From 42672d73de4fab730a2d6e91c92d19afebeb40bb Mon Sep 17 00:00:00 2001 From: V3n3RiX Date: Sat, 9 Feb 2019 20:03:56 +0000 Subject: sys-kernel/linux-{image,sources}-redcore-lts : version bump to v4.14.95 && v4.19.20 --- .../files/0001-BFQ-v8r12-20171108.patch | 25199 ------------------- ...ersion-0.162-CPU-scheduler-linux-hardened.patch | 9560 ------- .../files/0002-BFQ-v8r12-20180404.patch | 4611 ---- .../0002-Make-preemptible-kernel-default.patch | 733 - ...-Expose-vmsplit-for-our-poor-32-bit-users.patch | 48 - ...res-timeout-variants-of-schedule_timeout-.patch | 153 - ...e-calls-of-schedule_timeout-1-to-use-the-.patch | 50 - ...onvert-msleep-to-use-hrtimers-when-active.patch | 54 - ...-schedule-timeout-1-with-schedule_min_hrt.patch | 529 - ...-calls-to-schedule_timeout_interruptible-.patch | 311 - ...-calls-to-schedule_timeout_uninterruptibl.patch | 160 - ...rtimer-overlay-when-pm_freezing-since-som.patch | 69 - ...r-granularity-and-minimum-hrtimeout-confi.patch | 136 - ...efault-Hz-of-100-in-combination-with-MuQS.patch | 81 - ...ed-IRQs-optionally-the-default-which-can-.patch | 61 - .../files/0014-Swap-sucks.patch | 25 - ...SS.c-needs-irq_regs.h-to-use-get_irq_regs.patch | 19 - .../0016-unfuck-MuQSS-on-linux-4_14_15+.patch | 48 - .../0017-unfuck-MuQSS-on-linux-4_14_75+.patch | 14 - .../files/4.14-0001-BFQ-v8r12-20171108.patch | 25199 +++++++++++++++++++ ...ersion-0.162-CPU-scheduler-linux-hardened.patch | 9560 +++++++ .../files/4.14-0002-BFQ-v8r12-20180404.patch | 4611 ++++ ...4.14-0002-Make-preemptible-kernel-default.patch | 733 + ...-Expose-vmsplit-for-our-poor-32-bit-users.patch | 48 + ...res-timeout-variants-of-schedule_timeout-.patch | 153 + ...e-calls-of-schedule_timeout-1-to-use-the-.patch | 50 + ...onvert-msleep-to-use-hrtimers-when-active.patch | 54 + ...-schedule-timeout-1-with-schedule_min_hrt.patch | 529 + ...-calls-to-schedule_timeout_interruptible-.patch | 311 + ...-calls-to-schedule_timeout_uninterruptibl.patch | 160 + ...rtimer-overlay-when-pm_freezing-since-som.patch | 69 + ...r-granularity-and-minimum-hrtimeout-confi.patch | 136 + ...efault-Hz-of-100-in-combination-with-MuQS.patch | 81 + ...ed-IRQs-optionally-the-default-which-can-.patch | 61 + .../files/4.14-0014-Swap-sucks.patch | 25 + ...SS.c-needs-irq_regs.h-to-use-get_irq_regs.patch | 19 + .../4.14-0016-unfuck-MuQSS-on-linux-4_14_15+.patch | 48 + .../4.14-0017-unfuck-MuQSS-on-linux-4_14_75+.patch | 14 + ...h10k-activate-user-space-firmware-loading.patch | 13 + ...introduce-NUMA-identity-node-sched-domain.patch | 46 + .../files/4.14-k10temp-add-ZEN-support.patch | 177 + .../files/4.14-linux-hardened.patch | 2868 +++ .../files/4.14-mute-pps_state_mismatch.patch | 16 + .../files/4.14-redcore-lts-amd64.config | 9106 +++++++ ...4-restore-SD_PREFER_SIBLING-on-MC-domains.patch | 12 + .../files/4.14-uksm-linux-hardened.patch | 6919 +++++ ...t-Scheduler-version-v0.180-linux-hardened.patch | 10305 ++++++++ ...19-0002-Fix-Werror-build-failure-in-tools.patch | 25 + ...4.19-0003-Make-preemptible-kernel-default.patch | 4648 ++++ ...-Expose-vmsplit-for-our-poor-32-bit-users.patch | 48 + ...res-timeout-variants-of-schedule_timeout-.patch | 153 + ...e-calls-of-schedule_timeout-1-to-use-the-.patch | 49 + ...onvert-msleep-to-use-hrtimers-when-active.patch | 54 + ...-schedule-timeout-1-with-schedule_min_hrt.patch | 1009 + ...-calls-to-schedule_timeout_interruptible-.patch | 311 + ...-calls-to-schedule_timeout_uninterruptibl.patch | 160 + ...rtimer-overlay-when-pm_freezing-since-som.patch | 69 + ...ed-IRQs-optionally-the-default-which-can-.patch | 67 + ...efault-Hz-of-100-in-combination-with-MuQS.patch | 81 + .../files/4.19-0014-Swap-sucks.patch | 25 + .../4.19-0015-unfuck-MuQSS-on-linux-4_19_10+.patch | 14 + ...ata-fix-NCQ-LOG-strings-and-move-to-debug.patch | 23 + ...-cd93b83ad927b2c7979e0add0343ace59328b461.patch | 74 + .../files/4.19-bfq-sq-mq-v9r1-2K190204-rc1.patch | 18511 ++++++++++++++ .../files/4.19-drop_ancient-and-wrong-msg.patch | 29 + .../files/4.19-enable_alx_wol.patch | 485 + .../files/4.19-linux-hardened.patch | 2732 ++ .../files/4.19-mute-pps_state_mismatch.patch | 16 + .../files/4.19-nouveau-pascal-backlight.patch | 11 + ..._aux_transfer_native-no-ratelimited_debug.patch | 13 + .../files/4.19-redcore-lts-amd64.config | 9348 +++++++ ....19-revert-patches-causing-instant-reboot.patch | 314 + .../files/4.19-uksm-linux-hardened.patch | 6941 +++++ ...h10k-activate-user-space-firmware-loading.patch | 13 - ...introduce-NUMA-identity-node-sched-domain.patch | 46 - .../files/k10temp-add-ZEN-support.patch | 177 - .../files/linux-hardened.patch | 2868 --- .../files/mute-pps_state_mismatch.patch | 16 - .../files/redcore-lts-amd64.config | 9106 ------- .../restore-SD_PREFER_SIBLING-on-MC-domains.patch | 12 - .../files/uksm-linux-hardened.patch | 6919 ----- 81 files changed, 116533 insertions(+), 61018 deletions(-) delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0001-BFQ-v8r12-20171108.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0001-MuQSS-version-0.162-CPU-scheduler-linux-hardened.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0002-BFQ-v8r12-20180404.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0002-Make-preemptible-kernel-default.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0003-Expose-vmsplit-for-our-poor-32-bit-users.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0004-Create-highres-timeout-variants-of-schedule_timeout-.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0005-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0006-Convert-msleep-to-use-hrtimers-when-active.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0007-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0008-Replace-all-calls-to-schedule_timeout_interruptible-.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0009-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0010-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0011-Make-hrtimer-granularity-and-minimum-hrtimeout-confi.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0012-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0013-Make-threaded-IRQs-optionally-the-default-which-can-.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0014-Swap-sucks.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0016-unfuck-MuQSS-on-linux-4_14_15+.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/0017-unfuck-MuQSS-on-linux-4_14_75+.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0001-BFQ-v8r12-20171108.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0001-MuQSS-version-0.162-CPU-scheduler-linux-hardened.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0002-BFQ-v8r12-20180404.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0002-Make-preemptible-kernel-default.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0003-Expose-vmsplit-for-our-poor-32-bit-users.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0004-Create-highres-timeout-variants-of-schedule_timeout-.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0005-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0006-Convert-msleep-to-use-hrtimers-when-active.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0007-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0008-Replace-all-calls-to-schedule_timeout_interruptible-.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0009-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0010-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0011-Make-hrtimer-granularity-and-minimum-hrtimeout-confi.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0012-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0013-Make-threaded-IRQs-optionally-the-default-which-can-.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0014-Swap-sucks.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0016-unfuck-MuQSS-on-linux-4_14_15+.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-0017-unfuck-MuQSS-on-linux-4_14_75+.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-Revert-ath10k-activate-user-space-firmware-loading.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-introduce-NUMA-identity-node-sched-domain.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-k10temp-add-ZEN-support.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-linux-hardened.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-mute-pps_state_mismatch.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-redcore-lts-amd64.config create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-restore-SD_PREFER_SIBLING-on-MC-domains.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.14-uksm-linux-hardened.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0001-MultiQueue-Skiplist-Scheduler-version-v0.180-linux-hardened.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0002-Fix-Werror-build-failure-in-tools.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0003-Make-preemptible-kernel-default.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0004-Expose-vmsplit-for-our-poor-32-bit-users.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0005-Create-highres-timeout-variants-of-schedule_timeout-.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0006-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0007-Convert-msleep-to-use-hrtimers-when-active.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0008-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0009-Replace-all-calls-to-schedule_timeout_interruptible-.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0010-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0011-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0012-Make-threaded-IRQs-optionally-the-default-which-can-.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0013-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0014-Swap-sucks.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-0015-unfuck-MuQSS-on-linux-4_19_10+.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-ata-fix-NCQ-LOG-strings-and-move-to-debug.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-ath10k-drop-WARN_ON-added-in-cd93b83ad927b2c7979e0add0343ace59328b461.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-bfq-sq-mq-v9r1-2K190204-rc1.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-drop_ancient-and-wrong-msg.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-enable_alx_wol.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-linux-hardened.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-mute-pps_state_mismatch.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-nouveau-pascal-backlight.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-radeon_dp_aux_transfer_native-no-ratelimited_debug.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-redcore-lts-amd64.config create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-revert-patches-causing-instant-reboot.patch create mode 100644 sys-kernel/linux-sources-redcore-lts/files/4.19-uksm-linux-hardened.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/Revert-ath10k-activate-user-space-firmware-loading.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/introduce-NUMA-identity-node-sched-domain.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/k10temp-add-ZEN-support.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/linux-hardened.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/mute-pps_state_mismatch.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/redcore-lts-amd64.config delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/restore-SD_PREFER_SIBLING-on-MC-domains.patch delete mode 100644 sys-kernel/linux-sources-redcore-lts/files/uksm-linux-hardened.patch (limited to 'sys-kernel/linux-sources-redcore-lts/files') diff --git a/sys-kernel/linux-sources-redcore-lts/files/0001-BFQ-v8r12-20171108.patch b/sys-kernel/linux-sources-redcore-lts/files/0001-BFQ-v8r12-20171108.patch deleted file mode 100644 index db7d064b..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0001-BFQ-v8r12-20171108.patch +++ /dev/null @@ -1,25199 +0,0 @@ -From c21f53f17430230dab50df29b8ea1b71f99d09d6 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Tue, 7 Apr 2015 13:39:12 +0200 -Subject: [PATCH 01/51] Add BFQ-v8r12 - -This commit is the result of the following operations. - -1. The squash of all the commits between "block: cgroups, kconfig, -build bits for BFQ-v7r11-4.5.0" and BFQ-v8r12 in the branch -bfq-mq-v8-v4.11 - -2. The renaming of two files (block/bfq-cgroup.c -> -block/bfq-cgroup-included.c and block/bfq-iosched.c -> -block/bfq-sq-iosched.c) and of one option (CONFIG_BFQ_GROUP_IOSCHED -> -CONFIG_BFQ_SQ_GROUP_IOSCHED), to avoid name clashes. These name -clashes are due to the presence of bfq in mainline from 4.12. - -3. The modification of block/Makefile and block/Kconfig.iosched to -comply with the above renaming. - -Signed-off-by: Mauro Andreolini -Signed-off-by: Arianna Avanzini -Signed-off-by: Linus Walleij -Signed-off-by: Paolo Valente ---- - Makefile | 2 +- - block/Kconfig.iosched | 31 + - block/bfq-cgroup-included.c | 1190 ++++++++++ - block/bfq-ioc.c | 36 + - block/bfq-sched.c | 2002 ++++++++++++++++ - block/bfq-sq-iosched.c | 5379 +++++++++++++++++++++++++++++++++++++++++++ - block/bfq.h | 948 ++++++++ - include/linux/blkdev.h | 2 +- - 9 files changed, 9589 insertions(+), 2 deletions(-) - create mode 100644 block/bfq-cgroup-included.c - create mode 100644 block/bfq-ioc.c - create mode 100644 block/bfq-sched.c - create mode 100644 block/bfq-sq-iosched.c - create mode 100644 block/bfq.h - -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index a4a8914bf7a4..9e3f4c2f7390 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -40,6 +40,26 @@ config CFQ_GROUP_IOSCHED - ---help--- - Enable group IO scheduling in CFQ. - -+config IOSCHED_BFQ_SQ -+ tristate "BFQ-SQ I/O scheduler" -+ default n -+ ---help--- -+ The BFQ-SQ I/O scheduler (for legacy blk: SQ stands for -+ SingleQueue) distributes bandwidth among all processes -+ according to their weights, regardless of the device -+ parameters and with any workload. It also guarantees a low -+ latency to interactive and soft real-time applications. -+ Details in Documentation/block/bfq-iosched.txt -+ -+config BFQ_SQ_GROUP_IOSCHED -+ bool "BFQ-SQ hierarchical scheduling support" -+ depends on IOSCHED_BFQ_SQ && BLK_CGROUP -+ default n -+ ---help--- -+ -+ Enable hierarchical scheduling in BFQ-SQ, using the blkio -+ (cgroups-v1) or io (cgroups-v2) controller. -+ - choice - - prompt "Default I/O scheduler" -@@ -54,6 +74,16 @@ choice - config DEFAULT_CFQ - bool "CFQ" if IOSCHED_CFQ=y - -+ config DEFAULT_BFQ_SQ -+ bool "BFQ-SQ" if IOSCHED_BFQ_SQ=y -+ help -+ Selects BFQ-SQ as the default I/O scheduler which will be -+ used by default for all block devices. -+ The BFQ-SQ I/O scheduler aims at distributing the bandwidth -+ as desired, independently of the disk parameters and with -+ any workload. It also tries to guarantee low latency to -+ interactive and soft real-time applications. -+ - config DEFAULT_NOOP - bool "No-op" - -@@ -63,6 +93,7 @@ config DEFAULT_IOSCHED - string - default "deadline" if DEFAULT_DEADLINE - default "cfq" if DEFAULT_CFQ -+ default "bfq-sq" if DEFAULT_BFQ_SQ - default "noop" if DEFAULT_NOOP - - config MQ_IOSCHED_DEADLINE -diff --git a/block/Makefile b/block/Makefile -index 6a56303b9925..59026b425791 100644 ---- a/block/Makefile -+++ b/block/Makefile -@@ -24,6 +24,7 @@ obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o - obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o - bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o - obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -+obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o - - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o - obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -new file mode 100644 -index 000000000000..af7c216a3540 ---- /dev/null -+++ b/block/bfq-cgroup-included.c -@@ -0,0 +1,1190 @@ -+/* -+ * BFQ: CGROUPS support. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2015 Paolo Valente -+ * -+ * Copyright (C) 2016 Paolo Valente -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ */ -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ -+/* bfqg stats flags */ -+enum bfqg_stats_flags { -+ BFQG_stats_waiting = 0, -+ BFQG_stats_idling, -+ BFQG_stats_empty, -+}; -+ -+#define BFQG_FLAG_FNS(name) \ -+static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ -+{ \ -+ stats->flags |= (1 << BFQG_stats_##name); \ -+} \ -+static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ -+{ \ -+ stats->flags &= ~(1 << BFQG_stats_##name); \ -+} \ -+static int bfqg_stats_##name(struct bfqg_stats *stats) \ -+{ \ -+ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ -+} \ -+ -+BFQG_FLAG_FNS(waiting) -+BFQG_FLAG_FNS(idling) -+BFQG_FLAG_FNS(empty) -+#undef BFQG_FLAG_FNS -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) -+{ -+ unsigned long long now; -+ -+ if (!bfqg_stats_waiting(stats)) -+ return; -+ -+ now = sched_clock(); -+ if (time_after64(now, stats->start_group_wait_time)) -+ blkg_stat_add(&stats->group_wait_time, -+ now - stats->start_group_wait_time); -+ bfqg_stats_clear_waiting(stats); -+} -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (bfqg_stats_waiting(stats)) -+ return; -+ if (bfqg == curr_bfqg) -+ return; -+ stats->start_group_wait_time = sched_clock(); -+ bfqg_stats_mark_waiting(stats); -+} -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) -+{ -+ unsigned long long now; -+ -+ if (!bfqg_stats_empty(stats)) -+ return; -+ -+ now = sched_clock(); -+ if (time_after64(now, stats->start_empty_time)) -+ blkg_stat_add(&stats->empty_time, -+ now - stats->start_empty_time); -+ bfqg_stats_clear_empty(stats); -+} -+ -+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) -+{ -+ blkg_stat_add(&bfqg->stats.dequeue, 1); -+} -+ -+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (blkg_rwstat_total(&stats->queued)) -+ return; -+ -+ /* -+ * group is already marked empty. This can happen if bfqq got new -+ * request in parent group and moved to this group while being added -+ * to service tree. Just ignore the event and move on. -+ */ -+ if (bfqg_stats_empty(stats)) -+ return; -+ -+ stats->start_empty_time = sched_clock(); -+ bfqg_stats_mark_empty(stats); -+} -+ -+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (bfqg_stats_idling(stats)) { -+ unsigned long long now = sched_clock(); -+ -+ if (time_after64(now, stats->start_idle_time)) -+ blkg_stat_add(&stats->idle_time, -+ now - stats->start_idle_time); -+ bfqg_stats_clear_idling(stats); -+ } -+} -+ -+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ stats->start_idle_time = sched_clock(); -+ bfqg_stats_mark_idling(stats); -+} -+ -+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ blkg_stat_add(&stats->avg_queue_size_sum, -+ blkg_rwstat_total(&stats->queued)); -+ blkg_stat_add(&stats->avg_queue_size_samples, 1); -+ bfqg_stats_update_group_wait_time(stats); -+} -+ -+static struct blkcg_policy blkcg_policy_bfq; -+ -+/* -+ * blk-cgroup policy-related handlers -+ * The following functions help in converting between blk-cgroup -+ * internal structures and BFQ-specific structures. -+ */ -+ -+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) -+{ -+ return pd ? container_of(pd, struct bfq_group, pd) : NULL; -+} -+ -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) -+{ -+ return pd_to_blkg(&bfqg->pd); -+} -+ -+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) -+{ -+ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); -+ -+ return pd_to_bfqg(pd); -+} -+ -+/* -+ * bfq_group handlers -+ * The following functions help in navigating the bfq_group hierarchy -+ * by allowing to find the parent of a bfq_group or the bfq_group -+ * associated to a bfq_queue. -+ */ -+ -+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) -+{ -+ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; -+ -+ return pblkg ? blkg_to_bfqg(pblkg) : NULL; -+} -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ return group_entity ? container_of(group_entity, struct bfq_group, -+ entity) : -+ bfqq->bfqd->root_group; -+} -+ -+/* -+ * The following two functions handle get and put of a bfq_group by -+ * wrapping the related blk-cgroup hooks. -+ */ -+ -+static void bfqg_get(struct bfq_group *bfqg) -+{ -+ return blkg_get(bfqg_to_blkg(bfqg)); -+} -+ -+static void bfqg_put(struct bfq_group *bfqg) -+{ -+ return blkg_put(bfqg_to_blkg(bfqg)); -+} -+ -+static void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, -+ unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, 1); -+ bfqg_stats_end_empty_time(&bfqg->stats); -+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) -+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -+} -+ -+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, -1); -+} -+ -+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.merged, op, 1); -+} -+ -+static void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, -+ unsigned int op) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ unsigned long long now = sched_clock(); -+ -+ if (time_after64(now, io_start_time)) -+ blkg_rwstat_add(&stats->service_time, op, -+ now - io_start_time); -+ if (time_after64(io_start_time, start_time)) -+ blkg_rwstat_add(&stats->wait_time, op, -+ io_start_time - start_time); -+} -+ -+/* @stats = 0 */ -+static void bfqg_stats_reset(struct bfqg_stats *stats) -+{ -+ /* queued stats shouldn't be cleared */ -+ blkg_rwstat_reset(&stats->merged); -+ blkg_rwstat_reset(&stats->service_time); -+ blkg_rwstat_reset(&stats->wait_time); -+ blkg_stat_reset(&stats->time); -+ blkg_stat_reset(&stats->avg_queue_size_sum); -+ blkg_stat_reset(&stats->avg_queue_size_samples); -+ blkg_stat_reset(&stats->dequeue); -+ blkg_stat_reset(&stats->group_wait_time); -+ blkg_stat_reset(&stats->idle_time); -+ blkg_stat_reset(&stats->empty_time); -+} -+ -+/* @to += @from */ -+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) -+{ -+ if (!to || !from) -+ return; -+ -+ /* queued stats shouldn't be cleared */ -+ blkg_rwstat_add_aux(&to->merged, &from->merged); -+ blkg_rwstat_add_aux(&to->service_time, &from->service_time); -+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); -+ blkg_stat_add_aux(&from->time, &from->time); -+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); -+ blkg_stat_add_aux(&to->avg_queue_size_samples, -+ &from->avg_queue_size_samples); -+ blkg_stat_add_aux(&to->dequeue, &from->dequeue); -+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); -+ blkg_stat_add_aux(&to->idle_time, &from->idle_time); -+ blkg_stat_add_aux(&to->empty_time, &from->empty_time); -+} -+ -+/* -+ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' -+ * recursive stats can still account for the amount used by this bfqg after -+ * it's gone. -+ */ -+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) -+{ -+ struct bfq_group *parent; -+ -+ if (!bfqg) /* root_group */ -+ return; -+ -+ parent = bfqg_parent(bfqg); -+ -+ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); -+ -+ if (unlikely(!parent)) -+ return; -+ -+ bfqg_stats_add_aux(&parent->stats, &bfqg->stats); -+ bfqg_stats_reset(&bfqg->stats); -+} -+ -+static void bfq_init_entity(struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->weight = entity->new_weight; -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) { -+ bfqq->ioprio = bfqq->new_ioprio; -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ bfqg_get(bfqg); -+ } -+ entity->parent = bfqg->my_entity; /* NULL for root group */ -+ entity->sched_data = &bfqg->sched_data; -+} -+ -+static void bfqg_stats_exit(struct bfqg_stats *stats) -+{ -+ blkg_rwstat_exit(&stats->merged); -+ blkg_rwstat_exit(&stats->service_time); -+ blkg_rwstat_exit(&stats->wait_time); -+ blkg_rwstat_exit(&stats->queued); -+ blkg_stat_exit(&stats->time); -+ blkg_stat_exit(&stats->avg_queue_size_sum); -+ blkg_stat_exit(&stats->avg_queue_size_samples); -+ blkg_stat_exit(&stats->dequeue); -+ blkg_stat_exit(&stats->group_wait_time); -+ blkg_stat_exit(&stats->idle_time); -+ blkg_stat_exit(&stats->empty_time); -+} -+ -+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) -+{ -+ if (blkg_rwstat_init(&stats->merged, gfp) || -+ blkg_rwstat_init(&stats->service_time, gfp) || -+ blkg_rwstat_init(&stats->wait_time, gfp) || -+ blkg_rwstat_init(&stats->queued, gfp) || -+ blkg_stat_init(&stats->time, gfp) || -+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) || -+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) || -+ blkg_stat_init(&stats->dequeue, gfp) || -+ blkg_stat_init(&stats->group_wait_time, gfp) || -+ blkg_stat_init(&stats->idle_time, gfp) || -+ blkg_stat_init(&stats->empty_time, gfp)) { -+ bfqg_stats_exit(stats); -+ return -ENOMEM; -+ } -+ -+ return 0; -+} -+ -+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) -+{ -+ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; -+} -+ -+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) -+{ -+ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); -+} -+ -+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -+{ -+ struct bfq_group_data *bgd; -+ -+ bgd = kzalloc(sizeof(*bgd), gfp); -+ if (!bgd) -+ return NULL; -+ return &bgd->pd; -+} -+ -+static void bfq_cpd_init(struct blkcg_policy_data *cpd) -+{ -+ struct bfq_group_data *d = cpd_to_bfqgd(cpd); -+ -+ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? -+ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; -+} -+ -+static void bfq_cpd_free(struct blkcg_policy_data *cpd) -+{ -+ kfree(cpd_to_bfqgd(cpd)); -+} -+ -+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) -+{ -+ struct bfq_group *bfqg; -+ -+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); -+ if (!bfqg) -+ return NULL; -+ -+ if (bfqg_stats_init(&bfqg->stats, gfp)) { -+ kfree(bfqg); -+ return NULL; -+ } -+ -+ return &bfqg->pd; -+} -+ -+static void bfq_pd_init(struct blkg_policy_data *pd) -+{ -+ struct blkcg_gq *blkg; -+ struct bfq_group *bfqg; -+ struct bfq_data *bfqd; -+ struct bfq_entity *entity; -+ struct bfq_group_data *d; -+ -+ blkg = pd_to_blkg(pd); -+ BUG_ON(!blkg); -+ bfqg = blkg_to_bfqg(blkg); -+ bfqd = blkg->q->elevator->elevator_data; -+ entity = &bfqg->entity; -+ d = blkcg_to_bfqgd(blkg->blkcg); -+ -+ entity->orig_weight = entity->weight = entity->new_weight = d->weight; -+ entity->my_sched_data = &bfqg->sched_data; -+ bfqg->my_entity = entity; /* -+ * the root_group's will be set to NULL -+ * in bfq_init_queue() -+ */ -+ bfqg->bfqd = bfqd; -+ bfqg->active_entities = 0; -+ bfqg->rq_pos_tree = RB_ROOT; -+} -+ -+static void bfq_pd_free(struct blkg_policy_data *pd) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ -+ bfqg_stats_exit(&bfqg->stats); -+ return kfree(bfqg); -+} -+ -+static void bfq_pd_reset_stats(struct blkg_policy_data *pd) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ -+ bfqg_stats_reset(&bfqg->stats); -+} -+ -+static void bfq_group_set_parent(struct bfq_group *bfqg, -+ struct bfq_group *parent) -+{ -+ struct bfq_entity *entity; -+ -+ BUG_ON(!parent); -+ BUG_ON(!bfqg); -+ BUG_ON(bfqg == parent); -+ -+ entity = &bfqg->entity; -+ entity->parent = parent->my_entity; -+ entity->sched_data = &parent->sched_data; -+} -+ -+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ struct blkcg_gq *blkg; -+ -+ blkg = blkg_lookup(blkcg, bfqd->queue); -+ if (likely(blkg)) -+ return blkg_to_bfqg(blkg); -+ return NULL; -+} -+ -+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ struct bfq_group *bfqg, *parent; -+ struct bfq_entity *entity; -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ bfqg = bfq_lookup_bfqg(bfqd, blkcg); -+ -+ if (unlikely(!bfqg)) -+ return NULL; -+ -+ /* -+ * Update chain of bfq_groups as we might be handling a leaf group -+ * which, along with some of its relatives, has not been hooked yet -+ * to the private hierarchy of BFQ. -+ */ -+ entity = &bfqg->entity; -+ for_each_entity(entity) { -+ bfqg = container_of(entity, struct bfq_group, entity); -+ BUG_ON(!bfqg); -+ if (bfqg != bfqd->root_group) { -+ parent = bfqg_parent(bfqg); -+ if (!parent) -+ parent = bfqd->root_group; -+ BUG_ON(!parent); -+ bfq_group_set_parent(bfqg, parent); -+ } -+ } -+ -+ return bfqg; -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq); -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/** -+ * bfq_bfqq_move - migrate @bfqq to @bfqg. -+ * @bfqd: queue descriptor. -+ * @bfqq: the queue to move. -+ * @bfqg: the group to move to. -+ * -+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating -+ * it on the new one. Avoid putting the entity on the old group idle tree. -+ * -+ * Must be called under the queue lock; the cgroup owning @bfqg must -+ * not disappear (by now this just means that we are called under -+ * rcu_read_lock()). -+ */ -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); -+ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) -+ && entity->on_st && -+ bfqq != bfqd->in_service_queue); -+ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); -+ -+ /* If bfqq is empty, then bfq_bfqq_expire also invokes -+ * bfq_del_bfqq_busy, thereby removing bfqq and its entity -+ * from data structures related to current group. Otherwise we -+ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as -+ * we do below. -+ */ -+ if (bfqq == bfqd->in_service_queue) -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ -+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) -+ && &bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); -+ -+ if (bfq_bfqq_busy(bfqq)) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ else if (entity->on_st) { -+ BUG_ON(&bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); -+ } -+ bfqg_put(bfqq_group(bfqq)); -+ -+ /* -+ * Here we use a reference to bfqg. We don't need a refcounter -+ * as the cgroup reference will not be dropped, so that its -+ * destroy() callback will not be invoked. -+ */ -+ entity->parent = bfqg->my_entity; -+ entity->sched_data = &bfqg->sched_data; -+ bfqg_get(bfqg); -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); -+ if (bfq_bfqq_busy(bfqq)) { -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ bfq_activate_bfqq(bfqd, bfqq); -+ } -+ -+ if (!bfqd->in_service_queue && !bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) -+ && &bfq_entity_service_tree(entity)->idle != -+ entity->tree); -+} -+ -+/** -+ * __bfq_bic_change_cgroup - move @bic to @cgroup. -+ * @bfqd: the queue descriptor. -+ * @bic: the bic to move. -+ * @blkcg: the blk-cgroup to move to. -+ * -+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller -+ * has to make sure that the reference to cgroup is valid across the call. -+ * -+ * NOTE: an alternative approach might have been to store the current -+ * cgroup in bfqq and getting a reference to it, reducing the lookup -+ * time here, at the price of slightly more complex code. -+ */ -+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, -+ struct blkcg *blkcg) -+{ -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); -+ struct bfq_group *bfqg; -+ struct bfq_entity *entity; -+ -+ lockdep_assert_held(bfqd->queue->queue_lock); -+ -+ bfqg = bfq_find_set_group(bfqd, blkcg); -+ -+ if (unlikely(!bfqg)) -+ bfqg = bfqd->root_group; -+ -+ if (async_bfqq) { -+ entity = &async_bfqq->entity; -+ -+ if (entity->sched_data != &bfqg->sched_data) { -+ bic_set_bfqq(bic, NULL, 0); -+ bfq_log_bfqq(bfqd, async_bfqq, -+ "bic_change_group: %p %d", -+ async_bfqq, -+ async_bfqq->ref); -+ bfq_put_queue(async_bfqq); -+ } -+ } -+ -+ if (sync_bfqq) { -+ entity = &sync_bfqq->entity; -+ if (entity->sched_data != &bfqg->sched_data) -+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg); -+ } -+ -+ return bfqg; -+} -+ -+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_group *bfqg = NULL; -+ uint64_t serial_nr; -+ -+ rcu_read_lock(); -+ serial_nr = bio_blkcg(bio)->css.serial_nr; -+ -+ /* -+ * Check whether blkcg has changed. The condition may trigger -+ * spuriously on a newly created cic but there's no harm. -+ */ -+ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) -+ goto out; -+ -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); -+ bic->blkcg_serial_nr = serial_nr; -+out: -+ rcu_read_unlock(); -+} -+ -+/** -+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. -+ * @st: the service tree being flushed. -+ */ -+static void bfq_flush_idle_tree(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *entity = st->first_idle; -+ -+ for (; entity ; entity = st->first_idle) -+ __bfq_deactivate_entity(entity, false); -+} -+ -+/** -+ * bfq_reparent_leaf_entity - move leaf entity to the root_group. -+ * @bfqd: the device data structure with the root group. -+ * @entity: the entity to move. -+ */ -+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ BUG_ON(!bfqq); -+ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); -+} -+ -+/** -+ * bfq_reparent_active_entities - move to the root group all active -+ * entities. -+ * @bfqd: the device data structure with the root group. -+ * @bfqg: the group to move from. -+ * @st: the service tree with the entities. -+ * -+ * Needs queue_lock to be taken and reference to be valid over the call. -+ */ -+static void bfq_reparent_active_entities(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ struct bfq_service_tree *st) -+{ -+ struct rb_root *active = &st->active; -+ struct bfq_entity *entity = NULL; -+ -+ if (!RB_EMPTY_ROOT(&st->active)) -+ entity = bfq_entity_of(rb_first(active)); -+ -+ for (; entity ; entity = bfq_entity_of(rb_first(active))) -+ bfq_reparent_leaf_entity(bfqd, entity); -+ -+ if (bfqg->sched_data.in_service_entity) -+ bfq_reparent_leaf_entity(bfqd, -+ bfqg->sched_data.in_service_entity); -+} -+ -+/** -+ * bfq_pd_offline - deactivate the entity associated with @pd, -+ * and reparent its children entities. -+ * @pd: descriptor of the policy going offline. -+ * -+ * blkio already grabs the queue_lock for us, so no need to use -+ * RCU-based magic -+ */ -+static void bfq_pd_offline(struct blkg_policy_data *pd) -+{ -+ struct bfq_service_tree *st; -+ struct bfq_group *bfqg; -+ struct bfq_data *bfqd; -+ struct bfq_entity *entity; -+ int i; -+ -+ BUG_ON(!pd); -+ bfqg = pd_to_bfqg(pd); -+ BUG_ON(!bfqg); -+ bfqd = bfqg->bfqd; -+ BUG_ON(bfqd && !bfqd->root_group); -+ -+ entity = bfqg->my_entity; -+ -+ if (!entity) /* root group */ -+ return; -+ -+ /* -+ * Empty all service_trees belonging to this group before -+ * deactivating the group itself. -+ */ -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { -+ BUG_ON(!bfqg->sched_data.service_tree); -+ st = bfqg->sched_data.service_tree + i; -+ /* -+ * The idle tree may still contain bfq_queues belonging -+ * to exited task because they never migrated to a different -+ * cgroup from the one being destroyed now. No one else -+ * can access them so it's safe to act without any lock. -+ */ -+ bfq_flush_idle_tree(st); -+ -+ /* -+ * It may happen that some queues are still active -+ * (busy) upon group destruction (if the corresponding -+ * processes have been forced to terminate). We move -+ * all the leaf entities corresponding to these queues -+ * to the root_group. -+ * Also, it may happen that the group has an entity -+ * in service, which is disconnected from the active -+ * tree: it must be moved, too. -+ * There is no need to put the sync queues, as the -+ * scheduler has taken no reference. -+ */ -+ bfq_reparent_active_entities(bfqd, bfqg, st); -+ BUG_ON(!RB_EMPTY_ROOT(&st->active)); -+ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); -+ } -+ BUG_ON(bfqg->sched_data.next_in_service); -+ BUG_ON(bfqg->sched_data.in_service_entity); -+ -+ __bfq_deactivate_entity(entity, false); -+ bfq_put_async_queues(bfqd, bfqg); -+ -+ /* -+ * @blkg is going offline and will be ignored by -+ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so -+ * that they don't get lost. If IOs complete after this point, the -+ * stats for them will be lost. Oh well... -+ */ -+ bfqg_stats_xfer_dead(bfqg); -+} -+ -+static void bfq_end_wr_async(struct bfq_data *bfqd) -+{ -+ struct blkcg_gq *blkg; -+ -+ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ BUG_ON(!bfqg); -+ -+ bfq_end_wr_async_queues(bfqd, bfqg); -+ } -+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); -+} -+ -+static int bfq_io_show_weight(struct seq_file *sf, void *v) -+{ -+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ unsigned int val = 0; -+ -+ if (bfqgd) -+ val = bfqgd->weight; -+ -+ seq_printf(sf, "%u\n", val); -+ -+ return 0; -+} -+ -+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, -+ struct cftype *cftype, -+ u64 val) -+{ -+ struct blkcg *blkcg = css_to_blkcg(css); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ struct blkcg_gq *blkg; -+ int ret = -ERANGE; -+ -+ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) -+ return ret; -+ -+ ret = 0; -+ spin_lock_irq(&blkcg->lock); -+ bfqgd->weight = (unsigned short)val; -+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ -+ if (!bfqg) -+ continue; -+ /* -+ * Setting the prio_changed flag of the entity -+ * to 1 with new_weight == weight would re-set -+ * the value of the weight to its ioprio mapping. -+ * Set the flag only if necessary. -+ */ -+ if ((unsigned short)val != bfqg->entity.new_weight) { -+ bfqg->entity.new_weight = (unsigned short)val; -+ /* -+ * Make sure that the above new value has been -+ * stored in bfqg->entity.new_weight before -+ * setting the prio_changed flag. In fact, -+ * this flag may be read asynchronously (in -+ * critical sections protected by a different -+ * lock than that held here), and finding this -+ * flag set may cause the execution of the code -+ * for updating parameters whose value may -+ * depend also on bfqg->entity.new_weight (in -+ * __bfq_entity_update_weight_prio). -+ * This barrier makes sure that the new value -+ * of bfqg->entity.new_weight is correctly -+ * seen in that code. -+ */ -+ smp_wmb(); -+ bfqg->entity.prio_changed = 1; -+ } -+ } -+ spin_unlock_irq(&blkcg->lock); -+ -+ return ret; -+} -+ -+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, -+ char *buf, size_t nbytes, -+ loff_t off) -+{ -+ u64 weight; -+ /* First unsigned long found in the file is used */ -+ int ret = kstrtoull(strim(buf), 0, &weight); -+ -+ if (ret) -+ return ret; -+ -+ return bfq_io_set_weight_legacy(of_css(of), NULL, weight); -+} -+ -+static int bfqg_print_stat(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, -+ &blkcg_policy_bfq, seq_cft(sf)->private, false); -+ return 0; -+} -+ -+static int bfqg_print_rwstat(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, -+ &blkcg_policy_bfq, seq_cft(sf)->private, true); -+ return 0; -+} -+ -+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), -+ &blkcg_policy_bfq, off); -+ return __blkg_prfill_u64(sf, pd, sum); -+} -+ -+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), -+ &blkcg_policy_bfq, -+ off); -+ return __blkg_prfill_rwstat(sf, pd, &sum); -+} -+ -+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_stat_recursive, &blkcg_policy_bfq, -+ seq_cft(sf)->private, false); -+ return 0; -+} -+ -+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, -+ seq_cft(sf)->private, true); -+ return 0; -+} -+ -+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, -+ int off) -+{ -+ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); -+ -+ return __blkg_prfill_u64(sf, pd, sum >> 9); -+} -+ -+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); -+ return 0; -+} -+ -+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, -+ offsetof(struct blkcg_gq, stat_bytes)); -+ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + -+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); -+ -+ return __blkg_prfill_u64(sf, pd, sum >> 9); -+} -+ -+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, -+ false); -+ return 0; -+} -+ -+ -+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); -+ u64 v = 0; -+ -+ if (samples) { -+ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); -+ v = div64_u64(v, samples); -+ } -+ __blkg_prfill_u64(sf, pd, v); -+ return 0; -+} -+ -+/* print avg_queue_size */ -+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, -+ 0, false); -+ return 0; -+} -+ -+static struct bfq_group * -+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -+{ -+ int ret; -+ -+ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); -+ if (ret) -+ return NULL; -+ -+ return blkg_to_bfqg(bfqd->queue->root_blkg); -+} -+ -+static struct cftype bfq_blkcg_legacy_files[] = { -+ { -+ .name = "bfq.weight", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = bfq_io_show_weight, -+ .write_u64 = bfq_io_set_weight_legacy, -+ }, -+ -+ /* statistics, covers only the tasks in the bfqg */ -+ { -+ .name = "bfq.time", -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.sectors", -+ .seq_show = bfqg_print_stat_sectors, -+ }, -+ { -+ .name = "bfq.io_service_bytes", -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_bytes, -+ }, -+ { -+ .name = "bfq.io_serviced", -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_ios, -+ }, -+ { -+ .name = "bfq.io_service_time", -+ .private = offsetof(struct bfq_group, stats.service_time), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_wait_time", -+ .private = offsetof(struct bfq_group, stats.wait_time), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_merged", -+ .private = offsetof(struct bfq_group, stats.merged), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_queued", -+ .private = offsetof(struct bfq_group, stats.queued), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ -+ /* the same statictics which cover the bfqg and its descendants */ -+ { -+ .name = "bfq.time_recursive", -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat_recursive, -+ }, -+ { -+ .name = "bfq.sectors_recursive", -+ .seq_show = bfqg_print_stat_sectors_recursive, -+ }, -+ { -+ .name = "bfq.io_service_bytes_recursive", -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_bytes_recursive, -+ }, -+ { -+ .name = "bfq.io_serviced_recursive", -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_ios_recursive, -+ }, -+ { -+ .name = "bfq.io_service_time_recursive", -+ .private = offsetof(struct bfq_group, stats.service_time), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_wait_time_recursive", -+ .private = offsetof(struct bfq_group, stats.wait_time), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_merged_recursive", -+ .private = offsetof(struct bfq_group, stats.merged), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_queued_recursive", -+ .private = offsetof(struct bfq_group, stats.queued), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.avg_queue_size", -+ .seq_show = bfqg_print_avg_queue_size, -+ }, -+ { -+ .name = "bfq.group_wait_time", -+ .private = offsetof(struct bfq_group, stats.group_wait_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.idle_time", -+ .private = offsetof(struct bfq_group, stats.idle_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.empty_time", -+ .private = offsetof(struct bfq_group, stats.empty_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.dequeue", -+ .private = offsetof(struct bfq_group, stats.dequeue), -+ .seq_show = bfqg_print_stat, -+ }, -+ { } /* terminate */ -+}; -+ -+static struct cftype bfq_blkg_files[] = { -+ { -+ .name = "bfq.weight", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = bfq_io_show_weight, -+ .write = bfq_io_set_weight, -+ }, -+ {} /* terminate */ -+}; -+ -+#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } -+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, -+ unsigned int op) { } -+static inline void -+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) { } -+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } -+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -+ -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_group *bfqg) {} -+ -+static void bfq_init_entity(struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->weight = entity->new_weight; -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) { -+ bfqq->ioprio = bfqq->new_ioprio; -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ } -+ entity->sched_data = &bfqg->sched_data; -+} -+ -+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} -+ -+static void bfq_end_wr_async(struct bfq_data *bfqd) -+{ -+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); -+} -+ -+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ return bfqd->root_group; -+} -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+static struct bfq_group * -+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -+{ -+ struct bfq_group *bfqg; -+ int i; -+ -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); -+ if (!bfqg) -+ return NULL; -+ -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ -+ return bfqg; -+} -+#endif -diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c -new file mode 100644 -index 000000000000..fb7bb8f08b75 ---- /dev/null -+++ b/block/bfq-ioc.c -@@ -0,0 +1,36 @@ -+/* -+ * BFQ: I/O context handling. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2010 Paolo Valente -+ */ -+ -+/** -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. -+ * @icq: the iocontext queue. -+ */ -+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) -+{ -+ /* bic->icq is the first member, %NULL will convert to %NULL */ -+ return container_of(icq, struct bfq_io_cq, icq); -+} -+ -+/** -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. -+ * @bfqd: the lookup key. -+ * @ioc: the io_context of the process doing I/O. -+ * -+ * Queue lock must be held. -+ */ -+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, -+ struct io_context *ioc) -+{ -+ if (ioc) -+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); -+ return NULL; -+} -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -new file mode 100644 -index 000000000000..ac8991bca9fa ---- /dev/null -+++ b/block/bfq-sched.c -@@ -0,0 +1,2002 @@ -+/* -+ * BFQ: Hierarchical B-WF2Q+ scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2015 Paolo Valente -+ * -+ * Copyright (C) 2016 Paolo Valente -+ */ -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+ -+/** -+ * bfq_gt - compare two timestamps. -+ * @a: first ts. -+ * @b: second ts. -+ * -+ * Return @a > @b, dealing with wrapping correctly. -+ */ -+static int bfq_gt(u64 a, u64 b) -+{ -+ return (s64)(a - b) > 0; -+} -+ -+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) -+{ -+ struct rb_node *node = tree->rb_node; -+ -+ return rb_entry(node, struct bfq_entity, rb_node); -+} -+ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); -+ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); -+ -+/** -+ * bfq_update_next_in_service - update sd->next_in_service -+ * @sd: sched_data for which to perform the update. -+ * @new_entity: if not NULL, pointer to the entity whose activation, -+ * requeueing or repositionig triggered the invocation of -+ * this function. -+ * -+ * This function is called to update sd->next_in_service, which, in -+ * its turn, may change as a consequence of the insertion or -+ * extraction of an entity into/from one of the active trees of -+ * sd. These insertions/extractions occur as a consequence of -+ * activations/deactivations of entities, with some activations being -+ * 'true' activations, and other activations being requeueings (i.e., -+ * implementing the second, requeueing phase of the mechanism used to -+ * reposition an entity in its active tree; see comments on -+ * __bfq_activate_entity and __bfq_requeue_entity for details). In -+ * both the last two activation sub-cases, new_entity points to the -+ * just activated or requeued entity. -+ * -+ * Returns true if sd->next_in_service changes in such a way that -+ * entity->parent may become the next_in_service for its parent -+ * entity. -+ */ -+static bool bfq_update_next_in_service(struct bfq_sched_data *sd, -+ struct bfq_entity *new_entity) -+{ -+ struct bfq_entity *next_in_service = sd->next_in_service; -+ struct bfq_queue *bfqq; -+ bool parent_sched_may_change = false; -+ -+ /* -+ * If this update is triggered by the activation, requeueing -+ * or repositiong of an entity that does not coincide with -+ * sd->next_in_service, then a full lookup in the active tree -+ * can be avoided. In fact, it is enough to check whether the -+ * just-modified entity has a higher priority than -+ * sd->next_in_service, or, even if it has the same priority -+ * as sd->next_in_service, is eligible and has a lower virtual -+ * finish time than sd->next_in_service. If this compound -+ * condition holds, then the new entity becomes the new -+ * next_in_service. Otherwise no change is needed. -+ */ -+ if (new_entity && new_entity != sd->next_in_service) { -+ /* -+ * Flag used to decide whether to replace -+ * sd->next_in_service with new_entity. Tentatively -+ * set to true, and left as true if -+ * sd->next_in_service is NULL. -+ */ -+ bool replace_next = true; -+ -+ /* -+ * If there is already a next_in_service candidate -+ * entity, then compare class priorities or timestamps -+ * to decide whether to replace sd->service_tree with -+ * new_entity. -+ */ -+ if (next_in_service) { -+ unsigned int new_entity_class_idx = -+ bfq_class_idx(new_entity); -+ struct bfq_service_tree *st = -+ sd->service_tree + new_entity_class_idx; -+ -+ /* -+ * For efficiency, evaluate the most likely -+ * sub-condition first. -+ */ -+ replace_next = -+ (new_entity_class_idx == -+ bfq_class_idx(next_in_service) -+ && -+ !bfq_gt(new_entity->start, st->vtime) -+ && -+ bfq_gt(next_in_service->finish, -+ new_entity->finish)) -+ || -+ new_entity_class_idx < -+ bfq_class_idx(next_in_service); -+ } -+ -+ if (replace_next) -+ next_in_service = new_entity; -+ } else /* invoked because of a deactivation: lookup needed */ -+ next_in_service = bfq_lookup_next_entity(sd); -+ -+ if (next_in_service) { -+ parent_sched_may_change = !sd->next_in_service || -+ bfq_update_parent_budget(next_in_service); -+ } -+ -+ sd->next_in_service = next_in_service; -+ -+ if (!next_in_service) -+ return parent_sched_may_change; -+ -+ bfqq = bfq_entity_to_bfqq(next_in_service); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "update_next_in_service: chosen this queue"); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(next_in_service, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "update_next_in_service: chosen this entity"); -+ } -+#endif -+ return parent_sched_may_change; -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+/* both next loops stop at one of the child entities of the root group */ -+#define for_each_entity(entity) \ -+ for (; entity ; entity = entity->parent) -+ -+/* -+ * For each iteration, compute parent in advance, so as to be safe if -+ * entity is deallocated during the iteration. Such a deallocation may -+ * happen as a consequence of a bfq_put_queue that frees the bfq_queue -+ * containing entity. -+ */ -+#define for_each_entity_safe(entity, parent) \ -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) -+ -+/* -+ * Returns true if this budget changes may let next_in_service->parent -+ * become the next_in_service entity for its parent entity. -+ */ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) -+{ -+ struct bfq_entity *bfqg_entity; -+ struct bfq_group *bfqg; -+ struct bfq_sched_data *group_sd; -+ bool ret = false; -+ -+ BUG_ON(!next_in_service); -+ -+ group_sd = next_in_service->sched_data; -+ -+ bfqg = container_of(group_sd, struct bfq_group, sched_data); -+ /* -+ * bfq_group's my_entity field is not NULL only if the group -+ * is not the root group. We must not touch the root entity -+ * as it must never become an in-service entity. -+ */ -+ bfqg_entity = bfqg->my_entity; -+ if (bfqg_entity) { -+ if (bfqg_entity->budget > next_in_service->budget) -+ ret = true; -+ bfqg_entity->budget = next_in_service->budget; -+ } -+ -+ return ret; -+} -+ -+/* -+ * This function tells whether entity stops being a candidate for next -+ * service, according to the following logic. -+ * -+ * This function is invoked for an entity that is about to be set in -+ * service. If such an entity is a queue, then the entity is no longer -+ * a candidate for next service (i.e, a candidate entity to serve -+ * after the in-service entity is expired). The function then returns -+ * true. -+ * -+ * In contrast, the entity could stil be a candidate for next service -+ * if it is not a queue, and has more than one child. In fact, even if -+ * one of its children is about to be set in service, other children -+ * may still be the next to serve. As a consequence, a non-queue -+ * entity is not a candidate for next-service only if it has only one -+ * child. And only if this condition holds, then the function returns -+ * true for a non-queue entity. -+ */ -+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) -+{ -+ struct bfq_group *bfqg; -+ -+ if (bfq_entity_to_bfqq(entity)) -+ return true; -+ -+ bfqg = container_of(entity, struct bfq_group, entity); -+ -+ BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); -+ BUG_ON(bfqg->active_entities == 0); -+ if (bfqg->active_entities == 1) -+ return true; -+ -+ return false; -+} -+ -+#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#define for_each_entity(entity) \ -+ for (; entity ; entity = NULL) -+ -+#define for_each_entity_safe(entity, parent) \ -+ for (parent = NULL; entity ; entity = parent) -+ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) -+{ -+ return false; -+} -+ -+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) -+{ -+ return true; -+} -+ -+#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+/* -+ * Shift for timestamp calculations. This actually limits the maximum -+ * service allowed in one timestamp delta (small shift values increase it), -+ * the maximum total weight that can be used for the queues in the system -+ * (big shift values increase it), and the period of virtual time -+ * wraparounds. -+ */ -+#define WFQ_SERVICE_SHIFT 22 -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = NULL; -+ -+ BUG_ON(!entity); -+ -+ if (!entity->my_sched_data) -+ bfqq = container_of(entity, struct bfq_queue, entity); -+ -+ return bfqq; -+} -+ -+ -+/** -+ * bfq_delta - map service into the virtual time domain. -+ * @service: amount of service. -+ * @weight: scale factor (weight of an entity or weight sum). -+ */ -+static u64 bfq_delta(unsigned long service, unsigned long weight) -+{ -+ u64 d = (u64)service << WFQ_SERVICE_SHIFT; -+ -+ do_div(d, weight); -+ return d; -+} -+ -+/** -+ * bfq_calc_finish - assign the finish time to an entity. -+ * @entity: the entity to act upon. -+ * @service: the service to be charged to the entity. -+ */ -+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned long long start, finish, delta; -+ -+ BUG_ON(entity->weight == 0); -+ -+ entity->finish = entity->start + -+ bfq_delta(service, entity->weight); -+ -+ start = ((entity->start>>10)*1000)>>12; -+ finish = ((entity->finish>>10)*1000)>>12; -+ delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_finish: serv %lu, w %d", -+ service, entity->weight); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_finish: start %llu, finish %llu, delta %llu", -+ start, finish, delta); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_finish group: serv %lu, w %d", -+ service, entity->weight); -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_finish group: start %llu, finish %llu, delta %llu", -+ start, finish, delta); -+#endif -+ } -+} -+ -+/** -+ * bfq_entity_of - get an entity from a node. -+ * @node: the node field of the entity. -+ * -+ * Convert a node pointer to the relative entity. This is used only -+ * to simplify the logic of some functions and not as the generic -+ * conversion mechanism because, e.g., in the tree walking functions, -+ * the check for a %NULL value would be redundant. -+ */ -+static struct bfq_entity *bfq_entity_of(struct rb_node *node) -+{ -+ struct bfq_entity *entity = NULL; -+ -+ if (node) -+ entity = rb_entry(node, struct bfq_entity, rb_node); -+ -+ return entity; -+} -+ -+/** -+ * bfq_extract - remove an entity from a tree. -+ * @root: the tree root. -+ * @entity: the entity to remove. -+ */ -+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) -+{ -+ BUG_ON(entity->tree != root); -+ -+ entity->tree = NULL; -+ rb_erase(&entity->rb_node, root); -+} -+ -+/** -+ * bfq_idle_extract - extract an entity from the idle tree. -+ * @st: the service tree of the owning @entity. -+ * @entity: the entity being removed. -+ */ -+static void bfq_idle_extract(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *next; -+ -+ BUG_ON(entity->tree != &st->idle); -+ -+ if (entity == st->first_idle) { -+ next = rb_next(&entity->rb_node); -+ st->first_idle = bfq_entity_of(next); -+ } -+ -+ if (entity == st->last_idle) { -+ next = rb_prev(&entity->rb_node); -+ st->last_idle = bfq_entity_of(next); -+ } -+ -+ bfq_extract(&st->idle, entity); -+ -+ if (bfqq) -+ list_del(&bfqq->bfqq_list); -+} -+ -+/** -+ * bfq_insert - generic tree insertion. -+ * @root: tree root. -+ * @entity: entity to insert. -+ * -+ * This is used for the idle and the active tree, since they are both -+ * ordered by finish time. -+ */ -+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) -+{ -+ struct bfq_entity *entry; -+ struct rb_node **node = &root->rb_node; -+ struct rb_node *parent = NULL; -+ -+ BUG_ON(entity->tree); -+ -+ while (*node) { -+ parent = *node; -+ entry = rb_entry(parent, struct bfq_entity, rb_node); -+ -+ if (bfq_gt(entry->finish, entity->finish)) -+ node = &parent->rb_left; -+ else -+ node = &parent->rb_right; -+ } -+ -+ rb_link_node(&entity->rb_node, parent, node); -+ rb_insert_color(&entity->rb_node, root); -+ -+ entity->tree = root; -+} -+ -+/** -+ * bfq_update_min - update the min_start field of a entity. -+ * @entity: the entity to update. -+ * @node: one of its children. -+ * -+ * This function is called when @entity may store an invalid value for -+ * min_start due to updates to the active tree. The function assumes -+ * that the subtree rooted at @node (which may be its left or its right -+ * child) has a valid min_start value. -+ */ -+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) -+{ -+ struct bfq_entity *child; -+ -+ if (node) { -+ child = rb_entry(node, struct bfq_entity, rb_node); -+ if (bfq_gt(entity->min_start, child->min_start)) -+ entity->min_start = child->min_start; -+ } -+} -+ -+/** -+ * bfq_update_active_node - recalculate min_start. -+ * @node: the node to update. -+ * -+ * @node may have changed position or one of its children may have moved, -+ * this function updates its min_start value. The left and right subtrees -+ * are assumed to hold a correct min_start value. -+ */ -+static void bfq_update_active_node(struct rb_node *node) -+{ -+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->min_start = entity->start; -+ bfq_update_min(entity, node->rb_right); -+ bfq_update_min(entity, node->rb_left); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "update_active_node: new min_start %llu", -+ ((entity->min_start>>10)*1000)>>12); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "update_active_node: new min_start %llu", -+ ((entity->min_start>>10)*1000)>>12); -+#endif -+ } -+} -+ -+/** -+ * bfq_update_active_tree - update min_start for the whole active tree. -+ * @node: the starting node. -+ * -+ * @node must be the deepest modified node after an update. This function -+ * updates its min_start using the values held by its children, assuming -+ * that they did not change, and then updates all the nodes that may have -+ * changed in the path to the root. The only nodes that may have changed -+ * are the ones in the path or their siblings. -+ */ -+static void bfq_update_active_tree(struct rb_node *node) -+{ -+ struct rb_node *parent; -+ -+up: -+ bfq_update_active_node(node); -+ -+ parent = rb_parent(node); -+ if (!parent) -+ return; -+ -+ if (node == parent->rb_left && parent->rb_right) -+ bfq_update_active_node(parent->rb_right); -+ else if (parent->rb_left) -+ bfq_update_active_node(parent->rb_left); -+ -+ node = parent; -+ goto up; -+} -+ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root); -+ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root); -+ -+ -+/** -+ * bfq_active_insert - insert an entity in the active tree of its -+ * group/device. -+ * @st: the service tree of the entity. -+ * @entity: the entity being inserted. -+ * -+ * The active tree is ordered by finish time, but an extra key is kept -+ * per each node, containing the minimum value for the start times of -+ * its children (and the node itself), so it's possible to search for -+ * the eligible node with the lowest finish time in logarithmic time. -+ */ -+static void bfq_active_insert(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *node = &entity->rb_node; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd = NULL; -+ struct bfq_group *bfqg = NULL; -+ struct bfq_data *bfqd = NULL; -+#endif -+ -+ bfq_insert(&st->active, entity); -+ -+ if (node->rb_left) -+ node = node->rb_left; -+ else if (node->rb_right) -+ node = node->rb_right; -+ -+ bfq_update_active_tree(node); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ sd = entity->sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+#endif -+ if (bfqq) -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { /* bfq_group */ -+ BUG_ON(!bfqd); -+ bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); -+ } -+ if (bfqg != bfqd->root_group) { -+ BUG_ON(!bfqg); -+ BUG_ON(!bfqd); -+ bfqg->active_entities++; -+ } -+#endif -+} -+ -+/** -+ * bfq_ioprio_to_weight - calc a weight from an ioprio. -+ * @ioprio: the ioprio value to convert. -+ */ -+static unsigned short bfq_ioprio_to_weight(int ioprio) -+{ -+ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); -+ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; -+} -+ -+/** -+ * bfq_weight_to_ioprio - calc an ioprio from a weight. -+ * @weight: the weight value to convert. -+ * -+ * To preserve as much as possible the old only-ioprio user interface, -+ * 0 is used as an escape ioprio value for weights (numerically) equal or -+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. -+ */ -+static unsigned short bfq_weight_to_ioprio(int weight) -+{ -+ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); -+ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? -+ 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; -+} -+ -+static void bfq_get_entity(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ if (bfqq) { -+ bfqq->ref++; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", -+ bfqq, bfqq->ref); -+ } -+} -+ -+/** -+ * bfq_find_deepest - find the deepest node that an extraction can modify. -+ * @node: the node being removed. -+ * -+ * Do the first step of an extraction in an rb tree, looking for the -+ * node that will replace @node, and returning the deepest node that -+ * the following modifications to the tree can touch. If @node is the -+ * last node in the tree return %NULL. -+ */ -+static struct rb_node *bfq_find_deepest(struct rb_node *node) -+{ -+ struct rb_node *deepest; -+ -+ if (!node->rb_right && !node->rb_left) -+ deepest = rb_parent(node); -+ else if (!node->rb_right) -+ deepest = node->rb_left; -+ else if (!node->rb_left) -+ deepest = node->rb_right; -+ else { -+ deepest = rb_next(node); -+ if (deepest->rb_right) -+ deepest = deepest->rb_right; -+ else if (rb_parent(deepest) != node) -+ deepest = rb_parent(deepest); -+ } -+ -+ return deepest; -+} -+ -+/** -+ * bfq_active_extract - remove an entity from the active tree. -+ * @st: the service_tree containing the tree. -+ * @entity: the entity being removed. -+ */ -+static void bfq_active_extract(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *node; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd = NULL; -+ struct bfq_group *bfqg = NULL; -+ struct bfq_data *bfqd = NULL; -+#endif -+ -+ node = bfq_find_deepest(&entity->rb_node); -+ bfq_extract(&st->active, entity); -+ -+ if (node) -+ bfq_update_active_tree(node); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ sd = entity->sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+#endif -+ if (bfqq) -+ list_del(&bfqq->bfqq_list); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { /* bfq_group */ -+ BUG_ON(!bfqd); -+ bfq_weights_tree_remove(bfqd, entity, -+ &bfqd->group_weights_tree); -+ } -+ if (bfqg != bfqd->root_group) { -+ BUG_ON(!bfqg); -+ BUG_ON(!bfqd); -+ BUG_ON(!bfqg->active_entities); -+ bfqg->active_entities--; -+ } -+#endif -+} -+ -+/** -+ * bfq_idle_insert - insert an entity into the idle tree. -+ * @st: the service tree containing the tree. -+ * @entity: the entity to insert. -+ */ -+static void bfq_idle_insert(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_entity *first_idle = st->first_idle; -+ struct bfq_entity *last_idle = st->last_idle; -+ -+ if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) -+ st->first_idle = entity; -+ if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) -+ st->last_idle = entity; -+ -+ bfq_insert(&st->idle, entity); -+ -+ if (bfqq) -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); -+} -+ -+/** -+ * bfq_forget_entity - do not consider entity any longer for scheduling -+ * @st: the service tree. -+ * @entity: the entity being removed. -+ * @is_in_service: true if entity is currently the in-service entity. -+ * -+ * Forget everything about @entity. In addition, if entity represents -+ * a queue, and the latter is not in service, then release the service -+ * reference to the queue (the one taken through bfq_get_entity). In -+ * fact, in this case, there is really no more service reference to -+ * the queue, as the latter is also outside any service tree. If, -+ * instead, the queue is in service, then __bfq_bfqd_reset_in_service -+ * will take care of putting the reference when the queue finally -+ * stops being served. -+ */ -+static void bfq_forget_entity(struct bfq_service_tree *st, -+ struct bfq_entity *entity, -+ bool is_in_service) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ BUG_ON(!entity->on_st); -+ -+ entity->on_st = false; -+ st->wsum -= entity->weight; -+ if (bfqq && !is_in_service) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity (before): %p %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/** -+ * bfq_put_idle_entity - release the idle tree ref of an entity. -+ * @st: service tree for the entity. -+ * @entity: the entity being released. -+ */ -+static void bfq_put_idle_entity(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ bfq_idle_extract(st, entity); -+ bfq_forget_entity(st, entity, -+ entity == entity->sched_data->in_service_entity); -+} -+ -+/** -+ * bfq_forget_idle - update the idle tree if necessary. -+ * @st: the service tree to act upon. -+ * -+ * To preserve the global O(log N) complexity we only remove one entry here; -+ * as the idle tree will not grow indefinitely this can be done safely. -+ */ -+static void bfq_forget_idle(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *first_idle = st->first_idle; -+ struct bfq_entity *last_idle = st->last_idle; -+ -+ if (RB_EMPTY_ROOT(&st->active) && last_idle && -+ !bfq_gt(last_idle->finish, st->vtime)) { -+ /* -+ * Forget the whole idle tree, increasing the vtime past -+ * the last finish time of idle entities. -+ */ -+ st->vtime = last_idle->finish; -+ } -+ -+ if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) -+ bfq_put_idle_entity(st, first_idle); -+} -+ -+/* -+ * Update weight and priority of entity. If update_class_too is true, -+ * then update the ioprio_class of entity too. -+ * -+ * The reason why the update of ioprio_class is controlled through the -+ * last parameter is as follows. Changing the ioprio class of an -+ * entity implies changing the destination service trees for that -+ * entity. If such a change occurred when the entity is already on one -+ * of the service trees for its previous class, then the state of the -+ * entity would become more complex: none of the new possible service -+ * trees for the entity, according to bfq_entity_service_tree(), would -+ * match any of the possible service trees on which the entity -+ * is. Complex operations involving these trees, such as entity -+ * activations and deactivations, should take into account this -+ * additional complexity. To avoid this issue, this function is -+ * invoked with update_class_too unset in the points in the code where -+ * entity may happen to be on some tree. -+ */ -+static struct bfq_service_tree * -+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, -+ struct bfq_entity *entity, -+ bool update_class_too) -+{ -+ struct bfq_service_tree *new_st = old_st; -+ -+ if (entity->prio_changed) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int prev_weight, new_weight; -+ struct bfq_data *bfqd = NULL; -+ struct rb_root *root; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd; -+ struct bfq_group *bfqg; -+#endif -+ -+ if (bfqq) -+ bfqd = bfqq->bfqd; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ sd = entity->my_sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+ BUG_ON(!bfqd); -+ } -+#endif -+ -+ BUG_ON(old_st->wsum < entity->weight); -+ old_st->wsum -= entity->weight; -+ -+ if (entity->new_weight != entity->orig_weight) { -+ if (entity->new_weight < BFQ_MIN_WEIGHT || -+ entity->new_weight > BFQ_MAX_WEIGHT) { -+ pr_crit("update_weight_prio: new_weight %d\n", -+ entity->new_weight); -+ if (entity->new_weight < BFQ_MIN_WEIGHT) -+ entity->new_weight = BFQ_MIN_WEIGHT; -+ else -+ entity->new_weight = BFQ_MAX_WEIGHT; -+ } -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) -+ bfqq->ioprio = -+ bfq_weight_to_ioprio(entity->orig_weight); -+ } -+ -+ if (bfqq && update_class_too) -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ -+ /* -+ * Reset prio_changed only if the ioprio_class change -+ * is not pending any longer. -+ */ -+ if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class) -+ entity->prio_changed = 0; -+ -+ /* -+ * NOTE: here we may be changing the weight too early, -+ * this will cause unfairness. The correct approach -+ * would have required additional complexity to defer -+ * weight changes to the proper time instants (i.e., -+ * when entity->finish <= old_st->vtime). -+ */ -+ new_st = bfq_entity_service_tree(entity); -+ -+ prev_weight = entity->weight; -+ new_weight = entity->orig_weight * -+ (bfqq ? bfqq->wr_coeff : 1); -+ /* -+ * If the weight of the entity changes, remove the entity -+ * from its old weight counter (if there is a counter -+ * associated with the entity), and add it to the counter -+ * associated with its new weight. -+ */ -+ if (prev_weight != new_weight) { -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "weight changed %d %d(%d %d)", -+ prev_weight, new_weight, -+ entity->orig_weight, -+ bfqq->wr_coeff); -+ -+ root = bfqq ? &bfqd->queue_weights_tree : -+ &bfqd->group_weights_tree; -+ bfq_weights_tree_remove(bfqd, entity, root); -+ } -+ entity->weight = new_weight; -+ /* -+ * Add the entity to its weights tree only if it is -+ * not associated with a weight-raised queue. -+ */ -+ if (prev_weight != new_weight && -+ (bfqq ? bfqq->wr_coeff == 1 : 1)) -+ /* If we get here, root has been initialized. */ -+ bfq_weights_tree_add(bfqd, entity, root); -+ -+ new_st->wsum += entity->weight; -+ -+ if (new_st != old_st) -+ entity->start = new_st->vtime; -+ } -+ -+ return new_st; -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); -+#endif -+ -+/** -+ * bfq_bfqq_served - update the scheduler status after selection for -+ * service. -+ * @bfqq: the queue being served. -+ * @served: bytes to transfer. -+ * -+ * NOTE: this can be optimized, as the timestamps of upper level entities -+ * are synchronized every time a new bfqq is selected for service. By now, -+ * we keep it to better check consistency. -+ */ -+static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st; -+ -+ for_each_entity(entity) { -+ st = bfq_entity_service_tree(entity); -+ -+ entity->service += served; -+ -+ BUG_ON(st->wsum == 0); -+ -+ st->vtime += bfq_delta(served, st->wsum); -+ bfq_forget_idle(st); -+ } -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); -+#endif -+ st = bfq_entity_service_tree(&bfqq->entity); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", -+ served, ((st->vtime>>10)*1000)>>12, st); -+} -+ -+/** -+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length -+ * of the time interval during which bfqq has been in -+ * service. -+ * @bfqd: the device -+ * @bfqq: the queue that needs a service update. -+ * @time_ms: the amount of time during which the queue has received service -+ * -+ * If a queue does not consume its budget fast enough, then providing -+ * the queue with service fairness may impair throughput, more or less -+ * severely. For this reason, queues that consume their budget slowly -+ * are provided with time fairness instead of service fairness. This -+ * goal is achieved through the BFQ scheduling engine, even if such an -+ * engine works in the service, and not in the time domain. The trick -+ * is charging these queues with an inflated amount of service, equal -+ * to the amount of service that they would have received during their -+ * service slot if they had been fast, i.e., if their requests had -+ * been dispatched at a rate equal to the estimated peak rate. -+ * -+ * It is worth noting that time fairness can cause important -+ * distortions in terms of bandwidth distribution, on devices with -+ * internal queueing. The reason is that I/O requests dispatched -+ * during the service slot of a queue may be served after that service -+ * slot is finished, and may have a total processing time loosely -+ * correlated with the duration of the service slot. This is -+ * especially true for short service slots. -+ */ -+static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ unsigned long time_ms) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ int tot_serv_to_charge = entity->service; -+ unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); -+ -+ if (time_ms > 0 && time_ms < timeout_ms) -+ tot_serv_to_charge = -+ (bfqd->bfq_max_budget * time_ms) / timeout_ms; -+ -+ if (tot_serv_to_charge < entity->service) -+ tot_serv_to_charge = entity->service; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "charge_time: %lu/%u ms, %d/%d/%d sectors", -+ time_ms, timeout_ms, entity->service, -+ tot_serv_to_charge, entity->budget); -+ -+ /* Increase budget to avoid inconsistencies */ -+ if (tot_serv_to_charge > entity->budget) -+ entity->budget = tot_serv_to_charge; -+ -+ bfq_bfqq_served(bfqq, -+ max_t(int, 0, tot_serv_to_charge - entity->service)); -+} -+ -+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, -+ struct bfq_service_tree *st, -+ bool backshifted) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_sched_data *sd = entity->sched_data; -+ -+ /* -+ * When this function is invoked, entity is not in any service -+ * tree, then it is safe to invoke next function with the last -+ * parameter set (see the comments on the function). -+ */ -+ st = __bfq_entity_update_weight_prio(st, entity, true); -+ bfq_calc_finish(entity, entity->budget); -+ -+ /* -+ * If some queues enjoy backshifting for a while, then their -+ * (virtual) finish timestamps may happen to become lower and -+ * lower than the system virtual time. In particular, if -+ * these queues often happen to be idle for short time -+ * periods, and during such time periods other queues with -+ * higher timestamps happen to be busy, then the backshifted -+ * timestamps of the former queues can become much lower than -+ * the system virtual time. In fact, to serve the queues with -+ * higher timestamps while the ones with lower timestamps are -+ * idle, the system virtual time may be pushed-up to much -+ * higher values than the finish timestamps of the idle -+ * queues. As a consequence, the finish timestamps of all new -+ * or newly activated queues may end up being much larger than -+ * those of lucky queues with backshifted timestamps. The -+ * latter queues may then monopolize the device for a lot of -+ * time. This would simply break service guarantees. -+ * -+ * To reduce this problem, push up a little bit the -+ * backshifted timestamps of the queue associated with this -+ * entity (only a queue can happen to have the backshifted -+ * flag set): just enough to let the finish timestamp of the -+ * queue be equal to the current value of the system virtual -+ * time. This may introduce a little unfairness among queues -+ * with backshifted timestamps, but it does not break -+ * worst-case fairness guarantees. -+ * -+ * As a special case, if bfqq is weight-raised, push up -+ * timestamps much less, to keep very low the probability that -+ * this push up causes the backshifted finish timestamps of -+ * weight-raised queues to become higher than the backshifted -+ * finish timestamps of non weight-raised queues. -+ */ -+ if (backshifted && bfq_gt(st->vtime, entity->finish)) { -+ unsigned long delta = st->vtime - entity->finish; -+ -+ if (bfqq) -+ delta /= bfqq->wr_coeff; -+ -+ entity->start += delta; -+ entity->finish += delta; -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__activate_entity: new queue finish %llu", -+ ((entity->finish>>10)*1000)>>12); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__activate_entity: new group finish %llu", -+ ((entity->finish>>10)*1000)>>12); -+#endif -+ } -+ } -+ -+ bfq_active_insert(st, entity); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__activate_entity: queue %seligible in st %p", -+ entity->start <= st->vtime ? "" : "non ", st); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__activate_entity: group %seligible in st %p", -+ entity->start <= st->vtime ? "" : "non ", st); -+#endif -+ } -+ BUG_ON(RB_EMPTY_ROOT(&st->active)); -+ BUG_ON(&st->active != &sd->service_tree->active && -+ &st->active != &(sd->service_tree+1)->active && -+ &st->active != &(sd->service_tree+2)->active); -+} -+ -+/** -+ * __bfq_activate_entity - handle activation of entity. -+ * @entity: the entity being activated. -+ * @non_blocking_wait_rq: true if entity was waiting for a request -+ * -+ * Called for a 'true' activation, i.e., if entity is not active and -+ * one of its children receives a new request. -+ * -+ * Basically, this function updates the timestamps of entity and -+ * inserts entity into its active tree, ater possible extracting it -+ * from its idle tree. -+ */ -+static void __bfq_activate_entity(struct bfq_entity *entity, -+ bool non_blocking_wait_rq) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ bool backshifted = false; -+ unsigned long long min_vstart; -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ -+ /* See comments on bfq_fqq_update_budg_for_activation */ -+ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { -+ backshifted = true; -+ min_vstart = entity->finish; -+ } else -+ min_vstart = st->vtime; -+ -+ if (entity->tree == &st->idle) { -+ /* -+ * Must be on the idle tree, bfq_idle_extract() will -+ * check for that. -+ */ -+ bfq_idle_extract(st, entity); -+ entity->start = bfq_gt(min_vstart, entity->finish) ? -+ min_vstart : entity->finish; -+ } else { -+ /* -+ * The finish time of the entity may be invalid, and -+ * it is in the past for sure, otherwise the queue -+ * would have been on the idle tree. -+ */ -+ entity->start = min_vstart; -+ st->wsum += entity->weight; -+ /* -+ * entity is about to be inserted into a service tree, -+ * and then set in service: get a reference to make -+ * sure entity does not disappear until it is no -+ * longer in service or scheduled for service. -+ */ -+ bfq_get_entity(entity); -+ -+ BUG_ON(entity->on_st && bfqq); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ if (entity->on_st && !bfqq) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, -+ entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, -+ bfqg, -+ "activate bug, class %d in_service %p", -+ bfq_class_idx(entity), sd->in_service_entity); -+ } -+#endif -+ BUG_ON(entity->on_st && !bfqq); -+ entity->on_st = true; -+ } -+ -+ bfq_update_fin_time_enqueue(entity, st, backshifted); -+} -+ -+/** -+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity. -+ * @entity: the entity being requeued or repositioned. -+ * -+ * Requeueing is needed if this entity stops being served, which -+ * happens if a leaf descendant entity has expired. On the other hand, -+ * repositioning is needed if the next_inservice_entity for the child -+ * entity has changed. See the comments inside the function for -+ * details. -+ * -+ * Basically, this function: 1) removes entity from its active tree if -+ * present there, 2) updates the timestamps of entity and 3) inserts -+ * entity back into its active tree (in the new, right position for -+ * the new values of the timestamps). -+ */ -+static void __bfq_requeue_entity(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ -+ BUG_ON(entity != sd->in_service_entity && -+ entity->tree != &st->active); -+ -+ if (entity == sd->in_service_entity) { -+ /* -+ * We are requeueing the current in-service entity, -+ * which may have to be done for one of the following -+ * reasons: -+ * - entity represents the in-service queue, and the -+ * in-service queue is being requeued after an -+ * expiration; -+ * - entity represents a group, and its budget has -+ * changed because one of its child entities has -+ * just been either activated or requeued for some -+ * reason; the timestamps of the entity need then to -+ * be updated, and the entity needs to be enqueued -+ * or repositioned accordingly. -+ * -+ * In particular, before requeueing, the start time of -+ * the entity must be moved forward to account for the -+ * service that the entity has received while in -+ * service. This is done by the next instructions. The -+ * finish time will then be updated according to this -+ * new value of the start time, and to the budget of -+ * the entity. -+ */ -+ bfq_calc_finish(entity, entity->service); -+ entity->start = entity->finish; -+ BUG_ON(entity->tree && entity->tree != &st->active); -+ /* -+ * In addition, if the entity had more than one child -+ * when set in service, then was not extracted from -+ * the active tree. This implies that the position of -+ * the entity in the active tree may need to be -+ * changed now, because we have just updated the start -+ * time of the entity, and we will update its finish -+ * time in a moment (the requeueing is then, more -+ * precisely, a repositioning in this case). To -+ * implement this repositioning, we: 1) dequeue the -+ * entity here, 2) update the finish time and -+ * requeue the entity according to the new -+ * timestamps below. -+ */ -+ if (entity->tree) -+ bfq_active_extract(st, entity); -+ } else { /* The entity is already active, and not in service */ -+ /* -+ * In this case, this function gets called only if the -+ * next_in_service entity below this entity has -+ * changed, and this change has caused the budget of -+ * this entity to change, which, finally implies that -+ * the finish time of this entity must be -+ * updated. Such an update may cause the scheduling, -+ * i.e., the position in the active tree, of this -+ * entity to change. We handle this change by: 1) -+ * dequeueing the entity here, 2) updating the finish -+ * time and requeueing the entity according to the new -+ * timestamps below. This is the same approach as the -+ * non-extracted-entity sub-case above. -+ */ -+ bfq_active_extract(st, entity); -+ } -+ -+ bfq_update_fin_time_enqueue(entity, st, false); -+} -+ -+static void __bfq_activate_requeue_entity(struct bfq_entity *entity, -+ struct bfq_sched_data *sd, -+ bool non_blocking_wait_rq) -+{ -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ if (sd->in_service_entity == entity || entity->tree == &st->active) -+ /* -+ * in service or already queued on the active tree, -+ * requeue or reposition -+ */ -+ __bfq_requeue_entity(entity); -+ else -+ /* -+ * Not in service and not queued on its active tree: -+ * the activity is idle and this is a true activation. -+ */ -+ __bfq_activate_entity(entity, non_blocking_wait_rq); -+} -+ -+ -+/** -+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, -+ * and activate, requeue or reposition all ancestors -+ * for which such an update becomes necessary. -+ * @entity: the entity to activate. -+ * @non_blocking_wait_rq: true if this entity was waiting for a request -+ * @requeue: true if this is a requeue, which implies that bfqq is -+ * being expired; thus ALL its ancestors stop being served and must -+ * therefore be requeued -+ */ -+static void bfq_activate_requeue_entity(struct bfq_entity *entity, -+ bool non_blocking_wait_rq, -+ bool requeue) -+{ -+ struct bfq_sched_data *sd; -+ -+ for_each_entity(entity) { -+ BUG_ON(!entity); -+ sd = entity->sched_data; -+ __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); -+ -+ BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) && -+ RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && -+ RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); -+ -+ if (!bfq_update_next_in_service(sd, entity) && !requeue) { -+ BUG_ON(!sd->next_in_service); -+ break; -+ } -+ BUG_ON(!sd->next_in_service); -+ } -+} -+ -+/** -+ * __bfq_deactivate_entity - deactivate an entity from its service tree. -+ * @entity: the entity to deactivate. -+ * @ins_into_idle_tree: if false, the entity will not be put into the -+ * idle tree. -+ * -+ * Deactivates an entity, independently from its previous state. Must -+ * be invoked only if entity is on a service tree. Extracts the entity -+ * from that tree, and if necessary and allowed, puts it on the idle -+ * tree. -+ */ -+static bool __bfq_deactivate_entity(struct bfq_entity *entity, -+ bool ins_into_idle_tree) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st; -+ bool is_in_service; -+ -+ if (!entity->on_st) { /* entity never activated, or already inactive */ -+ BUG_ON(sd && entity == sd->in_service_entity); -+ return false; -+ } -+ -+ /* -+ * If we get here, then entity is active, which implies that -+ * bfq_group_set_parent has already been invoked for the group -+ * represented by entity. Therefore, the field -+ * entity->sched_data has been set, and we can safely use it. -+ */ -+ st = bfq_entity_service_tree(entity); -+ is_in_service = entity == sd->in_service_entity; -+ -+ BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); -+ -+ if (is_in_service) -+ bfq_calc_finish(entity, entity->service); -+ -+ if (entity->tree == &st->active) -+ bfq_active_extract(st, entity); -+ else if (!is_in_service && entity->tree == &st->idle) -+ bfq_idle_extract(st, entity); -+ else if (entity->tree) -+ BUG(); -+ -+ if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) -+ bfq_forget_entity(st, entity, is_in_service); -+ else -+ bfq_idle_insert(st, entity); -+ -+ return true; -+} -+ -+/** -+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. -+ * @entity: the entity to deactivate. -+ * @ins_into_idle_tree: true if the entity can be put on the idle tree -+ */ -+static void bfq_deactivate_entity(struct bfq_entity *entity, -+ bool ins_into_idle_tree, -+ bool expiration) -+{ -+ struct bfq_sched_data *sd; -+ struct bfq_entity *parent = NULL; -+ -+ for_each_entity_safe(entity, parent) { -+ sd = entity->sched_data; -+ -+ BUG_ON(sd == NULL); /* -+ * It would mean that this is the -+ * root group. -+ */ -+ -+ BUG_ON(expiration && entity != sd->in_service_entity); -+ -+ BUG_ON(entity != sd->in_service_entity && -+ entity->tree == -+ &bfq_entity_service_tree(entity)->active && -+ !sd->next_in_service); -+ -+ if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { -+ /* -+ * entity is not in any tree any more, so -+ * this deactivation is a no-op, and there is -+ * nothing to change for upper-level entities -+ * (in case of expiration, this can never -+ * happen). -+ */ -+ BUG_ON(expiration); /* -+ * entity cannot be already out of -+ * any tree -+ */ -+ return; -+ } -+ -+ if (sd->next_in_service == entity) -+ /* -+ * entity was the next_in_service entity, -+ * then, since entity has just been -+ * deactivated, a new one must be found. -+ */ -+ bfq_update_next_in_service(sd, NULL); -+ -+ if (sd->next_in_service) { -+ /* -+ * The parent entity is still backlogged, -+ * because next_in_service is not NULL. So, no -+ * further upwards deactivation must be -+ * performed. Yet, next_in_service has -+ * changed. Then the schedule does need to be -+ * updated upwards. -+ */ -+ BUG_ON(sd->next_in_service == entity); -+ break; -+ } -+ -+ /* -+ * If we get here, then the parent is no more -+ * backlogged and we need to propagate the -+ * deactivation upwards. Thus let the loop go on. -+ */ -+ -+ /* -+ * Also let parent be queued into the idle tree on -+ * deactivation, to preserve service guarantees, and -+ * assuming that who invoked this function does not -+ * need parent entities too to be removed completely. -+ */ -+ ins_into_idle_tree = true; -+ } -+ -+ /* -+ * If the deactivation loop is fully executed, then there are -+ * no more entities to touch and next loop is not executed at -+ * all. Otherwise, requeue remaining entities if they are -+ * about to stop receiving service, or reposition them if this -+ * is not the case. -+ */ -+ entity = parent; -+ for_each_entity(entity) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ /* -+ * Invoke __bfq_requeue_entity on entity, even if -+ * already active, to requeue/reposition it in the -+ * active tree (because sd->next_in_service has -+ * changed) -+ */ -+ __bfq_requeue_entity(entity); -+ -+ sd = entity->sched_data; -+ BUG_ON(expiration && sd->in_service_entity != entity); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "invoking udpdate_next for this queue"); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "invoking udpdate_next for this entity"); -+ } -+#endif -+ if (!bfq_update_next_in_service(sd, entity) && -+ !expiration) -+ /* -+ * next_in_service unchanged or not causing -+ * any change in entity->parent->sd, and no -+ * requeueing needed for expiration: stop -+ * here. -+ */ -+ break; -+ } -+} -+ -+/** -+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump, -+ * if needed, to have at least one entity eligible. -+ * @st: the service tree to act upon. -+ * -+ * Assumes that st is not empty. -+ */ -+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); -+ -+ if (bfq_gt(root_entity->min_start, st->vtime)) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_vtime_jump: new value %llu", -+ root_entity->min_start); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(root_entity, struct bfq_group, -+ entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_vtime_jump: new value %llu", -+ root_entity->min_start); -+ } -+#endif -+ return root_entity->min_start; -+ } -+ return st->vtime; -+} -+ -+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) -+{ -+ if (new_value > st->vtime) { -+ st->vtime = new_value; -+ bfq_forget_idle(st); -+ } -+} -+ -+/** -+ * bfq_first_active_entity - find the eligible entity with -+ * the smallest finish time -+ * @st: the service tree to select from. -+ * @vtime: the system virtual to use as a reference for eligibility -+ * -+ * This function searches the first schedulable entity, starting from the -+ * root of the tree and going on the left every time on this side there is -+ * a subtree with at least one eligible (start >= vtime) entity. The path on -+ * the right is followed only if a) the left subtree contains no eligible -+ * entities and b) no eligible entity has been found yet. -+ */ -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, -+ u64 vtime) -+{ -+ struct bfq_entity *entry, *first = NULL; -+ struct rb_node *node = st->active.rb_node; -+ -+ while (node) { -+ entry = rb_entry(node, struct bfq_entity, rb_node); -+left: -+ if (!bfq_gt(entry->start, vtime)) -+ first = entry; -+ -+ BUG_ON(bfq_gt(entry->min_start, vtime)); -+ -+ if (node->rb_left) { -+ entry = rb_entry(node->rb_left, -+ struct bfq_entity, rb_node); -+ if (!bfq_gt(entry->min_start, vtime)) { -+ node = node->rb_left; -+ goto left; -+ } -+ } -+ if (first) -+ break; -+ node = node->rb_right; -+ } -+ -+ BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); -+ return first; -+} -+ -+/** -+ * __bfq_lookup_next_entity - return the first eligible entity in @st. -+ * @st: the service tree. -+ * -+ * If there is no in-service entity for the sched_data st belongs to, -+ * then return the entity that will be set in service if: -+ * 1) the parent entity this st belongs to is set in service; -+ * 2) no entity belonging to such parent entity undergoes a state change -+ * that would influence the timestamps of the entity (e.g., becomes idle, -+ * becomes backlogged, changes its budget, ...). -+ * -+ * In this first case, update the virtual time in @st too (see the -+ * comments on this update inside the function). -+ * -+ * In constrast, if there is an in-service entity, then return the -+ * entity that would be set in service if not only the above -+ * conditions, but also the next one held true: the currently -+ * in-service entity, on expiration, -+ * 1) gets a finish time equal to the current one, or -+ * 2) is not eligible any more, or -+ * 3) is idle. -+ */ -+static struct bfq_entity * -+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service -+#if 0 -+ , bool force -+#endif -+ ) -+{ -+ struct bfq_entity *entity -+#if 0 -+ , *new_next_in_service = NULL -+#endif -+ ; -+ u64 new_vtime; -+ struct bfq_queue *bfqq; -+ -+ if (RB_EMPTY_ROOT(&st->active)) -+ return NULL; -+ -+ /* -+ * Get the value of the system virtual time for which at -+ * least one entity is eligible. -+ */ -+ new_vtime = bfq_calc_vtime_jump(st); -+ -+ /* -+ * If there is no in-service entity for the sched_data this -+ * active tree belongs to, then push the system virtual time -+ * up to the value that guarantees that at least one entity is -+ * eligible. If, instead, there is an in-service entity, then -+ * do not make any such update, because there is already an -+ * eligible entity, namely the in-service one (even if the -+ * entity is not on st, because it was extracted when set in -+ * service). -+ */ -+ if (!in_service) -+ bfq_update_vtime(st, new_vtime); -+ -+ entity = bfq_first_active_entity(st, new_vtime); -+ BUG_ON(bfq_gt(entity->start, new_vtime)); -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__lookup_next: start %llu vtime %llu st %p", -+ ((entity->start>>10)*1000)>>12, -+ ((new_vtime>>10)*1000)>>12, st); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__lookup_next: start %llu vtime %llu st %p", -+ ((entity->start>>10)*1000)>>12, -+ ((new_vtime>>10)*1000)>>12, st); -+ } -+#endif -+ -+ BUG_ON(!entity); -+ -+ return entity; -+} -+ -+/** -+ * bfq_lookup_next_entity - return the first eligible entity in @sd. -+ * @sd: the sched_data. -+ * -+ * This function is invoked when there has been a change in the trees -+ * for sd, and we need know what is the new next entity after this -+ * change. -+ */ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) -+{ -+ struct bfq_service_tree *st = sd->service_tree; -+ struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); -+ struct bfq_entity *entity = NULL; -+ struct bfq_queue *bfqq; -+ int class_idx = 0; -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ /* -+ * Choose from idle class, if needed to guarantee a minimum -+ * bandwidth to this class (and if there is some active entity -+ * in idle class). This should also mitigate -+ * priority-inversion problems in case a low priority task is -+ * holding file system resources. -+ */ -+ if (time_is_before_jiffies(sd->bfq_class_idle_last_service + -+ BFQ_CL_IDLE_TIMEOUT)) { -+ if (!RB_EMPTY_ROOT(&idle_class_st->active)) -+ class_idx = BFQ_IOPRIO_CLASSES - 1; -+ /* About to be served if backlogged, or not yet backlogged */ -+ sd->bfq_class_idle_last_service = jiffies; -+ } -+ -+ /* -+ * Find the next entity to serve for the highest-priority -+ * class, unless the idle class needs to be served. -+ */ -+ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { -+ entity = __bfq_lookup_next_entity(st + class_idx, -+ sd->in_service_entity); -+ -+ if (entity) -+ break; -+ } -+ -+ BUG_ON(!entity && -+ (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) || -+ !RB_EMPTY_ROOT(&(st+2)->active))); -+ -+ if (!entity) -+ return NULL; -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", -+ st + class_idx, class_idx); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "chosen from st %p %d", -+ st + class_idx, class_idx); -+ } -+#endif -+ -+ return entity; -+} -+ -+static bool next_queue_may_preempt(struct bfq_data *bfqd) -+{ -+ struct bfq_sched_data *sd = &bfqd->root_group->sched_data; -+ -+ return sd->next_in_service != sd->in_service_entity; -+} -+ -+/* -+ * Get next queue for service. -+ */ -+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_entity *entity = NULL; -+ struct bfq_sched_data *sd; -+ struct bfq_queue *bfqq; -+ -+ BUG_ON(bfqd->in_service_queue); -+ -+ if (bfqd->busy_queues == 0) -+ return NULL; -+ -+ /* -+ * Traverse the path from the root to the leaf entity to -+ * serve. Set in service all the entities visited along the -+ * way. -+ */ -+ sd = &bfqd->root_group->sched_data; -+ for (; sd ; sd = entity->my_sched_data) { -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ if (entity) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "get_next_queue: lookup in this group"); -+ if (!sd->next_in_service) -+ pr_crit("get_next_queue: lookup in this group"); -+ } else { -+ bfq_log_bfqg(bfqd, bfqd->root_group, -+ "get_next_queue: lookup in root group"); -+ if (!sd->next_in_service) -+ pr_crit("get_next_queue: lookup in root group"); -+ } -+#endif -+ -+ BUG_ON(!sd->next_in_service); -+ -+ /* -+ * WARNING. We are about to set the in-service entity -+ * to sd->next_in_service, i.e., to the (cached) value -+ * returned by bfq_lookup_next_entity(sd) the last -+ * time it was invoked, i.e., the last time when the -+ * service order in sd changed as a consequence of the -+ * activation or deactivation of an entity. In this -+ * respect, if we execute bfq_lookup_next_entity(sd) -+ * in this very moment, it may, although with low -+ * probability, yield a different entity than that -+ * pointed to by sd->next_in_service. This rare event -+ * happens in case there was no CLASS_IDLE entity to -+ * serve for sd when bfq_lookup_next_entity(sd) was -+ * invoked for the last time, while there is now one -+ * such entity. -+ * -+ * If the above event happens, then the scheduling of -+ * such entity in CLASS_IDLE is postponed until the -+ * service of the sd->next_in_service entity -+ * finishes. In fact, when the latter is expired, -+ * bfq_lookup_next_entity(sd) gets called again, -+ * exactly to update sd->next_in_service. -+ */ -+ -+ /* Make next_in_service entity become in_service_entity */ -+ entity = sd->next_in_service; -+ sd->in_service_entity = entity; -+ -+ /* -+ * Reset the accumulator of the amount of service that -+ * the entity is about to receive. -+ */ -+ entity->service = 0; -+ -+ /* -+ * If entity is no longer a candidate for next -+ * service, then we extract it from its active tree, -+ * for the following reason. To further boost the -+ * throughput in some special case, BFQ needs to know -+ * which is the next candidate entity to serve, while -+ * there is already an entity in service. In this -+ * respect, to make it easy to compute/update the next -+ * candidate entity to serve after the current -+ * candidate has been set in service, there is a case -+ * where it is necessary to extract the current -+ * candidate from its service tree. Such a case is -+ * when the entity just set in service cannot be also -+ * a candidate for next service. Details about when -+ * this conditions holds are reported in the comments -+ * on the function bfq_no_longer_next_in_service() -+ * invoked below. -+ */ -+ if (bfq_no_longer_next_in_service(entity)) -+ bfq_active_extract(bfq_entity_service_tree(entity), -+ entity); -+ -+ /* -+ * For the same reason why we may have just extracted -+ * entity from its active tree, we may need to update -+ * next_in_service for the sched_data of entity too, -+ * regardless of whether entity has been extracted. -+ * In fact, even if entity has not been extracted, a -+ * descendant entity may get extracted. Such an event -+ * would cause a change in next_in_service for the -+ * level of the descendant entity, and thus possibly -+ * back to upper levels. -+ * -+ * We cannot perform the resulting needed update -+ * before the end of this loop, because, to know which -+ * is the correct next-to-serve candidate entity for -+ * each level, we need first to find the leaf entity -+ * to set in service. In fact, only after we know -+ * which is the next-to-serve leaf entity, we can -+ * discover whether the parent entity of the leaf -+ * entity becomes the next-to-serve, and so on. -+ */ -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_next_queue: this queue, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "get_next_queue: this entity, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+ } -+#endif -+ -+ } -+ -+ BUG_ON(!entity); -+ bfqq = bfq_entity_to_bfqq(entity); -+ BUG_ON(!bfqq); -+ -+ /* -+ * We can finally update all next-to-serve entities along the -+ * path from the leaf entity just set in service to the root. -+ */ -+ for_each_entity(entity) { -+ struct bfq_sched_data *sd = entity->sched_data; -+ -+ if(!bfq_update_next_in_service(sd, NULL)) -+ break; -+ } -+ -+ return bfqq; -+} -+ -+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; -+ struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; -+ struct bfq_entity *entity = in_serv_entity; -+ -+ if (bfqd->in_service_bic) { -+ put_io_context(bfqd->in_service_bic->icq.ioc); -+ bfqd->in_service_bic = NULL; -+ } -+ -+ bfq_clear_bfqq_wait_request(in_serv_bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqd->in_service_queue = NULL; -+ -+ /* -+ * When this function is called, all in-service entities have -+ * been properly deactivated or requeued, so we can safely -+ * execute the final step: reset in_service_entity along the -+ * path from entity to the root. -+ */ -+ for_each_entity(entity) -+ entity->sched_data->in_service_entity = NULL; -+ -+ /* -+ * in_serv_entity is no longer in service, so, if it is in no -+ * service tree either, then release the service reference to -+ * the queue it represents (taken with bfq_get_entity). -+ */ -+ if (!in_serv_entity->on_st) -+ bfq_put_queue(in_serv_bfqq); -+} -+ -+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool ins_into_idle_tree, bool expiration) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); -+} -+ -+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(entity->tree != &st->active && entity->tree != &st->idle && -+ entity->on_st); -+ -+ bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), -+ false); -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+} -+ -+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_activate_requeue_entity(entity, false, -+ bfqq == bfqd->in_service_queue); -+} -+ -+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -+ -+/* -+ * Called when the bfqq no longer has requests pending, remove it from -+ * the service tree. As a special case, it can be invoked during an -+ * expiration. -+ */ -+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool expiration) -+{ -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ bfq_log_bfqq(bfqd, bfqq, "del from busy"); -+ -+ bfq_clear_bfqq_busy(bfqq); -+ -+ BUG_ON(bfqd->busy_queues == 0); -+ bfqd->busy_queues--; -+ -+ if (!bfqq->dispatched) -+ bfq_weights_tree_remove(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ -+ if (bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ bfqg_stats_update_dequeue(bfqq_group(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); -+} -+ -+/* -+ * Called when an inactive queue receives a new request. -+ */ -+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ -+ bfq_log_bfqq(bfqd, bfqq, "add to busy"); -+ -+ bfq_activate_bfqq(bfqd, bfqq); -+ -+ bfq_mark_bfqq_busy(bfqq); -+ bfqd->busy_queues++; -+ -+ if (!bfqq->dispatched) -+ if (bfqq->wr_coeff == 1) -+ bfq_weights_tree_add(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ -+ if (bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } -+ -+} -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -new file mode 100644 -index 000000000000..65e7c7e77f3c ---- /dev/null -+++ b/block/bfq-sq-iosched.c -@@ -0,0 +1,5379 @@ -+/* -+ * Budget Fair Queueing (BFQ) I/O scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2015 Paolo Valente -+ * -+ * Copyright (C) 2017 Paolo Valente -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ * -+ * BFQ is a proportional-share I/O scheduler, with some extra -+ * low-latency capabilities. BFQ also supports full hierarchical -+ * scheduling through cgroups. Next paragraphs provide an introduction -+ * on BFQ inner workings. Details on BFQ benefits and usage can be -+ * found in Documentation/block/bfq-iosched.txt. -+ * -+ * BFQ is a proportional-share storage-I/O scheduling algorithm based -+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns -+ * budgets, measured in number of sectors, to processes instead of -+ * time slices. The device is not granted to the in-service process -+ * for a given time slice, but until it has exhausted its assigned -+ * budget. This change from the time to the service domain enables BFQ -+ * to distribute the device throughput among processes as desired, -+ * without any distortion due to throughput fluctuations, or to device -+ * internal queueing. BFQ uses an ad hoc internal scheduler, called -+ * B-WF2Q+, to schedule processes according to their budgets. More -+ * precisely, BFQ schedules queues associated with processes. Thanks to -+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high -+ * budgets to I/O-bound processes issuing sequential requests (to -+ * boost the throughput), and yet guarantee a low latency to -+ * interactive and soft real-time applications. -+ * -+ * NOTE: if the main or only goal, with a given device, is to achieve -+ * the maximum-possible throughput at all times, then do switch off -+ * all low-latency heuristics for that device, by setting low_latency -+ * to 0. -+ * -+ * BFQ is described in [1], where also a reference to the initial, more -+ * theoretical paper on BFQ can be found. The interested reader can find -+ * in the latter paper full details on the main algorithm, as well as -+ * formulas of the guarantees and formal proofs of all the properties. -+ * With respect to the version of BFQ presented in these papers, this -+ * implementation adds a few more heuristics, such as the one that -+ * guarantees a low latency to soft real-time applications, and a -+ * hierarchical extension based on H-WF2Q+. -+ * -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) -+ * complexity derives from the one introduced with EEVDF in [3]. -+ * -+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O -+ * Scheduler", Proceedings of the First Workshop on Mobile System -+ * Technologies (MST-2015), May 2015. -+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf -+ * -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf -+ * -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, -+ * Oct 1997. -+ * -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz -+ * -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline -+ * First: A Flexible and Accurate Mechanism for Proportional Share -+ * Resource Allocation,'' technical report. -+ * -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "blk.h" -+#include "bfq.h" -+ -+/* Expiration time of sync (0) and async (1) requests, in ns. */ -+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -+ -+/* Maximum backwards seek, in KiB. */ -+static const int bfq_back_max = (16 * 1024); -+ -+/* Penalty of a backwards seek, in number of sectors. */ -+static const int bfq_back_penalty = 2; -+ -+/* Idling period duration, in ns. */ -+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); -+ -+/* Minimum number of assigned budgets for which stats are safe to compute. */ -+static const int bfq_stats_min_budgets = 194; -+ -+/* Default maximum budget values, in sectors and number of requests. */ -+static const int bfq_default_max_budget = (16 * 1024); -+ -+/* -+ * Async to sync throughput distribution is controlled as follows: -+ * when an async request is served, the entity is charged the number -+ * of sectors of the request, multiplied by the factor below -+ */ -+static const int bfq_async_charge_factor = 10; -+ -+/* Default timeout values, in jiffies, approximating CFQ defaults. */ -+static const int bfq_timeout = (HZ / 8); -+ -+static struct kmem_cache *bfq_pool; -+ -+/* Below this threshold (in ns), we consider thinktime immediate. */ -+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) -+ -+/* hw_tag detection: parallel requests threshold and min samples needed. */ -+#define BFQ_HW_QUEUE_THRESHOLD 4 -+#define BFQ_HW_QUEUE_SAMPLES 32 -+ -+#define BFQQ_SEEK_THR (sector_t)(8 * 100) -+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -+ -+/* Min number of samples required to perform peak-rate update */ -+#define BFQ_RATE_MIN_SAMPLES 32 -+/* Min observation time interval required to perform a peak-rate update (ns) */ -+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) -+/* Target observation time interval for a peak-rate update (ns) */ -+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC -+ -+/* Shift used for peak rate fixed precision calculations. */ -+#define BFQ_RATE_SHIFT 16 -+ -+/* -+ * By default, BFQ computes the duration of the weight raising for -+ * interactive applications automatically, using the following formula: -+ * duration = (R / r) * T, where r is the peak rate of the device, and -+ * R and T are two reference parameters. -+ * In particular, R is the peak rate of the reference device (see below), -+ * and T is a reference time: given the systems that are likely to be -+ * installed on the reference device according to its speed class, T is -+ * about the maximum time needed, under BFQ and while reading two files in -+ * parallel, to load typical large applications on these systems. -+ * In practice, the slower/faster the device at hand is, the more/less it -+ * takes to load applications with respect to the reference device. -+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive -+ * applications. -+ * -+ * BFQ uses four different reference pairs (R, T), depending on: -+ * . whether the device is rotational or non-rotational; -+ * . whether the device is slow, such as old or portable HDDs, as well as -+ * SD cards, or fast, such as newer HDDs and SSDs. -+ * -+ * The device's speed class is dynamically (re)detected in -+ * bfq_update_peak_rate() every time the estimated peak rate is updated. -+ * -+ * In the following definitions, R_slow[0]/R_fast[0] and -+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast -+ * rotational device, whereas R_slow[1]/R_fast[1] and -+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast -+ * non-rotational device. Finally, device_speed_thresh are the -+ * thresholds used to switch between speed classes. The reference -+ * rates are not the actual peak rates of the devices used as a -+ * reference, but slightly lower values. The reason for using these -+ * slightly lower values is that the peak-rate estimator tends to -+ * yield slightly lower values than the actual peak rate (it can yield -+ * the actual peak rate only if there is only one process doing I/O, -+ * and the process does sequential I/O). -+ * -+ * Both the reference peak rates and the thresholds are measured in -+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT. -+ */ -+static int R_slow[2] = {1000, 10700}; -+static int R_fast[2] = {14000, 33000}; -+/* -+ * To improve readability, a conversion function is used to initialize the -+ * following arrays, which entails that they can be initialized only in a -+ * function. -+ */ -+static int T_slow[2]; -+static int T_fast[2]; -+static int device_speed_thresh[2]; -+ -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) -+ -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -+ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd); -+ -+#include "bfq-ioc.c" -+#include "bfq-sched.c" -+#include "bfq-cgroup-included.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* -+ * Scheduler run of queue, if there are requests pending and no one in the -+ * driver that will restart queueing. -+ */ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd) -+{ -+ if (bfqd->queued != 0) { -+ bfq_log(bfqd, "schedule dispatch"); -+ kblockd_schedule_work(&bfqd->unplug_work); -+ } -+} -+ -+/* -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now. -+ * We choose the request that is closesr to the head right now. Distance -+ * behind the head is penalized and only allowed to a certain extent. -+ */ -+static struct request *bfq_choose_req(struct bfq_data *bfqd, -+ struct request *rq1, -+ struct request *rq2, -+ sector_t last) -+{ -+ sector_t s1, s2, d1 = 0, d2 = 0; -+ unsigned long back_max; -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ -+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ -+ -+ if (!rq1 || rq1 == rq2) -+ return rq2; -+ if (!rq2) -+ return rq1; -+ -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) -+ return rq1; -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) -+ return rq2; -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) -+ return rq1; -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) -+ return rq2; -+ -+ s1 = blk_rq_pos(rq1); -+ s2 = blk_rq_pos(rq2); -+ -+ /* -+ * By definition, 1KiB is 2 sectors. -+ */ -+ back_max = bfqd->bfq_back_max * 2; -+ -+ /* -+ * Strict one way elevator _except_ in the case where we allow -+ * short backward seeks which are biased as twice the cost of a -+ * similar forward seek. -+ */ -+ if (s1 >= last) -+ d1 = s1 - last; -+ else if (s1 + back_max >= last) -+ d1 = (last - s1) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ1_WRAP; -+ -+ if (s2 >= last) -+ d2 = s2 - last; -+ else if (s2 + back_max >= last) -+ d2 = (last - s2) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ2_WRAP; -+ -+ /* Found required data */ -+ -+ /* -+ * By doing switch() on the bit mask "wrap" we avoid having to -+ * check two variables for all permutations: --> faster! -+ */ -+ switch (wrap) { -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ -+ if (d1 < d2) -+ return rq1; -+ else if (d2 < d1) -+ return rq2; -+ -+ if (s1 >= s2) -+ return rq1; -+ else -+ return rq2; -+ -+ case BFQ_RQ2_WRAP: -+ return rq1; -+ case BFQ_RQ1_WRAP: -+ return rq2; -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ -+ default: -+ /* -+ * Since both rqs are wrapped, -+ * start with the one that's further behind head -+ * (--> only *one* back seek required), -+ * since back seek takes more time than forward. -+ */ -+ if (s1 <= s2) -+ return rq1; -+ else -+ return rq2; -+ } -+} -+ -+static struct bfq_queue * -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, -+ sector_t sector, struct rb_node **ret_parent, -+ struct rb_node ***rb_link) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *bfqq = NULL; -+ -+ parent = NULL; -+ p = &root->rb_node; -+ while (*p) { -+ struct rb_node **n; -+ -+ parent = *p; -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ -+ /* -+ * Sort strictly based on sector. Smallest to the left, -+ * largest to the right. -+ */ -+ if (sector > blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_right; -+ else if (sector < blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_left; -+ else -+ break; -+ p = n; -+ bfqq = NULL; -+ } -+ -+ *ret_parent = parent; -+ if (rb_link) -+ *rb_link = p; -+ -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", -+ (unsigned long long) sector, -+ bfqq ? bfqq->pid : 0); -+ -+ return bfqq; -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *__bfqq; -+ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ -+ if (bfq_class_idle(bfqq)) -+ return; -+ if (!bfqq->next_rq) -+ return; -+ -+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, -+ blk_rq_pos(bfqq->next_rq), &parent, &p); -+ if (!__bfqq) { -+ rb_link_node(&bfqq->pos_node, parent, p); -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); -+ } else -+ bfqq->pos_root = NULL; -+} -+ -+/* -+ * Tell whether there are active queues or groups with differentiated weights. -+ */ -+static bool bfq_differentiated_weights(struct bfq_data *bfqd) -+{ -+ /* -+ * For weights to differ, at least one of the trees must contain -+ * at least two nodes. -+ */ -+ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && -+ (bfqd->queue_weights_tree.rb_node->rb_left || -+ bfqd->queue_weights_tree.rb_node->rb_right) -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ ) || -+ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && -+ (bfqd->group_weights_tree.rb_node->rb_left || -+ bfqd->group_weights_tree.rb_node->rb_right) -+#endif -+ ); -+} -+ -+/* -+ * The following function returns true if every queue must receive the -+ * same share of the throughput (this condition is used when deciding -+ * whether idling may be disabled, see the comments in the function -+ * bfq_bfqq_may_idle()). -+ * -+ * Such a scenario occurs when: -+ * 1) all active queues have the same weight, -+ * 2) all active groups at the same level in the groups tree have the same -+ * weight, -+ * 3) all active groups at the same level in the groups tree have the same -+ * number of children. -+ * -+ * Unfortunately, keeping the necessary state for evaluating exactly the -+ * above symmetry conditions would be quite complex and time-consuming. -+ * Therefore this function evaluates, instead, the following stronger -+ * sub-conditions, for which it is much easier to maintain the needed -+ * state: -+ * 1) all active queues have the same weight, -+ * 2) all active groups have the same weight, -+ * 3) all active groups have at most one active child each. -+ * In particular, the last two conditions are always true if hierarchical -+ * support and the cgroups interface are not enabled, thus no state needs -+ * to be maintained in this case. -+ */ -+static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -+{ -+ return !bfq_differentiated_weights(bfqd); -+} -+ -+/* -+ * If the weight-counter tree passed as input contains no counter for -+ * the weight of the input entity, then add that counter; otherwise just -+ * increment the existing counter. -+ * -+ * Note that weight-counter trees contain few nodes in mostly symmetric -+ * scenarios. For example, if all queues have the same weight, then the -+ * weight-counter tree for the queues may contain at most one node. -+ * This holds even if low_latency is on, because weight-raised queues -+ * are not inserted in the tree. -+ * In most scenarios, the rate at which nodes are created/destroyed -+ * should be low too. -+ */ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ struct rb_node **new = &(root->rb_node), *parent = NULL; -+ -+ /* -+ * Do not insert if the entity is already associated with a -+ * counter, which happens if: -+ * 1) the entity is associated with a queue, -+ * 2) a request arrival has caused the queue to become both -+ * non-weight-raised, and hence change its weight, and -+ * backlogged; in this respect, each of the two events -+ * causes an invocation of this function, -+ * 3) this is the invocation of this function caused by the -+ * second event. This second invocation is actually useless, -+ * and we handle this fact by exiting immediately. More -+ * efficient or clearer solutions might possibly be adopted. -+ */ -+ if (entity->weight_counter) -+ return; -+ -+ while (*new) { -+ struct bfq_weight_counter *__counter = container_of(*new, -+ struct bfq_weight_counter, -+ weights_node); -+ parent = *new; -+ -+ if (entity->weight == __counter->weight) { -+ entity->weight_counter = __counter; -+ goto inc_counter; -+ } -+ if (entity->weight < __counter->weight) -+ new = &((*new)->rb_left); -+ else -+ new = &((*new)->rb_right); -+ } -+ -+ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), -+ GFP_ATOMIC); -+ -+ /* -+ * In the unlucky event of an allocation failure, we just -+ * exit. This will cause the weight of entity to not be -+ * considered in bfq_differentiated_weights, which, in its -+ * turn, causes the scenario to be deemed wrongly symmetric in -+ * case entity's weight would have been the only weight making -+ * the scenario asymmetric. On the bright side, no unbalance -+ * will however occur when entity becomes inactive again (the -+ * invocation of this function is triggered by an activation -+ * of entity). In fact, bfq_weights_tree_remove does nothing -+ * if !entity->weight_counter. -+ */ -+ if (unlikely(!entity->weight_counter)) -+ return; -+ -+ entity->weight_counter->weight = entity->weight; -+ rb_link_node(&entity->weight_counter->weights_node, parent, new); -+ rb_insert_color(&entity->weight_counter->weights_node, root); -+ -+inc_counter: -+ entity->weight_counter->num_active++; -+} -+ -+/* -+ * Decrement the weight counter associated with the entity, and, if the -+ * counter reaches 0, remove the counter from the tree. -+ * See the comments to the function bfq_weights_tree_add() for considerations -+ * about overhead. -+ */ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ if (!entity->weight_counter) -+ return; -+ -+ BUG_ON(RB_EMPTY_ROOT(root)); -+ BUG_ON(entity->weight_counter->weight != entity->weight); -+ -+ BUG_ON(!entity->weight_counter->num_active); -+ entity->weight_counter->num_active--; -+ if (entity->weight_counter->num_active > 0) -+ goto reset_entity_pointer; -+ -+ rb_erase(&entity->weight_counter->weights_node, root); -+ kfree(entity->weight_counter); -+ -+reset_entity_pointer: -+ entity->weight_counter = NULL; -+} -+ -+/* -+ * Return expired entry, or NULL to just start from scratch in rbtree. -+ */ -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct request *rq; -+ -+ if (bfq_bfqq_fifo_expire(bfqq)) -+ return NULL; -+ -+ bfq_mark_bfqq_fifo_expire(bfqq); -+ -+ rq = rq_entry_fifo(bfqq->fifo.next); -+ -+ if (rq == last || ktime_get_ns() < rq->fifo_time) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); -+ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); -+ return rq; -+} -+ -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct rb_node *rbnext = rb_next(&last->rb_node); -+ struct rb_node *rbprev = rb_prev(&last->rb_node); -+ struct request *next, *prev = NULL; -+ -+ BUG_ON(list_empty(&bfqq->fifo)); -+ -+ /* Follow expired path, else get first next available. */ -+ next = bfq_check_fifo(bfqq, last); -+ if (next) { -+ BUG_ON(next == last); -+ return next; -+ } -+ -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); -+ -+ if (rbprev) -+ prev = rb_entry_rq(rbprev); -+ -+ if (rbnext) -+ next = rb_entry_rq(rbnext); -+ else { -+ rbnext = rb_first(&bfqq->sort_list); -+ if (rbnext && rbnext != &last->rb_node) -+ next = rb_entry_rq(rbnext); -+ } -+ -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -+} -+ -+/* see the definition of bfq_async_charge_factor for details */ -+static unsigned long bfq_serv_to_charge(struct request *rq, -+ struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) -+ return blk_rq_sectors(rq); -+ -+ /* -+ * If there are no weight-raised queues, then amplify service -+ * by just the async charge factor; otherwise amplify service -+ * by twice the async charge factor, to further reduce latency -+ * for weight-raised queues. -+ */ -+ if (bfqq->bfqd->wr_busy_queues == 0) -+ return blk_rq_sectors(rq) * bfq_async_charge_factor; -+ -+ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; -+} -+ -+/** -+ * bfq_updated_next_req - update the queue after a new next_rq selection. -+ * @bfqd: the device data the queue belongs to. -+ * @bfqq: the queue to update. -+ * -+ * If the first request of a queue changes we make sure that the queue -+ * has enough budget to serve at least its first request (if the -+ * request has grown). We do this because if the queue has not enough -+ * budget for its first request, it has to go through two dispatch -+ * rounds to actually get it dispatched. -+ */ -+static void bfq_updated_next_req(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct request *next_rq = bfqq->next_rq; -+ unsigned long new_budget; -+ -+ if (!next_rq) -+ return; -+ -+ if (bfqq == bfqd->in_service_queue) -+ /* -+ * In order not to break guarantees, budgets cannot be -+ * changed after an entity has been selected. -+ */ -+ return; -+ -+ BUG_ON(entity->tree != &st->active); -+ BUG_ON(entity == entity->sched_data->in_service_entity); -+ -+ new_budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ if (entity->budget != new_budget) { -+ entity->budget = new_budget; -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -+ new_budget); -+ bfq_requeue_bfqq(bfqd, bfqq); -+ } -+} -+ -+static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -+{ -+ u64 dur; -+ -+ if (bfqd->bfq_wr_max_time > 0) -+ return bfqd->bfq_wr_max_time; -+ -+ dur = bfqd->RT_prod; -+ do_div(dur, bfqd->peak_rate); -+ -+ /* -+ * Limit duration between 3 and 13 seconds. Tests show that -+ * higher values than 13 seconds often yield the opposite of -+ * the desired result, i.e., worsen responsiveness by letting -+ * non-interactive and non-soft-real-time applications -+ * preserve weight raising for a too long time interval. -+ * -+ * On the other end, lower values than 3 seconds make it -+ * difficult for most interactive tasks to complete their jobs -+ * before weight-raising finishes. -+ */ -+ if (dur > msecs_to_jiffies(13000)) -+ dur = msecs_to_jiffies(13000); -+ else if (dur < msecs_to_jiffies(3000)) -+ dur = msecs_to_jiffies(3000); -+ -+ return dur; -+} -+ -+static void -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, bool bfq_already_existing) -+{ -+ unsigned int old_wr_coeff; -+ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); -+ -+ if (bic->saved_idle_window) -+ bfq_mark_bfqq_idle_window(bfqq); -+ else -+ bfq_clear_bfqq_idle_window(bfqq); -+ -+ if (bic->saved_IO_bound) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ else -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (unlikely(busy)) -+ old_wr_coeff = bfqq->wr_coeff; -+ -+ bfqq->wr_coeff = bic->saved_wr_coeff; -+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; -+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time))) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "resume state: switching off wr (%lu + %lu < %lu)", -+ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, -+ jiffies); -+ -+ bfqq->wr_coeff = 1; -+ } -+ -+ /* make sure weight will be updated, however we got here */ -+ bfqq->entity.prio_changed = 1; -+ -+ if (likely(!busy)) -+ return; -+ -+ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+} -+ -+static int bfqq_process_refs(struct bfq_queue *bfqq) -+{ -+ int process_refs, io_refs; -+ -+ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); -+ -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -+ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; -+ BUG_ON(process_refs < 0); -+ return process_refs; -+} -+ -+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *item; -+ struct hlist_node *n; -+ -+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) -+ hlist_del_init(&item->burst_list_node); -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+ bfqd->burst_size = 1; -+ bfqd->burst_parent_entity = bfqq->entity.parent; -+} -+ -+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* Increment burst size to take into account also bfqq */ -+ bfqd->burst_size++; -+ -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); -+ -+ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); -+ -+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { -+ struct bfq_queue *pos, *bfqq_item; -+ struct hlist_node *n; -+ -+ /* -+ * Enough queues have been activated shortly after each -+ * other to consider this burst as large. -+ */ -+ bfqd->large_burst = true; -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); -+ -+ /* -+ * We can now mark all queues in the burst list as -+ * belonging to a large burst. -+ */ -+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, -+ burst_list_node) { -+ bfq_mark_bfqq_in_large_burst(bfqq_item); -+ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); -+ } -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); -+ -+ /* -+ * From now on, and until the current burst finishes, any -+ * new queue being activated shortly after the last queue -+ * was inserted in the burst can be immediately marked as -+ * belonging to a large burst. So the burst list is not -+ * needed any more. Remove it. -+ */ -+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, -+ burst_list_node) -+ hlist_del_init(&pos->burst_list_node); -+ } else /* -+ * Burst not yet large: add bfqq to the burst list. Do -+ * not increment the ref counter for bfqq, because bfqq -+ * is removed from the burst list before freeing bfqq -+ * in put_queue. -+ */ -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+} -+ -+/* -+ * If many queues belonging to the same group happen to be created -+ * shortly after each other, then the processes associated with these -+ * queues have typically a common goal. In particular, bursts of queue -+ * creations are usually caused by services or applications that spawn -+ * many parallel threads/processes. Examples are systemd during boot, -+ * or git grep. To help these processes get their job done as soon as -+ * possible, it is usually better to not grant either weight-raising -+ * or device idling to their queues. -+ * -+ * In this comment we describe, firstly, the reasons why this fact -+ * holds, and, secondly, the next function, which implements the main -+ * steps needed to properly mark these queues so that they can then be -+ * treated in a different way. -+ * -+ * The above services or applications benefit mostly from a high -+ * throughput: the quicker the requests of the activated queues are -+ * cumulatively served, the sooner the target job of these queues gets -+ * completed. As a consequence, weight-raising any of these queues, -+ * which also implies idling the device for it, is almost always -+ * counterproductive. In most cases it just lowers throughput. -+ * -+ * On the other hand, a burst of queue creations may be caused also by -+ * the start of an application that does not consist of a lot of -+ * parallel I/O-bound threads. In fact, with a complex application, -+ * several short processes may need to be executed to start-up the -+ * application. In this respect, to start an application as quickly as -+ * possible, the best thing to do is in any case to privilege the I/O -+ * related to the application with respect to all other -+ * I/O. Therefore, the best strategy to start as quickly as possible -+ * an application that causes a burst of queue creations is to -+ * weight-raise all the queues created during the burst. This is the -+ * exact opposite of the best strategy for the other type of bursts. -+ * -+ * In the end, to take the best action for each of the two cases, the -+ * two types of bursts need to be distinguished. Fortunately, this -+ * seems relatively easy, by looking at the sizes of the bursts. In -+ * particular, we found a threshold such that only bursts with a -+ * larger size than that threshold are apparently caused by -+ * services or commands such as systemd or git grep. For brevity, -+ * hereafter we call just 'large' these bursts. BFQ *does not* -+ * weight-raise queues whose creation occurs in a large burst. In -+ * addition, for each of these queues BFQ performs or does not perform -+ * idling depending on which choice boosts the throughput more. The -+ * exact choice depends on the device and request pattern at -+ * hand. -+ * -+ * Unfortunately, false positives may occur while an interactive task -+ * is starting (e.g., an application is being started). The -+ * consequence is that the queues associated with the task do not -+ * enjoy weight raising as expected. Fortunately these false positives -+ * are very rare. They typically occur if some service happens to -+ * start doing I/O exactly when the interactive task starts. -+ * -+ * Turning back to the next function, it implements all the steps -+ * needed to detect the occurrence of a large burst and to properly -+ * mark all the queues belonging to it (so that they can then be -+ * treated in a different way). This goal is achieved by maintaining a -+ * "burst list" that holds, temporarily, the queues that belong to the -+ * burst in progress. The list is then used to mark these queues as -+ * belonging to a large burst if the burst does become large. The main -+ * steps are the following. -+ * -+ * . when the very first queue is created, the queue is inserted into the -+ * list (as it could be the first queue in a possible burst) -+ * -+ * . if the current burst has not yet become large, and a queue Q that does -+ * not yet belong to the burst is activated shortly after the last time -+ * at which a new queue entered the burst list, then the function appends -+ * Q to the burst list -+ * -+ * . if, as a consequence of the previous step, the burst size reaches -+ * the large-burst threshold, then -+ * -+ * . all the queues in the burst list are marked as belonging to a -+ * large burst -+ * -+ * . the burst list is deleted; in fact, the burst list already served -+ * its purpose (keeping temporarily track of the queues in a burst, -+ * so as to be able to mark them as belonging to a large burst in the -+ * previous sub-step), and now is not needed any more -+ * -+ * . the device enters a large-burst mode -+ * -+ * . if a queue Q that does not belong to the burst is created while -+ * the device is in large-burst mode and shortly after the last time -+ * at which a queue either entered the burst list or was marked as -+ * belonging to the current large burst, then Q is immediately marked -+ * as belonging to a large burst. -+ * -+ * . if a queue Q that does not belong to the burst is created a while -+ * later, i.e., not shortly after, than the last time at which a queue -+ * either entered the burst list or was marked as belonging to the -+ * current large burst, then the current burst is deemed as finished and: -+ * -+ * . the large-burst mode is reset if set -+ * -+ * . the burst list is emptied -+ * -+ * . Q is inserted in the burst list, as Q may be the first queue -+ * in a possible new burst (then the burst list contains just Q -+ * after this step). -+ */ -+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq is already in the burst list or is part of a large -+ * burst, or finally has just been split, then there is -+ * nothing else to do. -+ */ -+ if (!hlist_unhashed(&bfqq->burst_list_node) || -+ bfq_bfqq_in_large_burst(bfqq) || -+ time_is_after_eq_jiffies(bfqq->split_time + -+ msecs_to_jiffies(10))) -+ return; -+ -+ /* -+ * If bfqq's creation happens late enough, or bfqq belongs to -+ * a different group than the burst group, then the current -+ * burst is finished, and related data structures must be -+ * reset. -+ * -+ * In this respect, consider the special case where bfqq is -+ * the very first queue created after BFQ is selected for this -+ * device. In this case, last_ins_in_burst and -+ * burst_parent_entity are not yet significant when we get -+ * here. But it is easy to verify that, whether or not the -+ * following condition is true, bfqq will end up being -+ * inserted into the burst list. In particular the list will -+ * happen to contain only bfqq. And this is exactly what has -+ * to happen, as bfqq may be the first queue of the first -+ * burst. -+ */ -+ if (time_is_before_jiffies(bfqd->last_ins_in_burst + -+ bfqd->bfq_burst_interval) || -+ bfqq->entity.parent != bfqd->burst_parent_entity) { -+ bfqd->large_burst = false; -+ bfq_reset_burst_list(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "handle_burst: late activation or different group"); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then bfqq is being activated shortly after the -+ * last queue. So, if the current burst is also large, we can mark -+ * bfqq as belonging to this large burst immediately. -+ */ -+ if (bfqd->large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then a large-burst state has not yet been -+ * reached, but bfqq is being activated shortly after the last -+ * queue. Then we add bfqq to the burst. -+ */ -+ bfq_add_to_burst(bfqd, bfqq); -+end: -+ /* -+ * At this point, bfqq either has been added to the current -+ * burst or has caused the current burst to terminate and a -+ * possible new burst to start. In particular, in the second -+ * case, bfqq has become the first queue in the possible new -+ * burst. In both cases last_ins_in_burst needs to be moved -+ * forward. -+ */ -+ bfqd->last_ins_in_burst = jiffies; -+ -+} -+ -+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ return entity->budget - entity->service; -+} -+ -+/* -+ * If enough samples have been computed, return the current max budget -+ * stored in bfqd, which is dynamically updated according to the -+ * estimated disk peak rate; otherwise return the default max budget -+ */ -+static int bfq_max_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget; -+ else -+ return bfqd->bfq_max_budget; -+} -+ -+/* -+ * Return min budget, which is a fraction of the current or default -+ * max budget (trying with 1/32) -+ */ -+static int bfq_min_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget / 32; -+ else -+ return bfqd->bfq_max_budget / 32; -+} -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/* -+ * The next function, invoked after the input queue bfqq switches from -+ * idle to busy, updates the budget of bfqq. The function also tells -+ * whether the in-service queue should be expired, by returning -+ * true. The purpose of expiring the in-service queue is to give bfqq -+ * the chance to possibly preempt the in-service queue, and the reason -+ * for preempting the in-service queue is to achieve one of the two -+ * goals below. -+ * -+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has -+ * expired because it has remained idle. In particular, bfqq may have -+ * expired for one of the following two reasons: -+ * -+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and -+ * did not make it to issue a new request before its last request -+ * was served; -+ * -+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue -+ * a new request before the expiration of the idling-time. -+ * -+ * Even if bfqq has expired for one of the above reasons, the process -+ * associated with the queue may be however issuing requests greedily, -+ * and thus be sensitive to the bandwidth it receives (bfqq may have -+ * remained idle for other reasons: CPU high load, bfqq not enjoying -+ * idling, I/O throttling somewhere in the path from the process to -+ * the I/O scheduler, ...). But if, after every expiration for one of -+ * the above two reasons, bfqq has to wait for the service of at least -+ * one full budget of another queue before being served again, then -+ * bfqq is likely to get a much lower bandwidth or resource time than -+ * its reserved ones. To address this issue, two countermeasures need -+ * to be taken. -+ * -+ * First, the budget and the timestamps of bfqq need to be updated in -+ * a special way on bfqq reactivation: they need to be updated as if -+ * bfqq did not remain idle and did not expire. In fact, if they are -+ * computed as if bfqq expired and remained idle until reactivation, -+ * then the process associated with bfqq is treated as if, instead of -+ * being greedy, it stopped issuing requests when bfqq remained idle, -+ * and restarts issuing requests only on this reactivation. In other -+ * words, the scheduler does not help the process recover the "service -+ * hole" between bfqq expiration and reactivation. As a consequence, -+ * the process receives a lower bandwidth than its reserved one. In -+ * contrast, to recover this hole, the budget must be updated as if -+ * bfqq was not expired at all before this reactivation, i.e., it must -+ * be set to the value of the remaining budget when bfqq was -+ * expired. Along the same line, timestamps need to be assigned the -+ * value they had the last time bfqq was selected for service, i.e., -+ * before last expiration. Thus timestamps need to be back-shifted -+ * with respect to their normal computation (see [1] for more details -+ * on this tricky aspect). -+ * -+ * Secondly, to allow the process to recover the hole, the in-service -+ * queue must be expired too, to give bfqq the chance to preempt it -+ * immediately. In fact, if bfqq has to wait for a full budget of the -+ * in-service queue to be completed, then it may become impossible to -+ * let the process recover the hole, even if the back-shifted -+ * timestamps of bfqq are lower than those of the in-service queue. If -+ * this happens for most or all of the holes, then the process may not -+ * receive its reserved bandwidth. In this respect, it is worth noting -+ * that, being the service of outstanding requests unpreemptible, a -+ * little fraction of the holes may however be unrecoverable, thereby -+ * causing a little loss of bandwidth. -+ * -+ * The last important point is detecting whether bfqq does need this -+ * bandwidth recovery. In this respect, the next function deems the -+ * process associated with bfqq greedy, and thus allows it to recover -+ * the hole, if: 1) the process is waiting for the arrival of a new -+ * request (which implies that bfqq expired for one of the above two -+ * reasons), and 2) such a request has arrived soon. The first -+ * condition is controlled through the flag non_blocking_wait_rq, -+ * while the second through the flag arrived_in_time. If both -+ * conditions hold, then the function computes the budget in the -+ * above-described special way, and signals that the in-service queue -+ * should be expired. Timestamp back-shifting is done later in -+ * __bfq_activate_entity. -+ * -+ * 2. Reduce latency. Even if timestamps are not backshifted to let -+ * the process associated with bfqq recover a service hole, bfqq may -+ * however happen to have, after being (re)activated, a lower finish -+ * timestamp than the in-service queue. That is, the next budget of -+ * bfqq may have to be completed before the one of the in-service -+ * queue. If this is the case, then preempting the in-service queue -+ * allows this goal to be achieved, apart from the unpreemptible, -+ * outstanding requests mentioned above. -+ * -+ * Unfortunately, regardless of which of the above two goals one wants -+ * to achieve, service trees need first to be updated to know whether -+ * the in-service queue must be preempted. To have service trees -+ * correctly updated, the in-service queue must be expired and -+ * rescheduled, and bfqq must be scheduled too. This is one of the -+ * most costly operations (in future versions, the scheduling -+ * mechanism may be re-designed in such a way to make it possible to -+ * know whether preemption is needed without needing to update service -+ * trees). In addition, queue preemptions almost always cause random -+ * I/O, and thus loss of throughput. Because of these facts, the next -+ * function adopts the following simple scheme to avoid both costly -+ * operations and too frequent preemptions: it requests the expiration -+ * of the in-service queue (unconditionally) only for queues that need -+ * to recover a hole, or that either are weight-raised or deserve to -+ * be weight-raised. -+ */ -+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool arrived_in_time, -+ bool wr_or_deserves_wr) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { -+ /* -+ * We do not clear the flag non_blocking_wait_rq here, as -+ * the latter is used in bfq_activate_bfqq to signal -+ * that timestamps need to be back-shifted (and is -+ * cleared right after). -+ */ -+ -+ /* -+ * In next assignment we rely on that either -+ * entity->service or entity->budget are not updated -+ * on expiration if bfqq is empty (see -+ * __bfq_bfqq_recalc_budget). Thus both quantities -+ * remain unchanged after such an expiration, and the -+ * following statement therefore assigns to -+ * entity->budget the remaining budget on such an -+ * expiration. For clarity, entity->service is not -+ * updated on expiration in any case, and, in normal -+ * operation, is reset only when bfqq is selected for -+ * service (see bfq_get_next_queue). -+ */ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = min_t(unsigned long, -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->max_budget); -+ -+ BUG_ON(entity->budget < 0); -+ return true; -+ } -+ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(bfqq->next_rq, bfqq)); -+ BUG_ON(entity->budget < 0); -+ -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+ return wr_or_deserves_wr; -+} -+ -+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ unsigned int old_wr_coeff, -+ bool wr_or_deserves_wr, -+ bool interactive, -+ bool in_burst, -+ bool soft_rt) -+{ -+ if (old_wr_coeff == 1 && wr_or_deserves_wr) { -+ /* start a weight-raising period */ -+ if (interactive) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else { -+ bfqq->wr_start_at_switch_to_srt = jiffies; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ } -+ /* -+ * If needed, further reduce budget to make sure it is -+ * close to bfqq's backlog, so as to reduce the -+ * scheduling-error component due to a too large -+ * budget. Do not care about throughput consequences, -+ * but only about latency. Finally, do not assign a -+ * too small budget either, to avoid increasing -+ * latency by causing too frequent expirations. -+ */ -+ bfqq->entity.budget = min_t(unsigned long, -+ bfqq->entity.budget, -+ 2 * bfq_min_budget(bfqd)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } else if (old_wr_coeff > 1) { -+ if (interactive) { /* update wr coeff and duration */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else if (in_burst) { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq-> -+ wr_cur_max_time)); -+ } else if (soft_rt) { -+ /* -+ * The application is now or still meeting the -+ * requirements for being deemed soft rt. We -+ * can then correctly and safely (re)charge -+ * the weight-raising duration for the -+ * application with the weight-raising -+ * duration for soft rt applications. -+ * -+ * In particular, doing this recharge now, i.e., -+ * before the weight-raising period for the -+ * application finishes, reduces the probability -+ * of the following negative scenario: -+ * 1) the weight of a soft rt application is -+ * raised at startup (as for any newly -+ * created application), -+ * 2) since the application is not interactive, -+ * at a certain time weight-raising is -+ * stopped for the application, -+ * 3) at that time the application happens to -+ * still have pending requests, and hence -+ * is destined to not have a chance to be -+ * deemed soft rt before these requests are -+ * completed (see the comments to the -+ * function bfq_bfqq_softrt_next_start() -+ * for details on soft rt detection), -+ * 4) these pending requests experience a high -+ * latency because the application is not -+ * weight-raised while they are pending. -+ */ -+ if (bfqq->wr_cur_max_time != -+ bfqd->bfq_wr_rt_max_time) { -+ bfqq->wr_start_at_switch_to_srt = -+ bfqq->last_wr_start_finish; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfq_log_bfqq(bfqd, bfqq, -+ "switching to soft_rt wr"); -+ } else -+ bfq_log_bfqq(bfqd, bfqq, -+ "moving forward soft_rt wr duration"); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+} -+ -+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ return bfqq->dispatched == 0 && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ bfqd->bfq_wr_min_idle_time); -+} -+ -+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ int old_wr_coeff, -+ struct request *rq, -+ bool *interactive) -+{ -+ bool soft_rt, in_burst, wr_or_deserves_wr, -+ bfqq_wants_to_preempt, -+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), -+ /* -+ * See the comments on -+ * bfq_bfqq_update_budg_for_activation for -+ * details on the usage of the next variable. -+ */ -+ arrived_in_time = ktime_get_ns() <= -+ RQ_BIC(rq)->ttime.last_end_request + -+ bfqd->bfq_slice_idle * 3; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request non-busy: " -+ "jiffies %lu, in_time %d, idle_long %d busyw %d " -+ "wr_coeff %u", -+ jiffies, arrived_in_time, -+ idle_for_long_time, -+ bfq_bfqq_non_blocking_wait_rq(bfqq), -+ old_wr_coeff); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); -+ -+ /* -+ * bfqq deserves to be weight-raised if: -+ * - it is sync, -+ * - it does not belong to a large burst, -+ * - it has been idle for enough time or is soft real-time, -+ * - is linked to a bfq_io_cq (it is not shared in any sense) -+ */ -+ in_burst = bfq_bfqq_in_large_burst(bfqq); -+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -+ !in_burst && -+ time_is_before_jiffies(bfqq->soft_rt_next_start); -+ *interactive = -+ !in_burst && -+ idle_for_long_time; -+ wr_or_deserves_wr = bfqd->low_latency && -+ (bfqq->wr_coeff > 1 || -+ (bfq_bfqq_sync(bfqq) && -+ bfqq->bic && (*interactive || soft_rt))); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request: " -+ "in_burst %d, " -+ "soft_rt %d (next %lu), inter %d, bic %p", -+ bfq_bfqq_in_large_burst(bfqq), soft_rt, -+ bfqq->soft_rt_next_start, -+ *interactive, -+ bfqq->bic); -+ -+ /* -+ * Using the last flag, update budget and check whether bfqq -+ * may want to preempt the in-service queue. -+ */ -+ bfqq_wants_to_preempt = -+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, -+ arrived_in_time, -+ wr_or_deserves_wr); -+ -+ /* -+ * If bfqq happened to be activated in a burst, but has been -+ * idle for much more than an interactive queue, then we -+ * assume that, in the overall I/O initiated in the burst, the -+ * I/O associated with bfqq is finished. So bfqq does not need -+ * to be treated as a queue belonging to a burst -+ * anymore. Accordingly, we reset bfqq's in_large_burst flag -+ * if set, and remove bfqq from the burst list if it's -+ * there. We do not decrement burst_size, because the fact -+ * that bfqq does not need to belong to the burst list any -+ * more does not invalidate the fact that bfqq was created in -+ * a burst. -+ */ -+ if (likely(!bfq_bfqq_just_created(bfqq)) && -+ idle_for_long_time && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ msecs_to_jiffies(10000))) { -+ hlist_del_init(&bfqq->burst_list_node); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ -+ if (!bfq_bfqq_IO_bound(bfqq)) { -+ if (arrived_in_time) { -+ bfqq->requests_within_timer++; -+ if (bfqq->requests_within_timer >= -+ bfqd->bfq_requests_within_timer) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ } else -+ bfqq->requests_within_timer = 0; -+ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", -+ bfqq->requests_within_timer); -+ } -+ -+ if (bfqd->low_latency) { -+ if (unlikely(time_is_after_jiffies(bfqq->split_time))) -+ /* wraparound */ -+ bfqq->split_time = -+ jiffies - bfqd->bfq_wr_min_idle_time - 1; -+ -+ if (time_is_before_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) { -+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, -+ old_wr_coeff, -+ wr_or_deserves_wr, -+ *interactive, -+ in_burst, -+ soft_rt); -+ -+ if (old_wr_coeff != bfqq->wr_coeff) -+ bfqq->entity.prio_changed = 1; -+ } -+ } -+ -+ bfqq->last_idle_bklogged = jiffies; -+ bfqq->service_from_backlogged = 0; -+ bfq_clear_bfqq_softrt_update(bfqq); -+ -+ bfq_add_bfqq_busy(bfqd, bfqq); -+ -+ /* -+ * Expire in-service queue only if preemption may be needed -+ * for guarantees. In this respect, the function -+ * next_queue_may_preempt just checks a simple, necessary -+ * condition, and not a sufficient condition based on -+ * timestamps. In fact, for the latter condition to be -+ * evaluated, timestamps would need first to be updated, and -+ * this operation is quite costly (see the comments on the -+ * function bfq_bfqq_update_budg_for_activation). -+ */ -+ if (bfqd->in_service_queue && bfqq_wants_to_preempt && -+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && -+ next_queue_may_preempt(bfqd)) { -+ struct bfq_queue *in_serv = -+ bfqd->in_service_queue; -+ BUG_ON(in_serv == bfqq); -+ -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ } -+} -+ -+static void bfq_add_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *next_rq, *prev; -+ unsigned int old_wr_coeff = bfqq->wr_coeff; -+ bool interactive = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", -+ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); -+ -+ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ bfqq->queued[rq_is_sync(rq)]++; -+ bfqd->queued++; -+ -+ elv_rb_add(&bfqq->sort_list, rq); -+ -+ /* -+ * Check if this request is a better next-to-serve candidate. -+ */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ -+ /* -+ * Adjust priority tree position, if next_rq changes. -+ */ -+ if (prev != bfqq->next_rq) -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ -+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ -+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, -+ rq, &interactive); -+ else { -+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && -+ time_is_before_jiffies( -+ bfqq->last_wr_start_finish + -+ bfqd->bfq_wr_min_inter_arr_async)) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "non-idle wrais starting, " -+ "wr_max_time %u wr_busy %d", -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqd->wr_busy_queues); -+ } -+ if (prev != bfqq->next_rq) -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ /* -+ * Assign jiffies to last_wr_start_finish in the following -+ * cases: -+ * -+ * . if bfqq is not going to be weight-raised, because, for -+ * non weight-raised queues, last_wr_start_finish stores the -+ * arrival time of the last request; as of now, this piece -+ * of information is used only for deciding whether to -+ * weight-raise async queues -+ * -+ * . if bfqq is not weight-raised, because, if bfqq is now -+ * switching to weight-raised, then last_wr_start_finish -+ * stores the time when weight-raising starts -+ * -+ * . if bfqq is interactive, because, regardless of whether -+ * bfqq is currently weight-raised, the weight-raising -+ * period must start or restart (this case is considered -+ * separately because it is not detected by the above -+ * conditions, if bfqq is already weight-raised) -+ * -+ * last_wr_start_finish has to be updated also if bfqq is soft -+ * real-time, because the weight-raising period is constantly -+ * restarted on idle-to-busy transitions for these queues, but -+ * this is already done in bfq_bfqq_handle_idle_busy_switch if -+ * needed. -+ */ -+ if (bfqd->low_latency && -+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) -+ bfqq->last_wr_start_finish = jiffies; -+} -+ -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -+ struct bio *bio) -+{ -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return NULL; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); -+ if (bfqq) -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -+ -+ return NULL; -+} -+ -+static sector_t get_sdist(sector_t last_pos, struct request *rq) -+{ -+ sector_t sdist = 0; -+ -+ if (last_pos) { -+ if (last_pos < blk_rq_pos(rq)) -+ sdist = blk_rq_pos(rq) - last_pos; -+ else -+ sdist = last_pos - blk_rq_pos(rq); -+ } -+ -+ return sdist; -+} -+ -+static void bfq_activate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bfqd->rq_in_driver++; -+} -+ -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ BUG_ON(bfqd->rq_in_driver == 0); -+ bfqd->rq_in_driver--; -+} -+ -+static void bfq_remove_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ const int sync = rq_is_sync(rq); -+ -+ BUG_ON(bfqq->entity.service > bfqq->entity.budget && -+ bfqq == bfqd->in_service_queue); -+ -+ if (bfqq->next_rq == rq) { -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ if (rq->queuelist.prev != &rq->queuelist) -+ list_del_init(&rq->queuelist); -+ BUG_ON(bfqq->queued[sync] == 0); -+ bfqq->queued[sync]--; -+ bfqd->queued--; -+ elv_rb_del(&bfqq->sort_list, rq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ bfqq->next_rq = NULL; -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { -+ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ -+ bfq_del_bfqq_busy(bfqd, bfqq, false); -+ /* -+ * bfqq emptied. In normal operation, when -+ * bfqq is empty, bfqq->entity.service and -+ * bfqq->entity.budget must contain, -+ * respectively, the service received and the -+ * budget used last time bfqq emptied. These -+ * facts do not hold in this case, as at least -+ * this last removal occurred while bfqq is -+ * not in service. To avoid inconsistencies, -+ * reset both bfqq->entity.service and -+ * bfqq->entity.budget, if bfqq has still a -+ * process that may issue I/O requests to it. -+ */ -+ bfqq->entity.budget = bfqq->entity.service = 0; -+ } -+ -+ /* -+ * Remove queue from request-position tree as it is empty. -+ */ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ } -+ -+ if (rq->cmd_flags & REQ_META) { -+ BUG_ON(bfqq->meta_pending == 0); -+ bfqq->meta_pending--; -+ } -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -+} -+ -+static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *__rq; -+ -+ __rq = bfq_find_rq_fmerge(bfqd, bio); -+ if (__rq && elv_bio_merge_ok(__rq, bio)) { -+ *req = __rq; -+ return ELEVATOR_FRONT_MERGE; -+ } -+ -+ return ELEVATOR_NO_MERGE; -+} -+ -+static void bfq_merged_request(struct request_queue *q, struct request *req, -+ enum elv_merge type) -+{ -+ if (type == ELEVATOR_FRONT_MERGE && -+ rb_prev(&req->rb_node) && -+ blk_rq_pos(req) < -+ blk_rq_pos(container_of(rb_prev(&req->rb_node), -+ struct request, rb_node))) { -+ struct bfq_queue *bfqq = RQ_BFQQ(req); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *prev, *next_rq; -+ -+ /* Reposition request in its sort_list */ -+ elv_rb_del(&bfqq->sort_list, req); -+ elv_rb_add(&bfqq->sort_list, req); -+ /* Choose next request to be served for bfqq */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -+ bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ /* -+ * If next_rq changes, update both the queue's budget to -+ * fit the new request and the queue's position in its -+ * rq_pos_tree. -+ */ -+ if (prev != bfqq->next_rq) { -+ bfq_updated_next_req(bfqd, bfqq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ } -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static void bfq_bio_merged(struct request_queue *q, struct request *req, -+ struct bio *bio) -+{ -+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); -+} -+#endif -+ -+static void bfq_merged_requests(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); -+ -+ /* -+ * If next and rq belong to the same bfq_queue and next is older -+ * than rq, then reposition rq in the fifo (by substituting next -+ * with rq). Otherwise, if next and rq belong to different -+ * bfq_queues, never reposition rq: in fact, we would have to -+ * reposition it with respect to next's position in its own fifo, -+ * which would most certainly be too expensive with respect to -+ * the benefits. -+ */ -+ if (bfqq == next_bfqq && -+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -+ next->fifo_time < rq->fifo_time) { -+ list_del_init(&rq->queuelist); -+ list_replace_init(&next->queuelist, &rq->queuelist); -+ rq->fifo_time = next->fifo_time; -+ } -+ -+ if (bfqq->next_rq == next) -+ bfqq->next_rq = rq; -+ -+ bfq_remove_request(next); -+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -+} -+ -+/* Must be called with bfqq != NULL */ -+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) -+{ -+ BUG_ON(!bfqq); -+ -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqq->bfqd->wr_busy_queues--; -+ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); -+ } -+ bfqq->wr_coeff = 1; -+ bfqq->wr_cur_max_time = 0; -+ bfqq->last_wr_start_finish = jiffies; -+ /* -+ * Trigger a weight change on the next invocation of -+ * __bfq_entity_update_weight_prio. -+ */ -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "end_wr: wrais ending at %lu, rais_max_time %u", -+ bfqq->last_wr_start_finish, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", -+ bfqq->bfqd->wr_busy_queues); -+} -+ -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ if (bfqg->async_bfqq[i][j]) -+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); -+ if (bfqg->async_idle_bfqq) -+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -+} -+ -+static void bfq_end_wr(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ bfq_end_wr_async(bfqd); -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+} -+ -+static sector_t bfq_io_struct_pos(void *io_struct, bool request) -+{ -+ if (request) -+ return blk_rq_pos(io_struct); -+ else -+ return ((struct bio *)io_struct)->bi_iter.bi_sector; -+} -+ -+static int bfq_rq_close_to_sector(void *io_struct, bool request, -+ sector_t sector) -+{ -+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= -+ BFQQ_CLOSE_THR; -+} -+ -+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ sector_t sector) -+{ -+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ struct rb_node *parent, *node; -+ struct bfq_queue *__bfqq; -+ -+ if (RB_EMPTY_ROOT(root)) -+ return NULL; -+ -+ /* -+ * First, if we find a request starting at the end of the last -+ * request, choose it. -+ */ -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); -+ if (__bfqq) -+ return __bfqq; -+ -+ /* -+ * If the exact sector wasn't found, the parent of the NULL leaf -+ * will contain the closest sector (rq_pos_tree sorted by -+ * next_request position). -+ */ -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ if (blk_rq_pos(__bfqq->next_rq) < sector) -+ node = rb_next(&__bfqq->pos_node); -+ else -+ node = rb_prev(&__bfqq->pos_node); -+ if (!node) -+ return NULL; -+ -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ return NULL; -+} -+ -+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, -+ struct bfq_queue *cur_bfqq, -+ sector_t sector) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * We shall notice if some of the queues are cooperating, -+ * e.g., working closely on the same area of the device. In -+ * that case, we can group them together and: 1) don't waste -+ * time idling, and 2) serve the union of their requests in -+ * the best possible order for throughput. -+ */ -+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); -+ if (!bfqq || bfqq == cur_bfqq) -+ return NULL; -+ -+ return bfqq; -+} -+ -+static struct bfq_queue * -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ int process_refs, new_process_refs; -+ struct bfq_queue *__bfqq; -+ -+ /* -+ * If there are no process references on the new_bfqq, then it is -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain -+ * may have dropped their last reference (not just their last process -+ * reference). -+ */ -+ if (!bfqq_process_refs(new_bfqq)) -+ return NULL; -+ -+ /* Avoid a circular list and skip interim queue merges. */ -+ while ((__bfqq = new_bfqq->new_bfqq)) { -+ if (__bfqq == bfqq) -+ return NULL; -+ new_bfqq = __bfqq; -+ } -+ -+ process_refs = bfqq_process_refs(bfqq); -+ new_process_refs = bfqq_process_refs(new_bfqq); -+ /* -+ * If the process for the bfqq has gone away, there is no -+ * sense in merging the queues. -+ */ -+ if (process_refs == 0 || new_process_refs == 0) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", -+ new_bfqq->pid); -+ -+ /* -+ * Merging is just a redirection: the requests of the process -+ * owning one of the two queues are redirected to the other queue. -+ * The latter queue, in its turn, is set as shared if this is the -+ * first time that the requests of some process are redirected to -+ * it. -+ * -+ * We redirect bfqq to new_bfqq and not the opposite, because we -+ * are in the context of the process owning bfqq, hence we have -+ * the io_cq of this process. So we can immediately configure this -+ * io_cq to redirect the requests of the process to new_bfqq. -+ * -+ * NOTE, even if new_bfqq coincides with the in-service queue, the -+ * io_cq of new_bfqq is not available, because, if the in-service -+ * queue is shared, bfqd->in_service_bic may not point to the -+ * io_cq of the in-service queue. -+ * Redirecting the requests of the process owning bfqq to the -+ * currently in-service queue is in any case the best option, as -+ * we feed the in-service queue with new requests close to the -+ * last request served and, by doing so, hopefully increase the -+ * throughput. -+ */ -+ bfqq->new_bfqq = new_bfqq; -+ new_bfqq->ref += process_refs; -+ return new_bfqq; -+} -+ -+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, -+ struct bfq_queue *new_bfqq) -+{ -+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || -+ (bfqq->ioprio_class != new_bfqq->ioprio_class)) -+ return false; -+ -+ /* -+ * If either of the queues has already been detected as seeky, -+ * then merging it with the other queue is unlikely to lead to -+ * sequential I/O. -+ */ -+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) -+ return false; -+ -+ /* -+ * Interleaved I/O is known to be done by (some) applications -+ * only for reads, so it does not make sense to merge async -+ * queues. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) -+ return false; -+ -+ return true; -+} -+ -+/* -+ * If this function returns true, then bfqq cannot be merged. The idea -+ * is that true cooperation happens very early after processes start -+ * to do I/O. Usually, late cooperations are just accidental false -+ * positives. In case bfqq is weight-raised, such false positives -+ * would evidently degrade latency guarantees for bfqq. -+ */ -+static bool wr_from_too_long(struct bfq_queue *bfqq) -+{ -+ return bfqq->wr_coeff > 1 && -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ msecs_to_jiffies(100)); -+} -+ -+/* -+ * Attempt to schedule a merge of bfqq with the currently in-service -+ * queue or with a close queue among the scheduled queues. Return -+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue -+ * structure otherwise. -+ * -+ * The OOM queue is not allowed to participate to cooperation: in fact, since -+ * the requests temporarily redirected to the OOM queue could be redirected -+ * again to dedicated queues at any time, the state needed to correctly -+ * handle merging with the OOM queue would be quite complex and expensive -+ * to maintain. Besides, in such a critical condition as an out of memory, -+ * the benefits of queue merging may be little relevant, or even negligible. -+ * -+ * Weight-raised queues can be merged only if their weight-raising -+ * period has just started. In fact cooperating processes are usually -+ * started together. Thus, with this filter we avoid false positives -+ * that would jeopardize low-latency guarantees. -+ * -+ * WARNING: queue merging may impair fairness among non-weight raised -+ * queues, for at least two reasons: 1) the original weight of a -+ * merged queue may change during the merged state, 2) even being the -+ * weight the same, a merged queue may be bloated with many more -+ * requests than the ones produced by its originally-associated -+ * process. -+ */ -+static struct bfq_queue * -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ void *io_struct, bool request) -+{ -+ struct bfq_queue *in_service_bfqq, *new_bfqq; -+ -+ if (bfqq->new_bfqq) -+ return bfqq->new_bfqq; -+ -+ if (io_struct && wr_from_too_long(bfqq) && -+ likely(bfqq != &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but bfq%d wr", -+ bfqq->pid); -+ -+ if (!io_struct || -+ wr_from_too_long(bfqq) || -+ unlikely(bfqq == &bfqd->oom_bfqq)) -+ return NULL; -+ -+ /* If there is only one backlogged queue, don't search. */ -+ if (bfqd->busy_queues == 1) -+ return NULL; -+ -+ in_service_bfqq = bfqd->in_service_queue; -+ -+ if (in_service_bfqq && in_service_bfqq != bfqq && -+ bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) -+ && likely(in_service_bfqq == &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have tried merge with in-service-queue, but wr"); -+ -+ if (!in_service_bfqq || in_service_bfqq == bfqq || -+ !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || -+ unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -+ goto check_scheduled; -+ -+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ bfqq->entity.parent == in_service_bfqq->entity.parent && -+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -+ if (new_bfqq) -+ return new_bfqq; -+ } -+ /* -+ * Check whether there is a cooperator among currently scheduled -+ * queues. The only thing we need is that the bio/request is not -+ * NULL, as we need it to establish whether a cooperator exists. -+ */ -+check_scheduled: -+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, -+ bfq_io_struct_pos(io_struct, request)); -+ -+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); -+ -+ if (new_bfqq && wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have merged with bfq%d, but wr", -+ new_bfqq->pid); -+ -+ if (new_bfqq && !wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ return bfq_setup_merge(bfqq, new_bfqq); -+ -+ return NULL; -+} -+ -+static void bfq_bfqq_save_state(struct bfq_queue *bfqq) -+{ -+ struct bfq_io_cq *bic = bfqq->bic; -+ -+ /* -+ * If !bfqq->bic, the queue is already shared or its requests -+ * have already been redirected to a shared queue; both idle window -+ * and weight raising state have already been saved. Do nothing. -+ */ -+ if (!bic) -+ return; -+ -+ bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); -+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+} -+ -+static void bfq_get_bic_reference(struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs -+ * is about to begin using a shared bfq_queue. -+ */ -+ if (bfqq->bic) -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); -+} -+ -+static void -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", -+ (unsigned long) new_bfqq->pid); -+ /* Save weight raising and idle window of the merged queues */ -+ bfq_bfqq_save_state(bfqq); -+ bfq_bfqq_save_state(new_bfqq); -+ if (bfq_bfqq_IO_bound(bfqq)) -+ bfq_mark_bfqq_IO_bound(new_bfqq); -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ /* -+ * If bfqq is weight-raised, then let new_bfqq inherit -+ * weight-raising. To reduce false positives, neglect the case -+ * where bfqq has just been created, but has not yet made it -+ * to be weight-raised (which may happen because EQM may merge -+ * bfqq even before bfq_add_request is executed for the first -+ * time for bfqq). Handling this case would however be very -+ * easy, thanks to the flag just_created. -+ */ -+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ new_bfqq->wr_coeff = bfqq->wr_coeff; -+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; -+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; -+ new_bfqq->wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ if (bfq_bfqq_busy(new_bfqq)) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } -+ -+ new_bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "wr start after merge with %d, rais_max_time %u", -+ bfqq->pid, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ -+ bfqq->wr_coeff = 1; -+ bfqq->entity.prio_changed = 1; -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ } -+ -+ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", -+ bfqd->wr_busy_queues); -+ -+ /* -+ * Grab a reference to the bic, to prevent it from being destroyed -+ * before being possibly touched by a bfq_split_bfqq(). -+ */ -+ bfq_get_bic_reference(bfqq); -+ bfq_get_bic_reference(new_bfqq); -+ /* -+ * Merge queues (that is, let bic redirect its requests to new_bfqq) -+ */ -+ bic_set_bfqq(bic, new_bfqq, 1); -+ bfq_mark_bfqq_coop(new_bfqq); -+ /* -+ * new_bfqq now belongs to at least two bics (it is a shared queue): -+ * set new_bfqq->bic to NULL. bfqq either: -+ * - does not belong to any bic any more, and hence bfqq->bic must -+ * be set to NULL, or -+ * - is a queue whose owning bics have already been redirected to a -+ * different queue, hence the queue is destined to not belong to -+ * any bic soon and bfqq->bic is already NULL (therefore the next -+ * assignment causes no harm). -+ */ -+ new_bfqq->bic = NULL; -+ bfqq->bic = NULL; -+ /* release process reference to bfqq */ -+ bfq_put_queue(bfqq); -+} -+ -+static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bool is_sync = op_is_sync(bio->bi_opf); -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq, *new_bfqq; -+ -+ /* -+ * Disallow merge of a sync bio into an async request. -+ */ -+ if (is_sync && !rq_is_sync(rq)) -+ return false; -+ -+ /* -+ * Lookup the bfqq that this bio will be queued with. Allow -+ * merge only if rq is queued there. -+ * Queue lock is held here. -+ */ -+ bic = bfq_bic_lookup(bfqd, current->io_context); -+ if (!bic) -+ return false; -+ -+ bfqq = bic_to_bfqq(bic, is_sync); -+ /* -+ * We take advantage of this function to perform an early merge -+ * of the queues of possible cooperating processes. -+ */ -+ if (bfqq) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ if (new_bfqq) { -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -+ /* -+ * If we get here, the bio will be queued in the -+ * shared queue, i.e., new_bfqq, so use new_bfqq -+ * to decide whether bio and rq can be merged. -+ */ -+ bfqq = new_bfqq; -+ } -+ } -+ -+ return bfqq == RQ_BFQQ(rq); -+} -+ -+static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ return RQ_BFQQ(rq) == RQ_BFQQ(next); -+} -+ -+/* -+ * Set the maximum time for the in-service queue to consume its -+ * budget. This prevents seeky processes from lowering the throughput. -+ * In practice, a time-slice service scheme is used with seeky -+ * processes. -+ */ -+static void bfq_set_budget_timeout(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ unsigned int timeout_coeff; -+ -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -+ timeout_coeff = 1; -+ else -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+ -+ bfqd->last_budget_start = ktime_get(); -+ -+ bfqq->budget_timeout = jiffies + -+ bfqd->bfq_timeout * timeout_coeff; -+ -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); -+} -+ -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ if (bfqq) { -+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -+ bfq_mark_bfqq_must_alloc(bfqq); -+ bfq_clear_bfqq_fifo_expire(bfqq); -+ -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && -+ bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_before_jiffies(bfqq->budget_timeout)) { -+ /* -+ * For soft real-time queues, move the start -+ * of the weight-raising period forward by the -+ * time the queue has not received any -+ * service. Otherwise, a relatively long -+ * service delay is likely to cause the -+ * weight-raising period of the queue to end, -+ * because of the short duration of the -+ * weight-raising period of a soft real-time -+ * queue. It is worth noting that this move -+ * is not so dangerous for the other queues, -+ * because soft real-time queues are not -+ * greedy. -+ * -+ * To not add a further variable, we use the -+ * overloaded field budget_timeout to -+ * determine for how long the queue has not -+ * received service, i.e., how much time has -+ * elapsed since the queue expired. However, -+ * this is a little imprecise, because -+ * budget_timeout is set to jiffies if bfqq -+ * not only expires, but also remains with no -+ * request. -+ */ -+ if (time_after(bfqq->budget_timeout, -+ bfqq->last_wr_start_finish)) -+ bfqq->last_wr_start_finish += -+ jiffies - bfqq->budget_timeout; -+ else -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { -+ pr_crit( -+ "BFQ WARNING:last %lu budget %lu jiffies %lu", -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout, -+ jiffies); -+ pr_crit("diff %lu", jiffies - -+ max_t(unsigned long, -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout)); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+ -+ bfq_set_budget_timeout(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_in_service_queue, cur-budget = %d", -+ bfqq->entity.budget); -+ } else -+ bfq_log(bfqd, "set_in_service_queue: NULL"); -+ -+ bfqd->in_service_queue = bfqq; -+} -+ -+/* -+ * Get and set a new queue for service. -+ */ -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); -+ -+ __bfq_set_in_service_queue(bfqd, bfqq); -+ return bfqq; -+} -+ -+static void bfq_arm_slice_timer(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ struct bfq_io_cq *bic; -+ u32 sl; -+ -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ /* Processes have exited, don't wait. */ -+ bic = bfqd->in_service_bic; -+ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) -+ return; -+ -+ bfq_mark_bfqq_wait_request(bfqq); -+ -+ /* -+ * We don't want to idle for seeks, but we do want to allow -+ * fair distribution of slice time for a process doing back-to-back -+ * seeks. So allow a little bit of time for him to submit a new rq. -+ * -+ * To prevent processes with (partly) seeky workloads from -+ * being too ill-treated, grant them a small fraction of the -+ * assigned budget before reducing the waiting time to -+ * BFQ_MIN_TT. This happened to help reduce latency. -+ */ -+ sl = bfqd->bfq_slice_idle; -+ /* -+ * Unless the queue is being weight-raised or the scenario is -+ * asymmetric, grant only minimum idle time if the queue -+ * is seeky. A long idling is preserved for a weight-raised -+ * queue, or, more in general, in an asymemtric scenario, -+ * because a long idling is needed for guaranteeing to a queue -+ * its reserved share of the throughput (in particular, it is -+ * needed if the queue has a higher weight than some other -+ * queue). -+ */ -+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && -+ bfq_symmetric_scenario(bfqd)) -+ sl = min_t(u32, sl, BFQ_MIN_TT); -+ -+ bfqd->last_idling_start = ktime_get(); -+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), -+ HRTIMER_MODE_REL); -+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -+ bfq_log(bfqd, "arm idle: %ld/%ld ms", -+ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); -+} -+ -+/* -+ * In autotuning mode, max_budget is dynamically recomputed as the -+ * amount of sectors transferred in timeout at the estimated peak -+ * rate. This enables BFQ to utilize a full timeslice with a full -+ * budget, even if the in-service queue is served at peak rate. And -+ * this maximises throughput with sequential workloads. -+ */ -+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) -+{ -+ return (u64)bfqd->peak_rate * USEC_PER_MSEC * -+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; -+} -+ -+/* -+ * Update parameters related to throughput and responsiveness, as a -+ * function of the estimated peak rate. See comments on -+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays. -+ */ -+static void update_thr_responsiveness_params(struct bfq_data *bfqd) -+{ -+ int dev_type = blk_queue_nonrot(bfqd->queue); -+ -+ if (bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd); -+ BUG_ON(bfqd->bfq_max_budget < 0); -+ bfq_log(bfqd, "new max_budget = %d", -+ bfqd->bfq_max_budget); -+ } -+ -+ if (bfqd->device_speed == BFQ_BFQD_FAST && -+ bfqd->peak_rate < device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_SLOW; -+ bfqd->RT_prod = R_slow[dev_type] * -+ T_slow[dev_type]; -+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && -+ bfqd->peak_rate > device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ bfqd->RT_prod = R_fast[dev_type] * -+ T_fast[dev_type]; -+ } -+ -+ bfq_log(bfqd, -+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", -+ dev_type == 0 ? "ROT" : "NONROT", -+ bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", -+ bfqd->device_speed == BFQ_BFQD_FAST ? -+ (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> -+ BFQ_RATE_SHIFT); -+} -+ -+static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) -+{ -+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ -+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; -+ bfqd->peak_rate_samples = 1; -+ bfqd->sequential_samples = 0; -+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = -+ blk_rq_sectors(rq); -+ } else /* no new rq dispatched, just reset the number of samples */ -+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ -+ -+ bfq_log(bfqd, -+ "reset_rate_computation at end, sample %u/%u tot_sects %llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched); -+} -+ -+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) -+{ -+ u32 rate, weight, divisor; -+ -+ /* -+ * For the convergence property to hold (see comments on -+ * bfq_update_peak_rate()) and for the assessment to be -+ * reliable, a minimum number of samples must be present, and -+ * a minimum amount of time must have elapsed. If not so, do -+ * not compute new rate. Just reset parameters, to get ready -+ * for a new evaluation attempt. -+ */ -+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || -+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { -+ bfq_log(bfqd, -+ "update_rate_reset: only resetting, delta_first %lluus samples %d", -+ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); -+ goto reset_computation; -+ } -+ -+ /* -+ * If a new request completion has occurred after last -+ * dispatch, then, to approximate the rate at which requests -+ * have been served by the device, it is more precise to -+ * extend the observation interval to the last completion. -+ */ -+ bfqd->delta_from_first = -+ max_t(u64, bfqd->delta_from_first, -+ bfqd->last_completion - bfqd->first_dispatch); -+ -+ BUG_ON(bfqd->delta_from_first == 0); -+ /* -+ * Rate computed in sects/usec, and not sects/nsec, for -+ * precision issues. -+ */ -+ rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); -+ -+ bfq_log(bfqd, -+"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ rate > 20< 20M sectors/sec) -+ */ -+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && -+ rate <= bfqd->peak_rate) || -+ rate > 20<peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ goto reset_computation; -+ } else { -+ bfq_log(bfqd, -+ "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ } -+ -+ /* -+ * We have to update the peak rate, at last! To this purpose, -+ * we use a low-pass filter. We compute the smoothing constant -+ * of the filter as a function of the 'weight' of the new -+ * measured rate. -+ * -+ * As can be seen in next formulas, we define this weight as a -+ * quantity proportional to how sequential the workload is, -+ * and to how long the observation time interval is. -+ * -+ * The weight runs from 0 to 8. The maximum value of the -+ * weight, 8, yields the minimum value for the smoothing -+ * constant. At this minimum value for the smoothing constant, -+ * the measured rate contributes for half of the next value of -+ * the estimated peak rate. -+ * -+ * So, the first step is to compute the weight as a function -+ * of how sequential the workload is. Note that the weight -+ * cannot reach 9, because bfqd->sequential_samples cannot -+ * become equal to bfqd->peak_rate_samples, which, in its -+ * turn, holds true because bfqd->sequential_samples is not -+ * incremented for the first sample. -+ */ -+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; -+ -+ /* -+ * Second step: further refine the weight as a function of the -+ * duration of the observation interval. -+ */ -+ weight = min_t(u32, 8, -+ div_u64(weight * bfqd->delta_from_first, -+ BFQ_RATE_REF_INTERVAL)); -+ -+ /* -+ * Divisor ranging from 10, for minimum weight, to 2, for -+ * maximum weight. -+ */ -+ divisor = 10 - weight; -+ BUG_ON(divisor == 0); -+ -+ /* -+ * Finally, update peak rate: -+ * -+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor -+ */ -+ bfqd->peak_rate *= divisor-1; -+ bfqd->peak_rate /= divisor; -+ rate /= divisor; /* smoothing constant alpha = 1/divisor */ -+ -+ bfq_log(bfqd, -+ "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", -+ divisor, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), -+ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -+ -+ BUG_ON(bfqd->peak_rate == 0); -+ BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; -+ update_thr_responsiveness_params(bfqd); -+ BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */ -+ bfq_log(bfqd, -+ "update_peak_rate: goto reset, samples %d", -+ bfqd->peak_rate_samples) ; -+ bfq_reset_rate_computation(bfqd, rq); -+ goto update_last_values; /* will add one sample */ -+ } -+ -+ /* -+ * Device idle for very long: the observation interval lasting -+ * up to this dispatch cannot be a valid observation interval -+ * for computing a new peak rate (similarly to the late- -+ * completion event in bfq_completed_request()). Go to -+ * update_rate_and_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - start a new observation interval with this dispatch -+ */ -+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && -+ bfqd->rq_in_driver == 0) { -+ bfq_log(bfqd, -+"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", -+ (now_ns - bfqd->last_dispatch)>>10, -+ bfqd->peak_rate_samples) ; -+ goto update_rate_and_reset; -+ } -+ -+ /* Update sampling information */ -+ bfqd->peak_rate_samples++; -+ -+ if ((bfqd->rq_in_driver > 0 || -+ now_ns - bfqd->last_completion < BFQ_MIN_TT) -+ && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) -+ bfqd->sequential_samples++; -+ -+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); -+ -+ /* Reset max observed rq size every 32 dispatches */ -+ if (likely(bfqd->peak_rate_samples % 32)) -+ bfqd->last_rq_max_size = -+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); -+ else -+ bfqd->last_rq_max_size = blk_rq_sectors(rq); -+ -+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched, -+ bfqd->delta_from_first>>10); -+ -+ /* Target observation interval not yet reached, go on sampling */ -+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) -+ goto update_last_values; -+ -+update_rate_and_reset: -+ bfq_update_rate_reset(bfqd, rq); -+update_last_values: -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ bfqd->last_dispatch = now_ns; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", -+ (now_ns - bfqd->first_dispatch)>>10, -+ (unsigned long long) bfqd->last_position, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ bfq_log(bfqd, -+ "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); -+} -+ -+/* -+ * Move request from internal lists to the dispatch list of the request queue -+ */ -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * For consistency, the next instruction should have been executed -+ * after removing the request from the queue and dispatching it. -+ * We execute instead this instruction before bfq_remove_request() -+ * (and hence introduce a temporary inconsistency), for efficiency. -+ * In fact, in a forced_dispatch, this prevents two counters related -+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq -+ * is not in service, and then to be incremented again after -+ * incrementing bfqq->dispatched. -+ */ -+ bfqq->dispatched++; -+ bfq_update_peak_rate(q->elevator->elevator_data, rq); -+ -+ bfq_remove_request(rq); -+ elv_dispatch_sort(q, rq); -+} -+ -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * If this bfqq is shared between multiple processes, check -+ * to make sure that those processes are still issuing I/Os -+ * within the mean seek distance. If not, it may be time to -+ * break the queues apart again. -+ */ -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) -+ bfq_mark_bfqq_split_coop(bfqq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ if (bfqq->dispatched == 0) -+ /* -+ * Overloading budget_timeout field to store -+ * the time at which the queue remains with no -+ * backlog and no outstanding request; used by -+ * the weight-raising mechanism. -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_del_bfqq_busy(bfqd, bfqq, true); -+ } else { -+ bfq_requeue_bfqq(bfqd, bfqq); -+ /* -+ * Resort priority tree of potential close cooperators. -+ */ -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ -+ /* -+ * All in-service entities must have been properly deactivated -+ * or requeued before executing the next function, which -+ * resets all in-service entites as no more in service. -+ */ -+ __bfq_bfqd_reset_in_service(bfqd); -+} -+ -+/** -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. -+ * @bfqd: device data. -+ * @bfqq: queue to update. -+ * @reason: reason for expiration. -+ * -+ * Handle the feedback on @bfqq budget at queue expiration. -+ * See the body for detailed comments. -+ */ -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ enum bfqq_expiration reason) -+{ -+ struct request *next_rq; -+ int budget, min_budget; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ min_budget = bfq_min_budget(bfqd); -+ -+ if (bfqq->wr_coeff == 1) -+ budget = bfqq->max_budget; -+ else /* -+ * Use a constant, low budget for weight-raised queues, -+ * to help achieve a low latency. Keep it slightly higher -+ * than the minimum possible budget, to cause a little -+ * bit fewer expirations. -+ */ -+ budget = 2 * min_budget; -+ -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", -+ budget, bfq_min_budget(bfqd)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); -+ -+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -+ switch (reason) { -+ /* -+ * Caveat: in all the following cases we trade latency -+ * for throughput. -+ */ -+ case BFQ_BFQQ_TOO_IDLE: -+ /* -+ * This is the only case where we may reduce -+ * the budget: if there is no request of the -+ * process still waiting for completion, then -+ * we assume (tentatively) that the timer has -+ * expired because the batch of requests of -+ * the process could have been served with a -+ * smaller budget. Hence, betting that -+ * process will behave in the same way when it -+ * becomes backlogged again, we reduce its -+ * next budget. As long as we guess right, -+ * this budget cut reduces the latency -+ * experienced by the process. -+ * -+ * However, if there are still outstanding -+ * requests, then the process may have not yet -+ * issued its next request just because it is -+ * still waiting for the completion of some of -+ * the still outstanding ones. So in this -+ * subcase we do not reduce its budget, on the -+ * contrary we increase it to possibly boost -+ * the throughput, as discussed in the -+ * comments to the BUDGET_TIMEOUT case. -+ */ -+ if (bfqq->dispatched > 0) /* still outstanding reqs */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ else { -+ if (budget > 5 * min_budget) -+ budget -= 4 * min_budget; -+ else -+ budget = min_budget; -+ } -+ break; -+ case BFQ_BFQQ_BUDGET_TIMEOUT: -+ /* -+ * We double the budget here because it gives -+ * the chance to boost the throughput if this -+ * is not a seeky process (and has bumped into -+ * this timeout because of, e.g., ZBR). -+ */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_BUDGET_EXHAUSTED: -+ /* -+ * The process still has backlog, and did not -+ * let either the budget timeout or the disk -+ * idling timeout expire. Hence it is not -+ * seeky, has a short thinktime and may be -+ * happy with a higher budget too. So -+ * definitely increase the budget of this good -+ * candidate to boost the disk throughput. -+ */ -+ budget = min(budget * 4, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_NO_MORE_REQUESTS: -+ /* -+ * For queues that expire for this reason, it -+ * is particularly important to keep the -+ * budget close to the actual service they -+ * need. Doing so reduces the timestamp -+ * misalignment problem described in the -+ * comments in the body of -+ * __bfq_activate_entity. In fact, suppose -+ * that a queue systematically expires for -+ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a -+ * new request in time to enjoy timestamp -+ * back-shifting. The larger the budget of the -+ * queue is with respect to the service the -+ * queue actually requests in each service -+ * slot, the more times the queue can be -+ * reactivated with the same virtual finish -+ * time. It follows that, even if this finish -+ * time is pushed to the system virtual time -+ * to reduce the consequent timestamp -+ * misalignment, the queue unjustly enjoys for -+ * many re-activations a lower finish time -+ * than all newly activated queues. -+ * -+ * The service needed by bfqq is measured -+ * quite precisely by bfqq->entity.service. -+ * Since bfqq does not enjoy device idling, -+ * bfqq->entity.service is equal to the number -+ * of sectors that the process associated with -+ * bfqq requested to read/write before waiting -+ * for request completions, or blocking for -+ * other reasons. -+ */ -+ budget = max_t(int, bfqq->entity.service, min_budget); -+ break; -+ default: -+ return; -+ } -+ } else if (!bfq_bfqq_sync(bfqq)) -+ /* -+ * Async queues get always the maximum possible -+ * budget, as for them we do not care about latency -+ * (in addition, their ability to dispatch is limited -+ * by the charging factor). -+ */ -+ budget = bfqd->bfq_max_budget; -+ -+ bfqq->max_budget = budget; -+ -+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && -+ !bfqd->bfq_user_max_budget) -+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); -+ -+ /* -+ * If there is still backlog, then assign a new budget, making -+ * sure that it is large enough for the next request. Since -+ * the finish time of bfqq must be kept in sync with the -+ * budget, be sure to call __bfq_bfqq_expire() *after* this -+ * update. -+ * -+ * If there is no backlog, then no need to update the budget; -+ * it will be updated on the arrival of a new request. -+ */ -+ next_rq = bfqq->next_rq; -+ if (next_rq) { -+ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || -+ reason == BFQ_BFQQ_NO_MORE_REQUESTS); -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", -+ next_rq ? blk_rq_sectors(next_rq) : 0, -+ bfqq->entity.budget); -+} -+ -+/* -+ * Return true if the process associated with bfqq is "slow". The slow -+ * flag is used, in addition to the budget timeout, to reduce the -+ * amount of service provided to seeky processes, and thus reduce -+ * their chances to lower the throughput. More details in the comments -+ * on the function bfq_bfqq_expire(). -+ * -+ * An important observation is in order: as discussed in the comments -+ * on the function bfq_update_peak_rate(), with devices with internal -+ * queues, it is hard if ever possible to know when and for how long -+ * an I/O request is processed by the device (apart from the trivial -+ * I/O pattern where a new request is dispatched only after the -+ * previous one has been completed). This makes it hard to evaluate -+ * the real rate at which the I/O requests of each bfq_queue are -+ * served. In fact, for an I/O scheduler like BFQ, serving a -+ * bfq_queue means just dispatching its requests during its service -+ * slot (i.e., until the budget of the queue is exhausted, or the -+ * queue remains idle, or, finally, a timeout fires). But, during the -+ * service slot of a bfq_queue, around 100 ms at most, the device may -+ * be even still processing requests of bfq_queues served in previous -+ * service slots. On the opposite end, the requests of the in-service -+ * bfq_queue may be completed after the service slot of the queue -+ * finishes. -+ * -+ * Anyway, unless more sophisticated solutions are used -+ * (where possible), the sum of the sizes of the requests dispatched -+ * during the service slot of a bfq_queue is probably the only -+ * approximation available for the service received by the bfq_queue -+ * during its service slot. And this sum is the quantity used in this -+ * function to evaluate the I/O speed of a process. -+ */ -+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool compensate, enum bfqq_expiration reason, -+ unsigned long *delta_ms) -+{ -+ ktime_t delta_ktime; -+ u32 delta_usecs; -+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ -+ -+ if (!bfq_bfqq_sync(bfqq)) -+ return false; -+ -+ if (compensate) -+ delta_ktime = bfqd->last_idling_start; -+ else -+ delta_ktime = ktime_get(); -+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); -+ delta_usecs = ktime_to_us(delta_ktime); -+ -+ /* don't use too short time intervals */ -+ if (delta_usecs < 1000) { -+ if (blk_queue_nonrot(bfqd->queue)) -+ /* -+ * give same worst-case guarantees as idling -+ * for seeky -+ */ -+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; -+ else /* charge at least one seek */ -+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; -+ -+ bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); -+ -+ return slow; -+ } -+ -+ *delta_ms = delta_usecs / USEC_PER_MSEC; -+ -+ /* -+ * Use only long (> 20ms) intervals to filter out excessive -+ * spikes in service rate estimation. -+ */ -+ if (delta_usecs > 20000) { -+ /* -+ * Caveat for rotational devices: processes doing I/O -+ * in the slower disk zones tend to be slow(er) even -+ * if not seeky. In this respect, the estimated peak -+ * rate is likely to be an average over the disk -+ * surface. Accordingly, to not be too harsh with -+ * unlucky processes, a process is deemed slow only if -+ * its rate has been lower than half of the estimated -+ * peak rate. -+ */ -+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -+ bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", -+ bfqq->entity.service, bfqd->bfq_max_budget); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); -+ -+ return slow; -+} -+ -+/* -+ * To be deemed as soft real-time, an application must meet two -+ * requirements. First, the application must not require an average -+ * bandwidth higher than the approximate bandwidth required to playback or -+ * record a compressed high-definition video. -+ * The next function is invoked on the completion of the last request of a -+ * batch, to compute the next-start time instant, soft_rt_next_start, such -+ * that, if the next request of the application does not arrive before -+ * soft_rt_next_start, then the above requirement on the bandwidth is met. -+ * -+ * The second requirement is that the request pattern of the application is -+ * isochronous, i.e., that, after issuing a request or a batch of requests, -+ * the application stops issuing new requests until all its pending requests -+ * have been completed. After that, the application may issue a new batch, -+ * and so on. -+ * For this reason the next function is invoked to compute -+ * soft_rt_next_start only for applications that meet this requirement, -+ * whereas soft_rt_next_start is set to infinity for applications that do -+ * not. -+ * -+ * Unfortunately, even a greedy application may happen to behave in an -+ * isochronous way if the CPU load is high. In fact, the application may -+ * stop issuing requests while the CPUs are busy serving other processes, -+ * then restart, then stop again for a while, and so on. In addition, if -+ * the disk achieves a low enough throughput with the request pattern -+ * issued by the application (e.g., because the request pattern is random -+ * and/or the device is slow), then the application may meet the above -+ * bandwidth requirement too. To prevent such a greedy application to be -+ * deemed as soft real-time, a further rule is used in the computation of -+ * soft_rt_next_start: soft_rt_next_start must be higher than the current -+ * time plus the maximum time for which the arrival of a request is waited -+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. -+ * This filters out greedy applications, as the latter issue instead their -+ * next request as soon as possible after the last one has been completed -+ * (in contrast, when a batch of requests is completed, a soft real-time -+ * application spends some time processing data). -+ * -+ * Unfortunately, the last filter may easily generate false positives if -+ * only bfqd->bfq_slice_idle is used as a reference time interval and one -+ * or both the following cases occur: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher -+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with -+ * HZ=100. -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing -+ * for a while, then suddenly 'jump' by several units to recover the lost -+ * increments. This seems to happen, e.g., inside virtual machines. -+ * To address this issue, we do not use as a reference time interval just -+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In -+ * particular we add the minimum number of jiffies for which the filter -+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual -+ * machines. -+ */ -+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, -+"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", -+ bfqq->service_from_backlogged, -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate)); -+ -+ return max(bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+} -+ -+/* -+ * Return the farthest future time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_greatest_from_now(void) -+{ -+ return jiffies + MAX_JIFFY_OFFSET; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ -+/** -+ * bfq_bfqq_expire - expire a queue. -+ * @bfqd: device owning the queue. -+ * @bfqq: the queue to expire. -+ * @compensate: if true, compensate for the time spent idling. -+ * @reason: the reason causing the expiration. -+ * -+ * If the process associated with bfqq does slow I/O (e.g., because it -+ * issues random requests), we charge bfqq with the time it has been -+ * in service instead of the service it has received (see -+ * bfq_bfqq_charge_time for details on how this goal is achieved). As -+ * a consequence, bfqq will typically get higher timestamps upon -+ * reactivation, and hence it will be rescheduled as if it had -+ * received more service than what it has actually received. In the -+ * end, bfqq receives less service in proportion to how slowly its -+ * associated process consumes its budgets (and hence how seriously it -+ * tends to lower the throughput). In addition, this time-charging -+ * strategy guarantees time fairness among slow processes. In -+ * contrast, if the process associated with bfqq is not slow, we -+ * charge bfqq exactly with the service it has received. -+ * -+ * Charging time to the first type of queues and the exact service to -+ * the other has the effect of using the WF2Q+ policy to schedule the -+ * former on a timeslice basis, without violating service domain -+ * guarantees among the latter. -+ */ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason) -+{ -+ bool slow; -+ unsigned long delta = 0; -+ struct bfq_entity *entity = &bfqq->entity; -+ int ref; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * Check whether the process is slow (see bfq_bfqq_is_slow). -+ */ -+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); -+ -+ /* -+ * Increase service_from_backlogged before next statement, -+ * because the possible next invocation of -+ * bfq_bfqq_charge_time would likely inflate -+ * entity->service. In contrast, service_from_backlogged must -+ * contain real service, to enable the soft real-time -+ * heuristic to correctly compute the bandwidth consumed by -+ * bfqq. -+ */ -+ bfqq->service_from_backlogged += entity->service; -+ -+ /* -+ * As above explained, charge slow (typically seeky) and -+ * timed-out queues with the time and not the service -+ * received, to favor sequential workloads. -+ * -+ * Processes doing I/O in the slower disk zones will tend to -+ * be slow(er) even if not seeky. Therefore, since the -+ * estimated peak rate is actually an average over the disk -+ * surface, these processes may timeout just for bad luck. To -+ * avoid punishing them, do not charge time to processes that -+ * succeeded in consuming at least 2/3 of their budget. This -+ * allows BFQ to preserve enough elasticity to still perform -+ * bandwidth, and not time, distribution with little unlucky -+ * or quasi-sequential processes. -+ */ -+ if (bfqq->wr_coeff == 1 && -+ (slow || -+ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) -+ bfq_bfqq_charge_time(bfqd, bfqq, delta); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ if (reason == BFQ_BFQQ_TOO_IDLE && -+ entity->service <= 2 * entity->budget / 10) -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (bfqd->low_latency && bfqq->wr_coeff == 1) -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ /* -+ * If we get here, and there are no outstanding -+ * requests, then the request pattern is isochronous -+ * (see the comments on the function -+ * bfq_bfqq_softrt_next_start()). Thus we can compute -+ * soft_rt_next_start. If, instead, the queue still -+ * has outstanding requests, then we have to wait for -+ * the completion of all the outstanding requests to -+ * discover whether the request pattern is actually -+ * isochronous. -+ */ -+ BUG_ON(bfqd->busy_queues < 1); -+ if (bfqq->dispatched == 0) { -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", -+ bfqq->soft_rt_next_start); -+ } else { -+ /* -+ * The application is still waiting for the -+ * completion of one or more requests: -+ * prevent it from possibly being incorrectly -+ * deemed as soft real-time by setting its -+ * soft_rt_next_start to infinity. In fact, -+ * without this assignment, the application -+ * would be incorrectly deemed as soft -+ * real-time if: -+ * 1) it issued a new request before the -+ * completion of all its in-flight -+ * requests, and -+ * 2) at that time, its soft_rt_next_start -+ * happened to be in the past. -+ */ -+ bfqq->soft_rt_next_start = -+ bfq_greatest_from_now(); -+ /* -+ * Schedule an update of soft_rt_next_start to when -+ * the task may be discovered to be isochronous. -+ */ -+ bfq_mark_bfqq_softrt_update(bfqq); -+ } -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", -+ reason, slow, bfqq->dispatched, -+ bfq_bfqq_idle_window(bfqq), entity->weight); -+ -+ /* -+ * Increase, decrease or leave budget unchanged according to -+ * reason. -+ */ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ ref = bfqq->ref; -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ BUG_ON(ref > 1 && -+ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && -+ !bfq_class_idle(bfqq)); -+ -+ /* mark bfqq as waiting a request only if a bic still points to it */ -+ if (ref > 1 && !bfq_bfqq_busy(bfqq) && -+ reason != BFQ_BFQQ_BUDGET_TIMEOUT && -+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) -+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); -+} -+ -+/* -+ * Budget timeout is not implemented through a dedicated timer, but -+ * just checked on request arrivals and completions, as well as on -+ * idle timer expirations. -+ */ -+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -+{ -+ return time_is_before_eq_jiffies(bfqq->budget_timeout); -+} -+ -+/* -+ * If we expire a queue that is actively waiting (i.e., with the -+ * device idled) for the arrival of a new request, then we may incur -+ * the timestamp misalignment problem described in the body of the -+ * function __bfq_activate_entity. Hence we return true only if this -+ * condition does not hold, or if the queue is slow enough to deserve -+ * only to be kicked off for preserving a high throughput. -+ */ -+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "may_budget_timeout: wait_request %d left %d timeout %d", -+ bfq_bfqq_wait_request(bfqq), -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, -+ bfq_bfqq_budget_timeout(bfqq)); -+ -+ return (!bfq_bfqq_wait_request(bfqq) || -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) -+ && -+ bfq_bfqq_budget_timeout(bfqq); -+} -+ -+/* -+ * For a queue that becomes empty, device idling is allowed only if -+ * this function returns true for that queue. As a consequence, since -+ * device idling plays a critical role for both throughput boosting -+ * and service guarantees, the return value of this function plays a -+ * critical role as well. -+ * -+ * In a nutshell, this function returns true only if idling is -+ * beneficial for throughput or, even if detrimental for throughput, -+ * idling is however necessary to preserve service guarantees (low -+ * latency, desired throughput distribution, ...). In particular, on -+ * NCQ-capable devices, this function tries to return false, so as to -+ * help keep the drives' internal queues full, whenever this helps the -+ * device boost the throughput without causing any service-guarantee -+ * issue. -+ * -+ * In more detail, the return value of this function is obtained by, -+ * first, computing a number of boolean variables that take into -+ * account throughput and service-guarantee issues, and, then, -+ * combining these variables in a logical expression. Most of the -+ * issues taken into account are not trivial. We discuss these issues -+ * while introducing the variables. -+ */ -+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ bool idling_boosts_thr, idling_boosts_thr_without_issues, -+ idling_needed_for_service_guarantees, -+ asymmetric_scenario; -+ -+ if (bfqd->strict_guarantees) -+ return true; -+ -+ /* -+ * The next variable takes into account the cases where idling -+ * boosts the throughput. -+ * -+ * The value of the variable is computed considering, first, that -+ * idling is virtually always beneficial for the throughput if: -+ * (a) the device is not NCQ-capable, or -+ * (b) regardless of the presence of NCQ, the device is rotational -+ * and the request pattern for bfqq is I/O-bound and sequential. -+ * -+ * Secondly, and in contrast to the above item (b), idling an -+ * NCQ-capable flash-based device would not boost the -+ * throughput even with sequential I/O; rather it would lower -+ * the throughput in proportion to how fast the device -+ * is. Accordingly, the next variable is true if any of the -+ * above conditions (a) and (b) is true, and, in particular, -+ * happens to be false if bfqd is an NCQ-capable flash-based -+ * device. -+ */ -+ idling_boosts_thr = !bfqd->hw_tag || -+ (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && -+ bfq_bfqq_idle_window(bfqq)); -+ -+ /* -+ * The value of the next variable, -+ * idling_boosts_thr_without_issues, is equal to that of -+ * idling_boosts_thr, unless a special case holds. In this -+ * special case, described below, idling may cause problems to -+ * weight-raised queues. -+ * -+ * When the request pool is saturated (e.g., in the presence -+ * of write hogs), if the processes associated with -+ * non-weight-raised queues ask for requests at a lower rate, -+ * then processes associated with weight-raised queues have a -+ * higher probability to get a request from the pool -+ * immediately (or at least soon) when they need one. Thus -+ * they have a higher probability to actually get a fraction -+ * of the device throughput proportional to their high -+ * weight. This is especially true with NCQ-capable drives, -+ * which enqueue several requests in advance, and further -+ * reorder internally-queued requests. -+ * -+ * For this reason, we force to false the value of -+ * idling_boosts_thr_without_issues if there are weight-raised -+ * busy queues. In this case, and if bfqq is not weight-raised, -+ * this guarantees that the device is not idled for bfqq (if, -+ * instead, bfqq is weight-raised, then idling will be -+ * guaranteed by another variable, see below). Combined with -+ * the timestamping rules of BFQ (see [1] for details), this -+ * behavior causes bfqq, and hence any sync non-weight-raised -+ * queue, to get a lower number of requests served, and thus -+ * to ask for a lower number of requests from the request -+ * pool, before the busy weight-raised queues get served -+ * again. This often mitigates starvation problems in the -+ * presence of heavy write workloads and NCQ, thereby -+ * guaranteeing a higher application and system responsiveness -+ * in these hostile scenarios. -+ */ -+ idling_boosts_thr_without_issues = idling_boosts_thr && -+ bfqd->wr_busy_queues == 0; -+ -+ /* -+ * There is then a case where idling must be performed not -+ * for throughput concerns, but to preserve service -+ * guarantees. -+ * -+ * To introduce this case, we can note that allowing the drive -+ * to enqueue more than one request at a time, and hence -+ * delegating de facto final scheduling decisions to the -+ * drive's internal scheduler, entails loss of control on the -+ * actual request service order. In particular, the critical -+ * situation is when requests from different processes happen -+ * to be present, at the same time, in the internal queue(s) -+ * of the drive. In such a situation, the drive, by deciding -+ * the service order of the internally-queued requests, does -+ * determine also the actual throughput distribution among -+ * these processes. But the drive typically has no notion or -+ * concern about per-process throughput distribution, and -+ * makes its decisions only on a per-request basis. Therefore, -+ * the service distribution enforced by the drive's internal -+ * scheduler is likely to coincide with the desired -+ * device-throughput distribution only in a completely -+ * symmetric scenario where: -+ * (i) each of these processes must get the same throughput as -+ * the others; -+ * (ii) all these processes have the same I/O pattern -+ * (either sequential or random). -+ * In fact, in such a scenario, the drive will tend to treat -+ * the requests of each of these processes in about the same -+ * way as the requests of the others, and thus to provide -+ * each of these processes with about the same throughput -+ * (which is exactly the desired throughput distribution). In -+ * contrast, in any asymmetric scenario, device idling is -+ * certainly needed to guarantee that bfqq receives its -+ * assigned fraction of the device throughput (see [1] for -+ * details). -+ * -+ * We address this issue by controlling, actually, only the -+ * symmetry sub-condition (i), i.e., provided that -+ * sub-condition (i) holds, idling is not performed, -+ * regardless of whether sub-condition (ii) holds. In other -+ * words, only if sub-condition (i) holds, then idling is -+ * allowed, and the device tends to be prevented from queueing -+ * many requests, possibly of several processes. The reason -+ * for not controlling also sub-condition (ii) is that we -+ * exploit preemption to preserve guarantees in case of -+ * symmetric scenarios, even if (ii) does not hold, as -+ * explained in the next two paragraphs. -+ * -+ * Even if a queue, say Q, is expired when it remains idle, Q -+ * can still preempt the new in-service queue if the next -+ * request of Q arrives soon (see the comments on -+ * bfq_bfqq_update_budg_for_activation). If all queues and -+ * groups have the same weight, this form of preemption, -+ * combined with the hole-recovery heuristic described in the -+ * comments on function bfq_bfqq_update_budg_for_activation, -+ * are enough to preserve a correct bandwidth distribution in -+ * the mid term, even without idling. In fact, even if not -+ * idling allows the internal queues of the device to contain -+ * many requests, and thus to reorder requests, we can rather -+ * safely assume that the internal scheduler still preserves a -+ * minimum of mid-term fairness. The motivation for using -+ * preemption instead of idling is that, by not idling, -+ * service guarantees are preserved without minimally -+ * sacrificing throughput. In other words, both a high -+ * throughput and its desired distribution are obtained. -+ * -+ * More precisely, this preemption-based, idleless approach -+ * provides fairness in terms of IOPS, and not sectors per -+ * second. This can be seen with a simple example. Suppose -+ * that there are two queues with the same weight, but that -+ * the first queue receives requests of 8 sectors, while the -+ * second queue receives requests of 1024 sectors. In -+ * addition, suppose that each of the two queues contains at -+ * most one request at a time, which implies that each queue -+ * always remains idle after it is served. Finally, after -+ * remaining idle, each queue receives very quickly a new -+ * request. It follows that the two queues are served -+ * alternatively, preempting each other if needed. This -+ * implies that, although both queues have the same weight, -+ * the queue with large requests receives a service that is -+ * 1024/8 times as high as the service received by the other -+ * queue. -+ * -+ * On the other hand, device idling is performed, and thus -+ * pure sector-domain guarantees are provided, for the -+ * following queues, which are likely to need stronger -+ * throughput guarantees: weight-raised queues, and queues -+ * with a higher weight than other queues. When such queues -+ * are active, sub-condition (i) is false, which triggers -+ * device idling. -+ * -+ * According to the above considerations, the next variable is -+ * true (only) if sub-condition (i) holds. To compute the -+ * value of this variable, we not only use the return value of -+ * the function bfq_symmetric_scenario(), but also check -+ * whether bfqq is being weight-raised, because -+ * bfq_symmetric_scenario() does not take into account also -+ * weight-raised queues (see comments on -+ * bfq_weights_tree_add()). -+ * -+ * As a side note, it is worth considering that the above -+ * device-idling countermeasures may however fail in the -+ * following unlucky scenario: if idling is (correctly) -+ * disabled in a time period during which all symmetry -+ * sub-conditions hold, and hence the device is allowed to -+ * enqueue many requests, but at some later point in time some -+ * sub-condition stops to hold, then it may become impossible -+ * to let requests be served in the desired order until all -+ * the requests already queued in the device have been served. -+ */ -+ asymmetric_scenario = bfqq->wr_coeff > 1 || -+ !bfq_symmetric_scenario(bfqd); -+ -+ /* -+ * Finally, there is a case where maximizing throughput is the -+ * best choice even if it may cause unfairness toward -+ * bfqq. Such a case is when bfqq became active in a burst of -+ * queue activations. Queues that became active during a large -+ * burst benefit only from throughput, as discussed in the -+ * comments on bfq_handle_burst. Thus, if bfqq became active -+ * in a burst and not idling the device maximizes throughput, -+ * then the device must no be idled, because not idling the -+ * device provides bfqq and all other queues in the burst with -+ * maximum benefit. Combining this and the above case, we can -+ * now establish when idling is actually needed to preserve -+ * service guarantees. -+ */ -+ idling_needed_for_service_guarantees = -+ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); -+ -+ /* -+ * We have now all the components we need to compute the return -+ * value of the function, which is true only if both the following -+ * conditions hold: -+ * 1) bfqq is sync, because idling make sense only for sync queues; -+ * 2) idling either boosts the throughput (without issues), or -+ * is necessary to preserve service guarantees. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", -+ bfq_bfqq_sync(bfqq), idling_boosts_thr); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", -+ bfqd->wr_busy_queues, -+ idling_boosts_thr_without_issues, -+ bfq_bfqq_IO_bound(bfqq), -+ idling_needed_for_service_guarantees); -+ -+ return bfq_bfqq_sync(bfqq) && -+ (idling_boosts_thr_without_issues || -+ idling_needed_for_service_guarantees); -+} -+ -+/* -+ * If the in-service queue is empty but the function bfq_bfqq_may_idle -+ * returns true, then: -+ * 1) the queue must remain in service and cannot be expired, and -+ * 2) the device must be idled to wait for the possible arrival of a new -+ * request for the queue. -+ * See the comments on the function bfq_bfqq_may_idle for the reasons -+ * why performing device idling is the best choice to boost the throughput -+ * and preserve service guarantees when bfq_bfqq_may_idle itself -+ * returns true. -+ */ -+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && -+ bfq_bfqq_may_idle(bfqq); -+} -+ -+/* -+ * Select a queue for service. If we have a current queue in service, -+ * check whether to continue servicing it, or retrieve and set a new one. -+ */ -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ struct request *next_rq; -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ -+ bfqq = bfqd->in_service_queue; -+ if (!bfqq) -+ goto new_queue; -+ -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); -+ -+ if (bfq_may_expire_for_budg_timeout(bfqq) && -+ !hrtimer_active(&bfqd->idle_slice_timer) && -+ !bfq_bfqq_must_idle(bfqq)) -+ goto expire; -+ -+check_queue: -+ /* -+ * This loop is rarely executed more than once. Even when it -+ * happens, it is much more convenient to re-execute this loop -+ * than to return NULL and trigger a new dispatch to get a -+ * request served. -+ */ -+ next_rq = bfqq->next_rq; -+ /* -+ * If bfqq has requests queued and it has enough budget left to -+ * serve them, keep the queue, otherwise expire it. -+ */ -+ if (next_rq) { -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (bfq_serv_to_charge(next_rq, bfqq) > -+ bfq_bfqq_budget_left(bfqq)) { -+ /* -+ * Expire the queue for budget exhaustion, -+ * which makes sure that the next budget is -+ * enough to serve the next request, even if -+ * it comes from the fifo expired path. -+ */ -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; -+ goto expire; -+ } else { -+ /* -+ * The idle timer may be pending because we may -+ * not disable disk idling even when a new request -+ * arrives. -+ */ -+ if (bfq_bfqq_wait_request(bfqq)) { -+ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); -+ /* -+ * If we get here: 1) at least a new request -+ * has arrived but we have not disabled the -+ * timer because the request was too small, -+ * 2) then the block layer has unplugged -+ * the device, causing the dispatch to be -+ * invoked. -+ * -+ * Since the device is unplugged, now the -+ * requests are probably large enough to -+ * provide a reasonable throughput. -+ * So we disable idling. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ } -+ goto keep_queue; -+ } -+ } -+ -+ /* -+ * No requests pending. However, if the in-service queue is idling -+ * for a new request, or has requests waiting for a completion and -+ * may idle after their completion, then keep it anyway. -+ */ -+ if (hrtimer_active(&bfqd->idle_slice_timer) || -+ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { -+ bfqq = NULL; -+ goto keep_queue; -+ } -+ -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, reason); -+new_queue: -+ bfqq = bfq_set_in_service_queue(bfqd); -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); -+ goto check_queue; -+ } -+keep_queue: -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); -+ else -+ bfq_log(bfqd, "select_queue: no queue returned"); -+ -+ return bfqq; -+} -+ -+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ -+ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != -+ entity->orig_weight * bfqq->wr_coeff); -+ if (entity->prio_changed) -+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); -+ -+ /* -+ * If the queue was activated in a burst, or too much -+ * time has elapsed from the beginning of this -+ * weight-raising period, then end weight raising. -+ */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bfq_bfqq_end_wr(bfqq); -+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time)) { -+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || -+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) -+ bfq_bfqq_end_wr(bfqq); -+ else { -+ /* switch back to interactive wr */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = -+ bfqq->wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies( -+ bfqq->last_wr_start_finish)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "back to interactive wr"); -+ } -+ } -+ } -+ /* -+ * To improve latency (for this or other queues), immediately -+ * update weight both if it must be raised and if it must be -+ * lowered. Since, entity may be on some active tree here, and -+ * might have a pending change of its ioprio class, invoke -+ * next function with the last parameter unset (see the -+ * comments on the function). -+ */ -+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) -+ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), -+ entity, false); -+} -+ -+/* -+ * Dispatch one request from bfqq, moving it to the request queue -+ * dispatch list. -+ */ -+static int bfq_dispatch_request(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ struct request *rq = bfqq->next_rq; -+ unsigned long service_to_charge; -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!rq); -+ service_to_charge = bfq_serv_to_charge(rq, bfqq); -+ -+ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_bfqq_served(bfqq, service_to_charge); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_dispatch_insert(bfqd->queue, rq); -+ -+ /* -+ * If weight raising has to terminate for bfqq, then next -+ * function causes an immediate update of bfqq's weight, -+ * without waiting for next activation. As a consequence, on -+ * expiration, bfqq will be timestamped as if has never been -+ * weight-raised during this service slot, even if it has -+ * received part or even most of the service as a -+ * weight-raised queue. This inflates bfqq's timestamps, which -+ * is beneficial, as bfqq is then more willing to leave the -+ * device immediately to possible other weight-raised queues. -+ */ -+ bfq_update_wr_data(bfqd, bfqq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %u sec req (%llu), budg left %d", -+ blk_rq_sectors(rq), -+ (unsigned long long) blk_rq_pos(rq), -+ bfq_bfqq_budget_left(bfqq)); -+ -+ dispatched++; -+ -+ if (!bfqd->in_service_bic) { -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); -+ bfqd->in_service_bic = RQ_BIC(rq); -+ } -+ -+ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) -+ goto expire; -+ -+ return dispatched; -+ -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -+ return dispatched; -+} -+ -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ -+ while (bfqq->next_rq) { -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); -+ dispatched++; -+ } -+ -+ BUG_ON(!list_empty(&bfqq->fifo)); -+ return dispatched; -+} -+ -+/* -+ * Drain our current requests. -+ * Used for barriers and when switching io schedulers on-the-fly. -+ */ -+static int bfq_forced_dispatch(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq, *n; -+ struct bfq_service_tree *st; -+ int dispatched = 0; -+ -+ bfqq = bfqd->in_service_queue; -+ if (bfqq) -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ /* -+ * Loop through classes, and be careful to leave the scheduler -+ * in a consistent state, as feedback mechanisms and vtime -+ * updates cannot be disabled during the process. -+ */ -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { -+ st = bfq_entity_service_tree(&bfqq->entity); -+ -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq); -+ -+ bfqq->max_budget = bfq_max_budget(bfqd); -+ bfq_forget_idle(st); -+ } -+ -+ BUG_ON(bfqd->busy_queues != 0); -+ -+ return dispatched; -+} -+ -+static int bfq_dispatch_requests(struct request_queue *q, int force) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq; -+ -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ -+ if (bfqd->busy_queues == 0) -+ return 0; -+ -+ if (unlikely(force)) -+ return bfq_forced_dispatch(bfqd); -+ -+ /* -+ * Force device to serve one request at a time if -+ * strict_guarantees is true. Forcing this service scheme is -+ * currently the ONLY way to guarantee that the request -+ * service order enforced by the scheduler is respected by a -+ * queueing device. Otherwise the device is free even to make -+ * some unlucky request wait for as long as the device -+ * wishes. -+ * -+ * Of course, serving one request at at time may cause loss of -+ * throughput. -+ */ -+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -+ return 0; -+ -+ bfqq = bfq_select_queue(bfqd); -+ if (!bfqq) -+ return 0; -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfq_bfqq_wait_request(bfqq)); -+ -+ if (!bfq_dispatch_request(bfqd, bfqq)) -+ return 0; -+ -+ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async"); -+ -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ return 1; -+} -+ -+/* -+ * Task holds one reference to the queue, dropped when task exits. Each rq -+ * in-flight on this queue also holds a reference, dropped when rq is freed. -+ * -+ * Queue lock must be held here. Recall not to use bfqq after calling -+ * this function on it. -+ */ -+static void bfq_put_queue(struct bfq_queue *bfqq) -+{ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+#endif -+ -+ BUG_ON(bfqq->ref <= 0); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ bfqq->ref--; -+ if (bfqq->ref) -+ return; -+ -+ BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); -+ BUG_ON(bfqq->entity.tree); -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ -+ if (bfq_bfqq_sync(bfqq)) -+ /* -+ * The fact that this queue is being destroyed does not -+ * invalidate the fact that this queue may have been -+ * activated during the current burst. As a consequence, -+ * although the queue does not exist anymore, and hence -+ * needs to be removed from the burst list if there, -+ * the burst size has not to be decremented. -+ */ -+ hlist_del_init(&bfqq->burst_list_node); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ -+ kmem_cache_free(bfq_pool, bfqq); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ bfqg_put(bfqg); -+#endif -+} -+ -+static void bfq_put_cooperator(struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *__bfqq, *next; -+ -+ /* -+ * If this queue was scheduled to merge with another queue, be -+ * sure to drop the reference taken on that queue (and others in -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. -+ */ -+ __bfqq = bfqq->new_bfqq; -+ while (__bfqq) { -+ if (__bfqq == bfqq) -+ break; -+ next = __bfqq->new_bfqq; -+ bfq_put_queue(__bfqq); -+ __bfqq = next; -+ } -+} -+ -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ if (bfqq == bfqd->in_service_queue) { -+ __bfq_bfqq_expire(bfqd, bfqq); -+ bfq_schedule_dispatch(bfqd); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); /* release process reference */ -+} -+ -+static void bfq_init_icq(struct io_cq *icq) -+{ -+ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ -+ if (bic_to_bfqq(bic, false)) { -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); -+ bic_set_bfqq(bic, NULL, false); -+ } -+ -+ if (bic_to_bfqq(bic, true)) { -+ /* -+ * If the bic is using a shared queue, put the reference -+ * taken on the io_context when the bic started using a -+ * shared bfq_queue. -+ */ -+ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) -+ put_io_context(icq->ioc); -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); -+ bic_set_bfqq(bic, NULL, true); -+ } -+} -+ -+/* -+ * Update the entity prio values; note that the new values will not -+ * be used until the next (re)activation. -+ */ -+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ struct task_struct *tsk = current; -+ int ioprio_class; -+ -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ switch (ioprio_class) { -+ default: -+ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, -+ "bfq: bad prio class %d\n", ioprio_class); -+ case IOPRIO_CLASS_NONE: -+ /* -+ * No prio set, inherit CPU scheduling settings. -+ */ -+ bfqq->new_ioprio = task_nice_ioprio(tsk); -+ bfqq->new_ioprio_class = task_nice_ioclass(tsk); -+ break; -+ case IOPRIO_CLASS_RT: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; -+ break; -+ case IOPRIO_CLASS_BE: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_IDLE: -+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; -+ bfqq->new_ioprio = 7; -+ bfq_clear_bfqq_idle_window(bfqq); -+ break; -+ } -+ -+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { -+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", -+ bfqq->new_ioprio); -+ BUG(); -+ } -+ -+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "set_next_ioprio_data: bic_class %d prio %d class %d", -+ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); -+} -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq; -+ unsigned long uninitialized_var(flags); -+ int ioprio = bic->icq.ioc->ioprio; -+ -+ /* -+ * This condition may trigger on a newly created bic, be sure to -+ * drop the lock before returning. -+ */ -+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) -+ return; -+ -+ bic->ioprio = ioprio; -+ -+ bfqq = bic_to_bfqq(bic, false); -+ if (bfqq) { -+ /* release process reference on this queue */ -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); -+ bic_set_bfqq(bic, bfqq, false); -+ bfq_log_bfqq(bfqd, bfqq, -+ "check_ioprio_change: bfqq %p %d", -+ bfqq, bfqq->ref); -+ } -+ -+ bfqq = bic_to_bfqq(bic, true); -+ if (bfqq) -+ bfq_set_next_ioprio_data(bfqq, bic); -+} -+ -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic, pid_t pid, int is_sync) -+{ -+ RB_CLEAR_NODE(&bfqq->entity.rb_node); -+ INIT_LIST_HEAD(&bfqq->fifo); -+ INIT_HLIST_NODE(&bfqq->burst_list_node); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bfqq->ref = 0; -+ bfqq->bfqd = bfqd; -+ -+ if (bic) -+ bfq_set_next_ioprio_data(bfqq, bic); -+ -+ if (is_sync) { -+ if (!bfq_class_idle(bfqq)) -+ bfq_mark_bfqq_idle_window(bfqq); -+ bfq_mark_bfqq_sync(bfqq); -+ bfq_mark_bfqq_just_created(bfqq); -+ } else -+ bfq_clear_bfqq_sync(bfqq); -+ bfq_mark_bfqq_IO_bound(bfqq); -+ -+ /* Tentative initial value to trade off between thr and lat */ -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; -+ bfqq->pid = pid; -+ -+ bfqq->wr_coeff = 1; -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); -+ bfqq->budget_timeout = bfq_smallest_from_now(); -+ bfqq->split_time = bfq_smallest_from_now(); -+ -+ /* -+ * Set to the value for which bfqq will not be deemed as -+ * soft rt when it becomes backlogged. -+ */ -+ bfqq->soft_rt_next_start = bfq_greatest_from_now(); -+ -+ /* first request is almost certainly seeky */ -+ bfqq->seek_history = 1; -+} -+ -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ int ioprio_class, int ioprio) -+{ -+ switch (ioprio_class) { -+ case IOPRIO_CLASS_RT: -+ return &bfqg->async_bfqq[0][ioprio]; -+ case IOPRIO_CLASS_NONE: -+ ioprio = IOPRIO_NORM; -+ /* fall through */ -+ case IOPRIO_CLASS_BE: -+ return &bfqg->async_bfqq[1][ioprio]; -+ case IOPRIO_CLASS_IDLE: -+ return &bfqg->async_idle_bfqq; -+ default: -+ BUG(); -+ } -+} -+ -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic) -+{ -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ struct bfq_queue **async_bfqq = NULL; -+ struct bfq_queue *bfqq; -+ struct bfq_group *bfqg; -+ -+ rcu_read_lock(); -+ -+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); -+ if (!bfqg) { -+ bfqq = &bfqd->oom_bfqq; -+ goto out; -+ } -+ -+ if (!is_sync) { -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, -+ ioprio); -+ bfqq = *async_bfqq; -+ if (bfqq) -+ goto out; -+ } -+ -+ bfqq = kmem_cache_alloc_node(bfq_pool, -+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, -+ bfqd->queue->node); -+ -+ if (bfqq) { -+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -+ is_sync); -+ bfq_init_entity(&bfqq->entity, bfqg); -+ bfq_log_bfqq(bfqd, bfqq, "allocated"); -+ } else { -+ bfqq = &bfqd->oom_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -+ goto out; -+ } -+ -+ /* -+ * Pin the queue now that it's allocated, scheduler exit will -+ * prune it. -+ */ -+ if (async_bfqq) { -+ bfqq->ref++; /* -+ * Extra group reference, w.r.t. sync -+ * queue. This extra reference is removed -+ * only if bfqq->bfqg disappears, to -+ * guarantee that this queue is not freed -+ * until its group goes away. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -+ bfqq, bfqq->ref); -+ *async_bfqq = bfqq; -+ } -+ -+out: -+ bfqq->ref++; /* get a process reference to this queue */ -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ rcu_read_unlock(); -+ return bfqq; -+} -+ -+static void bfq_update_io_thinktime(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic) -+{ -+ struct bfq_ttime *ttime = &bic->ttime; -+ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; -+ -+ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); -+ -+ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); -+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, -+ ttime->ttime_samples); -+} -+ -+static void -+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ bfqq->seek_history <<= 1; -+ bfqq->seek_history |= -+ get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && -+ (!blk_queue_nonrot(bfqd->queue) || -+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); -+} -+ -+/* -+ * Disable idle window if the process thinks too long or seeks so much that -+ * it doesn't matter. -+ */ -+static void bfq_update_idle_window(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ int enable_idle; -+ -+ /* Don't idle for async or idle io prio class. */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) -+ return; -+ -+ /* Idle window just restored, statistics are meaningless. */ -+ if (time_is_after_eq_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) -+ return; -+ -+ enable_idle = bfq_bfqq_idle_window(bfqq); -+ -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -+ bfqd->bfq_slice_idle == 0 || -+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && -+ bfqq->wr_coeff == 1)) -+ enable_idle = 0; -+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) { -+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && -+ bfqq->wr_coeff == 1) -+ enable_idle = 0; -+ else -+ enable_idle = 1; -+ } -+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", -+ enable_idle); -+ -+ if (enable_idle) -+ bfq_mark_bfqq_idle_window(bfqq); -+ else -+ bfq_clear_bfqq_idle_window(bfqq); -+} -+ -+/* -+ * Called when a new fs request (rq) is added to bfqq. Check if there's -+ * something we should do about it. -+ */ -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ struct bfq_io_cq *bic = RQ_BIC(rq); -+ -+ if (rq->cmd_flags & REQ_META) -+ bfqq->meta_pending++; -+ -+ bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_io_seektime(bfqd, bfqq, rq); -+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || -+ !BFQQ_SEEKY(bfqq)) -+ bfq_update_idle_window(bfqd, bfqq, bic); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "rq_enqueued: idle_window=%d (seeky %d)", -+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); -+ -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { -+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && -+ blk_rq_sectors(rq) < 32; -+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); -+ -+ /* -+ * There is just this request queued: if the request -+ * is small and the queue is not to be expired, then -+ * just exit. -+ * -+ * In this way, if the device is being idled to wait -+ * for a new request from the in-service queue, we -+ * avoid unplugging the device and committing the -+ * device to serve just a small request. On the -+ * contrary, we wait for the block layer to decide -+ * when to unplug the device: hopefully, new requests -+ * will be merged to this one quickly, then the device -+ * will be unplugged and larger requests will be -+ * dispatched. -+ */ -+ if (small_req && !budget_timeout) -+ return; -+ -+ /* -+ * A large enough request arrived, or the queue is to -+ * be expired: in both cases disk idling is to be -+ * stopped, so clear wait_request flag and reset -+ * timer. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ -+ /* -+ * The queue is not empty, because a new request just -+ * arrived. Hence we can safely expire the queue, in -+ * case of budget timeout, without risking that the -+ * timestamps of the queue are not updated correctly. -+ * See [1] for more details. -+ */ -+ if (budget_timeout) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ -+ /* -+ * Let the request rip immediately, or let a new queue be -+ * selected if bfqq has just been expired. -+ */ -+ __blk_run_queue(bfqd->queue); -+ } -+} -+ -+static void bfq_insert_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ /* -+ * An unplug may trigger a requeue of a request from the device -+ * driver: make sure we are in process context while trying to -+ * merge two bfq_queues. -+ */ -+ if (!in_interrupt()) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); -+ if (new_bfqq) { -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); -+ /* -+ * Release the request's reference to the old bfqq -+ * and make sure one is taken to the shared queue. -+ */ -+ new_bfqq->allocated[rq_data_dir(rq)]++; -+ bfqq->allocated[rq_data_dir(rq)]--; -+ new_bfqq->ref++; -+ bfq_clear_bfqq_just_created(bfqq); -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), -+ bfqq, new_bfqq); -+ /* -+ * rq is about to be enqueued into new_bfqq, -+ * release rq reference on bfqq -+ */ -+ bfq_put_queue(bfqq); -+ rq->elv.priv[1] = new_bfqq; -+ bfqq = new_bfqq; -+ } -+ } -+ -+ bfq_add_request(rq); -+ -+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; -+ list_add_tail(&rq->queuelist, &bfqq->fifo); -+ -+ bfq_rq_enqueued(bfqd, bfqq, rq); -+} -+ -+static void bfq_update_hw_tag(struct bfq_data *bfqd) -+{ -+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -+ bfqd->rq_in_driver); -+ -+ if (bfqd->hw_tag == 1) -+ return; -+ -+ /* -+ * This sample is valid if the number of outstanding requests -+ * is large enough to allow a queueing behavior. Note that the -+ * sum is not exact, as it's not taking into account deactivated -+ * requests. -+ */ -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) -+ return; -+ -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; -+ bfqd->max_rq_in_driver = 0; -+ bfqd->hw_tag_samples = 0; -+} -+ -+static void bfq_completed_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ u64 now_ns; -+ u32 delta_us; -+ -+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", -+ blk_rq_sectors(rq)); -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ bfq_update_hw_tag(bfqd); -+ -+ BUG_ON(!bfqd->rq_in_driver); -+ BUG_ON(!bfqq->dispatched); -+ bfqd->rq_in_driver--; -+ bfqq->dispatched--; -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq_start_time_ns(rq), -+ rq_io_start_time_ns(rq), -+ rq->cmd_flags); -+ -+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ /* -+ * Set budget_timeout (which we overload to store the -+ * time at which the queue remains with no backlog and -+ * no outstanding request; used by the weight-raising -+ * mechanism). -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_weights_tree_remove(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ } -+ -+ now_ns = ktime_get_ns(); -+ -+ RQ_BIC(rq)->ttime.last_end_request = now_ns; -+ -+ /* -+ * Using us instead of ns, to get a reasonable precision in -+ * computing rate in next check. -+ */ -+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); -+ -+ bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ (USEC_PER_SEC* -+ (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); -+ -+ /* -+ * If the request took rather long to complete, and, according -+ * to the maximum request size recorded, this completion latency -+ * implies that the request was certainly served at a very low -+ * rate (less than 1M sectors/sec), then the whole observation -+ * interval that lasts up to this time instant cannot be a -+ * valid time interval for computing a new peak rate. Invoke -+ * bfq_update_rate_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - reset to zero samples, which will trigger a proper -+ * re-initialization of the observation interval on next -+ * dispatch -+ */ -+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && -+ (bfqd->last_rq_max_size<last_completion = now_ns; -+ -+ /* -+ * If we are waiting to discover whether the request pattern -+ * of the task associated with the queue is actually -+ * isochronous, and both requisites for this condition to hold -+ * are now satisfied, then compute soft_rt_next_start (see the -+ * comments on the function bfq_bfqq_softrt_next_start()). We -+ * schedule this delayed check when bfqq expires, if it still -+ * has in-flight requests. -+ */ -+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ -+ /* -+ * If this is the in-service queue, check if it needs to be expired, -+ * or if we want to idle in case it has no pending requests. -+ */ -+ if (bfqd->in_service_queue == bfqq) { -+ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { -+ bfq_arm_slice_timer(bfqd); -+ goto out; -+ } else if (bfq_may_expire_for_budg_timeout(bfqq)) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && -+ (bfqq->dispatched == 0 || -+ !bfq_bfqq_may_idle(bfqq))) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_NO_MORE_REQUESTS); -+ } -+ -+ if (!bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ -+out: -+ return; -+} -+ -+static int __bfq_may_queue(struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { -+ bfq_clear_bfqq_must_alloc(bfqq); -+ return ELV_MQUEUE_MUST; -+ } -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+static int bfq_may_queue(struct request_queue *q, unsigned int op) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ /* -+ * Don't force setup of a queue from here, as a call to may_queue -+ * does not necessarily imply that a request actually will be -+ * queued. So just lookup a possibly existing queue, or return -+ * 'may queue' if that fails. -+ */ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return ELV_MQUEUE_MAY; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(op)); -+ if (bfqq) -+ return __bfq_may_queue(bfqq); -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+/* -+ * Queue lock held here. -+ */ -+static void bfq_put_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ const int rw = rq_data_dir(rq); -+ -+ BUG_ON(!bfqq->allocated[rw]); -+ bfqq->allocated[rw]--; -+ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/* -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this -+ * was the last process referring to that bfqq. -+ */ -+static struct bfq_queue * -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); -+ -+ put_io_context(bic->icq.ioc); -+ -+ if (bfqq_process_refs(bfqq) == 1) { -+ bfqq->pid = current->pid; -+ bfq_clear_bfqq_coop(bfqq); -+ bfq_clear_bfqq_split_coop(bfqq); -+ return bfqq; -+ } -+ -+ bic_set_bfqq(bic, NULL, 1); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); -+ return NULL; -+} -+ -+/* -+ * Allocate bfq data structures associated with this request. -+ */ -+static int bfq_set_request(struct request_queue *q, struct request *rq, -+ struct bio *bio, gfp_t gfp_mask) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -+ const int rw = rq_data_dir(rq); -+ const int is_sync = rq_is_sync(rq); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ bool bfqq_already_existing = false, split = false; -+ -+ spin_lock_irqsave(q->queue_lock, flags); -+ -+ if (!bic) -+ goto queue_fail; -+ -+ bfq_check_ioprio_change(bic, bio); -+ -+ bfq_bic_update_cgroup(bic, bio); -+ -+new_queue: -+ bfqq = bic_to_bfqq(bic, is_sync); -+ if (!bfqq || bfqq == &bfqd->oom_bfqq) { -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: marking in " -+ "large burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: clearing in " -+ "large burst"); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ bfqq->split_time = jiffies; -+ } -+ } else { -+ /* If the queue was seeky for too long, break it apart. */ -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ -+ /* Update bic before losing reference to bfqq */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bic->saved_in_large_burst = true; -+ -+ bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; -+ if (!bfqq) -+ goto new_queue; -+ else -+ bfqq_already_existing = true; -+ } -+ } -+ -+ bfqq->allocated[rw]++; -+ bfqq->ref++; -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); -+ -+ rq->elv.priv[0] = bic; -+ rq->elv.priv[1] = bfqq; -+ -+ /* -+ * If a bfq_queue has only one process reference, it is owned -+ * by only one bfq_io_cq: we can set the bic field of the -+ * bfq_queue to the address of that structure. Also, if the -+ * queue has just been split, mark a flag so that the -+ * information is available to the other scheduler hooks. -+ */ -+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { -+ bfqq->bic = bic; -+ if (split) { -+ /* -+ * If the queue has just been split from a shared -+ * queue, restore the idle window and the possible -+ * weight raising period. -+ */ -+ bfq_bfqq_resume_state(bfqq, bfqd, bic, -+ bfqq_already_existing); -+ } -+ } -+ -+ if (unlikely(bfq_bfqq_just_created(bfqq))) -+ bfq_handle_burst(bfqd, bfqq); -+ -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 0; -+ -+queue_fail: -+ bfq_schedule_dispatch(bfqd); -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 1; -+} -+ -+static void bfq_kick_queue(struct work_struct *work) -+{ -+ struct bfq_data *bfqd = -+ container_of(work, struct bfq_data, unplug_work); -+ struct request_queue *q = bfqd->queue; -+ -+ spin_lock_irq(q->queue_lock); -+ __blk_run_queue(q); -+ spin_unlock_irq(q->queue_lock); -+} -+ -+/* -+ * Handler of the expiration of the timer running if the in-service queue -+ * is idling inside its time slice. -+ */ -+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) -+{ -+ struct bfq_data *bfqd = container_of(timer, struct bfq_data, -+ idle_slice_timer); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ enum bfqq_expiration reason; -+ -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags); -+ -+ bfqq = bfqd->in_service_queue; -+ /* -+ * Theoretical race here: the in-service queue can be NULL or -+ * different from the queue that was idling if the timer handler -+ * spins on the queue_lock and a new request arrives for the -+ * current queue and there is a full dispatch cycle that changes -+ * the in-service queue. This can hardly happen, but in the worst -+ * case we just expire a queue too early. -+ */ -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -+ bfq_clear_bfqq_wait_request(bfqq); -+ -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ } -+ -+schedule_dispatch: -+ bfq_schedule_dispatch(bfqd); -+ -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -+ return HRTIMER_NORESTART; -+} -+ -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -+{ -+ hrtimer_cancel(&bfqd->idle_slice_timer); -+ cancel_work_sync(&bfqd->unplug_work); -+} -+ -+static void __bfq_put_async_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue **bfqq_ptr) -+{ -+ struct bfq_group *root_group = bfqd->root_group; -+ struct bfq_queue *bfqq = *bfqq_ptr; -+ -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); -+ if (bfqq) { -+ bfq_bfqq_move(bfqd, bfqq, root_group); -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ *bfqq_ptr = NULL; -+ } -+} -+ -+/* -+ * Release all the bfqg references to its async queues. If we are -+ * deallocating the group these queues may still contain requests, so -+ * we reparent them to the root cgroup (i.e., the only one that will -+ * exist for sure until all the requests on a device are gone). -+ */ -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); -+ -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -+} -+ -+static void bfq_exit_queue(struct elevator_queue *e) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ struct request_queue *q = bfqd->queue; -+ struct bfq_queue *bfqq, *n; -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ spin_lock_irq(q->queue_lock); -+ -+ BUG_ON(bfqd->in_service_queue); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ -+ spin_unlock_irq(q->queue_lock); -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_deactivate_policy(q, &blkcg_policy_bfq); -+#else -+ bfq_put_async_queues(bfqd, bfqd->root_group); -+ kfree(bfqd->root_group); -+#endif -+ -+ kfree(bfqd); -+} -+ -+static void bfq_init_root_group(struct bfq_group *root_group, -+ struct bfq_data *bfqd) -+{ -+ int i; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ root_group->entity.parent = NULL; -+ root_group->my_entity = NULL; -+ root_group->bfqd = bfqd; -+#endif -+ root_group->rq_pos_tree = RB_ROOT; -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ root_group->sched_data.bfq_class_idle_last_service = jiffies; -+} -+ -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -+{ -+ struct bfq_data *bfqd; -+ struct elevator_queue *eq; -+ -+ eq = elevator_alloc(q, e); -+ if (!eq) -+ return -ENOMEM; -+ -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); -+ if (!bfqd) { -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+ } -+ eq->elevator_data = bfqd; -+ -+ /* -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. -+ * Grab a permanent reference to it, so that the normal code flow -+ * will not attempt to free it. -+ */ -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -+ bfqd->oom_bfqq.ref++; -+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; -+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; -+ bfqd->oom_bfqq.entity.new_weight = -+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); -+ -+ /* oom_bfqq does not participate to bursts */ -+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); -+ /* -+ * Trigger weight initialization, according to ioprio, at the -+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio -+ * class won't be changed any more. -+ */ -+ bfqd->oom_bfqq.entity.prio_changed = 1; -+ -+ bfqd->queue = q; -+ -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+ -+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, -+ HRTIMER_MODE_REL); -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; -+ -+ bfqd->queue_weights_tree = RB_ROOT; -+ bfqd->group_weights_tree = RB_ROOT; -+ -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); -+ -+ INIT_LIST_HEAD(&bfqd->active_list); -+ INIT_LIST_HEAD(&bfqd->idle_list); -+ INIT_HLIST_HEAD(&bfqd->burst_list); -+ -+ bfqd->hw_tag = -1; -+ -+ bfqd->bfq_max_budget = bfq_default_max_budget; -+ -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; -+ bfqd->bfq_back_max = bfq_back_max; -+ bfqd->bfq_back_penalty = bfq_back_penalty; -+ bfqd->bfq_slice_idle = bfq_slice_idle; -+ bfqd->bfq_timeout = bfq_timeout; -+ -+ bfqd->bfq_requests_within_timer = 120; -+ -+ bfqd->bfq_large_burst_thresh = 8; -+ bfqd->bfq_burst_interval = msecs_to_jiffies(180); -+ -+ bfqd->low_latency = true; -+ -+ /* -+ * Trade-off between responsiveness and fairness. -+ */ -+ bfqd->bfq_wr_coeff = 30; -+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); -+ bfqd->bfq_wr_max_time = 0; -+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); -+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); -+ bfqd->bfq_wr_max_softrt_rate = 7000; /* -+ * Approximate rate required -+ * to playback or record a -+ * high-definition compressed -+ * video. -+ */ -+ bfqd->wr_busy_queues = 0; -+ -+ /* -+ * Begin by assuming, optimistically, that the device is a -+ * high-speed one, and that its peak rate is equal to 2/3 of -+ * the highest reference rate. -+ */ -+ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * -+ T_fast[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ -+ return 0; -+ -+out_free: -+ kfree(bfqd); -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+} -+ -+static void bfq_slab_kill(void) -+{ -+ kmem_cache_destroy(bfq_pool); -+} -+ -+static int __init bfq_slab_setup(void) -+{ -+ bfq_pool = KMEM_CACHE(bfq_queue, 0); -+ if (!bfq_pool) -+ return -ENOMEM; -+ return 0; -+} -+ -+static ssize_t bfq_var_show(unsigned int var, char *page) -+{ -+ return sprintf(page, "%u\n", var); -+} -+ -+static ssize_t bfq_var_store(unsigned long *var, const char *page, -+ size_t count) -+{ -+ unsigned long new_val; -+ int ret = kstrtoul(page, 10, &new_val); -+ -+ if (ret == 0) -+ *var = new_val; -+ -+ return count; -+} -+ -+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ -+ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? -+ jiffies_to_msecs(bfqd->bfq_wr_max_time) : -+ jiffies_to_msecs(bfq_wr_duration(bfqd))); -+} -+ -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = e->elevator_data; -+ ssize_t num_char = 0; -+ -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", -+ bfqd->queued); -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ num_char += sprintf(page + num_char, "Active:\n"); -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, nr_queued %d %d, ", -+ bfqq->pid, -+ bfqq->entity.weight, -+ bfqq->queued[0], -+ bfqq->queued[1]); -+ num_char += sprintf(page + num_char, -+ "dur %d/%u\n", -+ jiffies_to_msecs( -+ jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ num_char += sprintf(page + num_char, "Idle:\n"); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, dur %d/%u\n", -+ bfqq->pid, -+ bfqq->entity.weight, -+ jiffies_to_msecs(jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+ -+ return num_char; -+} -+ -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ if (__CONV == 1) \ -+ __data = jiffies_to_msecs(__data); \ -+ else if (__CONV == 2) \ -+ __data = div_u64(__data, NSEC_PER_MSEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); -+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); -+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); -+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, -+ 1); -+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); -+#undef SHOW_FUNCTION -+ -+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ __data = div_u64(__data, NSEC_PER_USEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); -+#undef USEC_SHOW_FUNCTION -+ -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -+static ssize_t \ -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ if (__CONV == 1) \ -+ *(__PTR) = msecs_to_jiffies(__data); \ -+ else if (__CONV == 2) \ -+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ -+ else \ -+ *(__PTR) = __data; \ -+ return ret; \ -+} -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, -+ INT_MAX, 0); -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); -+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); -+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, -+ 1); -+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, -+ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, -+ INT_MAX, 0); -+#undef STORE_FUNCTION -+ -+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ -+ return ret; \ -+} -+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, -+ UINT_MAX); -+#undef USEC_STORE_FUNCTION -+ -+/* do nothing for the moment */ -+static ssize_t bfq_weights_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ return count; -+} -+ -+static ssize_t bfq_max_budget_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ else { -+ if (__data > INT_MAX) -+ __data = INT_MAX; -+ bfqd->bfq_max_budget = __data; -+ } -+ -+ bfqd->bfq_user_max_budget = __data; -+ -+ return ret; -+} -+ -+/* -+ * Leaving this name to preserve name compatibility with cfq -+ * parameters, but this timeout is used for both sync and async. -+ */ -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data < 1) -+ __data = 1; -+ else if (__data > INT_MAX) -+ __data = INT_MAX; -+ -+ bfqd->bfq_timeout = msecs_to_jiffies(__data); -+ if (bfqd->bfq_user_max_budget == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ -+ return ret; -+} -+ -+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (!bfqd->strict_guarantees && __data == 1 -+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) -+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; -+ -+ bfqd->strict_guarantees = __data; -+ -+ return ret; -+} -+ -+static ssize_t bfq_low_latency_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (__data == 0 && bfqd->low_latency != 0) -+ bfq_end_wr(bfqd); -+ bfqd->low_latency = __data; -+ -+ return ret; -+} -+ -+#define BFQ_ATTR(name) \ -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) -+ -+static struct elv_fs_entry bfq_attrs[] = { -+ BFQ_ATTR(fifo_expire_sync), -+ BFQ_ATTR(fifo_expire_async), -+ BFQ_ATTR(back_seek_max), -+ BFQ_ATTR(back_seek_penalty), -+ BFQ_ATTR(slice_idle), -+ BFQ_ATTR(slice_idle_us), -+ BFQ_ATTR(max_budget), -+ BFQ_ATTR(timeout_sync), -+ BFQ_ATTR(strict_guarantees), -+ BFQ_ATTR(low_latency), -+ BFQ_ATTR(wr_coeff), -+ BFQ_ATTR(wr_max_time), -+ BFQ_ATTR(wr_rt_max_time), -+ BFQ_ATTR(wr_min_idle_time), -+ BFQ_ATTR(wr_min_inter_arr_async), -+ BFQ_ATTR(wr_max_softrt_rate), -+ BFQ_ATTR(weights), -+ __ATTR_NULL -+}; -+ -+static struct elevator_type iosched_bfq = { -+ .ops.sq = { -+ .elevator_merge_fn = bfq_merge, -+ .elevator_merged_fn = bfq_merged_request, -+ .elevator_merge_req_fn = bfq_merged_requests, -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ .elevator_bio_merged_fn = bfq_bio_merged, -+#endif -+ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -+ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, -+ .elevator_dispatch_fn = bfq_dispatch_requests, -+ .elevator_add_req_fn = bfq_insert_request, -+ .elevator_activate_req_fn = bfq_activate_request, -+ .elevator_deactivate_req_fn = bfq_deactivate_request, -+ .elevator_completed_req_fn = bfq_completed_request, -+ .elevator_former_req_fn = elv_rb_former_request, -+ .elevator_latter_req_fn = elv_rb_latter_request, -+ .elevator_init_icq_fn = bfq_init_icq, -+ .elevator_exit_icq_fn = bfq_exit_icq, -+ .elevator_set_req_fn = bfq_set_request, -+ .elevator_put_req_fn = bfq_put_request, -+ .elevator_may_queue_fn = bfq_may_queue, -+ .elevator_init_fn = bfq_init_queue, -+ .elevator_exit_fn = bfq_exit_queue, -+ }, -+ .icq_size = sizeof(struct bfq_io_cq), -+ .icq_align = __alignof__(struct bfq_io_cq), -+ .elevator_attrs = bfq_attrs, -+ .elevator_name = "bfq-sq", -+ .elevator_owner = THIS_MODULE, -+}; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static struct blkcg_policy blkcg_policy_bfq = { -+ .dfl_cftypes = bfq_blkg_files, -+ .legacy_cftypes = bfq_blkcg_legacy_files, -+ -+ .cpd_alloc_fn = bfq_cpd_alloc, -+ .cpd_init_fn = bfq_cpd_init, -+ .cpd_bind_fn = bfq_cpd_init, -+ .cpd_free_fn = bfq_cpd_free, -+ -+ .pd_alloc_fn = bfq_pd_alloc, -+ .pd_init_fn = bfq_pd_init, -+ .pd_offline_fn = bfq_pd_offline, -+ .pd_free_fn = bfq_pd_free, -+ .pd_reset_stats_fn = bfq_pd_reset_stats, -+}; -+#endif -+ -+static int __init bfq_init(void) -+{ -+ int ret; -+ char msg[60] = "BFQ I/O-scheduler: v8r12"; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ ret = blkcg_policy_register(&blkcg_policy_bfq); -+ if (ret) -+ return ret; -+#endif -+ -+ ret = -ENOMEM; -+ if (bfq_slab_setup()) -+ goto err_pol_unreg; -+ -+ /* -+ * Times to load large popular applications for the typical -+ * systems installed on the reference devices (see the -+ * comments before the definitions of the next two -+ * arrays). Actually, we use slightly slower values, as the -+ * estimated peak rate tends to be smaller than the actual -+ * peak rate. The reason for this last fact is that estimates -+ * are computed over much shorter time intervals than the long -+ * intervals typically used for benchmarking. Why? First, to -+ * adapt more quickly to variations. Second, because an I/O -+ * scheduler cannot rely on a peak-rate-evaluation workload to -+ * be run for a long time. -+ */ -+ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ -+ T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ -+ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ -+ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ -+ -+ /* -+ * Thresholds that determine the switch between speed classes -+ * (see the comments before the definition of the array -+ * device_speed_thresh). These thresholds are biased towards -+ * transitions to the fast class. This is safer than the -+ * opposite bias. In fact, a wrong transition to the slow -+ * class results in short weight-raising periods, because the -+ * speed of the device then tends to be higher that the -+ * reference peak rate. On the opposite end, a wrong -+ * transition to the fast class tends to increase -+ * weight-raising periods, because of the opposite reason. -+ */ -+ device_speed_thresh[0] = (4 * R_slow[0]) / 3; -+ device_speed_thresh[1] = (4 * R_slow[1]) / 3; -+ -+ ret = elv_register(&iosched_bfq); -+ if (ret) -+ goto err_pol_unreg; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ strcat(msg, " (with cgroups support)"); -+#endif -+ pr_info("%s", msg); -+ -+ return 0; -+ -+err_pol_unreg: -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ return ret; -+} -+ -+static void __exit bfq_exit(void) -+{ -+ elv_unregister(&iosched_bfq); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ bfq_slab_kill(); -+} -+ -+module_init(bfq_init); -+module_exit(bfq_exit); -+ -+MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -+MODULE_LICENSE("GPL"); -diff --git a/block/bfq.h b/block/bfq.h -new file mode 100644 -index 000000000000..f5751ea59d98 ---- /dev/null -+++ b/block/bfq.h -@@ -0,0 +1,948 @@ -+/* -+ * BFQ v8r12 for 4.11.0: data structures and common functions prototypes. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2015 Paolo Valente -+ * -+ * Copyright (C) 2017 Paolo Valente -+ */ -+ -+#ifndef _BFQ_H -+#define _BFQ_H -+ -+#include -+#include -+#include -+ -+#define BFQ_IOPRIO_CLASSES 3 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5) -+ -+#define BFQ_MIN_WEIGHT 1 -+#define BFQ_MAX_WEIGHT 1000 -+#define BFQ_WEIGHT_CONVERSION_COEFF 10 -+ -+#define BFQ_DEFAULT_QUEUE_IOPRIO 4 -+ -+#define BFQ_WEIGHT_LEGACY_DFL 100 -+#define BFQ_DEFAULT_GRP_IOPRIO 0 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -+ -+/* -+ * Soft real-time applications are extremely more latency sensitive -+ * than interactive ones. Over-raise the weight of the former to -+ * privilege them against the latter. -+ */ -+#define BFQ_SOFTRT_WEIGHT_FACTOR 100 -+ -+struct bfq_entity; -+ -+/** -+ * struct bfq_service_tree - per ioprio_class service tree. -+ * -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each -+ * ioprio_class has its own independent scheduler, and so its own -+ * bfq_service_tree. All the fields are protected by the queue lock -+ * of the containing bfqd. -+ */ -+struct bfq_service_tree { -+ /* tree for active entities (i.e., those backlogged) */ -+ struct rb_root active; -+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ -+ struct rb_root idle; -+ -+ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ -+ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ -+ -+ u64 vtime; /* scheduler virtual time */ -+ /* scheduler weight sum; active and idle entities contribute to it */ -+ unsigned long wsum; -+}; -+ -+/** -+ * struct bfq_sched_data - multi-class scheduler. -+ * -+ * bfq_sched_data is the basic scheduler queue. It supports three -+ * ioprio_classes, and can be used either as a toplevel queue or as an -+ * intermediate queue on a hierarchical setup. @next_in_service -+ * points to the active entity of the sched_data service trees that -+ * will be scheduled next. It is used to reduce the number of steps -+ * needed for each hierarchical-schedule update. -+ * -+ * The supported ioprio_classes are the same as in CFQ, in descending -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. -+ * Requests from higher priority queues are served before all the -+ * requests from lower priority queues; among requests of the same -+ * queue requests are served according to B-WF2Q+. -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_sched_data { -+ struct bfq_entity *in_service_entity; /* entity in service */ -+ /* head-of-the-line entity in the scheduler (see comments above) */ -+ struct bfq_entity *next_in_service; -+ /* array of service trees, one per ioprio_class */ -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -+ /* last time CLASS_IDLE was served */ -+ unsigned long bfq_class_idle_last_service; -+ -+}; -+ -+/** -+ * struct bfq_weight_counter - counter of the number of all active entities -+ * with a given weight. -+ */ -+struct bfq_weight_counter { -+ unsigned int weight; /* weight of the entities this counter refers to */ -+ unsigned int num_active; /* nr of active entities with this weight */ -+ /* -+ * Weights tree member (see bfq_data's @queue_weights_tree and -+ * @group_weights_tree) -+ */ -+ struct rb_node weights_node; -+}; -+ -+/** -+ * struct bfq_entity - schedulable entity. -+ * -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each -+ * entity belongs to the sched_data of the parent group in the cgroup -+ * hierarchy. Non-leaf entities have also their own sched_data, stored -+ * in @my_sched_data. -+ * -+ * Each entity stores independently its priority values; this would -+ * allow different weights on different devices, but this -+ * functionality is not exported to userspace by now. Priorities and -+ * weights are updated lazily, first storing the new values into the -+ * new_* fields, then setting the @prio_changed flag. As soon as -+ * there is a transition in the entity state that allows the priority -+ * update to take place the effective and the requested priority -+ * values are synchronized. -+ * -+ * Unless cgroups are used, the weight value is calculated from the -+ * ioprio to export the same interface as CFQ. When dealing with -+ * ``well-behaved'' queues (i.e., queues that do not spend too much -+ * time to consume their budget and have true sequential behavior, and -+ * when there are no external factors breaking anticipation) the -+ * relative weights at each level of the cgroups hierarchy should be -+ * guaranteed. All the fields are protected by the queue lock of the -+ * containing bfqd. -+ */ -+struct bfq_entity { -+ struct rb_node rb_node; /* service_tree member */ -+ /* pointer to the weight counter associated with this entity */ -+ struct bfq_weight_counter *weight_counter; -+ -+ /* -+ * Flag, true if the entity is on a tree (either the active or -+ * the idle one of its service_tree) or is in service. -+ */ -+ bool on_st; -+ -+ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ -+ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ -+ -+ /* tree the entity is enqueued into; %NULL if not on a tree */ -+ struct rb_root *tree; -+ -+ /* -+ * minimum start time of the (active) subtree rooted at this -+ * entity; used for O(log N) lookups into active trees -+ */ -+ u64 min_start; -+ -+ /* amount of service received during the last service slot */ -+ int service; -+ -+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ -+ int budget; -+ -+ unsigned int weight; /* weight of the queue */ -+ unsigned int new_weight; /* next weight if a change is in progress */ -+ -+ /* original weight, used to implement weight boosting */ -+ unsigned int orig_weight; -+ -+ /* parent entity, for hierarchical scheduling */ -+ struct bfq_entity *parent; -+ -+ /* -+ * For non-leaf nodes in the hierarchy, the associated -+ * scheduler queue, %NULL on leaf nodes. -+ */ -+ struct bfq_sched_data *my_sched_data; -+ /* the scheduler queue this entity belongs to */ -+ struct bfq_sched_data *sched_data; -+ -+ /* flag, set to request a weight, ioprio or ioprio_class change */ -+ int prio_changed; -+}; -+ -+struct bfq_group; -+ -+/** -+ * struct bfq_queue - leaf schedulable entity. -+ * -+ * A bfq_queue is a leaf request queue; it can be associated with an -+ * io_context or more, if it is async or shared between cooperating -+ * processes. @cgroup holds a reference to the cgroup, to be sure that it -+ * does not disappear while a bfqq still references it (mostly to avoid -+ * races between request issuing and task migration followed by cgroup -+ * destruction). -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_queue { -+ /* reference counter */ -+ int ref; -+ /* parent bfq_data */ -+ struct bfq_data *bfqd; -+ -+ /* current ioprio and ioprio class */ -+ unsigned short ioprio, ioprio_class; -+ /* next ioprio and ioprio class if a change is in progress */ -+ unsigned short new_ioprio, new_ioprio_class; -+ -+ /* -+ * Shared bfq_queue if queue is cooperating with one or more -+ * other queues. -+ */ -+ struct bfq_queue *new_bfqq; -+ /* request-position tree member (see bfq_group's @rq_pos_tree) */ -+ struct rb_node pos_node; -+ /* request-position tree root (see bfq_group's @rq_pos_tree) */ -+ struct rb_root *pos_root; -+ -+ /* sorted list of pending requests */ -+ struct rb_root sort_list; -+ /* if fifo isn't expired, next request to serve */ -+ struct request *next_rq; -+ /* number of sync and async requests queued */ -+ int queued[2]; -+ /* number of sync and async requests currently allocated */ -+ int allocated[2]; -+ /* number of pending metadata requests */ -+ int meta_pending; -+ /* fifo list of requests in sort_list */ -+ struct list_head fifo; -+ -+ /* entity representing this queue in the scheduler */ -+ struct bfq_entity entity; -+ -+ /* maximum budget allowed from the feedback mechanism */ -+ int max_budget; -+ /* budget expiration (in jiffies) */ -+ unsigned long budget_timeout; -+ -+ /* number of requests on the dispatch list or inside driver */ -+ int dispatched; -+ -+ unsigned int flags; /* status flags.*/ -+ -+ /* node for active/idle bfqq list inside parent bfqd */ -+ struct list_head bfqq_list; -+ -+ /* bit vector: a 1 for each seeky requests in history */ -+ u32 seek_history; -+ -+ /* node for the device's burst list */ -+ struct hlist_node burst_list_node; -+ -+ /* position of the last request enqueued */ -+ sector_t last_request_pos; -+ -+ /* Number of consecutive pairs of request completion and -+ * arrival, such that the queue becomes idle after the -+ * completion, but the next request arrives within an idle -+ * time slice; used only if the queue's IO_bound flag has been -+ * cleared. -+ */ -+ unsigned int requests_within_timer; -+ -+ /* pid of the process owning the queue, used for logging purposes */ -+ pid_t pid; -+ -+ /* -+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL -+ * if the queue is shared. -+ */ -+ struct bfq_io_cq *bic; -+ -+ /* current maximum weight-raising time for this queue */ -+ unsigned long wr_cur_max_time; -+ /* -+ * Minimum time instant such that, only if a new request is -+ * enqueued after this time instant in an idle @bfq_queue with -+ * no outstanding requests, then the task associated with the -+ * queue it is deemed as soft real-time (see the comments on -+ * the function bfq_bfqq_softrt_next_start()) -+ */ -+ unsigned long soft_rt_next_start; -+ /* -+ * Start time of the current weight-raising period if -+ * the @bfq-queue is being weight-raised, otherwise -+ * finish time of the last weight-raising period. -+ */ -+ unsigned long last_wr_start_finish; -+ /* factor by which the weight of this queue is multiplied */ -+ unsigned int wr_coeff; -+ /* -+ * Time of the last transition of the @bfq_queue from idle to -+ * backlogged. -+ */ -+ unsigned long last_idle_bklogged; -+ /* -+ * Cumulative service received from the @bfq_queue since the -+ * last transition from idle to backlogged. -+ */ -+ unsigned long service_from_backlogged; -+ /* -+ * Value of wr start time when switching to soft rt -+ */ -+ unsigned long wr_start_at_switch_to_srt; -+ -+ unsigned long split_time; /* time of last split */ -+}; -+ -+/** -+ * struct bfq_ttime - per process thinktime stats. -+ */ -+struct bfq_ttime { -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ -+ -+}; -+ -+/** -+ * struct bfq_io_cq - per (request_queue, io_context) structure. -+ */ -+struct bfq_io_cq { -+ /* associated io_cq structure */ -+ struct io_cq icq; /* must be the first member */ -+ /* array of two process queues, the sync and the async */ -+ struct bfq_queue *bfqq[2]; -+ /* associated @bfq_ttime struct */ -+ struct bfq_ttime ttime; -+ /* per (request_queue, blkcg) ioprio */ -+ int ioprio; -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ uint64_t blkcg_serial_nr; /* the current blkcg serial */ -+#endif -+ -+ /* -+ * Snapshot of the idle window before merging; taken to -+ * remember this value while the queue is merged, so as to be -+ * able to restore it in case of split. -+ */ -+ bool saved_idle_window; -+ /* -+ * Same purpose as the previous two fields for the I/O bound -+ * classification of a queue. -+ */ -+ bool saved_IO_bound; -+ -+ /* -+ * Same purpose as the previous fields for the value of the -+ * field keeping the queue's belonging to a large burst -+ */ -+ bool saved_in_large_burst; -+ /* -+ * True if the queue belonged to a burst list before its merge -+ * with another cooperating queue. -+ */ -+ bool was_in_burst_list; -+ -+ /* -+ * Similar to previous fields: save wr information. -+ */ -+ unsigned long saved_wr_coeff; -+ unsigned long saved_last_wr_start_finish; -+ unsigned long saved_wr_start_at_switch_to_srt; -+ unsigned int saved_wr_cur_max_time; -+}; -+ -+enum bfq_device_speed { -+ BFQ_BFQD_FAST, -+ BFQ_BFQD_SLOW, -+}; -+ -+/** -+ * struct bfq_data - per-device data structure. -+ * -+ * All the fields are protected by the @queue lock. -+ */ -+struct bfq_data { -+ /* request queue for the device */ -+ struct request_queue *queue; -+ -+ /* root bfq_group for the device */ -+ struct bfq_group *root_group; -+ -+ /* -+ * rbtree of weight counters of @bfq_queues, sorted by -+ * weight. Used to keep track of whether all @bfq_queues have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active and not -+ * weight-raised @bfq_queue (see the comments to the functions -+ * bfq_weights_tree_[add|remove] for further details). -+ */ -+ struct rb_root queue_weights_tree; -+ /* -+ * rbtree of non-queue @bfq_entity weight counters, sorted by -+ * weight. Used to keep track of whether all @bfq_groups have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active @bfq_group (see -+ * the comments to the functions bfq_weights_tree_[add|remove] -+ * for further details). -+ */ -+ struct rb_root group_weights_tree; -+ -+ /* -+ * Number of bfq_queues containing requests (including the -+ * queue in service, even if it is idling). -+ */ -+ int busy_queues; -+ /* number of weight-raised busy @bfq_queues */ -+ int wr_busy_queues; -+ /* number of queued requests */ -+ int queued; -+ /* number of requests dispatched and waiting for completion */ -+ int rq_in_driver; -+ -+ /* -+ * Maximum number of requests in driver in the last -+ * @hw_tag_samples completed requests. -+ */ -+ int max_rq_in_driver; -+ /* number of samples used to calculate hw_tag */ -+ int hw_tag_samples; -+ /* flag set to one if the driver is showing a queueing behavior */ -+ int hw_tag; -+ -+ /* number of budgets assigned */ -+ int budgets_assigned; -+ -+ /* -+ * Timer set when idling (waiting) for the next request from -+ * the queue in service. -+ */ -+ struct hrtimer idle_slice_timer; -+ /* delayed work to restart dispatching on the request queue */ -+ struct work_struct unplug_work; -+ -+ /* bfq_queue in service */ -+ struct bfq_queue *in_service_queue; -+ /* bfq_io_cq (bic) associated with the @in_service_queue */ -+ struct bfq_io_cq *in_service_bic; -+ -+ /* on-disk position of the last served request */ -+ sector_t last_position; -+ -+ /* time of last request completion (ns) */ -+ u64 last_completion; -+ -+ /* time of first rq dispatch in current observation interval (ns) */ -+ u64 first_dispatch; -+ /* time of last rq dispatch in current observation interval (ns) */ -+ u64 last_dispatch; -+ -+ /* beginning of the last budget */ -+ ktime_t last_budget_start; -+ /* beginning of the last idle slice */ -+ ktime_t last_idling_start; -+ -+ /* number of samples in current observation interval */ -+ int peak_rate_samples; -+ /* num of samples of seq dispatches in current observation interval */ -+ u32 sequential_samples; -+ /* total num of sectors transferred in current observation interval */ -+ u64 tot_sectors_dispatched; -+ /* max rq size seen during current observation interval (sectors) */ -+ u32 last_rq_max_size; -+ /* time elapsed from first dispatch in current observ. interval (us) */ -+ u64 delta_from_first; -+ /* current estimate of device peak rate */ -+ u32 peak_rate; -+ -+ /* maximum budget allotted to a bfq_queue before rescheduling */ -+ int bfq_max_budget; -+ -+ /* list of all the bfq_queues active on the device */ -+ struct list_head active_list; -+ /* list of all the bfq_queues idle on the device */ -+ struct list_head idle_list; -+ -+ /* -+ * Timeout for async/sync requests; when it fires, requests -+ * are served in fifo order. -+ */ -+ u64 bfq_fifo_expire[2]; -+ /* weight of backward seeks wrt forward ones */ -+ unsigned int bfq_back_penalty; -+ /* maximum allowed backward seek */ -+ unsigned int bfq_back_max; -+ /* maximum idling time */ -+ u32 bfq_slice_idle; -+ -+ /* user-configured max budget value (0 for auto-tuning) */ -+ int bfq_user_max_budget; -+ /* -+ * Timeout for bfq_queues to consume their budget; used to -+ * prevent seeky queues from imposing long latencies to -+ * sequential or quasi-sequential ones (this also implies that -+ * seeky queues cannot receive guarantees in the service -+ * domain; after a timeout they are charged for the time they -+ * have been in service, to preserve fairness among them, but -+ * without service-domain guarantees). -+ */ -+ unsigned int bfq_timeout; -+ -+ /* -+ * Number of consecutive requests that must be issued within -+ * the idle time slice to set again idling to a queue which -+ * was marked as non-I/O-bound (see the definition of the -+ * IO_bound flag for further details). -+ */ -+ unsigned int bfq_requests_within_timer; -+ -+ /* -+ * Force device idling whenever needed to provide accurate -+ * service guarantees, without caring about throughput -+ * issues. CAVEAT: this may even increase latencies, in case -+ * of useless idling for processes that did stop doing I/O. -+ */ -+ bool strict_guarantees; -+ -+ /* -+ * Last time at which a queue entered the current burst of -+ * queues being activated shortly after each other; for more -+ * details about this and the following parameters related to -+ * a burst of activations, see the comments on the function -+ * bfq_handle_burst. -+ */ -+ unsigned long last_ins_in_burst; -+ /* -+ * Reference time interval used to decide whether a queue has -+ * been activated shortly after @last_ins_in_burst. -+ */ -+ unsigned long bfq_burst_interval; -+ /* number of queues in the current burst of queue activations */ -+ int burst_size; -+ -+ /* common parent entity for the queues in the burst */ -+ struct bfq_entity *burst_parent_entity; -+ /* Maximum burst size above which the current queue-activation -+ * burst is deemed as 'large'. -+ */ -+ unsigned long bfq_large_burst_thresh; -+ /* true if a large queue-activation burst is in progress */ -+ bool large_burst; -+ /* -+ * Head of the burst list (as for the above fields, more -+ * details in the comments on the function bfq_handle_burst). -+ */ -+ struct hlist_head burst_list; -+ -+ /* if set to true, low-latency heuristics are enabled */ -+ bool low_latency; -+ /* -+ * Maximum factor by which the weight of a weight-raised queue -+ * is multiplied. -+ */ -+ unsigned int bfq_wr_coeff; -+ /* maximum duration of a weight-raising period (jiffies) */ -+ unsigned int bfq_wr_max_time; -+ -+ /* Maximum weight-raising duration for soft real-time processes */ -+ unsigned int bfq_wr_rt_max_time; -+ /* -+ * Minimum idle period after which weight-raising may be -+ * reactivated for a queue (in jiffies). -+ */ -+ unsigned int bfq_wr_min_idle_time; -+ /* -+ * Minimum period between request arrivals after which -+ * weight-raising may be reactivated for an already busy async -+ * queue (in jiffies). -+ */ -+ unsigned long bfq_wr_min_inter_arr_async; -+ -+ /* Max service-rate for a soft real-time queue, in sectors/sec */ -+ unsigned int bfq_wr_max_softrt_rate; -+ /* -+ * Cached value of the product R*T, used for computing the -+ * maximum duration of weight raising automatically. -+ */ -+ u64 RT_prod; -+ /* device-speed class for the low-latency heuristic */ -+ enum bfq_device_speed device_speed; -+ -+ /* fallback dummy bfqq for extreme OOM conditions */ -+ struct bfq_queue oom_bfqq; -+}; -+ -+enum bfqq_state_flags { -+ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ -+ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ -+ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* -+ * waiting for a request -+ * without idling the device -+ */ -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -+ BFQ_BFQQ_FLAG_IO_bound, /* -+ * bfqq has timed-out at least once -+ * having consumed at most 2/10 of -+ * its budget -+ */ -+ BFQ_BFQQ_FLAG_in_large_burst, /* -+ * bfqq activated in a large burst, -+ * see comments to bfq_handle_burst. -+ */ -+ BFQ_BFQQ_FLAG_softrt_update, /* -+ * may need softrt-next-start -+ * update -+ */ -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ -+ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ -+}; -+ -+#define BFQ_BFQQ_FNS(name) \ -+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -+{ \ -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -+} -+ -+BFQ_BFQQ_FNS(just_created); -+BFQ_BFQQ_FNS(busy); -+BFQ_BFQQ_FNS(wait_request); -+BFQ_BFQQ_FNS(non_blocking_wait_rq); -+BFQ_BFQQ_FNS(must_alloc); -+BFQ_BFQQ_FNS(fifo_expire); -+BFQ_BFQQ_FNS(idle_window); -+BFQ_BFQQ_FNS(sync); -+BFQ_BFQQ_FNS(IO_bound); -+BFQ_BFQQ_FNS(in_large_burst); -+BFQ_BFQQ_FNS(coop); -+BFQ_BFQQ_FNS(split_coop); -+BFQ_BFQQ_FNS(softrt_update); -+#undef BFQ_BFQQ_FNS -+ -+/* Logging facilities. */ -+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE -+ -+static const char *checked_dev_name(const struct device *dev) -+{ -+ static const char nodev[] = "nodev"; -+ -+ if (dev) -+ return dev_name(dev); -+ -+ return nodev; -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s bfq%d%c %s " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s %s " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ pr_crit("%s bfq%d%c " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ pr_crit("%s bfq " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ ##args) -+ -+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -+} while (0) -+ -+#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+/* Expiration reasons. */ -+enum bfqq_expiration { -+ BFQ_BFQQ_TOO_IDLE = 0, /* -+ * queue has been idling for -+ * too long -+ */ -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -+ BFQ_BFQQ_PREEMPTED /* preemption in progress */ -+}; -+ -+ -+struct bfqg_stats { -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ /* number of ios merged */ -+ struct blkg_rwstat merged; -+ /* total time spent on device in ns, may not be accurate w/ queueing */ -+ struct blkg_rwstat service_time; -+ /* total time spent waiting in scheduler queue in ns */ -+ struct blkg_rwstat wait_time; -+ /* number of IOs queued up */ -+ struct blkg_rwstat queued; -+ /* total disk time and nr sectors dispatched by this group */ -+ struct blkg_stat time; -+ /* sum of number of ios queued across all samples */ -+ struct blkg_stat avg_queue_size_sum; -+ /* count of samples taken for average */ -+ struct blkg_stat avg_queue_size_samples; -+ /* how many times this group has been removed from service tree */ -+ struct blkg_stat dequeue; -+ /* total time spent waiting for it to be assigned a timeslice. */ -+ struct blkg_stat group_wait_time; -+ /* time spent idling for this blkcg_gq */ -+ struct blkg_stat idle_time; -+ /* total time with empty current active q with other requests queued */ -+ struct blkg_stat empty_time; -+ /* fields after this shouldn't be cleared on stat reset */ -+ uint64_t start_group_wait_time; -+ uint64_t start_idle_time; -+ uint64_t start_empty_time; -+ uint16_t flags; -+#endif -+}; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+/* -+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. -+ * -+ * @ps: @blkcg_policy_storage that this structure inherits -+ * @weight: weight of the bfq_group -+ */ -+struct bfq_group_data { -+ /* must be the first member */ -+ struct blkcg_policy_data pd; -+ -+ unsigned int weight; -+}; -+ -+/** -+ * struct bfq_group - per (device, cgroup) data structure. -+ * @entity: schedulable entity to insert into the parent group sched_data. -+ * @sched_data: own sched_data, to contain child entities (they may be -+ * both bfq_queues and bfq_groups). -+ * @bfqd: the bfq_data for the device this group acts upon. -+ * @async_bfqq: array of async queues for all the tasks belonging to -+ * the group, one queue per ioprio value per ioprio_class, -+ * except for the idle class that has only one queue. -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used -+ * to avoid too many special cases during group creation/ -+ * migration. -+ * @active_entities: number of active entities belonging to the group; -+ * unused for the root group. Used to know whether there -+ * are groups with more than one active @bfq_entity -+ * (see the comments to the function -+ * bfq_bfqq_may_idle()). -+ * @rq_pos_tree: rbtree sorted by next_request position, used when -+ * determining if two or more queues have interleaving -+ * requests (see bfq_find_close_cooperator()). -+ * -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup -+ * there is a set of bfq_groups, each one collecting the lower-level -+ * entities belonging to the group that are acting on the same device. -+ * -+ * Locking works as follows: -+ * o @bfqd is protected by the queue lock, RCU is used to access it -+ * from the readers. -+ * o All the other fields are protected by the @bfqd queue lock. -+ */ -+struct bfq_group { -+ /* must be the first member */ -+ struct blkg_policy_data pd; -+ -+ struct bfq_entity entity; -+ struct bfq_sched_data sched_data; -+ -+ void *bfqd; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct bfq_entity *my_entity; -+ -+ int active_entities; -+ -+ struct rb_root rq_pos_tree; -+ -+ struct bfqg_stats stats; -+}; -+ -+#else -+struct bfq_group { -+ struct bfq_sched_data sched_data; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct rb_root rq_pos_tree; -+}; -+#endif -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -+ -+static unsigned int bfq_class_idx(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ return bfqq ? bfqq->ioprio_class - 1 : -+ BFQ_DEFAULT_GRP_CLASS - 1; -+} -+ -+static struct bfq_service_tree * -+bfq_entity_service_tree(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sched_data = entity->sched_data; -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int idx = bfq_class_idx(entity); -+ -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); -+ BUG_ON(sched_data == NULL); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+ } -+#endif -+ return sched_data->service_tree + idx; -+} -+ -+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -+{ -+ return bic->bfqq[is_sync]; -+} -+ -+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, -+ bool is_sync) -+{ -+ bic->bfqq[is_sync] = bfqq; -+} -+ -+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -+{ -+ return bic->icq.q->elevator->elevator_data; -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ if (!group_entity) -+ group_entity = &bfqq->bfqd->root_group->entity; -+ -+ return container_of(group_entity, struct bfq_group, entity); -+} -+ -+#else -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+#endif -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); -+static void bfq_put_queue(struct bfq_queue *bfqq); -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic); -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -+#endif -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -+ -+#endif /* _BFQ_H */ -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 8da66379f7ea..bf000c58644b 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -54,7 +54,7 @@ struct blk_stat_callback; - * Maximum number of blkcg policies allowed to be registered concurrently. - * Defined here to simplify include dependency. - */ --#define BLKCG_MAX_POLS 3 -+#define BLKCG_MAX_POLS 4 - - typedef void (rq_end_io_fn)(struct request *, blk_status_t); - - -From 9916fed6c89c61a2b26053be04501784570bbec8 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Thu, 20 Jul 2017 10:46:39 +0200 -Subject: [PATCH 02/51] Add extra checks related to entity scheduling - -- extra checks related to ioprioi-class changes -- specific check on st->idle in __bfq_requeue_entity - -Signed-off-by: Paolo Valente ---- - block/bfq-sched.c | 9 ++++++++- - 1 file changed, 8 insertions(+), 1 deletion(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index ac8991bca9fa..5ddf9af4261e 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -812,6 +812,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - } - #endif - -+ BUG_ON(entity->tree && update_class_too); - BUG_ON(old_st->wsum < entity->weight); - old_st->wsum -= entity->weight; - -@@ -883,8 +884,10 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - - new_st->wsum += entity->weight; - -- if (new_st != old_st) -+ if (new_st != old_st) { -+ BUG_ON(!update_class_too); - entity->start = new_st->vtime; -+ } - } - - return new_st; -@@ -993,6 +996,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - * tree, then it is safe to invoke next function with the last - * parameter set (see the comments on the function). - */ -+ BUG_ON(entity->tree); - st = __bfq_entity_update_weight_prio(st, entity, true); - bfq_calc_finish(entity, entity->budget); - -@@ -1113,9 +1117,11 @@ static void __bfq_activate_entity(struct bfq_entity *entity, - * check for that. - */ - bfq_idle_extract(st, entity); -+ BUG_ON(entity->tree); - entity->start = bfq_gt(min_vstart, entity->finish) ? - min_vstart : entity->finish; - } else { -+ BUG_ON(entity->tree); - /* - * The finish time of the entity may be invalid, and - * it is in the past for sure, otherwise the queue -@@ -1203,6 +1209,7 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) - */ - bfq_calc_finish(entity, entity->service); - entity->start = entity->finish; -+ BUG_ON(entity->tree && entity->tree == &st->idle); - BUG_ON(entity->tree && entity->tree != &st->active); - /* - * In addition, if the entity had more than one child - -From 8f5b2c25dcbe31dda524e85b921b3aa1fe11d111 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Fri, 21 Jul 2017 12:08:57 +0200 -Subject: [PATCH 03/51] block, bfq: reset in_service_entity if it becomes idle - -BFQ implements hierarchical scheduling by representing each group of -queues with a generic parent entity. For each parent entity, BFQ -maintains an in_service_entity pointer: if one of the child entities -happens to be in service, in_service_entity points to it. The -resetting of these pointers happens only on queue expirations: when -the in-service queue is expired, i.e., stops to be the queue in -service, BFQ resets all in_service_entity pointers along the -parent-entity path from this queue to the root entity. - -Functions handling the scheduling of entities assume, naturally, that -in-service entities are active, i.e., have pending I/O requests (or, -as a special case, even if they have no pending requests, they are -expected to receive a new request very soon, with the scheduler idling -the storage device while waiting for such an event). Unfortunately, -the above resetting scheme of the in_service_entity pointers may cause -this assumption to be violated. For example, the in-service queue may -happen to remain without requests because of a request merge. In this -case the queue does become idle, and all related data structures are -updated accordingly. But in_service_entity still points to the queue -in the parent entity. This inconsistency may even propagate to -higher-level parent entities, if they happen to become idle as well, -as a consequence of the leaf queue becoming idle. For this queue and -parent entities, scheduling functions have an undefined behaviour, -and, as reported, may easily lead to kernel crashes or hangs. - -This commit addresses this issue by simply resetting the -in_service_entity field also when it is detected to point to an entity -becoming idle (regardless of why the entity becomes idle). - -Reported-by: Laurentiu Nicola -Signed-off-by: Paolo Valente -Tested-by: Laurentiu Nicola ---- - block/bfq-sched.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 5ddf9af4261e..a07a06eb5c72 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1336,8 +1336,10 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, - - BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); - -- if (is_in_service) -+ if (is_in_service) { - bfq_calc_finish(entity, entity->service); -+ sd->in_service_entity = NULL; -+ } - - if (entity->tree == &st->active) - bfq_active_extract(st, entity); - -From 600ea668e2d340c95724bcf981d88812d6900342 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Fri, 28 Jul 2017 21:09:51 +0200 -Subject: [PATCH 04/51] block, bfq: consider also in_service_entity to state - whether an entity is active - -Groups of BFQ queues are represented by generic entities in BFQ. When -a queue belonging to a parent entity is deactivated, the parent entity -may need to be deactivated too, in case the deactivated queue was the -only active queue for the parent entity. This deactivation may need to -be propagated upwards if the entity belongs, in its turn, to a further -higher-level entity, and so on. In particular, the upward propagation -of deactivation stops at the first parent entity that remains active -even if one of its child entities has been deactivated. - -To decide whether the last non-deactivation condition holds for a -parent entity, BFQ checks whether the field next_in_service is still -not NULL for the parent entity, after the deactivation of one of its -child entity. If it is not NULL, then there are certainly other active -entities in the parent entity, and deactivations can stop. - -Unfortunately, this check misses a corner case: if in_service_entity -is not NULL, then next_in_service may happen to be NULL, although the -parent entity is evidently active. This happens if: 1) the entity -pointed by in_service_entity is the only active entity in the parent -entity, and 2) according to the definition of next_in_service, the -in_service_entity cannot be considered as next_in_service. See the -comments on the definition of next_in_service for details on this -second point. - -Hitting the above corner case causes crashes. - -To address this issue, this commit: -1) Extends the above check on only next_in_service to controlling both -next_in_service and in_service_entity (if any of them is not NULL, -then no further deactivation is performed) -2) Improves the (important) comments on how next_in_service is defined -and updated; in particular it fixes a few rather obscure paragraphs - -Reported-by: Eric Wheeler -Reported-by: Rick Yiu -Reported-by: Tom X Nguyen -Signed-off-by: Paolo Valente -Tested-by: Eric Wheeler -Tested-by: Rick Yiu -Tested-by: Laurentiu Nicola -Tested-by: Tom X Nguyen ---- - block/bfq-sched.c | 140 ++++++++++++++++++++++++++++++------------------------ - block/bfq.h | 23 +++++++-- - 2 files changed, 95 insertions(+), 68 deletions(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index a07a06eb5c72..5c0f9290a79c 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -196,21 +196,23 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) - - /* - * This function tells whether entity stops being a candidate for next -- * service, according to the following logic. -+ * service, according to the restrictive definition of the field -+ * next_in_service. In particular, this function is invoked for an -+ * entity that is about to be set in service. - * -- * This function is invoked for an entity that is about to be set in -- * service. If such an entity is a queue, then the entity is no longer -- * a candidate for next service (i.e, a candidate entity to serve -- * after the in-service entity is expired). The function then returns -- * true. -+ * If entity is a queue, then the entity is no longer a candidate for -+ * next service according to the that definition, because entity is -+ * about to become the in-service queue. This function then returns -+ * true if entity is a queue. - * -- * In contrast, the entity could stil be a candidate for next service -- * if it is not a queue, and has more than one child. In fact, even if -- * one of its children is about to be set in service, other children -- * may still be the next to serve. As a consequence, a non-queue -- * entity is not a candidate for next-service only if it has only one -- * child. And only if this condition holds, then the function returns -- * true for a non-queue entity. -+ * In contrast, entity could still be a candidate for next service if -+ * it is not a queue, and has more than one active child. In fact, -+ * even if one of its children is about to be set in service, other -+ * active children may still be the next to serve, for the parent -+ * entity, even according to the above definition. As a consequence, a -+ * non-queue entity is not a candidate for next-service only if it has -+ * only one active child. And only if this condition holds, then this -+ * function returns true for a non-queue entity. - */ - static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - { -@@ -223,6 +225,18 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - - BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); - BUG_ON(bfqg->active_entities == 0); -+ /* -+ * The field active_entities does not always contain the -+ * actual number of active children entities: it happens to -+ * not account for the in-service entity in case the latter is -+ * removed from its active tree (which may get done after -+ * invoking the function bfq_no_longer_next_in_service in -+ * bfq_get_next_queue). Fortunately, here, i.e., while -+ * bfq_no_longer_next_in_service is not yet completed in -+ * bfq_get_next_queue, bfq_active_extract has not yet been -+ * invoked, and thus active_entities still coincides with the -+ * actual number of active entities. -+ */ - if (bfqg->active_entities == 1) - return true; - -@@ -1089,7 +1103,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - * one of its children receives a new request. - * - * Basically, this function updates the timestamps of entity and -- * inserts entity into its active tree, ater possible extracting it -+ * inserts entity into its active tree, ater possibly extracting it - * from its idle tree. - */ - static void __bfq_activate_entity(struct bfq_entity *entity, -@@ -1213,7 +1227,7 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) - BUG_ON(entity->tree && entity->tree != &st->active); - /* - * In addition, if the entity had more than one child -- * when set in service, then was not extracted from -+ * when set in service, then it was not extracted from - * the active tree. This implies that the position of - * the entity in the active tree may need to be - * changed now, because we have just updated the start -@@ -1221,9 +1235,8 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) - * time in a moment (the requeueing is then, more - * precisely, a repositioning in this case). To - * implement this repositioning, we: 1) dequeue the -- * entity here, 2) update the finish time and -- * requeue the entity according to the new -- * timestamps below. -+ * entity here, 2) update the finish time and requeue -+ * the entity according to the new timestamps below. - */ - if (entity->tree) - bfq_active_extract(st, entity); -@@ -1270,9 +1283,9 @@ static void __bfq_activate_requeue_entity(struct bfq_entity *entity, - - - /** -- * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, -- * and activate, requeue or reposition all ancestors -- * for which such an update becomes necessary. -+ * bfq_activate_requeue_entity - activate or requeue an entity representing a bfq_queue, -+ * and activate, requeue or reposition all ancestors -+ * for which such an update becomes necessary. - * @entity: the entity to activate. - * @non_blocking_wait_rq: true if this entity was waiting for a request - * @requeue: true if this is a requeue, which implies that bfqq is -@@ -1308,9 +1321,9 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, - * @ins_into_idle_tree: if false, the entity will not be put into the - * idle tree. - * -- * Deactivates an entity, independently from its previous state. Must -+ * Deactivates an entity, independently of its previous state. Must - * be invoked only if entity is on a service tree. Extracts the entity -- * from that tree, and if necessary and allowed, puts it on the idle -+ * from that tree, and if necessary and allowed, puts it into the idle - * tree. - */ - static bool __bfq_deactivate_entity(struct bfq_entity *entity, -@@ -1359,7 +1372,7 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, - /** - * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. - * @entity: the entity to deactivate. -- * @ins_into_idle_tree: true if the entity can be put on the idle tree -+ * @ins_into_idle_tree: true if the entity can be put into the idle tree - */ - static void bfq_deactivate_entity(struct bfq_entity *entity, - bool ins_into_idle_tree, -@@ -1406,16 +1419,29 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, - */ - bfq_update_next_in_service(sd, NULL); - -- if (sd->next_in_service) { -+ if (sd->next_in_service || sd->in_service_entity) { - /* -- * The parent entity is still backlogged, -- * because next_in_service is not NULL. So, no -- * further upwards deactivation must be -- * performed. Yet, next_in_service has -- * changed. Then the schedule does need to be -- * updated upwards. -+ * The parent entity is still active, because -+ * either next_in_service or in_service_entity -+ * is not NULL. So, no further upwards -+ * deactivation must be performed. Yet, -+ * next_in_service has changed. Then the -+ * schedule does need to be updated upwards. -+ * -+ * NOTE If in_service_entity is not NULL, then -+ * next_in_service may happen to be NULL, -+ * although the parent entity is evidently -+ * active. This happens if 1) the entity -+ * pointed by in_service_entity is the only -+ * active entity in the parent entity, and 2) -+ * according to the definition of -+ * next_in_service, the in_service_entity -+ * cannot be considered as -+ * next_in_service. See the comments on the -+ * definition of next_in_service for details. - */ - BUG_ON(sd->next_in_service == entity); -+ BUG_ON(sd->in_service_entity == entity); - break; - } - -@@ -1806,45 +1832,33 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - - /* - * If entity is no longer a candidate for next -- * service, then we extract it from its active tree, -- * for the following reason. To further boost the -- * throughput in some special case, BFQ needs to know -- * which is the next candidate entity to serve, while -- * there is already an entity in service. In this -- * respect, to make it easy to compute/update the next -- * candidate entity to serve after the current -- * candidate has been set in service, there is a case -- * where it is necessary to extract the current -- * candidate from its service tree. Such a case is -- * when the entity just set in service cannot be also -- * a candidate for next service. Details about when -- * this conditions holds are reported in the comments -- * on the function bfq_no_longer_next_in_service() -- * invoked below. -+ * service, then it must be extracted from its active -+ * tree, so as to make sure that it won't be -+ * considered when computing next_in_service. See the -+ * comments on the function -+ * bfq_no_longer_next_in_service() for details. - */ - if (bfq_no_longer_next_in_service(entity)) - bfq_active_extract(bfq_entity_service_tree(entity), - entity); - - /* -- * For the same reason why we may have just extracted -- * entity from its active tree, we may need to update -- * next_in_service for the sched_data of entity too, -- * regardless of whether entity has been extracted. -- * In fact, even if entity has not been extracted, a -- * descendant entity may get extracted. Such an event -- * would cause a change in next_in_service for the -- * level of the descendant entity, and thus possibly -- * back to upper levels. -+ * Even if entity is not to be extracted according to -+ * the above check, a descendant entity may get -+ * extracted in one of the next iterations of this -+ * loop. Such an event could cause a change in -+ * next_in_service for the level of the descendant -+ * entity, and thus possibly back to this level. - * -- * We cannot perform the resulting needed update -- * before the end of this loop, because, to know which -- * is the correct next-to-serve candidate entity for -- * each level, we need first to find the leaf entity -- * to set in service. In fact, only after we know -- * which is the next-to-serve leaf entity, we can -- * discover whether the parent entity of the leaf -- * entity becomes the next-to-serve, and so on. -+ * However, we cannot perform the resulting needed -+ * update of next_in_service for this level before the -+ * end of the whole loop, because, to know which is -+ * the correct next-to-serve candidate entity for each -+ * level, we need first to find the leaf entity to set -+ * in service. In fact, only after we know which is -+ * the next-to-serve leaf entity, we can discover -+ * whether the parent entity of the leaf entity -+ * becomes the next-to-serve, and so on. - */ - - /* Log some information */ -diff --git a/block/bfq.h b/block/bfq.h -index f5751ea59d98..ebd9688b9f61 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -68,17 +68,30 @@ struct bfq_service_tree { - * - * bfq_sched_data is the basic scheduler queue. It supports three - * ioprio_classes, and can be used either as a toplevel queue or as an -- * intermediate queue on a hierarchical setup. @next_in_service -- * points to the active entity of the sched_data service trees that -- * will be scheduled next. It is used to reduce the number of steps -- * needed for each hierarchical-schedule update. -+ * intermediate queue in a hierarchical setup. - * - * The supported ioprio_classes are the same as in CFQ, in descending - * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. - * Requests from higher priority queues are served before all the - * requests from lower priority queues; among requests of the same - * queue requests are served according to B-WF2Q+. -- * All the fields are protected by the queue lock of the containing bfqd. -+ * -+ * The schedule is implemented by the service trees, plus the field -+ * @next_in_service, which points to the entity on the active trees -+ * that will be served next, if 1) no changes in the schedule occurs -+ * before the current in-service entity is expired, 2) the in-service -+ * queue becomes idle when it expires, and 3) if the entity pointed by -+ * in_service_entity is not a queue, then the in-service child entity -+ * of the entity pointed by in_service_entity becomes idle on -+ * expiration. This peculiar definition allows for the following -+ * optimization, not yet exploited: while a given entity is still in -+ * service, we already know which is the best candidate for next -+ * service among the other active entitities in the same parent -+ * entity. We can then quickly compare the timestamps of the -+ * in-service entity with those of such best candidate. -+ * -+ * All the fields are protected by the queue lock of the containing -+ * bfqd. - */ - struct bfq_sched_data { - struct bfq_entity *in_service_entity; /* entity in service */ - -From 6b5effd10bc6711a862e7cbd7cd2dd0146defa01 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Thu, 4 May 2017 10:53:43 +0200 -Subject: [PATCH 05/51] block, bfq: improve and refactor throughput-boosting - logic - -When a queue associated with a process remains empty, there are cases -where throughput gets boosted if the device is idled to await the -arrival of a new I/O request for that queue. Currently, BFQ assumes -that one of these cases is when the device has no internal queueing -(regardless of the properties of the I/O being served). Unfortunately, -this condition has proved to be too general. So, this commit refines it -as "the device has no internal queueing and is rotational". - -This refinement provides a significant throughput boost with random -I/O, on flash-based storage without internal queueing. For example, on -a HiKey board, throughput increases by up to 125%, growing, e.g., from -6.9MB/s to 15.6MB/s with two or three random readers in parallel. - -This commit also refactors the code related to device idling, for the -following reason. Finding the change that provides the above large -improvement has been slightly more difficult than it had to be, -because the logic that decides whether to idle the device is still -scattered across three functions. Almost all of the logic is in the -function bfq_bfqq_may_idle, but (1) part of the decision is made in -bfq_update_idle_window, and (2) the function bfq_bfqq_must_idle may -switch off idling regardless of the output of bfq_bfqq_may_idle. In -addition, both bfq_update_idle_window and bfq_bfqq_must_idle make -their decisions as a function of parameters that are used, for similar -purposes, also in bfq_bfqq_may_idle. This commit addresses this issue -by moving all the logic into bfq_bfqq_may_idle. - -Signed-off-by: Paolo Valente -Signed-off-by: Luca Miccio ---- - block/bfq-sq-iosched.c | 141 +++++++++++++++++++++++++++---------------------- - block/bfq.h | 12 ++--- - 2 files changed, 83 insertions(+), 70 deletions(-) - -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 65e7c7e77f3c..30d019fc67e0 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -684,10 +684,10 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - unsigned int old_wr_coeff; - bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); - -- if (bic->saved_idle_window) -- bfq_mark_bfqq_idle_window(bfqq); -+ if (bic->saved_has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); - else -- bfq_clear_bfqq_idle_window(bfqq); -+ bfq_clear_bfqq_has_short_ttime(bfqq); - - if (bic->saved_IO_bound) - bfq_mark_bfqq_IO_bound(bfqq); -@@ -2047,7 +2047,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - if (!bic) - return; - -- bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); -+ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -@@ -3214,9 +3214,9 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - } - - bfq_log_bfqq(bfqd, bfqq, -- "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", -+ "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", - reason, slow, bfqq->dispatched, -- bfq_bfqq_idle_window(bfqq), entity->weight); -+ bfq_bfqq_has_short_ttime(bfqq), entity->weight); - - /* - * Increase, decrease or leave budget unchanged according to -@@ -3298,7 +3298,10 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) - static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - { - struct bfq_data *bfqd = bfqq->bfqd; -- bool idling_boosts_thr, idling_boosts_thr_without_issues, -+ bool rot_without_queueing = -+ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, -+ bfqq_sequential_and_IO_bound, -+ idling_boosts_thr, idling_boosts_thr_without_issues, - idling_needed_for_service_guarantees, - asymmetric_scenario; - -@@ -3306,27 +3309,44 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - return true; - - /* -+ * Idling is performed only if slice_idle > 0. In addition, we -+ * do not idle if -+ * (a) bfqq is async -+ * (b) bfqq is in the idle io prio class: in this case we do -+ * not idle because we want to minimize the bandwidth that -+ * queues in this class can steal to higher-priority queues -+ */ -+ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || -+ bfq_class_idle(bfqq)) -+ return false; -+ -+ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && -+ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); -+ /* - * The next variable takes into account the cases where idling - * boosts the throughput. - * - * The value of the variable is computed considering, first, that - * idling is virtually always beneficial for the throughput if: -- * (a) the device is not NCQ-capable, or -- * (b) regardless of the presence of NCQ, the device is rotational -- * and the request pattern for bfqq is I/O-bound and sequential. -+ * (a) the device is not NCQ-capable and rotational, or -+ * (b) regardless of the presence of NCQ, the device is rotational and -+ * the request pattern for bfqq is I/O-bound and sequential, or -+ * (c) regardless of whether it is rotational, the device is -+ * not NCQ-capable and the request pattern for bfqq is -+ * I/O-bound and sequential. - * - * Secondly, and in contrast to the above item (b), idling an - * NCQ-capable flash-based device would not boost the - * throughput even with sequential I/O; rather it would lower - * the throughput in proportion to how fast the device - * is. Accordingly, the next variable is true if any of the -- * above conditions (a) and (b) is true, and, in particular, -- * happens to be false if bfqd is an NCQ-capable flash-based -- * device. -+ * above conditions (a), (b) or (c) is true, and, in -+ * particular, happens to be false if bfqd is an NCQ-capable -+ * flash-based device. - */ -- idling_boosts_thr = !bfqd->hw_tag || -- (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && -- bfq_bfqq_idle_window(bfqq)); -+ idling_boosts_thr = rot_without_queueing || -+ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && -+ bfqq_sequential_and_IO_bound); - - /* - * The value of the next variable, -@@ -3497,12 +3517,10 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); - - /* -- * We have now all the components we need to compute the return -- * value of the function, which is true only if both the following -- * conditions hold: -- * 1) bfqq is sync, because idling make sense only for sync queues; -- * 2) idling either boosts the throughput (without issues), or -- * is necessary to preserve service guarantees. -+ * We have now all the components we need to compute the -+ * return value of the function, which is true only if idling -+ * either boosts the throughput (without issues), or is -+ * necessary to preserve service guarantees. - */ - bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", - bfq_bfqq_sync(bfqq), idling_boosts_thr); -@@ -3514,9 +3532,8 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - bfq_bfqq_IO_bound(bfqq), - idling_needed_for_service_guarantees); - -- return bfq_bfqq_sync(bfqq) && -- (idling_boosts_thr_without_issues || -- idling_needed_for_service_guarantees); -+ return idling_boosts_thr_without_issues || -+ idling_needed_for_service_guarantees; - } - - /* -@@ -3532,10 +3549,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - */ - static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) - { -- struct bfq_data *bfqd = bfqq->bfqd; -- -- return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && -- bfq_bfqq_may_idle(bfqq); -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); - } - - /* -@@ -3994,7 +4008,6 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - case IOPRIO_CLASS_IDLE: - bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; - bfqq->new_ioprio = 7; -- bfq_clear_bfqq_idle_window(bfqq); - break; - } - -@@ -4058,8 +4071,14 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_set_next_ioprio_data(bfqq, bic); - - if (is_sync) { -+ /* -+ * No need to mark as has_short_ttime if in -+ * idle_class, because no device idling is performed -+ * for queues in idle class -+ */ - if (!bfq_class_idle(bfqq)) -- bfq_mark_bfqq_idle_window(bfqq); -+ /* tentatively mark as has_short_ttime */ -+ bfq_mark_bfqq_has_short_ttime(bfqq); - bfq_mark_bfqq_sync(bfqq); - bfq_mark_bfqq_just_created(bfqq); - } else -@@ -4195,18 +4214,19 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, - blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); - } - --/* -- * Disable idle window if the process thinks too long or seeks so much that -- * it doesn't matter. -- */ --static void bfq_update_idle_window(struct bfq_data *bfqd, -- struct bfq_queue *bfqq, -- struct bfq_io_cq *bic) -+static void bfq_update_has_short_ttime(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) - { -- int enable_idle; -+ bool has_short_ttime = true; - -- /* Don't idle for async or idle io prio class. */ -- if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) -+ /* -+ * No need to update has_short_ttime if bfqq is async or in -+ * idle io prio class, or if bfq_slice_idle is zero, because -+ * no device idling is performed for bfqq in this case. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || -+ bfqd->bfq_slice_idle == 0) - return; - - /* Idle window just restored, statistics are meaningless. */ -@@ -4214,27 +4234,22 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, - bfqd->bfq_wr_min_idle_time)) - return; - -- enable_idle = bfq_bfqq_idle_window(bfqq); -- -+ /* Think time is infinite if no process is linked to -+ * bfqq. Otherwise check average think time to -+ * decide whether to mark as has_short_ttime -+ */ - if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -- bfqd->bfq_slice_idle == 0 || -- (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && -- bfqq->wr_coeff == 1)) -- enable_idle = 0; -- else if (bfq_sample_valid(bic->ttime.ttime_samples)) { -- if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && -- bfqq->wr_coeff == 1) -- enable_idle = 0; -- else -- enable_idle = 1; -- } -- bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", -- enable_idle); -+ (bfq_sample_valid(bic->ttime.ttime_samples) && -+ bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) -+ has_short_ttime = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -+ has_short_ttime); - -- if (enable_idle) -- bfq_mark_bfqq_idle_window(bfqq); -+ if (has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); - else -- bfq_clear_bfqq_idle_window(bfqq); -+ bfq_clear_bfqq_has_short_ttime(bfqq); - } - - /* -@@ -4250,14 +4265,12 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfqq->meta_pending++; - - bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_has_short_ttime(bfqd, bfqq, bic); - bfq_update_io_seektime(bfqd, bfqq, rq); -- if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || -- !BFQQ_SEEKY(bfqq)) -- bfq_update_idle_window(bfqd, bfqq, bic); - - bfq_log_bfqq(bfqd, bfqq, -- "rq_enqueued: idle_window=%d (seeky %d)", -- bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); -+ "rq_enqueued: has_short_ttime=%d (seeky %d)", -+ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); - - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); - -diff --git a/block/bfq.h b/block/bfq.h -index ebd9688b9f61..34fc4697fd89 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -349,11 +349,11 @@ struct bfq_io_cq { - #endif - - /* -- * Snapshot of the idle window before merging; taken to -- * remember this value while the queue is merged, so as to be -- * able to restore it in case of split. -+ * Snapshot of the has_short_time flag before merging; taken -+ * to remember its value while the queue is merged, so as to -+ * be able to restore it in case of split. - */ -- bool saved_idle_window; -+ bool saved_has_short_ttime; - /* - * Same purpose as the previous two fields for the I/O bound - * classification of a queue. -@@ -610,7 +610,7 @@ enum bfqq_state_flags { - */ - BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ - BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -- BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ -+ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ - BFQ_BFQQ_FLAG_sync, /* synchronous queue */ - BFQ_BFQQ_FLAG_IO_bound, /* - * bfqq has timed-out at least once -@@ -649,7 +649,7 @@ BFQ_BFQQ_FNS(wait_request); - BFQ_BFQQ_FNS(non_blocking_wait_rq); - BFQ_BFQQ_FNS(must_alloc); - BFQ_BFQQ_FNS(fifo_expire); --BFQ_BFQQ_FNS(idle_window); -+BFQ_BFQQ_FNS(has_short_ttime); - BFQ_BFQQ_FNS(sync); - BFQ_BFQQ_FNS(IO_bound); - BFQ_BFQQ_FNS(in_large_burst); - -From b5e746fa99d961a5642cffb27c19a77e8b638007 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Mon, 19 Dec 2016 16:59:33 +0100 -Subject: [PATCH 06/51] FIRST BFQ-MQ COMMIT: Copy bfq-sq-iosched.c as - bfq-mq-iosched.c - -This commit introduces bfq-mq-iosched.c, the main source file that -will contain the code of bfq for blk-mq. I name tentatively -bfq-mq this version of bfq. - -For the moment, the file bfq-mq-iosched.c is just a copy of -bfq-sq-iosched.c, i.e, of the main source file of bfq for blk. - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 5392 ++++++++++++++++++++++++++++++++++++++++++++++++ - 1 file changed, 5392 insertions(+) - create mode 100644 block/bfq-mq-iosched.c - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -new file mode 100644 -index 000000000000..30d019fc67e0 ---- /dev/null -+++ b/block/bfq-mq-iosched.c -@@ -0,0 +1,5392 @@ -+/* -+ * Budget Fair Queueing (BFQ) I/O scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2015 Paolo Valente -+ * -+ * Copyright (C) 2017 Paolo Valente -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ * -+ * BFQ is a proportional-share I/O scheduler, with some extra -+ * low-latency capabilities. BFQ also supports full hierarchical -+ * scheduling through cgroups. Next paragraphs provide an introduction -+ * on BFQ inner workings. Details on BFQ benefits and usage can be -+ * found in Documentation/block/bfq-iosched.txt. -+ * -+ * BFQ is a proportional-share storage-I/O scheduling algorithm based -+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns -+ * budgets, measured in number of sectors, to processes instead of -+ * time slices. The device is not granted to the in-service process -+ * for a given time slice, but until it has exhausted its assigned -+ * budget. This change from the time to the service domain enables BFQ -+ * to distribute the device throughput among processes as desired, -+ * without any distortion due to throughput fluctuations, or to device -+ * internal queueing. BFQ uses an ad hoc internal scheduler, called -+ * B-WF2Q+, to schedule processes according to their budgets. More -+ * precisely, BFQ schedules queues associated with processes. Thanks to -+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high -+ * budgets to I/O-bound processes issuing sequential requests (to -+ * boost the throughput), and yet guarantee a low latency to -+ * interactive and soft real-time applications. -+ * -+ * NOTE: if the main or only goal, with a given device, is to achieve -+ * the maximum-possible throughput at all times, then do switch off -+ * all low-latency heuristics for that device, by setting low_latency -+ * to 0. -+ * -+ * BFQ is described in [1], where also a reference to the initial, more -+ * theoretical paper on BFQ can be found. The interested reader can find -+ * in the latter paper full details on the main algorithm, as well as -+ * formulas of the guarantees and formal proofs of all the properties. -+ * With respect to the version of BFQ presented in these papers, this -+ * implementation adds a few more heuristics, such as the one that -+ * guarantees a low latency to soft real-time applications, and a -+ * hierarchical extension based on H-WF2Q+. -+ * -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) -+ * complexity derives from the one introduced with EEVDF in [3]. -+ * -+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O -+ * Scheduler", Proceedings of the First Workshop on Mobile System -+ * Technologies (MST-2015), May 2015. -+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf -+ * -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf -+ * -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, -+ * Oct 1997. -+ * -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz -+ * -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline -+ * First: A Flexible and Accurate Mechanism for Proportional Share -+ * Resource Allocation,'' technical report. -+ * -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "blk.h" -+#include "bfq.h" -+ -+/* Expiration time of sync (0) and async (1) requests, in ns. */ -+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -+ -+/* Maximum backwards seek, in KiB. */ -+static const int bfq_back_max = (16 * 1024); -+ -+/* Penalty of a backwards seek, in number of sectors. */ -+static const int bfq_back_penalty = 2; -+ -+/* Idling period duration, in ns. */ -+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); -+ -+/* Minimum number of assigned budgets for which stats are safe to compute. */ -+static const int bfq_stats_min_budgets = 194; -+ -+/* Default maximum budget values, in sectors and number of requests. */ -+static const int bfq_default_max_budget = (16 * 1024); -+ -+/* -+ * Async to sync throughput distribution is controlled as follows: -+ * when an async request is served, the entity is charged the number -+ * of sectors of the request, multiplied by the factor below -+ */ -+static const int bfq_async_charge_factor = 10; -+ -+/* Default timeout values, in jiffies, approximating CFQ defaults. */ -+static const int bfq_timeout = (HZ / 8); -+ -+static struct kmem_cache *bfq_pool; -+ -+/* Below this threshold (in ns), we consider thinktime immediate. */ -+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) -+ -+/* hw_tag detection: parallel requests threshold and min samples needed. */ -+#define BFQ_HW_QUEUE_THRESHOLD 4 -+#define BFQ_HW_QUEUE_SAMPLES 32 -+ -+#define BFQQ_SEEK_THR (sector_t)(8 * 100) -+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -+ -+/* Min number of samples required to perform peak-rate update */ -+#define BFQ_RATE_MIN_SAMPLES 32 -+/* Min observation time interval required to perform a peak-rate update (ns) */ -+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) -+/* Target observation time interval for a peak-rate update (ns) */ -+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC -+ -+/* Shift used for peak rate fixed precision calculations. */ -+#define BFQ_RATE_SHIFT 16 -+ -+/* -+ * By default, BFQ computes the duration of the weight raising for -+ * interactive applications automatically, using the following formula: -+ * duration = (R / r) * T, where r is the peak rate of the device, and -+ * R and T are two reference parameters. -+ * In particular, R is the peak rate of the reference device (see below), -+ * and T is a reference time: given the systems that are likely to be -+ * installed on the reference device according to its speed class, T is -+ * about the maximum time needed, under BFQ and while reading two files in -+ * parallel, to load typical large applications on these systems. -+ * In practice, the slower/faster the device at hand is, the more/less it -+ * takes to load applications with respect to the reference device. -+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive -+ * applications. -+ * -+ * BFQ uses four different reference pairs (R, T), depending on: -+ * . whether the device is rotational or non-rotational; -+ * . whether the device is slow, such as old or portable HDDs, as well as -+ * SD cards, or fast, such as newer HDDs and SSDs. -+ * -+ * The device's speed class is dynamically (re)detected in -+ * bfq_update_peak_rate() every time the estimated peak rate is updated. -+ * -+ * In the following definitions, R_slow[0]/R_fast[0] and -+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast -+ * rotational device, whereas R_slow[1]/R_fast[1] and -+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast -+ * non-rotational device. Finally, device_speed_thresh are the -+ * thresholds used to switch between speed classes. The reference -+ * rates are not the actual peak rates of the devices used as a -+ * reference, but slightly lower values. The reason for using these -+ * slightly lower values is that the peak-rate estimator tends to -+ * yield slightly lower values than the actual peak rate (it can yield -+ * the actual peak rate only if there is only one process doing I/O, -+ * and the process does sequential I/O). -+ * -+ * Both the reference peak rates and the thresholds are measured in -+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT. -+ */ -+static int R_slow[2] = {1000, 10700}; -+static int R_fast[2] = {14000, 33000}; -+/* -+ * To improve readability, a conversion function is used to initialize the -+ * following arrays, which entails that they can be initialized only in a -+ * function. -+ */ -+static int T_slow[2]; -+static int T_fast[2]; -+static int device_speed_thresh[2]; -+ -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) -+ -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -+ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd); -+ -+#include "bfq-ioc.c" -+#include "bfq-sched.c" -+#include "bfq-cgroup-included.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* -+ * Scheduler run of queue, if there are requests pending and no one in the -+ * driver that will restart queueing. -+ */ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd) -+{ -+ if (bfqd->queued != 0) { -+ bfq_log(bfqd, "schedule dispatch"); -+ kblockd_schedule_work(&bfqd->unplug_work); -+ } -+} -+ -+/* -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now. -+ * We choose the request that is closesr to the head right now. Distance -+ * behind the head is penalized and only allowed to a certain extent. -+ */ -+static struct request *bfq_choose_req(struct bfq_data *bfqd, -+ struct request *rq1, -+ struct request *rq2, -+ sector_t last) -+{ -+ sector_t s1, s2, d1 = 0, d2 = 0; -+ unsigned long back_max; -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ -+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ -+ -+ if (!rq1 || rq1 == rq2) -+ return rq2; -+ if (!rq2) -+ return rq1; -+ -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) -+ return rq1; -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) -+ return rq2; -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) -+ return rq1; -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) -+ return rq2; -+ -+ s1 = blk_rq_pos(rq1); -+ s2 = blk_rq_pos(rq2); -+ -+ /* -+ * By definition, 1KiB is 2 sectors. -+ */ -+ back_max = bfqd->bfq_back_max * 2; -+ -+ /* -+ * Strict one way elevator _except_ in the case where we allow -+ * short backward seeks which are biased as twice the cost of a -+ * similar forward seek. -+ */ -+ if (s1 >= last) -+ d1 = s1 - last; -+ else if (s1 + back_max >= last) -+ d1 = (last - s1) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ1_WRAP; -+ -+ if (s2 >= last) -+ d2 = s2 - last; -+ else if (s2 + back_max >= last) -+ d2 = (last - s2) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ2_WRAP; -+ -+ /* Found required data */ -+ -+ /* -+ * By doing switch() on the bit mask "wrap" we avoid having to -+ * check two variables for all permutations: --> faster! -+ */ -+ switch (wrap) { -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ -+ if (d1 < d2) -+ return rq1; -+ else if (d2 < d1) -+ return rq2; -+ -+ if (s1 >= s2) -+ return rq1; -+ else -+ return rq2; -+ -+ case BFQ_RQ2_WRAP: -+ return rq1; -+ case BFQ_RQ1_WRAP: -+ return rq2; -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ -+ default: -+ /* -+ * Since both rqs are wrapped, -+ * start with the one that's further behind head -+ * (--> only *one* back seek required), -+ * since back seek takes more time than forward. -+ */ -+ if (s1 <= s2) -+ return rq1; -+ else -+ return rq2; -+ } -+} -+ -+static struct bfq_queue * -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, -+ sector_t sector, struct rb_node **ret_parent, -+ struct rb_node ***rb_link) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *bfqq = NULL; -+ -+ parent = NULL; -+ p = &root->rb_node; -+ while (*p) { -+ struct rb_node **n; -+ -+ parent = *p; -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ -+ /* -+ * Sort strictly based on sector. Smallest to the left, -+ * largest to the right. -+ */ -+ if (sector > blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_right; -+ else if (sector < blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_left; -+ else -+ break; -+ p = n; -+ bfqq = NULL; -+ } -+ -+ *ret_parent = parent; -+ if (rb_link) -+ *rb_link = p; -+ -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", -+ (unsigned long long) sector, -+ bfqq ? bfqq->pid : 0); -+ -+ return bfqq; -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *__bfqq; -+ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ -+ if (bfq_class_idle(bfqq)) -+ return; -+ if (!bfqq->next_rq) -+ return; -+ -+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, -+ blk_rq_pos(bfqq->next_rq), &parent, &p); -+ if (!__bfqq) { -+ rb_link_node(&bfqq->pos_node, parent, p); -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); -+ } else -+ bfqq->pos_root = NULL; -+} -+ -+/* -+ * Tell whether there are active queues or groups with differentiated weights. -+ */ -+static bool bfq_differentiated_weights(struct bfq_data *bfqd) -+{ -+ /* -+ * For weights to differ, at least one of the trees must contain -+ * at least two nodes. -+ */ -+ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && -+ (bfqd->queue_weights_tree.rb_node->rb_left || -+ bfqd->queue_weights_tree.rb_node->rb_right) -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ ) || -+ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && -+ (bfqd->group_weights_tree.rb_node->rb_left || -+ bfqd->group_weights_tree.rb_node->rb_right) -+#endif -+ ); -+} -+ -+/* -+ * The following function returns true if every queue must receive the -+ * same share of the throughput (this condition is used when deciding -+ * whether idling may be disabled, see the comments in the function -+ * bfq_bfqq_may_idle()). -+ * -+ * Such a scenario occurs when: -+ * 1) all active queues have the same weight, -+ * 2) all active groups at the same level in the groups tree have the same -+ * weight, -+ * 3) all active groups at the same level in the groups tree have the same -+ * number of children. -+ * -+ * Unfortunately, keeping the necessary state for evaluating exactly the -+ * above symmetry conditions would be quite complex and time-consuming. -+ * Therefore this function evaluates, instead, the following stronger -+ * sub-conditions, for which it is much easier to maintain the needed -+ * state: -+ * 1) all active queues have the same weight, -+ * 2) all active groups have the same weight, -+ * 3) all active groups have at most one active child each. -+ * In particular, the last two conditions are always true if hierarchical -+ * support and the cgroups interface are not enabled, thus no state needs -+ * to be maintained in this case. -+ */ -+static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -+{ -+ return !bfq_differentiated_weights(bfqd); -+} -+ -+/* -+ * If the weight-counter tree passed as input contains no counter for -+ * the weight of the input entity, then add that counter; otherwise just -+ * increment the existing counter. -+ * -+ * Note that weight-counter trees contain few nodes in mostly symmetric -+ * scenarios. For example, if all queues have the same weight, then the -+ * weight-counter tree for the queues may contain at most one node. -+ * This holds even if low_latency is on, because weight-raised queues -+ * are not inserted in the tree. -+ * In most scenarios, the rate at which nodes are created/destroyed -+ * should be low too. -+ */ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ struct rb_node **new = &(root->rb_node), *parent = NULL; -+ -+ /* -+ * Do not insert if the entity is already associated with a -+ * counter, which happens if: -+ * 1) the entity is associated with a queue, -+ * 2) a request arrival has caused the queue to become both -+ * non-weight-raised, and hence change its weight, and -+ * backlogged; in this respect, each of the two events -+ * causes an invocation of this function, -+ * 3) this is the invocation of this function caused by the -+ * second event. This second invocation is actually useless, -+ * and we handle this fact by exiting immediately. More -+ * efficient or clearer solutions might possibly be adopted. -+ */ -+ if (entity->weight_counter) -+ return; -+ -+ while (*new) { -+ struct bfq_weight_counter *__counter = container_of(*new, -+ struct bfq_weight_counter, -+ weights_node); -+ parent = *new; -+ -+ if (entity->weight == __counter->weight) { -+ entity->weight_counter = __counter; -+ goto inc_counter; -+ } -+ if (entity->weight < __counter->weight) -+ new = &((*new)->rb_left); -+ else -+ new = &((*new)->rb_right); -+ } -+ -+ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), -+ GFP_ATOMIC); -+ -+ /* -+ * In the unlucky event of an allocation failure, we just -+ * exit. This will cause the weight of entity to not be -+ * considered in bfq_differentiated_weights, which, in its -+ * turn, causes the scenario to be deemed wrongly symmetric in -+ * case entity's weight would have been the only weight making -+ * the scenario asymmetric. On the bright side, no unbalance -+ * will however occur when entity becomes inactive again (the -+ * invocation of this function is triggered by an activation -+ * of entity). In fact, bfq_weights_tree_remove does nothing -+ * if !entity->weight_counter. -+ */ -+ if (unlikely(!entity->weight_counter)) -+ return; -+ -+ entity->weight_counter->weight = entity->weight; -+ rb_link_node(&entity->weight_counter->weights_node, parent, new); -+ rb_insert_color(&entity->weight_counter->weights_node, root); -+ -+inc_counter: -+ entity->weight_counter->num_active++; -+} -+ -+/* -+ * Decrement the weight counter associated with the entity, and, if the -+ * counter reaches 0, remove the counter from the tree. -+ * See the comments to the function bfq_weights_tree_add() for considerations -+ * about overhead. -+ */ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ if (!entity->weight_counter) -+ return; -+ -+ BUG_ON(RB_EMPTY_ROOT(root)); -+ BUG_ON(entity->weight_counter->weight != entity->weight); -+ -+ BUG_ON(!entity->weight_counter->num_active); -+ entity->weight_counter->num_active--; -+ if (entity->weight_counter->num_active > 0) -+ goto reset_entity_pointer; -+ -+ rb_erase(&entity->weight_counter->weights_node, root); -+ kfree(entity->weight_counter); -+ -+reset_entity_pointer: -+ entity->weight_counter = NULL; -+} -+ -+/* -+ * Return expired entry, or NULL to just start from scratch in rbtree. -+ */ -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct request *rq; -+ -+ if (bfq_bfqq_fifo_expire(bfqq)) -+ return NULL; -+ -+ bfq_mark_bfqq_fifo_expire(bfqq); -+ -+ rq = rq_entry_fifo(bfqq->fifo.next); -+ -+ if (rq == last || ktime_get_ns() < rq->fifo_time) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); -+ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); -+ return rq; -+} -+ -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct rb_node *rbnext = rb_next(&last->rb_node); -+ struct rb_node *rbprev = rb_prev(&last->rb_node); -+ struct request *next, *prev = NULL; -+ -+ BUG_ON(list_empty(&bfqq->fifo)); -+ -+ /* Follow expired path, else get first next available. */ -+ next = bfq_check_fifo(bfqq, last); -+ if (next) { -+ BUG_ON(next == last); -+ return next; -+ } -+ -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); -+ -+ if (rbprev) -+ prev = rb_entry_rq(rbprev); -+ -+ if (rbnext) -+ next = rb_entry_rq(rbnext); -+ else { -+ rbnext = rb_first(&bfqq->sort_list); -+ if (rbnext && rbnext != &last->rb_node) -+ next = rb_entry_rq(rbnext); -+ } -+ -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -+} -+ -+/* see the definition of bfq_async_charge_factor for details */ -+static unsigned long bfq_serv_to_charge(struct request *rq, -+ struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) -+ return blk_rq_sectors(rq); -+ -+ /* -+ * If there are no weight-raised queues, then amplify service -+ * by just the async charge factor; otherwise amplify service -+ * by twice the async charge factor, to further reduce latency -+ * for weight-raised queues. -+ */ -+ if (bfqq->bfqd->wr_busy_queues == 0) -+ return blk_rq_sectors(rq) * bfq_async_charge_factor; -+ -+ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; -+} -+ -+/** -+ * bfq_updated_next_req - update the queue after a new next_rq selection. -+ * @bfqd: the device data the queue belongs to. -+ * @bfqq: the queue to update. -+ * -+ * If the first request of a queue changes we make sure that the queue -+ * has enough budget to serve at least its first request (if the -+ * request has grown). We do this because if the queue has not enough -+ * budget for its first request, it has to go through two dispatch -+ * rounds to actually get it dispatched. -+ */ -+static void bfq_updated_next_req(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct request *next_rq = bfqq->next_rq; -+ unsigned long new_budget; -+ -+ if (!next_rq) -+ return; -+ -+ if (bfqq == bfqd->in_service_queue) -+ /* -+ * In order not to break guarantees, budgets cannot be -+ * changed after an entity has been selected. -+ */ -+ return; -+ -+ BUG_ON(entity->tree != &st->active); -+ BUG_ON(entity == entity->sched_data->in_service_entity); -+ -+ new_budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ if (entity->budget != new_budget) { -+ entity->budget = new_budget; -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -+ new_budget); -+ bfq_requeue_bfqq(bfqd, bfqq); -+ } -+} -+ -+static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -+{ -+ u64 dur; -+ -+ if (bfqd->bfq_wr_max_time > 0) -+ return bfqd->bfq_wr_max_time; -+ -+ dur = bfqd->RT_prod; -+ do_div(dur, bfqd->peak_rate); -+ -+ /* -+ * Limit duration between 3 and 13 seconds. Tests show that -+ * higher values than 13 seconds often yield the opposite of -+ * the desired result, i.e., worsen responsiveness by letting -+ * non-interactive and non-soft-real-time applications -+ * preserve weight raising for a too long time interval. -+ * -+ * On the other end, lower values than 3 seconds make it -+ * difficult for most interactive tasks to complete their jobs -+ * before weight-raising finishes. -+ */ -+ if (dur > msecs_to_jiffies(13000)) -+ dur = msecs_to_jiffies(13000); -+ else if (dur < msecs_to_jiffies(3000)) -+ dur = msecs_to_jiffies(3000); -+ -+ return dur; -+} -+ -+static void -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, bool bfq_already_existing) -+{ -+ unsigned int old_wr_coeff; -+ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); -+ -+ if (bic->saved_has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ else -+ bfq_clear_bfqq_has_short_ttime(bfqq); -+ -+ if (bic->saved_IO_bound) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ else -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (unlikely(busy)) -+ old_wr_coeff = bfqq->wr_coeff; -+ -+ bfqq->wr_coeff = bic->saved_wr_coeff; -+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; -+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time))) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "resume state: switching off wr (%lu + %lu < %lu)", -+ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, -+ jiffies); -+ -+ bfqq->wr_coeff = 1; -+ } -+ -+ /* make sure weight will be updated, however we got here */ -+ bfqq->entity.prio_changed = 1; -+ -+ if (likely(!busy)) -+ return; -+ -+ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+} -+ -+static int bfqq_process_refs(struct bfq_queue *bfqq) -+{ -+ int process_refs, io_refs; -+ -+ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); -+ -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -+ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; -+ BUG_ON(process_refs < 0); -+ return process_refs; -+} -+ -+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *item; -+ struct hlist_node *n; -+ -+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) -+ hlist_del_init(&item->burst_list_node); -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+ bfqd->burst_size = 1; -+ bfqd->burst_parent_entity = bfqq->entity.parent; -+} -+ -+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* Increment burst size to take into account also bfqq */ -+ bfqd->burst_size++; -+ -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); -+ -+ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); -+ -+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { -+ struct bfq_queue *pos, *bfqq_item; -+ struct hlist_node *n; -+ -+ /* -+ * Enough queues have been activated shortly after each -+ * other to consider this burst as large. -+ */ -+ bfqd->large_burst = true; -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); -+ -+ /* -+ * We can now mark all queues in the burst list as -+ * belonging to a large burst. -+ */ -+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, -+ burst_list_node) { -+ bfq_mark_bfqq_in_large_burst(bfqq_item); -+ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); -+ } -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); -+ -+ /* -+ * From now on, and until the current burst finishes, any -+ * new queue being activated shortly after the last queue -+ * was inserted in the burst can be immediately marked as -+ * belonging to a large burst. So the burst list is not -+ * needed any more. Remove it. -+ */ -+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, -+ burst_list_node) -+ hlist_del_init(&pos->burst_list_node); -+ } else /* -+ * Burst not yet large: add bfqq to the burst list. Do -+ * not increment the ref counter for bfqq, because bfqq -+ * is removed from the burst list before freeing bfqq -+ * in put_queue. -+ */ -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+} -+ -+/* -+ * If many queues belonging to the same group happen to be created -+ * shortly after each other, then the processes associated with these -+ * queues have typically a common goal. In particular, bursts of queue -+ * creations are usually caused by services or applications that spawn -+ * many parallel threads/processes. Examples are systemd during boot, -+ * or git grep. To help these processes get their job done as soon as -+ * possible, it is usually better to not grant either weight-raising -+ * or device idling to their queues. -+ * -+ * In this comment we describe, firstly, the reasons why this fact -+ * holds, and, secondly, the next function, which implements the main -+ * steps needed to properly mark these queues so that they can then be -+ * treated in a different way. -+ * -+ * The above services or applications benefit mostly from a high -+ * throughput: the quicker the requests of the activated queues are -+ * cumulatively served, the sooner the target job of these queues gets -+ * completed. As a consequence, weight-raising any of these queues, -+ * which also implies idling the device for it, is almost always -+ * counterproductive. In most cases it just lowers throughput. -+ * -+ * On the other hand, a burst of queue creations may be caused also by -+ * the start of an application that does not consist of a lot of -+ * parallel I/O-bound threads. In fact, with a complex application, -+ * several short processes may need to be executed to start-up the -+ * application. In this respect, to start an application as quickly as -+ * possible, the best thing to do is in any case to privilege the I/O -+ * related to the application with respect to all other -+ * I/O. Therefore, the best strategy to start as quickly as possible -+ * an application that causes a burst of queue creations is to -+ * weight-raise all the queues created during the burst. This is the -+ * exact opposite of the best strategy for the other type of bursts. -+ * -+ * In the end, to take the best action for each of the two cases, the -+ * two types of bursts need to be distinguished. Fortunately, this -+ * seems relatively easy, by looking at the sizes of the bursts. In -+ * particular, we found a threshold such that only bursts with a -+ * larger size than that threshold are apparently caused by -+ * services or commands such as systemd or git grep. For brevity, -+ * hereafter we call just 'large' these bursts. BFQ *does not* -+ * weight-raise queues whose creation occurs in a large burst. In -+ * addition, for each of these queues BFQ performs or does not perform -+ * idling depending on which choice boosts the throughput more. The -+ * exact choice depends on the device and request pattern at -+ * hand. -+ * -+ * Unfortunately, false positives may occur while an interactive task -+ * is starting (e.g., an application is being started). The -+ * consequence is that the queues associated with the task do not -+ * enjoy weight raising as expected. Fortunately these false positives -+ * are very rare. They typically occur if some service happens to -+ * start doing I/O exactly when the interactive task starts. -+ * -+ * Turning back to the next function, it implements all the steps -+ * needed to detect the occurrence of a large burst and to properly -+ * mark all the queues belonging to it (so that they can then be -+ * treated in a different way). This goal is achieved by maintaining a -+ * "burst list" that holds, temporarily, the queues that belong to the -+ * burst in progress. The list is then used to mark these queues as -+ * belonging to a large burst if the burst does become large. The main -+ * steps are the following. -+ * -+ * . when the very first queue is created, the queue is inserted into the -+ * list (as it could be the first queue in a possible burst) -+ * -+ * . if the current burst has not yet become large, and a queue Q that does -+ * not yet belong to the burst is activated shortly after the last time -+ * at which a new queue entered the burst list, then the function appends -+ * Q to the burst list -+ * -+ * . if, as a consequence of the previous step, the burst size reaches -+ * the large-burst threshold, then -+ * -+ * . all the queues in the burst list are marked as belonging to a -+ * large burst -+ * -+ * . the burst list is deleted; in fact, the burst list already served -+ * its purpose (keeping temporarily track of the queues in a burst, -+ * so as to be able to mark them as belonging to a large burst in the -+ * previous sub-step), and now is not needed any more -+ * -+ * . the device enters a large-burst mode -+ * -+ * . if a queue Q that does not belong to the burst is created while -+ * the device is in large-burst mode and shortly after the last time -+ * at which a queue either entered the burst list or was marked as -+ * belonging to the current large burst, then Q is immediately marked -+ * as belonging to a large burst. -+ * -+ * . if a queue Q that does not belong to the burst is created a while -+ * later, i.e., not shortly after, than the last time at which a queue -+ * either entered the burst list or was marked as belonging to the -+ * current large burst, then the current burst is deemed as finished and: -+ * -+ * . the large-burst mode is reset if set -+ * -+ * . the burst list is emptied -+ * -+ * . Q is inserted in the burst list, as Q may be the first queue -+ * in a possible new burst (then the burst list contains just Q -+ * after this step). -+ */ -+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq is already in the burst list or is part of a large -+ * burst, or finally has just been split, then there is -+ * nothing else to do. -+ */ -+ if (!hlist_unhashed(&bfqq->burst_list_node) || -+ bfq_bfqq_in_large_burst(bfqq) || -+ time_is_after_eq_jiffies(bfqq->split_time + -+ msecs_to_jiffies(10))) -+ return; -+ -+ /* -+ * If bfqq's creation happens late enough, or bfqq belongs to -+ * a different group than the burst group, then the current -+ * burst is finished, and related data structures must be -+ * reset. -+ * -+ * In this respect, consider the special case where bfqq is -+ * the very first queue created after BFQ is selected for this -+ * device. In this case, last_ins_in_burst and -+ * burst_parent_entity are not yet significant when we get -+ * here. But it is easy to verify that, whether or not the -+ * following condition is true, bfqq will end up being -+ * inserted into the burst list. In particular the list will -+ * happen to contain only bfqq. And this is exactly what has -+ * to happen, as bfqq may be the first queue of the first -+ * burst. -+ */ -+ if (time_is_before_jiffies(bfqd->last_ins_in_burst + -+ bfqd->bfq_burst_interval) || -+ bfqq->entity.parent != bfqd->burst_parent_entity) { -+ bfqd->large_burst = false; -+ bfq_reset_burst_list(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "handle_burst: late activation or different group"); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then bfqq is being activated shortly after the -+ * last queue. So, if the current burst is also large, we can mark -+ * bfqq as belonging to this large burst immediately. -+ */ -+ if (bfqd->large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ goto end; -+ } -+ -+ /* -+ * If we get here, then a large-burst state has not yet been -+ * reached, but bfqq is being activated shortly after the last -+ * queue. Then we add bfqq to the burst. -+ */ -+ bfq_add_to_burst(bfqd, bfqq); -+end: -+ /* -+ * At this point, bfqq either has been added to the current -+ * burst or has caused the current burst to terminate and a -+ * possible new burst to start. In particular, in the second -+ * case, bfqq has become the first queue in the possible new -+ * burst. In both cases last_ins_in_burst needs to be moved -+ * forward. -+ */ -+ bfqd->last_ins_in_burst = jiffies; -+ -+} -+ -+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ return entity->budget - entity->service; -+} -+ -+/* -+ * If enough samples have been computed, return the current max budget -+ * stored in bfqd, which is dynamically updated according to the -+ * estimated disk peak rate; otherwise return the default max budget -+ */ -+static int bfq_max_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget; -+ else -+ return bfqd->bfq_max_budget; -+} -+ -+/* -+ * Return min budget, which is a fraction of the current or default -+ * max budget (trying with 1/32) -+ */ -+static int bfq_min_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget / 32; -+ else -+ return bfqd->bfq_max_budget / 32; -+} -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/* -+ * The next function, invoked after the input queue bfqq switches from -+ * idle to busy, updates the budget of bfqq. The function also tells -+ * whether the in-service queue should be expired, by returning -+ * true. The purpose of expiring the in-service queue is to give bfqq -+ * the chance to possibly preempt the in-service queue, and the reason -+ * for preempting the in-service queue is to achieve one of the two -+ * goals below. -+ * -+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has -+ * expired because it has remained idle. In particular, bfqq may have -+ * expired for one of the following two reasons: -+ * -+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and -+ * did not make it to issue a new request before its last request -+ * was served; -+ * -+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue -+ * a new request before the expiration of the idling-time. -+ * -+ * Even if bfqq has expired for one of the above reasons, the process -+ * associated with the queue may be however issuing requests greedily, -+ * and thus be sensitive to the bandwidth it receives (bfqq may have -+ * remained idle for other reasons: CPU high load, bfqq not enjoying -+ * idling, I/O throttling somewhere in the path from the process to -+ * the I/O scheduler, ...). But if, after every expiration for one of -+ * the above two reasons, bfqq has to wait for the service of at least -+ * one full budget of another queue before being served again, then -+ * bfqq is likely to get a much lower bandwidth or resource time than -+ * its reserved ones. To address this issue, two countermeasures need -+ * to be taken. -+ * -+ * First, the budget and the timestamps of bfqq need to be updated in -+ * a special way on bfqq reactivation: they need to be updated as if -+ * bfqq did not remain idle and did not expire. In fact, if they are -+ * computed as if bfqq expired and remained idle until reactivation, -+ * then the process associated with bfqq is treated as if, instead of -+ * being greedy, it stopped issuing requests when bfqq remained idle, -+ * and restarts issuing requests only on this reactivation. In other -+ * words, the scheduler does not help the process recover the "service -+ * hole" between bfqq expiration and reactivation. As a consequence, -+ * the process receives a lower bandwidth than its reserved one. In -+ * contrast, to recover this hole, the budget must be updated as if -+ * bfqq was not expired at all before this reactivation, i.e., it must -+ * be set to the value of the remaining budget when bfqq was -+ * expired. Along the same line, timestamps need to be assigned the -+ * value they had the last time bfqq was selected for service, i.e., -+ * before last expiration. Thus timestamps need to be back-shifted -+ * with respect to their normal computation (see [1] for more details -+ * on this tricky aspect). -+ * -+ * Secondly, to allow the process to recover the hole, the in-service -+ * queue must be expired too, to give bfqq the chance to preempt it -+ * immediately. In fact, if bfqq has to wait for a full budget of the -+ * in-service queue to be completed, then it may become impossible to -+ * let the process recover the hole, even if the back-shifted -+ * timestamps of bfqq are lower than those of the in-service queue. If -+ * this happens for most or all of the holes, then the process may not -+ * receive its reserved bandwidth. In this respect, it is worth noting -+ * that, being the service of outstanding requests unpreemptible, a -+ * little fraction of the holes may however be unrecoverable, thereby -+ * causing a little loss of bandwidth. -+ * -+ * The last important point is detecting whether bfqq does need this -+ * bandwidth recovery. In this respect, the next function deems the -+ * process associated with bfqq greedy, and thus allows it to recover -+ * the hole, if: 1) the process is waiting for the arrival of a new -+ * request (which implies that bfqq expired for one of the above two -+ * reasons), and 2) such a request has arrived soon. The first -+ * condition is controlled through the flag non_blocking_wait_rq, -+ * while the second through the flag arrived_in_time. If both -+ * conditions hold, then the function computes the budget in the -+ * above-described special way, and signals that the in-service queue -+ * should be expired. Timestamp back-shifting is done later in -+ * __bfq_activate_entity. -+ * -+ * 2. Reduce latency. Even if timestamps are not backshifted to let -+ * the process associated with bfqq recover a service hole, bfqq may -+ * however happen to have, after being (re)activated, a lower finish -+ * timestamp than the in-service queue. That is, the next budget of -+ * bfqq may have to be completed before the one of the in-service -+ * queue. If this is the case, then preempting the in-service queue -+ * allows this goal to be achieved, apart from the unpreemptible, -+ * outstanding requests mentioned above. -+ * -+ * Unfortunately, regardless of which of the above two goals one wants -+ * to achieve, service trees need first to be updated to know whether -+ * the in-service queue must be preempted. To have service trees -+ * correctly updated, the in-service queue must be expired and -+ * rescheduled, and bfqq must be scheduled too. This is one of the -+ * most costly operations (in future versions, the scheduling -+ * mechanism may be re-designed in such a way to make it possible to -+ * know whether preemption is needed without needing to update service -+ * trees). In addition, queue preemptions almost always cause random -+ * I/O, and thus loss of throughput. Because of these facts, the next -+ * function adopts the following simple scheme to avoid both costly -+ * operations and too frequent preemptions: it requests the expiration -+ * of the in-service queue (unconditionally) only for queues that need -+ * to recover a hole, or that either are weight-raised or deserve to -+ * be weight-raised. -+ */ -+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool arrived_in_time, -+ bool wr_or_deserves_wr) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { -+ /* -+ * We do not clear the flag non_blocking_wait_rq here, as -+ * the latter is used in bfq_activate_bfqq to signal -+ * that timestamps need to be back-shifted (and is -+ * cleared right after). -+ */ -+ -+ /* -+ * In next assignment we rely on that either -+ * entity->service or entity->budget are not updated -+ * on expiration if bfqq is empty (see -+ * __bfq_bfqq_recalc_budget). Thus both quantities -+ * remain unchanged after such an expiration, and the -+ * following statement therefore assigns to -+ * entity->budget the remaining budget on such an -+ * expiration. For clarity, entity->service is not -+ * updated on expiration in any case, and, in normal -+ * operation, is reset only when bfqq is selected for -+ * service (see bfq_get_next_queue). -+ */ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = min_t(unsigned long, -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->max_budget); -+ -+ BUG_ON(entity->budget < 0); -+ return true; -+ } -+ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(bfqq->next_rq, bfqq)); -+ BUG_ON(entity->budget < 0); -+ -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+ return wr_or_deserves_wr; -+} -+ -+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ unsigned int old_wr_coeff, -+ bool wr_or_deserves_wr, -+ bool interactive, -+ bool in_burst, -+ bool soft_rt) -+{ -+ if (old_wr_coeff == 1 && wr_or_deserves_wr) { -+ /* start a weight-raising period */ -+ if (interactive) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else { -+ bfqq->wr_start_at_switch_to_srt = jiffies; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ } -+ /* -+ * If needed, further reduce budget to make sure it is -+ * close to bfqq's backlog, so as to reduce the -+ * scheduling-error component due to a too large -+ * budget. Do not care about throughput consequences, -+ * but only about latency. Finally, do not assign a -+ * too small budget either, to avoid increasing -+ * latency by causing too frequent expirations. -+ */ -+ bfqq->entity.budget = min_t(unsigned long, -+ bfqq->entity.budget, -+ 2 * bfq_min_budget(bfqd)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } else if (old_wr_coeff > 1) { -+ if (interactive) { /* update wr coeff and duration */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else if (in_burst) { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq-> -+ wr_cur_max_time)); -+ } else if (soft_rt) { -+ /* -+ * The application is now or still meeting the -+ * requirements for being deemed soft rt. We -+ * can then correctly and safely (re)charge -+ * the weight-raising duration for the -+ * application with the weight-raising -+ * duration for soft rt applications. -+ * -+ * In particular, doing this recharge now, i.e., -+ * before the weight-raising period for the -+ * application finishes, reduces the probability -+ * of the following negative scenario: -+ * 1) the weight of a soft rt application is -+ * raised at startup (as for any newly -+ * created application), -+ * 2) since the application is not interactive, -+ * at a certain time weight-raising is -+ * stopped for the application, -+ * 3) at that time the application happens to -+ * still have pending requests, and hence -+ * is destined to not have a chance to be -+ * deemed soft rt before these requests are -+ * completed (see the comments to the -+ * function bfq_bfqq_softrt_next_start() -+ * for details on soft rt detection), -+ * 4) these pending requests experience a high -+ * latency because the application is not -+ * weight-raised while they are pending. -+ */ -+ if (bfqq->wr_cur_max_time != -+ bfqd->bfq_wr_rt_max_time) { -+ bfqq->wr_start_at_switch_to_srt = -+ bfqq->last_wr_start_finish; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfq_log_bfqq(bfqd, bfqq, -+ "switching to soft_rt wr"); -+ } else -+ bfq_log_bfqq(bfqd, bfqq, -+ "moving forward soft_rt wr duration"); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+} -+ -+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ return bfqq->dispatched == 0 && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ bfqd->bfq_wr_min_idle_time); -+} -+ -+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ int old_wr_coeff, -+ struct request *rq, -+ bool *interactive) -+{ -+ bool soft_rt, in_burst, wr_or_deserves_wr, -+ bfqq_wants_to_preempt, -+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), -+ /* -+ * See the comments on -+ * bfq_bfqq_update_budg_for_activation for -+ * details on the usage of the next variable. -+ */ -+ arrived_in_time = ktime_get_ns() <= -+ RQ_BIC(rq)->ttime.last_end_request + -+ bfqd->bfq_slice_idle * 3; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request non-busy: " -+ "jiffies %lu, in_time %d, idle_long %d busyw %d " -+ "wr_coeff %u", -+ jiffies, arrived_in_time, -+ idle_for_long_time, -+ bfq_bfqq_non_blocking_wait_rq(bfqq), -+ old_wr_coeff); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); -+ -+ /* -+ * bfqq deserves to be weight-raised if: -+ * - it is sync, -+ * - it does not belong to a large burst, -+ * - it has been idle for enough time or is soft real-time, -+ * - is linked to a bfq_io_cq (it is not shared in any sense) -+ */ -+ in_burst = bfq_bfqq_in_large_burst(bfqq); -+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -+ !in_burst && -+ time_is_before_jiffies(bfqq->soft_rt_next_start); -+ *interactive = -+ !in_burst && -+ idle_for_long_time; -+ wr_or_deserves_wr = bfqd->low_latency && -+ (bfqq->wr_coeff > 1 || -+ (bfq_bfqq_sync(bfqq) && -+ bfqq->bic && (*interactive || soft_rt))); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request: " -+ "in_burst %d, " -+ "soft_rt %d (next %lu), inter %d, bic %p", -+ bfq_bfqq_in_large_burst(bfqq), soft_rt, -+ bfqq->soft_rt_next_start, -+ *interactive, -+ bfqq->bic); -+ -+ /* -+ * Using the last flag, update budget and check whether bfqq -+ * may want to preempt the in-service queue. -+ */ -+ bfqq_wants_to_preempt = -+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, -+ arrived_in_time, -+ wr_or_deserves_wr); -+ -+ /* -+ * If bfqq happened to be activated in a burst, but has been -+ * idle for much more than an interactive queue, then we -+ * assume that, in the overall I/O initiated in the burst, the -+ * I/O associated with bfqq is finished. So bfqq does not need -+ * to be treated as a queue belonging to a burst -+ * anymore. Accordingly, we reset bfqq's in_large_burst flag -+ * if set, and remove bfqq from the burst list if it's -+ * there. We do not decrement burst_size, because the fact -+ * that bfqq does not need to belong to the burst list any -+ * more does not invalidate the fact that bfqq was created in -+ * a burst. -+ */ -+ if (likely(!bfq_bfqq_just_created(bfqq)) && -+ idle_for_long_time && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ msecs_to_jiffies(10000))) { -+ hlist_del_init(&bfqq->burst_list_node); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ -+ if (!bfq_bfqq_IO_bound(bfqq)) { -+ if (arrived_in_time) { -+ bfqq->requests_within_timer++; -+ if (bfqq->requests_within_timer >= -+ bfqd->bfq_requests_within_timer) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ } else -+ bfqq->requests_within_timer = 0; -+ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", -+ bfqq->requests_within_timer); -+ } -+ -+ if (bfqd->low_latency) { -+ if (unlikely(time_is_after_jiffies(bfqq->split_time))) -+ /* wraparound */ -+ bfqq->split_time = -+ jiffies - bfqd->bfq_wr_min_idle_time - 1; -+ -+ if (time_is_before_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) { -+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, -+ old_wr_coeff, -+ wr_or_deserves_wr, -+ *interactive, -+ in_burst, -+ soft_rt); -+ -+ if (old_wr_coeff != bfqq->wr_coeff) -+ bfqq->entity.prio_changed = 1; -+ } -+ } -+ -+ bfqq->last_idle_bklogged = jiffies; -+ bfqq->service_from_backlogged = 0; -+ bfq_clear_bfqq_softrt_update(bfqq); -+ -+ bfq_add_bfqq_busy(bfqd, bfqq); -+ -+ /* -+ * Expire in-service queue only if preemption may be needed -+ * for guarantees. In this respect, the function -+ * next_queue_may_preempt just checks a simple, necessary -+ * condition, and not a sufficient condition based on -+ * timestamps. In fact, for the latter condition to be -+ * evaluated, timestamps would need first to be updated, and -+ * this operation is quite costly (see the comments on the -+ * function bfq_bfqq_update_budg_for_activation). -+ */ -+ if (bfqd->in_service_queue && bfqq_wants_to_preempt && -+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && -+ next_queue_may_preempt(bfqd)) { -+ struct bfq_queue *in_serv = -+ bfqd->in_service_queue; -+ BUG_ON(in_serv == bfqq); -+ -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ } -+} -+ -+static void bfq_add_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *next_rq, *prev; -+ unsigned int old_wr_coeff = bfqq->wr_coeff; -+ bool interactive = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", -+ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); -+ -+ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ bfqq->queued[rq_is_sync(rq)]++; -+ bfqd->queued++; -+ -+ elv_rb_add(&bfqq->sort_list, rq); -+ -+ /* -+ * Check if this request is a better next-to-serve candidate. -+ */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ -+ /* -+ * Adjust priority tree position, if next_rq changes. -+ */ -+ if (prev != bfqq->next_rq) -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ -+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ -+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, -+ rq, &interactive); -+ else { -+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && -+ time_is_before_jiffies( -+ bfqq->last_wr_start_finish + -+ bfqd->bfq_wr_min_inter_arr_async)) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "non-idle wrais starting, " -+ "wr_max_time %u wr_busy %d", -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqd->wr_busy_queues); -+ } -+ if (prev != bfqq->next_rq) -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ /* -+ * Assign jiffies to last_wr_start_finish in the following -+ * cases: -+ * -+ * . if bfqq is not going to be weight-raised, because, for -+ * non weight-raised queues, last_wr_start_finish stores the -+ * arrival time of the last request; as of now, this piece -+ * of information is used only for deciding whether to -+ * weight-raise async queues -+ * -+ * . if bfqq is not weight-raised, because, if bfqq is now -+ * switching to weight-raised, then last_wr_start_finish -+ * stores the time when weight-raising starts -+ * -+ * . if bfqq is interactive, because, regardless of whether -+ * bfqq is currently weight-raised, the weight-raising -+ * period must start or restart (this case is considered -+ * separately because it is not detected by the above -+ * conditions, if bfqq is already weight-raised) -+ * -+ * last_wr_start_finish has to be updated also if bfqq is soft -+ * real-time, because the weight-raising period is constantly -+ * restarted on idle-to-busy transitions for these queues, but -+ * this is already done in bfq_bfqq_handle_idle_busy_switch if -+ * needed. -+ */ -+ if (bfqd->low_latency && -+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) -+ bfqq->last_wr_start_finish = jiffies; -+} -+ -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -+ struct bio *bio) -+{ -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return NULL; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); -+ if (bfqq) -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -+ -+ return NULL; -+} -+ -+static sector_t get_sdist(sector_t last_pos, struct request *rq) -+{ -+ sector_t sdist = 0; -+ -+ if (last_pos) { -+ if (last_pos < blk_rq_pos(rq)) -+ sdist = blk_rq_pos(rq) - last_pos; -+ else -+ sdist = last_pos - blk_rq_pos(rq); -+ } -+ -+ return sdist; -+} -+ -+static void bfq_activate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bfqd->rq_in_driver++; -+} -+ -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ BUG_ON(bfqd->rq_in_driver == 0); -+ bfqd->rq_in_driver--; -+} -+ -+static void bfq_remove_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ const int sync = rq_is_sync(rq); -+ -+ BUG_ON(bfqq->entity.service > bfqq->entity.budget && -+ bfqq == bfqd->in_service_queue); -+ -+ if (bfqq->next_rq == rq) { -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ if (rq->queuelist.prev != &rq->queuelist) -+ list_del_init(&rq->queuelist); -+ BUG_ON(bfqq->queued[sync] == 0); -+ bfqq->queued[sync]--; -+ bfqd->queued--; -+ elv_rb_del(&bfqq->sort_list, rq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ bfqq->next_rq = NULL; -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { -+ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ -+ bfq_del_bfqq_busy(bfqd, bfqq, false); -+ /* -+ * bfqq emptied. In normal operation, when -+ * bfqq is empty, bfqq->entity.service and -+ * bfqq->entity.budget must contain, -+ * respectively, the service received and the -+ * budget used last time bfqq emptied. These -+ * facts do not hold in this case, as at least -+ * this last removal occurred while bfqq is -+ * not in service. To avoid inconsistencies, -+ * reset both bfqq->entity.service and -+ * bfqq->entity.budget, if bfqq has still a -+ * process that may issue I/O requests to it. -+ */ -+ bfqq->entity.budget = bfqq->entity.service = 0; -+ } -+ -+ /* -+ * Remove queue from request-position tree as it is empty. -+ */ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ } -+ -+ if (rq->cmd_flags & REQ_META) { -+ BUG_ON(bfqq->meta_pending == 0); -+ bfqq->meta_pending--; -+ } -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -+} -+ -+static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *__rq; -+ -+ __rq = bfq_find_rq_fmerge(bfqd, bio); -+ if (__rq && elv_bio_merge_ok(__rq, bio)) { -+ *req = __rq; -+ return ELEVATOR_FRONT_MERGE; -+ } -+ -+ return ELEVATOR_NO_MERGE; -+} -+ -+static void bfq_merged_request(struct request_queue *q, struct request *req, -+ enum elv_merge type) -+{ -+ if (type == ELEVATOR_FRONT_MERGE && -+ rb_prev(&req->rb_node) && -+ blk_rq_pos(req) < -+ blk_rq_pos(container_of(rb_prev(&req->rb_node), -+ struct request, rb_node))) { -+ struct bfq_queue *bfqq = RQ_BFQQ(req); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *prev, *next_rq; -+ -+ /* Reposition request in its sort_list */ -+ elv_rb_del(&bfqq->sort_list, req); -+ elv_rb_add(&bfqq->sort_list, req); -+ /* Choose next request to be served for bfqq */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -+ bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ /* -+ * If next_rq changes, update both the queue's budget to -+ * fit the new request and the queue's position in its -+ * rq_pos_tree. -+ */ -+ if (prev != bfqq->next_rq) { -+ bfq_updated_next_req(bfqd, bfqq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ } -+} -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static void bfq_bio_merged(struct request_queue *q, struct request *req, -+ struct bio *bio) -+{ -+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); -+} -+#endif -+ -+static void bfq_merged_requests(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); -+ -+ /* -+ * If next and rq belong to the same bfq_queue and next is older -+ * than rq, then reposition rq in the fifo (by substituting next -+ * with rq). Otherwise, if next and rq belong to different -+ * bfq_queues, never reposition rq: in fact, we would have to -+ * reposition it with respect to next's position in its own fifo, -+ * which would most certainly be too expensive with respect to -+ * the benefits. -+ */ -+ if (bfqq == next_bfqq && -+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -+ next->fifo_time < rq->fifo_time) { -+ list_del_init(&rq->queuelist); -+ list_replace_init(&next->queuelist, &rq->queuelist); -+ rq->fifo_time = next->fifo_time; -+ } -+ -+ if (bfqq->next_rq == next) -+ bfqq->next_rq = rq; -+ -+ bfq_remove_request(next); -+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -+} -+ -+/* Must be called with bfqq != NULL */ -+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) -+{ -+ BUG_ON(!bfqq); -+ -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqq->bfqd->wr_busy_queues--; -+ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); -+ } -+ bfqq->wr_coeff = 1; -+ bfqq->wr_cur_max_time = 0; -+ bfqq->last_wr_start_finish = jiffies; -+ /* -+ * Trigger a weight change on the next invocation of -+ * __bfq_entity_update_weight_prio. -+ */ -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "end_wr: wrais ending at %lu, rais_max_time %u", -+ bfqq->last_wr_start_finish, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", -+ bfqq->bfqd->wr_busy_queues); -+} -+ -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ if (bfqg->async_bfqq[i][j]) -+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); -+ if (bfqg->async_idle_bfqq) -+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -+} -+ -+static void bfq_end_wr(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ bfq_end_wr_async(bfqd); -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+} -+ -+static sector_t bfq_io_struct_pos(void *io_struct, bool request) -+{ -+ if (request) -+ return blk_rq_pos(io_struct); -+ else -+ return ((struct bio *)io_struct)->bi_iter.bi_sector; -+} -+ -+static int bfq_rq_close_to_sector(void *io_struct, bool request, -+ sector_t sector) -+{ -+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= -+ BFQQ_CLOSE_THR; -+} -+ -+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ sector_t sector) -+{ -+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ struct rb_node *parent, *node; -+ struct bfq_queue *__bfqq; -+ -+ if (RB_EMPTY_ROOT(root)) -+ return NULL; -+ -+ /* -+ * First, if we find a request starting at the end of the last -+ * request, choose it. -+ */ -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); -+ if (__bfqq) -+ return __bfqq; -+ -+ /* -+ * If the exact sector wasn't found, the parent of the NULL leaf -+ * will contain the closest sector (rq_pos_tree sorted by -+ * next_request position). -+ */ -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ if (blk_rq_pos(__bfqq->next_rq) < sector) -+ node = rb_next(&__bfqq->pos_node); -+ else -+ node = rb_prev(&__bfqq->pos_node); -+ if (!node) -+ return NULL; -+ -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ return NULL; -+} -+ -+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, -+ struct bfq_queue *cur_bfqq, -+ sector_t sector) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * We shall notice if some of the queues are cooperating, -+ * e.g., working closely on the same area of the device. In -+ * that case, we can group them together and: 1) don't waste -+ * time idling, and 2) serve the union of their requests in -+ * the best possible order for throughput. -+ */ -+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); -+ if (!bfqq || bfqq == cur_bfqq) -+ return NULL; -+ -+ return bfqq; -+} -+ -+static struct bfq_queue * -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ int process_refs, new_process_refs; -+ struct bfq_queue *__bfqq; -+ -+ /* -+ * If there are no process references on the new_bfqq, then it is -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain -+ * may have dropped their last reference (not just their last process -+ * reference). -+ */ -+ if (!bfqq_process_refs(new_bfqq)) -+ return NULL; -+ -+ /* Avoid a circular list and skip interim queue merges. */ -+ while ((__bfqq = new_bfqq->new_bfqq)) { -+ if (__bfqq == bfqq) -+ return NULL; -+ new_bfqq = __bfqq; -+ } -+ -+ process_refs = bfqq_process_refs(bfqq); -+ new_process_refs = bfqq_process_refs(new_bfqq); -+ /* -+ * If the process for the bfqq has gone away, there is no -+ * sense in merging the queues. -+ */ -+ if (process_refs == 0 || new_process_refs == 0) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", -+ new_bfqq->pid); -+ -+ /* -+ * Merging is just a redirection: the requests of the process -+ * owning one of the two queues are redirected to the other queue. -+ * The latter queue, in its turn, is set as shared if this is the -+ * first time that the requests of some process are redirected to -+ * it. -+ * -+ * We redirect bfqq to new_bfqq and not the opposite, because we -+ * are in the context of the process owning bfqq, hence we have -+ * the io_cq of this process. So we can immediately configure this -+ * io_cq to redirect the requests of the process to new_bfqq. -+ * -+ * NOTE, even if new_bfqq coincides with the in-service queue, the -+ * io_cq of new_bfqq is not available, because, if the in-service -+ * queue is shared, bfqd->in_service_bic may not point to the -+ * io_cq of the in-service queue. -+ * Redirecting the requests of the process owning bfqq to the -+ * currently in-service queue is in any case the best option, as -+ * we feed the in-service queue with new requests close to the -+ * last request served and, by doing so, hopefully increase the -+ * throughput. -+ */ -+ bfqq->new_bfqq = new_bfqq; -+ new_bfqq->ref += process_refs; -+ return new_bfqq; -+} -+ -+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, -+ struct bfq_queue *new_bfqq) -+{ -+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || -+ (bfqq->ioprio_class != new_bfqq->ioprio_class)) -+ return false; -+ -+ /* -+ * If either of the queues has already been detected as seeky, -+ * then merging it with the other queue is unlikely to lead to -+ * sequential I/O. -+ */ -+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) -+ return false; -+ -+ /* -+ * Interleaved I/O is known to be done by (some) applications -+ * only for reads, so it does not make sense to merge async -+ * queues. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) -+ return false; -+ -+ return true; -+} -+ -+/* -+ * If this function returns true, then bfqq cannot be merged. The idea -+ * is that true cooperation happens very early after processes start -+ * to do I/O. Usually, late cooperations are just accidental false -+ * positives. In case bfqq is weight-raised, such false positives -+ * would evidently degrade latency guarantees for bfqq. -+ */ -+static bool wr_from_too_long(struct bfq_queue *bfqq) -+{ -+ return bfqq->wr_coeff > 1 && -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ msecs_to_jiffies(100)); -+} -+ -+/* -+ * Attempt to schedule a merge of bfqq with the currently in-service -+ * queue or with a close queue among the scheduled queues. Return -+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue -+ * structure otherwise. -+ * -+ * The OOM queue is not allowed to participate to cooperation: in fact, since -+ * the requests temporarily redirected to the OOM queue could be redirected -+ * again to dedicated queues at any time, the state needed to correctly -+ * handle merging with the OOM queue would be quite complex and expensive -+ * to maintain. Besides, in such a critical condition as an out of memory, -+ * the benefits of queue merging may be little relevant, or even negligible. -+ * -+ * Weight-raised queues can be merged only if their weight-raising -+ * period has just started. In fact cooperating processes are usually -+ * started together. Thus, with this filter we avoid false positives -+ * that would jeopardize low-latency guarantees. -+ * -+ * WARNING: queue merging may impair fairness among non-weight raised -+ * queues, for at least two reasons: 1) the original weight of a -+ * merged queue may change during the merged state, 2) even being the -+ * weight the same, a merged queue may be bloated with many more -+ * requests than the ones produced by its originally-associated -+ * process. -+ */ -+static struct bfq_queue * -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ void *io_struct, bool request) -+{ -+ struct bfq_queue *in_service_bfqq, *new_bfqq; -+ -+ if (bfqq->new_bfqq) -+ return bfqq->new_bfqq; -+ -+ if (io_struct && wr_from_too_long(bfqq) && -+ likely(bfqq != &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but bfq%d wr", -+ bfqq->pid); -+ -+ if (!io_struct || -+ wr_from_too_long(bfqq) || -+ unlikely(bfqq == &bfqd->oom_bfqq)) -+ return NULL; -+ -+ /* If there is only one backlogged queue, don't search. */ -+ if (bfqd->busy_queues == 1) -+ return NULL; -+ -+ in_service_bfqq = bfqd->in_service_queue; -+ -+ if (in_service_bfqq && in_service_bfqq != bfqq && -+ bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) -+ && likely(in_service_bfqq == &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have tried merge with in-service-queue, but wr"); -+ -+ if (!in_service_bfqq || in_service_bfqq == bfqq || -+ !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || -+ unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -+ goto check_scheduled; -+ -+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ bfqq->entity.parent == in_service_bfqq->entity.parent && -+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -+ if (new_bfqq) -+ return new_bfqq; -+ } -+ /* -+ * Check whether there is a cooperator among currently scheduled -+ * queues. The only thing we need is that the bio/request is not -+ * NULL, as we need it to establish whether a cooperator exists. -+ */ -+check_scheduled: -+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, -+ bfq_io_struct_pos(io_struct, request)); -+ -+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); -+ -+ if (new_bfqq && wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have merged with bfq%d, but wr", -+ new_bfqq->pid); -+ -+ if (new_bfqq && !wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ return bfq_setup_merge(bfqq, new_bfqq); -+ -+ return NULL; -+} -+ -+static void bfq_bfqq_save_state(struct bfq_queue *bfqq) -+{ -+ struct bfq_io_cq *bic = bfqq->bic; -+ -+ /* -+ * If !bfqq->bic, the queue is already shared or its requests -+ * have already been redirected to a shared queue; both idle window -+ * and weight raising state have already been saved. Do nothing. -+ */ -+ if (!bic) -+ return; -+ -+ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); -+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+} -+ -+static void bfq_get_bic_reference(struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs -+ * is about to begin using a shared bfq_queue. -+ */ -+ if (bfqq->bic) -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); -+} -+ -+static void -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", -+ (unsigned long) new_bfqq->pid); -+ /* Save weight raising and idle window of the merged queues */ -+ bfq_bfqq_save_state(bfqq); -+ bfq_bfqq_save_state(new_bfqq); -+ if (bfq_bfqq_IO_bound(bfqq)) -+ bfq_mark_bfqq_IO_bound(new_bfqq); -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ /* -+ * If bfqq is weight-raised, then let new_bfqq inherit -+ * weight-raising. To reduce false positives, neglect the case -+ * where bfqq has just been created, but has not yet made it -+ * to be weight-raised (which may happen because EQM may merge -+ * bfqq even before bfq_add_request is executed for the first -+ * time for bfqq). Handling this case would however be very -+ * easy, thanks to the flag just_created. -+ */ -+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ new_bfqq->wr_coeff = bfqq->wr_coeff; -+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; -+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; -+ new_bfqq->wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ if (bfq_bfqq_busy(new_bfqq)) { -+ bfqd->wr_busy_queues++; -+ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); -+ } -+ -+ new_bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "wr start after merge with %d, rais_max_time %u", -+ bfqq->pid, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ -+ bfqq->wr_coeff = 1; -+ bfqq->entity.prio_changed = 1; -+ if (bfq_bfqq_busy(bfqq)) { -+ bfqd->wr_busy_queues--; -+ BUG_ON(bfqd->wr_busy_queues < 0); -+ } -+ -+ } -+ -+ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", -+ bfqd->wr_busy_queues); -+ -+ /* -+ * Grab a reference to the bic, to prevent it from being destroyed -+ * before being possibly touched by a bfq_split_bfqq(). -+ */ -+ bfq_get_bic_reference(bfqq); -+ bfq_get_bic_reference(new_bfqq); -+ /* -+ * Merge queues (that is, let bic redirect its requests to new_bfqq) -+ */ -+ bic_set_bfqq(bic, new_bfqq, 1); -+ bfq_mark_bfqq_coop(new_bfqq); -+ /* -+ * new_bfqq now belongs to at least two bics (it is a shared queue): -+ * set new_bfqq->bic to NULL. bfqq either: -+ * - does not belong to any bic any more, and hence bfqq->bic must -+ * be set to NULL, or -+ * - is a queue whose owning bics have already been redirected to a -+ * different queue, hence the queue is destined to not belong to -+ * any bic soon and bfqq->bic is already NULL (therefore the next -+ * assignment causes no harm). -+ */ -+ new_bfqq->bic = NULL; -+ bfqq->bic = NULL; -+ /* release process reference to bfqq */ -+ bfq_put_queue(bfqq); -+} -+ -+static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bool is_sync = op_is_sync(bio->bi_opf); -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq, *new_bfqq; -+ -+ /* -+ * Disallow merge of a sync bio into an async request. -+ */ -+ if (is_sync && !rq_is_sync(rq)) -+ return false; -+ -+ /* -+ * Lookup the bfqq that this bio will be queued with. Allow -+ * merge only if rq is queued there. -+ * Queue lock is held here. -+ */ -+ bic = bfq_bic_lookup(bfqd, current->io_context); -+ if (!bic) -+ return false; -+ -+ bfqq = bic_to_bfqq(bic, is_sync); -+ /* -+ * We take advantage of this function to perform an early merge -+ * of the queues of possible cooperating processes. -+ */ -+ if (bfqq) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ if (new_bfqq) { -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -+ /* -+ * If we get here, the bio will be queued in the -+ * shared queue, i.e., new_bfqq, so use new_bfqq -+ * to decide whether bio and rq can be merged. -+ */ -+ bfqq = new_bfqq; -+ } -+ } -+ -+ return bfqq == RQ_BFQQ(rq); -+} -+ -+static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ return RQ_BFQQ(rq) == RQ_BFQQ(next); -+} -+ -+/* -+ * Set the maximum time for the in-service queue to consume its -+ * budget. This prevents seeky processes from lowering the throughput. -+ * In practice, a time-slice service scheme is used with seeky -+ * processes. -+ */ -+static void bfq_set_budget_timeout(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ unsigned int timeout_coeff; -+ -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -+ timeout_coeff = 1; -+ else -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+ -+ bfqd->last_budget_start = ktime_get(); -+ -+ bfqq->budget_timeout = jiffies + -+ bfqd->bfq_timeout * timeout_coeff; -+ -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); -+} -+ -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ if (bfqq) { -+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -+ bfq_mark_bfqq_must_alloc(bfqq); -+ bfq_clear_bfqq_fifo_expire(bfqq); -+ -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && -+ bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_before_jiffies(bfqq->budget_timeout)) { -+ /* -+ * For soft real-time queues, move the start -+ * of the weight-raising period forward by the -+ * time the queue has not received any -+ * service. Otherwise, a relatively long -+ * service delay is likely to cause the -+ * weight-raising period of the queue to end, -+ * because of the short duration of the -+ * weight-raising period of a soft real-time -+ * queue. It is worth noting that this move -+ * is not so dangerous for the other queues, -+ * because soft real-time queues are not -+ * greedy. -+ * -+ * To not add a further variable, we use the -+ * overloaded field budget_timeout to -+ * determine for how long the queue has not -+ * received service, i.e., how much time has -+ * elapsed since the queue expired. However, -+ * this is a little imprecise, because -+ * budget_timeout is set to jiffies if bfqq -+ * not only expires, but also remains with no -+ * request. -+ */ -+ if (time_after(bfqq->budget_timeout, -+ bfqq->last_wr_start_finish)) -+ bfqq->last_wr_start_finish += -+ jiffies - bfqq->budget_timeout; -+ else -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { -+ pr_crit( -+ "BFQ WARNING:last %lu budget %lu jiffies %lu", -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout, -+ jiffies); -+ pr_crit("diff %lu", jiffies - -+ max_t(unsigned long, -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout)); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+ -+ bfq_set_budget_timeout(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_in_service_queue, cur-budget = %d", -+ bfqq->entity.budget); -+ } else -+ bfq_log(bfqd, "set_in_service_queue: NULL"); -+ -+ bfqd->in_service_queue = bfqq; -+} -+ -+/* -+ * Get and set a new queue for service. -+ */ -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); -+ -+ __bfq_set_in_service_queue(bfqd, bfqq); -+ return bfqq; -+} -+ -+static void bfq_arm_slice_timer(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ struct bfq_io_cq *bic; -+ u32 sl; -+ -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ /* Processes have exited, don't wait. */ -+ bic = bfqd->in_service_bic; -+ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) -+ return; -+ -+ bfq_mark_bfqq_wait_request(bfqq); -+ -+ /* -+ * We don't want to idle for seeks, but we do want to allow -+ * fair distribution of slice time for a process doing back-to-back -+ * seeks. So allow a little bit of time for him to submit a new rq. -+ * -+ * To prevent processes with (partly) seeky workloads from -+ * being too ill-treated, grant them a small fraction of the -+ * assigned budget before reducing the waiting time to -+ * BFQ_MIN_TT. This happened to help reduce latency. -+ */ -+ sl = bfqd->bfq_slice_idle; -+ /* -+ * Unless the queue is being weight-raised or the scenario is -+ * asymmetric, grant only minimum idle time if the queue -+ * is seeky. A long idling is preserved for a weight-raised -+ * queue, or, more in general, in an asymemtric scenario, -+ * because a long idling is needed for guaranteeing to a queue -+ * its reserved share of the throughput (in particular, it is -+ * needed if the queue has a higher weight than some other -+ * queue). -+ */ -+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && -+ bfq_symmetric_scenario(bfqd)) -+ sl = min_t(u32, sl, BFQ_MIN_TT); -+ -+ bfqd->last_idling_start = ktime_get(); -+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), -+ HRTIMER_MODE_REL); -+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -+ bfq_log(bfqd, "arm idle: %ld/%ld ms", -+ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); -+} -+ -+/* -+ * In autotuning mode, max_budget is dynamically recomputed as the -+ * amount of sectors transferred in timeout at the estimated peak -+ * rate. This enables BFQ to utilize a full timeslice with a full -+ * budget, even if the in-service queue is served at peak rate. And -+ * this maximises throughput with sequential workloads. -+ */ -+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) -+{ -+ return (u64)bfqd->peak_rate * USEC_PER_MSEC * -+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; -+} -+ -+/* -+ * Update parameters related to throughput and responsiveness, as a -+ * function of the estimated peak rate. See comments on -+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays. -+ */ -+static void update_thr_responsiveness_params(struct bfq_data *bfqd) -+{ -+ int dev_type = blk_queue_nonrot(bfqd->queue); -+ -+ if (bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd); -+ BUG_ON(bfqd->bfq_max_budget < 0); -+ bfq_log(bfqd, "new max_budget = %d", -+ bfqd->bfq_max_budget); -+ } -+ -+ if (bfqd->device_speed == BFQ_BFQD_FAST && -+ bfqd->peak_rate < device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_SLOW; -+ bfqd->RT_prod = R_slow[dev_type] * -+ T_slow[dev_type]; -+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && -+ bfqd->peak_rate > device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ bfqd->RT_prod = R_fast[dev_type] * -+ T_fast[dev_type]; -+ } -+ -+ bfq_log(bfqd, -+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", -+ dev_type == 0 ? "ROT" : "NONROT", -+ bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", -+ bfqd->device_speed == BFQ_BFQD_FAST ? -+ (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> -+ BFQ_RATE_SHIFT); -+} -+ -+static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) -+{ -+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ -+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; -+ bfqd->peak_rate_samples = 1; -+ bfqd->sequential_samples = 0; -+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = -+ blk_rq_sectors(rq); -+ } else /* no new rq dispatched, just reset the number of samples */ -+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ -+ -+ bfq_log(bfqd, -+ "reset_rate_computation at end, sample %u/%u tot_sects %llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched); -+} -+ -+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) -+{ -+ u32 rate, weight, divisor; -+ -+ /* -+ * For the convergence property to hold (see comments on -+ * bfq_update_peak_rate()) and for the assessment to be -+ * reliable, a minimum number of samples must be present, and -+ * a minimum amount of time must have elapsed. If not so, do -+ * not compute new rate. Just reset parameters, to get ready -+ * for a new evaluation attempt. -+ */ -+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || -+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { -+ bfq_log(bfqd, -+ "update_rate_reset: only resetting, delta_first %lluus samples %d", -+ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); -+ goto reset_computation; -+ } -+ -+ /* -+ * If a new request completion has occurred after last -+ * dispatch, then, to approximate the rate at which requests -+ * have been served by the device, it is more precise to -+ * extend the observation interval to the last completion. -+ */ -+ bfqd->delta_from_first = -+ max_t(u64, bfqd->delta_from_first, -+ bfqd->last_completion - bfqd->first_dispatch); -+ -+ BUG_ON(bfqd->delta_from_first == 0); -+ /* -+ * Rate computed in sects/usec, and not sects/nsec, for -+ * precision issues. -+ */ -+ rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); -+ -+ bfq_log(bfqd, -+"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ rate > 20< 20M sectors/sec) -+ */ -+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && -+ rate <= bfqd->peak_rate) || -+ rate > 20<peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ goto reset_computation; -+ } else { -+ bfq_log(bfqd, -+ "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ } -+ -+ /* -+ * We have to update the peak rate, at last! To this purpose, -+ * we use a low-pass filter. We compute the smoothing constant -+ * of the filter as a function of the 'weight' of the new -+ * measured rate. -+ * -+ * As can be seen in next formulas, we define this weight as a -+ * quantity proportional to how sequential the workload is, -+ * and to how long the observation time interval is. -+ * -+ * The weight runs from 0 to 8. The maximum value of the -+ * weight, 8, yields the minimum value for the smoothing -+ * constant. At this minimum value for the smoothing constant, -+ * the measured rate contributes for half of the next value of -+ * the estimated peak rate. -+ * -+ * So, the first step is to compute the weight as a function -+ * of how sequential the workload is. Note that the weight -+ * cannot reach 9, because bfqd->sequential_samples cannot -+ * become equal to bfqd->peak_rate_samples, which, in its -+ * turn, holds true because bfqd->sequential_samples is not -+ * incremented for the first sample. -+ */ -+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; -+ -+ /* -+ * Second step: further refine the weight as a function of the -+ * duration of the observation interval. -+ */ -+ weight = min_t(u32, 8, -+ div_u64(weight * bfqd->delta_from_first, -+ BFQ_RATE_REF_INTERVAL)); -+ -+ /* -+ * Divisor ranging from 10, for minimum weight, to 2, for -+ * maximum weight. -+ */ -+ divisor = 10 - weight; -+ BUG_ON(divisor == 0); -+ -+ /* -+ * Finally, update peak rate: -+ * -+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor -+ */ -+ bfqd->peak_rate *= divisor-1; -+ bfqd->peak_rate /= divisor; -+ rate /= divisor; /* smoothing constant alpha = 1/divisor */ -+ -+ bfq_log(bfqd, -+ "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", -+ divisor, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), -+ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -+ -+ BUG_ON(bfqd->peak_rate == 0); -+ BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; -+ update_thr_responsiveness_params(bfqd); -+ BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */ -+ bfq_log(bfqd, -+ "update_peak_rate: goto reset, samples %d", -+ bfqd->peak_rate_samples) ; -+ bfq_reset_rate_computation(bfqd, rq); -+ goto update_last_values; /* will add one sample */ -+ } -+ -+ /* -+ * Device idle for very long: the observation interval lasting -+ * up to this dispatch cannot be a valid observation interval -+ * for computing a new peak rate (similarly to the late- -+ * completion event in bfq_completed_request()). Go to -+ * update_rate_and_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - start a new observation interval with this dispatch -+ */ -+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && -+ bfqd->rq_in_driver == 0) { -+ bfq_log(bfqd, -+"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", -+ (now_ns - bfqd->last_dispatch)>>10, -+ bfqd->peak_rate_samples) ; -+ goto update_rate_and_reset; -+ } -+ -+ /* Update sampling information */ -+ bfqd->peak_rate_samples++; -+ -+ if ((bfqd->rq_in_driver > 0 || -+ now_ns - bfqd->last_completion < BFQ_MIN_TT) -+ && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) -+ bfqd->sequential_samples++; -+ -+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); -+ -+ /* Reset max observed rq size every 32 dispatches */ -+ if (likely(bfqd->peak_rate_samples % 32)) -+ bfqd->last_rq_max_size = -+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); -+ else -+ bfqd->last_rq_max_size = blk_rq_sectors(rq); -+ -+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched, -+ bfqd->delta_from_first>>10); -+ -+ /* Target observation interval not yet reached, go on sampling */ -+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) -+ goto update_last_values; -+ -+update_rate_and_reset: -+ bfq_update_rate_reset(bfqd, rq); -+update_last_values: -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ bfqd->last_dispatch = now_ns; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", -+ (now_ns - bfqd->first_dispatch)>>10, -+ (unsigned long long) bfqd->last_position, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ bfq_log(bfqd, -+ "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); -+} -+ -+/* -+ * Move request from internal lists to the dispatch list of the request queue -+ */ -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * For consistency, the next instruction should have been executed -+ * after removing the request from the queue and dispatching it. -+ * We execute instead this instruction before bfq_remove_request() -+ * (and hence introduce a temporary inconsistency), for efficiency. -+ * In fact, in a forced_dispatch, this prevents two counters related -+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq -+ * is not in service, and then to be incremented again after -+ * incrementing bfqq->dispatched. -+ */ -+ bfqq->dispatched++; -+ bfq_update_peak_rate(q->elevator->elevator_data, rq); -+ -+ bfq_remove_request(rq); -+ elv_dispatch_sort(q, rq); -+} -+ -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * If this bfqq is shared between multiple processes, check -+ * to make sure that those processes are still issuing I/Os -+ * within the mean seek distance. If not, it may be time to -+ * break the queues apart again. -+ */ -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) -+ bfq_mark_bfqq_split_coop(bfqq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ if (bfqq->dispatched == 0) -+ /* -+ * Overloading budget_timeout field to store -+ * the time at which the queue remains with no -+ * backlog and no outstanding request; used by -+ * the weight-raising mechanism. -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_del_bfqq_busy(bfqd, bfqq, true); -+ } else { -+ bfq_requeue_bfqq(bfqd, bfqq); -+ /* -+ * Resort priority tree of potential close cooperators. -+ */ -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } -+ -+ /* -+ * All in-service entities must have been properly deactivated -+ * or requeued before executing the next function, which -+ * resets all in-service entites as no more in service. -+ */ -+ __bfq_bfqd_reset_in_service(bfqd); -+} -+ -+/** -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. -+ * @bfqd: device data. -+ * @bfqq: queue to update. -+ * @reason: reason for expiration. -+ * -+ * Handle the feedback on @bfqq budget at queue expiration. -+ * See the body for detailed comments. -+ */ -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ enum bfqq_expiration reason) -+{ -+ struct request *next_rq; -+ int budget, min_budget; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ min_budget = bfq_min_budget(bfqd); -+ -+ if (bfqq->wr_coeff == 1) -+ budget = bfqq->max_budget; -+ else /* -+ * Use a constant, low budget for weight-raised queues, -+ * to help achieve a low latency. Keep it slightly higher -+ * than the minimum possible budget, to cause a little -+ * bit fewer expirations. -+ */ -+ budget = 2 * min_budget; -+ -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", -+ budget, bfq_min_budget(bfqd)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); -+ -+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -+ switch (reason) { -+ /* -+ * Caveat: in all the following cases we trade latency -+ * for throughput. -+ */ -+ case BFQ_BFQQ_TOO_IDLE: -+ /* -+ * This is the only case where we may reduce -+ * the budget: if there is no request of the -+ * process still waiting for completion, then -+ * we assume (tentatively) that the timer has -+ * expired because the batch of requests of -+ * the process could have been served with a -+ * smaller budget. Hence, betting that -+ * process will behave in the same way when it -+ * becomes backlogged again, we reduce its -+ * next budget. As long as we guess right, -+ * this budget cut reduces the latency -+ * experienced by the process. -+ * -+ * However, if there are still outstanding -+ * requests, then the process may have not yet -+ * issued its next request just because it is -+ * still waiting for the completion of some of -+ * the still outstanding ones. So in this -+ * subcase we do not reduce its budget, on the -+ * contrary we increase it to possibly boost -+ * the throughput, as discussed in the -+ * comments to the BUDGET_TIMEOUT case. -+ */ -+ if (bfqq->dispatched > 0) /* still outstanding reqs */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ else { -+ if (budget > 5 * min_budget) -+ budget -= 4 * min_budget; -+ else -+ budget = min_budget; -+ } -+ break; -+ case BFQ_BFQQ_BUDGET_TIMEOUT: -+ /* -+ * We double the budget here because it gives -+ * the chance to boost the throughput if this -+ * is not a seeky process (and has bumped into -+ * this timeout because of, e.g., ZBR). -+ */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_BUDGET_EXHAUSTED: -+ /* -+ * The process still has backlog, and did not -+ * let either the budget timeout or the disk -+ * idling timeout expire. Hence it is not -+ * seeky, has a short thinktime and may be -+ * happy with a higher budget too. So -+ * definitely increase the budget of this good -+ * candidate to boost the disk throughput. -+ */ -+ budget = min(budget * 4, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_NO_MORE_REQUESTS: -+ /* -+ * For queues that expire for this reason, it -+ * is particularly important to keep the -+ * budget close to the actual service they -+ * need. Doing so reduces the timestamp -+ * misalignment problem described in the -+ * comments in the body of -+ * __bfq_activate_entity. In fact, suppose -+ * that a queue systematically expires for -+ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a -+ * new request in time to enjoy timestamp -+ * back-shifting. The larger the budget of the -+ * queue is with respect to the service the -+ * queue actually requests in each service -+ * slot, the more times the queue can be -+ * reactivated with the same virtual finish -+ * time. It follows that, even if this finish -+ * time is pushed to the system virtual time -+ * to reduce the consequent timestamp -+ * misalignment, the queue unjustly enjoys for -+ * many re-activations a lower finish time -+ * than all newly activated queues. -+ * -+ * The service needed by bfqq is measured -+ * quite precisely by bfqq->entity.service. -+ * Since bfqq does not enjoy device idling, -+ * bfqq->entity.service is equal to the number -+ * of sectors that the process associated with -+ * bfqq requested to read/write before waiting -+ * for request completions, or blocking for -+ * other reasons. -+ */ -+ budget = max_t(int, bfqq->entity.service, min_budget); -+ break; -+ default: -+ return; -+ } -+ } else if (!bfq_bfqq_sync(bfqq)) -+ /* -+ * Async queues get always the maximum possible -+ * budget, as for them we do not care about latency -+ * (in addition, their ability to dispatch is limited -+ * by the charging factor). -+ */ -+ budget = bfqd->bfq_max_budget; -+ -+ bfqq->max_budget = budget; -+ -+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && -+ !bfqd->bfq_user_max_budget) -+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); -+ -+ /* -+ * If there is still backlog, then assign a new budget, making -+ * sure that it is large enough for the next request. Since -+ * the finish time of bfqq must be kept in sync with the -+ * budget, be sure to call __bfq_bfqq_expire() *after* this -+ * update. -+ * -+ * If there is no backlog, then no need to update the budget; -+ * it will be updated on the arrival of a new request. -+ */ -+ next_rq = bfqq->next_rq; -+ if (next_rq) { -+ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || -+ reason == BFQ_BFQQ_NO_MORE_REQUESTS); -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", -+ next_rq ? blk_rq_sectors(next_rq) : 0, -+ bfqq->entity.budget); -+} -+ -+/* -+ * Return true if the process associated with bfqq is "slow". The slow -+ * flag is used, in addition to the budget timeout, to reduce the -+ * amount of service provided to seeky processes, and thus reduce -+ * their chances to lower the throughput. More details in the comments -+ * on the function bfq_bfqq_expire(). -+ * -+ * An important observation is in order: as discussed in the comments -+ * on the function bfq_update_peak_rate(), with devices with internal -+ * queues, it is hard if ever possible to know when and for how long -+ * an I/O request is processed by the device (apart from the trivial -+ * I/O pattern where a new request is dispatched only after the -+ * previous one has been completed). This makes it hard to evaluate -+ * the real rate at which the I/O requests of each bfq_queue are -+ * served. In fact, for an I/O scheduler like BFQ, serving a -+ * bfq_queue means just dispatching its requests during its service -+ * slot (i.e., until the budget of the queue is exhausted, or the -+ * queue remains idle, or, finally, a timeout fires). But, during the -+ * service slot of a bfq_queue, around 100 ms at most, the device may -+ * be even still processing requests of bfq_queues served in previous -+ * service slots. On the opposite end, the requests of the in-service -+ * bfq_queue may be completed after the service slot of the queue -+ * finishes. -+ * -+ * Anyway, unless more sophisticated solutions are used -+ * (where possible), the sum of the sizes of the requests dispatched -+ * during the service slot of a bfq_queue is probably the only -+ * approximation available for the service received by the bfq_queue -+ * during its service slot. And this sum is the quantity used in this -+ * function to evaluate the I/O speed of a process. -+ */ -+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool compensate, enum bfqq_expiration reason, -+ unsigned long *delta_ms) -+{ -+ ktime_t delta_ktime; -+ u32 delta_usecs; -+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ -+ -+ if (!bfq_bfqq_sync(bfqq)) -+ return false; -+ -+ if (compensate) -+ delta_ktime = bfqd->last_idling_start; -+ else -+ delta_ktime = ktime_get(); -+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); -+ delta_usecs = ktime_to_us(delta_ktime); -+ -+ /* don't use too short time intervals */ -+ if (delta_usecs < 1000) { -+ if (blk_queue_nonrot(bfqd->queue)) -+ /* -+ * give same worst-case guarantees as idling -+ * for seeky -+ */ -+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; -+ else /* charge at least one seek */ -+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; -+ -+ bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); -+ -+ return slow; -+ } -+ -+ *delta_ms = delta_usecs / USEC_PER_MSEC; -+ -+ /* -+ * Use only long (> 20ms) intervals to filter out excessive -+ * spikes in service rate estimation. -+ */ -+ if (delta_usecs > 20000) { -+ /* -+ * Caveat for rotational devices: processes doing I/O -+ * in the slower disk zones tend to be slow(er) even -+ * if not seeky. In this respect, the estimated peak -+ * rate is likely to be an average over the disk -+ * surface. Accordingly, to not be too harsh with -+ * unlucky processes, a process is deemed slow only if -+ * its rate has been lower than half of the estimated -+ * peak rate. -+ */ -+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -+ bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", -+ bfqq->entity.service, bfqd->bfq_max_budget); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); -+ -+ return slow; -+} -+ -+/* -+ * To be deemed as soft real-time, an application must meet two -+ * requirements. First, the application must not require an average -+ * bandwidth higher than the approximate bandwidth required to playback or -+ * record a compressed high-definition video. -+ * The next function is invoked on the completion of the last request of a -+ * batch, to compute the next-start time instant, soft_rt_next_start, such -+ * that, if the next request of the application does not arrive before -+ * soft_rt_next_start, then the above requirement on the bandwidth is met. -+ * -+ * The second requirement is that the request pattern of the application is -+ * isochronous, i.e., that, after issuing a request or a batch of requests, -+ * the application stops issuing new requests until all its pending requests -+ * have been completed. After that, the application may issue a new batch, -+ * and so on. -+ * For this reason the next function is invoked to compute -+ * soft_rt_next_start only for applications that meet this requirement, -+ * whereas soft_rt_next_start is set to infinity for applications that do -+ * not. -+ * -+ * Unfortunately, even a greedy application may happen to behave in an -+ * isochronous way if the CPU load is high. In fact, the application may -+ * stop issuing requests while the CPUs are busy serving other processes, -+ * then restart, then stop again for a while, and so on. In addition, if -+ * the disk achieves a low enough throughput with the request pattern -+ * issued by the application (e.g., because the request pattern is random -+ * and/or the device is slow), then the application may meet the above -+ * bandwidth requirement too. To prevent such a greedy application to be -+ * deemed as soft real-time, a further rule is used in the computation of -+ * soft_rt_next_start: soft_rt_next_start must be higher than the current -+ * time plus the maximum time for which the arrival of a request is waited -+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. -+ * This filters out greedy applications, as the latter issue instead their -+ * next request as soon as possible after the last one has been completed -+ * (in contrast, when a batch of requests is completed, a soft real-time -+ * application spends some time processing data). -+ * -+ * Unfortunately, the last filter may easily generate false positives if -+ * only bfqd->bfq_slice_idle is used as a reference time interval and one -+ * or both the following cases occur: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher -+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with -+ * HZ=100. -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing -+ * for a while, then suddenly 'jump' by several units to recover the lost -+ * increments. This seems to happen, e.g., inside virtual machines. -+ * To address this issue, we do not use as a reference time interval just -+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In -+ * particular we add the minimum number of jiffies for which the filter -+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual -+ * machines. -+ */ -+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, -+"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", -+ bfqq->service_from_backlogged, -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate)); -+ -+ return max(bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+} -+ -+/* -+ * Return the farthest future time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_greatest_from_now(void) -+{ -+ return jiffies + MAX_JIFFY_OFFSET; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ -+/** -+ * bfq_bfqq_expire - expire a queue. -+ * @bfqd: device owning the queue. -+ * @bfqq: the queue to expire. -+ * @compensate: if true, compensate for the time spent idling. -+ * @reason: the reason causing the expiration. -+ * -+ * If the process associated with bfqq does slow I/O (e.g., because it -+ * issues random requests), we charge bfqq with the time it has been -+ * in service instead of the service it has received (see -+ * bfq_bfqq_charge_time for details on how this goal is achieved). As -+ * a consequence, bfqq will typically get higher timestamps upon -+ * reactivation, and hence it will be rescheduled as if it had -+ * received more service than what it has actually received. In the -+ * end, bfqq receives less service in proportion to how slowly its -+ * associated process consumes its budgets (and hence how seriously it -+ * tends to lower the throughput). In addition, this time-charging -+ * strategy guarantees time fairness among slow processes. In -+ * contrast, if the process associated with bfqq is not slow, we -+ * charge bfqq exactly with the service it has received. -+ * -+ * Charging time to the first type of queues and the exact service to -+ * the other has the effect of using the WF2Q+ policy to schedule the -+ * former on a timeslice basis, without violating service domain -+ * guarantees among the latter. -+ */ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason) -+{ -+ bool slow; -+ unsigned long delta = 0; -+ struct bfq_entity *entity = &bfqq->entity; -+ int ref; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * Check whether the process is slow (see bfq_bfqq_is_slow). -+ */ -+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); -+ -+ /* -+ * Increase service_from_backlogged before next statement, -+ * because the possible next invocation of -+ * bfq_bfqq_charge_time would likely inflate -+ * entity->service. In contrast, service_from_backlogged must -+ * contain real service, to enable the soft real-time -+ * heuristic to correctly compute the bandwidth consumed by -+ * bfqq. -+ */ -+ bfqq->service_from_backlogged += entity->service; -+ -+ /* -+ * As above explained, charge slow (typically seeky) and -+ * timed-out queues with the time and not the service -+ * received, to favor sequential workloads. -+ * -+ * Processes doing I/O in the slower disk zones will tend to -+ * be slow(er) even if not seeky. Therefore, since the -+ * estimated peak rate is actually an average over the disk -+ * surface, these processes may timeout just for bad luck. To -+ * avoid punishing them, do not charge time to processes that -+ * succeeded in consuming at least 2/3 of their budget. This -+ * allows BFQ to preserve enough elasticity to still perform -+ * bandwidth, and not time, distribution with little unlucky -+ * or quasi-sequential processes. -+ */ -+ if (bfqq->wr_coeff == 1 && -+ (slow || -+ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) -+ bfq_bfqq_charge_time(bfqd, bfqq, delta); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ if (reason == BFQ_BFQQ_TOO_IDLE && -+ entity->service <= 2 * entity->budget / 10) -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (bfqd->low_latency && bfqq->wr_coeff == 1) -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ /* -+ * If we get here, and there are no outstanding -+ * requests, then the request pattern is isochronous -+ * (see the comments on the function -+ * bfq_bfqq_softrt_next_start()). Thus we can compute -+ * soft_rt_next_start. If, instead, the queue still -+ * has outstanding requests, then we have to wait for -+ * the completion of all the outstanding requests to -+ * discover whether the request pattern is actually -+ * isochronous. -+ */ -+ BUG_ON(bfqd->busy_queues < 1); -+ if (bfqq->dispatched == 0) { -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", -+ bfqq->soft_rt_next_start); -+ } else { -+ /* -+ * The application is still waiting for the -+ * completion of one or more requests: -+ * prevent it from possibly being incorrectly -+ * deemed as soft real-time by setting its -+ * soft_rt_next_start to infinity. In fact, -+ * without this assignment, the application -+ * would be incorrectly deemed as soft -+ * real-time if: -+ * 1) it issued a new request before the -+ * completion of all its in-flight -+ * requests, and -+ * 2) at that time, its soft_rt_next_start -+ * happened to be in the past. -+ */ -+ bfqq->soft_rt_next_start = -+ bfq_greatest_from_now(); -+ /* -+ * Schedule an update of soft_rt_next_start to when -+ * the task may be discovered to be isochronous. -+ */ -+ bfq_mark_bfqq_softrt_update(bfqq); -+ } -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", -+ reason, slow, bfqq->dispatched, -+ bfq_bfqq_has_short_ttime(bfqq), entity->weight); -+ -+ /* -+ * Increase, decrease or leave budget unchanged according to -+ * reason. -+ */ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ ref = bfqq->ref; -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ BUG_ON(ref > 1 && -+ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && -+ !bfq_class_idle(bfqq)); -+ -+ /* mark bfqq as waiting a request only if a bic still points to it */ -+ if (ref > 1 && !bfq_bfqq_busy(bfqq) && -+ reason != BFQ_BFQQ_BUDGET_TIMEOUT && -+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) -+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); -+} -+ -+/* -+ * Budget timeout is not implemented through a dedicated timer, but -+ * just checked on request arrivals and completions, as well as on -+ * idle timer expirations. -+ */ -+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -+{ -+ return time_is_before_eq_jiffies(bfqq->budget_timeout); -+} -+ -+/* -+ * If we expire a queue that is actively waiting (i.e., with the -+ * device idled) for the arrival of a new request, then we may incur -+ * the timestamp misalignment problem described in the body of the -+ * function __bfq_activate_entity. Hence we return true only if this -+ * condition does not hold, or if the queue is slow enough to deserve -+ * only to be kicked off for preserving a high throughput. -+ */ -+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "may_budget_timeout: wait_request %d left %d timeout %d", -+ bfq_bfqq_wait_request(bfqq), -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, -+ bfq_bfqq_budget_timeout(bfqq)); -+ -+ return (!bfq_bfqq_wait_request(bfqq) || -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) -+ && -+ bfq_bfqq_budget_timeout(bfqq); -+} -+ -+/* -+ * For a queue that becomes empty, device idling is allowed only if -+ * this function returns true for that queue. As a consequence, since -+ * device idling plays a critical role for both throughput boosting -+ * and service guarantees, the return value of this function plays a -+ * critical role as well. -+ * -+ * In a nutshell, this function returns true only if idling is -+ * beneficial for throughput or, even if detrimental for throughput, -+ * idling is however necessary to preserve service guarantees (low -+ * latency, desired throughput distribution, ...). In particular, on -+ * NCQ-capable devices, this function tries to return false, so as to -+ * help keep the drives' internal queues full, whenever this helps the -+ * device boost the throughput without causing any service-guarantee -+ * issue. -+ * -+ * In more detail, the return value of this function is obtained by, -+ * first, computing a number of boolean variables that take into -+ * account throughput and service-guarantee issues, and, then, -+ * combining these variables in a logical expression. Most of the -+ * issues taken into account are not trivial. We discuss these issues -+ * while introducing the variables. -+ */ -+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ bool rot_without_queueing = -+ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, -+ bfqq_sequential_and_IO_bound, -+ idling_boosts_thr, idling_boosts_thr_without_issues, -+ idling_needed_for_service_guarantees, -+ asymmetric_scenario; -+ -+ if (bfqd->strict_guarantees) -+ return true; -+ -+ /* -+ * Idling is performed only if slice_idle > 0. In addition, we -+ * do not idle if -+ * (a) bfqq is async -+ * (b) bfqq is in the idle io prio class: in this case we do -+ * not idle because we want to minimize the bandwidth that -+ * queues in this class can steal to higher-priority queues -+ */ -+ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || -+ bfq_class_idle(bfqq)) -+ return false; -+ -+ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && -+ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); -+ /* -+ * The next variable takes into account the cases where idling -+ * boosts the throughput. -+ * -+ * The value of the variable is computed considering, first, that -+ * idling is virtually always beneficial for the throughput if: -+ * (a) the device is not NCQ-capable and rotational, or -+ * (b) regardless of the presence of NCQ, the device is rotational and -+ * the request pattern for bfqq is I/O-bound and sequential, or -+ * (c) regardless of whether it is rotational, the device is -+ * not NCQ-capable and the request pattern for bfqq is -+ * I/O-bound and sequential. -+ * -+ * Secondly, and in contrast to the above item (b), idling an -+ * NCQ-capable flash-based device would not boost the -+ * throughput even with sequential I/O; rather it would lower -+ * the throughput in proportion to how fast the device -+ * is. Accordingly, the next variable is true if any of the -+ * above conditions (a), (b) or (c) is true, and, in -+ * particular, happens to be false if bfqd is an NCQ-capable -+ * flash-based device. -+ */ -+ idling_boosts_thr = rot_without_queueing || -+ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && -+ bfqq_sequential_and_IO_bound); -+ -+ /* -+ * The value of the next variable, -+ * idling_boosts_thr_without_issues, is equal to that of -+ * idling_boosts_thr, unless a special case holds. In this -+ * special case, described below, idling may cause problems to -+ * weight-raised queues. -+ * -+ * When the request pool is saturated (e.g., in the presence -+ * of write hogs), if the processes associated with -+ * non-weight-raised queues ask for requests at a lower rate, -+ * then processes associated with weight-raised queues have a -+ * higher probability to get a request from the pool -+ * immediately (or at least soon) when they need one. Thus -+ * they have a higher probability to actually get a fraction -+ * of the device throughput proportional to their high -+ * weight. This is especially true with NCQ-capable drives, -+ * which enqueue several requests in advance, and further -+ * reorder internally-queued requests. -+ * -+ * For this reason, we force to false the value of -+ * idling_boosts_thr_without_issues if there are weight-raised -+ * busy queues. In this case, and if bfqq is not weight-raised, -+ * this guarantees that the device is not idled for bfqq (if, -+ * instead, bfqq is weight-raised, then idling will be -+ * guaranteed by another variable, see below). Combined with -+ * the timestamping rules of BFQ (see [1] for details), this -+ * behavior causes bfqq, and hence any sync non-weight-raised -+ * queue, to get a lower number of requests served, and thus -+ * to ask for a lower number of requests from the request -+ * pool, before the busy weight-raised queues get served -+ * again. This often mitigates starvation problems in the -+ * presence of heavy write workloads and NCQ, thereby -+ * guaranteeing a higher application and system responsiveness -+ * in these hostile scenarios. -+ */ -+ idling_boosts_thr_without_issues = idling_boosts_thr && -+ bfqd->wr_busy_queues == 0; -+ -+ /* -+ * There is then a case where idling must be performed not -+ * for throughput concerns, but to preserve service -+ * guarantees. -+ * -+ * To introduce this case, we can note that allowing the drive -+ * to enqueue more than one request at a time, and hence -+ * delegating de facto final scheduling decisions to the -+ * drive's internal scheduler, entails loss of control on the -+ * actual request service order. In particular, the critical -+ * situation is when requests from different processes happen -+ * to be present, at the same time, in the internal queue(s) -+ * of the drive. In such a situation, the drive, by deciding -+ * the service order of the internally-queued requests, does -+ * determine also the actual throughput distribution among -+ * these processes. But the drive typically has no notion or -+ * concern about per-process throughput distribution, and -+ * makes its decisions only on a per-request basis. Therefore, -+ * the service distribution enforced by the drive's internal -+ * scheduler is likely to coincide with the desired -+ * device-throughput distribution only in a completely -+ * symmetric scenario where: -+ * (i) each of these processes must get the same throughput as -+ * the others; -+ * (ii) all these processes have the same I/O pattern -+ * (either sequential or random). -+ * In fact, in such a scenario, the drive will tend to treat -+ * the requests of each of these processes in about the same -+ * way as the requests of the others, and thus to provide -+ * each of these processes with about the same throughput -+ * (which is exactly the desired throughput distribution). In -+ * contrast, in any asymmetric scenario, device idling is -+ * certainly needed to guarantee that bfqq receives its -+ * assigned fraction of the device throughput (see [1] for -+ * details). -+ * -+ * We address this issue by controlling, actually, only the -+ * symmetry sub-condition (i), i.e., provided that -+ * sub-condition (i) holds, idling is not performed, -+ * regardless of whether sub-condition (ii) holds. In other -+ * words, only if sub-condition (i) holds, then idling is -+ * allowed, and the device tends to be prevented from queueing -+ * many requests, possibly of several processes. The reason -+ * for not controlling also sub-condition (ii) is that we -+ * exploit preemption to preserve guarantees in case of -+ * symmetric scenarios, even if (ii) does not hold, as -+ * explained in the next two paragraphs. -+ * -+ * Even if a queue, say Q, is expired when it remains idle, Q -+ * can still preempt the new in-service queue if the next -+ * request of Q arrives soon (see the comments on -+ * bfq_bfqq_update_budg_for_activation). If all queues and -+ * groups have the same weight, this form of preemption, -+ * combined with the hole-recovery heuristic described in the -+ * comments on function bfq_bfqq_update_budg_for_activation, -+ * are enough to preserve a correct bandwidth distribution in -+ * the mid term, even without idling. In fact, even if not -+ * idling allows the internal queues of the device to contain -+ * many requests, and thus to reorder requests, we can rather -+ * safely assume that the internal scheduler still preserves a -+ * minimum of mid-term fairness. The motivation for using -+ * preemption instead of idling is that, by not idling, -+ * service guarantees are preserved without minimally -+ * sacrificing throughput. In other words, both a high -+ * throughput and its desired distribution are obtained. -+ * -+ * More precisely, this preemption-based, idleless approach -+ * provides fairness in terms of IOPS, and not sectors per -+ * second. This can be seen with a simple example. Suppose -+ * that there are two queues with the same weight, but that -+ * the first queue receives requests of 8 sectors, while the -+ * second queue receives requests of 1024 sectors. In -+ * addition, suppose that each of the two queues contains at -+ * most one request at a time, which implies that each queue -+ * always remains idle after it is served. Finally, after -+ * remaining idle, each queue receives very quickly a new -+ * request. It follows that the two queues are served -+ * alternatively, preempting each other if needed. This -+ * implies that, although both queues have the same weight, -+ * the queue with large requests receives a service that is -+ * 1024/8 times as high as the service received by the other -+ * queue. -+ * -+ * On the other hand, device idling is performed, and thus -+ * pure sector-domain guarantees are provided, for the -+ * following queues, which are likely to need stronger -+ * throughput guarantees: weight-raised queues, and queues -+ * with a higher weight than other queues. When such queues -+ * are active, sub-condition (i) is false, which triggers -+ * device idling. -+ * -+ * According to the above considerations, the next variable is -+ * true (only) if sub-condition (i) holds. To compute the -+ * value of this variable, we not only use the return value of -+ * the function bfq_symmetric_scenario(), but also check -+ * whether bfqq is being weight-raised, because -+ * bfq_symmetric_scenario() does not take into account also -+ * weight-raised queues (see comments on -+ * bfq_weights_tree_add()). -+ * -+ * As a side note, it is worth considering that the above -+ * device-idling countermeasures may however fail in the -+ * following unlucky scenario: if idling is (correctly) -+ * disabled in a time period during which all symmetry -+ * sub-conditions hold, and hence the device is allowed to -+ * enqueue many requests, but at some later point in time some -+ * sub-condition stops to hold, then it may become impossible -+ * to let requests be served in the desired order until all -+ * the requests already queued in the device have been served. -+ */ -+ asymmetric_scenario = bfqq->wr_coeff > 1 || -+ !bfq_symmetric_scenario(bfqd); -+ -+ /* -+ * Finally, there is a case where maximizing throughput is the -+ * best choice even if it may cause unfairness toward -+ * bfqq. Such a case is when bfqq became active in a burst of -+ * queue activations. Queues that became active during a large -+ * burst benefit only from throughput, as discussed in the -+ * comments on bfq_handle_burst. Thus, if bfqq became active -+ * in a burst and not idling the device maximizes throughput, -+ * then the device must no be idled, because not idling the -+ * device provides bfqq and all other queues in the burst with -+ * maximum benefit. Combining this and the above case, we can -+ * now establish when idling is actually needed to preserve -+ * service guarantees. -+ */ -+ idling_needed_for_service_guarantees = -+ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); -+ -+ /* -+ * We have now all the components we need to compute the -+ * return value of the function, which is true only if idling -+ * either boosts the throughput (without issues), or is -+ * necessary to preserve service guarantees. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", -+ bfq_bfqq_sync(bfqq), idling_boosts_thr); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", -+ bfqd->wr_busy_queues, -+ idling_boosts_thr_without_issues, -+ bfq_bfqq_IO_bound(bfqq), -+ idling_needed_for_service_guarantees); -+ -+ return idling_boosts_thr_without_issues || -+ idling_needed_for_service_guarantees; -+} -+ -+/* -+ * If the in-service queue is empty but the function bfq_bfqq_may_idle -+ * returns true, then: -+ * 1) the queue must remain in service and cannot be expired, and -+ * 2) the device must be idled to wait for the possible arrival of a new -+ * request for the queue. -+ * See the comments on the function bfq_bfqq_may_idle for the reasons -+ * why performing device idling is the best choice to boost the throughput -+ * and preserve service guarantees when bfq_bfqq_may_idle itself -+ * returns true. -+ */ -+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) -+{ -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); -+} -+ -+/* -+ * Select a queue for service. If we have a current queue in service, -+ * check whether to continue servicing it, or retrieve and set a new one. -+ */ -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ struct request *next_rq; -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ -+ bfqq = bfqd->in_service_queue; -+ if (!bfqq) -+ goto new_queue; -+ -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); -+ -+ if (bfq_may_expire_for_budg_timeout(bfqq) && -+ !hrtimer_active(&bfqd->idle_slice_timer) && -+ !bfq_bfqq_must_idle(bfqq)) -+ goto expire; -+ -+check_queue: -+ /* -+ * This loop is rarely executed more than once. Even when it -+ * happens, it is much more convenient to re-execute this loop -+ * than to return NULL and trigger a new dispatch to get a -+ * request served. -+ */ -+ next_rq = bfqq->next_rq; -+ /* -+ * If bfqq has requests queued and it has enough budget left to -+ * serve them, keep the queue, otherwise expire it. -+ */ -+ if (next_rq) { -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (bfq_serv_to_charge(next_rq, bfqq) > -+ bfq_bfqq_budget_left(bfqq)) { -+ /* -+ * Expire the queue for budget exhaustion, -+ * which makes sure that the next budget is -+ * enough to serve the next request, even if -+ * it comes from the fifo expired path. -+ */ -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; -+ goto expire; -+ } else { -+ /* -+ * The idle timer may be pending because we may -+ * not disable disk idling even when a new request -+ * arrives. -+ */ -+ if (bfq_bfqq_wait_request(bfqq)) { -+ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); -+ /* -+ * If we get here: 1) at least a new request -+ * has arrived but we have not disabled the -+ * timer because the request was too small, -+ * 2) then the block layer has unplugged -+ * the device, causing the dispatch to be -+ * invoked. -+ * -+ * Since the device is unplugged, now the -+ * requests are probably large enough to -+ * provide a reasonable throughput. -+ * So we disable idling. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ } -+ goto keep_queue; -+ } -+ } -+ -+ /* -+ * No requests pending. However, if the in-service queue is idling -+ * for a new request, or has requests waiting for a completion and -+ * may idle after their completion, then keep it anyway. -+ */ -+ if (hrtimer_active(&bfqd->idle_slice_timer) || -+ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { -+ bfqq = NULL; -+ goto keep_queue; -+ } -+ -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, reason); -+new_queue: -+ bfqq = bfq_set_in_service_queue(bfqd); -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); -+ goto check_queue; -+ } -+keep_queue: -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); -+ else -+ bfq_log(bfqd, "select_queue: no queue returned"); -+ -+ return bfqq; -+} -+ -+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ -+ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != -+ entity->orig_weight * bfqq->wr_coeff); -+ if (entity->prio_changed) -+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); -+ -+ /* -+ * If the queue was activated in a burst, or too much -+ * time has elapsed from the beginning of this -+ * weight-raising period, then end weight raising. -+ */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bfq_bfqq_end_wr(bfqq); -+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time)) { -+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || -+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) -+ bfq_bfqq_end_wr(bfqq); -+ else { -+ /* switch back to interactive wr */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = -+ bfqq->wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies( -+ bfqq->last_wr_start_finish)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "back to interactive wr"); -+ } -+ } -+ } -+ /* -+ * To improve latency (for this or other queues), immediately -+ * update weight both if it must be raised and if it must be -+ * lowered. Since, entity may be on some active tree here, and -+ * might have a pending change of its ioprio class, invoke -+ * next function with the last parameter unset (see the -+ * comments on the function). -+ */ -+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) -+ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), -+ entity, false); -+} -+ -+/* -+ * Dispatch one request from bfqq, moving it to the request queue -+ * dispatch list. -+ */ -+static int bfq_dispatch_request(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ struct request *rq = bfqq->next_rq; -+ unsigned long service_to_charge; -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!rq); -+ service_to_charge = bfq_serv_to_charge(rq, bfqq); -+ -+ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_bfqq_served(bfqq, service_to_charge); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ bfq_dispatch_insert(bfqd->queue, rq); -+ -+ /* -+ * If weight raising has to terminate for bfqq, then next -+ * function causes an immediate update of bfqq's weight, -+ * without waiting for next activation. As a consequence, on -+ * expiration, bfqq will be timestamped as if has never been -+ * weight-raised during this service slot, even if it has -+ * received part or even most of the service as a -+ * weight-raised queue. This inflates bfqq's timestamps, which -+ * is beneficial, as bfqq is then more willing to leave the -+ * device immediately to possible other weight-raised queues. -+ */ -+ bfq_update_wr_data(bfqd, bfqq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %u sec req (%llu), budg left %d", -+ blk_rq_sectors(rq), -+ (unsigned long long) blk_rq_pos(rq), -+ bfq_bfqq_budget_left(bfqq)); -+ -+ dispatched++; -+ -+ if (!bfqd->in_service_bic) { -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); -+ bfqd->in_service_bic = RQ_BIC(rq); -+ } -+ -+ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) -+ goto expire; -+ -+ return dispatched; -+ -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -+ return dispatched; -+} -+ -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ -+ while (bfqq->next_rq) { -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); -+ dispatched++; -+ } -+ -+ BUG_ON(!list_empty(&bfqq->fifo)); -+ return dispatched; -+} -+ -+/* -+ * Drain our current requests. -+ * Used for barriers and when switching io schedulers on-the-fly. -+ */ -+static int bfq_forced_dispatch(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq, *n; -+ struct bfq_service_tree *st; -+ int dispatched = 0; -+ -+ bfqq = bfqd->in_service_queue; -+ if (bfqq) -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ /* -+ * Loop through classes, and be careful to leave the scheduler -+ * in a consistent state, as feedback mechanisms and vtime -+ * updates cannot be disabled during the process. -+ */ -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { -+ st = bfq_entity_service_tree(&bfqq->entity); -+ -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq); -+ -+ bfqq->max_budget = bfq_max_budget(bfqd); -+ bfq_forget_idle(st); -+ } -+ -+ BUG_ON(bfqd->busy_queues != 0); -+ -+ return dispatched; -+} -+ -+static int bfq_dispatch_requests(struct request_queue *q, int force) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq; -+ -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ -+ if (bfqd->busy_queues == 0) -+ return 0; -+ -+ if (unlikely(force)) -+ return bfq_forced_dispatch(bfqd); -+ -+ /* -+ * Force device to serve one request at a time if -+ * strict_guarantees is true. Forcing this service scheme is -+ * currently the ONLY way to guarantee that the request -+ * service order enforced by the scheduler is respected by a -+ * queueing device. Otherwise the device is free even to make -+ * some unlucky request wait for as long as the device -+ * wishes. -+ * -+ * Of course, serving one request at at time may cause loss of -+ * throughput. -+ */ -+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -+ return 0; -+ -+ bfqq = bfq_select_queue(bfqd); -+ if (!bfqq) -+ return 0; -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfq_bfqq_wait_request(bfqq)); -+ -+ if (!bfq_dispatch_request(bfqd, bfqq)) -+ return 0; -+ -+ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async"); -+ -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); -+ return 1; -+} -+ -+/* -+ * Task holds one reference to the queue, dropped when task exits. Each rq -+ * in-flight on this queue also holds a reference, dropped when rq is freed. -+ * -+ * Queue lock must be held here. Recall not to use bfqq after calling -+ * this function on it. -+ */ -+static void bfq_put_queue(struct bfq_queue *bfqq) -+{ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+#endif -+ -+ BUG_ON(bfqq->ref <= 0); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ bfqq->ref--; -+ if (bfqq->ref) -+ return; -+ -+ BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); -+ BUG_ON(bfqq->entity.tree); -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ -+ if (bfq_bfqq_sync(bfqq)) -+ /* -+ * The fact that this queue is being destroyed does not -+ * invalidate the fact that this queue may have been -+ * activated during the current burst. As a consequence, -+ * although the queue does not exist anymore, and hence -+ * needs to be removed from the burst list if there, -+ * the burst size has not to be decremented. -+ */ -+ hlist_del_init(&bfqq->burst_list_node); -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ -+ kmem_cache_free(bfq_pool, bfqq); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ bfqg_put(bfqg); -+#endif -+} -+ -+static void bfq_put_cooperator(struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *__bfqq, *next; -+ -+ /* -+ * If this queue was scheduled to merge with another queue, be -+ * sure to drop the reference taken on that queue (and others in -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. -+ */ -+ __bfqq = bfqq->new_bfqq; -+ while (__bfqq) { -+ if (__bfqq == bfqq) -+ break; -+ next = __bfqq->new_bfqq; -+ bfq_put_queue(__bfqq); -+ __bfqq = next; -+ } -+} -+ -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ if (bfqq == bfqd->in_service_queue) { -+ __bfq_bfqq_expire(bfqd, bfqq); -+ bfq_schedule_dispatch(bfqd); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); /* release process reference */ -+} -+ -+static void bfq_init_icq(struct io_cq *icq) -+{ -+ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ -+ if (bic_to_bfqq(bic, false)) { -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); -+ bic_set_bfqq(bic, NULL, false); -+ } -+ -+ if (bic_to_bfqq(bic, true)) { -+ /* -+ * If the bic is using a shared queue, put the reference -+ * taken on the io_context when the bic started using a -+ * shared bfq_queue. -+ */ -+ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) -+ put_io_context(icq->ioc); -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); -+ bic_set_bfqq(bic, NULL, true); -+ } -+} -+ -+/* -+ * Update the entity prio values; note that the new values will not -+ * be used until the next (re)activation. -+ */ -+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ struct task_struct *tsk = current; -+ int ioprio_class; -+ -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ switch (ioprio_class) { -+ default: -+ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, -+ "bfq: bad prio class %d\n", ioprio_class); -+ case IOPRIO_CLASS_NONE: -+ /* -+ * No prio set, inherit CPU scheduling settings. -+ */ -+ bfqq->new_ioprio = task_nice_ioprio(tsk); -+ bfqq->new_ioprio_class = task_nice_ioclass(tsk); -+ break; -+ case IOPRIO_CLASS_RT: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; -+ break; -+ case IOPRIO_CLASS_BE: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_IDLE: -+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; -+ bfqq->new_ioprio = 7; -+ break; -+ } -+ -+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { -+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", -+ bfqq->new_ioprio); -+ BUG(); -+ } -+ -+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "set_next_ioprio_data: bic_class %d prio %d class %d", -+ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); -+} -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq; -+ unsigned long uninitialized_var(flags); -+ int ioprio = bic->icq.ioc->ioprio; -+ -+ /* -+ * This condition may trigger on a newly created bic, be sure to -+ * drop the lock before returning. -+ */ -+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) -+ return; -+ -+ bic->ioprio = ioprio; -+ -+ bfqq = bic_to_bfqq(bic, false); -+ if (bfqq) { -+ /* release process reference on this queue */ -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); -+ bic_set_bfqq(bic, bfqq, false); -+ bfq_log_bfqq(bfqd, bfqq, -+ "check_ioprio_change: bfqq %p %d", -+ bfqq, bfqq->ref); -+ } -+ -+ bfqq = bic_to_bfqq(bic, true); -+ if (bfqq) -+ bfq_set_next_ioprio_data(bfqq, bic); -+} -+ -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic, pid_t pid, int is_sync) -+{ -+ RB_CLEAR_NODE(&bfqq->entity.rb_node); -+ INIT_LIST_HEAD(&bfqq->fifo); -+ INIT_HLIST_NODE(&bfqq->burst_list_node); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bfqq->ref = 0; -+ bfqq->bfqd = bfqd; -+ -+ if (bic) -+ bfq_set_next_ioprio_data(bfqq, bic); -+ -+ if (is_sync) { -+ /* -+ * No need to mark as has_short_ttime if in -+ * idle_class, because no device idling is performed -+ * for queues in idle class -+ */ -+ if (!bfq_class_idle(bfqq)) -+ /* tentatively mark as has_short_ttime */ -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ bfq_mark_bfqq_sync(bfqq); -+ bfq_mark_bfqq_just_created(bfqq); -+ } else -+ bfq_clear_bfqq_sync(bfqq); -+ bfq_mark_bfqq_IO_bound(bfqq); -+ -+ /* Tentative initial value to trade off between thr and lat */ -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; -+ bfqq->pid = pid; -+ -+ bfqq->wr_coeff = 1; -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); -+ bfqq->budget_timeout = bfq_smallest_from_now(); -+ bfqq->split_time = bfq_smallest_from_now(); -+ -+ /* -+ * Set to the value for which bfqq will not be deemed as -+ * soft rt when it becomes backlogged. -+ */ -+ bfqq->soft_rt_next_start = bfq_greatest_from_now(); -+ -+ /* first request is almost certainly seeky */ -+ bfqq->seek_history = 1; -+} -+ -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ int ioprio_class, int ioprio) -+{ -+ switch (ioprio_class) { -+ case IOPRIO_CLASS_RT: -+ return &bfqg->async_bfqq[0][ioprio]; -+ case IOPRIO_CLASS_NONE: -+ ioprio = IOPRIO_NORM; -+ /* fall through */ -+ case IOPRIO_CLASS_BE: -+ return &bfqg->async_bfqq[1][ioprio]; -+ case IOPRIO_CLASS_IDLE: -+ return &bfqg->async_idle_bfqq; -+ default: -+ BUG(); -+ } -+} -+ -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic) -+{ -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ struct bfq_queue **async_bfqq = NULL; -+ struct bfq_queue *bfqq; -+ struct bfq_group *bfqg; -+ -+ rcu_read_lock(); -+ -+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); -+ if (!bfqg) { -+ bfqq = &bfqd->oom_bfqq; -+ goto out; -+ } -+ -+ if (!is_sync) { -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, -+ ioprio); -+ bfqq = *async_bfqq; -+ if (bfqq) -+ goto out; -+ } -+ -+ bfqq = kmem_cache_alloc_node(bfq_pool, -+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, -+ bfqd->queue->node); -+ -+ if (bfqq) { -+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -+ is_sync); -+ bfq_init_entity(&bfqq->entity, bfqg); -+ bfq_log_bfqq(bfqd, bfqq, "allocated"); -+ } else { -+ bfqq = &bfqd->oom_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -+ goto out; -+ } -+ -+ /* -+ * Pin the queue now that it's allocated, scheduler exit will -+ * prune it. -+ */ -+ if (async_bfqq) { -+ bfqq->ref++; /* -+ * Extra group reference, w.r.t. sync -+ * queue. This extra reference is removed -+ * only if bfqq->bfqg disappears, to -+ * guarantee that this queue is not freed -+ * until its group goes away. -+ */ -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -+ bfqq, bfqq->ref); -+ *async_bfqq = bfqq; -+ } -+ -+out: -+ bfqq->ref++; /* get a process reference to this queue */ -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ rcu_read_unlock(); -+ return bfqq; -+} -+ -+static void bfq_update_io_thinktime(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic) -+{ -+ struct bfq_ttime *ttime = &bic->ttime; -+ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; -+ -+ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); -+ -+ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); -+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, -+ ttime->ttime_samples); -+} -+ -+static void -+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ bfqq->seek_history <<= 1; -+ bfqq->seek_history |= -+ get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && -+ (!blk_queue_nonrot(bfqd->queue) || -+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); -+} -+ -+static void bfq_update_has_short_ttime(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ bool has_short_ttime = true; -+ -+ /* -+ * No need to update has_short_ttime if bfqq is async or in -+ * idle io prio class, or if bfq_slice_idle is zero, because -+ * no device idling is performed for bfqq in this case. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || -+ bfqd->bfq_slice_idle == 0) -+ return; -+ -+ /* Idle window just restored, statistics are meaningless. */ -+ if (time_is_after_eq_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) -+ return; -+ -+ /* Think time is infinite if no process is linked to -+ * bfqq. Otherwise check average think time to -+ * decide whether to mark as has_short_ttime -+ */ -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -+ (bfq_sample_valid(bic->ttime.ttime_samples) && -+ bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) -+ has_short_ttime = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -+ has_short_ttime); -+ -+ if (has_short_ttime) -+ bfq_mark_bfqq_has_short_ttime(bfqq); -+ else -+ bfq_clear_bfqq_has_short_ttime(bfqq); -+} -+ -+/* -+ * Called when a new fs request (rq) is added to bfqq. Check if there's -+ * something we should do about it. -+ */ -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ struct bfq_io_cq *bic = RQ_BIC(rq); -+ -+ if (rq->cmd_flags & REQ_META) -+ bfqq->meta_pending++; -+ -+ bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_has_short_ttime(bfqd, bfqq, bic); -+ bfq_update_io_seektime(bfqd, bfqq, rq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "rq_enqueued: has_short_ttime=%d (seeky %d)", -+ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); -+ -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { -+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && -+ blk_rq_sectors(rq) < 32; -+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); -+ -+ /* -+ * There is just this request queued: if the request -+ * is small and the queue is not to be expired, then -+ * just exit. -+ * -+ * In this way, if the device is being idled to wait -+ * for a new request from the in-service queue, we -+ * avoid unplugging the device and committing the -+ * device to serve just a small request. On the -+ * contrary, we wait for the block layer to decide -+ * when to unplug the device: hopefully, new requests -+ * will be merged to this one quickly, then the device -+ * will be unplugged and larger requests will be -+ * dispatched. -+ */ -+ if (small_req && !budget_timeout) -+ return; -+ -+ /* -+ * A large enough request arrived, or the queue is to -+ * be expired: in both cases disk idling is to be -+ * stopped, so clear wait_request flag and reset -+ * timer. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ -+ /* -+ * The queue is not empty, because a new request just -+ * arrived. Hence we can safely expire the queue, in -+ * case of budget timeout, without risking that the -+ * timestamps of the queue are not updated correctly. -+ * See [1] for more details. -+ */ -+ if (budget_timeout) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ -+ /* -+ * Let the request rip immediately, or let a new queue be -+ * selected if bfqq has just been expired. -+ */ -+ __blk_run_queue(bfqd->queue); -+ } -+} -+ -+static void bfq_insert_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ /* -+ * An unplug may trigger a requeue of a request from the device -+ * driver: make sure we are in process context while trying to -+ * merge two bfq_queues. -+ */ -+ if (!in_interrupt()) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); -+ if (new_bfqq) { -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); -+ /* -+ * Release the request's reference to the old bfqq -+ * and make sure one is taken to the shared queue. -+ */ -+ new_bfqq->allocated[rq_data_dir(rq)]++; -+ bfqq->allocated[rq_data_dir(rq)]--; -+ new_bfqq->ref++; -+ bfq_clear_bfqq_just_created(bfqq); -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), -+ bfqq, new_bfqq); -+ /* -+ * rq is about to be enqueued into new_bfqq, -+ * release rq reference on bfqq -+ */ -+ bfq_put_queue(bfqq); -+ rq->elv.priv[1] = new_bfqq; -+ bfqq = new_bfqq; -+ } -+ } -+ -+ bfq_add_request(rq); -+ -+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; -+ list_add_tail(&rq->queuelist, &bfqq->fifo); -+ -+ bfq_rq_enqueued(bfqd, bfqq, rq); -+} -+ -+static void bfq_update_hw_tag(struct bfq_data *bfqd) -+{ -+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -+ bfqd->rq_in_driver); -+ -+ if (bfqd->hw_tag == 1) -+ return; -+ -+ /* -+ * This sample is valid if the number of outstanding requests -+ * is large enough to allow a queueing behavior. Note that the -+ * sum is not exact, as it's not taking into account deactivated -+ * requests. -+ */ -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) -+ return; -+ -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; -+ bfqd->max_rq_in_driver = 0; -+ bfqd->hw_tag_samples = 0; -+} -+ -+static void bfq_completed_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ u64 now_ns; -+ u32 delta_us; -+ -+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", -+ blk_rq_sectors(rq)); -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ bfq_update_hw_tag(bfqd); -+ -+ BUG_ON(!bfqd->rq_in_driver); -+ BUG_ON(!bfqq->dispatched); -+ bfqd->rq_in_driver--; -+ bfqq->dispatched--; -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq_start_time_ns(rq), -+ rq_io_start_time_ns(rq), -+ rq->cmd_flags); -+ -+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ /* -+ * Set budget_timeout (which we overload to store the -+ * time at which the queue remains with no backlog and -+ * no outstanding request; used by the weight-raising -+ * mechanism). -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_weights_tree_remove(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ } -+ -+ now_ns = ktime_get_ns(); -+ -+ RQ_BIC(rq)->ttime.last_end_request = now_ns; -+ -+ /* -+ * Using us instead of ns, to get a reasonable precision in -+ * computing rate in next check. -+ */ -+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); -+ -+ bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ (USEC_PER_SEC* -+ (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); -+ -+ /* -+ * If the request took rather long to complete, and, according -+ * to the maximum request size recorded, this completion latency -+ * implies that the request was certainly served at a very low -+ * rate (less than 1M sectors/sec), then the whole observation -+ * interval that lasts up to this time instant cannot be a -+ * valid time interval for computing a new peak rate. Invoke -+ * bfq_update_rate_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - reset to zero samples, which will trigger a proper -+ * re-initialization of the observation interval on next -+ * dispatch -+ */ -+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && -+ (bfqd->last_rq_max_size<last_completion = now_ns; -+ -+ /* -+ * If we are waiting to discover whether the request pattern -+ * of the task associated with the queue is actually -+ * isochronous, and both requisites for this condition to hold -+ * are now satisfied, then compute soft_rt_next_start (see the -+ * comments on the function bfq_bfqq_softrt_next_start()). We -+ * schedule this delayed check when bfqq expires, if it still -+ * has in-flight requests. -+ */ -+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ -+ /* -+ * If this is the in-service queue, check if it needs to be expired, -+ * or if we want to idle in case it has no pending requests. -+ */ -+ if (bfqd->in_service_queue == bfqq) { -+ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { -+ bfq_arm_slice_timer(bfqd); -+ goto out; -+ } else if (bfq_may_expire_for_budg_timeout(bfqq)) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && -+ (bfqq->dispatched == 0 || -+ !bfq_bfqq_may_idle(bfqq))) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_NO_MORE_REQUESTS); -+ } -+ -+ if (!bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ -+out: -+ return; -+} -+ -+static int __bfq_may_queue(struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { -+ bfq_clear_bfqq_must_alloc(bfqq); -+ return ELV_MQUEUE_MUST; -+ } -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+static int bfq_may_queue(struct request_queue *q, unsigned int op) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ /* -+ * Don't force setup of a queue from here, as a call to may_queue -+ * does not necessarily imply that a request actually will be -+ * queued. So just lookup a possibly existing queue, or return -+ * 'may queue' if that fails. -+ */ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return ELV_MQUEUE_MAY; -+ -+ bfqq = bic_to_bfqq(bic, op_is_sync(op)); -+ if (bfqq) -+ return __bfq_may_queue(bfqq); -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+/* -+ * Queue lock held here. -+ */ -+static void bfq_put_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ const int rw = rq_data_dir(rq); -+ -+ BUG_ON(!bfqq->allocated[rw]); -+ bfqq->allocated[rw]--; -+ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/* -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this -+ * was the last process referring to that bfqq. -+ */ -+static struct bfq_queue * -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); -+ -+ put_io_context(bic->icq.ioc); -+ -+ if (bfqq_process_refs(bfqq) == 1) { -+ bfqq->pid = current->pid; -+ bfq_clear_bfqq_coop(bfqq); -+ bfq_clear_bfqq_split_coop(bfqq); -+ return bfqq; -+ } -+ -+ bic_set_bfqq(bic, NULL, 1); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); -+ return NULL; -+} -+ -+/* -+ * Allocate bfq data structures associated with this request. -+ */ -+static int bfq_set_request(struct request_queue *q, struct request *rq, -+ struct bio *bio, gfp_t gfp_mask) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -+ const int rw = rq_data_dir(rq); -+ const int is_sync = rq_is_sync(rq); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ bool bfqq_already_existing = false, split = false; -+ -+ spin_lock_irqsave(q->queue_lock, flags); -+ -+ if (!bic) -+ goto queue_fail; -+ -+ bfq_check_ioprio_change(bic, bio); -+ -+ bfq_bic_update_cgroup(bic, bio); -+ -+new_queue: -+ bfqq = bic_to_bfqq(bic, is_sync); -+ if (!bfqq || bfqq == &bfqd->oom_bfqq) { -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: marking in " -+ "large burst"); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: clearing in " -+ "large burst"); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ bfqq->split_time = jiffies; -+ } -+ } else { -+ /* If the queue was seeky for too long, break it apart. */ -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ -+ /* Update bic before losing reference to bfqq */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bic->saved_in_large_burst = true; -+ -+ bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; -+ if (!bfqq) -+ goto new_queue; -+ else -+ bfqq_already_existing = true; -+ } -+ } -+ -+ bfqq->allocated[rw]++; -+ bfqq->ref++; -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); -+ -+ rq->elv.priv[0] = bic; -+ rq->elv.priv[1] = bfqq; -+ -+ /* -+ * If a bfq_queue has only one process reference, it is owned -+ * by only one bfq_io_cq: we can set the bic field of the -+ * bfq_queue to the address of that structure. Also, if the -+ * queue has just been split, mark a flag so that the -+ * information is available to the other scheduler hooks. -+ */ -+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { -+ bfqq->bic = bic; -+ if (split) { -+ /* -+ * If the queue has just been split from a shared -+ * queue, restore the idle window and the possible -+ * weight raising period. -+ */ -+ bfq_bfqq_resume_state(bfqq, bfqd, bic, -+ bfqq_already_existing); -+ } -+ } -+ -+ if (unlikely(bfq_bfqq_just_created(bfqq))) -+ bfq_handle_burst(bfqd, bfqq); -+ -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 0; -+ -+queue_fail: -+ bfq_schedule_dispatch(bfqd); -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 1; -+} -+ -+static void bfq_kick_queue(struct work_struct *work) -+{ -+ struct bfq_data *bfqd = -+ container_of(work, struct bfq_data, unplug_work); -+ struct request_queue *q = bfqd->queue; -+ -+ spin_lock_irq(q->queue_lock); -+ __blk_run_queue(q); -+ spin_unlock_irq(q->queue_lock); -+} -+ -+/* -+ * Handler of the expiration of the timer running if the in-service queue -+ * is idling inside its time slice. -+ */ -+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) -+{ -+ struct bfq_data *bfqd = container_of(timer, struct bfq_data, -+ idle_slice_timer); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ enum bfqq_expiration reason; -+ -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags); -+ -+ bfqq = bfqd->in_service_queue; -+ /* -+ * Theoretical race here: the in-service queue can be NULL or -+ * different from the queue that was idling if the timer handler -+ * spins on the queue_lock and a new request arrives for the -+ * current queue and there is a full dispatch cycle that changes -+ * the in-service queue. This can hardly happen, but in the worst -+ * case we just expire a queue too early. -+ */ -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -+ bfq_clear_bfqq_wait_request(bfqq); -+ -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ } -+ -+schedule_dispatch: -+ bfq_schedule_dispatch(bfqd); -+ -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -+ return HRTIMER_NORESTART; -+} -+ -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -+{ -+ hrtimer_cancel(&bfqd->idle_slice_timer); -+ cancel_work_sync(&bfqd->unplug_work); -+} -+ -+static void __bfq_put_async_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue **bfqq_ptr) -+{ -+ struct bfq_group *root_group = bfqd->root_group; -+ struct bfq_queue *bfqq = *bfqq_ptr; -+ -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); -+ if (bfqq) { -+ bfq_bfqq_move(bfqd, bfqq, root_group); -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -+ bfqq, bfqq->ref); -+ bfq_put_queue(bfqq); -+ *bfqq_ptr = NULL; -+ } -+} -+ -+/* -+ * Release all the bfqg references to its async queues. If we are -+ * deallocating the group these queues may still contain requests, so -+ * we reparent them to the root cgroup (i.e., the only one that will -+ * exist for sure until all the requests on a device are gone). -+ */ -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); -+ -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -+} -+ -+static void bfq_exit_queue(struct elevator_queue *e) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ struct request_queue *q = bfqd->queue; -+ struct bfq_queue *bfqq, *n; -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ spin_lock_irq(q->queue_lock); -+ -+ BUG_ON(bfqd->in_service_queue); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ -+ spin_unlock_irq(q->queue_lock); -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_deactivate_policy(q, &blkcg_policy_bfq); -+#else -+ bfq_put_async_queues(bfqd, bfqd->root_group); -+ kfree(bfqd->root_group); -+#endif -+ -+ kfree(bfqd); -+} -+ -+static void bfq_init_root_group(struct bfq_group *root_group, -+ struct bfq_data *bfqd) -+{ -+ int i; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ root_group->entity.parent = NULL; -+ root_group->my_entity = NULL; -+ root_group->bfqd = bfqd; -+#endif -+ root_group->rq_pos_tree = RB_ROOT; -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ root_group->sched_data.bfq_class_idle_last_service = jiffies; -+} -+ -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -+{ -+ struct bfq_data *bfqd; -+ struct elevator_queue *eq; -+ -+ eq = elevator_alloc(q, e); -+ if (!eq) -+ return -ENOMEM; -+ -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); -+ if (!bfqd) { -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+ } -+ eq->elevator_data = bfqd; -+ -+ /* -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. -+ * Grab a permanent reference to it, so that the normal code flow -+ * will not attempt to free it. -+ */ -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -+ bfqd->oom_bfqq.ref++; -+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; -+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; -+ bfqd->oom_bfqq.entity.new_weight = -+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); -+ -+ /* oom_bfqq does not participate to bursts */ -+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); -+ /* -+ * Trigger weight initialization, according to ioprio, at the -+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio -+ * class won't be changed any more. -+ */ -+ bfqd->oom_bfqq.entity.prio_changed = 1; -+ -+ bfqd->queue = q; -+ -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+ -+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, -+ HRTIMER_MODE_REL); -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; -+ -+ bfqd->queue_weights_tree = RB_ROOT; -+ bfqd->group_weights_tree = RB_ROOT; -+ -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); -+ -+ INIT_LIST_HEAD(&bfqd->active_list); -+ INIT_LIST_HEAD(&bfqd->idle_list); -+ INIT_HLIST_HEAD(&bfqd->burst_list); -+ -+ bfqd->hw_tag = -1; -+ -+ bfqd->bfq_max_budget = bfq_default_max_budget; -+ -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; -+ bfqd->bfq_back_max = bfq_back_max; -+ bfqd->bfq_back_penalty = bfq_back_penalty; -+ bfqd->bfq_slice_idle = bfq_slice_idle; -+ bfqd->bfq_timeout = bfq_timeout; -+ -+ bfqd->bfq_requests_within_timer = 120; -+ -+ bfqd->bfq_large_burst_thresh = 8; -+ bfqd->bfq_burst_interval = msecs_to_jiffies(180); -+ -+ bfqd->low_latency = true; -+ -+ /* -+ * Trade-off between responsiveness and fairness. -+ */ -+ bfqd->bfq_wr_coeff = 30; -+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); -+ bfqd->bfq_wr_max_time = 0; -+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); -+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); -+ bfqd->bfq_wr_max_softrt_rate = 7000; /* -+ * Approximate rate required -+ * to playback or record a -+ * high-definition compressed -+ * video. -+ */ -+ bfqd->wr_busy_queues = 0; -+ -+ /* -+ * Begin by assuming, optimistically, that the device is a -+ * high-speed one, and that its peak rate is equal to 2/3 of -+ * the highest reference rate. -+ */ -+ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * -+ T_fast[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ -+ return 0; -+ -+out_free: -+ kfree(bfqd); -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+} -+ -+static void bfq_slab_kill(void) -+{ -+ kmem_cache_destroy(bfq_pool); -+} -+ -+static int __init bfq_slab_setup(void) -+{ -+ bfq_pool = KMEM_CACHE(bfq_queue, 0); -+ if (!bfq_pool) -+ return -ENOMEM; -+ return 0; -+} -+ -+static ssize_t bfq_var_show(unsigned int var, char *page) -+{ -+ return sprintf(page, "%u\n", var); -+} -+ -+static ssize_t bfq_var_store(unsigned long *var, const char *page, -+ size_t count) -+{ -+ unsigned long new_val; -+ int ret = kstrtoul(page, 10, &new_val); -+ -+ if (ret == 0) -+ *var = new_val; -+ -+ return count; -+} -+ -+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ -+ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? -+ jiffies_to_msecs(bfqd->bfq_wr_max_time) : -+ jiffies_to_msecs(bfq_wr_duration(bfqd))); -+} -+ -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = e->elevator_data; -+ ssize_t num_char = 0; -+ -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", -+ bfqd->queued); -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ num_char += sprintf(page + num_char, "Active:\n"); -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, nr_queued %d %d, ", -+ bfqq->pid, -+ bfqq->entity.weight, -+ bfqq->queued[0], -+ bfqq->queued[1]); -+ num_char += sprintf(page + num_char, -+ "dur %d/%u\n", -+ jiffies_to_msecs( -+ jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ num_char += sprintf(page + num_char, "Idle:\n"); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, dur %d/%u\n", -+ bfqq->pid, -+ bfqq->entity.weight, -+ jiffies_to_msecs(jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+ -+ return num_char; -+} -+ -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ if (__CONV == 1) \ -+ __data = jiffies_to_msecs(__data); \ -+ else if (__CONV == 2) \ -+ __data = div_u64(__data, NSEC_PER_MSEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); -+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); -+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); -+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, -+ 1); -+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); -+#undef SHOW_FUNCTION -+ -+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ __data = div_u64(__data, NSEC_PER_USEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); -+#undef USEC_SHOW_FUNCTION -+ -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -+static ssize_t \ -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ if (__CONV == 1) \ -+ *(__PTR) = msecs_to_jiffies(__data); \ -+ else if (__CONV == 2) \ -+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ -+ else \ -+ *(__PTR) = __data; \ -+ return ret; \ -+} -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, -+ INT_MAX, 2); -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, -+ INT_MAX, 0); -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); -+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); -+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, -+ 1); -+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, -+ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, -+ INT_MAX, 0); -+#undef STORE_FUNCTION -+ -+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ -+ return ret; \ -+} -+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, -+ UINT_MAX); -+#undef USEC_STORE_FUNCTION -+ -+/* do nothing for the moment */ -+static ssize_t bfq_weights_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ return count; -+} -+ -+static ssize_t bfq_max_budget_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ else { -+ if (__data > INT_MAX) -+ __data = INT_MAX; -+ bfqd->bfq_max_budget = __data; -+ } -+ -+ bfqd->bfq_user_max_budget = __data; -+ -+ return ret; -+} -+ -+/* -+ * Leaving this name to preserve name compatibility with cfq -+ * parameters, but this timeout is used for both sync and async. -+ */ -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data < 1) -+ __data = 1; -+ else if (__data > INT_MAX) -+ __data = INT_MAX; -+ -+ bfqd->bfq_timeout = msecs_to_jiffies(__data); -+ if (bfqd->bfq_user_max_budget == 0) -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ -+ return ret; -+} -+ -+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (!bfqd->strict_guarantees && __data == 1 -+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) -+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; -+ -+ bfqd->strict_guarantees = __data; -+ -+ return ret; -+} -+ -+static ssize_t bfq_low_latency_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (__data == 0 && bfqd->low_latency != 0) -+ bfq_end_wr(bfqd); -+ bfqd->low_latency = __data; -+ -+ return ret; -+} -+ -+#define BFQ_ATTR(name) \ -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) -+ -+static struct elv_fs_entry bfq_attrs[] = { -+ BFQ_ATTR(fifo_expire_sync), -+ BFQ_ATTR(fifo_expire_async), -+ BFQ_ATTR(back_seek_max), -+ BFQ_ATTR(back_seek_penalty), -+ BFQ_ATTR(slice_idle), -+ BFQ_ATTR(slice_idle_us), -+ BFQ_ATTR(max_budget), -+ BFQ_ATTR(timeout_sync), -+ BFQ_ATTR(strict_guarantees), -+ BFQ_ATTR(low_latency), -+ BFQ_ATTR(wr_coeff), -+ BFQ_ATTR(wr_max_time), -+ BFQ_ATTR(wr_rt_max_time), -+ BFQ_ATTR(wr_min_idle_time), -+ BFQ_ATTR(wr_min_inter_arr_async), -+ BFQ_ATTR(wr_max_softrt_rate), -+ BFQ_ATTR(weights), -+ __ATTR_NULL -+}; -+ -+static struct elevator_type iosched_bfq = { -+ .ops.sq = { -+ .elevator_merge_fn = bfq_merge, -+ .elevator_merged_fn = bfq_merged_request, -+ .elevator_merge_req_fn = bfq_merged_requests, -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ .elevator_bio_merged_fn = bfq_bio_merged, -+#endif -+ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -+ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, -+ .elevator_dispatch_fn = bfq_dispatch_requests, -+ .elevator_add_req_fn = bfq_insert_request, -+ .elevator_activate_req_fn = bfq_activate_request, -+ .elevator_deactivate_req_fn = bfq_deactivate_request, -+ .elevator_completed_req_fn = bfq_completed_request, -+ .elevator_former_req_fn = elv_rb_former_request, -+ .elevator_latter_req_fn = elv_rb_latter_request, -+ .elevator_init_icq_fn = bfq_init_icq, -+ .elevator_exit_icq_fn = bfq_exit_icq, -+ .elevator_set_req_fn = bfq_set_request, -+ .elevator_put_req_fn = bfq_put_request, -+ .elevator_may_queue_fn = bfq_may_queue, -+ .elevator_init_fn = bfq_init_queue, -+ .elevator_exit_fn = bfq_exit_queue, -+ }, -+ .icq_size = sizeof(struct bfq_io_cq), -+ .icq_align = __alignof__(struct bfq_io_cq), -+ .elevator_attrs = bfq_attrs, -+ .elevator_name = "bfq-sq", -+ .elevator_owner = THIS_MODULE, -+}; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+static struct blkcg_policy blkcg_policy_bfq = { -+ .dfl_cftypes = bfq_blkg_files, -+ .legacy_cftypes = bfq_blkcg_legacy_files, -+ -+ .cpd_alloc_fn = bfq_cpd_alloc, -+ .cpd_init_fn = bfq_cpd_init, -+ .cpd_bind_fn = bfq_cpd_init, -+ .cpd_free_fn = bfq_cpd_free, -+ -+ .pd_alloc_fn = bfq_pd_alloc, -+ .pd_init_fn = bfq_pd_init, -+ .pd_offline_fn = bfq_pd_offline, -+ .pd_free_fn = bfq_pd_free, -+ .pd_reset_stats_fn = bfq_pd_reset_stats, -+}; -+#endif -+ -+static int __init bfq_init(void) -+{ -+ int ret; -+ char msg[60] = "BFQ I/O-scheduler: v8r12"; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ ret = blkcg_policy_register(&blkcg_policy_bfq); -+ if (ret) -+ return ret; -+#endif -+ -+ ret = -ENOMEM; -+ if (bfq_slab_setup()) -+ goto err_pol_unreg; -+ -+ /* -+ * Times to load large popular applications for the typical -+ * systems installed on the reference devices (see the -+ * comments before the definitions of the next two -+ * arrays). Actually, we use slightly slower values, as the -+ * estimated peak rate tends to be smaller than the actual -+ * peak rate. The reason for this last fact is that estimates -+ * are computed over much shorter time intervals than the long -+ * intervals typically used for benchmarking. Why? First, to -+ * adapt more quickly to variations. Second, because an I/O -+ * scheduler cannot rely on a peak-rate-evaluation workload to -+ * be run for a long time. -+ */ -+ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ -+ T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ -+ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ -+ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ -+ -+ /* -+ * Thresholds that determine the switch between speed classes -+ * (see the comments before the definition of the array -+ * device_speed_thresh). These thresholds are biased towards -+ * transitions to the fast class. This is safer than the -+ * opposite bias. In fact, a wrong transition to the slow -+ * class results in short weight-raising periods, because the -+ * speed of the device then tends to be higher that the -+ * reference peak rate. On the opposite end, a wrong -+ * transition to the fast class tends to increase -+ * weight-raising periods, because of the opposite reason. -+ */ -+ device_speed_thresh[0] = (4 * R_slow[0]) / 3; -+ device_speed_thresh[1] = (4 * R_slow[1]) / 3; -+ -+ ret = elv_register(&iosched_bfq); -+ if (ret) -+ goto err_pol_unreg; -+ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ strcat(msg, " (with cgroups support)"); -+#endif -+ pr_info("%s", msg); -+ -+ return 0; -+ -+err_pol_unreg: -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ return ret; -+} -+ -+static void __exit bfq_exit(void) -+{ -+ elv_unregister(&iosched_bfq); -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ bfq_slab_kill(); -+} -+ -+module_init(bfq_init); -+module_exit(bfq_exit); -+ -+MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -+MODULE_LICENSE("GPL"); - -From e24d2e6461479dbd13d58be2dc44b23b5e24487c Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Mon, 19 Dec 2016 17:13:39 +0100 -Subject: [PATCH 07/51] Add config and build bits for bfq-mq-iosched - -Signed-off-by: Paolo Valente ---- - block/Kconfig.iosched | 10 +++++++++ - block/Makefile | 1 + - block/bfq-cgroup-included.c | 4 ++-- - block/bfq-mq-iosched.c | 25 ++++++++++++----------- - block/bfq-sched.c | 50 ++++++++++++++++++++++----------------------- - block/bfq-sq-iosched.c | 24 +++++++++++----------- - block/bfq.h | 36 +++++++++++++++++++++----------- - 8 files changed, 88 insertions(+), 64 deletions(-) - -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index 9e3f4c2f7390..2d94af3d8b0a 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -96,6 +96,16 @@ config DEFAULT_IOSCHED - default "bfq-sq" if DEFAULT_BFQ_SQ - default "noop" if DEFAULT_NOOP - -+config MQ_IOSCHED_BFQ -+ tristate "BFQ-MQ I/O Scheduler" -+ default y -+ ---help--- -+ BFQ I/O scheduler for BLK-MQ. BFQ-MQ distributes bandwidth -+ among all processes according to their weights, regardless of -+ the device parameters and with any workload. It also -+ guarantees a low latency to interactive and soft real-time -+ applications. Details in Documentation/block/bfq-iosched.txt -+ - config MQ_IOSCHED_DEADLINE - tristate "MQ deadline I/O scheduler" - default y -diff --git a/block/Makefile b/block/Makefile -index 59026b425791..a571329c23f0 100644 ---- a/block/Makefile -+++ b/block/Makefile -@@ -25,6 +25,7 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o - bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o - obj-$(CONFIG_IOSCHED_BFQ) += bfq.o - obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o -+obj-$(CONFIG_MQ_IOSCHED_BFQ) += bfq-mq-iosched.o - - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o - obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index af7c216a3540..9c483b658179 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -15,7 +15,7 @@ - * file. - */ - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - - /* bfqg stats flags */ - enum bfqg_stats_flags { -@@ -1116,7 +1116,7 @@ static struct cftype bfq_blkg_files[] = { - {} /* terminate */ - }; - --#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ - - static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, unsigned int op) { } -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 30d019fc67e0..e88e00f1e0a7 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -82,6 +82,7 @@ - #include - #include - #include "blk.h" -+#undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ - #include "bfq.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ -@@ -387,7 +388,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) - return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && - (bfqd->queue_weights_tree.rb_node->rb_left || - bfqd->queue_weights_tree.rb_node->rb_right) --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - ) || - (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && - (bfqd->group_weights_tree.rb_node->rb_left || -@@ -1672,7 +1673,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - } - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static void bfq_bio_merged(struct request_queue *q, struct request *req, - struct bio *bio) - { -@@ -3879,7 +3880,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - */ - static void bfq_put_queue(struct bfq_queue *bfqq) - { --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_group *bfqg = bfqq_group(bfqq); - #endif - -@@ -3909,7 +3910,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - bfqg_put(bfqg); - #endif - } -@@ -4835,7 +4836,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); - #else - bfq_put_async_queues(bfqd, bfqd->root_group); -@@ -4850,7 +4851,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, - { - int i; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - root_group->entity.parent = NULL; - root_group->my_entity = NULL; - root_group->bfqd = bfqd; -@@ -5265,7 +5266,7 @@ static struct elevator_type iosched_bfq = { - .elevator_merge_fn = bfq_merge, - .elevator_merged_fn = bfq_merged_request, - .elevator_merge_req_fn = bfq_merged_requests, --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - .elevator_bio_merged_fn = bfq_bio_merged, - #endif - .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -@@ -5292,7 +5293,7 @@ static struct elevator_type iosched_bfq = { - .elevator_owner = THIS_MODULE, - }; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct blkcg_policy blkcg_policy_bfq = { - .dfl_cftypes = bfq_blkg_files, - .legacy_cftypes = bfq_blkcg_legacy_files, -@@ -5315,7 +5316,7 @@ static int __init bfq_init(void) - int ret; - char msg[60] = "BFQ I/O-scheduler: v8r12"; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - ret = blkcg_policy_register(&blkcg_policy_bfq); - if (ret) - return ret; -@@ -5362,7 +5363,7 @@ static int __init bfq_init(void) - if (ret) - goto err_pol_unreg; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - strcat(msg, " (with cgroups support)"); - #endif - pr_info("%s", msg); -@@ -5370,7 +5371,7 @@ static int __init bfq_init(void) - return 0; - - err_pol_unreg: --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - return ret; -@@ -5379,7 +5380,7 @@ static int __init bfq_init(void) - static void __exit bfq_exit(void) - { - elv_unregister(&iosched_bfq); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - bfq_slab_kill(); -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 5c0f9290a79c..b54a638186e3 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -136,7 +136,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "update_next_in_service: chosen this queue"); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(next_in_service, -@@ -149,7 +149,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - return parent_sched_may_change; - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* both next loops stop at one of the child entities of the root group */ - #define for_each_entity(entity) \ - for (; entity ; entity = entity->parent) -@@ -243,7 +243,7 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - return false; - } - --#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ - #define for_each_entity(entity) \ - for (; entity ; entity = NULL) - -@@ -260,7 +260,7 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - return true; - } - --#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - /* - * Shift for timestamp calculations. This actually limits the maximum -@@ -323,7 +323,7 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_finish: start %llu, finish %llu, delta %llu", - start, finish, delta); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -473,7 +473,7 @@ static void bfq_update_active_node(struct rb_node *node) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "update_active_node: new min_start %llu", - ((entity->min_start>>10)*1000)>>12); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -540,7 +540,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, - { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *node = &entity->rb_node; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -@@ -555,7 +555,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, - - bfq_update_active_tree(node); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - BUG_ON(!bfqg); -@@ -563,7 +563,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, - #endif - if (bfqq) - list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { /* bfq_group */ - BUG_ON(!bfqd); - bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); -@@ -652,7 +652,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, - { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - struct rb_node *node; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -@@ -664,7 +664,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, - if (node) - bfq_update_active_tree(node); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - BUG_ON(!bfqg); -@@ -672,7 +672,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, - #endif - if (bfqq) - list_del(&bfqq->bfqq_list); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { /* bfq_group */ - BUG_ON(!bfqd); - bfq_weights_tree_remove(bfqd, entity, -@@ -809,14 +809,14 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - unsigned int prev_weight, new_weight; - struct bfq_data *bfqd = NULL; - struct rb_root *root; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_sched_data *sd; - struct bfq_group *bfqg; - #endif - - if (bfqq) - bfqd = bfqq->bfqd; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - sd = entity->my_sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); -@@ -907,7 +907,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - return new_st; - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); - #endif - -@@ -936,7 +936,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - st->vtime += bfq_delta(served, st->wsum); - bfq_forget_idle(st); - } --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); - #endif - st = bfq_entity_service_tree(&bfqq->entity); -@@ -1060,7 +1060,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - bfq_log_bfqq(bfqq->bfqd, bfqq, - "__activate_entity: new queue finish %llu", - ((entity->finish>>10)*1000)>>12); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1078,7 +1078,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - bfq_log_bfqq(bfqq->bfqd, bfqq, - "__activate_entity: queue %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1153,7 +1153,7 @@ static void __bfq_activate_entity(struct bfq_entity *entity, - - BUG_ON(entity->on_st && bfqq); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - if (entity->on_st && !bfqq) { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, -@@ -1485,7 +1485,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "invoking udpdate_next for this queue"); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, -@@ -1525,7 +1525,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_vtime_jump: new value %llu", - root_entity->min_start); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(root_entity, struct bfq_group, -@@ -1661,7 +1661,7 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service - "__lookup_next: start %llu vtime %llu st %p", - ((entity->start>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1735,7 +1735,7 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", - st + class_idx, class_idx); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1777,7 +1777,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - */ - sd = &bfqd->root_group->sched_data; - for (; sd ; sd = entity->my_sched_data) { --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - if (entity) { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -1867,7 +1867,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - bfq_log_bfqq(bfqd, bfqq, - "get_next_queue: this queue, finish %llu", - (((entity->finish>>10)*1000)>>10)>>2); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 30d019fc67e0..25da0d1c0622 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -387,7 +387,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) - return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && - (bfqd->queue_weights_tree.rb_node->rb_left || - bfqd->queue_weights_tree.rb_node->rb_right) --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - ) || - (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && - (bfqd->group_weights_tree.rb_node->rb_left || -@@ -1672,7 +1672,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - } - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static void bfq_bio_merged(struct request_queue *q, struct request *req, - struct bio *bio) - { -@@ -3879,7 +3879,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - */ - static void bfq_put_queue(struct bfq_queue *bfqq) - { --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - struct bfq_group *bfqg = bfqq_group(bfqq); - #endif - -@@ -3909,7 +3909,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - bfqg_put(bfqg); - #endif - } -@@ -4835,7 +4835,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); - #else - bfq_put_async_queues(bfqd, bfqd->root_group); -@@ -4850,7 +4850,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, - { - int i; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - root_group->entity.parent = NULL; - root_group->my_entity = NULL; - root_group->bfqd = bfqd; -@@ -5265,7 +5265,7 @@ static struct elevator_type iosched_bfq = { - .elevator_merge_fn = bfq_merge, - .elevator_merged_fn = bfq_merged_request, - .elevator_merge_req_fn = bfq_merged_requests, --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - .elevator_bio_merged_fn = bfq_bio_merged, - #endif - .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -@@ -5292,7 +5292,7 @@ static struct elevator_type iosched_bfq = { - .elevator_owner = THIS_MODULE, - }; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct blkcg_policy blkcg_policy_bfq = { - .dfl_cftypes = bfq_blkg_files, - .legacy_cftypes = bfq_blkcg_legacy_files, -@@ -5315,7 +5315,7 @@ static int __init bfq_init(void) - int ret; - char msg[60] = "BFQ I/O-scheduler: v8r12"; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - ret = blkcg_policy_register(&blkcg_policy_bfq); - if (ret) - return ret; -@@ -5362,7 +5362,7 @@ static int __init bfq_init(void) - if (ret) - goto err_pol_unreg; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - strcat(msg, " (with cgroups support)"); - #endif - pr_info("%s", msg); -@@ -5370,7 +5370,7 @@ static int __init bfq_init(void) - return 0; - - err_pol_unreg: --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - return ret; -@@ -5379,7 +5379,7 @@ static int __init bfq_init(void) - static void __exit bfq_exit(void) - { - elv_unregister(&iosched_bfq); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - bfq_slab_kill(); -diff --git a/block/bfq.h b/block/bfq.h -index 34fc4697fd89..53954d1b87f8 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -19,6 +19,18 @@ - #include - #include - -+/* -+ * Define an alternative macro to compile cgroups support. This is one -+ * of the steps needed to let bfq-mq share the files bfq-sched.c and -+ * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro -+ * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether -+ * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not -+ * CONFIG_BFQ_GROUP_IOSCHED, is defined. -+ */ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#define BFQ_GROUP_IOSCHED_ENABLED -+#endif -+ - #define BFQ_IOPRIO_CLASSES 3 - #define BFQ_CL_IDLE_TIMEOUT (HZ/5) - -@@ -344,7 +356,7 @@ struct bfq_io_cq { - struct bfq_ttime ttime; - /* per (request_queue, blkcg) ioprio */ - int ioprio; --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - uint64_t blkcg_serial_nr; /* the current blkcg serial */ - #endif - -@@ -671,7 +683,7 @@ static const char *checked_dev_name(const struct device *dev) - return nodev; - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - -@@ -696,7 +708,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - __pbuf, ##args); \ - } while (0) - --#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - pr_crit("%s bfq%d%c " fmt "\n", \ -@@ -705,7 +717,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - --#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ - pr_crit("%s bfq " fmt "\n", \ -@@ -713,7 +725,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - -@@ -735,7 +747,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ - } while (0) - --#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -@@ -743,7 +755,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - --#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -@@ -763,7 +775,7 @@ enum bfqq_expiration { - - - struct bfqg_stats { --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ -@@ -794,7 +806,7 @@ struct bfqg_stats { - #endif - }; - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* - * struct bfq_group_data - per-blkcg storage for the blkio subsystem. - * -@@ -895,7 +907,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "entity_service_tree %p %d", - sched_data->service_tree + idx, idx); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); -@@ -924,7 +936,7 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) - return bic->icq.q->elevator->elevator_data; - } - --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - - static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) - { -@@ -953,7 +965,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bfq_io_cq *bic); - static void bfq_end_wr_async_queues(struct bfq_data *bfqd, - struct bfq_group *bfqg); --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); - #endif - static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); - -From add91dbd756cf8ca3aa3add9a19eef742d5fca6b Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Fri, 20 Jan 2017 09:18:25 +0100 -Subject: [PATCH 08/51] Increase max policies for io controller - -To let bfq-mq policy be plugged too (however cgroups -suppport is not yet functional in bfq-mq). - -Signed-off-by: Paolo Valente ---- - include/linux/blkdev.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index bf000c58644b..10f892ca585d 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -54,7 +54,7 @@ struct blk_stat_callback; - * Maximum number of blkcg policies allowed to be registered concurrently. - * Defined here to simplify include dependency. - */ --#define BLKCG_MAX_POLS 4 -+#define BLKCG_MAX_POLS 5 - - typedef void (rq_end_io_fn)(struct request *, blk_status_t); - - -From 2c39a1d9ab4516d44e01e96f19f578b927e7f2e9 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Mon, 19 Dec 2016 18:11:33 +0100 -Subject: [PATCH 09/51] Copy header file bfq.h as bfq-mq.h - -This commit introduces the header file bfq-mq.h, that will play -for bfq-mq-iosched.c the same role that bfq.h plays for bfq-iosched.c. - -For the moment, the file bfq-mq.h is just a copy of bfq.h. - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-mq.h | 973 +++++++++++++++++++++++++++++++++++++++++++++++++ - 2 files changed, 974 insertions(+), 1 deletion(-) - create mode 100644 block/bfq-mq.h - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index e88e00f1e0a7..d1125aee658c 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -83,7 +83,7 @@ - #include - #include "blk.h" - #undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ --#include "bfq.h" -+#include "bfq-mq.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ - static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -new file mode 100644 -index 000000000000..53954d1b87f8 ---- /dev/null -+++ b/block/bfq-mq.h -@@ -0,0 +1,973 @@ -+/* -+ * BFQ v8r12 for 4.11.0: data structures and common functions prototypes. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2015 Paolo Valente -+ * -+ * Copyright (C) 2017 Paolo Valente -+ */ -+ -+#ifndef _BFQ_H -+#define _BFQ_H -+ -+#include -+#include -+#include -+ -+/* -+ * Define an alternative macro to compile cgroups support. This is one -+ * of the steps needed to let bfq-mq share the files bfq-sched.c and -+ * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro -+ * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether -+ * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not -+ * CONFIG_BFQ_GROUP_IOSCHED, is defined. -+ */ -+#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+#define BFQ_GROUP_IOSCHED_ENABLED -+#endif -+ -+#define BFQ_IOPRIO_CLASSES 3 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5) -+ -+#define BFQ_MIN_WEIGHT 1 -+#define BFQ_MAX_WEIGHT 1000 -+#define BFQ_WEIGHT_CONVERSION_COEFF 10 -+ -+#define BFQ_DEFAULT_QUEUE_IOPRIO 4 -+ -+#define BFQ_WEIGHT_LEGACY_DFL 100 -+#define BFQ_DEFAULT_GRP_IOPRIO 0 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -+ -+/* -+ * Soft real-time applications are extremely more latency sensitive -+ * than interactive ones. Over-raise the weight of the former to -+ * privilege them against the latter. -+ */ -+#define BFQ_SOFTRT_WEIGHT_FACTOR 100 -+ -+struct bfq_entity; -+ -+/** -+ * struct bfq_service_tree - per ioprio_class service tree. -+ * -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each -+ * ioprio_class has its own independent scheduler, and so its own -+ * bfq_service_tree. All the fields are protected by the queue lock -+ * of the containing bfqd. -+ */ -+struct bfq_service_tree { -+ /* tree for active entities (i.e., those backlogged) */ -+ struct rb_root active; -+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ -+ struct rb_root idle; -+ -+ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ -+ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ -+ -+ u64 vtime; /* scheduler virtual time */ -+ /* scheduler weight sum; active and idle entities contribute to it */ -+ unsigned long wsum; -+}; -+ -+/** -+ * struct bfq_sched_data - multi-class scheduler. -+ * -+ * bfq_sched_data is the basic scheduler queue. It supports three -+ * ioprio_classes, and can be used either as a toplevel queue or as an -+ * intermediate queue in a hierarchical setup. -+ * -+ * The supported ioprio_classes are the same as in CFQ, in descending -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. -+ * Requests from higher priority queues are served before all the -+ * requests from lower priority queues; among requests of the same -+ * queue requests are served according to B-WF2Q+. -+ * -+ * The schedule is implemented by the service trees, plus the field -+ * @next_in_service, which points to the entity on the active trees -+ * that will be served next, if 1) no changes in the schedule occurs -+ * before the current in-service entity is expired, 2) the in-service -+ * queue becomes idle when it expires, and 3) if the entity pointed by -+ * in_service_entity is not a queue, then the in-service child entity -+ * of the entity pointed by in_service_entity becomes idle on -+ * expiration. This peculiar definition allows for the following -+ * optimization, not yet exploited: while a given entity is still in -+ * service, we already know which is the best candidate for next -+ * service among the other active entitities in the same parent -+ * entity. We can then quickly compare the timestamps of the -+ * in-service entity with those of such best candidate. -+ * -+ * All the fields are protected by the queue lock of the containing -+ * bfqd. -+ */ -+struct bfq_sched_data { -+ struct bfq_entity *in_service_entity; /* entity in service */ -+ /* head-of-the-line entity in the scheduler (see comments above) */ -+ struct bfq_entity *next_in_service; -+ /* array of service trees, one per ioprio_class */ -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -+ /* last time CLASS_IDLE was served */ -+ unsigned long bfq_class_idle_last_service; -+ -+}; -+ -+/** -+ * struct bfq_weight_counter - counter of the number of all active entities -+ * with a given weight. -+ */ -+struct bfq_weight_counter { -+ unsigned int weight; /* weight of the entities this counter refers to */ -+ unsigned int num_active; /* nr of active entities with this weight */ -+ /* -+ * Weights tree member (see bfq_data's @queue_weights_tree and -+ * @group_weights_tree) -+ */ -+ struct rb_node weights_node; -+}; -+ -+/** -+ * struct bfq_entity - schedulable entity. -+ * -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each -+ * entity belongs to the sched_data of the parent group in the cgroup -+ * hierarchy. Non-leaf entities have also their own sched_data, stored -+ * in @my_sched_data. -+ * -+ * Each entity stores independently its priority values; this would -+ * allow different weights on different devices, but this -+ * functionality is not exported to userspace by now. Priorities and -+ * weights are updated lazily, first storing the new values into the -+ * new_* fields, then setting the @prio_changed flag. As soon as -+ * there is a transition in the entity state that allows the priority -+ * update to take place the effective and the requested priority -+ * values are synchronized. -+ * -+ * Unless cgroups are used, the weight value is calculated from the -+ * ioprio to export the same interface as CFQ. When dealing with -+ * ``well-behaved'' queues (i.e., queues that do not spend too much -+ * time to consume their budget and have true sequential behavior, and -+ * when there are no external factors breaking anticipation) the -+ * relative weights at each level of the cgroups hierarchy should be -+ * guaranteed. All the fields are protected by the queue lock of the -+ * containing bfqd. -+ */ -+struct bfq_entity { -+ struct rb_node rb_node; /* service_tree member */ -+ /* pointer to the weight counter associated with this entity */ -+ struct bfq_weight_counter *weight_counter; -+ -+ /* -+ * Flag, true if the entity is on a tree (either the active or -+ * the idle one of its service_tree) or is in service. -+ */ -+ bool on_st; -+ -+ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ -+ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ -+ -+ /* tree the entity is enqueued into; %NULL if not on a tree */ -+ struct rb_root *tree; -+ -+ /* -+ * minimum start time of the (active) subtree rooted at this -+ * entity; used for O(log N) lookups into active trees -+ */ -+ u64 min_start; -+ -+ /* amount of service received during the last service slot */ -+ int service; -+ -+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ -+ int budget; -+ -+ unsigned int weight; /* weight of the queue */ -+ unsigned int new_weight; /* next weight if a change is in progress */ -+ -+ /* original weight, used to implement weight boosting */ -+ unsigned int orig_weight; -+ -+ /* parent entity, for hierarchical scheduling */ -+ struct bfq_entity *parent; -+ -+ /* -+ * For non-leaf nodes in the hierarchy, the associated -+ * scheduler queue, %NULL on leaf nodes. -+ */ -+ struct bfq_sched_data *my_sched_data; -+ /* the scheduler queue this entity belongs to */ -+ struct bfq_sched_data *sched_data; -+ -+ /* flag, set to request a weight, ioprio or ioprio_class change */ -+ int prio_changed; -+}; -+ -+struct bfq_group; -+ -+/** -+ * struct bfq_queue - leaf schedulable entity. -+ * -+ * A bfq_queue is a leaf request queue; it can be associated with an -+ * io_context or more, if it is async or shared between cooperating -+ * processes. @cgroup holds a reference to the cgroup, to be sure that it -+ * does not disappear while a bfqq still references it (mostly to avoid -+ * races between request issuing and task migration followed by cgroup -+ * destruction). -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_queue { -+ /* reference counter */ -+ int ref; -+ /* parent bfq_data */ -+ struct bfq_data *bfqd; -+ -+ /* current ioprio and ioprio class */ -+ unsigned short ioprio, ioprio_class; -+ /* next ioprio and ioprio class if a change is in progress */ -+ unsigned short new_ioprio, new_ioprio_class; -+ -+ /* -+ * Shared bfq_queue if queue is cooperating with one or more -+ * other queues. -+ */ -+ struct bfq_queue *new_bfqq; -+ /* request-position tree member (see bfq_group's @rq_pos_tree) */ -+ struct rb_node pos_node; -+ /* request-position tree root (see bfq_group's @rq_pos_tree) */ -+ struct rb_root *pos_root; -+ -+ /* sorted list of pending requests */ -+ struct rb_root sort_list; -+ /* if fifo isn't expired, next request to serve */ -+ struct request *next_rq; -+ /* number of sync and async requests queued */ -+ int queued[2]; -+ /* number of sync and async requests currently allocated */ -+ int allocated[2]; -+ /* number of pending metadata requests */ -+ int meta_pending; -+ /* fifo list of requests in sort_list */ -+ struct list_head fifo; -+ -+ /* entity representing this queue in the scheduler */ -+ struct bfq_entity entity; -+ -+ /* maximum budget allowed from the feedback mechanism */ -+ int max_budget; -+ /* budget expiration (in jiffies) */ -+ unsigned long budget_timeout; -+ -+ /* number of requests on the dispatch list or inside driver */ -+ int dispatched; -+ -+ unsigned int flags; /* status flags.*/ -+ -+ /* node for active/idle bfqq list inside parent bfqd */ -+ struct list_head bfqq_list; -+ -+ /* bit vector: a 1 for each seeky requests in history */ -+ u32 seek_history; -+ -+ /* node for the device's burst list */ -+ struct hlist_node burst_list_node; -+ -+ /* position of the last request enqueued */ -+ sector_t last_request_pos; -+ -+ /* Number of consecutive pairs of request completion and -+ * arrival, such that the queue becomes idle after the -+ * completion, but the next request arrives within an idle -+ * time slice; used only if the queue's IO_bound flag has been -+ * cleared. -+ */ -+ unsigned int requests_within_timer; -+ -+ /* pid of the process owning the queue, used for logging purposes */ -+ pid_t pid; -+ -+ /* -+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL -+ * if the queue is shared. -+ */ -+ struct bfq_io_cq *bic; -+ -+ /* current maximum weight-raising time for this queue */ -+ unsigned long wr_cur_max_time; -+ /* -+ * Minimum time instant such that, only if a new request is -+ * enqueued after this time instant in an idle @bfq_queue with -+ * no outstanding requests, then the task associated with the -+ * queue it is deemed as soft real-time (see the comments on -+ * the function bfq_bfqq_softrt_next_start()) -+ */ -+ unsigned long soft_rt_next_start; -+ /* -+ * Start time of the current weight-raising period if -+ * the @bfq-queue is being weight-raised, otherwise -+ * finish time of the last weight-raising period. -+ */ -+ unsigned long last_wr_start_finish; -+ /* factor by which the weight of this queue is multiplied */ -+ unsigned int wr_coeff; -+ /* -+ * Time of the last transition of the @bfq_queue from idle to -+ * backlogged. -+ */ -+ unsigned long last_idle_bklogged; -+ /* -+ * Cumulative service received from the @bfq_queue since the -+ * last transition from idle to backlogged. -+ */ -+ unsigned long service_from_backlogged; -+ /* -+ * Value of wr start time when switching to soft rt -+ */ -+ unsigned long wr_start_at_switch_to_srt; -+ -+ unsigned long split_time; /* time of last split */ -+}; -+ -+/** -+ * struct bfq_ttime - per process thinktime stats. -+ */ -+struct bfq_ttime { -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ -+ -+}; -+ -+/** -+ * struct bfq_io_cq - per (request_queue, io_context) structure. -+ */ -+struct bfq_io_cq { -+ /* associated io_cq structure */ -+ struct io_cq icq; /* must be the first member */ -+ /* array of two process queues, the sync and the async */ -+ struct bfq_queue *bfqq[2]; -+ /* associated @bfq_ttime struct */ -+ struct bfq_ttime ttime; -+ /* per (request_queue, blkcg) ioprio */ -+ int ioprio; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ uint64_t blkcg_serial_nr; /* the current blkcg serial */ -+#endif -+ -+ /* -+ * Snapshot of the has_short_time flag before merging; taken -+ * to remember its value while the queue is merged, so as to -+ * be able to restore it in case of split. -+ */ -+ bool saved_has_short_ttime; -+ /* -+ * Same purpose as the previous two fields for the I/O bound -+ * classification of a queue. -+ */ -+ bool saved_IO_bound; -+ -+ /* -+ * Same purpose as the previous fields for the value of the -+ * field keeping the queue's belonging to a large burst -+ */ -+ bool saved_in_large_burst; -+ /* -+ * True if the queue belonged to a burst list before its merge -+ * with another cooperating queue. -+ */ -+ bool was_in_burst_list; -+ -+ /* -+ * Similar to previous fields: save wr information. -+ */ -+ unsigned long saved_wr_coeff; -+ unsigned long saved_last_wr_start_finish; -+ unsigned long saved_wr_start_at_switch_to_srt; -+ unsigned int saved_wr_cur_max_time; -+}; -+ -+enum bfq_device_speed { -+ BFQ_BFQD_FAST, -+ BFQ_BFQD_SLOW, -+}; -+ -+/** -+ * struct bfq_data - per-device data structure. -+ * -+ * All the fields are protected by the @queue lock. -+ */ -+struct bfq_data { -+ /* request queue for the device */ -+ struct request_queue *queue; -+ -+ /* root bfq_group for the device */ -+ struct bfq_group *root_group; -+ -+ /* -+ * rbtree of weight counters of @bfq_queues, sorted by -+ * weight. Used to keep track of whether all @bfq_queues have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active and not -+ * weight-raised @bfq_queue (see the comments to the functions -+ * bfq_weights_tree_[add|remove] for further details). -+ */ -+ struct rb_root queue_weights_tree; -+ /* -+ * rbtree of non-queue @bfq_entity weight counters, sorted by -+ * weight. Used to keep track of whether all @bfq_groups have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active @bfq_group (see -+ * the comments to the functions bfq_weights_tree_[add|remove] -+ * for further details). -+ */ -+ struct rb_root group_weights_tree; -+ -+ /* -+ * Number of bfq_queues containing requests (including the -+ * queue in service, even if it is idling). -+ */ -+ int busy_queues; -+ /* number of weight-raised busy @bfq_queues */ -+ int wr_busy_queues; -+ /* number of queued requests */ -+ int queued; -+ /* number of requests dispatched and waiting for completion */ -+ int rq_in_driver; -+ -+ /* -+ * Maximum number of requests in driver in the last -+ * @hw_tag_samples completed requests. -+ */ -+ int max_rq_in_driver; -+ /* number of samples used to calculate hw_tag */ -+ int hw_tag_samples; -+ /* flag set to one if the driver is showing a queueing behavior */ -+ int hw_tag; -+ -+ /* number of budgets assigned */ -+ int budgets_assigned; -+ -+ /* -+ * Timer set when idling (waiting) for the next request from -+ * the queue in service. -+ */ -+ struct hrtimer idle_slice_timer; -+ /* delayed work to restart dispatching on the request queue */ -+ struct work_struct unplug_work; -+ -+ /* bfq_queue in service */ -+ struct bfq_queue *in_service_queue; -+ /* bfq_io_cq (bic) associated with the @in_service_queue */ -+ struct bfq_io_cq *in_service_bic; -+ -+ /* on-disk position of the last served request */ -+ sector_t last_position; -+ -+ /* time of last request completion (ns) */ -+ u64 last_completion; -+ -+ /* time of first rq dispatch in current observation interval (ns) */ -+ u64 first_dispatch; -+ /* time of last rq dispatch in current observation interval (ns) */ -+ u64 last_dispatch; -+ -+ /* beginning of the last budget */ -+ ktime_t last_budget_start; -+ /* beginning of the last idle slice */ -+ ktime_t last_idling_start; -+ -+ /* number of samples in current observation interval */ -+ int peak_rate_samples; -+ /* num of samples of seq dispatches in current observation interval */ -+ u32 sequential_samples; -+ /* total num of sectors transferred in current observation interval */ -+ u64 tot_sectors_dispatched; -+ /* max rq size seen during current observation interval (sectors) */ -+ u32 last_rq_max_size; -+ /* time elapsed from first dispatch in current observ. interval (us) */ -+ u64 delta_from_first; -+ /* current estimate of device peak rate */ -+ u32 peak_rate; -+ -+ /* maximum budget allotted to a bfq_queue before rescheduling */ -+ int bfq_max_budget; -+ -+ /* list of all the bfq_queues active on the device */ -+ struct list_head active_list; -+ /* list of all the bfq_queues idle on the device */ -+ struct list_head idle_list; -+ -+ /* -+ * Timeout for async/sync requests; when it fires, requests -+ * are served in fifo order. -+ */ -+ u64 bfq_fifo_expire[2]; -+ /* weight of backward seeks wrt forward ones */ -+ unsigned int bfq_back_penalty; -+ /* maximum allowed backward seek */ -+ unsigned int bfq_back_max; -+ /* maximum idling time */ -+ u32 bfq_slice_idle; -+ -+ /* user-configured max budget value (0 for auto-tuning) */ -+ int bfq_user_max_budget; -+ /* -+ * Timeout for bfq_queues to consume their budget; used to -+ * prevent seeky queues from imposing long latencies to -+ * sequential or quasi-sequential ones (this also implies that -+ * seeky queues cannot receive guarantees in the service -+ * domain; after a timeout they are charged for the time they -+ * have been in service, to preserve fairness among them, but -+ * without service-domain guarantees). -+ */ -+ unsigned int bfq_timeout; -+ -+ /* -+ * Number of consecutive requests that must be issued within -+ * the idle time slice to set again idling to a queue which -+ * was marked as non-I/O-bound (see the definition of the -+ * IO_bound flag for further details). -+ */ -+ unsigned int bfq_requests_within_timer; -+ -+ /* -+ * Force device idling whenever needed to provide accurate -+ * service guarantees, without caring about throughput -+ * issues. CAVEAT: this may even increase latencies, in case -+ * of useless idling for processes that did stop doing I/O. -+ */ -+ bool strict_guarantees; -+ -+ /* -+ * Last time at which a queue entered the current burst of -+ * queues being activated shortly after each other; for more -+ * details about this and the following parameters related to -+ * a burst of activations, see the comments on the function -+ * bfq_handle_burst. -+ */ -+ unsigned long last_ins_in_burst; -+ /* -+ * Reference time interval used to decide whether a queue has -+ * been activated shortly after @last_ins_in_burst. -+ */ -+ unsigned long bfq_burst_interval; -+ /* number of queues in the current burst of queue activations */ -+ int burst_size; -+ -+ /* common parent entity for the queues in the burst */ -+ struct bfq_entity *burst_parent_entity; -+ /* Maximum burst size above which the current queue-activation -+ * burst is deemed as 'large'. -+ */ -+ unsigned long bfq_large_burst_thresh; -+ /* true if a large queue-activation burst is in progress */ -+ bool large_burst; -+ /* -+ * Head of the burst list (as for the above fields, more -+ * details in the comments on the function bfq_handle_burst). -+ */ -+ struct hlist_head burst_list; -+ -+ /* if set to true, low-latency heuristics are enabled */ -+ bool low_latency; -+ /* -+ * Maximum factor by which the weight of a weight-raised queue -+ * is multiplied. -+ */ -+ unsigned int bfq_wr_coeff; -+ /* maximum duration of a weight-raising period (jiffies) */ -+ unsigned int bfq_wr_max_time; -+ -+ /* Maximum weight-raising duration for soft real-time processes */ -+ unsigned int bfq_wr_rt_max_time; -+ /* -+ * Minimum idle period after which weight-raising may be -+ * reactivated for a queue (in jiffies). -+ */ -+ unsigned int bfq_wr_min_idle_time; -+ /* -+ * Minimum period between request arrivals after which -+ * weight-raising may be reactivated for an already busy async -+ * queue (in jiffies). -+ */ -+ unsigned long bfq_wr_min_inter_arr_async; -+ -+ /* Max service-rate for a soft real-time queue, in sectors/sec */ -+ unsigned int bfq_wr_max_softrt_rate; -+ /* -+ * Cached value of the product R*T, used for computing the -+ * maximum duration of weight raising automatically. -+ */ -+ u64 RT_prod; -+ /* device-speed class for the low-latency heuristic */ -+ enum bfq_device_speed device_speed; -+ -+ /* fallback dummy bfqq for extreme OOM conditions */ -+ struct bfq_queue oom_bfqq; -+}; -+ -+enum bfqq_state_flags { -+ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ -+ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ -+ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* -+ * waiting for a request -+ * without idling the device -+ */ -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -+ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -+ BFQ_BFQQ_FLAG_IO_bound, /* -+ * bfqq has timed-out at least once -+ * having consumed at most 2/10 of -+ * its budget -+ */ -+ BFQ_BFQQ_FLAG_in_large_burst, /* -+ * bfqq activated in a large burst, -+ * see comments to bfq_handle_burst. -+ */ -+ BFQ_BFQQ_FLAG_softrt_update, /* -+ * may need softrt-next-start -+ * update -+ */ -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ -+ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ -+}; -+ -+#define BFQ_BFQQ_FNS(name) \ -+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -+{ \ -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -+} -+ -+BFQ_BFQQ_FNS(just_created); -+BFQ_BFQQ_FNS(busy); -+BFQ_BFQQ_FNS(wait_request); -+BFQ_BFQQ_FNS(non_blocking_wait_rq); -+BFQ_BFQQ_FNS(must_alloc); -+BFQ_BFQQ_FNS(fifo_expire); -+BFQ_BFQQ_FNS(has_short_ttime); -+BFQ_BFQQ_FNS(sync); -+BFQ_BFQQ_FNS(IO_bound); -+BFQ_BFQQ_FNS(in_large_burst); -+BFQ_BFQQ_FNS(coop); -+BFQ_BFQQ_FNS(split_coop); -+BFQ_BFQQ_FNS(softrt_update); -+#undef BFQ_BFQQ_FNS -+ -+/* Logging facilities. */ -+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE -+ -+static const char *checked_dev_name(const struct device *dev) -+{ -+ static const char nodev[] = "nodev"; -+ -+ if (dev) -+ return dev_name(dev); -+ -+ return nodev; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s bfq%d%c %s " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s %s " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ pr_crit("%s bfq%d%c " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ pr_crit("%s bfq " fmt "\n", \ -+ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -+ ##args) -+ -+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -+} while (0) -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+/* Expiration reasons. */ -+enum bfqq_expiration { -+ BFQ_BFQQ_TOO_IDLE = 0, /* -+ * queue has been idling for -+ * too long -+ */ -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -+ BFQ_BFQQ_PREEMPTED /* preemption in progress */ -+}; -+ -+ -+struct bfqg_stats { -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ /* number of ios merged */ -+ struct blkg_rwstat merged; -+ /* total time spent on device in ns, may not be accurate w/ queueing */ -+ struct blkg_rwstat service_time; -+ /* total time spent waiting in scheduler queue in ns */ -+ struct blkg_rwstat wait_time; -+ /* number of IOs queued up */ -+ struct blkg_rwstat queued; -+ /* total disk time and nr sectors dispatched by this group */ -+ struct blkg_stat time; -+ /* sum of number of ios queued across all samples */ -+ struct blkg_stat avg_queue_size_sum; -+ /* count of samples taken for average */ -+ struct blkg_stat avg_queue_size_samples; -+ /* how many times this group has been removed from service tree */ -+ struct blkg_stat dequeue; -+ /* total time spent waiting for it to be assigned a timeslice. */ -+ struct blkg_stat group_wait_time; -+ /* time spent idling for this blkcg_gq */ -+ struct blkg_stat idle_time; -+ /* total time with empty current active q with other requests queued */ -+ struct blkg_stat empty_time; -+ /* fields after this shouldn't be cleared on stat reset */ -+ uint64_t start_group_wait_time; -+ uint64_t start_idle_time; -+ uint64_t start_empty_time; -+ uint16_t flags; -+#endif -+}; -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+/* -+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. -+ * -+ * @ps: @blkcg_policy_storage that this structure inherits -+ * @weight: weight of the bfq_group -+ */ -+struct bfq_group_data { -+ /* must be the first member */ -+ struct blkcg_policy_data pd; -+ -+ unsigned int weight; -+}; -+ -+/** -+ * struct bfq_group - per (device, cgroup) data structure. -+ * @entity: schedulable entity to insert into the parent group sched_data. -+ * @sched_data: own sched_data, to contain child entities (they may be -+ * both bfq_queues and bfq_groups). -+ * @bfqd: the bfq_data for the device this group acts upon. -+ * @async_bfqq: array of async queues for all the tasks belonging to -+ * the group, one queue per ioprio value per ioprio_class, -+ * except for the idle class that has only one queue. -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used -+ * to avoid too many special cases during group creation/ -+ * migration. -+ * @active_entities: number of active entities belonging to the group; -+ * unused for the root group. Used to know whether there -+ * are groups with more than one active @bfq_entity -+ * (see the comments to the function -+ * bfq_bfqq_may_idle()). -+ * @rq_pos_tree: rbtree sorted by next_request position, used when -+ * determining if two or more queues have interleaving -+ * requests (see bfq_find_close_cooperator()). -+ * -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup -+ * there is a set of bfq_groups, each one collecting the lower-level -+ * entities belonging to the group that are acting on the same device. -+ * -+ * Locking works as follows: -+ * o @bfqd is protected by the queue lock, RCU is used to access it -+ * from the readers. -+ * o All the other fields are protected by the @bfqd queue lock. -+ */ -+struct bfq_group { -+ /* must be the first member */ -+ struct blkg_policy_data pd; -+ -+ struct bfq_entity entity; -+ struct bfq_sched_data sched_data; -+ -+ void *bfqd; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct bfq_entity *my_entity; -+ -+ int active_entities; -+ -+ struct rb_root rq_pos_tree; -+ -+ struct bfqg_stats stats; -+}; -+ -+#else -+struct bfq_group { -+ struct bfq_sched_data sched_data; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct rb_root rq_pos_tree; -+}; -+#endif -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -+ -+static unsigned int bfq_class_idx(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ return bfqq ? bfqq->ioprio_class - 1 : -+ BFQ_DEFAULT_GRP_CLASS - 1; -+} -+ -+static struct bfq_service_tree * -+bfq_entity_service_tree(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sched_data = entity->sched_data; -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int idx = bfq_class_idx(entity); -+ -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); -+ BUG_ON(sched_data == NULL); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+ } -+#endif -+ return sched_data->service_tree + idx; -+} -+ -+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -+{ -+ return bic->bfqq[is_sync]; -+} -+ -+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, -+ bool is_sync) -+{ -+ bic->bfqq[is_sync] = bfqq; -+} -+ -+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -+{ -+ return bic->icq.q->elevator->elevator_data; -+} -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ if (!group_entity) -+ group_entity = &bfqq->bfqd->root_group->entity; -+ -+ return container_of(group_entity, struct bfq_group, entity); -+} -+ -+#else -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+#endif -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); -+static void bfq_put_queue(struct bfq_queue *bfqq); -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic); -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -+#endif -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -+ -+#endif /* _BFQ_H */ - -From 0bd96428e086fd28800efdf5f0a5f62869af6e30 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Sat, 21 Jan 2017 12:41:14 +0100 -Subject: [PATCH 10/51] Move thinktime from bic to bfqq - -Prep change to make it possible to protect this field with a -scheduler lock. - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 28 ++++++++++++++-------------- - block/bfq-mq.h | 30 ++++++++++++++++-------------- - 2 files changed, 30 insertions(+), 28 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index d1125aee658c..65f5dfb79417 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -698,6 +698,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - if (unlikely(busy)) - old_wr_coeff = bfqq->wr_coeff; - -+ bfqq->ttime = bic->saved_ttime; - bfqq->wr_coeff = bic->saved_wr_coeff; - bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; - BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -@@ -1287,7 +1288,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, - * details on the usage of the next variable. - */ - arrived_in_time = ktime_get_ns() <= -- RQ_BIC(rq)->ttime.last_end_request + -+ bfqq->ttime.last_end_request + - bfqd->bfq_slice_idle * 3; - - bfq_log_bfqq(bfqd, bfqq, -@@ -2048,6 +2049,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - if (!bic) - return; - -+ bic->saved_ttime = bfqq->ttime; - bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -@@ -3948,11 +3950,6 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_put_queue(bfqq); /* release process reference */ - } - --static void bfq_init_icq(struct io_cq *icq) --{ -- icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); --} -- - static void bfq_exit_icq(struct io_cq *icq) - { - struct bfq_io_cq *bic = icq_to_bic(icq); -@@ -4084,6 +4081,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_mark_bfqq_just_created(bfqq); - } else - bfq_clear_bfqq_sync(bfqq); -+ -+ bfqq->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -+ - bfq_mark_bfqq_IO_bound(bfqq); - - /* Tentative initial value to trade off between thr and lat */ -@@ -4191,14 +4191,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - } - - static void bfq_update_io_thinktime(struct bfq_data *bfqd, -- struct bfq_io_cq *bic) -+ struct bfq_queue *bfqq) - { -- struct bfq_ttime *ttime = &bic->ttime; -- u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; -+ struct bfq_ttime *ttime = &bfqq->ttime; -+ u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; - - elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); - -- ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; - ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); - ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, - ttime->ttime_samples); -@@ -4240,8 +4240,8 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, - * decide whether to mark as has_short_ttime - */ - if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -- (bfq_sample_valid(bic->ttime.ttime_samples) && -- bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) -+ (bfq_sample_valid(bfqq->ttime.ttime_samples) && -+ bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) - has_short_ttime = false; - - bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -@@ -4265,7 +4265,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (rq->cmd_flags & REQ_META) - bfqq->meta_pending++; - -- bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_io_thinktime(bfqd, bfqq); - bfq_update_has_short_ttime(bfqd, bfqq, bic); - bfq_update_io_seektime(bfqd, bfqq, rq); - -@@ -4436,7 +4436,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - - now_ns = ktime_get_ns(); - -- RQ_BIC(rq)->ttime.last_end_request = now_ns; -+ bfqq->ttime.last_end_request = now_ns; - - /* - * Using us instead of ns, to get a reasonable precision in -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 53954d1b87f8..0f51f270469c 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -210,6 +210,18 @@ struct bfq_entity { - struct bfq_group; - - /** -+ * struct bfq_ttime - per process thinktime stats. -+ */ -+struct bfq_ttime { -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ -+ -+}; -+ -+/** - * struct bfq_queue - leaf schedulable entity. - * - * A bfq_queue is a leaf request queue; it can be associated with an -@@ -270,6 +282,9 @@ struct bfq_queue { - /* node for active/idle bfqq list inside parent bfqd */ - struct list_head bfqq_list; - -+ /* associated @bfq_ttime struct */ -+ struct bfq_ttime ttime; -+ - /* bit vector: a 1 for each seeky requests in history */ - u32 seek_history; - -@@ -333,18 +348,6 @@ struct bfq_queue { - }; - - /** -- * struct bfq_ttime - per process thinktime stats. -- */ --struct bfq_ttime { -- u64 last_end_request; /* completion time of last request */ -- -- u64 ttime_total; /* total process thinktime */ -- unsigned long ttime_samples; /* number of thinktime samples */ -- u64 ttime_mean; /* average process thinktime */ -- --}; -- --/** - * struct bfq_io_cq - per (request_queue, io_context) structure. - */ - struct bfq_io_cq { -@@ -352,8 +355,6 @@ struct bfq_io_cq { - struct io_cq icq; /* must be the first member */ - /* array of two process queues, the sync and the async */ - struct bfq_queue *bfqq[2]; -- /* associated @bfq_ttime struct */ -- struct bfq_ttime ttime; - /* per (request_queue, blkcg) ioprio */ - int ioprio; - #ifdef BFQ_GROUP_IOSCHED_ENABLED -@@ -390,6 +391,7 @@ struct bfq_io_cq { - unsigned long saved_last_wr_start_finish; - unsigned long saved_wr_start_at_switch_to_srt; - unsigned int saved_wr_cur_max_time; -+ struct bfq_ttime saved_ttime; - }; - - enum bfq_device_speed { - -From 351a9aea7c0c9c30edacdbf2a3c0d089470de1e8 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 18 Jan 2017 11:42:22 +0100 -Subject: [PATCH 11/51] Embed bfq-ioc.c and add locking on request queue - -The version of bfq-ioc.c for bfq-iosched.c is not correct any more for -bfq-mq, because, in bfq-mq, the request queue lock is not being held -when bfq_bic_lookup is invoked. That function must then take that look -on its own. This commit removes the inclusion of bfq-ioc.c, copies the -content of bfq-ioc.c into bfq-mq-iosched.c, and adds the grabbing of -the lock. - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 39 ++++++++++++++++++++++++++++++++++++--- - 1 file changed, 36 insertions(+), 3 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 65f5dfb79417..756a618d5902 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -195,7 +195,39 @@ static int device_speed_thresh[2]; - - static void bfq_schedule_dispatch(struct bfq_data *bfqd); - --#include "bfq-ioc.c" -+/** -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. -+ * @icq: the iocontext queue. -+ */ -+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) -+{ -+ /* bic->icq is the first member, %NULL will convert to %NULL */ -+ return container_of(icq, struct bfq_io_cq, icq); -+} -+ -+/** -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. -+ * @bfqd: the lookup key. -+ * @ioc: the io_context of the process doing I/O. -+ * @q: the request queue. -+ */ -+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, -+ struct io_context *ioc, -+ struct request_queue *q) -+{ -+ if (ioc) { -+ struct bfq_io_cq *icq; -+ -+ spin_lock_irq(q->queue_lock); -+ icq = icq_to_bic(ioc_lookup_icq(ioc, q)); -+ spin_unlock_irq(q->queue_lock); -+ -+ return icq; -+ } -+ -+ return NULL; -+} -+ - #include "bfq-sched.c" - #include "bfq-cgroup-included.c" - -@@ -1520,13 +1552,14 @@ static void bfq_add_request(struct request *rq) - } - - static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -- struct bio *bio) -+ struct bio *bio, -+ struct request_queue *q) - { - struct task_struct *tsk = current; - struct bfq_io_cq *bic; - struct bfq_queue *bfqq; - -- bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ bic = bfq_bic_lookup(bfqd, tsk->io_context, q); - if (!bic) - return NULL; - - -From ed0d64e27b2308813a2a846139e405e0479f0849 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Tue, 20 Dec 2016 09:07:19 +0100 -Subject: [PATCH 12/51] Modify interface and operation to comply with - blk-mq-sched - -As for modifications of the operation, the major changes are the introduction -of a scheduler lock, and the moving to deferred work of the body of the hook -exit_icq. The latter change has been made to avoid deadlocks caused by the -combination of the following facts: 1) such a body takes the scheduler lock, -and, if not deferred, 2) it does so from inside the exit_icq hook, which is -invoked with the queue lock held, and 3) there is at least one code path, -namely that starting from bfq_bio_merge, which takes these locks in the -opposite order. - -Signed-off-by: Paolo Valente ---- - block/bfq-cgroup-included.c | 4 - - block/bfq-mq-iosched.c | 695 ++++++++++++++++++++++++-------------------- - block/bfq-mq.h | 35 +-- - 3 files changed, 394 insertions(+), 340 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 9c483b658179..8a73de76f32b 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -472,8 +472,6 @@ static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct bfq_group *bfqg, *parent; - struct bfq_entity *entity; - -- assert_spin_locked(bfqd->queue->queue_lock); -- - bfqg = bfq_lookup_bfqg(bfqd, blkcg); - - if (unlikely(!bfqg)) -@@ -602,8 +600,6 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - struct bfq_group *bfqg; - struct bfq_entity *entity; - -- lockdep_assert_held(bfqd->queue->queue_lock); -- - bfqg = bfq_find_set_group(bfqd, blkcg); - - if (unlikely(!bfqg)) -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 756a618d5902..c963d92a32c2 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -81,7 +81,13 @@ - #include - #include - #include -+#include -+#include -+ - #include "blk.h" -+#include "blk-mq.h" -+#include "blk-mq-tag.h" -+#include "blk-mq-sched.h" - #undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ - #include "bfq-mq.h" - -@@ -193,8 +199,6 @@ static int device_speed_thresh[2]; - #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) - #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) - --static void bfq_schedule_dispatch(struct bfq_data *bfqd); -- - /** - * icq_to_bic - convert iocontext queue structure to bfq_io_cq. - * @icq: the iocontext queue. -@@ -216,11 +220,12 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - struct request_queue *q) - { - if (ioc) { -+ unsigned long flags; - struct bfq_io_cq *icq; - -- spin_lock_irq(q->queue_lock); -+ spin_lock_irqsave(q->queue_lock, flags); - icq = icq_to_bic(ioc_lookup_icq(ioc, q)); -- spin_unlock_irq(q->queue_lock); -+ spin_unlock_irqrestore(q->queue_lock, flags); - - return icq; - } -@@ -244,7 +249,7 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) - { - if (bfqd->queued != 0) { - bfq_log(bfqd, "schedule dispatch"); -- kblockd_schedule_work(&bfqd->unplug_work); -+ blk_mq_run_hw_queues(bfqd->queue, true); - } - } - -@@ -768,9 +773,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) - { - int process_refs, io_refs; - -- lockdep_assert_held(bfqq->bfqd->queue->queue_lock); -- -- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -+ io_refs = bfqq->allocated; - process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; - BUG_ON(process_refs < 0); - return process_refs; -@@ -1584,6 +1587,7 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq) - return sdist; - } - -+#if 0 /* Still not clear if we can do without next two functions */ - static void bfq_activate_request(struct request_queue *q, struct request *rq) - { - struct bfq_data *bfqd = q->elevator->elevator_data; -@@ -1597,8 +1601,10 @@ static void bfq_deactivate_request(struct request_queue *q, struct request *rq) - BUG_ON(bfqd->rq_in_driver == 0); - bfqd->rq_in_driver--; - } -+#endif - --static void bfq_remove_request(struct request *rq) -+static void bfq_remove_request(struct request_queue *q, -+ struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; -@@ -1619,6 +1625,10 @@ static void bfq_remove_request(struct request *rq) - bfqd->queued--; - elv_rb_del(&bfqq->sort_list, rq); - -+ elv_rqhash_del(q, rq); -+ if (q->last_merge == rq) -+ q->last_merge = NULL; -+ - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - bfqq->next_rq = NULL; - -@@ -1659,13 +1669,36 @@ static void bfq_remove_request(struct request *rq) - bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); - } - --static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, -- struct bio *bio) -+static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) -+{ -+ struct request_queue *q = hctx->queue; -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *free = NULL; -+ bool ret; -+ -+ spin_lock_irq(&bfqd->lock); -+ ret = blk_mq_sched_try_merge(q, bio, &free); -+ -+ /* -+ * XXX Not yet freeing without lock held, to avoid an -+ * inconsistency with respect to the lock-protected invocation -+ * of blk_mq_sched_try_insert_merge in bfq_bio_merge. Waiting -+ * for clarifications from Jens. -+ */ -+ if (free) -+ blk_mq_free_request(free); -+ spin_unlock_irq(&bfqd->lock); -+ -+ return ret; -+} -+ -+static int bfq_request_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct request *__rq; - -- __rq = bfq_find_rq_fmerge(bfqd, bio); -+ __rq = bfq_find_rq_fmerge(bfqd, bio, q); - if (__rq && elv_bio_merge_ok(__rq, bio)) { - *req = __rq; - return ELEVATOR_FRONT_MERGE; -@@ -1674,7 +1707,7 @@ static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, - return ELEVATOR_NO_MERGE; - } - --static void bfq_merged_request(struct request_queue *q, struct request *req, -+static void bfq_request_merged(struct request_queue *q, struct request *req, - enum elv_merge type) - { - if (type == ELEVATOR_FRONT_MERGE && -@@ -1689,6 +1722,8 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - /* Reposition request in its sort_list */ - elv_rb_del(&bfqq->sort_list, req); - elv_rb_add(&bfqq->sort_list, req); -+ -+ spin_lock_irq(&bfqd->lock); - /* Choose next request to be served for bfqq */ - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -@@ -1704,22 +1739,19 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); - } -+ spin_unlock_irq(&bfqd->lock); - } - } - --#ifdef BFQ_GROUP_IOSCHED_ENABLED --static void bfq_bio_merged(struct request_queue *q, struct request *req, -- struct bio *bio) --{ -- bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); --} --#endif -- --static void bfq_merged_requests(struct request_queue *q, struct request *rq, -+static void bfq_requests_merged(struct request_queue *q, struct request *rq, - struct request *next) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); - -+ if (!RB_EMPTY_NODE(&rq->rb_node)) -+ goto end; -+ spin_lock_irq(&bfqq->bfqd->lock); -+ - /* - * If next and rq belong to the same bfq_queue and next is older - * than rq, then reposition rq in the fifo (by substituting next -@@ -1740,7 +1772,10 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, - if (bfqq->next_rq == next) - bfqq->next_rq = rq; - -- bfq_remove_request(next); -+ bfq_remove_request(q, next); -+ -+ spin_unlock_irq(&bfqq->bfqd->lock); -+end: - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); - } - -@@ -1786,7 +1821,7 @@ static void bfq_end_wr(struct bfq_data *bfqd) - { - struct bfq_queue *bfqq; - -- spin_lock_irq(bfqd->queue->queue_lock); -+ spin_lock_irq(&bfqd->lock); - - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) - bfq_bfqq_end_wr(bfqq); -@@ -1794,7 +1829,7 @@ static void bfq_end_wr(struct bfq_data *bfqd) - bfq_bfqq_end_wr(bfqq); - bfq_end_wr_async(bfqd); - -- spin_unlock_irq(bfqd->queue->queue_lock); -+ spin_unlock_irq(&bfqd->lock); - } - - static sector_t bfq_io_struct_pos(void *io_struct, bool request) -@@ -2184,8 +2219,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - bfq_put_queue(bfqq); - } - --static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -- struct bio *bio) -+static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - bool is_sync = op_is_sync(bio->bi_opf); -@@ -2203,7 +2238,7 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - * merge only if rq is queued there. - * Queue lock is held here. - */ -- bic = bfq_bic_lookup(bfqd, current->io_context); -+ bic = bfq_bic_lookup(bfqd, current->io_context, q); - if (!bic) - return false; - -@@ -2228,12 +2263,6 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - return bfqq == RQ_BFQQ(rq); - } - --static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, -- struct request *next) --{ -- return RQ_BFQQ(rq) == RQ_BFQQ(next); --} -- - /* - * Set the maximum time for the in-service queue to consume its - * budget. This prevents seeky processes from lowering the throughput. -@@ -2264,7 +2293,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - { - if (bfqq) { - bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -- bfq_mark_bfqq_must_alloc(bfqq); - bfq_clear_bfqq_fifo_expire(bfqq); - - bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -@@ -2703,27 +2731,28 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - } - - /* -- * Move request from internal lists to the dispatch list of the request queue -+ * Remove request from internal lists. - */ --static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - - /* -- * For consistency, the next instruction should have been executed -- * after removing the request from the queue and dispatching it. -- * We execute instead this instruction before bfq_remove_request() -- * (and hence introduce a temporary inconsistency), for efficiency. -- * In fact, in a forced_dispatch, this prevents two counters related -- * to bfqq->dispatched to risk to be uselessly decremented if bfqq -- * is not in service, and then to be incremented again after -- * incrementing bfqq->dispatched. -+ * For consistency, the next instruction should have been -+ * executed after removing the request from the queue and -+ * dispatching it. We execute instead this instruction before -+ * bfq_remove_request() (and hence introduce a temporary -+ * inconsistency), for efficiency. In fact, should this -+ * dispatch occur for a non in-service bfqq, this anticipated -+ * increment prevents two counters related to bfqq->dispatched -+ * from risking to be, first, uselessly decremented, and then -+ * incremented again when the (new) value of bfqq->dispatched -+ * happens to be taken into account. - */ - bfqq->dispatched++; - bfq_update_peak_rate(q->elevator->elevator_data, rq); - -- bfq_remove_request(rq); -- elv_dispatch_sort(q, rq); -+ bfq_remove_request(q, rq); - } - - static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -@@ -3605,7 +3634,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); - - if (bfq_may_expire_for_budg_timeout(bfqq) && -- !hrtimer_active(&bfqd->idle_slice_timer) && -+ !bfq_bfqq_wait_request(bfqq) && - !bfq_bfqq_must_idle(bfqq)) - goto expire; - -@@ -3641,7 +3670,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * arrives. - */ - if (bfq_bfqq_wait_request(bfqq)) { -- BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); - /* - * If we get here: 1) at least a new request - * has arrived but we have not disabled the -@@ -3668,7 +3696,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * for a new request, or has requests waiting for a completion and - * may idle after their completion, then keep it anyway. - */ -- if (hrtimer_active(&bfqd->idle_slice_timer) || -+ if (bfq_bfqq_wait_request(bfqq) || - (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { - bfqq = NULL; - goto keep_queue; -@@ -3753,13 +3781,11 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - } - - /* -- * Dispatch one request from bfqq, moving it to the request queue -- * dispatch list. -+ * Dispatch next request from bfqq. - */ --static int bfq_dispatch_request(struct bfq_data *bfqd, -- struct bfq_queue *bfqq) -+static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) - { -- int dispatched = 0; - struct request *rq = bfqq->next_rq; - unsigned long service_to_charge; - -@@ -3775,7 +3801,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, - - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - -- bfq_dispatch_insert(bfqd->queue, rq); -+ bfq_dispatch_remove(bfqd->queue, rq); - - /* - * If weight raising has to terminate for bfqq, then next -@@ -3791,86 +3817,61 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, - bfq_update_wr_data(bfqd, bfqq); - - bfq_log_bfqq(bfqd, bfqq, -- "dispatched %u sec req (%llu), budg left %d", -+ "dispatched %u sec req (%llu), budg left %d, new disp_nr %d", - blk_rq_sectors(rq), - (unsigned long long) blk_rq_pos(rq), -- bfq_bfqq_budget_left(bfqq)); -- -- dispatched++; -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->dispatched); - - if (!bfqd->in_service_bic) { - atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); - bfqd->in_service_bic = RQ_BIC(rq); - } - -+ /* -+ * Expire bfqq, pretending that its budget expired, if bfqq -+ * belongs to CLASS_IDLE and other queues are waiting for -+ * service. -+ */ - if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) - goto expire; - -- return dispatched; -+ return rq; - - expire: - bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -- return dispatched; --} -- --static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) --{ -- int dispatched = 0; -- -- while (bfqq->next_rq) { -- bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); -- dispatched++; -- } -- -- BUG_ON(!list_empty(&bfqq->fifo)); -- return dispatched; -+ return rq; - } - --/* -- * Drain our current requests. -- * Used for barriers and when switching io schedulers on-the-fly. -- */ --static int bfq_forced_dispatch(struct bfq_data *bfqd) -+static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) - { -- struct bfq_queue *bfqq, *n; -- struct bfq_service_tree *st; -- int dispatched = 0; -- -- bfqq = bfqd->in_service_queue; -- if (bfqq) -- __bfq_bfqq_expire(bfqd, bfqq); -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - - /* -- * Loop through classes, and be careful to leave the scheduler -- * in a consistent state, as feedback mechanisms and vtime -- * updates cannot be disabled during the process. -+ * Avoiding lock: a race on bfqd->busy_queues should cause at -+ * most a call to dispatch for nothing - */ -- list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { -- st = bfq_entity_service_tree(&bfqq->entity); -- -- dispatched += __bfq_forced_dispatch_bfqq(bfqq); -- -- bfqq->max_budget = bfq_max_budget(bfqd); -- bfq_forget_idle(st); -- } -- -- BUG_ON(bfqd->busy_queues != 0); -- -- return dispatched; -+ return !list_empty_careful(&bfqd->dispatch) || -+ bfqd->busy_queues > 0; - } - --static int bfq_dispatch_requests(struct request_queue *q, int force) -+static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { -- struct bfq_data *bfqd = q->elevator->elevator_data; -- struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct request *rq = NULL; -+ struct bfq_queue *bfqq = NULL; -+ -+ if (!list_empty(&bfqd->dispatch)) { -+ rq = list_first_entry(&bfqd->dispatch, struct request, -+ queuelist); -+ list_del_init(&rq->queuelist); -+ goto exit; -+ } - - bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); - - if (bfqd->busy_queues == 0) -- return 0; -- -- if (unlikely(force)) -- return bfq_forced_dispatch(bfqd); -+ goto exit; - - /* - * Force device to serve one request at a time if -@@ -3885,25 +3886,39 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - * throughput. - */ - if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -- return 0; -+ goto exit; - - bfqq = bfq_select_queue(bfqd); - if (!bfqq) -- return 0; -+ goto exit; - - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - - BUG_ON(bfq_bfqq_wait_request(bfqq)); - -- if (!bfq_dispatch_request(bfqd, bfqq)) -- return 0; -- -- bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", -- bfq_bfqq_sync(bfqq) ? "sync" : "async"); -+ rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq); - - BUG_ON(bfqq->next_rq == NULL && - bfqq->entity.budget < bfqq->entity.service); -- return 1; -+exit: -+ if (rq) { -+ rq->rq_flags |= RQF_STARTED; -+ bfqd->rq_in_driver++; -+ } -+ -+ return rq; -+} -+ -+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -+{ -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct request *rq; -+ -+ spin_lock_irq(&bfqd->lock); -+ rq = __bfq_dispatch_request(hctx); -+ spin_unlock_irq(&bfqd->lock); -+ -+ return rq; - } - - /* -@@ -3921,13 +3936,14 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - - BUG_ON(bfqq->ref <= 0); - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ if (bfqq->bfqd) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ - bfqq->ref--; - if (bfqq->ref) - return; - - BUG_ON(rb_first(&bfqq->sort_list)); -- BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -@@ -3942,7 +3958,8 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - */ - hlist_del_init(&bfqq->burst_list_node); - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ if (bfqq->bfqd) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -@@ -3983,29 +4000,52 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_put_queue(bfqq); /* release process reference */ - } - --static void bfq_exit_icq(struct io_cq *icq) -+static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - { -- struct bfq_io_cq *bic = icq_to_bic(icq); -- struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); -+ struct bfq_data *bfqd; - -- if (bic_to_bfqq(bic, false)) { -- bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); -- bic_set_bfqq(bic, NULL, false); -- } -+ if (bfqq) -+ bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ - -- if (bic_to_bfqq(bic, true)) { -+ if (bfqq && bfqd) { -+ spin_lock_irq(&bfqd->lock); - /* - * If the bic is using a shared queue, put the reference - * taken on the io_context when the bic started using a - * shared bfq_queue. - */ -- if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) -- put_io_context(icq->ioc); -- bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); -- bic_set_bfqq(bic, NULL, true); -+ if (is_sync && bfq_bfqq_coop(bfqq)) -+ put_io_context(bic->icq.ioc); -+ bfq_exit_bfqq(bfqd, bfqq); -+ bic_set_bfqq(bic, NULL, is_sync); -+ spin_unlock_irq(&bfqd->lock); - } - } - -+static void bfq_exit_icq_body(struct work_struct *work) -+{ -+ struct bfq_io_cq *bic = -+ container_of(work, struct bfq_io_cq, exit_icq_work); -+ -+ bfq_exit_icq_bfqq(bic, true); -+ bfq_exit_icq_bfqq(bic, false); -+} -+ -+static void bfq_init_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ -+ INIT_WORK(&bic->exit_icq_work, bfq_exit_icq_body); -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ -+ kblockd_schedule_work(&bic->exit_icq_work); -+} -+ - /* - * Update the entity prio values; note that the new values will not - * be used until the next (re)activation. -@@ -4015,6 +4055,10 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - { - struct task_struct *tsk = current; - int ioprio_class; -+ struct bfq_data *bfqd = bfqq->bfqd; -+ -+ if (!bfqd) -+ return; - - ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); - switch (ioprio_class) { -@@ -4095,6 +4139,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - INIT_HLIST_NODE(&bfqq->burst_list_node); - BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - -+ spin_lock_init(&bfqq->lock); -+ - bfqq->ref = 0; - bfqq->bfqd = bfqd; - -@@ -4351,22 +4397,13 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (budget_timeout) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_BUDGET_TIMEOUT); -- -- /* -- * Let the request rip immediately, or let a new queue be -- * selected if bfqq has just been expired. -- */ -- __blk_run_queue(bfqd->queue); - } - } - --static void bfq_insert_request(struct request_queue *q, struct request *rq) -+static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - { -- struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; - -- assert_spin_locked(bfqd->queue->queue_lock); -- - /* - * An unplug may trigger a requeue of a request from the device - * driver: make sure we are in process context while trying to -@@ -4381,8 +4418,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) - * Release the request's reference to the old bfqq - * and make sure one is taken to the shared queue. - */ -- new_bfqq->allocated[rq_data_dir(rq)]++; -- bfqq->allocated[rq_data_dir(rq)]--; -+ new_bfqq->allocated++; -+ bfqq->allocated--; - new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -@@ -4406,6 +4443,55 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) - bfq_rq_enqueued(bfqd, bfqq, rq); - } - -+static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -+ bool at_head) -+{ -+ struct request_queue *q = hctx->queue; -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ spin_lock_irq(&bfqd->lock); -+ if (blk_mq_sched_try_insert_merge(q, rq)) -+ goto done; -+ spin_unlock_irq(&bfqd->lock); -+ -+ blk_mq_sched_request_inserted(rq); -+ -+ spin_lock_irq(&bfqd->lock); -+ if (at_head || blk_rq_is_passthrough(rq)) { -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ if (at_head) -+ list_add(&rq->queuelist, &bfqd->dispatch); -+ else -+ list_add_tail(&rq->queuelist, &bfqd->dispatch); -+ -+ if (bfqq) -+ bfqq->dispatched++; -+ } else { -+ __bfq_insert_request(bfqd, rq); -+ -+ if (rq_mergeable(rq)) { -+ elv_rqhash_add(q, rq); -+ if (!q->last_merge) -+ q->last_merge = rq; -+ } -+ } -+done: -+ spin_unlock_irq(&bfqd->lock); -+} -+ -+static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -+ struct list_head *list, bool at_head) -+{ -+ while (!list_empty(list)) { -+ struct request *rq; -+ -+ rq = list_first_entry(list, struct request, queuelist); -+ list_del_init(&rq->queuelist); -+ bfq_insert_request(hctx, rq, at_head); -+ } -+} -+ - static void bfq_update_hw_tag(struct bfq_data *bfqd) - { - bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -@@ -4431,27 +4517,17 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) - bfqd->hw_tag_samples = 0; - } - --static void bfq_completed_request(struct request_queue *q, struct request *rq) -+static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - { -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -- struct bfq_data *bfqd = bfqq->bfqd; - u64 now_ns; - u32 delta_us; - -- bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", -- blk_rq_sectors(rq)); -- -- assert_spin_locked(bfqd->queue->queue_lock); - bfq_update_hw_tag(bfqd); - - BUG_ON(!bfqd->rq_in_driver); - BUG_ON(!bfqq->dispatched); - bfqd->rq_in_driver--; - bfqq->dispatched--; -- bfqg_stats_update_completion(bfqq_group(bfqq), -- rq_start_time_ns(rq), -- rq_io_start_time_ns(rq), -- rq->cmd_flags); - - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -@@ -4477,7 +4553,8 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - */ - delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); - -- bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ bfq_log_bfqq(bfqd, bfqq, -+ "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, - (USEC_PER_SEC* - (u64)((bfqd->last_rq_max_size<in_service_queue == bfqq) { - if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { - bfq_arm_slice_timer(bfqd); -- goto out; -+ return; - } else if (bfq_may_expire_for_budg_timeout(bfqq)) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_BUDGET_TIMEOUT); -@@ -4537,68 +4614,55 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - bfq_bfqq_expire(bfqd, bfqq, false, - BFQ_BFQQ_NO_MORE_REQUESTS); - } -- -- if (!bfqd->rq_in_driver) -- bfq_schedule_dispatch(bfqd); -- --out: -- return; - } - --static int __bfq_may_queue(struct bfq_queue *bfqq) -+static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) - { -- if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { -- bfq_clear_bfqq_must_alloc(bfqq); -- return ELV_MQUEUE_MUST; -- } -+ bfqq->allocated--; - -- return ELV_MQUEUE_MAY; -+ bfq_put_queue(bfqq); - } - --static int bfq_may_queue(struct request_queue *q, unsigned int op) -+static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - { -- struct bfq_data *bfqd = q->elevator->elevator_data; -- struct task_struct *tsk = current; -- struct bfq_io_cq *bic; -- struct bfq_queue *bfqq; -- -- /* -- * Don't force setup of a queue from here, as a call to may_queue -- * does not necessarily imply that a request actually will be -- * queued. So just lookup a possibly existing queue, or return -- * 'may queue' if that fails. -- */ -- bic = bfq_bic_lookup(bfqd, tsk->io_context); -- if (!bic) -- return ELV_MQUEUE_MAY; -- -- bfqq = bic_to_bfqq(bic, op_is_sync(op)); -- if (bfqq) -- return __bfq_may_queue(bfqq); -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; - -- return ELV_MQUEUE_MAY; --} -+ if (rq->rq_flags & RQF_STARTED) -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq_start_time_ns(rq), -+ rq_io_start_time_ns(rq), -+ rq->cmd_flags); - --/* -- * Queue lock held here. -- */ --static void bfq_put_request(struct request *rq) --{ -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ if (likely(rq->rq_flags & RQF_STARTED)) { -+ unsigned long flags; - -- if (bfqq) { -- const int rw = rq_data_dir(rq); -+ spin_lock_irqsave(&bfqd->lock, flags); - -- BUG_ON(!bfqq->allocated[rw]); -- bfqq->allocated[rw]--; -+ bfq_completed_request(bfqq, bfqd); -+ bfq_put_rq_priv_body(bfqq); - -- rq->elv.priv[0] = NULL; -- rq->elv.priv[1] = NULL; -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ } else { -+ /* -+ * Request rq may be still/already in the scheduler, -+ * in which case we need to remove it. And we cannot -+ * defer such a check and removal, to avoid -+ * inconsistencies in the time interval from the end -+ * of this function to the start of the deferred work. -+ * Fortunately, this situation occurs only in process -+ * context, so taking the scheduler lock does not -+ * cause any deadlock, even if other locks are already -+ * (correctly) held by this process. -+ */ - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -- bfqq, bfqq->ref); -- bfq_put_queue(bfqq); -+ if (!RB_EMPTY_NODE(&rq->rb_node)) -+ bfq_remove_request(q, rq); -+ bfq_put_rq_priv_body(bfqq); - } -+ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; - } - - /* -@@ -4630,18 +4694,16 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) - /* - * Allocate bfq data structures associated with this request. - */ --static int bfq_set_request(struct request_queue *q, struct request *rq, -- struct bio *bio, gfp_t gfp_mask) -+static int bfq_get_rq_private(struct request_queue *q, struct request *rq, -+ struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -- const int rw = rq_data_dir(rq); - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; -- unsigned long flags; - bool bfqq_already_existing = false, split = false; - -- spin_lock_irqsave(q->queue_lock, flags); -+ spin_lock_irq(&bfqd->lock); - - if (!bic) - goto queue_fail; -@@ -4661,7 +4723,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: was_in_list %d " -+ "get_request: was_in_list %d " - "was_in_large_burst %d " - "large burst in progress %d", - bic->was_in_burst_list, -@@ -4671,12 +4733,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: marking in " -+ "get_request: marking in " - "large burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - } else { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: clearing in " -+ "get_request: clearing in " - "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) -@@ -4703,9 +4765,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - } - } - -- bfqq->allocated[rw]++; -+ bfqq->allocated++; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "get_request: new allocated %d", bfqq->allocated); -+ - bfqq->ref++; -- bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "get_request: bfqq %p, %d", bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; -@@ -4733,26 +4798,53 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - if (unlikely(bfq_bfqq_just_created(bfqq))) - bfq_handle_burst(bfqd, bfqq); - -- spin_unlock_irqrestore(q->queue_lock, flags); -+ spin_unlock_irq(&bfqd->lock); - - return 0; - - queue_fail: -- bfq_schedule_dispatch(bfqd); -- spin_unlock_irqrestore(q->queue_lock, flags); -+ spin_unlock_irq(&bfqd->lock); - - return 1; - } - --static void bfq_kick_queue(struct work_struct *work) -+static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - { -- struct bfq_data *bfqd = -- container_of(work, struct bfq_data, unplug_work); -- struct request_queue *q = bfqd->queue; -+ struct bfq_data *bfqd = bfqq->bfqd; -+ enum bfqq_expiration reason; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&bfqd->lock, flags); -+ bfq_clear_bfqq_wait_request(bfqq); - -- spin_lock_irq(q->queue_lock); -- __blk_run_queue(q); -- spin_unlock_irq(q->queue_lock); -+ if (bfqq != bfqd->in_service_queue) { -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ return; -+ } -+ -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ -+schedule_dispatch: -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ bfq_schedule_dispatch(bfqd); - } - - /* -@@ -4763,59 +4855,22 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) - { - struct bfq_data *bfqd = container_of(timer, struct bfq_data, - idle_slice_timer); -- struct bfq_queue *bfqq; -- unsigned long flags; -- enum bfqq_expiration reason; -- -- spin_lock_irqsave(bfqd->queue->queue_lock, flags); -+ struct bfq_queue *bfqq = bfqd->in_service_queue; - -- bfqq = bfqd->in_service_queue; - /* - * Theoretical race here: the in-service queue can be NULL or -- * different from the queue that was idling if the timer handler -- * spins on the queue_lock and a new request arrives for the -- * current queue and there is a full dispatch cycle that changes -- * the in-service queue. This can hardly happen, but in the worst -- * case we just expire a queue too early. -+ * different from the queue that was idling if a new request -+ * arrives for the current queue and there is a full dispatch -+ * cycle that changes the in-service queue. This can hardly -+ * happen, but in the worst case we just expire a queue too -+ * early. - */ -- if (bfqq) { -- bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -- bfq_clear_bfqq_wait_request(bfqq); -- -- if (bfq_bfqq_budget_timeout(bfqq)) -- /* -- * Also here the queue can be safely expired -- * for budget timeout without wasting -- * guarantees -- */ -- reason = BFQ_BFQQ_BUDGET_TIMEOUT; -- else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -- /* -- * The queue may not be empty upon timer expiration, -- * because we may not disable the timer when the -- * first request of the in-service queue arrives -- * during disk idling. -- */ -- reason = BFQ_BFQQ_TOO_IDLE; -- else -- goto schedule_dispatch; -- -- bfq_bfqq_expire(bfqd, bfqq, true, reason); -- } -- --schedule_dispatch: -- bfq_schedule_dispatch(bfqd); -+ if (bfqq) -+ bfq_idle_slice_timer_body(bfqq); - -- spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); - return HRTIMER_NORESTART; - } - --static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) --{ -- hrtimer_cancel(&bfqd->idle_slice_timer); -- cancel_work_sync(&bfqd->unplug_work); --} -- - static void __bfq_put_async_bfqq(struct bfq_data *bfqd, - struct bfq_queue **bfqq_ptr) - { -@@ -4852,28 +4907,40 @@ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) - static void bfq_exit_queue(struct elevator_queue *e) - { - struct bfq_data *bfqd = e->elevator_data; -- struct request_queue *q = bfqd->queue; - struct bfq_queue *bfqq, *n; - -- bfq_shutdown_timer_wq(bfqd); -- -- spin_lock_irq(q->queue_lock); -+ hrtimer_cancel(&bfqd->idle_slice_timer); - - BUG_ON(bfqd->in_service_queue); -- list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -- bfq_deactivate_bfqq(bfqd, bfqq, false, false); - -- spin_unlock_irq(q->queue_lock); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { -+ if (bfqq->bic) /* bfqqs without bic are handled below */ -+ cancel_work_sync(&bfqq->bic->exit_icq_work); -+ } -+ -+ spin_lock_irq(&bfqd->lock); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ /* -+ * Make sure that deferred exit_icq_work completes -+ * without errors for bfq_queues without bic -+ */ -+ if (!bfqq->bic) -+ bfqq->bfqd = NULL; -+ } -+ spin_unlock_irq(&bfqd->lock); - -- bfq_shutdown_timer_wq(bfqd); -+ hrtimer_cancel(&bfqd->idle_slice_timer); - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - - #ifdef BFQ_GROUP_IOSCHED_ENABLED -- blkcg_deactivate_policy(q, &blkcg_policy_bfq); -+ blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); - #else -+ spin_lock_irq(&bfqd->lock); - bfq_put_async_queues(bfqd, bfqd->root_group); - kfree(bfqd->root_group); -+ spin_unlock_irq(&bfqd->lock); - #endif - - kfree(bfqd); -@@ -4934,10 +5001,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - - bfqd->queue = q; - -- spin_lock_irq(q->queue_lock); -- q->elevator = eq; -- spin_unlock_irq(q->queue_lock); -- - bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); - if (!bfqd->root_group) - goto out_free; -@@ -4951,8 +5014,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->queue_weights_tree = RB_ROOT; - bfqd->group_weights_tree = RB_ROOT; - -- INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); -- - INIT_LIST_HEAD(&bfqd->active_list); - INIT_LIST_HEAD(&bfqd->idle_list); - INIT_HLIST_HEAD(&bfqd->burst_list); -@@ -5001,6 +5062,11 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; - bfqd->device_speed = BFQ_BFQD_FAST; - -+ spin_lock_init(&bfqd->lock); -+ INIT_LIST_HEAD(&bfqd->dispatch); -+ -+ q->elevator = eq; -+ - return 0; - - out_free: -@@ -5057,7 +5123,7 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) - num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", - bfqd->queued); - -- spin_lock_irq(bfqd->queue->queue_lock); -+ spin_lock_irq(&bfqd->lock); - - num_char += sprintf(page + num_char, "Active:\n"); - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -@@ -5086,7 +5152,7 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) - jiffies_to_msecs(bfqq->wr_cur_max_time)); - } - -- spin_unlock_irq(bfqd->queue->queue_lock); -+ spin_unlock_irq(&bfqd->lock); - - return num_char; - } -@@ -5294,35 +5360,31 @@ static struct elv_fs_entry bfq_attrs[] = { - __ATTR_NULL - }; - --static struct elevator_type iosched_bfq = { -- .ops.sq = { -- .elevator_merge_fn = bfq_merge, -- .elevator_merged_fn = bfq_merged_request, -- .elevator_merge_req_fn = bfq_merged_requests, --#ifdef BFQ_GROUP_IOSCHED_ENABLED -- .elevator_bio_merged_fn = bfq_bio_merged, --#endif -- .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -- .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, -- .elevator_dispatch_fn = bfq_dispatch_requests, -- .elevator_add_req_fn = bfq_insert_request, -- .elevator_activate_req_fn = bfq_activate_request, -- .elevator_deactivate_req_fn = bfq_deactivate_request, -- .elevator_completed_req_fn = bfq_completed_request, -- .elevator_former_req_fn = elv_rb_former_request, -- .elevator_latter_req_fn = elv_rb_latter_request, -- .elevator_init_icq_fn = bfq_init_icq, -- .elevator_exit_icq_fn = bfq_exit_icq, -- .elevator_set_req_fn = bfq_set_request, -- .elevator_put_req_fn = bfq_put_request, -- .elevator_may_queue_fn = bfq_may_queue, -- .elevator_init_fn = bfq_init_queue, -- .elevator_exit_fn = bfq_exit_queue, -+static struct elevator_type iosched_bfq_mq = { -+ .ops.mq = { -+ .get_rq_priv = bfq_get_rq_private, -+ .put_rq_priv = bfq_put_rq_private, -+ .init_icq = bfq_init_icq, -+ .exit_icq = bfq_exit_icq, -+ .insert_requests = bfq_insert_requests, -+ .dispatch_request = bfq_dispatch_request, -+ .next_request = elv_rb_latter_request, -+ .former_request = elv_rb_former_request, -+ .allow_merge = bfq_allow_bio_merge, -+ .bio_merge = bfq_bio_merge, -+ .request_merge = bfq_request_merge, -+ .requests_merged = bfq_requests_merged, -+ .request_merged = bfq_request_merged, -+ .has_work = bfq_has_work, -+ .init_sched = bfq_init_queue, -+ .exit_sched = bfq_exit_queue, - }, -+ -+ .uses_mq = true, - .icq_size = sizeof(struct bfq_io_cq), - .icq_align = __alignof__(struct bfq_io_cq), - .elevator_attrs = bfq_attrs, -- .elevator_name = "bfq-sq", -+ .elevator_name = "bfq-mq", - .elevator_owner = THIS_MODULE, - }; - -@@ -5392,7 +5454,7 @@ static int __init bfq_init(void) - device_speed_thresh[0] = (4 * R_slow[0]) / 3; - device_speed_thresh[1] = (4 * R_slow[1]) / 3; - -- ret = elv_register(&iosched_bfq); -+ ret = elv_register(&iosched_bfq_mq); - if (ret) - goto err_pol_unreg; - -@@ -5412,8 +5474,8 @@ static int __init bfq_init(void) - - static void __exit bfq_exit(void) - { -- elv_unregister(&iosched_bfq); --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ elv_unregister(&iosched_bfq_mq); -+#ifdef CONFIG_BFQ_GROUP_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - bfq_slab_kill(); -@@ -5422,5 +5484,6 @@ static void __exit bfq_exit(void) - module_init(bfq_init); - module_exit(bfq_exit); - --MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -+MODULE_AUTHOR("Paolo Valente"); - MODULE_LICENSE("GPL"); -+MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler"); -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 0f51f270469c..c3fcd5ebd735 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -19,15 +19,8 @@ - #include - #include - --/* -- * Define an alternative macro to compile cgroups support. This is one -- * of the steps needed to let bfq-mq share the files bfq-sched.c and -- * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro -- * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether -- * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not -- * CONFIG_BFQ_GROUP_IOSCHED, is defined. -- */ --#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED -+/* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */ -+#ifdef CONFIG_BFQ_MQ_GROUP_IOSCHED - #define BFQ_GROUP_IOSCHED_ENABLED - #endif - -@@ -259,8 +252,8 @@ struct bfq_queue { - struct request *next_rq; - /* number of sync and async requests queued */ - int queued[2]; -- /* number of sync and async requests currently allocated */ -- int allocated[2]; -+ /* number of requests currently allocated */ -+ int allocated; - /* number of pending metadata requests */ - int meta_pending; - /* fifo list of requests in sort_list */ -@@ -345,6 +338,8 @@ struct bfq_queue { - unsigned long wr_start_at_switch_to_srt; - - unsigned long split_time; /* time of last split */ -+ -+ spinlock_t lock; - }; - - /** -@@ -361,6 +356,9 @@ struct bfq_io_cq { - uint64_t blkcg_serial_nr; /* the current blkcg serial */ - #endif - -+ /* delayed work to exec the body of the the exit_icq handler */ -+ struct work_struct exit_icq_work; -+ - /* - * Snapshot of the has_short_time flag before merging; taken - * to remember its value while the queue is merged, so as to -@@ -402,11 +400,13 @@ enum bfq_device_speed { - /** - * struct bfq_data - per-device data structure. - * -- * All the fields are protected by the @queue lock. -+ * All the fields are protected by @lock. - */ - struct bfq_data { -- /* request queue for the device */ -+ /* device request queue */ - struct request_queue *queue; -+ /* dispatch queue */ -+ struct list_head dispatch; - - /* root bfq_group for the device */ - struct bfq_group *root_group; -@@ -460,8 +460,6 @@ struct bfq_data { - * the queue in service. - */ - struct hrtimer idle_slice_timer; -- /* delayed work to restart dispatching on the request queue */ -- struct work_struct unplug_work; - - /* bfq_queue in service */ - struct bfq_queue *in_service_queue; -@@ -612,6 +610,8 @@ struct bfq_data { - - /* fallback dummy bfqq for extreme OOM conditions */ - struct bfq_queue oom_bfqq; -+ -+ spinlock_t lock; - }; - - enum bfqq_state_flags { -@@ -622,7 +622,6 @@ enum bfqq_state_flags { - * waiting for a request - * without idling the device - */ -- BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ - BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ - BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -@@ -661,7 +660,6 @@ BFQ_BFQQ_FNS(just_created); - BFQ_BFQQ_FNS(busy); - BFQ_BFQQ_FNS(wait_request); - BFQ_BFQQ_FNS(non_blocking_wait_rq); --BFQ_BFQQ_FNS(must_alloc); - BFQ_BFQQ_FNS(fifo_expire); - BFQ_BFQQ_FNS(has_short_ttime); - BFQ_BFQQ_FNS(sync); -@@ -692,7 +690,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ -- assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - pr_crit("%s bfq%d%c %s " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -@@ -734,7 +731,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ -- assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ - (bfqq)->pid, \ -@@ -961,7 +957,6 @@ static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) - - static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); - static void bfq_put_queue(struct bfq_queue *bfqq); --static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); - static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - struct bio *bio, bool is_sync, - struct bfq_io_cq *bic); - -From bde5235de2241502c1c00337bd51c96d9b60b6df Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Fri, 3 Mar 2017 08:52:40 +0100 -Subject: [PATCH 13/51] Add checks and extra log messages - Part I - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++-- - 1 file changed, 109 insertions(+), 3 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index c963d92a32c2..40eadb3f7073 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -773,6 +773,8 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) - { - int process_refs, io_refs; - -+ lockdep_assert_held(&bfqq->bfqd->lock); -+ - io_refs = bfqq->allocated; - process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; - BUG_ON(process_refs < 0); -@@ -1483,6 +1485,8 @@ static void bfq_add_request(struct request *rq) - bfqq->queued[rq_is_sync(rq)]++; - bfqd->queued++; - -+ BUG_ON(!RQ_BFQQ(rq)); -+ BUG_ON(RQ_BFQQ(rq) != bfqq); - elv_rb_add(&bfqq->sort_list, rq); - - /* -@@ -1491,6 +1495,8 @@ static void bfq_add_request(struct request *rq) - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); - BUG_ON(!next_rq); -+ BUG_ON(!RQ_BFQQ(next_rq)); -+ BUG_ON(RQ_BFQQ(next_rq) != bfqq); - bfqq->next_rq = next_rq; - - /* -@@ -1615,6 +1621,19 @@ static void bfq_remove_request(struct request_queue *q, - - if (bfqq->next_rq == rq) { - bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ if (bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)) { -+ pr_crit("no bfqq! for next rq %p bfqq %p\n", -+ bfqq->next_rq, bfqq); -+ } -+ -+ BUG_ON(bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)); -+ if (bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq) { -+ pr_crit( -+ "wrong bfqq! for next rq %p, rq_bfqq %p bfqq %p\n", -+ bfqq->next_rq, RQ_BFQQ(bfqq->next_rq), bfqq); -+ } -+ BUG_ON(bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq); -+ - bfq_updated_next_req(bfqd, bfqq); - } - -@@ -1701,6 +1720,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, - __rq = bfq_find_rq_fmerge(bfqd, bio, q); - if (__rq && elv_bio_merge_ok(__rq, bio)) { - *req = __rq; -+ bfq_log(bfqd, "request_merge: req %p", __rq); -+ - return ELEVATOR_FRONT_MERGE; - } - -@@ -1721,6 +1742,8 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - - /* Reposition request in its sort_list */ - elv_rb_del(&bfqq->sort_list, req); -+ BUG_ON(!RQ_BFQQ(req)); -+ BUG_ON(RQ_BFQQ(req) != bfqq); - elv_rb_add(&bfqq->sort_list, req); - - spin_lock_irq(&bfqd->lock); -@@ -1729,7 +1752,13 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, - bfqd->last_position); - BUG_ON(!next_rq); -+ - bfqq->next_rq = next_rq; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "requests_merged: req %p prev %p next_rq %p bfqq %p", -+ req, prev, next_rq, bfqq); -+ - /* - * If next_rq changes, update both the queue's budget to - * fit the new request and the queue's position in its -@@ -1748,8 +1777,16 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); - -+ BUG_ON(!RQ_BFQQ(rq)); -+ BUG_ON(!RQ_BFQQ(next)); -+ - if (!RB_EMPTY_NODE(&rq->rb_node)) - goto end; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "requests_merged: rq %p next %p bfqq %p next_bfqq %p", -+ rq, next, bfqq, next_bfqq); -+ - spin_lock_irq(&bfqq->bfqd->lock); - - /* -@@ -3847,6 +3884,9 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - -+ bfq_log(bfqd, "has_work, dispatch_non_empty %d busy_queues %d", -+ !list_empty_careful(&bfqd->dispatch), bfqd->busy_queues > 0); -+ - /* - * Avoiding lock: a race on bfqd->busy_queues should cause at - * most a call to dispatch for nothing -@@ -3865,6 +3905,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq = list_first_entry(&bfqd->dispatch, struct request, - queuelist); - list_del_init(&rq->queuelist); -+ bfq_log(bfqd, -+ "dispatch requests: picked %p from dispatch list", rq); - goto exit; - } - -@@ -3904,7 +3946,20 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - if (rq) { - rq->rq_flags |= RQF_STARTED; - bfqd->rq_in_driver++; -- } -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %s request %p, rq_in_driver %d", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async", -+ rq, -+ bfqd->rq_in_driver); -+ else -+ bfq_log(bfqd, -+ "dispatched request %p from dispatch list, rq_in_driver %d", -+ rq, bfqd->rq_in_driver); -+ } else -+ bfq_log(bfqd, -+ "returned NULL request, rq_in_driver %d", -+ bfqd->rq_in_driver); - - return rq; - } -@@ -3944,6 +3999,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - return; - - BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated != 0); - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -@@ -4043,6 +4099,7 @@ static void bfq_exit_icq(struct io_cq *icq) - { - struct bfq_io_cq *bic = icq_to_bic(icq); - -+ BUG_ON(!bic); - kblockd_schedule_work(&bic->exit_icq_work); - } - -@@ -4057,6 +4114,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - int ioprio_class; - struct bfq_data *bfqd = bfqq->bfqd; - -+ WARN_ON(!bfqd); - if (!bfqd) - return; - -@@ -4404,6 +4462,10 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; - -+ assert_spin_locked(&bfqd->lock); -+ -+ bfq_log_bfqq(bfqd, bfqq, "__insert_req: rq %p bfqq %p", rq, bfqq); -+ - /* - * An unplug may trigger a requeue of a request from the device - * driver: make sure we are in process context while trying to -@@ -4420,6 +4482,12 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - */ - new_bfqq->allocated++; - bfqq->allocated--; -+ bfq_log_bfqq(bfqd, bfqq, -+ "insert_request: new allocated %d", bfqq->allocated); -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "insert_request: new_bfqq new allocated %d", -+ bfqq->allocated); -+ - new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -@@ -4529,6 +4597,10 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - bfqd->rq_in_driver--; - bfqq->dispatched--; - -+ bfq_log_bfqq(bfqd, bfqq, -+ "completed_requests: new disp %d, new rq_in_driver %d", -+ bfqq->dispatched, bfqd->rq_in_driver); -+ - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - /* -@@ -4618,6 +4690,9 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - - static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) - { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "put_request_body: allocated %d", bfqq->allocated); -+ BUG_ON(!bfqq->allocated); - bfqq->allocated--; - - bfq_put_queue(bfqq); -@@ -4625,8 +4700,27 @@ static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) - - static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - { -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -- struct bfq_data *bfqd = bfqq->bfqd; -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd; -+ struct bfq_io_cq *bic; -+ -+ BUG_ON(!rq); -+ bfqq = RQ_BFQQ(rq); -+ BUG_ON(!bfqq); -+ -+ bic = RQ_BIC(rq); -+ BUG_ON(!bic); -+ -+ bfqd = bfqq->bfqd; -+ BUG_ON(!bfqd); -+ -+ BUG_ON(rq->rq_flags & RQF_QUEUED); -+ BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "putting rq %p with %u sects left, STARTED %d", -+ rq, blk_rq_sectors(rq), -+ rq->rq_flags & RQF_STARTED); - - if (rq->rq_flags & RQF_STARTED) - bfqg_stats_update_completion(bfqq_group(bfqq), -@@ -4634,6 +4728,8 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - rq_io_start_time_ns(rq), - rq->cmd_flags); - -+ BUG_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); -+ - if (likely(rq->rq_flags & RQF_STARTED)) { - unsigned long flags; - -@@ -4655,7 +4751,9 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - * cause any deadlock, even if other locks are already - * (correctly) held by this process. - */ -+ BUG_ON(in_interrupt()); - -+ assert_spin_locked(&bfqd->lock); - if (!RB_EMPTY_NODE(&rq->rb_node)) - bfq_remove_request(q, rq); - bfq_put_rq_priv_body(bfqq); -@@ -4814,7 +4912,9 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - enum bfqq_expiration reason; - unsigned long flags; - -+ BUG_ON(!bfqd); - spin_lock_irqsave(&bfqd->lock, flags); -+ bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); - bfq_clear_bfqq_wait_request(bfqq); - - if (bfqq != bfqd->in_service_queue) { -@@ -4857,6 +4957,8 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) - idle_slice_timer); - struct bfq_queue *bfqq = bfqd->in_service_queue; - -+ bfq_log(bfqd, "slice_timer expired"); -+ - /* - * Theoretical race here: the in-service queue can be NULL or - * different from the queue that was idling if a new request -@@ -4909,9 +5011,12 @@ static void bfq_exit_queue(struct elevator_queue *e) - struct bfq_data *bfqd = e->elevator_data; - struct bfq_queue *bfqq, *n; - -+ bfq_log(bfqd, "exit_queue: starting ..."); -+ - hrtimer_cancel(&bfqd->idle_slice_timer); - - BUG_ON(bfqd->in_service_queue); -+ BUG_ON(!list_empty(&bfqd->active_list)); - - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { - if (bfqq->bic) /* bfqqs without bic are handled below */ -@@ -4943,6 +5048,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - spin_unlock_irq(&bfqd->lock); - #endif - -+ bfq_log(bfqd, "exit_queue: finished ..."); - kfree(bfqd); - } - - -From 7f59486861e368d25f59d4136cf8e51a75b7edf9 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Thu, 9 Feb 2017 10:36:27 +0100 -Subject: [PATCH 14/51] Add lock check in bfq_allow_bio_merge - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 40eadb3f7073..21b876aeba16 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -2279,6 +2279,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - if (!bic) - return false; - -+ assert_spin_locked(&bfqd->lock); - bfqq = bic_to_bfqq(bic, is_sync); - /* - * We take advantage of this function to perform an early merge - -From a2dd19a4d95cf401268c144c79ce549c7fc4bbca Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Tue, 7 Feb 2017 15:14:29 +0100 -Subject: [PATCH 15/51] bfq-mq: execute exit_icq operations immediately - -Exploting Omar's patch that removes the taking of the queue lock in -put_io_context_active, this patch moves back the operation of the bfq_exit_icq -hook from a deferred work to the body of the function. - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 34 +++------------------------------- - block/bfq-mq.h | 3 --- - 2 files changed, 3 insertions(+), 34 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 21b876aeba16..1deb79a47181 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4080,28 +4080,13 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - } - } - --static void bfq_exit_icq_body(struct work_struct *work) --{ -- struct bfq_io_cq *bic = -- container_of(work, struct bfq_io_cq, exit_icq_work); -- -- bfq_exit_icq_bfqq(bic, true); -- bfq_exit_icq_bfqq(bic, false); --} -- --static void bfq_init_icq(struct io_cq *icq) --{ -- struct bfq_io_cq *bic = icq_to_bic(icq); -- -- INIT_WORK(&bic->exit_icq_work, bfq_exit_icq_body); --} -- - static void bfq_exit_icq(struct io_cq *icq) - { - struct bfq_io_cq *bic = icq_to_bic(icq); - - BUG_ON(!bic); -- kblockd_schedule_work(&bic->exit_icq_work); -+ bfq_exit_icq_bfqq(bic, true); -+ bfq_exit_icq_bfqq(bic, false); - } - - /* -@@ -5019,21 +5004,9 @@ static void bfq_exit_queue(struct elevator_queue *e) - BUG_ON(bfqd->in_service_queue); - BUG_ON(!list_empty(&bfqd->active_list)); - -- list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { -- if (bfqq->bic) /* bfqqs without bic are handled below */ -- cancel_work_sync(&bfqq->bic->exit_icq_work); -- } -- - spin_lock_irq(&bfqd->lock); -- list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, false, false); -- /* -- * Make sure that deferred exit_icq_work completes -- * without errors for bfq_queues without bic -- */ -- if (!bfqq->bic) -- bfqq->bfqd = NULL; -- } - spin_unlock_irq(&bfqd->lock); - - hrtimer_cancel(&bfqd->idle_slice_timer); -@@ -5471,7 +5444,6 @@ static struct elevator_type iosched_bfq_mq = { - .ops.mq = { - .get_rq_priv = bfq_get_rq_private, - .put_rq_priv = bfq_put_rq_private, -- .init_icq = bfq_init_icq, - .exit_icq = bfq_exit_icq, - .insert_requests = bfq_insert_requests, - .dispatch_request = bfq_dispatch_request, -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index c3fcd5ebd735..23744b246db6 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -356,9 +356,6 @@ struct bfq_io_cq { - uint64_t blkcg_serial_nr; /* the current blkcg serial */ - #endif - -- /* delayed work to exec the body of the the exit_icq handler */ -- struct work_struct exit_icq_work; -- - /* - * Snapshot of the has_short_time flag before merging; taken - * to remember its value while the queue is merged, so as to - -From ab7e78a0ff095101de74e700f8743295a500bb20 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Tue, 21 Feb 2017 10:26:22 +0100 -Subject: [PATCH 16/51] Unnest request-queue and ioc locks from scheduler locks - -In some bio-merging functions, the request-queue lock needs to be -taken, to lookup for the bic associated with the process that issued -the bio that may need to be merged. In addition, put_io_context must -be invoked in some other functions, and put_io_context may cause the -lock of the involved ioc to be taken. In both cases, these extra -request-queue or ioc locks are taken, or might be taken, while the -scheduler lock is being held. In this respect, there are other code -paths, in part external to bfq-mq, in which the same locks are taken -(nested) in the opposite order, i.e., it is the scheduler lock to be -taken while the request-queue or the ioc lock is being held. This -leads to circular deadlocks. - -This commit addresses this issue by modifying the logic of the above -functions, so as to let the lookup and put_io_context be performed, -and thus the extra locks be taken, outside the critical sections -protected by the scheduler lock. - -Signed-off-by: Paolo Valente ---- - block/bfq-cgroup-included.c | 9 ++ - block/bfq-mq-iosched.c | 264 ++++++++++++++++++++++++++++---------------- - block/bfq-mq.h | 25 ++++- - block/bfq-sched.c | 11 ++ - 4 files changed, 213 insertions(+), 96 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 8a73de76f32b..cf59eeb7f08e 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -716,6 +716,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - struct bfq_group *bfqg; - struct bfq_data *bfqd; - struct bfq_entity *entity; -+#ifdef BFQ_MQ -+ unsigned long flags; -+#endif - int i; - - BUG_ON(!pd); -@@ -729,6 +732,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - if (!entity) /* root group */ - return; - -+#ifdef BFQ_MQ -+ spin_lock_irqsave(&bfqd->lock, flags); -+#endif - /* - * Empty all service_trees belonging to this group before - * deactivating the group itself. -@@ -766,6 +772,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - __bfq_deactivate_entity(entity, false); - bfq_put_async_queues(bfqd, bfqg); - -+#ifdef BFQ_MQ -+ bfq_unlock_put_ioc_restore(bfqd, flags); -+#endif - /* - * @blkg is going offline and will be ignored by - * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 1deb79a47181..69ef3761c95d 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -233,6 +233,7 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - return NULL; - } - -+#define BFQ_MQ - #include "bfq-sched.c" - #include "bfq-cgroup-included.c" - -@@ -1564,15 +1565,9 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, - struct bio *bio, - struct request_queue *q) - { -- struct task_struct *tsk = current; -- struct bfq_io_cq *bic; -- struct bfq_queue *bfqq; -+ struct bfq_queue *bfqq = bfqd->bio_bfqq; - -- bic = bfq_bic_lookup(bfqd, tsk->io_context, q); -- if (!bic) -- return NULL; - -- bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); - if (bfqq) - return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); - -@@ -1693,9 +1688,26 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct request *free = NULL; -+ /* -+ * bfq_bic_lookup grabs the queue_lock: invoke it now and -+ * store its return value for later use, to avoid nesting -+ * queue_lock inside the bfqd->lock. We assume that the bic -+ * returned by bfq_bic_lookup does not go away before -+ * bfqd->lock is taken. -+ */ -+ struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); - bool ret; - - spin_lock_irq(&bfqd->lock); -+ -+ if (bic) -+ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); -+ else -+ bfqd->bio_bfqq = NULL; -+ bfqd->bio_bic = bic; -+ /* Set next flag just for testing purposes */ -+ bfqd->bio_bfqq_set = true; -+ - ret = blk_mq_sched_try_merge(q, bio, &free); - - /* -@@ -1706,6 +1718,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) - */ - if (free) - blk_mq_free_request(free); -+ bfqd->bio_bfqq_set = false; - spin_unlock_irq(&bfqd->lock); - - return ret; -@@ -2261,8 +2274,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - { - struct bfq_data *bfqd = q->elevator->elevator_data; - bool is_sync = op_is_sync(bio->bi_opf); -- struct bfq_io_cq *bic; -- struct bfq_queue *bfqq, *new_bfqq; -+ struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; - - /* - * Disallow merge of a sync bio into an async request. -@@ -2273,31 +2285,40 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - /* - * Lookup the bfqq that this bio will be queued with. Allow - * merge only if rq is queued there. -- * Queue lock is held here. - */ -- bic = bfq_bic_lookup(bfqd, current->io_context, q); -- if (!bic) -+ if (!bfqq) - return false; - -- assert_spin_locked(&bfqd->lock); -- bfqq = bic_to_bfqq(bic, is_sync); - /* - * We take advantage of this function to perform an early merge - * of the queues of possible cooperating processes. - */ -- if (bfqq) { -- new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -- if (new_bfqq) { -- bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -- /* -- * If we get here, the bio will be queued in the -- * shared queue, i.e., new_bfqq, so use new_bfqq -- * to decide whether bio and rq can be merged. -- */ -- bfqq = new_bfqq; -- } -- } -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ if (new_bfqq) { -+ /* -+ * bic still points to bfqq, then it has not yet been -+ * redirected to some other bfq_queue, and a queue -+ * merge beween bfqq and new_bfqq can be safely -+ * fulfillled, i.e., bic can be redirected to new_bfqq -+ * and bfqq can be put. -+ */ -+ bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, -+ new_bfqq); -+ /* -+ * If we get here, bio will be queued into new_queue, -+ * so use new_bfqq to decide whether bio and rq can be -+ * merged. -+ */ -+ bfqq = new_bfqq; - -+ /* -+ * Change also bqfd->bio_bfqq, as -+ * bfqd->bio_bic now points to new_bfqq, and -+ * this function may be invoked again (and then may -+ * use again bqfd->bio_bfqq). -+ */ -+ bfqd->bio_bfqq = bfqq; -+ } - return bfqq == RQ_BFQQ(rq); - } - -@@ -3965,14 +3986,43 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - return rq; - } - -+/* -+ * Next two functions release bfqd->lock and put the io context -+ * pointed by bfqd->ioc_to_put. This delayed put is used to not risk -+ * to take an ioc->lock while the scheduler lock is being held. -+ */ -+static void bfq_unlock_put_ioc(struct bfq_data *bfqd) -+{ -+ struct io_context *ioc_to_put = bfqd->ioc_to_put; -+ -+ bfqd->ioc_to_put = NULL; -+ spin_unlock_irq(&bfqd->lock); -+ -+ if (ioc_to_put) -+ put_io_context(ioc_to_put); -+} -+ -+static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, -+ unsigned long flags) -+{ -+ struct io_context *ioc_to_put = bfqd->ioc_to_put; -+ -+ bfqd->ioc_to_put = NULL; -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ -+ if (ioc_to_put) -+ put_io_context(ioc_to_put); -+} -+ - static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct request *rq; - - spin_lock_irq(&bfqd->lock); -+ - rq = __bfq_dispatch_request(hctx); -- spin_unlock_irq(&bfqd->lock); -+ bfq_unlock_put_ioc(bfqd); - - return rq; - } -@@ -3981,7 +4031,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * Task holds one reference to the queue, dropped when task exits. Each rq - * in-flight on this queue also holds a reference, dropped when rq is freed. - * -- * Queue lock must be held here. Recall not to use bfqq after calling -+ * Scheduler lock must be held here. Recall not to use bfqq after calling - * this function on it. - */ - static void bfq_put_queue(struct bfq_queue *bfqq) -@@ -4066,17 +4116,23 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ - - if (bfqq && bfqd) { -- spin_lock_irq(&bfqd->lock); -+ unsigned long flags; -+ -+ spin_lock_irqsave(&bfqd->lock, flags); - /* -- * If the bic is using a shared queue, put the reference -- * taken on the io_context when the bic started using a -- * shared bfq_queue. -+ * If the bic is using a shared queue, put the -+ * reference taken on the io_context when the bic -+ * started using a shared bfq_queue. This put cannot -+ * make ioc->ref_count reach 0, then no ioc->lock -+ * risks to be taken (leading to possible deadlock -+ * scenarios). - */ - if (is_sync && bfq_bfqq_coop(bfqq)) - put_io_context(bic->icq.ioc); -+ - bfq_exit_bfqq(bfqd, bfqq); - bic_set_bfqq(bic, NULL, is_sync); -- spin_unlock_irq(&bfqd->lock); -+ bfq_unlock_put_ioc_restore(bfqd, flags); - } - } - -@@ -4183,8 +4239,6 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - INIT_HLIST_NODE(&bfqq->burst_list_node); - BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - -- spin_lock_init(&bfqq->lock); -- - bfqq->ref = 0; - bfqq->bfqd = bfqd; - -@@ -4476,6 +4530,14 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - - new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); -+ /* -+ * If the bic associated with the process -+ * issuing this request still points to bfqq -+ * (and thus has not been already redirected -+ * to new_bfqq or even some other bfq_queue), -+ * then complete the merge and redirect it to -+ * new_bfqq. -+ */ - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); -@@ -4498,14 +4560,17 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - } - - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -- bool at_head) -+ bool at_head) - { - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - - spin_lock_irq(&bfqd->lock); -- if (blk_mq_sched_try_insert_merge(q, rq)) -- goto done; -+ if (blk_mq_sched_try_insert_merge(q, rq)) { -+ spin_unlock_irq(&bfqd->lock); -+ return; -+ } -+ - spin_unlock_irq(&bfqd->lock); - - blk_mq_sched_request_inserted(rq); -@@ -4530,8 +4595,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - q->last_merge = rq; - } - } --done: -- spin_unlock_irq(&bfqd->lock); -+ -+ bfq_unlock_put_ioc(bfqd); - } - - static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -@@ -4724,7 +4789,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - bfq_completed_request(bfqq, bfqd); - bfq_put_rq_priv_body(bfqq); - -- spin_unlock_irqrestore(&bfqd->lock, flags); -+ bfq_unlock_put_ioc_restore(bfqd, flags); - } else { - /* - * Request rq may be still/already in the scheduler, -@@ -4732,10 +4797,10 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - * defer such a check and removal, to avoid - * inconsistencies in the time interval from the end - * of this function to the start of the deferred work. -- * Fortunately, this situation occurs only in process -- * context, so taking the scheduler lock does not -- * cause any deadlock, even if other locks are already -- * (correctly) held by this process. -+ * This situation seems to occur only in process -+ * context, as a consequence of a merge. In the -+ * current version of the code, this implies that the -+ * lock is held. - */ - BUG_ON(in_interrupt()); - -@@ -4758,8 +4823,6 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); - -- put_io_context(bic->icq.ioc); -- - if (bfqq_process_refs(bfqq) == 1) { - bfqq->pid = current->pid; - bfq_clear_bfqq_coop(bfqq); -@@ -4775,6 +4838,41 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) - return NULL; - } - -+static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, -+ struct bio *bio, -+ bool split, bool is_sync, -+ bool *new_queue) -+{ -+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); -+ -+ if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) -+ return bfqq; -+ -+ if (new_queue) -+ *new_queue = true; -+ -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (split && is_sync) { -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ else { -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ bfqq->split_time = jiffies; -+ } -+ -+ return bfqq; -+} -+ - /* - * Allocate bfq data structures associated with this request. - */ -@@ -4786,6 +4884,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; - bool bfqq_already_existing = false, split = false; -+ bool new_queue = false; - - spin_lock_irq(&bfqd->lock); - -@@ -4796,42 +4895,10 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - bfq_bic_update_cgroup(bic, bio); - --new_queue: -- bfqq = bic_to_bfqq(bic, is_sync); -- if (!bfqq || bfqq == &bfqd->oom_bfqq) { -- if (bfqq) -- bfq_put_queue(bfqq); -- bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -- BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, -+ &new_queue); - -- bic_set_bfqq(bic, bfqq, is_sync); -- if (split && is_sync) { -- bfq_log_bfqq(bfqd, bfqq, -- "get_request: was_in_list %d " -- "was_in_large_burst %d " -- "large burst in progress %d", -- bic->was_in_burst_list, -- bic->saved_in_large_burst, -- bfqd->large_burst); -- -- if ((bic->was_in_burst_list && bfqd->large_burst) || -- bic->saved_in_large_burst) { -- bfq_log_bfqq(bfqd, bfqq, -- "get_request: marking in " -- "large burst"); -- bfq_mark_bfqq_in_large_burst(bfqq); -- } else { -- bfq_log_bfqq(bfqd, bfqq, -- "get_request: clearing in " -- "large burst"); -- bfq_clear_bfqq_in_large_burst(bfqq); -- if (bic->was_in_burst_list) -- hlist_add_head(&bfqq->burst_list_node, -- &bfqd->burst_list); -- } -- bfqq->split_time = jiffies; -- } -- } else { -+ if (unlikely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -@@ -4841,9 +4908,19 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - bic->saved_in_large_burst = true; - - bfqq = bfq_split_bfqq(bic, bfqq); -- split = true; -+ /* -+ * A reference to bic->icq.ioc needs to be -+ * released after a queue split. Do not do it -+ * immediately, to not risk to possibly take -+ * an ioc->lock while holding the scheduler -+ * lock. -+ */ -+ bfqd->ioc_to_put = bic->icq.ioc; -+ - if (!bfqq) -- goto new_queue; -+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, -+ true, is_sync, -+ NULL); - else - bfqq_already_existing = true; - } -@@ -4861,18 +4938,17 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - /* - * If a bfq_queue has only one process reference, it is owned -- * by only one bfq_io_cq: we can set the bic field of the -- * bfq_queue to the address of that structure. Also, if the -- * queue has just been split, mark a flag so that the -- * information is available to the other scheduler hooks. -+ * by only this bic: we can then set bfqq->bic = bic. in -+ * addition, if the queue has also just been split, we have to -+ * resume its state. - */ - if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { - bfqq->bic = bic; -- if (split) { -+ if (bfqd->ioc_to_put) { /* if true, then there has been a split */ - /* -- * If the queue has just been split from a shared -- * queue, restore the idle window and the possible -- * weight raising period. -+ * The queue has just been split from a shared -+ * queue: restore the idle window and the -+ * possible weight raising period. - */ - bfq_bfqq_resume_state(bfqq, bfqd, bic, - bfqq_already_existing); -@@ -4882,7 +4958,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - if (unlikely(bfq_bfqq_just_created(bfqq))) - bfq_handle_burst(bfqd, bfqq); - -- spin_unlock_irq(&bfqd->lock); -+ bfq_unlock_put_ioc(bfqd); - - return 0; - -@@ -4929,7 +5005,7 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - bfq_bfqq_expire(bfqd, bfqq, true, reason); - - schedule_dispatch: -- spin_unlock_irqrestore(&bfqd->lock, flags); -+ bfq_unlock_put_ioc_restore(bfqd, flags); - bfq_schedule_dispatch(bfqd); - } - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 23744b246db6..bd83f1c02573 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -338,8 +338,6 @@ struct bfq_queue { - unsigned long wr_start_at_switch_to_srt; - - unsigned long split_time; /* time of last split */ -- -- spinlock_t lock; - }; - - /** -@@ -609,6 +607,29 @@ struct bfq_data { - struct bfq_queue oom_bfqq; - - spinlock_t lock; -+ -+ /* -+ * bic associated with the task issuing current bio for -+ * merging. This and the next field are used as a support to -+ * be able to perform the bic lookup, needed by bio-merge -+ * functions, before the scheduler lock is taken, and thus -+ * avoid taking the request-queue lock while the scheduler -+ * lock is being held. -+ */ -+ struct bfq_io_cq *bio_bic; -+ /* bfqq associated with the task issuing current bio for merging */ -+ struct bfq_queue *bio_bfqq; -+ /* Extra flag used only for TESTING */ -+ bool bio_bfqq_set; -+ -+ /* -+ * io context to put right after bfqd->lock is released. This -+ * filed is used to perform put_io_context, when needed, to -+ * after the scheduler lock has been released, and thus -+ * prevent an ioc->lock from being possibly taken while the -+ * scheduler lock is being held. -+ */ -+ struct io_context *ioc_to_put; - }; - - enum bfqq_state_flags { -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index b54a638186e3..a5c8b4acd33c 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1905,7 +1905,18 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) - struct bfq_entity *entity = in_serv_entity; - - if (bfqd->in_service_bic) { -+#ifdef BFQ_MQ -+ /* -+ * Schedule the release of a reference to -+ * bfqd->in_service_bic->icq.ioc to right after the -+ * scheduler lock is released. This ioc is not -+ * released immediately, to not risk to possibly take -+ * an ioc->lock while holding the scheduler lock. -+ */ -+ bfqd->ioc_to_put = bfqd->in_service_bic->icq.ioc; -+#else - put_io_context(bfqd->in_service_bic->icq.ioc); -+#endif - bfqd->in_service_bic = NULL; - } - - -From 84cc7140cb4f0574710625f51abbb076a1dd2920 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Fri, 3 Mar 2017 09:31:14 +0100 -Subject: [PATCH 17/51] Add checks and extra log messages - Part II - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 42 ++++++++++++++++++++++++++++++++++++++++-- - block/bfq-sched.c | 1 + - 2 files changed, 41 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 69ef3761c95d..5707d42b160d 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1567,6 +1567,7 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, - { - struct bfq_queue *bfqq = bfqd->bio_bfqq; - -+ BUG_ON(!bfqd->bio_bfqq_set); - - if (bfqq) - return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -@@ -1719,6 +1720,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) - if (free) - blk_mq_free_request(free); - bfqd->bio_bfqq_set = false; -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - return ret; -@@ -1781,6 +1783,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); - } -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - } - } -@@ -1824,6 +1827,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - - bfq_remove_request(q, next); - -+ BUG_ON(bfqq->bfqd->ioc_to_put); - spin_unlock_irq(&bfqq->bfqd->lock); - end: - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -@@ -2195,9 +2199,11 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - { - bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", - (unsigned long) new_bfqq->pid); -+ BUG_ON(bfqq->bic && bfqq->bic == new_bfqq->bic); - /* Save weight raising and idle window of the merged queues */ - bfq_bfqq_save_state(bfqq); - bfq_bfqq_save_state(new_bfqq); -+ - if (bfq_bfqq_IO_bound(bfqq)) - bfq_mark_bfqq_IO_bound(new_bfqq); - bfq_clear_bfqq_IO_bound(bfqq); -@@ -2276,6 +2282,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - bool is_sync = op_is_sync(bio->bi_opf); - struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; - -+ assert_spin_locked(&bfqd->lock); - /* - * Disallow merge of a sync bio into an async request. - */ -@@ -2286,6 +2293,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - * Lookup the bfqq that this bio will be queued with. Allow - * merge only if rq is queued there. - */ -+ BUG_ON(!bfqd->bio_bfqq_set); - if (!bfqq) - return false; - -@@ -2294,6 +2302,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - * of the queues of possible cooperating processes. - */ - new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ BUG_ON(new_bfqq == bfqq); - if (new_bfqq) { - /* - * bic still points to bfqq, then it has not yet been -@@ -4040,6 +4049,8 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - struct bfq_group *bfqg = bfqq_group(bfqq); - #endif - -+ assert_spin_locked(&bfqq->bfqd->lock); -+ - BUG_ON(bfqq->ref <= 0); - - if (bfqq->bfqd) -@@ -4119,6 +4130,7 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -+ BUG_ON(bfqd->ioc_to_put); - /* - * If the bic is using a shared queue, put the - * reference taken on the io_context when the bic -@@ -4567,10 +4579,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - return; - } - -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - blk_mq_sched_request_inserted(rq); -@@ -4785,6 +4799,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -+ BUG_ON(bfqd->ioc_to_put); - - bfq_completed_request(bfqq, bfqd); - bfq_put_rq_priv_body(bfqq); -@@ -4855,13 +4870,28 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - if (bfqq) - bfq_put_queue(bfqq); - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - - bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ - if ((bic->was_in_burst_list && bfqd->large_burst) || -- bic->saved_in_large_burst) -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: marking in " -+ "large burst"); - bfq_mark_bfqq_in_large_burst(bfqq); -- else { -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_request: clearing in " -+ "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) - hlist_add_head(&bfqq->burst_list_node, -@@ -4897,10 +4927,12 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, - &new_queue); -+ BUG_ON(bfqd->ioc_to_put); - - if (unlikely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ BUG_ON(!is_sync); - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); - - /* Update bic before losing reference to bfqq */ -@@ -4923,6 +4955,9 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - NULL); - else - bfqq_already_existing = true; -+ -+ BUG_ON(!bfqq); -+ BUG_ON(bfqq == &bfqd->oom_bfqq); - } - } - -@@ -4976,6 +5011,8 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - - BUG_ON(!bfqd); - spin_lock_irqsave(&bfqd->lock, flags); -+ BUG_ON(bfqd->ioc_to_put); -+ - bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); - bfq_clear_bfqq_wait_request(bfqq); - -@@ -5083,6 +5120,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - spin_lock_irq(&bfqd->lock); - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - hrtimer_cancel(&bfqd->idle_slice_timer); -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index a5c8b4acd33c..85e59eeb3569 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1906,6 +1906,7 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) - - if (bfqd->in_service_bic) { - #ifdef BFQ_MQ -+ BUG_ON(bfqd->ioc_to_put); - /* - * Schedule the release of a reference to - * bfqd->in_service_bic->icq.ioc to right after the - -From 3d54cb804f1db2e08ce4a6cc335868538542f587 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 22 Feb 2017 11:30:01 +0100 -Subject: [PATCH 18/51] Fix unbalanced increment of rq_in_driver - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 52 +++++++++++++++++++++++++++++++++++++++++--------- - 1 file changed, 43 insertions(+), 9 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 5707d42b160d..9cbcb8d43d81 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -3936,9 +3936,45 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq = list_first_entry(&bfqd->dispatch, struct request, - queuelist); - list_del_init(&rq->queuelist); -+ - bfq_log(bfqd, - "dispatch requests: picked %p from dispatch list", rq); -- goto exit; -+ bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ /* -+ * Increment counters here, because this -+ * dispatch does not follow the standard -+ * dispatch flow (where counters are -+ * incremented) -+ */ -+ bfqq->dispatched++; -+ -+ goto inc_in_driver_start_rq; -+ } -+ -+ /* -+ * We exploit the put_rq_private hook to decrement -+ * rq_in_driver, but put_rq_private will not be -+ * invoked on this request. So, to avoid unbalance, -+ * just start this request, without incrementing -+ * rq_in_driver. As a negative consequence, -+ * rq_in_driver is deceptively lower than it should be -+ * while this request is in service. This may cause -+ * bfq_schedule_dispatch to be invoked uselessly. -+ * -+ * As for implementing an exact solution, the -+ * put_request hook, if defined, is probably invoked -+ * also on this request. So, by exploiting this hook, -+ * we could 1) increment rq_in_driver here, and 2) -+ * decrement it in put_request. Such a solution would -+ * let the value of the counter be always accurate, -+ * but it would entail using an extra interface -+ * function. This cost seems higher than the benefit, -+ * being the frequency of non-elevator-private -+ * requests very low. -+ */ -+ goto start_rq; - } - - bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -@@ -3973,10 +4009,12 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - - BUG_ON(bfqq->next_rq == NULL && - bfqq->entity.budget < bfqq->entity.service); --exit: -+ - if (rq) { -- rq->rq_flags |= RQF_STARTED; -+ inc_in_driver_start_rq: - bfqd->rq_in_driver++; -+ start_rq: -+ rq->rq_flags |= RQF_STARTED; - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, - "dispatched %s request %p, rq_in_driver %d", -@@ -3992,6 +4030,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - "returned NULL request, rq_in_driver %d", - bfqd->rq_in_driver); - -+exit: - return rq; - } - -@@ -4591,15 +4630,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (at_head || blk_rq_is_passthrough(rq)) { -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -- - if (at_head) - list_add(&rq->queuelist, &bfqd->dispatch); - else - list_add_tail(&rq->queuelist, &bfqd->dispatch); -- -- if (bfqq) -- bfqq->dispatched++; - } else { - __bfq_insert_request(bfqd, rq); - -@@ -4966,7 +5000,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - "get_request: new allocated %d", bfqq->allocated); - - bfqq->ref++; -- bfq_log_bfqq(bfqd, bfqq, "get_request: bfqq %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; - -From 7ba977d696b239569b4cd233aebc99e136ecf487 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Fri, 3 Mar 2017 09:39:35 +0100 -Subject: [PATCH 19/51] Add checks and extra log messages - Part III - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 11 +++++++++++ - 1 file changed, 11 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 9cbcb8d43d81..24b529a2edc7 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4630,10 +4630,21 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (at_head || blk_rq_is_passthrough(rq)) { -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ - if (at_head) - list_add(&rq->queuelist, &bfqd->dispatch); - else - list_add_tail(&rq->queuelist, &bfqd->dispatch); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "insert_request %p in disp: at_head %d", -+ rq, at_head); -+ else -+ bfq_log(bfqd, -+ "insert_request %p in disp: at_head %d", -+ rq, at_head); - } else { - __bfq_insert_request(bfqd, rq); - - -From c94e47b2908600b8ba89f84b0ac7febddd313141 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Fri, 17 Feb 2017 14:28:02 +0100 -Subject: [PATCH 20/51] TESTING: Check wrong invocation of merge and - put_rq_priv functions - -Check that merge functions are not invoked on requests queued in the -dispatch queue, and that neither put_rq_private is invoked on these -requests if, in addition, they have not passed through get_rq_private. - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 22 ++++++++++++++++++++++ - include/linux/blkdev.h | 2 ++ - 2 files changed, 24 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 24b529a2edc7..b4d40bb712d2 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1746,6 +1746,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, - static void bfq_request_merged(struct request_queue *q, struct request *req, - enum elv_merge type) - { -+ BUG_ON(req->rq_flags & RQF_DISP_LIST); -+ - if (type == ELEVATOR_FRONT_MERGE && - rb_prev(&req->rb_node) && - blk_rq_pos(req) < -@@ -1795,6 +1797,8 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - - BUG_ON(!RQ_BFQQ(rq)); - BUG_ON(!RQ_BFQQ(next)); -+ BUG_ON(rq->rq_flags & RQF_DISP_LIST); -+ BUG_ON(next->rq_flags & RQF_DISP_LIST); - - if (!RB_EMPTY_NODE(&rq->rb_node)) - goto end; -@@ -3936,6 +3940,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq = list_first_entry(&bfqd->dispatch, struct request, - queuelist); - list_del_init(&rq->queuelist); -+ rq->rq_flags &= ~RQF_DISP_LIST; - - bfq_log(bfqd, - "dispatch requests: picked %p from dispatch list", rq); -@@ -3950,6 +3955,17 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - */ - bfqq->dispatched++; - -+ /* -+ * TESTING: reset DISP_LIST flag, because: 1) -+ * this rq this request has passed through -+ * get_rq_private, 2) then it will have -+ * put_rq_private invoked on it, and 3) in -+ * put_rq_private we use this flag to check -+ * that put_rq_private is not invoked on -+ * requests for which get_rq_private has been -+ * invoked. -+ */ -+ rq->rq_flags &= ~RQF_DISP_LIST; - goto inc_in_driver_start_rq; - } - -@@ -4637,6 +4653,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - else - list_add_tail(&rq->queuelist, &bfqd->dispatch); - -+ rq->rq_flags |= RQF_DISP_LIST; - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, - "insert_request %p in disp: at_head %d", -@@ -4824,6 +4841,10 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - bfqd = bfqq->bfqd; - BUG_ON(!bfqd); - -+ if (rq->rq_flags & RQF_DISP_LIST) { -+ pr_crit("putting disp rq %p for %d", rq, bfqq->pid); -+ BUG(); -+ } - BUG_ON(rq->rq_flags & RQF_QUEUED); - BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); - -@@ -5015,6 +5036,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; -+ rq->rq_flags &= ~RQF_DISP_LIST; - - /* - * If a bfq_queue has only one process reference, it is owned -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 10f892ca585d..0048e59e6d07 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -121,6 +121,8 @@ typedef __u32 __bitwise req_flags_t; - /* Look at ->special_vec for the actual data payload instead of the - bio chain. */ - #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) -+/* DEBUG: rq in bfq-mq dispatch list */ -+#define RQF_DISP_LIST ((__force req_flags_t)(1 << 19)) - - /* flags that prevent us from merging requests: */ - #define RQF_NOMERGE_FLAGS \ - -From 49206f9052d13c96d49dbc36c612bed41b2d6552 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Sat, 25 Feb 2017 17:38:05 +0100 -Subject: [PATCH 21/51] Complete support for cgroups - -This commit completes cgroups support for bfq-mq. In particular, it deals with -a sort of circular dependency introduced in blk-mq: the function -blkcg_activate_policy, invoked during scheduler initialization, triggers the -invocation of the has_work scheduler hook (before the init function is -finished). To adress this issue, this commit moves the invocation of -blkcg_activate_policy after the initialization of all the fields that could be -initialized before invoking blkcg_activate_policy itself. This enables has_work -to correctly return false, and thus to prevent the blk-mq stack from invoking -further scheduler hooks before the init function is finished. - -Signed-off-by: Paolo Valente ---- - block/Kconfig.iosched | 9 +++++ - block/bfq-mq-iosched.c | 108 ++++++++++++++++++++++++++++--------------------- - block/bfq-mq.h | 2 +- - 3 files changed, 72 insertions(+), 47 deletions(-) - -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index 2d94af3d8b0a..299a6861fb90 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -106,6 +106,15 @@ config MQ_IOSCHED_BFQ - guarantees a low latency to interactive and soft real-time - applications. Details in Documentation/block/bfq-iosched.txt - -+config MQ_BFQ_GROUP_IOSCHED -+ bool "BFQ-MQ hierarchical scheduling support" -+ depends on MQ_IOSCHED_BFQ && BLK_CGROUP -+ default n -+ ---help--- -+ -+ Enable hierarchical scheduling in BFQ-MQ, using the blkio -+ (cgroups-v1) or io (cgroups-v2) controller. -+ - config MQ_IOSCHED_DEADLINE - tristate "MQ deadline I/O scheduler" - default y -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index b4d40bb712d2..02a1e7fd0ea4 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -88,7 +88,6 @@ - #include "blk-mq.h" - #include "blk-mq-tag.h" - #include "blk-mq-sched.h" --#undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ - #include "bfq-mq.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ -@@ -233,15 +232,6 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - return NULL; - } - --#define BFQ_MQ --#include "bfq-sched.c" --#include "bfq-cgroup-included.c" -- --#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) --#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -- --#define bfq_sample_valid(samples) ((samples) > 80) -- - /* - * Scheduler run of queue, if there are requests pending and no one in the - * driver that will restart queueing. -@@ -255,6 +245,43 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) - } - - /* -+ * Next two functions release bfqd->lock and put the io context -+ * pointed by bfqd->ioc_to_put. This delayed put is used to not risk -+ * to take an ioc->lock while the scheduler lock is being held. -+ */ -+static void bfq_unlock_put_ioc(struct bfq_data *bfqd) -+{ -+ struct io_context *ioc_to_put = bfqd->ioc_to_put; -+ -+ bfqd->ioc_to_put = NULL; -+ spin_unlock_irq(&bfqd->lock); -+ -+ if (ioc_to_put) -+ put_io_context(ioc_to_put); -+} -+ -+static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, -+ unsigned long flags) -+{ -+ struct io_context *ioc_to_put = bfqd->ioc_to_put; -+ -+ bfqd->ioc_to_put = NULL; -+ spin_unlock_irqrestore(&bfqd->lock, flags); -+ -+ if (ioc_to_put) -+ put_io_context(ioc_to_put); -+} -+ -+#define BFQ_MQ -+#include "bfq-sched.c" -+#include "bfq-cgroup-included.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* - * Lifted from AS - choose which of rq1 and rq2 that is best served now. - * We choose the request that is closesr to the head right now. Distance - * behind the head is penalized and only allowed to a certain extent. -@@ -4050,34 +4077,6 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - return rq; - } - --/* -- * Next two functions release bfqd->lock and put the io context -- * pointed by bfqd->ioc_to_put. This delayed put is used to not risk -- * to take an ioc->lock while the scheduler lock is being held. -- */ --static void bfq_unlock_put_ioc(struct bfq_data *bfqd) --{ -- struct io_context *ioc_to_put = bfqd->ioc_to_put; -- -- bfqd->ioc_to_put = NULL; -- spin_unlock_irq(&bfqd->lock); -- -- if (ioc_to_put) -- put_io_context(ioc_to_put); --} -- --static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, -- unsigned long flags) --{ -- struct io_context *ioc_to_put = bfqd->ioc_to_put; -- -- bfqd->ioc_to_put = NULL; -- spin_unlock_irqrestore(&bfqd->lock, flags); -- -- if (ioc_to_put) -- put_io_context(ioc_to_put); --} -- - static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -@@ -5239,6 +5238,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - } - eq->elevator_data = bfqd; - -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ - /* - * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. - * Grab a permanent reference to it, so that the normal code flow -@@ -5261,12 +5264,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->oom_bfqq.entity.prio_changed = 1; - - bfqd->queue = q; -- -- bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -- if (!bfqd->root_group) -- goto out_free; -- bfq_init_root_group(bfqd->root_group, bfqd); -- bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+ INIT_LIST_HEAD(&bfqd->dispatch); - - hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); -@@ -5324,9 +5322,27 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->device_speed = BFQ_BFQD_FAST; - - spin_lock_init(&bfqd->lock); -- INIT_LIST_HEAD(&bfqd->dispatch); - -- q->elevator = eq; -+ /* -+ * The invocation of the next bfq_create_group_hierarchy -+ * function is the head of a chain of function calls -+ * (bfq_create_group_hierarchy->blkcg_activate_policy-> -+ * blk_mq_freeze_queue) that may lead to the invocation of the -+ * has_work hook function. For this reason, -+ * bfq_create_group_hierarchy is invoked only after all -+ * scheduler data has been initialized, apart from the fields -+ * that can be initialized only after invoking -+ * bfq_create_group_hierarchy. This, in particular, enables -+ * has_work to correctly return false. Of course, to avoid -+ * other inconsistencies, the blk-mq stack must then refrain -+ * from invoking further scheduler hooks before this init -+ * function is finished. -+ */ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); - - return 0; - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index bd83f1c02573..2c81c02bccc4 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -20,7 +20,7 @@ - #include - - /* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */ --#ifdef CONFIG_BFQ_MQ_GROUP_IOSCHED -+#ifdef CONFIG_MQ_BFQ_GROUP_IOSCHED - #define BFQ_GROUP_IOSCHED_ENABLED - #endif - - -From 62d12db23ce14d2716b5cff7d2635fbc817b96d0 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Fri, 17 Mar 2017 06:15:18 +0100 -Subject: [PATCH 22/51] Remove all get and put of I/O contexts - -When a bfq queue is set in service and when it is merged, a reference -to the I/O context associated with the queue is taken. This reference -is then released when the queue is deselected from service or -split. More precisely, the release of the reference is postponed to -when the scheduler lock is released, to avoid nesting between the -scheduler and the I/O-context lock. In fact, such nesting would lead -to deadlocks, because of other code paths that take the same locks in -the opposite order. This postponing of I/O-context releases does -complicate code. - -This commit addresses this issue by modifying involved operations in -such a way to not need to get the above I/O-context references any -more. Then it also removes any get and release of these references. - -Signed-off-by: Paolo Valente ---- - block/bfq-cgroup-included.c | 2 +- - block/bfq-mq-iosched.c | 127 ++++++++------------------------------------ - block/bfq-mq.h | 11 ---- - block/bfq-sched.c | 17 ------ - 4 files changed, 22 insertions(+), 135 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index cf59eeb7f08e..dfacca799b5e 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -773,7 +773,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - bfq_put_async_queues(bfqd, bfqg); - - #ifdef BFQ_MQ -- bfq_unlock_put_ioc_restore(bfqd, flags); -+ spin_unlock_irqrestore(&bfqd->lock, flags); - #endif - /* - * @blkg is going offline and will be ignored by -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 02a1e7fd0ea4..8e7589d3280f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -244,34 +244,6 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) - } - } - --/* -- * Next two functions release bfqd->lock and put the io context -- * pointed by bfqd->ioc_to_put. This delayed put is used to not risk -- * to take an ioc->lock while the scheduler lock is being held. -- */ --static void bfq_unlock_put_ioc(struct bfq_data *bfqd) --{ -- struct io_context *ioc_to_put = bfqd->ioc_to_put; -- -- bfqd->ioc_to_put = NULL; -- spin_unlock_irq(&bfqd->lock); -- -- if (ioc_to_put) -- put_io_context(ioc_to_put); --} -- --static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, -- unsigned long flags) --{ -- struct io_context *ioc_to_put = bfqd->ioc_to_put; -- -- bfqd->ioc_to_put = NULL; -- spin_unlock_irqrestore(&bfqd->lock, flags); -- -- if (ioc_to_put) -- put_io_context(ioc_to_put); --} -- - #define BFQ_MQ - #include "bfq-sched.c" - #include "bfq-cgroup-included.c" -@@ -1747,7 +1719,6 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) - if (free) - blk_mq_free_request(free); - bfqd->bio_bfqq_set = false; -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - return ret; -@@ -1812,7 +1783,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); - } -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - } - } -@@ -1858,7 +1828,6 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - - bfq_remove_request(q, next); - -- BUG_ON(bfqq->bfqd->ioc_to_put); - spin_unlock_irq(&bfqq->bfqd->lock); - end: - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -@@ -2035,20 +2004,18 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) - * first time that the requests of some process are redirected to - * it. - * -- * We redirect bfqq to new_bfqq and not the opposite, because we -- * are in the context of the process owning bfqq, hence we have -- * the io_cq of this process. So we can immediately configure this -- * io_cq to redirect the requests of the process to new_bfqq. -+ * We redirect bfqq to new_bfqq and not the opposite, because -+ * we are in the context of the process owning bfqq, thus we -+ * have the io_cq of this process. So we can immediately -+ * configure this io_cq to redirect the requests of the -+ * process to new_bfqq. In contrast, the io_cq of new_bfqq is -+ * not available any more (new_bfqq->bic == NULL). - * -- * NOTE, even if new_bfqq coincides with the in-service queue, the -- * io_cq of new_bfqq is not available, because, if the in-service -- * queue is shared, bfqd->in_service_bic may not point to the -- * io_cq of the in-service queue. -- * Redirecting the requests of the process owning bfqq to the -- * currently in-service queue is in any case the best option, as -- * we feed the in-service queue with new requests close to the -- * last request served and, by doing so, hopefully increase the -- * throughput. -+ * Anyway, even in case new_bfqq coincides with the in-service -+ * queue, redirecting requests the in-service queue is the -+ * best option, as we feed the in-service queue with new -+ * requests close to the last request served and, by doing so, -+ * are likely to increase the throughput. - */ - bfqq->new_bfqq = new_bfqq; - new_bfqq->ref += process_refs; -@@ -2147,13 +2114,13 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - in_service_bfqq = bfqd->in_service_queue; - - if (in_service_bfqq && in_service_bfqq != bfqq && -- bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) -+ wr_from_too_long(in_service_bfqq) - && likely(in_service_bfqq == &bfqd->oom_bfqq)) - bfq_log_bfqq(bfqd, bfqq, - "would have tried merge with in-service-queue, but wr"); - -- if (!in_service_bfqq || in_service_bfqq == bfqq || -- !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || -+ if (!in_service_bfqq || in_service_bfqq == bfqq -+ || wr_from_too_long(in_service_bfqq) || - unlikely(in_service_bfqq == &bfqd->oom_bfqq)) - goto check_scheduled; - -@@ -2214,16 +2181,6 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - } - --static void bfq_get_bic_reference(struct bfq_queue *bfqq) --{ -- /* -- * If bfqq->bic has a non-NULL value, the bic to which it belongs -- * is about to begin using a shared bfq_queue. -- */ -- if (bfqq->bic) -- atomic_long_inc(&bfqq->bic->icq.ioc->refcount); --} -- - static void - bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -@@ -2280,12 +2237,6 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - bfqd->wr_busy_queues); - - /* -- * Grab a reference to the bic, to prevent it from being destroyed -- * before being possibly touched by a bfq_split_bfqq(). -- */ -- bfq_get_bic_reference(bfqq); -- bfq_get_bic_reference(new_bfqq); -- /* - * Merge queues (that is, let bic redirect its requests to new_bfqq) - */ - bic_set_bfqq(bic, new_bfqq, 1); -@@ -2472,16 +2423,10 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) - static void bfq_arm_slice_timer(struct bfq_data *bfqd) - { - struct bfq_queue *bfqq = bfqd->in_service_queue; -- struct bfq_io_cq *bic; - u32 sl; - - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - -- /* Processes have exited, don't wait. */ -- bic = bfqd->in_service_bic; -- if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) -- return; -- - bfq_mark_bfqq_wait_request(bfqq); - - /* -@@ -3922,11 +3867,6 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, - bfq_bfqq_budget_left(bfqq), - bfqq->dispatched); - -- if (!bfqd->in_service_bic) { -- atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); -- bfqd->in_service_bic = RQ_BIC(rq); -- } -- - /* - * Expire bfqq, pretending that its budget expired, if bfqq - * belongs to CLASS_IDLE and other queues are waiting for -@@ -4085,7 +4025,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - spin_lock_irq(&bfqd->lock); - - rq = __bfq_dispatch_request(hctx); -- bfq_unlock_put_ioc(bfqd); -+ spin_unlock_irq(&bfqd->lock); - - return rq; - } -@@ -4184,21 +4124,10 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -- BUG_ON(bfqd->ioc_to_put); -- /* -- * If the bic is using a shared queue, put the -- * reference taken on the io_context when the bic -- * started using a shared bfq_queue. This put cannot -- * make ioc->ref_count reach 0, then no ioc->lock -- * risks to be taken (leading to possible deadlock -- * scenarios). -- */ -- if (is_sync && bfq_bfqq_coop(bfqq)) -- put_io_context(bic->icq.ioc); - - bfq_exit_bfqq(bfqd, bfqq); - bic_set_bfqq(bic, NULL, is_sync); -- bfq_unlock_put_ioc_restore(bfqd, flags); -+ spin_unlock_irqrestore(&bfqd->lock, flags); - } - } - -@@ -4633,12 +4562,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - return; - } - -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - blk_mq_sched_request_inserted(rq); -@@ -4671,7 +4598,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - } - } - -- bfq_unlock_put_ioc(bfqd); -+ spin_unlock_irq(&bfqd->lock); - } - - static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -@@ -4864,12 +4791,11 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -- BUG_ON(bfqd->ioc_to_put); - - bfq_completed_request(bfqq, bfqd); - bfq_put_rq_priv_body(bfqq); - -- bfq_unlock_put_ioc_restore(bfqd, flags); -+ spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - /* - * Request rq may be still/already in the scheduler, -@@ -4992,7 +4918,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, - &new_queue); -- BUG_ON(bfqd->ioc_to_put); - - if (unlikely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ -@@ -5005,14 +4930,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - bic->saved_in_large_burst = true; - - bfqq = bfq_split_bfqq(bic, bfqq); -- /* -- * A reference to bic->icq.ioc needs to be -- * released after a queue split. Do not do it -- * immediately, to not risk to possibly take -- * an ioc->lock while holding the scheduler -- * lock. -- */ -- bfqd->ioc_to_put = bic->icq.ioc; - - if (!bfqq) - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, -@@ -5045,7 +4962,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - */ - if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { - bfqq->bic = bic; -- if (bfqd->ioc_to_put) { /* if true, then there has been a split */ -+ if (split) { - /* - * The queue has just been split from a shared - * queue: restore the idle window and the -@@ -5059,7 +4976,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - if (unlikely(bfq_bfqq_just_created(bfqq))) - bfq_handle_burst(bfqd, bfqq); - -- bfq_unlock_put_ioc(bfqd); -+ spin_unlock_irq(&bfqd->lock); - - return 0; - -@@ -5077,7 +4994,6 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - - BUG_ON(!bfqd); - spin_lock_irqsave(&bfqd->lock, flags); -- BUG_ON(bfqd->ioc_to_put); - - bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); - bfq_clear_bfqq_wait_request(bfqq); -@@ -5108,7 +5024,7 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - bfq_bfqq_expire(bfqd, bfqq, true, reason); - - schedule_dispatch: -- bfq_unlock_put_ioc_restore(bfqd, flags); -+ spin_unlock_irqrestore(&bfqd->lock, flags); - bfq_schedule_dispatch(bfqd); - } - -@@ -5186,7 +5102,6 @@ static void bfq_exit_queue(struct elevator_queue *e) - spin_lock_irq(&bfqd->lock); - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, false, false); -- BUG_ON(bfqd->ioc_to_put); - spin_unlock_irq(&bfqd->lock); - - hrtimer_cancel(&bfqd->idle_slice_timer); -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 2c81c02bccc4..36ee24a87dda 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -458,8 +458,6 @@ struct bfq_data { - - /* bfq_queue in service */ - struct bfq_queue *in_service_queue; -- /* bfq_io_cq (bic) associated with the @in_service_queue */ -- struct bfq_io_cq *in_service_bic; - - /* on-disk position of the last served request */ - sector_t last_position; -@@ -621,15 +619,6 @@ struct bfq_data { - struct bfq_queue *bio_bfqq; - /* Extra flag used only for TESTING */ - bool bio_bfqq_set; -- -- /* -- * io context to put right after bfqd->lock is released. This -- * filed is used to perform put_io_context, when needed, to -- * after the scheduler lock has been released, and thus -- * prevent an ioc->lock from being possibly taken while the -- * scheduler lock is being held. -- */ -- struct io_context *ioc_to_put; - }; - - enum bfqq_state_flags { -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 85e59eeb3569..9c4e6797d8c9 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1904,23 +1904,6 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) - struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; - struct bfq_entity *entity = in_serv_entity; - -- if (bfqd->in_service_bic) { --#ifdef BFQ_MQ -- BUG_ON(bfqd->ioc_to_put); -- /* -- * Schedule the release of a reference to -- * bfqd->in_service_bic->icq.ioc to right after the -- * scheduler lock is released. This ioc is not -- * released immediately, to not risk to possibly take -- * an ioc->lock while holding the scheduler lock. -- */ -- bfqd->ioc_to_put = bfqd->in_service_bic->icq.ioc; --#else -- put_io_context(bfqd->in_service_bic->icq.ioc); --#endif -- bfqd->in_service_bic = NULL; -- } -- - bfq_clear_bfqq_wait_request(in_serv_bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqd->in_service_queue = NULL; - -From 1521ad11f8684cf0a1b7249249cd406fee50da6d Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 29 Mar 2017 18:41:46 +0200 -Subject: [PATCH 23/51] BUGFIX: Remove unneeded and deadlock-causing lock in - request_merged - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 2 -- - 1 file changed, 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 8e7589d3280f..bb046335ff4f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1761,7 +1761,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - BUG_ON(RQ_BFQQ(req) != bfqq); - elv_rb_add(&bfqq->sort_list, req); - -- spin_lock_irq(&bfqd->lock); - /* Choose next request to be served for bfqq */ - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -@@ -1783,7 +1782,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfq_updated_next_req(bfqd, bfqq); - bfq_pos_tree_add_move(bfqd, bfqq); - } -- spin_unlock_irq(&bfqd->lock); - } - } - - -From 9136b4c953918ea937254c57cfb787b55b5bc2c6 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 29 Mar 2017 18:55:30 +0200 -Subject: [PATCH 24/51] Fix wrong unlikely - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index bb046335ff4f..3ae9bd424b3f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4917,7 +4917,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, - &new_queue); - -- if (unlikely(!new_queue)) { -+ if (likely(!new_queue)) { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - BUG_ON(!is_sync); - -From 8e05f722f19645f2278f6962368ca3b7c2a81c9c Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Fri, 12 May 2017 09:51:18 +0200 -Subject: [PATCH 25/51] Change cgroup params prefix to bfq-mq for bfq-mq - -Signed-off-by: Paolo Valente ---- - block/bfq-cgroup-included.c | 54 ++++++++++++++++++++++++++------------------- - 1 file changed, 31 insertions(+), 23 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index dfacca799b5e..9e9b0a09e26f 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -995,9 +995,15 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) - return blkg_to_bfqg(bfqd->queue->root_blkg); - } - -+#ifdef BFQ_MQ -+#define BFQ_CGROUP_FNAME(param) "bfq-mq."#param -+#else -+#define BFQ_CGROUP_FNAME(param) "bfq."#param -+#endif -+ - static struct cftype bfq_blkcg_legacy_files[] = { - { -- .name = "bfq.weight", -+ .name = BFQ_CGROUP_FNAME(weight), - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfq_io_show_weight, - .write_u64 = bfq_io_set_weight_legacy, -@@ -1005,106 +1011,106 @@ static struct cftype bfq_blkcg_legacy_files[] = { - - /* statistics, covers only the tasks in the bfqg */ - { -- .name = "bfq.time", -+ .name = BFQ_CGROUP_FNAME(time), - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat, - }, - { -- .name = "bfq.sectors", -+ .name = BFQ_CGROUP_FNAME(sectors), - .seq_show = bfqg_print_stat_sectors, - }, - { -- .name = "bfq.io_service_bytes", -+ .name = BFQ_CGROUP_FNAME(io_service_bytes), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes, - }, - { -- .name = "bfq.io_serviced", -+ .name = BFQ_CGROUP_FNAME(io_serviced), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios, - }, - { -- .name = "bfq.io_service_time", -+ .name = BFQ_CGROUP_FNAME(io_service_time), - .private = offsetof(struct bfq_group, stats.service_time), - .seq_show = bfqg_print_rwstat, - }, - { -- .name = "bfq.io_wait_time", -+ .name = BFQ_CGROUP_FNAME(io_wait_time), - .private = offsetof(struct bfq_group, stats.wait_time), - .seq_show = bfqg_print_rwstat, - }, - { -- .name = "bfq.io_merged", -+ .name = BFQ_CGROUP_FNAME(io_merged), - .private = offsetof(struct bfq_group, stats.merged), - .seq_show = bfqg_print_rwstat, - }, - { -- .name = "bfq.io_queued", -+ .name = BFQ_CGROUP_FNAME(io_queued), - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat, - }, - - /* the same statictics which cover the bfqg and its descendants */ - { -- .name = "bfq.time_recursive", -+ .name = BFQ_CGROUP_FNAME(time_recursive), - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat_recursive, - }, - { -- .name = "bfq.sectors_recursive", -+ .name = BFQ_CGROUP_FNAME(sectors_recursive), - .seq_show = bfqg_print_stat_sectors_recursive, - }, - { -- .name = "bfq.io_service_bytes_recursive", -+ .name = BFQ_CGROUP_FNAME(io_service_bytes_recursive), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes_recursive, - }, - { -- .name = "bfq.io_serviced_recursive", -+ .name = BFQ_CGROUP_FNAME(io_serviced_recursive), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios_recursive, - }, - { -- .name = "bfq.io_service_time_recursive", -+ .name = BFQ_CGROUP_FNAME(io_service_time_recursive), - .private = offsetof(struct bfq_group, stats.service_time), - .seq_show = bfqg_print_rwstat_recursive, - }, - { -- .name = "bfq.io_wait_time_recursive", -+ .name = BFQ_CGROUP_FNAME(io_wait_time_recursive), - .private = offsetof(struct bfq_group, stats.wait_time), - .seq_show = bfqg_print_rwstat_recursive, - }, - { -- .name = "bfq.io_merged_recursive", -+ .name = BFQ_CGROUP_FNAME(io_merged_recursive), - .private = offsetof(struct bfq_group, stats.merged), - .seq_show = bfqg_print_rwstat_recursive, - }, - { -- .name = "bfq.io_queued_recursive", -+ .name = BFQ_CGROUP_FNAME(io_queued_recursive), - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat_recursive, - }, - { -- .name = "bfq.avg_queue_size", -+ .name = BFQ_CGROUP_FNAME(avg_queue_size), - .seq_show = bfqg_print_avg_queue_size, - }, - { -- .name = "bfq.group_wait_time", -+ .name = BFQ_CGROUP_FNAME(group_wait_time), - .private = offsetof(struct bfq_group, stats.group_wait_time), - .seq_show = bfqg_print_stat, - }, - { -- .name = "bfq.idle_time", -+ .name = BFQ_CGROUP_FNAME(idle_time), - .private = offsetof(struct bfq_group, stats.idle_time), - .seq_show = bfqg_print_stat, - }, - { -- .name = "bfq.empty_time", -+ .name = BFQ_CGROUP_FNAME(empty_time), - .private = offsetof(struct bfq_group, stats.empty_time), - .seq_show = bfqg_print_stat, - }, - { -- .name = "bfq.dequeue", -+ .name = BFQ_CGROUP_FNAME(dequeue), - .private = offsetof(struct bfq_group, stats.dequeue), - .seq_show = bfqg_print_stat, - }, -@@ -1113,7 +1119,7 @@ static struct cftype bfq_blkcg_legacy_files[] = { - - static struct cftype bfq_blkg_files[] = { - { -- .name = "bfq.weight", -+ .name = BFQ_CGROUP_FNAME(weight), - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = bfq_io_show_weight, - .write = bfq_io_set_weight, -@@ -1121,6 +1127,8 @@ static struct cftype bfq_blkg_files[] = { - {} /* terminate */ - }; - -+#undef BFQ_CGROUP_FNAME -+ - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, - -From abdf7565dadbb00e78be5f4fb2cc9b157649840e Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Fri, 12 May 2017 11:56:13 +0200 -Subject: [PATCH 26/51] Add tentative extra tests on groups, reqs and queues - -Signed-off-by: Paolo Valente ---- - block/bfq-cgroup-included.c | 1 + - block/bfq-mq-iosched.c | 5 +++++ - include/linux/blkdev.h | 2 ++ - 3 files changed, 8 insertions(+) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 9e9b0a09e26f..72107ad12220 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -412,6 +412,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) - BUG_ON(!blkg); - bfqg = blkg_to_bfqg(blkg); - bfqd = blkg->q->elevator->elevator_data; -+ BUG_ON(bfqg == bfqd->root_group); - entity = &bfqg->entity; - d = blkcg_to_bfqgd(blkg->blkcg); - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 3ae9bd424b3f..a9e3406fef06 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4494,6 +4494,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ BUG_ON(!bfqq); - - assert_spin_locked(&bfqd->lock); - -@@ -4587,6 +4588,9 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - "insert_request %p in disp: at_head %d", - rq, at_head); - } else { -+ BUG_ON(!(rq->rq_flags & RQF_GOT)); -+ rq->rq_flags &= ~RQF_GOT; -+ - __bfq_insert_request(bfqd, rq); - - if (rq_mergeable(rq)) { -@@ -4974,6 +4978,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - if (unlikely(bfq_bfqq_just_created(bfqq))) - bfq_handle_burst(bfqd, bfqq); - -+ rq->rq_flags |= RQF_GOT; - spin_unlock_irq(&bfqd->lock); - - return 0; -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 0048e59e6d07..9ae814743095 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -123,6 +123,8 @@ typedef __u32 __bitwise req_flags_t; - #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) - /* DEBUG: rq in bfq-mq dispatch list */ - #define RQF_DISP_LIST ((__force req_flags_t)(1 << 19)) -+/* DEBUG: rq had get_rq_private executed on it */ -+#define RQF_GOT ((__force req_flags_t)(1 << 20)) - - /* flags that prevent us from merging requests: */ - #define RQF_NOMERGE_FLAGS \ - -From 9e1c1514bc947c4e04502331372b1cc58459d8d1 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Mon, 15 May 2017 22:25:03 +0200 -Subject: [PATCH 27/51] block, bfq-mq: access and cache blkg data only when - safe - -In blk-cgroup, operations on blkg objects are protected with the -request_queue lock. This is no more the lock that protects -I/O-scheduler operations in blk-mq. In fact, the latter are now -protected with a finer-grained per-scheduler-instance lock. As a -consequence, although blkg lookups are also rcu-protected, blk-mq I/O -schedulers may see inconsistent data when they access blkg and -blkg-related objects. BFQ does access these objects, and does incur -this problem, in the following case. - -The blkg_lookup performed in bfq_get_queue, being protected (only) -through rcu, may happen to return the address of a copy of the -original blkg. If this is the case, then the blkg_get performed in -bfq_get_queue, to pin down the blkg, is useless: it does not prevent -blk-cgroup code from destroying both the original blkg and all objects -directly or indirectly referred by the copy of the blkg. BFQ accesses -these objects, which typically causes a crash for NULL-pointer -dereference of memory-protection violation. - -Some additional protection mechanism should be added to blk-cgroup to -address this issue. In the meantime, this commit provides a quick -temporary fix for BFQ: cache (when safe) blkg data that might -disappear right after a blkg_lookup. - -In particular, this commit exploits the following facts to achieve its -goal without introducing further locks. Destroy operations on a blkg -invoke, as a first step, hooks of the scheduler associated with the -blkg. And these hooks are executed with bfqd->lock held for BFQ. As a -consequence, for any blkg associated with the request queue an -instance of BFQ is attached to, we are guaranteed that such a blkg is -not destroyed, and that all the pointers it contains are consistent, -while that instance is holding its bfqd->lock. A blkg_lookup performed -with bfqd->lock held then returns a fully consistent blkg, which -remains consistent until this lock is held. In more detail, this holds -even if the returned blkg is a copy of the original one. - -Finally, also the object describing a group inside BFQ needs to be -protected from destruction on the blkg_free of the original blkg -(which invokes bfq_pd_free). This commit adds private refcounting for -this object, to let it disappear only after no bfq_queue refers to it -any longer. - -This commit also removes or updates some stale comments on locking -issues related to blk-cgroup operations. - -Reported-by: Tomas Konir -Reported-by: Lee Tibbert -Reported-by: Marco Piazza -Signed-off-by: Paolo Valente -Tested-by: Tomas Konir -Tested-by: Lee Tibbert -Tested-by: Marco Piazza ---- - block/bfq-cgroup-included.c | 149 ++++++++++++++++++++++++++++++++++++++++---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-mq.h | 26 +++----- - 3 files changed, 148 insertions(+), 29 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 72107ad12220..d903393ee78a 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -43,7 +43,11 @@ BFQG_FLAG_FNS(idling) - BFQG_FLAG_FNS(empty) - #undef BFQG_FLAG_FNS - -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else - /* This should be called with the queue_lock held. */ -+#endif - static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) - { - unsigned long long now; -@@ -58,7 +62,11 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) - bfqg_stats_clear_waiting(stats); - } - -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else - /* This should be called with the queue_lock held. */ -+#endif - static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, - struct bfq_group *curr_bfqg) - { -@@ -72,7 +80,11 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, - bfqg_stats_mark_waiting(stats); - } - -+#ifdef BFQ_MQ -+/* This should be called with the scheduler lock held. */ -+#else - /* This should be called with the queue_lock held. */ -+#endif - static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) - { - unsigned long long now; -@@ -198,14 +210,43 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) - - static void bfqg_get(struct bfq_group *bfqg) - { -- return blkg_get(bfqg_to_blkg(bfqg)); -+#ifdef BFQ_MQ -+ bfqg->ref++; -+#else -+ blkg_get(bfqg_to_blkg(bfqg)); -+#endif - } - - static void bfqg_put(struct bfq_group *bfqg) - { -- return blkg_put(bfqg_to_blkg(bfqg)); -+#ifdef BFQ_MQ -+ bfqg->ref--; -+ -+ BUG_ON(bfqg->ref < 0); -+ if (bfqg->ref == 0) -+ kfree(bfqg); -+#else -+ blkg_put(bfqg_to_blkg(bfqg)); -+#endif -+} -+ -+#ifdef BFQ_MQ -+static void bfqg_and_blkg_get(struct bfq_group *bfqg) -+{ -+ /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ -+ bfqg_get(bfqg); -+ -+ blkg_get(bfqg_to_blkg(bfqg)); - } - -+static void bfqg_and_blkg_put(struct bfq_group *bfqg) -+{ -+ bfqg_put(bfqg); -+ -+ blkg_put(bfqg_to_blkg(bfqg)); -+} -+#endif -+ - static void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, - unsigned int op) -@@ -310,7 +351,15 @@ static void bfq_init_entity(struct bfq_entity *entity, - if (bfqq) { - bfqq->ioprio = bfqq->new_ioprio; - bfqq->ioprio_class = bfqq->new_ioprio_class; -+#ifdef BFQ_MQ -+ /* -+ * Make sure that bfqg and its associated blkg do not -+ * disappear before entity. -+ */ -+ bfqg_and_blkg_get(bfqg); -+#else - bfqg_get(bfqg); -+#endif - } - entity->parent = bfqg->my_entity; /* NULL for root group */ - entity->sched_data = &bfqg->sched_data; -@@ -397,6 +446,10 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) - return NULL; - } - -+#ifdef BFQ_MQ -+ /* see comments in bfq_bic_update_cgroup for why refcounting */ -+ bfqg_get(bfqg); -+#endif - return &bfqg->pd; - } - -@@ -432,7 +485,11 @@ static void bfq_pd_free(struct blkg_policy_data *pd) - struct bfq_group *bfqg = pd_to_bfqg(pd); - - bfqg_stats_exit(&bfqg->stats); -- return kfree(bfqg); -+#ifdef BFQ_MQ -+ bfqg_put(bfqg); -+#else -+ kfree(bfqg); -+#endif - } - - static void bfq_pd_reset_stats(struct blkg_policy_data *pd) -@@ -516,9 +573,16 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - * Move @bfqq to @bfqg, deactivating it from its old group and reactivating - * it on the new one. Avoid putting the entity on the old group idle tree. - * -+#ifdef BFQ_MQ -+ * Must be called under the scheduler lock, to make sure that the blkg -+ * owning @bfqg does not disappear (see comments in -+ * bfq_bic_update_cgroup on guaranteeing the consistency of blkg -+ * objects). -+#else - * Must be called under the queue lock; the cgroup owning @bfqg must - * not disappear (by now this just means that we are called under - * rcu_read_lock()). -+#endif - */ - static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_group *bfqg) -@@ -555,16 +619,20 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - entity->tree); - bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - } -+#ifdef BFQ_MQ -+ bfqg_and_blkg_put(bfqq_group(bfqq)); -+#else - bfqg_put(bfqq_group(bfqq)); -+#endif - -- /* -- * Here we use a reference to bfqg. We don't need a refcounter -- * as the cgroup reference will not be dropped, so that its -- * destroy() callback will not be invoked. -- */ - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; -+#ifdef BFQ_MQ -+ /* pin down bfqg and its associated blkg */ -+ bfqg_and_blkg_get(bfqg); -+#else - bfqg_get(bfqg); -+#endif - - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); - if (bfq_bfqq_busy(bfqq)) { -@@ -585,8 +653,14 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * @bic: the bic to move. - * @blkcg: the blk-cgroup to move to. - * -+#ifdef BFQ_MQ -+ * Move bic to blkcg, assuming that bfqd->lock is held; which makes -+ * sure that the reference to cgroup is valid across the call (see -+ * comments in bfq_bic_update_cgroup on this issue) -+#else - * Move bic to blkcg, assuming that bfqd->queue is locked; the caller - * has to make sure that the reference to cgroup is valid across the call. -+#endif - * - * NOTE: an alternative approach might have been to store the current - * cgroup in bfqq and getting a reference to it, reducing the lookup -@@ -645,6 +719,59 @@ static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) - goto out; - - bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); -+#ifdef BFQ_MQ -+ /* -+ * Update blkg_path for bfq_log_* functions. We cache this -+ * path, and update it here, for the following -+ * reasons. Operations on blkg objects in blk-cgroup are -+ * protected with the request_queue lock, and not with the -+ * lock that protects the instances of this scheduler -+ * (bfqd->lock). This exposes BFQ to the following sort of -+ * race. -+ * -+ * The blkg_lookup performed in bfq_get_queue, protected -+ * through rcu, may happen to return the address of a copy of -+ * the original blkg. If this is the case, then the -+ * bfqg_and_blkg_get performed in bfq_get_queue, to pin down -+ * the blkg, is useless: it does not prevent blk-cgroup code -+ * from destroying both the original blkg and all objects -+ * directly or indirectly referred by the copy of the -+ * blkg. -+ * -+ * On the bright side, destroy operations on a blkg invoke, as -+ * a first step, hooks of the scheduler associated with the -+ * blkg. And these hooks are executed with bfqd->lock held for -+ * BFQ. As a consequence, for any blkg associated with the -+ * request queue this instance of the scheduler is attached -+ * to, we are guaranteed that such a blkg is not destroyed, and -+ * that all the pointers it contains are consistent, while we -+ * are holding bfqd->lock. A blkg_lookup performed with -+ * bfqd->lock held then returns a fully consistent blkg, which -+ * remains consistent until this lock is held. -+ * -+ * Thanks to the last fact, and to the fact that: (1) bfqg has -+ * been obtained through a blkg_lookup in the above -+ * assignment, and (2) bfqd->lock is being held, here we can -+ * safely use the policy data for the involved blkg (i.e., the -+ * field bfqg->pd) to get to the blkg associated with bfqg, -+ * and then we can safely use any field of blkg. After we -+ * release bfqd->lock, even just getting blkg through this -+ * bfqg may cause dangling references to be traversed, as -+ * bfqg->pd may not exist any more. -+ * -+ * In view of the above facts, here we cache, in the bfqg, any -+ * blkg data we may need for this bic, and for its associated -+ * bfq_queue. As of now, we need to cache only the path of the -+ * blkg, which is used in the bfq_log_* functions. -+ * -+ * Finally, note that bfqg itself needs to be protected from -+ * destruction on the blkg_free of the original blkg (which -+ * invokes bfq_pd_free). We use an additional private -+ * refcounter for bfqg, to let it disappear only after no -+ * bfq_queue refers to it any longer. -+ */ -+ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); -+#endif - bic->blkcg_serial_nr = serial_nr; - out: - rcu_read_unlock(); -@@ -682,8 +809,6 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, - * @bfqd: the device data structure with the root group. - * @bfqg: the group to move from. - * @st: the service tree with the entities. -- * -- * Needs queue_lock to be taken and reference to be valid over the call. - */ - static void bfq_reparent_active_entities(struct bfq_data *bfqd, - struct bfq_group *bfqg, -@@ -736,6 +861,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - #ifdef BFQ_MQ - spin_lock_irqsave(&bfqd->lock, flags); - #endif -+ - /* - * Empty all service_trees belonging to this group before - * deactivating the group itself. -@@ -746,8 +872,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - /* - * The idle tree may still contain bfq_queues belonging - * to exited task because they never migrated to a different -- * cgroup from the one being destroyed now. No one else -- * can access them so it's safe to act without any lock. -+ * cgroup from the one being destroyed now. - */ - bfq_flush_idle_tree(st); - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index a9e3406fef06..4eb668eeacdc 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4073,7 +4073,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - - kmem_cache_free(bfq_pool, bfqq); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -- bfqg_put(bfqg); -+ bfqg_and_blkg_put(bfqg); - #endif - } - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 36ee24a87dda..77ab0f22ed22 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -695,23 +695,17 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -- char __pbuf[128]; \ -- \ -- blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - pr_crit("%s bfq%d%c %s " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- __pbuf, ##args); \ -+ bfqq_group(bfqq)->blkg_path, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -- char __pbuf[128]; \ -- \ -- blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ - pr_crit("%s %s " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- __pbuf, ##args); \ -+ bfqg->blkg_path, ##args); \ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ -@@ -736,20 +730,14 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -- char __pbuf[128]; \ -- \ -- blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- __pbuf, ##args); \ -+ bfqq_group(bfqq)->blkg_path, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -- char __pbuf[128]; \ -- \ -- blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -- blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, bfqg->blkg_path, ##args);\ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ -@@ -860,6 +848,12 @@ struct bfq_group { - /* must be the first member */ - struct blkg_policy_data pd; - -+ /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ -+ char blkg_path[128]; -+ -+ /* reference counter (see comments in bfq_bic_update_cgroup) */ -+ int ref; -+ - struct bfq_entity entity; - struct bfq_sched_data sched_data; - - -From c9137b749aceef6c2dde88e99b2fc978d5952e76 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Sat, 17 Jun 2017 11:18:11 +0200 -Subject: [PATCH 28/51] bfq-mq: fix macro name in conditional invocation of - policy_unregister - -This commit fixes the name of the macro in the conditional group that -invokes blkcg_policy_unregister in bfq_exit for bfq-mq. Because of -this error, blkcg_policy_unregister was never invoked. - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 4eb668eeacdc..bc1de3f70ea8 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -5669,7 +5669,7 @@ static int __init bfq_init(void) - static void __exit bfq_exit(void) - { - elv_unregister(&iosched_bfq_mq); --#ifdef CONFIG_BFQ_GROUP_ENABLED -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - #endif - bfq_slab_kill(); - -From c7ceb37496f63b2dba4d06946ab85ec97b87bfb5 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 5 Jul 2017 11:48:17 +0200 -Subject: [PATCH 29/51] Port of "blk-mq-sched: unify request finished methods" - -No need to have two different callouts of bfq vs kyber. - -Signed-off-by: Christoph Hellwig -Signed-off-by: Jens Axboe ---- - block/bfq-mq-iosched.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index bc1de3f70ea8..2598602a0b10 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4753,7 +4753,7 @@ static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) - bfq_put_queue(bfqq); - } - --static void bfq_put_rq_private(struct request_queue *q, struct request *rq) -+static void bfq_finish_request(struct request *rq) - { - struct bfq_queue *bfqq; - struct bfq_data *bfqd; -@@ -4814,7 +4814,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) - - assert_spin_locked(&bfqd->lock); - if (!RB_EMPTY_NODE(&rq->rb_node)) -- bfq_remove_request(q, rq); -+ bfq_remove_request(rq->q, rq); - bfq_put_rq_priv_body(bfqq); - } - -@@ -5558,7 +5558,7 @@ static struct elv_fs_entry bfq_attrs[] = { - static struct elevator_type iosched_bfq_mq = { - .ops.mq = { - .get_rq_priv = bfq_get_rq_private, -- .put_rq_priv = bfq_put_rq_private, -+ .finish_request = bfq_finish_request, - .exit_icq = bfq_exit_icq, - .insert_requests = bfq_insert_requests, - .dispatch_request = bfq_dispatch_request, - -From 12bef026fe114ab5e2e284772ddc52a8be83fdbc Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 5 Jul 2017 11:54:57 +0200 -Subject: [PATCH 30/51] Port of "bfq-iosched: fix NULL ioc check in - bfq_get_rq_private" - -icq_to_bic is a container_of operation, so we need to check for NULL -before it. Also move the check outside the spinlock while we're at -it. - -Signed-off-by: Christoph Hellwig -Signed-off-by: Jens Axboe ---- - block/bfq-mq-iosched.c | 15 +++++---------- - 1 file changed, 5 insertions(+), 10 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 2598602a0b10..c57774a60911 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4903,16 +4903,17 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; -- struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -+ struct bfq_io_cq *bic; - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; - bool bfqq_already_existing = false, split = false; - bool new_queue = false; - -- spin_lock_irq(&bfqd->lock); -+ if (!rq->elv.icq) -+ return 1; -+ bic = icq_to_bic(rq->elv.icq); - -- if (!bic) -- goto queue_fail; -+ spin_lock_irq(&bfqd->lock); - - bfq_check_ioprio_change(bic, bio); - -@@ -4980,13 +4981,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - rq->rq_flags |= RQF_GOT; - spin_unlock_irq(&bfqd->lock); -- - return 0; -- --queue_fail: -- spin_unlock_irq(&bfqd->lock); -- -- return 1; - } - - static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) - -From 633e5711347df1bf4ca935fd0aa9118a0054f75d Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 5 Jul 2017 12:02:16 +0200 -Subject: [PATCH 31/51] Port of "blk-mq-sched: unify request prepare methods" - -This patch makes sure we always allocate requests in the core blk-mq -code and use a common prepare_request method to initialize them for -both mq I/O schedulers. For Kyber and additional limit_depth method -is added that is called before allocating the request. - -Also because none of the intializations can really fail the new method -does not return an error - instead the bfq finish method is hardened -to deal with the no-IOC case. - -Last but not least this removes the abuse of RQF_QUEUE by the blk-mq -scheduling code as RQF_ELFPRIV is all that is needed now. - -Signed-off-by: Christoph Hellwig -Signed-off-by: Jens Axboe ---- - block/bfq-mq-iosched.c | 13 ++++++++----- - 1 file changed, 8 insertions(+), 5 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index c57774a60911..49ffca1ad6e7 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4760,6 +4760,10 @@ static void bfq_finish_request(struct request *rq) - struct bfq_io_cq *bic; - - BUG_ON(!rq); -+ -+ if (!rq->elv.icq) -+ return; -+ - bfqq = RQ_BFQQ(rq); - BUG_ON(!bfqq); - -@@ -4899,9 +4903,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - /* - * Allocate bfq data structures associated with this request. - */ --static int bfq_get_rq_private(struct request_queue *q, struct request *rq, -- struct bio *bio) -+static void bfq_prepare_request(struct request *rq, struct bio *bio) - { -+ struct request_queue *q = rq->q; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic; - const int is_sync = rq_is_sync(rq); -@@ -4910,7 +4914,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - bool new_queue = false; - - if (!rq->elv.icq) -- return 1; -+ return; - bic = icq_to_bic(rq->elv.icq); - - spin_lock_irq(&bfqd->lock); -@@ -4981,7 +4985,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - - rq->rq_flags |= RQF_GOT; - spin_unlock_irq(&bfqd->lock); -- return 0; - } - - static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) -@@ -5552,7 +5555,7 @@ static struct elv_fs_entry bfq_attrs[] = { - - static struct elevator_type iosched_bfq_mq = { - .ops.mq = { -- .get_rq_priv = bfq_get_rq_private, -+ .prepare_request = bfq_prepare_request, - .finish_request = bfq_finish_request, - .exit_icq = bfq_exit_icq, - .insert_requests = bfq_insert_requests, - -From 5a321acfce282c3e58ac63582faf6f928ad17f27 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 5 Jul 2017 12:43:22 +0200 -Subject: [PATCH 32/51] Add list of bfq instances to documentation - -Signed-off-by: Paolo Valente ---- - Documentation/block/bfq-iosched.txt | 11 ++++++++++- - 1 file changed, 10 insertions(+), 1 deletion(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 3d6951d63489..8ce6b9a9bacd 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -11,6 +11,15 @@ controllers), BFQ's main features are: - groups (switching back to time distribution when needed to keep - throughput high). - -+If bfq-mq patches have been applied, then the following three -+instances of BFQ are available (otherwise only the first instance): -+- bfq: mainline version of BFQ, for blk-mq -+- bfq-mq: development version of BFQ for blk-mq; this version contains -+ also all latest features not yet landed in mainline, plus many -+ safety checks -+- bfq: BFQ for legacy blk; also this version contains both latest -+ features and safety checks -+ - In its default configuration, BFQ privileges latency over - throughput. So, when needed for achieving a lower latency, BFQ builds - schedules that may lead to a lower throughput. If your main or only -@@ -27,7 +36,7 @@ sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and - to 120-200 MB/s with 4KB random I/O. BFQ is currently being tested on - multi-queue devices too. - --The table of contents follow. Impatients can just jump to Section 3. -+The table of contents follows. Impatients can just jump to Section 3. - - CONTENTS - - -From 9f2e5b27227fd9254cc258572dc2d4531838c30b Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 5 Jul 2017 16:28:00 +0200 -Subject: [PATCH 33/51] bfq-sq: fix prefix of names of cgroups parameters - -Signed-off-by: Paolo Valente ---- - Documentation/block/bfq-iosched.txt | 12 +++++++----- - block/bfq-cgroup-included.c | 2 +- - 2 files changed, 8 insertions(+), 6 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 8ce6b9a9bacd..965d82f94db9 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -503,10 +503,12 @@ To get proportional sharing of bandwidth with BFQ for a given device, - BFQ must of course be the active scheduler for that device. - - Within each group directory, the names of the files associated with --BFQ-specific cgroup parameters and stats begin with the "bfq." --prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for --BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group --parameter to set the weight of a group with BFQ is blkio.bfq.weight -+BFQ-specific cgroup parameters and stats begin with the "bfq.", -+"bfq-sq." or "bfq-mq." prefix, depending on which instance of bfq you -+want to use. So, with cgroups-v1 or cgroups-v2, the full prefix for -+BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be "" -+(i.e., null string), "-sq" or "-mq". For example, the group parameter -+to set the weight of a group with the mainline BFQ is blkio.bfq.weight - or io.bfq.weight. - - Parameters to set -@@ -514,7 +516,7 @@ Parameters to set - - For each group, there is only the following parameter to set. - --weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the -+weight (namely blkio.bfqX.weight or io.bfqX.weight): the weight of the - group inside its parent. Available values: 1..10000 (default 100). The - linear mapping between ioprio and weights, described at the beginning - of the tunable section, is still valid, but all weights higher than -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index d903393ee78a..631e53d9150d 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -1124,7 +1124,7 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) - #ifdef BFQ_MQ - #define BFQ_CGROUP_FNAME(param) "bfq-mq."#param - #else --#define BFQ_CGROUP_FNAME(param) "bfq."#param -+#define BFQ_CGROUP_FNAME(param) "bfq-sq."#param - #endif - - static struct cftype bfq_blkcg_legacy_files[] = { - -From 92b42df8166939ccf26aa450125b5b575cf6d505 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 5 Jul 2017 21:08:32 +0200 -Subject: [PATCH 34/51] Add to documentation that bfq-mq and bfq-sq contain - last fixes too - -Signed-off-by: Paolo Valente ---- - Documentation/block/bfq-iosched.txt | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 965d82f94db9..0e59f1c9d30e 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -15,10 +15,10 @@ If bfq-mq patches have been applied, then the following three - instances of BFQ are available (otherwise only the first instance): - - bfq: mainline version of BFQ, for blk-mq - - bfq-mq: development version of BFQ for blk-mq; this version contains -- also all latest features not yet landed in mainline, plus many -+ also all latest features and fixes not yet landed in mainline, plus many - safety checks --- bfq: BFQ for legacy blk; also this version contains both latest -- features and safety checks -+- bfq: BFQ for legacy blk; also this version contains latest features -+ and fixes, as well as safety checks - - In its default configuration, BFQ privileges latency over - throughput. So, when needed for achieving a lower latency, BFQ builds - -From 7f9bdd433b848d4f53c167258bf4d0b3f1ae1923 Mon Sep 17 00:00:00 2001 -From: Lee Tibbert -Date: Wed, 19 Jul 2017 10:28:32 -0400 -Subject: [PATCH 35/51] Improve most frequently used no-logging path - -This patch originated as a fix for compiler unused-variable warnings -issued when compiling bfq-mq with logging disabled (both -CONFIG_BLK_DEV_IO_TRACE and CONFIG_BFQ_REDIRECT_TO_CONSOLE -undefined). - -It turns out to also have benefits for the bfq-sq path as well. - -In most performance sensitive production builds blktrace_api logging -will probably be turned off, so it is worth making the no-logging path -compile without warnings. Any performance benefit is a bonus. - -Thank you to T. B. on the bfq-iosched@googlegroups.com list -for ((void) (bfqq)) simplification/suggestion/improvement. All bugs -and unclear descriptions are my own doing. - -The discussion below is based on the gcc compiler with optimization -level of at least 02. Lower optimization levels are unlikely to -remove no-op instruction equivalents. - -Provide three improvements in this likely case. - - 1) Fix multiple occurrences of an unused-variable warning - issued when compiling bfq-mq with no logging. The warning - occurred each time the bfq_log_bfqg macro was expanded inside - a code block such as the following snippet from - block/bfq-sched.c, line 139 and few following, lightly edited for - indentation in order to pass checkpatch.pl maximum line lengths. - -else { - struct bfq_group *bfqg = - container_of(next_in_service, - struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "update_next_in_service: chosen this entity"); - } - - Previously bfq-mq.h expanded bfq_log_bfqg to blk_add_trace_msg. - When both bfq console logging and blktrace_api logging are - disabled, include/linux/blktrace_api expands to - do { } while (0), leaving the code block local variable unused. - - bfq_log_bfqq() had similar behavior but is never called with - a potentially unused variable. This patch fixes that macro for - consistency. - - bfq-sq.h (single queue) with blktrace_api enabled, and the bfq - console logging macros have code paths which not trigger this - warning. - - kernel.org (4.12 & 4.13) bfq (bfq-iosched.h) could trigger - the warning but no code does so now. This patch fixes - bfq-iosched.h for consistency. - - The style above enables a software engineering approach where - complex expressions are moved to a local variable before the - bfq_log* call. This makes it easier to read the expression and - use breakpoints to verify it. bfq-mq uses this approach in - several places. - - New bfq_log* macros are provided for the no-logging case. - I touch only the second argument, because current code never - uses the local variable approach with the first or other - arguments. I tried to balance consistency with simplicity. - - 2) For bfq-sq, reduce to zero, the number of instructions executed - when no logging is configured. No sense marshaling arguments - which are never going to be used. - - On a trial V8R11 builds, this reduced the size of bfq-iosched.o - by 14.3 KiB. The size went from 70304 to 55664 bytes. - - bfq-mq and kernel.org bfq code size does not change because - existing macros already optimize to zero bytes when not logging. - The current changes maintains consistency with the bfq-sq path - and makes the bfq-mq & bfq no-logging paths resistant to future - logging path macro changes which might cause generated code. - - 3) Slightly reduce compile time of all bfq variants by including - blktrace_api.h only when it will be used. - -Signed-off-by: Lee Tibbert ---- - block/bfq-mq.h | 18 +++++++++++++++++- - block/bfq.h | 18 +++++++++++++++++- - 2 files changed, 34 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 77ab0f22ed22..7ed2cc29be57 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -15,7 +15,6 @@ - #ifndef _BFQ_H - #define _BFQ_H - --#include - #include - #include - -@@ -725,6 +724,21 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+#if !defined(CONFIG_BLK_DEV_IO_TRACE) -+ -+/* Avoid possible "unused-variable" warning. See commit message. */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) -+ -+#define bfq_log(bfqd, fmt, args...) do {} while (0) -+ -+#else /* CONFIG_BLK_DEV_IO_TRACE */ -+ -+#include -+ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -@@ -752,6 +766,8 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log(bfqd, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+ -+#endif /* CONFIG_BLK_DEV_IO_TRACE */ - #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - - /* Expiration reasons. */ -diff --git a/block/bfq.h b/block/bfq.h -index 53954d1b87f8..15d326f466b7 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -15,7 +15,6 @@ - #ifndef _BFQ_H - #define _BFQ_H - --#include - #include - #include - -@@ -725,6 +724,21 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+ -+#if !defined(CONFIG_BLK_DEV_IO_TRACE) -+ -+/* Avoid possible "unused-variable" warning. See commit message. */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) -+ -+#define bfq_log(bfqd, fmt, args...) do {} while (0) -+ -+#else /* CONFIG_BLK_DEV_IO_TRACE */ -+ -+#include -+ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -@@ -759,6 +773,8 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log(bfqd, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+ -+#endif /* CONFIG_BLK_DEV_IO_TRACE */ - #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - - /* Expiration reasons. */ - -From f11a0e751e741bf94c6a48234824d50b3c0100ad Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 9 Aug 2017 16:40:39 +0200 -Subject: [PATCH 36/51] bfq-sq: fix commit "Remove all get and put of I/O - contexts" in branch bfq-mq - -The commit "Remove all get and put of I/O contexts" erroneously removed -the reset of the field in_service_bic for bfq-sq. This commit re-adds -that missing reset. - -Signed-off-by: Paolo Valente ---- - block/bfq-sched.c | 7 +++++++ - block/bfq-sq-iosched.c | 1 + - 2 files changed, 8 insertions(+) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 9c4e6797d8c9..7425824c26b8 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -1904,6 +1904,13 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) - struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; - struct bfq_entity *entity = in_serv_entity; - -+#ifndef BFQ_MQ -+ if (bfqd->in_service_bic) { -+ put_io_context(bfqd->in_service_bic->icq.ioc); -+ bfqd->in_service_bic = NULL; -+ } -+#endif -+ - bfq_clear_bfqq_wait_request(in_serv_bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqd->in_service_queue = NULL; -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 25da0d1c0622..e1960bf149d8 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -3765,6 +3765,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, - if (!bfqd->in_service_bic) { - atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); - bfqd->in_service_bic = RQ_BIC(rq); -+ BUG_ON(!bfqd->in_service_bic); - } - - if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) - -From eceae5457530df8598557767d7be258ca9384de4 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 9 Aug 2017 22:29:01 +0200 -Subject: [PATCH 37/51] bfq-sq-mq: make lookup_next_entity push up vtime on - expirations - -To provide a very smooth service, bfq starts to serve a bfq_queue -only if the queue is 'eligible', i.e., if the same queue would -have started to be served in the ideal, perfectly fair system that -bfq simulates internally. This is obtained by associating each -queue with a virtual start time, and by computing a special system -virtual time quantity: a queue is eligible only if the system -virtual time has reached the virtual start time of the -queue. Finally, bfq guarantees that, when a new queue must be set -in service, there is always at least one eligible entity for each -active parent entity in the scheduler. To provide this guarantee, -the function __bfq_lookup_next_entity pushes up, for each parent -entity on which it is invoked, the system virtual time to the -minimum among the virtual start times of the entities in the -active tree for the parent entity (more precisely, the push up -occurs if the system virtual time happens to be lower than all -such virtual start times). - -There is however a circumstance in which __bfq_lookup_next_entity -cannot push up the system virtual time for a parent entity, even -if the system virtual time is lower than the virtual start times -of all the child entities in the active tree. It happens if one of -the child entities is in service. In fact, in such a case, there -is already an eligible entity, the in-service one, even if it may -not be not present in the active tree (because in-service entities -may be removed from the active tree). - -Unfortunately, in the last re-design of the -hierarchical-scheduling engine, the reset of the pointer to the -in-service entity for a given parent entity--reset to be done as a -consequence of the expiration of the in-service entity--always -happens after the function __bfq_lookup_next_entity has been -invoked. This causes the function to think that there is still an -entity in service for the parent entity, and then that the system -virtual time cannot be pushed up, even if actually such a -no-more-in-service entity has already been properly reinserted -into the active tree (or in some other tree if no more -active). Yet, the system virtual time *had* to be pushed up, to be -ready to correctly choose the next queue to serve. Because of the -lack of this push up, bfq may wrongly set in service a queue that -had been speculatively pre-computed as the possible -next-in-service queue, but that would no more be the one to serve -after the expiration and the reinsertion into the active trees of -the previously in-service entities. - -This commit addresses this issue by making -__bfq_lookup_next_entity properly push up the system virtual time -if an expiration is occurring. - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 4 +-- - block/bfq-sched.c | 77 ++++++++++++++++++++++++++++++++------------------ - block/bfq-sq-iosched.c | 4 +-- - 3 files changed, 53 insertions(+), 32 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 49ffca1ad6e7..b5c848650375 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -682,7 +682,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, - entity->budget = new_budget; - bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", - new_budget); -- bfq_requeue_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq, false); - } - } - -@@ -2822,7 +2822,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) - - bfq_del_bfqq_busy(bfqd, bfqq, true); - } else { -- bfq_requeue_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq, true); - /* - * Resort priority tree of potential close cooperators. - */ -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 7425824c26b8..f3001af37256 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -33,7 +33,8 @@ static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) - return rb_entry(node, struct bfq_entity, rb_node); - } - --static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -+ bool expiration); - - static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); - -@@ -43,6 +44,8 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); - * @new_entity: if not NULL, pointer to the entity whose activation, - * requeueing or repositionig triggered the invocation of - * this function. -+ * @expiration: id true, this function is being invoked after the -+ * expiration of the in-service entity - * - * This function is called to update sd->next_in_service, which, in - * its turn, may change as a consequence of the insertion or -@@ -61,7 +64,8 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); - * entity. - */ - static bool bfq_update_next_in_service(struct bfq_sched_data *sd, -- struct bfq_entity *new_entity) -+ struct bfq_entity *new_entity, -+ bool expiration) - { - struct bfq_entity *next_in_service = sd->next_in_service; - struct bfq_queue *bfqq; -@@ -120,7 +124,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - if (replace_next) - next_in_service = new_entity; - } else /* invoked because of a deactivation: lookup needed */ -- next_in_service = bfq_lookup_next_entity(sd); -+ next_in_service = bfq_lookup_next_entity(sd, expiration); - - if (next_in_service) { - parent_sched_may_change = !sd->next_in_service || -@@ -1291,10 +1295,12 @@ static void __bfq_activate_requeue_entity(struct bfq_entity *entity, - * @requeue: true if this is a requeue, which implies that bfqq is - * being expired; thus ALL its ancestors stop being served and must - * therefore be requeued -+ * @expiration: true if this function is being invoked in the expiration path -+ * of the in-service queue - */ - static void bfq_activate_requeue_entity(struct bfq_entity *entity, - bool non_blocking_wait_rq, -- bool requeue) -+ bool requeue, bool expiration) - { - struct bfq_sched_data *sd; - -@@ -1307,7 +1313,8 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, - RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && - RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); - -- if (!bfq_update_next_in_service(sd, entity) && !requeue) { -+ if (!bfq_update_next_in_service(sd, entity, expiration) && -+ !requeue) { - BUG_ON(!sd->next_in_service); - break; - } -@@ -1373,6 +1380,8 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, - * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. - * @entity: the entity to deactivate. - * @ins_into_idle_tree: true if the entity can be put into the idle tree -+ * @expiration: true if this function is being invoked in the expiration path -+ * of the in-service queue - */ - static void bfq_deactivate_entity(struct bfq_entity *entity, - bool ins_into_idle_tree, -@@ -1417,7 +1426,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, - * then, since entity has just been - * deactivated, a new one must be found. - */ -- bfq_update_next_in_service(sd, NULL); -+ bfq_update_next_in_service(sd, NULL, expiration); - - if (sd->next_in_service || sd->in_service_entity) { - /* -@@ -1495,7 +1504,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, - "invoking udpdate_next for this entity"); - } - #endif -- if (!bfq_update_next_in_service(sd, entity) && -+ if (!bfq_update_next_in_service(sd, entity, expiration) && - !expiration) - /* - * next_in_service unchanged or not causing -@@ -1524,7 +1533,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_vtime_jump: new value %llu", -- root_entity->min_start); -+ ((root_entity->min_start>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = -@@ -1533,7 +1542,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "calc_vtime_jump: new value %llu", -- root_entity->min_start); -+ ((root_entity->min_start>>10)*1000)>>12); - } - #endif - return root_entity->min_start; -@@ -1615,17 +1624,9 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, - * 3) is idle. - */ - static struct bfq_entity * --__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service --#if 0 -- , bool force --#endif -- ) -+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) - { -- struct bfq_entity *entity --#if 0 -- , *new_next_in_service = NULL --#endif -- ; -+ struct bfq_entity *entity; - u64 new_vtime; - struct bfq_queue *bfqq; - -@@ -1667,8 +1668,9 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "__lookup_next: start %llu vtime %llu st %p", -+ "__lookup_next: start %llu vtime %llu (%llu) st %p", - ((entity->start>>10)*1000)>>12, -+ ((st->vtime>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); - } - #endif -@@ -1681,12 +1683,14 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service - /** - * bfq_lookup_next_entity - return the first eligible entity in @sd. - * @sd: the sched_data. -+ * @expiration: true if we are on the expiration path of the in-service queue - * - * This function is invoked when there has been a change in the trees -- * for sd, and we need know what is the new next entity after this -- * change. -+ * for sd, and we need to know what is the new next entity to serve -+ * after this change. - */ --static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -+ bool expiration) - { - struct bfq_service_tree *st = sd->service_tree; - struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); -@@ -1716,8 +1720,24 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) - * class, unless the idle class needs to be served. - */ - for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { -+ /* -+ * If expiration is true, then bfq_lookup_next_entity -+ * is being invoked as a part of the expiration path -+ * of the in-service queue. In this case, even if -+ * sd->in_service_entity is not NULL, -+ * sd->in_service_entiy at this point is actually not -+ * in service any more, and, if needed, has already -+ * been properly queued or requeued into the right -+ * tree. The reason why sd->in_service_entity is still -+ * not NULL here, even if expiration is true, is that -+ * sd->in_service_entiy is reset as a last step in the -+ * expiration path. So, if expiration is true, tell -+ * __bfq_lookup_next_entity that there is no -+ * sd->in_service_entity. -+ */ - entity = __bfq_lookup_next_entity(st + class_idx, -- sd->in_service_entity); -+ sd->in_service_entity && -+ !expiration); - - if (entity) - break; -@@ -1891,7 +1911,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - for_each_entity(entity) { - struct bfq_sched_data *sd = entity->sched_data; - -- if(!bfq_update_next_in_service(sd, NULL)) -+ if (!bfq_update_next_in_service(sd, NULL, false)) - break; - } - -@@ -1951,16 +1971,17 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - entity->on_st); - - bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), -- false); -+ false, false); - bfq_clear_bfqq_non_blocking_wait_rq(bfqq); - } - --static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool expiration) - { - struct bfq_entity *entity = &bfqq->entity; - - bfq_activate_requeue_entity(entity, false, -- bfqq == bfqd->in_service_queue); -+ bfqq == bfqd->in_service_queue, expiration); - } - - static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index e1960bf149d8..42393ab889a9 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -644,7 +644,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, - entity->budget = new_budget; - bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", - new_budget); -- bfq_requeue_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq, false); - } - } - -@@ -2715,7 +2715,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) - - bfq_del_bfqq_busy(bfqd, bfqq, true); - } else { -- bfq_requeue_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq, true); - /* - * Resort priority tree of potential close cooperators. - */ - -From ee9f95b24e1d88ffba4845981c2a4684aefd0245 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 9 Aug 2017 22:53:00 +0200 -Subject: [PATCH 38/51] bfq-sq-mq: remove direct switch to an entity in higher - class - -If the function bfq_update_next_in_service is invoked as a consequence -of the activation or requeueing of an entity, say E, and finds out -that E belongs to a higher-priority class than that of the current -next-in-service entity, then it sets next_in_service directly to -E. But this may lead to anomalous schedules, because E may happen not -be eligible for service, because its virtual start time is higher than -the system virtual time for its service tree. - -This commit addresses this issue by simply removing this direct -switch. - -Signed-off-by: Paolo Valente ---- - block/bfq-sched.c | 19 +++++-------------- - 1 file changed, 5 insertions(+), 14 deletions(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index f3001af37256..b1a59088db88 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -76,9 +76,8 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - * or repositiong of an entity that does not coincide with - * sd->next_in_service, then a full lookup in the active tree - * can be avoided. In fact, it is enough to check whether the -- * just-modified entity has a higher priority than -- * sd->next_in_service, or, even if it has the same priority -- * as sd->next_in_service, is eligible and has a lower virtual -+ * just-modified entity has the same priority as -+ * sd->next_in_service, is eligible and has a lower virtual - * finish time than sd->next_in_service. If this compound - * condition holds, then the new entity becomes the new - * next_in_service. Otherwise no change is needed. -@@ -94,9 +93,8 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - - /* - * If there is already a next_in_service candidate -- * entity, then compare class priorities or timestamps -- * to decide whether to replace sd->service_tree with -- * new_entity. -+ * entity, then compare timestamps to decide whether -+ * to replace sd->service_tree with new_entity. - */ - if (next_in_service) { - unsigned int new_entity_class_idx = -@@ -104,10 +102,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_service_tree *st = - sd->service_tree + new_entity_class_idx; - -- /* -- * For efficiency, evaluate the most likely -- * sub-condition first. -- */ - replace_next = - (new_entity_class_idx == - bfq_class_idx(next_in_service) -@@ -115,10 +109,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - !bfq_gt(new_entity->start, st->vtime) - && - bfq_gt(next_in_service->finish, -- new_entity->finish)) -- || -- new_entity_class_idx < -- bfq_class_idx(next_in_service); -+ new_entity->finish)); - } - - if (replace_next) - -From a3fdc5af40537355b68c1f0d3997c5a5fb54b9ce Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Thu, 10 Aug 2017 08:15:50 +0200 -Subject: [PATCH 39/51] bfq-sq-mq: guarantee update_next_in_service always - returns an eligible entity - -If the function bfq_update_next_in_service is invoked as a consequence -of the activation or requeueing of an entity, say E, then it doesn't -invoke bfq_lookup_next_entity to get the next-in-service entity. In -contrast, it follows a shorter path: if E happens to be eligible (see -commit "bfq-sq-mq: make lookup_next_entity push up vtime on -expirations" for details on eligibility) and to have a lower virtual -finish time than the current candidate as next-in-service entity, then -E directly becomes the next-in-service entity. Unfortunately, there is -a corner case for which this shorter path makes -bfq_update_next_in_service choose a non eligible entity: it occurs if -both E and the current next-in-service entity happen to be non -eligible when bfq_update_next_in_service is invoked. In this case, E -is not set as next-in-service, and, since bfq_lookup_next_entity is -not invoked, the state of the parent entity is not updated so as to -end up with an eligible entity as the proper next-in-service entity. - -In this respect, next-in-service is actually allowed to be non -eligible while some queue is in service: since no system-virtual-time -push-up can be performed in that case (see again commit "bfq-sq-mq: -make lookup_next_entity push up vtime on expirations" for details), -next-in-service is chosen, speculatively, as a function of the -possible value that the system virtual time may get after a push -up. But the correctness of the schedule breaks if next-in-service is -still a non eligible entity when it is time to set in service the next -entity. Unfortunately, this may happen in the above corner case. - -This commit fixes this problem by making bfq_update_next_in_service -invoke bfq_lookup_next_entity not only if the above shorter path -cannot be taken, but also if the shorter path is taken but fails to -yield an eligible next-in-service entity. - -Signed-off-by: Paolo Valente ---- - block/bfq-sched.c | 38 ++++++++++++++++++++++++++++---------- - 1 file changed, 28 insertions(+), 10 deletions(-) - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index b1a59088db88..e4a2553a2d2c 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -70,6 +70,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *next_in_service = sd->next_in_service; - struct bfq_queue *bfqq; - bool parent_sched_may_change = false; -+ bool change_without_lookup = false; - - /* - * If this update is triggered by the activation, requeueing -@@ -89,7 +90,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - * set to true, and left as true if - * sd->next_in_service is NULL. - */ -- bool replace_next = true; -+ change_without_lookup = true; - - /* - * If there is already a next_in_service candidate -@@ -102,7 +103,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_service_tree *st = - sd->service_tree + new_entity_class_idx; - -- replace_next = -+ change_without_lookup = - (new_entity_class_idx == - bfq_class_idx(next_in_service) - && -@@ -112,15 +113,32 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - new_entity->finish)); - } - -- if (replace_next) -+ if (change_without_lookup) { - next_in_service = new_entity; -- } else /* invoked because of a deactivation: lookup needed */ -+ bfqq = bfq_entity_to_bfqq(next_in_service); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "update_next_in_service: chose without lookup"); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ else { -+ struct bfq_group *bfqg = -+ container_of(next_in_service, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data*)bfqg->bfqd, bfqg, -+ "update_next_in_service: chose without lookup"); -+ } -+#endif -+ } -+ } -+ -+ if (!change_without_lookup) /* lookup needed */ - next_in_service = bfq_lookup_next_entity(sd, expiration); - -- if (next_in_service) { -+ if (next_in_service) - parent_sched_may_change = !sd->next_in_service || - bfq_update_parent_budget(next_in_service); -- } - - sd->next_in_service = next_in_service; - -@@ -1053,7 +1071,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "__activate_entity: new queue finish %llu", -+ "update_fin_time_enqueue: new queue finish %llu", - ((entity->finish>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -1061,7 +1079,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "__activate_entity: new group finish %llu", -+ "update_fin_time_enqueue: new group finish %llu", - ((entity->finish>>10)*1000)>>12); - #endif - } -@@ -1071,7 +1089,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "__activate_entity: queue %seligible in st %p", -+ "update_fin_time_enqueue: queue %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -1079,7 +1097,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "__activate_entity: group %seligible in st %p", -+ "update_fin_time_enqueue: group %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); - #endif - } - -From 6565e4d1aac029b6f0a5d86a4c6ef38608838eac Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Thu, 31 Aug 2017 19:24:26 +0200 -Subject: [PATCH 40/51] doc, block, bfq: fix some typos and stale sentences - -Signed-off-by: Paolo Valente -Reviewed-by: Jeremy Hickman -Reviewed-by: Laurentiu Nicola ---- - Documentation/block/bfq-iosched.txt | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 0e59f1c9d30e..dcfe15523da3 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -17,7 +17,7 @@ instances of BFQ are available (otherwise only the first instance): - - bfq-mq: development version of BFQ for blk-mq; this version contains - also all latest features and fixes not yet landed in mainline, plus many - safety checks --- bfq: BFQ for legacy blk; also this version contains latest features -+- bfq-sq: BFQ for legacy blk; also this version contains latest features - and fixes, as well as safety checks - - In its default configuration, BFQ privileges latency over - -From 261ee8cc9f43e03d790a07184f0bcaa504ee6737 Mon Sep 17 00:00:00 2001 -From: Luca Miccio -Date: Wed, 13 Sep 2017 12:03:56 +0200 -Subject: [PATCH 41/51] bfq-mq, bfq-sq: Disable writeback throttling - -Similarly to CFQ, BFQ has its write-throttling heuristics, and it -is better not to combine them with further write-throttling -heuristics of a different nature. -So this commit disables write-back throttling for a device if BFQ -is used as I/O scheduler for that device. - -Signed-off-by: Luca Miccio -Signed-off-by: Paolo Valente -Tested-by: Oleksandr Natalenko ---- - block/bfq-mq-iosched.c | 2 ++ - block/bfq-sq-iosched.c | 7 +++++++ - 2 files changed, 9 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index b5c848650375..7d27d5b3befb 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -89,6 +89,7 @@ - #include "blk-mq-tag.h" - #include "blk-mq-sched.h" - #include "bfq-mq.h" -+#include "blk-wbt.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ - static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -@@ -5260,6 +5261,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfq_init_root_group(bfqd->root_group, bfqd); - bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); - -+ wbt_disable_default(q); - return 0; - - out_free: -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 42393ab889a9..6fdc3b1d5bb8 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -83,6 +83,7 @@ - #include - #include "blk.h" - #include "bfq.h" -+#include "blk-wbt.h" - - /* Expiration time of sync (0) and async (1) requests, in ns. */ - static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -@@ -4976,6 +4977,11 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - return -ENOMEM; - } - -+static void bfq_registered_queue(struct request_queue *q) -+{ -+ wbt_disable_default(q); -+} -+ - static void bfq_slab_kill(void) - { - kmem_cache_destroy(bfq_pool); -@@ -5285,6 +5291,7 @@ static struct elevator_type iosched_bfq = { - .elevator_may_queue_fn = bfq_may_queue, - .elevator_init_fn = bfq_init_queue, - .elevator_exit_fn = bfq_exit_queue, -+ .elevator_registered_fn = bfq_registered_queue, - }, - .icq_size = sizeof(struct bfq_io_cq), - .icq_align = __alignof__(struct bfq_io_cq), - -From 40ea0aed088791da27fcfa51f3b64d1f96b0d06e Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Tue, 12 Sep 2017 16:45:53 +0200 -Subject: [PATCH 42/51] bfq-mq, bfq-sq: fix wrong init of saved start time for - weight raising - -This commit fixes a bug that causes bfq to fail to guarantee a high -responsiveness on some drives, if there is heavy random read+write I/O -in the background. More precisely, such a failure allowed this bug to -be found [1], but the bug may well cause other yet unreported -anomalies. - -BFQ raises the weight of the bfq_queues associated with soft real-time -applications, to privilege the I/O, and thus reduce latency, for these -applications. This mechanism is named soft-real-time weight raising in -BFQ. A soft real-time period may happen to be nested into an -interactive weight raising period, i.e., it may happen that, when a -bfq_queue switches to a soft real-time weight-raised state, the -bfq_queue is already being weight-raised because deemed interactive -too. In this case, BFQ saves in a special variable -wr_start_at_switch_to_srt, the time instant when the interactive -weight-raising period started for the bfq_queue, i.e., the time -instant when BFQ started to deem the bfq_queue interactive. This value -is then used to check whether the interactive weight-raising period -would still be in progress when the soft real-time weight-raising -period ends. If so, interactive weight raising is restored for the -bfq_queue. This restore is useful, in particular, because it prevents -bfq_queues from losing their interactive weight raising prematurely, -as a consequence of spurious, short-lived soft real-time -weight-raising periods caused by wrong detections as soft real-time. - -If, instead, a bfq_queue switches to soft-real-time weight raising -while it *is not* already in an interactive weight-raising period, -then the variable wr_start_at_switch_to_srt has no meaning during the -following soft real-time weight-raising period. Unfortunately the -handling of this case is wrong in BFQ: not only the variable is not -flagged somehow as meaningless, but it is also set to the time when -the switch to soft real-time weight-raising occurs. This may cause an -interactive weight-raising period to be considered mistakenly as still -in progress, and thus a spurious interactive weight-raising period to -start for the bfq_queue, at the end of the soft-real-time -weight-raising period. In particular the spurious interactive -weight-raising period will be considered as still in progress, if the -soft-real-time weight-raising period does not last very long. The -bfq_queue will then be wrongly privileged and, if I/O bound, will -unjustly steal bandwidth to truly interactive or soft real-time -bfq_queues, harming responsiveness and low latency. - -This commit fixes this issue by just setting wr_start_at_switch_to_srt -to minus infinity (farthest past time instant according to jiffies -macros): when the soft-real-time weight-raising period ends, certainly -no interactive weight-raising period will be considered as still in -progress. - -[1] Background I/O Type: Random - Background I/O mix: Reads and writes -- Application to start: LibreOffice Writer in -http://www.phoronix.com/scan.php?page=news_item&px=Linux-4.13-IO-Laptop - -Signed-off-by: Paolo Valente -Signed-off-by: Angelo Ruocco -Tested-by: Oleksandr Natalenko -Tested-by: Lee Tibbert -Tested-by: Mirko Montanari ---- - block/bfq-mq-iosched.c | 50 +++++++++++++++++++++++++++++++------------------- - block/bfq-sq-iosched.c | 50 +++++++++++++++++++++++++++++++------------------- - 2 files changed, 62 insertions(+), 38 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 7d27d5b3befb..f378519b6d33 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1204,6 +1204,24 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, - return wr_or_deserves_wr; - } - -+/* -+ * Return the farthest future time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_greatest_from_now(void) -+{ -+ return jiffies + MAX_JIFFY_OFFSET; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ - static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - unsigned int old_wr_coeff, -@@ -1218,7 +1236,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else { -- bfqq->wr_start_at_switch_to_srt = jiffies; -+ /* -+ * No interactive weight raising in progress -+ * here: assign minus infinity to -+ * wr_start_at_switch_to_srt, to make sure -+ * that, at the end of the soft-real-time -+ * weight raising periods that is starting -+ * now, no interactive weight-raising period -+ * may be wrongly considered as still in -+ * progress (and thus actually started by -+ * mistake). -+ */ -+ bfqq->wr_start_at_switch_to_srt = -+ bfq_smallest_from_now(); - bfqq->wr_coeff = bfqd->bfq_wr_coeff * - BFQ_SOFTRT_WEIGHT_FACTOR; - bfqq->wr_cur_max_time = -@@ -3174,24 +3204,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); - } - --/* -- * Return the farthest future time instant according to jiffies -- * macros. -- */ --static unsigned long bfq_greatest_from_now(void) --{ -- return jiffies + MAX_JIFFY_OFFSET; --} -- --/* -- * Return the farthest past time instant according to jiffies -- * macros. -- */ --static unsigned long bfq_smallest_from_now(void) --{ -- return jiffies - MAX_JIFFY_OFFSET; --} -- - /** - * bfq_bfqq_expire - expire a queue. - * @bfqd: device owning the queue. -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 6fdc3b1d5bb8..f4654436cd55 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -1165,6 +1165,24 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, - return wr_or_deserves_wr; - } - -+/* -+ * Return the farthest future time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_greatest_from_now(void) -+{ -+ return jiffies + MAX_JIFFY_OFFSET; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; -+} -+ - static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - unsigned int old_wr_coeff, -@@ -1179,7 +1197,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else { -- bfqq->wr_start_at_switch_to_srt = jiffies; -+ /* -+ * No interactive weight raising in progress -+ * here: assign minus infinity to -+ * wr_start_at_switch_to_srt, to make sure -+ * that, at the end of the soft-real-time -+ * weight raising periods that is starting -+ * now, no interactive weight-raising period -+ * may be wrongly considered as still in -+ * progress (and thus actually started by -+ * mistake). -+ */ -+ bfqq->wr_start_at_switch_to_srt = -+ bfq_smallest_from_now(); - bfqq->wr_coeff = bfqd->bfq_wr_coeff * - BFQ_SOFTRT_WEIGHT_FACTOR; - bfqq->wr_cur_max_time = -@@ -3067,24 +3097,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); - } - --/* -- * Return the farthest future time instant according to jiffies -- * macros. -- */ --static unsigned long bfq_greatest_from_now(void) --{ -- return jiffies + MAX_JIFFY_OFFSET; --} -- --/* -- * Return the farthest past time instant according to jiffies -- * macros. -- */ --static unsigned long bfq_smallest_from_now(void) --{ -- return jiffies - MAX_JIFFY_OFFSET; --} -- - /** - * bfq_bfqq_expire - expire a queue. - * @bfqd: device owning the queue. - -From 9dbea44b6f721baeff35b9fdf628ec55fe00e09d Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Thu, 14 Sep 2017 05:12:58 -0400 -Subject: [PATCH 43/51] Fix commit "Unnest request-queue and ioc locks from - scheduler locks" - -The commit "Unnest request-queue and ioc locks from scheduler locks" -mistakenly removed the setting of the split flag in function -bfq_prepare_request. This commit puts this missing instruction back in -its place. - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 12 ++++++++++++ - 1 file changed, 12 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index f378519b6d33..288078e68a2a 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -744,6 +744,12 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -+ __func__, -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); -+ - if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time))) { -@@ -2208,6 +2214,11 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; - bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -+ __func__, -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); - } - - static void -@@ -4950,6 +4961,7 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio) - bic->saved_in_large_burst = true; - - bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; - - if (!bfqq) - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, - -From d4ebb2a66a23dc183792088c521f2be2193b56db Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Fri, 15 Sep 2017 01:53:51 -0400 -Subject: [PATCH 44/51] bfq-sq, bfq-mq: check and switch back to interactive wr - also on queue split - -As already explained in the message of commit "bfq-mq, bfq-sq: fix -wrong init of saved start time for weight raising", if a soft -real-time weight-raising period happens to be nested in a larger -interactive weight-raising period, then BFQ restores the interactive -weight raising at the end of the soft real-time weight raising. In -particular, BFQ checks whether the latter has ended only on request -dispatches. - -Unfortunately, the above scheme fails to restore interactive weight -raising in the following corner case: if a bfq_queue, say Q, -1) Is merged with another bfq_queue while it is in a nested soft -real-time weight-raising period. The weight-raising state of Q is -then saved, and not considered any longer until a split occurs. -2) Is split from the other bfq_queue(s) at a time instant when its -soft real-time weight raising is already finished. -On the split, while resuming the previous, soft real-time -weight-raised state of the bfq_queue Q, BFQ checks whether the -current soft real-time weight-raising period is actually over. If so, -BFQ switches weight raising off for Q, *without* checking whether the -soft real-time period was actually nested in a non-yet-finished -interactive weight-raising period. - -This commit addresses this issue by adding the above missing check in -bfq_queue splits, and restoring interactive weight raising if needed. - -Signed-off-by: Paolo Valente -Tested-by: Angelo Ruocco -Tested-by: Mirko Montanari ---- - block/bfq-mq-iosched.c | 29 +++++++++++++++++++++-------- - block/bfq-sq-iosched.c | 35 +++++++++++++++++++++++++++-------- - 2 files changed, 48 insertions(+), 16 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 288078e68a2a..6130a95c6497 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -716,6 +716,15 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) - return dur; - } - -+/* switch back from soft real-time to interactive weight raising */ -+static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, -+ struct bfq_data *bfqd) -+{ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; -+} -+ - static void - bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - struct bfq_io_cq *bic, bool bfq_already_existing) -@@ -753,12 +762,20 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time))) { -- bfq_log_bfqq(bfqq->bfqd, bfqq, -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) { -+ switch_back_to_interactive_wr(bfqq, bfqd); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "resume state: switching back to interactive"); -+ } else { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, - "resume state: switching off wr (%lu + %lu < %lu)", - bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, - jiffies); -- -- bfqq->wr_coeff = 1; -+ } - } - - /* make sure weight will be updated, however we got here */ -@@ -3820,11 +3837,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_wr_duration(bfqd))) - bfq_bfqq_end_wr(bfqq); - else { -- /* switch back to interactive wr */ -- bfqq->wr_coeff = bfqd->bfq_wr_coeff; -- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -- bfqq->last_wr_start_finish = -- bfqq->wr_start_at_switch_to_srt; -+ switch_back_to_interactive_wr(bfqq, bfqd); - BUG_ON(time_is_after_jiffies( - bfqq->last_wr_start_finish)); - bfqq->entity.prio_changed = 1; -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index f4654436cd55..e07d5d1c0d40 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -678,6 +678,15 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) - return dur; - } - -+/* switch back from soft real-time to interactive weight raising */ -+static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, -+ struct bfq_data *bfqd) -+{ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; -+} -+ - static void - bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - struct bfq_io_cq *bic, bool bfq_already_existing) -@@ -705,15 +714,29 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -+ __func__, -+ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, -+ bfqq->wr_cur_max_time); -+ - if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time))) { -- bfq_log_bfqq(bfqq->bfqd, bfqq, -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) { -+ switch_back_to_interactive_wr(bfqq, bfqd); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "resume state: switching back to interactive"); -+ } else { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, - "resume state: switching off wr (%lu + %lu < %lu)", - bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, - jiffies); -- -- bfqq->wr_coeff = 1; -+ } - } - - /* make sure weight will be updated, however we got here */ -@@ -3703,11 +3726,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_wr_duration(bfqd))) - bfq_bfqq_end_wr(bfqq); - else { -- /* switch back to interactive wr */ -- bfqq->wr_coeff = bfqd->bfq_wr_coeff; -- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -- bfqq->last_wr_start_finish = -- bfqq->wr_start_at_switch_to_srt; -+ switch_back_to_interactive_wr(bfqq, bfqd); - BUG_ON(time_is_after_jiffies( - bfqq->last_wr_start_finish)); - bfqq->entity.prio_changed = 1; - -From 9eaec0c3a2d675763b09da81c9117a9c43bce942 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Fri, 15 Sep 2017 04:58:33 -0400 -Subject: [PATCH 45/51] bfq-sq, bfq-mq: let early-merged queues be - weight-raised on split too - -A just-created bfq_queue, say Q, may happen to be merged with another -bfq_queue on the very first invocation of the function -__bfq_insert_request. In such a case, even if Q would clearly deserve -interactive weight raising (as it has just been created), the function -bfq_add_request does not make it to be invoked for Q, and thus to -activate weight raising for Q. As a consequence, when the state of Q -is saved for a possible future restore, after a split of Q from the -other bfq_queue(s), such a state happens to be (unjustly) -non-weight-raised. Then the bfq_queue will not enjoy any weight -raising on the split, even if should still be in an interactive -weight-raising period when the split occurs. - -This commit solves this problem as follows, for a just-created -bfq_queue that is being early-merged: it stores directly, in the saved -state of the bfq_queue, the weight-raising state that would have been -assigned to the bfq_queue if not early-merged. - -Signed-off-by: Paolo Valente -Tested-by: Angelo Ruocco -Tested-by: Mirko Montanari ---- - block/bfq-mq-iosched.c | 28 +++++++++++++++++++++++----- - block/bfq-sq-iosched.c | 28 +++++++++++++++++++++++----- - 2 files changed, 46 insertions(+), 10 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 6130a95c6497..af84e506e897 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -2226,10 +2226,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -- bic->saved_wr_coeff = bfqq->wr_coeff; -- bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ if (unlikely(bfq_bfqq_just_created(bfqq) && -+ !bfq_bfqq_in_large_burst(bfqq))) { -+ /* -+ * bfqq being merged ritgh after being created: bfqq -+ * would have deserved interactive weight raising, but -+ * did not make it to be set in a weight-raised state, -+ * because of this early merge. Store directly the -+ * weight-raising state that would have been assigned -+ * to bfqq, so that to avoid that bfqq unjustly fails -+ * to enjoy weight raising if split soon. -+ */ -+ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; -+ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); -+ bic->saved_last_wr_start_finish = jiffies; -+ } else { -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ } - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - bfq_log_bfqq(bfqq->bfqd, bfqq, - "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -@@ -4560,7 +4577,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - bfqq->allocated); - - new_bfqq->ref++; -- bfq_clear_bfqq_just_created(bfqq); - /* - * If the bic associated with the process - * issuing this request still points to bfqq -@@ -4572,6 +4588,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); -+ -+ bfq_clear_bfqq_just_created(bfqq); - /* - * rq is about to be enqueued into new_bfqq, - * release rq reference on bfqq -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index e07d5d1c0d40..0c48f527fe3f 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -2105,10 +2105,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -- bic->saved_wr_coeff = bfqq->wr_coeff; -- bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ if (unlikely(bfq_bfqq_just_created(bfqq) && -+ !bfq_bfqq_in_large_burst(bfqq))) { -+ /* -+ * bfqq being merged ritgh after being created: bfqq -+ * would have deserved interactive weight raising, but -+ * did not make it to be set in a weight-raised state, -+ * because of this early merge. Store directly the -+ * weight-raising state that would have been assigned -+ * to bfqq, so that to avoid that bfqq unjustly fails -+ * to enjoy weight raising if split soon. -+ */ -+ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; -+ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); -+ bic->saved_last_wr_start_finish = jiffies; -+ } else { -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = -+ bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ } - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - } - -@@ -4383,10 +4400,11 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) - new_bfqq->allocated[rq_data_dir(rq)]++; - bfqq->allocated[rq_data_dir(rq)]--; - new_bfqq->ref++; -- bfq_clear_bfqq_just_created(bfqq); - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); -+ -+ bfq_clear_bfqq_just_created(bfqq); - /* - * rq is about to be enqueued into new_bfqq, - * release rq reference on bfqq - -From cb05150675095cb97ab22e4955eb82e4fe2e9dbe Mon Sep 17 00:00:00 2001 -From: omcira -Date: Mon, 18 Sep 2017 10:49:48 +0200 -Subject: [PATCH 46/51] bfq-sq, bfq-mq: decrease burst size when queues in - burst exit - -If many queues belonging to the same group happen to be created -shortly after each other, then the concurrent processes associated -with these queues have typically a common goal, and they get it done -as soon as possible if not hampered by device idling. Examples are -processes spawned by git grep, or by systemd during boot. As for -device idling, this mechanism is currently necessary for weight -raising to succeed in its goal: privileging I/O. In view of these -facts, BFQ does not provide the above queues with either weight -raising or device idling. - -On the other hand, a burst of queue creations may be caused also by -the start-up of a complex application. In this case, these queues need -usually to be served one after the other, and as quickly as possible, -to maximise responsiveness. Therefore, in this case the best strategy -is to weight-raise all the queues created during the burst, i.e., the -exact opposite of the strategy for the above case. - -To distinguish between the two cases, BFQ uses an empirical burst-size -threshold, found through extensive tests and monitoring of daily -usage. Only large bursts, i.e., burst with a size above this -threshold, are considered as generated by a high number of parallel -processes. In this respect, upstart-based boot proved to be rather -hard to detect as generating a large burst of queue creations, because -with upstart most of the queues created in a burst exit *before* the -next queues in the same burst are created. To address this issue, I -changed the burst-detection mechanism so as to not decrease the size -of the current burst even if one of the queues in the burst is -eliminated. - -Unfortunately, this missing decrease causes false positives on very -fast systems: on the start-up of a complex application, such as -libreoffice writer, so many queues are created, served and exited -shortly after each other, that a large burst of queue creations is -wrongly detected as occurring. These false positives just disappear if -the size of a burst is decreased when one of the queues in the burst -exits. This commit restores the missing burst-size decrease, relying -of the fact that upstart is apparently unlikely to be used on systems -running this and future versions of the kernel. - -Signed-off-by: Paolo Valente -Signed-off-by: Mauro Andreolini -Signed-off-by: Angelo Ruocco -Tested-by: Mirko Montanari ---- - block/bfq-mq-iosched.c | 12 +++--------- - block/bfq-sq-iosched.c | 12 +++--------- - 2 files changed, 6 insertions(+), 18 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index af84e506e897..6e413d7236ce 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4111,16 +4111,10 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -- if (bfq_bfqq_sync(bfqq)) -- /* -- * The fact that this queue is being destroyed does not -- * invalidate the fact that this queue may have been -- * activated during the current burst. As a consequence, -- * although the queue does not exist anymore, and hence -- * needs to be removed from the burst list if there, -- * the burst size has not to be decremented. -- */ -+ if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { - hlist_del_init(&bfqq->burst_list_node); -+ bfqq->bfqd->burst_size--; -+ } - - if (bfqq->bfqd) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 0c48f527fe3f..93034dd7b801 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -3945,16 +3945,10 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -- if (bfq_bfqq_sync(bfqq)) -- /* -- * The fact that this queue is being destroyed does not -- * invalidate the fact that this queue may have been -- * activated during the current burst. As a consequence, -- * although the queue does not exist anymore, and hence -- * needs to be removed from the burst list if there, -- * the burst size has not to be decremented. -- */ -+ if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { - hlist_del_init(&bfqq->burst_list_node); -+ bfqq->bfqd->burst_size--; -+ } - - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - -From 60de7307d5e3ed7f272f12c900f631bdfe114db2 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Fri, 6 Oct 2017 19:35:38 +0200 -Subject: [PATCH 47/51] bfq-sq, bfq-mq: fix unbalanced decrements of burst size -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The commit "bfq-sq, bfq-mq: decrease burst size when queues in burst -exit" introduced the decrement of burst_size on the removal of a -bfq_queue from the burst list. Unfortunately, this decrement can -happen to be performed even when burst size is already equal to 0, -because of unbalanced decrements. A description follows of the cause -of these unbalanced decrements, namely a wrong assumption, and of the -way how this wrong assumption leads to unbalanced decrements. - -The wrong assumption is that a bfq_queue can exit only if the process -associated with the bfq_queue has exited. This is false, because a -bfq_queue, say Q, may exit also as a consequence of a merge with -another bfq_queue. In this case, Q exits because the I/O of its -associated process has been redirected to another bfq_queue. - -The decrement unbalance occurs because Q may then be re-created after -a split, and added back to the current burst list, *without* -incrementing burst_size. burst_size is not incremented because Q is -not a new bfq_queue added to the burst list, but a bfq_queue only -temporarily removed from the list, and, before the commit "bfq-sq, -bfq-mq: decrease burst size when queues in burst exit", burst_size was -not decremented when Q was removed. - -This commit addresses this issue by just checking whether the exiting -bfq_queue is a merged bfq_queue, and, in that case, not decrementing -burst_size. Unfortunately, this still leaves room for unbalanced -decrements, in the following rarer case: on a split, the bfq_queue -happens to be inserted into a different burst list than that it was -removed from when merged. If this happens, the number of elements in -the new burst list becomes higher than burst_size (by one). When the -bfq_queue then exits, it is of course not in a merged state any -longer, thus burst_size is decremented, which results in an unbalanced -decrement. To handle this sporadic, unlucky case in a simple way, -this commit also checks that burst_size is larger than 0 before -decrementing it. - -Finally, this commit removes an useless, extra check: the check that -the bfq_queue is sync, performed before checking whether the bfq_queue -is in the burst list. This extra check is redundant, because only sync -bfq_queues can be inserted into the burst list. - -Reported-by: Philip Müller -Signed-off-by: Paolo Valente -Signed-off-by: Angelo Ruocco -Tested-by: Philip Müller -Tested-by: Oleksandr Natalenko -Tested-by: Lee Tibbert ---- - block/bfq-mq-iosched.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-- - block/bfq-sq-iosched.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-- - 2 files changed, 114 insertions(+), 4 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 6e413d7236ce..816bac6cdd3d 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4111,9 +4111,36 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -- if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { -+ if (!hlist_unhashed(&bfqq->burst_list_node)) { - hlist_del_init(&bfqq->burst_list_node); -- bfqq->bfqd->burst_size--; -+ /* -+ * Decrement also burst size after the removal, if the -+ * process associated with bfqq is exiting, and thus -+ * does not contribute to the burst any longer. This -+ * decrement helps filter out false positives of large -+ * bursts, when some short-lived process (often due to -+ * the execution of commands by some service) happens -+ * to start and exit while a complex application is -+ * starting, and thus spawning several processes that -+ * do I/O (and that *must not* be treated as a large -+ * burst, see comments on bfq_handle_burst). -+ * -+ * In particular, the decrement is performed only if: -+ * 1) bfqq is not a merged queue, because, if it is, -+ * then this free of bfqq is not triggered by the exit -+ * of the process bfqq is associated with, but exactly -+ * by the fact that bfqq has just been merged. -+ * 2) burst_size is greater than 0, to handle -+ * unbalanced decrements. Unbalanced decrements may -+ * happen in te following case: bfqq is inserted into -+ * the current burst list--without incrementing -+ * bust_size--because of a split, but the current -+ * burst list is not the burst list bfqq belonged to -+ * (see comments on the case of a split in -+ * bfq_set_request). -+ */ -+ if (bfqq->bic && bfqq->bfqd->burst_size > 0) -+ bfqq->bfqd->burst_size--; - } - - if (bfqq->bfqd) -@@ -4940,6 +4967,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, - "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) -+ /* -+ * If bfqq was in the current -+ * burst list before being -+ * merged, then we have to add -+ * it back. And we do not need -+ * to increase burst_size, as -+ * we did not decrement -+ * burst_size when we removed -+ * bfqq from the burst list as -+ * a consequence of a merge -+ * (see comments in -+ * bfq_put_queue). In this -+ * respect, it would be rather -+ * costly to know whether the -+ * current burst list is still -+ * the same burst list from -+ * which bfqq was removed on -+ * the merge. To avoid this -+ * cost, if bfqq was in a -+ * burst list, then we add -+ * bfqq to the current burst -+ * list without any further -+ * check. This can cause -+ * inappropriate insertions, -+ * but rarely enough to not -+ * harm the detection of large -+ * bursts significantly. -+ */ - hlist_add_head(&bfqq->burst_list_node, - &bfqd->burst_list); - } -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 93034dd7b801..4bbd7f4c0154 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -3945,9 +3945,36 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); - -- if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { -+ if (!hlist_unhashed(&bfqq->burst_list_node)) { - hlist_del_init(&bfqq->burst_list_node); -- bfqq->bfqd->burst_size--; -+ /* -+ * Decrement also burst size after the removal, if the -+ * process associated with bfqq is exiting, and thus -+ * does not contribute to the burst any longer. This -+ * decrement helps filter out false positives of large -+ * bursts, when some short-lived process (often due to -+ * the execution of commands by some service) happens -+ * to start and exit while a complex application is -+ * starting, and thus spawning several processes that -+ * do I/O (and that *must not* be treated as a large -+ * burst, see comments on bfq_handle_burst). -+ * -+ * In particular, the decrement is performed only if: -+ * 1) bfqq is not a merged queue, because, if it is, -+ * then this free of bfqq is not triggered by the exit -+ * of the process bfqq is associated with, but exactly -+ * by the fact that bfqq has just been merged. -+ * 2) burst_size is greater than 0, to handle -+ * unbalanced decrements. Unbalanced decrements may -+ * happen in te following case: bfqq is inserted into -+ * the current burst list--without incrementing -+ * bust_size--because of a split, but the current -+ * burst list is not the burst list bfqq belonged to -+ * (see comments on the case of a split in -+ * bfq_set_request). -+ */ -+ if (bfqq->bic && bfqq->bfqd->burst_size > 0) -+ bfqq->bfqd->burst_size--; - } - - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -@@ -4691,6 +4718,34 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) -+ /* -+ * If bfqq was in the current -+ * burst list before being -+ * merged, then we have to add -+ * it back. And we do not need -+ * to increase burst_size, as -+ * we did not decrement -+ * burst_size when we removed -+ * bfqq from the burst list as -+ * a consequence of a merge -+ * (see comments in -+ * bfq_put_queue). In this -+ * respect, it would be rather -+ * costly to know whether the -+ * current burst list is still -+ * the same burst list from -+ * which bfqq was removed on -+ * the merge. To avoid this -+ * cost, if bfqq was in a -+ * burst list, then we add -+ * bfqq to the current burst -+ * list without any further -+ * check. This can cause -+ * inappropriate insertions, -+ * but rarely enough to not -+ * harm the detection of large -+ * bursts significantly. -+ */ - hlist_add_head(&bfqq->burst_list_node, - &bfqd->burst_list); - } - -From 09adbd0f46f4ba395964b35bf611b7cc3dd84b4d Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Mon, 30 Oct 2017 16:50:50 +0100 -Subject: [PATCH 48/51] doc, block, bfq-mq: update max IOPS sustainable with - BFQ - -We have investigated more deeply the performance of BFQ, in terms of -number of IOPS that can be processed by the CPU when BFQ is used as -I/O scheduler. In more detail, using the script [1], we have measured -the number of IOPS reached on top of a null block device configured -with zero latency, as a function of the workload (sequential read, -sequential write, random read, random write) and of the system (we -considered desktops, laptops and embedded systems). - -Basing on the resulting figures, with this commit we update the -current, conservative IOPS range reported in BFQ documentation. In -particular, the documentation now reports, for each of three different -systems, the lowest number of IOPS obtained for that system with the -above test (namely, the value obtained with the workload leading to -the lowest IOPS). - -[1] https://github.com/Algodev-github/IOSpeed - -Signed-off-by: Paolo Valente -Signed-off-by: Luca Miccio ---- - Documentation/block/bfq-iosched.txt | 19 +++++++++++++------ - 1 file changed, 13 insertions(+), 6 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index dcfe15523da3..595ff7a5ff34 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -29,12 +29,19 @@ for that device, by setting low_latency to 0. See Section 3 for - details on how to configure BFQ for the desired tradeoff between - latency and throughput, or on how to maximize throughput. - --On average CPUs, the current version of BFQ can handle devices --performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a --reference, 30-50 KIOPS correspond to very high bandwidths with --sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and --to 120-200 MB/s with 4KB random I/O. BFQ is currently being tested on --multi-queue devices too. -+BFQ has a non-null overhead, which limits the maximum IOPS that the -+CPU can process for a device scheduled with BFQ. To give an idea of -+the limits on slow or average CPUs, here are BFQ limits for three -+different CPUs, on, respectively, an average laptop, an old desktop, -+and a cheap embedded system, in case full hierarchical support is -+enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set for bfq-sq, or -+CONFIG_MQ_BFQ_GROUP_IOSCHED is set for bfq-mq, or, finally, -+CONFIG_BFQ_GROUP_IOSCHED is set for bfq): -+- Intel i7-4850HQ: 250 KIOPS -+- AMD A8-3850: 170 KIOPS -+- ARM CortexTM-A53 Octa-core: 45 KIOPS -+ -+BFQ works for multi-queue devices too (bfq and bfq-mq instances). - - The table of contents follows. Impatients can just jump to Section 3. - - -From be94f97b577dc587593185224a7718aa59ac43f7 Mon Sep 17 00:00:00 2001 -From: Luca Miccio -Date: Tue, 31 Oct 2017 09:50:11 +0100 -Subject: [PATCH 49/51] block, bfq-mq: add missing invocations of - bfqg_stats_update_io_add/remove - -bfqg_stats_update_io_add and bfqg_stats_update_io_remove are to be -invoked, respectively, when an I/O request enters and when an I/O -request exits the scheduler. Unfortunately, bfq-mq does not fully comply -with this scheme, because it does not invoke these functions for -requests that are inserted into or extracted from its priority -dispatch list. This commit fixes this mistake. - -Signed-off-by: Paolo Valente -Signed-off-by: Luca Miccio ---- - block/bfq-mq-iosched.c | 24 +++++++++++++++++++----- - 1 file changed, 19 insertions(+), 5 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 816bac6cdd3d..fbf28804c220 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1394,7 +1394,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, - BUG_ON(bfqq->entity.budget < bfqq->entity.service); - - BUG_ON(bfqq == bfqd->in_service_queue); -- bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); - - /* - * bfqq deserves to be weight-raised if: -@@ -1734,7 +1733,6 @@ static void bfq_remove_request(struct request_queue *q, - BUG_ON(bfqq->meta_pending == 0); - bfqq->meta_pending--; - } -- bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); - } - - static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) -@@ -1879,6 +1877,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - bfqq->next_rq = rq; - - bfq_remove_request(q, next); -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), next->cmd_flags); - - spin_unlock_irq(&bfqq->bfqd->lock); - end: -@@ -4077,6 +4076,10 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - spin_lock_irq(&bfqd->lock); - - rq = __bfq_dispatch_request(hctx); -+ if (rq && RQ_BFQQ(rq)) -+ bfqg_stats_update_io_remove(bfqq_group(RQ_BFQQ(rq)), -+ rq->cmd_flags); -+ - spin_unlock_irq(&bfqd->lock); - - return rq; -@@ -4634,6 +4637,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - { - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -@@ -4647,8 +4651,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - - spin_lock_irq(&bfqd->lock); - if (at_head || blk_rq_is_passthrough(rq)) { -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -- - if (at_head) - list_add(&rq->queuelist, &bfqd->dispatch); - else -@@ -4668,6 +4670,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - rq->rq_flags &= ~RQF_GOT; - - __bfq_insert_request(bfqd, rq); -+ /* -+ * Update bfqq, because, if a queue merge has occurred -+ * in __bfq_insert_request, then rq has been -+ * redirected into a new queue. -+ */ -+ bfqq = RQ_BFQQ(rq); - - if (rq_mergeable(rq)) { - elv_rqhash_add(q, rq); -@@ -4676,6 +4684,9 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - } - } - -+ if (bfqq) -+ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, rq->cmd_flags); -+ - spin_unlock_irq(&bfqd->lock); - } - -@@ -4893,8 +4904,11 @@ static void bfq_finish_request(struct request *rq) - BUG_ON(in_interrupt()); - - assert_spin_locked(&bfqd->lock); -- if (!RB_EMPTY_NODE(&rq->rb_node)) -+ if (!RB_EMPTY_NODE(&rq->rb_node)) { - bfq_remove_request(rq->q, rq); -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), -+ rq->cmd_flags); -+ } - bfq_put_rq_priv_body(bfqq); - } - - -From 8659a1549d2bf241129a0f7c90429bddd9c2bc53 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 8 Nov 2017 19:07:40 +0100 -Subject: [PATCH 50/51] block, bfq-mq: update blkio stats outside the scheduler - lock - -bfq-mq invokes various blkg_*stats_* functions to update the statistics -contained in the special files blkio.bfq-mq.* in the blkio controller -groups, i.e., the I/O accounting related to the proportional-share -policy provided by bfq-mq. The execution of these functions takes a -considerable percentage, about 40%, of the total per-request execution -time of bfq-mq (i.e., of the sum of the execution time of all the bfq-mq -functions that have to be executed to process an I/O request from its -creation to its destruction). This reduces the request-processing -rate sustainable by bfq-mq noticeably, even on a multicore CPU. In fact, -the bfq-mq functions that invoke blkg_*stats_* functions cannot be -executed in parallel with the rest of the code of bfq-mq, because -both are executed under the same same per-device scheduler lock. - -To reduce this slowdown, this commit moves, wherever possible, the -invocation of these functions (more precisely, of the bfq-mq functions -that invoke blkg_*stats_* functions) outside the critical sections -protected by the scheduler lock. - -With this change, and with all blkio.bfq-mq.* statistics enabled, the -throughput grows, e.g., from 250 to 310 KIOPS (+25%) on an Intel -i7-4850HQ, in case of 8 threads doing random I/O in parallel on -null_blk, with the latter configured with 0 latency. We obtained the -same or higher throughput boosts, up to +30%, with other processors -(some figures are reported in the documentation). For our tests, we -used the script [1], with which our results can be easily reproduced. - -NOTE. This commit still protects the invocation of blkg_*stats_* -functions with the request_queue lock, because the group these -functions are invoked on may otherwise disappear before or while these -functions are executed. Fortunately, tests without even this lock -show, by difference, that the serialization caused by this lock has a -little impact (at most ~5% of throughput reduction). - -[1] https://github.com/Algodev-github/IOSpeed - -Signed-off-by: Paolo Valente -Signed-off-by: Luca Miccio ---- - Documentation/block/bfq-iosched.txt | 18 ++++-- - block/bfq-mq-iosched.c | 112 +++++++++++++++++++++++++++++++----- - block/bfq-sched.c | 2 + - 3 files changed, 112 insertions(+), 20 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index 595ff7a5ff34..c816c595082d 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -31,16 +31,22 @@ latency and throughput, or on how to maximize throughput. - - BFQ has a non-null overhead, which limits the maximum IOPS that the - CPU can process for a device scheduled with BFQ. To give an idea of --the limits on slow or average CPUs, here are BFQ limits for three --different CPUs, on, respectively, an average laptop, an old desktop, --and a cheap embedded system, in case full hierarchical support is --enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set for bfq-sq, or --CONFIG_MQ_BFQ_GROUP_IOSCHED is set for bfq-mq, or, finally, --CONFIG_BFQ_GROUP_IOSCHED is set for bfq): -+the limits on slow or average CPUs, here are, first, the limits of -+bfq-sq for three different CPUs, on, respectively, an average laptop, -+an old desktop, and a cheap embedded system, in case full hierarchical -+support is enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set): - - Intel i7-4850HQ: 250 KIOPS - - AMD A8-3850: 170 KIOPS - - ARM CortexTM-A53 Octa-core: 45 KIOPS - -+bfq-mq and bfq instances reach, instead, a higher sustainable -+throughput. Their limits, on the same systems as above, are, with full -+hierarchical support enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED set -+for bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED set for bfq): -+- Intel i7-4850HQ: 310 KIOPS -+- AMD A8-3850: 200 KIOPS -+- ARM CortexTM-A53 Octa-core: 56 KIOPS -+ - BFQ works for multi-queue devices too (bfq and bfq-mq instances). - - The table of contents follows. Impatients can just jump to Section 3. -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index fbf28804c220..ab3b83d612c2 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1822,7 +1822,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfqq->next_rq = next_rq; - - bfq_log_bfqq(bfqd, bfqq, -- "requests_merged: req %p prev %p next_rq %p bfqq %p", -+ "request_merged: req %p prev %p next_rq %p bfqq %p", - req, prev, next_rq, bfqq); - - /* -@@ -2415,7 +2415,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { - if (bfqq) { -- bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); - bfq_clear_bfqq_fifo_expire(bfqq); - - bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -@@ -3784,7 +3783,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - */ - bfq_clear_bfqq_wait_request(bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -- bfqg_stats_update_idle_time(bfqq_group(bfqq)); - } - goto keep_queue; - } -@@ -4072,16 +4070,67 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct request *rq; -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ struct bfq_queue *in_serv_queue, *bfqq; -+ bool waiting_rq, idle_timer_disabled; -+#endif - - spin_lock_irq(&bfqd->lock); - -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ in_serv_queue = bfqd->in_service_queue; -+ waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); -+ - rq = __bfq_dispatch_request(hctx); -- if (rq && RQ_BFQQ(rq)) -- bfqg_stats_update_io_remove(bfqq_group(RQ_BFQQ(rq)), -- rq->cmd_flags); - -+ idle_timer_disabled = -+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); -+ -+#else -+ rq = __bfq_dispatch_request(hctx); -+#endif - spin_unlock_irq(&bfqd->lock); - -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfqq = rq ? RQ_BFQQ(rq) : NULL; -+ if (!idle_timer_disabled && !bfqq) -+ return rq; -+ -+ /* -+ * rq and bfqq are guaranteed to exist until this function -+ * ends, for the following reasons. First, rq can be -+ * dispatched to the device, and then can be completed and -+ * freed, only after this function ends. Second, rq cannot be -+ * merged (and thus freed because of a merge) any longer, -+ * because it has already started. Thus rq cannot be freed -+ * before this function ends, and, since rq has a reference to -+ * bfqq, the same guarantee holds for bfqq too. -+ * -+ * In addition, the following queue lock guarantees that -+ * bfqq_group(bfqq) exists as well. -+ */ -+ spin_lock_irq(hctx->queue->queue_lock); -+ if (idle_timer_disabled) -+ /* -+ * Since the idle timer has been disabled, -+ * in_serv_queue contained some request when -+ * __bfq_dispatch_request was invoked above, which -+ * implies that rq was picked exactly from -+ * in_serv_queue. Thus in_serv_queue == bfqq, and is -+ * therefore guaranteed to exist because of the above -+ * arguments. -+ */ -+ bfqg_stats_update_idle_time(bfqq_group(in_serv_queue)); -+ if (bfqq) { -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+ -+ bfqg_stats_update_avg_queue_size(bfqg); -+ bfqg_stats_set_start_empty_time(bfqg); -+ bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); -+ } -+ spin_unlock_irq(hctx->queue->queue_lock); -+#endif -+ - return rq; - } - -@@ -4200,7 +4249,6 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); -- - bfq_exit_bfqq(bfqd, bfqq); - bic_set_bfqq(bic, NULL, is_sync); - spin_unlock_irqrestore(&bfqd->lock, flags); -@@ -4554,7 +4602,6 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - */ - bfq_clear_bfqq_wait_request(bfqq); - hrtimer_try_to_cancel(&bfqd->idle_slice_timer); -- bfqg_stats_update_idle_time(bfqq_group(bfqq)); - - /* - * The queue is not empty, because a new request just -@@ -4569,9 +4616,11 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - } - } - --static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) -+/* returns true if it causes the idle timer to be disabled */ -+static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; -+ bool waiting, idle_timer_disabled = false; - BUG_ON(!bfqq); - - assert_spin_locked(&bfqd->lock); -@@ -4624,12 +4673,16 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - } - } - -+ waiting = bfqq && bfq_bfqq_wait_request(bfqq); - bfq_add_request(rq); -+ idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); - - rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; - list_add_tail(&rq->queuelist, &bfqq->fifo); - - bfq_rq_enqueued(bfqd, bfqq, rq); -+ -+ return idle_timer_disabled; - } - - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -@@ -4638,6 +4691,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bool idle_timer_disabled = false; -+ unsigned int cmd_flags; -+#endif - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -@@ -4669,13 +4726,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - BUG_ON(!(rq->rq_flags & RQF_GOT)); - rq->rq_flags &= ~RQF_GOT; - -- __bfq_insert_request(bfqd, rq); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ idle_timer_disabled = __bfq_insert_request(bfqd, rq); - /* - * Update bfqq, because, if a queue merge has occurred - * in __bfq_insert_request, then rq has been - * redirected into a new queue. - */ - bfqq = RQ_BFQQ(rq); -+#else -+ __bfq_insert_request(bfqd, rq); -+#endif - - if (rq_mergeable(rq)) { - elv_rqhash_add(q, rq); -@@ -4683,11 +4744,34 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - q->last_merge = rq; - } - } -- -- if (bfqq) -- bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, rq->cmd_flags); -- -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ /* -+ * Cache cmd_flags before releasing scheduler lock, because rq -+ * may disappear afterwards (for example, because of a request -+ * merge). -+ */ -+ cmd_flags = rq->cmd_flags; -+#endif - spin_unlock_irq(&bfqd->lock); -+#ifdef BFQ_GROUP_IOSCHED_ENABLED -+ if (!bfqq) -+ return; -+ /* -+ * bfqq still exists, because it can disappear only after -+ * either it is merged with another queue, or the process it -+ * is associated with exits. But both actions must be taken by -+ * the same process currently executing this flow of -+ * instruction. -+ * -+ * In addition, the following queue lock guarantees that -+ * bfqq_group(bfqq) exists as well. -+ */ -+ spin_lock_irq(q->queue_lock); -+ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); -+ if (idle_timer_disabled) -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ spin_unlock_irq(q->queue_lock); -+#endif - } - - static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index e4a2553a2d2c..616c0692335a 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -949,9 +949,11 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - st->vtime += bfq_delta(served, st->wsum); - bfq_forget_idle(st); - } -+#ifndef BFQ_MQ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); - #endif -+#endif - st = bfq_entity_service_tree(&bfqq->entity); - bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", - served, ((st->vtime>>10)*1000)>>12, st); - -From abdfb33a3325df55ec0261fd824ca61ddac13575 Mon Sep 17 00:00:00 2001 -From: Luca Miccio -Date: Wed, 8 Nov 2017 19:07:41 +0100 -Subject: [PATCH 51/51] block, bfq-sq, bfq-mq: move debug blkio stats behind - CONFIG_DEBUG_BLK_CGROUP - -BFQ (both bfq-mq and bfq-sq) currently creates, and updates, its own -instance of the whole set of blkio statistics that cfq creates. Yet, -from the comments of Tejun Heo in [1], it turned out that most of -these statistics are meant/useful only for debugging. This commit -makes BFQ create the latter, debugging statistics only if the option -CONFIG_DEBUG_BLK_CGROUP is set. - -By doing so, this commit also enables BFQ to enjoy a high perfomance -boost. The reason is that, if CONFIG_DEBUG_BLK_CGROUP is not set, then -BFQ has to update far fewer statistics, and, in particular, not the -heaviest to update. To give an idea of the benefits, if -CONFIG_DEBUG_BLK_CGROUP is not set, then, on an Intel i7-4850HQ, and -with 8 threads doing random I/O in parallel on null_blk (configured -with 0 latency), the throughput of bfq-mq grows from 310 to 400 KIOPS -(+30%). We have measured similar or even much higher boosts with other -CPUs: e.g., +45% with an ARM CortexTM-A53 Octa-core. Our results have -been obtained and can be reproduced very easily with the script in [1]. - -[1] https://www.spinics.net/lists/linux-block/msg18943.html - -Reported-by: Tejun Heo -Signed-off-by: Luca Miccio -Signed-off-by: Paolo Valente ---- - Documentation/block/bfq-iosched.txt | 59 ++++++++++--- - block/bfq-cgroup-included.c | 163 ++++++++++++++++++++---------------- - block/bfq-mq-iosched.c | 14 ++-- - block/bfq-mq.h | 4 +- - block/bfq.h | 4 +- - 5 files changed, 147 insertions(+), 97 deletions(-) - -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -index c816c595082d..30ef2dba85ad 100644 ---- a/Documentation/block/bfq-iosched.txt -+++ b/Documentation/block/bfq-iosched.txt -@@ -29,24 +29,41 @@ for that device, by setting low_latency to 0. See Section 3 for - details on how to configure BFQ for the desired tradeoff between - latency and throughput, or on how to maximize throughput. - --BFQ has a non-null overhead, which limits the maximum IOPS that the --CPU can process for a device scheduled with BFQ. To give an idea of --the limits on slow or average CPUs, here are, first, the limits of --bfq-sq for three different CPUs, on, respectively, an average laptop, -+BFQ has a non-null overhead, which limits the maximum IOPS that a CPU -+can process for a device scheduled with BFQ. To give an idea of the -+limits on slow or average CPUs, here are, first, the limits of bfq-mq -+and bfq for three different CPUs, on, respectively, an average laptop, - an old desktop, and a cheap embedded system, in case full hierarchical --support is enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set): --- Intel i7-4850HQ: 250 KIOPS --- AMD A8-3850: 170 KIOPS --- ARM CortexTM-A53 Octa-core: 45 KIOPS -- --bfq-mq and bfq instances reach, instead, a higher sustainable --throughput. Their limits, on the same systems as above, are, with full --hierarchical support enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED set --for bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED set for bfq): -+support is enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED is set for -+bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED is set for bfq), but -+CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2): -+- Intel i7-4850HQ: 400 KIOPS -+- AMD A8-3850: 250 KIOPS -+- ARM CortexTM-A53 Octa-core: 80 KIOPS -+ -+As for bfq-sq, it cannot reach the above IOPS, because of the -+inherent, lower parallelism of legacy blk and of the components within -+it (including bfq-sq itself). In particular, results with -+CONFIG_DEBUG_BLK_CGROUP unset are rather fluctuating. The limits -+reported below for the case CONFIG_DEBUG_BLK_CGROUP set will however -+provide a lower bound to the limits of bfq-sq. -+ -+Turning back to bfq-mq and bfq, If CONFIG_DEBUG_BLK_CGROUP is set (and -+of course full hierarchical support is enabled), then the sustainable -+throughput with bfq-mq and bfq decreases, because all blkio.bfq* -+statistics are created and updated (Section 4-2). For bfq-mq and bfq, -+this leads to the following maximum sustainable throughputs, on the -+same systems as above: - - Intel i7-4850HQ: 310 KIOPS - - AMD A8-3850: 200 KIOPS - - ARM CortexTM-A53 Octa-core: 56 KIOPS - -+Finally, if CONFIG_DEBUG_BLK_CGROUP is set (and full hierarchical -+support is enabled), then bfq-sq exhibits the following limits: -+- Intel i7-4850HQ: 250 KIOPS -+- AMD A8-3850: 170 KIOPS -+- ARM CortexTM-A53 Octa-core: 45 KIOPS -+ - BFQ works for multi-queue devices too (bfq and bfq-mq instances). - - The table of contents follows. Impatients can just jump to Section 3. -@@ -524,6 +541,22 @@ BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be "" - to set the weight of a group with the mainline BFQ is blkio.bfq.weight - or io.bfq.weight. - -+As for cgroups-v1 (blkio controller), the exact set of stat files -+created, and kept up-to-date by bfq*, depends on whether -+CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq* creates all -+the stat files documented in -+Documentation/cgroup-v1/blkio-controller.txt. If, instead, -+CONFIG_DEBUG_BLK_CGROUP is not set, then bfq* creates only the files -+blkio.bfq*.io_service_bytes -+blkio.bfq*.io_service_bytes_recursive -+blkio.bfq*.io_serviced -+blkio.bfq*.io_serviced_recursive -+ -+The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum -+throughput sustainable with bfq*, because updating the blkio.bfq* -+stats is rather costly, especially for some of the stats enabled by -+CONFIG_DEBUG_BLK_CGROUP. -+ - Parameters to set - ----------------- - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 631e53d9150d..562b0ce581a7 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -15,7 +15,7 @@ - * file. - */ - --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - - /* bfqg stats flags */ - enum bfqg_stats_flags { -@@ -155,6 +155,63 @@ static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) - bfqg_stats_update_group_wait_time(stats); - } - -+static void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, 1); -+ bfqg_stats_end_empty_time(&bfqg->stats); -+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) -+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -+} -+ -+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, op, -1); -+} -+ -+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) -+{ -+ blkg_rwstat_add(&bfqg->stats.merged, op, 1); -+} -+ -+static void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, unsigned int op) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ unsigned long long now = sched_clock(); -+ -+ if (time_after64(now, io_start_time)) -+ blkg_rwstat_add(&stats->service_time, op, -+ now - io_start_time); -+ if (time_after64(io_start_time, start_time)) -+ blkg_rwstat_add(&stats->wait_time, op, -+ io_start_time - start_time); -+} -+ -+#else /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ -+ -+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } -+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, -+ unsigned int op) { } -+static inline void -+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) { } -+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } -+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -+ -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ -+ -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - static struct blkcg_policy blkcg_policy_bfq; - - /* -@@ -247,44 +304,10 @@ static void bfqg_and_blkg_put(struct bfq_group *bfqg) - } - #endif - --static void bfqg_stats_update_io_add(struct bfq_group *bfqg, -- struct bfq_queue *bfqq, -- unsigned int op) --{ -- blkg_rwstat_add(&bfqg->stats.queued, op, 1); -- bfqg_stats_end_empty_time(&bfqg->stats); -- if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) -- bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); --} -- --static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) --{ -- blkg_rwstat_add(&bfqg->stats.queued, op, -1); --} -- --static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) --{ -- blkg_rwstat_add(&bfqg->stats.merged, op, 1); --} -- --static void bfqg_stats_update_completion(struct bfq_group *bfqg, -- uint64_t start_time, uint64_t io_start_time, -- unsigned int op) --{ -- struct bfqg_stats *stats = &bfqg->stats; -- unsigned long long now = sched_clock(); -- -- if (time_after64(now, io_start_time)) -- blkg_rwstat_add(&stats->service_time, op, -- now - io_start_time); -- if (time_after64(io_start_time, start_time)) -- blkg_rwstat_add(&stats->wait_time, op, -- io_start_time - start_time); --} -- - /* @stats = 0 */ - static void bfqg_stats_reset(struct bfqg_stats *stats) - { -+#ifdef CONFIG_DEBUG_BLK_CGROUP - /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->merged); - blkg_rwstat_reset(&stats->service_time); -@@ -296,6 +319,7 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) - blkg_stat_reset(&stats->group_wait_time); - blkg_stat_reset(&stats->idle_time); - blkg_stat_reset(&stats->empty_time); -+#endif - } - - /* @to += @from */ -@@ -304,6 +328,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) - if (!to || !from) - return; - -+#ifdef CONFIG_DEBUG_BLK_CGROUP - /* queued stats shouldn't be cleared */ - blkg_rwstat_add_aux(&to->merged, &from->merged); - blkg_rwstat_add_aux(&to->service_time, &from->service_time); -@@ -316,6 +341,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) - blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); - blkg_stat_add_aux(&to->idle_time, &from->idle_time); - blkg_stat_add_aux(&to->empty_time, &from->empty_time); -+#endif - } - - /* -@@ -367,6 +393,7 @@ static void bfq_init_entity(struct bfq_entity *entity, - - static void bfqg_stats_exit(struct bfqg_stats *stats) - { -+#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_rwstat_exit(&stats->merged); - blkg_rwstat_exit(&stats->service_time); - blkg_rwstat_exit(&stats->wait_time); -@@ -378,10 +405,12 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) - blkg_stat_exit(&stats->group_wait_time); - blkg_stat_exit(&stats->idle_time); - blkg_stat_exit(&stats->empty_time); -+#endif - } - - static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) - { -+#ifdef CONFIG_DEBUG_BLK_CGROUP - if (blkg_rwstat_init(&stats->merged, gfp) || - blkg_rwstat_init(&stats->service_time, gfp) || - blkg_rwstat_init(&stats->wait_time, gfp) || -@@ -396,6 +425,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) - bfqg_stats_exit(stats); - return -ENOMEM; - } -+#endif - - return 0; - } -@@ -1003,6 +1033,7 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, - return bfq_io_set_weight_legacy(of_css(of), NULL, weight); - } - -+#ifdef CONFIG_DEBUG_BLK_CGROUP - static int bfqg_print_stat(struct seq_file *sf, void *v) - { - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, -@@ -1108,6 +1139,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) - 0, false); - return 0; - } -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ - - static struct bfq_group * - bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -@@ -1137,15 +1169,6 @@ static struct cftype bfq_blkcg_legacy_files[] = { - - /* statistics, covers only the tasks in the bfqg */ - { -- .name = BFQ_CGROUP_FNAME(time), -- .private = offsetof(struct bfq_group, stats.time), -- .seq_show = bfqg_print_stat, -- }, -- { -- .name = BFQ_CGROUP_FNAME(sectors), -- .seq_show = bfqg_print_stat_sectors, -- }, -- { - .name = BFQ_CGROUP_FNAME(io_service_bytes), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes, -@@ -1155,6 +1178,16 @@ static struct cftype bfq_blkcg_legacy_files[] = { - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios, - }, -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ { -+ .name = BFQ_CGROUP_FNAME(time), -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(sectors), -+ .seq_show = bfqg_print_stat_sectors, -+ }, - { - .name = BFQ_CGROUP_FNAME(io_service_time), - .private = offsetof(struct bfq_group, stats.service_time), -@@ -1175,18 +1208,10 @@ static struct cftype bfq_blkcg_legacy_files[] = { - .private = offsetof(struct bfq_group, stats.queued), - .seq_show = bfqg_print_rwstat, - }, -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ - - /* the same statictics which cover the bfqg and its descendants */ - { -- .name = BFQ_CGROUP_FNAME(time_recursive), -- .private = offsetof(struct bfq_group, stats.time), -- .seq_show = bfqg_print_stat_recursive, -- }, -- { -- .name = BFQ_CGROUP_FNAME(sectors_recursive), -- .seq_show = bfqg_print_stat_sectors_recursive, -- }, -- { - .name = BFQ_CGROUP_FNAME(io_service_bytes_recursive), - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_bytes_recursive, -@@ -1196,6 +1221,16 @@ static struct cftype bfq_blkcg_legacy_files[] = { - .private = (unsigned long)&blkcg_policy_bfq, - .seq_show = blkg_print_stat_ios_recursive, - }, -+#ifdef CONFIG_DEBUG_BLK_CGROUP -+ { -+ .name = BFQ_CGROUP_FNAME(time_recursive), -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat_recursive, -+ }, -+ { -+ .name = BFQ_CGROUP_FNAME(sectors_recursive), -+ .seq_show = bfqg_print_stat_sectors_recursive, -+ }, - { - .name = BFQ_CGROUP_FNAME(io_service_time_recursive), - .private = offsetof(struct bfq_group, stats.service_time), -@@ -1240,6 +1275,7 @@ static struct cftype bfq_blkcg_legacy_files[] = { - .private = offsetof(struct bfq_group, stats.dequeue), - .seq_show = bfqg_print_stat, - }, -+#endif /* CONFIG_DEBUG_BLK_CGROUP */ - { } /* terminate */ - }; - -@@ -1257,25 +1293,6 @@ static struct cftype bfq_blkg_files[] = { - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - --static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, -- struct bfq_queue *bfqq, unsigned int op) { } --static inline void --bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } --static inline void --bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } --static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, -- uint64_t start_time, uint64_t io_start_time, -- unsigned int op) { } --static inline void --bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -- struct bfq_group *curr_bfqg) { } --static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } --static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } --static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } --static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } --static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } --static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -- - static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_group *bfqg) {} - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index ab3b83d612c2..0c09609a6099 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4070,14 +4070,14 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct request *rq; --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - struct bfq_queue *in_serv_queue, *bfqq; - bool waiting_rq, idle_timer_disabled; - #endif - - spin_lock_irq(&bfqd->lock); - --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - in_serv_queue = bfqd->in_service_queue; - waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); - -@@ -4091,7 +4091,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - #endif - spin_unlock_irq(&bfqd->lock); - --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - bfqq = rq ? RQ_BFQQ(rq) : NULL; - if (!idle_timer_disabled && !bfqq) - return rq; -@@ -4691,7 +4691,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - bool idle_timer_disabled = false; - unsigned int cmd_flags; - #endif -@@ -4726,7 +4726,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - BUG_ON(!(rq->rq_flags & RQF_GOT)); - rq->rq_flags &= ~RQF_GOT; - --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - idle_timer_disabled = __bfq_insert_request(bfqd, rq); - /* - * Update bfqq, because, if a queue merge has occurred -@@ -4744,7 +4744,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - q->last_merge = rq; - } - } --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - /* - * Cache cmd_flags before releasing scheduler lock, because rq - * may disappear afterwards (for example, because of a request -@@ -4753,7 +4753,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - cmd_flags = rq->cmd_flags; - #endif - spin_unlock_irq(&bfqd->lock); --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - if (!bfqq) - return; - /* -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 7ed2cc29be57..1cb05bb853d2 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -784,7 +784,7 @@ enum bfqq_expiration { - - - struct bfqg_stats { --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ -@@ -812,7 +812,7 @@ struct bfqg_stats { - uint64_t start_idle_time; - uint64_t start_empty_time; - uint16_t flags; --#endif -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ - }; - - #ifdef BFQ_GROUP_IOSCHED_ENABLED -diff --git a/block/bfq.h b/block/bfq.h -index 15d326f466b7..47cd4d5a8c32 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -791,7 +791,7 @@ enum bfqq_expiration { - - - struct bfqg_stats { --#ifdef BFQ_GROUP_IOSCHED_ENABLED -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ -@@ -819,7 +819,7 @@ struct bfqg_stats { - uint64_t start_idle_time; - uint64_t start_empty_time; - uint16_t flags; --#endif -+#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ - }; - - #ifdef BFQ_GROUP_IOSCHED_ENABLED diff --git a/sys-kernel/linux-sources-redcore-lts/files/0001-MuQSS-version-0.162-CPU-scheduler-linux-hardened.patch b/sys-kernel/linux-sources-redcore-lts/files/0001-MuQSS-version-0.162-CPU-scheduler-linux-hardened.patch deleted file mode 100644 index a81dbeac..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0001-MuQSS-version-0.162-CPU-scheduler-linux-hardened.patch +++ /dev/null @@ -1,9560 +0,0 @@ -diff -Nur a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c ---- a/arch/powerpc/platforms/cell/spufs/sched.c 2018-12-21 13:13:19.000000000 +0000 -+++ b/arch/powerpc/platforms/cell/spufs/sched.c 2019-01-05 20:22:51.089998199 +0000 -@@ -65,11 +65,6 @@ - static struct timer_list spuloadavg_timer; - - /* -- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). -- */ --#define NORMAL_PRIO 120 -- --/* - * Frequency of the spu scheduler tick. By default we do one SPU scheduler - * tick for every 10 CPU scheduler ticks. - */ -diff -Nur a/arch/x86/Kconfig b/arch/x86/Kconfig ---- a/arch/x86/Kconfig 2019-01-05 20:17:13.829237906 +0000 -+++ b/arch/x86/Kconfig 2019-01-05 20:30:14.244135060 +0000 -@@ -957,6 +957,20 @@ - config SCHED_SMT - def_bool y if SMP - -+config SMT_NICE -+ bool "SMT (Hyperthreading) aware nice priority and policy support" -+ depends on SCHED_MUQSS && SCHED_SMT -+ default y -+ ---help--- -+ Enabling Hyperthreading on Intel CPUs decreases the effectiveness -+ of the use of 'nice' levels and different scheduling policies -+ (e.g. realtime) due to sharing of CPU power between hyperthreads. -+ SMT nice support makes each logical CPU aware of what is running on -+ its hyperthread siblings, maintaining appropriate distribution of -+ CPU according to nice levels and scheduling policies at the expense -+ of slightly increased overhead. -+ If unsure say Y here. -+ - config SCHED_MC - def_bool y - prompt "Multi-core scheduler support" -diff -Nur a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt ---- a/Documentation/scheduler/sched-BFS.txt 1970-01-01 01:00:00.000000000 +0100 -+++ b/Documentation/scheduler/sched-BFS.txt 2019-01-05 20:22:51.089998199 +0000 -@@ -0,0 +1,351 @@ -+BFS - The Brain Fuck Scheduler by Con Kolivas. -+ -+Goals. -+ -+The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to -+completely do away with the complex designs of the past for the cpu process -+scheduler and instead implement one that is very simple in basic design. -+The main focus of BFS is to achieve excellent desktop interactivity and -+responsiveness without heuristics and tuning knobs that are difficult to -+understand, impossible to model and predict the effect of, and when tuned to -+one workload cause massive detriment to another. -+ -+ -+Design summary. -+ -+BFS is best described as a single runqueue, O(n) lookup, earliest effective -+virtual deadline first design, loosely based on EEVDF (earliest eligible virtual -+deadline first) and my previous Staircase Deadline scheduler. Each component -+shall be described in order to understand the significance of, and reasoning for -+it. The codebase when the first stable version was released was approximately -+9000 lines less code than the existing mainline linux kernel scheduler (in -+2.6.31). This does not even take into account the removal of documentation and -+the cgroups code that is not used. -+ -+Design reasoning. -+ -+The single runqueue refers to the queued but not running processes for the -+entire system, regardless of the number of CPUs. The reason for going back to -+a single runqueue design is that once multiple runqueues are introduced, -+per-CPU or otherwise, there will be complex interactions as each runqueue will -+be responsible for the scheduling latency and fairness of the tasks only on its -+own runqueue, and to achieve fairness and low latency across multiple CPUs, any -+advantage in throughput of having CPU local tasks causes other disadvantages. -+This is due to requiring a very complex balancing system to at best achieve some -+semblance of fairness across CPUs and can only maintain relatively low latency -+for tasks bound to the same CPUs, not across them. To increase said fairness -+and latency across CPUs, the advantage of local runqueue locking, which makes -+for better scalability, is lost due to having to grab multiple locks. -+ -+A significant feature of BFS is that all accounting is done purely based on CPU -+used and nowhere is sleep time used in any way to determine entitlement or -+interactivity. Interactivity "estimators" that use some kind of sleep/run -+algorithm are doomed to fail to detect all interactive tasks, and to falsely tag -+tasks that aren't interactive as being so. The reason for this is that it is -+close to impossible to determine that when a task is sleeping, whether it is -+doing it voluntarily, as in a userspace application waiting for input in the -+form of a mouse click or otherwise, or involuntarily, because it is waiting for -+another thread, process, I/O, kernel activity or whatever. Thus, such an -+estimator will introduce corner cases, and more heuristics will be required to -+cope with those corner cases, introducing more corner cases and failed -+interactivity detection and so on. Interactivity in BFS is built into the design -+by virtue of the fact that tasks that are waking up have not used up their quota -+of CPU time, and have earlier effective deadlines, thereby making it very likely -+they will preempt any CPU bound task of equivalent nice level. See below for -+more information on the virtual deadline mechanism. Even if they do not preempt -+a running task, because the rr interval is guaranteed to have a bound upper -+limit on how long a task will wait for, it will be scheduled within a timeframe -+that will not cause visible interface jitter. -+ -+ -+Design details. -+ -+Task insertion. -+ -+BFS inserts tasks into each relevant queue as an O(1) insertion into a double -+linked list. On insertion, *every* running queue is checked to see if the newly -+queued task can run on any idle queue, or preempt the lowest running task on the -+system. This is how the cross-CPU scheduling of BFS achieves significantly lower -+latency per extra CPU the system has. In this case the lookup is, in the worst -+case scenario, O(n) where n is the number of CPUs on the system. -+ -+Data protection. -+ -+BFS has one single lock protecting the process local data of every task in the -+global queue. Thus every insertion, removal and modification of task data in the -+global runqueue needs to grab the global lock. However, once a task is taken by -+a CPU, the CPU has its own local data copy of the running process' accounting -+information which only that CPU accesses and modifies (such as during a -+timer tick) thus allowing the accounting data to be updated lockless. Once a -+CPU has taken a task to run, it removes it from the global queue. Thus the -+global queue only ever has, at most, -+ -+ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 -+ -+tasks in the global queue. This value is relevant for the time taken to look up -+tasks during scheduling. This will increase if many tasks with CPU affinity set -+in their policy to limit which CPUs they're allowed to run on if they outnumber -+the number of CPUs. The +1 is because when rescheduling a task, the CPU's -+currently running task is put back on the queue. Lookup will be described after -+the virtual deadline mechanism is explained. -+ -+Virtual deadline. -+ -+The key to achieving low latency, scheduling fairness, and "nice level" -+distribution in BFS is entirely in the virtual deadline mechanism. The one -+tunable in BFS is the rr_interval, or "round robin interval". This is the -+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) -+tasks of the same nice level will be running for, or looking at it the other -+way around, the longest duration two tasks of the same nice level will be -+delayed for. When a task requests cpu time, it is given a quota (time_slice) -+equal to the rr_interval and a virtual deadline. The virtual deadline is -+offset from the current time in jiffies by this equation: -+ -+ jiffies + (prio_ratio * rr_interval) -+ -+The prio_ratio is determined as a ratio compared to the baseline of nice -20 -+and increases by 10% per nice level. The deadline is a virtual one only in that -+no guarantee is placed that a task will actually be scheduled by this time, but -+it is used to compare which task should go next. There are three components to -+how a task is next chosen. First is time_slice expiration. If a task runs out -+of its time_slice, it is descheduled, the time_slice is refilled, and the -+deadline reset to that formula above. Second is sleep, where a task no longer -+is requesting CPU for whatever reason. The time_slice and deadline are _not_ -+adjusted in this case and are just carried over for when the task is next -+scheduled. Third is preemption, and that is when a newly waking task is deemed -+higher priority than a currently running task on any cpu by virtue of the fact -+that it has an earlier virtual deadline than the currently running task. The -+earlier deadline is the key to which task is next chosen for the first and -+second cases. Once a task is descheduled, it is put back on the queue, and an -+O(n) lookup of all queued-but-not-running tasks is done to determine which has -+the earliest deadline and that task is chosen to receive CPU next. -+ -+The CPU proportion of different nice tasks works out to be approximately the -+ -+ (prio_ratio difference)^2 -+ -+The reason it is squared is that a task's deadline does not change while it is -+running unless it runs out of time_slice. Thus, even if the time actually -+passes the deadline of another task that is queued, it will not get CPU time -+unless the current running task deschedules, and the time "base" (jiffies) is -+constantly moving. -+ -+Task lookup. -+ -+BFS has 103 priority queues. 100 of these are dedicated to the static priority -+of realtime tasks, and the remaining 3 are, in order of best to worst priority, -+SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority -+scheduling). When a task of these priorities is queued, a bitmap of running -+priorities is set showing which of these priorities has tasks waiting for CPU -+time. When a CPU is made to reschedule, the lookup for the next task to get -+CPU time is performed in the following way: -+ -+First the bitmap is checked to see what static priority tasks are queued. If -+any realtime priorities are found, the corresponding queue is checked and the -+first task listed there is taken (provided CPU affinity is suitable) and lookup -+is complete. If the priority corresponds to a SCHED_ISO task, they are also -+taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds -+to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this -+stage, every task in the runlist that corresponds to that priority is checked -+to see which has the earliest set deadline, and (provided it has suitable CPU -+affinity) it is taken off the runqueue and given the CPU. If a task has an -+expired deadline, it is taken and the rest of the lookup aborted (as they are -+chosen in FIFO order). -+ -+Thus, the lookup is O(n) in the worst case only, where n is as described -+earlier, as tasks may be chosen before the whole task list is looked over. -+ -+ -+Scalability. -+ -+The major limitations of BFS will be that of scalability, as the separate -+runqueue designs will have less lock contention as the number of CPUs rises. -+However they do not scale linearly even with separate runqueues as multiple -+runqueues will need to be locked concurrently on such designs to be able to -+achieve fair CPU balancing, to try and achieve some sort of nice-level fairness -+across CPUs, and to achieve low enough latency for tasks on a busy CPU when -+other CPUs would be more suited. BFS has the advantage that it requires no -+balancing algorithm whatsoever, as balancing occurs by proxy simply because -+all CPUs draw off the global runqueue, in priority and deadline order. Despite -+the fact that scalability is _not_ the prime concern of BFS, it both shows very -+good scalability to smaller numbers of CPUs and is likely a more scalable design -+at these numbers of CPUs. -+ -+It also has some very low overhead scalability features built into the design -+when it has been deemed their overhead is so marginal that they're worth adding. -+The first is the local copy of the running process' data to the CPU it's running -+on to allow that data to be updated lockless where possible. Then there is -+deference paid to the last CPU a task was running on, by trying that CPU first -+when looking for an idle CPU to use the next time it's scheduled. Finally there -+is the notion of cache locality beyond the last running CPU. The sched_domains -+information is used to determine the relative virtual "cache distance" that -+other CPUs have from the last CPU a task was running on. CPUs with shared -+caches, such as SMT siblings, or multicore CPUs with shared caches, are treated -+as cache local. CPUs without shared caches are treated as not cache local, and -+CPUs on different NUMA nodes are treated as very distant. This "relative cache -+distance" is used by modifying the virtual deadline value when doing lookups. -+Effectively, the deadline is unaltered between "cache local" CPUs, doubled for -+"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning -+behind the doubling of deadlines is as follows. The real cost of migrating a -+task from one CPU to another is entirely dependant on the cache footprint of -+the task, how cache intensive the task is, how long it's been running on that -+CPU to take up the bulk of its cache, how big the CPU cache is, how fast and -+how layered the CPU cache is, how fast a context switch is... and so on. In -+other words, it's close to random in the real world where we do more than just -+one sole workload. The only thing we can be sure of is that it's not free. So -+BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs -+is more important than cache locality, and cache locality only plays a part -+after that. Doubling the effective deadline is based on the premise that the -+"cache local" CPUs will tend to work on the same tasks up to double the number -+of cache local CPUs, and once the workload is beyond that amount, it is likely -+that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA -+is a value I pulled out of my arse. -+ -+When choosing an idle CPU for a waking task, the cache locality is determined -+according to where the task last ran and then idle CPUs are ranked from best -+to worst to choose the most suitable idle CPU based on cache locality, NUMA -+node locality and hyperthread sibling business. They are chosen in the -+following preference (if idle): -+ -+* Same core, idle or busy cache, idle threads -+* Other core, same cache, idle or busy cache, idle threads. -+* Same node, other CPU, idle cache, idle threads. -+* Same node, other CPU, busy cache, idle threads. -+* Same core, busy threads. -+* Other core, same cache, busy threads. -+* Same node, other CPU, busy threads. -+* Other node, other CPU, idle cache, idle threads. -+* Other node, other CPU, busy cache, idle threads. -+* Other node, other CPU, busy threads. -+ -+This shows the SMT or "hyperthread" awareness in the design as well which will -+choose a real idle core first before a logical SMT sibling which already has -+tasks on the physical CPU. -+ -+Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. -+However this benchmarking was performed on an earlier design that was far less -+scalable than the current one so it's hard to know how scalable it is in terms -+of both CPUs (due to the global runqueue) and heavily loaded machines (due to -+O(n) lookup) at this stage. Note that in terms of scalability, the number of -+_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) -+quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark -+results are very promising indeed, without needing to tweak any knobs, features -+or options. Benchmark contributions are most welcome. -+ -+ -+Features -+ -+As the initial prime target audience for BFS was the average desktop user, it -+was designed to not need tweaking, tuning or have features set to obtain benefit -+from it. Thus the number of knobs and features has been kept to an absolute -+minimum and should not require extra user input for the vast majority of cases. -+There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval -+and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition -+to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is -+support for CGROUPS. The average user should neither need to know what these -+are, nor should they need to be using them to have good desktop behaviour. -+ -+rr_interval -+ -+There is only one "scheduler" tunable, the round robin interval. This can be -+accessed in -+ -+ /proc/sys/kernel/rr_interval -+ -+The value is in milliseconds, and the default value is set to 6 on a -+uniprocessor machine, and automatically set to a progressively higher value on -+multiprocessor machines. The reasoning behind increasing the value on more CPUs -+is that the effective latency is decreased by virtue of there being more CPUs on -+BFS (for reasons explained above), and increasing the value allows for less -+cache contention and more throughput. Valid values are from 1 to 1000 -+Decreasing the value will decrease latencies at the cost of decreasing -+throughput, while increasing it will improve throughput, but at the cost of -+worsening latencies. The accuracy of the rr interval is limited by HZ resolution -+of the kernel configuration. Thus, the worst case latencies are usually slightly -+higher than this actual value. The default value of 6 is not an arbitrary one. -+It is based on the fact that humans can detect jitter at approximately 7ms, so -+aiming for much lower latencies is pointless under most circumstances. It is -+worth noting this fact when comparing the latency performance of BFS to other -+schedulers. Worst case latencies being higher than 7ms are far worse than -+average latencies not being in the microsecond range. -+ -+Isochronous scheduling. -+ -+Isochronous scheduling is a unique scheduling policy designed to provide -+near-real-time performance to unprivileged (ie non-root) users without the -+ability to starve the machine indefinitely. Isochronous tasks (which means -+"same time") are set using, for example, the schedtool application like so: -+ -+ schedtool -I -e amarok -+ -+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works -+is that it has a priority level between true realtime tasks and SCHED_NORMAL -+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, -+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval -+rate). However if ISO tasks run for more than a tunable finite amount of time, -+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of -+time is the percentage of _total CPU_ available across the machine, configurable -+as a percentage in the following "resource handling" tunable (as opposed to a -+scheduler tunable): -+ -+ /proc/sys/kernel/iso_cpu -+ -+and is set to 70% by default. It is calculated over a rolling 5 second average -+Because it is the total CPU available, it means that on a multi CPU machine, it -+is possible to have an ISO task running as realtime scheduling indefinitely on -+just one CPU, as the other CPUs will be available. Setting this to 100 is the -+equivalent of giving all users SCHED_RR access and setting it to 0 removes the -+ability to run any pseudo-realtime tasks. -+ -+A feature of BFS is that it detects when an application tries to obtain a -+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the -+appropriate privileges to use those policies. When it detects this, it will -+give the task SCHED_ISO policy instead. Thus it is transparent to the user. -+Because some applications constantly set their policy as well as their nice -+level, there is potential for them to undo the override specified by the user -+on the command line of setting the policy to SCHED_ISO. To counter this, once -+a task has been set to SCHED_ISO policy, it needs superuser privileges to set -+it back to SCHED_NORMAL. This will ensure the task remains ISO and all child -+processes and threads will also inherit the ISO policy. -+ -+Idleprio scheduling. -+ -+Idleprio scheduling is a scheduling policy designed to give out CPU to a task -+_only_ when the CPU would be otherwise idle. The idea behind this is to allow -+ultra low priority tasks to be run in the background that have virtually no -+effect on the foreground tasks. This is ideally suited to distributed computing -+clients (like setiathome, folding, mprime etc) but can also be used to start -+a video encode or so on without any slowdown of other tasks. To avoid this -+policy from grabbing shared resources and holding them indefinitely, if it -+detects a state where the task is waiting on I/O, the machine is about to -+suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As -+per the Isochronous task management, once a task has been scheduled as IDLEPRIO, -+it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can -+be set to start as SCHED_IDLEPRIO with the schedtool command like so: -+ -+ schedtool -D -e ./mprime -+ -+Subtick accounting. -+ -+It is surprisingly difficult to get accurate CPU accounting, and in many cases, -+the accounting is done by simply determining what is happening at the precise -+moment a timer tick fires off. This becomes increasingly inaccurate as the -+timer tick frequency (HZ) is lowered. It is possible to create an application -+which uses almost 100% CPU, yet by being descheduled at the right time, records -+zero CPU usage. While the main problem with this is that there are possible -+security implications, it is also difficult to determine how much CPU a task -+really does use. BFS tries to use the sub-tick accounting from the TSC clock, -+where possible, to determine real CPU usage. This is not entirely reliable, but -+is far more likely to produce accurate CPU usage data than the existing designs -+and will not show tasks as consuming no CPU usage when they actually are. Thus, -+the amount of CPU reported as being used by BFS will more accurately represent -+how much CPU the task itself is using (as is shown for example by the 'time' -+application), so the reported values may be quite different to other schedulers. -+Values reported as the 'load' are more prone to problems with this design, but -+per process values are closer to real usage. When comparing throughput of BFS -+to other designs, it is important to compare the actual completed work in terms -+of total wall clock time taken and total work done, rather than the reported -+"cpu usage". -+ -+ -+Con Kolivas Fri Aug 27 2010 -diff -Nur a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt ---- a/Documentation/scheduler/sched-MuQSS.txt 1970-01-01 01:00:00.000000000 +0100 -+++ b/Documentation/scheduler/sched-MuQSS.txt 2019-01-05 20:22:51.089998199 +0000 -@@ -0,0 +1,347 @@ -+MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. -+ -+MuQSS is a per-cpu runqueue variant of the original BFS scheduler with -+one 8 level skiplist per runqueue, and fine grained locking for much more -+scalability. -+ -+ -+Goals. -+ -+The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from -+here on (pronounced mux) is to completely do away with the complex designs of -+the past for the cpu process scheduler and instead implement one that is very -+simple in basic design. The main focus of MuQSS is to achieve excellent desktop -+interactivity and responsiveness without heuristics and tuning knobs that are -+difficult to understand, impossible to model and predict the effect of, and when -+tuned to one workload cause massive detriment to another, while still being -+scalable to many CPUs and processes. -+ -+ -+Design summary. -+ -+MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) -+lookup, earliest effective virtual deadline first tickless design, loosely based -+on EEVDF (earliest eligible virtual deadline first) and my previous Staircase -+Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. -+Each component shall be described in order to understand the significance of, -+and reasoning for it. -+ -+ -+Design reasoning. -+ -+In BFS, the use of a single runqueue across all CPUs meant that each CPU would -+need to scan the entire runqueue looking for the process with the earliest -+deadline and schedule that next, regardless of which CPU it originally came -+from. This made BFS deterministic with respect to latency and provided -+guaranteed latencies dependent on number of processes and CPUs. The single -+runqueue, however, meant that all CPUs would compete for the single lock -+protecting it, which would lead to increasing lock contention as the number of -+CPUs rose and appeared to limit scalability of common workloads beyond 16 -+logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously -+increased overhead proportionate to the number of queued proecesses and led to -+cache thrashing while iterating over the linked list. -+ -+MuQSS is an evolution of BFS, designed to maintain the same scheduling -+decision mechanism and be virtually deterministic without relying on the -+constrained design of the single runqueue by splitting out the single runqueue -+to be per-CPU and use skiplists instead of linked lists. -+ -+The original reason for going back to a single runqueue design for BFS was that -+once multiple runqueues are introduced, per-CPU or otherwise, there will be -+complex interactions as each runqueue will be responsible for the scheduling -+latency and fairness of the tasks only on its own runqueue, and to achieve -+fairness and low latency across multiple CPUs, any advantage in throughput of -+having CPU local tasks causes other disadvantages. This is due to requiring a -+very complex balancing system to at best achieve some semblance of fairness -+across CPUs and can only maintain relatively low latency for tasks bound to the -+same CPUs, not across them. To increase said fairness and latency across CPUs, -+the advantage of local runqueue locking, which makes for better scalability, is -+lost due to having to grab multiple locks. -+ -+MuQSS works around the problems inherent in multiple runqueue designs by -+making its skip lists priority ordered and through novel use of lockless -+examination of each other runqueue it can decide if it should take the earliest -+deadline task from another runqueue for latency reasons, or for CPU balancing -+reasons. It still does not have a balancing system, choosing to allow the -+next task scheduling decision and task wakeup CPU choice to allow balancing to -+happen by virtue of its choices. -+ -+ -+Design details. -+ -+Custom skip list implementation: -+ -+To avoid the overhead of building up and tearing down skip list structures, -+the variant used by MuQSS has a number of optimisations making it specific for -+its use case in the scheduler. It uses static arrays of 8 'levels' instead of -+building up and tearing down structures dynamically. This makes each runqueue -+only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU -+it means that it scales O(log N) up to 64k x number of logical CPUs which is -+far beyond the realistic task limits each CPU could handle. By being 8 levels -+it also makes the array exactly one cacheline in size. Additionally, each -+skip list node is bidirectional making insertion and removal amortised O(1), -+being O(k) where k is 1-8. Uniquely, we are only ever interested in the very -+first entry in each list at all times with MuQSS, so there is never a need to -+do a search and thus look up is always O(1). In interactive mode, the queues -+will be searched beyond their first entry if the first task is not suitable -+for affinity or SMT nice reasons. -+ -+Task insertion: -+ -+MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into -+a custom skip list as described above (based on the original design by William -+Pugh). Insertion is ordered in such a way that there is never a need to do a -+search by ordering tasks according to static priority primarily, and then -+virtual deadline at the time of insertion. -+ -+Niffies: -+ -+Niffies are a monotonic forward moving timer not unlike the "jiffies" but are -+of nanosecond resolution. Niffies are calculated per-runqueue from the high -+resolution TSC timers, and in order to maintain fairness are synchronised -+between CPUs whenever both runqueues are locked concurrently. -+ -+Virtual deadline: -+ -+The key to achieving low latency, scheduling fairness, and "nice level" -+distribution in MuQSS is entirely in the virtual deadline mechanism. The one -+tunable in MuQSS is the rr_interval, or "round robin interval". This is the -+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) -+tasks of the same nice level will be running for, or looking at it the other -+way around, the longest duration two tasks of the same nice level will be -+delayed for. When a task requests cpu time, it is given a quota (time_slice) -+equal to the rr_interval and a virtual deadline. The virtual deadline is -+offset from the current time in niffies by this equation: -+ -+ niffies + (prio_ratio * rr_interval) -+ -+The prio_ratio is determined as a ratio compared to the baseline of nice -20 -+and increases by 10% per nice level. The deadline is a virtual one only in that -+no guarantee is placed that a task will actually be scheduled by this time, but -+it is used to compare which task should go next. There are three components to -+how a task is next chosen. First is time_slice expiration. If a task runs out -+of its time_slice, it is descheduled, the time_slice is refilled, and the -+deadline reset to that formula above. Second is sleep, where a task no longer -+is requesting CPU for whatever reason. The time_slice and deadline are _not_ -+adjusted in this case and are just carried over for when the task is next -+scheduled. Third is preemption, and that is when a newly waking task is deemed -+higher priority than a currently running task on any cpu by virtue of the fact -+that it has an earlier virtual deadline than the currently running task. The -+earlier deadline is the key to which task is next chosen for the first and -+second cases. -+ -+The CPU proportion of different nice tasks works out to be approximately the -+ -+ (prio_ratio difference)^2 -+ -+The reason it is squared is that a task's deadline does not change while it is -+running unless it runs out of time_slice. Thus, even if the time actually -+passes the deadline of another task that is queued, it will not get CPU time -+unless the current running task deschedules, and the time "base" (niffies) is -+constantly moving. -+ -+Task lookup: -+ -+As tasks are already pre-ordered according to anticipated scheduling order in -+the skip lists, lookup for the next suitable task per-runqueue is always a -+matter of simply selecting the first task in the 0th level skip list entry. -+In order to maintain optimal latency and fairness across CPUs, MuQSS does a -+novel examination of every other runqueue in cache locality order, choosing the -+best task across all runqueues. This provides near-determinism of how long any -+task across the entire system may wait before receiving CPU time. The other -+runqueues are first examine lockless and then trylocked to minimise the -+potential lock contention if they are likely to have a suitable better task. -+Each other runqueue lock is only held for as long as it takes to examine the -+entry for suitability. In "interactive" mode, the default setting, MuQSS will -+look for the best deadline task across all CPUs, while in !interactive mode, -+it will only select a better deadline task from another CPU if it is more -+heavily laden than the current one. -+ -+Lookup is therefore O(k) where k is number of CPUs. -+ -+ -+Latency. -+ -+Through the use of virtual deadlines to govern the scheduling order of normal -+tasks, queue-to-activation latency per runqueue is guaranteed to be bound by -+the rr_interval tunable which is set to 6ms by default. This means that the -+longest a CPU bound task will wait for more CPU is proportional to the number -+of running tasks and in the common case of 0-2 running tasks per CPU, will be -+under the 7ms threshold for human perception of jitter. Additionally, as newly -+woken tasks will have an early deadline from their previous runtime, the very -+tasks that are usually latency sensitive will have the shortest interval for -+activation, usually preempting any existing CPU bound tasks. -+ -+Tickless expiry: -+ -+A feature of MuQSS is that it is not tied to the resolution of the chosen tick -+rate in Hz, instead depending entirely on the high resolution timers where -+possible for sub-millisecond accuracy on timeouts regarless of the underlying -+tick rate. This allows MuQSS to be run with the low overhead of low Hz rates -+such as 100 by default, benefiting from the improved throughput and lower -+power usage it provides. Another advantage of this approach is that in -+combination with the Full No HZ option, which disables ticks on running task -+CPUs instead of just idle CPUs, the tick can be disabled at all times -+regardless of how many tasks are running instead of being limited to just one -+running task. Note that this option is NOT recommended for regular desktop -+users. -+ -+ -+Scalability and balancing. -+ -+Unlike traditional approaches where balancing is a combination of CPU selection -+at task wakeup and intermittent balancing based on a vast array of rules set -+according to architecture, busyness calculations and special case management, -+MuQSS indirectly balances on the fly at task wakeup and next task selection. -+During initialisation, MuQSS creates a cache coherency ordered list of CPUs for -+each logical CPU and uses this to aid task/CPU selection when CPUs are busy. -+Additionally it selects any idle CPUs, if they are available, at any time over -+busy CPUs according to the following preference: -+ -+ * Same thread, idle or busy cache, idle or busy threads -+ * Other core, same cache, idle or busy cache, idle threads. -+ * Same node, other CPU, idle cache, idle threads. -+ * Same node, other CPU, busy cache, idle threads. -+ * Other core, same cache, busy threads. -+ * Same node, other CPU, busy threads. -+ * Other node, other CPU, idle cache, idle threads. -+ * Other node, other CPU, busy cache, idle threads. -+ * Other node, other CPU, busy threads. -+ -+Mux is therefore SMT, MC and Numa aware without the need for extra -+intermittent balancing to maintain CPUs busy and make the most of cache -+coherency. -+ -+ -+Features -+ -+As the initial prime target audience for MuQSS was the average desktop user, it -+was designed to not need tweaking, tuning or have features set to obtain benefit -+from it. Thus the number of knobs and features has been kept to an absolute -+minimum and should not require extra user input for the vast majority of cases. -+There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, -+interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO -+policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS -+does _not_ now feature is support for CGROUPS. The average user should neither -+need to know what these are, nor should they need to be using them to have good -+desktop behaviour. However since some applications refuse to work without -+cgroups, one can enable them with MuQSS as a stub and the filesystem will be -+created which will allow the applications to work. -+ -+rr_interval: -+ -+ /proc/sys/kernel/rr_interval -+ -+The value is in milliseconds, and the default value is set to 6. Valid values -+are from 1 to 1000 Decreasing the value will decrease latencies at the cost of -+decreasing throughput, while increasing it will improve throughput, but at the -+cost of worsening latencies. It is based on the fact that humans can detect -+jitter at approximately 7ms, so aiming for much lower latencies is pointless -+under most circumstances. It is worth noting this fact when comparing the -+latency performance of MuQSS to other schedulers. Worst case latencies being -+higher than 7ms are far worse than average latencies not being in the -+microsecond range. -+ -+interactive: -+ -+ /proc/sys/kernel/interactive -+ -+The value is a simple boolean of 1 for on and 0 for off and is set to on by -+default. Disabling this will disable the near-determinism of MuQSS when -+selecting the next task by not examining all CPUs for the earliest deadline -+task, or which CPU to wake to, instead prioritising CPU balancing for improved -+throughput. Latency will still be bound by rr_interval, but on a per-CPU basis -+instead of across the whole system. -+ -+Isochronous scheduling: -+ -+Isochronous scheduling is a unique scheduling policy designed to provide -+near-real-time performance to unprivileged (ie non-root) users without the -+ability to starve the machine indefinitely. Isochronous tasks (which means -+"same time") are set using, for example, the schedtool application like so: -+ -+ schedtool -I -e amarok -+ -+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works -+is that it has a priority level between true realtime tasks and SCHED_NORMAL -+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, -+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval -+rate). However if ISO tasks run for more than a tunable finite amount of time, -+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of -+time is the percentage of CPU available per CPU, configurable as a percentage in -+the following "resource handling" tunable (as opposed to a scheduler tunable): -+ -+iso_cpu: -+ -+ /proc/sys/kernel/iso_cpu -+ -+and is set to 70% by default. It is calculated over a rolling 5 second average -+Because it is the total CPU available, it means that on a multi CPU machine, it -+is possible to have an ISO task running as realtime scheduling indefinitely on -+just one CPU, as the other CPUs will be available. Setting this to 100 is the -+equivalent of giving all users SCHED_RR access and setting it to 0 removes the -+ability to run any pseudo-realtime tasks. -+ -+A feature of MuQSS is that it detects when an application tries to obtain a -+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the -+appropriate privileges to use those policies. When it detects this, it will -+give the task SCHED_ISO policy instead. Thus it is transparent to the user. -+ -+ -+Idleprio scheduling: -+ -+Idleprio scheduling is a scheduling policy designed to give out CPU to a task -+_only_ when the CPU would be otherwise idle. The idea behind this is to allow -+ultra low priority tasks to be run in the background that have virtually no -+effect on the foreground tasks. This is ideally suited to distributed computing -+clients (like setiathome, folding, mprime etc) but can also be used to start a -+video encode or so on without any slowdown of other tasks. To avoid this policy -+from grabbing shared resources and holding them indefinitely, if it detects a -+state where the task is waiting on I/O, the machine is about to suspend to ram -+and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has -+been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without -+superuser privileges since it is effectively a lower scheduling policy. Tasks -+can be set to start as SCHED_IDLEPRIO with the schedtool command like so: -+ -+schedtool -D -e ./mprime -+ -+Subtick accounting: -+ -+It is surprisingly difficult to get accurate CPU accounting, and in many cases, -+the accounting is done by simply determining what is happening at the precise -+moment a timer tick fires off. This becomes increasingly inaccurate as the timer -+tick frequency (HZ) is lowered. It is possible to create an application which -+uses almost 100% CPU, yet by being descheduled at the right time, records zero -+CPU usage. While the main problem with this is that there are possible security -+implications, it is also difficult to determine how much CPU a task really does -+use. Mux uses sub-tick accounting from the TSC clock to determine real CPU -+usage. Thus, the amount of CPU reported as being used by MuQSS will more -+accurately represent how much CPU the task itself is using (as is shown for -+example by the 'time' application), so the reported values may be quite -+different to other schedulers. When comparing throughput of MuQSS to other -+designs, it is important to compare the actual completed work in terms of total -+wall clock time taken and total work done, rather than the reported "cpu usage". -+ -+Symmetric MultiThreading (SMT) aware nice: -+ -+SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the -+logical CPU count rises by adding thread units to each CPU core, allowing more -+than one task to be run simultaneously on the same core, the disadvantage of it -+is that the CPU power is shared between the tasks, not summating to the power -+of two CPUs. The practical upshot of this is that two tasks running on -+separate threads of the same core run significantly slower than if they had one -+core each to run on. While smart CPU selection allows each task to have a core -+to itself whenever available (as is done on MuQSS), it cannot offset the -+slowdown that occurs when the cores are all loaded and only a thread is left. -+Most of the time this is harmless as the CPU is effectively overloaded at this -+point and the extra thread is of benefit. However when running a niced task in -+the presence of an un-niced task (say nice 19 v nice 0), the nice task gets -+precisely the same amount of CPU power as the unniced one. MuQSS has an -+optional configuration feature known as SMT-NICE which selectively idles the -+secondary niced thread for a period proportional to the nice difference, -+allowing CPU distribution according to nice level to be maintained, at the -+expense of a small amount of extra overhead. If this is configured in on a -+machine without SMT threads, the overhead is minimal. -+ -+ -+Con Kolivas Sat, 29th October 2016 -diff -Nur a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt ---- a/Documentation/sysctl/kernel.txt 2019-01-05 20:17:13.829237906 +0000 -+++ b/Documentation/sysctl/kernel.txt 2019-01-05 20:22:51.089998199 +0000 -@@ -39,6 +39,7 @@ - - hung_task_timeout_secs - - hung_task_warnings - - kexec_load_disabled -+- iso_cpu - - kptr_restrict - - l2cr [ PPC only ] - - modprobe ==> Documentation/debugging-modules.txt -@@ -73,6 +74,7 @@ - - randomize_va_space - - real-root-dev ==> Documentation/admin-guide/initrd.rst - - reboot-cmd [ SPARC only ] -+- rr_interval - - rtsig-max - - rtsig-nr - - seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst -@@ -95,6 +97,7 @@ - - unknown_nmi_panic - - watchdog - - watchdog_thresh -+- yield_type - - version - - ============================================================== -@@ -397,6 +400,16 @@ - - ============================================================== - -+iso_cpu: (MuQSS CPU scheduler only). -+ -+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can -+run effectively at realtime priority, averaged over a rolling five -+seconds over the -whole- system, meaning all cpus. -+ -+Set to 70 (percent) by default. -+ -+============================================================== -+ - l2cr: (PPC only) - - This flag controls the L2 cache of G3 processor boards. If -@@ -823,6 +836,20 @@ - - ============================================================== - -+rr_interval: (MuQSS CPU scheduler only) -+ -+This is the smallest duration that any cpu process scheduling unit -+will run for. Increasing this value can increase throughput of cpu -+bound tasks substantially but at the expense of increased latencies -+overall. Conversely decreasing it will decrease average and maximum -+latencies but at the expense of throughput. This value is in -+milliseconds and the default value chosen depends on the number of -+cpus available at scheduler initialisation with a minimum of 6. -+ -+Valid values are from 1-1000. -+ -+============================================================== -+ - rtsig-max & rtsig-nr: - - The file rtsig-max can be used to tune the maximum number -@@ -1081,3 +1108,13 @@ - tunable to zero will disable lockup detection altogether. - - ============================================================== -+ -+yield_type: (MuQSS CPU scheduler only) -+ -+This determines what type of yield calls to sched_yield will perform. -+ -+ 0: No yield. -+ 1: Yield only to better priority/deadline tasks. (default) -+ 2: Expire timeslice and recalculate deadline. -+ -+============================================================== -diff -Nur a/fs/proc/base.c b/fs/proc/base.c ---- a/fs/proc/base.c 2018-12-21 13:13:19.000000000 +0000 -+++ b/fs/proc/base.c 2019-01-05 20:22:51.089998199 +0000 -@@ -481,7 +481,7 @@ - seq_printf(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff -Nur a/include/linux/init_task.h b/include/linux/init_task.h ---- a/include/linux/init_task.h 2018-12-21 13:13:19.000000000 +0000 -+++ b/include/linux/init_task.h 2019-01-05 20:22:51.089998199 +0000 -@@ -172,8 +172,6 @@ - # define INIT_VTIME(tsk) - #endif - --#define INIT_TASK_COMM "swapper" -- - #ifdef CONFIG_RT_MUTEXES - # define INIT_RT_MUTEXES(tsk) \ - .pi_waiters = RB_ROOT_CACHED, \ -@@ -223,6 +221,80 @@ - * INIT_TASK is used to set up the first task table, touch at - * your own risk!. Base=0, limit=0x1fffff (=2MB) - */ -+#ifdef CONFIG_SCHED_MUQSS -+#define INIT_TASK_COMM "MuQSS" -+#define INIT_TASK(tsk) \ -+{ \ -+ INIT_TASK_TI(tsk) \ -+ .state = 0, \ -+ .stack = init_stack, \ -+ .usage = ATOMIC_INIT(2), \ -+ .flags = PF_KTHREAD, \ -+ .prio = NORMAL_PRIO, \ -+ .static_prio = MAX_PRIO-20, \ -+ .normal_prio = NORMAL_PRIO, \ -+ .deadline = 0, \ -+ .policy = SCHED_NORMAL, \ -+ .cpus_allowed = CPU_MASK_ALL, \ -+ .mm = NULL, \ -+ .active_mm = &init_mm, \ -+ .restart_block = { \ -+ .fn = do_no_restart_syscall, \ -+ }, \ -+ .time_slice = 1000000, \ -+ .tasks = LIST_HEAD_INIT(tsk.tasks), \ -+ INIT_PUSHABLE_TASKS(tsk) \ -+ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ -+ .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ -+ .real_parent = &tsk, \ -+ .parent = &tsk, \ -+ .children = LIST_HEAD_INIT(tsk.children), \ -+ .sibling = LIST_HEAD_INIT(tsk.sibling), \ -+ .group_leader = &tsk, \ -+ RCU_POINTER_INITIALIZER(real_cred, &init_cred), \ -+ RCU_POINTER_INITIALIZER(cred, &init_cred), \ -+ .comm = INIT_TASK_COMM, \ -+ .thread = INIT_THREAD, \ -+ .fs = &init_fs, \ -+ .files = &init_files, \ -+ .signal = &init_signals, \ -+ .sighand = &init_sighand, \ -+ .nsproxy = &init_nsproxy, \ -+ .pending = { \ -+ .list = LIST_HEAD_INIT(tsk.pending.list), \ -+ .signal = {{0}}}, \ -+ .blocked = {{0}}, \ -+ .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ -+ .journal_info = NULL, \ -+ INIT_CPU_TIMERS(tsk) \ -+ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ -+ .timer_slack_ns = 50000, /* 50 usec default slack */ \ -+ .pids = { \ -+ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ -+ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ -+ [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ -+ }, \ -+ .thread_group = LIST_HEAD_INIT(tsk.thread_group), \ -+ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \ -+ INIT_IDS \ -+ INIT_PERF_EVENTS(tsk) \ -+ INIT_TRACE_IRQFLAGS \ -+ INIT_LOCKDEP \ -+ INIT_FTRACE_GRAPH \ -+ INIT_TRACE_RECURSION \ -+ INIT_TASK_RCU_PREEMPT(tsk) \ -+ INIT_TASK_RCU_TASKS(tsk) \ -+ INIT_CPUSET_SEQ(tsk) \ -+ INIT_RT_MUTEXES(tsk) \ -+ INIT_PREV_CPUTIME(tsk) \ -+ INIT_VTIME(tsk) \ -+ INIT_NUMA_BALANCING(tsk) \ -+ INIT_KASAN(tsk) \ -+ INIT_LIVEPATCH(tsk) \ -+ INIT_TASK_SECURITY \ -+} -+#else /* CONFIG_SCHED_MUQSS */ -+#define INIT_TASK_COMM "swapper" - #define INIT_TASK(tsk) \ - { \ - INIT_TASK_TI(tsk) \ -@@ -300,7 +372,7 @@ - INIT_LIVEPATCH(tsk) \ - INIT_TASK_SECURITY \ - } -- -+#endif /* CONFIG_SCHED_MUQSS */ - - /* Attach to the init_task data structure for proper alignment */ - #define __init_task_data __attribute__((__section__(".data..init_task"))) -diff -Nur a/include/linux/ioprio.h b/include/linux/ioprio.h ---- a/include/linux/ioprio.h 2018-12-21 13:13:19.000000000 +0000 -+++ b/include/linux/ioprio.h 2019-01-05 20:22:51.089998199 +0000 -@@ -52,6 +52,8 @@ - */ - static inline int task_nice_ioprio(struct task_struct *task) - { -+ if (iso_task(task)) -+ return 0; - return (task_nice(task) + 20) / 5; - } - -diff -Nur a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h ---- a/include/linux/sched/nohz.h 2018-12-21 13:13:19.000000000 +0000 -+++ b/include/linux/sched/nohz.h 2019-01-05 20:22:51.089998199 +0000 -@@ -6,7 +6,7 @@ - * This is the interface between the scheduler and nohz/dynticks: - */ - --#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) - extern void cpu_load_update_nohz_start(void); - extern void cpu_load_update_nohz_stop(void); - #else -@@ -23,7 +23,7 @@ - static inline void set_cpu_sd_state_idle(void) { } - #endif - --#ifdef CONFIG_NO_HZ_COMMON -+#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) - void calc_load_nohz_start(void); - void calc_load_nohz_stop(void); - #else -diff -Nur a/include/linux/sched/prio.h b/include/linux/sched/prio.h ---- a/include/linux/sched/prio.h 2018-12-21 13:13:19.000000000 +0000 -+++ b/include/linux/sched/prio.h 2019-01-05 20:22:51.089998199 +0000 -@@ -20,8 +20,20 @@ - */ - - #define MAX_USER_RT_PRIO 100 -+ -+#ifdef CONFIG_SCHED_MUQSS -+/* Note different MAX_RT_PRIO */ -+#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) -+ -+#define ISO_PRIO (MAX_RT_PRIO) -+#define NORMAL_PRIO (MAX_RT_PRIO + 1) -+#define IDLE_PRIO (MAX_RT_PRIO + 2) -+#define PRIO_LIMIT ((IDLE_PRIO) + 1) -+#else /* CONFIG_SCHED_MUQSS */ - #define MAX_RT_PRIO MAX_USER_RT_PRIO - -+#endif /* CONFIG_SCHED_MUQSS */ -+ - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) - -diff -Nur a/include/linux/sched/task.h b/include/linux/sched/task.h ---- a/include/linux/sched/task.h 2018-12-21 13:13:19.000000000 +0000 -+++ b/include/linux/sched/task.h 2019-01-05 20:22:51.089998199 +0000 -@@ -80,7 +80,7 @@ - extern void free_task(struct task_struct *tsk); - - /* sched_exec is called by processes performing an exec */ --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) - extern void sched_exec(void); - #else - #define sched_exec() {} -diff -Nur a/include/linux/sched.h b/include/linux/sched.h ---- a/include/linux/sched.h 2018-12-21 13:13:19.000000000 +0000 -+++ b/include/linux/sched.h 2019-01-05 20:22:51.089998199 +0000 -@@ -27,6 +27,9 @@ - #include - #include - #include -+#ifdef CONFIG_SCHED_MUQSS -+#include -+#endif - - /* task_struct member predeclarations (sorted alphabetically): */ - struct audit_context; -@@ -579,9 +582,11 @@ - unsigned int flags; - unsigned int ptrace; - -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) -+ int on_cpu; -+#endif - #ifdef CONFIG_SMP - struct llist_node wake_entry; -- int on_cpu; - #ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ - unsigned int cpu; -@@ -598,10 +603,25 @@ - int static_prio; - int normal_prio; - unsigned int rt_priority; -+#ifdef CONFIG_SCHED_MUQSS -+ int time_slice; -+ u64 deadline; -+ skiplist_node node; /* Skip list node */ -+ u64 last_ran; -+ u64 sched_time; /* sched_clock time spent running */ -+#ifdef CONFIG_SMT_NICE -+ int smt_bias; /* Policy/nice level bias across smt siblings */ -+#endif -+#ifdef CONFIG_HOTPLUG_CPU -+ bool zerobound; /* Bound to CPU0 for hotplug */ -+#endif -+ unsigned long rt_timeout; -+#else /* CONFIG_SCHED_MUQSS */ - - const struct sched_class *sched_class; - struct sched_entity se; - struct sched_rt_entity rt; -+#endif - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; - #endif -@@ -751,6 +771,10 @@ - u64 utimescaled; - u64 stimescaled; - #endif -+#ifdef CONFIG_SCHED_MUQSS -+ /* Unbanked cpu time */ -+ unsigned long utime_ns, stime_ns; -+#endif - u64 gtime; - struct prev_cputime prev_cputime; - #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -@@ -1155,6 +1179,40 @@ - */ - }; - -+#ifdef CONFIG_SCHED_MUQSS -+#define tsk_seruntime(t) ((t)->sched_time) -+#define tsk_rttimeout(t) ((t)->rt_timeout) -+ -+static inline void tsk_cpus_current(struct task_struct *p) -+{ -+} -+ -+void print_scheduler_version(void); -+ -+static inline bool iso_task(struct task_struct *p) -+{ -+ return (p->policy == SCHED_ISO); -+} -+#else /* CFS */ -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+ -+static inline void tsk_cpus_current(struct task_struct *p) -+{ -+ p->nr_cpus_allowed = current->nr_cpus_allowed; -+} -+ -+static inline void print_scheduler_version(void) -+{ -+ printk(KERN_INFO "CFS CPU scheduler.\n"); -+} -+ -+static inline bool iso_task(struct task_struct *p) -+{ -+ return false; -+} -+#endif /* CONFIG_SCHED_MUQSS */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->pids[PIDTYPE_PID].pid; -diff -Nur a/include/linux/skip_list.h b/include/linux/skip_list.h ---- a/include/linux/skip_list.h 1970-01-01 01:00:00.000000000 +0100 -+++ b/include/linux/skip_list.h 2019-01-05 20:22:51.089998199 +0000 -@@ -0,0 +1,33 @@ -+#ifndef _LINUX_SKIP_LISTS_H -+#define _LINUX_SKIP_LISTS_H -+typedef u64 keyType; -+typedef void *valueType; -+ -+typedef struct nodeStructure skiplist_node; -+ -+struct nodeStructure { -+ int level; /* Levels in this structure */ -+ keyType key; -+ valueType value; -+ skiplist_node *next[8]; -+ skiplist_node *prev[8]; -+}; -+ -+typedef struct listStructure { -+ int entries; -+ int level; /* Maximum level of the list -+ (1 more than the number of levels in the list) */ -+ skiplist_node *header; /* pointer to header */ -+} skiplist; -+ -+void skiplist_init(skiplist_node *slnode); -+skiplist *new_skiplist(skiplist_node *slnode); -+void free_skiplist(skiplist *l); -+void skiplist_node_init(skiplist_node *node); -+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); -+void skiplist_delete(skiplist *l, skiplist_node *node); -+ -+static inline bool skiplist_node_empty(skiplist_node *node) { -+ return (!node->next[0]); -+} -+#endif /* _LINUX_SKIP_LISTS_H */ -diff -Nur a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h ---- a/include/uapi/linux/sched.h 2018-12-21 13:13:19.000000000 +0000 -+++ b/include/uapi/linux/sched.h 2019-01-05 20:22:51.089998199 +0000 -@@ -37,9 +37,16 @@ - #define SCHED_FIFO 1 - #define SCHED_RR 2 - #define SCHED_BATCH 3 --/* SCHED_ISO: reserved but not implemented yet */ -+/* SCHED_ISO: Implemented on MuQSS only */ - #define SCHED_IDLE 5 -+#ifdef CONFIG_SCHED_MUQSS -+#define SCHED_ISO 4 -+#define SCHED_IDLEPRIO SCHED_IDLE -+#define SCHED_MAX (SCHED_IDLEPRIO) -+#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) -+#else /* CONFIG_SCHED_MUQSS */ - #define SCHED_DEADLINE 6 -+#endif /* CONFIG_SCHED_MUQSS */ - - /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ - #define SCHED_RESET_ON_FORK 0x40000000 -diff -Nur a/init/Kconfig b/init/Kconfig ---- a/init/Kconfig 2019-01-05 20:17:13.849238543 +0000 -+++ b/init/Kconfig 2019-01-05 20:22:51.089998199 +0000 -@@ -38,6 +38,18 @@ - - menu "General setup" - -+config SCHED_MUQSS -+ bool "MuQSS cpu scheduler" -+ select HIGH_RES_TIMERS -+ ---help--- -+ The Multiple Queue Skiplist Scheduler for excellent interactivity and -+ responsiveness on the desktop and highly scalable deterministic -+ low latency on any hardware. -+ -+ Say Y here. -+ default y -+ -+ - config BROKEN - bool - -@@ -621,6 +633,7 @@ - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION -+ depends on !SCHED_MUQSS - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -723,9 +736,13 @@ - help - This feature lets CPU scheduler recognize task groups and control CPU - bandwidth allocation to such task groups. It uses cgroups to group -- tasks. -+ tasks. In combination with MuQSS this is purely a STUB to create the -+ files associated with the CPU controller cgroup but most of the -+ controls do nothing. This is useful for working in environments and -+ with applications that will only work if this control group is -+ present. - --if CGROUP_SCHED -+if CGROUP_SCHED && !SCHED_MUQSS - config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on CGROUP_SCHED -@@ -832,6 +849,7 @@ - - config CGROUP_CPUACCT - bool "Simple CPU accounting controller" -+ depends on !SCHED_MUQSS - help - Provides a simple controller for monitoring the - total CPU consumed by the tasks in a cgroup. -@@ -950,6 +968,7 @@ - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_MUQSS - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff -Nur a/init/main.c b/init/main.c ---- a/init/main.c 2018-12-21 13:13:19.000000000 +0000 -+++ b/init/main.c 2019-01-05 20:22:51.089998199 +0000 -@@ -841,7 +841,6 @@ - return ret; - } - -- - extern initcall_t __initcall_start[]; - extern initcall_t __initcall0_start[]; - extern initcall_t __initcall1_start[]; -@@ -1008,6 +1007,8 @@ - - rcu_end_inkernel_boot(); - -+ print_scheduler_version(); -+ - if (ramdisk_execute_command) { - ret = run_init_process(ramdisk_execute_command); - if (!ret) -diff -Nur a/kernel/delayacct.c b/kernel/delayacct.c ---- a/kernel/delayacct.c 2018-12-21 13:13:19.000000000 +0000 -+++ b/kernel/delayacct.c 2019-01-05 20:22:51.089998199 +0000 -@@ -115,7 +115,7 @@ - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff -Nur a/kernel/exit.c b/kernel/exit.c ---- a/kernel/exit.c 2018-12-21 13:13:19.000000000 +0000 -+++ b/kernel/exit.c 2019-01-05 20:22:51.089998199 +0000 -@@ -129,7 +129,7 @@ - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -150,7 +150,7 @@ - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff -Nur a/kernel/kthread.c b/kernel/kthread.c ---- a/kernel/kthread.c 2018-12-21 13:13:19.000000000 +0000 -+++ b/kernel/kthread.c 2019-01-05 20:22:51.099998516 +0000 -@@ -410,6 +410,34 @@ - } - EXPORT_SYMBOL(kthread_bind); - -+#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) -+extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); -+ -+/* -+ * new_kthread_bind is a special variant of __kthread_bind_mask. -+ * For new threads to work on muqss we want to call do_set_cpus_allowed -+ * without the task_cpu being set and the task rescheduled until they're -+ * rescheduled on their own so we call __do_set_cpus_allowed directly which -+ * only changes the cpumask. This is particularly important for smpboot threads -+ * to work. -+ */ -+static void new_kthread_bind(struct task_struct *p, unsigned int cpu) -+{ -+ unsigned long flags; -+ -+ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) -+ return; -+ -+ /* It's safe because the task is inactive. */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ __do_set_cpus_allowed(p, cpumask_of(cpu)); -+ p->flags |= PF_NO_SETAFFINITY; -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+#else -+#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) -+#endif -+ - /** - * kthread_create_on_cpu - Create a cpu bound kthread - * @threadfn: the function to run until signal_pending(current). -@@ -431,7 +459,7 @@ - cpu); - if (IS_ERR(p)) - return p; -- kthread_bind(p, cpu); -+ new_kthread_bind(p, cpu); - /* CPU hotplug need to bind once again when unparking the thread. */ - set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); - to_kthread(p)->cpu = cpu; -diff -Nur a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c ---- a/kernel/livepatch/transition.c 2018-12-21 13:13:19.000000000 +0000 -+++ b/kernel/livepatch/transition.c 2019-01-05 20:22:51.099998516 +0000 -@@ -277,6 +277,12 @@ - return 0; - } - -+#ifdef CONFIG_SCHED_MUQSS -+typedef unsigned long rq_flags_t; -+#else -+typedef struct rq_flags rq_flag_t; -+#endif -+ - /* - * Try to safely switch a task to the target patch state. If it's currently - * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or -@@ -285,7 +291,7 @@ - static bool klp_try_switch_task(struct task_struct *task) - { - struct rq *rq; -- struct rq_flags flags; -+ rq_flags_t flags; - int ret; - bool success = false; - char err_buf[STACK_ERR_BUF_SIZE]; -diff -Nur a/kernel/Makefile b/kernel/Makefile ---- a/kernel/Makefile 2018-12-21 13:13:19.000000000 +0000 -+++ b/kernel/Makefile 2019-01-05 20:22:51.099998516 +0000 -@@ -10,7 +10,7 @@ - extable.o params.o \ - kthread.o sys_ni.o nsproxy.o \ - notifier.o ksysfs.o cred.o reboot.o \ -- async.o range.o smpboot.o ucount.o -+ async.o range.o smpboot.o ucount.o skip_list.o - - obj-$(CONFIG_MODULES) += kmod.o - obj-$(CONFIG_MULTIUSER) += groups.o -diff -Nur a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig ---- a/kernel/rcu/Kconfig 2018-12-21 13:13:19.000000000 +0000 -+++ b/kernel/rcu/Kconfig 2019-01-05 20:22:51.099998516 +0000 -@@ -93,7 +93,7 @@ - config CONTEXT_TRACKING_FORCE - bool "Force context tracking" - depends on CONTEXT_TRACKING -- default y if !NO_HZ_FULL -+ default y if !NO_HZ_FULL && !SCHED_MUQSS - help - The major pre-requirement for full dynticks to work is to - support the context tracking subsystem. But there are also -diff -Nur a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c ---- a/kernel/sched/cpufreq_schedutil.c 2018-12-21 13:13:19.000000000 +0000 -+++ b/kernel/sched/cpufreq_schedutil.c 2019-01-05 20:22:51.099998516 +0000 -@@ -176,6 +176,17 @@ - return cpufreq_driver_resolve_freq(policy, freq); - } - -+#ifdef CONFIG_SCHED_MUQSS -+static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ *util = rq->load_avg; -+ if (*util > SCHED_CAPACITY_SCALE) -+ *util = SCHED_CAPACITY_SCALE; -+ *max = SCHED_CAPACITY_SCALE; -+} -+#else /* CONFIG_SCHED_MUQSS */ - static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) - { - struct rq *rq = cpu_rq(cpu); -@@ -186,6 +197,7 @@ - *util = min(rq->cfs.avg.util_avg, cfs_max); - *max = cfs_max; - } -+#endif /* CONFIG_SCHED_MUQSS */ - - static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, - unsigned int flags) -diff -Nur a/kernel/sched/cputime.c b/kernel/sched/cputime.c ---- a/kernel/sched/cputime.c 2018-12-21 13:13:19.000000000 +0000 -+++ b/kernel/sched/cputime.c 2019-01-05 20:22:51.099998516 +0000 -@@ -270,26 +270,6 @@ - return accounted; - } - --#ifdef CONFIG_64BIT --static inline u64 read_sum_exec_runtime(struct task_struct *t) --{ -- return t->se.sum_exec_runtime; --} --#else --static u64 read_sum_exec_runtime(struct task_struct *t) --{ -- u64 ns; -- struct rq_flags rf; -- struct rq *rq; -- -- rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -- task_rq_unlock(rq, t, &rf); -- -- return ns; --} --#endif -- - /* - * Accumulate raw cputime values of dead tasks (sig->[us]time) and live - * tasks (sum on group iteration) belonging to @tsk's group. -@@ -661,7 +641,7 @@ - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); -diff -Nur a/kernel/sched/idle.c b/kernel/sched/idle.c ---- a/kernel/sched/idle.c 2018-12-21 13:13:19.000000000 +0000 -+++ b/kernel/sched/idle.c 2019-01-05 20:22:51.099998516 +0000 -@@ -209,6 +209,9 @@ - */ - static void do_idle(void) - { -+ int cpu = smp_processor_id(); -+ bool pending = false; -+ - /* - * If the arch has a polling bit, we maintain an invariant: - * -@@ -220,13 +223,16 @@ - - __current_set_polling(); - quiet_vmstat(); -- tick_nohz_idle_enter(); -+ if (unlikely(softirq_pending(cpu))) -+ pending = true; -+ else -+ tick_nohz_idle_enter(); - - while (!need_resched()) { - check_pgt_cache(); - rmb(); - -- if (cpu_is_offline(smp_processor_id())) { -+ if (cpu_is_offline(cpu)) { - cpuhp_report_idle_dead(); - arch_cpu_idle_dead(); - } -@@ -255,7 +261,8 @@ - * an IPI to fold the state for us. - */ - preempt_set_need_resched(); -- tick_nohz_idle_exit(); -+ if (!pending) -+ tick_nohz_idle_exit(); - __current_clr_polling(); - - /* -diff -Nur a/kernel/sched/Makefile b/kernel/sched/Makefile ---- a/kernel/sched/Makefile 2018-12-21 13:13:19.000000000 +0000 -+++ b/kernel/sched/Makefile 2019-01-05 20:22:51.099998516 +0000 -@@ -16,14 +16,20 @@ - CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer - endif - --obj-y += core.o loadavg.o clock.o cputime.o -+ifdef CONFIG_SCHED_MUQSS -+obj-y += MuQSS.o clock.o -+else -+obj-y += core.o loadavg.o clock.o - obj-y += idle_task.o fair.o rt.o deadline.o --obj-y += wait.o wait_bit.o swait.o completion.o idle.o --obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o -+obj-$(CONFIG_SMP) += cpudeadline.o stop_task.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o --obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o - obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o -+endif -+obj-y += cputime.o -+obj-y += wait.o wait_bit.o swait.o completion.o idle.o -+obj-$(CONFIG_SMP) += cpupri.o topology.o -+obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o - obj-$(CONFIG_MEMBARRIER) += membarrier.o -diff -Nur a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c ---- a/kernel/sched/MuQSS.c 1970-01-01 01:00:00.000000000 +0100 -+++ b/kernel/sched/MuQSS.c 2019-01-05 20:22:51.099998516 +0000 -@@ -0,0 +1,6923 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * kernel/sched/MuQSS.c, was kernel/sched.c -+ * -+ * Kernel scheduler and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and -+ * make semaphores SMP safe -+ * 1998-11-19 Implemented schedule_timeout() and related stuff -+ * by Andrea Arcangeli -+ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: -+ * hybrid priority-list and round-robin design with -+ * an array-switch method of distributing timeslices -+ * and per-CPU runqueues. Cleanups and useful suggestions -+ * by Davide Libenzi, preemptible kernel bits by Robert Love. -+ * 2003-09-03 Interactivity tuning by Con Kolivas. -+ * 2004-04-02 Scheduler domains code by Nick Piggin -+ * 2007-04-15 Work begun on replacing all interactivity tuning with a -+ * fair scheduling design by Con Kolivas. -+ * 2007-05-05 Load balancing (smp-nice) and other improvements -+ * by Peter Williams -+ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith -+ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri -+ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, -+ * Thomas Gleixner, Mike Kravetz -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS -+ * scheduler by Con Kolivas. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#ifdef CONFIG_PARAVIRT -+#include -+#endif -+ -+#include "../workqueue_internal.h" -+#include "../smpboot.h" -+ -+#define CREATE_TRACE_POINTS -+#include -+ -+#include "MuQSS.h" -+ -+#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) -+#define rt_task(p) rt_prio((p)->prio) -+#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) -+#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ -+ (policy) == SCHED_RR) -+#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) -+ -+#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) -+#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) -+#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) -+ -+#define is_iso_policy(policy) ((policy) == SCHED_ISO) -+#define iso_task(p) unlikely(is_iso_policy((p)->policy)) -+#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) -+ -+#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) -+ -+#define ISO_PERIOD (5 * HZ) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+/* -+ * Some helpers for converting to/from various scales. Use shifts to get -+ * approximate multiples of ten for less overhead. -+ */ -+#define JIFFIES_TO_NS(TIME) ((TIME) * (1073741824 / HZ)) -+#define JIFFY_NS (1073741824 / HZ) -+#define JIFFY_US (1048576 / HZ) -+#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) -+#define HALF_JIFFY_NS (1073741824 / HZ / 2) -+#define HALF_JIFFY_US (1048576 / HZ / 2) -+#define MS_TO_NS(TIME) ((TIME) << 20) -+#define MS_TO_US(TIME) ((TIME) << 10) -+#define NS_TO_MS(TIME) ((TIME) >> 20) -+#define NS_TO_US(TIME) ((TIME) >> 10) -+#define US_TO_NS(TIME) ((TIME) << 10) -+ -+#define RESCHED_US (100) /* Reschedule if less than this many μs left */ -+ -+void print_scheduler_version(void) -+{ -+ printk(KERN_INFO "MuQSS CPU scheduler v0.162 by Con Kolivas.\n"); -+} -+ -+/* -+ * This is the time all tasks within the same priority round robin. -+ * Value is in ms and set to a minimum of 6ms. -+ * Tunable via /proc interface. -+ */ -+int rr_interval __read_mostly = 6; -+ -+/* -+ * Tunable to choose whether to prioritise latency or throughput, simple -+ * binary yes or no -+ */ -+int sched_interactive __read_mostly = 1; -+ -+/* -+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks -+ * are allowed to run five seconds as real time tasks. This is the total over -+ * all online cpus. -+ */ -+int sched_iso_cpu __read_mostly = 70; -+ -+/* -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Yield only to better priority/deadline tasks. (default) -+ * 2: Expire timeslice and recalculate deadline. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+/* -+ * The relative length of deadline for each priority(nice) level. -+ */ -+static int prio_ratios[NICE_WIDTH] __read_mostly; -+ -+/* -+ * The quota handed out to tasks of all priority levels when refilling their -+ * time_slice. -+ */ -+static inline int timeslice(void) -+{ -+ return MS_TO_US(rr_interval); -+} -+ -+#ifdef CONFIG_SMP -+static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; -+#endif -+ -+/* CPUs with isolated domains */ -+cpumask_var_t cpu_isolated_map; -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#ifdef CONFIG_SMP -+struct rq *cpu_rq(int cpu) -+{ -+ return &per_cpu(runqueues, (cpu)); -+} -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+ -+/* -+ * For asym packing, by default the lower numbered cpu has higher priority. -+ */ -+int __weak arch_asym_cpu_priority(int cpu) -+{ -+ return -cpu; -+} -+ -+int __weak arch_sd_sibling_asym_packing(void) -+{ -+ return 0*SD_ASYM_PACKING; -+} -+#else -+struct rq *uprq; -+#endif /* CONFIG_SMP */ -+ -+#include "stats.h" -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_switch -+# define finish_arch_switch(prev) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+/* -+ * All common locking functions performed on rq->lock. rq->clock is local to -+ * the CPU accessing it so it can be modified just with interrupts disabled -+ * when we're not updating niffies. -+ * Looking up task_rq must be done under rq->lock to be safe. -+ */ -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ s64 steal = paravirt_steal_clock(cpu_of(rq)); -+ -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ -+ delta -= steal; -+ } -+#endif -+ rq->clock_task += delta; -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta < 0)) -+ return; -+ rq->clock += delta; -+ update_rq_clock_task(rq, delta); -+} -+ -+/* -+ * Niffies are a globally increasing nanosecond counter. They're only used by -+ * update_load_avg and time_slice_expired, however deadlines are based on them -+ * across CPUs. Update them whenever we will call one of those functions, and -+ * synchronise them across CPUs whenever we hold both runqueue locks. -+ */ -+static inline void update_clocks(struct rq *rq) -+{ -+ s64 ndiff, minndiff; -+ long jdiff; -+ -+ update_rq_clock(rq); -+ ndiff = rq->clock - rq->old_clock; -+ rq->old_clock = rq->clock; -+ jdiff = jiffies - rq->last_jiffy; -+ -+ /* Subtract any niffies added by balancing with other rqs */ -+ ndiff -= rq->niffies - rq->last_niffy; -+ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; -+ if (minndiff < 0) -+ minndiff = 0; -+ ndiff = max(ndiff, minndiff); -+ rq->niffies += ndiff; -+ rq->last_niffy = rq->niffies; -+ if (jdiff) { -+ rq->last_jiffy += jdiff; -+ rq->last_jiffy_niffies = rq->niffies; -+ } -+} -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_MIGRATING; -+} -+ -+static inline int rq_trylock(struct rq *rq) -+ __acquires(rq->lock) -+{ -+ return raw_spin_trylock(&rq->lock); -+} -+ -+/* -+ * Any time we have two runqueues locked we use that as an opportunity to -+ * synchronise niffies to the highest value as idle ticks may have artificially -+ * kept niffies low on one CPU and the truth can only be later. -+ */ -+static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) -+{ -+ if (rq1->niffies > rq2->niffies) -+ rq2->niffies = rq1->niffies; -+ else -+ rq1->niffies = rq2->niffies; -+} -+ -+/* -+ * double_rq_lock - safely lock two runqueues -+ * -+ * Note this does not disable interrupts like task_rq_lock, -+ * you need to do so manually before calling. -+ */ -+ -+/* For when we know rq1 != rq2 */ -+static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) -+ __acquires(rq1->lock) -+ __acquires(rq2->lock) -+{ -+ if (rq1 < rq2) { -+ raw_spin_lock(&rq1->lock); -+ raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); -+ } else { -+ raw_spin_lock(&rq2->lock); -+ raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); -+ } -+} -+ -+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) -+ __acquires(rq1->lock) -+ __acquires(rq2->lock) -+{ -+ BUG_ON(!irqs_disabled()); -+ if (rq1 == rq2) { -+ raw_spin_lock(&rq1->lock); -+ __acquire(rq2->lock); /* Fake it out ;) */ -+ } else -+ __double_rq_lock(rq1, rq2); -+ synchronise_niffies(rq1, rq2); -+} -+ -+/* -+ * double_rq_unlock - safely unlock two runqueues -+ * -+ * Note this does not restore interrupts like task_rq_unlock, -+ * you need to do so manually after calling. -+ */ -+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) -+ __releases(rq1->lock) -+ __releases(rq2->lock) -+{ -+ raw_spin_unlock(&rq1->lock); -+ if (rq1 != rq2) -+ raw_spin_unlock(&rq2->lock); -+ else -+ __release(rq2->lock); -+} -+ -+static inline void lock_all_rqs(void) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ do_raw_spin_lock(&rq->lock); -+ } -+} -+ -+static inline void unlock_all_rqs(void) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ do_raw_spin_unlock(&rq->lock); -+ } -+ preempt_enable(); -+} -+ -+/* Specially nest trylock an rq */ -+static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) -+{ -+ if (unlikely(!do_raw_spin_trylock(&rq->lock))) -+ return false; -+ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ synchronise_niffies(this_rq, rq); -+ return true; -+} -+ -+/* Unlock a specially nested trylocked rq */ -+static inline void unlock_rq(struct rq *rq) -+{ -+ spin_release(&rq->lock.dep_map, 1, _RET_IP_); -+ do_raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _old, _val = *_ptr; \ -+ \ -+ for (;;) { \ -+ _old = cmpxchg(_ptr, _val, _val | _mask); \ -+ if (_old == _val) \ -+ break; \ -+ _val = _old; \ -+ } \ -+ _old; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) old, val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); -+ if (old == val) -+ break; -+ val = old; -+ } -+ return true; -+} -+ -+#else -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * its already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * This cmpxchg() implies a full barrier, which pairs with the write -+ * barrier implied by the wakeup in wake_up_q(). -+ */ -+ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) -+ return; -+ -+ get_task_struct(task); -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ BUG_ON(!task); -+ /* Task can safely be re-inserted now */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() implies a wmb() to pair with the queueing -+ * in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ next->on_cpu = 1; -+} -+ -+static inline void smp_sched_reschedule(int cpu) -+{ -+ if (likely(cpu_online(cpu))) -+ smp_send_reschedule(cpu); -+} -+ -+/* -+ * resched_task - mark a task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_task(struct task_struct *p) -+{ -+ int cpu; -+#ifdef CONFIG_LOCKDEP -+ /* Kernel threads call this when creating workqueues while still -+ * inactive from __kthread_bind_mask, holding only the pi_lock */ -+ if (!(p->flags & PF_KTHREAD)) { -+ struct rq *rq = task_rq(p); -+ -+ lockdep_assert_held(&rq->lock); -+ } -+#endif -+ if (test_tsk_need_resched(p)) -+ return; -+ -+ cpu = task_cpu(p); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(p)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+/* -+ * A task that is not running or queued will not have a node set. -+ * A task that is queued but not running will have a node set. -+ * A task that is currently running will have ->on_cpu set but no node set. -+ */ -+static inline bool task_queued(struct task_struct *p) -+{ -+ return !skiplist_node_empty(&p->node); -+} -+ -+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); -+static inline void resched_if_idle(struct rq *rq); -+ -+/* Dodgy workaround till we figure out where the softirqs are going */ -+static inline void do_pending_softirq(struct rq *rq, struct task_struct *next) -+{ -+ if (unlikely(next == rq->idle && local_softirq_pending() && !in_interrupt())) -+ do_softirq_own_stack(); -+} -+ -+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->on_cpu is cleared, the task can be moved to a different CPU. -+ * We must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#endif -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock.owner = current; -+#endif -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); -+ -+#ifdef CONFIG_SMP -+ /* -+ * If prev was marked as migrating to another CPU in return_task, drop -+ * the local runqueue lock but leave interrupts disabled and grab the -+ * remote lock we're migrating it to before enabling them. -+ */ -+ if (unlikely(task_on_rq_migrating(prev))) { -+ sched_info_dequeued(rq, prev); -+ /* -+ * We move the ownership of prev to the new cpu now. ttwu can't -+ * activate prev to the wrong cpu since it has to grab this -+ * runqueue in ttwu_remote. -+ */ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ prev->cpu = prev->wake_cpu; -+#else -+ task_thread_info(prev)->cpu = prev->wake_cpu; -+#endif -+ raw_spin_unlock(&rq->lock); -+ -+ raw_spin_lock(&prev->pi_lock); -+ rq = __task_rq_lock(prev); -+ /* Check that someone else hasn't already queued prev */ -+ if (likely(!task_queued(prev))) { -+ enqueue_task(rq, prev, 0); -+ prev->on_rq = TASK_ON_RQ_QUEUED; -+ /* Wake up the CPU if it's not already running */ -+ resched_if_idle(rq); -+ } -+ raw_spin_unlock(&prev->pi_lock); -+ } -+#endif -+ /* Accurately set nr_running here for load average calculations */ -+ rq->nr_running = rq->sl->entries + !rq_idle(rq); -+ rq_unlock(rq); -+ -+ do_pending_softirq(rq, current); -+ -+ local_irq_enable(); -+} -+ -+static inline bool deadline_before(u64 deadline, u64 time) -+{ -+ return (deadline < time); -+} -+ -+/* -+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline -+ * is the key to everything. It distributes cpu fairly amongst tasks of the -+ * same nice value, it proportions cpu according to nice level, it means the -+ * task that last woke up the longest ago has the earliest deadline, thus -+ * ensuring that interactive tasks get low latency on wake up. The CPU -+ * proportion works out to the square of the virtual deadline difference, so -+ * this equation will give nice 19 3% CPU compared to nice 0. -+ */ -+static inline u64 prio_deadline_diff(int user_prio) -+{ -+ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); -+} -+ -+static inline u64 task_deadline_diff(struct task_struct *p) -+{ -+ return prio_deadline_diff(TASK_USER_PRIO(p)); -+} -+ -+static inline u64 static_deadline_diff(int static_prio) -+{ -+ return prio_deadline_diff(USER_PRIO(static_prio)); -+} -+ -+static inline int longest_deadline_diff(void) -+{ -+ return prio_deadline_diff(39); -+} -+ -+static inline int ms_longest_deadline_diff(void) -+{ -+ return NS_TO_MS(longest_deadline_diff()); -+} -+ -+static inline bool rq_local(struct rq *rq); -+ -+#ifndef SCHED_CAPACITY_SCALE -+#define SCHED_CAPACITY_SCALE 1024 -+#endif -+ -+static inline int rq_load(struct rq *rq) -+{ -+ return rq->nr_running; -+} -+ -+/* -+ * Update the load average for feeding into cpu frequency governors. Use a -+ * rough estimate of a rolling average with ~ time constant of 32ms. -+ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 -+ * Make sure a call to update_clocks has been made before calling this to get -+ * an updated rq->niffies. -+ */ -+static void update_load_avg(struct rq *rq, unsigned int flags) -+{ -+ unsigned long us_interval, curload; -+ long load; -+ -+ if (unlikely(rq->niffies <= rq->load_update)) -+ return; -+ -+ us_interval = NS_TO_US(rq->niffies - rq->load_update); -+ curload = rq_load(rq); -+ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); -+ if (unlikely(load < 0)) -+ load = 0; -+ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; -+ rq->load_avg = load; -+ -+ rq->load_update = rq->niffies; -+ if (likely(rq_local(rq))) -+ cpufreq_trigger(rq, flags); -+} -+ -+/* -+ * Removing from the runqueue. Enter with rq locked. Deleting a task -+ * from the skip list is done via the stored node reference in the task struct -+ * and does not require a full look up. Thus it occurs in O(k) time where k -+ * is the "level" of the list the task was stored at - usually < 4, max 8. -+ */ -+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) -+{ -+ skiplist_delete(rq->sl, &p->node); -+ rq->best_key = rq->node.next[0]->key; -+ update_clocks(rq); -+ -+ if (!(flags & DEQUEUE_SAVE)) -+ sched_info_dequeued(task_rq(p), p); -+ update_load_avg(rq, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_RCU -+static bool rcu_read_critical(struct task_struct *p) -+{ -+ return p->rcu_read_unlock_special.b.blocked; -+} -+#else /* CONFIG_PREEMPT_RCU */ -+#define rcu_read_critical(p) (false) -+#endif /* CONFIG_PREEMPT_RCU */ -+ -+/* -+ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as -+ * an idle task, we ensure none of the following conditions are met. -+ */ -+static bool idleprio_suitable(struct task_struct *p) -+{ -+ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && -+ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); -+} -+ -+/* -+ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check -+ * that the iso_refractory flag is not set. -+ */ -+static inline bool isoprio_suitable(struct rq *rq) -+{ -+ return !rq->iso_refractory; -+} -+ -+/* -+ * Adding to the runqueue. Enter with rq locked. -+ */ -+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) -+{ -+ unsigned int randseed, cflags = 0; -+ u64 sl_id; -+ -+ if (!rt_task(p)) { -+ /* Check it hasn't gotten rt from PI */ -+ if ((idleprio_task(p) && idleprio_suitable(p)) || -+ (iso_task(p) && isoprio_suitable(rq))) -+ p->prio = p->normal_prio; -+ else -+ p->prio = NORMAL_PRIO; -+ } -+ /* -+ * The sl_id key passed to the skiplist generates a sorted list. -+ * Realtime and sched iso tasks run FIFO so they only need be sorted -+ * according to priority. The skiplist will put tasks of the same -+ * key inserted later in FIFO order. Tasks of sched normal, batch -+ * and idleprio are sorted according to their deadlines. Idleprio -+ * tasks are offset by an impossibly large deadline value ensuring -+ * they get sorted into last positions, but still according to their -+ * own deadlines. This creates a "landscape" of skiplists running -+ * from priority 0 realtime in first place to the lowest priority -+ * idleprio tasks last. Skiplist insertion is an O(log n) process. -+ */ -+ if (p->prio <= ISO_PRIO) { -+ sl_id = p->prio; -+ cflags = SCHED_CPUFREQ_RT; -+ } else { -+ sl_id = p->deadline; -+ if (idleprio_task(p)) { -+ if (p->prio == IDLE_PRIO) -+ sl_id |= 0xF000000000000000; -+ else -+ sl_id += longest_deadline_diff(); -+ } -+ } -+ /* -+ * Some architectures don't have better than microsecond resolution -+ * so mask out ~microseconds as the random seed for skiplist insertion. -+ */ -+ update_clocks(rq); -+ if (!(flags & ENQUEUE_RESTORE)) -+ sched_info_queued(rq, p); -+ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; -+ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); -+ rq->best_key = rq->node.next[0]->key; -+ if (p->in_iowait) -+ cflags |= SCHED_CPUFREQ_IOWAIT; -+ update_load_avg(rq, cflags); -+} -+ -+/* -+ * Returns the relative length of deadline all compared to the shortest -+ * deadline which is that of nice -20. -+ */ -+static inline int task_prio_ratio(struct task_struct *p) -+{ -+ return prio_ratios[TASK_USER_PRIO(p)]; -+} -+ -+/* -+ * task_timeslice - all tasks of all priorities get the exact same timeslice -+ * length. CPU distribution is handled by giving different deadlines to -+ * tasks of different priorities. Use 128 as the base value for fast shifts. -+ */ -+static inline int task_timeslice(struct task_struct *p) -+{ -+ return (rr_interval * task_prio_ratio(p) / 128); -+} -+ -+#ifdef CONFIG_SMP -+/* Entered with rq locked */ -+static inline void resched_if_idle(struct rq *rq) -+{ -+ if (rq_idle(rq)) -+ resched_task(rq->curr); -+} -+ -+static inline bool rq_local(struct rq *rq) -+{ -+ return (rq->cpu == smp_processor_id()); -+} -+#ifdef CONFIG_SMT_NICE -+static const cpumask_t *thread_cpumask(int cpu); -+ -+/* Find the best real time priority running on any SMT siblings of cpu and if -+ * none are running, the static priority of the best deadline task running. -+ * The lookups to the other runqueues is done lockless as the occasional wrong -+ * value would be harmless. */ -+static int best_smt_bias(struct rq *this_rq) -+{ -+ int other_cpu, best_bias = 0; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct rq *rq = cpu_rq(other_cpu); -+ -+ if (rq_idle(rq)) -+ continue; -+ if (unlikely(!rq->online)) -+ continue; -+ if (!rq->rq_mm) -+ continue; -+ if (likely(rq->rq_smt_bias > best_bias)) -+ best_bias = rq->rq_smt_bias; -+ } -+ return best_bias; -+} -+ -+static int task_prio_bias(struct task_struct *p) -+{ -+ if (rt_task(p)) -+ return 1 << 30; -+ else if (task_running_iso(p)) -+ return 1 << 29; -+ else if (task_running_idle(p)) -+ return 0; -+ return MAX_PRIO - p->static_prio; -+} -+ -+static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) -+{ -+ return true; -+} -+ -+static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; -+ -+/* We've already decided p can run on CPU, now test if it shouldn't for SMT -+ * nice reasons. */ -+static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) -+{ -+ int best_bias, task_bias; -+ -+ /* Kernel threads always run */ -+ if (unlikely(!p->mm)) -+ return true; -+ if (rt_task(p)) -+ return true; -+ if (!idleprio_suitable(p)) -+ return true; -+ best_bias = best_smt_bias(this_rq); -+ /* The smt siblings are all idle or running IDLEPRIO */ -+ if (best_bias < 1) -+ return true; -+ task_bias = task_prio_bias(p); -+ if (task_bias < 1) -+ return false; -+ if (task_bias >= best_bias) -+ return true; -+ /* Dither 25% cpu of normal tasks regardless of nice difference */ -+ if (best_bias % 4 == 1) -+ return true; -+ /* Sorry, you lose */ -+ return false; -+} -+#else /* CONFIG_SMT_NICE */ -+#define smt_schedule(p, this_rq) (true) -+#endif /* CONFIG_SMT_NICE */ -+ -+static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) -+{ -+ set_bit(cpu, (volatile unsigned long *)cpumask); -+} -+ -+/* -+ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to -+ * allow easy lookup of whether any suitable idle CPUs are available. -+ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the -+ * idle_cpus variable than to do a full bitmask check when we are busy. The -+ * bits are set atomically but read locklessly as occasional false positive / -+ * negative is harmless. -+ */ -+static inline void set_cpuidle_map(int cpu) -+{ -+ if (likely(cpu_online(cpu))) -+ atomic_set_cpu(cpu, &cpu_idle_map); -+} -+ -+static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) -+{ -+ clear_bit(cpu, (volatile unsigned long *)cpumask); -+} -+ -+static inline void clear_cpuidle_map(int cpu) -+{ -+ atomic_clear_cpu(cpu, &cpu_idle_map); -+} -+ -+static bool suitable_idle_cpus(struct task_struct *p) -+{ -+ return (cpumask_intersects(&p->cpus_allowed, &cpu_idle_map)); -+} -+ -+/* -+ * Resched current on rq. We don't know if rq is local to this CPU nor if it -+ * is locked so we do not use an intermediate variable for the task to avoid -+ * having it dereferenced. -+ */ -+static void resched_curr(struct rq *rq) -+{ -+ int cpu; -+ -+ if (test_tsk_need_resched(rq->curr)) -+ return; -+ -+ rq->preempt = rq->curr; -+ cpu = rq->cpu; -+ -+ /* We're doing this without holding the rq lock if it's not task_rq */ -+ -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(rq->curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(rq->curr)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+#define CPUIDLE_DIFF_THREAD (1) -+#define CPUIDLE_DIFF_CORE (2) -+#define CPUIDLE_CACHE_BUSY (4) -+#define CPUIDLE_DIFF_CPU (8) -+#define CPUIDLE_THREAD_BUSY (16) -+#define CPUIDLE_DIFF_NODE (32) -+ -+/* -+ * The best idle CPU is chosen according to the CPUIDLE ranking above where the -+ * lowest value would give the most suitable CPU to schedule p onto next. The -+ * order works out to be the following: -+ * -+ * Same thread, idle or busy cache, idle or busy threads -+ * Other core, same cache, idle or busy cache, idle threads. -+ * Same node, other CPU, idle cache, idle threads. -+ * Same node, other CPU, busy cache, idle threads. -+ * Other core, same cache, busy threads. -+ * Same node, other CPU, busy threads. -+ * Other node, other CPU, idle cache, idle threads. -+ * Other node, other CPU, busy cache, idle threads. -+ * Other node, other CPU, busy threads. -+ */ -+static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) -+{ -+ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | -+ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | -+ CPUIDLE_DIFF_THREAD; -+ int cpu_tmp; -+ -+ if (cpumask_test_cpu(best_cpu, tmpmask)) -+ goto out; -+ -+ for_each_cpu(cpu_tmp, tmpmask) { -+ int ranking, locality; -+ struct rq *tmp_rq; -+ -+ ranking = 0; -+ tmp_rq = cpu_rq(cpu_tmp); -+ -+ locality = rq->cpu_locality[cpu_tmp]; -+#ifdef CONFIG_NUMA -+ if (locality > 3) -+ ranking |= CPUIDLE_DIFF_NODE; -+ else -+#endif -+ if (locality > 2) -+ ranking |= CPUIDLE_DIFF_CPU; -+#ifdef CONFIG_SCHED_MC -+ else if (locality == 2) -+ ranking |= CPUIDLE_DIFF_CORE; -+ else if (!(tmp_rq->cache_idle(tmp_rq))) -+ ranking |= CPUIDLE_CACHE_BUSY; -+#endif -+#ifdef CONFIG_SCHED_SMT -+ if (locality == 1) -+ ranking |= CPUIDLE_DIFF_THREAD; -+ if (!(tmp_rq->siblings_idle(tmp_rq))) -+ ranking |= CPUIDLE_THREAD_BUSY; -+#endif -+ if (ranking < best_ranking) { -+ best_cpu = cpu_tmp; -+ best_ranking = ranking; -+ } -+ } -+out: -+ return best_cpu; -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ struct rq *this_rq = cpu_rq(this_cpu); -+ -+ return (this_rq->cpu_locality[that_cpu] < 3); -+} -+ -+/* As per resched_curr but only will resched idle task */ -+static inline void resched_idle(struct rq *rq) -+{ -+ if (test_tsk_need_resched(rq->idle)) -+ return; -+ -+ rq->preempt = rq->idle; -+ -+ set_tsk_need_resched(rq->idle); -+ -+ if (rq_local(rq)) { -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ smp_sched_reschedule(rq->cpu); -+} -+ -+static struct rq *resched_best_idle(struct task_struct *p, int cpu) -+{ -+ cpumask_t tmpmask; -+ struct rq *rq; -+ int best_cpu; -+ -+ cpumask_and(&tmpmask, &p->cpus_allowed, &cpu_idle_map); -+ best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); -+ rq = cpu_rq(best_cpu); -+ if (!smt_schedule(p, rq)) -+ return NULL; -+ rq->preempt = p; -+ resched_idle(rq); -+ return rq; -+} -+ -+static inline void resched_suitable_idle(struct task_struct *p) -+{ -+ if (suitable_idle_cpus(p)) -+ resched_best_idle(p, task_cpu(p)); -+} -+ -+static inline struct rq *rq_order(struct rq *rq, int cpu) -+{ -+ return rq->rq_order[cpu]; -+} -+#else /* CONFIG_SMP */ -+static inline void set_cpuidle_map(int cpu) -+{ -+} -+ -+static inline void clear_cpuidle_map(int cpu) -+{ -+} -+ -+static inline bool suitable_idle_cpus(struct task_struct *p) -+{ -+ return uprq->curr == uprq->idle; -+} -+ -+static inline void resched_suitable_idle(struct task_struct *p) -+{ -+} -+ -+static inline void resched_curr(struct rq *rq) -+{ -+ resched_task(rq->curr); -+} -+ -+static inline void resched_if_idle(struct rq *rq) -+{ -+} -+ -+static inline bool rq_local(struct rq *rq) -+{ -+ return true; -+} -+ -+static inline struct rq *rq_order(struct rq *rq, int cpu) -+{ -+ return rq; -+} -+ -+static inline bool smt_schedule(struct task_struct *p, struct rq *rq) -+{ -+ return true; -+} -+#endif /* CONFIG_SMP */ -+ -+static inline int normal_prio(struct task_struct *p) -+{ -+ if (has_rt_policy(p)) -+ return MAX_RT_PRIO - 1 - p->rt_priority; -+ if (idleprio_task(p)) -+ return IDLE_PRIO; -+ if (iso_task(p)) -+ return ISO_PRIO; -+ return NORMAL_PRIO; -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. Enter with rq locked. -+ */ -+static void activate_task(struct task_struct *p, struct rq *rq) -+{ -+ resched_if_idle(rq); -+ -+ /* -+ * Sleep time is in units of nanosecs, so shift by 20 to get a -+ * milliseconds-range estimation of the amount of time that the task -+ * spent sleeping: -+ */ -+ if (unlikely(prof_on == SLEEP_PROFILING)) { -+ if (p->state == TASK_UNINTERRUPTIBLE) -+ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), -+ (rq->niffies - p->last_ran) >> 20); -+ } -+ -+ p->prio = effective_prio(p); -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible--; -+ -+ enqueue_task(rq, p, 0); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+} -+ -+/* -+ * deactivate_task - If it's running, it's not on the runqueue and we can just -+ * decrement the nr_running. Enter with rq locked. -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible++; -+ -+ p->on_rq = 0; -+ sched_info_dequeued(rq, p); -+} -+ -+#ifdef CONFIG_SMP -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+ struct rq *rq; -+ -+ if (task_cpu(p) == new_cpu) -+ return; -+ -+ /* Do NOT call set_task_cpu on a currently queued task as we will not -+ * be reliably holding the rq lock after changing CPU. */ -+ BUG_ON(task_queued(p)); -+ rq = task_rq(p); -+ -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * Furthermore, all task_rq users should acquire both locks, see -+ * task_rq_lock(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(&rq->lock))); -+#endif -+ -+ trace_sched_migrate_task(p, new_cpu); -+ perf_event_task_migrate(p); -+ -+ /* -+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+ p->wake_cpu = new_cpu; -+ -+ if (task_running(rq, p)) { -+ /* -+ * We should only be calling this on a running task if we're -+ * holding rq lock. -+ */ -+ lockdep_assert_held(&rq->lock); -+ -+ /* -+ * We can't change the task_thread_info CPU on a running task -+ * as p will still be protected by the rq lock of the CPU it -+ * is still running on so we only set the wake_cpu for it to be -+ * lazily updated once off the CPU. -+ */ -+ return; -+ } -+ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ p->cpu = new_cpu; -+#else -+ task_thread_info(p)->cpu = new_cpu; -+#endif -+ /* We're no longer protecting p after this point since we're holding -+ * the wrong runqueue lock. */ -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * Move a task off the runqueue and take it to a cpu for it will -+ * become the running task. -+ */ -+static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) -+{ -+ struct rq *p_rq = task_rq(p); -+ -+ dequeue_task(p_rq, p, DEQUEUE_SAVE); -+ if (p_rq != rq) { -+ sched_info_dequeued(p_rq, p); -+ sched_info_queued(rq, p); -+ } -+ set_task_cpu(p, cpu); -+} -+ -+/* -+ * Returns a descheduling task to the runqueue unless it is being -+ * deactivated. -+ */ -+static inline void return_task(struct task_struct *p, struct rq *rq, -+ int cpu, bool deactivate) -+{ -+ if (deactivate) -+ deactivate_task(p, rq); -+ else { -+#ifdef CONFIG_SMP -+ /* -+ * set_task_cpu was called on the running task that doesn't -+ * want to deactivate so it has to be enqueued to a different -+ * CPU and we need its lock. Tag it to be moved with as the -+ * lock is dropped in finish_lock_switch. -+ */ -+ if (unlikely(p->wake_cpu != cpu)) -+ p->on_rq = TASK_ON_RQ_MIGRATING; -+ else -+#endif -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ } -+} -+ -+/* Enter with rq lock held. We know p is on the local cpu */ -+static inline void __set_tsk_resched(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * If @match_state is nonzero, it's the @p->state value just checked and -+ * not expected to change. If it changes, i.e. @p might have woken up, -+ * then return zero. When we succeed in waiting for @p to be off its CPU, -+ * we return a positive number (its total switch count). If a second call -+ * a short while later returns the same number, the caller can be sure that -+ * @p has remained unscheduled the whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, long match_state) -+{ -+ int running, queued; -+ unsigned long flags; -+ unsigned long ncsw; -+ struct rq *rq; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_running(rq, p)) { -+ if (match_state && unlikely(p->state != match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ rq = task_rq_lock(p, &flags); -+ trace_sched_wait_task(p); -+ running = task_running(rq, p); -+ queued = task_on_rq_queued(p); -+ ncsw = 0; -+ if (!match_state || p->state == match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_rq_unlock(rq, p, &flags); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(queued)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_sched_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+#endif -+ -+/* -+ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the -+ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or -+ * between themselves, they cooperatively multitask. An idle rq scores as -+ * prio PRIO_LIMIT so it is always preempted. -+ */ -+static inline bool -+can_preempt(struct task_struct *p, int prio, u64 deadline) -+{ -+ /* Better static priority RT task or better policy preemption */ -+ if (p->prio < prio) -+ return true; -+ if (p->prio > prio) -+ return false; -+ if (p->policy == SCHED_BATCH) -+ return false; -+ /* SCHED_NORMAL and ISO will preempt based on deadline */ -+ if (!deadline_before(p->deadline, deadline)) -+ return false; -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * Check to see if p can run on cpu, and if not, whether there are any online -+ * CPUs it can run on instead. -+ */ -+static inline bool needs_other_cpu(struct task_struct *p, int cpu) -+{ -+ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed))) -+ return true; -+ return false; -+} -+#define cpu_online_map (*(cpumask_t *)cpu_online_mask) -+ -+static void try_preempt(struct task_struct *p, struct rq *this_rq) -+{ -+ int i, this_entries = rq_load(this_rq); -+ cpumask_t tmp; -+ -+ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) -+ return; -+ -+ /* IDLEPRIO tasks never preempt anything but idle */ -+ if (p->policy == SCHED_IDLEPRIO) -+ return; -+ -+ cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed); -+ -+ for (i = 0; i < num_possible_cpus(); i++) { -+ struct rq *rq = this_rq->rq_order[i]; -+ -+ if (!cpumask_test_cpu(rq->cpu, &tmp)) -+ continue; -+ -+ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) -+ continue; -+ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { -+ /* We set rq->preempting lockless, it's a hint only */ -+ rq->preempting = p; -+ resched_curr(rq); -+ return; -+ } -+ } -+} -+ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check); -+#else /* CONFIG_SMP */ -+static inline bool needs_other_cpu(struct task_struct *p, int cpu) -+{ -+ return false; -+} -+ -+static void try_preempt(struct task_struct *p, struct rq *this_rq) -+{ -+ if (p->policy == SCHED_IDLEPRIO) -+ return; -+ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) -+ resched_curr(uprq); -+} -+ -+static inline int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq = this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) -+ schedstat_inc(rq->ttwu_local); -+ else { -+ struct sched_domain *sd; -+ -+ rcu_read_lock(); -+ for_each_domain(rq->cpu, sd) { -+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { -+ schedstat_inc(sd->ttwu_wake_remote); -+ break; -+ } -+ } -+ rcu_read_unlock(); -+ } -+ -+#endif /* CONFIG_SMP */ -+ -+ schedstat_inc(rq->ttwu_count); -+} -+ -+static inline void ttwu_activate(struct rq *rq, struct task_struct *p) -+{ -+ activate_task(p, rq); -+ -+ /* if a worker is waking up, notify the workqueue */ -+ if (p->flags & PF_WQ_WORKER) -+ wq_worker_waking_up(p, cpu_of(rq)); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ /* -+ * Sync wakeups (i.e. those types of wakeups where the waker -+ * has indicated that it will leave the CPU in short order) -+ * don't trigger a preemption if there are no idle cpus, -+ * instead waiting for current to deschedule. -+ */ -+ if (wake_flags & WF_SYNC) -+ resched_suitable_idle(p); -+ else -+ try_preempt(p, rq); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+} -+ -+static void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+#ifdef CONFIG_SMP -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+#endif -+ -+ ttwu_activate(rq, p); -+ ttwu_do_wakeup(rq, p, wake_flags); -+} -+ -+/* -+ * Called in case the task @p isn't fully descheduled from its runqueue, -+ * in this case we must do a remote wakeup. Its a 'light' wakeup though, -+ * since all we need to do is flip p->state to TASK_RUNNING, since -+ * the task is still ->on_rq. -+ */ -+static int ttwu_remote(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ int ret = 0; -+ -+ rq = __task_rq_lock(p); -+ if (likely(task_on_rq_queued(p))) { -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_rq_unlock(rq); -+ -+ return ret; -+} -+ -+#ifdef CONFIG_SMP -+void sched_ttwu_pending(void) -+{ -+ struct rq *rq = this_rq(); -+ struct llist_node *llist = llist_del_all(&rq->wake_list); -+ struct task_struct *p, *t; -+ unsigned long flags; -+ -+ if (!llist) -+ return; -+ -+ rq_lock_irqsave(rq, &flags); -+ -+ llist_for_each_entry_safe(p, t, llist, wake_entry) -+ ttwu_do_activate(rq, p, 0); -+ -+ rq_unlock_irqrestore(rq, &flags); -+} -+ -+void scheduler_ipi(void) -+{ -+ /* -+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting -+ * TIF_NEED_RESCHED remotely (for the first time) will also send -+ * this IPI. -+ */ -+ preempt_fold_need_resched(); -+ -+ if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) -+ return; -+ -+ /* -+ * Not all reschedule IPI handlers call irq_enter/irq_exit, since -+ * traditionally all their work was done from the interrupt return -+ * path. Now that we actually do some work, we need to make sure -+ * we do call them. -+ * -+ * Some archs already do call them, luckily irq_enter/exit nest -+ * properly. -+ * -+ * Arguably we should visit all archs and update all handlers, -+ * however a fair share of IPIs are still resched only so this would -+ * somewhat pessimize the simple resched case. -+ */ -+ irq_enter(); -+ sched_ttwu_pending(); -+ irq_exit(); -+} -+ -+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { -+ if (!set_nr_if_polling(rq->idle)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+ } -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ rq_lock_irqsave(rq, &flags); -+ if (likely(is_idle_task(rq->curr))) -+ smp_sched_reschedule(cpu); -+ /* Else cpu is not in idle, do nothing here */ -+ rq_unlock_irqrestore(rq, &flags); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+ -+static int valid_task_cpu(struct task_struct *p) -+{ -+ cpumask_t valid_mask; -+ -+ if (p->flags & PF_KTHREAD) -+ cpumask_and(&valid_mask, &p->cpus_allowed, cpu_online_mask); -+ else -+ cpumask_and(&valid_mask, &p->cpus_allowed, cpu_active_mask); -+ -+ if (unlikely(!cpumask_weight(&valid_mask))) { -+ /* Hotplug boot threads do this before the CPU is up */ -+ printk(KERN_INFO "SCHED: No cpumask for %s/%d weight %d\n", p->comm, p->pid, cpumask_weight(&p->cpus_allowed)); -+ return cpumask_any(&p->cpus_allowed); -+ } -+ return cpumask_any(&valid_mask); -+} -+ -+/* -+ * For a task that's just being woken up we have a valuable balancing -+ * opportunity so choose the nearest cache most lightly loaded runqueue. -+ * Entered with rq locked and returns with the chosen runqueue locked. -+ */ -+static inline int select_best_cpu(struct task_struct *p) -+{ -+ unsigned int idlest = ~0U; -+ struct rq *rq = NULL; -+ int i; -+ -+ if (suitable_idle_cpus(p)) { -+ int cpu = task_cpu(p); -+ -+ if (unlikely(needs_other_cpu(p, cpu))) -+ cpu = valid_task_cpu(p); -+ rq = resched_best_idle(p, cpu); -+ if (likely(rq)) -+ return rq->cpu; -+ } -+ -+ for (i = 0; i < num_possible_cpus(); i++) { -+ struct rq *other_rq = task_rq(p)->rq_order[i]; -+ int entries; -+ -+ if (!other_rq->online) -+ continue; -+ if (needs_other_cpu(p, other_rq->cpu)) -+ continue; -+ entries = rq_load(other_rq); -+ if (entries >= idlest) -+ continue; -+ idlest = entries; -+ rq = other_rq; -+ } -+ if (unlikely(!rq)) -+ return task_cpu(p); -+ return rq->cpu; -+} -+#else /* CONFIG_SMP */ -+static int valid_task_cpu(struct task_struct *p) -+{ -+ return 0; -+} -+ -+static inline int select_best_cpu(struct task_struct *p) -+{ -+ return 0; -+} -+ -+static struct rq *resched_best_idle(struct task_struct *p, int cpu) -+{ -+ return NULL; -+} -+#endif /* CONFIG_SMP */ -+ -+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+#if defined(CONFIG_SMP) -+ if (!cpus_share_cache(smp_processor_id(), cpu)) { -+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ -+ ttwu_queue_remote(p, cpu, wake_flags); -+ return; -+ } -+#endif -+ rq_lock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ rq_unlock(rq); -+} -+ -+/*** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Put it on the run-queue if it's not already there. The "current" -+ * thread is always on the run-queue (except when the actual -+ * re-schedule is in progress), and as such you're allowed to do -+ * the simpler "current->state = TASK_RUNNING" to mark yourself -+ * runnable without the overhead of this. -+ * -+ * Return: %true if @p was woken up, %false if it was already running. -+ * or @state didn't match @p's state. -+ */ -+static int -+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) -+{ -+ unsigned long flags; -+ int cpu, success = 0; -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with mb() in -+ * set_current_state() the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ /* state is a volatile long, どうして、分からない */ -+ if (!((unsigned int)p->state & state)) -+ goto out; -+ -+ trace_sched_waking(p); -+ -+ /* We're going to change ->state: */ -+ success = 1; -+ cpu = task_cpu(p); -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * sched_ttwu_pending() try_to_wake_up() -+ * [S] p->on_rq = 1; [L] P->state -+ * UNLOCK rq->lock -----. -+ * \ -+ * +--- RMB -+ * schedule() / -+ * LOCK rq->lock -----' -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq -+ * -+ * Pairs with the UNLOCK+LOCK on rq->lock from the -+ * last wakeup of our task and the schedule that got our task -+ * current. -+ */ -+ smp_rmb(); -+ if (p->on_rq && ttwu_remote(p, wake_flags)) -+ goto stat; -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * [S] ->on_cpu = 1; [L] ->on_rq -+ * UNLOCK rq->lock -+ * RMB -+ * LOCK rq->lock -+ * [S] ->on_rq = 0; [L] ->on_cpu -+ * -+ * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock -+ * from the consecutive calls to schedule(); the first switching to our -+ * task, the second putting it to sleep. -+ */ -+ smp_rmb(); -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until its done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_lock_switch(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ p->sched_contributes_to_load = !!task_contributes_to_load(p); -+ p->state = TASK_WAKING; -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+ cpu = select_best_cpu(p); -+ if (task_cpu(p) != cpu) -+ set_task_cpu(p, cpu); -+ -+#else /* CONFIG_SMP */ -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+#endif /* CONFIG_SMP */ -+ -+ ttwu_queue(p, cpu, wake_flags); -+stat: -+ ttwu_stat(p, cpu, wake_flags); -+out: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return success; -+} -+ -+/** -+ * try_to_wake_up_local - try to wake up a local task with rq lock held -+ * @p: the thread to be awakened -+ * -+ * Put @p on the run-queue if it's not already there. The caller must -+ * ensure that rq is locked and, @p is not the current task. -+ * rq stays locked over invocation. -+ */ -+static void try_to_wake_up_local(struct task_struct *p) -+{ -+ struct rq *rq = task_rq(p); -+ -+ if (WARN_ON_ONCE(rq != this_rq()) || -+ WARN_ON_ONCE(p == current)) -+ return; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ if (!raw_spin_trylock(&p->pi_lock)) { -+ /* -+ * This is OK, because current is on_cpu, which avoids it being -+ * picked for load-balance and preemption/IRQs are still -+ * disabled avoiding further scheduler activity on it and we've -+ * not yet picked a replacement task. -+ */ -+ rq_unlock(rq); -+ raw_spin_lock(&p->pi_lock); -+ rq_lock(rq); -+ } -+ -+ if (!(p->state & TASK_NORMAL)) -+ goto out; -+ -+ trace_sched_waking(p); -+ -+ if (!task_on_rq_queued(p)) { -+ if (p->in_iowait) { -+ delayacct_blkio_end(); -+ atomic_dec(&rq->nr_iowait); -+ } -+ ttwu_activate(rq, p); -+ } -+ -+ ttwu_do_wakeup(rq, p, 0); -+ ttwu_stat(p, smp_processor_id(), 0); -+out: -+ raw_spin_unlock(&p->pi_lock); -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * It may be assumed that this function implies a write memory barrier before -+ * changing the task state if and only if any tasks are woken up. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+static void time_slice_expired(struct task_struct *p, struct rq *rq); -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ */ -+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) -+{ -+ unsigned long flags; -+ int cpu = get_cpu(); -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->state = TASK_NEW; -+ -+ /* -+ * The process state is set to the same value of the process executing -+ * do_fork() code. That is running. This guarantees that nobody will -+ * actually run it, and a signal or other external event cannot wake -+ * it up and insert it on the runqueue either. -+ */ -+ -+ /* Should be reset in fork.c but done here for ease of MuQSS patching */ -+ p->on_cpu = -+ p->on_rq = -+ p->utime = -+ p->stime = -+ p->sched_time = -+ p->stime_ns = -+ p->utime_ns = 0; -+ skiplist_node_init(&p->node); -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { -+ p->policy = SCHED_NORMAL; -+ p->normal_prio = normal_prio(p); -+ } -+ -+ if (PRIO_TO_NICE(p->static_prio) < 0) { -+ p->static_prio = NICE_TO_PRIO(0); -+ p->normal_prio = p->static_prio; -+ } -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+ /* -+ * Silence PROVE_RCU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ set_task_cpu(p, cpu); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ put_cpu(); -+ return 0; -+} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+static bool __initdata __sched_schedstats = false; -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ /* -+ * This code is called before jump labels have been set up, so we can't -+ * change the static branch directly just yet. Instead set a temporary -+ * variable so init_schedstats() can do it later. -+ */ -+ if (!strcmp(str, "enable")) { -+ __sched_schedstats = true; -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ __sched_schedstats = false; -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+static void __init init_schedstats(void) -+{ -+ set_schedstats(__sched_schedstats); -+} -+ -+#ifdef CONFIG_PROC_SYSCTL -+int sysctl_schedstats(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+#endif /* CONFIG_PROC_SYSCTL */ -+#else /* !CONFIG_SCHEDSTATS */ -+static inline void init_schedstats(void) {} -+#endif /* CONFIG_SCHEDSTATS */ -+ -+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); -+ -+static void account_task_cpu(struct rq *rq, struct task_struct *p) -+{ -+ update_clocks(rq); -+ /* This isn't really a context switch but accounting is the same */ -+ update_cpu_clock_switch(rq, p); -+ p->last_ran = rq->niffies; -+} -+ -+bool sched_smp_initialized __read_mostly; -+ -+static inline int hrexpiry_enabled(struct rq *rq) -+{ -+ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrexpiry_timer); -+} -+ -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+static inline void hrexpiry_clear(struct rq *rq) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ if (hrtimer_active(&rq->hrexpiry_timer)) -+ hrtimer_cancel(&rq->hrexpiry_timer); -+} -+ -+/* -+ * High-resolution time_slice expiry. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrexpiry(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); -+ struct task_struct *p; -+ -+ /* This can happen during CPU hotplug / resume */ -+ if (unlikely(cpu_of(rq) != smp_processor_id())) -+ goto out; -+ -+ /* -+ * We're doing this without the runqueue lock but this should always -+ * be run on the local CPU. Time slice should run out in __schedule -+ * but we set it to zero here in case niffies is slightly less. -+ */ -+ p = rq->curr; -+ p->time_slice = 0; -+ __set_tsk_resched(p); -+out: -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Called to set the hrexpiry timer state. -+ * -+ * called with irqs disabled from the local CPU only -+ */ -+static void hrexpiry_start(struct rq *rq, u64 delay) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ -+ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED); -+} -+ -+static void init_rq_hrexpiry(struct rq *rq) -+{ -+ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ rq->hrexpiry_timer.function = hrexpiry; -+} -+ -+static inline int rq_dither(struct rq *rq) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return HALF_JIFFY_US; -+ return 0; -+} -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ struct task_struct *parent, *rq_curr; -+ struct rq *rq, *new_rq; -+ unsigned long flags; -+ -+ parent = p->parent; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ p->state = TASK_RUNNING; -+ /* Task_rq can't change yet on a new task */ -+ new_rq = rq = task_rq(p); -+ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { -+ set_task_cpu(p, valid_task_cpu(p)); -+ new_rq = task_rq(p); -+ } -+ -+ double_rq_lock(rq, new_rq); -+ rq_curr = rq->curr; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = rq_curr->normal_prio; -+ -+ trace_sched_wakeup_new(p); -+ -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. If it's negative, it won't -+ * matter since that's the same as being 0. rq->rq_deadline is only -+ * modified within schedule() so it is always equal to -+ * current->deadline. -+ */ -+ account_task_cpu(rq, rq_curr); -+ p->last_ran = rq_curr->last_ran; -+ if (likely(rq_curr->policy != SCHED_FIFO)) { -+ rq_curr->time_slice /= 2; -+ if (rq_curr->time_slice < RESCHED_US) { -+ /* -+ * Forking task has run out of timeslice. Reschedule it and -+ * start its child with a new time slice and deadline. The -+ * child will end up running first because its deadline will -+ * be slightly earlier. -+ */ -+ __set_tsk_resched(rq_curr); -+ time_slice_expired(p, new_rq); -+ if (suitable_idle_cpus(p)) -+ resched_best_idle(p, task_cpu(p)); -+ else if (unlikely(rq != new_rq)) -+ try_preempt(p, new_rq); -+ } else { -+ p->time_slice = rq_curr->time_slice; -+ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { -+ /* -+ * The VM isn't cloned, so we're in a good position to -+ * do child-runs-first in anticipation of an exec. This -+ * usually avoids a lot of COW overhead. -+ */ -+ __set_tsk_resched(rq_curr); -+ } else { -+ /* -+ * Adjust the hrexpiry since rq_curr will keep -+ * running and its timeslice has been shortened. -+ */ -+ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); -+ try_preempt(p, new_rq); -+ } -+ } -+ } else { -+ time_slice_expired(p, new_rq); -+ try_preempt(p, new_rq); -+ } -+ activate_task(p, new_rq); -+ double_rq_unlock(rq, new_rq); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; -+ -+void preempt_notifier_inc(void) -+{ -+ static_key_slow_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_key_slow_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_key_false(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_key_false(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_key_false(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ fire_sched_out_preempt_notifiers(prev, next); -+ prepare_lock_switch(rq, next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static void finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ long prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(&rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_lock_switch), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = prev->state; -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ /* -+ * The membarrier system call requires a full memory barrier -+ * after storing to rq->curr, before going back to user-space. -+ * -+ * TODO: This smp_mb__after_unlock_lock can go away if PPC end -+ * up adding a full barrier to switch_mm(), or we should figure -+ * out if a smp_mb__after_unlock_lock is really the proper API -+ * to use. -+ */ -+ smp_mb__after_unlock_lock(); -+ finish_lock_switch(rq, prev); -+ finish_arch_post_lock_switch(); -+ -+ fire_sched_in_preempt_notifiers(current); -+ if (mm) -+ mmdrop(mm); -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* -+ * Remove function-return probe instances associated with this -+ * task and put them back on the free list. -+ */ -+ kprobe_flush_task(prev); -+ -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct(prev); -+ } -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+{ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline void -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ struct mm_struct *mm, *oldmm; -+ -+ prepare_task_switch(rq, prev, next); -+ -+ mm = next->mm; -+ oldmm = prev->active_mm; -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ if (!mm) { -+ next->active_mm = oldmm; -+ mmgrab(oldmm); -+ enter_lazy_tlb(oldmm, next); -+ } else -+ switch_mm_irqs_off(oldmm, mm, next); -+ -+ if (!prev->mm) { -+ prev->active_mm = NULL; -+ rq->prev_mm = oldmm; -+ } -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned long nr_running(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+static unsigned long nr_uninterruptible(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_uninterruptible; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptable section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ struct rq *rq = cpu_rq(smp_processor_id()); -+ -+ if (rq_load(rq) == 1) -+ return true; -+ else -+ return false; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int i; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += cpu_rq(i)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * IO-wait accounting, and how its mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned long nr_iowait(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += atomic_read(&cpu_rq(i)->nr_iowait); -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpufreq menu -+ * governor are using nonsensical data. Boosting frequency for a CPU that has -+ * IO-wait which might not even end up running the task when it does become -+ * runnable. -+ */ -+ -+unsigned long nr_iowait_cpu(int cpu) -+{ -+ struct rq *this = cpu_rq(cpu); -+ return atomic_read(&this->nr_iowait); -+} -+ -+unsigned long nr_active(void) -+{ -+ return nr_running() + nr_uninterruptible(); -+} -+ -+/* -+ * I/O wait is the number of running or queued tasks with their ->rq pointer -+ * set to this cpu as being the CPU they're more likely to run on. -+ */ -+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) -+{ -+ struct rq *rq = this_rq(); -+ -+ *nr_waiters = atomic_read(&rq->nr_iowait); -+ *load = rq_load(rq); -+} -+ -+/* Variables and functions for calc_load */ -+static unsigned long calc_load_update; -+unsigned long avenrun[3]; -+EXPORT_SYMBOL(avenrun); -+ -+/** -+ * get_avenrun - get the load average array -+ * @loads: pointer to dest load array -+ * @offset: offset to add -+ * @shift: shift count to shift the result left -+ * -+ * These values are estimates at best, so no need for locking. -+ */ -+void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -+{ -+ loads[0] = (avenrun[0] + offset) << shift; -+ loads[1] = (avenrun[1] + offset) << shift; -+ loads[2] = (avenrun[2] + offset) << shift; -+} -+ -+static unsigned long -+calc_load(unsigned long load, unsigned long exp, unsigned long active) -+{ -+ unsigned long newload; -+ -+ newload = load * exp + active * (FIXED_1 - exp); -+ if (active >= load) -+ newload += FIXED_1-1; -+ -+ return newload / FIXED_1; -+} -+ -+/* -+ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. -+ */ -+void calc_global_load(unsigned long ticks) -+{ -+ long active; -+ -+ if (time_before(jiffies, READ_ONCE(calc_load_update))) -+ return; -+ active = nr_active() * FIXED_1; -+ -+ avenrun[0] = calc_load(avenrun[0], EXP_1, active); -+ avenrun[1] = calc_load(avenrun[1], EXP_5, active); -+ avenrun[2] = calc_load(avenrun[2], EXP_15, active); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+} -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+#ifdef CONFIG_PARAVIRT -+static inline u64 steal_ticks(u64 steal) -+{ -+ if (unlikely(steal > NSEC_PER_SEC)) -+ return div_u64(steal, TICK_NSEC); -+ -+ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -+} -+#endif -+ -+#ifndef nsecs_to_cputime -+# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -+#endif -+ -+/* -+ * On each tick, add the number of nanoseconds to the unbanked variables and -+ * once one tick's worth has accumulated, account it allowing for accurate -+ * sub-tick accounting and totals. -+ */ -+static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ if (atomic_read(&rq->nr_iowait) > 0) { -+ rq->iowait_ns += ns; -+ if (rq->iowait_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->iowait_ns); -+ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_NSEC * ticks; -+ rq->iowait_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->idle_ns += ns; -+ if (rq->idle_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->idle_ns); -+ cpustat[CPUTIME_IDLE] += (__force u64)TICK_NSEC * ticks; -+ rq->idle_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(idle); -+} -+ -+static void pc_system_time(struct rq *rq, struct task_struct *p, -+ int hardirq_offset, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ p->stime_ns += ns; -+ if (p->stime_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(p->stime_ns); -+ p->stime_ns %= JIFFY_NS; -+ p->stime += (__force u64)TICK_NSEC * ticks; -+ account_group_system_time(p, TICK_NSEC * ticks); -+ } -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ if (hardirq_count() - hardirq_offset) { -+ rq->irq_ns += ns; -+ if (rq->irq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->irq_ns); -+ cpustat[CPUTIME_IRQ] += (__force u64)TICK_NSEC * ticks; -+ rq->irq_ns %= JIFFY_NS; -+ } -+ } else if (in_serving_softirq()) { -+ rq->softirq_ns += ns; -+ if (rq->softirq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->softirq_ns); -+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_NSEC * ticks; -+ rq->softirq_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->system_ns += ns; -+ if (rq->system_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->system_ns); -+ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_NSEC * ticks; -+ rq->system_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(p); -+} -+ -+static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ p->utime_ns += ns; -+ if (p->utime_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(p->utime_ns); -+ p->utime_ns %= JIFFY_NS; -+ p->utime += (__force u64)TICK_NSEC * ticks; -+ account_group_user_time(p, TICK_NSEC * ticks); -+ } -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ if (this_cpu_ksoftirqd() == p) { -+ /* -+ * ksoftirqd time do not get accounted in cpu_softirq_time. -+ * So, we have to handle it separately here. -+ */ -+ rq->softirq_ns += ns; -+ if (rq->softirq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->softirq_ns); -+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_NSEC * ticks; -+ rq->softirq_ns %= JIFFY_NS; -+ } -+ } -+ -+ if (task_nice(p) > 0 || idleprio_task(p)) { -+ rq->nice_ns += ns; -+ if (rq->nice_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->nice_ns); -+ cpustat[CPUTIME_NICE] += (__force u64)TICK_NSEC * ticks; -+ rq->nice_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->user_ns += ns; -+ if (rq->user_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->user_ns); -+ cpustat[CPUTIME_USER] += (__force u64)TICK_NSEC * ticks; -+ rq->user_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(p); -+} -+ -+/* -+ * This is called on clock ticks. -+ * Bank in p->sched_time the ns elapsed since the last tick or switch. -+ * CPU scheduler quota accounting is also performed here in microseconds. -+ */ -+static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) -+{ -+ s64 account_ns = rq->niffies - p->last_ran; -+ struct task_struct *idle = rq->idle; -+ -+ /* Accurate tick timekeeping */ -+ if (user_mode(get_irq_regs())) -+ pc_user_time(rq, p, account_ns); -+ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { -+ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); -+ } else -+ pc_idle_time(rq, idle, account_ns); -+ -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ if (p->policy != SCHED_FIFO && p != idle) -+ p->time_slice -= NS_TO_US(account_ns); -+ -+ p->last_ran = rq->niffies; -+} -+ -+/* -+ * This is called on context switches. -+ * Bank in p->sched_time the ns elapsed since the last tick or switch. -+ * CPU scheduler quota accounting is also performed here in microseconds. -+ */ -+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) -+{ -+ s64 account_ns = rq->niffies - p->last_ran; -+ struct task_struct *idle = rq->idle; -+ -+ /* Accurate subtick timekeeping */ -+ if (p != idle) -+ pc_user_time(rq, p, account_ns); -+ else -+ pc_idle_time(rq, idle, account_ns); -+ -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ if (p->policy != SCHED_FIFO && p != idle) -+ p->time_slice -= NS_TO_US(account_ns); -+} -+ -+/* -+ * Return any ns on the sched_clock that have not yet been accounted in -+ * @p in case that task is currently running. -+ * -+ * Called with task_rq_lock(p) held. -+ */ -+static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) -+{ -+ u64 ns = 0; -+ -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_clocks(rq); -+ ns = rq->niffies - p->last_ran; -+ } -+ -+ return ns; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ * -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64bit value. -+ * So we have a optimization chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_rq_lock(p, &flags); -+ ns = p->sched_time + do_task_delta_exec(p, rq); -+ task_rq_unlock(rq, p, &flags); -+ -+ return ns; -+} -+ -+/* -+ * Functions to test for when SCHED_ISO tasks have used their allocated -+ * quota as real time scheduling and convert them back to SCHED_NORMAL. All -+ * data is modified only by the local runqueue during scheduler_tick with -+ * interrupts disabled. -+ */ -+ -+/* -+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT -+ * tasks and set the refractory flag if necessary. There is 10% hysteresis -+ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a -+ * slow division. -+ */ -+static inline void iso_tick(struct rq *rq) -+{ -+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; -+ rq->iso_ticks += 100; -+ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { -+ rq->iso_refractory = true; -+ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) -+ rq->iso_ticks = ISO_PERIOD * 100; -+ } -+} -+ -+/* No SCHED_ISO task was running so decrease rq->iso_ticks */ -+static inline void no_iso_tick(struct rq *rq, int ticks) -+{ -+ if (rq->iso_ticks > 0 || rq->iso_refractory) { -+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; -+ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { -+ rq->iso_refractory = false; -+ if (unlikely(rq->iso_ticks < 0)) -+ rq->iso_ticks = 0; -+ } -+ } -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static void task_running_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ /* -+ * If a SCHED_ISO task is running we increment the iso_ticks. In -+ * order to prevent SCHED_ISO tasks from causing starvation in the -+ * presence of true RT tasks we account those as iso_ticks as well. -+ */ -+ if (rt_task(p) || task_running_iso(p)) -+ iso_tick(rq); -+ else -+ no_iso_tick(rq, 1); -+ -+ /* SCHED_FIFO tasks never run out of timeslice. */ -+ if (p->policy == SCHED_FIFO) -+ return; -+ -+ if (iso_task(p)) { -+ if (task_running_iso(p)) { -+ if (rq->iso_refractory) { -+ /* -+ * SCHED_ISO task is running as RT and limit -+ * has been hit. Force it to reschedule as -+ * SCHED_NORMAL by zeroing its time_slice -+ */ -+ p->time_slice = 0; -+ } -+ } else if (!rq->iso_refractory) { -+ /* Can now run again ISO. Reschedule to pick up prio */ -+ goto out_resched; -+ } -+ } -+ -+ /* -+ * Tasks that were scheduled in the first half of a tick are not -+ * allowed to run into the 2nd half of the next tick if they will -+ * run out of time slice in the interim. Otherwise, if they have -+ * less than RESCHED_US μs of time slice left they will be rescheduled. -+ * Dither is used as a backup for when hrexpiry is disabled or high res -+ * timers not configured in. -+ */ -+ if (p->time_slice - rq->dither >= RESCHED_US) -+ return; -+out_resched: -+ rq_lock(rq); -+ __set_tsk_resched(p); -+ rq_unlock(rq); -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * We can stop the timer tick any time highres timers are active since -+ * we rely entirely on highres timeouts for task expiry rescheduling. -+ */ -+static void sched_stop_tick(struct rq *rq, int cpu) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ if (!tick_nohz_full_enabled()) -+ return; -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+ -+static inline void sched_start_tick(struct rq *rq, int cpu) -+{ -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+ -+/** -+ * scheduler_tick_max_deferment -+ * -+ * Keep at least one tick per second when a single -+ * active task is running. -+ * -+ * This makes sure that uptime continues to move forward, even -+ * with a very low granularity. -+ * -+ * Return: Maximum deferment in nanoseconds. -+ */ -+u64 scheduler_tick_max_deferment(void) -+{ -+ struct rq *rq = this_rq(); -+ unsigned long next, now = READ_ONCE(jiffies); -+ -+ next = rq->last_jiffy + HZ; -+ -+ if (time_before_eq(next, now)) -+ return 0; -+ -+ return jiffies_to_nsecs(next - now); -+} -+#else -+static inline void sched_stop_tick(struct rq *rq, int cpu) -+{ -+} -+ -+static inline void sched_start_tick(struct rq *rq, int cpu) -+{ -+} -+#endif -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ -+ sched_clock_tick(); -+ update_clocks(rq); -+ update_load_avg(rq, 0); -+ update_cpu_clock_tick(rq, rq->curr); -+ if (!rq_idle(rq)) -+ task_running_tick(rq); -+ else if (rq->last_jiffy > rq->last_scheduler_tick) -+ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); -+ rq->last_scheduler_tick = rq->last_jiffy; -+ rq->last_tick = rq->clock; -+ perf_event_task_tick(); -+ sched_stop_tick(rq, cpu); -+} -+ -+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_PREEMPT_TRACER)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * The time_slice is only refilled when it is empty and that is when we set a -+ * new deadline. Make sure update_clocks has been called recently to update -+ * rq->niffies. -+ */ -+static void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = timeslice(); -+ p->deadline = rq->niffies + task_deadline_diff(p); -+#ifdef CONFIG_SMT_NICE -+ if (!p->mm) -+ p->smt_bias = 0; -+ else if (rt_task(p)) -+ p->smt_bias = 1 << 30; -+ else if (task_running_iso(p)) -+ p->smt_bias = 1 << 29; -+ else if (idleprio_task(p)) { -+ if (task_running_idle(p)) -+ p->smt_bias = 0; -+ else -+ p->smt_bias = 1; -+ } else if (--p->smt_bias < 1) -+ p->smt_bias = MAX_PRIO - p->static_prio; -+#endif -+} -+ -+/* -+ * Timeslices below RESCHED_US are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. SCHED_BATCH tasks -+ * have been flagged be not latency sensitive and likely to be fully CPU -+ * bound so every time they're rescheduled they have their time_slice -+ * refilled, but get a new later deadline to have little effect on -+ * SCHED_NORMAL tasks. -+ -+ */ -+static inline void check_deadline(struct task_struct *p, struct rq *rq) -+{ -+ if (p->time_slice < RESCHED_US || batch_task(p)) -+ time_slice_expired(p, rq); -+} -+ -+/* -+ * Task selection with skiplists is a simple matter of picking off the first -+ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) -+ * being bound to the number of processors. -+ * -+ * Runqueues are selectively locked based on their unlocked data and then -+ * unlocked if not needed. At most 3 locks will be held at any time and are -+ * released as soon as they're no longer needed. All balancing between CPUs -+ * is thus done here in an extremely simple first come best fit manner. -+ * -+ * This iterates over runqueues in cache locality order. In interactive mode -+ * it iterates over all CPUs and finds the task with the best key/deadline. -+ * In non-interactive mode it will only take a task if it's from the current -+ * runqueue or a runqueue with more tasks than the current one with a better -+ * key/deadline. -+ */ -+#ifdef CONFIG_SMP -+static inline struct task_struct -+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) -+{ -+ struct rq *locked = NULL, *chosen = NULL; -+ struct task_struct *edt = idle; -+ int i, best_entries = 0; -+ u64 best_key = ~0ULL; -+ -+ for (i = 0; i < num_possible_cpus(); i++) { -+ struct rq *other_rq = rq_order(rq, i); -+ int entries = other_rq->sl->entries; -+ skiplist_node *next; -+ -+ /* -+ * Check for queued entres lockless first. The local runqueue -+ * is locked so entries will always be accurate. -+ */ -+ if (!sched_interactive) { -+ /* -+ * Don't reschedule balance across nodes unless the CPU -+ * is idle. -+ */ -+ if (edt != idle && rq->cpu_locality[other_rq->cpu] > 3) -+ break; -+ if (entries <= best_entries) -+ continue; -+ } else if (!entries) -+ continue; -+ -+ /* if (i) implies other_rq != rq */ -+ if (i) { -+ /* Check for best id queued lockless first */ -+ if (other_rq->best_key >= best_key) -+ continue; -+ -+ if (unlikely(!trylock_rq(rq, other_rq))) -+ continue; -+ -+ /* Need to reevaluate entries after locking */ -+ entries = other_rq->sl->entries; -+ if (unlikely(!entries)) { -+ unlock_rq(other_rq); -+ continue; -+ } -+ } -+ -+ next = &other_rq->node; -+ /* -+ * In interactive mode we check beyond the best entry on other -+ * runqueues if we can't get the best for smt or affinity -+ * reasons. -+ */ -+ while ((next = next->next[0]) != &other_rq->node) { -+ struct task_struct *p; -+ u64 key = next->key; -+ -+ /* Reevaluate key after locking */ -+ if (key >= best_key) -+ break; -+ -+ p = next->value; -+ if (!smt_schedule(p, rq)) { -+ if (i && !sched_interactive) -+ break; -+ continue; -+ } -+ -+ /* Make sure affinity is ok */ -+ if (i) { -+ if (needs_other_cpu(p, cpu)) { -+ if (sched_interactive) -+ continue; -+ break; -+ } -+ /* From this point on p is the best so far */ -+ if (locked) -+ unlock_rq(locked); -+ chosen = locked = other_rq; -+ } -+ best_entries = entries; -+ best_key = key; -+ edt = p; -+ break; -+ } -+ /* rq->preempting is a hint only as the state may have changed -+ * since it was set with the resched call but if we have met -+ * the condition we can break out here. */ -+ if (edt == rq->preempting) -+ break; -+ if (i && other_rq != chosen) -+ unlock_rq(other_rq); -+ } -+ -+ if (likely(edt != idle)) -+ take_task(rq, cpu, edt); -+ -+ if (locked) -+ unlock_rq(locked); -+ -+ rq->preempting = NULL; -+ -+ return edt; -+} -+#else /* CONFIG_SMP */ -+static inline struct task_struct -+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) -+{ -+ struct task_struct *edt; -+ -+ if (unlikely(!rq->sl->entries)) -+ return idle; -+ edt = rq->node.next[0]->value; -+ take_task(rq, cpu, edt); -+ return edt; -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(preempt_disable_ip); -+ pr_cont("\n"); -+ } -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+/* -+ * The currently running task's information is all stored in rq local data -+ * which is only modified by the local CPU. -+ */ -+static inline void set_rq_task(struct rq *rq, struct task_struct *p) -+{ -+ if (p == rq->idle || p->policy == SCHED_FIFO) -+ hrexpiry_clear(rq); -+ else -+ hrexpiry_start(rq, US_TO_NS(p->time_slice)); -+ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) -+ rq->dither = 0; -+ else -+ rq->dither = rq_dither(rq); -+ -+ rq->rq_deadline = p->deadline; -+ rq->rq_prio = p->prio; -+#ifdef CONFIG_SMT_NICE -+ rq->rq_mm = p->mm; -+ rq->rq_smt_bias = p->smt_bias; -+#endif -+} -+ -+#ifdef CONFIG_SMT_NICE -+static void check_no_siblings(struct rq __maybe_unused *this_rq) {} -+static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} -+static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; -+static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; -+ -+/* Iterate over smt siblings when we've scheduled a process on cpu and decide -+ * whether they should continue running or be descheduled. */ -+static void check_smt_siblings(struct rq *this_rq) -+{ -+ int other_cpu; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct task_struct *p; -+ struct rq *rq; -+ -+ rq = cpu_rq(other_cpu); -+ if (rq_idle(rq)) -+ continue; -+ p = rq->curr; -+ if (!smt_schedule(p, this_rq)) -+ resched_curr(rq); -+ } -+} -+ -+static void wake_smt_siblings(struct rq *this_rq) -+{ -+ int other_cpu; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct rq *rq; -+ -+ rq = cpu_rq(other_cpu); -+ if (rq_idle(rq)) -+ resched_idle(rq); -+ } -+} -+#else -+static void check_siblings(struct rq __maybe_unused *this_rq) {} -+static void wake_siblings(struct rq __maybe_unused *this_rq) {} -+#endif -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPT=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(bool preempt) -+{ -+ struct task_struct *prev, *next, *idle; -+ unsigned long *switch_count; -+ bool deactivate = false; -+ struct rq *rq; -+ u64 niffies; -+ int cpu; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ idle = rq->idle; -+ -+ schedule_debug(prev); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(preempt); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(). -+ */ -+ rq_lock(rq); -+ smp_mb__after_spinlock(); -+#ifdef CONFIG_SMP -+ if (rq->preempt) { -+ /* -+ * Make sure resched_curr hasn't triggered a preemption -+ * locklessly on a task that has since scheduled away. Spurious -+ * wakeup of idle is okay though. -+ */ -+ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { -+ rq->preempt = NULL; -+ clear_preempt_need_resched(); -+ rq_unlock_irq(rq); -+ return; -+ } -+ rq->preempt = NULL; -+ } -+#endif -+ -+ switch_count = &prev->nivcsw; -+ if (!preempt && prev->state) { -+ if (unlikely(signal_pending_state(prev->state, prev))) { -+ prev->state = TASK_RUNNING; -+ } else { -+ deactivate = true; -+ prev->on_rq = 0; -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ -+ /* -+ * If a worker is going to sleep, notify and -+ * ask workqueue whether it wants to wake up a -+ * task to maintain concurrency. If so, wake -+ * up the task. -+ */ -+ if (prev->flags & PF_WQ_WORKER) { -+ struct task_struct *to_wakeup; -+ -+ to_wakeup = wq_worker_sleeping(prev); -+ if (to_wakeup) -+ try_to_wake_up_local(to_wakeup); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ /* -+ * Store the niffy value here for use by the next task's last_ran -+ * below to avoid losing niffies due to update_clocks being called -+ * again after this point. -+ */ -+ update_clocks(rq); -+ niffies = rq->niffies; -+ update_cpu_clock_switch(rq, prev); -+ -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+ -+ if (idle != prev) { -+ check_deadline(prev, rq); -+ return_task(prev, rq, cpu, deactivate); -+ } -+ -+ next = earliest_deadline_task(rq, cpu, idle); -+ if (likely(next->prio != PRIO_LIMIT)) -+ clear_cpuidle_map(cpu); -+ else { -+ set_cpuidle_map(cpu); -+ update_load_avg(rq, 0); -+ } -+ -+ set_rq_task(rq, next); -+ next->last_ran = niffies; -+ -+ if (likely(prev != next)) { -+ /* -+ * Don't reschedule an idle task or deactivated tasks -+ */ -+ if (prev != idle && !deactivate) -+ resched_suitable_idle(prev); -+ if (next != idle) -+ check_siblings(rq); -+ else -+ wake_siblings(rq); -+ rq->nr_switches++; -+ rq->curr = next; -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. For TSO -+ * (e.g. x86), the architecture must provide its own -+ * barrier in switch_mm(). For weakly ordered machines -+ * for which spin_unlock() acts as a full memory -+ * barrier, finish_lock_switch() in common code takes -+ * care of this barrier. For weakly ordered machines for -+ * which spin_unlock() acts as a RELEASE barrier (only -+ * arm64 and PowerPC), arm64 has a full barrier in -+ * switch_to(), and PowerPC has -+ * smp_mb__after_unlock_lock() before -+ * finish_lock_switch(). -+ */ -+ ++*switch_count; -+ -+ trace_sched_switch(preempt, prev, next); -+ context_switch(rq, prev, next); /* unlocks the rq */ -+ } else { -+ check_siblings(rq); -+ rq_unlock(rq); -+ do_pending_softirq(rq, next); -+ local_irq_enable(); -+ } -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* -+ * The setting of TASK_RUNNING by try_to_wake_up() may be delayed -+ * when the following two conditions become true. -+ * - There is race condition of mmap_sem (It is acquired by -+ * exit_mm()), and -+ * - SMI occurs before setting TASK_RUNINNG. -+ * (or hypervisor of virtual machine switches to other guest) -+ * As a result, we may become TASK_RUNNING after becoming TASK_DEAD -+ * -+ * To avoid it, we have to wait for releasing tsk->pi_lock which -+ * is held by try_to_wake_up() -+ */ -+ raw_spin_lock_irq(¤t->pi_lock); -+ raw_spin_unlock_irq(¤t->pi_lock); -+ -+ /* Causes final put_task_struct in finish_task_switch(). */ -+ __set_current_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ __schedule(false); -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ if (!tsk->state || tsk_is_pi_blocked(tsk) || -+ preempt_count() || -+ signal_pending_state(tsk->state, tsk)) -+ return; -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ if (blk_needs_flush_plug(tsk)) -+ blk_schedule_flush_plug(tsk); -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(false); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+} -+ -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->state); -+ do { -+ __schedule(false); -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_CONTEXT_TRACKING -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != IN_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(true); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPT -+/* -+ * this is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. Kernel preemptions off return from interrupt -+ * occur there and call schedule directly. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(true); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#endif /* CONFIG_PREEMPT */ -+ -+/* -+ * this is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(true); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio, oldprio; -+ struct rq *rq; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_rq_lock(p); -+ update_rq_clock(rq); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guaratees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ oldprio = p->prio; -+ p->prio = prio; -+ if (task_running(rq, p)){ -+ if (prio > oldprio) -+ resched_task(p); -+ } else if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (prio < oldprio) -+ try_preempt(p, rq); -+ } -+out_unlock: -+ __task_rq_unlock(rq); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+/* -+ * Adjust the deadline for when the priority is to change, before it's -+ * changed. -+ */ -+static inline void adjust_deadline(struct task_struct *p, int new_prio) -+{ -+ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); -+} -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ int new_static, old_static; -+ unsigned long flags; -+ struct rq *rq; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ new_static = NICE_TO_PRIO(nice); -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ rq = task_rq_lock(p, &flags); -+ update_rq_clock(rq); -+ -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it wont have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (has_rt_policy(p)) { -+ p->static_prio = new_static; -+ goto out_unlock; -+ } -+ -+ adjust_deadline(p, new_static); -+ old_static = p->static_prio; -+ p->static_prio = new_static; -+ p->prio = effective_prio(p); -+ -+ if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (new_static < old_static) -+ try_preempt(p, rq); -+ } else if (task_running(rq, p)) { -+ set_rq_task(rq, p); -+ if (old_static < new_static) -+ resched_task(p); -+ } -+out_unlock: -+ task_rq_unlock(rq, p, &flags); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40] */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || -+ capable(CAP_SYS_NICE)); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ int delta, prio = p->prio - MAX_RT_PRIO; -+ -+ /* rt tasks and iso tasks */ -+ if (prio <= 0) -+ goto out; -+ -+ /* Convert to ms to avoid overflows */ -+ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); -+ if (unlikely(delta < 0)) -+ delta = 0; -+ delta = delta * 40 / ms_longest_deadline_diff(); -+ if (delta <= 80) -+ prio += delta; -+ if (idleprio_task(p)) -+ prio += 40; -+out: -+ return prio; -+} -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ return cpu_curr(cpu) == cpu_rq(cpu)->idle; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the CPU @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+/* Actually do priority change: must hold rq lock. */ -+static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, -+ int prio, bool keep_boost) -+{ -+ int oldrtprio, oldprio; -+ -+ p->policy = policy; -+ oldrtprio = p->rt_priority; -+ p->rt_priority = prio; -+ p->normal_prio = normal_prio(p); -+ oldprio = p->prio; -+ /* -+ * Keep a potential priority boosting if called from -+ * sched_setscheduler(). -+ */ -+ p->prio = normal_prio(p); -+ if (keep_boost) -+ p->prio = rt_effective_prio(p, p->prio); -+ -+ if (task_running(rq, p)) { -+ set_rq_task(rq, p); -+ resched_task(p); -+ } else if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (p->prio < oldprio || p->rt_priority > oldrtprio) -+ try_preempt(p, rq); -+ } -+} -+ -+/* -+ * Check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+static int -+__sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool user, bool pi) -+{ -+ struct sched_param zero_param = { .sched_priority = 0 }; -+ unsigned long flags, rlim_rtprio = 0; -+ int retval, oldpolicy = -1; -+ int reset_on_fork; -+ struct rq *rq; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { -+ unsigned long lflags; -+ -+ if (!lock_task_sighand(p, &lflags)) -+ return -ESRCH; -+ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); -+ unlock_task_sighand(p, &lflags); -+ if (rlim_rtprio) -+ goto recheck; -+ /* -+ * If the caller requested an RT policy without having the -+ * necessary rights, we downgrade the policy to SCHED_ISO. -+ * We also set the parameter to zero to pass the checks. -+ */ -+ policy = SCHED_ISO; -+ param = &zero_param; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); -+ policy &= ~SCHED_RESET_ON_FORK; -+ -+ if (!SCHED_RANGE(policy)) -+ return -EINVAL; -+ } -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH is 0. -+ */ -+ if (param->sched_priority < 0 || -+ (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) || -+ (!p->mm && param->sched_priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if (is_rt_policy(policy) != (param->sched_priority != 0)) -+ return -EINVAL; -+ -+ /* -+ * Allow unprivileged RT tasks to decrease priority: -+ */ -+ if (user && !capable(CAP_SYS_NICE)) { -+ if (is_rt_policy(policy)) { -+ unsigned long rlim_rtprio = -+ task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy */ -+ if (policy != p->policy && !rlim_rtprio) -+ return -EPERM; -+ -+ /* Can't increase priority */ -+ if (param->sched_priority > p->rt_priority && -+ param->sched_priority > rlim_rtprio) -+ return -EPERM; -+ } else { -+ switch (p->policy) { -+ /* -+ * Can only downgrade policies but not back to -+ * SCHED_NORMAL -+ */ -+ case SCHED_ISO: -+ if (policy == SCHED_ISO) -+ goto out; -+ if (policy != SCHED_NORMAL) -+ return -EPERM; -+ break; -+ case SCHED_BATCH: -+ if (policy == SCHED_BATCH) -+ goto out; -+ if (policy != SCHED_IDLEPRIO) -+ return -EPERM; -+ break; -+ case SCHED_IDLEPRIO: -+ if (policy == SCHED_IDLEPRIO) -+ goto out; -+ return -EPERM; -+ default: -+ break; -+ } -+ } -+ -+ /* Can't change other user's priorities */ -+ if (!check_same_owner(p)) -+ return -EPERM; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag: */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ return -EPERM; -+ } -+ -+ if (user) { -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ * -+ * To be able to change p->policy safely, the runqueue lock must be -+ * held. -+ */ -+ rq = task_rq_lock(p, &flags); -+ update_rq_clock(rq); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea: -+ */ -+ if (p == rq->stop) { -+ task_rq_unlock(rq, p, &flags); -+ return -EINVAL; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || -+ param->sched_priority == p->rt_priority))) { -+ task_rq_unlock(rq, p, &flags); -+ return 0; -+ } -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ task_rq_unlock(rq, p, &flags); -+ goto recheck; -+ } -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ __setscheduler(p, rq, policy, param->sched_priority, pi); -+ task_rq_unlock(rq, p, &flags); -+ -+ if (pi) -+ rt_mutex_adjust_pi(p); -+out: -+ return 0; -+} -+ -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return __sched_setscheduler(p, policy, param, true, true); -+} -+ -+EXPORT_SYMBOL_GPL(sched_setscheduler); -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ const struct sched_param param = { .sched_priority = attr->sched_priority }; -+ int policy = attr->sched_policy; -+ -+ return __sched_setscheduler(p, policy, ¶m, true, true); -+} -+EXPORT_SYMBOL_GPL(sched_setattr); -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return __sched_setscheduler(p, policy, param, false, true); -+} -+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (p != NULL) -+ retval = sched_setscheduler(p, policy, &lparam); -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, -+ struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) -+ return -EFAULT; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* Bail out on silly large: */ -+ if (size > PAGE_SIZE) -+ goto err_size; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0) -+ goto err_size; -+ -+ /* -+ * If we're handed a bigger struct than we know of, -+ * ensure all the unknown bits are 0 - i.e. new -+ * user-space does not rely on any kernel feature -+ * extensions we dont know about yet. -+ */ -+ if (size > sizeof(*attr)) { -+ unsigned char __user *addr; -+ unsigned char __user *end; -+ unsigned char val; -+ -+ addr = (void __user *)uattr + sizeof(*attr); -+ end = (void __user *)uattr + size; -+ -+ for (; addr < end; addr++) { -+ ret = get_user(val, addr); -+ if (ret) -+ return ret; -+ if (val) -+ goto err_size; -+ } -+ size = sizeof(*attr); -+ } -+ -+ ret = copy_from_user(attr, uattr, size); -+ if (ret) -+ return -EFAULT; -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (p != NULL) -+ retval = sched_setattr(p, &attr); -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+static int sched_read_attr(struct sched_attr __user *uattr, -+ struct sched_attr *attr, -+ unsigned int usize) -+{ -+ int ret; -+ -+ if (!access_ok(VERIFY_WRITE, uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * If we're handed a smaller struct than we know of, -+ * ensure all the unknown bits are 0 - i.e. old -+ * user-space does not get uncomplete information. -+ */ -+ if (usize < sizeof(*attr)) { -+ unsigned char *addr; -+ unsigned char *end; -+ -+ addr = (void *)attr + usize; -+ end = (void *)attr + sizeof(*attr); -+ -+ for (; addr < end; addr++) { -+ if (*addr) -+ return -EFBIG; -+ } -+ -+ attr->size = usize; -+ } -+ -+ ret = copy_to_user(uattr, attr, attr->size); -+ if (ret) -+ return -EFAULT; -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return ret; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @size: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, size, unsigned int, flags) -+{ -+ struct sched_attr attr = { -+ .size = sizeof(struct sched_attr), -+ }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || size > PAGE_SIZE || -+ size < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ attr.sched_policy = p->policy; -+ if (rt_task(p)) -+ attr.sched_priority = p->rt_priority; -+ else -+ attr.sched_nice = task_nice(p); -+ -+ rcu_read_unlock(); -+ -+ retval = sched_read_attr(uattr, &attr, size); -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ cpumask_var_t cpus_allowed, new_mask; -+ struct task_struct *p; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ retval = -EPERM; -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ goto out_unlock; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ cpuset_cpus_allowed(p, cpus_allowed); -+ cpumask_and(new_mask, in_mask, cpus_allowed); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, true); -+ -+ if (!retval) { -+ cpuset_cpus_allowed(p, cpus_allowed); -+ if (!cpumask_subset(new_mask, cpus_allowed)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_allowed to the -+ * cpuset's cpus_allowed -+ */ -+ cpumask_copy(new_mask, cpus_allowed); -+ goto again; -+ } -+ } -+out_unlock: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_allowed); -+out_put_task: -+ put_task_struct(p); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ cpumask_t *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ unsigned long flags; -+ int retval; -+ -+ get_online_cpus(); -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ put_online_cpus(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ size_t retlen = min_t(size_t, len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. It does this by -+ * scheduling away the current task. If it still has the earliest deadline -+ * it will be scheduled again as the next task. -+ * -+ * Return: 0. -+ */ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ struct rq *rq; -+ -+ if (!sched_yield_type) -+ goto out; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ rq_lock(rq); -+ -+ if (sched_yield_type > 1) -+ time_slice_expired(current, rq); -+ schedstat_inc(rq->yld_count); -+ -+ /* -+ * Since we are going to call schedule() anyway, there's -+ * no need to preempt or enable interrupts: -+ */ -+ preempt_disable(); -+ rq_unlock(rq); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+out: -+ return 0; -+} -+ -+#ifndef CONFIG_PREEMPT -+int __sched _cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ return 0; -+} -+EXPORT_SYMBOL(_cond_resched); -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (resched) -+ preempt_schedule_common(); -+ else -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+int __sched __cond_resched_softirq(void) -+{ -+ BUG_ON(!in_softirq()); -+ -+ if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { -+ local_bh_enable(); -+ preempt_schedule_common(); -+ local_bh_disable(); -+ return 1; -+ } -+ return 0; -+} -+EXPORT_SYMBOL(__cond_resched_softirq); -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, its already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ sys_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ struct task_struct *rq_p; -+ struct rq *rq, *p_rq; -+ unsigned long flags; -+ int yielded = 0; -+ -+ local_irq_save(flags); -+ rq = this_rq(); -+ -+again: -+ p_rq = task_rq(p); -+ /* -+ * If we're the only runnable task on the rq and target rq also -+ * has only one task, there's absolutely no point in yielding. -+ */ -+ if (task_running(p_rq, p) || p->state) { -+ yielded = -ESRCH; -+ goto out_irq; -+ } -+ -+ double_rq_lock(rq, p_rq); -+ if (unlikely(task_rq(p) != p_rq)) { -+ double_rq_unlock(rq, p_rq); -+ goto again; -+ } -+ -+ yielded = 1; -+ schedstat_inc(rq->yld_count); -+ rq_p = rq->curr; -+ if (p->deadline > rq_p->deadline) -+ p->deadline = rq_p->deadline; -+ p->time_slice += rq_p->time_slice; -+ if (p->time_slice > timeslice()) -+ p->time_slice = timeslice(); -+ time_slice_expired(rq_p, rq); -+ if (preempt && rq != p_rq) -+ resched_task(p_rq->curr); -+ double_rq_unlock(rq, p_rq); -+out_irq: -+ local_irq_restore(flags); -+ -+ if (yielded > 0) -+ schedule(); -+ return yielded; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_schedule_flush_plug(current); -+ -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_USER_RT_PRIO-1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLEPRIO: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLEPRIO: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct timespec __user *, interval) -+{ -+ struct task_struct *p; -+ unsigned int time_slice; -+ unsigned long flags; -+ struct timespec t; -+ struct rq *rq; -+ int retval; -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ rq = task_rq_lock(p, &flags); -+ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); -+ task_rq_unlock(rq, p, &flags); -+ -+ rcu_read_unlock(); -+ t = ns_to_timespec(time_slice); -+ retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); -+ -+ if (p->state == TASK_RUNNING) -+ printk(KERN_CONT " running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, -+ task_pid_nr(p), ppid, -+ (unsigned long)task_thread_info(p)->flags); -+ -+ print_worker_info(KERN_INFO, p); -+ show_stack(p, NULL); -+ put_task_stack(p); -+} -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(p->state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) -+ return false; -+ -+ return true; -+} -+ -+void show_state_filter(unsigned long state_filter) -+{ -+ struct task_struct *g, *p; -+ -+#if BITS_PER_LONG == 32 -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#else -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#endif -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+#ifdef CONFIG_SMP -+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_allowed, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ struct rq *rq = task_rq(p); -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ cpumask_copy(&p->cpus_allowed, new_mask); -+ -+ if (task_queued(p)) { -+ /* -+ * Because __kthread_bind() calls this on blocked tasks without -+ * holding rq->lock. -+ */ -+ lockdep_assert_held(&rq->lock); -+ } -+} -+ -+/* -+ * Calling do_set_cpus_allowed from outside the scheduler code should not be -+ * called on a running or queued task. We should be holding pi_lock. -+ */ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ __do_set_cpus_allowed(p, new_mask); -+ if (needs_other_cpu(p, task_cpu(p))) { -+ struct rq *rq; -+ -+ rq = __task_rq_lock(p); -+ set_task_cpu(p, valid_task_cpu(p)); -+ resched_task(p); -+ __task_rq_unlock(rq); -+ } -+} -+#endif -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: cpu the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(&rq->lock); -+ idle->last_ran = rq->niffies; -+ time_slice_expired(idle, rq); -+ idle->state = TASK_RUNNING; -+ /* Setting prio to illegal value shouldn't matter when never queued */ -+ idle->prio = PRIO_LIMIT; -+ -+ kasan_unpoison_task_stack(idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#ifdef CONFIG_SMT_NICE -+ idle->smt_bias = 0; -+#endif -+#endif -+ set_rq_task(rq, idle); -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->curr = rq->idle = idle; -+ idle->on_rq = TASK_ON_RQ_QUEUED; -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_cpus_allowed) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_allowed may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ rq_lock_irqsave(rq, &flags); -+ resched_task(cpu_curr(cpu)); -+ rq_unlock_irqrestore(rq, &flags); -+} -+ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_NO_HZ_COMMON -+void nohz_balance_enter_idle(int cpu) -+{ -+} -+ -+void select_nohz_load_balancer(int stop_tick) -+{ -+} -+ -+void set_cpu_sd_state_idle(void) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(); -+ struct sched_domain *sd; -+ -+ if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) -+ return cpu; -+ -+ rcu_read_lock(); -+ for_each_domain(cpu, sd) { -+ for_each_cpu(i, sched_domain_span(sd)) { -+ if (cpu == i) -+ continue; -+ -+ if (!idle_cpu(i) && is_housekeeping_cpu(i)) { -+ cpu = i; -+ cpu = i; -+ goto unlock; -+ } -+ } -+ } -+ -+ if (!is_housekeeping_cpu(cpu)) -+ cpu = housekeeping_any_cpu(); -+unlock: -+ rcu_read_unlock(); -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+void wake_up_idle_cpu(int cpu) -+{ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+static bool wake_up_full_nohz_cpu(int cpu) -+{ -+ /* -+ * We just need the target to call irq_exit() and re-evaluate -+ * the next tick. The nohz full kick at least implies that. -+ * If needed we can still optimize that later with an -+ * empty IRQ. -+ */ -+ if (cpu_is_offline(cpu)) -+ return true; /* Don't try to wake offline CPUs. */ -+ if (tick_nohz_full_cpu(cpu)) { -+ if (cpu != smp_processor_id() || -+ tick_nohz_tick_stopped()) -+ tick_nohz_full_kick_cpu(cpu); -+ return true; -+ } -+ -+ return false; -+} -+ -+/* -+ * Wake up the specified CPU. If the CPU is going offline, it is the -+ * caller's responsibility to deal with the lost wakeup, for example, -+ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. -+ */ -+void wake_up_nohz_cpu(int cpu) -+{ -+ if (!wake_up_full_nohz_cpu(cpu)) -+ wake_up_idle_cpu(cpu); -+} -+#endif /* CONFIG_NO_HZ_COMMON */ -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ bool queued = false, running_wrong = false, kthread; -+ struct cpumask old_mask; -+ unsigned long flags; -+ struct rq *rq; -+ int ret = 0; -+ -+ rq = task_rq_lock(p, &flags); -+ update_rq_clock(rq); -+ -+ kthread = !!(p->flags & PF_KTHREAD); -+ if (kthread) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ cpumask_copy(&old_mask, &p->cpus_allowed); -+ if (cpumask_equal(&old_mask, new_mask)) -+ goto out; -+ -+ if (!cpumask_intersects(new_mask, cpu_valid_mask)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ queued = task_queued(p); -+ __do_set_cpus_allowed(p, new_mask); -+ -+ if (kthread) { -+ /* -+ * For kernel threads that do indeed end up on online && -+ * !active we want to ensure they are strict per-CPU threads. -+ */ -+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && -+ !cpumask_intersects(new_mask, cpu_active_mask) && -+ p->nr_cpus_allowed != 1); -+ } -+ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), new_mask)) -+ goto out; -+ -+ if (task_running(rq, p)) { -+ /* Task is running on the wrong cpu now, reschedule it. */ -+ if (rq == this_rq()) { -+ set_tsk_need_resched(p); -+ running_wrong = true; -+ } else -+ resched_task(p); -+ } else { -+ int cpu = cpumask_any_and(cpu_valid_mask, new_mask); -+ -+ if (queued) { -+ /* -+ * Switch runqueue locks after dequeueing the task -+ * here while still holding the pi_lock to be holding -+ * the correct lock for enqueueing. -+ */ -+ dequeue_task(rq, p, 0); -+ rq_unlock(rq); -+ -+ rq = cpu_rq(cpu); -+ rq_lock(rq); -+ } -+ set_task_cpu(p, cpu); -+ if (queued) -+ enqueue_task(rq, p, 0); -+ } -+ if (queued) -+ try_preempt(p, rq); -+ if (running_wrong) -+ preempt_disable(); -+out: -+ task_rq_unlock(rq, p, &flags); -+ -+ if (running_wrong) { -+ __schedule(true); -+ preempt_enable(); -+ } -+ -+ return ret; -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, false); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Run through task list and find tasks affined to the dead cpu, then remove -+ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold -+ * cpu 0 and src_cpu's runqueue locks. -+ */ -+static void bind_zero(int src_cpu) -+{ -+ struct task_struct *p, *t; -+ struct rq *rq0; -+ int bound = 0; -+ -+ if (src_cpu == 0) -+ return; -+ -+ rq0 = cpu_rq(0); -+ -+ do_each_thread(t, p) { -+ if (cpumask_test_cpu(src_cpu, &p->cpus_allowed)) { -+ bool local = (task_cpu(p) == src_cpu); -+ struct rq *rq = task_rq(p); -+ -+ /* task_running is the cpu stopper thread */ -+ if (local && task_running(rq, p)) -+ continue; -+ atomic_clear_cpu(src_cpu, &p->cpus_allowed); -+ atomic_set_cpu(0, &p->cpus_allowed); -+ p->zerobound = true; -+ bound++; -+ if (local) { -+ bool queued = task_queued(p); -+ -+ if (queued) -+ dequeue_task(rq, p, 0); -+ set_task_cpu(p, 0); -+ if (queued) -+ enqueue_task(rq0, p, 0); -+ } -+ } -+ } while_each_thread(t, p); -+ -+ if (bound) { -+ printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n", -+ bound, src_cpu); -+ } -+} -+ -+/* Find processes with the zerobound flag and reenable their affinity for the -+ * CPU coming alive. */ -+static void unbind_zero(int src_cpu) -+{ -+ int unbound = 0, zerobound = 0; -+ struct task_struct *p, *t; -+ -+ if (src_cpu == 0) -+ return; -+ -+ do_each_thread(t, p) { -+ if (!p->mm) -+ p->zerobound = false; -+ if (p->zerobound) { -+ unbound++; -+ cpumask_set_cpu(src_cpu, &p->cpus_allowed); -+ /* Once every CPU affinity has been re-enabled, remove -+ * the zerobound flag */ -+ if (cpumask_subset(cpu_possible_mask, &p->cpus_allowed)) { -+ p->zerobound = false; -+ zerobound++; -+ } -+ } -+ } while_each_thread(t, p); -+ -+ if (unbound) { -+ printk(KERN_INFO "Added affinity for %d processes to cpu %d\n", -+ unbound, src_cpu); -+ } -+ if (zerobound) { -+ printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n", -+ zerobound); -+ } -+} -+ -+/* -+ * Ensure that the idle task is using init_mm right before its cpu goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(cpu_online(smp_processor_id())); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ finish_arch_post_lock_switch(); -+ } -+ mmdrop(mm); -+} -+#else /* CONFIG_HOTPLUG_CPU */ -+static void unbind_zero(int src_cpu) {} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+ -+static struct ctl_table sd_ctl_dir[] = { -+ { -+ .procname = "sched_domain", -+ .mode = 0555, -+ }, -+ {} -+}; -+ -+static struct ctl_table sd_ctl_root[] = { -+ { -+ .procname = "kernel", -+ .mode = 0555, -+ .child = sd_ctl_dir, -+ }, -+ {} -+}; -+ -+static struct ctl_table *sd_alloc_ctl_entry(int n) -+{ -+ struct ctl_table *entry = -+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); -+ -+ return entry; -+} -+ -+static void sd_free_ctl_entry(struct ctl_table **tablep) -+{ -+ struct ctl_table *entry; -+ -+ /* -+ * In the intermediate directories, both the child directory and -+ * procname are dynamically allocated and could fail but the mode -+ * will always be set. In the lowest directory the names are -+ * static strings and all have proc handlers. -+ */ -+ for (entry = *tablep; entry->mode; entry++) { -+ if (entry->child) -+ sd_free_ctl_entry(&entry->child); -+ if (entry->proc_handler == NULL) -+ kfree(entry->procname); -+ } -+ -+ kfree(*tablep); -+ *tablep = NULL; -+} -+ -+#define CPU_LOAD_IDX_MAX 5 -+static int min_load_idx = 0; -+static int max_load_idx = CPU_LOAD_IDX_MAX-1; -+ -+static void -+set_table_entry(struct ctl_table *entry, -+ const char *procname, void *data, int maxlen, -+ umode_t mode, proc_handler *proc_handler, -+ bool load_idx) -+{ -+ entry->procname = procname; -+ entry->data = data; -+ entry->maxlen = maxlen; -+ entry->mode = mode; -+ entry->proc_handler = proc_handler; -+ -+ if (load_idx) { -+ entry->extra1 = &min_load_idx; -+ entry->extra2 = &max_load_idx; -+ } -+} -+ -+static struct ctl_table * -+sd_alloc_ctl_domain_table(struct sched_domain *sd) -+{ -+ struct ctl_table *table = sd_alloc_ctl_entry(14); -+ -+ if (table == NULL) -+ return NULL; -+ -+ set_table_entry(&table[0], "min_interval", &sd->min_interval, -+ sizeof(long), 0644, proc_doulongvec_minmax, false); -+ set_table_entry(&table[1], "max_interval", &sd->max_interval, -+ sizeof(long), 0644, proc_doulongvec_minmax, false); -+ set_table_entry(&table[2], "busy_idx", &sd->busy_idx, -+ sizeof(int), 0644, proc_dointvec_minmax, true); -+ set_table_entry(&table[3], "idle_idx", &sd->idle_idx, -+ sizeof(int), 0644, proc_dointvec_minmax, true); -+ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, -+ sizeof(int), 0644, proc_dointvec_minmax, true); -+ set_table_entry(&table[5], "wake_idx", &sd->wake_idx, -+ sizeof(int), 0644, proc_dointvec_minmax, true); -+ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, -+ sizeof(int), 0644, proc_dointvec_minmax, true); -+ set_table_entry(&table[7], "busy_factor", &sd->busy_factor, -+ sizeof(int), 0644, proc_dointvec_minmax, false); -+ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, -+ sizeof(int), 0644, proc_dointvec_minmax, false); -+ set_table_entry(&table[9], "cache_nice_tries", -+ &sd->cache_nice_tries, -+ sizeof(int), 0644, proc_dointvec_minmax, false); -+ set_table_entry(&table[10], "flags", &sd->flags, -+ sizeof(int), 0644, proc_dointvec_minmax, false); -+ set_table_entry(&table[11], "max_newidle_lb_cost", -+ &sd->max_newidle_lb_cost, -+ sizeof(long), 0644, proc_doulongvec_minmax, false); -+ set_table_entry(&table[12], "name", sd->name, -+ CORENAME_MAX_SIZE, 0444, proc_dostring, false); -+ /* &table[13] is terminator */ -+ -+ return table; -+} -+ -+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -+{ -+ struct ctl_table *entry, *table; -+ struct sched_domain *sd; -+ int domain_num = 0, i; -+ char buf[32]; -+ -+ for_each_domain(cpu, sd) -+ domain_num++; -+ entry = table = sd_alloc_ctl_entry(domain_num + 1); -+ if (table == NULL) -+ return NULL; -+ -+ i = 0; -+ for_each_domain(cpu, sd) { -+ snprintf(buf, 32, "domain%d", i); -+ entry->procname = kstrdup(buf, GFP_KERNEL); -+ entry->mode = 0555; -+ entry->child = sd_alloc_ctl_domain_table(sd); -+ entry++; -+ i++; -+ } -+ return table; -+} -+ -+static cpumask_var_t sd_sysctl_cpus; -+static struct ctl_table_header *sd_sysctl_header; -+ -+void register_sched_domain_sysctl(void) -+{ -+ static struct ctl_table *cpu_entries; -+ static struct ctl_table **cpu_idx; -+ char buf[32]; -+ int i; -+ -+ if (!cpu_entries) { -+ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); -+ if (!cpu_entries) -+ return; -+ -+ WARN_ON(sd_ctl_dir[0].child); -+ sd_ctl_dir[0].child = cpu_entries; -+ } -+ -+ if (!cpu_idx) { -+ struct ctl_table *e = cpu_entries; -+ -+ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); -+ if (!cpu_idx) -+ return; -+ -+ /* deal with sparse possible map */ -+ for_each_possible_cpu(i) { -+ cpu_idx[i] = e; -+ e++; -+ } -+ } -+ -+ if (!cpumask_available(sd_sysctl_cpus)) { -+ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) -+ return; -+ -+ /* init to possible to not have holes in @cpu_entries */ -+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); -+ } -+ -+ for_each_cpu(i, sd_sysctl_cpus) { -+ struct ctl_table *e = cpu_idx[i]; -+ -+ if (e->child) -+ sd_free_ctl_entry(&e->child); -+ -+ if (!e->procname) { -+ snprintf(buf, 32, "cpu%d", i); -+ e->procname = kstrdup(buf, GFP_KERNEL); -+ } -+ e->mode = 0555; -+ e->child = sd_alloc_ctl_cpu_table(i); -+ -+ __cpumask_clear_cpu(i, sd_sysctl_cpus); -+ } -+ -+ WARN_ON(sd_sysctl_header); -+ sd_sysctl_header = register_sysctl_table(sd_ctl_root); -+} -+ -+void dirty_sched_domain_sysctl(int cpu) -+{ -+ if (cpumask_available(sd_sysctl_cpus)) -+ __cpumask_set_cpu(cpu, sd_sysctl_cpus); -+} -+ -+/* may be called multiple times per register */ -+void unregister_sched_domain_sysctl(void) -+{ -+ unregister_sysctl_table(sd_sysctl_header); -+ sd_sysctl_header = NULL; -+} -+#endif /* CONFIG_SYSCTL */ -+ -+void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) { -+ cpumask_set_cpu(cpu_of(rq), rq->rd->online); -+ rq->online = true; -+ } -+} -+ -+void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) { -+ int cpu = cpu_of(rq); -+ -+ cpumask_clear_cpu(cpu, rq->rd->online); -+ rq->online = false; -+ clear_cpuidle_map(cpu); -+ } -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) { -+ sched_domains_numa_masks_set(cpu); -+ cpuset_cpu_active(); -+ } -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all CPUs have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ rq_lock_irqsave(rq, &flags); -+ if (rq->rd) { -+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -+ set_rq_online(rq); -+ } -+ unbind_zero(cpu); -+ rq_unlock_irqrestore(rq, &flags); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu_mult(call_rcu, call_rcu_sched); -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ sched_domains_numa_masks_clear(cpu); -+ return 0; -+} -+ -+int sched_cpu_starting(unsigned int __maybe_unused cpu) -+{ -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ double_rq_lock(rq, cpu_rq(0)); -+ if (rq->rd) { -+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -+ set_rq_offline(rq); -+ } -+ bind_zero(cpu); -+ double_rq_unlock(rq, cpu_rq(0)); -+ sched_start_tick(rq, cpu); -+ hrexpiry_clear(rq); -+ local_irq_restore(flags); -+ -+ return 0; -+} -+#endif -+ -+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) -+/* -+ * Cheaper version of the below functions in case support for SMT and MC is -+ * compiled in but CPUs have no siblings. -+ */ -+static bool sole_cpu_idle(struct rq *rq) -+{ -+ return rq_idle(rq); -+} -+#endif -+#ifdef CONFIG_SCHED_SMT -+static const cpumask_t *thread_cpumask(int cpu) -+{ -+ return topology_sibling_cpumask(cpu); -+} -+/* All this CPU's SMT siblings are idle */ -+static bool siblings_cpu_idle(struct rq *rq) -+{ -+ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); -+} -+#endif -+#ifdef CONFIG_SCHED_MC -+static const cpumask_t *core_cpumask(int cpu) -+{ -+ return topology_core_cpumask(cpu); -+} -+/* All this CPU's shared cache siblings are idle */ -+static bool cache_cpu_idle(struct rq *rq) -+{ -+ return cpumask_subset(&rq->core_mask, &cpu_idle_map); -+} -+#endif -+ -+enum sched_domain_level { -+ SD_LV_NONE = 0, -+ SD_LV_SIBLING, -+ SD_LV_MC, -+ SD_LV_BOOK, -+ SD_LV_CPU, -+ SD_LV_NODE, -+ SD_LV_ALLNODES, -+ SD_LV_MAX -+}; -+ -+void __init sched_init_smp(void) -+{ -+ struct sched_domain *sd; -+ int cpu, other_cpu; -+#ifdef CONFIG_SCHED_SMT -+ bool smt_threads = false; -+#endif -+ cpumask_var_t non_isolated_cpus; -+ struct rq *rq; -+ -+ alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); -+ -+ sched_init_numa(); -+ -+ /* -+ * There's no userspace yet to cause hotplug operations; hence all the -+ * cpu masks are stable and all blatant races in the below code cannot -+ * happen. -+ */ -+ mutex_lock(&sched_domains_mutex); -+ sched_init_domains(cpu_active_mask); -+ cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); -+ if (cpumask_empty(non_isolated_cpus)) -+ cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); -+ mutex_unlock(&sched_domains_mutex); -+ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) -+ BUG(); -+ free_cpumask_var(non_isolated_cpus); -+ -+ mutex_lock(&sched_domains_mutex); -+ local_irq_disable(); -+ lock_all_rqs(); -+ /* -+ * Set up the relative cache distance of each online cpu from each -+ * other in a simple array for quick lookup. Locality is determined -+ * by the closest sched_domain that CPUs are separated by. CPUs with -+ * shared cache in SMT and MC are treated as local. Separate CPUs -+ * (within the same package or physically) within the same node are -+ * treated as not local. CPUs not even in the same domain (different -+ * nodes) are treated as very distant. -+ */ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ -+ /* First check if this cpu is in the same node */ -+ for_each_domain(cpu, sd) { -+ if (sd->level > SD_LV_MC) -+ continue; -+ /* Set locality to local node if not already found lower */ -+ for_each_cpu(other_cpu, sched_domain_span(sd)) { -+ if (rq->cpu_locality[other_cpu] > 3) -+ rq->cpu_locality[other_cpu] = 3; -+ } -+ } -+ -+ /* -+ * Each runqueue has its own function in case it doesn't have -+ * siblings of its own allowing mixed topologies. -+ */ -+#ifdef CONFIG_SCHED_MC -+ for_each_cpu(other_cpu, core_cpumask(cpu)) { -+ if (rq->cpu_locality[other_cpu] > 2) -+ rq->cpu_locality[other_cpu] = 2; -+ } -+ if (cpumask_weight(core_cpumask(cpu)) > 1) { -+ cpumask_copy(&rq->core_mask, core_cpumask(cpu)); -+ cpumask_clear_cpu(cpu, &rq->core_mask); -+ rq->cache_idle = cache_cpu_idle; -+ } -+#endif -+#ifdef CONFIG_SCHED_SMT -+ if (cpumask_weight(thread_cpumask(cpu)) > 1) { -+ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); -+ cpumask_clear_cpu(cpu, &rq->thread_mask); -+ for_each_cpu(other_cpu, thread_cpumask(cpu)) -+ rq->cpu_locality[other_cpu] = 1; -+ rq->siblings_idle = siblings_cpu_idle; -+ smt_threads = true; -+ } -+#endif -+ } -+ for_each_possible_cpu(cpu) { -+ int total_cpus = 1, locality; -+ -+ rq = cpu_rq(cpu); -+ for (locality = 1; locality <= 4; locality++) { -+ for_each_possible_cpu(other_cpu) { -+ if (rq->cpu_locality[other_cpu] == locality) -+ rq->rq_order[total_cpus++] = cpu_rq(other_cpu); -+ } -+ } -+ } -+#ifdef CONFIG_SMT_NICE -+ if (smt_threads) { -+ check_siblings = &check_smt_siblings; -+ wake_siblings = &wake_smt_siblings; -+ smt_schedule = &smt_should_schedule; -+ } -+#endif -+ unlock_all_rqs(); -+ local_irq_enable(); -+ mutex_unlock(&sched_domains_mutex); -+ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ -+ for_each_online_cpu(other_cpu) { -+ if (other_cpu <= cpu) -+ continue; -+ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); -+ } -+ } -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+ sched_smp_initialized = true; -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+#ifdef CONFIG_SMP -+ int cpu_ids; -+#endif -+ int i; -+ struct rq *rq; -+ -+ sched_clock_init(); -+ -+ wait_bit_init(); -+ -+ prio_ratios[0] = 128; -+ for (i = 1 ; i < NICE_WIDTH ; i++) -+ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; -+ -+ skiplist_node_init(&init_task.node); -+ -+#ifdef CONFIG_SMP -+ init_defrootdomain(); -+ cpumask_clear(&cpu_idle_map); -+#else -+ uprq = &per_cpu(runqueues, 0); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ skiplist_init(&rq->node); -+ rq->sl = new_skiplist(&rq->node); -+ raw_spin_lock_init(&rq->lock); -+ rq->nr_running = 0; -+ rq->nr_uninterruptible = 0; -+ rq->nr_switches = 0; -+ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; -+ rq->last_jiffy = jiffies; -+ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = -+ rq->iowait_ns = rq->idle_ns = 0; -+ rq->dither = 0; -+ set_rq_task(rq, &init_task); -+ rq->iso_ticks = 0; -+ rq->iso_refractory = false; -+#ifdef CONFIG_SMP -+ rq->sd = NULL; -+ rq->rd = NULL; -+ rq->online = false; -+ rq->cpu = i; -+ rq_attach_root(rq, &def_root_domain); -+#endif -+ init_rq_hrexpiry(rq); -+ atomic_set(&rq->nr_iowait, 0); -+ } -+ -+#ifdef CONFIG_SMP -+ cpu_ids = i; -+ /* -+ * Set the base locality for cpu cache distance calculation to -+ * "distant" (3). Make sure the distance from a CPU to itself is 0. -+ */ -+ for_each_possible_cpu(i) { -+ int j; -+ -+ rq = cpu_rq(i); -+#ifdef CONFIG_SCHED_SMT -+ rq->siblings_idle = sole_cpu_idle; -+#endif -+#ifdef CONFIG_SCHED_MC -+ rq->cache_idle = sole_cpu_idle; -+#endif -+ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); -+ for_each_possible_cpu(j) { -+ if (i == j) -+ rq->cpu_locality[j] = 0; -+ else -+ rq->cpu_locality[j] = 4; -+ } -+ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); -+ rq->rq_order[0] = rq; -+ for (j = 1; j < cpu_ids; j++) -+ rq->rq_order[j] = cpu_rq(j); -+ } -+#endif -+ -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+#ifdef CONFIG_SMP -+ /* May be allocated at isolcpus cmdline parse time */ -+ if (cpu_isolated_map == NULL) -+ zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); -+ idle_thread_set_boot_cpu(); -+#endif /* SMP */ -+ -+ init_schedstats(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+static inline int preempt_count_equals(int preempt_offset) -+{ -+ int nested = preempt_count() + rcu_preempt_depth(); -+ -+ return (nested == preempt_offset); -+} -+ -+void __might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%lx set at [<%p>] %pS\n", -+ current->state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ ___might_sleep(file, line, preempt_offset); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+void ___might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -+ !is_idle_task(current)) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ printk(KERN_ERR -+ "BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ printk(KERN_ERR -+ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ if (task_stack_end_corrupted(current)) -+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && !preempt_count_equals(preempt_offset)) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(preempt_disable_ip); -+ pr_cont("\n"); -+ } -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(___might_sleep); -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+static inline void normalise_rt_tasks(void) -+{ -+ struct task_struct *g, *p; -+ unsigned long flags; -+ struct rq *rq; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ if (!rt_task(p) && !iso_task(p)) -+ continue; -+ -+ rq = task_rq_lock(p, &flags); -+ __setscheduler(p, rq, SCHED_NORMAL, 0, false); -+ task_rq_unlock(rq, p, &flags); -+ } -+ read_unlock(&tasklist_lock); -+} -+ -+void normalize_rt_tasks(void) -+{ -+ normalise_rt_tasks(); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+void init_idle_bootup_task(struct task_struct *idle) -+{} -+ -+#ifdef CONFIG_SCHED_DEBUG -+__read_mostly bool sched_debug_enabled; -+ -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -+#endif -+ -+#ifdef CONFIG_SMP -+#define SCHED_LOAD_SHIFT (10) -+#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) -+ -+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) -+{ -+ return SCHED_LOAD_SCALE; -+} -+ -+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) -+{ -+ unsigned long weight = cpumask_weight(sched_domain_span(sd)); -+ unsigned long smt_gain = sd->smt_gain; -+ -+ smt_gain /= weight; -+ -+ return smt_gain; -+} -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs */ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+void sched_offline_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_offline_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_free_group(tg); -+} -+ -+static void cpu_cgroup_fork(struct task_struct *task) -+{ -+} -+ -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+static struct cftype cpu_files[] = { -+ { } /* Terminate */ -+}; -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .fork = cpu_cgroup_fork, -+ .can_attach = cpu_cgroup_can_attach, -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .early_init = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -diff -Nur a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h ---- a/kernel/sched/MuQSS.h 1970-01-01 01:00:00.000000000 +0100 -+++ b/kernel/sched/MuQSS.h 2019-01-05 20:22:51.099998516 +0000 -@@ -0,0 +1,725 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_PARAVIRT -+#include -+#endif -+ -+#include "cpuacct.h" -+ -+#ifndef MUQSS_SCHED_H -+#define MUQSS_SCHED_H -+ -+#ifdef CONFIG_SCHED_DEBUG -+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) -+#else -+# define SCHED_WARN_ON(x) ((void)(x)) -+#endif -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+struct rq; -+ -+#ifdef CONFIG_SMP -+ -+static inline bool sched_asym_prefer(int a, int b) -+{ -+ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); -+} -+ -+/* -+ * We add the notion of a root-domain which will be used to define per-domain -+ * variables. Each exclusive cpuset essentially defines an island domain by -+ * fully partitioning the member cpus from any other cpuset. Whenever a new -+ * exclusive cpuset is created, we also create and attach a new root-domain -+ * object. -+ * -+ */ -+struct root_domain { -+ atomic_t refcount; -+ atomic_t rto_count; -+ struct rcu_head rcu; -+ cpumask_var_t span; -+ cpumask_var_t online; -+ -+ /* Indicate more than one runnable task for any CPU */ -+ bool overload; -+ -+ /* -+ * The bit corresponding to a CPU gets set here if such CPU has more -+ * than one runnable -deadline task (as it is below for RT tasks). -+ */ -+ cpumask_var_t dlo_mask; -+ atomic_t dlo_count; -+ /* Replace unused CFS structures with void */ -+ //struct dl_bw dl_bw; -+ //struct cpudl cpudl; -+ void *dl_bw; -+ void *cpudl; -+ -+ /* -+ * The "RT overload" flag: it gets set if a CPU has more than -+ * one runnable RT task. -+ */ -+ cpumask_var_t rto_mask; -+ //struct cpupri cpupri; -+ void *cpupri; -+ -+ unsigned long max_cpu_capacity; -+}; -+ -+extern struct root_domain def_root_domain; -+extern struct mutex sched_domains_mutex; -+ -+extern void init_defrootdomain(void); -+extern int sched_init_domains(const struct cpumask *cpu_map); -+extern void rq_attach_root(struct rq *rq, struct root_domain *rd); -+ -+static inline void cpupri_cleanup(void __maybe_unused *cpupri) -+{ -+} -+ -+static inline void cpudl_cleanup(void __maybe_unused *cpudl) -+{ -+} -+ -+static inline void init_dl_bw(void __maybe_unused *dl_bw) -+{ -+} -+ -+static inline int cpudl_init(void __maybe_unused *dl_bw) -+{ -+ return 0; -+} -+ -+static inline int cpupri_init(void __maybe_unused *cpupri) -+{ -+ return 0; -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ raw_spinlock_t lock; -+ -+ struct task_struct *curr, *idle, *stop; -+ struct mm_struct *prev_mm; -+ -+ unsigned int nr_running; -+ /* -+ * This is part of a global counter where only the total sum -+ * over all CPUs matters. A task can increase this counter on -+ * one CPU and if it got migrated afterwards it may decrease -+ * it on another CPU. Always updated under the runqueue lock: -+ */ -+ unsigned long nr_uninterruptible; -+ u64 nr_switches; -+ -+ /* Stored data about rq->curr to work outside rq lock */ -+ u64 rq_deadline; -+ int rq_prio; -+ -+ /* Best queued id for use outside lock */ -+ u64 best_key; -+ -+ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ -+ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ -+ u64 niffies; /* Last time this RQ updated rq clock */ -+ u64 last_niffy; /* Last niffies as updated by local clock */ -+ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ -+ -+ u64 load_update; /* When we last updated load */ -+ unsigned long load_avg; /* Rolling load average */ -+#ifdef CONFIG_SMT_NICE -+ struct mm_struct *rq_mm; -+ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ -+#endif -+ /* Accurate timekeeping data */ -+ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, -+ iowait_ns, idle_ns; -+ atomic_t nr_iowait; -+ -+ skiplist_node node; -+ skiplist *sl; -+#ifdef CONFIG_SMP -+ struct task_struct *preempt; /* Preempt triggered on this task */ -+ struct task_struct *preempting; /* Hint only, what task is preempting */ -+ -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ -+ struct root_domain *rd; -+ struct sched_domain *sd; -+ -+ unsigned long cpu_capacity_orig; -+ -+ int *cpu_locality; /* CPU relative cache distance */ -+ struct rq **rq_order; /* RQs ordered by relative cache distance */ -+ -+#ifdef CONFIG_SCHED_SMT -+ cpumask_t thread_mask; -+ bool (*siblings_idle)(struct rq *rq); -+ /* See if all smt siblings are idle */ -+#endif /* CONFIG_SCHED_SMT */ -+#ifdef CONFIG_SCHED_MC -+ cpumask_t core_mask; -+ bool (*cache_idle)(struct rq *rq); -+ /* See if all cache siblings are idle */ -+#endif /* CONFIG_SCHED_MC */ -+#endif /* CONFIG_SMP */ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ u64 clock, old_clock, last_tick; -+ u64 clock_task; -+ int dither; -+ -+ int iso_ticks; -+ bool iso_refractory; -+ -+#ifdef CONFIG_HIGH_RES_TIMERS -+ struct hrtimer hrexpiry_timer; -+#endif -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+ -+#ifdef CONFIG_SMP -+ struct llist_head wake_list; -+#endif -+ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+}; -+ -+#ifdef CONFIG_SMP -+struct rq *cpu_rq(int cpu); -+#endif -+ -+#ifndef CONFIG_SMP -+extern struct rq *uprq; -+#define cpu_rq(cpu) (uprq) -+#define this_rq() (uprq) -+#define raw_rq() (uprq) -+#define task_rq(p) (uprq) -+#define cpu_curr(cpu) ((uprq)->curr) -+#else /* CONFIG_SMP */ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define this_rq() this_cpu_ptr(&runqueues) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#endif /* CONFIG_SMP */ -+ -+static inline int task_current(struct rq *rq, struct task_struct *p) -+{ -+ return rq->curr == p; -+} -+ -+static inline int task_running(struct rq *rq, struct task_struct *p) -+{ -+#ifdef CONFIG_SMP -+ return p->on_cpu; -+#else -+ return task_current(rq, p); -+#endif -+} -+ -+static inline void rq_lock(struct rq *rq) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock(&rq->lock); -+} -+ -+static inline void rq_unlock(struct rq *rq) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(&rq->lock); -+} -+ -+static inline void rq_lock_irq(struct rq *rq) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irq(&rq->lock); -+} -+ -+static inline void rq_unlock_irq(struct rq *rq) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irqsave(&rq->lock, *flags); -+} -+ -+static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irqrestore(&rq->lock, *flags); -+} -+ -+static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ while (42) { -+ raw_spin_lock_irqsave(&p->pi_lock, *flags); -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ if (likely(rq == task_rq(p))) -+ break; -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -+ } -+ return rq; -+} -+ -+static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ rq_unlock(rq); -+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -+} -+ -+static inline struct rq *__task_rq_lock(struct task_struct *p) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ while (42) { -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ if (likely(rq == task_rq(p))) -+ break; -+ raw_spin_unlock(&rq->lock); -+ } -+ return rq; -+} -+ -+static inline void __task_rq_unlock(struct rq *rq) -+{ -+ rq_unlock(rq); -+} -+ -+/* -+ * {de,en}queue flags: Most not used on MuQSS. -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks -+ * are in a known state which allows modification. Such pairs -+ * should preserve as much state as possible. -+ * -+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location -+ * in the runqueue. -+ * -+ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) -+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) -+ * ENQUEUE_MIGRATED - the task was migrated during wakeup -+ * -+ */ -+ -+#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ -+ -+#define ENQUEUE_RESTORE 0x02 -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ return rq->clock_task; -+} -+ -+#ifdef CONFIG_NUMA -+enum numa_topology_type { -+ NUMA_DIRECT, -+ NUMA_GLUELESS_MESH, -+ NUMA_BACKPLANE, -+}; -+extern enum numa_topology_type sched_numa_topology_type; -+extern int sched_max_numa_distance; -+extern bool find_numa_distance(int distance); -+ -+extern void sched_init_numa(void); -+extern void sched_domains_numa_masks_set(unsigned int cpu); -+extern void sched_domains_numa_masks_clear(unsigned int cpu); -+#else -+static inline void sched_init_numa(void) { } -+static inline void sched_domains_numa_masks_set(unsigned int cpu) { } -+static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } -+#endif -+ -+extern struct mutex sched_domains_mutex; -+extern struct static_key_false sched_schedstats; -+ -+#define rcu_dereference_check_sched_domain(p) \ -+ rcu_dereference_check((p), \ -+ lockdep_is_held(&sched_domains_mutex)) -+ -+#ifdef CONFIG_SMP -+ -+/* -+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. -+ * See detach_destroy_domains: synchronize_sched for details. -+ * -+ * The domain tree of any CPU may only be accessed from within -+ * preempt-disabled sections. -+ */ -+#define for_each_domain(cpu, __sd) \ -+ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ -+ __sd; __sd = __sd->parent) -+ -+#define for_each_lower_domain(sd) for (; sd; sd = sd->child) -+ -+/** -+ * highest_flag_domain - Return highest sched_domain containing flag. -+ * @cpu: The cpu whose highest level of sched domain is to -+ * be returned. -+ * @flag: The flag to check for the highest sched_domain -+ * for the given cpu. -+ * -+ * Returns the highest sched_domain of a cpu which contains the given flag. -+ */ -+static inline struct sched_domain *highest_flag_domain(int cpu, int flag) -+{ -+ struct sched_domain *sd, *hsd = NULL; -+ -+ for_each_domain(cpu, sd) { -+ if (!(sd->flags & flag)) -+ break; -+ hsd = sd; -+ } -+ -+ return hsd; -+} -+ -+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) -+{ -+ struct sched_domain *sd; -+ -+ for_each_domain(cpu, sd) { -+ if (sd->flags & flag) -+ break; -+ } -+ -+ return sd; -+} -+ -+DECLARE_PER_CPU(struct sched_domain *, sd_llc); -+DECLARE_PER_CPU(int, sd_llc_size); -+DECLARE_PER_CPU(int, sd_llc_id); -+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -+DECLARE_PER_CPU(struct sched_domain *, sd_numa); -+DECLARE_PER_CPU(struct sched_domain *, sd_asym); -+ -+struct sched_group_capacity { -+ atomic_t ref; -+ /* -+ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity -+ * for a single CPU. -+ */ -+ unsigned long capacity; -+ unsigned long min_capacity; /* Min per-CPU capacity in group */ -+ unsigned long next_update; -+ int imbalance; /* XXX unrelated to capacity but shared group state */ -+ -+#ifdef CONFIG_SCHED_DEBUG -+ int id; -+#endif -+ -+ unsigned long cpumask[0]; /* balance mask */ -+}; -+ -+struct sched_group { -+ struct sched_group *next; /* Must be a circular list */ -+ atomic_t ref; -+ -+ unsigned int group_weight; -+ struct sched_group_capacity *sgc; -+ int asym_prefer_cpu; /* cpu of highest priority in group */ -+ -+ /* -+ * The CPUs this group covers. -+ * -+ * NOTE: this field is variable length. (Allocated dynamically -+ * by attaching extra space to the end of the structure, -+ * depending on how many CPUs the kernel has booted up with) -+ */ -+ unsigned long cpumask[0]; -+}; -+ -+static inline struct cpumask *sched_group_span(struct sched_group *sg) -+{ -+ return to_cpumask(sg->cpumask); -+} -+ -+/* -+ * See build_balance_mask(). -+ */ -+static inline struct cpumask *group_balance_mask(struct sched_group *sg) -+{ -+ return to_cpumask(sg->sgc->cpumask); -+} -+ -+/** -+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. -+ * @group: The group whose first cpu is to be returned. -+ */ -+static inline unsigned int group_first_cpu(struct sched_group *group) -+{ -+ return cpumask_first(sched_group_span(group)); -+} -+ -+ -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void dirty_sched_domain_sysctl(int cpu); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void dirty_sched_domain_sysctl(int cpu) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+ -+extern void sched_ttwu_pending(void); -+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); -+extern void set_rq_online (struct rq *rq); -+extern void set_rq_offline(struct rq *rq); -+extern bool sched_smp_initialized; -+ -+static inline void update_group_capacity(struct sched_domain *sd, int cpu) -+{ -+} -+ -+static inline void trigger_load_balance(struct rq *rq) -+{ -+} -+ -+#define sched_feat(x) 0 -+ -+#else /* CONFIG_SMP */ -+ -+static inline void sched_ttwu_pending(void) { } -+ -+#endif /* CONFIG_SMP */ -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ SCHED_WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+#ifdef CONFIG_SCHED_DEBUG -+extern bool sched_debug_enabled; -+#endif -+ -+extern void schedule_idle(void); -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+#ifdef CONFIG_SMP -+static inline int cpu_of(struct rq *rq) -+{ -+ return rq->cpu; -+} -+#else /* CONFIG_SMP */ -+static inline int cpu_of(struct rq *rq) -+{ -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); -+ -+static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, -+ cpu_of(rq))); -+ -+ if (data) -+ data->func(data, rq->niffies, flags); -+} -+#else -+static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) -+{ -+} -+#endif /* CONFIG_CPU_FREQ */ -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+/* -+ * This should only be called when current == rq->idle. Dodgy workaround for -+ * when softirqs are pending and we are in the idle loop. Setting current to -+ * resched will kick us out of the idle loop and the softirqs will be serviced -+ * on our next pass through schedule(). -+ */ -+static inline bool softirq_pending(int cpu) -+{ -+ if (likely(!local_softirq_pending())) -+ return false; -+ set_tsk_need_resched(current); -+ return true; -+} -+ -+#ifdef CONFIG_64BIT -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ return tsk_seruntime(t); -+} -+#else -+struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags); -+void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags); -+ -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ unsigned long flags; -+ u64 ns; -+ struct rq *rq; -+ -+ rq = task_rq_lock(t, &flags); -+ ns = tsk_seruntime(t); -+ task_rq_unlock(rq, t, &flags); -+ -+ return ns; -+} -+#endif -+ -+#endif /* MUQSS_SCHED_H */ -diff -Nur a/kernel/sched/sched.h b/kernel/sched/sched.h ---- a/kernel/sched/sched.h 2018-12-21 13:13:19.000000000 +0000 -+++ b/kernel/sched/sched.h 2019-01-05 20:22:51.099998516 +0000 -@@ -1,5 +1,8 @@ - /* SPDX-License-Identifier: GPL-2.0 */ - -+#ifdef CONFIG_SCHED_MUQSS -+#include "MuQSS.h" -+#else /* CONFIG_SCHED_MUQSS */ - #include - #include - #include -@@ -2103,3 +2106,29 @@ - #else /* arch_scale_freq_capacity */ - #define arch_scale_freq_invariant() (false) - #endif -+ -+static inline bool softirq_pending(int cpu) -+{ -+ return false; -+} -+ -+#ifdef CONFIG_64BIT -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ return t->se.sum_exec_runtime; -+} -+#else -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ u64 ns; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ rq = task_rq_lock(t, &rf); -+ ns = t->se.sum_exec_runtime; -+ task_rq_unlock(rq, t, &rf); -+ -+ return ns; -+} -+#endif -+#endif /* CONFIG_SCHED_MUQSS */ -diff -Nur a/kernel/skip_list.c b/kernel/skip_list.c ---- a/kernel/skip_list.c 1970-01-01 01:00:00.000000000 +0100 -+++ b/kernel/skip_list.c 2019-01-05 20:22:51.099998516 +0000 -@@ -0,0 +1,148 @@ -+/* -+ Copyright (C) 2011,2016 Con Kolivas. -+ -+ Code based on example originally by William Pugh. -+ -+Skip Lists are a probabilistic alternative to balanced trees, as -+described in the June 1990 issue of CACM and were invented by -+William Pugh in 1987. -+ -+A couple of comments about this implementation: -+The routine randomLevel has been hard-coded to generate random -+levels using p=0.25. It can be easily changed. -+ -+The insertion routine has been implemented so as to use the -+dirty hack described in the CACM paper: if a random level is -+generated that is more than the current maximum level, the -+current maximum level plus one is used instead. -+ -+Levels start at zero and go up to MaxLevel (which is equal to -+MaxNumberOfLevels-1). -+ -+The routines defined in this file are: -+ -+init: defines slnode -+ -+new_skiplist: returns a new, empty list -+ -+randomLevel: Returns a random level based on a u64 random seed passed to it. -+In MuQSS, the "niffy" time is used for this purpose. -+ -+insert(l,key, value): inserts the binding (key, value) into l. This operation -+occurs in O(log n) time. -+ -+delnode(slnode, l, node): deletes any binding of key from the l based on the -+actual node value. This operation occurs in O(k) time where k is the -+number of levels of the node in question (max 8). The original delete -+function occurred in O(log n) time and involved a search. -+ -+MuQSS Notes: In this implementation of skiplists, there are bidirectional -+next/prev pointers and the insert function returns a pointer to the actual -+node the value is stored. The key here is chosen by the scheduler so as to -+sort tasks according to the priority list requirements and is no longer used -+by the scheduler after insertion. The scheduler lookup, however, occurs in -+O(1) time because it is always the first item in the level 0 linked list. -+Since the task struct stores a copy of the node pointer upon skiplist_insert, -+it can also remove it much faster than the original implementation with the -+aid of prev<->next pointer manipulation and no searching. -+ -+*/ -+ -+#include -+#include -+ -+#define MaxNumberOfLevels 8 -+#define MaxLevel (MaxNumberOfLevels - 1) -+ -+void skiplist_init(skiplist_node *slnode) -+{ -+ int i; -+ -+ slnode->key = 0xFFFFFFFFFFFFFFFF; -+ slnode->level = 0; -+ slnode->value = NULL; -+ for (i = 0; i < MaxNumberOfLevels; i++) -+ slnode->next[i] = slnode->prev[i] = slnode; -+} -+ -+skiplist *new_skiplist(skiplist_node *slnode) -+{ -+ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); -+ -+ BUG_ON(!l); -+ l->header = slnode; -+ return l; -+} -+ -+void free_skiplist(skiplist *l) -+{ -+ skiplist_node *p, *q; -+ -+ p = l->header; -+ do { -+ q = p->next[0]; -+ p->next[0]->prev[0] = q->prev[0]; -+ skiplist_node_init(p); -+ p = q; -+ } while (p != l->header); -+ kfree(l); -+} -+ -+void skiplist_node_init(skiplist_node *node) -+{ -+ memset(node, 0, sizeof(skiplist_node)); -+} -+ -+static inline unsigned int randomLevel(const long unsigned int randseed) -+{ -+ return find_first_bit(&randseed, MaxLevel) / 2; -+} -+ -+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) -+{ -+ skiplist_node *update[MaxNumberOfLevels]; -+ skiplist_node *p, *q; -+ int k = l->level; -+ -+ p = l->header; -+ do { -+ while (q = p->next[k], q->key <= key) -+ p = q; -+ update[k] = p; -+ } while (--k >= 0); -+ -+ ++l->entries; -+ k = randomLevel(randseed); -+ if (k > l->level) { -+ k = ++l->level; -+ update[k] = l->header; -+ } -+ -+ node->level = k; -+ node->key = key; -+ node->value = value; -+ do { -+ p = update[k]; -+ node->next[k] = p->next[k]; -+ p->next[k] = node; -+ node->prev[k] = p; -+ node->next[k]->prev[k] = node; -+ } while (--k >= 0); -+} -+ -+void skiplist_delete(skiplist *l, skiplist_node *node) -+{ -+ int k, m = node->level; -+ -+ for (k = 0; k <= m; k++) { -+ node->prev[k]->next[k] = node->next[k]; -+ node->next[k]->prev[k] = node->prev[k]; -+ } -+ skiplist_node_init(node); -+ if (m == l->level) { -+ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) -+ m--; -+ l->level = m; -+ } -+ l->entries--; -+} -diff -Nur a/kernel/sysctl.c b/kernel/sysctl.c ---- a/kernel/sysctl.c 2019-01-05 20:17:13.859238862 +0000 -+++ b/kernel/sysctl.c 2019-01-05 20:22:51.099998516 +0000 -@@ -135,6 +135,12 @@ - static unsigned long one_ul __read_only = 1; - static int one_hundred __read_only = 100; - static int one_thousand __read_only = 1000; -+#ifdef CONFIG_SCHED_MUQSS -+extern int rr_interval; -+extern int sched_interactive; -+extern int sched_iso_cpu; -+extern int sched_yield_type; -+#endif - #ifdef CONFIG_PRINTK - static int ten_thousand __read_only = 10000; - #endif -@@ -296,7 +302,7 @@ - { } - }; - --#ifdef CONFIG_SCHED_DEBUG -+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) - static int min_sched_granularity_ns __read_only = 100000; /* 100 usecs */ - static int max_sched_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ - static int min_wakeup_granularity_ns __read_only; /* 0 usecs */ -@@ -313,6 +319,7 @@ - #endif - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_MUQSS - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, -@@ -475,6 +482,7 @@ - .extra1 = &one, - }, - #endif -+#endif /* !CONFIG_SCHED_MUQSS */ - #ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", -@@ -1073,6 +1081,44 @@ - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_SCHED_MUQSS -+ { -+ .procname = "rr_interval", -+ .data = &rr_interval, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &one_thousand, -+ }, -+ { -+ .procname = "interactive", -+ .data = &sched_interactive, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &one, -+ }, -+ { -+ .procname = "iso_cpu", -+ .data = &sched_iso_cpu, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &one_hundred, -+ }, -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &two, -+ }, -+#endif - #if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", -diff -Nur a/kernel/time/clockevents.c b/kernel/time/clockevents.c ---- a/kernel/time/clockevents.c 2018-12-21 13:13:19.000000000 +0000 -+++ b/kernel/time/clockevents.c 2019-01-05 20:22:51.099998516 +0000 -@@ -198,8 +198,13 @@ - - #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST - -+#ifdef CONFIG_SCHED_MUQSS -+/* Limit min_delta to 100us */ -+#define MIN_DELTA_LIMIT (NSEC_PER_SEC / 10000) -+#else - /* Limit min_delta to a jiffie */ - #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) -+#endif - - /** - * clockevents_increase_min_delta - raise minimum delta of a clock event device -diff -Nur a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c ---- a/kernel/time/posix-cpu-timers.c 2018-12-21 13:13:19.000000000 +0000 -+++ b/kernel/time/posix-cpu-timers.c 2019-01-05 20:22:51.109998835 +0000 -@@ -818,7 +818,7 @@ - tsk_expires->virt_exp = expires; - - tsk_expires->sched_exp = check_timers_list(++timers, firing, -- tsk->se.sum_exec_runtime); -+ tsk_seruntime(tsk)); - - /* - * Check for the special case thread timers. -@@ -828,7 +828,7 @@ - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - if (hard != RLIM_INFINITY && -- tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { -+ tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { - /* - * At the hard limit, we just die. - * No need to calculate anything else now. -@@ -840,7 +840,7 @@ - __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); - return; - } -- if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { -+ if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { - /* - * At the soft limit, send a SIGXCPU every second. - */ -@@ -1081,7 +1081,7 @@ - struct task_cputime task_sample; - - task_cputime(tsk, &task_sample.utime, &task_sample.stime); -- task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; -+ task_sample.sum_exec_runtime = tsk_seruntime(tsk); - if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) - return 1; - } -diff -Nur a/kernel/time/timer.c b/kernel/time/timer.c ---- a/kernel/time/timer.c 2019-01-05 20:17:13.859238862 +0000 -+++ b/kernel/time/timer.c 2019-01-05 20:22:51.109998835 +0000 -@@ -1434,7 +1434,7 @@ - * Check, if the next hrtimer event is before the next timer wheel - * event: - */ --static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) -+static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) - { - u64 nextevt = hrtimer_get_next_event(); - -@@ -1452,6 +1452,9 @@ - if (nextevt <= basem) - return basem; - -+ if (nextevt < expires && nextevt - basem <= TICK_NSEC) -+ base->is_idle = false; -+ - /* - * Round up to the next jiffie. High resolution timers are - * off, so the hrtimers are expired in the tick and we need to -@@ -1521,7 +1524,7 @@ - } - raw_spin_unlock(&base->lock); - -- return cmp_next_hrtimer_event(basem, expires); -+ return cmp_next_hrtimer_event(base, basem, expires); - } - - /** -diff -Nur a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c ---- a/kernel/trace/trace_selftest.c 2018-12-21 13:13:19.000000000 +0000 -+++ b/kernel/trace/trace_selftest.c 2019-01-05 20:22:51.109998835 +0000 -@@ -1041,10 +1041,15 @@ - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_MUQSS -+ /* No deadline on MuQSS, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - diff --git a/sys-kernel/linux-sources-redcore-lts/files/0002-BFQ-v8r12-20180404.patch b/sys-kernel/linux-sources-redcore-lts/files/0002-BFQ-v8r12-20180404.patch deleted file mode 100644 index 104325d6..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0002-BFQ-v8r12-20180404.patch +++ /dev/null @@ -1,4611 +0,0 @@ -From 7bd365a925748767d7ed807e5498f90bae0ebc25 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Tue, 14 Nov 2017 08:28:45 +0100 -Subject: [PATCH 01/23] block, bfq-mq: turn BUG_ON on request-size into WARN_ON - -BFQ has many checks of internal and external consistency. One of them -checks that an I/O request has still sectors to serve, if it happens -to be retired without being served. If the request has no sector to -serve, a BUG_ON signals the failure and causes the kernel to -terminate. Yet, from a crash report by a user [1], this condition may -happen to hold, in apparently correct functioning, for I/O with a -CD/DVD. - -To address this issue, this commit turns the above BUG_ON into a -WARN_ON. This commit also adds a companion WARN_ON on request -insertion into the scheduler. - -[1] https://groups.google.com/d/msg/bfq-iosched/DDOTJBroBa4/VyU1zUFtCgAJ - -Reported-by: Alexandre Frade -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 0c09609a6099..0fc757ae7a42 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1540,6 +1540,8 @@ static void bfq_add_request(struct request *rq) - - BUG_ON(!RQ_BFQQ(rq)); - BUG_ON(RQ_BFQQ(rq) != bfqq); -+ WARN_ON(blk_rq_sectors(rq) == 0); -+ - elv_rb_add(&bfqq->sort_list, rq); - - /* -@@ -4962,7 +4964,7 @@ static void bfq_finish_request(struct request *rq) - rq_io_start_time_ns(rq), - rq->cmd_flags); - -- BUG_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); -+ WARN_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); - - if (likely(rq->rq_flags & RQF_STARTED)) { - unsigned long flags; - -From 1097d368a20456c88acd75b3184c68df38e8f7b8 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Sun, 12 Nov 2017 22:43:46 +0100 -Subject: [PATCH 02/23] block, bfq-sq, bfq-mq: consider also past I/O in soft - real-time detection - -BFQ privileges the I/O of soft real-time applications, such as video -players, to guarantee to these application a high bandwidth and a low -latency. In this respect, it is not easy to correctly detect when an -application is soft real-time. A particularly nasty false positive is -that of an I/O-bound application that occasionally happens to meet all -requirements to be deemed as soft real-time. After being detected as -soft real-time, such an application monopolizes the device. Fortunately, -BFQ will realize soon that the application is actually not soft -real-time and suspend every privilege. Yet, the application may happen -again to be wrongly detected as soft real-time, and so on. - -As highlighted by our tests, this problem causes BFQ to occasionally -fail to guarantee a high responsiveness, in the presence of heavy -background I/O workloads. The reason is that the background workload -happens to be detected as soft real-time, more or less frequently, -during the execution of the interactive task under test. To give an -idea, because of this problem, Libreoffice Writer occasionally takes 8 -seconds, instead of 3, to start up, if there are sequential reads and -writes in the background, on a Kingston SSDNow V300. - -This commit addresses this issue by leveraging the following facts. - -The reason why some applications are detected as soft real-time despite -all BFQ checks to avoid false positives, is simply that, during high -CPU or storage-device load, I/O-bound applications may happen to do -I/O slowly enough to meet all soft real-time requirements, and pass -all BFQ extra checks. Yet, this happens only for limited time periods: -slow-speed time intervals are usually interspersed between other time -intervals during which these applications do I/O at a very high speed. -To exploit these facts, this commit introduces a little change, in the -detection of soft real-time behavior, to systematically consider also -the recent past: the higher the speed was in the recent past, the -later next I/O should arrive for the application to be considered as -soft real-time. At the beginning of a slow-speed interval, the minimum -arrival time allowed for the next I/O usually happens to still be so -high, to fall *after* the end of the slow-speed period itself. As a -consequence, the application does not risk to be deemed as soft -real-time during the slow-speed interval. Then, during the next -high-speed interval, the application cannot, evidently, be deemed as -soft real-time (exactly because of its speed), and so on. - -This extra filtering proved to be rather effective: in the above test, -the frequency of false positives became so low that the start-up time -was 3 seconds in all iterations (apart from occasional outliers, -caused by page-cache-management issues, which are out of the scope of -this commit, and cannot be solved by an I/O scheduler). - -Signed-off-by: Paolo Valente -Signed-off-by: Angelo Ruocco ---- - block/bfq-mq-iosched.c | 115 ++++++++++++++++++++++++++++++++++--------------- - block/bfq-sq-iosched.c | 115 ++++++++++++++++++++++++++++++++++--------------- - 2 files changed, 162 insertions(+), 68 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 0fc757ae7a42..4d06d900f45e 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -3201,37 +3201,78 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * whereas soft_rt_next_start is set to infinity for applications that do - * not. - * -- * Unfortunately, even a greedy application may happen to behave in an -- * isochronous way if the CPU load is high. In fact, the application may -- * stop issuing requests while the CPUs are busy serving other processes, -- * then restart, then stop again for a while, and so on. In addition, if -- * the disk achieves a low enough throughput with the request pattern -- * issued by the application (e.g., because the request pattern is random -- * and/or the device is slow), then the application may meet the above -- * bandwidth requirement too. To prevent such a greedy application to be -- * deemed as soft real-time, a further rule is used in the computation of -- * soft_rt_next_start: soft_rt_next_start must be higher than the current -- * time plus the maximum time for which the arrival of a request is waited -- * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. -- * This filters out greedy applications, as the latter issue instead their -- * next request as soon as possible after the last one has been completed -- * (in contrast, when a batch of requests is completed, a soft real-time -- * application spends some time processing data). -+ * Unfortunately, even a greedy (i.e., I/O-bound) application may -+ * happen to meet, occasionally or systematically, both the above -+ * bandwidth and isochrony requirements. This may happen at least in -+ * the following circumstances. First, if the CPU load is high. The -+ * application may stop issuing requests while the CPUs are busy -+ * serving other processes, then restart, then stop again for a while, -+ * and so on. The other circumstances are related to the storage -+ * device: the storage device is highly loaded or reaches a low-enough -+ * throughput with the I/O of the application (e.g., because the I/O -+ * is random and/or the device is slow). In all these cases, the -+ * I/O of the application may be simply slowed down enough to meet -+ * the bandwidth and isochrony requirements. To reduce the probability -+ * that greedy applications are deemed as soft real-time in these -+ * corner cases, a further rule is used in the computation of -+ * soft_rt_next_start: the return value of this function is forced to -+ * be higher than the maximum between the following two quantities. - * -- * Unfortunately, the last filter may easily generate false positives if -- * only bfqd->bfq_slice_idle is used as a reference time interval and one -- * or both the following cases occur: -- * 1) HZ is so low that the duration of a jiffy is comparable to or higher -- * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with -- * HZ=100. -+ * (a) Current time plus: (1) the maximum time for which the arrival -+ * of a request is waited for when a sync queue becomes idle, -+ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We -+ * postpone for a moment the reason for adding a few extra -+ * jiffies; we get back to it after next item (b). Lower-bounding -+ * the return value of this function with the current time plus -+ * bfqd->bfq_slice_idle tends to filter out greedy applications, -+ * because the latter issue their next request as soon as possible -+ * after the last one has been completed. In contrast, a soft -+ * real-time application spends some time processing data, after a -+ * batch of its requests has been completed. -+ * -+ * (b) Current value of bfqq->soft_rt_next_start. As pointed out -+ * above, greedy applications may happen to meet both the -+ * bandwidth and isochrony requirements under heavy CPU or -+ * storage-device load. In more detail, in these scenarios, these -+ * applications happen, only for limited time periods, to do I/O -+ * slowly enough to meet all the requirements described so far, -+ * including the filtering in above item (a). These slow-speed -+ * time intervals are usually interspersed between other time -+ * intervals during which these applications do I/O at a very high -+ * speed. Fortunately, exactly because of the high speed of the -+ * I/O in the high-speed intervals, the values returned by this -+ * function happen to be so high, near the end of any such -+ * high-speed interval, to be likely to fall *after* the end of -+ * the low-speed time interval that follows. These high values are -+ * stored in bfqq->soft_rt_next_start after each invocation of -+ * this function. As a consequence, if the last value of -+ * bfqq->soft_rt_next_start is constantly used to lower-bound the -+ * next value that this function may return, then, from the very -+ * beginning of a low-speed interval, bfqq->soft_rt_next_start is -+ * likely to be constantly kept so high that any I/O request -+ * issued during the low-speed interval is considered as arriving -+ * to soon for the application to be deemed as soft -+ * real-time. Then, in the high-speed interval that follows, the -+ * application will not be deemed as soft real-time, just because -+ * it will do I/O at a high speed. And so on. -+ * -+ * Getting back to the filtering in item (a), in the following two -+ * cases this filtering might be easily passed by a greedy -+ * application, if the reference quantity was just -+ * bfqd->bfq_slice_idle: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or -+ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow -+ * devices with HZ=100. The time granularity may be so coarse -+ * that the approximation, in jiffies, of bfqd->bfq_slice_idle -+ * is rather lower than the exact value. - * 2) jiffies, instead of increasing at a constant rate, may stop increasing - * for a while, then suddenly 'jump' by several units to recover the lost - * increments. This seems to happen, e.g., inside virtual machines. -- * To address this issue, we do not use as a reference time interval just -- * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In -- * particular we add the minimum number of jiffies for which the filter -- * seems to be quite precise also in embedded systems and KVM/QEMU virtual -- * machines. -+ * To address this issue, in the filtering in (a) we do not use as a -+ * reference time interval just bfqd->bfq_slice_idle, but -+ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the -+ * minimum number of jiffies for which the filter seems to be quite -+ * precise also in embedded systems and KVM/QEMU virtual machines. - */ - static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -@@ -3243,10 +3284,11 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - jiffies_to_msecs(HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate)); - -- return max(bfqq->last_idle_bklogged + -- HZ * bfqq->service_from_backlogged / -- bfqd->bfq_wr_max_softrt_rate, -- jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+ return max3(bfqq->soft_rt_next_start, -+ bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); - } - - /** -@@ -4395,10 +4437,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfqq->split_time = bfq_smallest_from_now(); - - /* -- * Set to the value for which bfqq will not be deemed as -- * soft rt when it becomes backlogged. -+ * To not forget the possibly high bandwidth consumed by a -+ * process/queue in the recent past, -+ * bfq_bfqq_softrt_next_start() returns a value at least equal -+ * to the current value of bfqq->soft_rt_next_start (see -+ * comments on bfq_bfqq_softrt_next_start). Set -+ * soft_rt_next_start to now, to mean that bfqq has consumed -+ * no bandwidth so far. - */ -- bfqq->soft_rt_next_start = bfq_greatest_from_now(); -+ bfqq->soft_rt_next_start = jiffies; - - /* first request is almost certainly seeky */ - bfqq->seek_history = 1; -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 4bbd7f4c0154..987dc255c82c 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -3089,37 +3089,78 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * whereas soft_rt_next_start is set to infinity for applications that do - * not. - * -- * Unfortunately, even a greedy application may happen to behave in an -- * isochronous way if the CPU load is high. In fact, the application may -- * stop issuing requests while the CPUs are busy serving other processes, -- * then restart, then stop again for a while, and so on. In addition, if -- * the disk achieves a low enough throughput with the request pattern -- * issued by the application (e.g., because the request pattern is random -- * and/or the device is slow), then the application may meet the above -- * bandwidth requirement too. To prevent such a greedy application to be -- * deemed as soft real-time, a further rule is used in the computation of -- * soft_rt_next_start: soft_rt_next_start must be higher than the current -- * time plus the maximum time for which the arrival of a request is waited -- * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. -- * This filters out greedy applications, as the latter issue instead their -- * next request as soon as possible after the last one has been completed -- * (in contrast, when a batch of requests is completed, a soft real-time -- * application spends some time processing data). -+ * Unfortunately, even a greedy (i.e., I/O-bound) application may -+ * happen to meet, occasionally or systematically, both the above -+ * bandwidth and isochrony requirements. This may happen at least in -+ * the following circumstances. First, if the CPU load is high. The -+ * application may stop issuing requests while the CPUs are busy -+ * serving other processes, then restart, then stop again for a while, -+ * and so on. The other circumstances are related to the storage -+ * device: the storage device is highly loaded or reaches a low-enough -+ * throughput with the I/O of the application (e.g., because the I/O -+ * is random and/or the device is slow). In all these cases, the -+ * I/O of the application may be simply slowed down enough to meet -+ * the bandwidth and isochrony requirements. To reduce the probability -+ * that greedy applications are deemed as soft real-time in these -+ * corner cases, a further rule is used in the computation of -+ * soft_rt_next_start: the return value of this function is forced to -+ * be higher than the maximum between the following two quantities. - * -- * Unfortunately, the last filter may easily generate false positives if -- * only bfqd->bfq_slice_idle is used as a reference time interval and one -- * or both the following cases occur: -- * 1) HZ is so low that the duration of a jiffy is comparable to or higher -- * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with -- * HZ=100. -+ * (a) Current time plus: (1) the maximum time for which the arrival -+ * of a request is waited for when a sync queue becomes idle, -+ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We -+ * postpone for a moment the reason for adding a few extra -+ * jiffies; we get back to it after next item (b). Lower-bounding -+ * the return value of this function with the current time plus -+ * bfqd->bfq_slice_idle tends to filter out greedy applications, -+ * because the latter issue their next request as soon as possible -+ * after the last one has been completed. In contrast, a soft -+ * real-time application spends some time processing data, after a -+ * batch of its requests has been completed. -+ * -+ * (b) Current value of bfqq->soft_rt_next_start. As pointed out -+ * above, greedy applications may happen to meet both the -+ * bandwidth and isochrony requirements under heavy CPU or -+ * storage-device load. In more detail, in these scenarios, these -+ * applications happen, only for limited time periods, to do I/O -+ * slowly enough to meet all the requirements described so far, -+ * including the filtering in above item (a). These slow-speed -+ * time intervals are usually interspersed between other time -+ * intervals during which these applications do I/O at a very high -+ * speed. Fortunately, exactly because of the high speed of the -+ * I/O in the high-speed intervals, the values returned by this -+ * function happen to be so high, near the end of any such -+ * high-speed interval, to be likely to fall *after* the end of -+ * the low-speed time interval that follows. These high values are -+ * stored in bfqq->soft_rt_next_start after each invocation of -+ * this function. As a consequence, if the last value of -+ * bfqq->soft_rt_next_start is constantly used to lower-bound the -+ * next value that this function may return, then, from the very -+ * beginning of a low-speed interval, bfqq->soft_rt_next_start is -+ * likely to be constantly kept so high that any I/O request -+ * issued during the low-speed interval is considered as arriving -+ * to soon for the application to be deemed as soft -+ * real-time. Then, in the high-speed interval that follows, the -+ * application will not be deemed as soft real-time, just because -+ * it will do I/O at a high speed. And so on. -+ * -+ * Getting back to the filtering in item (a), in the following two -+ * cases this filtering might be easily passed by a greedy -+ * application, if the reference quantity was just -+ * bfqd->bfq_slice_idle: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or -+ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow -+ * devices with HZ=100. The time granularity may be so coarse -+ * that the approximation, in jiffies, of bfqd->bfq_slice_idle -+ * is rather lower than the exact value. - * 2) jiffies, instead of increasing at a constant rate, may stop increasing - * for a while, then suddenly 'jump' by several units to recover the lost - * increments. This seems to happen, e.g., inside virtual machines. -- * To address this issue, we do not use as a reference time interval just -- * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In -- * particular we add the minimum number of jiffies for which the filter -- * seems to be quite precise also in embedded systems and KVM/QEMU virtual -- * machines. -+ * To address this issue, in the filtering in (a) we do not use as a -+ * reference time interval just bfqd->bfq_slice_idle, but -+ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the -+ * minimum number of jiffies for which the filter seems to be quite -+ * precise also in embedded systems and KVM/QEMU virtual machines. - */ - static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -@@ -3131,10 +3172,11 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - jiffies_to_msecs(HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate)); - -- return max(bfqq->last_idle_bklogged + -- HZ * bfqq->service_from_backlogged / -- bfqd->bfq_wr_max_softrt_rate, -- jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); -+ return max3(bfqq->soft_rt_next_start, -+ bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); - } - - /** -@@ -4167,10 +4209,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfqq->split_time = bfq_smallest_from_now(); - - /* -- * Set to the value for which bfqq will not be deemed as -- * soft rt when it becomes backlogged. -+ * To not forget the possibly high bandwidth consumed by a -+ * process/queue in the recent past, -+ * bfq_bfqq_softrt_next_start() returns a value at least equal -+ * to the current value of bfqq->soft_rt_next_start (see -+ * comments on bfq_bfqq_softrt_next_start). Set -+ * soft_rt_next_start to now, to mean that bfqq has consumed -+ * no bandwidth so far. - */ -- bfqq->soft_rt_next_start = bfq_greatest_from_now(); -+ bfqq->soft_rt_next_start = jiffies; - - /* first request is almost certainly seeky */ - bfqq->seek_history = 1; - -From 2a09b505660c81dbb80a5d68c9bc558c326d041f Mon Sep 17 00:00:00 2001 -From: Chiara Bruschi -Date: Thu, 7 Dec 2017 09:57:19 +0100 -Subject: [PATCH 03/23] block, bfq-mq: fix occurrences of request - prepare/finish methods' old names - -Commits 'b01f1fa3bb19' (Port of "blk-mq-sched: unify request prepare -methods") and 'cc10d2d7d2c1' (Port of "blk-mq-sched: unify request -finished methods") changed the old names of current bfq_prepare_request -and bfq_finish_request methods, but left them unchanged elsewhere in -the code (related comments, part of function name bfq_put_rq_priv_body). - -This commit fixes every occurrence of the old names of these methods -by changing them into the current names. - -Fixes: b01f1fa3bb19 (Port of "blk-mq-sched: unify request prepare methods") -Fixes: cc10d2d7d2c1 (Port of "blk-mq-sched: unify request finished methods") -Reviewed-by: Paolo Valente -Signed-off-by: Federico Motta -Signed-off-by: Chiara Bruschi ---- - block/bfq-mq-iosched.c | 38 +++++++++++++++++++------------------- - 1 file changed, 19 insertions(+), 19 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 4d06d900f45e..8f8d5eccb016 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4018,20 +4018,20 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - /* - * TESTING: reset DISP_LIST flag, because: 1) - * this rq this request has passed through -- * get_rq_private, 2) then it will have -- * put_rq_private invoked on it, and 3) in -- * put_rq_private we use this flag to check -- * that put_rq_private is not invoked on -- * requests for which get_rq_private has been -- * invoked. -+ * bfq_prepare_request, 2) then it will have -+ * bfq_finish_request invoked on it, and 3) in -+ * bfq_finish_request we use this flag to check -+ * that bfq_finish_request is not invoked on -+ * requests for which bfq_prepare_request has -+ * been invoked. - */ - rq->rq_flags &= ~RQF_DISP_LIST; - goto inc_in_driver_start_rq; - } - - /* -- * We exploit the put_rq_private hook to decrement -- * rq_in_driver, but put_rq_private will not be -+ * We exploit the bfq_finish_request hook to decrement -+ * rq_in_driver, but bfq_finish_request will not be - * invoked on this request. So, to avoid unbalance, - * just start this request, without incrementing - * rq_in_driver. As a negative consequence, -@@ -4040,14 +4040,14 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * bfq_schedule_dispatch to be invoked uselessly. - * - * As for implementing an exact solution, the -- * put_request hook, if defined, is probably invoked -- * also on this request. So, by exploiting this hook, -- * we could 1) increment rq_in_driver here, and 2) -- * decrement it in put_request. Such a solution would -- * let the value of the counter be always accurate, -- * but it would entail using an extra interface -- * function. This cost seems higher than the benefit, -- * being the frequency of non-elevator-private -+ * bfq_finish_request hook, if defined, is probably -+ * invoked also on this request. So, by exploiting -+ * this hook, we could 1) increment rq_in_driver here, -+ * and 2) decrement it in bfq_finish_request. Such a -+ * solution would let the value of the counter be -+ * always accurate, but it would entail using an extra -+ * interface function. This cost seems higher than the -+ * benefit, being the frequency of non-elevator-private - * requests very low. - */ - goto start_rq; -@@ -4963,7 +4963,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - } - } - --static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) -+static void bfq_finish_request_body(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "put_request_body: allocated %d", bfqq->allocated); -@@ -5019,7 +5019,7 @@ static void bfq_finish_request(struct request *rq) - spin_lock_irqsave(&bfqd->lock, flags); - - bfq_completed_request(bfqq, bfqd); -- bfq_put_rq_priv_body(bfqq); -+ bfq_finish_request_body(bfqq); - - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { -@@ -5042,7 +5042,7 @@ static void bfq_finish_request(struct request *rq) - bfqg_stats_update_io_remove(bfqq_group(bfqq), - rq->cmd_flags); - } -- bfq_put_rq_priv_body(bfqq); -+ bfq_finish_request_body(bfqq); - } - - rq->elv.priv[0] = NULL; - -From 4df19943c3a767df453abea3d2ac3433c3326ce0 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Thu, 16 Nov 2017 18:38:13 +0100 -Subject: [PATCH 04/23] block, bfq-sq, bfq-mq: add missing rq_pos_tree update - on rq removal - -If two processes do I/O close to each other, then BFQ merges the -bfq_queues associated with these processes, to get a more sequential -I/O, and thus a higher throughput. In this respect, to detect whether -two processes are doing I/O close to each other, BFQ keeps a list of -the head-of-line I/O requests of all active bfq_queues. The list is -ordered by initial sectors, and implemented through a red-black tree -(rq_pos_tree). - -Unfortunately, the update of the rq_pos_tree was incomplete, because -the tree was not updated on the removal of the head-of-line I/O -request of a bfq_queue, in case the queue did not remain empty. This -commit adds the missing update. - -Signed-off-by: Paolo Valente -Signed-off-by: Angelo Ruocco ---- - block/bfq-mq-iosched.c | 3 +++ - block/bfq-sq-iosched.c | 3 +++ - 2 files changed, 6 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 8f8d5eccb016..603191c9008f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -1729,6 +1729,9 @@ static void bfq_remove_request(struct request_queue *q, - rb_erase(&bfqq->pos_node, bfqq->pos_root); - bfqq->pos_root = NULL; - } -+ } else { -+ BUG_ON(!bfqq->next_rq); -+ bfq_pos_tree_add_move(bfqd, bfqq); - } - - if (rq->cmd_flags & REQ_META) { -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 987dc255c82c..ea90ace79e49 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -1669,6 +1669,9 @@ static void bfq_remove_request(struct request *rq) - rb_erase(&bfqq->pos_node, bfqq->pos_root); - bfqq->pos_root = NULL; - } -+ } else { -+ BUG_ON(!bfqq->next_rq); -+ bfq_pos_tree_add_move(bfqd, bfqq); - } - - if (rq->cmd_flags & REQ_META) { - -From b844e345140aaea957d84a21d2aa67588b020cd5 Mon Sep 17 00:00:00 2001 -From: Angelo Ruocco -Date: Mon, 18 Dec 2017 08:28:08 +0100 -Subject: [PATCH 05/23] block, bfq-sq, bfq-mq: check low_latency flag in - bfq_bfqq_save_state() - -A just-created bfq_queue will certainly be deemed as interactive on -the arrival of its first I/O request, if the low_latency flag is -set. Yet, if the queue is merged with another queue on the arrival of -its first I/O request, it will not have the chance to be flagged as -interactive. Nevertheless, if the queue is then split soon enough, it -has to be flagged as interactive after the split. - -To handle this early-merge scenario correctly, BFQ saves the state of -the queue, on the merge, as if the latter had already been deemed -interactive. So, if the queue is split soon, it will get -weight-raised, because the previous state of the queue is resumed on -the split. - -Unfortunately, in the act of saving the state of the newly-created -queue, BFQ doesn't check whether the low_latency flag is set, and this -causes early-merged queues to be then weight-raised, on queue splits, -even if low_latency is off. This commit addresses this problem by -adding the missing check. - -Signed-off-by: Angelo Ruocco -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 3 ++- - block/bfq-sq-iosched.c | 3 ++- - 2 files changed, 4 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 603191c9008f..ff9776c8836a 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -2231,7 +2231,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - if (unlikely(bfq_bfqq_just_created(bfqq) && -- !bfq_bfqq_in_large_burst(bfqq))) { -+ !bfq_bfqq_in_large_burst(bfqq) && -+ bfqq->bfqd->low_latency)) { - /* - * bfqq being merged ritgh after being created: bfqq - * would have deserved interactive weight raising, but -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index ea90ace79e49..3a2d764e760c 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -2109,7 +2109,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - if (unlikely(bfq_bfqq_just_created(bfqq) && -- !bfq_bfqq_in_large_burst(bfqq))) { -+ !bfq_bfqq_in_large_burst(bfqq) && -+ bfqq->bfqd->low_latency)) { - /* - * bfqq being merged ritgh after being created: bfqq - * would have deserved interactive weight raising, but - -From 4cc6896fe1de2e0b4de151a6e70658f10b9ec2fa Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Fri, 27 Oct 2017 11:12:14 +0200 -Subject: [PATCH 06/23] block, bfq-sq, bfq-mq: let a queue be merged only - shortly after starting I/O - -In BFQ and CFQ, two processes are said to be cooperating if they do -I/O in such a way that the union of their I/O requests yields a -sequential I/O pattern. To get such a sequential I/O pattern out of -the non-sequential pattern of each cooperating process, BFQ and CFQ -merge the queues associated with these processes. In more detail, -cooperating processes, and thus their associated queues, usually -start, or restart, to do I/O shortly after each other. This is the -case, e.g., for the I/O threads of KVM/QEMU and of the dump -utility. Basing on this assumption, this commit allows a bfq_queue to -be merged only during a short time interval (100ms) after it starts, -or re-starts, to do I/O. This filtering provides two important -benefits. - -First, it greatly reduces the probability that two non-cooperating -processes have their queues merged by mistake, if they just happen to -do I/O close to each other for a short time interval. These spurious -merges cause loss of service guarantees. A low-weight bfq_queue may -unjustly get more than its expected share of the throughput: if such a -low-weight queue is merged with a high-weight queue, then the I/O for -the low-weight queue is served as if the queue had a high weight. This -may damage other high-weight queues unexpectedly. For instance, -because of this issue, lxterminal occasionally took 7.5 seconds to -start, instead of 6.5 seconds, when some sequential readers and -writers did I/O in the background on a FUJITSU MHX2300BT HDD. The -reason is that the bfq_queues associated with some of the readers or -the writers were merged with the high-weight queues of some processes -that had to do some urgent but little I/O. The readers then exploited -the inherited high weight for all or most of their I/O, during the -start-up of terminal. The filtering introduced by this commit -eliminated any outlier caused by spurious queue merges in our start-up -time tests. - -This filtering also provides a little boost of the throughput -sustainable by BFQ: 3-4%, depending on the CPU. The reason is that, -once a bfq_queue cannot be merged any longer, this commit makes BFQ -stop updating the data needed to handle merging for the queue. - -Signed-off-by: Paolo Valente -Signed-off-by: Angelo Ruocco ---- - block/bfq-mq-iosched.c | 64 +++++++++++++++++++++++++++++++++++++++++--------- - block/bfq-mq.h | 1 + - block/bfq-sched.c | 4 ++++ - block/bfq-sq-iosched.c | 64 +++++++++++++++++++++++++++++++++++++++++--------- - block/bfq.h | 2 ++ - 5 files changed, 113 insertions(+), 22 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index ff9776c8836a..8b17b25a3c30 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -119,6 +119,20 @@ static const int bfq_async_charge_factor = 10; - /* Default timeout values, in jiffies, approximating CFQ defaults. */ - static const int bfq_timeout = (HZ / 8); - -+/* -+ * Time limit for merging (see comments in bfq_setup_cooperator). Set -+ * to the slowest value that, in our tests, proved to be effective in -+ * removing false positives, while not causing true positives to miss -+ * queue merging. -+ * -+ * As can be deduced from the low time limit below, queue merging, if -+ * successful, happens at the very beggining of the I/O of the involved -+ * cooperating processes, as a consequence of the arrival of the very -+ * first requests from each cooperator. After that, there is very -+ * little chance to find cooperators. -+ */ -+static const unsigned long bfq_merge_time_limit = HZ/10; -+ - static struct kmem_cache *bfq_pool; - - /* Below this threshold (in ns), we consider thinktime immediate. */ -@@ -389,6 +403,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - return bfqq; - } - -+static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) -+{ -+ return bfqq->service_from_backlogged > 0 && -+ time_is_before_jiffies(bfqq->first_IO_time + -+ bfq_merge_time_limit); -+} -+ - static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - struct rb_node **p, *parent; -@@ -399,6 +420,14 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfqq->pos_root = NULL; - } - -+ /* -+ * bfqq cannot be merged any longer (see comments in -+ * bfq_setup_cooperator): no point in adding bfqq into the -+ * position tree. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) -+ return; -+ - if (bfq_class_idle(bfqq)) - return; - if (!bfqq->next_rq) -@@ -2081,6 +2110,13 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) - static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - struct bfq_queue *new_bfqq) - { -+ if (bfq_too_late_for_merging(new_bfqq)) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] too late for bfq%d to be merged", -+ __func__, new_bfqq->pid); -+ return false; -+ } -+ - if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || - (bfqq->ioprio_class != new_bfqq->ioprio_class)) - return false; -@@ -2149,6 +2185,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - { - struct bfq_queue *in_service_bfqq, *new_bfqq; - -+ /* -+ * Prevent bfqq from being merged if it has been created too -+ * long ago. The idea is that true cooperating processes, and -+ * thus their associated bfq_queues, are supposed to be -+ * created shortly after each other. This is the case, e.g., -+ * for KVM/QEMU and dump I/O threads. Basing on this -+ * assumption, the following filtering greatly reduces the -+ * probability that two non-cooperating processes, which just -+ * happen to do close I/O for some short time interval, have -+ * their queues merged by mistake. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but too late"); -+ return NULL; -+ } -+ - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - -@@ -3338,17 +3391,6 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - */ - slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); - -- /* -- * Increase service_from_backlogged before next statement, -- * because the possible next invocation of -- * bfq_bfqq_charge_time would likely inflate -- * entity->service. In contrast, service_from_backlogged must -- * contain real service, to enable the soft real-time -- * heuristic to correctly compute the bandwidth consumed by -- * bfqq. -- */ -- bfqq->service_from_backlogged += entity->service; -- - /* - * As above explained, charge slow (typically seeky) and - * timed-out queues with the time and not the service -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 1cb05bb853d2..a5947b203ef2 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -337,6 +337,7 @@ struct bfq_queue { - unsigned long wr_start_at_switch_to_srt; - - unsigned long split_time; /* time of last split */ -+ unsigned long first_IO_time; /* time of first I/O for this queue */ - }; - - /** -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 616c0692335a..9d261dd428e4 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -939,6 +939,10 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - struct bfq_entity *entity = &bfqq->entity; - struct bfq_service_tree *st; - -+ if (!bfqq->service_from_backlogged) -+ bfqq->first_IO_time = jiffies; -+ -+ bfqq->service_from_backlogged += served; - for_each_entity(entity) { - st = bfq_entity_service_tree(entity); - -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 3a2d764e760c..cd00a41ca35d 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -113,6 +113,20 @@ static const int bfq_async_charge_factor = 10; - /* Default timeout values, in jiffies, approximating CFQ defaults. */ - static const int bfq_timeout = (HZ / 8); - -+/* -+ * Time limit for merging (see comments in bfq_setup_cooperator). Set -+ * to the slowest value that, in our tests, proved to be effective in -+ * removing false positives, while not causing true positives to miss -+ * queue merging. -+ * -+ * As can be deduced from the low time limit below, queue merging, if -+ * successful, happens at the very beggining of the I/O of the involved -+ * cooperating processes, as a consequence of the arrival of the very -+ * first requests from each cooperator. After that, there is very -+ * little chance to find cooperators. -+ */ -+static const unsigned long bfq_merge_time_limit = HZ/10; -+ - static struct kmem_cache *bfq_pool; - - /* Below this threshold (in ns), we consider thinktime immediate. */ -@@ -351,6 +365,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - return bfqq; - } - -+static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) -+{ -+ return bfqq->service_from_backlogged > 0 && -+ time_is_before_jiffies(bfqq->first_IO_time + -+ bfq_merge_time_limit); -+} -+ - static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - struct rb_node **p, *parent; -@@ -361,6 +382,14 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfqq->pos_root = NULL; - } - -+ /* -+ * bfqq cannot be merged any longer (see comments in -+ * bfq_setup_cooperator): no point in adding bfqq into the -+ * position tree. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) -+ return; -+ - if (bfq_class_idle(bfqq)) - return; - if (!bfqq->next_rq) -@@ -1960,6 +1989,13 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) - static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - struct bfq_queue *new_bfqq) - { -+ if (bfq_too_late_for_merging(new_bfqq)) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "[%s] too late for bfq%d to be merged", -+ __func__, new_bfqq->pid); -+ return false; -+ } -+ - if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || - (bfqq->ioprio_class != new_bfqq->ioprio_class)) - return false; -@@ -2028,6 +2064,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - { - struct bfq_queue *in_service_bfqq, *new_bfqq; - -+ /* -+ * Prevent bfqq from being merged if it has been created too -+ * long ago. The idea is that true cooperating processes, and -+ * thus their associated bfq_queues, are supposed to be -+ * created shortly after each other. This is the case, e.g., -+ * for KVM/QEMU and dump I/O threads. Basing on this -+ * assumption, the following filtering greatly reduces the -+ * probability that two non-cooperating processes, which just -+ * happen to do close I/O for some short time interval, have -+ * their queues merged by mistake. -+ */ -+ if (bfq_too_late_for_merging(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but too late"); -+ return NULL; -+ } -+ - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - -@@ -3226,17 +3279,6 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - */ - slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); - -- /* -- * Increase service_from_backlogged before next statement, -- * because the possible next invocation of -- * bfq_bfqq_charge_time would likely inflate -- * entity->service. In contrast, service_from_backlogged must -- * contain real service, to enable the soft real-time -- * heuristic to correctly compute the bandwidth consumed by -- * bfqq. -- */ -- bfqq->service_from_backlogged += entity->service; -- - /* - * As above explained, charge slow (typically seeky) and - * timed-out queues with the time and not the service -diff --git a/block/bfq.h b/block/bfq.h -index 47cd4d5a8c32..59539adc00a5 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -329,6 +329,8 @@ struct bfq_queue { - unsigned long wr_start_at_switch_to_srt; - - unsigned long split_time; /* time of last split */ -+ -+ unsigned long first_IO_time; /* time of first I/O for this queue */ - }; - - /** - -From 157f39c43ab182280634cd4f6335d0187b3741a0 Mon Sep 17 00:00:00 2001 -From: Angelo Ruocco -Date: Mon, 11 Dec 2017 14:19:54 +0100 -Subject: [PATCH 07/23] block, bfq-sq, bfq-mq: remove superfluous check in - queue-merging setup - -When two or more processes do I/O in a way that the their requests are -sequential in respect to one another, BFQ merges the bfq_queues associated -with the processes. This way the overall I/O pattern becomes sequential, -and thus there is a boost in througput. -These cooperating processes usually start or restart to do I/O shortly -after each other. So, in order to avoid merging non-cooperating processes, -BFQ ensures that none of these queues has been in weight raising for too -long. - -In this respect, from commit "block, bfq-sq, bfq-mq: let a queue be merged -only shortly after being created", BFQ checks whether any queue (and not -only weight-raised ones) is doing I/O continuously from too long to be -merged. - -This new additional check makes the first one useless: a queue doing -I/O from long enough, if being weight-raised, is also a queue in -weight raising for too long to be merged. Accordingly, this commit -removes the first check. - -Signed-off-by: Angelo Ruocco -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 53 ++++---------------------------------------------- - block/bfq-sq-iosched.c | 53 ++++---------------------------------------------- - 2 files changed, 8 insertions(+), 98 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 8b17b25a3c30..f5db8613a70f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -2140,20 +2140,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - return true; - } - --/* -- * If this function returns true, then bfqq cannot be merged. The idea -- * is that true cooperation happens very early after processes start -- * to do I/O. Usually, late cooperations are just accidental false -- * positives. In case bfqq is weight-raised, such false positives -- * would evidently degrade latency guarantees for bfqq. -- */ --static bool wr_from_too_long(struct bfq_queue *bfqq) --{ -- return bfqq->wr_coeff > 1 && -- time_is_before_jiffies(bfqq->last_wr_start_finish + -- msecs_to_jiffies(100)); --} -- - /* - * Attempt to schedule a merge of bfqq with the currently in-service - * queue or with a close queue among the scheduled queues. Return -@@ -2167,11 +2153,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq) - * to maintain. Besides, in such a critical condition as an out of memory, - * the benefits of queue merging may be little relevant, or even negligible. - * -- * Weight-raised queues can be merged only if their weight-raising -- * period has just started. In fact cooperating processes are usually -- * started together. Thus, with this filter we avoid false positives -- * that would jeopardize low-latency guarantees. -- * - * WARNING: queue merging may impair fairness among non-weight raised - * queues, for at least two reasons: 1) the original weight of a - * merged queue may change during the merged state, 2) even being the -@@ -2205,15 +2186,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - -- if (io_struct && wr_from_too_long(bfqq) && -- likely(bfqq != &bfqd->oom_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have looked for coop, but bfq%d wr", -- bfqq->pid); -- -- if (!io_struct || -- wr_from_too_long(bfqq) || -- unlikely(bfqq == &bfqd->oom_bfqq)) -+ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) - return NULL; - - /* If there is only one backlogged queue, don't search. */ -@@ -2223,17 +2196,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - in_service_bfqq = bfqd->in_service_queue; - - if (in_service_bfqq && in_service_bfqq != bfqq && -- wr_from_too_long(in_service_bfqq) -- && likely(in_service_bfqq == &bfqd->oom_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have tried merge with in-service-queue, but wr"); -- -- if (!in_service_bfqq || in_service_bfqq == bfqq -- || wr_from_too_long(in_service_bfqq) || -- unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -- goto check_scheduled; -- -- if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ likely(in_service_bfqq != &bfqd->oom_bfqq) && -+ bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && - bfqq->entity.parent == in_service_bfqq->entity.parent && - bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { - new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -@@ -2245,21 +2209,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * queues. The only thing we need is that the bio/request is not - * NULL, as we need it to establish whether a cooperator exists. - */ --check_scheduled: - new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, - bfq_io_struct_pos(io_struct, request)); - - BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - -- if (new_bfqq && wr_from_too_long(new_bfqq) && -- likely(new_bfqq != &bfqd->oom_bfqq) && -- bfq_may_be_close_cooperator(bfqq, new_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have merged with bfq%d, but wr", -- new_bfqq->pid); -- -- if (new_bfqq && !wr_from_too_long(new_bfqq) && -- likely(new_bfqq != &bfqd->oom_bfqq) && -+ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && - bfq_may_be_close_cooperator(bfqq, new_bfqq)) - return bfq_setup_merge(bfqq, new_bfqq); - -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index cd00a41ca35d..d8a358e5e284 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -2019,20 +2019,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - return true; - } - --/* -- * If this function returns true, then bfqq cannot be merged. The idea -- * is that true cooperation happens very early after processes start -- * to do I/O. Usually, late cooperations are just accidental false -- * positives. In case bfqq is weight-raised, such false positives -- * would evidently degrade latency guarantees for bfqq. -- */ --static bool wr_from_too_long(struct bfq_queue *bfqq) --{ -- return bfqq->wr_coeff > 1 && -- time_is_before_jiffies(bfqq->last_wr_start_finish + -- msecs_to_jiffies(100)); --} -- - /* - * Attempt to schedule a merge of bfqq with the currently in-service - * queue or with a close queue among the scheduled queues. Return -@@ -2046,11 +2032,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq) - * to maintain. Besides, in such a critical condition as an out of memory, - * the benefits of queue merging may be little relevant, or even negligible. - * -- * Weight-raised queues can be merged only if their weight-raising -- * period has just started. In fact cooperating processes are usually -- * started together. Thus, with this filter we avoid false positives -- * that would jeopardize low-latency guarantees. -- * - * WARNING: queue merging may impair fairness among non-weight raised - * queues, for at least two reasons: 1) the original weight of a - * merged queue may change during the merged state, 2) even being the -@@ -2084,15 +2065,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - -- if (io_struct && wr_from_too_long(bfqq) && -- likely(bfqq != &bfqd->oom_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have looked for coop, but bfq%d wr", -- bfqq->pid); -- -- if (!io_struct || -- wr_from_too_long(bfqq) || -- unlikely(bfqq == &bfqd->oom_bfqq)) -+ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) - return NULL; - - /* If there is only one backlogged queue, don't search. */ -@@ -2102,17 +2075,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - in_service_bfqq = bfqd->in_service_queue; - - if (in_service_bfqq && in_service_bfqq != bfqq && -- bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) -- && likely(in_service_bfqq == &bfqd->oom_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have tried merge with in-service-queue, but wr"); -- -- if (!in_service_bfqq || in_service_bfqq == bfqq || -- !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || -- unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -- goto check_scheduled; -- -- if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ likely(in_service_bfqq != &bfqd->oom_bfqq) && -+ bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && - bfqq->entity.parent == in_service_bfqq->entity.parent && - bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { - new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -@@ -2124,21 +2088,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * queues. The only thing we need is that the bio/request is not - * NULL, as we need it to establish whether a cooperator exists. - */ --check_scheduled: - new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, - bfq_io_struct_pos(io_struct, request)); - - BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - -- if (new_bfqq && wr_from_too_long(new_bfqq) && -- likely(new_bfqq != &bfqd->oom_bfqq) && -- bfq_may_be_close_cooperator(bfqq, new_bfqq)) -- bfq_log_bfqq(bfqd, bfqq, -- "would have merged with bfq%d, but wr", -- new_bfqq->pid); -- -- if (new_bfqq && !wr_from_too_long(new_bfqq) && -- likely(new_bfqq != &bfqd->oom_bfqq) && -+ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && - bfq_may_be_close_cooperator(bfqq, new_bfqq)) - return bfq_setup_merge(bfqq, new_bfqq); - - -From b82eb91d87f172aba7eb5eb98e8d5e2a621adf51 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Thu, 30 Nov 2017 17:48:28 +0100 -Subject: [PATCH 08/23] block, bfq-sq, bfq-mq: increase threshold to deem I/O - as random - -If two processes do I/O close to each other, i.e., are cooperating -processes in BFQ (and CFQ'S) nomenclature, then BFQ merges their -associated bfq_queues, so as to get sequential I/O from the union of -the I/O requests of the processes, and thus reach a higher -throughput. A merged queue is then split if its I/O stops being -sequential. In this respect, BFQ deems the I/O of a bfq_queue as -(mostly) sequential only if less than 4 I/O requests are random, out -of the last 32 requests inserted into the queue. - -Unfortunately, extensive testing (with the interleaved_io benchmark of -the S suite [1], and with real applications spawning cooperating -processes) has clearly shown that, with such a low threshold, only a -rather low I/O throughput may be reached when several cooperating -processes do I/O. In particular, the outcome of each test run was -bimodal: if queue merging occurred and was stable during the test, -then the throughput was close to the peak rate of the storage device, -otherwise the throughput was arbitrarily low (usually around 1/10 of -the peak rate with a rotational device). The probability to get the -unlucky outcomes grew with the number of cooperating processes: it was -already significant with 5 processes, and close to one with 7 or more -processes. - -The cause of the low throughput in the unlucky runs was that the -merged queues containing the I/O of these cooperating processes were -soon split, because they contained more random I/O requests than those -tolerated by the 4/32 threshold, but -- that I/O would have however allowed the storage device to reach - peak throughput or almost peak throughput; -- in contrast, the I/O of these processes, if served individually - (from separate queues) yielded a rather low throughput. - -So we repeated our tests with increasing values of the threshold, -until we found the minimum value (19) for which we obtained maximum -throughput, reliably, with at least up to 9 cooperating -processes. Then we checked that the use of that higher threshold value -did not cause any regression for any other benchmark in the suite [1]. -This commit raises the threshold to such a higher value. - -[1] https://github.com/Algodev-github/S - -Signed-off-by: Angelo Ruocco -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-sq-iosched.c | 2 +- - 2 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index f5db8613a70f..cb5f49ddecb6 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -145,7 +145,7 @@ static struct kmem_cache *bfq_pool; - #define BFQQ_SEEK_THR (sector_t)(8 * 100) - #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) - #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) --#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) - - /* Min number of samples required to perform peak-rate update */ - #define BFQ_RATE_MIN_SAMPLES 32 -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index d8a358e5e284..e1c6dc651be1 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -139,7 +139,7 @@ static struct kmem_cache *bfq_pool; - #define BFQQ_SEEK_THR (sector_t)(8 * 100) - #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) - #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) --#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) - - /* Min number of samples required to perform peak-rate update */ - #define BFQ_RATE_MIN_SAMPLES 32 - -From b739dda4e4b3a1cbbc905f86f9fbb0860b068ce7 Mon Sep 17 00:00:00 2001 -From: Chiara Bruschi -Date: Mon, 11 Dec 2017 18:55:26 +0100 -Subject: [PATCH 09/23] block, bfq-sq, bfq-mq: specify usage condition of - delta_us in bfq_log_bfqq call - -Inside the function bfq_completed_request the value of a variable -called delta_us is computed as current request completion time. -delta_us is used inside a call to the function bfq_log_bfqq as divisor -in a division operation to compute a rate value, but no check makes -sure that delta_us has non-zero value. A divisor with value 0 leads -to a division error that could result in a kernel oops (therefore -unstable/unreliable system state) and consequently cause kernel panic -if resources are unavailable after the system fault. - -This commit fixes this call to bfq_log_bfqq specifying the condition -that allows delta_us to be safely used as divisor. - -Signed-off-by: Paolo Valente -Signed-off-by: Chiara Bruschi ---- - block/bfq-mq-iosched.c | 5 ++++- - block/bfq-sq-iosched.c | 5 ++++- - 2 files changed, 8 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index cb5f49ddecb6..6ce2c0789046 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4904,9 +4904,12 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - bfq_log_bfqq(bfqd, bfqq, - "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ delta_us > 0 ? - (USEC_PER_SEC* - (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, -+ >>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC* -+ (u64)(bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, - (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); - - /* -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index e1c6dc651be1..eff4c4edf5a0 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -4565,9 +4565,12 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - - bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ delta_us > 0 ? - (USEC_PER_SEC* - (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, -+ >>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC* -+ (u64)(bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, - (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); - - /* - -From ae4310c13eca762644734d53074d8456c85e2dec Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Tue, 19 Dec 2017 12:07:12 +0100 -Subject: [PATCH 10/23] block, bfq-mq: limit tags for writes and async I/O - -Asynchronous I/O can easily starve synchronous I/O (both sync reads -and sync writes), by consuming all request tags. Similarly, storms of -synchronous writes, such as those that sync(2) may trigger, can starve -synchronous reads. In their turn, these two problems may also cause -BFQ to loose control on latency for interactive and soft real-time -applications. For example, on a PLEXTOR PX-256M5S SSD, LibreOffice -Writer takes 0.6 seconds to start if the device is idle, but it takes -more than 45 seconds (!) if there are sequential writes in the -background. - -This commit addresses this issue by limiting the maximum percentage of -tags that asynchronous I/O requests and synchronous write requests can -consume. In particular, this commit grants a higher threshold to -synchronous writes, to prevent the latter from being starved by -asynchronous I/O. - -According to the above test, LibreOffice Writer now starts in about -1.2 seconds on average, regardless of the background workload, and -apart from some rare outlier. To check this improvement, run, e.g., -sudo ./comm_startup_lat.sh bfq-mq 5 5 seq 10 "lowriter --terminate_after_init" -for the comm_startup_lat benchmark in the S suite [1]. - -[1] https://github.com/Algodev-github/S - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++ - block/bfq-mq.h | 12 ++++++++ - 2 files changed, 89 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 6ce2c0789046..f384f5566672 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -362,6 +362,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, - } - } - -+/* -+ * See the comments on bfq_limit_depth for the purpose of -+ * the depths set in the function. -+ */ -+static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) -+{ -+ bfqd->sb_shift = bt->sb.shift; -+ -+ /* -+ * In-word depths if no bfq_queue is being weight-raised: -+ * leaving 25% of tags only for sync reads. -+ * -+ * In next formulas, right-shift the value -+ * (1U<sb_shift), instead of computing directly -+ * (1U<<(bfqd->sb_shift - something)), to be robust against -+ * any possible value of bfqd->sb_shift, without having to -+ * limit 'something'. -+ */ -+ /* no more than 50% of tags for async I/O */ -+ bfqd->word_depths[0][0] = max((1U<sb_shift)>>1, 1U); -+ /* -+ * no more than 75% of tags for sync writes (25% extra tags -+ * w.r.t. async I/O, to prevent async I/O from starving sync -+ * writes) -+ */ -+ bfqd->word_depths[0][1] = max(((1U<sb_shift) * 3)>>2, 1U); -+ -+ /* -+ * In-word depths in case some bfq_queue is being weight- -+ * raised: leaving ~63% of tags for sync reads. This is the -+ * highest percentage for which, in our tests, application -+ * start-up times didn't suffer from any regression due to tag -+ * shortage. -+ */ -+ /* no more than ~18% of tags for async I/O */ -+ bfqd->word_depths[1][0] = max(((1U<sb_shift) * 3)>>4, 1U); -+ /* no more than ~37% of tags for sync writes (~20% extra tags) */ -+ bfqd->word_depths[1][1] = max(((1U<sb_shift) * 6)>>4, 1U); -+} -+ -+/* -+ * Async I/O can easily starve sync I/O (both sync reads and sync -+ * writes), by consuming all tags. Similarly, storms of sync writes, -+ * such as those that sync(2) may trigger, can starve sync reads. -+ * Limit depths of async I/O and sync writes so as to counter both -+ * problems. -+ */ -+static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) -+{ -+ struct blk_mq_tags *tags = blk_mq_tags_from_data(data); -+ struct bfq_data *bfqd = data->q->elevator->elevator_data; -+ struct sbitmap_queue *bt; -+ -+ if (op_is_sync(op) && !op_is_write(op)) -+ return; -+ -+ if (data->flags & BLK_MQ_REQ_RESERVED) { -+ if (unlikely(!tags->nr_reserved_tags)) { -+ WARN_ON_ONCE(1); -+ return; -+ } -+ bt = &tags->breserved_tags; -+ } else -+ bt = &tags->bitmap_tags; -+ -+ if (unlikely(bfqd->sb_shift != bt->sb.shift)) -+ bfq_update_depths(bfqd, bt); -+ -+ data->shallow_depth = -+ bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; -+ -+ bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", -+ __func__, bfqd->wr_busy_queues, op_is_sync(op), -+ data->shallow_depth); -+} -+ - static struct bfq_queue * - bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - sector_t sector, struct rb_node **ret_parent, -@@ -5812,6 +5888,7 @@ static struct elv_fs_entry bfq_attrs[] = { - - static struct elevator_type iosched_bfq_mq = { - .ops.mq = { -+ .limit_depth = bfq_limit_depth, - .prepare_request = bfq_prepare_request, - .finish_request = bfq_finish_request, - .exit_icq = bfq_exit_icq, -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index a5947b203ef2..458099ee0308 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -619,6 +619,18 @@ struct bfq_data { - struct bfq_queue *bio_bfqq; - /* Extra flag used only for TESTING */ - bool bio_bfqq_set; -+ -+ /* -+ * Cached sbitmap shift, used to compute depth limits in -+ * bfq_update_depths. -+ */ -+ unsigned int sb_shift; -+ -+ /* -+ * Depth limits used in bfq_limit_depth (see comments on the -+ * function) -+ */ -+ unsigned int word_depths[2][2]; - }; - - enum bfqq_state_flags { - -From 402e5f6b59662d290ab2b3c10b0016207a63ad21 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Thu, 21 Dec 2017 15:51:39 +0100 -Subject: [PATCH 11/23] bfq-sq, bfq-mq: limit sectors served with interactive - weight raising - -To maximise responsiveness, BFQ raises the weight, and performs device -idling, for bfq_queues associated with processes deemed as -interactive. In particular, weight raising has a maximum duration, -equal to the time needed to start a large application. If a -weight-raised process goes on doing I/O beyond this maximum duration, -it loses weight-raising. - -This mechanism is evidently vulnerable to the following false -positives: I/O-bound applications that will go on doing I/O for much -longer than the duration of weight-raising. These applications have -basically no benefit from being weight-raised at the beginning of -their I/O. On the opposite end, while being weight-raised, these -applications -a) unjustly steal throughput to applications that may truly need -low latency; -b) make BFQ uselessly perform device idling; device idling results -in loss of device throughput with most flash-based storage, and may -increase latencies when used purposelessly. - -This commit adds a countermeasure to reduce both the above -problems. To introduce this countermeasure, we provide the following -extra piece of information (full details in the comments added by this -commit). During the start-up of the large application used as a -reference to set the duration of weight-raising, involved processes -transfer at most ~110K sectors each. Accordingly, a process initially -deemed as interactive has no right to be weight-raised any longer, -once transferred 110K sectors or more. - -Basing on this consideration, this commit early-ends weight-raising -for a bfq_queue if the latter happens to have received an amount of -service at least equal to 110K sectors (actually, a little bit more, -to keep a safety margin). I/O-bound applications that reach a high -throughput, such as file copy, get to this threshold much before the -allowed weight-raising period finishes. Thus this early ending of -weight-raising reduces the amount of time during which these -applications cause the problems described above. - -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 84 ++++++++++++++++++++++++++++++++++++++++++++------ - block/bfq-mq.h | 5 +++ - block/bfq-sched.c | 3 ++ - block/bfq-sq-iosched.c | 84 ++++++++++++++++++++++++++++++++++++++++++++------ - block/bfq.h | 5 +++ - 5 files changed, 163 insertions(+), 18 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index f384f5566672..63fdd16dec3c 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -162,15 +162,17 @@ static struct kmem_cache *bfq_pool; - * interactive applications automatically, using the following formula: - * duration = (R / r) * T, where r is the peak rate of the device, and - * R and T are two reference parameters. -- * In particular, R is the peak rate of the reference device (see below), -- * and T is a reference time: given the systems that are likely to be -- * installed on the reference device according to its speed class, T is -- * about the maximum time needed, under BFQ and while reading two files in -- * parallel, to load typical large applications on these systems. -- * In practice, the slower/faster the device at hand is, the more/less it -- * takes to load applications with respect to the reference device. -- * Accordingly, the longer/shorter BFQ grants weight raising to interactive -- * applications. -+ * In particular, R is the peak rate of the reference device (see -+ * below), and T is a reference time: given the systems that are -+ * likely to be installed on the reference device according to its -+ * speed class, T is about the maximum time needed, under BFQ and -+ * while reading two files in parallel, to load typical large -+ * applications on these systems (see the comments on -+ * max_service_from_wr below, for more details on how T is obtained). -+ * In practice, the slower/faster the device at hand is, the more/less -+ * it takes to load applications with respect to the reference device. -+ * Accordingly, the longer/shorter BFQ grants weight raising to -+ * interactive applications. - * - * BFQ uses four different reference pairs (R, T), depending on: - * . whether the device is rotational or non-rotational; -@@ -207,6 +209,60 @@ static int T_slow[2]; - static int T_fast[2]; - static int device_speed_thresh[2]; - -+/* -+ * BFQ uses the above-detailed, time-based weight-raising mechanism to -+ * privilege interactive tasks. This mechanism is vulnerable to the -+ * following false positives: I/O-bound applications that will go on -+ * doing I/O for much longer than the duration of weight -+ * raising. These applications have basically no benefit from being -+ * weight-raised at the beginning of their I/O. On the opposite end, -+ * while being weight-raised, these applications -+ * a) unjustly steal throughput to applications that may actually need -+ * low latency; -+ * b) make BFQ uselessly perform device idling; device idling results -+ * in loss of device throughput with most flash-based storage, and may -+ * increase latencies when used purposelessly. -+ * -+ * BFQ tries to reduce these problems, by adopting the following -+ * countermeasure. To introduce this countermeasure, we need first to -+ * finish explaining how the duration of weight-raising for -+ * interactive tasks is computed. -+ * -+ * For a bfq_queue deemed as interactive, the duration of weight -+ * raising is dynamically adjusted, as a function of the estimated -+ * peak rate of the device, so as to be equal to the time needed to -+ * execute the 'largest' interactive task we benchmarked so far. By -+ * largest task, we mean the task for which each involved process has -+ * to do more I/O than for any of the other tasks we benchmarked. This -+ * reference interactive task is the start-up of LibreOffice Writer, -+ * and in this task each process/bfq_queue needs to have at most ~110K -+ * sectors transferred. -+ * -+ * This last piece of information enables BFQ to reduce the actual -+ * duration of weight-raising for at least one class of I/O-bound -+ * applications: those doing sequential or quasi-sequential I/O. An -+ * example is file copy. In fact, once started, the main I/O-bound -+ * processes of these applications usually consume the above 110K -+ * sectors in much less time than the processes of an application that -+ * is starting, because these I/O-bound processes will greedily devote -+ * almost all their CPU cycles only to their target, -+ * throughput-friendly I/O operations. This is even more true if BFQ -+ * happens to be underestimating the device peak rate, and thus -+ * overestimating the duration of weight raising. But, according to -+ * our measurements, once transferred 110K sectors, these processes -+ * have no right to be weight-raised any longer. -+ * -+ * Basing on the last consideration, BFQ ends weight-raising for a -+ * bfq_queue if the latter happens to have received an amount of -+ * service at least equal to the following constant. The constant is -+ * set to slightly more than 110K, to have a minimum safety margin. -+ * -+ * This early ending of weight-raising reduces the amount of time -+ * during which interactive false positives cause the two problems -+ * described at the beginning of these comments. -+ */ -+static const unsigned long max_service_from_wr = 120000; -+ - #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - -@@ -1361,6 +1417,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - if (old_wr_coeff == 1 && wr_or_deserves_wr) { - /* start a weight-raising period */ - if (interactive) { -+ bfqq->service_from_wr = 0; - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else { -@@ -3980,6 +4037,15 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - "back to interactive wr"); - } - } -+ if (bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && -+ bfqq->service_from_wr > max_service_from_wr) { -+ /* see comments on max_service_from_wr */ -+ bfq_bfqq_end_wr(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "[%s] too much service", -+ __func__); -+ } - } - /* - * To improve latency (for this or other queues), immediately -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 458099ee0308..9a5ce1168ff5 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -331,6 +331,11 @@ struct bfq_queue { - * last transition from idle to backlogged. - */ - unsigned long service_from_backlogged; -+ /* -+ * Cumulative service received from the @bfq_queue since its -+ * last transition to weight-raised state. -+ */ -+ unsigned long service_from_wr; - /* - * Value of wr start time when switching to soft rt - */ -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 9d261dd428e4..4e6c5232e2fb 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -942,6 +942,9 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - if (!bfqq->service_from_backlogged) - bfqq->first_IO_time = jiffies; - -+ if (bfqq->wr_coeff > 1) -+ bfqq->service_from_wr += served; -+ - bfqq->service_from_backlogged += served; - for_each_entity(entity) { - st = bfq_entity_service_tree(entity); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index eff4c4edf5a0..486493aafaf8 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -156,15 +156,17 @@ static struct kmem_cache *bfq_pool; - * interactive applications automatically, using the following formula: - * duration = (R / r) * T, where r is the peak rate of the device, and - * R and T are two reference parameters. -- * In particular, R is the peak rate of the reference device (see below), -- * and T is a reference time: given the systems that are likely to be -- * installed on the reference device according to its speed class, T is -- * about the maximum time needed, under BFQ and while reading two files in -- * parallel, to load typical large applications on these systems. -- * In practice, the slower/faster the device at hand is, the more/less it -- * takes to load applications with respect to the reference device. -- * Accordingly, the longer/shorter BFQ grants weight raising to interactive -- * applications. -+ * In particular, R is the peak rate of the reference device (see -+ * below), and T is a reference time: given the systems that are -+ * likely to be installed on the reference device according to its -+ * speed class, T is about the maximum time needed, under BFQ and -+ * while reading two files in parallel, to load typical large -+ * applications on these systems (see the comments on -+ * max_service_from_wr below, for more details on how T is obtained). -+ * In practice, the slower/faster the device at hand is, the more/less -+ * it takes to load applications with respect to the reference device. -+ * Accordingly, the longer/shorter BFQ grants weight raising to -+ * interactive applications. - * - * BFQ uses four different reference pairs (R, T), depending on: - * . whether the device is rotational or non-rotational; -@@ -201,6 +203,60 @@ static int T_slow[2]; - static int T_fast[2]; - static int device_speed_thresh[2]; - -+/* -+ * BFQ uses the above-detailed, time-based weight-raising mechanism to -+ * privilege interactive tasks. This mechanism is vulnerable to the -+ * following false positives: I/O-bound applications that will go on -+ * doing I/O for much longer than the duration of weight -+ * raising. These applications have basically no benefit from being -+ * weight-raised at the beginning of their I/O. On the opposite end, -+ * while being weight-raised, these applications -+ * a) unjustly steal throughput to applications that may actually need -+ * low latency; -+ * b) make BFQ uselessly perform device idling; device idling results -+ * in loss of device throughput with most flash-based storage, and may -+ * increase latencies when used purposelessly. -+ * -+ * BFQ tries to reduce these problems, by adopting the following -+ * countermeasure. To introduce this countermeasure, we need first to -+ * finish explaining how the duration of weight-raising for -+ * interactive tasks is computed. -+ * -+ * For a bfq_queue deemed as interactive, the duration of weight -+ * raising is dynamically adjusted, as a function of the estimated -+ * peak rate of the device, so as to be equal to the time needed to -+ * execute the 'largest' interactive task we benchmarked so far. By -+ * largest task, we mean the task for which each involved process has -+ * to do more I/O than for any of the other tasks we benchmarked. This -+ * reference interactive task is the start-up of LibreOffice Writer, -+ * and in this task each process/bfq_queue needs to have at most ~110K -+ * sectors transfered. -+ * -+ * This last piece of information enables BFQ to reduce the actual -+ * duration of weight-raising for at least one class of I/O-bound -+ * applications: those doing sequential or quasi-sequential I/O. An -+ * example is file copy. In fact, once started, the main I/O-bound -+ * processes of these applications usually consume the above 110K -+ * sectors in much less time than the processes of an application that -+ * is starting, because these I/O-bound processes will greedily devote -+ * almost all their CPU cycles only to their target, -+ * throughput-friendly I/O operations. This is even more true if BFQ -+ * happens to be underestimating the device peak rate, and thus -+ * overestimating the duration of weight raising. But, according to -+ * our measurements, once transferred 110K sectors, these processes -+ * have no right to be weight-raised any longer. -+ * -+ * Basing on the last consideration, BFQ ends weight-raising for a -+ * bfq_queue if the latter happens to have received an amount of -+ * service at least equal to the following constant. The constant is -+ * set to slightly more than 110K, to have a minimum safety margin. -+ * -+ * This early ending of weight-raising reduces the amount of time -+ * during which interactive false positives cause the two problems -+ * described at the beginning of these comments. -+ */ -+static const unsigned long max_service_from_wr = 120000; -+ - #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - -@@ -1246,6 +1302,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, - if (old_wr_coeff == 1 && wr_or_deserves_wr) { - /* start a weight-raising period */ - if (interactive) { -+ bfqq->service_from_wr = 0; - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - } else { -@@ -3794,6 +3851,15 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - "back to interactive wr"); - } - } -+ if (bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && -+ bfqq->service_from_wr > max_service_from_wr) { -+ /* see comments on max_service_from_wr */ -+ bfq_bfqq_end_wr(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, -+ "[%s] too much service", -+ __func__); -+ } - } - /* - * To improve latency (for this or other queues), immediately -diff --git a/block/bfq.h b/block/bfq.h -index 59539adc00a5..0cd7a3f251a7 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -323,6 +323,11 @@ struct bfq_queue { - * last transition from idle to backlogged. - */ - unsigned long service_from_backlogged; -+ /* -+ * Cumulative service received from the @bfq_queue since its -+ * last transition to weight-raised state. -+ */ -+ unsigned long service_from_wr; - /* - * Value of wr start time when switching to soft rt - */ - -From 59efebb94b2f9bac653faf62dadb45b83bd27fa7 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Thu, 4 Jan 2018 16:29:58 +0100 -Subject: [PATCH 12/23] bfq-sq, bfq-mq: put async queues for root bfq groups - too -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -For each pair [device for which bfq is selected as I/O scheduler, -group in blkio/io], bfq maintains a corresponding bfq group. Each such -bfq group contains a set of async queues, with each async queue -created on demand, i.e., when some I/O request arrives for it. On -creation, an async queue gets an extra reference, to make sure that -the queue is not freed as long as its bfq group exists. Accordingly, -to allow the queue to be freed after the group exited, this extra -reference must released on group exit. - -The above holds also for a bfq root group, i.e., for the bfq group -corresponding to the root blkio/io root for a given device. Yet, by -mistake, the references to the existing async queues of a root group -are not released when the latter exits. This causes a memory leak when -the instance of bfq for a given device exits. In a similar vein, -bfqg_stats_xfer_dead is not executed for a root group. - -This commit fixes bfq_pd_offline so that the latter executes the above -missing operations for a root group too. - -Reported-by: Holger Hoffstätte -Reported-by: Guoqing Jiang -Signed-off-by: Davide Ferrari -Signed-off-by: Paolo Valente ---- - block/bfq-cgroup-included.c | 8 +++++--- - 1 file changed, 5 insertions(+), 3 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 562b0ce581a7..45fefb2e2d57 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -885,13 +885,13 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - - entity = bfqg->my_entity; - -- if (!entity) /* root group */ -- return; -- - #ifdef BFQ_MQ - spin_lock_irqsave(&bfqd->lock, flags); - #endif - -+ if (!entity) /* root group */ -+ goto put_async_queues; -+ - /* - * Empty all service_trees belonging to this group before - * deactivating the group itself. -@@ -926,6 +926,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - BUG_ON(bfqg->sched_data.in_service_entity); - - __bfq_deactivate_entity(entity, false); -+ -+put_async_queues: - bfq_put_async_queues(bfqd, bfqg); - - #ifdef BFQ_MQ - -From 2dfbaaaf95054e2da3ededc0deb1ba5a4f589e53 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Mon, 8 Jan 2018 19:38:45 +0100 -Subject: [PATCH 13/23] bfq-sq, bfq-mq: release oom-queue ref to root group on - exit - -On scheduler init, a reference to the root group, and a reference to -its corresponding blkg are taken for the oom queue. Yet these -references are not released on scheduler exit, which prevents these -objects from be freed. This commit adds the missing reference -releases. - -Reported-by: Davide Ferrari -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 3 +++ - block/bfq-sq-iosched.c | 3 +++ - 2 files changed, 6 insertions(+) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 63fdd16dec3c..b82c52fabf91 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -5507,6 +5507,9 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - -+ /* release oom-queue reference to root group */ -+ bfqg_and_blkg_put(bfqd->root_group); -+ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); - #else -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 486493aafaf8..851af055664d 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -5052,6 +5052,9 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - -+ /* release oom-queue reference to root group */ -+ bfqg_put(bfqd->root_group); -+ - #ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); - #else - -From 13efe00c8292d78d223e1090a7f36426e360eb38 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Mon, 8 Jan 2018 19:40:38 +0100 -Subject: [PATCH 14/23] block, bfq-sq, bfq-mq: trace get and put of bfq groups - -Signed-off-by: Paolo Valente ---- - block/bfq-cgroup-included.c | 15 +++++++++++++++ - block/bfq-mq-iosched.c | 3 ++- - 2 files changed, 17 insertions(+), 1 deletion(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index 45fefb2e2d57..f94743fb2e7d 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -267,6 +267,8 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) - - static void bfqg_get(struct bfq_group *bfqg) - { -+ trace_printk("bfqg %p\n", bfqg); -+ - #ifdef BFQ_MQ - bfqg->ref++; - #else -@@ -280,6 +282,9 @@ static void bfqg_put(struct bfq_group *bfqg) - bfqg->ref--; - - BUG_ON(bfqg->ref < 0); -+ trace_printk("putting bfqg %p %s\n", bfqg, -+ bfqg->ref == 0 ? "and freeing it" : ""); -+ - if (bfqg->ref == 0) - kfree(bfqg); - #else -@@ -293,6 +298,7 @@ static void bfqg_and_blkg_get(struct bfq_group *bfqg) - /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ - bfqg_get(bfqg); - -+ trace_printk("getting blkg for bfqg %p\n", bfqg); - blkg_get(bfqg_to_blkg(bfqg)); - } - -@@ -300,6 +306,7 @@ static void bfqg_and_blkg_put(struct bfq_group *bfqg) - { - bfqg_put(bfqg); - -+ trace_printk("putting blkg for bfqg %p\n", bfqg); - blkg_put(bfqg_to_blkg(bfqg)); - } - #endif -@@ -382,6 +389,8 @@ static void bfq_init_entity(struct bfq_entity *entity, - * Make sure that bfqg and its associated blkg do not - * disappear before entity. - */ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting bfqg %p and blkg\n", __func__, bfqg); -+ - bfqg_and_blkg_get(bfqg); - #else - bfqg_get(bfqg); -@@ -475,6 +484,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) - kfree(bfqg); - return NULL; - } -+ trace_printk("bfqg %p\n", bfqg); - - #ifdef BFQ_MQ - /* see comments in bfq_bic_update_cgroup for why refcounting */ -@@ -513,6 +523,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) - static void bfq_pd_free(struct blkg_policy_data *pd) - { - struct bfq_group *bfqg = pd_to_bfqg(pd); -+ trace_printk("bfqg %p\n", bfqg); - - bfqg_stats_exit(&bfqg->stats); - #ifdef BFQ_MQ -@@ -650,6 +661,8 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - } - #ifdef BFQ_MQ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); -+ - bfqg_and_blkg_put(bfqq_group(bfqq)); - #else - bfqg_put(bfqq_group(bfqq)); -@@ -658,6 +671,8 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; - #ifdef BFQ_MQ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting blkg and bfqg %p\n", __func__, bfqg); -+ - /* pin down bfqg and its associated blkg */ - bfqg_and_blkg_get(bfqg); - #else -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index b82c52fabf91..d5b7a6b985d7 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4385,10 +4385,11 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - if (bfqq->bfqd) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - -- kmem_cache_free(bfq_pool, bfqq); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); - bfqg_and_blkg_put(bfqg); - #endif -+ kmem_cache_free(bfq_pool, bfqq); - } - - static void bfq_put_cooperator(struct bfq_queue *bfqq) - -From 816b77fba966171974eb5ee25d81bc4e19eaf1b4 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 10 Jan 2018 09:08:22 +0100 -Subject: [PATCH 15/23] bfq-sq, bfq-mq: compile group put for oom queue only if - BFQ_GROUP_IOSCHED is set - -Commit ("bfq-sq, bfq-mq: release oom-queue ref to root group on exit") -added a missing put of the root bfq group for the oom queue. That put -has to be, and can be, performed only if CONFIG_BFQ_GROUP_IOSCHED is -defined: the function doing the put is even not defined at all if -CONFIG_BFQ_GROUP_IOSCHED is not defined. But that commit makes that -put be invoked regardless of whether CONFIG_BFQ_GROUP_IOSCHED is -defined. This commit fixes this mistake, by making that invocation be -compiled only if CONFIG_BFQ_GROUP_IOSCHED is actually defined. - -Fixes ("block, bfq: release oom-queue ref to root group on exit") -Reported-by: Jan Alexander Steffens -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-sq-iosched.c | 2 +- - 2 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index d5b7a6b985d7..2581fe0f6f2f 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -5508,10 +5508,10 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* release oom-queue reference to root group */ - bfqg_and_blkg_put(bfqd->root_group); - --#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); - #else - spin_lock_irq(&bfqd->lock); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index 851af055664d..c4df156b1fb4 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -5052,10 +5052,10 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - -+#ifdef BFQ_GROUP_IOSCHED_ENABLED - /* release oom-queue reference to root group */ - bfqg_put(bfqd->root_group); - --#ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); - #else - bfq_put_async_queues(bfqd, bfqd->root_group); - -From 643a89c659172b2c9ae16adfe03af4e3e88e1326 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Sat, 13 Jan 2018 18:48:41 +0100 -Subject: [PATCH 16/23] block, bfq-sq, bfq-mq: remove trace_printks - -Commit ("block, bfq-sq, bfq-mq: trace get and put of bfq groups") -unwisely added some invocations of the function trace_printk, which -is inappropriate in production kernels. This commit removes those -invocations. - -Signed-off-by: Paolo Valente ---- - block/bfq-cgroup-included.c | 10 ---------- - 1 file changed, 10 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index f94743fb2e7d..a4f8a03edfc9 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -267,8 +267,6 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) - - static void bfqg_get(struct bfq_group *bfqg) - { -- trace_printk("bfqg %p\n", bfqg); -- - #ifdef BFQ_MQ - bfqg->ref++; - #else -@@ -282,9 +280,6 @@ static void bfqg_put(struct bfq_group *bfqg) - bfqg->ref--; - - BUG_ON(bfqg->ref < 0); -- trace_printk("putting bfqg %p %s\n", bfqg, -- bfqg->ref == 0 ? "and freeing it" : ""); -- - if (bfqg->ref == 0) - kfree(bfqg); - #else -@@ -298,7 +293,6 @@ static void bfqg_and_blkg_get(struct bfq_group *bfqg) - /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ - bfqg_get(bfqg); - -- trace_printk("getting blkg for bfqg %p\n", bfqg); - blkg_get(bfqg_to_blkg(bfqg)); - } - -@@ -306,7 +300,6 @@ static void bfqg_and_blkg_put(struct bfq_group *bfqg) - { - bfqg_put(bfqg); - -- trace_printk("putting blkg for bfqg %p\n", bfqg); - blkg_put(bfqg_to_blkg(bfqg)); - } - #endif -@@ -484,8 +477,6 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) - kfree(bfqg); - return NULL; - } -- trace_printk("bfqg %p\n", bfqg); -- - #ifdef BFQ_MQ - /* see comments in bfq_bic_update_cgroup for why refcounting */ - bfqg_get(bfqg); -@@ -523,7 +514,6 @@ static void bfq_pd_init(struct blkg_policy_data *pd) - static void bfq_pd_free(struct blkg_policy_data *pd) - { - struct bfq_group *bfqg = pd_to_bfqg(pd); -- trace_printk("bfqg %p\n", bfqg); - - bfqg_stats_exit(&bfqg->stats); - #ifdef BFQ_MQ - -From ce050275e24fecec800f346c09d9494563e9fc8a Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Mon, 15 Jan 2018 15:07:05 +0100 -Subject: [PATCH 17/23] block, bfq-mq: add requeue-request hook - -Commit 'a6a252e64914 ("blk-mq-sched: decide how to handle flush rq via -RQF_FLUSH_SEQ")' makes all non-flush re-prepared requests for a device -be re-inserted into the active I/O scheduler for that device. As a -consequence, I/O schedulers may get the same request inserted again, -even several times, without a finish_request invoked on that request -before each re-insertion. - -This fact is the cause of the failure reported in [1]. For an I/O -scheduler, every re-insertion of the same re-prepared request is -equivalent to the insertion of a new request. For schedulers like -mq-deadline or kyber, this fact causes no harm. In contrast, it -confuses a stateful scheduler like BFQ, which keeps state for an I/O -request, until the finish_request hook is invoked on the request. In -particular, BFQ may get stuck, waiting forever for the number of -request dispatches, of the same request, to be balanced by an equal -number of request completions (while there will be one completion for -that request). In this state, BFQ may refuse to serve I/O requests -from other bfq_queues. The hang reported in [1] then follows. - -However, the above re-prepared requests undergo a requeue, thus the -requeue_request hook of the active elevator is invoked for these -requests, if set. This commit then addresses the above issue by -properly implementing the hook requeue_request in BFQ. - -[1] https://marc.info/?l=linux-block&m=151211117608676 - -Reported-by: Ivan Kozik -Reported-by: Alban Browaeys -Signed-off-by: Paolo Valente -Signed-off-by: Serena Ziviani ---- - block/bfq-mq-iosched.c | 90 ++++++++++++++++++++++++++++++++++++++++---------- - 1 file changed, 73 insertions(+), 17 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 2581fe0f6f2f..bb7ccc2f1165 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4162,9 +4162,9 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * TESTING: reset DISP_LIST flag, because: 1) - * this rq this request has passed through - * bfq_prepare_request, 2) then it will have -- * bfq_finish_request invoked on it, and 3) in -- * bfq_finish_request we use this flag to check -- * that bfq_finish_request is not invoked on -+ * bfq_finish_requeue_request invoked on it, and 3) in -+ * bfq_finish_requeue_request we use this flag to check -+ * that bfq_finish_requeue_request is not invoked on - * requests for which bfq_prepare_request has - * been invoked. - */ -@@ -4173,8 +4173,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - } - - /* -- * We exploit the bfq_finish_request hook to decrement -- * rq_in_driver, but bfq_finish_request will not be -+ * We exploit the bfq_finish_requeue_request hook to decrement -+ * rq_in_driver, but bfq_finish_requeue_request will not be - * invoked on this request. So, to avoid unbalance, - * just start this request, without incrementing - * rq_in_driver. As a negative consequence, -@@ -4183,10 +4183,10 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * bfq_schedule_dispatch to be invoked uselessly. - * - * As for implementing an exact solution, the -- * bfq_finish_request hook, if defined, is probably -+ * bfq_finish_requeue_request hook, if defined, is probably - * invoked also on this request. So, by exploiting - * this hook, we could 1) increment rq_in_driver here, -- * and 2) decrement it in bfq_finish_request. Such a -+ * and 2) decrement it in bfq_finish_requeue_request. Such a - * solution would let the value of the counter be - * always accurate, but it would entail using an extra - * interface function. This cost seems higher than the -@@ -4878,6 +4878,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - return idle_timer_disabled; - } - -+static void bfq_prepare_request(struct request *rq, struct bio *bio); -+ - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head) - { -@@ -4919,6 +4921,20 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - BUG_ON(!(rq->rq_flags & RQF_GOT)); - rq->rq_flags &= ~RQF_GOT; - -+ if (!bfqq) { -+ /* -+ * This should never happen. Most likely rq is -+ * a requeued regular request, being -+ * re-inserted without being first -+ * re-prepared. Do a prepare, to avoid -+ * failure. -+ */ -+ pr_warn("Regular request associated with no queue"); -+ WARN_ON(1); -+ bfq_prepare_request(rq, rq->bio); -+ bfqq = RQ_BFQQ(rq); -+ } -+ - #if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - idle_timer_disabled = __bfq_insert_request(bfqd, rq); - /* -@@ -5110,7 +5126,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - } - } - --static void bfq_finish_request_body(struct bfq_queue *bfqq) -+static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "put_request_body: allocated %d", bfqq->allocated); -@@ -5120,7 +5136,13 @@ static void bfq_finish_request_body(struct bfq_queue *bfqq) - bfq_put_queue(bfqq); - } - --static void bfq_finish_request(struct request *rq) -+/* -+ * Handle either a requeue or a finish for rq. The things to do are -+ * the same in both cases: all references to rq are to be dropped. In -+ * particular, rq is considered completed from the point of view of -+ * the scheduler. -+ */ -+static void bfq_finish_requeue_request(struct request *rq) - { - struct bfq_queue *bfqq; - struct bfq_data *bfqd; -@@ -5128,11 +5150,27 @@ static void bfq_finish_request(struct request *rq) - - BUG_ON(!rq); - -- if (!rq->elv.icq) -+ bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * Requeue and finish hooks are invoked in blk-mq without -+ * checking whether the involved request is actually still -+ * referenced in the scheduler. To handle this fact, the -+ * following two checks make this function exit in case of -+ * spurious invocations, for which there is nothing to do. -+ * -+ * First, check whether rq has nothing to do with an elevator. -+ */ -+ if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) - return; - -- bfqq = RQ_BFQQ(rq); -- BUG_ON(!bfqq); -+ /* -+ * rq either is not associated with any icq, or is an already -+ * requeued request that has not (yet) been re-inserted into -+ * a bfq_queue. -+ */ -+ if (!rq->elv.icq || !bfqq) -+ return; - - bic = RQ_BIC(rq); - BUG_ON(!bic); -@@ -5145,7 +5183,6 @@ static void bfq_finish_request(struct request *rq) - BUG(); - } - BUG_ON(rq->rq_flags & RQF_QUEUED); -- BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); - - bfq_log_bfqq(bfqd, bfqq, - "putting rq %p with %u sects left, STARTED %d", -@@ -5166,13 +5203,14 @@ static void bfq_finish_request(struct request *rq) - spin_lock_irqsave(&bfqd->lock, flags); - - bfq_completed_request(bfqq, bfqd); -- bfq_finish_request_body(bfqq); -+ bfq_finish_requeue_request_body(bfqq); - - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - /* - * Request rq may be still/already in the scheduler, -- * in which case we need to remove it. And we cannot -+ * in which case we need to remove it (this should -+ * never happen in case of requeue). And we cannot - * defer such a check and removal, to avoid - * inconsistencies in the time interval from the end - * of this function to the start of the deferred work. -@@ -5189,9 +5227,26 @@ static void bfq_finish_request(struct request *rq) - bfqg_stats_update_io_remove(bfqq_group(bfqq), - rq->cmd_flags); - } -- bfq_finish_request_body(bfqq); -+ bfq_finish_requeue_request_body(bfqq); - } - -+ /* -+ * Reset private fields. In case of a requeue, this allows -+ * this function to correctly do nothing if it is spuriously -+ * invoked again on this same request (see the check at the -+ * beginning of the function). Probably, a better general -+ * design would be to prevent blk-mq from invoking the requeue -+ * or finish hooks of an elevator, for a request that is not -+ * referred by that elevator. -+ * -+ * Resetting the following fields would break the -+ * request-insertion logic if rq is re-inserted into a bfq -+ * internal queue, without a re-preparation. Here we assume -+ * that re-insertions of requeued requests, without -+ * re-preparation, can happen only for pass_through or at_head -+ * requests (which are not re-inserted into bfq internal -+ * queues). -+ */ - rq->elv.priv[0] = NULL; - rq->elv.priv[1] = NULL; - } -@@ -5960,7 +6015,8 @@ static struct elevator_type iosched_bfq_mq = { - .ops.mq = { - .limit_depth = bfq_limit_depth, - .prepare_request = bfq_prepare_request, -- .finish_request = bfq_finish_request, -+ .requeue_request = bfq_finish_requeue_request, -+ .finish_request = bfq_finish_requeue_request, - .exit_icq = bfq_exit_icq, - .insert_requests = bfq_insert_requests, - .dispatch_request = bfq_dispatch_request, - -From 3e4f292191cc62b3844316b9741534c3f1b36f0a Mon Sep 17 00:00:00 2001 -From: Davide Paganelli -Date: Thu, 8 Feb 2018 12:19:24 +0100 -Subject: [PATCH 18/23] block, bfq-mq, bfq-sq: make log functions print names - of calling functions - -Add the macro __func__ as a parameter to the invocations of the functions -pr_crit, blk_add_trace_msg and blk_add_cgroup_trace_msg in bfq_log* -functions, in order to include automatically in the log messages -the names of the functions that call the log functions. -The programmer can then avoid doing it. - -Signed-off-by: Davide Paganelli -Signed-off-by: Paolo Valente ---- - block/bfq-cgroup-included.c | 9 +-- - block/bfq-mq-iosched.c | 167 ++++++++++++++++++++++---------------------- - block/bfq-mq.h | 33 ++++----- - block/bfq-sched.c | 54 +++++++------- - block/bfq-sq-iosched.c | 134 +++++++++++++++++------------------ - block/bfq.h | 33 ++++----- - 6 files changed, 214 insertions(+), 216 deletions(-) - -diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c -index a4f8a03edfc9..613f154e9da5 100644 ---- a/block/bfq-cgroup-included.c -+++ b/block/bfq-cgroup-included.c -@@ -382,7 +382,8 @@ static void bfq_init_entity(struct bfq_entity *entity, - * Make sure that bfqg and its associated blkg do not - * disappear before entity. - */ -- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting bfqg %p and blkg\n", __func__, bfqg); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting bfqg %p and blkg\n", -+ bfqg); - - bfqg_and_blkg_get(bfqg); - #else -@@ -651,7 +652,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - } - #ifdef BFQ_MQ -- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); - - bfqg_and_blkg_put(bfqq_group(bfqq)); - #else -@@ -661,7 +662,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - entity->parent = bfqg->my_entity; - entity->sched_data = &bfqg->sched_data; - #ifdef BFQ_MQ -- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting blkg and bfqg %p\n", __func__, bfqg); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting blkg and bfqg %p\n", bfqg); - - /* pin down bfqg and its associated blkg */ - bfqg_and_blkg_get(bfqg); -@@ -721,7 +722,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - if (entity->sched_data != &bfqg->sched_data) { - bic_set_bfqq(bic, NULL, 0); - bfq_log_bfqq(bfqd, async_bfqq, -- "bic_change_group: %p %d", -+ "%p %d", - async_bfqq, - async_bfqq->ref); - bfq_put_queue(async_bfqq); -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index bb7ccc2f1165..edc93b6af186 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -310,7 +310,7 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - static void bfq_schedule_dispatch(struct bfq_data *bfqd) - { - if (bfqd->queued != 0) { -- bfq_log(bfqd, "schedule dispatch"); -+ bfq_log(bfqd, ""); - blk_mq_run_hw_queues(bfqd->queue, true); - } - } -@@ -489,8 +489,8 @@ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) - data->shallow_depth = - bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; - -- bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", -- __func__, bfqd->wr_busy_queues, op_is_sync(op), -+ bfq_log(bfqd, "wr_busy %d sync %d depth %u", -+ bfqd->wr_busy_queues, op_is_sync(op), - data->shallow_depth); - } - -@@ -528,7 +528,7 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - if (rb_link) - *rb_link = p; - -- bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", -+ bfq_log(bfqd, "%llu: returning %d", - (unsigned long long) sector, - bfqq ? bfqq->pid : 0); - -@@ -749,7 +749,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, - if (rq == last || ktime_get_ns() < rq->fifo_time) - return NULL; - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); - BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); - return rq; - } -@@ -842,7 +842,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, - bfq_serv_to_charge(next_rq, bfqq)); - if (entity->budget != new_budget) { - entity->budget = new_budget; -- bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -+ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", - new_budget); - bfq_requeue_bfqq(bfqd, bfqq, false); - } -@@ -915,8 +915,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -- __func__, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", - bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, - bfqq->wr_cur_max_time); - -@@ -929,11 +928,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - bfq_wr_duration(bfqd))) { - switch_back_to_interactive_wr(bfqq, bfqd); - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "resume state: switching back to interactive"); -+ "switching back to interactive"); - } else { - bfqq->wr_coeff = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "resume state: switching off wr (%lu + %lu < %lu)", -+ "switching off wr (%lu + %lu < %lu)", - bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, - jiffies); - } -@@ -985,7 +984,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* Increment burst size to take into account also bfqq */ - bfqd->burst_size++; - -- bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); -+ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); - - BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); - -@@ -998,7 +997,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * other to consider this burst as large. - */ - bfqd->large_burst = true; -- bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); -+ bfq_log_bfqq(bfqd, bfqq, "large burst started"); - - /* - * We can now mark all queues in the burst list as -@@ -1170,7 +1169,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfqd->large_burst = false; - bfq_reset_burst_list(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "handle_burst: late activation or different group"); -+ "late activation or different group"); - goto end; - } - -@@ -1180,7 +1179,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * bfqq as belonging to this large burst immediately. - */ - if (bfqd->large_burst) { -- bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); -+ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - goto end; - } -@@ -1686,7 +1685,7 @@ static void bfq_add_request(struct request *rq) - unsigned int old_wr_coeff = bfqq->wr_coeff; - bool interactive = false; - -- bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", -+ bfq_log_bfqq(bfqd, bfqq, "size %u %s", - blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); - - if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -@@ -1952,7 +1951,7 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, - __rq = bfq_find_rq_fmerge(bfqd, bio, q); - if (__rq && elv_bio_merge_ok(__rq, bio)) { - *req = __rq; -- bfq_log(bfqd, "request_merge: req %p", __rq); -+ bfq_log(bfqd, "req %p", __rq); - - return ELEVATOR_FRONT_MERGE; - } -@@ -1989,7 +1988,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, - bfqq->next_rq = next_rq; - - bfq_log_bfqq(bfqd, bfqq, -- "request_merged: req %p prev %p next_rq %p bfqq %p", -+ "req %p prev %p next_rq %p bfqq %p", - req, prev, next_rq, bfqq); - - /* -@@ -2018,7 +2017,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, - goto end; - - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "requests_merged: rq %p next %p bfqq %p next_bfqq %p", -+ "rq %p next %p bfqq %p next_bfqq %p", - rq, next, bfqq, next_bfqq); - - spin_lock_irq(&bfqq->bfqd->lock); -@@ -2069,10 +2068,10 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) - */ - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "end_wr: wrais ending at %lu, rais_max_time %u", -+ "wrais ending at %lu, rais_max_time %u", - bfqq->last_wr_start_finish, - jiffies_to_msecs(bfqq->wr_cur_max_time)); -- bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", - bfqq->bfqd->wr_busy_queues); - } - -@@ -2245,8 +2244,8 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - { - if (bfq_too_late_for_merging(new_bfqq)) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] too late for bfq%d to be merged", -- __func__, new_bfqq->pid); -+ "too late for bfq%d to be merged", -+ new_bfqq->pid); - return false; - } - -@@ -2395,8 +2394,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - } - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -- __func__, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", - bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, - bfqq->wr_cur_max_time); - } -@@ -2453,7 +2451,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - - } - -- bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", -+ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", - bfqd->wr_busy_queues); - - /* -@@ -2554,7 +2552,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd, - bfqq->budget_timeout = jiffies + - bfqd->bfq_timeout * timeout_coeff; - -- bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ bfq_log_bfqq(bfqd, bfqq, "%u", - jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); - } - -@@ -2620,10 +2618,10 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - - bfq_set_budget_timeout(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "set_in_service_queue, cur-budget = %d", -+ "cur-budget = %d", - bfqq->entity.budget); - } else -- bfq_log(bfqd, "set_in_service_queue: NULL"); -+ bfq_log(bfqd, "NULL"); - - bfqd->in_service_queue = bfqq; - } -@@ -2746,7 +2744,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq - bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ - - bfq_log(bfqd, -- "reset_rate_computation at end, sample %u/%u tot_sects %llu", -+ "at end, sample %u/%u tot_sects %llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched); - } -@@ -2766,7 +2764,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || - bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { - bfq_log(bfqd, -- "update_rate_reset: only resetting, delta_first %lluus samples %d", -+ "only resetting, delta_first %lluus samples %d", - bfqd->delta_from_first>>10, bfqd->peak_rate_samples); - goto reset_computation; - } -@@ -2790,7 +2788,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); - - bfq_log(bfqd, --"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", - bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - rate > 20<peak_rate) || - rate > 20<peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - goto reset_computation; - } else { - bfq_log(bfqd, -- "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", -+ "do update, samples %u/%u rate/peak %llu/%llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -@@ -2868,7 +2866,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - rate /= divisor; /* smoothing constant alpha = 1/divisor */ - - bfq_log(bfqd, -- "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", -+ "divisor %d tmp_peak_rate %llu tmp_rate %u", - divisor, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), - (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -@@ -2922,7 +2920,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - - if (bfqd->peak_rate_samples == 0) { /* first dispatch */ - bfq_log(bfqd, -- "update_peak_rate: goto reset, samples %d", -+ "goto reset, samples %d", - bfqd->peak_rate_samples) ; - bfq_reset_rate_computation(bfqd, rq); - goto update_last_values; /* will add one sample */ -@@ -2943,7 +2941,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && - bfqd->rq_in_driver == 0) { - bfq_log(bfqd, --"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", -+"jumping to updating&resetting delta_last %lluus samples %d", - (now_ns - bfqd->last_dispatch)>>10, - bfqd->peak_rate_samples) ; - goto update_rate_and_reset; -@@ -2969,7 +2967,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - bfqd->delta_from_first = now_ns - bfqd->first_dispatch; - - bfq_log(bfqd, -- "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", -+ "added samples %u/%u tot_sects %llu delta_first %lluus", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched, - bfqd->delta_from_first>>10); -@@ -2985,12 +2983,12 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - bfqd->last_dispatch = now_ns; - - bfq_log(bfqd, -- "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", -+ "delta_first %lluus last_pos %llu peak_rate %llu", - (now_ns - bfqd->first_dispatch)>>10, - (unsigned long long) bfqd->last_position, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - bfq_log(bfqd, -- "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); -+ "samples at end %d", bfqd->peak_rate_samples); - } - - /* -@@ -3088,11 +3086,11 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - */ - budget = 2 * min_budget; - -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", -+ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", - bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", -+ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", - budget, bfq_min_budget(bfqd)); -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", - bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); - - if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -@@ -3294,7 +3292,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - else /* charge at least one seek */ - *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; - -- bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); -+ bfq_log(bfqd, "too short %u", delta_usecs); - - return slow; - } -@@ -3317,11 +3315,11 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * peak rate. - */ - slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -- bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", -+ bfq_log(bfqd, "relative rate %d/%d", - bfqq->entity.service, bfqd->bfq_max_budget); - } - -- bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); -+ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); - - return slow; - } -@@ -3423,7 +3421,7 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqd, bfqq, --"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", -+"service_blkg %lu soft_rate %u sects/sec interval %u", - bfqq->service_from_backlogged, - bfqd->bfq_wr_max_softrt_rate, - jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -@@ -3602,7 +3600,7 @@ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) - static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "may_budget_timeout: wait_request %d left %d timeout %d", -+ "wait_request %d left %d timeout %d", - bfq_bfqq_wait_request(bfqq), - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, - bfq_bfqq_budget_timeout(bfqq)); -@@ -3863,11 +3861,11 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * either boosts the throughput (without issues), or is - * necessary to preserve service guarantees. - */ -- bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", -+ bfq_log_bfqq(bfqd, bfqq, "sync %d idling_boosts_thr %d", - bfq_bfqq_sync(bfqq), idling_boosts_thr); - - bfq_log_bfqq(bfqd, bfqq, -- "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", -+ "wr_busy %d boosts %d IO-bound %d guar %d", - bfqd->wr_busy_queues, - idling_boosts_thr_without_issues, - bfq_bfqq_IO_bound(bfqq), -@@ -3907,7 +3905,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - if (!bfqq) - goto new_queue; - -- bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); -+ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); - - if (bfq_may_expire_for_budg_timeout(bfqq) && - !bfq_bfqq_wait_request(bfqq) && -@@ -3983,14 +3981,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - new_queue: - bfqq = bfq_set_in_service_queue(bfqd); - if (bfqq) { -- bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); -+ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); - goto check_queue; - } - keep_queue: - if (bfqq) -- bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); -+ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); - else -- bfq_log(bfqd, "select_queue: no queue returned"); -+ bfq_log(bfqd, "no queue returned"); - - return bfqq; - } -@@ -4043,8 +4041,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* see comments on max_service_from_wr */ - bfq_bfqq_end_wr(bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "[%s] too much service", -- __func__); -+ "too much service"); - } - } - /* -@@ -4122,7 +4119,7 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - -- bfq_log(bfqd, "has_work, dispatch_non_empty %d busy_queues %d", -+ bfq_log(bfqd, "dispatch_non_empty %d busy_queues %d", - !list_empty_careful(&bfqd->dispatch), bfqd->busy_queues > 0); - - /* -@@ -4146,7 +4143,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq->rq_flags &= ~RQF_DISP_LIST; - - bfq_log(bfqd, -- "dispatch requests: picked %p from dispatch list", rq); -+ "picked %p from dispatch list", rq); - bfqq = RQ_BFQQ(rq); - - if (bfqq) { -@@ -4196,7 +4193,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - goto start_rq; - } - -- bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ bfq_log(bfqd, "%d busy queues", bfqd->busy_queues); - - if (bfqd->busy_queues == 0) - goto exit; -@@ -4236,13 +4233,13 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - rq->rq_flags |= RQF_STARTED; - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, -- "dispatched %s request %p, rq_in_driver %d", -+ "%s request %p, rq_in_driver %d", - bfq_bfqq_sync(bfqq) ? "sync" : "async", - rq, - bfqd->rq_in_driver); - else - bfq_log(bfqd, -- "dispatched request %p from dispatch list, rq_in_driver %d", -+ "request %p from dispatch list, rq_in_driver %d", - rq, bfqd->rq_in_driver); - } else - bfq_log(bfqd, -@@ -4339,7 +4336,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - BUG_ON(bfqq->ref <= 0); - - if (bfqq->bfqd) -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); - - bfqq->ref--; - if (bfqq->ref) -@@ -4383,10 +4380,10 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - } - - if (bfqq->bfqd) -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); - - #ifdef BFQ_GROUP_IOSCHED_ENABLED -- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); - bfqg_and_blkg_put(bfqg); - #endif - kmem_cache_free(bfq_pool, bfqq); -@@ -4418,7 +4415,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_schedule_dispatch(bfqd); - } - -- bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); - - bfq_put_cooperator(bfqq); - -@@ -4502,7 +4499,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "set_next_ioprio_data: bic_class %d prio %d class %d", -+ "bic_class %d prio %d class %d", - ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); - } - -@@ -4529,7 +4526,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); - bic_set_bfqq(bic, bfqq, false); - bfq_log_bfqq(bfqd, bfqq, -- "check_ioprio_change: bfqq %p %d", -+ "bfqq %p %d", - bfqq, bfqq->ref); - } - -@@ -4667,14 +4664,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - * guarantee that this queue is not freed - * until its group goes away. - */ -- bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -+ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", - bfqq, bfqq->ref); - *async_bfqq = bfqq; - } - - out: - bfqq->ref++; /* get a process reference to this queue */ -- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); - rcu_read_unlock(); - return bfqq; - } -@@ -4733,7 +4730,7 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, - bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) - has_short_ttime = false; - -- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -+ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", - has_short_ttime); - - if (has_short_ttime) -@@ -4759,7 +4756,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_update_io_seektime(bfqd, bfqq, rq); - - bfq_log_bfqq(bfqd, bfqq, -- "rq_enqueued: has_short_ttime=%d (seeky %d)", -+ "has_short_ttime=%d (seeky %d)", - bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); - - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -@@ -4818,7 +4815,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - - assert_spin_locked(&bfqd->lock); - -- bfq_log_bfqq(bfqd, bfqq, "__insert_req: rq %p bfqq %p", rq, bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "rq %p bfqq %p", rq, bfqq); - - /* - * An unplug may trigger a requeue of a request from the device -@@ -4837,9 +4834,9 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - new_bfqq->allocated++; - bfqq->allocated--; - bfq_log_bfqq(bfqd, bfqq, -- "insert_request: new allocated %d", bfqq->allocated); -+ "new allocated %d", bfqq->allocated); - bfq_log_bfqq(bfqd, new_bfqq, -- "insert_request: new_bfqq new allocated %d", -+ "new_bfqq new allocated %d", - bfqq->allocated); - - new_bfqq->ref++; -@@ -4911,11 +4908,11 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - rq->rq_flags |= RQF_DISP_LIST; - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, -- "insert_request %p in disp: at_head %d", -+ "%p in disp: at_head %d", - rq, at_head); - else - bfq_log(bfqd, -- "insert_request %p in disp: at_head %d", -+ "%p in disp: at_head %d", - rq, at_head); - } else { - BUG_ON(!(rq->rq_flags & RQF_GOT)); -@@ -5033,7 +5030,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - bfqq->dispatched--; - - bfq_log_bfqq(bfqd, bfqq, -- "completed_requests: new disp %d, new rq_in_driver %d", -+ "new disp %d, new rq_in_driver %d", - bfqq->dispatched, bfqd->rq_in_driver); - - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -@@ -5061,7 +5058,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); - - bfq_log_bfqq(bfqd, bfqq, -- "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ "delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, - delta_us > 0 ? - (USEC_PER_SEC* -@@ -5129,7 +5126,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) - static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "put_request_body: allocated %d", bfqq->allocated); -+ "allocated %d", bfqq->allocated); - BUG_ON(!bfqq->allocated); - bfqq->allocated--; - -@@ -5406,10 +5403,10 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio) - - bfqq->allocated++; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "get_request: new allocated %d", bfqq->allocated); -+ "new allocated %d", bfqq->allocated); - - bfqq->ref++; -- bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "%p: bfqq %p, %d", rq, bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; -@@ -5493,7 +5490,7 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) - idle_slice_timer); - struct bfq_queue *bfqq = bfqd->in_service_queue; - -- bfq_log(bfqd, "slice_timer expired"); -+ bfq_log(bfqd, "expired"); - - /* - * Theoretical race here: the in-service queue can be NULL or -@@ -5515,10 +5512,10 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, - struct bfq_group *root_group = bfqd->root_group; - struct bfq_queue *bfqq = *bfqq_ptr; - -- bfq_log(bfqd, "put_async_bfqq: %p", bfqq); -+ bfq_log(bfqd, "%p", bfqq); - if (bfqq) { - bfq_bfqq_move(bfqd, bfqq, root_group); -- bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -+ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - *bfqq_ptr = NULL; -@@ -5547,7 +5544,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - struct bfq_data *bfqd = e->elevator_data; - struct bfq_queue *bfqq, *n; - -- bfq_log(bfqd, "exit_queue: starting ..."); -+ bfq_log(bfqd, "starting ..."); - - hrtimer_cancel(&bfqd->idle_slice_timer); - -@@ -5575,7 +5572,7 @@ static void bfq_exit_queue(struct elevator_queue *e) - spin_unlock_irq(&bfqd->lock); - #endif - -- bfq_log(bfqd, "exit_queue: finished ..."); -+ bfq_log(bfqd, "finished ..."); - kfree(bfqd); - } - -diff --git a/block/bfq-mq.h b/block/bfq-mq.h -index 9a5ce1168ff5..e2ae11bf8f76 100644 ---- a/block/bfq-mq.h -+++ b/block/bfq-mq.h -@@ -712,34 +712,34 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -- pr_crit("%s bfq%d%c %s " fmt "\n", \ -+ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- bfqq_group(bfqq)->blkg_path, ##args); \ -+ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -- pr_crit("%s %s " fmt "\n", \ -+ pr_crit("%s %s [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- bfqg->blkg_path, ##args); \ -+ bfqg->blkg_path, __func__, ##args); \ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -- pr_crit("%s bfq%d%c " fmt "\n", \ -+ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- ##args) -+ __func__, ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - - #endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ -- pr_crit("%s bfq " fmt "\n", \ -+ pr_crit("%s bfq [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- ##args) -+ __func__, ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - -@@ -762,28 +762,29 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); - static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -- blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- bfqq_group(bfqq)->blkg_path, ##args); \ -+ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -- blk_add_trace_msg((bfqd)->queue, "%s " fmt, bfqg->blkg_path, ##args);\ -+ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, bfqg->blkg_path, \ -+ __func__, ##args);\ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -- blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- ##args) -+ __func__, ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - - #endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ -- blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) - - #endif /* CONFIG_BLK_DEV_IO_TRACE */ - #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -@@ -938,7 +939,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "entity_service_tree %p %d", -+ "%p %d", - sched_data->service_tree + idx, idx); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { -@@ -946,7 +947,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "entity_service_tree %p %d", -+ "%p %d", - sched_data->service_tree + idx, idx); - } - #endif -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 4e6c5232e2fb..ead34c30a7c2 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -119,7 +119,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_next_in_service: chose without lookup"); -+ "chose without lookup"); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = -@@ -127,7 +127,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data*)bfqg->bfqd, bfqg, -- "update_next_in_service: chose without lookup"); -+ "chose without lookup"); - } - #endif - } -@@ -148,7 +148,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - bfqq = bfq_entity_to_bfqq(next_in_service); - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_next_in_service: chosen this queue"); -+ "chosen this queue"); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { - struct bfq_group *bfqg = -@@ -156,7 +156,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "update_next_in_service: chosen this entity"); -+ "chosen this entity"); - } - #endif - return parent_sched_may_change; -@@ -331,10 +331,10 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "calc_finish: serv %lu, w %d", -+ "serv %lu, w %d", - service, entity->weight); - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "calc_finish: start %llu, finish %llu, delta %llu", -+ "start %llu, finish %llu, delta %llu", - start, finish, delta); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -342,10 +342,10 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "calc_finish group: serv %lu, w %d", -+ "group: serv %lu, w %d", - service, entity->weight); - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "calc_finish group: start %llu, finish %llu, delta %llu", -+ "group: start %llu, finish %llu, delta %llu", - start, finish, delta); - #endif - } -@@ -484,7 +484,7 @@ static void bfq_update_active_node(struct rb_node *node) - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_active_node: new min_start %llu", -+ "new min_start %llu", - ((entity->min_start>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -492,7 +492,7 @@ static void bfq_update_active_node(struct rb_node *node) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "update_active_node: new min_start %llu", -+ "new min_start %llu", - ((entity->min_start>>10)*1000)>>12); - #endif - } -@@ -620,7 +620,7 @@ static void bfq_get_entity(struct bfq_entity *entity) - - if (bfqq) { - bfqq->ref++; -- bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", - bfqq, bfqq->ref); - } - } -@@ -748,7 +748,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st, - entity->on_st = false; - st->wsum -= entity->weight; - if (bfqq && !is_in_service) { -- bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity (before): %p %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "(before): %p %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - } -@@ -1008,7 +1008,7 @@ static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, - tot_serv_to_charge = entity->service; - - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "charge_time: %lu/%u ms, %d/%d/%d sectors", -+ "%lu/%u ms, %d/%d/%d sectors", - time_ms, timeout_ms, entity->service, - tot_serv_to_charge, entity->budget); - -@@ -1080,7 +1080,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_fin_time_enqueue: new queue finish %llu", -+ "new queue finish %llu", - ((entity->finish>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -1088,7 +1088,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "update_fin_time_enqueue: new group finish %llu", -+ "new group finish %llu", - ((entity->finish>>10)*1000)>>12); - #endif - } -@@ -1098,7 +1098,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "update_fin_time_enqueue: queue %seligible in st %p", -+ "queue %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - } else { -@@ -1106,7 +1106,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "update_fin_time_enqueue: group %seligible in st %p", -+ "group %seligible in st %p", - entity->start <= st->vtime ? "" : "non ", st); - #endif - } -@@ -1550,7 +1550,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "calc_vtime_jump: new value %llu", -+ "new value %llu", - ((root_entity->min_start>>10)*1000)>>12); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { -@@ -1559,7 +1559,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "calc_vtime_jump: new value %llu", -+ "new value %llu", - ((root_entity->min_start>>10)*1000)>>12); - } - #endif -@@ -1677,7 +1677,7 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) - bfqq = bfq_entity_to_bfqq(entity); - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "__lookup_next: start %llu vtime %llu st %p", -+ "start %llu vtime %llu st %p", - ((entity->start>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -@@ -1686,7 +1686,7 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "__lookup_next: start %llu vtime %llu (%llu) st %p", -+ "start %llu vtime %llu (%llu) st %p", - ((entity->start>>10)*1000)>>12, - ((st->vtime>>10)*1000)>>12, - ((new_vtime>>10)*1000)>>12, st); -@@ -1821,14 +1821,14 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg(bfqd, bfqg, -- "get_next_queue: lookup in this group"); -+ "lookup in this group"); - if (!sd->next_in_service) -- pr_crit("get_next_queue: lookup in this group"); -+ pr_crit("lookup in this group"); - } else { - bfq_log_bfqg(bfqd, bfqd->root_group, -- "get_next_queue: lookup in root group"); -+ "lookup in root group"); - if (!sd->next_in_service) -- pr_crit("get_next_queue: lookup in root group"); -+ pr_crit("lookup in root group"); - } - #endif - -@@ -1903,7 +1903,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - bfqq = bfq_entity_to_bfqq(entity); - if (bfqq) - bfq_log_bfqq(bfqd, bfqq, -- "get_next_queue: this queue, finish %llu", -+ "this queue, finish %llu", - (((entity->finish>>10)*1000)>>10)>>2); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { -@@ -1911,7 +1911,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg(bfqd, bfqg, -- "get_next_queue: this entity, finish %llu", -+ "this entity, finish %llu", - (((entity->finish>>10)*1000)>>10)>>2); - } - #endif -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index c4df156b1fb4..e49e8ac882b3 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -281,7 +281,7 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd); - static void bfq_schedule_dispatch(struct bfq_data *bfqd) - { - if (bfqd->queued != 0) { -- bfq_log(bfqd, "schedule dispatch"); -+ bfq_log(bfqd, ""); - kblockd_schedule_work(&bfqd->unplug_work); - } - } -@@ -414,7 +414,7 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, - if (rb_link) - *rb_link = p; - -- bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", -+ bfq_log(bfqd, "%llu: returning %d", - (unsigned long long) sector, - bfqq ? bfqq->pid : 0); - -@@ -635,7 +635,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, - if (rq == last || ktime_get_ns() < rq->fifo_time) - return NULL; - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); - BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); - return rq; - } -@@ -728,7 +728,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, - bfq_serv_to_charge(next_rq, bfqq)); - if (entity->budget != new_budget) { - entity->budget = new_budget; -- bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -+ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", - new_budget); - bfq_requeue_bfqq(bfqd, bfqq, false); - } -@@ -800,8 +800,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", -- __func__, -+ "bic %p wr_coeff %d start_finish %lu max_time %lu", - bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, - bfqq->wr_cur_max_time); - -@@ -814,11 +813,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, - bfq_wr_duration(bfqd))) { - switch_back_to_interactive_wr(bfqq, bfqd); - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "resume state: switching back to interactive"); -+ "switching back to interactive"); - } else { - bfqq->wr_coeff = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "resume state: switching off wr (%lu + %lu < %lu)", -+ "switching off wr (%lu + %lu < %lu)", - bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, - jiffies); - } -@@ -870,7 +869,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* Increment burst size to take into account also bfqq */ - bfqd->burst_size++; - -- bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); -+ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); - - BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); - -@@ -883,7 +882,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * other to consider this burst as large. - */ - bfqd->large_burst = true; -- bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); -+ bfq_log_bfqq(bfqd, bfqq, "large burst started"); - - /* - * We can now mark all queues in the burst list as -@@ -1055,7 +1054,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfqd->large_burst = false; - bfq_reset_burst_list(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "handle_burst: late activation or different group"); -+ "late activation or different group"); - goto end; - } - -@@ -1065,7 +1064,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * bfqq as belonging to this large burst immediately. - */ - if (bfqd->large_burst) { -- bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); -+ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - goto end; - } -@@ -1572,7 +1571,7 @@ static void bfq_add_request(struct request *rq) - unsigned int old_wr_coeff = bfqq->wr_coeff; - bool interactive = false; - -- bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", -+ bfq_log_bfqq(bfqd, bfqq, "size %u %s", - blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); - - if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -@@ -1870,10 +1869,10 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) - */ - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "end_wr: wrais ending at %lu, rais_max_time %u", -+ "wrais ending at %lu, rais_max_time %u", - bfqq->last_wr_start_finish, - jiffies_to_msecs(bfqq->wr_cur_max_time)); -- bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", - bfqq->bfqd->wr_busy_queues); - } - -@@ -2048,8 +2047,8 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - { - if (bfq_too_late_for_merging(new_bfqq)) { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "[%s] too late for bfq%d to be merged", -- __func__, new_bfqq->pid); -+ "too late for bfq%d to be merged", -+ new_bfqq->pid); - return false; - } - -@@ -2258,7 +2257,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - - } - -- bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", -+ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", - bfqd->wr_busy_queues); - - /* -@@ -2359,7 +2358,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd, - bfqq->budget_timeout = jiffies + - bfqd->bfq_timeout * timeout_coeff; - -- bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ bfq_log_bfqq(bfqd, bfqq, "%u", - jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); - } - -@@ -2427,10 +2426,10 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - - bfq_set_budget_timeout(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "set_in_service_queue, cur-budget = %d", -+ "cur-budget = %d", - bfqq->entity.budget); - } else -- bfq_log(bfqd, "set_in_service_queue: NULL"); -+ bfq_log(bfqd, "NULL"); - - bfqd->in_service_queue = bfqq; - } -@@ -2559,7 +2558,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq - bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ - - bfq_log(bfqd, -- "reset_rate_computation at end, sample %u/%u tot_sects %llu", -+ "at end, sample %u/%u tot_sects %llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched); - } -@@ -2579,7 +2578,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || - bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { - bfq_log(bfqd, -- "update_rate_reset: only resetting, delta_first %lluus samples %d", -+ "only resetting, delta_first %lluus samples %d", - bfqd->delta_from_first>>10, bfqd->peak_rate_samples); - goto reset_computation; - } -@@ -2603,7 +2602,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); - - bfq_log(bfqd, --"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", - bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - rate > 20<peak_rate) || - rate > 20<peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - goto reset_computation; - } else { - bfq_log(bfqd, -- "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", -+ "do update, samples %u/%u rate/peak %llu/%llu", - bfqd->peak_rate_samples, bfqd->sequential_samples, - ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -@@ -2681,7 +2680,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - rate /= divisor; /* smoothing constant alpha = 1/divisor */ - - bfq_log(bfqd, -- "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", -+ "divisor %d tmp_peak_rate %llu tmp_rate %u", - divisor, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), - (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -@@ -2735,7 +2734,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - - if (bfqd->peak_rate_samples == 0) { /* first dispatch */ - bfq_log(bfqd, -- "update_peak_rate: goto reset, samples %d", -+ "goto reset, samples %d", - bfqd->peak_rate_samples) ; - bfq_reset_rate_computation(bfqd, rq); - goto update_last_values; /* will add one sample */ -@@ -2756,7 +2755,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && - bfqd->rq_in_driver == 0) { - bfq_log(bfqd, --"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", -+"jumping to updating&resetting delta_last %lluus samples %d", - (now_ns - bfqd->last_dispatch)>>10, - bfqd->peak_rate_samples) ; - goto update_rate_and_reset; -@@ -2782,7 +2781,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - bfqd->delta_from_first = now_ns - bfqd->first_dispatch; - - bfq_log(bfqd, -- "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", -+ "added samples %u/%u tot_sects %llu delta_first %lluus", - bfqd->peak_rate_samples, bfqd->sequential_samples, - bfqd->tot_sectors_dispatched, - bfqd->delta_from_first>>10); -@@ -2798,12 +2797,12 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) - bfqd->last_dispatch = now_ns; - - bfq_log(bfqd, -- "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", -+ "delta_first %lluus last_pos %llu peak_rate %llu", - (now_ns - bfqd->first_dispatch)>>10, - (unsigned long long) bfqd->last_position, - ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); - bfq_log(bfqd, -- "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); -+ "samples at end %d", bfqd->peak_rate_samples); - } - - /* -@@ -2900,11 +2899,11 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - */ - budget = 2 * min_budget; - -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", -+ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", - bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", -+ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", - budget, bfq_min_budget(bfqd)); -- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", - bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); - - if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { -@@ -3106,7 +3105,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - else /* charge at least one seek */ - *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; - -- bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); -+ bfq_log(bfqd, "too short %u", delta_usecs); - - return slow; - } -@@ -3129,11 +3128,11 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * peak rate. - */ - slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -- bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", -+ bfq_log(bfqd, "relative rate %d/%d", - bfqq->entity.service, bfqd->bfq_max_budget); - } - -- bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); -+ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); - - return slow; - } -@@ -3235,7 +3234,7 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqd, bfqq, --"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", -+"service_blkg %lu soft_rate %u sects/sec interval %u", - bfqq->service_from_backlogged, - bfqd->bfq_wr_max_softrt_rate, - jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -@@ -3414,7 +3413,7 @@ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) - static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "may_budget_timeout: wait_request %d left %d timeout %d", -+ "wait_request %d left %d timeout %d", - bfq_bfqq_wait_request(bfqq), - bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, - bfq_bfqq_budget_timeout(bfqq)); -@@ -3675,11 +3674,11 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * either boosts the throughput (without issues), or is - * necessary to preserve service guarantees. - */ -- bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", -+ bfq_log_bfqq(bfqd, bfqq, "sync %d idling_boosts_thr %d", - bfq_bfqq_sync(bfqq), idling_boosts_thr); - - bfq_log_bfqq(bfqd, bfqq, -- "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", -+ "wr_busy %d boosts %d IO-bound %d guar %d", - bfqd->wr_busy_queues, - idling_boosts_thr_without_issues, - bfq_bfqq_IO_bound(bfqq), -@@ -3719,7 +3718,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - if (!bfqq) - goto new_queue; - -- bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); -+ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); - - if (bfq_may_expire_for_budg_timeout(bfqq) && - !hrtimer_active(&bfqd->idle_slice_timer) && -@@ -3797,14 +3796,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - new_queue: - bfqq = bfq_set_in_service_queue(bfqd); - if (bfqq) { -- bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); -+ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); - goto check_queue; - } - keep_queue: - if (bfqq) -- bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); -+ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); - else -- bfq_log(bfqd, "select_queue: no queue returned"); -+ bfq_log(bfqd, "no queue returned"); - - return bfqq; - } -@@ -3857,8 +3856,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* see comments on max_service_from_wr */ - bfq_bfqq_end_wr(bfqq); - bfq_log_bfqq(bfqd, bfqq, -- "[%s] too much service", -- __func__); -+ "too much service"); - } - } - /* -@@ -3987,7 +3985,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq; - -- bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ bfq_log(bfqd, "%d busy queues", bfqd->busy_queues); - - if (bfqd->busy_queues == 0) - return 0; -@@ -4021,7 +4019,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - if (!bfq_dispatch_request(bfqd, bfqq)) - return 0; - -- bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", -+ bfq_log_bfqq(bfqd, bfqq, "%s request", - bfq_bfqq_sync(bfqq) ? "sync" : "async"); - - BUG_ON(bfqq->next_rq == NULL && -@@ -4044,7 +4042,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - - BUG_ON(bfqq->ref <= 0); - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); - bfqq->ref--; - if (bfqq->ref) - return; -@@ -4086,7 +4084,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - bfqq->bfqd->burst_size--; - } - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); - #ifdef BFQ_GROUP_IOSCHED_ENABLED -@@ -4120,7 +4118,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_schedule_dispatch(bfqd); - } - -- bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); - - bfq_put_cooperator(bfqq); - -@@ -4200,7 +4198,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, - bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); - bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "set_next_ioprio_data: bic_class %d prio %d class %d", -+ "bic_class %d prio %d class %d", - ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); - } - -@@ -4227,7 +4225,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); - bic_set_bfqq(bic, bfqq, false); - bfq_log_bfqq(bfqd, bfqq, -- "check_ioprio_change: bfqq %p %d", -+ "bfqq %p %d", - bfqq, bfqq->ref); - } - -@@ -4362,14 +4360,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, - * guarantee that this queue is not freed - * until its group goes away. - */ -- bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -+ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", - bfqq, bfqq->ref); - *async_bfqq = bfqq; - } - - out: - bfqq->ref++; /* get a process reference to this queue */ -- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); - rcu_read_unlock(); - return bfqq; - } -@@ -4428,7 +4426,7 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, - bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) - has_short_ttime = false; - -- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", -+ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", - has_short_ttime); - - if (has_short_ttime) -@@ -4454,7 +4452,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfq_update_io_seektime(bfqd, bfqq, rq); - - bfq_log_bfqq(bfqd, bfqq, -- "rq_enqueued: has_short_ttime=%d (seeky %d)", -+ "has_short_ttime=%d (seeky %d)", - bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); - - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -@@ -4629,7 +4627,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - */ - delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); - -- bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ bfq_log(bfqd, "delta %uus/%luus max_size %u rate %llu/%llu", - delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, - delta_us > 0 ? - (USEC_PER_SEC* -@@ -4750,7 +4748,7 @@ static void bfq_put_request(struct request *rq) - rq->elv.priv[0] = NULL; - rq->elv.priv[1] = NULL; - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p, %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - } -@@ -4816,7 +4814,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: was_in_list %d " -+ "was_in_list %d " - "was_in_large_burst %d " - "large burst in progress %d", - bic->was_in_burst_list, -@@ -4826,12 +4824,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: marking in " -+ "marking in " - "large burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - } else { - bfq_log_bfqq(bfqd, bfqq, -- "set_request: clearing in " -+ "clearing in " - "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) -@@ -4888,7 +4886,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - - bfqq->allocated[rw]++; - bfqq->ref++; -- bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "bfqq %p, %d", bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; -@@ -4962,7 +4960,7 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) - * case we just expire a queue too early. - */ - if (bfqq) { -- bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -+ bfq_log_bfqq(bfqd, bfqq, "expired"); - bfq_clear_bfqq_wait_request(bfqq); - - if (bfq_bfqq_budget_timeout(bfqq)) -@@ -5005,10 +5003,10 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, - struct bfq_group *root_group = bfqd->root_group; - struct bfq_queue *bfqq = *bfqq_ptr; - -- bfq_log(bfqd, "put_async_bfqq: %p", bfqq); -+ bfq_log(bfqd, "%p", bfqq); - if (bfqq) { - bfq_bfqq_move(bfqd, bfqq, root_group); -- bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -+ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); - *bfqq_ptr = NULL; -diff --git a/block/bfq.h b/block/bfq.h -index 0cd7a3f251a7..4d2fe7f77af1 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -698,37 +698,37 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - \ - assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -- pr_crit("%s bfq%d%c %s " fmt "\n", \ -+ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- __pbuf, ##args); \ -+ __pbuf, __func__, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -- pr_crit("%s %s " fmt "\n", \ -+ pr_crit("%s %s [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- __pbuf, ##args); \ -+ __pbuf, __func__, ##args); \ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -- pr_crit("%s bfq%d%c " fmt "\n", \ -+ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- ##args) -+ __func__, ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - - #endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ -- pr_crit("%s bfq " fmt "\n", \ -+ pr_crit("%s bfq [%s] " fmt "\n", \ - checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ -- ##args) -+ __func__, ##args) - - #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - -@@ -755,31 +755,32 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); - \ - assert_spin_locked((bfqd)->queue->queue_lock); \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -- blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ - (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- __pbuf, ##args); \ -+ __pbuf, __func__, ##args); \ - } while (0) - - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -- blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -+ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, __pbuf, \ -+ __func__, ##args); \ - } while (0) - - #else /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -- blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -- ##args) -+ __func__, ##args) - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) - - #endif /* BFQ_GROUP_IOSCHED_ENABLED */ - - #define bfq_log(bfqd, fmt, args...) \ -- blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) - - #endif /* CONFIG_BLK_DEV_IO_TRACE */ - #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -@@ -928,7 +929,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - - if (bfqq) - bfq_log_bfqq(bfqq->bfqd, bfqq, -- "entity_service_tree %p %d", -+ "%p %d", - sched_data->service_tree + idx, idx); - #ifdef BFQ_GROUP_IOSCHED_ENABLED - else { -@@ -936,7 +937,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) - container_of(entity, struct bfq_group, entity); - - bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -- "entity_service_tree %p %d", -+ "%p %d", - sched_data->service_tree + idx, idx); - } - #endif - -From 673a457e8a54d1c4b66e61b1a50956ba0b8c6a60 Mon Sep 17 00:00:00 2001 -From: Davide Paganelli -Date: Thu, 8 Feb 2018 11:49:58 +0100 -Subject: [PATCH 19/23] block, bfq-mq, bfq-sq: make bfq_bfqq_expire print - expiration reason - -Improve readability of the log messages related to the expiration -reasons of the function bfq_bfqq_expire. -Change the printing of the number that represents the reason for -expiration with an actual textual description of the reason. - -Signed-off-by: Davide Paganelli -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 10 ++++++++-- - block/bfq-sq-iosched.c | 10 ++++++++-- - 2 files changed, 16 insertions(+), 4 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index edc93b6af186..9268dd47a4e5 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -133,6 +133,12 @@ static const int bfq_timeout = (HZ / 8); - */ - static const unsigned long bfq_merge_time_limit = HZ/10; - -+#define MAX_LENGTH_REASON_NAME 25 -+ -+static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", -+"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", -+"PREEMPTED"}; -+ - static struct kmem_cache *bfq_pool; - - /* Below this threshold (in ns), we consider thinktime immediate. */ -@@ -3553,8 +3559,8 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - } - - bfq_log_bfqq(bfqd, bfqq, -- "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", -- reason, slow, bfqq->dispatched, -+ "expire (%s, slow %d, num_disp %d, short_ttime %d, weight %d)", -+ reason_name[reason], slow, bfqq->dispatched, - bfq_bfqq_has_short_ttime(bfqq), entity->weight); - - /* -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index e49e8ac882b3..f95deaab49a1 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -127,6 +127,12 @@ static const int bfq_timeout = (HZ / 8); - */ - static const unsigned long bfq_merge_time_limit = HZ/10; - -+#define MAX_LENGTH_REASON_NAME 25 -+ -+static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", -+"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", -+"PREEMPTED"}; -+ - static struct kmem_cache *bfq_pool; - - /* Below this threshold (in ns), we consider thinktime immediate. */ -@@ -3366,8 +3372,8 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - } - - bfq_log_bfqq(bfqd, bfqq, -- "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", -- reason, slow, bfqq->dispatched, -+ "expire (%s, slow %d, num_disp %d, short_ttime %d, weight %d)", -+ reason_name[reason], slow, bfqq->dispatched, - bfq_bfqq_has_short_ttime(bfqq), entity->weight); - - /* - -From 62e80623fbb58367c3f667dab22fea0804001f3b Mon Sep 17 00:00:00 2001 -From: Melzani Alessandro -Date: Mon, 26 Feb 2018 22:21:59 +0100 -Subject: [PATCH 20/23] bfq-mq: port of "block, bfq: remove batches of - confusing ifdefs" - -Commit a33801e8b473 ("block, bfq: move debug blkio stats behind -CONFIG_DEBUG_BLK_CGROUP") introduced two batches of confusing ifdefs: -one reported in [1], plus a similar one in another function. This -commit removes both batches, in the way suggested in [1]. - -[1] https://www.spinics.net/lists/linux-block/msg20043.html - -Fixes: a33801e8b473 ("block, bfq: move debug blkio stats behind CONFIG_DEBUG_BLK_CGROUP") - -Signed-off-by: Alessandro Melzani ---- - block/bfq-mq-iosched.c | 128 ++++++++++++++++++++++++++++--------------------- - 1 file changed, 73 insertions(+), 55 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 9268dd47a4e5..5a211620f316 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -4256,35 +4256,17 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - return rq; - } - --static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) --{ -- struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -- struct request *rq; --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -- struct bfq_queue *in_serv_queue, *bfqq; -- bool waiting_rq, idle_timer_disabled; --#endif - -- spin_lock_irq(&bfqd->lock); -- --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -- in_serv_queue = bfqd->in_service_queue; -- waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); -- -- rq = __bfq_dispatch_request(hctx); -- -- idle_timer_disabled = -- waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); -- --#else -- rq = __bfq_dispatch_request(hctx); --#endif -- spin_unlock_irq(&bfqd->lock); -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+static void bfq_update_dispatch_stats(struct request_queue *q, -+ struct request *rq, -+ struct bfq_queue *in_serv_queue, -+ bool idle_timer_disabled) -+{ -+ struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; - --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -- bfqq = rq ? RQ_BFQQ(rq) : NULL; - if (!idle_timer_disabled && !bfqq) -- return rq; -+ return; - - /* - * rq and bfqq are guaranteed to exist until this function -@@ -4299,7 +4281,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - * In addition, the following queue lock guarantees that - * bfqq_group(bfqq) exists as well. - */ -- spin_lock_irq(hctx->queue->queue_lock); -+ spin_lock_irq(q->queue_lock); - if (idle_timer_disabled) - /* - * Since the idle timer has been disabled, -@@ -4318,8 +4300,35 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - bfqg_stats_set_start_empty_time(bfqg); - bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); - } -- spin_unlock_irq(hctx->queue->queue_lock); -+ spin_unlock_irq(q->queue_lock); -+} -+#else -+static inline void bfq_update_dispatch_stats(struct request_queue *q, -+ struct request *rq, -+ struct bfq_queue *in_serv_queue, -+ bool idle_timer_disabled) {} - #endif -+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -+{ -+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct request *rq; -+ struct bfq_queue *in_serv_queue; -+ bool waiting_rq, idle_timer_disabled; -+ -+ spin_lock_irq(&bfqd->lock); -+ -+ in_serv_queue = bfqd->in_service_queue; -+ waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); -+ -+ rq = __bfq_dispatch_request(hctx); -+ -+ idle_timer_disabled = -+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); -+ -+ spin_unlock_irq(&bfqd->lock); -+ -+ bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, -+ idle_timer_disabled); - - return rq; - } -@@ -4881,6 +4890,38 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) - return idle_timer_disabled; - } - -+#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+static void bfq_update_insert_stats(struct request_queue *q, -+ struct bfq_queue *bfqq, -+ bool idle_timer_disabled, -+ unsigned int cmd_flags) -+{ -+ if (!bfqq) -+ return; -+ -+ /* -+ * bfqq still exists, because it can disappear only after -+ * either it is merged with another queue, or the process it -+ * is associated with exits. But both actions must be taken by -+ * the same process currently executing this flow of -+ * instructions. -+ * -+ * In addition, the following queue lock guarantees that -+ * bfqq_group(bfqq) exists as well. -+ */ -+ spin_lock_irq(q->queue_lock); -+ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); -+ if (idle_timer_disabled) -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+ spin_unlock_irq(q->queue_lock); -+} -+#else -+static inline void bfq_update_insert_stats(struct request_queue *q, -+ struct bfq_queue *bfqq, -+ bool idle_timer_disabled, -+ unsigned int cmd_flags) {} -+#endif -+ - static void bfq_prepare_request(struct request *rq, struct bio *bio); - - static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -@@ -4889,10 +4930,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq = RQ_BFQQ(rq); --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - bool idle_timer_disabled = false; - unsigned int cmd_flags; --#endif - - spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { -@@ -4938,7 +4977,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bfqq = RQ_BFQQ(rq); - } - --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) - idle_timer_disabled = __bfq_insert_request(bfqd, rq); - /* - * Update bfqq, because, if a queue merge has occurred -@@ -4946,9 +4984,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - * redirected into a new queue. - */ - bfqq = RQ_BFQQ(rq); --#else -- __bfq_insert_request(bfqd, rq); --#endif - - if (rq_mergeable(rq)) { - elv_rqhash_add(q, rq); -@@ -4956,34 +4991,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - q->last_merge = rq; - } - } --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -+ - /* - * Cache cmd_flags before releasing scheduler lock, because rq - * may disappear afterwards (for example, because of a request - * merge). - */ - cmd_flags = rq->cmd_flags; --#endif -+ - spin_unlock_irq(&bfqd->lock); --#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) -- if (!bfqq) -- return; -- /* -- * bfqq still exists, because it can disappear only after -- * either it is merged with another queue, or the process it -- * is associated with exits. But both actions must be taken by -- * the same process currently executing this flow of -- * instruction. -- * -- * In addition, the following queue lock guarantees that -- * bfqq_group(bfqq) exists as well. -- */ -- spin_lock_irq(q->queue_lock); -- bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); -- if (idle_timer_disabled) -- bfqg_stats_update_idle_time(bfqq_group(bfqq)); -- spin_unlock_irq(q->queue_lock); --#endif -+ bfq_update_insert_stats(q, bfqq, idle_timer_disabled, -+ cmd_flags); - } - - static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, - -From 0d0d05632872b226f4fae5e56af8736a4c24bf57 Mon Sep 17 00:00:00 2001 -From: Melzani Alessandro -Date: Mon, 26 Feb 2018 22:43:30 +0100 -Subject: [PATCH 21/23] bfq-sq, bfq-mq: port of "bfq: Use icq_to_bic() - consistently" - -Some code uses icq_to_bic() to convert an io_cq pointer to a -bfq_io_cq pointer while other code uses a direct cast. Convert -the code that uses a direct cast such that it uses icq_to_bic(). - -Signed-off-by: Alessandro Melzani ---- - block/bfq-mq-iosched.c | 2 +- - block/bfq-sq-iosched.c | 2 +- - 2 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 5a211620f316..7b1269558c47 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -272,7 +272,7 @@ static const unsigned long max_service_from_wr = 120000; - #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - --#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -+#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) - #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) - - /** -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index f95deaab49a1..c4aff8d55fc4 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -266,7 +266,7 @@ static const unsigned long max_service_from_wr = 120000; - #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ - { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) - --#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -+#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) - #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) - - static void bfq_schedule_dispatch(struct bfq_data *bfqd); - -From 4cb5de6add7d6ad0d25d73cb95dc871305db1522 Mon Sep 17 00:00:00 2001 -From: Melzani Alessandro -Date: Mon, 26 Feb 2018 22:59:30 +0100 -Subject: [PATCH 22/23] bfq-sq, bfq-mq: port of "block, bfq: fix error handle - in bfq_init" - -if elv_register fail, bfq_pool should be free. - -Signed-off-by: Alessandro Melzani ---- - block/bfq-mq-iosched.c | 4 +++- - block/bfq-sq-iosched.c | 4 +++- - 2 files changed, 6 insertions(+), 2 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 7b1269558c47..964e88c2ce59 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -6129,7 +6129,7 @@ static int __init bfq_init(void) - - ret = elv_register(&iosched_bfq_mq); - if (ret) -- goto err_pol_unreg; -+ goto slab_kill; - - #ifdef BFQ_GROUP_IOSCHED_ENABLED - strcat(msg, " (with cgroups support)"); -@@ -6138,6 +6138,8 @@ static int __init bfq_init(void) - - return 0; - -+slab_kill: -+ bfq_slab_kill(); - err_pol_unreg: - #ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); -diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c -index c4aff8d55fc4..7f0cf1f01ffc 100644 ---- a/block/bfq-sq-iosched.c -+++ b/block/bfq-sq-iosched.c -@@ -5590,7 +5590,7 @@ static int __init bfq_init(void) - - ret = elv_register(&iosched_bfq); - if (ret) -- goto err_pol_unreg; -+ goto slab_kill; - - #ifdef BFQ_GROUP_IOSCHED_ENABLED - strcat(msg, " (with cgroups support)"); -@@ -5599,6 +5599,8 @@ static int __init bfq_init(void) - - return 0; - -+slab_kill: -+ bfq_slab_kill(); - err_pol_unreg: - #ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_policy_unregister(&blkcg_policy_bfq); - -From 1f77c173aaa87ffb22c9f062a6449245d14311e4 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Wed, 4 Apr 2018 11:28:16 +0200 -Subject: [PATCH 23/23] block, bfq-sq, bfq-mq: lower-bound the estimated peak - rate to 1 - -If a storage device handled by BFQ happens to be slower than 7.5 KB/s -for a certain amount of time (in the order of a second), then the -estimated peak rate of the device, maintained in BFQ, becomes equal to -0. The reason is the limited precision with which the rate is -represented (details on the range of representable values in the -comments introduced by this commit). This leads to a division-by-zero -error where the estimated peak rate is used as divisor. Such a type of -failure has been reported in [1]. - -This commit addresses this issue by: -1. Lower-bounding the estimated peak rate to 1 -2. Adding and improving comments on the range of rates representable - -[1] https://www.spinics.net/lists/kernel/msg2739205.html - -Signed-off-by: Konstantin Khlebnikov -Signed-off-by: Paolo Valente ---- - block/bfq-mq-iosched.c | 25 ++++++++++++++++++++++++- - block/bfq-mq.h | 7 ++++++- - block/bfq-sq-iosched.c | 25 ++++++++++++++++++++++++- - block/bfq.h | 7 ++++++- - 4 files changed, 60 insertions(+), 4 deletions(-) - -diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c -index 964e88c2ce59..03efd90c5d20 100644 ---- a/block/bfq-mq-iosched.c -+++ b/block/bfq-mq-iosched.c -@@ -160,7 +160,20 @@ static struct kmem_cache *bfq_pool; - /* Target observation time interval for a peak-rate update (ns) */ - #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC - --/* Shift used for peak rate fixed precision calculations. */ -+/* -+ * Shift used for peak-rate fixed precision calculations. -+ * With -+ * - the current shift: 16 positions -+ * - the current type used to store rate: u32 -+ * - the current unit of measure for rate: [sectors/usec], or, more precisely, -+ * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, -+ * the range of rates that can be stored is -+ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = -+ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = -+ * [15, 65G] sectors/sec -+ * Which, assuming a sector size of 512B, corresponds to a range of -+ * [7.5K, 33T] B/sec -+ */ - #define BFQ_RATE_SHIFT 16 - - /* -@@ -2881,6 +2894,16 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; -+ -+ /* -+ * For a very slow device, bfqd->peak_rate can reach 0 (see -+ * the minimum representable values reported in the comments -+ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid -+ * divisions by zero where bfqd->peak_rate is used as a -+ * divisor. -+ */ -+ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); -+ - update_thr_responsiveness_params(bfqd); - BUG_ON(bfqd->peak_rate > 20<peak_rate > 20<peak_rate += rate; -+ -+ /* -+ * For a very slow device, bfqd->peak_rate can reach 0 (see -+ * the minimum representable values reported in the comments -+ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid -+ * divisions by zero where bfqd->peak_rate is used as a -+ * divisor. -+ */ -+ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); -+ - update_thr_responsiveness_params(bfqd); - BUG_ON(bfqd->peak_rate > 20< -Date: Sat, 29 Oct 2016 11:20:37 +1100 -Subject: [PATCH 02/16] Make preemptible kernel default. - -Make full preempt default on all arches. ---- - arch/arc/configs/tb10x_defconfig | 2 +- - arch/arm/configs/bcm2835_defconfig | 2 +- - arch/arm/configs/imx_v6_v7_defconfig | 2 +- - arch/arm/configs/mps2_defconfig | 2 +- - arch/arm/configs/mxs_defconfig | 2 +- - arch/blackfin/configs/BF518F-EZBRD_defconfig | 2 +- - arch/blackfin/configs/BF526-EZBRD_defconfig | 2 +- - arch/blackfin/configs/BF527-EZKIT-V2_defconfig | 2 +- - arch/blackfin/configs/BF527-EZKIT_defconfig | 2 +- - arch/blackfin/configs/BF527-TLL6527M_defconfig | 2 +- - arch/blackfin/configs/BF533-EZKIT_defconfig | 2 +- - arch/blackfin/configs/BF533-STAMP_defconfig | 2 +- - arch/blackfin/configs/BF537-STAMP_defconfig | 2 +- - arch/blackfin/configs/BF538-EZKIT_defconfig | 2 +- - arch/blackfin/configs/BF548-EZKIT_defconfig | 2 +- - arch/blackfin/configs/BF561-ACVILON_defconfig | 2 +- - arch/blackfin/configs/BF561-EZKIT-SMP_defconfig | 2 +- - arch/blackfin/configs/BF561-EZKIT_defconfig | 2 +- - arch/blackfin/configs/BF609-EZKIT_defconfig | 2 +- - arch/blackfin/configs/BlackStamp_defconfig | 2 +- - arch/blackfin/configs/CM-BF527_defconfig | 2 +- - arch/blackfin/configs/PNAV-10_defconfig | 2 +- - arch/blackfin/configs/SRV1_defconfig | 2 +- - arch/blackfin/configs/TCM-BF518_defconfig | 2 +- - arch/mips/configs/fuloong2e_defconfig | 3 ++- - arch/mips/configs/gpr_defconfig | 3 ++- - arch/mips/configs/ip22_defconfig | 3 ++- - arch/mips/configs/ip28_defconfig | 3 ++- - arch/mips/configs/jazz_defconfig | 3 ++- - arch/mips/configs/mtx1_defconfig | 3 ++- - arch/mips/configs/nlm_xlr_defconfig | 2 +- - arch/mips/configs/pic32mzda_defconfig | 2 +- - arch/mips/configs/pistachio_defconfig | 2 +- - arch/mips/configs/pnx8335_stb225_defconfig | 2 +- - arch/mips/configs/rm200_defconfig | 3 ++- - arch/parisc/configs/712_defconfig | 2 +- - arch/parisc/configs/c3000_defconfig | 2 +- - arch/parisc/configs/default_defconfig | 2 +- - arch/powerpc/configs/c2k_defconfig | 2 +- - arch/powerpc/configs/ppc6xx_defconfig | 2 +- - arch/score/configs/spct6600_defconfig | 2 +- - arch/sh/configs/se7712_defconfig | 2 +- - arch/sh/configs/se7721_defconfig | 2 +- - arch/sh/configs/titan_defconfig | 2 +- - arch/sparc/configs/sparc64_defconfig | 2 +- - arch/tile/configs/tilegx_defconfig | 2 +- - arch/tile/configs/tilepro_defconfig | 2 +- - arch/x86/configs/i386_defconfig | 2 +- - arch/x86/configs/x86_64_defconfig | 2 +- - kernel/Kconfig.preempt | 7 ++++--- - 50 files changed, 60 insertions(+), 52 deletions(-) - -diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig -index f30182549395..42910f628869 100644 ---- a/arch/arc/configs/tb10x_defconfig -+++ b/arch/arc/configs/tb10x_defconfig -@@ -28,7 +28,7 @@ CONFIG_ARC_PLAT_TB10X=y - CONFIG_ARC_CACHE_LINE_SHIFT=5 - CONFIG_HZ=250 - CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk" --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_COMPACTION is not set - CONFIG_NET=y - CONFIG_PACKET=y -diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig -index 43dab4890ad3..44a52166ca5e 100644 ---- a/arch/arm/configs/bcm2835_defconfig -+++ b/arch/arm/configs/bcm2835_defconfig -@@ -29,7 +29,7 @@ CONFIG_MODULE_UNLOAD=y - CONFIG_ARCH_MULTI_V6=y - CONFIG_ARCH_BCM=y - CONFIG_ARCH_BCM2835=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_AEABI=y - CONFIG_KSM=y - CONFIG_CLEANCACHE=y -diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig -index 32acac9ab81a..1482bb312987 100644 ---- a/arch/arm/configs/imx_v6_v7_defconfig -+++ b/arch/arm/configs/imx_v6_v7_defconfig -@@ -47,7 +47,7 @@ CONFIG_PCI_MSI=y - CONFIG_PCI_IMX6=y - CONFIG_SMP=y - CONFIG_ARM_PSCI=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_AEABI=y - CONFIG_HIGHMEM=y - CONFIG_CMA=y -diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig -index 0bcdec7cc169..10ceaefa51e0 100644 ---- a/arch/arm/configs/mps2_defconfig -+++ b/arch/arm/configs/mps2_defconfig -@@ -18,7 +18,7 @@ CONFIG_ARCH_MPS2=y - CONFIG_SET_MEM_PARAM=y - CONFIG_DRAM_BASE=0x21000000 - CONFIG_DRAM_SIZE=0x1000000 --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_ATAGS is not set - CONFIG_ZBOOT_ROM_TEXT=0x0 - CONFIG_ZBOOT_ROM_BSS=0x0 -diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig -index e5822ab01b7d..3e77e02f678f 100644 ---- a/arch/arm/configs/mxs_defconfig -+++ b/arch/arm/configs/mxs_defconfig -@@ -27,7 +27,7 @@ CONFIG_BLK_DEV_INTEGRITY=y - # CONFIG_ARCH_MULTI_V7 is not set - CONFIG_ARCH_MXS=y - # CONFIG_ARM_THUMB is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_AEABI=y - CONFIG_NET=y - CONFIG_PACKET=y -diff --git a/arch/blackfin/configs/BF518F-EZBRD_defconfig b/arch/blackfin/configs/BF518F-EZBRD_defconfig -index 99c00d835f47..39b91dfa55b5 100644 ---- a/arch/blackfin/configs/BF518F-EZBRD_defconfig -+++ b/arch/blackfin/configs/BF518F-EZBRD_defconfig -@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF518=y - CONFIG_IRQ_TIMER0=12 - # CONFIG_CYCLES_CLOCKSOURCE is not set -diff --git a/arch/blackfin/configs/BF526-EZBRD_defconfig b/arch/blackfin/configs/BF526-EZBRD_defconfig -index e66ba31ef84d..675cadb3a0c4 100644 ---- a/arch/blackfin/configs/BF526-EZBRD_defconfig -+++ b/arch/blackfin/configs/BF526-EZBRD_defconfig -@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF526=y - CONFIG_IRQ_TIMER0=12 - CONFIG_BFIN526_EZBRD=y -diff --git a/arch/blackfin/configs/BF527-EZKIT-V2_defconfig b/arch/blackfin/configs/BF527-EZKIT-V2_defconfig -index 0207c588c19f..4c517c443af5 100644 ---- a/arch/blackfin/configs/BF527-EZKIT-V2_defconfig -+++ b/arch/blackfin/configs/BF527-EZKIT-V2_defconfig -@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF527=y - CONFIG_BF_REV_0_2=y - CONFIG_BFIN527_EZKIT_V2=y -diff --git a/arch/blackfin/configs/BF527-EZKIT_defconfig b/arch/blackfin/configs/BF527-EZKIT_defconfig -index 99c131ba7d90..bf8df3e6cf02 100644 ---- a/arch/blackfin/configs/BF527-EZKIT_defconfig -+++ b/arch/blackfin/configs/BF527-EZKIT_defconfig -@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF527=y - CONFIG_BF_REV_0_1=y - CONFIG_IRQ_USB_INT0=11 -diff --git a/arch/blackfin/configs/BF527-TLL6527M_defconfig b/arch/blackfin/configs/BF527-TLL6527M_defconfig -index cdeb51856f26..0220b3b15c53 100644 ---- a/arch/blackfin/configs/BF527-TLL6527M_defconfig -+++ b/arch/blackfin/configs/BF527-TLL6527M_defconfig -@@ -21,7 +21,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_LBDAF is not set - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF527=y - CONFIG_BF_REV_0_2=y - CONFIG_BFIN527_TLL6527M=y -diff --git a/arch/blackfin/configs/BF533-EZKIT_defconfig b/arch/blackfin/configs/BF533-EZKIT_defconfig -index ed7d2c096739..6023e3fd2c48 100644 ---- a/arch/blackfin/configs/BF533-EZKIT_defconfig -+++ b/arch/blackfin/configs/BF533-EZKIT_defconfig -@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BFIN533_EZKIT=y - CONFIG_TIMER0=11 - CONFIG_CLKIN_HZ=27000000 -diff --git a/arch/blackfin/configs/BF533-STAMP_defconfig b/arch/blackfin/configs/BF533-STAMP_defconfig -index 0c241f4d28d7..f5cd0f18b711 100644 ---- a/arch/blackfin/configs/BF533-STAMP_defconfig -+++ b/arch/blackfin/configs/BF533-STAMP_defconfig -@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_TIMER0=11 - CONFIG_HIGH_RES_TIMERS=y - CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -diff --git a/arch/blackfin/configs/BF537-STAMP_defconfig b/arch/blackfin/configs/BF537-STAMP_defconfig -index e5360b30e39a..48085fde7f9e 100644 ---- a/arch/blackfin/configs/BF537-STAMP_defconfig -+++ b/arch/blackfin/configs/BF537-STAMP_defconfig -@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF537=y - CONFIG_HIGH_RES_TIMERS=y - CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -diff --git a/arch/blackfin/configs/BF538-EZKIT_defconfig b/arch/blackfin/configs/BF538-EZKIT_defconfig -index 60f6fb86125c..12deeaaef3cb 100644 ---- a/arch/blackfin/configs/BF538-EZKIT_defconfig -+++ b/arch/blackfin/configs/BF538-EZKIT_defconfig -@@ -21,7 +21,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF538=y - CONFIG_IRQ_TIMER0=12 - CONFIG_IRQ_TIMER1=12 -diff --git a/arch/blackfin/configs/BF548-EZKIT_defconfig b/arch/blackfin/configs/BF548-EZKIT_defconfig -index 38cb17d218d4..6a68ffc55b5a 100644 ---- a/arch/blackfin/configs/BF548-EZKIT_defconfig -+++ b/arch/blackfin/configs/BF548-EZKIT_defconfig -@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF548_std=y - CONFIG_IRQ_TIMER0=11 - # CONFIG_CYCLES_CLOCKSOURCE is not set -diff --git a/arch/blackfin/configs/BF561-ACVILON_defconfig b/arch/blackfin/configs/BF561-ACVILON_defconfig -index 78f6bc79f910..e9f3ba783a4e 100644 ---- a/arch/blackfin/configs/BF561-ACVILON_defconfig -+++ b/arch/blackfin/configs/BF561-ACVILON_defconfig -@@ -20,7 +20,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_LBDAF is not set - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF561=y - CONFIG_BF_REV_0_5=y - CONFIG_IRQ_TIMER0=10 -diff --git a/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig -index fac8bb578249..89b75a6c3fab 100644 ---- a/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig -+++ b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig -@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF561=y - CONFIG_SMP=y - CONFIG_IRQ_TIMER0=10 -diff --git a/arch/blackfin/configs/BF561-EZKIT_defconfig b/arch/blackfin/configs/BF561-EZKIT_defconfig -index 2a2e4d0cebc1..67b3d2f419ba 100644 ---- a/arch/blackfin/configs/BF561-EZKIT_defconfig -+++ b/arch/blackfin/configs/BF561-EZKIT_defconfig -@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF561=y - CONFIG_IRQ_TIMER0=10 - CONFIG_CLKIN_HZ=30000000 -diff --git a/arch/blackfin/configs/BF609-EZKIT_defconfig b/arch/blackfin/configs/BF609-EZKIT_defconfig -index 3ce77f07208a..8cc75d4218fb 100644 ---- a/arch/blackfin/configs/BF609-EZKIT_defconfig -+++ b/arch/blackfin/configs/BF609-EZKIT_defconfig -@@ -20,7 +20,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF609=y - CONFIG_PINT1_ASSIGN=0x01010000 - CONFIG_PINT2_ASSIGN=0x07000101 -diff --git a/arch/blackfin/configs/BlackStamp_defconfig b/arch/blackfin/configs/BlackStamp_defconfig -index f4a9200e1ab1..9faf0ec7007f 100644 ---- a/arch/blackfin/configs/BlackStamp_defconfig -+++ b/arch/blackfin/configs/BlackStamp_defconfig -@@ -17,7 +17,7 @@ CONFIG_MODULE_UNLOAD=y - CONFIG_MODULE_FORCE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF532=y - CONFIG_BF_REV_0_5=y - CONFIG_BLACKSTAMP=y -diff --git a/arch/blackfin/configs/CM-BF527_defconfig b/arch/blackfin/configs/CM-BF527_defconfig -index 1902bb05d086..4a1ad4fd7bb2 100644 ---- a/arch/blackfin/configs/CM-BF527_defconfig -+++ b/arch/blackfin/configs/CM-BF527_defconfig -@@ -19,7 +19,7 @@ CONFIG_MODULES=y - CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF527=y - CONFIG_BF_REV_0_1=y - CONFIG_IRQ_TIMER0=12 -diff --git a/arch/blackfin/configs/PNAV-10_defconfig b/arch/blackfin/configs/PNAV-10_defconfig -index c7926812971c..9d787e28bbe8 100644 ---- a/arch/blackfin/configs/PNAV-10_defconfig -+++ b/arch/blackfin/configs/PNAV-10_defconfig -@@ -15,7 +15,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF537=y - CONFIG_IRQ_TIMER0=12 - CONFIG_PNAV10=y -diff --git a/arch/blackfin/configs/SRV1_defconfig b/arch/blackfin/configs/SRV1_defconfig -index 23fdc57d657a..225df32dc9a8 100644 ---- a/arch/blackfin/configs/SRV1_defconfig -+++ b/arch/blackfin/configs/SRV1_defconfig -@@ -13,7 +13,7 @@ CONFIG_MMAP_ALLOW_UNINITIALIZED=y - CONFIG_MODULES=y - CONFIG_MODULE_UNLOAD=y - # CONFIG_IOSCHED_DEADLINE is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF537=y - CONFIG_IRQ_TIMER0=12 - CONFIG_BOOT_LOAD=0x400000 -diff --git a/arch/blackfin/configs/TCM-BF518_defconfig b/arch/blackfin/configs/TCM-BF518_defconfig -index e28959479fe0..425c24e43c34 100644 ---- a/arch/blackfin/configs/TCM-BF518_defconfig -+++ b/arch/blackfin/configs/TCM-BF518_defconfig -@@ -23,7 +23,7 @@ CONFIG_MODULE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BF518=y - CONFIG_BF_REV_0_1=y - CONFIG_BFIN518F_TCM=y -diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig -index 499f51498ecb..f7cb39b0662c 100644 ---- a/arch/mips/configs/fuloong2e_defconfig -+++ b/arch/mips/configs/fuloong2e_defconfig -@@ -2,7 +2,8 @@ CONFIG_MACH_LOONGSON64=y - CONFIG_64BIT=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y -+CONFIG_EXPERIMENTAL=y - CONFIG_LOCALVERSION="-fuloong2e" - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y -diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig -index 55438fc9991e..db03ef4f737d 100644 ---- a/arch/mips/configs/gpr_defconfig -+++ b/arch/mips/configs/gpr_defconfig -@@ -1,7 +1,8 @@ - CONFIG_MIPS_ALCHEMY=y - CONFIG_MIPS_GPR=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y -+CONFIG_EXPERIMENTAL=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y -diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig -index 83e8fe2064aa..93e7b167433b 100644 ---- a/arch/mips/configs/ip22_defconfig -+++ b/arch/mips/configs/ip22_defconfig -@@ -3,7 +3,8 @@ CONFIG_CPU_R5000=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y - CONFIG_HZ_1000=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y -+CONFIG_EXPERIMENTAL=y - CONFIG_SYSVIPC=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y -diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig -index d0a4c2cfacf8..6f0600e99c25 100644 ---- a/arch/mips/configs/ip28_defconfig -+++ b/arch/mips/configs/ip28_defconfig -@@ -1,6 +1,7 @@ - CONFIG_SGI_IP28=y - CONFIG_ARC_CONSOLE=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y -+CONFIG_EXPERIMENTAL=y - CONFIG_SYSVIPC=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y -diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig -index 9ad1c94376c8..1d62ce7ff5dc 100644 ---- a/arch/mips/configs/jazz_defconfig -+++ b/arch/mips/configs/jazz_defconfig -@@ -1,6 +1,7 @@ - CONFIG_MACH_JAZZ=y - CONFIG_OLIVETTI_M700=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y -+CONFIG_EXPERIMENTAL=y - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_BSD_PROCESS_ACCT=y -diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig -index c3d0d0a6e044..aa3426d5f7d7 100644 ---- a/arch/mips/configs/mtx1_defconfig -+++ b/arch/mips/configs/mtx1_defconfig -@@ -1,6 +1,7 @@ - CONFIG_MIPS_ALCHEMY=y - CONFIG_MIPS_MTX1=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y -+CONFIG_EXPERIMENTAL=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y -diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig -index 1e18fd7de209..b514e91e5426 100644 ---- a/arch/mips/configs/nlm_xlr_defconfig -+++ b/arch/mips/configs/nlm_xlr_defconfig -@@ -5,7 +5,7 @@ CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 - CONFIG_SMP=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_KEXEC=y - CONFIG_CROSS_COMPILE="" - # CONFIG_LOCALVERSION_AUTO is not set -diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig -index 52192c632ae8..96b087498dab 100644 ---- a/arch/mips/configs/pic32mzda_defconfig -+++ b/arch/mips/configs/pic32mzda_defconfig -@@ -1,7 +1,7 @@ - CONFIG_MACH_PIC32=y - CONFIG_DTB_PIC32_MZDA_SK=y - CONFIG_HZ_100=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_SECCOMP is not set - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y -diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig -index b22a3cf149b6..cfffca3d37f4 100644 ---- a/arch/mips/configs/pistachio_defconfig -+++ b/arch/mips/configs/pistachio_defconfig -@@ -5,7 +5,7 @@ CONFIG_MIPS_CPS=y - CONFIG_DEFAULT_MMAP_MIN_ADDR=32768 - CONFIG_ZSMALLOC=y - CONFIG_NR_CPUS=4 --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_DEFAULT_HOSTNAME="localhost" - CONFIG_SYSVIPC=y -diff --git a/arch/mips/configs/pnx8335_stb225_defconfig b/arch/mips/configs/pnx8335_stb225_defconfig -index 81b5eb89446c..19f8cea849a1 100644 ---- a/arch/mips/configs/pnx8335_stb225_defconfig -+++ b/arch/mips/configs/pnx8335_stb225_defconfig -@@ -3,7 +3,7 @@ CONFIG_CPU_LITTLE_ENDIAN=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y - CONFIG_HZ_128=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_SECCOMP is not set - # CONFIG_LOCALVERSION_AUTO is not set - # CONFIG_SWAP is not set -diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig -index 99679e514042..2ced507a8ba7 100644 ---- a/arch/mips/configs/rm200_defconfig -+++ b/arch/mips/configs/rm200_defconfig -@@ -2,7 +2,8 @@ CONFIG_SNI_RM=y - CONFIG_CPU_LITTLE_ENDIAN=y - CONFIG_ARC_CONSOLE=y - CONFIG_HZ_1000=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y -+CONFIG_EXPERIMENTAL=y - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_BSD_PROCESS_ACCT=y -diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig -index ccc109761f44..a6a5b0b7a9c9 100644 ---- a/arch/parisc/configs/712_defconfig -+++ b/arch/parisc/configs/712_defconfig -@@ -13,7 +13,7 @@ CONFIG_MODULES=y - CONFIG_MODULE_UNLOAD=y - CONFIG_MODULE_FORCE_UNLOAD=y - CONFIG_PA7100LC=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_GSC_LASI=y - # CONFIG_PDC_CHASSIS is not set - CONFIG_BINFMT_MISC=m -diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig -index 8d41a73bd71b..b8e0a6662ff9 100644 ---- a/arch/parisc/configs/c3000_defconfig -+++ b/arch/parisc/configs/c3000_defconfig -@@ -13,7 +13,7 @@ CONFIG_MODULES=y - CONFIG_MODULE_UNLOAD=y - CONFIG_MODULE_FORCE_UNLOAD=y - CONFIG_PA8X00=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_GSC is not set - CONFIG_PCI=y - CONFIG_PCI_LBA=y -diff --git a/arch/parisc/configs/default_defconfig b/arch/parisc/configs/default_defconfig -index 52c9050a7c5c..8d86d2e989f4 100644 ---- a/arch/parisc/configs/default_defconfig -+++ b/arch/parisc/configs/default_defconfig -@@ -14,7 +14,7 @@ CONFIG_MODULE_UNLOAD=y - CONFIG_MODULE_FORCE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - CONFIG_PA7100LC=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_IOMMU_CCIO=y - CONFIG_GSC_LASI=y - CONFIG_GSC_WAX=y -diff --git a/arch/powerpc/configs/c2k_defconfig b/arch/powerpc/configs/c2k_defconfig -index f1552af9eecc..f8505e6ec7b3 100644 ---- a/arch/powerpc/configs/c2k_defconfig -+++ b/arch/powerpc/configs/c2k_defconfig -@@ -29,7 +29,7 @@ CONFIG_CPU_FREQ_GOV_POWERSAVE=m - CONFIG_CPU_FREQ_GOV_ONDEMAND=m - CONFIG_GEN_RTC=y - CONFIG_HIGHMEM=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BINFMT_MISC=y - CONFIG_PM=y - CONFIG_PCI_MSI=y -diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig -index da0e8d535eb8..c016af41ab4f 100644 ---- a/arch/powerpc/configs/ppc6xx_defconfig -+++ b/arch/powerpc/configs/ppc6xx_defconfig -@@ -74,7 +74,7 @@ CONFIG_QE_GPIO=y - CONFIG_MCU_MPC8349EMITX=y - CONFIG_HIGHMEM=y - CONFIG_HZ_1000=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BINFMT_MISC=y - CONFIG_HIBERNATION=y - CONFIG_PM_DEBUG=y -diff --git a/arch/score/configs/spct6600_defconfig b/arch/score/configs/spct6600_defconfig -index b2d8802f43b4..46434ca1fa10 100644 ---- a/arch/score/configs/spct6600_defconfig -+++ b/arch/score/configs/spct6600_defconfig -@@ -1,5 +1,5 @@ - CONFIG_HZ_100=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_EXPERIMENTAL=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y -diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig -index 5a1097641247..eb5fbf554e7f 100644 ---- a/arch/sh/configs/se7712_defconfig -+++ b/arch/sh/configs/se7712_defconfig -@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y - CONFIG_SH_SOLUTION_ENGINE=y - CONFIG_SH_PCLK_FREQ=66666666 - CONFIG_HEARTBEAT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" - CONFIG_NET=y -diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig -index 9c0ef13bee10..cbaa65c8bf9e 100644 ---- a/arch/sh/configs/se7721_defconfig -+++ b/arch/sh/configs/se7721_defconfig -@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y - CONFIG_SH_7721_SOLUTION_ENGINE=y - CONFIG_SH_PCLK_FREQ=33333333 - CONFIG_HEARTBEAT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda2" - CONFIG_NET=y -diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig -index ceb48e9b70f4..1a69eda6610c 100644 ---- a/arch/sh/configs/titan_defconfig -+++ b/arch/sh/configs/titan_defconfig -@@ -20,7 +20,7 @@ CONFIG_SH_TITAN=y - CONFIG_SH_PCLK_FREQ=30000000 - CONFIG_SH_DMA=y - CONFIG_SH_DMA_API=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" - CONFIG_PCI=y -diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig -index 4d4e1cc6402f..04bea1d28ba7 100644 ---- a/arch/sparc/configs/sparc64_defconfig -+++ b/arch/sparc/configs/sparc64_defconfig -@@ -22,7 +22,7 @@ CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y - CONFIG_NUMA=y - CONFIG_DEFAULT_MMAP_MIN_ADDR=8192 --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_SUN_LDOMS=y - CONFIG_PCI=y - CONFIG_PCI_MSI=y -diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig -index 9f94435cc44f..aa78ee6cd5eb 100644 ---- a/arch/tile/configs/tilegx_defconfig -+++ b/arch/tile/configs/tilegx_defconfig -@@ -47,7 +47,7 @@ CONFIG_CFQ_GROUP_IOSCHED=y - CONFIG_NR_CPUS=100 - CONFIG_HZ_100=y - # CONFIG_COMPACTION is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_TILE_PCI_IO=y - CONFIG_PCI_DEBUG=y - # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig -index 1c5bd4f8ffca..38005862062c 100644 ---- a/arch/tile/configs/tilepro_defconfig -+++ b/arch/tile/configs/tilepro_defconfig -@@ -44,7 +44,7 @@ CONFIG_KARMA_PARTITION=y - CONFIG_CFQ_GROUP_IOSCHED=y - CONFIG_HZ_100=y - # CONFIG_COMPACTION is not set --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_PCI_DEBUG=y - # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set - CONFIG_BINFMT_MISC=y -diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig -index 0eb9f92f3717..e5890ae917e5 100644 ---- a/arch/x86/configs/i386_defconfig -+++ b/arch/x86/configs/i386_defconfig -@@ -41,7 +41,7 @@ CONFIG_SMP=y - CONFIG_X86_GENERIC=y - CONFIG_HPET_TIMER=y - CONFIG_SCHED_SMT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y - CONFIG_X86_MCE=y - CONFIG_X86_REBOOTFIXUPS=y -diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig -index 4a4b16e56d35..7452dcadda74 100644 ---- a/arch/x86/configs/x86_64_defconfig -+++ b/arch/x86/configs/x86_64_defconfig -@@ -40,7 +40,7 @@ CONFIG_SMP=y - CONFIG_CALGARY_IOMMU=y - CONFIG_NR_CPUS=64 - CONFIG_SCHED_SMT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y - CONFIG_X86_MCE=y - CONFIG_MICROCODE=y -diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index 3f9c97419f02..1dc79ec7ad09 100644 ---- a/kernel/Kconfig.preempt -+++ b/kernel/Kconfig.preempt -@@ -1,7 +1,7 @@ - - choice - prompt "Preemption Model" -- default PREEMPT_NONE -+ default PREEMPT - - config PREEMPT_NONE - bool "No Forced Preemption (Server)" -@@ -17,7 +17,7 @@ config PREEMPT_NONE - latencies. - - config PREEMPT_VOLUNTARY -- bool "Voluntary Kernel Preemption (Desktop)" -+ bool "Voluntary Kernel Preemption (Nothing)" - help - This option reduces the latency of the kernel by adding more - "explicit preemption points" to the kernel code. These new -@@ -31,7 +31,8 @@ config PREEMPT_VOLUNTARY - applications to run more 'smoothly' even when the system is - under load. - -- Select this if you are building a kernel for a desktop system. -+ Select this for no system in particular (choose Preemptible -+ instead on a desktop if you know what's good for you). - - config PREEMPT - bool "Preemptible Kernel (Low-Latency Desktop)" --- -2.11.0 - diff --git a/sys-kernel/linux-sources-redcore-lts/files/0003-Expose-vmsplit-for-our-poor-32-bit-users.patch b/sys-kernel/linux-sources-redcore-lts/files/0003-Expose-vmsplit-for-our-poor-32-bit-users.patch deleted file mode 100644 index b7897dbe..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0003-Expose-vmsplit-for-our-poor-32-bit-users.patch +++ /dev/null @@ -1,48 +0,0 @@ -From 44fc740a3ff85d378c28a416a076cc7e019d7b8c Mon Sep 17 00:00:00 2001 -From: Con Kolivas -Date: Fri, 12 May 2017 13:07:37 +1000 -Subject: [PATCH 03/16] Expose vmsplit for our poor 32 bit users. - ---- - arch/x86/Kconfig | 12 ++++++------ - 1 file changed, 6 insertions(+), 6 deletions(-) - -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index e06a7b4e1dc4..931aba4fc567 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -1361,7 +1361,7 @@ config HIGHMEM64G - endchoice - - choice -- prompt "Memory split" if EXPERT -+ prompt "Memory split" - default VMSPLIT_3G - depends on X86_32 - ---help--- -@@ -1381,17 +1381,17 @@ choice - option alone! - - config VMSPLIT_3G -- bool "3G/1G user/kernel split" -+ bool "Default 896MB lowmem (3G/1G user/kernel split)" - config VMSPLIT_3G_OPT - depends on !X86_PAE -- bool "3G/1G user/kernel split (for full 1G low memory)" -+ bool "1GB lowmem (3G/1G user/kernel split)" - config VMSPLIT_2G -- bool "2G/2G user/kernel split" -+ bool "2GB lowmem (2G/2G user/kernel split)" - config VMSPLIT_2G_OPT - depends on !X86_PAE -- bool "2G/2G user/kernel split (for full 2G low memory)" -+ bool "2GB lowmem (2G/2G user/kernel split)" - config VMSPLIT_1G -- bool "1G/3G user/kernel split" -+ bool "3GB lowmem (1G/3G user/kernel split)" - endchoice - - config PAGE_OFFSET --- -2.11.0 - diff --git a/sys-kernel/linux-sources-redcore-lts/files/0004-Create-highres-timeout-variants-of-schedule_timeout-.patch b/sys-kernel/linux-sources-redcore-lts/files/0004-Create-highres-timeout-variants-of-schedule_timeout-.patch deleted file mode 100644 index 3c182fbe..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0004-Create-highres-timeout-variants-of-schedule_timeout-.patch +++ /dev/null @@ -1,153 +0,0 @@ -From d27b58b0707ac311be5a51594fc6f22ed1d109e5 Mon Sep 17 00:00:00 2001 -From: Con Kolivas -Date: Sat, 12 Aug 2017 11:53:39 +1000 -Subject: [PATCH 04/16] Create highres timeout variants of schedule_timeout - functions. - ---- - include/linux/freezer.h | 1 + - include/linux/sched.h | 31 +++++++++++++++++++-- - kernel/time/hrtimer.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++ - 3 files changed, 101 insertions(+), 2 deletions(-) - -diff --git a/include/linux/freezer.h b/include/linux/freezer.h -index 3995df1d068f..f8645e8f2444 100644 ---- a/include/linux/freezer.h -+++ b/include/linux/freezer.h -@@ -297,6 +297,7 @@ static inline void set_freezable(void) {} - #define wait_event_freezekillable_unsafe(wq, condition) \ - wait_event_killable(wq, condition) - -+#define pm_freezing (false) - #endif /* !CONFIG_FREEZER */ - - #endif /* FREEZER_H_INCLUDED */ -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 35dc91a0e2ed..38852ebfa864 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -173,13 +173,40 @@ extern cpumask_var_t cpu_isolated_map; - - extern void scheduler_tick(void); - --#define MAX_SCHEDULE_TIMEOUT LONG_MAX -- -+#define MAX_SCHEDULE_TIMEOUT LONG_MAX - extern long schedule_timeout(long timeout); - extern long schedule_timeout_interruptible(long timeout); - extern long schedule_timeout_killable(long timeout); - extern long schedule_timeout_uninterruptible(long timeout); - extern long schedule_timeout_idle(long timeout); -+ -+#ifdef CONFIG_HIGH_RES_TIMERS -+extern long schedule_msec_hrtimeout(long timeout); -+extern long schedule_min_hrtimeout(void); -+extern long schedule_msec_hrtimeout_interruptible(long timeout); -+extern long schedule_msec_hrtimeout_uninterruptible(long timeout); -+#else -+static inline long schedule_msec_hrtimeout(long timeout) -+{ -+ return schedule_timeout(msecs_to_jiffies(timeout)); -+} -+ -+static inline long schedule_min_hrtimeout(void) -+{ -+ return schedule_timeout(1); -+} -+ -+static inline long schedule_msec_hrtimeout_interruptible(long timeout) -+{ -+ return schedule_timeout_interruptible(msecs_to_jiffies(timeout)); -+} -+ -+static inline long schedule_msec_hrtimeout_uninterruptible(long timeout) -+{ -+ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout)); -+} -+#endif -+ - asmlinkage void schedule(void); - extern void schedule_preempt_disabled(void); - -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index 88f75f92ef36..13227cf2814c 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -1787,3 +1787,74 @@ int __sched schedule_hrtimeout(ktime_t *expires, - return schedule_hrtimeout_range(expires, 0, mode); - } - EXPORT_SYMBOL_GPL(schedule_hrtimeout); -+ -+/* -+ * As per schedule_hrtimeout but taskes a millisecond value and returns how -+ * many milliseconds are left. -+ */ -+long __sched schedule_msec_hrtimeout(long timeout) -+{ -+ struct hrtimer_sleeper t; -+ int delta, secs, jiffs; -+ ktime_t expires; -+ -+ if (!timeout) { -+ __set_current_state(TASK_RUNNING); -+ return 0; -+ } -+ -+ jiffs = msecs_to_jiffies(timeout); -+ /* -+ * If regular timer resolution is adequate or hrtimer resolution is not -+ * (yet) better than Hz, as would occur during startup, use regular -+ * timers. -+ */ -+ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ) -+ return schedule_timeout(jiffs); -+ -+ secs = timeout / 1000; -+ delta = (timeout % 1000) * NSEC_PER_MSEC; -+ expires = ktime_set(secs, delta); -+ -+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ hrtimer_set_expires_range_ns(&t.timer, expires, delta); -+ -+ hrtimer_init_sleeper(&t, current); -+ -+ hrtimer_start_expires(&t.timer, HRTIMER_MODE_REL); -+ -+ if (likely(t.task)) -+ schedule(); -+ -+ hrtimer_cancel(&t.timer); -+ destroy_hrtimer_on_stack(&t.timer); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ expires = hrtimer_expires_remaining(&t.timer); -+ timeout = ktime_to_ms(expires); -+ return timeout < 0 ? 0 : timeout; -+} -+ -+EXPORT_SYMBOL(schedule_msec_hrtimeout); -+ -+long __sched schedule_min_hrtimeout(void) -+{ -+ return schedule_msec_hrtimeout(1); -+} -+ -+EXPORT_SYMBOL(schedule_min_hrtimeout); -+ -+long __sched schedule_msec_hrtimeout_interruptible(long timeout) -+{ -+ __set_current_state(TASK_INTERRUPTIBLE); -+ return schedule_msec_hrtimeout(timeout); -+} -+EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible); -+ -+long __sched schedule_msec_hrtimeout_uninterruptible(long timeout) -+{ -+ __set_current_state(TASK_UNINTERRUPTIBLE); -+ return schedule_msec_hrtimeout(timeout); -+} -+EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible); --- -2.11.0 - diff --git a/sys-kernel/linux-sources-redcore-lts/files/0005-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch b/sys-kernel/linux-sources-redcore-lts/files/0005-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch deleted file mode 100644 index 3c889719..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0005-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch +++ /dev/null @@ -1,50 +0,0 @@ -From 5da7d1778b96c514394334c92de9b3d8d71f4a29 Mon Sep 17 00:00:00 2001 -From: Con Kolivas -Date: Sat, 5 Nov 2016 09:27:36 +1100 -Subject: [PATCH 05/16] Special case calls of schedule_timeout(1) to use the - min hrtimeout of 1ms, working around low Hz resolutions. - ---- - kernel/time/timer.c | 17 +++++++++++++++-- - 1 file changed, 15 insertions(+), 2 deletions(-) - -diff --git a/kernel/time/timer.c b/kernel/time/timer.c -index 9c18e16059a3..dd4d1b193286 100644 ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -1741,6 +1741,19 @@ signed long __sched schedule_timeout(signed long timeout) - - expire = timeout + jiffies; - -+#ifdef CONFIG_HIGH_RES_TIMERS -+ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) { -+ /* -+ * Special case 1 as being a request for the minimum timeout -+ * and use highres timers to timeout after 1ms to workaround -+ * the granularity of low Hz tick timers. -+ */ -+ if (!schedule_min_hrtimeout()) -+ return 0; -+ goto out_timeout; -+ } -+#endif -+ - setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); - __mod_timer(&timer, expire, false); - schedule(); -@@ -1748,10 +1761,10 @@ signed long __sched schedule_timeout(signed long timeout) - - /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer); -- -+out_timeout: - timeout = expire - jiffies; - -- out: -+out: - return timeout < 0 ? 0 : timeout; - } - EXPORT_SYMBOL(schedule_timeout); --- -2.11.0 - diff --git a/sys-kernel/linux-sources-redcore-lts/files/0006-Convert-msleep-to-use-hrtimers-when-active.patch b/sys-kernel/linux-sources-redcore-lts/files/0006-Convert-msleep-to-use-hrtimers-when-active.patch deleted file mode 100644 index 2f065652..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0006-Convert-msleep-to-use-hrtimers-when-active.patch +++ /dev/null @@ -1,54 +0,0 @@ -From 9df803c28bb8ccb2588c0ccaf857b9e673175fed Mon Sep 17 00:00:00 2001 -From: Con Kolivas -Date: Fri, 4 Nov 2016 09:25:54 +1100 -Subject: [PATCH 06/16] Convert msleep to use hrtimers when active. - ---- - kernel/time/timer.c | 24 ++++++++++++++++++++++-- - 1 file changed, 22 insertions(+), 2 deletions(-) - -diff --git a/kernel/time/timer.c b/kernel/time/timer.c -index dd4d1b193286..c68cb9307f64 100644 ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -1884,7 +1884,19 @@ void __init init_timers(void) - */ - void msleep(unsigned int msecs) - { -- unsigned long timeout = msecs_to_jiffies(msecs) + 1; -+ int jiffs = msecs_to_jiffies(msecs); -+ unsigned long timeout; -+ -+ /* -+ * Use high resolution timers where the resolution of tick based -+ * timers is inadequate. -+ */ -+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { -+ while (msecs) -+ msecs = schedule_msec_hrtimeout_uninterruptible(msecs); -+ return; -+ } -+ timeout = msecs_to_jiffies(msecs) + 1; - - while (timeout) - timeout = schedule_timeout_uninterruptible(timeout); -@@ -1898,7 +1910,15 @@ EXPORT_SYMBOL(msleep); - */ - unsigned long msleep_interruptible(unsigned int msecs) - { -- unsigned long timeout = msecs_to_jiffies(msecs) + 1; -+ int jiffs = msecs_to_jiffies(msecs); -+ unsigned long timeout; -+ -+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { -+ while (msecs && !signal_pending(current)) -+ msecs = schedule_msec_hrtimeout_interruptible(msecs); -+ return msecs; -+ } -+ timeout = msecs_to_jiffies(msecs) + 1; - - while (timeout && !signal_pending(current)) - timeout = schedule_timeout_interruptible(timeout); --- -2.11.0 - diff --git a/sys-kernel/linux-sources-redcore-lts/files/0007-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch b/sys-kernel/linux-sources-redcore-lts/files/0007-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch deleted file mode 100644 index ff071da8..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0007-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch +++ /dev/null @@ -1,529 +0,0 @@ -diff -Nur a/drivers/block/swim.c b/drivers/block/swim.c ---- a/drivers/block/swim.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/block/swim.c 2018-11-03 16:30:39.471807304 +0000 -@@ -332,7 +332,7 @@ - if (swim_readbit(base, MOTOR_ON)) - break; - current->state = TASK_INTERRUPTIBLE; -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - } else if (action == OFF) { - swim_action(base, MOTOR_OFF); -@@ -351,7 +351,7 @@ - if (!swim_readbit(base, DISK_IN)) - break; - current->state = TASK_INTERRUPTIBLE; -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - swim_select(base, RELAX); - } -@@ -375,7 +375,7 @@ - for (wait = 0; wait < HZ; wait++) { - - current->state = TASK_INTERRUPTIBLE; -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - - swim_select(base, RELAX); - if (!swim_readbit(base, STEP)) -diff -Nur a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c ---- a/drivers/bluetooth/hci_qca.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/bluetooth/hci_qca.c 2018-11-03 16:31:56.065260061 +0000 -@@ -880,7 +880,7 @@ - * then host can communicate with new baudrate to controller - */ - set_current_state(TASK_UNINTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(BAUDRATE_SETTLE_TIMEOUT_MS)); -+ schedule_msec_hrtimeout((BAUDRATE_SETTLE_TIMEOUT_MS)); - set_current_state(TASK_RUNNING); - - return 0; -diff -Nur a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c ---- a/drivers/char/ipmi/ipmi_msghandler.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/char/ipmi/ipmi_msghandler.c 2018-11-03 16:30:39.473807368 +0000 -@@ -2953,7 +2953,7 @@ - /* Current message first, to preserve order */ - while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) { - /* Wait for the message to clear out. */ -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - - /* No need for locks, the interface is down. */ -diff -Nur a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c ---- a/drivers/char/ipmi/ipmi_ssif.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/char/ipmi/ipmi_ssif.c 2018-11-03 16:30:39.473807368 +0000 -@@ -1200,7 +1200,7 @@ - - /* make sure the driver is not looking for flags any more. */ - while (ssif_info->ssif_state != SSIF_NORMAL) -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - - ssif_info->stopping = true; - del_timer_sync(&ssif_info->retry_timer); -diff -Nur a/drivers/char/snsc.c b/drivers/char/snsc.c ---- a/drivers/char/snsc.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/char/snsc.c 2018-11-03 16:30:39.474807400 +0000 -@@ -198,7 +198,7 @@ - add_wait_queue(&sd->sd_rq, &wait); - spin_unlock_irqrestore(&sd->sd_rlock, flags); - -- schedule_timeout(msecs_to_jiffies(SCDRV_TIMEOUT)); -+ schedule_msec_hrtimeout((SCDRV_TIMEOUT)); - - remove_wait_queue(&sd->sd_rq, &wait); - if (signal_pending(current)) { -@@ -294,7 +294,7 @@ - add_wait_queue(&sd->sd_wq, &wait); - spin_unlock_irqrestore(&sd->sd_wlock, flags); - -- schedule_timeout(msecs_to_jiffies(SCDRV_TIMEOUT)); -+ schedule_msec_hrtimeout((SCDRV_TIMEOUT)); - - remove_wait_queue(&sd->sd_wq, &wait); - if (signal_pending(current)) { -diff -Nur a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c ---- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c 2018-11-03 16:30:39.474807400 +0000 -@@ -235,7 +235,7 @@ - DRM_ERROR("SVGA device lockup.\n"); - break; - } -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - if (interruptible && signal_pending(current)) { - ret = -ERESTARTSYS; - break; -diff -Nur a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c ---- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c 2018-11-03 16:30:39.474807400 +0000 -@@ -202,7 +202,7 @@ - break; - } - if (lazy) -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - else if ((++count & 0x0F) == 0) { - /** - * FIXME: Use schedule_hr_timeout here for -diff -Nur a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c ---- a/drivers/media/pci/ivtv/ivtv-ioctl.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/media/pci/ivtv/ivtv-ioctl.c 2018-11-03 16:30:39.475807432 +0000 -@@ -1154,7 +1154,7 @@ - TASK_UNINTERRUPTIBLE); - if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100) - break; -- schedule_timeout(msecs_to_jiffies(25)); -+ schedule_msec_hrtimeout((25)); - } - finish_wait(&itv->vsync_waitq, &wait); - mutex_lock(&itv->serialize_lock); -diff -Nur a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c ---- a/drivers/media/pci/ivtv/ivtv-streams.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/media/pci/ivtv/ivtv-streams.c 2018-11-03 16:30:39.475807432 +0000 -@@ -834,7 +834,7 @@ - while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) && - time_before(jiffies, - then + msecs_to_jiffies(2000))) { -- schedule_timeout(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout((10)); - } - - /* To convert jiffies to ms, we must multiply by 1000 -diff -Nur a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c ---- a/drivers/mfd/ucb1x00-core.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/mfd/ucb1x00-core.c 2018-11-03 16:30:39.476807464 +0000 -@@ -253,7 +253,7 @@ - break; - /* yield to other processes */ - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - - return UCB_ADC_DAT(val); -diff -Nur a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c ---- a/drivers/misc/sgi-xp/xpc_channel.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/misc/sgi-xp/xpc_channel.c 2018-11-03 16:30:39.476807464 +0000 -@@ -837,7 +837,7 @@ - - atomic_inc(&ch->n_on_msg_allocate_wq); - prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); -- ret = schedule_timeout(1); -+ ret = schedule_min_hrtimeout(); - finish_wait(&ch->msg_allocate_wq, &wait); - atomic_dec(&ch->n_on_msg_allocate_wq); - -diff -Nur a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c ---- a/drivers/net/caif/caif_hsi.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/net/caif/caif_hsi.c 2018-11-03 16:30:39.477807497 +0000 -@@ -940,7 +940,7 @@ - break; - - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - retry--; - } - -diff -Nur a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c ---- a/drivers/net/can/usb/peak_usb/pcan_usb.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/net/can/usb/peak_usb/pcan_usb.c 2018-11-03 16:30:39.477807497 +0000 -@@ -250,7 +250,7 @@ - } else { - /* the PCAN-USB needs time to init */ - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); -+ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT)); - } - - return err; -diff -Nur a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c ---- a/drivers/net/usb/lan78xx.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/net/usb/lan78xx.c 2018-11-03 16:30:39.478807529 +0000 -@@ -2567,7 +2567,7 @@ - while (!skb_queue_empty(&dev->rxq) && - !skb_queue_empty(&dev->txq) && - !skb_queue_empty(&dev->done)) { -- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); -+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); - set_current_state(TASK_UNINTERRUPTIBLE); - netif_dbg(dev, ifdown, dev->net, - "waited for %d urb completions\n", temp); -diff -Nur a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c ---- a/drivers/net/usb/usbnet.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/net/usb/usbnet.c 2018-11-03 16:30:39.479807561 +0000 -@@ -772,7 +772,7 @@ - spin_lock_irqsave(&q->lock, flags); - while (!skb_queue_empty(q)) { - spin_unlock_irqrestore(&q->lock, flags); -- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); -+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); - set_current_state(TASK_UNINTERRUPTIBLE); - spin_lock_irqsave(&q->lock, flags); - } -diff -Nur a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c ---- a/drivers/ntb/test/ntb_perf.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/ntb/test/ntb_perf.c 2018-11-03 16:30:39.479807561 +0000 -@@ -310,7 +310,7 @@ - if (unlikely((jiffies - last_sleep) > 5 * HZ)) { - last_sleep = jiffies; - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - - if (unlikely(kthread_should_stop())) -diff -Nur a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c ---- a/drivers/scsi/fnic/fnic_scsi.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/scsi/fnic/fnic_scsi.c 2018-11-03 16:30:39.480807592 +0000 -@@ -217,7 +217,7 @@ - - /* wait for io cmpl */ - while (atomic_read(&fnic->in_flight)) -- schedule_timeout(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout((1)); - - spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); - -@@ -2255,7 +2255,7 @@ - } - } - -- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); -+ schedule_msec_hrtimeout((2 * fnic->config.ed_tov)); - - /* walk again to check, if IOs are still pending in fw */ - if (fnic_is_abts_pending(fnic, lr_sc)) -diff -Nur a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c ---- a/drivers/scsi/snic/snic_scsi.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/scsi/snic/snic_scsi.c 2018-11-03 16:30:39.481807625 +0000 -@@ -2354,7 +2354,7 @@ - - /* Wait for all the IOs that are entered in Qcmd */ - while (atomic_read(&snic->ios_inflight)) -- schedule_timeout(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout((1)); - - ret = snic_issue_hba_reset(snic, sc); - if (ret) { -diff -Nur a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c ---- a/drivers/staging/comedi/drivers/ni_mio_common.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/staging/comedi/drivers/ni_mio_common.c 2018-11-03 16:30:39.483807688 +0000 -@@ -4657,7 +4657,7 @@ - if ((status & NI67XX_CAL_STATUS_BUSY) == 0) - break; - set_current_state(TASK_INTERRUPTIBLE); -- if (schedule_timeout(1)) -+ if (schedule_min_hrtimeout()) - return -EIO; - } - if (i == timeout) { -diff -Nur a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c ---- a/drivers/staging/lustre/lnet/lnet/lib-eq.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c 2018-11-03 16:30:39.483807688 +0000 -@@ -329,7 +329,7 @@ - schedule(); - } else { - now = jiffies; -- schedule_timeout(msecs_to_jiffies(tms)); -+ schedule_msec_hrtimeout((tms)); - tms -= jiffies_to_msecs(jiffies - now); - if (tms < 0) /* no more wait but may have new event */ - tms = 0; -diff -Nur a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c ---- a/drivers/staging/rts5208/rtsx.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/staging/rts5208/rtsx.c 2018-11-03 16:30:39.483807688 +0000 -@@ -524,7 +524,7 @@ - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL)); -+ schedule_msec_hrtimeout((POLLING_INTERVAL)); - - /* lock the device pointers */ - mutex_lock(&dev->dev_mutex); -diff -Nur a/drivers/staging/speakup/speakup_acntpc.c b/drivers/staging/speakup/speakup_acntpc.c ---- a/drivers/staging/speakup/speakup_acntpc.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/staging/speakup/speakup_acntpc.c 2018-11-03 16:30:39.484807721 +0000 -@@ -206,7 +206,7 @@ - full_time_val = full_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout((full_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -234,7 +234,7 @@ - jiffy_delta_val = jiffy_delta->u.n.value; - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - jiff_max = jiffies + jiffy_delta_val; - } - } -diff -Nur a/drivers/staging/speakup/speakup_apollo.c b/drivers/staging/speakup/speakup_apollo.c ---- a/drivers/staging/speakup/speakup_apollo.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/staging/speakup/speakup_apollo.c 2018-11-03 16:30:39.484807721 +0000 -@@ -174,7 +174,7 @@ - if (!synth->io_ops->synth_out(synth, ch)) { - synth->io_ops->tiocmset(0, UART_MCR_RTS); - synth->io_ops->tiocmset(UART_MCR_RTS, 0); -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout(full_time_val); - continue; - } - if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { -diff -Nur a/drivers/staging/speakup/speakup_decext.c b/drivers/staging/speakup/speakup_decext.c ---- a/drivers/staging/speakup/speakup_decext.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/staging/speakup/speakup_decext.c 2018-11-03 16:30:39.484807721 +0000 -@@ -185,7 +185,7 @@ - if (ch == '\n') - ch = 0x0D; - if (synth_full() || !synth->io_ops->synth_out(synth, ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - continue; - } - set_current_state(TASK_RUNNING); -diff -Nur a/drivers/staging/speakup/speakup_decpc.c b/drivers/staging/speakup/speakup_decpc.c ---- a/drivers/staging/speakup/speakup_decpc.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/staging/speakup/speakup_decpc.c 2018-11-03 16:30:39.484807721 +0000 -@@ -403,7 +403,7 @@ - if (ch == '\n') - ch = 0x0D; - if (dt_sendchar(ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -diff -Nur a/drivers/staging/speakup/speakup_dectlk.c b/drivers/staging/speakup/speakup_dectlk.c ---- a/drivers/staging/speakup/speakup_dectlk.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/staging/speakup/speakup_dectlk.c 2018-11-03 16:30:39.485807753 +0000 -@@ -253,7 +253,7 @@ - if (ch == '\n') - ch = 0x0D; - if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - continue; - } - set_current_state(TASK_RUNNING); -diff -Nur a/drivers/staging/speakup/speakup_dtlk.c b/drivers/staging/speakup/speakup_dtlk.c ---- a/drivers/staging/speakup/speakup_dtlk.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/staging/speakup/speakup_dtlk.c 2018-11-03 16:30:39.485807753 +0000 -@@ -220,7 +220,7 @@ - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -236,7 +236,7 @@ - delay_time_val = delay_time->u.n.value; - jiffy_delta_val = jiffy_delta->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - jiff_max = jiffies + jiffy_delta_val; - } - } -diff -Nur a/drivers/staging/speakup/speakup_keypc.c b/drivers/staging/speakup/speakup_keypc.c ---- a/drivers/staging/speakup/speakup_keypc.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/staging/speakup/speakup_keypc.c 2018-11-03 16:30:39.485807753 +0000 -@@ -208,7 +208,7 @@ - full_time_val = full_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout((full_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -241,7 +241,7 @@ - jiffy_delta_val = jiffy_delta->u.n.value; - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - jiff_max = jiffies+jiffy_delta_val; - } - } -diff -Nur a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c ---- a/drivers/staging/speakup/synth.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/staging/speakup/synth.c 2018-11-03 16:30:39.486807785 +0000 -@@ -92,7 +92,7 @@ - if (ch == '\n') - ch = synth->procspeech; - if (!synth->io_ops->synth_out(synth, ch)) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout(full_time_val); - continue; - } - if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { -diff -Nur a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c ---- a/drivers/staging/unisys/visornic/visornic_main.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/staging/unisys/visornic/visornic_main.c 2018-11-03 16:30:39.486807785 +0000 -@@ -556,7 +556,7 @@ - } - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- wait += schedule_timeout(msecs_to_jiffies(10)); -+ wait += schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - } - -@@ -567,7 +567,7 @@ - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- schedule_timeout(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - if (atomic_read(&devdata->usage)) - break; -@@ -721,7 +721,7 @@ - } - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- wait += schedule_timeout(msecs_to_jiffies(10)); -+ wait += schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - } - -diff -Nur a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c ---- a/drivers/target/target_core_user.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/target/target_core_user.c 2018-11-03 16:30:39.487807817 +0000 -@@ -808,10 +808,9 @@ - pr_debug("sleeping for ring space\n"); - mutex_unlock(&udev->cmdr_lock); - if (udev->cmd_time_out) -- ret = schedule_timeout( -- msecs_to_jiffies(udev->cmd_time_out)); -+ ret = schedule_msec_hrtimeout(udev->cmd_time_out); - else -- ret = schedule_timeout(msecs_to_jiffies(TCMU_TIME_OUT)); -+ ret = schedule_msec_hrtimeout(TCMU_TIME_OUT); - finish_wait(&udev->wait_cmdr, &__wait); - if (!ret) { - pr_warn("tcmu: command timed out\n"); -diff -Nur a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c ---- a/drivers/video/fbdev/omap/hwa742.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/video/fbdev/omap/hwa742.c 2018-11-03 16:30:39.487807817 +0000 -@@ -926,7 +926,7 @@ - if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7)) - break; - set_current_state(TASK_UNINTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(5)); -+ schedule_msec_hrtimeout((5)); - } - hwa742_set_update_mode(hwa742.update_mode_before_suspend); - } -diff -Nur a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c ---- a/drivers/video/fbdev/pxafb.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/drivers/video/fbdev/pxafb.c 2018-11-03 16:30:39.488807849 +0000 -@@ -1286,7 +1286,7 @@ - mutex_unlock(&fbi->ctrlr_lock); - - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(30)); -+ schedule_msec_hrtimeout((30)); - } - - pr_debug("%s(): task ending\n", __func__); -diff -Nur a/fs/afs/vlocation.c b/fs/afs/vlocation.c ---- a/fs/afs/vlocation.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/fs/afs/vlocation.c 2018-11-03 16:30:39.488807849 +0000 -@@ -129,7 +129,7 @@ - if (vl->upd_busy_cnt > 1) { - /* second+ BUSY - sleep a little bit */ - set_current_state(TASK_UNINTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - continue; - } -diff -Nur a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c ---- a/fs/btrfs/extent-tree.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/fs/btrfs/extent-tree.c 2018-11-03 16:30:39.491807945 +0000 -@@ -6106,7 +6106,7 @@ - - if (flush != BTRFS_RESERVE_NO_FLUSH && - btrfs_transaction_in_commit(fs_info)) -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - - if (delalloc_lock) - mutex_lock(&inode->delalloc_mutex); -diff -Nur a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c ---- a/fs/btrfs/inode-map.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/fs/btrfs/inode-map.c 2018-11-03 16:30:39.492807977 +0000 -@@ -89,7 +89,7 @@ - btrfs_release_path(path); - root->ino_cache_progress = last; - up_read(&fs_info->commit_root_sem); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - goto again; - } else - continue; -diff -Nur a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c ---- a/sound/usb/line6/pcm.c 2018-10-10 07:54:28.000000000 +0100 -+++ b/sound/usb/line6/pcm.c 2018-11-03 16:30:39.492807977 +0000 -@@ -131,7 +131,7 @@ - if (!alive) - break; - set_current_state(TASK_UNINTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } while (--timeout > 0); - if (alive) - dev_err(line6pcm->line6->ifcdev, diff --git a/sys-kernel/linux-sources-redcore-lts/files/0008-Replace-all-calls-to-schedule_timeout_interruptible-.patch b/sys-kernel/linux-sources-redcore-lts/files/0008-Replace-all-calls-to-schedule_timeout_interruptible-.patch deleted file mode 100644 index f9f274ce..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0008-Replace-all-calls-to-schedule_timeout_interruptible-.patch +++ /dev/null @@ -1,311 +0,0 @@ -From 3ef5df78c2f425115b87f0f2f59fd189c0f1bbe3 Mon Sep 17 00:00:00 2001 -From: Con Kolivas -Date: Mon, 20 Feb 2017 13:30:07 +1100 -Subject: [PATCH 08/16] Replace all calls to schedule_timeout_interruptible of - potentially under 50ms to use schedule_msec_hrtimeout_interruptible. - ---- - drivers/hwmon/fam15h_power.c | 2 +- - drivers/iio/light/tsl2563.c | 6 +----- - drivers/media/i2c/msp3400-driver.c | 4 ++-- - drivers/media/pci/ivtv/ivtv-gpio.c | 6 +++--- - drivers/media/radio/radio-mr800.c | 2 +- - drivers/media/radio/radio-tea5777.c | 2 +- - drivers/media/radio/tea575x.c | 2 +- - drivers/parport/ieee1284.c | 2 +- - drivers/parport/ieee1284_ops.c | 2 +- - drivers/platform/x86/intel_ips.c | 8 ++++---- - net/core/pktgen.c | 2 +- - sound/soc/codecs/wm8350.c | 12 ++++++------ - sound/soc/codecs/wm8900.c | 2 +- - sound/soc/codecs/wm9713.c | 4 ++-- - 14 files changed, 26 insertions(+), 30 deletions(-) - -diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c -index 9545a346044f..c24cf1302ec7 100644 ---- a/drivers/hwmon/fam15h_power.c -+++ b/drivers/hwmon/fam15h_power.c -@@ -237,7 +237,7 @@ static ssize_t power1_average_show(struct device *dev, - prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; - } - -- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); -+ leftover = schedule_msec_hrtimeout_interruptible((data->power_period)); - if (leftover) - return 0; - -diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c -index 7599693f7fe9..452090739138 100644 ---- a/drivers/iio/light/tsl2563.c -+++ b/drivers/iio/light/tsl2563.c -@@ -282,11 +282,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip) - default: - delay = 402; - } -- /* -- * TODO: Make sure that we wait at least required delay but why we -- * have to extend it one tick more? -- */ -- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2); -+ schedule_msec_hrtimeout_interruptible(delay + 1); - } - - static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc) -diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c -index 3db966db83eb..f0fab7676f72 100644 ---- a/drivers/media/i2c/msp3400-driver.c -+++ b/drivers/media/i2c/msp3400-driver.c -@@ -179,7 +179,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr) - break; - dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err, - dev, addr); -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - if (err == 3) { - dev_warn(&client->dev, "resetting chip, sound will go off.\n"); -@@ -220,7 +220,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val) - break; - dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err, - dev, addr); -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - if (err == 3) { - dev_warn(&client->dev, "resetting chip, sound will go off.\n"); -diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c -index f752f3993687..23372af61ebf 100644 ---- a/drivers/media/pci/ivtv/ivtv-gpio.c -+++ b/drivers/media/pci/ivtv/ivtv-gpio.c -@@ -117,7 +117,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv) - curout = (curout & ~0xF) | 1; - write_reg(curout, IVTV_REG_GPIO_OUT); - /* We could use something else for smaller time */ -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - curout |= 2; - write_reg(curout, IVTV_REG_GPIO_OUT); - curdir &= ~0x80; -@@ -137,11 +137,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value) - curout = read_reg(IVTV_REG_GPIO_OUT); - curout &= ~(1 << itv->card->xceive_pin); - write_reg(curout, IVTV_REG_GPIO_OUT); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - - curout |= 1 << itv->card->xceive_pin; - write_reg(curout, IVTV_REG_GPIO_OUT); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - return 0; - } - -diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c -index c9f59129af79..cb6f8394a5c2 100644 ---- a/drivers/media/radio/radio-mr800.c -+++ b/drivers/media/radio/radio-mr800.c -@@ -378,7 +378,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv, - retval = -ENODATA; - break; - } -- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { -+ if (schedule_msec_hrtimeout_interruptible((10))) { - retval = -ERESTARTSYS; - break; - } -diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c -index 04ed1a5d1177..d593d28dc286 100644 ---- a/drivers/media/radio/radio-tea5777.c -+++ b/drivers/media/radio/radio-tea5777.c -@@ -245,7 +245,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait) - } - - if (wait) { -- if (schedule_timeout_interruptible(msecs_to_jiffies(wait))) -+ if (schedule_msec_hrtimeout_interruptible((wait))) - return -ERESTARTSYS; - } - -diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c -index 4dc2067bce14..29f4416fb9ae 100644 ---- a/drivers/media/radio/tea575x.c -+++ b/drivers/media/radio/tea575x.c -@@ -416,7 +416,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea, - for (;;) { - if (time_after(jiffies, timeout)) - break; -- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { -+ if (schedule_msec_hrtimeout_interruptible((10))) { - /* some signal arrived, stop search */ - tea->val &= ~TEA575X_BIT_SEARCH; - snd_tea575x_set_freq(tea); -diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c -index 74cc6dd982d2..c22c4d5f08d0 100644 ---- a/drivers/parport/ieee1284.c -+++ b/drivers/parport/ieee1284.c -@@ -215,7 +215,7 @@ int parport_wait_peripheral(struct parport *port, - /* parport_wait_event didn't time out, but the - * peripheral wasn't actually ready either. - * Wait for another 10ms. */ -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - } - -diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c -index 5d41dda6da4e..34705f6b423f 100644 ---- a/drivers/parport/ieee1284_ops.c -+++ b/drivers/parport/ieee1284_ops.c -@@ -537,7 +537,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, - /* Yield the port for a while. */ - if (count && dev->port->irq != PARPORT_IRQ_NONE) { - parport_release (dev); -- schedule_timeout_interruptible(msecs_to_jiffies(40)); -+ schedule_msec_hrtimeout_interruptible((40)); - parport_claim_or_block (dev); - } - else -diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c -index 58dcee562d64..b661b7c071bb 100644 ---- a/drivers/platform/x86/intel_ips.c -+++ b/drivers/platform/x86/intel_ips.c -@@ -813,7 +813,7 @@ static int ips_adjust(void *data) - ips_gpu_lower(ips); - - sleep: -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD)); - } while (!kthread_should_stop()); - - dev_dbg(&ips->dev->dev, "ips-adjust thread stopped\n"); -@@ -992,7 +992,7 @@ static int ips_monitor(void *data) - seqno_timestamp = get_jiffies_64(); - - old_cpu_power = thm_readl(THM_CEC); -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - - /* Collect an initial average */ - for (i = 0; i < IPS_SAMPLE_COUNT; i++) { -@@ -1019,7 +1019,7 @@ static int ips_monitor(void *data) - mchp_samples[i] = mchp; - } - -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - if (kthread_should_stop()) - break; - } -@@ -1046,7 +1046,7 @@ static int ips_monitor(void *data) - * us to reduce the sample frequency if the CPU and GPU are idle. - */ - old_cpu_power = thm_readl(THM_CEC); -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - last_sample_period = IPS_SAMPLE_PERIOD; - - setup_deferrable_timer_on_stack(&timer, monitor_timeout, -diff --git a/net/core/pktgen.c b/net/core/pktgen.c -index 6e1e10ff433a..be5d6f7142e4 100644 ---- a/net/core/pktgen.c -+++ b/net/core/pktgen.c -@@ -1992,7 +1992,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) - mutex_unlock(&pktgen_thread_lock); - pr_debug("%s: waiting for %s to disappear....\n", - __func__, ifname); -- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); -+ schedule_msec_hrtimeout_interruptible((msec_per_try)); - mutex_lock(&pktgen_thread_lock); - - if (++i >= max_tries) { -diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c -index 2efc5b41ad0f..3e3248c48c6b 100644 ---- a/sound/soc/codecs/wm8350.c -+++ b/sound/soc/codecs/wm8350.c -@@ -236,10 +236,10 @@ static void wm8350_pga_work(struct work_struct *work) - out2->ramp == WM8350_RAMP_UP) { - /* delay is longer over 0dB as increases are larger */ - if (i >= WM8350_OUTn_0dB) -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (2)); - else -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (1)); - } else - udelay(50); /* doesn't matter if we delay longer */ -@@ -1123,7 +1123,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec, - (platform->dis_out4 << 6)); - - /* wait for discharge */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - cap_discharge_msecs)); - -@@ -1139,7 +1139,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec, - WM8350_VBUFEN); - - /* wait for vmid */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - vmid_charge_msecs)); - -@@ -1190,7 +1190,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec, - wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1); - - /* wait */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - vmid_discharge_msecs)); - -@@ -1208,7 +1208,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec, - pm1 | WM8350_OUTPUT_DRAIN_EN); - - /* wait */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform->drain_msecs)); - - pm1 &= ~WM8350_BIASEN; -diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c -index c77b49a29311..fc50456e90a9 100644 ---- a/sound/soc/codecs/wm8900.c -+++ b/sound/soc/codecs/wm8900.c -@@ -1112,7 +1112,7 @@ static int wm8900_set_bias_level(struct snd_soc_codec *codec, - /* Need to let things settle before stopping the clock - * to ensure that restart works, see "Stopping the - * master clock" in the datasheet. */ -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - snd_soc_write(codec, WM8900_REG_POWER2, - WM8900_REG_POWER2_SYSCLK_ENA); - break; -diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c -index 7e4822185feb..0c85a207446a 100644 ---- a/sound/soc/codecs/wm9713.c -+++ b/sound/soc/codecs/wm9713.c -@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w, - - /* Gracefully shut down the voice interface. */ - snd_soc_update_bits(codec, AC97_HANDSET_RATE, 0x0f00, 0x0200); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - snd_soc_update_bits(codec, AC97_HANDSET_RATE, 0x0f00, 0x0f00); - snd_soc_update_bits(codec, AC97_EXTENDED_MID, 0x1000, 0x1000); - -@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_codec *codec, - wm9713->pll_in = freq_in; - - /* wait 10ms AC97 link frames for the link to stabilise */ -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - return 0; - } - --- -2.11.0 - diff --git a/sys-kernel/linux-sources-redcore-lts/files/0009-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch b/sys-kernel/linux-sources-redcore-lts/files/0009-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch deleted file mode 100644 index c910f3df..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0009-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch +++ /dev/null @@ -1,160 +0,0 @@ -From 6044370cf4bbc5e05f5d78f5772c1d88e3153603 Mon Sep 17 00:00:00 2001 -From: Con Kolivas -Date: Mon, 20 Feb 2017 13:30:32 +1100 -Subject: [PATCH 09/16] Replace all calls to schedule_timeout_uninterruptible - of potentially under 50ms to use schedule_msec_hrtimeout_uninterruptible - ---- - drivers/media/pci/cx18/cx18-gpio.c | 4 ++-- - drivers/net/wireless/intel/ipw2x00/ipw2100.c | 4 ++-- - drivers/rtc/rtc-wm8350.c | 6 +++--- - drivers/scsi/lpfc/lpfc_scsi.c | 2 +- - sound/pci/maestro3.c | 4 ++-- - sound/soc/codecs/rt5631.c | 4 ++-- - sound/soc/soc-dapm.c | 2 +- - 7 files changed, 13 insertions(+), 13 deletions(-) - -diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c -index 012859e6dc7b..206bd08265a5 100644 ---- a/drivers/media/pci/cx18/cx18-gpio.c -+++ b/drivers/media/pci/cx18/cx18-gpio.c -@@ -90,11 +90,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi, - - /* Assert */ - gpio_update(cx, mask, ~active_lo); -- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs)); -+ schedule_msec_hrtimeout_uninterruptible((assert_msecs)); - - /* Deassert */ - gpio_update(cx, mask, ~active_hi); -- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs)); -+ schedule_msec_hrtimeout_uninterruptible((recovery_msecs)); - } - - /* -diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c -index 19c442cb93e4..448f41782060 100644 ---- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c -+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c -@@ -830,7 +830,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv, - * doesn't seem to have as many firmware restart cycles... - * - * As a test, we're sticking in a 1/100s delay here */ -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - - return 0; - -@@ -1281,7 +1281,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv) - IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n"); - i = 5000; - do { -- schedule_timeout_uninterruptible(msecs_to_jiffies(40)); -+ schedule_msec_hrtimeout_uninterruptible((40)); - /* Todo... wait for sync command ... */ - - read_register(priv->net_dev, IPW_REG_INTA, &inta); -diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c -index 483c7993516b..fddbaa475066 100644 ---- a/drivers/rtc/rtc-wm8350.c -+++ b/drivers/rtc/rtc-wm8350.c -@@ -119,7 +119,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) - /* Wait until confirmation of stopping */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); - - if (!retries) { -@@ -202,7 +202,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350) - /* Wait until confirmation of stopping */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS)); - - if (!(rtc_ctrl & WM8350_RTC_ALMSTS)) -@@ -225,7 +225,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) - /* Wait until confirmation */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS); - - if (rtc_ctrl & WM8350_RTC_ALMSTS) -diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c -index 1a6f122bb25d..c0db66302a3e 100644 ---- a/drivers/scsi/lpfc/lpfc_scsi.c -+++ b/drivers/scsi/lpfc/lpfc_scsi.c -@@ -5131,7 +5131,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id, - tgt_id, lun_id, context); - later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies; - while (time_after(later, jiffies) && cnt) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(20)); -+ schedule_msec_hrtimeout_uninterruptible((20)); - cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context); - } - if (cnt) { -diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c -index 8f20dec97843..944ce63431b0 100644 ---- a/sound/pci/maestro3.c -+++ b/sound/pci/maestro3.c -@@ -2016,7 +2016,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) - outw(0, io + GPIO_DATA); - outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION); - -- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1)); -+ schedule_msec_hrtimeout_uninterruptible((delay1)); - - outw(GPO_PRIMARY_AC97, io + GPIO_DATA); - udelay(5); -@@ -2024,7 +2024,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) - outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A); - outw(~0, io + GPIO_MASK); - -- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2)); -+ schedule_msec_hrtimeout_uninterruptible((delay2)); - - if (! snd_m3_try_read_vendor(chip)) - break; -diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c -index 55b04c55fb4b..2ed02ad6ac41 100644 ---- a/sound/soc/codecs/rt5631.c -+++ b/sound/soc/codecs/rt5631.c -@@ -419,7 +419,7 @@ static void onebit_depop_mute_stage(struct snd_soc_codec *codec, int enable) - hp_zc = snd_soc_read(codec, RT5631_INT_ST_IRQ_CTRL_2); - snd_soc_write(codec, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); - if (enable) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - /* config one-bit depop parameter */ - rt5631_write_index(codec, RT5631_SPK_INTL_CTRL, 0x307f); - snd_soc_update_bits(codec, RT5631_HP_OUT_VOL, -@@ -529,7 +529,7 @@ static void depop_seq_mute_stage(struct snd_soc_codec *codec, int enable) - hp_zc = snd_soc_read(codec, RT5631_INT_ST_IRQ_CTRL_2); - snd_soc_write(codec, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); - if (enable) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - - /* config depop sequence parameter */ - rt5631_write_index(codec, RT5631_SPK_INTL_CTRL, 0x302f); -diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c -index dcef67a9bd48..11c2bb48c8f2 100644 ---- a/sound/soc/soc-dapm.c -+++ b/sound/soc/soc-dapm.c -@@ -134,7 +134,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm) - static void pop_wait(u32 pop_time) - { - if (pop_time) -- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time)); -+ schedule_msec_hrtimeout_uninterruptible((pop_time)); - } - - static void pop_dbg(struct device *dev, u32 pop_time, const char *fmt, ...) --- -2.11.0 - diff --git a/sys-kernel/linux-sources-redcore-lts/files/0010-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch b/sys-kernel/linux-sources-redcore-lts/files/0010-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch deleted file mode 100644 index 260bb98d..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0010-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch +++ /dev/null @@ -1,69 +0,0 @@ -From 071486de633698dcdd163295173ce4663ec9158c Mon Sep 17 00:00:00 2001 -From: Con Kolivas -Date: Mon, 20 Feb 2017 13:32:58 +1100 -Subject: [PATCH 10/16] Don't use hrtimer overlay when pm_freezing since some - drivers still don't correctly use freezable timeouts. - ---- - kernel/time/hrtimer.c | 2 +- - kernel/time/timer.c | 9 +++++---- - 2 files changed, 6 insertions(+), 5 deletions(-) - -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index 13227cf2814c..66456c72bace 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -1809,7 +1809,7 @@ long __sched schedule_msec_hrtimeout(long timeout) - * (yet) better than Hz, as would occur during startup, use regular - * timers. - */ -- if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ) -+ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) - return schedule_timeout(jiffs); - - secs = timeout / 1000; -diff --git a/kernel/time/timer.c b/kernel/time/timer.c -index c68cb9307f64..2f2c96b03efe 100644 ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -44,6 +44,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -1891,12 +1892,12 @@ void msleep(unsigned int msecs) - * Use high resolution timers where the resolution of tick based - * timers is inadequate. - */ -- if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { -+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { - while (msecs) - msecs = schedule_msec_hrtimeout_uninterruptible(msecs); - return; - } -- timeout = msecs_to_jiffies(msecs) + 1; -+ timeout = jiffs + 1; - - while (timeout) - timeout = schedule_timeout_uninterruptible(timeout); -@@ -1913,12 +1914,12 @@ unsigned long msleep_interruptible(unsigned int msecs) - int jiffs = msecs_to_jiffies(msecs); - unsigned long timeout; - -- if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { -+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { - while (msecs && !signal_pending(current)) - msecs = schedule_msec_hrtimeout_interruptible(msecs); - return msecs; - } -- timeout = msecs_to_jiffies(msecs) + 1; -+ timeout = jiffs + 1; - - while (timeout && !signal_pending(current)) - timeout = schedule_timeout_interruptible(timeout); --- -2.11.0 - diff --git a/sys-kernel/linux-sources-redcore-lts/files/0011-Make-hrtimer-granularity-and-minimum-hrtimeout-confi.patch b/sys-kernel/linux-sources-redcore-lts/files/0011-Make-hrtimer-granularity-and-minimum-hrtimeout-confi.patch deleted file mode 100644 index 5ac20300..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0011-Make-hrtimer-granularity-and-minimum-hrtimeout-confi.patch +++ /dev/null @@ -1,136 +0,0 @@ -diff -Nur a/kernel/sysctl.c b/kernel/sysctl.c ---- a/kernel/sysctl.c 2018-11-03 17:03:07.433069521 +0000 -+++ b/kernel/sysctl.c 2018-11-03 17:02:11.020267246 +0000 -@@ -141,7 +141,9 @@ - extern int sched_iso_cpu; - extern int sched_yield_type; - #endif --#ifdef CONFIG_PRINTK -+extern int hrtimer_granularity_us; -+extern int hrtimeout_min_us; -+#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS) - static int ten_thousand __read_only = 10000; - #endif - #ifdef CONFIG_PERF_EVENTS -@@ -1119,6 +1121,24 @@ - .extra2 = &two, - }, - #endif -+ { -+ .procname = "hrtimer_granularity_us", -+ .data = &hrtimer_granularity_us, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &ten_thousand, -+ }, -+ { -+ .procname = "hrtimeout_min_us", -+ .data = &hrtimeout_min_us, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &ten_thousand, -+ }, - #if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", -diff -Nur a/kernel/time/clockevents.c b/kernel/time/clockevents.c ---- a/kernel/time/clockevents.c 2018-11-03 17:03:07.433069521 +0000 -+++ b/kernel/time/clockevents.c 2018-11-03 16:58:17.283800909 +0000 -@@ -198,13 +198,9 @@ - - #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST - --#ifdef CONFIG_SCHED_MUQSS -+int __read_mostly hrtimer_granularity_us = 100; - /* Limit min_delta to 100us */ --#define MIN_DELTA_LIMIT (NSEC_PER_SEC / 10000) --#else --/* Limit min_delta to a jiffie */ --#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) --#endif -+#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC) - - /** - * clockevents_increase_min_delta - raise minimum delta of a clock event device -diff -Nur a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c ---- a/kernel/time/hrtimer.c 2018-11-03 17:04:16.448274547 +0000 -+++ b/kernel/time/hrtimer.c 2018-11-03 16:58:17.283800909 +0000 -@@ -1803,7 +1803,7 @@ - long __sched schedule_msec_hrtimeout(long timeout) - { - struct hrtimer_sleeper t; -- int delta, secs, jiffs; -+ int delta, jiffs; - ktime_t expires; - - if (!timeout) { -@@ -1820,9 +1820,8 @@ - if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) - return schedule_timeout(jiffs); - -- secs = timeout / 1000; - delta = (timeout % 1000) * NSEC_PER_MSEC; -- expires = ktime_set(secs, delta); -+ expires = ktime_set(0, delta); - - hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer_set_expires_range_ns(&t.timer, expires, delta); -@@ -1846,9 +1845,53 @@ - - EXPORT_SYMBOL(schedule_msec_hrtimeout); - -+#define USECS_PER_SEC 1000000 -+extern int hrtimer_granularity_us; -+ -+static inline long schedule_usec_hrtimeout(long timeout) -+{ -+ struct hrtimer_sleeper t; -+ ktime_t expires; -+ int delta; -+ -+ if (!timeout) { -+ __set_current_state(TASK_RUNNING); -+ return 0; -+ } -+ -+ if (hrtimer_resolution >= NSEC_PER_SEC / HZ) -+ return schedule_timeout(usecs_to_jiffies(timeout)); -+ -+ if (timeout < hrtimer_granularity_us) -+ timeout = hrtimer_granularity_us; -+ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC; -+ expires = ktime_set(0, delta); -+ -+ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ hrtimer_set_expires_range_ns(&t.timer, expires, delta); -+ -+ hrtimer_init_sleeper(&t, current); -+ -+ hrtimer_start_expires(&t.timer, HRTIMER_MODE_REL); -+ -+ if (likely(t.task)) -+ schedule(); -+ -+ hrtimer_cancel(&t.timer); -+ destroy_hrtimer_on_stack(&t.timer); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ expires = hrtimer_expires_remaining(&t.timer); -+ timeout = ktime_to_us(expires); -+ return timeout < 0 ? 0 : timeout; -+} -+ -+int __read_mostly hrtimeout_min_us = 1000; -+ - long __sched schedule_min_hrtimeout(void) - { -- return schedule_msec_hrtimeout(1); -+ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us)); - } - - EXPORT_SYMBOL(schedule_min_hrtimeout); diff --git a/sys-kernel/linux-sources-redcore-lts/files/0012-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch b/sys-kernel/linux-sources-redcore-lts/files/0012-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch deleted file mode 100644 index 99b28d65..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0012-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch +++ /dev/null @@ -1,81 +0,0 @@ -From 9e47a80f690080c12ce607158b96c305707543b8 Mon Sep 17 00:00:00 2001 -From: Con Kolivas -Date: Wed, 7 Dec 2016 21:23:01 +1100 -Subject: [PATCH 12/16] Reinstate default Hz of 100 in combination with MuQSS - and -ck patches. - ---- - kernel/Kconfig.hz | 25 ++++++++++++++++++------- - 1 file changed, 18 insertions(+), 7 deletions(-) - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1806fcac8f14 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,8 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_100 if SCHED_MUQSS -+ default HZ_250_NODEF if !SCHED_MUQSS - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -19,11 +20,18 @@ choice - config HZ_100 - bool "100 HZ" - help -+ 100 Hz is a suitable choice in combination with MuQSS which does -+ not rely on ticks for rescheduling interrupts, and is not Hz limited -+ for timeouts and sleeps from both the kernel and userspace. -+ This allows us to benefit from the lower overhead and higher -+ throughput of fewer timer ticks. -+ -+ Non-MuQSS kernels: - 100 Hz is a typical choice for servers, SMP and NUMA systems - with lots of processors that may show reduced performance if - too many timer interrupts are occurring. - -- config HZ_250 -+ config HZ_250_NODEF - bool "250 HZ" - help - 250 Hz is a good compromise choice allowing server performance -@@ -31,7 +39,10 @@ choice - on SMP and NUMA systems. If you are going to be using NTSC video - or multimedia, selected 300Hz instead. - -- config HZ_300 -+ 250 Hz is the default choice for the mainline scheduler but not -+ advantageous in combination with MuQSS. -+ -+ config HZ_300_NODEF - bool "300 HZ" - help - 300 Hz is a good compromise choice allowing server performance -@@ -39,7 +50,7 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -- config HZ_1000 -+ config HZ_1000_NODEF - bool "1000 HZ" - help - 1000 Hz is the preferred choice for desktop systems and other -@@ -50,9 +61,9 @@ endchoice - config HZ - int - default 100 if HZ_100 -- default 250 if HZ_250 -- default 300 if HZ_300 -- default 1000 if HZ_1000 -+ default 250 if HZ_250_NODEF -+ default 300 if HZ_300_NODEF -+ default 1000 if HZ_1000_NODEF - - config SCHED_HRTICK - def_bool HIGH_RES_TIMERS --- -2.11.0 - diff --git a/sys-kernel/linux-sources-redcore-lts/files/0013-Make-threaded-IRQs-optionally-the-default-which-can-.patch b/sys-kernel/linux-sources-redcore-lts/files/0013-Make-threaded-IRQs-optionally-the-default-which-can-.patch deleted file mode 100644 index 63ec9fdf..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0013-Make-threaded-IRQs-optionally-the-default-which-can-.patch +++ /dev/null @@ -1,61 +0,0 @@ -From 5902b315d4061ebbe73a62c52e6d3b618066cebc Mon Sep 17 00:00:00 2001 -From: Con Kolivas -Date: Wed, 7 Dec 2016 21:13:16 +1100 -Subject: [PATCH 13/16] Make threaded IRQs optionally the default which can be - disabled. - ---- - kernel/irq/Kconfig | 14 ++++++++++++++ - kernel/irq/manage.c | 10 ++++++++++ - 2 files changed, 24 insertions(+) - -diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig -index a117adf7084b..0984c54fd4e9 100644 ---- a/kernel/irq/Kconfig -+++ b/kernel/irq/Kconfig -@@ -111,6 +111,20 @@ config IRQ_DOMAIN_DEBUG - config IRQ_FORCED_THREADING - bool - -+config FORCE_IRQ_THREADING -+ bool "Make IRQ threading compulsory" -+ depends on IRQ_FORCED_THREADING -+ default y -+ ---help--- -+ -+ Make IRQ threading mandatory for any IRQ handlers that support it -+ instead of being optional and requiring the threadirqs kernel -+ parameter. Instead they can be optionally disabled with the -+ nothreadirqs kernel parameter. -+ -+ Enable if you are building for a desktop or low latency system, -+ otherwise say N. -+ - config SPARSE_IRQ - bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ - ---help--- -diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c -index 4bff6a10ae8e..5a6df0dd23c4 100644 ---- a/kernel/irq/manage.c -+++ b/kernel/irq/manage.c -@@ -24,7 +24,17 @@ - #include "internals.h" - - #ifdef CONFIG_IRQ_FORCED_THREADING -+#ifdef CONFIG_FORCE_IRQ_THREADING -+__read_mostly bool force_irqthreads = true; -+#else - __read_mostly bool force_irqthreads; -+#endif -+static int __init setup_noforced_irqthreads(char *arg) -+{ -+ force_irqthreads = false; -+ return 0; -+} -+early_param("nothreadirqs", setup_noforced_irqthreads); - - static int __init setup_forced_irqthreads(char *arg) - { --- -2.11.0 - diff --git a/sys-kernel/linux-sources-redcore-lts/files/0014-Swap-sucks.patch b/sys-kernel/linux-sources-redcore-lts/files/0014-Swap-sucks.patch deleted file mode 100644 index 6bf5bcda..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0014-Swap-sucks.patch +++ /dev/null @@ -1,25 +0,0 @@ -From ed0ab4c80fcb6fa4abb4f2f897e591df6eaa2d0e Mon Sep 17 00:00:00 2001 -From: Con Kolivas -Date: Sat, 12 Aug 2017 12:02:04 +1000 -Subject: [PATCH 14/16] Swap sucks. - ---- - mm/vmscan.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index eb2f0315b8c0..67d03efab288 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -149,7 +149,7 @@ struct scan_control { - /* - * From 0 .. 100. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 33; - /* - * The total number of pages which are beyond the high watermark within all - * zones. --- -2.11.0 - diff --git a/sys-kernel/linux-sources-redcore-lts/files/0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch b/sys-kernel/linux-sources-redcore-lts/files/0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch deleted file mode 100644 index bfa509a5..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch +++ /dev/null @@ -1,19 +0,0 @@ -diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c -index e84d700709ff6..16364915cff53 100644 ---- a/kernel/sched/MuQSS.c -+++ b/kernel/sched/MuQSS.c -@@ -70,6 +70,14 @@ - - #include "MuQSS.h" - -+/* needing to include irq_regs.h, "because reasons"... -+ * implicit declaration of function ‘get_irq_regs’; -+ * did you mean ‘get_ibs_caps’? -+ * [-Werror=implicit-function-declaration] -+ * ^ this is because autodetect is not flawless -+ */ -+#include -+ - #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) - #define rt_task(p) rt_prio((p)->prio) - #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) diff --git a/sys-kernel/linux-sources-redcore-lts/files/0016-unfuck-MuQSS-on-linux-4_14_15+.patch b/sys-kernel/linux-sources-redcore-lts/files/0016-unfuck-MuQSS-on-linux-4_14_15+.patch deleted file mode 100644 index f7dc1d1c..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0016-unfuck-MuQSS-on-linux-4_14_15+.patch +++ /dev/null @@ -1,48 +0,0 @@ -diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c -index e84d700709ff6..b0be7fcfe41f9 100644 ---- a/kernel/sched/MuQSS.c -+++ b/kernel/sched/MuQSS.c -@@ -55,6 +55,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -1959,7 +1960,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) - p->state = TASK_WAKING; - - if (p->in_iowait) { -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 15) - delayacct_blkio_end(); -+#else -+ delayacct_blkio_end(p); -+#endif - atomic_dec(&task_rq(p)->nr_iowait); - } - -@@ -1970,7 +1975,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) - #else /* CONFIG_SMP */ - - if (p->in_iowait) { -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 15) - delayacct_blkio_end(); -+#else -+ delayacct_blkio_end(p); -+#endif - atomic_dec(&task_rq(p)->nr_iowait); - } - -@@ -2022,7 +2031,11 @@ static void try_to_wake_up_local(struct task_struct *p) - - if (!task_on_rq_queued(p)) { - if (p->in_iowait) { -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 15) - delayacct_blkio_end(); -+#else -+ delayacct_blkio_end(p); -+#endif - atomic_dec(&rq->nr_iowait); - } - ttwu_activate(rq, p); diff --git a/sys-kernel/linux-sources-redcore-lts/files/0017-unfuck-MuQSS-on-linux-4_14_75+.patch b/sys-kernel/linux-sources-redcore-lts/files/0017-unfuck-MuQSS-on-linux-4_14_75+.patch deleted file mode 100644 index 1a1717bf..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/0017-unfuck-MuQSS-on-linux-4_14_75+.patch +++ /dev/null @@ -1,14 +0,0 @@ -diff -Nur a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c ---- a/kernel/sched/MuQSS.c 2019-01-05 22:51:24.547448624 +0000 -+++ b/kernel/sched/MuQSS.c 2019-01-05 22:58:29.821451056 +0000 -@@ -1021,6 +1021,10 @@ - #define CPUIDLE_THREAD_BUSY (16) - #define CPUIDLE_DIFF_NODE (32) - -+#ifdef CONFIG_SCHED_SMT -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+#endif -+ - /* - * The best idle CPU is chosen according to the CPUIDLE ranking above where the - * lowest value would give the most suitable CPU to schedule p onto next. The diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0001-BFQ-v8r12-20171108.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0001-BFQ-v8r12-20171108.patch new file mode 100644 index 00000000..db7d064b --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0001-BFQ-v8r12-20171108.patch @@ -0,0 +1,25199 @@ +From c21f53f17430230dab50df29b8ea1b71f99d09d6 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Tue, 7 Apr 2015 13:39:12 +0200 +Subject: [PATCH 01/51] Add BFQ-v8r12 + +This commit is the result of the following operations. + +1. The squash of all the commits between "block: cgroups, kconfig, +build bits for BFQ-v7r11-4.5.0" and BFQ-v8r12 in the branch +bfq-mq-v8-v4.11 + +2. The renaming of two files (block/bfq-cgroup.c -> +block/bfq-cgroup-included.c and block/bfq-iosched.c -> +block/bfq-sq-iosched.c) and of one option (CONFIG_BFQ_GROUP_IOSCHED -> +CONFIG_BFQ_SQ_GROUP_IOSCHED), to avoid name clashes. These name +clashes are due to the presence of bfq in mainline from 4.12. + +3. The modification of block/Makefile and block/Kconfig.iosched to +comply with the above renaming. + +Signed-off-by: Mauro Andreolini +Signed-off-by: Arianna Avanzini +Signed-off-by: Linus Walleij +Signed-off-by: Paolo Valente +--- + Makefile | 2 +- + block/Kconfig.iosched | 31 + + block/bfq-cgroup-included.c | 1190 ++++++++++ + block/bfq-ioc.c | 36 + + block/bfq-sched.c | 2002 ++++++++++++++++ + block/bfq-sq-iosched.c | 5379 +++++++++++++++++++++++++++++++++++++++++++ + block/bfq.h | 948 ++++++++ + include/linux/blkdev.h | 2 +- + 9 files changed, 9589 insertions(+), 2 deletions(-) + create mode 100644 block/bfq-cgroup-included.c + create mode 100644 block/bfq-ioc.c + create mode 100644 block/bfq-sched.c + create mode 100644 block/bfq-sq-iosched.c + create mode 100644 block/bfq.h + +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched +index a4a8914bf7a4..9e3f4c2f7390 100644 +--- a/block/Kconfig.iosched ++++ b/block/Kconfig.iosched +@@ -40,6 +40,26 @@ config CFQ_GROUP_IOSCHED + ---help--- + Enable group IO scheduling in CFQ. + ++config IOSCHED_BFQ_SQ ++ tristate "BFQ-SQ I/O scheduler" ++ default n ++ ---help--- ++ The BFQ-SQ I/O scheduler (for legacy blk: SQ stands for ++ SingleQueue) distributes bandwidth among all processes ++ according to their weights, regardless of the device ++ parameters and with any workload. It also guarantees a low ++ latency to interactive and soft real-time applications. ++ Details in Documentation/block/bfq-iosched.txt ++ ++config BFQ_SQ_GROUP_IOSCHED ++ bool "BFQ-SQ hierarchical scheduling support" ++ depends on IOSCHED_BFQ_SQ && BLK_CGROUP ++ default n ++ ---help--- ++ ++ Enable hierarchical scheduling in BFQ-SQ, using the blkio ++ (cgroups-v1) or io (cgroups-v2) controller. ++ + choice + + prompt "Default I/O scheduler" +@@ -54,6 +74,16 @@ choice + config DEFAULT_CFQ + bool "CFQ" if IOSCHED_CFQ=y + ++ config DEFAULT_BFQ_SQ ++ bool "BFQ-SQ" if IOSCHED_BFQ_SQ=y ++ help ++ Selects BFQ-SQ as the default I/O scheduler which will be ++ used by default for all block devices. ++ The BFQ-SQ I/O scheduler aims at distributing the bandwidth ++ as desired, independently of the disk parameters and with ++ any workload. It also tries to guarantee low latency to ++ interactive and soft real-time applications. ++ + config DEFAULT_NOOP + bool "No-op" + +@@ -63,6 +93,7 @@ config DEFAULT_IOSCHED + string + default "deadline" if DEFAULT_DEADLINE + default "cfq" if DEFAULT_CFQ ++ default "bfq-sq" if DEFAULT_BFQ_SQ + default "noop" if DEFAULT_NOOP + + config MQ_IOSCHED_DEADLINE +diff --git a/block/Makefile b/block/Makefile +index 6a56303b9925..59026b425791 100644 +--- a/block/Makefile ++++ b/block/Makefile +@@ -24,6 +24,7 @@ obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o + obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o + bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o + obj-$(CONFIG_IOSCHED_BFQ) += bfq.o ++obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o + + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o + obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +new file mode 100644 +index 000000000000..af7c216a3540 +--- /dev/null ++++ b/block/bfq-cgroup-included.c +@@ -0,0 +1,1190 @@ ++/* ++ * BFQ: CGROUPS support. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2015 Paolo Valente ++ * ++ * Copyright (C) 2016 Paolo Valente ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ ++ * file. ++ */ ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ ++/* bfqg stats flags */ ++enum bfqg_stats_flags { ++ BFQG_stats_waiting = 0, ++ BFQG_stats_idling, ++ BFQG_stats_empty, ++}; ++ ++#define BFQG_FLAG_FNS(name) \ ++static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ ++{ \ ++ stats->flags |= (1 << BFQG_stats_##name); \ ++} \ ++static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ ++{ \ ++ stats->flags &= ~(1 << BFQG_stats_##name); \ ++} \ ++static int bfqg_stats_##name(struct bfqg_stats *stats) \ ++{ \ ++ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ ++} \ ++ ++BFQG_FLAG_FNS(waiting) ++BFQG_FLAG_FNS(idling) ++BFQG_FLAG_FNS(empty) ++#undef BFQG_FLAG_FNS ++ ++/* This should be called with the queue_lock held. */ ++static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) ++{ ++ unsigned long long now; ++ ++ if (!bfqg_stats_waiting(stats)) ++ return; ++ ++ now = sched_clock(); ++ if (time_after64(now, stats->start_group_wait_time)) ++ blkg_stat_add(&stats->group_wait_time, ++ now - stats->start_group_wait_time); ++ bfqg_stats_clear_waiting(stats); ++} ++ ++/* This should be called with the queue_lock held. */ ++static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, ++ struct bfq_group *curr_bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ if (bfqg_stats_waiting(stats)) ++ return; ++ if (bfqg == curr_bfqg) ++ return; ++ stats->start_group_wait_time = sched_clock(); ++ bfqg_stats_mark_waiting(stats); ++} ++ ++/* This should be called with the queue_lock held. */ ++static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) ++{ ++ unsigned long long now; ++ ++ if (!bfqg_stats_empty(stats)) ++ return; ++ ++ now = sched_clock(); ++ if (time_after64(now, stats->start_empty_time)) ++ blkg_stat_add(&stats->empty_time, ++ now - stats->start_empty_time); ++ bfqg_stats_clear_empty(stats); ++} ++ ++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) ++{ ++ blkg_stat_add(&bfqg->stats.dequeue, 1); ++} ++ ++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ if (blkg_rwstat_total(&stats->queued)) ++ return; ++ ++ /* ++ * group is already marked empty. This can happen if bfqq got new ++ * request in parent group and moved to this group while being added ++ * to service tree. Just ignore the event and move on. ++ */ ++ if (bfqg_stats_empty(stats)) ++ return; ++ ++ stats->start_empty_time = sched_clock(); ++ bfqg_stats_mark_empty(stats); ++} ++ ++static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ if (bfqg_stats_idling(stats)) { ++ unsigned long long now = sched_clock(); ++ ++ if (time_after64(now, stats->start_idle_time)) ++ blkg_stat_add(&stats->idle_time, ++ now - stats->start_idle_time); ++ bfqg_stats_clear_idling(stats); ++ } ++} ++ ++static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ stats->start_idle_time = sched_clock(); ++ bfqg_stats_mark_idling(stats); ++} ++ ++static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ blkg_stat_add(&stats->avg_queue_size_sum, ++ blkg_rwstat_total(&stats->queued)); ++ blkg_stat_add(&stats->avg_queue_size_samples, 1); ++ bfqg_stats_update_group_wait_time(stats); ++} ++ ++static struct blkcg_policy blkcg_policy_bfq; ++ ++/* ++ * blk-cgroup policy-related handlers ++ * The following functions help in converting between blk-cgroup ++ * internal structures and BFQ-specific structures. ++ */ ++ ++static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) ++{ ++ return pd ? container_of(pd, struct bfq_group, pd) : NULL; ++} ++ ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) ++{ ++ return pd_to_blkg(&bfqg->pd); ++} ++ ++static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) ++{ ++ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); ++ ++ return pd_to_bfqg(pd); ++} ++ ++/* ++ * bfq_group handlers ++ * The following functions help in navigating the bfq_group hierarchy ++ * by allowing to find the parent of a bfq_group or the bfq_group ++ * associated to a bfq_queue. ++ */ ++ ++static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) ++{ ++ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; ++ ++ return pblkg ? blkg_to_bfqg(pblkg) : NULL; ++} ++ ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *group_entity = bfqq->entity.parent; ++ ++ return group_entity ? container_of(group_entity, struct bfq_group, ++ entity) : ++ bfqq->bfqd->root_group; ++} ++ ++/* ++ * The following two functions handle get and put of a bfq_group by ++ * wrapping the related blk-cgroup hooks. ++ */ ++ ++static void bfqg_get(struct bfq_group *bfqg) ++{ ++ return blkg_get(bfqg_to_blkg(bfqg)); ++} ++ ++static void bfqg_put(struct bfq_group *bfqg) ++{ ++ return blkg_put(bfqg_to_blkg(bfqg)); ++} ++ ++static void bfqg_stats_update_io_add(struct bfq_group *bfqg, ++ struct bfq_queue *bfqq, ++ unsigned int op) ++{ ++ blkg_rwstat_add(&bfqg->stats.queued, op, 1); ++ bfqg_stats_end_empty_time(&bfqg->stats); ++ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) ++ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); ++} ++ ++static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) ++{ ++ blkg_rwstat_add(&bfqg->stats.queued, op, -1); ++} ++ ++static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) ++{ ++ blkg_rwstat_add(&bfqg->stats.merged, op, 1); ++} ++ ++static void bfqg_stats_update_completion(struct bfq_group *bfqg, ++ uint64_t start_time, uint64_t io_start_time, ++ unsigned int op) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ unsigned long long now = sched_clock(); ++ ++ if (time_after64(now, io_start_time)) ++ blkg_rwstat_add(&stats->service_time, op, ++ now - io_start_time); ++ if (time_after64(io_start_time, start_time)) ++ blkg_rwstat_add(&stats->wait_time, op, ++ io_start_time - start_time); ++} ++ ++/* @stats = 0 */ ++static void bfqg_stats_reset(struct bfqg_stats *stats) ++{ ++ /* queued stats shouldn't be cleared */ ++ blkg_rwstat_reset(&stats->merged); ++ blkg_rwstat_reset(&stats->service_time); ++ blkg_rwstat_reset(&stats->wait_time); ++ blkg_stat_reset(&stats->time); ++ blkg_stat_reset(&stats->avg_queue_size_sum); ++ blkg_stat_reset(&stats->avg_queue_size_samples); ++ blkg_stat_reset(&stats->dequeue); ++ blkg_stat_reset(&stats->group_wait_time); ++ blkg_stat_reset(&stats->idle_time); ++ blkg_stat_reset(&stats->empty_time); ++} ++ ++/* @to += @from */ ++static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) ++{ ++ if (!to || !from) ++ return; ++ ++ /* queued stats shouldn't be cleared */ ++ blkg_rwstat_add_aux(&to->merged, &from->merged); ++ blkg_rwstat_add_aux(&to->service_time, &from->service_time); ++ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); ++ blkg_stat_add_aux(&from->time, &from->time); ++ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); ++ blkg_stat_add_aux(&to->avg_queue_size_samples, ++ &from->avg_queue_size_samples); ++ blkg_stat_add_aux(&to->dequeue, &from->dequeue); ++ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); ++ blkg_stat_add_aux(&to->idle_time, &from->idle_time); ++ blkg_stat_add_aux(&to->empty_time, &from->empty_time); ++} ++ ++/* ++ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' ++ * recursive stats can still account for the amount used by this bfqg after ++ * it's gone. ++ */ ++static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) ++{ ++ struct bfq_group *parent; ++ ++ if (!bfqg) /* root_group */ ++ return; ++ ++ parent = bfqg_parent(bfqg); ++ ++ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); ++ ++ if (unlikely(!parent)) ++ return; ++ ++ bfqg_stats_add_aux(&parent->stats, &bfqg->stats); ++ bfqg_stats_reset(&bfqg->stats); ++} ++ ++static void bfq_init_entity(struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ entity->weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; ++ if (bfqq) { ++ bfqq->ioprio = bfqq->new_ioprio; ++ bfqq->ioprio_class = bfqq->new_ioprio_class; ++ bfqg_get(bfqg); ++ } ++ entity->parent = bfqg->my_entity; /* NULL for root group */ ++ entity->sched_data = &bfqg->sched_data; ++} ++ ++static void bfqg_stats_exit(struct bfqg_stats *stats) ++{ ++ blkg_rwstat_exit(&stats->merged); ++ blkg_rwstat_exit(&stats->service_time); ++ blkg_rwstat_exit(&stats->wait_time); ++ blkg_rwstat_exit(&stats->queued); ++ blkg_stat_exit(&stats->time); ++ blkg_stat_exit(&stats->avg_queue_size_sum); ++ blkg_stat_exit(&stats->avg_queue_size_samples); ++ blkg_stat_exit(&stats->dequeue); ++ blkg_stat_exit(&stats->group_wait_time); ++ blkg_stat_exit(&stats->idle_time); ++ blkg_stat_exit(&stats->empty_time); ++} ++ ++static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) ++{ ++ if (blkg_rwstat_init(&stats->merged, gfp) || ++ blkg_rwstat_init(&stats->service_time, gfp) || ++ blkg_rwstat_init(&stats->wait_time, gfp) || ++ blkg_rwstat_init(&stats->queued, gfp) || ++ blkg_stat_init(&stats->time, gfp) || ++ blkg_stat_init(&stats->avg_queue_size_sum, gfp) || ++ blkg_stat_init(&stats->avg_queue_size_samples, gfp) || ++ blkg_stat_init(&stats->dequeue, gfp) || ++ blkg_stat_init(&stats->group_wait_time, gfp) || ++ blkg_stat_init(&stats->idle_time, gfp) || ++ blkg_stat_init(&stats->empty_time, gfp)) { ++ bfqg_stats_exit(stats); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) ++{ ++ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; ++} ++ ++static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) ++{ ++ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); ++} ++ ++static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) ++{ ++ struct bfq_group_data *bgd; ++ ++ bgd = kzalloc(sizeof(*bgd), gfp); ++ if (!bgd) ++ return NULL; ++ return &bgd->pd; ++} ++ ++static void bfq_cpd_init(struct blkcg_policy_data *cpd) ++{ ++ struct bfq_group_data *d = cpd_to_bfqgd(cpd); ++ ++ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? ++ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; ++} ++ ++static void bfq_cpd_free(struct blkcg_policy_data *cpd) ++{ ++ kfree(cpd_to_bfqgd(cpd)); ++} ++ ++static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) ++{ ++ struct bfq_group *bfqg; ++ ++ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); ++ if (!bfqg) ++ return NULL; ++ ++ if (bfqg_stats_init(&bfqg->stats, gfp)) { ++ kfree(bfqg); ++ return NULL; ++ } ++ ++ return &bfqg->pd; ++} ++ ++static void bfq_pd_init(struct blkg_policy_data *pd) ++{ ++ struct blkcg_gq *blkg; ++ struct bfq_group *bfqg; ++ struct bfq_data *bfqd; ++ struct bfq_entity *entity; ++ struct bfq_group_data *d; ++ ++ blkg = pd_to_blkg(pd); ++ BUG_ON(!blkg); ++ bfqg = blkg_to_bfqg(blkg); ++ bfqd = blkg->q->elevator->elevator_data; ++ entity = &bfqg->entity; ++ d = blkcg_to_bfqgd(blkg->blkcg); ++ ++ entity->orig_weight = entity->weight = entity->new_weight = d->weight; ++ entity->my_sched_data = &bfqg->sched_data; ++ bfqg->my_entity = entity; /* ++ * the root_group's will be set to NULL ++ * in bfq_init_queue() ++ */ ++ bfqg->bfqd = bfqd; ++ bfqg->active_entities = 0; ++ bfqg->rq_pos_tree = RB_ROOT; ++} ++ ++static void bfq_pd_free(struct blkg_policy_data *pd) ++{ ++ struct bfq_group *bfqg = pd_to_bfqg(pd); ++ ++ bfqg_stats_exit(&bfqg->stats); ++ return kfree(bfqg); ++} ++ ++static void bfq_pd_reset_stats(struct blkg_policy_data *pd) ++{ ++ struct bfq_group *bfqg = pd_to_bfqg(pd); ++ ++ bfqg_stats_reset(&bfqg->stats); ++} ++ ++static void bfq_group_set_parent(struct bfq_group *bfqg, ++ struct bfq_group *parent) ++{ ++ struct bfq_entity *entity; ++ ++ BUG_ON(!parent); ++ BUG_ON(!bfqg); ++ BUG_ON(bfqg == parent); ++ ++ entity = &bfqg->entity; ++ entity->parent = parent->my_entity; ++ entity->sched_data = &parent->sched_data; ++} ++ ++static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, ++ struct blkcg *blkcg) ++{ ++ struct blkcg_gq *blkg; ++ ++ blkg = blkg_lookup(blkcg, bfqd->queue); ++ if (likely(blkg)) ++ return blkg_to_bfqg(blkg); ++ return NULL; ++} ++ ++static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, ++ struct blkcg *blkcg) ++{ ++ struct bfq_group *bfqg, *parent; ++ struct bfq_entity *entity; ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ ++ bfqg = bfq_lookup_bfqg(bfqd, blkcg); ++ ++ if (unlikely(!bfqg)) ++ return NULL; ++ ++ /* ++ * Update chain of bfq_groups as we might be handling a leaf group ++ * which, along with some of its relatives, has not been hooked yet ++ * to the private hierarchy of BFQ. ++ */ ++ entity = &bfqg->entity; ++ for_each_entity(entity) { ++ bfqg = container_of(entity, struct bfq_group, entity); ++ BUG_ON(!bfqg); ++ if (bfqg != bfqd->root_group) { ++ parent = bfqg_parent(bfqg); ++ if (!parent) ++ parent = bfqd->root_group; ++ BUG_ON(!parent); ++ bfq_group_set_parent(bfqg, parent); ++ } ++ } ++ ++ return bfqg; ++} ++ ++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq); ++ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason); ++ ++/** ++ * bfq_bfqq_move - migrate @bfqq to @bfqg. ++ * @bfqd: queue descriptor. ++ * @bfqq: the queue to move. ++ * @bfqg: the group to move to. ++ * ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating ++ * it on the new one. Avoid putting the entity on the old group idle tree. ++ * ++ * Must be called under the queue lock; the cgroup owning @bfqg must ++ * not disappear (by now this just means that we are called under ++ * rcu_read_lock()). ++ */ ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_group *bfqg) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); ++ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) ++ && entity->on_st && ++ bfqq != bfqd->in_service_queue); ++ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); ++ ++ /* If bfqq is empty, then bfq_bfqq_expire also invokes ++ * bfq_del_bfqq_busy, thereby removing bfqq and its entity ++ * from data structures related to current group. Otherwise we ++ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as ++ * we do below. ++ */ ++ if (bfqq == bfqd->in_service_queue) ++ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, ++ false, BFQ_BFQQ_PREEMPTED); ++ ++ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) ++ && &bfq_entity_service_tree(entity)->idle != ++ entity->tree); ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); ++ ++ if (bfq_bfqq_busy(bfqq)) ++ bfq_deactivate_bfqq(bfqd, bfqq, false, false); ++ else if (entity->on_st) { ++ BUG_ON(&bfq_entity_service_tree(entity)->idle != ++ entity->tree); ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); ++ } ++ bfqg_put(bfqq_group(bfqq)); ++ ++ /* ++ * Here we use a reference to bfqg. We don't need a refcounter ++ * as the cgroup reference will not be dropped, so that its ++ * destroy() callback will not be invoked. ++ */ ++ entity->parent = bfqg->my_entity; ++ entity->sched_data = &bfqg->sched_data; ++ bfqg_get(bfqg); ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); ++ if (bfq_bfqq_busy(bfqq)) { ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ bfq_activate_bfqq(bfqd, bfqq); ++ } ++ ++ if (!bfqd->in_service_queue && !bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) ++ && &bfq_entity_service_tree(entity)->idle != ++ entity->tree); ++} ++ ++/** ++ * __bfq_bic_change_cgroup - move @bic to @cgroup. ++ * @bfqd: the queue descriptor. ++ * @bic: the bic to move. ++ * @blkcg: the blk-cgroup to move to. ++ * ++ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller ++ * has to make sure that the reference to cgroup is valid across the call. ++ * ++ * NOTE: an alternative approach might have been to store the current ++ * cgroup in bfqq and getting a reference to it, reducing the lookup ++ * time here, at the price of slightly more complex code. ++ */ ++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, ++ struct blkcg *blkcg) ++{ ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); ++ struct bfq_group *bfqg; ++ struct bfq_entity *entity; ++ ++ lockdep_assert_held(bfqd->queue->queue_lock); ++ ++ bfqg = bfq_find_set_group(bfqd, blkcg); ++ ++ if (unlikely(!bfqg)) ++ bfqg = bfqd->root_group; ++ ++ if (async_bfqq) { ++ entity = &async_bfqq->entity; ++ ++ if (entity->sched_data != &bfqg->sched_data) { ++ bic_set_bfqq(bic, NULL, 0); ++ bfq_log_bfqq(bfqd, async_bfqq, ++ "bic_change_group: %p %d", ++ async_bfqq, ++ async_bfqq->ref); ++ bfq_put_queue(async_bfqq); ++ } ++ } ++ ++ if (sync_bfqq) { ++ entity = &sync_bfqq->entity; ++ if (entity->sched_data != &bfqg->sched_data) ++ bfq_bfqq_move(bfqd, sync_bfqq, bfqg); ++ } ++ ++ return bfqg; ++} ++ ++static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ struct bfq_group *bfqg = NULL; ++ uint64_t serial_nr; ++ ++ rcu_read_lock(); ++ serial_nr = bio_blkcg(bio)->css.serial_nr; ++ ++ /* ++ * Check whether blkcg has changed. The condition may trigger ++ * spuriously on a newly created cic but there's no harm. ++ */ ++ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) ++ goto out; ++ ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); ++ bic->blkcg_serial_nr = serial_nr; ++out: ++ rcu_read_unlock(); ++} ++ ++/** ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. ++ * @st: the service tree being flushed. ++ */ ++static void bfq_flush_idle_tree(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entity = st->first_idle; ++ ++ for (; entity ; entity = st->first_idle) ++ __bfq_deactivate_entity(entity, false); ++} ++ ++/** ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group. ++ * @bfqd: the device data structure with the root group. ++ * @entity: the entity to move. ++ */ ++static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(!bfqq); ++ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); ++} ++ ++/** ++ * bfq_reparent_active_entities - move to the root group all active ++ * entities. ++ * @bfqd: the device data structure with the root group. ++ * @bfqg: the group to move from. ++ * @st: the service tree with the entities. ++ * ++ * Needs queue_lock to be taken and reference to be valid over the call. ++ */ ++static void bfq_reparent_active_entities(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ struct bfq_service_tree *st) ++{ ++ struct rb_root *active = &st->active; ++ struct bfq_entity *entity = NULL; ++ ++ if (!RB_EMPTY_ROOT(&st->active)) ++ entity = bfq_entity_of(rb_first(active)); ++ ++ for (; entity ; entity = bfq_entity_of(rb_first(active))) ++ bfq_reparent_leaf_entity(bfqd, entity); ++ ++ if (bfqg->sched_data.in_service_entity) ++ bfq_reparent_leaf_entity(bfqd, ++ bfqg->sched_data.in_service_entity); ++} ++ ++/** ++ * bfq_pd_offline - deactivate the entity associated with @pd, ++ * and reparent its children entities. ++ * @pd: descriptor of the policy going offline. ++ * ++ * blkio already grabs the queue_lock for us, so no need to use ++ * RCU-based magic ++ */ ++static void bfq_pd_offline(struct blkg_policy_data *pd) ++{ ++ struct bfq_service_tree *st; ++ struct bfq_group *bfqg; ++ struct bfq_data *bfqd; ++ struct bfq_entity *entity; ++ int i; ++ ++ BUG_ON(!pd); ++ bfqg = pd_to_bfqg(pd); ++ BUG_ON(!bfqg); ++ bfqd = bfqg->bfqd; ++ BUG_ON(bfqd && !bfqd->root_group); ++ ++ entity = bfqg->my_entity; ++ ++ if (!entity) /* root group */ ++ return; ++ ++ /* ++ * Empty all service_trees belonging to this group before ++ * deactivating the group itself. ++ */ ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { ++ BUG_ON(!bfqg->sched_data.service_tree); ++ st = bfqg->sched_data.service_tree + i; ++ /* ++ * The idle tree may still contain bfq_queues belonging ++ * to exited task because they never migrated to a different ++ * cgroup from the one being destroyed now. No one else ++ * can access them so it's safe to act without any lock. ++ */ ++ bfq_flush_idle_tree(st); ++ ++ /* ++ * It may happen that some queues are still active ++ * (busy) upon group destruction (if the corresponding ++ * processes have been forced to terminate). We move ++ * all the leaf entities corresponding to these queues ++ * to the root_group. ++ * Also, it may happen that the group has an entity ++ * in service, which is disconnected from the active ++ * tree: it must be moved, too. ++ * There is no need to put the sync queues, as the ++ * scheduler has taken no reference. ++ */ ++ bfq_reparent_active_entities(bfqd, bfqg, st); ++ BUG_ON(!RB_EMPTY_ROOT(&st->active)); ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); ++ } ++ BUG_ON(bfqg->sched_data.next_in_service); ++ BUG_ON(bfqg->sched_data.in_service_entity); ++ ++ __bfq_deactivate_entity(entity, false); ++ bfq_put_async_queues(bfqd, bfqg); ++ ++ /* ++ * @blkg is going offline and will be ignored by ++ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so ++ * that they don't get lost. If IOs complete after this point, the ++ * stats for them will be lost. Oh well... ++ */ ++ bfqg_stats_xfer_dead(bfqg); ++} ++ ++static void bfq_end_wr_async(struct bfq_data *bfqd) ++{ ++ struct blkcg_gq *blkg; ++ ++ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { ++ struct bfq_group *bfqg = blkg_to_bfqg(blkg); ++ BUG_ON(!bfqg); ++ ++ bfq_end_wr_async_queues(bfqd, bfqg); ++ } ++ bfq_end_wr_async_queues(bfqd, bfqd->root_group); ++} ++ ++static int bfq_io_show_weight(struct seq_file *sf, void *v) ++{ ++ struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); ++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); ++ unsigned int val = 0; ++ ++ if (bfqgd) ++ val = bfqgd->weight; ++ ++ seq_printf(sf, "%u\n", val); ++ ++ return 0; ++} ++ ++static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, ++ struct cftype *cftype, ++ u64 val) ++{ ++ struct blkcg *blkcg = css_to_blkcg(css); ++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); ++ struct blkcg_gq *blkg; ++ int ret = -ERANGE; ++ ++ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) ++ return ret; ++ ++ ret = 0; ++ spin_lock_irq(&blkcg->lock); ++ bfqgd->weight = (unsigned short)val; ++ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { ++ struct bfq_group *bfqg = blkg_to_bfqg(blkg); ++ ++ if (!bfqg) ++ continue; ++ /* ++ * Setting the prio_changed flag of the entity ++ * to 1 with new_weight == weight would re-set ++ * the value of the weight to its ioprio mapping. ++ * Set the flag only if necessary. ++ */ ++ if ((unsigned short)val != bfqg->entity.new_weight) { ++ bfqg->entity.new_weight = (unsigned short)val; ++ /* ++ * Make sure that the above new value has been ++ * stored in bfqg->entity.new_weight before ++ * setting the prio_changed flag. In fact, ++ * this flag may be read asynchronously (in ++ * critical sections protected by a different ++ * lock than that held here), and finding this ++ * flag set may cause the execution of the code ++ * for updating parameters whose value may ++ * depend also on bfqg->entity.new_weight (in ++ * __bfq_entity_update_weight_prio). ++ * This barrier makes sure that the new value ++ * of bfqg->entity.new_weight is correctly ++ * seen in that code. ++ */ ++ smp_wmb(); ++ bfqg->entity.prio_changed = 1; ++ } ++ } ++ spin_unlock_irq(&blkcg->lock); ++ ++ return ret; ++} ++ ++static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, ++ char *buf, size_t nbytes, ++ loff_t off) ++{ ++ u64 weight; ++ /* First unsigned long found in the file is used */ ++ int ret = kstrtoull(strim(buf), 0, &weight); ++ ++ if (ret) ++ return ret; ++ ++ return bfq_io_set_weight_legacy(of_css(of), NULL, weight); ++} ++ ++static int bfqg_print_stat(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, ++ &blkcg_policy_bfq, seq_cft(sf)->private, false); ++ return 0; ++} ++ ++static int bfqg_print_rwstat(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, ++ &blkcg_policy_bfq, seq_cft(sf)->private, true); ++ return 0; ++} ++ ++static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), ++ &blkcg_policy_bfq, off); ++ return __blkg_prfill_u64(sf, pd, sum); ++} ++ ++static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), ++ &blkcg_policy_bfq, ++ off); ++ return __blkg_prfill_rwstat(sf, pd, &sum); ++} ++ ++static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_stat_recursive, &blkcg_policy_bfq, ++ seq_cft(sf)->private, false); ++ return 0; ++} ++ ++static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, ++ seq_cft(sf)->private, true); ++ return 0; ++} ++ ++static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, ++ int off) ++{ ++ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); ++ ++ return __blkg_prfill_u64(sf, pd, sum >> 9); ++} ++ ++static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); ++ return 0; ++} ++ ++static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, ++ offsetof(struct blkcg_gq, stat_bytes)); ++ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + ++ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); ++ ++ return __blkg_prfill_u64(sf, pd, sum >> 9); ++} ++ ++static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, ++ false); ++ return 0; ++} ++ ++ ++static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ struct bfq_group *bfqg = pd_to_bfqg(pd); ++ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); ++ u64 v = 0; ++ ++ if (samples) { ++ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); ++ v = div64_u64(v, samples); ++ } ++ __blkg_prfill_u64(sf, pd, v); ++ return 0; ++} ++ ++/* print avg_queue_size */ ++static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, ++ 0, false); ++ return 0; ++} ++ ++static struct bfq_group * ++bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) ++{ ++ int ret; ++ ++ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); ++ if (ret) ++ return NULL; ++ ++ return blkg_to_bfqg(bfqd->queue->root_blkg); ++} ++ ++static struct cftype bfq_blkcg_legacy_files[] = { ++ { ++ .name = "bfq.weight", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = bfq_io_show_weight, ++ .write_u64 = bfq_io_set_weight_legacy, ++ }, ++ ++ /* statistics, covers only the tasks in the bfqg */ ++ { ++ .name = "bfq.time", ++ .private = offsetof(struct bfq_group, stats.time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = "bfq.sectors", ++ .seq_show = bfqg_print_stat_sectors, ++ }, ++ { ++ .name = "bfq.io_service_bytes", ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_bytes, ++ }, ++ { ++ .name = "bfq.io_serviced", ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_ios, ++ }, ++ { ++ .name = "bfq.io_service_time", ++ .private = offsetof(struct bfq_group, stats.service_time), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ { ++ .name = "bfq.io_wait_time", ++ .private = offsetof(struct bfq_group, stats.wait_time), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ { ++ .name = "bfq.io_merged", ++ .private = offsetof(struct bfq_group, stats.merged), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ { ++ .name = "bfq.io_queued", ++ .private = offsetof(struct bfq_group, stats.queued), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ ++ /* the same statictics which cover the bfqg and its descendants */ ++ { ++ .name = "bfq.time_recursive", ++ .private = offsetof(struct bfq_group, stats.time), ++ .seq_show = bfqg_print_stat_recursive, ++ }, ++ { ++ .name = "bfq.sectors_recursive", ++ .seq_show = bfqg_print_stat_sectors_recursive, ++ }, ++ { ++ .name = "bfq.io_service_bytes_recursive", ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_bytes_recursive, ++ }, ++ { ++ .name = "bfq.io_serviced_recursive", ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_ios_recursive, ++ }, ++ { ++ .name = "bfq.io_service_time_recursive", ++ .private = offsetof(struct bfq_group, stats.service_time), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = "bfq.io_wait_time_recursive", ++ .private = offsetof(struct bfq_group, stats.wait_time), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = "bfq.io_merged_recursive", ++ .private = offsetof(struct bfq_group, stats.merged), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = "bfq.io_queued_recursive", ++ .private = offsetof(struct bfq_group, stats.queued), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = "bfq.avg_queue_size", ++ .seq_show = bfqg_print_avg_queue_size, ++ }, ++ { ++ .name = "bfq.group_wait_time", ++ .private = offsetof(struct bfq_group, stats.group_wait_time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = "bfq.idle_time", ++ .private = offsetof(struct bfq_group, stats.idle_time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = "bfq.empty_time", ++ .private = offsetof(struct bfq_group, stats.empty_time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = "bfq.dequeue", ++ .private = offsetof(struct bfq_group, stats.dequeue), ++ .seq_show = bfqg_print_stat, ++ }, ++ { } /* terminate */ ++}; ++ ++static struct cftype bfq_blkg_files[] = { ++ { ++ .name = "bfq.weight", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = bfq_io_show_weight, ++ .write = bfq_io_set_weight, ++ }, ++ {} /* terminate */ ++}; ++ ++#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ ++ ++static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, ++ struct bfq_queue *bfqq, unsigned int op) { } ++static inline void ++bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } ++static inline void ++bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } ++static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, ++ uint64_t start_time, uint64_t io_start_time, ++ unsigned int op) { } ++static inline void ++bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, ++ struct bfq_group *curr_bfqg) { } ++static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } ++static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } ++ ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_group *bfqg) {} ++ ++static void bfq_init_entity(struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ entity->weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; ++ if (bfqq) { ++ bfqq->ioprio = bfqq->new_ioprio; ++ bfqq->ioprio_class = bfqq->new_ioprio_class; ++ } ++ entity->sched_data = &bfqg->sched_data; ++} ++ ++static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} ++ ++static void bfq_end_wr_async(struct bfq_data *bfqd) ++{ ++ bfq_end_wr_async_queues(bfqd, bfqd->root_group); ++} ++ ++static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, ++ struct blkcg *blkcg) ++{ ++ return bfqd->root_group; ++} ++ ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) ++{ ++ return bfqq->bfqd->root_group; ++} ++ ++static struct bfq_group * ++bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) ++{ ++ struct bfq_group *bfqg; ++ int i; ++ ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); ++ if (!bfqg) ++ return NULL; ++ ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ ++ return bfqg; ++} ++#endif +diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c +new file mode 100644 +index 000000000000..fb7bb8f08b75 +--- /dev/null ++++ b/block/bfq-ioc.c +@@ -0,0 +1,36 @@ ++/* ++ * BFQ: I/O context handling. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2010 Paolo Valente ++ */ ++ ++/** ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. ++ * @icq: the iocontext queue. ++ */ ++static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) ++{ ++ /* bic->icq is the first member, %NULL will convert to %NULL */ ++ return container_of(icq, struct bfq_io_cq, icq); ++} ++ ++/** ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. ++ * @bfqd: the lookup key. ++ * @ioc: the io_context of the process doing I/O. ++ * ++ * Queue lock must be held. ++ */ ++static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, ++ struct io_context *ioc) ++{ ++ if (ioc) ++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); ++ return NULL; ++} +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +new file mode 100644 +index 000000000000..ac8991bca9fa +--- /dev/null ++++ b/block/bfq-sched.c +@@ -0,0 +1,2002 @@ ++/* ++ * BFQ: Hierarchical B-WF2Q+ scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2015 Paolo Valente ++ * ++ * Copyright (C) 2016 Paolo Valente ++ */ ++ ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++ ++/** ++ * bfq_gt - compare two timestamps. ++ * @a: first ts. ++ * @b: second ts. ++ * ++ * Return @a > @b, dealing with wrapping correctly. ++ */ ++static int bfq_gt(u64 a, u64 b) ++{ ++ return (s64)(a - b) > 0; ++} ++ ++static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) ++{ ++ struct rb_node *node = tree->rb_node; ++ ++ return rb_entry(node, struct bfq_entity, rb_node); ++} ++ ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); ++ ++static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); ++ ++/** ++ * bfq_update_next_in_service - update sd->next_in_service ++ * @sd: sched_data for which to perform the update. ++ * @new_entity: if not NULL, pointer to the entity whose activation, ++ * requeueing or repositionig triggered the invocation of ++ * this function. ++ * ++ * This function is called to update sd->next_in_service, which, in ++ * its turn, may change as a consequence of the insertion or ++ * extraction of an entity into/from one of the active trees of ++ * sd. These insertions/extractions occur as a consequence of ++ * activations/deactivations of entities, with some activations being ++ * 'true' activations, and other activations being requeueings (i.e., ++ * implementing the second, requeueing phase of the mechanism used to ++ * reposition an entity in its active tree; see comments on ++ * __bfq_activate_entity and __bfq_requeue_entity for details). In ++ * both the last two activation sub-cases, new_entity points to the ++ * just activated or requeued entity. ++ * ++ * Returns true if sd->next_in_service changes in such a way that ++ * entity->parent may become the next_in_service for its parent ++ * entity. ++ */ ++static bool bfq_update_next_in_service(struct bfq_sched_data *sd, ++ struct bfq_entity *new_entity) ++{ ++ struct bfq_entity *next_in_service = sd->next_in_service; ++ struct bfq_queue *bfqq; ++ bool parent_sched_may_change = false; ++ ++ /* ++ * If this update is triggered by the activation, requeueing ++ * or repositiong of an entity that does not coincide with ++ * sd->next_in_service, then a full lookup in the active tree ++ * can be avoided. In fact, it is enough to check whether the ++ * just-modified entity has a higher priority than ++ * sd->next_in_service, or, even if it has the same priority ++ * as sd->next_in_service, is eligible and has a lower virtual ++ * finish time than sd->next_in_service. If this compound ++ * condition holds, then the new entity becomes the new ++ * next_in_service. Otherwise no change is needed. ++ */ ++ if (new_entity && new_entity != sd->next_in_service) { ++ /* ++ * Flag used to decide whether to replace ++ * sd->next_in_service with new_entity. Tentatively ++ * set to true, and left as true if ++ * sd->next_in_service is NULL. ++ */ ++ bool replace_next = true; ++ ++ /* ++ * If there is already a next_in_service candidate ++ * entity, then compare class priorities or timestamps ++ * to decide whether to replace sd->service_tree with ++ * new_entity. ++ */ ++ if (next_in_service) { ++ unsigned int new_entity_class_idx = ++ bfq_class_idx(new_entity); ++ struct bfq_service_tree *st = ++ sd->service_tree + new_entity_class_idx; ++ ++ /* ++ * For efficiency, evaluate the most likely ++ * sub-condition first. ++ */ ++ replace_next = ++ (new_entity_class_idx == ++ bfq_class_idx(next_in_service) ++ && ++ !bfq_gt(new_entity->start, st->vtime) ++ && ++ bfq_gt(next_in_service->finish, ++ new_entity->finish)) ++ || ++ new_entity_class_idx < ++ bfq_class_idx(next_in_service); ++ } ++ ++ if (replace_next) ++ next_in_service = new_entity; ++ } else /* invoked because of a deactivation: lookup needed */ ++ next_in_service = bfq_lookup_next_entity(sd); ++ ++ if (next_in_service) { ++ parent_sched_may_change = !sd->next_in_service || ++ bfq_update_parent_budget(next_in_service); ++ } ++ ++ sd->next_in_service = next_in_service; ++ ++ if (!next_in_service) ++ return parent_sched_may_change; ++ ++ bfqq = bfq_entity_to_bfqq(next_in_service); ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "update_next_in_service: chosen this queue"); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ else { ++ struct bfq_group *bfqg = ++ container_of(next_in_service, ++ struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "update_next_in_service: chosen this entity"); ++ } ++#endif ++ return parent_sched_may_change; ++} ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++/* both next loops stop at one of the child entities of the root group */ ++#define for_each_entity(entity) \ ++ for (; entity ; entity = entity->parent) ++ ++/* ++ * For each iteration, compute parent in advance, so as to be safe if ++ * entity is deallocated during the iteration. Such a deallocation may ++ * happen as a consequence of a bfq_put_queue that frees the bfq_queue ++ * containing entity. ++ */ ++#define for_each_entity_safe(entity, parent) \ ++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) ++ ++/* ++ * Returns true if this budget changes may let next_in_service->parent ++ * become the next_in_service entity for its parent entity. ++ */ ++static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) ++{ ++ struct bfq_entity *bfqg_entity; ++ struct bfq_group *bfqg; ++ struct bfq_sched_data *group_sd; ++ bool ret = false; ++ ++ BUG_ON(!next_in_service); ++ ++ group_sd = next_in_service->sched_data; ++ ++ bfqg = container_of(group_sd, struct bfq_group, sched_data); ++ /* ++ * bfq_group's my_entity field is not NULL only if the group ++ * is not the root group. We must not touch the root entity ++ * as it must never become an in-service entity. ++ */ ++ bfqg_entity = bfqg->my_entity; ++ if (bfqg_entity) { ++ if (bfqg_entity->budget > next_in_service->budget) ++ ret = true; ++ bfqg_entity->budget = next_in_service->budget; ++ } ++ ++ return ret; ++} ++ ++/* ++ * This function tells whether entity stops being a candidate for next ++ * service, according to the following logic. ++ * ++ * This function is invoked for an entity that is about to be set in ++ * service. If such an entity is a queue, then the entity is no longer ++ * a candidate for next service (i.e, a candidate entity to serve ++ * after the in-service entity is expired). The function then returns ++ * true. ++ * ++ * In contrast, the entity could stil be a candidate for next service ++ * if it is not a queue, and has more than one child. In fact, even if ++ * one of its children is about to be set in service, other children ++ * may still be the next to serve. As a consequence, a non-queue ++ * entity is not a candidate for next-service only if it has only one ++ * child. And only if this condition holds, then the function returns ++ * true for a non-queue entity. ++ */ ++static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) ++{ ++ struct bfq_group *bfqg; ++ ++ if (bfq_entity_to_bfqq(entity)) ++ return true; ++ ++ bfqg = container_of(entity, struct bfq_group, entity); ++ ++ BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); ++ BUG_ON(bfqg->active_entities == 0); ++ if (bfqg->active_entities == 1) ++ return true; ++ ++ return false; ++} ++ ++#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ ++#define for_each_entity(entity) \ ++ for (; entity ; entity = NULL) ++ ++#define for_each_entity_safe(entity, parent) \ ++ for (parent = NULL; entity ; entity = parent) ++ ++static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) ++{ ++ return false; ++} ++ ++static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) ++{ ++ return true; ++} ++ ++#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ ++ ++/* ++ * Shift for timestamp calculations. This actually limits the maximum ++ * service allowed in one timestamp delta (small shift values increase it), ++ * the maximum total weight that can be used for the queues in the system ++ * (big shift values increase it), and the period of virtual time ++ * wraparounds. ++ */ ++#define WFQ_SERVICE_SHIFT 22 ++ ++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = NULL; ++ ++ BUG_ON(!entity); ++ ++ if (!entity->my_sched_data) ++ bfqq = container_of(entity, struct bfq_queue, entity); ++ ++ return bfqq; ++} ++ ++ ++/** ++ * bfq_delta - map service into the virtual time domain. ++ * @service: amount of service. ++ * @weight: scale factor (weight of an entity or weight sum). ++ */ ++static u64 bfq_delta(unsigned long service, unsigned long weight) ++{ ++ u64 d = (u64)service << WFQ_SERVICE_SHIFT; ++ ++ do_div(d, weight); ++ return d; ++} ++ ++/** ++ * bfq_calc_finish - assign the finish time to an entity. ++ * @entity: the entity to act upon. ++ * @service: the service to be charged to the entity. ++ */ ++static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned long long start, finish, delta; ++ ++ BUG_ON(entity->weight == 0); ++ ++ entity->finish = entity->start + ++ bfq_delta(service, entity->weight); ++ ++ start = ((entity->start>>10)*1000)>>12; ++ finish = ((entity->finish>>10)*1000)>>12; ++ delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "calc_finish: serv %lu, w %d", ++ service, entity->weight); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "calc_finish: start %llu, finish %llu, delta %llu", ++ start, finish, delta); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "calc_finish group: serv %lu, w %d", ++ service, entity->weight); ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "calc_finish group: start %llu, finish %llu, delta %llu", ++ start, finish, delta); ++#endif ++ } ++} ++ ++/** ++ * bfq_entity_of - get an entity from a node. ++ * @node: the node field of the entity. ++ * ++ * Convert a node pointer to the relative entity. This is used only ++ * to simplify the logic of some functions and not as the generic ++ * conversion mechanism because, e.g., in the tree walking functions, ++ * the check for a %NULL value would be redundant. ++ */ ++static struct bfq_entity *bfq_entity_of(struct rb_node *node) ++{ ++ struct bfq_entity *entity = NULL; ++ ++ if (node) ++ entity = rb_entry(node, struct bfq_entity, rb_node); ++ ++ return entity; ++} ++ ++/** ++ * bfq_extract - remove an entity from a tree. ++ * @root: the tree root. ++ * @entity: the entity to remove. ++ */ ++static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) ++{ ++ BUG_ON(entity->tree != root); ++ ++ entity->tree = NULL; ++ rb_erase(&entity->rb_node, root); ++} ++ ++/** ++ * bfq_idle_extract - extract an entity from the idle tree. ++ * @st: the service tree of the owning @entity. ++ * @entity: the entity being removed. ++ */ ++static void bfq_idle_extract(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *next; ++ ++ BUG_ON(entity->tree != &st->idle); ++ ++ if (entity == st->first_idle) { ++ next = rb_next(&entity->rb_node); ++ st->first_idle = bfq_entity_of(next); ++ } ++ ++ if (entity == st->last_idle) { ++ next = rb_prev(&entity->rb_node); ++ st->last_idle = bfq_entity_of(next); ++ } ++ ++ bfq_extract(&st->idle, entity); ++ ++ if (bfqq) ++ list_del(&bfqq->bfqq_list); ++} ++ ++/** ++ * bfq_insert - generic tree insertion. ++ * @root: tree root. ++ * @entity: entity to insert. ++ * ++ * This is used for the idle and the active tree, since they are both ++ * ordered by finish time. ++ */ ++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) ++{ ++ struct bfq_entity *entry; ++ struct rb_node **node = &root->rb_node; ++ struct rb_node *parent = NULL; ++ ++ BUG_ON(entity->tree); ++ ++ while (*node) { ++ parent = *node; ++ entry = rb_entry(parent, struct bfq_entity, rb_node); ++ ++ if (bfq_gt(entry->finish, entity->finish)) ++ node = &parent->rb_left; ++ else ++ node = &parent->rb_right; ++ } ++ ++ rb_link_node(&entity->rb_node, parent, node); ++ rb_insert_color(&entity->rb_node, root); ++ ++ entity->tree = root; ++} ++ ++/** ++ * bfq_update_min - update the min_start field of a entity. ++ * @entity: the entity to update. ++ * @node: one of its children. ++ * ++ * This function is called when @entity may store an invalid value for ++ * min_start due to updates to the active tree. The function assumes ++ * that the subtree rooted at @node (which may be its left or its right ++ * child) has a valid min_start value. ++ */ ++static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) ++{ ++ struct bfq_entity *child; ++ ++ if (node) { ++ child = rb_entry(node, struct bfq_entity, rb_node); ++ if (bfq_gt(entity->min_start, child->min_start)) ++ entity->min_start = child->min_start; ++ } ++} ++ ++/** ++ * bfq_update_active_node - recalculate min_start. ++ * @node: the node to update. ++ * ++ * @node may have changed position or one of its children may have moved, ++ * this function updates its min_start value. The left and right subtrees ++ * are assumed to hold a correct min_start value. ++ */ ++static void bfq_update_active_node(struct rb_node *node) ++{ ++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ entity->min_start = entity->start; ++ bfq_update_min(entity, node->rb_right); ++ bfq_update_min(entity, node->rb_left); ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "update_active_node: new min_start %llu", ++ ((entity->min_start>>10)*1000)>>12); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "update_active_node: new min_start %llu", ++ ((entity->min_start>>10)*1000)>>12); ++#endif ++ } ++} ++ ++/** ++ * bfq_update_active_tree - update min_start for the whole active tree. ++ * @node: the starting node. ++ * ++ * @node must be the deepest modified node after an update. This function ++ * updates its min_start using the values held by its children, assuming ++ * that they did not change, and then updates all the nodes that may have ++ * changed in the path to the root. The only nodes that may have changed ++ * are the ones in the path or their siblings. ++ */ ++static void bfq_update_active_tree(struct rb_node *node) ++{ ++ struct rb_node *parent; ++ ++up: ++ bfq_update_active_node(node); ++ ++ parent = rb_parent(node); ++ if (!parent) ++ return; ++ ++ if (node == parent->rb_left && parent->rb_right) ++ bfq_update_active_node(parent->rb_right); ++ else if (parent->rb_left) ++ bfq_update_active_node(parent->rb_left); ++ ++ node = parent; ++ goto up; ++} ++ ++static void bfq_weights_tree_add(struct bfq_data *bfqd, ++ struct bfq_entity *entity, ++ struct rb_root *root); ++ ++static void bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_entity *entity, ++ struct rb_root *root); ++ ++ ++/** ++ * bfq_active_insert - insert an entity in the active tree of its ++ * group/device. ++ * @st: the service tree of the entity. ++ * @entity: the entity being inserted. ++ * ++ * The active tree is ordered by finish time, but an extra key is kept ++ * per each node, containing the minimum value for the start times of ++ * its children (and the node itself), so it's possible to search for ++ * the eligible node with the lowest finish time in logarithmic time. ++ */ ++static void bfq_active_insert(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *node = &entity->rb_node; ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ struct bfq_sched_data *sd = NULL; ++ struct bfq_group *bfqg = NULL; ++ struct bfq_data *bfqd = NULL; ++#endif ++ ++ bfq_insert(&st->active, entity); ++ ++ if (node->rb_left) ++ node = node->rb_left; ++ else if (node->rb_right) ++ node = node->rb_right; ++ ++ bfq_update_active_tree(node); ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ sd = entity->sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++#endif ++ if (bfqq) ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ else { /* bfq_group */ ++ BUG_ON(!bfqd); ++ bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); ++ } ++ if (bfqg != bfqd->root_group) { ++ BUG_ON(!bfqg); ++ BUG_ON(!bfqd); ++ bfqg->active_entities++; ++ } ++#endif ++} ++ ++/** ++ * bfq_ioprio_to_weight - calc a weight from an ioprio. ++ * @ioprio: the ioprio value to convert. ++ */ ++static unsigned short bfq_ioprio_to_weight(int ioprio) ++{ ++ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); ++ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; ++} ++ ++/** ++ * bfq_weight_to_ioprio - calc an ioprio from a weight. ++ * @weight: the weight value to convert. ++ * ++ * To preserve as much as possible the old only-ioprio user interface, ++ * 0 is used as an escape ioprio value for weights (numerically) equal or ++ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. ++ */ ++static unsigned short bfq_weight_to_ioprio(int weight) ++{ ++ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); ++ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? ++ 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; ++} ++ ++static void bfq_get_entity(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ if (bfqq) { ++ bfqq->ref++; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", ++ bfqq, bfqq->ref); ++ } ++} ++ ++/** ++ * bfq_find_deepest - find the deepest node that an extraction can modify. ++ * @node: the node being removed. ++ * ++ * Do the first step of an extraction in an rb tree, looking for the ++ * node that will replace @node, and returning the deepest node that ++ * the following modifications to the tree can touch. If @node is the ++ * last node in the tree return %NULL. ++ */ ++static struct rb_node *bfq_find_deepest(struct rb_node *node) ++{ ++ struct rb_node *deepest; ++ ++ if (!node->rb_right && !node->rb_left) ++ deepest = rb_parent(node); ++ else if (!node->rb_right) ++ deepest = node->rb_left; ++ else if (!node->rb_left) ++ deepest = node->rb_right; ++ else { ++ deepest = rb_next(node); ++ if (deepest->rb_right) ++ deepest = deepest->rb_right; ++ else if (rb_parent(deepest) != node) ++ deepest = rb_parent(deepest); ++ } ++ ++ return deepest; ++} ++ ++/** ++ * bfq_active_extract - remove an entity from the active tree. ++ * @st: the service_tree containing the tree. ++ * @entity: the entity being removed. ++ */ ++static void bfq_active_extract(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *node; ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ struct bfq_sched_data *sd = NULL; ++ struct bfq_group *bfqg = NULL; ++ struct bfq_data *bfqd = NULL; ++#endif ++ ++ node = bfq_find_deepest(&entity->rb_node); ++ bfq_extract(&st->active, entity); ++ ++ if (node) ++ bfq_update_active_tree(node); ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ sd = entity->sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++#endif ++ if (bfqq) ++ list_del(&bfqq->bfqq_list); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ else { /* bfq_group */ ++ BUG_ON(!bfqd); ++ bfq_weights_tree_remove(bfqd, entity, ++ &bfqd->group_weights_tree); ++ } ++ if (bfqg != bfqd->root_group) { ++ BUG_ON(!bfqg); ++ BUG_ON(!bfqd); ++ BUG_ON(!bfqg->active_entities); ++ bfqg->active_entities--; ++ } ++#endif ++} ++ ++/** ++ * bfq_idle_insert - insert an entity into the idle tree. ++ * @st: the service tree containing the tree. ++ * @entity: the entity to insert. ++ */ ++static void bfq_idle_insert(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_entity *first_idle = st->first_idle; ++ struct bfq_entity *last_idle = st->last_idle; ++ ++ if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) ++ st->first_idle = entity; ++ if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) ++ st->last_idle = entity; ++ ++ bfq_insert(&st->idle, entity); ++ ++ if (bfqq) ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); ++} ++ ++/** ++ * bfq_forget_entity - do not consider entity any longer for scheduling ++ * @st: the service tree. ++ * @entity: the entity being removed. ++ * @is_in_service: true if entity is currently the in-service entity. ++ * ++ * Forget everything about @entity. In addition, if entity represents ++ * a queue, and the latter is not in service, then release the service ++ * reference to the queue (the one taken through bfq_get_entity). In ++ * fact, in this case, there is really no more service reference to ++ * the queue, as the latter is also outside any service tree. If, ++ * instead, the queue is in service, then __bfq_bfqd_reset_in_service ++ * will take care of putting the reference when the queue finally ++ * stops being served. ++ */ ++static void bfq_forget_entity(struct bfq_service_tree *st, ++ struct bfq_entity *entity, ++ bool is_in_service) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ BUG_ON(!entity->on_st); ++ ++ entity->on_st = false; ++ st->wsum -= entity->weight; ++ if (bfqq && !is_in_service) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity (before): %p %d", ++ bfqq, bfqq->ref); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++/** ++ * bfq_put_idle_entity - release the idle tree ref of an entity. ++ * @st: service tree for the entity. ++ * @entity: the entity being released. ++ */ ++static void bfq_put_idle_entity(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ bfq_idle_extract(st, entity); ++ bfq_forget_entity(st, entity, ++ entity == entity->sched_data->in_service_entity); ++} ++ ++/** ++ * bfq_forget_idle - update the idle tree if necessary. ++ * @st: the service tree to act upon. ++ * ++ * To preserve the global O(log N) complexity we only remove one entry here; ++ * as the idle tree will not grow indefinitely this can be done safely. ++ */ ++static void bfq_forget_idle(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *first_idle = st->first_idle; ++ struct bfq_entity *last_idle = st->last_idle; ++ ++ if (RB_EMPTY_ROOT(&st->active) && last_idle && ++ !bfq_gt(last_idle->finish, st->vtime)) { ++ /* ++ * Forget the whole idle tree, increasing the vtime past ++ * the last finish time of idle entities. ++ */ ++ st->vtime = last_idle->finish; ++ } ++ ++ if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) ++ bfq_put_idle_entity(st, first_idle); ++} ++ ++/* ++ * Update weight and priority of entity. If update_class_too is true, ++ * then update the ioprio_class of entity too. ++ * ++ * The reason why the update of ioprio_class is controlled through the ++ * last parameter is as follows. Changing the ioprio class of an ++ * entity implies changing the destination service trees for that ++ * entity. If such a change occurred when the entity is already on one ++ * of the service trees for its previous class, then the state of the ++ * entity would become more complex: none of the new possible service ++ * trees for the entity, according to bfq_entity_service_tree(), would ++ * match any of the possible service trees on which the entity ++ * is. Complex operations involving these trees, such as entity ++ * activations and deactivations, should take into account this ++ * additional complexity. To avoid this issue, this function is ++ * invoked with update_class_too unset in the points in the code where ++ * entity may happen to be on some tree. ++ */ ++static struct bfq_service_tree * ++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, ++ struct bfq_entity *entity, ++ bool update_class_too) ++{ ++ struct bfq_service_tree *new_st = old_st; ++ ++ if (entity->prio_changed) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned int prev_weight, new_weight; ++ struct bfq_data *bfqd = NULL; ++ struct rb_root *root; ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ struct bfq_sched_data *sd; ++ struct bfq_group *bfqg; ++#endif ++ ++ if (bfqq) ++ bfqd = bfqq->bfqd; ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ else { ++ sd = entity->my_sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++ BUG_ON(!bfqd); ++ } ++#endif ++ ++ BUG_ON(old_st->wsum < entity->weight); ++ old_st->wsum -= entity->weight; ++ ++ if (entity->new_weight != entity->orig_weight) { ++ if (entity->new_weight < BFQ_MIN_WEIGHT || ++ entity->new_weight > BFQ_MAX_WEIGHT) { ++ pr_crit("update_weight_prio: new_weight %d\n", ++ entity->new_weight); ++ if (entity->new_weight < BFQ_MIN_WEIGHT) ++ entity->new_weight = BFQ_MIN_WEIGHT; ++ else ++ entity->new_weight = BFQ_MAX_WEIGHT; ++ } ++ entity->orig_weight = entity->new_weight; ++ if (bfqq) ++ bfqq->ioprio = ++ bfq_weight_to_ioprio(entity->orig_weight); ++ } ++ ++ if (bfqq && update_class_too) ++ bfqq->ioprio_class = bfqq->new_ioprio_class; ++ ++ /* ++ * Reset prio_changed only if the ioprio_class change ++ * is not pending any longer. ++ */ ++ if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class) ++ entity->prio_changed = 0; ++ ++ /* ++ * NOTE: here we may be changing the weight too early, ++ * this will cause unfairness. The correct approach ++ * would have required additional complexity to defer ++ * weight changes to the proper time instants (i.e., ++ * when entity->finish <= old_st->vtime). ++ */ ++ new_st = bfq_entity_service_tree(entity); ++ ++ prev_weight = entity->weight; ++ new_weight = entity->orig_weight * ++ (bfqq ? bfqq->wr_coeff : 1); ++ /* ++ * If the weight of the entity changes, remove the entity ++ * from its old weight counter (if there is a counter ++ * associated with the entity), and add it to the counter ++ * associated with its new weight. ++ */ ++ if (prev_weight != new_weight) { ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "weight changed %d %d(%d %d)", ++ prev_weight, new_weight, ++ entity->orig_weight, ++ bfqq->wr_coeff); ++ ++ root = bfqq ? &bfqd->queue_weights_tree : ++ &bfqd->group_weights_tree; ++ bfq_weights_tree_remove(bfqd, entity, root); ++ } ++ entity->weight = new_weight; ++ /* ++ * Add the entity to its weights tree only if it is ++ * not associated with a weight-raised queue. ++ */ ++ if (prev_weight != new_weight && ++ (bfqq ? bfqq->wr_coeff == 1 : 1)) ++ /* If we get here, root has been initialized. */ ++ bfq_weights_tree_add(bfqd, entity, root); ++ ++ new_st->wsum += entity->weight; ++ ++ if (new_st != old_st) ++ entity->start = new_st->vtime; ++ } ++ ++ return new_st; ++} ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); ++#endif ++ ++/** ++ * bfq_bfqq_served - update the scheduler status after selection for ++ * service. ++ * @bfqq: the queue being served. ++ * @served: bytes to transfer. ++ * ++ * NOTE: this can be optimized, as the timestamps of upper level entities ++ * are synchronized every time a new bfqq is selected for service. By now, ++ * we keep it to better check consistency. ++ */ ++static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st; ++ ++ for_each_entity(entity) { ++ st = bfq_entity_service_tree(entity); ++ ++ entity->service += served; ++ ++ BUG_ON(st->wsum == 0); ++ ++ st->vtime += bfq_delta(served, st->wsum); ++ bfq_forget_idle(st); ++ } ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); ++#endif ++ st = bfq_entity_service_tree(&bfqq->entity); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", ++ served, ((st->vtime>>10)*1000)>>12, st); ++} ++ ++/** ++ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length ++ * of the time interval during which bfqq has been in ++ * service. ++ * @bfqd: the device ++ * @bfqq: the queue that needs a service update. ++ * @time_ms: the amount of time during which the queue has received service ++ * ++ * If a queue does not consume its budget fast enough, then providing ++ * the queue with service fairness may impair throughput, more or less ++ * severely. For this reason, queues that consume their budget slowly ++ * are provided with time fairness instead of service fairness. This ++ * goal is achieved through the BFQ scheduling engine, even if such an ++ * engine works in the service, and not in the time domain. The trick ++ * is charging these queues with an inflated amount of service, equal ++ * to the amount of service that they would have received during their ++ * service slot if they had been fast, i.e., if their requests had ++ * been dispatched at a rate equal to the estimated peak rate. ++ * ++ * It is worth noting that time fairness can cause important ++ * distortions in terms of bandwidth distribution, on devices with ++ * internal queueing. The reason is that I/O requests dispatched ++ * during the service slot of a queue may be served after that service ++ * slot is finished, and may have a total processing time loosely ++ * correlated with the duration of the service slot. This is ++ * especially true for short service slots. ++ */ ++static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ unsigned long time_ms) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ int tot_serv_to_charge = entity->service; ++ unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); ++ ++ if (time_ms > 0 && time_ms < timeout_ms) ++ tot_serv_to_charge = ++ (bfqd->bfq_max_budget * time_ms) / timeout_ms; ++ ++ if (tot_serv_to_charge < entity->service) ++ tot_serv_to_charge = entity->service; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "charge_time: %lu/%u ms, %d/%d/%d sectors", ++ time_ms, timeout_ms, entity->service, ++ tot_serv_to_charge, entity->budget); ++ ++ /* Increase budget to avoid inconsistencies */ ++ if (tot_serv_to_charge > entity->budget) ++ entity->budget = tot_serv_to_charge; ++ ++ bfq_bfqq_served(bfqq, ++ max_t(int, 0, tot_serv_to_charge - entity->service)); ++} ++ ++static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, ++ struct bfq_service_tree *st, ++ bool backshifted) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_sched_data *sd = entity->sched_data; ++ ++ /* ++ * When this function is invoked, entity is not in any service ++ * tree, then it is safe to invoke next function with the last ++ * parameter set (see the comments on the function). ++ */ ++ st = __bfq_entity_update_weight_prio(st, entity, true); ++ bfq_calc_finish(entity, entity->budget); ++ ++ /* ++ * If some queues enjoy backshifting for a while, then their ++ * (virtual) finish timestamps may happen to become lower and ++ * lower than the system virtual time. In particular, if ++ * these queues often happen to be idle for short time ++ * periods, and during such time periods other queues with ++ * higher timestamps happen to be busy, then the backshifted ++ * timestamps of the former queues can become much lower than ++ * the system virtual time. In fact, to serve the queues with ++ * higher timestamps while the ones with lower timestamps are ++ * idle, the system virtual time may be pushed-up to much ++ * higher values than the finish timestamps of the idle ++ * queues. As a consequence, the finish timestamps of all new ++ * or newly activated queues may end up being much larger than ++ * those of lucky queues with backshifted timestamps. The ++ * latter queues may then monopolize the device for a lot of ++ * time. This would simply break service guarantees. ++ * ++ * To reduce this problem, push up a little bit the ++ * backshifted timestamps of the queue associated with this ++ * entity (only a queue can happen to have the backshifted ++ * flag set): just enough to let the finish timestamp of the ++ * queue be equal to the current value of the system virtual ++ * time. This may introduce a little unfairness among queues ++ * with backshifted timestamps, but it does not break ++ * worst-case fairness guarantees. ++ * ++ * As a special case, if bfqq is weight-raised, push up ++ * timestamps much less, to keep very low the probability that ++ * this push up causes the backshifted finish timestamps of ++ * weight-raised queues to become higher than the backshifted ++ * finish timestamps of non weight-raised queues. ++ */ ++ if (backshifted && bfq_gt(st->vtime, entity->finish)) { ++ unsigned long delta = st->vtime - entity->finish; ++ ++ if (bfqq) ++ delta /= bfqq->wr_coeff; ++ ++ entity->start += delta; ++ entity->finish += delta; ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "__activate_entity: new queue finish %llu", ++ ((entity->finish>>10)*1000)>>12); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "__activate_entity: new group finish %llu", ++ ((entity->finish>>10)*1000)>>12); ++#endif ++ } ++ } ++ ++ bfq_active_insert(st, entity); ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "__activate_entity: queue %seligible in st %p", ++ entity->start <= st->vtime ? "" : "non ", st); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "__activate_entity: group %seligible in st %p", ++ entity->start <= st->vtime ? "" : "non ", st); ++#endif ++ } ++ BUG_ON(RB_EMPTY_ROOT(&st->active)); ++ BUG_ON(&st->active != &sd->service_tree->active && ++ &st->active != &(sd->service_tree+1)->active && ++ &st->active != &(sd->service_tree+2)->active); ++} ++ ++/** ++ * __bfq_activate_entity - handle activation of entity. ++ * @entity: the entity being activated. ++ * @non_blocking_wait_rq: true if entity was waiting for a request ++ * ++ * Called for a 'true' activation, i.e., if entity is not active and ++ * one of its children receives a new request. ++ * ++ * Basically, this function updates the timestamps of entity and ++ * inserts entity into its active tree, ater possible extracting it ++ * from its idle tree. ++ */ ++static void __bfq_activate_entity(struct bfq_entity *entity, ++ bool non_blocking_wait_rq) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ bool backshifted = false; ++ unsigned long long min_vstart; ++ ++ BUG_ON(!sd); ++ BUG_ON(!st); ++ ++ /* See comments on bfq_fqq_update_budg_for_activation */ ++ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { ++ backshifted = true; ++ min_vstart = entity->finish; ++ } else ++ min_vstart = st->vtime; ++ ++ if (entity->tree == &st->idle) { ++ /* ++ * Must be on the idle tree, bfq_idle_extract() will ++ * check for that. ++ */ ++ bfq_idle_extract(st, entity); ++ entity->start = bfq_gt(min_vstart, entity->finish) ? ++ min_vstart : entity->finish; ++ } else { ++ /* ++ * The finish time of the entity may be invalid, and ++ * it is in the past for sure, otherwise the queue ++ * would have been on the idle tree. ++ */ ++ entity->start = min_vstart; ++ st->wsum += entity->weight; ++ /* ++ * entity is about to be inserted into a service tree, ++ * and then set in service: get a reference to make ++ * sure entity does not disappear until it is no ++ * longer in service or scheduled for service. ++ */ ++ bfq_get_entity(entity); ++ ++ BUG_ON(entity->on_st && bfqq); ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ if (entity->on_st && !bfqq) { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, ++ entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, ++ bfqg, ++ "activate bug, class %d in_service %p", ++ bfq_class_idx(entity), sd->in_service_entity); ++ } ++#endif ++ BUG_ON(entity->on_st && !bfqq); ++ entity->on_st = true; ++ } ++ ++ bfq_update_fin_time_enqueue(entity, st, backshifted); ++} ++ ++/** ++ * __bfq_requeue_entity - handle requeueing or repositioning of an entity. ++ * @entity: the entity being requeued or repositioned. ++ * ++ * Requeueing is needed if this entity stops being served, which ++ * happens if a leaf descendant entity has expired. On the other hand, ++ * repositioning is needed if the next_inservice_entity for the child ++ * entity has changed. See the comments inside the function for ++ * details. ++ * ++ * Basically, this function: 1) removes entity from its active tree if ++ * present there, 2) updates the timestamps of entity and 3) inserts ++ * entity back into its active tree (in the new, right position for ++ * the new values of the timestamps). ++ */ ++static void __bfq_requeue_entity(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ ++ BUG_ON(!sd); ++ BUG_ON(!st); ++ ++ BUG_ON(entity != sd->in_service_entity && ++ entity->tree != &st->active); ++ ++ if (entity == sd->in_service_entity) { ++ /* ++ * We are requeueing the current in-service entity, ++ * which may have to be done for one of the following ++ * reasons: ++ * - entity represents the in-service queue, and the ++ * in-service queue is being requeued after an ++ * expiration; ++ * - entity represents a group, and its budget has ++ * changed because one of its child entities has ++ * just been either activated or requeued for some ++ * reason; the timestamps of the entity need then to ++ * be updated, and the entity needs to be enqueued ++ * or repositioned accordingly. ++ * ++ * In particular, before requeueing, the start time of ++ * the entity must be moved forward to account for the ++ * service that the entity has received while in ++ * service. This is done by the next instructions. The ++ * finish time will then be updated according to this ++ * new value of the start time, and to the budget of ++ * the entity. ++ */ ++ bfq_calc_finish(entity, entity->service); ++ entity->start = entity->finish; ++ BUG_ON(entity->tree && entity->tree != &st->active); ++ /* ++ * In addition, if the entity had more than one child ++ * when set in service, then was not extracted from ++ * the active tree. This implies that the position of ++ * the entity in the active tree may need to be ++ * changed now, because we have just updated the start ++ * time of the entity, and we will update its finish ++ * time in a moment (the requeueing is then, more ++ * precisely, a repositioning in this case). To ++ * implement this repositioning, we: 1) dequeue the ++ * entity here, 2) update the finish time and ++ * requeue the entity according to the new ++ * timestamps below. ++ */ ++ if (entity->tree) ++ bfq_active_extract(st, entity); ++ } else { /* The entity is already active, and not in service */ ++ /* ++ * In this case, this function gets called only if the ++ * next_in_service entity below this entity has ++ * changed, and this change has caused the budget of ++ * this entity to change, which, finally implies that ++ * the finish time of this entity must be ++ * updated. Such an update may cause the scheduling, ++ * i.e., the position in the active tree, of this ++ * entity to change. We handle this change by: 1) ++ * dequeueing the entity here, 2) updating the finish ++ * time and requeueing the entity according to the new ++ * timestamps below. This is the same approach as the ++ * non-extracted-entity sub-case above. ++ */ ++ bfq_active_extract(st, entity); ++ } ++ ++ bfq_update_fin_time_enqueue(entity, st, false); ++} ++ ++static void __bfq_activate_requeue_entity(struct bfq_entity *entity, ++ struct bfq_sched_data *sd, ++ bool non_blocking_wait_rq) ++{ ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ ++ if (sd->in_service_entity == entity || entity->tree == &st->active) ++ /* ++ * in service or already queued on the active tree, ++ * requeue or reposition ++ */ ++ __bfq_requeue_entity(entity); ++ else ++ /* ++ * Not in service and not queued on its active tree: ++ * the activity is idle and this is a true activation. ++ */ ++ __bfq_activate_entity(entity, non_blocking_wait_rq); ++} ++ ++ ++/** ++ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, ++ * and activate, requeue or reposition all ancestors ++ * for which such an update becomes necessary. ++ * @entity: the entity to activate. ++ * @non_blocking_wait_rq: true if this entity was waiting for a request ++ * @requeue: true if this is a requeue, which implies that bfqq is ++ * being expired; thus ALL its ancestors stop being served and must ++ * therefore be requeued ++ */ ++static void bfq_activate_requeue_entity(struct bfq_entity *entity, ++ bool non_blocking_wait_rq, ++ bool requeue) ++{ ++ struct bfq_sched_data *sd; ++ ++ for_each_entity(entity) { ++ BUG_ON(!entity); ++ sd = entity->sched_data; ++ __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); ++ ++ BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) && ++ RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && ++ RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); ++ ++ if (!bfq_update_next_in_service(sd, entity) && !requeue) { ++ BUG_ON(!sd->next_in_service); ++ break; ++ } ++ BUG_ON(!sd->next_in_service); ++ } ++} ++ ++/** ++ * __bfq_deactivate_entity - deactivate an entity from its service tree. ++ * @entity: the entity to deactivate. ++ * @ins_into_idle_tree: if false, the entity will not be put into the ++ * idle tree. ++ * ++ * Deactivates an entity, independently from its previous state. Must ++ * be invoked only if entity is on a service tree. Extracts the entity ++ * from that tree, and if necessary and allowed, puts it on the idle ++ * tree. ++ */ ++static bool __bfq_deactivate_entity(struct bfq_entity *entity, ++ bool ins_into_idle_tree) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st; ++ bool is_in_service; ++ ++ if (!entity->on_st) { /* entity never activated, or already inactive */ ++ BUG_ON(sd && entity == sd->in_service_entity); ++ return false; ++ } ++ ++ /* ++ * If we get here, then entity is active, which implies that ++ * bfq_group_set_parent has already been invoked for the group ++ * represented by entity. Therefore, the field ++ * entity->sched_data has been set, and we can safely use it. ++ */ ++ st = bfq_entity_service_tree(entity); ++ is_in_service = entity == sd->in_service_entity; ++ ++ BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); ++ ++ if (is_in_service) ++ bfq_calc_finish(entity, entity->service); ++ ++ if (entity->tree == &st->active) ++ bfq_active_extract(st, entity); ++ else if (!is_in_service && entity->tree == &st->idle) ++ bfq_idle_extract(st, entity); ++ else if (entity->tree) ++ BUG(); ++ ++ if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) ++ bfq_forget_entity(st, entity, is_in_service); ++ else ++ bfq_idle_insert(st, entity); ++ ++ return true; ++} ++ ++/** ++ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. ++ * @entity: the entity to deactivate. ++ * @ins_into_idle_tree: true if the entity can be put on the idle tree ++ */ ++static void bfq_deactivate_entity(struct bfq_entity *entity, ++ bool ins_into_idle_tree, ++ bool expiration) ++{ ++ struct bfq_sched_data *sd; ++ struct bfq_entity *parent = NULL; ++ ++ for_each_entity_safe(entity, parent) { ++ sd = entity->sched_data; ++ ++ BUG_ON(sd == NULL); /* ++ * It would mean that this is the ++ * root group. ++ */ ++ ++ BUG_ON(expiration && entity != sd->in_service_entity); ++ ++ BUG_ON(entity != sd->in_service_entity && ++ entity->tree == ++ &bfq_entity_service_tree(entity)->active && ++ !sd->next_in_service); ++ ++ if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { ++ /* ++ * entity is not in any tree any more, so ++ * this deactivation is a no-op, and there is ++ * nothing to change for upper-level entities ++ * (in case of expiration, this can never ++ * happen). ++ */ ++ BUG_ON(expiration); /* ++ * entity cannot be already out of ++ * any tree ++ */ ++ return; ++ } ++ ++ if (sd->next_in_service == entity) ++ /* ++ * entity was the next_in_service entity, ++ * then, since entity has just been ++ * deactivated, a new one must be found. ++ */ ++ bfq_update_next_in_service(sd, NULL); ++ ++ if (sd->next_in_service) { ++ /* ++ * The parent entity is still backlogged, ++ * because next_in_service is not NULL. So, no ++ * further upwards deactivation must be ++ * performed. Yet, next_in_service has ++ * changed. Then the schedule does need to be ++ * updated upwards. ++ */ ++ BUG_ON(sd->next_in_service == entity); ++ break; ++ } ++ ++ /* ++ * If we get here, then the parent is no more ++ * backlogged and we need to propagate the ++ * deactivation upwards. Thus let the loop go on. ++ */ ++ ++ /* ++ * Also let parent be queued into the idle tree on ++ * deactivation, to preserve service guarantees, and ++ * assuming that who invoked this function does not ++ * need parent entities too to be removed completely. ++ */ ++ ins_into_idle_tree = true; ++ } ++ ++ /* ++ * If the deactivation loop is fully executed, then there are ++ * no more entities to touch and next loop is not executed at ++ * all. Otherwise, requeue remaining entities if they are ++ * about to stop receiving service, or reposition them if this ++ * is not the case. ++ */ ++ entity = parent; ++ for_each_entity(entity) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ /* ++ * Invoke __bfq_requeue_entity on entity, even if ++ * already active, to requeue/reposition it in the ++ * active tree (because sd->next_in_service has ++ * changed) ++ */ ++ __bfq_requeue_entity(entity); ++ ++ sd = entity->sched_data; ++ BUG_ON(expiration && sd->in_service_entity != entity); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "invoking udpdate_next for this queue"); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, ++ struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "invoking udpdate_next for this entity"); ++ } ++#endif ++ if (!bfq_update_next_in_service(sd, entity) && ++ !expiration) ++ /* ++ * next_in_service unchanged or not causing ++ * any change in entity->parent->sd, and no ++ * requeueing needed for expiration: stop ++ * here. ++ */ ++ break; ++ } ++} ++ ++/** ++ * bfq_calc_vtime_jump - compute the value to which the vtime should jump, ++ * if needed, to have at least one entity eligible. ++ * @st: the service tree to act upon. ++ * ++ * Assumes that st is not empty. ++ */ ++static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); ++ ++ if (bfq_gt(root_entity->min_start, st->vtime)) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "calc_vtime_jump: new value %llu", ++ root_entity->min_start); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ else { ++ struct bfq_group *bfqg = ++ container_of(root_entity, struct bfq_group, ++ entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "calc_vtime_jump: new value %llu", ++ root_entity->min_start); ++ } ++#endif ++ return root_entity->min_start; ++ } ++ return st->vtime; ++} ++ ++static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) ++{ ++ if (new_value > st->vtime) { ++ st->vtime = new_value; ++ bfq_forget_idle(st); ++ } ++} ++ ++/** ++ * bfq_first_active_entity - find the eligible entity with ++ * the smallest finish time ++ * @st: the service tree to select from. ++ * @vtime: the system virtual to use as a reference for eligibility ++ * ++ * This function searches the first schedulable entity, starting from the ++ * root of the tree and going on the left every time on this side there is ++ * a subtree with at least one eligible (start >= vtime) entity. The path on ++ * the right is followed only if a) the left subtree contains no eligible ++ * entities and b) no eligible entity has been found yet. ++ */ ++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, ++ u64 vtime) ++{ ++ struct bfq_entity *entry, *first = NULL; ++ struct rb_node *node = st->active.rb_node; ++ ++ while (node) { ++ entry = rb_entry(node, struct bfq_entity, rb_node); ++left: ++ if (!bfq_gt(entry->start, vtime)) ++ first = entry; ++ ++ BUG_ON(bfq_gt(entry->min_start, vtime)); ++ ++ if (node->rb_left) { ++ entry = rb_entry(node->rb_left, ++ struct bfq_entity, rb_node); ++ if (!bfq_gt(entry->min_start, vtime)) { ++ node = node->rb_left; ++ goto left; ++ } ++ } ++ if (first) ++ break; ++ node = node->rb_right; ++ } ++ ++ BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); ++ return first; ++} ++ ++/** ++ * __bfq_lookup_next_entity - return the first eligible entity in @st. ++ * @st: the service tree. ++ * ++ * If there is no in-service entity for the sched_data st belongs to, ++ * then return the entity that will be set in service if: ++ * 1) the parent entity this st belongs to is set in service; ++ * 2) no entity belonging to such parent entity undergoes a state change ++ * that would influence the timestamps of the entity (e.g., becomes idle, ++ * becomes backlogged, changes its budget, ...). ++ * ++ * In this first case, update the virtual time in @st too (see the ++ * comments on this update inside the function). ++ * ++ * In constrast, if there is an in-service entity, then return the ++ * entity that would be set in service if not only the above ++ * conditions, but also the next one held true: the currently ++ * in-service entity, on expiration, ++ * 1) gets a finish time equal to the current one, or ++ * 2) is not eligible any more, or ++ * 3) is idle. ++ */ ++static struct bfq_entity * ++__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service ++#if 0 ++ , bool force ++#endif ++ ) ++{ ++ struct bfq_entity *entity ++#if 0 ++ , *new_next_in_service = NULL ++#endif ++ ; ++ u64 new_vtime; ++ struct bfq_queue *bfqq; ++ ++ if (RB_EMPTY_ROOT(&st->active)) ++ return NULL; ++ ++ /* ++ * Get the value of the system virtual time for which at ++ * least one entity is eligible. ++ */ ++ new_vtime = bfq_calc_vtime_jump(st); ++ ++ /* ++ * If there is no in-service entity for the sched_data this ++ * active tree belongs to, then push the system virtual time ++ * up to the value that guarantees that at least one entity is ++ * eligible. If, instead, there is an in-service entity, then ++ * do not make any such update, because there is already an ++ * eligible entity, namely the in-service one (even if the ++ * entity is not on st, because it was extracted when set in ++ * service). ++ */ ++ if (!in_service) ++ bfq_update_vtime(st, new_vtime); ++ ++ entity = bfq_first_active_entity(st, new_vtime); ++ BUG_ON(bfq_gt(entity->start, new_vtime)); ++ ++ /* Log some information */ ++ bfqq = bfq_entity_to_bfqq(entity); ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "__lookup_next: start %llu vtime %llu st %p", ++ ((entity->start>>10)*1000)>>12, ++ ((new_vtime>>10)*1000)>>12, st); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "__lookup_next: start %llu vtime %llu st %p", ++ ((entity->start>>10)*1000)>>12, ++ ((new_vtime>>10)*1000)>>12, st); ++ } ++#endif ++ ++ BUG_ON(!entity); ++ ++ return entity; ++} ++ ++/** ++ * bfq_lookup_next_entity - return the first eligible entity in @sd. ++ * @sd: the sched_data. ++ * ++ * This function is invoked when there has been a change in the trees ++ * for sd, and we need know what is the new next entity after this ++ * change. ++ */ ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) ++{ ++ struct bfq_service_tree *st = sd->service_tree; ++ struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); ++ struct bfq_entity *entity = NULL; ++ struct bfq_queue *bfqq; ++ int class_idx = 0; ++ ++ BUG_ON(!sd); ++ BUG_ON(!st); ++ /* ++ * Choose from idle class, if needed to guarantee a minimum ++ * bandwidth to this class (and if there is some active entity ++ * in idle class). This should also mitigate ++ * priority-inversion problems in case a low priority task is ++ * holding file system resources. ++ */ ++ if (time_is_before_jiffies(sd->bfq_class_idle_last_service + ++ BFQ_CL_IDLE_TIMEOUT)) { ++ if (!RB_EMPTY_ROOT(&idle_class_st->active)) ++ class_idx = BFQ_IOPRIO_CLASSES - 1; ++ /* About to be served if backlogged, or not yet backlogged */ ++ sd->bfq_class_idle_last_service = jiffies; ++ } ++ ++ /* ++ * Find the next entity to serve for the highest-priority ++ * class, unless the idle class needs to be served. ++ */ ++ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { ++ entity = __bfq_lookup_next_entity(st + class_idx, ++ sd->in_service_entity); ++ ++ if (entity) ++ break; ++ } ++ ++ BUG_ON(!entity && ++ (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) || ++ !RB_EMPTY_ROOT(&(st+2)->active))); ++ ++ if (!entity) ++ return NULL; ++ ++ /* Log some information */ ++ bfqq = bfq_entity_to_bfqq(entity); ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", ++ st + class_idx, class_idx); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "chosen from st %p %d", ++ st + class_idx, class_idx); ++ } ++#endif ++ ++ return entity; ++} ++ ++static bool next_queue_may_preempt(struct bfq_data *bfqd) ++{ ++ struct bfq_sched_data *sd = &bfqd->root_group->sched_data; ++ ++ return sd->next_in_service != sd->in_service_entity; ++} ++ ++/* ++ * Get next queue for service. ++ */ ++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_entity *entity = NULL; ++ struct bfq_sched_data *sd; ++ struct bfq_queue *bfqq; ++ ++ BUG_ON(bfqd->in_service_queue); ++ ++ if (bfqd->busy_queues == 0) ++ return NULL; ++ ++ /* ++ * Traverse the path from the root to the leaf entity to ++ * serve. Set in service all the entities visited along the ++ * way. ++ */ ++ sd = &bfqd->root_group->sched_data; ++ for (; sd ; sd = entity->my_sched_data) { ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ if (entity) { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg(bfqd, bfqg, ++ "get_next_queue: lookup in this group"); ++ if (!sd->next_in_service) ++ pr_crit("get_next_queue: lookup in this group"); ++ } else { ++ bfq_log_bfqg(bfqd, bfqd->root_group, ++ "get_next_queue: lookup in root group"); ++ if (!sd->next_in_service) ++ pr_crit("get_next_queue: lookup in root group"); ++ } ++#endif ++ ++ BUG_ON(!sd->next_in_service); ++ ++ /* ++ * WARNING. We are about to set the in-service entity ++ * to sd->next_in_service, i.e., to the (cached) value ++ * returned by bfq_lookup_next_entity(sd) the last ++ * time it was invoked, i.e., the last time when the ++ * service order in sd changed as a consequence of the ++ * activation or deactivation of an entity. In this ++ * respect, if we execute bfq_lookup_next_entity(sd) ++ * in this very moment, it may, although with low ++ * probability, yield a different entity than that ++ * pointed to by sd->next_in_service. This rare event ++ * happens in case there was no CLASS_IDLE entity to ++ * serve for sd when bfq_lookup_next_entity(sd) was ++ * invoked for the last time, while there is now one ++ * such entity. ++ * ++ * If the above event happens, then the scheduling of ++ * such entity in CLASS_IDLE is postponed until the ++ * service of the sd->next_in_service entity ++ * finishes. In fact, when the latter is expired, ++ * bfq_lookup_next_entity(sd) gets called again, ++ * exactly to update sd->next_in_service. ++ */ ++ ++ /* Make next_in_service entity become in_service_entity */ ++ entity = sd->next_in_service; ++ sd->in_service_entity = entity; ++ ++ /* ++ * Reset the accumulator of the amount of service that ++ * the entity is about to receive. ++ */ ++ entity->service = 0; ++ ++ /* ++ * If entity is no longer a candidate for next ++ * service, then we extract it from its active tree, ++ * for the following reason. To further boost the ++ * throughput in some special case, BFQ needs to know ++ * which is the next candidate entity to serve, while ++ * there is already an entity in service. In this ++ * respect, to make it easy to compute/update the next ++ * candidate entity to serve after the current ++ * candidate has been set in service, there is a case ++ * where it is necessary to extract the current ++ * candidate from its service tree. Such a case is ++ * when the entity just set in service cannot be also ++ * a candidate for next service. Details about when ++ * this conditions holds are reported in the comments ++ * on the function bfq_no_longer_next_in_service() ++ * invoked below. ++ */ ++ if (bfq_no_longer_next_in_service(entity)) ++ bfq_active_extract(bfq_entity_service_tree(entity), ++ entity); ++ ++ /* ++ * For the same reason why we may have just extracted ++ * entity from its active tree, we may need to update ++ * next_in_service for the sched_data of entity too, ++ * regardless of whether entity has been extracted. ++ * In fact, even if entity has not been extracted, a ++ * descendant entity may get extracted. Such an event ++ * would cause a change in next_in_service for the ++ * level of the descendant entity, and thus possibly ++ * back to upper levels. ++ * ++ * We cannot perform the resulting needed update ++ * before the end of this loop, because, to know which ++ * is the correct next-to-serve candidate entity for ++ * each level, we need first to find the leaf entity ++ * to set in service. In fact, only after we know ++ * which is the next-to-serve leaf entity, we can ++ * discover whether the parent entity of the leaf ++ * entity becomes the next-to-serve, and so on. ++ */ ++ ++ /* Log some information */ ++ bfqq = bfq_entity_to_bfqq(entity); ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, ++ "get_next_queue: this queue, finish %llu", ++ (((entity->finish>>10)*1000)>>10)>>2); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg(bfqd, bfqg, ++ "get_next_queue: this entity, finish %llu", ++ (((entity->finish>>10)*1000)>>10)>>2); ++ } ++#endif ++ ++ } ++ ++ BUG_ON(!entity); ++ bfqq = bfq_entity_to_bfqq(entity); ++ BUG_ON(!bfqq); ++ ++ /* ++ * We can finally update all next-to-serve entities along the ++ * path from the leaf entity just set in service to the root. ++ */ ++ for_each_entity(entity) { ++ struct bfq_sched_data *sd = entity->sched_data; ++ ++ if(!bfq_update_next_in_service(sd, NULL)) ++ break; ++ } ++ ++ return bfqq; ++} ++ ++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; ++ struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; ++ struct bfq_entity *entity = in_serv_entity; ++ ++ if (bfqd->in_service_bic) { ++ put_io_context(bfqd->in_service_bic->icq.ioc); ++ bfqd->in_service_bic = NULL; ++ } ++ ++ bfq_clear_bfqq_wait_request(in_serv_bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ bfqd->in_service_queue = NULL; ++ ++ /* ++ * When this function is called, all in-service entities have ++ * been properly deactivated or requeued, so we can safely ++ * execute the final step: reset in_service_entity along the ++ * path from entity to the root. ++ */ ++ for_each_entity(entity) ++ entity->sched_data->in_service_entity = NULL; ++ ++ /* ++ * in_serv_entity is no longer in service, so, if it is in no ++ * service tree either, then release the service reference to ++ * the queue it represents (taken with bfq_get_entity). ++ */ ++ if (!in_serv_entity->on_st) ++ bfq_put_queue(in_serv_bfqq); ++} ++ ++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool ins_into_idle_tree, bool expiration) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); ++} ++ ++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ BUG_ON(entity->tree != &st->active && entity->tree != &st->idle && ++ entity->on_st); ++ ++ bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), ++ false); ++ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); ++} ++ ++static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_activate_requeue_entity(entity, false, ++ bfqq == bfqd->in_service_queue); ++} ++ ++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); ++ ++/* ++ * Called when the bfqq no longer has requests pending, remove it from ++ * the service tree. As a special case, it can be invoked during an ++ * expiration. ++ */ ++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool expiration) ++{ ++ BUG_ON(!bfq_bfqq_busy(bfqq)); ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ bfq_log_bfqq(bfqd, bfqq, "del from busy"); ++ ++ bfq_clear_bfqq_busy(bfqq); ++ ++ BUG_ON(bfqd->busy_queues == 0); ++ bfqd->busy_queues--; ++ ++ if (!bfqq->dispatched) ++ bfq_weights_tree_remove(bfqd, &bfqq->entity, ++ &bfqd->queue_weights_tree); ++ ++ if (bfqq->wr_coeff > 1) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++ ++ bfqg_stats_update_dequeue(bfqq_group(bfqq)); ++ ++ BUG_ON(bfqq->entity.budget < 0); ++ ++ bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); ++} ++ ++/* ++ * Called when an inactive queue receives a new request. ++ */ ++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ ++ bfq_log_bfqq(bfqd, bfqq, "add to busy"); ++ ++ bfq_activate_bfqq(bfqd, bfqq); ++ ++ bfq_mark_bfqq_busy(bfqq); ++ bfqd->busy_queues++; ++ ++ if (!bfqq->dispatched) ++ if (bfqq->wr_coeff == 1) ++ bfq_weights_tree_add(bfqd, &bfqq->entity, ++ &bfqd->queue_weights_tree); ++ ++ if (bfqq->wr_coeff > 1) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); ++ } ++ ++} +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +new file mode 100644 +index 000000000000..65e7c7e77f3c +--- /dev/null ++++ b/block/bfq-sq-iosched.c +@@ -0,0 +1,5379 @@ ++/* ++ * Budget Fair Queueing (BFQ) I/O scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2015 Paolo Valente ++ * ++ * Copyright (C) 2017 Paolo Valente ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ ++ * file. ++ * ++ * BFQ is a proportional-share I/O scheduler, with some extra ++ * low-latency capabilities. BFQ also supports full hierarchical ++ * scheduling through cgroups. Next paragraphs provide an introduction ++ * on BFQ inner workings. Details on BFQ benefits and usage can be ++ * found in Documentation/block/bfq-iosched.txt. ++ * ++ * BFQ is a proportional-share storage-I/O scheduling algorithm based ++ * on the slice-by-slice service scheme of CFQ. But BFQ assigns ++ * budgets, measured in number of sectors, to processes instead of ++ * time slices. The device is not granted to the in-service process ++ * for a given time slice, but until it has exhausted its assigned ++ * budget. This change from the time to the service domain enables BFQ ++ * to distribute the device throughput among processes as desired, ++ * without any distortion due to throughput fluctuations, or to device ++ * internal queueing. BFQ uses an ad hoc internal scheduler, called ++ * B-WF2Q+, to schedule processes according to their budgets. More ++ * precisely, BFQ schedules queues associated with processes. Thanks to ++ * the accurate policy of B-WF2Q+, BFQ can afford to assign high ++ * budgets to I/O-bound processes issuing sequential requests (to ++ * boost the throughput), and yet guarantee a low latency to ++ * interactive and soft real-time applications. ++ * ++ * NOTE: if the main or only goal, with a given device, is to achieve ++ * the maximum-possible throughput at all times, then do switch off ++ * all low-latency heuristics for that device, by setting low_latency ++ * to 0. ++ * ++ * BFQ is described in [1], where also a reference to the initial, more ++ * theoretical paper on BFQ can be found. The interested reader can find ++ * in the latter paper full details on the main algorithm, as well as ++ * formulas of the guarantees and formal proofs of all the properties. ++ * With respect to the version of BFQ presented in these papers, this ++ * implementation adds a few more heuristics, such as the one that ++ * guarantees a low latency to soft real-time applications, and a ++ * hierarchical extension based on H-WF2Q+. ++ * ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) ++ * complexity derives from the one introduced with EEVDF in [3]. ++ * ++ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O ++ * Scheduler", Proceedings of the First Workshop on Mobile System ++ * Technologies (MST-2015), May 2015. ++ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf ++ * ++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf ++ * ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, ++ * Oct 1997. ++ * ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz ++ * ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline ++ * First: A Flexible and Accurate Mechanism for Proportional Share ++ * Resource Allocation,'' technical report. ++ * ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "blk.h" ++#include "bfq.h" ++ ++/* Expiration time of sync (0) and async (1) requests, in ns. */ ++static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; ++ ++/* Maximum backwards seek, in KiB. */ ++static const int bfq_back_max = (16 * 1024); ++ ++/* Penalty of a backwards seek, in number of sectors. */ ++static const int bfq_back_penalty = 2; ++ ++/* Idling period duration, in ns. */ ++static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); ++ ++/* Minimum number of assigned budgets for which stats are safe to compute. */ ++static const int bfq_stats_min_budgets = 194; ++ ++/* Default maximum budget values, in sectors and number of requests. */ ++static const int bfq_default_max_budget = (16 * 1024); ++ ++/* ++ * Async to sync throughput distribution is controlled as follows: ++ * when an async request is served, the entity is charged the number ++ * of sectors of the request, multiplied by the factor below ++ */ ++static const int bfq_async_charge_factor = 10; ++ ++/* Default timeout values, in jiffies, approximating CFQ defaults. */ ++static const int bfq_timeout = (HZ / 8); ++ ++static struct kmem_cache *bfq_pool; ++ ++/* Below this threshold (in ns), we consider thinktime immediate. */ ++#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) ++ ++/* hw_tag detection: parallel requests threshold and min samples needed. */ ++#define BFQ_HW_QUEUE_THRESHOLD 4 ++#define BFQ_HW_QUEUE_SAMPLES 32 ++ ++#define BFQQ_SEEK_THR (sector_t)(8 * 100) ++#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) ++#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) ++#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) ++ ++/* Min number of samples required to perform peak-rate update */ ++#define BFQ_RATE_MIN_SAMPLES 32 ++/* Min observation time interval required to perform a peak-rate update (ns) */ ++#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) ++/* Target observation time interval for a peak-rate update (ns) */ ++#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC ++ ++/* Shift used for peak rate fixed precision calculations. */ ++#define BFQ_RATE_SHIFT 16 ++ ++/* ++ * By default, BFQ computes the duration of the weight raising for ++ * interactive applications automatically, using the following formula: ++ * duration = (R / r) * T, where r is the peak rate of the device, and ++ * R and T are two reference parameters. ++ * In particular, R is the peak rate of the reference device (see below), ++ * and T is a reference time: given the systems that are likely to be ++ * installed on the reference device according to its speed class, T is ++ * about the maximum time needed, under BFQ and while reading two files in ++ * parallel, to load typical large applications on these systems. ++ * In practice, the slower/faster the device at hand is, the more/less it ++ * takes to load applications with respect to the reference device. ++ * Accordingly, the longer/shorter BFQ grants weight raising to interactive ++ * applications. ++ * ++ * BFQ uses four different reference pairs (R, T), depending on: ++ * . whether the device is rotational or non-rotational; ++ * . whether the device is slow, such as old or portable HDDs, as well as ++ * SD cards, or fast, such as newer HDDs and SSDs. ++ * ++ * The device's speed class is dynamically (re)detected in ++ * bfq_update_peak_rate() every time the estimated peak rate is updated. ++ * ++ * In the following definitions, R_slow[0]/R_fast[0] and ++ * T_slow[0]/T_fast[0] are the reference values for a slow/fast ++ * rotational device, whereas R_slow[1]/R_fast[1] and ++ * T_slow[1]/T_fast[1] are the reference values for a slow/fast ++ * non-rotational device. Finally, device_speed_thresh are the ++ * thresholds used to switch between speed classes. The reference ++ * rates are not the actual peak rates of the devices used as a ++ * reference, but slightly lower values. The reason for using these ++ * slightly lower values is that the peak-rate estimator tends to ++ * yield slightly lower values than the actual peak rate (it can yield ++ * the actual peak rate only if there is only one process doing I/O, ++ * and the process does sequential I/O). ++ * ++ * Both the reference peak rates and the thresholds are measured in ++ * sectors/usec, left-shifted by BFQ_RATE_SHIFT. ++ */ ++static int R_slow[2] = {1000, 10700}; ++static int R_fast[2] = {14000, 33000}; ++/* ++ * To improve readability, a conversion function is used to initialize the ++ * following arrays, which entails that they can be initialized only in a ++ * function. ++ */ ++static int T_slow[2]; ++static int T_fast[2]; ++static int device_speed_thresh[2]; ++ ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) ++ ++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) ++ ++static void bfq_schedule_dispatch(struct bfq_data *bfqd); ++ ++#include "bfq-ioc.c" ++#include "bfq-sched.c" ++#include "bfq-cgroup-included.c" ++ ++#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) ++#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) ++ ++#define bfq_sample_valid(samples) ((samples) > 80) ++ ++/* ++ * Scheduler run of queue, if there are requests pending and no one in the ++ * driver that will restart queueing. ++ */ ++static void bfq_schedule_dispatch(struct bfq_data *bfqd) ++{ ++ if (bfqd->queued != 0) { ++ bfq_log(bfqd, "schedule dispatch"); ++ kblockd_schedule_work(&bfqd->unplug_work); ++ } ++} ++ ++/* ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now. ++ * We choose the request that is closesr to the head right now. Distance ++ * behind the head is penalized and only allowed to a certain extent. ++ */ ++static struct request *bfq_choose_req(struct bfq_data *bfqd, ++ struct request *rq1, ++ struct request *rq2, ++ sector_t last) ++{ ++ sector_t s1, s2, d1 = 0, d2 = 0; ++ unsigned long back_max; ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ ++ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ ++ ++ if (!rq1 || rq1 == rq2) ++ return rq2; ++ if (!rq2) ++ return rq1; ++ ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) ++ return rq1; ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) ++ return rq2; ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) ++ return rq1; ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) ++ return rq2; ++ ++ s1 = blk_rq_pos(rq1); ++ s2 = blk_rq_pos(rq2); ++ ++ /* ++ * By definition, 1KiB is 2 sectors. ++ */ ++ back_max = bfqd->bfq_back_max * 2; ++ ++ /* ++ * Strict one way elevator _except_ in the case where we allow ++ * short backward seeks which are biased as twice the cost of a ++ * similar forward seek. ++ */ ++ if (s1 >= last) ++ d1 = s1 - last; ++ else if (s1 + back_max >= last) ++ d1 = (last - s1) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ1_WRAP; ++ ++ if (s2 >= last) ++ d2 = s2 - last; ++ else if (s2 + back_max >= last) ++ d2 = (last - s2) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ2_WRAP; ++ ++ /* Found required data */ ++ ++ /* ++ * By doing switch() on the bit mask "wrap" we avoid having to ++ * check two variables for all permutations: --> faster! ++ */ ++ switch (wrap) { ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ ++ if (d1 < d2) ++ return rq1; ++ else if (d2 < d1) ++ return rq2; ++ ++ if (s1 >= s2) ++ return rq1; ++ else ++ return rq2; ++ ++ case BFQ_RQ2_WRAP: ++ return rq1; ++ case BFQ_RQ1_WRAP: ++ return rq2; ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ ++ default: ++ /* ++ * Since both rqs are wrapped, ++ * start with the one that's further behind head ++ * (--> only *one* back seek required), ++ * since back seek takes more time than forward. ++ */ ++ if (s1 <= s2) ++ return rq1; ++ else ++ return rq2; ++ } ++} ++ ++static struct bfq_queue * ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, ++ sector_t sector, struct rb_node **ret_parent, ++ struct rb_node ***rb_link) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *bfqq = NULL; ++ ++ parent = NULL; ++ p = &root->rb_node; ++ while (*p) { ++ struct rb_node **n; ++ ++ parent = *p; ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ ++ /* ++ * Sort strictly based on sector. Smallest to the left, ++ * largest to the right. ++ */ ++ if (sector > blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_right; ++ else if (sector < blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_left; ++ else ++ break; ++ p = n; ++ bfqq = NULL; ++ } ++ ++ *ret_parent = parent; ++ if (rb_link) ++ *rb_link = p; ++ ++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", ++ (unsigned long long) sector, ++ bfqq ? bfqq->pid : 0); ++ ++ return bfqq; ++} ++ ++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *__bfqq; ++ ++ if (bfqq->pos_root) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ ++ if (bfq_class_idle(bfqq)) ++ return; ++ if (!bfqq->next_rq) ++ return; ++ ++ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, ++ blk_rq_pos(bfqq->next_rq), &parent, &p); ++ if (!__bfqq) { ++ rb_link_node(&bfqq->pos_node, parent, p); ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); ++ } else ++ bfqq->pos_root = NULL; ++} ++ ++/* ++ * Tell whether there are active queues or groups with differentiated weights. ++ */ ++static bool bfq_differentiated_weights(struct bfq_data *bfqd) ++{ ++ /* ++ * For weights to differ, at least one of the trees must contain ++ * at least two nodes. ++ */ ++ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && ++ (bfqd->queue_weights_tree.rb_node->rb_left || ++ bfqd->queue_weights_tree.rb_node->rb_right) ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ ) || ++ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && ++ (bfqd->group_weights_tree.rb_node->rb_left || ++ bfqd->group_weights_tree.rb_node->rb_right) ++#endif ++ ); ++} ++ ++/* ++ * The following function returns true if every queue must receive the ++ * same share of the throughput (this condition is used when deciding ++ * whether idling may be disabled, see the comments in the function ++ * bfq_bfqq_may_idle()). ++ * ++ * Such a scenario occurs when: ++ * 1) all active queues have the same weight, ++ * 2) all active groups at the same level in the groups tree have the same ++ * weight, ++ * 3) all active groups at the same level in the groups tree have the same ++ * number of children. ++ * ++ * Unfortunately, keeping the necessary state for evaluating exactly the ++ * above symmetry conditions would be quite complex and time-consuming. ++ * Therefore this function evaluates, instead, the following stronger ++ * sub-conditions, for which it is much easier to maintain the needed ++ * state: ++ * 1) all active queues have the same weight, ++ * 2) all active groups have the same weight, ++ * 3) all active groups have at most one active child each. ++ * In particular, the last two conditions are always true if hierarchical ++ * support and the cgroups interface are not enabled, thus no state needs ++ * to be maintained in this case. ++ */ ++static bool bfq_symmetric_scenario(struct bfq_data *bfqd) ++{ ++ return !bfq_differentiated_weights(bfqd); ++} ++ ++/* ++ * If the weight-counter tree passed as input contains no counter for ++ * the weight of the input entity, then add that counter; otherwise just ++ * increment the existing counter. ++ * ++ * Note that weight-counter trees contain few nodes in mostly symmetric ++ * scenarios. For example, if all queues have the same weight, then the ++ * weight-counter tree for the queues may contain at most one node. ++ * This holds even if low_latency is on, because weight-raised queues ++ * are not inserted in the tree. ++ * In most scenarios, the rate at which nodes are created/destroyed ++ * should be low too. ++ */ ++static void bfq_weights_tree_add(struct bfq_data *bfqd, ++ struct bfq_entity *entity, ++ struct rb_root *root) ++{ ++ struct rb_node **new = &(root->rb_node), *parent = NULL; ++ ++ /* ++ * Do not insert if the entity is already associated with a ++ * counter, which happens if: ++ * 1) the entity is associated with a queue, ++ * 2) a request arrival has caused the queue to become both ++ * non-weight-raised, and hence change its weight, and ++ * backlogged; in this respect, each of the two events ++ * causes an invocation of this function, ++ * 3) this is the invocation of this function caused by the ++ * second event. This second invocation is actually useless, ++ * and we handle this fact by exiting immediately. More ++ * efficient or clearer solutions might possibly be adopted. ++ */ ++ if (entity->weight_counter) ++ return; ++ ++ while (*new) { ++ struct bfq_weight_counter *__counter = container_of(*new, ++ struct bfq_weight_counter, ++ weights_node); ++ parent = *new; ++ ++ if (entity->weight == __counter->weight) { ++ entity->weight_counter = __counter; ++ goto inc_counter; ++ } ++ if (entity->weight < __counter->weight) ++ new = &((*new)->rb_left); ++ else ++ new = &((*new)->rb_right); ++ } ++ ++ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), ++ GFP_ATOMIC); ++ ++ /* ++ * In the unlucky event of an allocation failure, we just ++ * exit. This will cause the weight of entity to not be ++ * considered in bfq_differentiated_weights, which, in its ++ * turn, causes the scenario to be deemed wrongly symmetric in ++ * case entity's weight would have been the only weight making ++ * the scenario asymmetric. On the bright side, no unbalance ++ * will however occur when entity becomes inactive again (the ++ * invocation of this function is triggered by an activation ++ * of entity). In fact, bfq_weights_tree_remove does nothing ++ * if !entity->weight_counter. ++ */ ++ if (unlikely(!entity->weight_counter)) ++ return; ++ ++ entity->weight_counter->weight = entity->weight; ++ rb_link_node(&entity->weight_counter->weights_node, parent, new); ++ rb_insert_color(&entity->weight_counter->weights_node, root); ++ ++inc_counter: ++ entity->weight_counter->num_active++; ++} ++ ++/* ++ * Decrement the weight counter associated with the entity, and, if the ++ * counter reaches 0, remove the counter from the tree. ++ * See the comments to the function bfq_weights_tree_add() for considerations ++ * about overhead. ++ */ ++static void bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_entity *entity, ++ struct rb_root *root) ++{ ++ if (!entity->weight_counter) ++ return; ++ ++ BUG_ON(RB_EMPTY_ROOT(root)); ++ BUG_ON(entity->weight_counter->weight != entity->weight); ++ ++ BUG_ON(!entity->weight_counter->num_active); ++ entity->weight_counter->num_active--; ++ if (entity->weight_counter->num_active > 0) ++ goto reset_entity_pointer; ++ ++ rb_erase(&entity->weight_counter->weights_node, root); ++ kfree(entity->weight_counter); ++ ++reset_entity_pointer: ++ entity->weight_counter = NULL; ++} ++ ++/* ++ * Return expired entry, or NULL to just start from scratch in rbtree. ++ */ ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct request *rq; ++ ++ if (bfq_bfqq_fifo_expire(bfqq)) ++ return NULL; ++ ++ bfq_mark_bfqq_fifo_expire(bfqq); ++ ++ rq = rq_entry_fifo(bfqq->fifo.next); ++ ++ if (rq == last || ktime_get_ns() < rq->fifo_time) ++ return NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); ++ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); ++ return rq; ++} ++ ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct rb_node *rbnext = rb_next(&last->rb_node); ++ struct rb_node *rbprev = rb_prev(&last->rb_node); ++ struct request *next, *prev = NULL; ++ ++ BUG_ON(list_empty(&bfqq->fifo)); ++ ++ /* Follow expired path, else get first next available. */ ++ next = bfq_check_fifo(bfqq, last); ++ if (next) { ++ BUG_ON(next == last); ++ return next; ++ } ++ ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); ++ ++ if (rbprev) ++ prev = rb_entry_rq(rbprev); ++ ++ if (rbnext) ++ next = rb_entry_rq(rbnext); ++ else { ++ rbnext = rb_first(&bfqq->sort_list); ++ if (rbnext && rbnext != &last->rb_node) ++ next = rb_entry_rq(rbnext); ++ } ++ ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); ++} ++ ++/* see the definition of bfq_async_charge_factor for details */ ++static unsigned long bfq_serv_to_charge(struct request *rq, ++ struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) ++ return blk_rq_sectors(rq); ++ ++ /* ++ * If there are no weight-raised queues, then amplify service ++ * by just the async charge factor; otherwise amplify service ++ * by twice the async charge factor, to further reduce latency ++ * for weight-raised queues. ++ */ ++ if (bfqq->bfqd->wr_busy_queues == 0) ++ return blk_rq_sectors(rq) * bfq_async_charge_factor; ++ ++ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; ++} ++ ++/** ++ * bfq_updated_next_req - update the queue after a new next_rq selection. ++ * @bfqd: the device data the queue belongs to. ++ * @bfqq: the queue to update. ++ * ++ * If the first request of a queue changes we make sure that the queue ++ * has enough budget to serve at least its first request (if the ++ * request has grown). We do this because if the queue has not enough ++ * budget for its first request, it has to go through two dispatch ++ * rounds to actually get it dispatched. ++ */ ++static void bfq_updated_next_req(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ struct request *next_rq = bfqq->next_rq; ++ unsigned long new_budget; ++ ++ if (!next_rq) ++ return; ++ ++ if (bfqq == bfqd->in_service_queue) ++ /* ++ * In order not to break guarantees, budgets cannot be ++ * changed after an entity has been selected. ++ */ ++ return; ++ ++ BUG_ON(entity->tree != &st->active); ++ BUG_ON(entity == entity->sched_data->in_service_entity); ++ ++ new_budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ if (entity->budget != new_budget) { ++ entity->budget = new_budget; ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", ++ new_budget); ++ bfq_requeue_bfqq(bfqd, bfqq); ++ } ++} ++ ++static unsigned int bfq_wr_duration(struct bfq_data *bfqd) ++{ ++ u64 dur; ++ ++ if (bfqd->bfq_wr_max_time > 0) ++ return bfqd->bfq_wr_max_time; ++ ++ dur = bfqd->RT_prod; ++ do_div(dur, bfqd->peak_rate); ++ ++ /* ++ * Limit duration between 3 and 13 seconds. Tests show that ++ * higher values than 13 seconds often yield the opposite of ++ * the desired result, i.e., worsen responsiveness by letting ++ * non-interactive and non-soft-real-time applications ++ * preserve weight raising for a too long time interval. ++ * ++ * On the other end, lower values than 3 seconds make it ++ * difficult for most interactive tasks to complete their jobs ++ * before weight-raising finishes. ++ */ ++ if (dur > msecs_to_jiffies(13000)) ++ dur = msecs_to_jiffies(13000); ++ else if (dur < msecs_to_jiffies(3000)) ++ dur = msecs_to_jiffies(3000); ++ ++ return dur; ++} ++ ++static void ++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, bool bfq_already_existing) ++{ ++ unsigned int old_wr_coeff; ++ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); ++ ++ if (bic->saved_idle_window) ++ bfq_mark_bfqq_idle_window(bfqq); ++ else ++ bfq_clear_bfqq_idle_window(bfqq); ++ ++ if (bic->saved_IO_bound) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ else ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ if (unlikely(busy)) ++ old_wr_coeff = bfqq->wr_coeff; ++ ++ bfqq->wr_coeff = bic->saved_wr_coeff; ++ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; ++ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); ++ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; ++ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || ++ time_is_before_jiffies(bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time))) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "resume state: switching off wr (%lu + %lu < %lu)", ++ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, ++ jiffies); ++ ++ bfqq->wr_coeff = 1; ++ } ++ ++ /* make sure weight will be updated, however we got here */ ++ bfqq->entity.prio_changed = 1; ++ ++ if (likely(!busy)) ++ return; ++ ++ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); ++ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++} ++ ++static int bfqq_process_refs(struct bfq_queue *bfqq) ++{ ++ int process_refs, io_refs; ++ ++ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); ++ ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; ++ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; ++ BUG_ON(process_refs < 0); ++ return process_refs; ++} ++ ++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ ++static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *item; ++ struct hlist_node *n; ++ ++ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) ++ hlist_del_init(&item->burst_list_node); ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++ bfqd->burst_size = 1; ++ bfqd->burst_parent_entity = bfqq->entity.parent; ++} ++ ++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ ++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ /* Increment burst size to take into account also bfqq */ ++ bfqd->burst_size++; ++ ++ bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); ++ ++ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); ++ ++ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { ++ struct bfq_queue *pos, *bfqq_item; ++ struct hlist_node *n; ++ ++ /* ++ * Enough queues have been activated shortly after each ++ * other to consider this burst as large. ++ */ ++ bfqd->large_burst = true; ++ bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); ++ ++ /* ++ * We can now mark all queues in the burst list as ++ * belonging to a large burst. ++ */ ++ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, ++ burst_list_node) { ++ bfq_mark_bfqq_in_large_burst(bfqq_item); ++ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); ++ } ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); ++ ++ /* ++ * From now on, and until the current burst finishes, any ++ * new queue being activated shortly after the last queue ++ * was inserted in the burst can be immediately marked as ++ * belonging to a large burst. So the burst list is not ++ * needed any more. Remove it. ++ */ ++ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, ++ burst_list_node) ++ hlist_del_init(&pos->burst_list_node); ++ } else /* ++ * Burst not yet large: add bfqq to the burst list. Do ++ * not increment the ref counter for bfqq, because bfqq ++ * is removed from the burst list before freeing bfqq ++ * in put_queue. ++ */ ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++} ++ ++/* ++ * If many queues belonging to the same group happen to be created ++ * shortly after each other, then the processes associated with these ++ * queues have typically a common goal. In particular, bursts of queue ++ * creations are usually caused by services or applications that spawn ++ * many parallel threads/processes. Examples are systemd during boot, ++ * or git grep. To help these processes get their job done as soon as ++ * possible, it is usually better to not grant either weight-raising ++ * or device idling to their queues. ++ * ++ * In this comment we describe, firstly, the reasons why this fact ++ * holds, and, secondly, the next function, which implements the main ++ * steps needed to properly mark these queues so that they can then be ++ * treated in a different way. ++ * ++ * The above services or applications benefit mostly from a high ++ * throughput: the quicker the requests of the activated queues are ++ * cumulatively served, the sooner the target job of these queues gets ++ * completed. As a consequence, weight-raising any of these queues, ++ * which also implies idling the device for it, is almost always ++ * counterproductive. In most cases it just lowers throughput. ++ * ++ * On the other hand, a burst of queue creations may be caused also by ++ * the start of an application that does not consist of a lot of ++ * parallel I/O-bound threads. In fact, with a complex application, ++ * several short processes may need to be executed to start-up the ++ * application. In this respect, to start an application as quickly as ++ * possible, the best thing to do is in any case to privilege the I/O ++ * related to the application with respect to all other ++ * I/O. Therefore, the best strategy to start as quickly as possible ++ * an application that causes a burst of queue creations is to ++ * weight-raise all the queues created during the burst. This is the ++ * exact opposite of the best strategy for the other type of bursts. ++ * ++ * In the end, to take the best action for each of the two cases, the ++ * two types of bursts need to be distinguished. Fortunately, this ++ * seems relatively easy, by looking at the sizes of the bursts. In ++ * particular, we found a threshold such that only bursts with a ++ * larger size than that threshold are apparently caused by ++ * services or commands such as systemd or git grep. For brevity, ++ * hereafter we call just 'large' these bursts. BFQ *does not* ++ * weight-raise queues whose creation occurs in a large burst. In ++ * addition, for each of these queues BFQ performs or does not perform ++ * idling depending on which choice boosts the throughput more. The ++ * exact choice depends on the device and request pattern at ++ * hand. ++ * ++ * Unfortunately, false positives may occur while an interactive task ++ * is starting (e.g., an application is being started). The ++ * consequence is that the queues associated with the task do not ++ * enjoy weight raising as expected. Fortunately these false positives ++ * are very rare. They typically occur if some service happens to ++ * start doing I/O exactly when the interactive task starts. ++ * ++ * Turning back to the next function, it implements all the steps ++ * needed to detect the occurrence of a large burst and to properly ++ * mark all the queues belonging to it (so that they can then be ++ * treated in a different way). This goal is achieved by maintaining a ++ * "burst list" that holds, temporarily, the queues that belong to the ++ * burst in progress. The list is then used to mark these queues as ++ * belonging to a large burst if the burst does become large. The main ++ * steps are the following. ++ * ++ * . when the very first queue is created, the queue is inserted into the ++ * list (as it could be the first queue in a possible burst) ++ * ++ * . if the current burst has not yet become large, and a queue Q that does ++ * not yet belong to the burst is activated shortly after the last time ++ * at which a new queue entered the burst list, then the function appends ++ * Q to the burst list ++ * ++ * . if, as a consequence of the previous step, the burst size reaches ++ * the large-burst threshold, then ++ * ++ * . all the queues in the burst list are marked as belonging to a ++ * large burst ++ * ++ * . the burst list is deleted; in fact, the burst list already served ++ * its purpose (keeping temporarily track of the queues in a burst, ++ * so as to be able to mark them as belonging to a large burst in the ++ * previous sub-step), and now is not needed any more ++ * ++ * . the device enters a large-burst mode ++ * ++ * . if a queue Q that does not belong to the burst is created while ++ * the device is in large-burst mode and shortly after the last time ++ * at which a queue either entered the burst list or was marked as ++ * belonging to the current large burst, then Q is immediately marked ++ * as belonging to a large burst. ++ * ++ * . if a queue Q that does not belong to the burst is created a while ++ * later, i.e., not shortly after, than the last time at which a queue ++ * either entered the burst list or was marked as belonging to the ++ * current large burst, then the current burst is deemed as finished and: ++ * ++ * . the large-burst mode is reset if set ++ * ++ * . the burst list is emptied ++ * ++ * . Q is inserted in the burst list, as Q may be the first queue ++ * in a possible new burst (then the burst list contains just Q ++ * after this step). ++ */ ++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ /* ++ * If bfqq is already in the burst list or is part of a large ++ * burst, or finally has just been split, then there is ++ * nothing else to do. ++ */ ++ if (!hlist_unhashed(&bfqq->burst_list_node) || ++ bfq_bfqq_in_large_burst(bfqq) || ++ time_is_after_eq_jiffies(bfqq->split_time + ++ msecs_to_jiffies(10))) ++ return; ++ ++ /* ++ * If bfqq's creation happens late enough, or bfqq belongs to ++ * a different group than the burst group, then the current ++ * burst is finished, and related data structures must be ++ * reset. ++ * ++ * In this respect, consider the special case where bfqq is ++ * the very first queue created after BFQ is selected for this ++ * device. In this case, last_ins_in_burst and ++ * burst_parent_entity are not yet significant when we get ++ * here. But it is easy to verify that, whether or not the ++ * following condition is true, bfqq will end up being ++ * inserted into the burst list. In particular the list will ++ * happen to contain only bfqq. And this is exactly what has ++ * to happen, as bfqq may be the first queue of the first ++ * burst. ++ */ ++ if (time_is_before_jiffies(bfqd->last_ins_in_burst + ++ bfqd->bfq_burst_interval) || ++ bfqq->entity.parent != bfqd->burst_parent_entity) { ++ bfqd->large_burst = false; ++ bfq_reset_burst_list(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "handle_burst: late activation or different group"); ++ goto end; ++ } ++ ++ /* ++ * If we get here, then bfqq is being activated shortly after the ++ * last queue. So, if the current burst is also large, we can mark ++ * bfqq as belonging to this large burst immediately. ++ */ ++ if (bfqd->large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ goto end; ++ } ++ ++ /* ++ * If we get here, then a large-burst state has not yet been ++ * reached, but bfqq is being activated shortly after the last ++ * queue. Then we add bfqq to the burst. ++ */ ++ bfq_add_to_burst(bfqd, bfqq); ++end: ++ /* ++ * At this point, bfqq either has been added to the current ++ * burst or has caused the current burst to terminate and a ++ * possible new burst to start. In particular, in the second ++ * case, bfqq has become the first queue in the possible new ++ * burst. In both cases last_ins_in_burst needs to be moved ++ * forward. ++ */ ++ bfqd->last_ins_in_burst = jiffies; ++ ++} ++ ++static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ return entity->budget - entity->service; ++} ++ ++/* ++ * If enough samples have been computed, return the current max budget ++ * stored in bfqd, which is dynamically updated according to the ++ * estimated disk peak rate; otherwise return the default max budget ++ */ ++static int bfq_max_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget; ++ else ++ return bfqd->bfq_max_budget; ++} ++ ++/* ++ * Return min budget, which is a fraction of the current or default ++ * max budget (trying with 1/32) ++ */ ++static int bfq_min_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget / 32; ++ else ++ return bfqd->bfq_max_budget / 32; ++} ++ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason); ++ ++/* ++ * The next function, invoked after the input queue bfqq switches from ++ * idle to busy, updates the budget of bfqq. The function also tells ++ * whether the in-service queue should be expired, by returning ++ * true. The purpose of expiring the in-service queue is to give bfqq ++ * the chance to possibly preempt the in-service queue, and the reason ++ * for preempting the in-service queue is to achieve one of the two ++ * goals below. ++ * ++ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has ++ * expired because it has remained idle. In particular, bfqq may have ++ * expired for one of the following two reasons: ++ * ++ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and ++ * did not make it to issue a new request before its last request ++ * was served; ++ * ++ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue ++ * a new request before the expiration of the idling-time. ++ * ++ * Even if bfqq has expired for one of the above reasons, the process ++ * associated with the queue may be however issuing requests greedily, ++ * and thus be sensitive to the bandwidth it receives (bfqq may have ++ * remained idle for other reasons: CPU high load, bfqq not enjoying ++ * idling, I/O throttling somewhere in the path from the process to ++ * the I/O scheduler, ...). But if, after every expiration for one of ++ * the above two reasons, bfqq has to wait for the service of at least ++ * one full budget of another queue before being served again, then ++ * bfqq is likely to get a much lower bandwidth or resource time than ++ * its reserved ones. To address this issue, two countermeasures need ++ * to be taken. ++ * ++ * First, the budget and the timestamps of bfqq need to be updated in ++ * a special way on bfqq reactivation: they need to be updated as if ++ * bfqq did not remain idle and did not expire. In fact, if they are ++ * computed as if bfqq expired and remained idle until reactivation, ++ * then the process associated with bfqq is treated as if, instead of ++ * being greedy, it stopped issuing requests when bfqq remained idle, ++ * and restarts issuing requests only on this reactivation. In other ++ * words, the scheduler does not help the process recover the "service ++ * hole" between bfqq expiration and reactivation. As a consequence, ++ * the process receives a lower bandwidth than its reserved one. In ++ * contrast, to recover this hole, the budget must be updated as if ++ * bfqq was not expired at all before this reactivation, i.e., it must ++ * be set to the value of the remaining budget when bfqq was ++ * expired. Along the same line, timestamps need to be assigned the ++ * value they had the last time bfqq was selected for service, i.e., ++ * before last expiration. Thus timestamps need to be back-shifted ++ * with respect to their normal computation (see [1] for more details ++ * on this tricky aspect). ++ * ++ * Secondly, to allow the process to recover the hole, the in-service ++ * queue must be expired too, to give bfqq the chance to preempt it ++ * immediately. In fact, if bfqq has to wait for a full budget of the ++ * in-service queue to be completed, then it may become impossible to ++ * let the process recover the hole, even if the back-shifted ++ * timestamps of bfqq are lower than those of the in-service queue. If ++ * this happens for most or all of the holes, then the process may not ++ * receive its reserved bandwidth. In this respect, it is worth noting ++ * that, being the service of outstanding requests unpreemptible, a ++ * little fraction of the holes may however be unrecoverable, thereby ++ * causing a little loss of bandwidth. ++ * ++ * The last important point is detecting whether bfqq does need this ++ * bandwidth recovery. In this respect, the next function deems the ++ * process associated with bfqq greedy, and thus allows it to recover ++ * the hole, if: 1) the process is waiting for the arrival of a new ++ * request (which implies that bfqq expired for one of the above two ++ * reasons), and 2) such a request has arrived soon. The first ++ * condition is controlled through the flag non_blocking_wait_rq, ++ * while the second through the flag arrived_in_time. If both ++ * conditions hold, then the function computes the budget in the ++ * above-described special way, and signals that the in-service queue ++ * should be expired. Timestamp back-shifting is done later in ++ * __bfq_activate_entity. ++ * ++ * 2. Reduce latency. Even if timestamps are not backshifted to let ++ * the process associated with bfqq recover a service hole, bfqq may ++ * however happen to have, after being (re)activated, a lower finish ++ * timestamp than the in-service queue. That is, the next budget of ++ * bfqq may have to be completed before the one of the in-service ++ * queue. If this is the case, then preempting the in-service queue ++ * allows this goal to be achieved, apart from the unpreemptible, ++ * outstanding requests mentioned above. ++ * ++ * Unfortunately, regardless of which of the above two goals one wants ++ * to achieve, service trees need first to be updated to know whether ++ * the in-service queue must be preempted. To have service trees ++ * correctly updated, the in-service queue must be expired and ++ * rescheduled, and bfqq must be scheduled too. This is one of the ++ * most costly operations (in future versions, the scheduling ++ * mechanism may be re-designed in such a way to make it possible to ++ * know whether preemption is needed without needing to update service ++ * trees). In addition, queue preemptions almost always cause random ++ * I/O, and thus loss of throughput. Because of these facts, the next ++ * function adopts the following simple scheme to avoid both costly ++ * operations and too frequent preemptions: it requests the expiration ++ * of the in-service queue (unconditionally) only for queues that need ++ * to recover a hole, or that either are weight-raised or deserve to ++ * be weight-raised. ++ */ ++static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool arrived_in_time, ++ bool wr_or_deserves_wr) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { ++ /* ++ * We do not clear the flag non_blocking_wait_rq here, as ++ * the latter is used in bfq_activate_bfqq to signal ++ * that timestamps need to be back-shifted (and is ++ * cleared right after). ++ */ ++ ++ /* ++ * In next assignment we rely on that either ++ * entity->service or entity->budget are not updated ++ * on expiration if bfqq is empty (see ++ * __bfq_bfqq_recalc_budget). Thus both quantities ++ * remain unchanged after such an expiration, and the ++ * following statement therefore assigns to ++ * entity->budget the remaining budget on such an ++ * expiration. For clarity, entity->service is not ++ * updated on expiration in any case, and, in normal ++ * operation, is reset only when bfqq is selected for ++ * service (see bfq_get_next_queue). ++ */ ++ BUG_ON(bfqq->max_budget < 0); ++ entity->budget = min_t(unsigned long, ++ bfq_bfqq_budget_left(bfqq), ++ bfqq->max_budget); ++ ++ BUG_ON(entity->budget < 0); ++ return true; ++ } ++ ++ BUG_ON(bfqq->max_budget < 0); ++ entity->budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(bfqq->next_rq, bfqq)); ++ BUG_ON(entity->budget < 0); ++ ++ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); ++ return wr_or_deserves_wr; ++} ++ ++static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ unsigned int old_wr_coeff, ++ bool wr_or_deserves_wr, ++ bool interactive, ++ bool in_burst, ++ bool soft_rt) ++{ ++ if (old_wr_coeff == 1 && wr_or_deserves_wr) { ++ /* start a weight-raising period */ ++ if (interactive) { ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ } else { ++ bfqq->wr_start_at_switch_to_srt = jiffies; ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff * ++ BFQ_SOFTRT_WEIGHT_FACTOR; ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ } ++ /* ++ * If needed, further reduce budget to make sure it is ++ * close to bfqq's backlog, so as to reduce the ++ * scheduling-error component due to a too large ++ * budget. Do not care about throughput consequences, ++ * but only about latency. Finally, do not assign a ++ * too small budget either, to avoid increasing ++ * latency by causing too frequent expirations. ++ */ ++ bfqq->entity.budget = min_t(unsigned long, ++ bfqq->entity.budget, ++ 2 * bfq_min_budget(bfqd)); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais starting at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } else if (old_wr_coeff > 1) { ++ if (interactive) { /* update wr coeff and duration */ ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ } else if (in_burst) { ++ bfqq->wr_coeff = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq-> ++ wr_cur_max_time)); ++ } else if (soft_rt) { ++ /* ++ * The application is now or still meeting the ++ * requirements for being deemed soft rt. We ++ * can then correctly and safely (re)charge ++ * the weight-raising duration for the ++ * application with the weight-raising ++ * duration for soft rt applications. ++ * ++ * In particular, doing this recharge now, i.e., ++ * before the weight-raising period for the ++ * application finishes, reduces the probability ++ * of the following negative scenario: ++ * 1) the weight of a soft rt application is ++ * raised at startup (as for any newly ++ * created application), ++ * 2) since the application is not interactive, ++ * at a certain time weight-raising is ++ * stopped for the application, ++ * 3) at that time the application happens to ++ * still have pending requests, and hence ++ * is destined to not have a chance to be ++ * deemed soft rt before these requests are ++ * completed (see the comments to the ++ * function bfq_bfqq_softrt_next_start() ++ * for details on soft rt detection), ++ * 4) these pending requests experience a high ++ * latency because the application is not ++ * weight-raised while they are pending. ++ */ ++ if (bfqq->wr_cur_max_time != ++ bfqd->bfq_wr_rt_max_time) { ++ bfqq->wr_start_at_switch_to_srt = ++ bfqq->last_wr_start_finish; ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff * ++ BFQ_SOFTRT_WEIGHT_FACTOR; ++ bfq_log_bfqq(bfqd, bfqq, ++ "switching to soft_rt wr"); ++ } else ++ bfq_log_bfqq(bfqd, bfqq, ++ "moving forward soft_rt wr duration"); ++ bfqq->last_wr_start_finish = jiffies; ++ } ++ } ++} ++ ++static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ return bfqq->dispatched == 0 && ++ time_is_before_jiffies( ++ bfqq->budget_timeout + ++ bfqd->bfq_wr_min_idle_time); ++} ++ ++static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ int old_wr_coeff, ++ struct request *rq, ++ bool *interactive) ++{ ++ bool soft_rt, in_burst, wr_or_deserves_wr, ++ bfqq_wants_to_preempt, ++ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), ++ /* ++ * See the comments on ++ * bfq_bfqq_update_budg_for_activation for ++ * details on the usage of the next variable. ++ */ ++ arrived_in_time = ktime_get_ns() <= ++ RQ_BIC(rq)->ttime.last_end_request + ++ bfqd->bfq_slice_idle * 3; ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfq_add_request non-busy: " ++ "jiffies %lu, in_time %d, idle_long %d busyw %d " ++ "wr_coeff %u", ++ jiffies, arrived_in_time, ++ idle_for_long_time, ++ bfq_bfqq_non_blocking_wait_rq(bfqq), ++ old_wr_coeff); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); ++ ++ /* ++ * bfqq deserves to be weight-raised if: ++ * - it is sync, ++ * - it does not belong to a large burst, ++ * - it has been idle for enough time or is soft real-time, ++ * - is linked to a bfq_io_cq (it is not shared in any sense) ++ */ ++ in_burst = bfq_bfqq_in_large_burst(bfqq); ++ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && ++ !in_burst && ++ time_is_before_jiffies(bfqq->soft_rt_next_start); ++ *interactive = ++ !in_burst && ++ idle_for_long_time; ++ wr_or_deserves_wr = bfqd->low_latency && ++ (bfqq->wr_coeff > 1 || ++ (bfq_bfqq_sync(bfqq) && ++ bfqq->bic && (*interactive || soft_rt))); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfq_add_request: " ++ "in_burst %d, " ++ "soft_rt %d (next %lu), inter %d, bic %p", ++ bfq_bfqq_in_large_burst(bfqq), soft_rt, ++ bfqq->soft_rt_next_start, ++ *interactive, ++ bfqq->bic); ++ ++ /* ++ * Using the last flag, update budget and check whether bfqq ++ * may want to preempt the in-service queue. ++ */ ++ bfqq_wants_to_preempt = ++ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, ++ arrived_in_time, ++ wr_or_deserves_wr); ++ ++ /* ++ * If bfqq happened to be activated in a burst, but has been ++ * idle for much more than an interactive queue, then we ++ * assume that, in the overall I/O initiated in the burst, the ++ * I/O associated with bfqq is finished. So bfqq does not need ++ * to be treated as a queue belonging to a burst ++ * anymore. Accordingly, we reset bfqq's in_large_burst flag ++ * if set, and remove bfqq from the burst list if it's ++ * there. We do not decrement burst_size, because the fact ++ * that bfqq does not need to belong to the burst list any ++ * more does not invalidate the fact that bfqq was created in ++ * a burst. ++ */ ++ if (likely(!bfq_bfqq_just_created(bfqq)) && ++ idle_for_long_time && ++ time_is_before_jiffies( ++ bfqq->budget_timeout + ++ msecs_to_jiffies(10000))) { ++ hlist_del_init(&bfqq->burst_list_node); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ } ++ ++ bfq_clear_bfqq_just_created(bfqq); ++ ++ if (!bfq_bfqq_IO_bound(bfqq)) { ++ if (arrived_in_time) { ++ bfqq->requests_within_timer++; ++ if (bfqq->requests_within_timer >= ++ bfqd->bfq_requests_within_timer) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ } else ++ bfqq->requests_within_timer = 0; ++ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", ++ bfqq->requests_within_timer); ++ } ++ ++ if (bfqd->low_latency) { ++ if (unlikely(time_is_after_jiffies(bfqq->split_time))) ++ /* wraparound */ ++ bfqq->split_time = ++ jiffies - bfqd->bfq_wr_min_idle_time - 1; ++ ++ if (time_is_before_jiffies(bfqq->split_time + ++ bfqd->bfq_wr_min_idle_time)) { ++ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, ++ old_wr_coeff, ++ wr_or_deserves_wr, ++ *interactive, ++ in_burst, ++ soft_rt); ++ ++ if (old_wr_coeff != bfqq->wr_coeff) ++ bfqq->entity.prio_changed = 1; ++ } ++ } ++ ++ bfqq->last_idle_bklogged = jiffies; ++ bfqq->service_from_backlogged = 0; ++ bfq_clear_bfqq_softrt_update(bfqq); ++ ++ bfq_add_bfqq_busy(bfqd, bfqq); ++ ++ /* ++ * Expire in-service queue only if preemption may be needed ++ * for guarantees. In this respect, the function ++ * next_queue_may_preempt just checks a simple, necessary ++ * condition, and not a sufficient condition based on ++ * timestamps. In fact, for the latter condition to be ++ * evaluated, timestamps would need first to be updated, and ++ * this operation is quite costly (see the comments on the ++ * function bfq_bfqq_update_budg_for_activation). ++ */ ++ if (bfqd->in_service_queue && bfqq_wants_to_preempt && ++ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && ++ next_queue_may_preempt(bfqd)) { ++ struct bfq_queue *in_serv = ++ bfqd->in_service_queue; ++ BUG_ON(in_serv == bfqq); ++ ++ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, ++ false, BFQ_BFQQ_PREEMPTED); ++ } ++} ++ ++static void bfq_add_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *next_rq, *prev; ++ unsigned int old_wr_coeff = bfqq->wr_coeff; ++ bool interactive = false; ++ ++ bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", ++ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); ++ ++ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ bfqq->queued[rq_is_sync(rq)]++; ++ bfqd->queued++; ++ ++ elv_rb_add(&bfqq->sort_list, rq); ++ ++ /* ++ * Check if this request is a better next-to-serve candidate. ++ */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); ++ BUG_ON(!next_rq); ++ bfqq->next_rq = next_rq; ++ ++ /* ++ * Adjust priority tree position, if next_rq changes. ++ */ ++ if (prev != bfqq->next_rq) ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ ++ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ ++ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, ++ rq, &interactive); ++ else { ++ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && ++ time_is_before_jiffies( ++ bfqq->last_wr_start_finish + ++ bfqd->bfq_wr_min_inter_arr_async)) { ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "non-idle wrais starting, " ++ "wr_max_time %u wr_busy %d", ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqd->wr_busy_queues); ++ } ++ if (prev != bfqq->next_rq) ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ /* ++ * Assign jiffies to last_wr_start_finish in the following ++ * cases: ++ * ++ * . if bfqq is not going to be weight-raised, because, for ++ * non weight-raised queues, last_wr_start_finish stores the ++ * arrival time of the last request; as of now, this piece ++ * of information is used only for deciding whether to ++ * weight-raise async queues ++ * ++ * . if bfqq is not weight-raised, because, if bfqq is now ++ * switching to weight-raised, then last_wr_start_finish ++ * stores the time when weight-raising starts ++ * ++ * . if bfqq is interactive, because, regardless of whether ++ * bfqq is currently weight-raised, the weight-raising ++ * period must start or restart (this case is considered ++ * separately because it is not detected by the above ++ * conditions, if bfqq is already weight-raised) ++ * ++ * last_wr_start_finish has to be updated also if bfqq is soft ++ * real-time, because the weight-raising period is constantly ++ * restarted on idle-to-busy transitions for these queues, but ++ * this is already done in bfq_bfqq_handle_idle_busy_switch if ++ * needed. ++ */ ++ if (bfqd->low_latency && ++ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) ++ bfqq->last_wr_start_finish = jiffies; ++} ++ ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, ++ struct bio *bio) ++{ ++ struct task_struct *tsk = current; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ if (!bic) ++ return NULL; ++ ++ bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); ++ if (bfqq) ++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); ++ ++ return NULL; ++} ++ ++static sector_t get_sdist(sector_t last_pos, struct request *rq) ++{ ++ sector_t sdist = 0; ++ ++ if (last_pos) { ++ if (last_pos < blk_rq_pos(rq)) ++ sdist = blk_rq_pos(rq) - last_pos; ++ else ++ sdist = last_pos - blk_rq_pos(rq); ++ } ++ ++ return sdist; ++} ++ ++static void bfq_activate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ bfqd->rq_in_driver++; ++} ++ ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ BUG_ON(bfqd->rq_in_driver == 0); ++ bfqd->rq_in_driver--; ++} ++ ++static void bfq_remove_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ const int sync = rq_is_sync(rq); ++ ++ BUG_ON(bfqq->entity.service > bfqq->entity.budget && ++ bfqq == bfqd->in_service_queue); ++ ++ if (bfqq->next_rq == rq) { ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ if (rq->queuelist.prev != &rq->queuelist) ++ list_del_init(&rq->queuelist); ++ BUG_ON(bfqq->queued[sync] == 0); ++ bfqq->queued[sync]--; ++ bfqd->queued--; ++ elv_rb_del(&bfqq->sort_list, rq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ bfqq->next_rq = NULL; ++ ++ BUG_ON(bfqq->entity.budget < 0); ++ ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { ++ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ ++ bfq_del_bfqq_busy(bfqd, bfqq, false); ++ /* ++ * bfqq emptied. In normal operation, when ++ * bfqq is empty, bfqq->entity.service and ++ * bfqq->entity.budget must contain, ++ * respectively, the service received and the ++ * budget used last time bfqq emptied. These ++ * facts do not hold in this case, as at least ++ * this last removal occurred while bfqq is ++ * not in service. To avoid inconsistencies, ++ * reset both bfqq->entity.service and ++ * bfqq->entity.budget, if bfqq has still a ++ * process that may issue I/O requests to it. ++ */ ++ bfqq->entity.budget = bfqq->entity.service = 0; ++ } ++ ++ /* ++ * Remove queue from request-position tree as it is empty. ++ */ ++ if (bfqq->pos_root) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ } ++ ++ if (rq->cmd_flags & REQ_META) { ++ BUG_ON(bfqq->meta_pending == 0); ++ bfqq->meta_pending--; ++ } ++ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); ++} ++ ++static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct request *__rq; ++ ++ __rq = bfq_find_rq_fmerge(bfqd, bio); ++ if (__rq && elv_bio_merge_ok(__rq, bio)) { ++ *req = __rq; ++ return ELEVATOR_FRONT_MERGE; ++ } ++ ++ return ELEVATOR_NO_MERGE; ++} ++ ++static void bfq_merged_request(struct request_queue *q, struct request *req, ++ enum elv_merge type) ++{ ++ if (type == ELEVATOR_FRONT_MERGE && ++ rb_prev(&req->rb_node) && ++ blk_rq_pos(req) < ++ blk_rq_pos(container_of(rb_prev(&req->rb_node), ++ struct request, rb_node))) { ++ struct bfq_queue *bfqq = RQ_BFQQ(req); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *prev, *next_rq; ++ ++ /* Reposition request in its sort_list */ ++ elv_rb_del(&bfqq->sort_list, req); ++ elv_rb_add(&bfqq->sort_list, req); ++ /* Choose next request to be served for bfqq */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, ++ bfqd->last_position); ++ BUG_ON(!next_rq); ++ bfqq->next_rq = next_rq; ++ /* ++ * If next_rq changes, update both the queue's budget to ++ * fit the new request and the queue's position in its ++ * rq_pos_tree. ++ */ ++ if (prev != bfqq->next_rq) { ++ bfq_updated_next_req(bfqd, bfqq); ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ } ++} ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++static void bfq_bio_merged(struct request_queue *q, struct request *req, ++ struct bio *bio) ++{ ++ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); ++} ++#endif ++ ++static void bfq_merged_requests(struct request_queue *q, struct request *rq, ++ struct request *next) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); ++ ++ /* ++ * If next and rq belong to the same bfq_queue and next is older ++ * than rq, then reposition rq in the fifo (by substituting next ++ * with rq). Otherwise, if next and rq belong to different ++ * bfq_queues, never reposition rq: in fact, we would have to ++ * reposition it with respect to next's position in its own fifo, ++ * which would most certainly be too expensive with respect to ++ * the benefits. ++ */ ++ if (bfqq == next_bfqq && ++ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && ++ next->fifo_time < rq->fifo_time) { ++ list_del_init(&rq->queuelist); ++ list_replace_init(&next->queuelist, &rq->queuelist); ++ rq->fifo_time = next->fifo_time; ++ } ++ ++ if (bfqq->next_rq == next) ++ bfqq->next_rq = rq; ++ ++ bfq_remove_request(next); ++ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); ++} ++ ++/* Must be called with bfqq != NULL */ ++static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) ++{ ++ BUG_ON(!bfqq); ++ ++ if (bfq_bfqq_busy(bfqq)) { ++ bfqq->bfqd->wr_busy_queues--; ++ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); ++ } ++ bfqq->wr_coeff = 1; ++ bfqq->wr_cur_max_time = 0; ++ bfqq->last_wr_start_finish = jiffies; ++ /* ++ * Trigger a weight change on the next invocation of ++ * __bfq_entity_update_weight_prio. ++ */ ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "end_wr: wrais ending at %lu, rais_max_time %u", ++ bfqq->last_wr_start_finish, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", ++ bfqq->bfqd->wr_busy_queues); ++} ++ ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ if (bfqg->async_bfqq[i][j]) ++ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); ++ if (bfqg->async_idle_bfqq) ++ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); ++} ++ ++static void bfq_end_wr(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ ++ spin_lock_irq(bfqd->queue->queue_lock); ++ ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ bfq_end_wr_async(bfqd); ++ ++ spin_unlock_irq(bfqd->queue->queue_lock); ++} ++ ++static sector_t bfq_io_struct_pos(void *io_struct, bool request) ++{ ++ if (request) ++ return blk_rq_pos(io_struct); ++ else ++ return ((struct bio *)io_struct)->bi_iter.bi_sector; ++} ++ ++static int bfq_rq_close_to_sector(void *io_struct, bool request, ++ sector_t sector) ++{ ++ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= ++ BFQQ_CLOSE_THR; ++} ++ ++static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ sector_t sector) ++{ ++ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; ++ struct rb_node *parent, *node; ++ struct bfq_queue *__bfqq; ++ ++ if (RB_EMPTY_ROOT(root)) ++ return NULL; ++ ++ /* ++ * First, if we find a request starting at the end of the last ++ * request, choose it. ++ */ ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); ++ if (__bfqq) ++ return __bfqq; ++ ++ /* ++ * If the exact sector wasn't found, the parent of the NULL leaf ++ * will contain the closest sector (rq_pos_tree sorted by ++ * next_request position). ++ */ ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ if (blk_rq_pos(__bfqq->next_rq) < sector) ++ node = rb_next(&__bfqq->pos_node); ++ else ++ node = rb_prev(&__bfqq->pos_node); ++ if (!node) ++ return NULL; ++ ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ return NULL; ++} ++ ++static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, ++ struct bfq_queue *cur_bfqq, ++ sector_t sector) ++{ ++ struct bfq_queue *bfqq; ++ ++ /* ++ * We shall notice if some of the queues are cooperating, ++ * e.g., working closely on the same area of the device. In ++ * that case, we can group them together and: 1) don't waste ++ * time idling, and 2) serve the union of their requests in ++ * the best possible order for throughput. ++ */ ++ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); ++ if (!bfqq || bfqq == cur_bfqq) ++ return NULL; ++ ++ return bfqq; ++} ++ ++static struct bfq_queue * ++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ int process_refs, new_process_refs; ++ struct bfq_queue *__bfqq; ++ ++ /* ++ * If there are no process references on the new_bfqq, then it is ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain ++ * may have dropped their last reference (not just their last process ++ * reference). ++ */ ++ if (!bfqq_process_refs(new_bfqq)) ++ return NULL; ++ ++ /* Avoid a circular list and skip interim queue merges. */ ++ while ((__bfqq = new_bfqq->new_bfqq)) { ++ if (__bfqq == bfqq) ++ return NULL; ++ new_bfqq = __bfqq; ++ } ++ ++ process_refs = bfqq_process_refs(bfqq); ++ new_process_refs = bfqq_process_refs(new_bfqq); ++ /* ++ * If the process for the bfqq has gone away, there is no ++ * sense in merging the queues. ++ */ ++ if (process_refs == 0 || new_process_refs == 0) ++ return NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", ++ new_bfqq->pid); ++ ++ /* ++ * Merging is just a redirection: the requests of the process ++ * owning one of the two queues are redirected to the other queue. ++ * The latter queue, in its turn, is set as shared if this is the ++ * first time that the requests of some process are redirected to ++ * it. ++ * ++ * We redirect bfqq to new_bfqq and not the opposite, because we ++ * are in the context of the process owning bfqq, hence we have ++ * the io_cq of this process. So we can immediately configure this ++ * io_cq to redirect the requests of the process to new_bfqq. ++ * ++ * NOTE, even if new_bfqq coincides with the in-service queue, the ++ * io_cq of new_bfqq is not available, because, if the in-service ++ * queue is shared, bfqd->in_service_bic may not point to the ++ * io_cq of the in-service queue. ++ * Redirecting the requests of the process owning bfqq to the ++ * currently in-service queue is in any case the best option, as ++ * we feed the in-service queue with new requests close to the ++ * last request served and, by doing so, hopefully increase the ++ * throughput. ++ */ ++ bfqq->new_bfqq = new_bfqq; ++ new_bfqq->ref += process_refs; ++ return new_bfqq; ++} ++ ++static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, ++ struct bfq_queue *new_bfqq) ++{ ++ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || ++ (bfqq->ioprio_class != new_bfqq->ioprio_class)) ++ return false; ++ ++ /* ++ * If either of the queues has already been detected as seeky, ++ * then merging it with the other queue is unlikely to lead to ++ * sequential I/O. ++ */ ++ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) ++ return false; ++ ++ /* ++ * Interleaved I/O is known to be done by (some) applications ++ * only for reads, so it does not make sense to merge async ++ * queues. ++ */ ++ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * If this function returns true, then bfqq cannot be merged. The idea ++ * is that true cooperation happens very early after processes start ++ * to do I/O. Usually, late cooperations are just accidental false ++ * positives. In case bfqq is weight-raised, such false positives ++ * would evidently degrade latency guarantees for bfqq. ++ */ ++static bool wr_from_too_long(struct bfq_queue *bfqq) ++{ ++ return bfqq->wr_coeff > 1 && ++ time_is_before_jiffies(bfqq->last_wr_start_finish + ++ msecs_to_jiffies(100)); ++} ++ ++/* ++ * Attempt to schedule a merge of bfqq with the currently in-service ++ * queue or with a close queue among the scheduled queues. Return ++ * NULL if no merge was scheduled, a pointer to the shared bfq_queue ++ * structure otherwise. ++ * ++ * The OOM queue is not allowed to participate to cooperation: in fact, since ++ * the requests temporarily redirected to the OOM queue could be redirected ++ * again to dedicated queues at any time, the state needed to correctly ++ * handle merging with the OOM queue would be quite complex and expensive ++ * to maintain. Besides, in such a critical condition as an out of memory, ++ * the benefits of queue merging may be little relevant, or even negligible. ++ * ++ * Weight-raised queues can be merged only if their weight-raising ++ * period has just started. In fact cooperating processes are usually ++ * started together. Thus, with this filter we avoid false positives ++ * that would jeopardize low-latency guarantees. ++ * ++ * WARNING: queue merging may impair fairness among non-weight raised ++ * queues, for at least two reasons: 1) the original weight of a ++ * merged queue may change during the merged state, 2) even being the ++ * weight the same, a merged queue may be bloated with many more ++ * requests than the ones produced by its originally-associated ++ * process. ++ */ ++static struct bfq_queue * ++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ void *io_struct, bool request) ++{ ++ struct bfq_queue *in_service_bfqq, *new_bfqq; ++ ++ if (bfqq->new_bfqq) ++ return bfqq->new_bfqq; ++ ++ if (io_struct && wr_from_too_long(bfqq) && ++ likely(bfqq != &bfqd->oom_bfqq)) ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have looked for coop, but bfq%d wr", ++ bfqq->pid); ++ ++ if (!io_struct || ++ wr_from_too_long(bfqq) || ++ unlikely(bfqq == &bfqd->oom_bfqq)) ++ return NULL; ++ ++ /* If there is only one backlogged queue, don't search. */ ++ if (bfqd->busy_queues == 1) ++ return NULL; ++ ++ in_service_bfqq = bfqd->in_service_queue; ++ ++ if (in_service_bfqq && in_service_bfqq != bfqq && ++ bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) ++ && likely(in_service_bfqq == &bfqd->oom_bfqq)) ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have tried merge with in-service-queue, but wr"); ++ ++ if (!in_service_bfqq || in_service_bfqq == bfqq || ++ !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || ++ unlikely(in_service_bfqq == &bfqd->oom_bfqq)) ++ goto check_scheduled; ++ ++ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && ++ bfqq->entity.parent == in_service_bfqq->entity.parent && ++ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { ++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); ++ if (new_bfqq) ++ return new_bfqq; ++ } ++ /* ++ * Check whether there is a cooperator among currently scheduled ++ * queues. The only thing we need is that the bio/request is not ++ * NULL, as we need it to establish whether a cooperator exists. ++ */ ++check_scheduled: ++ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, ++ bfq_io_struct_pos(io_struct, request)); ++ ++ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); ++ ++ if (new_bfqq && wr_from_too_long(new_bfqq) && ++ likely(new_bfqq != &bfqd->oom_bfqq) && ++ bfq_may_be_close_cooperator(bfqq, new_bfqq)) ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have merged with bfq%d, but wr", ++ new_bfqq->pid); ++ ++ if (new_bfqq && !wr_from_too_long(new_bfqq) && ++ likely(new_bfqq != &bfqd->oom_bfqq) && ++ bfq_may_be_close_cooperator(bfqq, new_bfqq)) ++ return bfq_setup_merge(bfqq, new_bfqq); ++ ++ return NULL; ++} ++ ++static void bfq_bfqq_save_state(struct bfq_queue *bfqq) ++{ ++ struct bfq_io_cq *bic = bfqq->bic; ++ ++ /* ++ * If !bfqq->bic, the queue is already shared or its requests ++ * have already been redirected to a shared queue; both idle window ++ * and weight raising state have already been saved. Do nothing. ++ */ ++ if (!bic) ++ return; ++ ++ bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); ++ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); ++ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); ++ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); ++ bic->saved_wr_coeff = bfqq->wr_coeff; ++ bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; ++ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; ++ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++} ++ ++static void bfq_get_bic_reference(struct bfq_queue *bfqq) ++{ ++ /* ++ * If bfqq->bic has a non-NULL value, the bic to which it belongs ++ * is about to begin using a shared bfq_queue. ++ */ ++ if (bfqq->bic) ++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); ++} ++ ++static void ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", ++ (unsigned long) new_bfqq->pid); ++ /* Save weight raising and idle window of the merged queues */ ++ bfq_bfqq_save_state(bfqq); ++ bfq_bfqq_save_state(new_bfqq); ++ if (bfq_bfqq_IO_bound(bfqq)) ++ bfq_mark_bfqq_IO_bound(new_bfqq); ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ /* ++ * If bfqq is weight-raised, then let new_bfqq inherit ++ * weight-raising. To reduce false positives, neglect the case ++ * where bfqq has just been created, but has not yet made it ++ * to be weight-raised (which may happen because EQM may merge ++ * bfqq even before bfq_add_request is executed for the first ++ * time for bfqq). Handling this case would however be very ++ * easy, thanks to the flag just_created. ++ */ ++ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { ++ new_bfqq->wr_coeff = bfqq->wr_coeff; ++ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; ++ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; ++ new_bfqq->wr_start_at_switch_to_srt = ++ bfqq->wr_start_at_switch_to_srt; ++ if (bfq_bfqq_busy(new_bfqq)) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); ++ } ++ ++ new_bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, new_bfqq, ++ "wr start after merge with %d, rais_max_time %u", ++ bfqq->pid, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ ++ bfqq->wr_coeff = 1; ++ bfqq->entity.prio_changed = 1; ++ if (bfq_bfqq_busy(bfqq)) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++ ++ } ++ ++ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", ++ bfqd->wr_busy_queues); ++ ++ /* ++ * Grab a reference to the bic, to prevent it from being destroyed ++ * before being possibly touched by a bfq_split_bfqq(). ++ */ ++ bfq_get_bic_reference(bfqq); ++ bfq_get_bic_reference(new_bfqq); ++ /* ++ * Merge queues (that is, let bic redirect its requests to new_bfqq) ++ */ ++ bic_set_bfqq(bic, new_bfqq, 1); ++ bfq_mark_bfqq_coop(new_bfqq); ++ /* ++ * new_bfqq now belongs to at least two bics (it is a shared queue): ++ * set new_bfqq->bic to NULL. bfqq either: ++ * - does not belong to any bic any more, and hence bfqq->bic must ++ * be set to NULL, or ++ * - is a queue whose owning bics have already been redirected to a ++ * different queue, hence the queue is destined to not belong to ++ * any bic soon and bfqq->bic is already NULL (therefore the next ++ * assignment causes no harm). ++ */ ++ new_bfqq->bic = NULL; ++ bfqq->bic = NULL; ++ /* release process reference to bfqq */ ++ bfq_put_queue(bfqq); ++} ++ ++static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ bool is_sync = op_is_sync(bio->bi_opf); ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq, *new_bfqq; ++ ++ /* ++ * Disallow merge of a sync bio into an async request. ++ */ ++ if (is_sync && !rq_is_sync(rq)) ++ return false; ++ ++ /* ++ * Lookup the bfqq that this bio will be queued with. Allow ++ * merge only if rq is queued there. ++ * Queue lock is held here. ++ */ ++ bic = bfq_bic_lookup(bfqd, current->io_context); ++ if (!bic) ++ return false; ++ ++ bfqq = bic_to_bfqq(bic, is_sync); ++ /* ++ * We take advantage of this function to perform an early merge ++ * of the queues of possible cooperating processes. ++ */ ++ if (bfqq) { ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); ++ if (new_bfqq) { ++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); ++ /* ++ * If we get here, the bio will be queued in the ++ * shared queue, i.e., new_bfqq, so use new_bfqq ++ * to decide whether bio and rq can be merged. ++ */ ++ bfqq = new_bfqq; ++ } ++ } ++ ++ return bfqq == RQ_BFQQ(rq); ++} ++ ++static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, ++ struct request *next) ++{ ++ return RQ_BFQQ(rq) == RQ_BFQQ(next); ++} ++ ++/* ++ * Set the maximum time for the in-service queue to consume its ++ * budget. This prevents seeky processes from lowering the throughput. ++ * In practice, a time-slice service scheme is used with seeky ++ * processes. ++ */ ++static void bfq_set_budget_timeout(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ unsigned int timeout_coeff; ++ ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) ++ timeout_coeff = 1; ++ else ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; ++ ++ bfqd->last_budget_start = ktime_get(); ++ ++ bfqq->budget_timeout = jiffies + ++ bfqd->bfq_timeout * timeout_coeff; ++ ++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", ++ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); ++} ++ ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ if (bfqq) { ++ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); ++ bfq_mark_bfqq_must_alloc(bfqq); ++ bfq_clear_bfqq_fifo_expire(bfqq); ++ ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && ++ bfqq->wr_coeff > 1 && ++ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ time_is_before_jiffies(bfqq->budget_timeout)) { ++ /* ++ * For soft real-time queues, move the start ++ * of the weight-raising period forward by the ++ * time the queue has not received any ++ * service. Otherwise, a relatively long ++ * service delay is likely to cause the ++ * weight-raising period of the queue to end, ++ * because of the short duration of the ++ * weight-raising period of a soft real-time ++ * queue. It is worth noting that this move ++ * is not so dangerous for the other queues, ++ * because soft real-time queues are not ++ * greedy. ++ * ++ * To not add a further variable, we use the ++ * overloaded field budget_timeout to ++ * determine for how long the queue has not ++ * received service, i.e., how much time has ++ * elapsed since the queue expired. However, ++ * this is a little imprecise, because ++ * budget_timeout is set to jiffies if bfqq ++ * not only expires, but also remains with no ++ * request. ++ */ ++ if (time_after(bfqq->budget_timeout, ++ bfqq->last_wr_start_finish)) ++ bfqq->last_wr_start_finish += ++ jiffies - bfqq->budget_timeout; ++ else ++ bfqq->last_wr_start_finish = jiffies; ++ ++ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { ++ pr_crit( ++ "BFQ WARNING:last %lu budget %lu jiffies %lu", ++ bfqq->last_wr_start_finish, ++ bfqq->budget_timeout, ++ jiffies); ++ pr_crit("diff %lu", jiffies - ++ max_t(unsigned long, ++ bfqq->last_wr_start_finish, ++ bfqq->budget_timeout)); ++ bfqq->last_wr_start_finish = jiffies; ++ } ++ } ++ ++ bfq_set_budget_timeout(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "set_in_service_queue, cur-budget = %d", ++ bfqq->entity.budget); ++ } else ++ bfq_log(bfqd, "set_in_service_queue: NULL"); ++ ++ bfqd->in_service_queue = bfqq; ++} ++ ++/* ++ * Get and set a new queue for service. ++ */ ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); ++ ++ __bfq_set_in_service_queue(bfqd, bfqq); ++ return bfqq; ++} ++ ++static void bfq_arm_slice_timer(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->in_service_queue; ++ struct bfq_io_cq *bic; ++ u32 sl; ++ ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ /* Processes have exited, don't wait. */ ++ bic = bfqd->in_service_bic; ++ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) ++ return; ++ ++ bfq_mark_bfqq_wait_request(bfqq); ++ ++ /* ++ * We don't want to idle for seeks, but we do want to allow ++ * fair distribution of slice time for a process doing back-to-back ++ * seeks. So allow a little bit of time for him to submit a new rq. ++ * ++ * To prevent processes with (partly) seeky workloads from ++ * being too ill-treated, grant them a small fraction of the ++ * assigned budget before reducing the waiting time to ++ * BFQ_MIN_TT. This happened to help reduce latency. ++ */ ++ sl = bfqd->bfq_slice_idle; ++ /* ++ * Unless the queue is being weight-raised or the scenario is ++ * asymmetric, grant only minimum idle time if the queue ++ * is seeky. A long idling is preserved for a weight-raised ++ * queue, or, more in general, in an asymemtric scenario, ++ * because a long idling is needed for guaranteeing to a queue ++ * its reserved share of the throughput (in particular, it is ++ * needed if the queue has a higher weight than some other ++ * queue). ++ */ ++ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && ++ bfq_symmetric_scenario(bfqd)) ++ sl = min_t(u32, sl, BFQ_MIN_TT); ++ ++ bfqd->last_idling_start = ktime_get(); ++ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), ++ HRTIMER_MODE_REL); ++ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); ++ bfq_log(bfqd, "arm idle: %ld/%ld ms", ++ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); ++} ++ ++/* ++ * In autotuning mode, max_budget is dynamically recomputed as the ++ * amount of sectors transferred in timeout at the estimated peak ++ * rate. This enables BFQ to utilize a full timeslice with a full ++ * budget, even if the in-service queue is served at peak rate. And ++ * this maximises throughput with sequential workloads. ++ */ ++static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) ++{ ++ return (u64)bfqd->peak_rate * USEC_PER_MSEC * ++ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; ++} ++ ++/* ++ * Update parameters related to throughput and responsiveness, as a ++ * function of the estimated peak rate. See comments on ++ * bfq_calc_max_budget(), and on T_slow and T_fast arrays. ++ */ ++static void update_thr_responsiveness_params(struct bfq_data *bfqd) ++{ ++ int dev_type = blk_queue_nonrot(bfqd->queue); ++ ++ if (bfqd->bfq_user_max_budget == 0) { ++ bfqd->bfq_max_budget = ++ bfq_calc_max_budget(bfqd); ++ BUG_ON(bfqd->bfq_max_budget < 0); ++ bfq_log(bfqd, "new max_budget = %d", ++ bfqd->bfq_max_budget); ++ } ++ ++ if (bfqd->device_speed == BFQ_BFQD_FAST && ++ bfqd->peak_rate < device_speed_thresh[dev_type]) { ++ bfqd->device_speed = BFQ_BFQD_SLOW; ++ bfqd->RT_prod = R_slow[dev_type] * ++ T_slow[dev_type]; ++ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && ++ bfqd->peak_rate > device_speed_thresh[dev_type]) { ++ bfqd->device_speed = BFQ_BFQD_FAST; ++ bfqd->RT_prod = R_fast[dev_type] * ++ T_fast[dev_type]; ++ } ++ ++ bfq_log(bfqd, ++"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", ++ dev_type == 0 ? "ROT" : "NONROT", ++ bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", ++ bfqd->device_speed == BFQ_BFQD_FAST ? ++ (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : ++ (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, ++ (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> ++ BFQ_RATE_SHIFT); ++} ++ ++static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) ++{ ++ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ ++ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; ++ bfqd->peak_rate_samples = 1; ++ bfqd->sequential_samples = 0; ++ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = ++ blk_rq_sectors(rq); ++ } else /* no new rq dispatched, just reset the number of samples */ ++ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ ++ ++ bfq_log(bfqd, ++ "reset_rate_computation at end, sample %u/%u tot_sects %llu", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ bfqd->tot_sectors_dispatched); ++} ++ ++static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) ++{ ++ u32 rate, weight, divisor; ++ ++ /* ++ * For the convergence property to hold (see comments on ++ * bfq_update_peak_rate()) and for the assessment to be ++ * reliable, a minimum number of samples must be present, and ++ * a minimum amount of time must have elapsed. If not so, do ++ * not compute new rate. Just reset parameters, to get ready ++ * for a new evaluation attempt. ++ */ ++ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || ++ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { ++ bfq_log(bfqd, ++ "update_rate_reset: only resetting, delta_first %lluus samples %d", ++ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); ++ goto reset_computation; ++ } ++ ++ /* ++ * If a new request completion has occurred after last ++ * dispatch, then, to approximate the rate at which requests ++ * have been served by the device, it is more precise to ++ * extend the observation interval to the last completion. ++ */ ++ bfqd->delta_from_first = ++ max_t(u64, bfqd->delta_from_first, ++ bfqd->last_completion - bfqd->first_dispatch); ++ ++ BUG_ON(bfqd->delta_from_first == 0); ++ /* ++ * Rate computed in sects/usec, and not sects/nsec, for ++ * precision issues. ++ */ ++ rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); ++ ++ bfq_log(bfqd, ++"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", ++ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ rate > 20< 20M sectors/sec) ++ */ ++ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && ++ rate <= bfqd->peak_rate) || ++ rate > 20<peak_rate_samples, bfqd->sequential_samples, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ goto reset_computation; ++ } else { ++ bfq_log(bfqd, ++ "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ } ++ ++ /* ++ * We have to update the peak rate, at last! To this purpose, ++ * we use a low-pass filter. We compute the smoothing constant ++ * of the filter as a function of the 'weight' of the new ++ * measured rate. ++ * ++ * As can be seen in next formulas, we define this weight as a ++ * quantity proportional to how sequential the workload is, ++ * and to how long the observation time interval is. ++ * ++ * The weight runs from 0 to 8. The maximum value of the ++ * weight, 8, yields the minimum value for the smoothing ++ * constant. At this minimum value for the smoothing constant, ++ * the measured rate contributes for half of the next value of ++ * the estimated peak rate. ++ * ++ * So, the first step is to compute the weight as a function ++ * of how sequential the workload is. Note that the weight ++ * cannot reach 9, because bfqd->sequential_samples cannot ++ * become equal to bfqd->peak_rate_samples, which, in its ++ * turn, holds true because bfqd->sequential_samples is not ++ * incremented for the first sample. ++ */ ++ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; ++ ++ /* ++ * Second step: further refine the weight as a function of the ++ * duration of the observation interval. ++ */ ++ weight = min_t(u32, 8, ++ div_u64(weight * bfqd->delta_from_first, ++ BFQ_RATE_REF_INTERVAL)); ++ ++ /* ++ * Divisor ranging from 10, for minimum weight, to 2, for ++ * maximum weight. ++ */ ++ divisor = 10 - weight; ++ BUG_ON(divisor == 0); ++ ++ /* ++ * Finally, update peak rate: ++ * ++ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor ++ */ ++ bfqd->peak_rate *= divisor-1; ++ bfqd->peak_rate /= divisor; ++ rate /= divisor; /* smoothing constant alpha = 1/divisor */ ++ ++ bfq_log(bfqd, ++ "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", ++ divisor, ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), ++ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); ++ ++ BUG_ON(bfqd->peak_rate == 0); ++ BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; ++ update_thr_responsiveness_params(bfqd); ++ BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */ ++ bfq_log(bfqd, ++ "update_peak_rate: goto reset, samples %d", ++ bfqd->peak_rate_samples) ; ++ bfq_reset_rate_computation(bfqd, rq); ++ goto update_last_values; /* will add one sample */ ++ } ++ ++ /* ++ * Device idle for very long: the observation interval lasting ++ * up to this dispatch cannot be a valid observation interval ++ * for computing a new peak rate (similarly to the late- ++ * completion event in bfq_completed_request()). Go to ++ * update_rate_and_reset to have the following three steps ++ * taken: ++ * - close the observation interval at the last (previous) ++ * request dispatch or completion ++ * - compute rate, if possible, for that observation interval ++ * - start a new observation interval with this dispatch ++ */ ++ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && ++ bfqd->rq_in_driver == 0) { ++ bfq_log(bfqd, ++"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", ++ (now_ns - bfqd->last_dispatch)>>10, ++ bfqd->peak_rate_samples) ; ++ goto update_rate_and_reset; ++ } ++ ++ /* Update sampling information */ ++ bfqd->peak_rate_samples++; ++ ++ if ((bfqd->rq_in_driver > 0 || ++ now_ns - bfqd->last_completion < BFQ_MIN_TT) ++ && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) ++ bfqd->sequential_samples++; ++ ++ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); ++ ++ /* Reset max observed rq size every 32 dispatches */ ++ if (likely(bfqd->peak_rate_samples % 32)) ++ bfqd->last_rq_max_size = ++ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); ++ else ++ bfqd->last_rq_max_size = blk_rq_sectors(rq); ++ ++ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; ++ ++ bfq_log(bfqd, ++ "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ bfqd->tot_sectors_dispatched, ++ bfqd->delta_from_first>>10); ++ ++ /* Target observation interval not yet reached, go on sampling */ ++ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) ++ goto update_last_values; ++ ++update_rate_and_reset: ++ bfq_update_rate_reset(bfqd, rq); ++update_last_values: ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ bfqd->last_dispatch = now_ns; ++ ++ bfq_log(bfqd, ++ "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", ++ (now_ns - bfqd->first_dispatch)>>10, ++ (unsigned long long) bfqd->last_position, ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ bfq_log(bfqd, ++ "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); ++} ++ ++/* ++ * Move request from internal lists to the dispatch list of the request queue ++ */ ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ /* ++ * For consistency, the next instruction should have been executed ++ * after removing the request from the queue and dispatching it. ++ * We execute instead this instruction before bfq_remove_request() ++ * (and hence introduce a temporary inconsistency), for efficiency. ++ * In fact, in a forced_dispatch, this prevents two counters related ++ * to bfqq->dispatched to risk to be uselessly decremented if bfqq ++ * is not in service, and then to be incremented again after ++ * incrementing bfqq->dispatched. ++ */ ++ bfqq->dispatched++; ++ bfq_update_peak_rate(q->elevator->elevator_data, rq); ++ ++ bfq_remove_request(rq); ++ elv_dispatch_sort(q, rq); ++} ++ ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ /* ++ * If this bfqq is shared between multiple processes, check ++ * to make sure that those processes are still issuing I/Os ++ * within the mean seek distance. If not, it may be time to ++ * break the queues apart again. ++ */ ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) ++ bfq_mark_bfqq_split_coop(bfqq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ if (bfqq->dispatched == 0) ++ /* ++ * Overloading budget_timeout field to store ++ * the time at which the queue remains with no ++ * backlog and no outstanding request; used by ++ * the weight-raising mechanism. ++ */ ++ bfqq->budget_timeout = jiffies; ++ ++ bfq_del_bfqq_busy(bfqd, bfqq, true); ++ } else { ++ bfq_requeue_bfqq(bfqd, bfqq); ++ /* ++ * Resort priority tree of potential close cooperators. ++ */ ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ ++ /* ++ * All in-service entities must have been properly deactivated ++ * or requeued before executing the next function, which ++ * resets all in-service entites as no more in service. ++ */ ++ __bfq_bfqd_reset_in_service(bfqd); ++} ++ ++/** ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. ++ * @bfqd: device data. ++ * @bfqq: queue to update. ++ * @reason: reason for expiration. ++ * ++ * Handle the feedback on @bfqq budget at queue expiration. ++ * See the body for detailed comments. ++ */ ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ enum bfqq_expiration reason) ++{ ++ struct request *next_rq; ++ int budget, min_budget; ++ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ min_budget = bfq_min_budget(bfqd); ++ ++ if (bfqq->wr_coeff == 1) ++ budget = bfqq->max_budget; ++ else /* ++ * Use a constant, low budget for weight-raised queues, ++ * to help achieve a low latency. Keep it slightly higher ++ * than the minimum possible budget, to cause a little ++ * bit fewer expirations. ++ */ ++ budget = 2 * min_budget; ++ ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", ++ budget, bfq_min_budget(bfqd)); ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); ++ ++ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { ++ switch (reason) { ++ /* ++ * Caveat: in all the following cases we trade latency ++ * for throughput. ++ */ ++ case BFQ_BFQQ_TOO_IDLE: ++ /* ++ * This is the only case where we may reduce ++ * the budget: if there is no request of the ++ * process still waiting for completion, then ++ * we assume (tentatively) that the timer has ++ * expired because the batch of requests of ++ * the process could have been served with a ++ * smaller budget. Hence, betting that ++ * process will behave in the same way when it ++ * becomes backlogged again, we reduce its ++ * next budget. As long as we guess right, ++ * this budget cut reduces the latency ++ * experienced by the process. ++ * ++ * However, if there are still outstanding ++ * requests, then the process may have not yet ++ * issued its next request just because it is ++ * still waiting for the completion of some of ++ * the still outstanding ones. So in this ++ * subcase we do not reduce its budget, on the ++ * contrary we increase it to possibly boost ++ * the throughput, as discussed in the ++ * comments to the BUDGET_TIMEOUT case. ++ */ ++ if (bfqq->dispatched > 0) /* still outstanding reqs */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ else { ++ if (budget > 5 * min_budget) ++ budget -= 4 * min_budget; ++ else ++ budget = min_budget; ++ } ++ break; ++ case BFQ_BFQQ_BUDGET_TIMEOUT: ++ /* ++ * We double the budget here because it gives ++ * the chance to boost the throughput if this ++ * is not a seeky process (and has bumped into ++ * this timeout because of, e.g., ZBR). ++ */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_BUDGET_EXHAUSTED: ++ /* ++ * The process still has backlog, and did not ++ * let either the budget timeout or the disk ++ * idling timeout expire. Hence it is not ++ * seeky, has a short thinktime and may be ++ * happy with a higher budget too. So ++ * definitely increase the budget of this good ++ * candidate to boost the disk throughput. ++ */ ++ budget = min(budget * 4, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_NO_MORE_REQUESTS: ++ /* ++ * For queues that expire for this reason, it ++ * is particularly important to keep the ++ * budget close to the actual service they ++ * need. Doing so reduces the timestamp ++ * misalignment problem described in the ++ * comments in the body of ++ * __bfq_activate_entity. In fact, suppose ++ * that a queue systematically expires for ++ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a ++ * new request in time to enjoy timestamp ++ * back-shifting. The larger the budget of the ++ * queue is with respect to the service the ++ * queue actually requests in each service ++ * slot, the more times the queue can be ++ * reactivated with the same virtual finish ++ * time. It follows that, even if this finish ++ * time is pushed to the system virtual time ++ * to reduce the consequent timestamp ++ * misalignment, the queue unjustly enjoys for ++ * many re-activations a lower finish time ++ * than all newly activated queues. ++ * ++ * The service needed by bfqq is measured ++ * quite precisely by bfqq->entity.service. ++ * Since bfqq does not enjoy device idling, ++ * bfqq->entity.service is equal to the number ++ * of sectors that the process associated with ++ * bfqq requested to read/write before waiting ++ * for request completions, or blocking for ++ * other reasons. ++ */ ++ budget = max_t(int, bfqq->entity.service, min_budget); ++ break; ++ default: ++ return; ++ } ++ } else if (!bfq_bfqq_sync(bfqq)) ++ /* ++ * Async queues get always the maximum possible ++ * budget, as for them we do not care about latency ++ * (in addition, their ability to dispatch is limited ++ * by the charging factor). ++ */ ++ budget = bfqd->bfq_max_budget; ++ ++ bfqq->max_budget = budget; ++ ++ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && ++ !bfqd->bfq_user_max_budget) ++ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); ++ ++ /* ++ * If there is still backlog, then assign a new budget, making ++ * sure that it is large enough for the next request. Since ++ * the finish time of bfqq must be kept in sync with the ++ * budget, be sure to call __bfq_bfqq_expire() *after* this ++ * update. ++ * ++ * If there is no backlog, then no need to update the budget; ++ * it will be updated on the arrival of a new request. ++ */ ++ next_rq = bfqq->next_rq; ++ if (next_rq) { ++ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || ++ reason == BFQ_BFQQ_NO_MORE_REQUESTS); ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ BUG_ON(!bfq_bfqq_busy(bfqq)); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", ++ next_rq ? blk_rq_sectors(next_rq) : 0, ++ bfqq->entity.budget); ++} ++ ++/* ++ * Return true if the process associated with bfqq is "slow". The slow ++ * flag is used, in addition to the budget timeout, to reduce the ++ * amount of service provided to seeky processes, and thus reduce ++ * their chances to lower the throughput. More details in the comments ++ * on the function bfq_bfqq_expire(). ++ * ++ * An important observation is in order: as discussed in the comments ++ * on the function bfq_update_peak_rate(), with devices with internal ++ * queues, it is hard if ever possible to know when and for how long ++ * an I/O request is processed by the device (apart from the trivial ++ * I/O pattern where a new request is dispatched only after the ++ * previous one has been completed). This makes it hard to evaluate ++ * the real rate at which the I/O requests of each bfq_queue are ++ * served. In fact, for an I/O scheduler like BFQ, serving a ++ * bfq_queue means just dispatching its requests during its service ++ * slot (i.e., until the budget of the queue is exhausted, or the ++ * queue remains idle, or, finally, a timeout fires). But, during the ++ * service slot of a bfq_queue, around 100 ms at most, the device may ++ * be even still processing requests of bfq_queues served in previous ++ * service slots. On the opposite end, the requests of the in-service ++ * bfq_queue may be completed after the service slot of the queue ++ * finishes. ++ * ++ * Anyway, unless more sophisticated solutions are used ++ * (where possible), the sum of the sizes of the requests dispatched ++ * during the service slot of a bfq_queue is probably the only ++ * approximation available for the service received by the bfq_queue ++ * during its service slot. And this sum is the quantity used in this ++ * function to evaluate the I/O speed of a process. ++ */ ++static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool compensate, enum bfqq_expiration reason, ++ unsigned long *delta_ms) ++{ ++ ktime_t delta_ktime; ++ u32 delta_usecs; ++ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ ++ ++ if (!bfq_bfqq_sync(bfqq)) ++ return false; ++ ++ if (compensate) ++ delta_ktime = bfqd->last_idling_start; ++ else ++ delta_ktime = ktime_get(); ++ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); ++ delta_usecs = ktime_to_us(delta_ktime); ++ ++ /* don't use too short time intervals */ ++ if (delta_usecs < 1000) { ++ if (blk_queue_nonrot(bfqd->queue)) ++ /* ++ * give same worst-case guarantees as idling ++ * for seeky ++ */ ++ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; ++ else /* charge at least one seek */ ++ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; ++ ++ bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); ++ ++ return slow; ++ } ++ ++ *delta_ms = delta_usecs / USEC_PER_MSEC; ++ ++ /* ++ * Use only long (> 20ms) intervals to filter out excessive ++ * spikes in service rate estimation. ++ */ ++ if (delta_usecs > 20000) { ++ /* ++ * Caveat for rotational devices: processes doing I/O ++ * in the slower disk zones tend to be slow(er) even ++ * if not seeky. In this respect, the estimated peak ++ * rate is likely to be an average over the disk ++ * surface. Accordingly, to not be too harsh with ++ * unlucky processes, a process is deemed slow only if ++ * its rate has been lower than half of the estimated ++ * peak rate. ++ */ ++ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; ++ bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", ++ bfqq->entity.service, bfqd->bfq_max_budget); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); ++ ++ return slow; ++} ++ ++/* ++ * To be deemed as soft real-time, an application must meet two ++ * requirements. First, the application must not require an average ++ * bandwidth higher than the approximate bandwidth required to playback or ++ * record a compressed high-definition video. ++ * The next function is invoked on the completion of the last request of a ++ * batch, to compute the next-start time instant, soft_rt_next_start, such ++ * that, if the next request of the application does not arrive before ++ * soft_rt_next_start, then the above requirement on the bandwidth is met. ++ * ++ * The second requirement is that the request pattern of the application is ++ * isochronous, i.e., that, after issuing a request or a batch of requests, ++ * the application stops issuing new requests until all its pending requests ++ * have been completed. After that, the application may issue a new batch, ++ * and so on. ++ * For this reason the next function is invoked to compute ++ * soft_rt_next_start only for applications that meet this requirement, ++ * whereas soft_rt_next_start is set to infinity for applications that do ++ * not. ++ * ++ * Unfortunately, even a greedy application may happen to behave in an ++ * isochronous way if the CPU load is high. In fact, the application may ++ * stop issuing requests while the CPUs are busy serving other processes, ++ * then restart, then stop again for a while, and so on. In addition, if ++ * the disk achieves a low enough throughput with the request pattern ++ * issued by the application (e.g., because the request pattern is random ++ * and/or the device is slow), then the application may meet the above ++ * bandwidth requirement too. To prevent such a greedy application to be ++ * deemed as soft real-time, a further rule is used in the computation of ++ * soft_rt_next_start: soft_rt_next_start must be higher than the current ++ * time plus the maximum time for which the arrival of a request is waited ++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. ++ * This filters out greedy applications, as the latter issue instead their ++ * next request as soon as possible after the last one has been completed ++ * (in contrast, when a batch of requests is completed, a soft real-time ++ * application spends some time processing data). ++ * ++ * Unfortunately, the last filter may easily generate false positives if ++ * only bfqd->bfq_slice_idle is used as a reference time interval and one ++ * or both the following cases occur: ++ * 1) HZ is so low that the duration of a jiffy is comparable to or higher ++ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with ++ * HZ=100. ++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing ++ * for a while, then suddenly 'jump' by several units to recover the lost ++ * increments. This seems to happen, e.g., inside virtual machines. ++ * To address this issue, we do not use as a reference time interval just ++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In ++ * particular we add the minimum number of jiffies for which the filter ++ * seems to be quite precise also in embedded systems and KVM/QEMU virtual ++ * machines. ++ */ ++static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, ++"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", ++ bfqq->service_from_backlogged, ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate)); ++ ++ return max(bfqq->last_idle_bklogged + ++ HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); ++} ++ ++/* ++ * Return the farthest future time instant according to jiffies ++ * macros. ++ */ ++static unsigned long bfq_greatest_from_now(void) ++{ ++ return jiffies + MAX_JIFFY_OFFSET; ++} ++ ++/* ++ * Return the farthest past time instant according to jiffies ++ * macros. ++ */ ++static unsigned long bfq_smallest_from_now(void) ++{ ++ return jiffies - MAX_JIFFY_OFFSET; ++} ++ ++/** ++ * bfq_bfqq_expire - expire a queue. ++ * @bfqd: device owning the queue. ++ * @bfqq: the queue to expire. ++ * @compensate: if true, compensate for the time spent idling. ++ * @reason: the reason causing the expiration. ++ * ++ * If the process associated with bfqq does slow I/O (e.g., because it ++ * issues random requests), we charge bfqq with the time it has been ++ * in service instead of the service it has received (see ++ * bfq_bfqq_charge_time for details on how this goal is achieved). As ++ * a consequence, bfqq will typically get higher timestamps upon ++ * reactivation, and hence it will be rescheduled as if it had ++ * received more service than what it has actually received. In the ++ * end, bfqq receives less service in proportion to how slowly its ++ * associated process consumes its budgets (and hence how seriously it ++ * tends to lower the throughput). In addition, this time-charging ++ * strategy guarantees time fairness among slow processes. In ++ * contrast, if the process associated with bfqq is not slow, we ++ * charge bfqq exactly with the service it has received. ++ * ++ * Charging time to the first type of queues and the exact service to ++ * the other has the effect of using the WF2Q+ policy to schedule the ++ * former on a timeslice basis, without violating service domain ++ * guarantees among the latter. ++ */ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason) ++{ ++ bool slow; ++ unsigned long delta = 0; ++ struct bfq_entity *entity = &bfqq->entity; ++ int ref; ++ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ /* ++ * Check whether the process is slow (see bfq_bfqq_is_slow). ++ */ ++ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); ++ ++ /* ++ * Increase service_from_backlogged before next statement, ++ * because the possible next invocation of ++ * bfq_bfqq_charge_time would likely inflate ++ * entity->service. In contrast, service_from_backlogged must ++ * contain real service, to enable the soft real-time ++ * heuristic to correctly compute the bandwidth consumed by ++ * bfqq. ++ */ ++ bfqq->service_from_backlogged += entity->service; ++ ++ /* ++ * As above explained, charge slow (typically seeky) and ++ * timed-out queues with the time and not the service ++ * received, to favor sequential workloads. ++ * ++ * Processes doing I/O in the slower disk zones will tend to ++ * be slow(er) even if not seeky. Therefore, since the ++ * estimated peak rate is actually an average over the disk ++ * surface, these processes may timeout just for bad luck. To ++ * avoid punishing them, do not charge time to processes that ++ * succeeded in consuming at least 2/3 of their budget. This ++ * allows BFQ to preserve enough elasticity to still perform ++ * bandwidth, and not time, distribution with little unlucky ++ * or quasi-sequential processes. ++ */ ++ if (bfqq->wr_coeff == 1 && ++ (slow || ++ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && ++ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) ++ bfq_bfqq_charge_time(bfqd, bfqq, delta); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ if (reason == BFQ_BFQQ_TOO_IDLE && ++ entity->service <= 2 * entity->budget / 10) ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ if (bfqd->low_latency && bfqq->wr_coeff == 1) ++ bfqq->last_wr_start_finish = jiffies; ++ ++ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ /* ++ * If we get here, and there are no outstanding ++ * requests, then the request pattern is isochronous ++ * (see the comments on the function ++ * bfq_bfqq_softrt_next_start()). Thus we can compute ++ * soft_rt_next_start. If, instead, the queue still ++ * has outstanding requests, then we have to wait for ++ * the completion of all the outstanding requests to ++ * discover whether the request pattern is actually ++ * isochronous. ++ */ ++ BUG_ON(bfqd->busy_queues < 1); ++ if (bfqq->dispatched == 0) { ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", ++ bfqq->soft_rt_next_start); ++ } else { ++ /* ++ * The application is still waiting for the ++ * completion of one or more requests: ++ * prevent it from possibly being incorrectly ++ * deemed as soft real-time by setting its ++ * soft_rt_next_start to infinity. In fact, ++ * without this assignment, the application ++ * would be incorrectly deemed as soft ++ * real-time if: ++ * 1) it issued a new request before the ++ * completion of all its in-flight ++ * requests, and ++ * 2) at that time, its soft_rt_next_start ++ * happened to be in the past. ++ */ ++ bfqq->soft_rt_next_start = ++ bfq_greatest_from_now(); ++ /* ++ * Schedule an update of soft_rt_next_start to when ++ * the task may be discovered to be isochronous. ++ */ ++ bfq_mark_bfqq_softrt_update(bfqq); ++ } ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", ++ reason, slow, bfqq->dispatched, ++ bfq_bfqq_idle_window(bfqq), entity->weight); ++ ++ /* ++ * Increase, decrease or leave budget unchanged according to ++ * reason. ++ */ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); ++ BUG_ON(bfqq->next_rq == NULL && ++ bfqq->entity.budget < bfqq->entity.service); ++ ref = bfqq->ref; ++ __bfq_bfqq_expire(bfqd, bfqq); ++ ++ BUG_ON(ref > 1 && ++ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && ++ !bfq_class_idle(bfqq)); ++ ++ /* mark bfqq as waiting a request only if a bic still points to it */ ++ if (ref > 1 && !bfq_bfqq_busy(bfqq) && ++ reason != BFQ_BFQQ_BUDGET_TIMEOUT && ++ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) ++ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); ++} ++ ++/* ++ * Budget timeout is not implemented through a dedicated timer, but ++ * just checked on request arrivals and completions, as well as on ++ * idle timer expirations. ++ */ ++static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) ++{ ++ return time_is_before_eq_jiffies(bfqq->budget_timeout); ++} ++ ++/* ++ * If we expire a queue that is actively waiting (i.e., with the ++ * device idled) for the arrival of a new request, then we may incur ++ * the timestamp misalignment problem described in the body of the ++ * function __bfq_activate_entity. Hence we return true only if this ++ * condition does not hold, or if the queue is slow enough to deserve ++ * only to be kicked off for preserving a high throughput. ++ */ ++static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "may_budget_timeout: wait_request %d left %d timeout %d", ++ bfq_bfqq_wait_request(bfqq), ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, ++ bfq_bfqq_budget_timeout(bfqq)); ++ ++ return (!bfq_bfqq_wait_request(bfqq) || ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) ++ && ++ bfq_bfqq_budget_timeout(bfqq); ++} ++ ++/* ++ * For a queue that becomes empty, device idling is allowed only if ++ * this function returns true for that queue. As a consequence, since ++ * device idling plays a critical role for both throughput boosting ++ * and service guarantees, the return value of this function plays a ++ * critical role as well. ++ * ++ * In a nutshell, this function returns true only if idling is ++ * beneficial for throughput or, even if detrimental for throughput, ++ * idling is however necessary to preserve service guarantees (low ++ * latency, desired throughput distribution, ...). In particular, on ++ * NCQ-capable devices, this function tries to return false, so as to ++ * help keep the drives' internal queues full, whenever this helps the ++ * device boost the throughput without causing any service-guarantee ++ * issue. ++ * ++ * In more detail, the return value of this function is obtained by, ++ * first, computing a number of boolean variables that take into ++ * account throughput and service-guarantee issues, and, then, ++ * combining these variables in a logical expression. Most of the ++ * issues taken into account are not trivial. We discuss these issues ++ * while introducing the variables. ++ */ ++static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ bool idling_boosts_thr, idling_boosts_thr_without_issues, ++ idling_needed_for_service_guarantees, ++ asymmetric_scenario; ++ ++ if (bfqd->strict_guarantees) ++ return true; ++ ++ /* ++ * The next variable takes into account the cases where idling ++ * boosts the throughput. ++ * ++ * The value of the variable is computed considering, first, that ++ * idling is virtually always beneficial for the throughput if: ++ * (a) the device is not NCQ-capable, or ++ * (b) regardless of the presence of NCQ, the device is rotational ++ * and the request pattern for bfqq is I/O-bound and sequential. ++ * ++ * Secondly, and in contrast to the above item (b), idling an ++ * NCQ-capable flash-based device would not boost the ++ * throughput even with sequential I/O; rather it would lower ++ * the throughput in proportion to how fast the device ++ * is. Accordingly, the next variable is true if any of the ++ * above conditions (a) and (b) is true, and, in particular, ++ * happens to be false if bfqd is an NCQ-capable flash-based ++ * device. ++ */ ++ idling_boosts_thr = !bfqd->hw_tag || ++ (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && ++ bfq_bfqq_idle_window(bfqq)); ++ ++ /* ++ * The value of the next variable, ++ * idling_boosts_thr_without_issues, is equal to that of ++ * idling_boosts_thr, unless a special case holds. In this ++ * special case, described below, idling may cause problems to ++ * weight-raised queues. ++ * ++ * When the request pool is saturated (e.g., in the presence ++ * of write hogs), if the processes associated with ++ * non-weight-raised queues ask for requests at a lower rate, ++ * then processes associated with weight-raised queues have a ++ * higher probability to get a request from the pool ++ * immediately (or at least soon) when they need one. Thus ++ * they have a higher probability to actually get a fraction ++ * of the device throughput proportional to their high ++ * weight. This is especially true with NCQ-capable drives, ++ * which enqueue several requests in advance, and further ++ * reorder internally-queued requests. ++ * ++ * For this reason, we force to false the value of ++ * idling_boosts_thr_without_issues if there are weight-raised ++ * busy queues. In this case, and if bfqq is not weight-raised, ++ * this guarantees that the device is not idled for bfqq (if, ++ * instead, bfqq is weight-raised, then idling will be ++ * guaranteed by another variable, see below). Combined with ++ * the timestamping rules of BFQ (see [1] for details), this ++ * behavior causes bfqq, and hence any sync non-weight-raised ++ * queue, to get a lower number of requests served, and thus ++ * to ask for a lower number of requests from the request ++ * pool, before the busy weight-raised queues get served ++ * again. This often mitigates starvation problems in the ++ * presence of heavy write workloads and NCQ, thereby ++ * guaranteeing a higher application and system responsiveness ++ * in these hostile scenarios. ++ */ ++ idling_boosts_thr_without_issues = idling_boosts_thr && ++ bfqd->wr_busy_queues == 0; ++ ++ /* ++ * There is then a case where idling must be performed not ++ * for throughput concerns, but to preserve service ++ * guarantees. ++ * ++ * To introduce this case, we can note that allowing the drive ++ * to enqueue more than one request at a time, and hence ++ * delegating de facto final scheduling decisions to the ++ * drive's internal scheduler, entails loss of control on the ++ * actual request service order. In particular, the critical ++ * situation is when requests from different processes happen ++ * to be present, at the same time, in the internal queue(s) ++ * of the drive. In such a situation, the drive, by deciding ++ * the service order of the internally-queued requests, does ++ * determine also the actual throughput distribution among ++ * these processes. But the drive typically has no notion or ++ * concern about per-process throughput distribution, and ++ * makes its decisions only on a per-request basis. Therefore, ++ * the service distribution enforced by the drive's internal ++ * scheduler is likely to coincide with the desired ++ * device-throughput distribution only in a completely ++ * symmetric scenario where: ++ * (i) each of these processes must get the same throughput as ++ * the others; ++ * (ii) all these processes have the same I/O pattern ++ * (either sequential or random). ++ * In fact, in such a scenario, the drive will tend to treat ++ * the requests of each of these processes in about the same ++ * way as the requests of the others, and thus to provide ++ * each of these processes with about the same throughput ++ * (which is exactly the desired throughput distribution). In ++ * contrast, in any asymmetric scenario, device idling is ++ * certainly needed to guarantee that bfqq receives its ++ * assigned fraction of the device throughput (see [1] for ++ * details). ++ * ++ * We address this issue by controlling, actually, only the ++ * symmetry sub-condition (i), i.e., provided that ++ * sub-condition (i) holds, idling is not performed, ++ * regardless of whether sub-condition (ii) holds. In other ++ * words, only if sub-condition (i) holds, then idling is ++ * allowed, and the device tends to be prevented from queueing ++ * many requests, possibly of several processes. The reason ++ * for not controlling also sub-condition (ii) is that we ++ * exploit preemption to preserve guarantees in case of ++ * symmetric scenarios, even if (ii) does not hold, as ++ * explained in the next two paragraphs. ++ * ++ * Even if a queue, say Q, is expired when it remains idle, Q ++ * can still preempt the new in-service queue if the next ++ * request of Q arrives soon (see the comments on ++ * bfq_bfqq_update_budg_for_activation). If all queues and ++ * groups have the same weight, this form of preemption, ++ * combined with the hole-recovery heuristic described in the ++ * comments on function bfq_bfqq_update_budg_for_activation, ++ * are enough to preserve a correct bandwidth distribution in ++ * the mid term, even without idling. In fact, even if not ++ * idling allows the internal queues of the device to contain ++ * many requests, and thus to reorder requests, we can rather ++ * safely assume that the internal scheduler still preserves a ++ * minimum of mid-term fairness. The motivation for using ++ * preemption instead of idling is that, by not idling, ++ * service guarantees are preserved without minimally ++ * sacrificing throughput. In other words, both a high ++ * throughput and its desired distribution are obtained. ++ * ++ * More precisely, this preemption-based, idleless approach ++ * provides fairness in terms of IOPS, and not sectors per ++ * second. This can be seen with a simple example. Suppose ++ * that there are two queues with the same weight, but that ++ * the first queue receives requests of 8 sectors, while the ++ * second queue receives requests of 1024 sectors. In ++ * addition, suppose that each of the two queues contains at ++ * most one request at a time, which implies that each queue ++ * always remains idle after it is served. Finally, after ++ * remaining idle, each queue receives very quickly a new ++ * request. It follows that the two queues are served ++ * alternatively, preempting each other if needed. This ++ * implies that, although both queues have the same weight, ++ * the queue with large requests receives a service that is ++ * 1024/8 times as high as the service received by the other ++ * queue. ++ * ++ * On the other hand, device idling is performed, and thus ++ * pure sector-domain guarantees are provided, for the ++ * following queues, which are likely to need stronger ++ * throughput guarantees: weight-raised queues, and queues ++ * with a higher weight than other queues. When such queues ++ * are active, sub-condition (i) is false, which triggers ++ * device idling. ++ * ++ * According to the above considerations, the next variable is ++ * true (only) if sub-condition (i) holds. To compute the ++ * value of this variable, we not only use the return value of ++ * the function bfq_symmetric_scenario(), but also check ++ * whether bfqq is being weight-raised, because ++ * bfq_symmetric_scenario() does not take into account also ++ * weight-raised queues (see comments on ++ * bfq_weights_tree_add()). ++ * ++ * As a side note, it is worth considering that the above ++ * device-idling countermeasures may however fail in the ++ * following unlucky scenario: if idling is (correctly) ++ * disabled in a time period during which all symmetry ++ * sub-conditions hold, and hence the device is allowed to ++ * enqueue many requests, but at some later point in time some ++ * sub-condition stops to hold, then it may become impossible ++ * to let requests be served in the desired order until all ++ * the requests already queued in the device have been served. ++ */ ++ asymmetric_scenario = bfqq->wr_coeff > 1 || ++ !bfq_symmetric_scenario(bfqd); ++ ++ /* ++ * Finally, there is a case where maximizing throughput is the ++ * best choice even if it may cause unfairness toward ++ * bfqq. Such a case is when bfqq became active in a burst of ++ * queue activations. Queues that became active during a large ++ * burst benefit only from throughput, as discussed in the ++ * comments on bfq_handle_burst. Thus, if bfqq became active ++ * in a burst and not idling the device maximizes throughput, ++ * then the device must no be idled, because not idling the ++ * device provides bfqq and all other queues in the burst with ++ * maximum benefit. Combining this and the above case, we can ++ * now establish when idling is actually needed to preserve ++ * service guarantees. ++ */ ++ idling_needed_for_service_guarantees = ++ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); ++ ++ /* ++ * We have now all the components we need to compute the return ++ * value of the function, which is true only if both the following ++ * conditions hold: ++ * 1) bfqq is sync, because idling make sense only for sync queues; ++ * 2) idling either boosts the throughput (without issues), or ++ * is necessary to preserve service guarantees. ++ */ ++ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", ++ bfq_bfqq_sync(bfqq), idling_boosts_thr); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", ++ bfqd->wr_busy_queues, ++ idling_boosts_thr_without_issues, ++ bfq_bfqq_IO_bound(bfqq), ++ idling_needed_for_service_guarantees); ++ ++ return bfq_bfqq_sync(bfqq) && ++ (idling_boosts_thr_without_issues || ++ idling_needed_for_service_guarantees); ++} ++ ++/* ++ * If the in-service queue is empty but the function bfq_bfqq_may_idle ++ * returns true, then: ++ * 1) the queue must remain in service and cannot be expired, and ++ * 2) the device must be idled to wait for the possible arrival of a new ++ * request for the queue. ++ * See the comments on the function bfq_bfqq_may_idle for the reasons ++ * why performing device idling is the best choice to boost the throughput ++ * and preserve service guarantees when bfq_bfqq_may_idle itself ++ * returns true. ++ */ ++static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ ++ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && ++ bfq_bfqq_may_idle(bfqq); ++} ++ ++/* ++ * Select a queue for service. If we have a current queue in service, ++ * check whether to continue servicing it, or retrieve and set a new one. ++ */ ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ struct request *next_rq; ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ ++ bfqq = bfqd->in_service_queue; ++ if (!bfqq) ++ goto new_queue; ++ ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); ++ ++ if (bfq_may_expire_for_budg_timeout(bfqq) && ++ !hrtimer_active(&bfqd->idle_slice_timer) && ++ !bfq_bfqq_must_idle(bfqq)) ++ goto expire; ++ ++check_queue: ++ /* ++ * This loop is rarely executed more than once. Even when it ++ * happens, it is much more convenient to re-execute this loop ++ * than to return NULL and trigger a new dispatch to get a ++ * request served. ++ */ ++ next_rq = bfqq->next_rq; ++ /* ++ * If bfqq has requests queued and it has enough budget left to ++ * serve them, keep the queue, otherwise expire it. ++ */ ++ if (next_rq) { ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (bfq_serv_to_charge(next_rq, bfqq) > ++ bfq_bfqq_budget_left(bfqq)) { ++ /* ++ * Expire the queue for budget exhaustion, ++ * which makes sure that the next budget is ++ * enough to serve the next request, even if ++ * it comes from the fifo expired path. ++ */ ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; ++ goto expire; ++ } else { ++ /* ++ * The idle timer may be pending because we may ++ * not disable disk idling even when a new request ++ * arrives. ++ */ ++ if (bfq_bfqq_wait_request(bfqq)) { ++ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); ++ /* ++ * If we get here: 1) at least a new request ++ * has arrived but we have not disabled the ++ * timer because the request was too small, ++ * 2) then the block layer has unplugged ++ * the device, causing the dispatch to be ++ * invoked. ++ * ++ * Since the device is unplugged, now the ++ * requests are probably large enough to ++ * provide a reasonable throughput. ++ * So we disable idling. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); ++ } ++ goto keep_queue; ++ } ++ } ++ ++ /* ++ * No requests pending. However, if the in-service queue is idling ++ * for a new request, or has requests waiting for a completion and ++ * may idle after their completion, then keep it anyway. ++ */ ++ if (hrtimer_active(&bfqd->idle_slice_timer) || ++ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { ++ bfqq = NULL; ++ goto keep_queue; ++ } ++ ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS; ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, false, reason); ++new_queue: ++ bfqq = bfq_set_in_service_queue(bfqd); ++ if (bfqq) { ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); ++ goto check_queue; ++ } ++keep_queue: ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); ++ else ++ bfq_log(bfqd, "select_queue: no queue returned"); ++ ++ return bfqq; ++} ++ ++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ ++ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != ++ entity->orig_weight * bfqq->wr_coeff); ++ if (entity->prio_changed) ++ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); ++ ++ /* ++ * If the queue was activated in a burst, or too much ++ * time has elapsed from the beginning of this ++ * weight-raising period, then end weight raising. ++ */ ++ if (bfq_bfqq_in_large_burst(bfqq)) ++ bfq_bfqq_end_wr(bfqq); ++ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time)) { ++ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || ++ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + ++ bfq_wr_duration(bfqd))) ++ bfq_bfqq_end_wr(bfqq); ++ else { ++ /* switch back to interactive wr */ ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ bfqq->last_wr_start_finish = ++ bfqq->wr_start_at_switch_to_srt; ++ BUG_ON(time_is_after_jiffies( ++ bfqq->last_wr_start_finish)); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "back to interactive wr"); ++ } ++ } ++ } ++ /* ++ * To improve latency (for this or other queues), immediately ++ * update weight both if it must be raised and if it must be ++ * lowered. Since, entity may be on some active tree here, and ++ * might have a pending change of its ioprio class, invoke ++ * next function with the last parameter unset (see the ++ * comments on the function). ++ */ ++ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) ++ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), ++ entity, false); ++} ++ ++/* ++ * Dispatch one request from bfqq, moving it to the request queue ++ * dispatch list. ++ */ ++static int bfq_dispatch_request(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ struct request *rq = bfqq->next_rq; ++ unsigned long service_to_charge; ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(!rq); ++ service_to_charge = bfq_serv_to_charge(rq, bfqq); ++ ++ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ bfq_bfqq_served(bfqq, service_to_charge); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ bfq_dispatch_insert(bfqd->queue, rq); ++ ++ /* ++ * If weight raising has to terminate for bfqq, then next ++ * function causes an immediate update of bfqq's weight, ++ * without waiting for next activation. As a consequence, on ++ * expiration, bfqq will be timestamped as if has never been ++ * weight-raised during this service slot, even if it has ++ * received part or even most of the service as a ++ * weight-raised queue. This inflates bfqq's timestamps, which ++ * is beneficial, as bfqq is then more willing to leave the ++ * device immediately to possible other weight-raised queues. ++ */ ++ bfq_update_wr_data(bfqd, bfqq); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "dispatched %u sec req (%llu), budg left %d", ++ blk_rq_sectors(rq), ++ (unsigned long long) blk_rq_pos(rq), ++ bfq_bfqq_budget_left(bfqq)); ++ ++ dispatched++; ++ ++ if (!bfqd->in_service_bic) { ++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); ++ bfqd->in_service_bic = RQ_BIC(rq); ++ } ++ ++ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) ++ goto expire; ++ ++ return dispatched; ++ ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); ++ return dispatched; ++} ++ ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ ++ while (bfqq->next_rq) { ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); ++ dispatched++; ++ } ++ ++ BUG_ON(!list_empty(&bfqq->fifo)); ++ return dispatched; ++} ++ ++/* ++ * Drain our current requests. ++ * Used for barriers and when switching io schedulers on-the-fly. ++ */ ++static int bfq_forced_dispatch(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq, *n; ++ struct bfq_service_tree *st; ++ int dispatched = 0; ++ ++ bfqq = bfqd->in_service_queue; ++ if (bfqq) ++ __bfq_bfqq_expire(bfqd, bfqq); ++ ++ /* ++ * Loop through classes, and be careful to leave the scheduler ++ * in a consistent state, as feedback mechanisms and vtime ++ * updates cannot be disabled during the process. ++ */ ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { ++ st = bfq_entity_service_tree(&bfqq->entity); ++ ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq); ++ ++ bfqq->max_budget = bfq_max_budget(bfqd); ++ bfq_forget_idle(st); ++ } ++ ++ BUG_ON(bfqd->busy_queues != 0); ++ ++ return dispatched; ++} ++ ++static int bfq_dispatch_requests(struct request_queue *q, int force) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq; ++ ++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); ++ ++ if (bfqd->busy_queues == 0) ++ return 0; ++ ++ if (unlikely(force)) ++ return bfq_forced_dispatch(bfqd); ++ ++ /* ++ * Force device to serve one request at a time if ++ * strict_guarantees is true. Forcing this service scheme is ++ * currently the ONLY way to guarantee that the request ++ * service order enforced by the scheduler is respected by a ++ * queueing device. Otherwise the device is free even to make ++ * some unlucky request wait for as long as the device ++ * wishes. ++ * ++ * Of course, serving one request at at time may cause loss of ++ * throughput. ++ */ ++ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) ++ return 0; ++ ++ bfqq = bfq_select_queue(bfqd); ++ if (!bfqq) ++ return 0; ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ BUG_ON(bfq_bfqq_wait_request(bfqq)); ++ ++ if (!bfq_dispatch_request(bfqd, bfqq)) ++ return 0; ++ ++ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", ++ bfq_bfqq_sync(bfqq) ? "sync" : "async"); ++ ++ BUG_ON(bfqq->next_rq == NULL && ++ bfqq->entity.budget < bfqq->entity.service); ++ return 1; ++} ++ ++/* ++ * Task holds one reference to the queue, dropped when task exits. Each rq ++ * in-flight on this queue also holds a reference, dropped when rq is freed. ++ * ++ * Queue lock must be held here. Recall not to use bfqq after calling ++ * this function on it. ++ */ ++static void bfq_put_queue(struct bfq_queue *bfqq) ++{ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ struct bfq_group *bfqg = bfqq_group(bfqq); ++#endif ++ ++ BUG_ON(bfqq->ref <= 0); ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); ++ bfqq->ref--; ++ if (bfqq->ref) ++ return; ++ ++ BUG_ON(rb_first(&bfqq->sort_list)); ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); ++ BUG_ON(bfqq->entity.tree); ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ ++ if (bfq_bfqq_sync(bfqq)) ++ /* ++ * The fact that this queue is being destroyed does not ++ * invalidate the fact that this queue may have been ++ * activated during the current burst. As a consequence, ++ * although the queue does not exist anymore, and hence ++ * needs to be removed from the burst list if there, ++ * the burst size has not to be decremented. ++ */ ++ hlist_del_init(&bfqq->burst_list_node); ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); ++ ++ kmem_cache_free(bfq_pool, bfqq); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ bfqg_put(bfqg); ++#endif ++} ++ ++static void bfq_put_cooperator(struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *__bfqq, *next; ++ ++ /* ++ * If this queue was scheduled to merge with another queue, be ++ * sure to drop the reference taken on that queue (and others in ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. ++ */ ++ __bfqq = bfqq->new_bfqq; ++ while (__bfqq) { ++ if (__bfqq == bfqq) ++ break; ++ next = __bfqq->new_bfqq; ++ bfq_put_queue(__bfqq); ++ __bfqq = next; ++ } ++} ++ ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ if (bfqq == bfqd->in_service_queue) { ++ __bfq_bfqq_expire(bfqd, bfqq); ++ bfq_schedule_dispatch(bfqd); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); /* release process reference */ ++} ++ ++static void bfq_init_icq(struct io_cq *icq) ++{ ++ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); ++} ++ ++static void bfq_exit_icq(struct io_cq *icq) ++{ ++ struct bfq_io_cq *bic = icq_to_bic(icq); ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ ++ if (bic_to_bfqq(bic, false)) { ++ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); ++ bic_set_bfqq(bic, NULL, false); ++ } ++ ++ if (bic_to_bfqq(bic, true)) { ++ /* ++ * If the bic is using a shared queue, put the reference ++ * taken on the io_context when the bic started using a ++ * shared bfq_queue. ++ */ ++ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) ++ put_io_context(icq->ioc); ++ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); ++ bic_set_bfqq(bic, NULL, true); ++ } ++} ++ ++/* ++ * Update the entity prio values; note that the new values will not ++ * be used until the next (re)activation. ++ */ ++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ struct task_struct *tsk = current; ++ int ioprio_class; ++ ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ switch (ioprio_class) { ++ default: ++ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, ++ "bfq: bad prio class %d\n", ioprio_class); ++ case IOPRIO_CLASS_NONE: ++ /* ++ * No prio set, inherit CPU scheduling settings. ++ */ ++ bfqq->new_ioprio = task_nice_ioprio(tsk); ++ bfqq->new_ioprio_class = task_nice_ioclass(tsk); ++ break; ++ case IOPRIO_CLASS_RT: ++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; ++ break; ++ case IOPRIO_CLASS_BE: ++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; ++ break; ++ case IOPRIO_CLASS_IDLE: ++ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; ++ bfqq->new_ioprio = 7; ++ bfq_clear_bfqq_idle_window(bfqq); ++ break; ++ } ++ ++ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { ++ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", ++ bfqq->new_ioprio); ++ BUG(); ++ } ++ ++ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "set_next_ioprio_data: bic_class %d prio %d class %d", ++ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); ++} ++ ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ struct bfq_queue *bfqq; ++ unsigned long uninitialized_var(flags); ++ int ioprio = bic->icq.ioc->ioprio; ++ ++ /* ++ * This condition may trigger on a newly created bic, be sure to ++ * drop the lock before returning. ++ */ ++ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) ++ return; ++ ++ bic->ioprio = ioprio; ++ ++ bfqq = bic_to_bfqq(bic, false); ++ if (bfqq) { ++ /* release process reference on this queue */ ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); ++ bic_set_bfqq(bic, bfqq, false); ++ bfq_log_bfqq(bfqd, bfqq, ++ "check_ioprio_change: bfqq %p %d", ++ bfqq, bfqq->ref); ++ } ++ ++ bfqq = bic_to_bfqq(bic, true); ++ if (bfqq) ++ bfq_set_next_ioprio_data(bfqq, bic); ++} ++ ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic, pid_t pid, int is_sync) ++{ ++ RB_CLEAR_NODE(&bfqq->entity.rb_node); ++ INIT_LIST_HEAD(&bfqq->fifo); ++ INIT_HLIST_NODE(&bfqq->burst_list_node); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); ++ ++ bfqq->ref = 0; ++ bfqq->bfqd = bfqd; ++ ++ if (bic) ++ bfq_set_next_ioprio_data(bfqq, bic); ++ ++ if (is_sync) { ++ if (!bfq_class_idle(bfqq)) ++ bfq_mark_bfqq_idle_window(bfqq); ++ bfq_mark_bfqq_sync(bfqq); ++ bfq_mark_bfqq_just_created(bfqq); ++ } else ++ bfq_clear_bfqq_sync(bfqq); ++ bfq_mark_bfqq_IO_bound(bfqq); ++ ++ /* Tentative initial value to trade off between thr and lat */ ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; ++ bfqq->pid = pid; ++ ++ bfqq->wr_coeff = 1; ++ bfqq->last_wr_start_finish = jiffies; ++ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); ++ bfqq->budget_timeout = bfq_smallest_from_now(); ++ bfqq->split_time = bfq_smallest_from_now(); ++ ++ /* ++ * Set to the value for which bfqq will not be deemed as ++ * soft rt when it becomes backlogged. ++ */ ++ bfqq->soft_rt_next_start = bfq_greatest_from_now(); ++ ++ /* first request is almost certainly seeky */ ++ bfqq->seek_history = 1; ++} ++ ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ int ioprio_class, int ioprio) ++{ ++ switch (ioprio_class) { ++ case IOPRIO_CLASS_RT: ++ return &bfqg->async_bfqq[0][ioprio]; ++ case IOPRIO_CLASS_NONE: ++ ioprio = IOPRIO_NORM; ++ /* fall through */ ++ case IOPRIO_CLASS_BE: ++ return &bfqg->async_bfqq[1][ioprio]; ++ case IOPRIO_CLASS_IDLE: ++ return &bfqg->async_idle_bfqq; ++ default: ++ BUG(); ++ } ++} ++ ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bio *bio, bool is_sync, ++ struct bfq_io_cq *bic) ++{ ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ struct bfq_queue **async_bfqq = NULL; ++ struct bfq_queue *bfqq; ++ struct bfq_group *bfqg; ++ ++ rcu_read_lock(); ++ ++ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); ++ if (!bfqg) { ++ bfqq = &bfqd->oom_bfqq; ++ goto out; ++ } ++ ++ if (!is_sync) { ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ++ ioprio); ++ bfqq = *async_bfqq; ++ if (bfqq) ++ goto out; ++ } ++ ++ bfqq = kmem_cache_alloc_node(bfq_pool, ++ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, ++ bfqd->queue->node); ++ ++ if (bfqq) { ++ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, ++ is_sync); ++ bfq_init_entity(&bfqq->entity, bfqg); ++ bfq_log_bfqq(bfqd, bfqq, "allocated"); ++ } else { ++ bfqq = &bfqd->oom_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); ++ goto out; ++ } ++ ++ /* ++ * Pin the queue now that it's allocated, scheduler exit will ++ * prune it. ++ */ ++ if (async_bfqq) { ++ bfqq->ref++; /* ++ * Extra group reference, w.r.t. sync ++ * queue. This extra reference is removed ++ * only if bfqq->bfqg disappears, to ++ * guarantee that this queue is not freed ++ * until its group goes away. ++ */ ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", ++ bfqq, bfqq->ref); ++ *async_bfqq = bfqq; ++ } ++ ++out: ++ bfqq->ref++; /* get a process reference to this queue */ ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); ++ rcu_read_unlock(); ++ return bfqq; ++} ++ ++static void bfq_update_io_thinktime(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic) ++{ ++ struct bfq_ttime *ttime = &bic->ttime; ++ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; ++ ++ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); ++ ++ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; ++ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); ++ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, ++ ttime->ttime_samples); ++} ++ ++static void ++bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ bfqq->seek_history <<= 1; ++ bfqq->seek_history |= ++ get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && ++ (!blk_queue_nonrot(bfqd->queue) || ++ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); ++} ++ ++/* ++ * Disable idle window if the process thinks too long or seeks so much that ++ * it doesn't matter. ++ */ ++static void bfq_update_idle_window(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ int enable_idle; ++ ++ /* Don't idle for async or idle io prio class. */ ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) ++ return; ++ ++ /* Idle window just restored, statistics are meaningless. */ ++ if (time_is_after_eq_jiffies(bfqq->split_time + ++ bfqd->bfq_wr_min_idle_time)) ++ return; ++ ++ enable_idle = bfq_bfqq_idle_window(bfqq); ++ ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || ++ bfqd->bfq_slice_idle == 0 || ++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && ++ bfqq->wr_coeff == 1)) ++ enable_idle = 0; ++ else if (bfq_sample_valid(bic->ttime.ttime_samples)) { ++ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && ++ bfqq->wr_coeff == 1) ++ enable_idle = 0; ++ else ++ enable_idle = 1; ++ } ++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", ++ enable_idle); ++ ++ if (enable_idle) ++ bfq_mark_bfqq_idle_window(bfqq); ++ else ++ bfq_clear_bfqq_idle_window(bfqq); ++} ++ ++/* ++ * Called when a new fs request (rq) is added to bfqq. Check if there's ++ * something we should do about it. ++ */ ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ struct bfq_io_cq *bic = RQ_BIC(rq); ++ ++ if (rq->cmd_flags & REQ_META) ++ bfqq->meta_pending++; ++ ++ bfq_update_io_thinktime(bfqd, bic); ++ bfq_update_io_seektime(bfqd, bfqq, rq); ++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || ++ !BFQQ_SEEKY(bfqq)) ++ bfq_update_idle_window(bfqd, bfqq, bic); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "rq_enqueued: idle_window=%d (seeky %d)", ++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); ++ ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ ++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { ++ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && ++ blk_rq_sectors(rq) < 32; ++ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); ++ ++ /* ++ * There is just this request queued: if the request ++ * is small and the queue is not to be expired, then ++ * just exit. ++ * ++ * In this way, if the device is being idled to wait ++ * for a new request from the in-service queue, we ++ * avoid unplugging the device and committing the ++ * device to serve just a small request. On the ++ * contrary, we wait for the block layer to decide ++ * when to unplug the device: hopefully, new requests ++ * will be merged to this one quickly, then the device ++ * will be unplugged and larger requests will be ++ * dispatched. ++ */ ++ if (small_req && !budget_timeout) ++ return; ++ ++ /* ++ * A large enough request arrived, or the queue is to ++ * be expired: in both cases disk idling is to be ++ * stopped, so clear wait_request flag and reset ++ * timer. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); ++ ++ /* ++ * The queue is not empty, because a new request just ++ * arrived. Hence we can safely expire the queue, in ++ * case of budget timeout, without risking that the ++ * timestamps of the queue are not updated correctly. ++ * See [1] for more details. ++ */ ++ if (budget_timeout) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ ++ /* ++ * Let the request rip immediately, or let a new queue be ++ * selected if bfqq has just been expired. ++ */ ++ __blk_run_queue(bfqd->queue); ++ } ++} ++ ++static void bfq_insert_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ ++ /* ++ * An unplug may trigger a requeue of a request from the device ++ * driver: make sure we are in process context while trying to ++ * merge two bfq_queues. ++ */ ++ if (!in_interrupt()) { ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); ++ if (new_bfqq) { ++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) ++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); ++ /* ++ * Release the request's reference to the old bfqq ++ * and make sure one is taken to the shared queue. ++ */ ++ new_bfqq->allocated[rq_data_dir(rq)]++; ++ bfqq->allocated[rq_data_dir(rq)]--; ++ new_bfqq->ref++; ++ bfq_clear_bfqq_just_created(bfqq); ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), ++ bfqq, new_bfqq); ++ /* ++ * rq is about to be enqueued into new_bfqq, ++ * release rq reference on bfqq ++ */ ++ bfq_put_queue(bfqq); ++ rq->elv.priv[1] = new_bfqq; ++ bfqq = new_bfqq; ++ } ++ } ++ ++ bfq_add_request(rq); ++ ++ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; ++ list_add_tail(&rq->queuelist, &bfqq->fifo); ++ ++ bfq_rq_enqueued(bfqd, bfqq, rq); ++} ++ ++static void bfq_update_hw_tag(struct bfq_data *bfqd) ++{ ++ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, ++ bfqd->rq_in_driver); ++ ++ if (bfqd->hw_tag == 1) ++ return; ++ ++ /* ++ * This sample is valid if the number of outstanding requests ++ * is large enough to allow a queueing behavior. Note that the ++ * sum is not exact, as it's not taking into account deactivated ++ * requests. ++ */ ++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) ++ return; ++ ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) ++ return; ++ ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; ++ bfqd->max_rq_in_driver = 0; ++ bfqd->hw_tag_samples = 0; ++} ++ ++static void bfq_completed_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ u64 now_ns; ++ u32 delta_us; ++ ++ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", ++ blk_rq_sectors(rq)); ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ bfq_update_hw_tag(bfqd); ++ ++ BUG_ON(!bfqd->rq_in_driver); ++ BUG_ON(!bfqq->dispatched); ++ bfqd->rq_in_driver--; ++ bfqq->dispatched--; ++ bfqg_stats_update_completion(bfqq_group(bfqq), ++ rq_start_time_ns(rq), ++ rq_io_start_time_ns(rq), ++ rq->cmd_flags); ++ ++ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ /* ++ * Set budget_timeout (which we overload to store the ++ * time at which the queue remains with no backlog and ++ * no outstanding request; used by the weight-raising ++ * mechanism). ++ */ ++ bfqq->budget_timeout = jiffies; ++ ++ bfq_weights_tree_remove(bfqd, &bfqq->entity, ++ &bfqd->queue_weights_tree); ++ } ++ ++ now_ns = ktime_get_ns(); ++ ++ RQ_BIC(rq)->ttime.last_end_request = now_ns; ++ ++ /* ++ * Using us instead of ns, to get a reasonable precision in ++ * computing rate in next check. ++ */ ++ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); ++ ++ bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", ++ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, ++ (USEC_PER_SEC* ++ (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, ++ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); ++ ++ /* ++ * If the request took rather long to complete, and, according ++ * to the maximum request size recorded, this completion latency ++ * implies that the request was certainly served at a very low ++ * rate (less than 1M sectors/sec), then the whole observation ++ * interval that lasts up to this time instant cannot be a ++ * valid time interval for computing a new peak rate. Invoke ++ * bfq_update_rate_reset to have the following three steps ++ * taken: ++ * - close the observation interval at the last (previous) ++ * request dispatch or completion ++ * - compute rate, if possible, for that observation interval ++ * - reset to zero samples, which will trigger a proper ++ * re-initialization of the observation interval on next ++ * dispatch ++ */ ++ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && ++ (bfqd->last_rq_max_size<last_completion = now_ns; ++ ++ /* ++ * If we are waiting to discover whether the request pattern ++ * of the task associated with the queue is actually ++ * isochronous, and both requisites for this condition to hold ++ * are now satisfied, then compute soft_rt_next_start (see the ++ * comments on the function bfq_bfqq_softrt_next_start()). We ++ * schedule this delayed check when bfqq expires, if it still ++ * has in-flight requests. ++ */ ++ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list)) ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ ++ /* ++ * If this is the in-service queue, check if it needs to be expired, ++ * or if we want to idle in case it has no pending requests. ++ */ ++ if (bfqd->in_service_queue == bfqq) { ++ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { ++ bfq_arm_slice_timer(bfqd); ++ goto out; ++ } else if (bfq_may_expire_for_budg_timeout(bfqq)) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && ++ (bfqq->dispatched == 0 || ++ !bfq_bfqq_may_idle(bfqq))) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_NO_MORE_REQUESTS); ++ } ++ ++ if (!bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++ ++out: ++ return; ++} ++ ++static int __bfq_may_queue(struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { ++ bfq_clear_bfqq_must_alloc(bfqq); ++ return ELV_MQUEUE_MUST; ++ } ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++static int bfq_may_queue(struct request_queue *q, unsigned int op) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct task_struct *tsk = current; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ /* ++ * Don't force setup of a queue from here, as a call to may_queue ++ * does not necessarily imply that a request actually will be ++ * queued. So just lookup a possibly existing queue, or return ++ * 'may queue' if that fails. ++ */ ++ bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ if (!bic) ++ return ELV_MQUEUE_MAY; ++ ++ bfqq = bic_to_bfqq(bic, op_is_sync(op)); ++ if (bfqq) ++ return __bfq_may_queue(bfqq); ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++/* ++ * Queue lock held here. ++ */ ++static void bfq_put_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ if (bfqq) { ++ const int rw = rq_data_dir(rq); ++ ++ BUG_ON(!bfqq->allocated[rw]); ++ bfqq->allocated[rw]--; ++ ++ rq->elv.priv[0] = NULL; ++ rq->elv.priv[1] = NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", ++ bfqq, bfqq->ref); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++/* ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this ++ * was the last process referring to that bfqq. ++ */ ++static struct bfq_queue * ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); ++ ++ put_io_context(bic->icq.ioc); ++ ++ if (bfqq_process_refs(bfqq) == 1) { ++ bfqq->pid = current->pid; ++ bfq_clear_bfqq_coop(bfqq); ++ bfq_clear_bfqq_split_coop(bfqq); ++ return bfqq; ++ } ++ ++ bic_set_bfqq(bic, NULL, 1); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); ++ return NULL; ++} ++ ++/* ++ * Allocate bfq data structures associated with this request. ++ */ ++static int bfq_set_request(struct request_queue *q, struct request *rq, ++ struct bio *bio, gfp_t gfp_mask) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); ++ const int rw = rq_data_dir(rq); ++ const int is_sync = rq_is_sync(rq); ++ struct bfq_queue *bfqq; ++ unsigned long flags; ++ bool bfqq_already_existing = false, split = false; ++ ++ spin_lock_irqsave(q->queue_lock, flags); ++ ++ if (!bic) ++ goto queue_fail; ++ ++ bfq_check_ioprio_change(bic, bio); ++ ++ bfq_bic_update_cgroup(bic, bio); ++ ++new_queue: ++ bfqq = bic_to_bfqq(bic, is_sync); ++ if (!bfqq || bfqq == &bfqd->oom_bfqq) { ++ if (bfqq) ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); ++ ++ bic_set_bfqq(bic, bfqq, is_sync); ++ if (split && is_sync) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "set_request: was_in_list %d " ++ "was_in_large_burst %d " ++ "large burst in progress %d", ++ bic->was_in_burst_list, ++ bic->saved_in_large_burst, ++ bfqd->large_burst); ++ ++ if ((bic->was_in_burst_list && bfqd->large_burst) || ++ bic->saved_in_large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "set_request: marking in " ++ "large burst"); ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ } else { ++ bfq_log_bfqq(bfqd, bfqq, ++ "set_request: clearing in " ++ "large burst"); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ if (bic->was_in_burst_list) ++ hlist_add_head(&bfqq->burst_list_node, ++ &bfqd->burst_list); ++ } ++ bfqq->split_time = jiffies; ++ } ++ } else { ++ /* If the queue was seeky for too long, break it apart. */ ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); ++ ++ /* Update bic before losing reference to bfqq */ ++ if (bfq_bfqq_in_large_burst(bfqq)) ++ bic->saved_in_large_burst = true; ++ ++ bfqq = bfq_split_bfqq(bic, bfqq); ++ split = true; ++ if (!bfqq) ++ goto new_queue; ++ else ++ bfqq_already_existing = true; ++ } ++ } ++ ++ bfqq->allocated[rw]++; ++ bfqq->ref++; ++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); ++ ++ rq->elv.priv[0] = bic; ++ rq->elv.priv[1] = bfqq; ++ ++ /* ++ * If a bfq_queue has only one process reference, it is owned ++ * by only one bfq_io_cq: we can set the bic field of the ++ * bfq_queue to the address of that structure. Also, if the ++ * queue has just been split, mark a flag so that the ++ * information is available to the other scheduler hooks. ++ */ ++ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { ++ bfqq->bic = bic; ++ if (split) { ++ /* ++ * If the queue has just been split from a shared ++ * queue, restore the idle window and the possible ++ * weight raising period. ++ */ ++ bfq_bfqq_resume_state(bfqq, bfqd, bic, ++ bfqq_already_existing); ++ } ++ } ++ ++ if (unlikely(bfq_bfqq_just_created(bfqq))) ++ bfq_handle_burst(bfqd, bfqq); ++ ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 0; ++ ++queue_fail: ++ bfq_schedule_dispatch(bfqd); ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 1; ++} ++ ++static void bfq_kick_queue(struct work_struct *work) ++{ ++ struct bfq_data *bfqd = ++ container_of(work, struct bfq_data, unplug_work); ++ struct request_queue *q = bfqd->queue; ++ ++ spin_lock_irq(q->queue_lock); ++ __blk_run_queue(q); ++ spin_unlock_irq(q->queue_lock); ++} ++ ++/* ++ * Handler of the expiration of the timer running if the in-service queue ++ * is idling inside its time slice. ++ */ ++static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) ++{ ++ struct bfq_data *bfqd = container_of(timer, struct bfq_data, ++ idle_slice_timer); ++ struct bfq_queue *bfqq; ++ unsigned long flags; ++ enum bfqq_expiration reason; ++ ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags); ++ ++ bfqq = bfqd->in_service_queue; ++ /* ++ * Theoretical race here: the in-service queue can be NULL or ++ * different from the queue that was idling if the timer handler ++ * spins on the queue_lock and a new request arrives for the ++ * current queue and there is a full dispatch cycle that changes ++ * the in-service queue. This can hardly happen, but in the worst ++ * case we just expire a queue too early. ++ */ ++ if (bfqq) { ++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); ++ bfq_clear_bfqq_wait_request(bfqq); ++ ++ if (bfq_bfqq_budget_timeout(bfqq)) ++ /* ++ * Also here the queue can be safely expired ++ * for budget timeout without wasting ++ * guarantees ++ */ ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) ++ /* ++ * The queue may not be empty upon timer expiration, ++ * because we may not disable the timer when the ++ * first request of the in-service queue arrives ++ * during disk idling. ++ */ ++ reason = BFQ_BFQQ_TOO_IDLE; ++ else ++ goto schedule_dispatch; ++ ++ bfq_bfqq_expire(bfqd, bfqq, true, reason); ++ } ++ ++schedule_dispatch: ++ bfq_schedule_dispatch(bfqd); ++ ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); ++ return HRTIMER_NORESTART; ++} ++ ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) ++{ ++ hrtimer_cancel(&bfqd->idle_slice_timer); ++ cancel_work_sync(&bfqd->unplug_work); ++} ++ ++static void __bfq_put_async_bfqq(struct bfq_data *bfqd, ++ struct bfq_queue **bfqq_ptr) ++{ ++ struct bfq_group *root_group = bfqd->root_group; ++ struct bfq_queue *bfqq = *bfqq_ptr; ++ ++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); ++ if (bfqq) { ++ bfq_bfqq_move(bfqd, bfqq, root_group); ++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", ++ bfqq, bfqq->ref); ++ bfq_put_queue(bfqq); ++ *bfqq_ptr = NULL; ++ } ++} ++ ++/* ++ * Release all the bfqg references to its async queues. If we are ++ * deallocating the group these queues may still contain requests, so ++ * we reparent them to the root cgroup (i.e., the only one that will ++ * exist for sure until all the requests on a device are gone). ++ */ ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); ++ ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); ++} ++ ++static void bfq_exit_queue(struct elevator_queue *e) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ struct request_queue *q = bfqd->queue; ++ struct bfq_queue *bfqq, *n; ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ spin_lock_irq(q->queue_lock); ++ ++ BUG_ON(bfqd->in_service_queue); ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) ++ bfq_deactivate_bfqq(bfqd, bfqq, false, false); ++ ++ spin_unlock_irq(q->queue_lock); ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ blkcg_deactivate_policy(q, &blkcg_policy_bfq); ++#else ++ bfq_put_async_queues(bfqd, bfqd->root_group); ++ kfree(bfqd->root_group); ++#endif ++ ++ kfree(bfqd); ++} ++ ++static void bfq_init_root_group(struct bfq_group *root_group, ++ struct bfq_data *bfqd) ++{ ++ int i; ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ root_group->entity.parent = NULL; ++ root_group->my_entity = NULL; ++ root_group->bfqd = bfqd; ++#endif ++ root_group->rq_pos_tree = RB_ROOT; ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ root_group->sched_data.bfq_class_idle_last_service = jiffies; ++} ++ ++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) ++{ ++ struct bfq_data *bfqd; ++ struct elevator_queue *eq; ++ ++ eq = elevator_alloc(q, e); ++ if (!eq) ++ return -ENOMEM; ++ ++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); ++ if (!bfqd) { ++ kobject_put(&eq->kobj); ++ return -ENOMEM; ++ } ++ eq->elevator_data = bfqd; ++ ++ /* ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. ++ * Grab a permanent reference to it, so that the normal code flow ++ * will not attempt to free it. ++ */ ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); ++ bfqd->oom_bfqq.ref++; ++ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; ++ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; ++ bfqd->oom_bfqq.entity.new_weight = ++ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); ++ ++ /* oom_bfqq does not participate to bursts */ ++ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); ++ /* ++ * Trigger weight initialization, according to ioprio, at the ++ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio ++ * class won't be changed any more. ++ */ ++ bfqd->oom_bfqq.entity.prio_changed = 1; ++ ++ bfqd->queue = q; ++ ++ spin_lock_irq(q->queue_lock); ++ q->elevator = eq; ++ spin_unlock_irq(q->queue_lock); ++ ++ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); ++ if (!bfqd->root_group) ++ goto out_free; ++ bfq_init_root_group(bfqd->root_group, bfqd); ++ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); ++ ++ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, ++ HRTIMER_MODE_REL); ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; ++ ++ bfqd->queue_weights_tree = RB_ROOT; ++ bfqd->group_weights_tree = RB_ROOT; ++ ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); ++ ++ INIT_LIST_HEAD(&bfqd->active_list); ++ INIT_LIST_HEAD(&bfqd->idle_list); ++ INIT_HLIST_HEAD(&bfqd->burst_list); ++ ++ bfqd->hw_tag = -1; ++ ++ bfqd->bfq_max_budget = bfq_default_max_budget; ++ ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; ++ bfqd->bfq_back_max = bfq_back_max; ++ bfqd->bfq_back_penalty = bfq_back_penalty; ++ bfqd->bfq_slice_idle = bfq_slice_idle; ++ bfqd->bfq_timeout = bfq_timeout; ++ ++ bfqd->bfq_requests_within_timer = 120; ++ ++ bfqd->bfq_large_burst_thresh = 8; ++ bfqd->bfq_burst_interval = msecs_to_jiffies(180); ++ ++ bfqd->low_latency = true; ++ ++ /* ++ * Trade-off between responsiveness and fairness. ++ */ ++ bfqd->bfq_wr_coeff = 30; ++ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); ++ bfqd->bfq_wr_max_time = 0; ++ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); ++ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); ++ bfqd->bfq_wr_max_softrt_rate = 7000; /* ++ * Approximate rate required ++ * to playback or record a ++ * high-definition compressed ++ * video. ++ */ ++ bfqd->wr_busy_queues = 0; ++ ++ /* ++ * Begin by assuming, optimistically, that the device is a ++ * high-speed one, and that its peak rate is equal to 2/3 of ++ * the highest reference rate. ++ */ ++ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * ++ T_fast[blk_queue_nonrot(bfqd->queue)]; ++ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; ++ bfqd->device_speed = BFQ_BFQD_FAST; ++ ++ return 0; ++ ++out_free: ++ kfree(bfqd); ++ kobject_put(&eq->kobj); ++ return -ENOMEM; ++} ++ ++static void bfq_slab_kill(void) ++{ ++ kmem_cache_destroy(bfq_pool); ++} ++ ++static int __init bfq_slab_setup(void) ++{ ++ bfq_pool = KMEM_CACHE(bfq_queue, 0); ++ if (!bfq_pool) ++ return -ENOMEM; ++ return 0; ++} ++ ++static ssize_t bfq_var_show(unsigned int var, char *page) ++{ ++ return sprintf(page, "%u\n", var); ++} ++ ++static ssize_t bfq_var_store(unsigned long *var, const char *page, ++ size_t count) ++{ ++ unsigned long new_val; ++ int ret = kstrtoul(page, 10, &new_val); ++ ++ if (ret == 0) ++ *var = new_val; ++ ++ return count; ++} ++ ++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ ++ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? ++ jiffies_to_msecs(bfqd->bfq_wr_max_time) : ++ jiffies_to_msecs(bfq_wr_duration(bfqd))); ++} ++ ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_queue *bfqq; ++ struct bfq_data *bfqd = e->elevator_data; ++ ssize_t num_char = 0; ++ ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", ++ bfqd->queued); ++ ++ spin_lock_irq(bfqd->queue->queue_lock); ++ ++ num_char += sprintf(page + num_char, "Active:\n"); ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, nr_queued %d %d, ", ++ bfqq->pid, ++ bfqq->entity.weight, ++ bfqq->queued[0], ++ bfqq->queued[1]); ++ num_char += sprintf(page + num_char, ++ "dur %d/%u\n", ++ jiffies_to_msecs( ++ jiffies - ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ num_char += sprintf(page + num_char, "Idle:\n"); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, dur %d/%u\n", ++ bfqq->pid, ++ bfqq->entity.weight, ++ jiffies_to_msecs(jiffies - ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ spin_unlock_irq(bfqd->queue->queue_lock); ++ ++ return num_char; ++} ++ ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ u64 __data = __VAR; \ ++ if (__CONV == 1) \ ++ __data = jiffies_to_msecs(__data); \ ++ else if (__CONV == 2) \ ++ __data = div_u64(__data, NSEC_PER_MSEC); \ ++ return bfq_var_show(__data, (page)); \ ++} ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); ++SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); ++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); ++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); ++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); ++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, ++ 1); ++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); ++#undef SHOW_FUNCTION ++ ++#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ u64 __data = __VAR; \ ++ __data = div_u64(__data, NSEC_PER_USEC); \ ++ return bfq_var_show(__data, (page)); \ ++} ++USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); ++#undef USEC_SHOW_FUNCTION ++ ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ ++static ssize_t \ ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long uninitialized_var(__data); \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ if (__CONV == 1) \ ++ *(__PTR) = msecs_to_jiffies(__data); \ ++ else if (__CONV == 2) \ ++ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ ++ else \ ++ *(__PTR) = __data; \ ++ return ret; \ ++} ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, ++ INT_MAX, 2); ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, ++ INT_MAX, 2); ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, ++ INT_MAX, 0); ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); ++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, ++ 1); ++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, ++ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, ++ INT_MAX, 0); ++#undef STORE_FUNCTION ++ ++#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ ++static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long uninitialized_var(__data); \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ ++ return ret; \ ++} ++USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, ++ UINT_MAX); ++#undef USEC_STORE_FUNCTION ++ ++/* do nothing for the moment */ ++static ssize_t bfq_weights_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ return count; ++} ++ ++static ssize_t bfq_max_budget_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data == 0) ++ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); ++ else { ++ if (__data > INT_MAX) ++ __data = INT_MAX; ++ bfqd->bfq_max_budget = __data; ++ } ++ ++ bfqd->bfq_user_max_budget = __data; ++ ++ return ret; ++} ++ ++/* ++ * Leaving this name to preserve name compatibility with cfq ++ * parameters, but this timeout is used for both sync and async. ++ */ ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data < 1) ++ __data = 1; ++ else if (__data > INT_MAX) ++ __data = INT_MAX; ++ ++ bfqd->bfq_timeout = msecs_to_jiffies(__data); ++ if (bfqd->bfq_user_max_budget == 0) ++ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); ++ ++ return ret; ++} ++ ++static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ if (!bfqd->strict_guarantees && __data == 1 ++ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) ++ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; ++ ++ bfqd->strict_guarantees = __data; ++ ++ return ret; ++} ++ ++static ssize_t bfq_low_latency_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ if (__data == 0 && bfqd->low_latency != 0) ++ bfq_end_wr(bfqd); ++ bfqd->low_latency = __data; ++ ++ return ret; ++} ++ ++#define BFQ_ATTR(name) \ ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) ++ ++static struct elv_fs_entry bfq_attrs[] = { ++ BFQ_ATTR(fifo_expire_sync), ++ BFQ_ATTR(fifo_expire_async), ++ BFQ_ATTR(back_seek_max), ++ BFQ_ATTR(back_seek_penalty), ++ BFQ_ATTR(slice_idle), ++ BFQ_ATTR(slice_idle_us), ++ BFQ_ATTR(max_budget), ++ BFQ_ATTR(timeout_sync), ++ BFQ_ATTR(strict_guarantees), ++ BFQ_ATTR(low_latency), ++ BFQ_ATTR(wr_coeff), ++ BFQ_ATTR(wr_max_time), ++ BFQ_ATTR(wr_rt_max_time), ++ BFQ_ATTR(wr_min_idle_time), ++ BFQ_ATTR(wr_min_inter_arr_async), ++ BFQ_ATTR(wr_max_softrt_rate), ++ BFQ_ATTR(weights), ++ __ATTR_NULL ++}; ++ ++static struct elevator_type iosched_bfq = { ++ .ops.sq = { ++ .elevator_merge_fn = bfq_merge, ++ .elevator_merged_fn = bfq_merged_request, ++ .elevator_merge_req_fn = bfq_merged_requests, ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ .elevator_bio_merged_fn = bfq_bio_merged, ++#endif ++ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, ++ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, ++ .elevator_dispatch_fn = bfq_dispatch_requests, ++ .elevator_add_req_fn = bfq_insert_request, ++ .elevator_activate_req_fn = bfq_activate_request, ++ .elevator_deactivate_req_fn = bfq_deactivate_request, ++ .elevator_completed_req_fn = bfq_completed_request, ++ .elevator_former_req_fn = elv_rb_former_request, ++ .elevator_latter_req_fn = elv_rb_latter_request, ++ .elevator_init_icq_fn = bfq_init_icq, ++ .elevator_exit_icq_fn = bfq_exit_icq, ++ .elevator_set_req_fn = bfq_set_request, ++ .elevator_put_req_fn = bfq_put_request, ++ .elevator_may_queue_fn = bfq_may_queue, ++ .elevator_init_fn = bfq_init_queue, ++ .elevator_exit_fn = bfq_exit_queue, ++ }, ++ .icq_size = sizeof(struct bfq_io_cq), ++ .icq_align = __alignof__(struct bfq_io_cq), ++ .elevator_attrs = bfq_attrs, ++ .elevator_name = "bfq-sq", ++ .elevator_owner = THIS_MODULE, ++}; ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++static struct blkcg_policy blkcg_policy_bfq = { ++ .dfl_cftypes = bfq_blkg_files, ++ .legacy_cftypes = bfq_blkcg_legacy_files, ++ ++ .cpd_alloc_fn = bfq_cpd_alloc, ++ .cpd_init_fn = bfq_cpd_init, ++ .cpd_bind_fn = bfq_cpd_init, ++ .cpd_free_fn = bfq_cpd_free, ++ ++ .pd_alloc_fn = bfq_pd_alloc, ++ .pd_init_fn = bfq_pd_init, ++ .pd_offline_fn = bfq_pd_offline, ++ .pd_free_fn = bfq_pd_free, ++ .pd_reset_stats_fn = bfq_pd_reset_stats, ++}; ++#endif ++ ++static int __init bfq_init(void) ++{ ++ int ret; ++ char msg[60] = "BFQ I/O-scheduler: v8r12"; ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ ret = blkcg_policy_register(&blkcg_policy_bfq); ++ if (ret) ++ return ret; ++#endif ++ ++ ret = -ENOMEM; ++ if (bfq_slab_setup()) ++ goto err_pol_unreg; ++ ++ /* ++ * Times to load large popular applications for the typical ++ * systems installed on the reference devices (see the ++ * comments before the definitions of the next two ++ * arrays). Actually, we use slightly slower values, as the ++ * estimated peak rate tends to be smaller than the actual ++ * peak rate. The reason for this last fact is that estimates ++ * are computed over much shorter time intervals than the long ++ * intervals typically used for benchmarking. Why? First, to ++ * adapt more quickly to variations. Second, because an I/O ++ * scheduler cannot rely on a peak-rate-evaluation workload to ++ * be run for a long time. ++ */ ++ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ ++ T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ ++ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ ++ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ ++ ++ /* ++ * Thresholds that determine the switch between speed classes ++ * (see the comments before the definition of the array ++ * device_speed_thresh). These thresholds are biased towards ++ * transitions to the fast class. This is safer than the ++ * opposite bias. In fact, a wrong transition to the slow ++ * class results in short weight-raising periods, because the ++ * speed of the device then tends to be higher that the ++ * reference peak rate. On the opposite end, a wrong ++ * transition to the fast class tends to increase ++ * weight-raising periods, because of the opposite reason. ++ */ ++ device_speed_thresh[0] = (4 * R_slow[0]) / 3; ++ device_speed_thresh[1] = (4 * R_slow[1]) / 3; ++ ++ ret = elv_register(&iosched_bfq); ++ if (ret) ++ goto err_pol_unreg; ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ strcat(msg, " (with cgroups support)"); ++#endif ++ pr_info("%s", msg); ++ ++ return 0; ++ ++err_pol_unreg: ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ blkcg_policy_unregister(&blkcg_policy_bfq); ++#endif ++ return ret; ++} ++ ++static void __exit bfq_exit(void) ++{ ++ elv_unregister(&iosched_bfq); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ blkcg_policy_unregister(&blkcg_policy_bfq); ++#endif ++ bfq_slab_kill(); ++} ++ ++module_init(bfq_init); ++module_exit(bfq_exit); ++ ++MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); ++MODULE_LICENSE("GPL"); +diff --git a/block/bfq.h b/block/bfq.h +new file mode 100644 +index 000000000000..f5751ea59d98 +--- /dev/null ++++ b/block/bfq.h +@@ -0,0 +1,948 @@ ++/* ++ * BFQ v8r12 for 4.11.0: data structures and common functions prototypes. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2015 Paolo Valente ++ * ++ * Copyright (C) 2017 Paolo Valente ++ */ ++ ++#ifndef _BFQ_H ++#define _BFQ_H ++ ++#include ++#include ++#include ++ ++#define BFQ_IOPRIO_CLASSES 3 ++#define BFQ_CL_IDLE_TIMEOUT (HZ/5) ++ ++#define BFQ_MIN_WEIGHT 1 ++#define BFQ_MAX_WEIGHT 1000 ++#define BFQ_WEIGHT_CONVERSION_COEFF 10 ++ ++#define BFQ_DEFAULT_QUEUE_IOPRIO 4 ++ ++#define BFQ_WEIGHT_LEGACY_DFL 100 ++#define BFQ_DEFAULT_GRP_IOPRIO 0 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE ++ ++/* ++ * Soft real-time applications are extremely more latency sensitive ++ * than interactive ones. Over-raise the weight of the former to ++ * privilege them against the latter. ++ */ ++#define BFQ_SOFTRT_WEIGHT_FACTOR 100 ++ ++struct bfq_entity; ++ ++/** ++ * struct bfq_service_tree - per ioprio_class service tree. ++ * ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each ++ * ioprio_class has its own independent scheduler, and so its own ++ * bfq_service_tree. All the fields are protected by the queue lock ++ * of the containing bfqd. ++ */ ++struct bfq_service_tree { ++ /* tree for active entities (i.e., those backlogged) */ ++ struct rb_root active; ++ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ ++ struct rb_root idle; ++ ++ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ ++ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ ++ ++ u64 vtime; /* scheduler virtual time */ ++ /* scheduler weight sum; active and idle entities contribute to it */ ++ unsigned long wsum; ++}; ++ ++/** ++ * struct bfq_sched_data - multi-class scheduler. ++ * ++ * bfq_sched_data is the basic scheduler queue. It supports three ++ * ioprio_classes, and can be used either as a toplevel queue or as an ++ * intermediate queue on a hierarchical setup. @next_in_service ++ * points to the active entity of the sched_data service trees that ++ * will be scheduled next. It is used to reduce the number of steps ++ * needed for each hierarchical-schedule update. ++ * ++ * The supported ioprio_classes are the same as in CFQ, in descending ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. ++ * Requests from higher priority queues are served before all the ++ * requests from lower priority queues; among requests of the same ++ * queue requests are served according to B-WF2Q+. ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_sched_data { ++ struct bfq_entity *in_service_entity; /* entity in service */ ++ /* head-of-the-line entity in the scheduler (see comments above) */ ++ struct bfq_entity *next_in_service; ++ /* array of service trees, one per ioprio_class */ ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; ++ /* last time CLASS_IDLE was served */ ++ unsigned long bfq_class_idle_last_service; ++ ++}; ++ ++/** ++ * struct bfq_weight_counter - counter of the number of all active entities ++ * with a given weight. ++ */ ++struct bfq_weight_counter { ++ unsigned int weight; /* weight of the entities this counter refers to */ ++ unsigned int num_active; /* nr of active entities with this weight */ ++ /* ++ * Weights tree member (see bfq_data's @queue_weights_tree and ++ * @group_weights_tree) ++ */ ++ struct rb_node weights_node; ++}; ++ ++/** ++ * struct bfq_entity - schedulable entity. ++ * ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each ++ * entity belongs to the sched_data of the parent group in the cgroup ++ * hierarchy. Non-leaf entities have also their own sched_data, stored ++ * in @my_sched_data. ++ * ++ * Each entity stores independently its priority values; this would ++ * allow different weights on different devices, but this ++ * functionality is not exported to userspace by now. Priorities and ++ * weights are updated lazily, first storing the new values into the ++ * new_* fields, then setting the @prio_changed flag. As soon as ++ * there is a transition in the entity state that allows the priority ++ * update to take place the effective and the requested priority ++ * values are synchronized. ++ * ++ * Unless cgroups are used, the weight value is calculated from the ++ * ioprio to export the same interface as CFQ. When dealing with ++ * ``well-behaved'' queues (i.e., queues that do not spend too much ++ * time to consume their budget and have true sequential behavior, and ++ * when there are no external factors breaking anticipation) the ++ * relative weights at each level of the cgroups hierarchy should be ++ * guaranteed. All the fields are protected by the queue lock of the ++ * containing bfqd. ++ */ ++struct bfq_entity { ++ struct rb_node rb_node; /* service_tree member */ ++ /* pointer to the weight counter associated with this entity */ ++ struct bfq_weight_counter *weight_counter; ++ ++ /* ++ * Flag, true if the entity is on a tree (either the active or ++ * the idle one of its service_tree) or is in service. ++ */ ++ bool on_st; ++ ++ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ ++ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ ++ ++ /* tree the entity is enqueued into; %NULL if not on a tree */ ++ struct rb_root *tree; ++ ++ /* ++ * minimum start time of the (active) subtree rooted at this ++ * entity; used for O(log N) lookups into active trees ++ */ ++ u64 min_start; ++ ++ /* amount of service received during the last service slot */ ++ int service; ++ ++ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ ++ int budget; ++ ++ unsigned int weight; /* weight of the queue */ ++ unsigned int new_weight; /* next weight if a change is in progress */ ++ ++ /* original weight, used to implement weight boosting */ ++ unsigned int orig_weight; ++ ++ /* parent entity, for hierarchical scheduling */ ++ struct bfq_entity *parent; ++ ++ /* ++ * For non-leaf nodes in the hierarchy, the associated ++ * scheduler queue, %NULL on leaf nodes. ++ */ ++ struct bfq_sched_data *my_sched_data; ++ /* the scheduler queue this entity belongs to */ ++ struct bfq_sched_data *sched_data; ++ ++ /* flag, set to request a weight, ioprio or ioprio_class change */ ++ int prio_changed; ++}; ++ ++struct bfq_group; ++ ++/** ++ * struct bfq_queue - leaf schedulable entity. ++ * ++ * A bfq_queue is a leaf request queue; it can be associated with an ++ * io_context or more, if it is async or shared between cooperating ++ * processes. @cgroup holds a reference to the cgroup, to be sure that it ++ * does not disappear while a bfqq still references it (mostly to avoid ++ * races between request issuing and task migration followed by cgroup ++ * destruction). ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_queue { ++ /* reference counter */ ++ int ref; ++ /* parent bfq_data */ ++ struct bfq_data *bfqd; ++ ++ /* current ioprio and ioprio class */ ++ unsigned short ioprio, ioprio_class; ++ /* next ioprio and ioprio class if a change is in progress */ ++ unsigned short new_ioprio, new_ioprio_class; ++ ++ /* ++ * Shared bfq_queue if queue is cooperating with one or more ++ * other queues. ++ */ ++ struct bfq_queue *new_bfqq; ++ /* request-position tree member (see bfq_group's @rq_pos_tree) */ ++ struct rb_node pos_node; ++ /* request-position tree root (see bfq_group's @rq_pos_tree) */ ++ struct rb_root *pos_root; ++ ++ /* sorted list of pending requests */ ++ struct rb_root sort_list; ++ /* if fifo isn't expired, next request to serve */ ++ struct request *next_rq; ++ /* number of sync and async requests queued */ ++ int queued[2]; ++ /* number of sync and async requests currently allocated */ ++ int allocated[2]; ++ /* number of pending metadata requests */ ++ int meta_pending; ++ /* fifo list of requests in sort_list */ ++ struct list_head fifo; ++ ++ /* entity representing this queue in the scheduler */ ++ struct bfq_entity entity; ++ ++ /* maximum budget allowed from the feedback mechanism */ ++ int max_budget; ++ /* budget expiration (in jiffies) */ ++ unsigned long budget_timeout; ++ ++ /* number of requests on the dispatch list or inside driver */ ++ int dispatched; ++ ++ unsigned int flags; /* status flags.*/ ++ ++ /* node for active/idle bfqq list inside parent bfqd */ ++ struct list_head bfqq_list; ++ ++ /* bit vector: a 1 for each seeky requests in history */ ++ u32 seek_history; ++ ++ /* node for the device's burst list */ ++ struct hlist_node burst_list_node; ++ ++ /* position of the last request enqueued */ ++ sector_t last_request_pos; ++ ++ /* Number of consecutive pairs of request completion and ++ * arrival, such that the queue becomes idle after the ++ * completion, but the next request arrives within an idle ++ * time slice; used only if the queue's IO_bound flag has been ++ * cleared. ++ */ ++ unsigned int requests_within_timer; ++ ++ /* pid of the process owning the queue, used for logging purposes */ ++ pid_t pid; ++ ++ /* ++ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL ++ * if the queue is shared. ++ */ ++ struct bfq_io_cq *bic; ++ ++ /* current maximum weight-raising time for this queue */ ++ unsigned long wr_cur_max_time; ++ /* ++ * Minimum time instant such that, only if a new request is ++ * enqueued after this time instant in an idle @bfq_queue with ++ * no outstanding requests, then the task associated with the ++ * queue it is deemed as soft real-time (see the comments on ++ * the function bfq_bfqq_softrt_next_start()) ++ */ ++ unsigned long soft_rt_next_start; ++ /* ++ * Start time of the current weight-raising period if ++ * the @bfq-queue is being weight-raised, otherwise ++ * finish time of the last weight-raising period. ++ */ ++ unsigned long last_wr_start_finish; ++ /* factor by which the weight of this queue is multiplied */ ++ unsigned int wr_coeff; ++ /* ++ * Time of the last transition of the @bfq_queue from idle to ++ * backlogged. ++ */ ++ unsigned long last_idle_bklogged; ++ /* ++ * Cumulative service received from the @bfq_queue since the ++ * last transition from idle to backlogged. ++ */ ++ unsigned long service_from_backlogged; ++ /* ++ * Value of wr start time when switching to soft rt ++ */ ++ unsigned long wr_start_at_switch_to_srt; ++ ++ unsigned long split_time; /* time of last split */ ++}; ++ ++/** ++ * struct bfq_ttime - per process thinktime stats. ++ */ ++struct bfq_ttime { ++ u64 last_end_request; /* completion time of last request */ ++ ++ u64 ttime_total; /* total process thinktime */ ++ unsigned long ttime_samples; /* number of thinktime samples */ ++ u64 ttime_mean; /* average process thinktime */ ++ ++}; ++ ++/** ++ * struct bfq_io_cq - per (request_queue, io_context) structure. ++ */ ++struct bfq_io_cq { ++ /* associated io_cq structure */ ++ struct io_cq icq; /* must be the first member */ ++ /* array of two process queues, the sync and the async */ ++ struct bfq_queue *bfqq[2]; ++ /* associated @bfq_ttime struct */ ++ struct bfq_ttime ttime; ++ /* per (request_queue, blkcg) ioprio */ ++ int ioprio; ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ uint64_t blkcg_serial_nr; /* the current blkcg serial */ ++#endif ++ ++ /* ++ * Snapshot of the idle window before merging; taken to ++ * remember this value while the queue is merged, so as to be ++ * able to restore it in case of split. ++ */ ++ bool saved_idle_window; ++ /* ++ * Same purpose as the previous two fields for the I/O bound ++ * classification of a queue. ++ */ ++ bool saved_IO_bound; ++ ++ /* ++ * Same purpose as the previous fields for the value of the ++ * field keeping the queue's belonging to a large burst ++ */ ++ bool saved_in_large_burst; ++ /* ++ * True if the queue belonged to a burst list before its merge ++ * with another cooperating queue. ++ */ ++ bool was_in_burst_list; ++ ++ /* ++ * Similar to previous fields: save wr information. ++ */ ++ unsigned long saved_wr_coeff; ++ unsigned long saved_last_wr_start_finish; ++ unsigned long saved_wr_start_at_switch_to_srt; ++ unsigned int saved_wr_cur_max_time; ++}; ++ ++enum bfq_device_speed { ++ BFQ_BFQD_FAST, ++ BFQ_BFQD_SLOW, ++}; ++ ++/** ++ * struct bfq_data - per-device data structure. ++ * ++ * All the fields are protected by the @queue lock. ++ */ ++struct bfq_data { ++ /* request queue for the device */ ++ struct request_queue *queue; ++ ++ /* root bfq_group for the device */ ++ struct bfq_group *root_group; ++ ++ /* ++ * rbtree of weight counters of @bfq_queues, sorted by ++ * weight. Used to keep track of whether all @bfq_queues have ++ * the same weight. The tree contains one counter for each ++ * distinct weight associated to some active and not ++ * weight-raised @bfq_queue (see the comments to the functions ++ * bfq_weights_tree_[add|remove] for further details). ++ */ ++ struct rb_root queue_weights_tree; ++ /* ++ * rbtree of non-queue @bfq_entity weight counters, sorted by ++ * weight. Used to keep track of whether all @bfq_groups have ++ * the same weight. The tree contains one counter for each ++ * distinct weight associated to some active @bfq_group (see ++ * the comments to the functions bfq_weights_tree_[add|remove] ++ * for further details). ++ */ ++ struct rb_root group_weights_tree; ++ ++ /* ++ * Number of bfq_queues containing requests (including the ++ * queue in service, even if it is idling). ++ */ ++ int busy_queues; ++ /* number of weight-raised busy @bfq_queues */ ++ int wr_busy_queues; ++ /* number of queued requests */ ++ int queued; ++ /* number of requests dispatched and waiting for completion */ ++ int rq_in_driver; ++ ++ /* ++ * Maximum number of requests in driver in the last ++ * @hw_tag_samples completed requests. ++ */ ++ int max_rq_in_driver; ++ /* number of samples used to calculate hw_tag */ ++ int hw_tag_samples; ++ /* flag set to one if the driver is showing a queueing behavior */ ++ int hw_tag; ++ ++ /* number of budgets assigned */ ++ int budgets_assigned; ++ ++ /* ++ * Timer set when idling (waiting) for the next request from ++ * the queue in service. ++ */ ++ struct hrtimer idle_slice_timer; ++ /* delayed work to restart dispatching on the request queue */ ++ struct work_struct unplug_work; ++ ++ /* bfq_queue in service */ ++ struct bfq_queue *in_service_queue; ++ /* bfq_io_cq (bic) associated with the @in_service_queue */ ++ struct bfq_io_cq *in_service_bic; ++ ++ /* on-disk position of the last served request */ ++ sector_t last_position; ++ ++ /* time of last request completion (ns) */ ++ u64 last_completion; ++ ++ /* time of first rq dispatch in current observation interval (ns) */ ++ u64 first_dispatch; ++ /* time of last rq dispatch in current observation interval (ns) */ ++ u64 last_dispatch; ++ ++ /* beginning of the last budget */ ++ ktime_t last_budget_start; ++ /* beginning of the last idle slice */ ++ ktime_t last_idling_start; ++ ++ /* number of samples in current observation interval */ ++ int peak_rate_samples; ++ /* num of samples of seq dispatches in current observation interval */ ++ u32 sequential_samples; ++ /* total num of sectors transferred in current observation interval */ ++ u64 tot_sectors_dispatched; ++ /* max rq size seen during current observation interval (sectors) */ ++ u32 last_rq_max_size; ++ /* time elapsed from first dispatch in current observ. interval (us) */ ++ u64 delta_from_first; ++ /* current estimate of device peak rate */ ++ u32 peak_rate; ++ ++ /* maximum budget allotted to a bfq_queue before rescheduling */ ++ int bfq_max_budget; ++ ++ /* list of all the bfq_queues active on the device */ ++ struct list_head active_list; ++ /* list of all the bfq_queues idle on the device */ ++ struct list_head idle_list; ++ ++ /* ++ * Timeout for async/sync requests; when it fires, requests ++ * are served in fifo order. ++ */ ++ u64 bfq_fifo_expire[2]; ++ /* weight of backward seeks wrt forward ones */ ++ unsigned int bfq_back_penalty; ++ /* maximum allowed backward seek */ ++ unsigned int bfq_back_max; ++ /* maximum idling time */ ++ u32 bfq_slice_idle; ++ ++ /* user-configured max budget value (0 for auto-tuning) */ ++ int bfq_user_max_budget; ++ /* ++ * Timeout for bfq_queues to consume their budget; used to ++ * prevent seeky queues from imposing long latencies to ++ * sequential or quasi-sequential ones (this also implies that ++ * seeky queues cannot receive guarantees in the service ++ * domain; after a timeout they are charged for the time they ++ * have been in service, to preserve fairness among them, but ++ * without service-domain guarantees). ++ */ ++ unsigned int bfq_timeout; ++ ++ /* ++ * Number of consecutive requests that must be issued within ++ * the idle time slice to set again idling to a queue which ++ * was marked as non-I/O-bound (see the definition of the ++ * IO_bound flag for further details). ++ */ ++ unsigned int bfq_requests_within_timer; ++ ++ /* ++ * Force device idling whenever needed to provide accurate ++ * service guarantees, without caring about throughput ++ * issues. CAVEAT: this may even increase latencies, in case ++ * of useless idling for processes that did stop doing I/O. ++ */ ++ bool strict_guarantees; ++ ++ /* ++ * Last time at which a queue entered the current burst of ++ * queues being activated shortly after each other; for more ++ * details about this and the following parameters related to ++ * a burst of activations, see the comments on the function ++ * bfq_handle_burst. ++ */ ++ unsigned long last_ins_in_burst; ++ /* ++ * Reference time interval used to decide whether a queue has ++ * been activated shortly after @last_ins_in_burst. ++ */ ++ unsigned long bfq_burst_interval; ++ /* number of queues in the current burst of queue activations */ ++ int burst_size; ++ ++ /* common parent entity for the queues in the burst */ ++ struct bfq_entity *burst_parent_entity; ++ /* Maximum burst size above which the current queue-activation ++ * burst is deemed as 'large'. ++ */ ++ unsigned long bfq_large_burst_thresh; ++ /* true if a large queue-activation burst is in progress */ ++ bool large_burst; ++ /* ++ * Head of the burst list (as for the above fields, more ++ * details in the comments on the function bfq_handle_burst). ++ */ ++ struct hlist_head burst_list; ++ ++ /* if set to true, low-latency heuristics are enabled */ ++ bool low_latency; ++ /* ++ * Maximum factor by which the weight of a weight-raised queue ++ * is multiplied. ++ */ ++ unsigned int bfq_wr_coeff; ++ /* maximum duration of a weight-raising period (jiffies) */ ++ unsigned int bfq_wr_max_time; ++ ++ /* Maximum weight-raising duration for soft real-time processes */ ++ unsigned int bfq_wr_rt_max_time; ++ /* ++ * Minimum idle period after which weight-raising may be ++ * reactivated for a queue (in jiffies). ++ */ ++ unsigned int bfq_wr_min_idle_time; ++ /* ++ * Minimum period between request arrivals after which ++ * weight-raising may be reactivated for an already busy async ++ * queue (in jiffies). ++ */ ++ unsigned long bfq_wr_min_inter_arr_async; ++ ++ /* Max service-rate for a soft real-time queue, in sectors/sec */ ++ unsigned int bfq_wr_max_softrt_rate; ++ /* ++ * Cached value of the product R*T, used for computing the ++ * maximum duration of weight raising automatically. ++ */ ++ u64 RT_prod; ++ /* device-speed class for the low-latency heuristic */ ++ enum bfq_device_speed device_speed; ++ ++ /* fallback dummy bfqq for extreme OOM conditions */ ++ struct bfq_queue oom_bfqq; ++}; ++ ++enum bfqq_state_flags { ++ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ ++ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ ++ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* ++ * waiting for a request ++ * without idling the device ++ */ ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ ++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ ++ BFQ_BFQQ_FLAG_IO_bound, /* ++ * bfqq has timed-out at least once ++ * having consumed at most 2/10 of ++ * its budget ++ */ ++ BFQ_BFQQ_FLAG_in_large_burst, /* ++ * bfqq activated in a large burst, ++ * see comments to bfq_handle_burst. ++ */ ++ BFQ_BFQQ_FLAG_softrt_update, /* ++ * may need softrt-next-start ++ * update ++ */ ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ ++ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ ++}; ++ ++#define BFQ_BFQQ_FNS(name) \ ++static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ ++{ \ ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ ++} ++ ++BFQ_BFQQ_FNS(just_created); ++BFQ_BFQQ_FNS(busy); ++BFQ_BFQQ_FNS(wait_request); ++BFQ_BFQQ_FNS(non_blocking_wait_rq); ++BFQ_BFQQ_FNS(must_alloc); ++BFQ_BFQQ_FNS(fifo_expire); ++BFQ_BFQQ_FNS(idle_window); ++BFQ_BFQQ_FNS(sync); ++BFQ_BFQQ_FNS(IO_bound); ++BFQ_BFQQ_FNS(in_large_burst); ++BFQ_BFQQ_FNS(coop); ++BFQ_BFQQ_FNS(split_coop); ++BFQ_BFQQ_FNS(softrt_update); ++#undef BFQ_BFQQ_FNS ++ ++/* Logging facilities. */ ++#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE ++ ++static const char *checked_dev_name(const struct device *dev) ++{ ++ static const char nodev[] = "nodev"; ++ ++ if (dev) ++ return dev_name(dev); ++ ++ return nodev; ++} ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ assert_spin_locked((bfqd)->queue->queue_lock); \ ++ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ ++ pr_crit("%s bfq%d%c %s " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __pbuf, ##args); \ ++} while (0) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ ++ pr_crit("%s %s " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ __pbuf, ##args); \ ++} while (0) ++ ++#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ pr_crit("%s bfq%d%c " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ ##args) ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) ++ ++#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ pr_crit("%s bfq " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ ##args) ++ ++#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ assert_spin_locked((bfqd)->queue->queue_lock); \ ++ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ ++ (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __pbuf, ##args); \ ++} while (0) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ ++ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ ++} while (0) ++ ++#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ ##args) ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) ++ ++#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) ++#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ ++ ++/* Expiration reasons. */ ++enum bfqq_expiration { ++ BFQ_BFQQ_TOO_IDLE = 0, /* ++ * queue has been idling for ++ * too long ++ */ ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ ++ BFQ_BFQQ_PREEMPTED /* preemption in progress */ ++}; ++ ++ ++struct bfqg_stats { ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ /* number of ios merged */ ++ struct blkg_rwstat merged; ++ /* total time spent on device in ns, may not be accurate w/ queueing */ ++ struct blkg_rwstat service_time; ++ /* total time spent waiting in scheduler queue in ns */ ++ struct blkg_rwstat wait_time; ++ /* number of IOs queued up */ ++ struct blkg_rwstat queued; ++ /* total disk time and nr sectors dispatched by this group */ ++ struct blkg_stat time; ++ /* sum of number of ios queued across all samples */ ++ struct blkg_stat avg_queue_size_sum; ++ /* count of samples taken for average */ ++ struct blkg_stat avg_queue_size_samples; ++ /* how many times this group has been removed from service tree */ ++ struct blkg_stat dequeue; ++ /* total time spent waiting for it to be assigned a timeslice. */ ++ struct blkg_stat group_wait_time; ++ /* time spent idling for this blkcg_gq */ ++ struct blkg_stat idle_time; ++ /* total time with empty current active q with other requests queued */ ++ struct blkg_stat empty_time; ++ /* fields after this shouldn't be cleared on stat reset */ ++ uint64_t start_group_wait_time; ++ uint64_t start_idle_time; ++ uint64_t start_empty_time; ++ uint16_t flags; ++#endif ++}; ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++/* ++ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. ++ * ++ * @ps: @blkcg_policy_storage that this structure inherits ++ * @weight: weight of the bfq_group ++ */ ++struct bfq_group_data { ++ /* must be the first member */ ++ struct blkcg_policy_data pd; ++ ++ unsigned int weight; ++}; ++ ++/** ++ * struct bfq_group - per (device, cgroup) data structure. ++ * @entity: schedulable entity to insert into the parent group sched_data. ++ * @sched_data: own sched_data, to contain child entities (they may be ++ * both bfq_queues and bfq_groups). ++ * @bfqd: the bfq_data for the device this group acts upon. ++ * @async_bfqq: array of async queues for all the tasks belonging to ++ * the group, one queue per ioprio value per ioprio_class, ++ * except for the idle class that has only one queue. ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used ++ * to avoid too many special cases during group creation/ ++ * migration. ++ * @active_entities: number of active entities belonging to the group; ++ * unused for the root group. Used to know whether there ++ * are groups with more than one active @bfq_entity ++ * (see the comments to the function ++ * bfq_bfqq_may_idle()). ++ * @rq_pos_tree: rbtree sorted by next_request position, used when ++ * determining if two or more queues have interleaving ++ * requests (see bfq_find_close_cooperator()). ++ * ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup ++ * there is a set of bfq_groups, each one collecting the lower-level ++ * entities belonging to the group that are acting on the same device. ++ * ++ * Locking works as follows: ++ * o @bfqd is protected by the queue lock, RCU is used to access it ++ * from the readers. ++ * o All the other fields are protected by the @bfqd queue lock. ++ */ ++struct bfq_group { ++ /* must be the first member */ ++ struct blkg_policy_data pd; ++ ++ struct bfq_entity entity; ++ struct bfq_sched_data sched_data; ++ ++ void *bfqd; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct bfq_entity *my_entity; ++ ++ int active_entities; ++ ++ struct rb_root rq_pos_tree; ++ ++ struct bfqg_stats stats; ++}; ++ ++#else ++struct bfq_group { ++ struct bfq_sched_data sched_data; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct rb_root rq_pos_tree; ++}; ++#endif ++ ++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); ++ ++static unsigned int bfq_class_idx(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ return bfqq ? bfqq->ioprio_class - 1 : ++ BFQ_DEFAULT_GRP_CLASS - 1; ++} ++ ++static struct bfq_service_tree * ++bfq_entity_service_tree(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sched_data = entity->sched_data; ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned int idx = bfq_class_idx(entity); ++ ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); ++ BUG_ON(sched_data == NULL); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "entity_service_tree %p %d", ++ sched_data->service_tree + idx, idx); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "entity_service_tree %p %d", ++ sched_data->service_tree + idx, idx); ++ } ++#endif ++ return sched_data->service_tree + idx; ++} ++ ++static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) ++{ ++ return bic->bfqq[is_sync]; ++} ++ ++static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, ++ bool is_sync) ++{ ++ bic->bfqq[is_sync] = bfqq; ++} ++ ++static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) ++{ ++ return bic->icq.q->elevator->elevator_data; ++} ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ ++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *group_entity = bfqq->entity.parent; ++ ++ if (!group_entity) ++ group_entity = &bfqq->bfqd->root_group->entity; ++ ++ return container_of(group_entity, struct bfq_group, entity); ++} ++ ++#else ++ ++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) ++{ ++ return bfqq->bfqd->root_group; ++} ++ ++#endif ++ ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); ++static void bfq_put_queue(struct bfq_queue *bfqq); ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bio *bio, bool is_sync, ++ struct bfq_io_cq *bic); ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); ++#endif ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); ++ ++#endif /* _BFQ_H */ +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 8da66379f7ea..bf000c58644b 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -54,7 +54,7 @@ struct blk_stat_callback; + * Maximum number of blkcg policies allowed to be registered concurrently. + * Defined here to simplify include dependency. + */ +-#define BLKCG_MAX_POLS 3 ++#define BLKCG_MAX_POLS 4 + + typedef void (rq_end_io_fn)(struct request *, blk_status_t); + + +From 9916fed6c89c61a2b26053be04501784570bbec8 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Thu, 20 Jul 2017 10:46:39 +0200 +Subject: [PATCH 02/51] Add extra checks related to entity scheduling + +- extra checks related to ioprioi-class changes +- specific check on st->idle in __bfq_requeue_entity + +Signed-off-by: Paolo Valente +--- + block/bfq-sched.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index ac8991bca9fa..5ddf9af4261e 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -812,6 +812,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + } + #endif + ++ BUG_ON(entity->tree && update_class_too); + BUG_ON(old_st->wsum < entity->weight); + old_st->wsum -= entity->weight; + +@@ -883,8 +884,10 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + + new_st->wsum += entity->weight; + +- if (new_st != old_st) ++ if (new_st != old_st) { ++ BUG_ON(!update_class_too); + entity->start = new_st->vtime; ++ } + } + + return new_st; +@@ -993,6 +996,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + * tree, then it is safe to invoke next function with the last + * parameter set (see the comments on the function). + */ ++ BUG_ON(entity->tree); + st = __bfq_entity_update_weight_prio(st, entity, true); + bfq_calc_finish(entity, entity->budget); + +@@ -1113,9 +1117,11 @@ static void __bfq_activate_entity(struct bfq_entity *entity, + * check for that. + */ + bfq_idle_extract(st, entity); ++ BUG_ON(entity->tree); + entity->start = bfq_gt(min_vstart, entity->finish) ? + min_vstart : entity->finish; + } else { ++ BUG_ON(entity->tree); + /* + * The finish time of the entity may be invalid, and + * it is in the past for sure, otherwise the queue +@@ -1203,6 +1209,7 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) + */ + bfq_calc_finish(entity, entity->service); + entity->start = entity->finish; ++ BUG_ON(entity->tree && entity->tree == &st->idle); + BUG_ON(entity->tree && entity->tree != &st->active); + /* + * In addition, if the entity had more than one child + +From 8f5b2c25dcbe31dda524e85b921b3aa1fe11d111 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Fri, 21 Jul 2017 12:08:57 +0200 +Subject: [PATCH 03/51] block, bfq: reset in_service_entity if it becomes idle + +BFQ implements hierarchical scheduling by representing each group of +queues with a generic parent entity. For each parent entity, BFQ +maintains an in_service_entity pointer: if one of the child entities +happens to be in service, in_service_entity points to it. The +resetting of these pointers happens only on queue expirations: when +the in-service queue is expired, i.e., stops to be the queue in +service, BFQ resets all in_service_entity pointers along the +parent-entity path from this queue to the root entity. + +Functions handling the scheduling of entities assume, naturally, that +in-service entities are active, i.e., have pending I/O requests (or, +as a special case, even if they have no pending requests, they are +expected to receive a new request very soon, with the scheduler idling +the storage device while waiting for such an event). Unfortunately, +the above resetting scheme of the in_service_entity pointers may cause +this assumption to be violated. For example, the in-service queue may +happen to remain without requests because of a request merge. In this +case the queue does become idle, and all related data structures are +updated accordingly. But in_service_entity still points to the queue +in the parent entity. This inconsistency may even propagate to +higher-level parent entities, if they happen to become idle as well, +as a consequence of the leaf queue becoming idle. For this queue and +parent entities, scheduling functions have an undefined behaviour, +and, as reported, may easily lead to kernel crashes or hangs. + +This commit addresses this issue by simply resetting the +in_service_entity field also when it is detected to point to an entity +becoming idle (regardless of why the entity becomes idle). + +Reported-by: Laurentiu Nicola +Signed-off-by: Paolo Valente +Tested-by: Laurentiu Nicola +--- + block/bfq-sched.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index 5ddf9af4261e..a07a06eb5c72 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -1336,8 +1336,10 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, + + BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); + +- if (is_in_service) ++ if (is_in_service) { + bfq_calc_finish(entity, entity->service); ++ sd->in_service_entity = NULL; ++ } + + if (entity->tree == &st->active) + bfq_active_extract(st, entity); + +From 600ea668e2d340c95724bcf981d88812d6900342 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Fri, 28 Jul 2017 21:09:51 +0200 +Subject: [PATCH 04/51] block, bfq: consider also in_service_entity to state + whether an entity is active + +Groups of BFQ queues are represented by generic entities in BFQ. When +a queue belonging to a parent entity is deactivated, the parent entity +may need to be deactivated too, in case the deactivated queue was the +only active queue for the parent entity. This deactivation may need to +be propagated upwards if the entity belongs, in its turn, to a further +higher-level entity, and so on. In particular, the upward propagation +of deactivation stops at the first parent entity that remains active +even if one of its child entities has been deactivated. + +To decide whether the last non-deactivation condition holds for a +parent entity, BFQ checks whether the field next_in_service is still +not NULL for the parent entity, after the deactivation of one of its +child entity. If it is not NULL, then there are certainly other active +entities in the parent entity, and deactivations can stop. + +Unfortunately, this check misses a corner case: if in_service_entity +is not NULL, then next_in_service may happen to be NULL, although the +parent entity is evidently active. This happens if: 1) the entity +pointed by in_service_entity is the only active entity in the parent +entity, and 2) according to the definition of next_in_service, the +in_service_entity cannot be considered as next_in_service. See the +comments on the definition of next_in_service for details on this +second point. + +Hitting the above corner case causes crashes. + +To address this issue, this commit: +1) Extends the above check on only next_in_service to controlling both +next_in_service and in_service_entity (if any of them is not NULL, +then no further deactivation is performed) +2) Improves the (important) comments on how next_in_service is defined +and updated; in particular it fixes a few rather obscure paragraphs + +Reported-by: Eric Wheeler +Reported-by: Rick Yiu +Reported-by: Tom X Nguyen +Signed-off-by: Paolo Valente +Tested-by: Eric Wheeler +Tested-by: Rick Yiu +Tested-by: Laurentiu Nicola +Tested-by: Tom X Nguyen +--- + block/bfq-sched.c | 140 ++++++++++++++++++++++++++++++------------------------ + block/bfq.h | 23 +++++++-- + 2 files changed, 95 insertions(+), 68 deletions(-) + +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index a07a06eb5c72..5c0f9290a79c 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -196,21 +196,23 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) + + /* + * This function tells whether entity stops being a candidate for next +- * service, according to the following logic. ++ * service, according to the restrictive definition of the field ++ * next_in_service. In particular, this function is invoked for an ++ * entity that is about to be set in service. + * +- * This function is invoked for an entity that is about to be set in +- * service. If such an entity is a queue, then the entity is no longer +- * a candidate for next service (i.e, a candidate entity to serve +- * after the in-service entity is expired). The function then returns +- * true. ++ * If entity is a queue, then the entity is no longer a candidate for ++ * next service according to the that definition, because entity is ++ * about to become the in-service queue. This function then returns ++ * true if entity is a queue. + * +- * In contrast, the entity could stil be a candidate for next service +- * if it is not a queue, and has more than one child. In fact, even if +- * one of its children is about to be set in service, other children +- * may still be the next to serve. As a consequence, a non-queue +- * entity is not a candidate for next-service only if it has only one +- * child. And only if this condition holds, then the function returns +- * true for a non-queue entity. ++ * In contrast, entity could still be a candidate for next service if ++ * it is not a queue, and has more than one active child. In fact, ++ * even if one of its children is about to be set in service, other ++ * active children may still be the next to serve, for the parent ++ * entity, even according to the above definition. As a consequence, a ++ * non-queue entity is not a candidate for next-service only if it has ++ * only one active child. And only if this condition holds, then this ++ * function returns true for a non-queue entity. + */ + static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) + { +@@ -223,6 +225,18 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) + + BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); + BUG_ON(bfqg->active_entities == 0); ++ /* ++ * The field active_entities does not always contain the ++ * actual number of active children entities: it happens to ++ * not account for the in-service entity in case the latter is ++ * removed from its active tree (which may get done after ++ * invoking the function bfq_no_longer_next_in_service in ++ * bfq_get_next_queue). Fortunately, here, i.e., while ++ * bfq_no_longer_next_in_service is not yet completed in ++ * bfq_get_next_queue, bfq_active_extract has not yet been ++ * invoked, and thus active_entities still coincides with the ++ * actual number of active entities. ++ */ + if (bfqg->active_entities == 1) + return true; + +@@ -1089,7 +1103,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + * one of its children receives a new request. + * + * Basically, this function updates the timestamps of entity and +- * inserts entity into its active tree, ater possible extracting it ++ * inserts entity into its active tree, ater possibly extracting it + * from its idle tree. + */ + static void __bfq_activate_entity(struct bfq_entity *entity, +@@ -1213,7 +1227,7 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) + BUG_ON(entity->tree && entity->tree != &st->active); + /* + * In addition, if the entity had more than one child +- * when set in service, then was not extracted from ++ * when set in service, then it was not extracted from + * the active tree. This implies that the position of + * the entity in the active tree may need to be + * changed now, because we have just updated the start +@@ -1221,9 +1235,8 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) + * time in a moment (the requeueing is then, more + * precisely, a repositioning in this case). To + * implement this repositioning, we: 1) dequeue the +- * entity here, 2) update the finish time and +- * requeue the entity according to the new +- * timestamps below. ++ * entity here, 2) update the finish time and requeue ++ * the entity according to the new timestamps below. + */ + if (entity->tree) + bfq_active_extract(st, entity); +@@ -1270,9 +1283,9 @@ static void __bfq_activate_requeue_entity(struct bfq_entity *entity, + + + /** +- * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, +- * and activate, requeue or reposition all ancestors +- * for which such an update becomes necessary. ++ * bfq_activate_requeue_entity - activate or requeue an entity representing a bfq_queue, ++ * and activate, requeue or reposition all ancestors ++ * for which such an update becomes necessary. + * @entity: the entity to activate. + * @non_blocking_wait_rq: true if this entity was waiting for a request + * @requeue: true if this is a requeue, which implies that bfqq is +@@ -1308,9 +1321,9 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, + * @ins_into_idle_tree: if false, the entity will not be put into the + * idle tree. + * +- * Deactivates an entity, independently from its previous state. Must ++ * Deactivates an entity, independently of its previous state. Must + * be invoked only if entity is on a service tree. Extracts the entity +- * from that tree, and if necessary and allowed, puts it on the idle ++ * from that tree, and if necessary and allowed, puts it into the idle + * tree. + */ + static bool __bfq_deactivate_entity(struct bfq_entity *entity, +@@ -1359,7 +1372,7 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, + /** + * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. + * @entity: the entity to deactivate. +- * @ins_into_idle_tree: true if the entity can be put on the idle tree ++ * @ins_into_idle_tree: true if the entity can be put into the idle tree + */ + static void bfq_deactivate_entity(struct bfq_entity *entity, + bool ins_into_idle_tree, +@@ -1406,16 +1419,29 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, + */ + bfq_update_next_in_service(sd, NULL); + +- if (sd->next_in_service) { ++ if (sd->next_in_service || sd->in_service_entity) { + /* +- * The parent entity is still backlogged, +- * because next_in_service is not NULL. So, no +- * further upwards deactivation must be +- * performed. Yet, next_in_service has +- * changed. Then the schedule does need to be +- * updated upwards. ++ * The parent entity is still active, because ++ * either next_in_service or in_service_entity ++ * is not NULL. So, no further upwards ++ * deactivation must be performed. Yet, ++ * next_in_service has changed. Then the ++ * schedule does need to be updated upwards. ++ * ++ * NOTE If in_service_entity is not NULL, then ++ * next_in_service may happen to be NULL, ++ * although the parent entity is evidently ++ * active. This happens if 1) the entity ++ * pointed by in_service_entity is the only ++ * active entity in the parent entity, and 2) ++ * according to the definition of ++ * next_in_service, the in_service_entity ++ * cannot be considered as ++ * next_in_service. See the comments on the ++ * definition of next_in_service for details. + */ + BUG_ON(sd->next_in_service == entity); ++ BUG_ON(sd->in_service_entity == entity); + break; + } + +@@ -1806,45 +1832,33 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) + + /* + * If entity is no longer a candidate for next +- * service, then we extract it from its active tree, +- * for the following reason. To further boost the +- * throughput in some special case, BFQ needs to know +- * which is the next candidate entity to serve, while +- * there is already an entity in service. In this +- * respect, to make it easy to compute/update the next +- * candidate entity to serve after the current +- * candidate has been set in service, there is a case +- * where it is necessary to extract the current +- * candidate from its service tree. Such a case is +- * when the entity just set in service cannot be also +- * a candidate for next service. Details about when +- * this conditions holds are reported in the comments +- * on the function bfq_no_longer_next_in_service() +- * invoked below. ++ * service, then it must be extracted from its active ++ * tree, so as to make sure that it won't be ++ * considered when computing next_in_service. See the ++ * comments on the function ++ * bfq_no_longer_next_in_service() for details. + */ + if (bfq_no_longer_next_in_service(entity)) + bfq_active_extract(bfq_entity_service_tree(entity), + entity); + + /* +- * For the same reason why we may have just extracted +- * entity from its active tree, we may need to update +- * next_in_service for the sched_data of entity too, +- * regardless of whether entity has been extracted. +- * In fact, even if entity has not been extracted, a +- * descendant entity may get extracted. Such an event +- * would cause a change in next_in_service for the +- * level of the descendant entity, and thus possibly +- * back to upper levels. ++ * Even if entity is not to be extracted according to ++ * the above check, a descendant entity may get ++ * extracted in one of the next iterations of this ++ * loop. Such an event could cause a change in ++ * next_in_service for the level of the descendant ++ * entity, and thus possibly back to this level. + * +- * We cannot perform the resulting needed update +- * before the end of this loop, because, to know which +- * is the correct next-to-serve candidate entity for +- * each level, we need first to find the leaf entity +- * to set in service. In fact, only after we know +- * which is the next-to-serve leaf entity, we can +- * discover whether the parent entity of the leaf +- * entity becomes the next-to-serve, and so on. ++ * However, we cannot perform the resulting needed ++ * update of next_in_service for this level before the ++ * end of the whole loop, because, to know which is ++ * the correct next-to-serve candidate entity for each ++ * level, we need first to find the leaf entity to set ++ * in service. In fact, only after we know which is ++ * the next-to-serve leaf entity, we can discover ++ * whether the parent entity of the leaf entity ++ * becomes the next-to-serve, and so on. + */ + + /* Log some information */ +diff --git a/block/bfq.h b/block/bfq.h +index f5751ea59d98..ebd9688b9f61 100644 +--- a/block/bfq.h ++++ b/block/bfq.h +@@ -68,17 +68,30 @@ struct bfq_service_tree { + * + * bfq_sched_data is the basic scheduler queue. It supports three + * ioprio_classes, and can be used either as a toplevel queue or as an +- * intermediate queue on a hierarchical setup. @next_in_service +- * points to the active entity of the sched_data service trees that +- * will be scheduled next. It is used to reduce the number of steps +- * needed for each hierarchical-schedule update. ++ * intermediate queue in a hierarchical setup. + * + * The supported ioprio_classes are the same as in CFQ, in descending + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. + * Requests from higher priority queues are served before all the + * requests from lower priority queues; among requests of the same + * queue requests are served according to B-WF2Q+. +- * All the fields are protected by the queue lock of the containing bfqd. ++ * ++ * The schedule is implemented by the service trees, plus the field ++ * @next_in_service, which points to the entity on the active trees ++ * that will be served next, if 1) no changes in the schedule occurs ++ * before the current in-service entity is expired, 2) the in-service ++ * queue becomes idle when it expires, and 3) if the entity pointed by ++ * in_service_entity is not a queue, then the in-service child entity ++ * of the entity pointed by in_service_entity becomes idle on ++ * expiration. This peculiar definition allows for the following ++ * optimization, not yet exploited: while a given entity is still in ++ * service, we already know which is the best candidate for next ++ * service among the other active entitities in the same parent ++ * entity. We can then quickly compare the timestamps of the ++ * in-service entity with those of such best candidate. ++ * ++ * All the fields are protected by the queue lock of the containing ++ * bfqd. + */ + struct bfq_sched_data { + struct bfq_entity *in_service_entity; /* entity in service */ + +From 6b5effd10bc6711a862e7cbd7cd2dd0146defa01 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Thu, 4 May 2017 10:53:43 +0200 +Subject: [PATCH 05/51] block, bfq: improve and refactor throughput-boosting + logic + +When a queue associated with a process remains empty, there are cases +where throughput gets boosted if the device is idled to await the +arrival of a new I/O request for that queue. Currently, BFQ assumes +that one of these cases is when the device has no internal queueing +(regardless of the properties of the I/O being served). Unfortunately, +this condition has proved to be too general. So, this commit refines it +as "the device has no internal queueing and is rotational". + +This refinement provides a significant throughput boost with random +I/O, on flash-based storage without internal queueing. For example, on +a HiKey board, throughput increases by up to 125%, growing, e.g., from +6.9MB/s to 15.6MB/s with two or three random readers in parallel. + +This commit also refactors the code related to device idling, for the +following reason. Finding the change that provides the above large +improvement has been slightly more difficult than it had to be, +because the logic that decides whether to idle the device is still +scattered across three functions. Almost all of the logic is in the +function bfq_bfqq_may_idle, but (1) part of the decision is made in +bfq_update_idle_window, and (2) the function bfq_bfqq_must_idle may +switch off idling regardless of the output of bfq_bfqq_may_idle. In +addition, both bfq_update_idle_window and bfq_bfqq_must_idle make +their decisions as a function of parameters that are used, for similar +purposes, also in bfq_bfqq_may_idle. This commit addresses this issue +by moving all the logic into bfq_bfqq_may_idle. + +Signed-off-by: Paolo Valente +Signed-off-by: Luca Miccio +--- + block/bfq-sq-iosched.c | 141 +++++++++++++++++++++++++++---------------------- + block/bfq.h | 12 ++--- + 2 files changed, 83 insertions(+), 70 deletions(-) + +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index 65e7c7e77f3c..30d019fc67e0 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -684,10 +684,10 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + unsigned int old_wr_coeff; + bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); + +- if (bic->saved_idle_window) +- bfq_mark_bfqq_idle_window(bfqq); ++ if (bic->saved_has_short_ttime) ++ bfq_mark_bfqq_has_short_ttime(bfqq); + else +- bfq_clear_bfqq_idle_window(bfqq); ++ bfq_clear_bfqq_has_short_ttime(bfqq); + + if (bic->saved_IO_bound) + bfq_mark_bfqq_IO_bound(bfqq); +@@ -2047,7 +2047,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) + if (!bic) + return; + +- bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); ++ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); + bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); +@@ -3214,9 +3214,9 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, + } + + bfq_log_bfqq(bfqd, bfqq, +- "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", ++ "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", + reason, slow, bfqq->dispatched, +- bfq_bfqq_idle_window(bfqq), entity->weight); ++ bfq_bfqq_has_short_ttime(bfqq), entity->weight); + + /* + * Increase, decrease or leave budget unchanged according to +@@ -3298,7 +3298,10 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) + static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + { + struct bfq_data *bfqd = bfqq->bfqd; +- bool idling_boosts_thr, idling_boosts_thr_without_issues, ++ bool rot_without_queueing = ++ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, ++ bfqq_sequential_and_IO_bound, ++ idling_boosts_thr, idling_boosts_thr_without_issues, + idling_needed_for_service_guarantees, + asymmetric_scenario; + +@@ -3306,27 +3309,44 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + return true; + + /* ++ * Idling is performed only if slice_idle > 0. In addition, we ++ * do not idle if ++ * (a) bfqq is async ++ * (b) bfqq is in the idle io prio class: in this case we do ++ * not idle because we want to minimize the bandwidth that ++ * queues in this class can steal to higher-priority queues ++ */ ++ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || ++ bfq_class_idle(bfqq)) ++ return false; ++ ++ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && ++ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); ++ /* + * The next variable takes into account the cases where idling + * boosts the throughput. + * + * The value of the variable is computed considering, first, that + * idling is virtually always beneficial for the throughput if: +- * (a) the device is not NCQ-capable, or +- * (b) regardless of the presence of NCQ, the device is rotational +- * and the request pattern for bfqq is I/O-bound and sequential. ++ * (a) the device is not NCQ-capable and rotational, or ++ * (b) regardless of the presence of NCQ, the device is rotational and ++ * the request pattern for bfqq is I/O-bound and sequential, or ++ * (c) regardless of whether it is rotational, the device is ++ * not NCQ-capable and the request pattern for bfqq is ++ * I/O-bound and sequential. + * + * Secondly, and in contrast to the above item (b), idling an + * NCQ-capable flash-based device would not boost the + * throughput even with sequential I/O; rather it would lower + * the throughput in proportion to how fast the device + * is. Accordingly, the next variable is true if any of the +- * above conditions (a) and (b) is true, and, in particular, +- * happens to be false if bfqd is an NCQ-capable flash-based +- * device. ++ * above conditions (a), (b) or (c) is true, and, in ++ * particular, happens to be false if bfqd is an NCQ-capable ++ * flash-based device. + */ +- idling_boosts_thr = !bfqd->hw_tag || +- (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && +- bfq_bfqq_idle_window(bfqq)); ++ idling_boosts_thr = rot_without_queueing || ++ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && ++ bfqq_sequential_and_IO_bound); + + /* + * The value of the next variable, +@@ -3497,12 +3517,10 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); + + /* +- * We have now all the components we need to compute the return +- * value of the function, which is true only if both the following +- * conditions hold: +- * 1) bfqq is sync, because idling make sense only for sync queues; +- * 2) idling either boosts the throughput (without issues), or +- * is necessary to preserve service guarantees. ++ * We have now all the components we need to compute the ++ * return value of the function, which is true only if idling ++ * either boosts the throughput (without issues), or is ++ * necessary to preserve service guarantees. + */ + bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", + bfq_bfqq_sync(bfqq), idling_boosts_thr); +@@ -3514,9 +3532,8 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + bfq_bfqq_IO_bound(bfqq), + idling_needed_for_service_guarantees); + +- return bfq_bfqq_sync(bfqq) && +- (idling_boosts_thr_without_issues || +- idling_needed_for_service_guarantees); ++ return idling_boosts_thr_without_issues || ++ idling_needed_for_service_guarantees; + } + + /* +@@ -3532,10 +3549,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + */ + static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) + { +- struct bfq_data *bfqd = bfqq->bfqd; +- +- return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && +- bfq_bfqq_may_idle(bfqq); ++ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); + } + + /* +@@ -3994,7 +4008,6 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, + case IOPRIO_CLASS_IDLE: + bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; + bfqq->new_ioprio = 7; +- bfq_clear_bfqq_idle_window(bfqq); + break; + } + +@@ -4058,8 +4071,14 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bfq_set_next_ioprio_data(bfqq, bic); + + if (is_sync) { ++ /* ++ * No need to mark as has_short_ttime if in ++ * idle_class, because no device idling is performed ++ * for queues in idle class ++ */ + if (!bfq_class_idle(bfqq)) +- bfq_mark_bfqq_idle_window(bfqq); ++ /* tentatively mark as has_short_ttime */ ++ bfq_mark_bfqq_has_short_ttime(bfqq); + bfq_mark_bfqq_sync(bfqq); + bfq_mark_bfqq_just_created(bfqq); + } else +@@ -4195,18 +4214,19 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, + blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); + } + +-/* +- * Disable idle window if the process thinks too long or seeks so much that +- * it doesn't matter. +- */ +-static void bfq_update_idle_window(struct bfq_data *bfqd, +- struct bfq_queue *bfqq, +- struct bfq_io_cq *bic) ++static void bfq_update_has_short_ttime(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) + { +- int enable_idle; ++ bool has_short_ttime = true; + +- /* Don't idle for async or idle io prio class. */ +- if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) ++ /* ++ * No need to update has_short_ttime if bfqq is async or in ++ * idle io prio class, or if bfq_slice_idle is zero, because ++ * no device idling is performed for bfqq in this case. ++ */ ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || ++ bfqd->bfq_slice_idle == 0) + return; + + /* Idle window just restored, statistics are meaningless. */ +@@ -4214,27 +4234,22 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, + bfqd->bfq_wr_min_idle_time)) + return; + +- enable_idle = bfq_bfqq_idle_window(bfqq); +- ++ /* Think time is infinite if no process is linked to ++ * bfqq. Otherwise check average think time to ++ * decide whether to mark as has_short_ttime ++ */ + if (atomic_read(&bic->icq.ioc->active_ref) == 0 || +- bfqd->bfq_slice_idle == 0 || +- (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && +- bfqq->wr_coeff == 1)) +- enable_idle = 0; +- else if (bfq_sample_valid(bic->ttime.ttime_samples)) { +- if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && +- bfqq->wr_coeff == 1) +- enable_idle = 0; +- else +- enable_idle = 1; +- } +- bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", +- enable_idle); ++ (bfq_sample_valid(bic->ttime.ttime_samples) && ++ bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) ++ has_short_ttime = false; ++ ++ bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", ++ has_short_ttime); + +- if (enable_idle) +- bfq_mark_bfqq_idle_window(bfqq); ++ if (has_short_ttime) ++ bfq_mark_bfqq_has_short_ttime(bfqq); + else +- bfq_clear_bfqq_idle_window(bfqq); ++ bfq_clear_bfqq_has_short_ttime(bfqq); + } + + /* +@@ -4250,14 +4265,12 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bfqq->meta_pending++; + + bfq_update_io_thinktime(bfqd, bic); ++ bfq_update_has_short_ttime(bfqd, bfqq, bic); + bfq_update_io_seektime(bfqd, bfqq, rq); +- if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || +- !BFQQ_SEEKY(bfqq)) +- bfq_update_idle_window(bfqd, bfqq, bic); + + bfq_log_bfqq(bfqd, bfqq, +- "rq_enqueued: idle_window=%d (seeky %d)", +- bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); ++ "rq_enqueued: has_short_ttime=%d (seeky %d)", ++ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); + + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); + +diff --git a/block/bfq.h b/block/bfq.h +index ebd9688b9f61..34fc4697fd89 100644 +--- a/block/bfq.h ++++ b/block/bfq.h +@@ -349,11 +349,11 @@ struct bfq_io_cq { + #endif + + /* +- * Snapshot of the idle window before merging; taken to +- * remember this value while the queue is merged, so as to be +- * able to restore it in case of split. ++ * Snapshot of the has_short_time flag before merging; taken ++ * to remember its value while the queue is merged, so as to ++ * be able to restore it in case of split. + */ +- bool saved_idle_window; ++ bool saved_has_short_ttime; + /* + * Same purpose as the previous two fields for the I/O bound + * classification of a queue. +@@ -610,7 +610,7 @@ enum bfqq_state_flags { + */ + BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ + BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ +- BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ ++ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ + BFQ_BFQQ_FLAG_sync, /* synchronous queue */ + BFQ_BFQQ_FLAG_IO_bound, /* + * bfqq has timed-out at least once +@@ -649,7 +649,7 @@ BFQ_BFQQ_FNS(wait_request); + BFQ_BFQQ_FNS(non_blocking_wait_rq); + BFQ_BFQQ_FNS(must_alloc); + BFQ_BFQQ_FNS(fifo_expire); +-BFQ_BFQQ_FNS(idle_window); ++BFQ_BFQQ_FNS(has_short_ttime); + BFQ_BFQQ_FNS(sync); + BFQ_BFQQ_FNS(IO_bound); + BFQ_BFQQ_FNS(in_large_burst); + +From b5e746fa99d961a5642cffb27c19a77e8b638007 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Mon, 19 Dec 2016 16:59:33 +0100 +Subject: [PATCH 06/51] FIRST BFQ-MQ COMMIT: Copy bfq-sq-iosched.c as + bfq-mq-iosched.c + +This commit introduces bfq-mq-iosched.c, the main source file that +will contain the code of bfq for blk-mq. I name tentatively +bfq-mq this version of bfq. + +For the moment, the file bfq-mq-iosched.c is just a copy of +bfq-sq-iosched.c, i.e, of the main source file of bfq for blk. + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 5392 ++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 5392 insertions(+) + create mode 100644 block/bfq-mq-iosched.c + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +new file mode 100644 +index 000000000000..30d019fc67e0 +--- /dev/null ++++ b/block/bfq-mq-iosched.c +@@ -0,0 +1,5392 @@ ++/* ++ * Budget Fair Queueing (BFQ) I/O scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2015 Paolo Valente ++ * ++ * Copyright (C) 2017 Paolo Valente ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ ++ * file. ++ * ++ * BFQ is a proportional-share I/O scheduler, with some extra ++ * low-latency capabilities. BFQ also supports full hierarchical ++ * scheduling through cgroups. Next paragraphs provide an introduction ++ * on BFQ inner workings. Details on BFQ benefits and usage can be ++ * found in Documentation/block/bfq-iosched.txt. ++ * ++ * BFQ is a proportional-share storage-I/O scheduling algorithm based ++ * on the slice-by-slice service scheme of CFQ. But BFQ assigns ++ * budgets, measured in number of sectors, to processes instead of ++ * time slices. The device is not granted to the in-service process ++ * for a given time slice, but until it has exhausted its assigned ++ * budget. This change from the time to the service domain enables BFQ ++ * to distribute the device throughput among processes as desired, ++ * without any distortion due to throughput fluctuations, or to device ++ * internal queueing. BFQ uses an ad hoc internal scheduler, called ++ * B-WF2Q+, to schedule processes according to their budgets. More ++ * precisely, BFQ schedules queues associated with processes. Thanks to ++ * the accurate policy of B-WF2Q+, BFQ can afford to assign high ++ * budgets to I/O-bound processes issuing sequential requests (to ++ * boost the throughput), and yet guarantee a low latency to ++ * interactive and soft real-time applications. ++ * ++ * NOTE: if the main or only goal, with a given device, is to achieve ++ * the maximum-possible throughput at all times, then do switch off ++ * all low-latency heuristics for that device, by setting low_latency ++ * to 0. ++ * ++ * BFQ is described in [1], where also a reference to the initial, more ++ * theoretical paper on BFQ can be found. The interested reader can find ++ * in the latter paper full details on the main algorithm, as well as ++ * formulas of the guarantees and formal proofs of all the properties. ++ * With respect to the version of BFQ presented in these papers, this ++ * implementation adds a few more heuristics, such as the one that ++ * guarantees a low latency to soft real-time applications, and a ++ * hierarchical extension based on H-WF2Q+. ++ * ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) ++ * complexity derives from the one introduced with EEVDF in [3]. ++ * ++ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O ++ * Scheduler", Proceedings of the First Workshop on Mobile System ++ * Technologies (MST-2015), May 2015. ++ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf ++ * ++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf ++ * ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, ++ * Oct 1997. ++ * ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz ++ * ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline ++ * First: A Flexible and Accurate Mechanism for Proportional Share ++ * Resource Allocation,'' technical report. ++ * ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "blk.h" ++#include "bfq.h" ++ ++/* Expiration time of sync (0) and async (1) requests, in ns. */ ++static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; ++ ++/* Maximum backwards seek, in KiB. */ ++static const int bfq_back_max = (16 * 1024); ++ ++/* Penalty of a backwards seek, in number of sectors. */ ++static const int bfq_back_penalty = 2; ++ ++/* Idling period duration, in ns. */ ++static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); ++ ++/* Minimum number of assigned budgets for which stats are safe to compute. */ ++static const int bfq_stats_min_budgets = 194; ++ ++/* Default maximum budget values, in sectors and number of requests. */ ++static const int bfq_default_max_budget = (16 * 1024); ++ ++/* ++ * Async to sync throughput distribution is controlled as follows: ++ * when an async request is served, the entity is charged the number ++ * of sectors of the request, multiplied by the factor below ++ */ ++static const int bfq_async_charge_factor = 10; ++ ++/* Default timeout values, in jiffies, approximating CFQ defaults. */ ++static const int bfq_timeout = (HZ / 8); ++ ++static struct kmem_cache *bfq_pool; ++ ++/* Below this threshold (in ns), we consider thinktime immediate. */ ++#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) ++ ++/* hw_tag detection: parallel requests threshold and min samples needed. */ ++#define BFQ_HW_QUEUE_THRESHOLD 4 ++#define BFQ_HW_QUEUE_SAMPLES 32 ++ ++#define BFQQ_SEEK_THR (sector_t)(8 * 100) ++#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) ++#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) ++#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) ++ ++/* Min number of samples required to perform peak-rate update */ ++#define BFQ_RATE_MIN_SAMPLES 32 ++/* Min observation time interval required to perform a peak-rate update (ns) */ ++#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) ++/* Target observation time interval for a peak-rate update (ns) */ ++#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC ++ ++/* Shift used for peak rate fixed precision calculations. */ ++#define BFQ_RATE_SHIFT 16 ++ ++/* ++ * By default, BFQ computes the duration of the weight raising for ++ * interactive applications automatically, using the following formula: ++ * duration = (R / r) * T, where r is the peak rate of the device, and ++ * R and T are two reference parameters. ++ * In particular, R is the peak rate of the reference device (see below), ++ * and T is a reference time: given the systems that are likely to be ++ * installed on the reference device according to its speed class, T is ++ * about the maximum time needed, under BFQ and while reading two files in ++ * parallel, to load typical large applications on these systems. ++ * In practice, the slower/faster the device at hand is, the more/less it ++ * takes to load applications with respect to the reference device. ++ * Accordingly, the longer/shorter BFQ grants weight raising to interactive ++ * applications. ++ * ++ * BFQ uses four different reference pairs (R, T), depending on: ++ * . whether the device is rotational or non-rotational; ++ * . whether the device is slow, such as old or portable HDDs, as well as ++ * SD cards, or fast, such as newer HDDs and SSDs. ++ * ++ * The device's speed class is dynamically (re)detected in ++ * bfq_update_peak_rate() every time the estimated peak rate is updated. ++ * ++ * In the following definitions, R_slow[0]/R_fast[0] and ++ * T_slow[0]/T_fast[0] are the reference values for a slow/fast ++ * rotational device, whereas R_slow[1]/R_fast[1] and ++ * T_slow[1]/T_fast[1] are the reference values for a slow/fast ++ * non-rotational device. Finally, device_speed_thresh are the ++ * thresholds used to switch between speed classes. The reference ++ * rates are not the actual peak rates of the devices used as a ++ * reference, but slightly lower values. The reason for using these ++ * slightly lower values is that the peak-rate estimator tends to ++ * yield slightly lower values than the actual peak rate (it can yield ++ * the actual peak rate only if there is only one process doing I/O, ++ * and the process does sequential I/O). ++ * ++ * Both the reference peak rates and the thresholds are measured in ++ * sectors/usec, left-shifted by BFQ_RATE_SHIFT. ++ */ ++static int R_slow[2] = {1000, 10700}; ++static int R_fast[2] = {14000, 33000}; ++/* ++ * To improve readability, a conversion function is used to initialize the ++ * following arrays, which entails that they can be initialized only in a ++ * function. ++ */ ++static int T_slow[2]; ++static int T_fast[2]; ++static int device_speed_thresh[2]; ++ ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) ++ ++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) ++ ++static void bfq_schedule_dispatch(struct bfq_data *bfqd); ++ ++#include "bfq-ioc.c" ++#include "bfq-sched.c" ++#include "bfq-cgroup-included.c" ++ ++#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) ++#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) ++ ++#define bfq_sample_valid(samples) ((samples) > 80) ++ ++/* ++ * Scheduler run of queue, if there are requests pending and no one in the ++ * driver that will restart queueing. ++ */ ++static void bfq_schedule_dispatch(struct bfq_data *bfqd) ++{ ++ if (bfqd->queued != 0) { ++ bfq_log(bfqd, "schedule dispatch"); ++ kblockd_schedule_work(&bfqd->unplug_work); ++ } ++} ++ ++/* ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now. ++ * We choose the request that is closesr to the head right now. Distance ++ * behind the head is penalized and only allowed to a certain extent. ++ */ ++static struct request *bfq_choose_req(struct bfq_data *bfqd, ++ struct request *rq1, ++ struct request *rq2, ++ sector_t last) ++{ ++ sector_t s1, s2, d1 = 0, d2 = 0; ++ unsigned long back_max; ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ ++ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ ++ ++ if (!rq1 || rq1 == rq2) ++ return rq2; ++ if (!rq2) ++ return rq1; ++ ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) ++ return rq1; ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) ++ return rq2; ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) ++ return rq1; ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) ++ return rq2; ++ ++ s1 = blk_rq_pos(rq1); ++ s2 = blk_rq_pos(rq2); ++ ++ /* ++ * By definition, 1KiB is 2 sectors. ++ */ ++ back_max = bfqd->bfq_back_max * 2; ++ ++ /* ++ * Strict one way elevator _except_ in the case where we allow ++ * short backward seeks which are biased as twice the cost of a ++ * similar forward seek. ++ */ ++ if (s1 >= last) ++ d1 = s1 - last; ++ else if (s1 + back_max >= last) ++ d1 = (last - s1) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ1_WRAP; ++ ++ if (s2 >= last) ++ d2 = s2 - last; ++ else if (s2 + back_max >= last) ++ d2 = (last - s2) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ2_WRAP; ++ ++ /* Found required data */ ++ ++ /* ++ * By doing switch() on the bit mask "wrap" we avoid having to ++ * check two variables for all permutations: --> faster! ++ */ ++ switch (wrap) { ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ ++ if (d1 < d2) ++ return rq1; ++ else if (d2 < d1) ++ return rq2; ++ ++ if (s1 >= s2) ++ return rq1; ++ else ++ return rq2; ++ ++ case BFQ_RQ2_WRAP: ++ return rq1; ++ case BFQ_RQ1_WRAP: ++ return rq2; ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ ++ default: ++ /* ++ * Since both rqs are wrapped, ++ * start with the one that's further behind head ++ * (--> only *one* back seek required), ++ * since back seek takes more time than forward. ++ */ ++ if (s1 <= s2) ++ return rq1; ++ else ++ return rq2; ++ } ++} ++ ++static struct bfq_queue * ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, ++ sector_t sector, struct rb_node **ret_parent, ++ struct rb_node ***rb_link) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *bfqq = NULL; ++ ++ parent = NULL; ++ p = &root->rb_node; ++ while (*p) { ++ struct rb_node **n; ++ ++ parent = *p; ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ ++ /* ++ * Sort strictly based on sector. Smallest to the left, ++ * largest to the right. ++ */ ++ if (sector > blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_right; ++ else if (sector < blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_left; ++ else ++ break; ++ p = n; ++ bfqq = NULL; ++ } ++ ++ *ret_parent = parent; ++ if (rb_link) ++ *rb_link = p; ++ ++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", ++ (unsigned long long) sector, ++ bfqq ? bfqq->pid : 0); ++ ++ return bfqq; ++} ++ ++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *__bfqq; ++ ++ if (bfqq->pos_root) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ ++ if (bfq_class_idle(bfqq)) ++ return; ++ if (!bfqq->next_rq) ++ return; ++ ++ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, ++ blk_rq_pos(bfqq->next_rq), &parent, &p); ++ if (!__bfqq) { ++ rb_link_node(&bfqq->pos_node, parent, p); ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); ++ } else ++ bfqq->pos_root = NULL; ++} ++ ++/* ++ * Tell whether there are active queues or groups with differentiated weights. ++ */ ++static bool bfq_differentiated_weights(struct bfq_data *bfqd) ++{ ++ /* ++ * For weights to differ, at least one of the trees must contain ++ * at least two nodes. ++ */ ++ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && ++ (bfqd->queue_weights_tree.rb_node->rb_left || ++ bfqd->queue_weights_tree.rb_node->rb_right) ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ ) || ++ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && ++ (bfqd->group_weights_tree.rb_node->rb_left || ++ bfqd->group_weights_tree.rb_node->rb_right) ++#endif ++ ); ++} ++ ++/* ++ * The following function returns true if every queue must receive the ++ * same share of the throughput (this condition is used when deciding ++ * whether idling may be disabled, see the comments in the function ++ * bfq_bfqq_may_idle()). ++ * ++ * Such a scenario occurs when: ++ * 1) all active queues have the same weight, ++ * 2) all active groups at the same level in the groups tree have the same ++ * weight, ++ * 3) all active groups at the same level in the groups tree have the same ++ * number of children. ++ * ++ * Unfortunately, keeping the necessary state for evaluating exactly the ++ * above symmetry conditions would be quite complex and time-consuming. ++ * Therefore this function evaluates, instead, the following stronger ++ * sub-conditions, for which it is much easier to maintain the needed ++ * state: ++ * 1) all active queues have the same weight, ++ * 2) all active groups have the same weight, ++ * 3) all active groups have at most one active child each. ++ * In particular, the last two conditions are always true if hierarchical ++ * support and the cgroups interface are not enabled, thus no state needs ++ * to be maintained in this case. ++ */ ++static bool bfq_symmetric_scenario(struct bfq_data *bfqd) ++{ ++ return !bfq_differentiated_weights(bfqd); ++} ++ ++/* ++ * If the weight-counter tree passed as input contains no counter for ++ * the weight of the input entity, then add that counter; otherwise just ++ * increment the existing counter. ++ * ++ * Note that weight-counter trees contain few nodes in mostly symmetric ++ * scenarios. For example, if all queues have the same weight, then the ++ * weight-counter tree for the queues may contain at most one node. ++ * This holds even if low_latency is on, because weight-raised queues ++ * are not inserted in the tree. ++ * In most scenarios, the rate at which nodes are created/destroyed ++ * should be low too. ++ */ ++static void bfq_weights_tree_add(struct bfq_data *bfqd, ++ struct bfq_entity *entity, ++ struct rb_root *root) ++{ ++ struct rb_node **new = &(root->rb_node), *parent = NULL; ++ ++ /* ++ * Do not insert if the entity is already associated with a ++ * counter, which happens if: ++ * 1) the entity is associated with a queue, ++ * 2) a request arrival has caused the queue to become both ++ * non-weight-raised, and hence change its weight, and ++ * backlogged; in this respect, each of the two events ++ * causes an invocation of this function, ++ * 3) this is the invocation of this function caused by the ++ * second event. This second invocation is actually useless, ++ * and we handle this fact by exiting immediately. More ++ * efficient or clearer solutions might possibly be adopted. ++ */ ++ if (entity->weight_counter) ++ return; ++ ++ while (*new) { ++ struct bfq_weight_counter *__counter = container_of(*new, ++ struct bfq_weight_counter, ++ weights_node); ++ parent = *new; ++ ++ if (entity->weight == __counter->weight) { ++ entity->weight_counter = __counter; ++ goto inc_counter; ++ } ++ if (entity->weight < __counter->weight) ++ new = &((*new)->rb_left); ++ else ++ new = &((*new)->rb_right); ++ } ++ ++ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), ++ GFP_ATOMIC); ++ ++ /* ++ * In the unlucky event of an allocation failure, we just ++ * exit. This will cause the weight of entity to not be ++ * considered in bfq_differentiated_weights, which, in its ++ * turn, causes the scenario to be deemed wrongly symmetric in ++ * case entity's weight would have been the only weight making ++ * the scenario asymmetric. On the bright side, no unbalance ++ * will however occur when entity becomes inactive again (the ++ * invocation of this function is triggered by an activation ++ * of entity). In fact, bfq_weights_tree_remove does nothing ++ * if !entity->weight_counter. ++ */ ++ if (unlikely(!entity->weight_counter)) ++ return; ++ ++ entity->weight_counter->weight = entity->weight; ++ rb_link_node(&entity->weight_counter->weights_node, parent, new); ++ rb_insert_color(&entity->weight_counter->weights_node, root); ++ ++inc_counter: ++ entity->weight_counter->num_active++; ++} ++ ++/* ++ * Decrement the weight counter associated with the entity, and, if the ++ * counter reaches 0, remove the counter from the tree. ++ * See the comments to the function bfq_weights_tree_add() for considerations ++ * about overhead. ++ */ ++static void bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_entity *entity, ++ struct rb_root *root) ++{ ++ if (!entity->weight_counter) ++ return; ++ ++ BUG_ON(RB_EMPTY_ROOT(root)); ++ BUG_ON(entity->weight_counter->weight != entity->weight); ++ ++ BUG_ON(!entity->weight_counter->num_active); ++ entity->weight_counter->num_active--; ++ if (entity->weight_counter->num_active > 0) ++ goto reset_entity_pointer; ++ ++ rb_erase(&entity->weight_counter->weights_node, root); ++ kfree(entity->weight_counter); ++ ++reset_entity_pointer: ++ entity->weight_counter = NULL; ++} ++ ++/* ++ * Return expired entry, or NULL to just start from scratch in rbtree. ++ */ ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct request *rq; ++ ++ if (bfq_bfqq_fifo_expire(bfqq)) ++ return NULL; ++ ++ bfq_mark_bfqq_fifo_expire(bfqq); ++ ++ rq = rq_entry_fifo(bfqq->fifo.next); ++ ++ if (rq == last || ktime_get_ns() < rq->fifo_time) ++ return NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); ++ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); ++ return rq; ++} ++ ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct rb_node *rbnext = rb_next(&last->rb_node); ++ struct rb_node *rbprev = rb_prev(&last->rb_node); ++ struct request *next, *prev = NULL; ++ ++ BUG_ON(list_empty(&bfqq->fifo)); ++ ++ /* Follow expired path, else get first next available. */ ++ next = bfq_check_fifo(bfqq, last); ++ if (next) { ++ BUG_ON(next == last); ++ return next; ++ } ++ ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); ++ ++ if (rbprev) ++ prev = rb_entry_rq(rbprev); ++ ++ if (rbnext) ++ next = rb_entry_rq(rbnext); ++ else { ++ rbnext = rb_first(&bfqq->sort_list); ++ if (rbnext && rbnext != &last->rb_node) ++ next = rb_entry_rq(rbnext); ++ } ++ ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); ++} ++ ++/* see the definition of bfq_async_charge_factor for details */ ++static unsigned long bfq_serv_to_charge(struct request *rq, ++ struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) ++ return blk_rq_sectors(rq); ++ ++ /* ++ * If there are no weight-raised queues, then amplify service ++ * by just the async charge factor; otherwise amplify service ++ * by twice the async charge factor, to further reduce latency ++ * for weight-raised queues. ++ */ ++ if (bfqq->bfqd->wr_busy_queues == 0) ++ return blk_rq_sectors(rq) * bfq_async_charge_factor; ++ ++ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; ++} ++ ++/** ++ * bfq_updated_next_req - update the queue after a new next_rq selection. ++ * @bfqd: the device data the queue belongs to. ++ * @bfqq: the queue to update. ++ * ++ * If the first request of a queue changes we make sure that the queue ++ * has enough budget to serve at least its first request (if the ++ * request has grown). We do this because if the queue has not enough ++ * budget for its first request, it has to go through two dispatch ++ * rounds to actually get it dispatched. ++ */ ++static void bfq_updated_next_req(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ struct request *next_rq = bfqq->next_rq; ++ unsigned long new_budget; ++ ++ if (!next_rq) ++ return; ++ ++ if (bfqq == bfqd->in_service_queue) ++ /* ++ * In order not to break guarantees, budgets cannot be ++ * changed after an entity has been selected. ++ */ ++ return; ++ ++ BUG_ON(entity->tree != &st->active); ++ BUG_ON(entity == entity->sched_data->in_service_entity); ++ ++ new_budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ if (entity->budget != new_budget) { ++ entity->budget = new_budget; ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", ++ new_budget); ++ bfq_requeue_bfqq(bfqd, bfqq); ++ } ++} ++ ++static unsigned int bfq_wr_duration(struct bfq_data *bfqd) ++{ ++ u64 dur; ++ ++ if (bfqd->bfq_wr_max_time > 0) ++ return bfqd->bfq_wr_max_time; ++ ++ dur = bfqd->RT_prod; ++ do_div(dur, bfqd->peak_rate); ++ ++ /* ++ * Limit duration between 3 and 13 seconds. Tests show that ++ * higher values than 13 seconds often yield the opposite of ++ * the desired result, i.e., worsen responsiveness by letting ++ * non-interactive and non-soft-real-time applications ++ * preserve weight raising for a too long time interval. ++ * ++ * On the other end, lower values than 3 seconds make it ++ * difficult for most interactive tasks to complete their jobs ++ * before weight-raising finishes. ++ */ ++ if (dur > msecs_to_jiffies(13000)) ++ dur = msecs_to_jiffies(13000); ++ else if (dur < msecs_to_jiffies(3000)) ++ dur = msecs_to_jiffies(3000); ++ ++ return dur; ++} ++ ++static void ++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, bool bfq_already_existing) ++{ ++ unsigned int old_wr_coeff; ++ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); ++ ++ if (bic->saved_has_short_ttime) ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ else ++ bfq_clear_bfqq_has_short_ttime(bfqq); ++ ++ if (bic->saved_IO_bound) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ else ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ if (unlikely(busy)) ++ old_wr_coeff = bfqq->wr_coeff; ++ ++ bfqq->wr_coeff = bic->saved_wr_coeff; ++ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; ++ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); ++ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; ++ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || ++ time_is_before_jiffies(bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time))) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "resume state: switching off wr (%lu + %lu < %lu)", ++ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, ++ jiffies); ++ ++ bfqq->wr_coeff = 1; ++ } ++ ++ /* make sure weight will be updated, however we got here */ ++ bfqq->entity.prio_changed = 1; ++ ++ if (likely(!busy)) ++ return; ++ ++ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); ++ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++} ++ ++static int bfqq_process_refs(struct bfq_queue *bfqq) ++{ ++ int process_refs, io_refs; ++ ++ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); ++ ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; ++ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; ++ BUG_ON(process_refs < 0); ++ return process_refs; ++} ++ ++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ ++static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *item; ++ struct hlist_node *n; ++ ++ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) ++ hlist_del_init(&item->burst_list_node); ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++ bfqd->burst_size = 1; ++ bfqd->burst_parent_entity = bfqq->entity.parent; ++} ++ ++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ ++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ /* Increment burst size to take into account also bfqq */ ++ bfqd->burst_size++; ++ ++ bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); ++ ++ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); ++ ++ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { ++ struct bfq_queue *pos, *bfqq_item; ++ struct hlist_node *n; ++ ++ /* ++ * Enough queues have been activated shortly after each ++ * other to consider this burst as large. ++ */ ++ bfqd->large_burst = true; ++ bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); ++ ++ /* ++ * We can now mark all queues in the burst list as ++ * belonging to a large burst. ++ */ ++ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, ++ burst_list_node) { ++ bfq_mark_bfqq_in_large_burst(bfqq_item); ++ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); ++ } ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); ++ ++ /* ++ * From now on, and until the current burst finishes, any ++ * new queue being activated shortly after the last queue ++ * was inserted in the burst can be immediately marked as ++ * belonging to a large burst. So the burst list is not ++ * needed any more. Remove it. ++ */ ++ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, ++ burst_list_node) ++ hlist_del_init(&pos->burst_list_node); ++ } else /* ++ * Burst not yet large: add bfqq to the burst list. Do ++ * not increment the ref counter for bfqq, because bfqq ++ * is removed from the burst list before freeing bfqq ++ * in put_queue. ++ */ ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++} ++ ++/* ++ * If many queues belonging to the same group happen to be created ++ * shortly after each other, then the processes associated with these ++ * queues have typically a common goal. In particular, bursts of queue ++ * creations are usually caused by services or applications that spawn ++ * many parallel threads/processes. Examples are systemd during boot, ++ * or git grep. To help these processes get their job done as soon as ++ * possible, it is usually better to not grant either weight-raising ++ * or device idling to their queues. ++ * ++ * In this comment we describe, firstly, the reasons why this fact ++ * holds, and, secondly, the next function, which implements the main ++ * steps needed to properly mark these queues so that they can then be ++ * treated in a different way. ++ * ++ * The above services or applications benefit mostly from a high ++ * throughput: the quicker the requests of the activated queues are ++ * cumulatively served, the sooner the target job of these queues gets ++ * completed. As a consequence, weight-raising any of these queues, ++ * which also implies idling the device for it, is almost always ++ * counterproductive. In most cases it just lowers throughput. ++ * ++ * On the other hand, a burst of queue creations may be caused also by ++ * the start of an application that does not consist of a lot of ++ * parallel I/O-bound threads. In fact, with a complex application, ++ * several short processes may need to be executed to start-up the ++ * application. In this respect, to start an application as quickly as ++ * possible, the best thing to do is in any case to privilege the I/O ++ * related to the application with respect to all other ++ * I/O. Therefore, the best strategy to start as quickly as possible ++ * an application that causes a burst of queue creations is to ++ * weight-raise all the queues created during the burst. This is the ++ * exact opposite of the best strategy for the other type of bursts. ++ * ++ * In the end, to take the best action for each of the two cases, the ++ * two types of bursts need to be distinguished. Fortunately, this ++ * seems relatively easy, by looking at the sizes of the bursts. In ++ * particular, we found a threshold such that only bursts with a ++ * larger size than that threshold are apparently caused by ++ * services or commands such as systemd or git grep. For brevity, ++ * hereafter we call just 'large' these bursts. BFQ *does not* ++ * weight-raise queues whose creation occurs in a large burst. In ++ * addition, for each of these queues BFQ performs or does not perform ++ * idling depending on which choice boosts the throughput more. The ++ * exact choice depends on the device and request pattern at ++ * hand. ++ * ++ * Unfortunately, false positives may occur while an interactive task ++ * is starting (e.g., an application is being started). The ++ * consequence is that the queues associated with the task do not ++ * enjoy weight raising as expected. Fortunately these false positives ++ * are very rare. They typically occur if some service happens to ++ * start doing I/O exactly when the interactive task starts. ++ * ++ * Turning back to the next function, it implements all the steps ++ * needed to detect the occurrence of a large burst and to properly ++ * mark all the queues belonging to it (so that they can then be ++ * treated in a different way). This goal is achieved by maintaining a ++ * "burst list" that holds, temporarily, the queues that belong to the ++ * burst in progress. The list is then used to mark these queues as ++ * belonging to a large burst if the burst does become large. The main ++ * steps are the following. ++ * ++ * . when the very first queue is created, the queue is inserted into the ++ * list (as it could be the first queue in a possible burst) ++ * ++ * . if the current burst has not yet become large, and a queue Q that does ++ * not yet belong to the burst is activated shortly after the last time ++ * at which a new queue entered the burst list, then the function appends ++ * Q to the burst list ++ * ++ * . if, as a consequence of the previous step, the burst size reaches ++ * the large-burst threshold, then ++ * ++ * . all the queues in the burst list are marked as belonging to a ++ * large burst ++ * ++ * . the burst list is deleted; in fact, the burst list already served ++ * its purpose (keeping temporarily track of the queues in a burst, ++ * so as to be able to mark them as belonging to a large burst in the ++ * previous sub-step), and now is not needed any more ++ * ++ * . the device enters a large-burst mode ++ * ++ * . if a queue Q that does not belong to the burst is created while ++ * the device is in large-burst mode and shortly after the last time ++ * at which a queue either entered the burst list or was marked as ++ * belonging to the current large burst, then Q is immediately marked ++ * as belonging to a large burst. ++ * ++ * . if a queue Q that does not belong to the burst is created a while ++ * later, i.e., not shortly after, than the last time at which a queue ++ * either entered the burst list or was marked as belonging to the ++ * current large burst, then the current burst is deemed as finished and: ++ * ++ * . the large-burst mode is reset if set ++ * ++ * . the burst list is emptied ++ * ++ * . Q is inserted in the burst list, as Q may be the first queue ++ * in a possible new burst (then the burst list contains just Q ++ * after this step). ++ */ ++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ /* ++ * If bfqq is already in the burst list or is part of a large ++ * burst, or finally has just been split, then there is ++ * nothing else to do. ++ */ ++ if (!hlist_unhashed(&bfqq->burst_list_node) || ++ bfq_bfqq_in_large_burst(bfqq) || ++ time_is_after_eq_jiffies(bfqq->split_time + ++ msecs_to_jiffies(10))) ++ return; ++ ++ /* ++ * If bfqq's creation happens late enough, or bfqq belongs to ++ * a different group than the burst group, then the current ++ * burst is finished, and related data structures must be ++ * reset. ++ * ++ * In this respect, consider the special case where bfqq is ++ * the very first queue created after BFQ is selected for this ++ * device. In this case, last_ins_in_burst and ++ * burst_parent_entity are not yet significant when we get ++ * here. But it is easy to verify that, whether or not the ++ * following condition is true, bfqq will end up being ++ * inserted into the burst list. In particular the list will ++ * happen to contain only bfqq. And this is exactly what has ++ * to happen, as bfqq may be the first queue of the first ++ * burst. ++ */ ++ if (time_is_before_jiffies(bfqd->last_ins_in_burst + ++ bfqd->bfq_burst_interval) || ++ bfqq->entity.parent != bfqd->burst_parent_entity) { ++ bfqd->large_burst = false; ++ bfq_reset_burst_list(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "handle_burst: late activation or different group"); ++ goto end; ++ } ++ ++ /* ++ * If we get here, then bfqq is being activated shortly after the ++ * last queue. So, if the current burst is also large, we can mark ++ * bfqq as belonging to this large burst immediately. ++ */ ++ if (bfqd->large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ goto end; ++ } ++ ++ /* ++ * If we get here, then a large-burst state has not yet been ++ * reached, but bfqq is being activated shortly after the last ++ * queue. Then we add bfqq to the burst. ++ */ ++ bfq_add_to_burst(bfqd, bfqq); ++end: ++ /* ++ * At this point, bfqq either has been added to the current ++ * burst or has caused the current burst to terminate and a ++ * possible new burst to start. In particular, in the second ++ * case, bfqq has become the first queue in the possible new ++ * burst. In both cases last_ins_in_burst needs to be moved ++ * forward. ++ */ ++ bfqd->last_ins_in_burst = jiffies; ++ ++} ++ ++static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ return entity->budget - entity->service; ++} ++ ++/* ++ * If enough samples have been computed, return the current max budget ++ * stored in bfqd, which is dynamically updated according to the ++ * estimated disk peak rate; otherwise return the default max budget ++ */ ++static int bfq_max_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget; ++ else ++ return bfqd->bfq_max_budget; ++} ++ ++/* ++ * Return min budget, which is a fraction of the current or default ++ * max budget (trying with 1/32) ++ */ ++static int bfq_min_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget / 32; ++ else ++ return bfqd->bfq_max_budget / 32; ++} ++ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason); ++ ++/* ++ * The next function, invoked after the input queue bfqq switches from ++ * idle to busy, updates the budget of bfqq. The function also tells ++ * whether the in-service queue should be expired, by returning ++ * true. The purpose of expiring the in-service queue is to give bfqq ++ * the chance to possibly preempt the in-service queue, and the reason ++ * for preempting the in-service queue is to achieve one of the two ++ * goals below. ++ * ++ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has ++ * expired because it has remained idle. In particular, bfqq may have ++ * expired for one of the following two reasons: ++ * ++ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and ++ * did not make it to issue a new request before its last request ++ * was served; ++ * ++ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue ++ * a new request before the expiration of the idling-time. ++ * ++ * Even if bfqq has expired for one of the above reasons, the process ++ * associated with the queue may be however issuing requests greedily, ++ * and thus be sensitive to the bandwidth it receives (bfqq may have ++ * remained idle for other reasons: CPU high load, bfqq not enjoying ++ * idling, I/O throttling somewhere in the path from the process to ++ * the I/O scheduler, ...). But if, after every expiration for one of ++ * the above two reasons, bfqq has to wait for the service of at least ++ * one full budget of another queue before being served again, then ++ * bfqq is likely to get a much lower bandwidth or resource time than ++ * its reserved ones. To address this issue, two countermeasures need ++ * to be taken. ++ * ++ * First, the budget and the timestamps of bfqq need to be updated in ++ * a special way on bfqq reactivation: they need to be updated as if ++ * bfqq did not remain idle and did not expire. In fact, if they are ++ * computed as if bfqq expired and remained idle until reactivation, ++ * then the process associated with bfqq is treated as if, instead of ++ * being greedy, it stopped issuing requests when bfqq remained idle, ++ * and restarts issuing requests only on this reactivation. In other ++ * words, the scheduler does not help the process recover the "service ++ * hole" between bfqq expiration and reactivation. As a consequence, ++ * the process receives a lower bandwidth than its reserved one. In ++ * contrast, to recover this hole, the budget must be updated as if ++ * bfqq was not expired at all before this reactivation, i.e., it must ++ * be set to the value of the remaining budget when bfqq was ++ * expired. Along the same line, timestamps need to be assigned the ++ * value they had the last time bfqq was selected for service, i.e., ++ * before last expiration. Thus timestamps need to be back-shifted ++ * with respect to their normal computation (see [1] for more details ++ * on this tricky aspect). ++ * ++ * Secondly, to allow the process to recover the hole, the in-service ++ * queue must be expired too, to give bfqq the chance to preempt it ++ * immediately. In fact, if bfqq has to wait for a full budget of the ++ * in-service queue to be completed, then it may become impossible to ++ * let the process recover the hole, even if the back-shifted ++ * timestamps of bfqq are lower than those of the in-service queue. If ++ * this happens for most or all of the holes, then the process may not ++ * receive its reserved bandwidth. In this respect, it is worth noting ++ * that, being the service of outstanding requests unpreemptible, a ++ * little fraction of the holes may however be unrecoverable, thereby ++ * causing a little loss of bandwidth. ++ * ++ * The last important point is detecting whether bfqq does need this ++ * bandwidth recovery. In this respect, the next function deems the ++ * process associated with bfqq greedy, and thus allows it to recover ++ * the hole, if: 1) the process is waiting for the arrival of a new ++ * request (which implies that bfqq expired for one of the above two ++ * reasons), and 2) such a request has arrived soon. The first ++ * condition is controlled through the flag non_blocking_wait_rq, ++ * while the second through the flag arrived_in_time. If both ++ * conditions hold, then the function computes the budget in the ++ * above-described special way, and signals that the in-service queue ++ * should be expired. Timestamp back-shifting is done later in ++ * __bfq_activate_entity. ++ * ++ * 2. Reduce latency. Even if timestamps are not backshifted to let ++ * the process associated with bfqq recover a service hole, bfqq may ++ * however happen to have, after being (re)activated, a lower finish ++ * timestamp than the in-service queue. That is, the next budget of ++ * bfqq may have to be completed before the one of the in-service ++ * queue. If this is the case, then preempting the in-service queue ++ * allows this goal to be achieved, apart from the unpreemptible, ++ * outstanding requests mentioned above. ++ * ++ * Unfortunately, regardless of which of the above two goals one wants ++ * to achieve, service trees need first to be updated to know whether ++ * the in-service queue must be preempted. To have service trees ++ * correctly updated, the in-service queue must be expired and ++ * rescheduled, and bfqq must be scheduled too. This is one of the ++ * most costly operations (in future versions, the scheduling ++ * mechanism may be re-designed in such a way to make it possible to ++ * know whether preemption is needed without needing to update service ++ * trees). In addition, queue preemptions almost always cause random ++ * I/O, and thus loss of throughput. Because of these facts, the next ++ * function adopts the following simple scheme to avoid both costly ++ * operations and too frequent preemptions: it requests the expiration ++ * of the in-service queue (unconditionally) only for queues that need ++ * to recover a hole, or that either are weight-raised or deserve to ++ * be weight-raised. ++ */ ++static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool arrived_in_time, ++ bool wr_or_deserves_wr) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { ++ /* ++ * We do not clear the flag non_blocking_wait_rq here, as ++ * the latter is used in bfq_activate_bfqq to signal ++ * that timestamps need to be back-shifted (and is ++ * cleared right after). ++ */ ++ ++ /* ++ * In next assignment we rely on that either ++ * entity->service or entity->budget are not updated ++ * on expiration if bfqq is empty (see ++ * __bfq_bfqq_recalc_budget). Thus both quantities ++ * remain unchanged after such an expiration, and the ++ * following statement therefore assigns to ++ * entity->budget the remaining budget on such an ++ * expiration. For clarity, entity->service is not ++ * updated on expiration in any case, and, in normal ++ * operation, is reset only when bfqq is selected for ++ * service (see bfq_get_next_queue). ++ */ ++ BUG_ON(bfqq->max_budget < 0); ++ entity->budget = min_t(unsigned long, ++ bfq_bfqq_budget_left(bfqq), ++ bfqq->max_budget); ++ ++ BUG_ON(entity->budget < 0); ++ return true; ++ } ++ ++ BUG_ON(bfqq->max_budget < 0); ++ entity->budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(bfqq->next_rq, bfqq)); ++ BUG_ON(entity->budget < 0); ++ ++ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); ++ return wr_or_deserves_wr; ++} ++ ++static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ unsigned int old_wr_coeff, ++ bool wr_or_deserves_wr, ++ bool interactive, ++ bool in_burst, ++ bool soft_rt) ++{ ++ if (old_wr_coeff == 1 && wr_or_deserves_wr) { ++ /* start a weight-raising period */ ++ if (interactive) { ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ } else { ++ bfqq->wr_start_at_switch_to_srt = jiffies; ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff * ++ BFQ_SOFTRT_WEIGHT_FACTOR; ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ } ++ /* ++ * If needed, further reduce budget to make sure it is ++ * close to bfqq's backlog, so as to reduce the ++ * scheduling-error component due to a too large ++ * budget. Do not care about throughput consequences, ++ * but only about latency. Finally, do not assign a ++ * too small budget either, to avoid increasing ++ * latency by causing too frequent expirations. ++ */ ++ bfqq->entity.budget = min_t(unsigned long, ++ bfqq->entity.budget, ++ 2 * bfq_min_budget(bfqd)); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais starting at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } else if (old_wr_coeff > 1) { ++ if (interactive) { /* update wr coeff and duration */ ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ } else if (in_burst) { ++ bfqq->wr_coeff = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq-> ++ wr_cur_max_time)); ++ } else if (soft_rt) { ++ /* ++ * The application is now or still meeting the ++ * requirements for being deemed soft rt. We ++ * can then correctly and safely (re)charge ++ * the weight-raising duration for the ++ * application with the weight-raising ++ * duration for soft rt applications. ++ * ++ * In particular, doing this recharge now, i.e., ++ * before the weight-raising period for the ++ * application finishes, reduces the probability ++ * of the following negative scenario: ++ * 1) the weight of a soft rt application is ++ * raised at startup (as for any newly ++ * created application), ++ * 2) since the application is not interactive, ++ * at a certain time weight-raising is ++ * stopped for the application, ++ * 3) at that time the application happens to ++ * still have pending requests, and hence ++ * is destined to not have a chance to be ++ * deemed soft rt before these requests are ++ * completed (see the comments to the ++ * function bfq_bfqq_softrt_next_start() ++ * for details on soft rt detection), ++ * 4) these pending requests experience a high ++ * latency because the application is not ++ * weight-raised while they are pending. ++ */ ++ if (bfqq->wr_cur_max_time != ++ bfqd->bfq_wr_rt_max_time) { ++ bfqq->wr_start_at_switch_to_srt = ++ bfqq->last_wr_start_finish; ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff * ++ BFQ_SOFTRT_WEIGHT_FACTOR; ++ bfq_log_bfqq(bfqd, bfqq, ++ "switching to soft_rt wr"); ++ } else ++ bfq_log_bfqq(bfqd, bfqq, ++ "moving forward soft_rt wr duration"); ++ bfqq->last_wr_start_finish = jiffies; ++ } ++ } ++} ++ ++static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ return bfqq->dispatched == 0 && ++ time_is_before_jiffies( ++ bfqq->budget_timeout + ++ bfqd->bfq_wr_min_idle_time); ++} ++ ++static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ int old_wr_coeff, ++ struct request *rq, ++ bool *interactive) ++{ ++ bool soft_rt, in_burst, wr_or_deserves_wr, ++ bfqq_wants_to_preempt, ++ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), ++ /* ++ * See the comments on ++ * bfq_bfqq_update_budg_for_activation for ++ * details on the usage of the next variable. ++ */ ++ arrived_in_time = ktime_get_ns() <= ++ RQ_BIC(rq)->ttime.last_end_request + ++ bfqd->bfq_slice_idle * 3; ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfq_add_request non-busy: " ++ "jiffies %lu, in_time %d, idle_long %d busyw %d " ++ "wr_coeff %u", ++ jiffies, arrived_in_time, ++ idle_for_long_time, ++ bfq_bfqq_non_blocking_wait_rq(bfqq), ++ old_wr_coeff); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); ++ ++ /* ++ * bfqq deserves to be weight-raised if: ++ * - it is sync, ++ * - it does not belong to a large burst, ++ * - it has been idle for enough time or is soft real-time, ++ * - is linked to a bfq_io_cq (it is not shared in any sense) ++ */ ++ in_burst = bfq_bfqq_in_large_burst(bfqq); ++ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && ++ !in_burst && ++ time_is_before_jiffies(bfqq->soft_rt_next_start); ++ *interactive = ++ !in_burst && ++ idle_for_long_time; ++ wr_or_deserves_wr = bfqd->low_latency && ++ (bfqq->wr_coeff > 1 || ++ (bfq_bfqq_sync(bfqq) && ++ bfqq->bic && (*interactive || soft_rt))); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfq_add_request: " ++ "in_burst %d, " ++ "soft_rt %d (next %lu), inter %d, bic %p", ++ bfq_bfqq_in_large_burst(bfqq), soft_rt, ++ bfqq->soft_rt_next_start, ++ *interactive, ++ bfqq->bic); ++ ++ /* ++ * Using the last flag, update budget and check whether bfqq ++ * may want to preempt the in-service queue. ++ */ ++ bfqq_wants_to_preempt = ++ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, ++ arrived_in_time, ++ wr_or_deserves_wr); ++ ++ /* ++ * If bfqq happened to be activated in a burst, but has been ++ * idle for much more than an interactive queue, then we ++ * assume that, in the overall I/O initiated in the burst, the ++ * I/O associated with bfqq is finished. So bfqq does not need ++ * to be treated as a queue belonging to a burst ++ * anymore. Accordingly, we reset bfqq's in_large_burst flag ++ * if set, and remove bfqq from the burst list if it's ++ * there. We do not decrement burst_size, because the fact ++ * that bfqq does not need to belong to the burst list any ++ * more does not invalidate the fact that bfqq was created in ++ * a burst. ++ */ ++ if (likely(!bfq_bfqq_just_created(bfqq)) && ++ idle_for_long_time && ++ time_is_before_jiffies( ++ bfqq->budget_timeout + ++ msecs_to_jiffies(10000))) { ++ hlist_del_init(&bfqq->burst_list_node); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ } ++ ++ bfq_clear_bfqq_just_created(bfqq); ++ ++ if (!bfq_bfqq_IO_bound(bfqq)) { ++ if (arrived_in_time) { ++ bfqq->requests_within_timer++; ++ if (bfqq->requests_within_timer >= ++ bfqd->bfq_requests_within_timer) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ } else ++ bfqq->requests_within_timer = 0; ++ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", ++ bfqq->requests_within_timer); ++ } ++ ++ if (bfqd->low_latency) { ++ if (unlikely(time_is_after_jiffies(bfqq->split_time))) ++ /* wraparound */ ++ bfqq->split_time = ++ jiffies - bfqd->bfq_wr_min_idle_time - 1; ++ ++ if (time_is_before_jiffies(bfqq->split_time + ++ bfqd->bfq_wr_min_idle_time)) { ++ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, ++ old_wr_coeff, ++ wr_or_deserves_wr, ++ *interactive, ++ in_burst, ++ soft_rt); ++ ++ if (old_wr_coeff != bfqq->wr_coeff) ++ bfqq->entity.prio_changed = 1; ++ } ++ } ++ ++ bfqq->last_idle_bklogged = jiffies; ++ bfqq->service_from_backlogged = 0; ++ bfq_clear_bfqq_softrt_update(bfqq); ++ ++ bfq_add_bfqq_busy(bfqd, bfqq); ++ ++ /* ++ * Expire in-service queue only if preemption may be needed ++ * for guarantees. In this respect, the function ++ * next_queue_may_preempt just checks a simple, necessary ++ * condition, and not a sufficient condition based on ++ * timestamps. In fact, for the latter condition to be ++ * evaluated, timestamps would need first to be updated, and ++ * this operation is quite costly (see the comments on the ++ * function bfq_bfqq_update_budg_for_activation). ++ */ ++ if (bfqd->in_service_queue && bfqq_wants_to_preempt && ++ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && ++ next_queue_may_preempt(bfqd)) { ++ struct bfq_queue *in_serv = ++ bfqd->in_service_queue; ++ BUG_ON(in_serv == bfqq); ++ ++ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, ++ false, BFQ_BFQQ_PREEMPTED); ++ } ++} ++ ++static void bfq_add_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *next_rq, *prev; ++ unsigned int old_wr_coeff = bfqq->wr_coeff; ++ bool interactive = false; ++ ++ bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", ++ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); ++ ++ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ bfqq->queued[rq_is_sync(rq)]++; ++ bfqd->queued++; ++ ++ elv_rb_add(&bfqq->sort_list, rq); ++ ++ /* ++ * Check if this request is a better next-to-serve candidate. ++ */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); ++ BUG_ON(!next_rq); ++ bfqq->next_rq = next_rq; ++ ++ /* ++ * Adjust priority tree position, if next_rq changes. ++ */ ++ if (prev != bfqq->next_rq) ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ ++ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ ++ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, ++ rq, &interactive); ++ else { ++ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && ++ time_is_before_jiffies( ++ bfqq->last_wr_start_finish + ++ bfqd->bfq_wr_min_inter_arr_async)) { ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "non-idle wrais starting, " ++ "wr_max_time %u wr_busy %d", ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqd->wr_busy_queues); ++ } ++ if (prev != bfqq->next_rq) ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ /* ++ * Assign jiffies to last_wr_start_finish in the following ++ * cases: ++ * ++ * . if bfqq is not going to be weight-raised, because, for ++ * non weight-raised queues, last_wr_start_finish stores the ++ * arrival time of the last request; as of now, this piece ++ * of information is used only for deciding whether to ++ * weight-raise async queues ++ * ++ * . if bfqq is not weight-raised, because, if bfqq is now ++ * switching to weight-raised, then last_wr_start_finish ++ * stores the time when weight-raising starts ++ * ++ * . if bfqq is interactive, because, regardless of whether ++ * bfqq is currently weight-raised, the weight-raising ++ * period must start or restart (this case is considered ++ * separately because it is not detected by the above ++ * conditions, if bfqq is already weight-raised) ++ * ++ * last_wr_start_finish has to be updated also if bfqq is soft ++ * real-time, because the weight-raising period is constantly ++ * restarted on idle-to-busy transitions for these queues, but ++ * this is already done in bfq_bfqq_handle_idle_busy_switch if ++ * needed. ++ */ ++ if (bfqd->low_latency && ++ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) ++ bfqq->last_wr_start_finish = jiffies; ++} ++ ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, ++ struct bio *bio) ++{ ++ struct task_struct *tsk = current; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ if (!bic) ++ return NULL; ++ ++ bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); ++ if (bfqq) ++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); ++ ++ return NULL; ++} ++ ++static sector_t get_sdist(sector_t last_pos, struct request *rq) ++{ ++ sector_t sdist = 0; ++ ++ if (last_pos) { ++ if (last_pos < blk_rq_pos(rq)) ++ sdist = blk_rq_pos(rq) - last_pos; ++ else ++ sdist = last_pos - blk_rq_pos(rq); ++ } ++ ++ return sdist; ++} ++ ++static void bfq_activate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ bfqd->rq_in_driver++; ++} ++ ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ BUG_ON(bfqd->rq_in_driver == 0); ++ bfqd->rq_in_driver--; ++} ++ ++static void bfq_remove_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ const int sync = rq_is_sync(rq); ++ ++ BUG_ON(bfqq->entity.service > bfqq->entity.budget && ++ bfqq == bfqd->in_service_queue); ++ ++ if (bfqq->next_rq == rq) { ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ if (rq->queuelist.prev != &rq->queuelist) ++ list_del_init(&rq->queuelist); ++ BUG_ON(bfqq->queued[sync] == 0); ++ bfqq->queued[sync]--; ++ bfqd->queued--; ++ elv_rb_del(&bfqq->sort_list, rq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ bfqq->next_rq = NULL; ++ ++ BUG_ON(bfqq->entity.budget < 0); ++ ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { ++ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ ++ bfq_del_bfqq_busy(bfqd, bfqq, false); ++ /* ++ * bfqq emptied. In normal operation, when ++ * bfqq is empty, bfqq->entity.service and ++ * bfqq->entity.budget must contain, ++ * respectively, the service received and the ++ * budget used last time bfqq emptied. These ++ * facts do not hold in this case, as at least ++ * this last removal occurred while bfqq is ++ * not in service. To avoid inconsistencies, ++ * reset both bfqq->entity.service and ++ * bfqq->entity.budget, if bfqq has still a ++ * process that may issue I/O requests to it. ++ */ ++ bfqq->entity.budget = bfqq->entity.service = 0; ++ } ++ ++ /* ++ * Remove queue from request-position tree as it is empty. ++ */ ++ if (bfqq->pos_root) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ } ++ ++ if (rq->cmd_flags & REQ_META) { ++ BUG_ON(bfqq->meta_pending == 0); ++ bfqq->meta_pending--; ++ } ++ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); ++} ++ ++static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct request *__rq; ++ ++ __rq = bfq_find_rq_fmerge(bfqd, bio); ++ if (__rq && elv_bio_merge_ok(__rq, bio)) { ++ *req = __rq; ++ return ELEVATOR_FRONT_MERGE; ++ } ++ ++ return ELEVATOR_NO_MERGE; ++} ++ ++static void bfq_merged_request(struct request_queue *q, struct request *req, ++ enum elv_merge type) ++{ ++ if (type == ELEVATOR_FRONT_MERGE && ++ rb_prev(&req->rb_node) && ++ blk_rq_pos(req) < ++ blk_rq_pos(container_of(rb_prev(&req->rb_node), ++ struct request, rb_node))) { ++ struct bfq_queue *bfqq = RQ_BFQQ(req); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *prev, *next_rq; ++ ++ /* Reposition request in its sort_list */ ++ elv_rb_del(&bfqq->sort_list, req); ++ elv_rb_add(&bfqq->sort_list, req); ++ /* Choose next request to be served for bfqq */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, ++ bfqd->last_position); ++ BUG_ON(!next_rq); ++ bfqq->next_rq = next_rq; ++ /* ++ * If next_rq changes, update both the queue's budget to ++ * fit the new request and the queue's position in its ++ * rq_pos_tree. ++ */ ++ if (prev != bfqq->next_rq) { ++ bfq_updated_next_req(bfqd, bfqq); ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ } ++} ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++static void bfq_bio_merged(struct request_queue *q, struct request *req, ++ struct bio *bio) ++{ ++ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); ++} ++#endif ++ ++static void bfq_merged_requests(struct request_queue *q, struct request *rq, ++ struct request *next) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); ++ ++ /* ++ * If next and rq belong to the same bfq_queue and next is older ++ * than rq, then reposition rq in the fifo (by substituting next ++ * with rq). Otherwise, if next and rq belong to different ++ * bfq_queues, never reposition rq: in fact, we would have to ++ * reposition it with respect to next's position in its own fifo, ++ * which would most certainly be too expensive with respect to ++ * the benefits. ++ */ ++ if (bfqq == next_bfqq && ++ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && ++ next->fifo_time < rq->fifo_time) { ++ list_del_init(&rq->queuelist); ++ list_replace_init(&next->queuelist, &rq->queuelist); ++ rq->fifo_time = next->fifo_time; ++ } ++ ++ if (bfqq->next_rq == next) ++ bfqq->next_rq = rq; ++ ++ bfq_remove_request(next); ++ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); ++} ++ ++/* Must be called with bfqq != NULL */ ++static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) ++{ ++ BUG_ON(!bfqq); ++ ++ if (bfq_bfqq_busy(bfqq)) { ++ bfqq->bfqd->wr_busy_queues--; ++ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); ++ } ++ bfqq->wr_coeff = 1; ++ bfqq->wr_cur_max_time = 0; ++ bfqq->last_wr_start_finish = jiffies; ++ /* ++ * Trigger a weight change on the next invocation of ++ * __bfq_entity_update_weight_prio. ++ */ ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "end_wr: wrais ending at %lu, rais_max_time %u", ++ bfqq->last_wr_start_finish, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", ++ bfqq->bfqd->wr_busy_queues); ++} ++ ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ if (bfqg->async_bfqq[i][j]) ++ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); ++ if (bfqg->async_idle_bfqq) ++ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); ++} ++ ++static void bfq_end_wr(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ ++ spin_lock_irq(bfqd->queue->queue_lock); ++ ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ bfq_end_wr_async(bfqd); ++ ++ spin_unlock_irq(bfqd->queue->queue_lock); ++} ++ ++static sector_t bfq_io_struct_pos(void *io_struct, bool request) ++{ ++ if (request) ++ return blk_rq_pos(io_struct); ++ else ++ return ((struct bio *)io_struct)->bi_iter.bi_sector; ++} ++ ++static int bfq_rq_close_to_sector(void *io_struct, bool request, ++ sector_t sector) ++{ ++ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= ++ BFQQ_CLOSE_THR; ++} ++ ++static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ sector_t sector) ++{ ++ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; ++ struct rb_node *parent, *node; ++ struct bfq_queue *__bfqq; ++ ++ if (RB_EMPTY_ROOT(root)) ++ return NULL; ++ ++ /* ++ * First, if we find a request starting at the end of the last ++ * request, choose it. ++ */ ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); ++ if (__bfqq) ++ return __bfqq; ++ ++ /* ++ * If the exact sector wasn't found, the parent of the NULL leaf ++ * will contain the closest sector (rq_pos_tree sorted by ++ * next_request position). ++ */ ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ if (blk_rq_pos(__bfqq->next_rq) < sector) ++ node = rb_next(&__bfqq->pos_node); ++ else ++ node = rb_prev(&__bfqq->pos_node); ++ if (!node) ++ return NULL; ++ ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ return NULL; ++} ++ ++static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, ++ struct bfq_queue *cur_bfqq, ++ sector_t sector) ++{ ++ struct bfq_queue *bfqq; ++ ++ /* ++ * We shall notice if some of the queues are cooperating, ++ * e.g., working closely on the same area of the device. In ++ * that case, we can group them together and: 1) don't waste ++ * time idling, and 2) serve the union of their requests in ++ * the best possible order for throughput. ++ */ ++ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); ++ if (!bfqq || bfqq == cur_bfqq) ++ return NULL; ++ ++ return bfqq; ++} ++ ++static struct bfq_queue * ++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ int process_refs, new_process_refs; ++ struct bfq_queue *__bfqq; ++ ++ /* ++ * If there are no process references on the new_bfqq, then it is ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain ++ * may have dropped their last reference (not just their last process ++ * reference). ++ */ ++ if (!bfqq_process_refs(new_bfqq)) ++ return NULL; ++ ++ /* Avoid a circular list and skip interim queue merges. */ ++ while ((__bfqq = new_bfqq->new_bfqq)) { ++ if (__bfqq == bfqq) ++ return NULL; ++ new_bfqq = __bfqq; ++ } ++ ++ process_refs = bfqq_process_refs(bfqq); ++ new_process_refs = bfqq_process_refs(new_bfqq); ++ /* ++ * If the process for the bfqq has gone away, there is no ++ * sense in merging the queues. ++ */ ++ if (process_refs == 0 || new_process_refs == 0) ++ return NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", ++ new_bfqq->pid); ++ ++ /* ++ * Merging is just a redirection: the requests of the process ++ * owning one of the two queues are redirected to the other queue. ++ * The latter queue, in its turn, is set as shared if this is the ++ * first time that the requests of some process are redirected to ++ * it. ++ * ++ * We redirect bfqq to new_bfqq and not the opposite, because we ++ * are in the context of the process owning bfqq, hence we have ++ * the io_cq of this process. So we can immediately configure this ++ * io_cq to redirect the requests of the process to new_bfqq. ++ * ++ * NOTE, even if new_bfqq coincides with the in-service queue, the ++ * io_cq of new_bfqq is not available, because, if the in-service ++ * queue is shared, bfqd->in_service_bic may not point to the ++ * io_cq of the in-service queue. ++ * Redirecting the requests of the process owning bfqq to the ++ * currently in-service queue is in any case the best option, as ++ * we feed the in-service queue with new requests close to the ++ * last request served and, by doing so, hopefully increase the ++ * throughput. ++ */ ++ bfqq->new_bfqq = new_bfqq; ++ new_bfqq->ref += process_refs; ++ return new_bfqq; ++} ++ ++static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, ++ struct bfq_queue *new_bfqq) ++{ ++ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || ++ (bfqq->ioprio_class != new_bfqq->ioprio_class)) ++ return false; ++ ++ /* ++ * If either of the queues has already been detected as seeky, ++ * then merging it with the other queue is unlikely to lead to ++ * sequential I/O. ++ */ ++ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) ++ return false; ++ ++ /* ++ * Interleaved I/O is known to be done by (some) applications ++ * only for reads, so it does not make sense to merge async ++ * queues. ++ */ ++ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * If this function returns true, then bfqq cannot be merged. The idea ++ * is that true cooperation happens very early after processes start ++ * to do I/O. Usually, late cooperations are just accidental false ++ * positives. In case bfqq is weight-raised, such false positives ++ * would evidently degrade latency guarantees for bfqq. ++ */ ++static bool wr_from_too_long(struct bfq_queue *bfqq) ++{ ++ return bfqq->wr_coeff > 1 && ++ time_is_before_jiffies(bfqq->last_wr_start_finish + ++ msecs_to_jiffies(100)); ++} ++ ++/* ++ * Attempt to schedule a merge of bfqq with the currently in-service ++ * queue or with a close queue among the scheduled queues. Return ++ * NULL if no merge was scheduled, a pointer to the shared bfq_queue ++ * structure otherwise. ++ * ++ * The OOM queue is not allowed to participate to cooperation: in fact, since ++ * the requests temporarily redirected to the OOM queue could be redirected ++ * again to dedicated queues at any time, the state needed to correctly ++ * handle merging with the OOM queue would be quite complex and expensive ++ * to maintain. Besides, in such a critical condition as an out of memory, ++ * the benefits of queue merging may be little relevant, or even negligible. ++ * ++ * Weight-raised queues can be merged only if their weight-raising ++ * period has just started. In fact cooperating processes are usually ++ * started together. Thus, with this filter we avoid false positives ++ * that would jeopardize low-latency guarantees. ++ * ++ * WARNING: queue merging may impair fairness among non-weight raised ++ * queues, for at least two reasons: 1) the original weight of a ++ * merged queue may change during the merged state, 2) even being the ++ * weight the same, a merged queue may be bloated with many more ++ * requests than the ones produced by its originally-associated ++ * process. ++ */ ++static struct bfq_queue * ++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ void *io_struct, bool request) ++{ ++ struct bfq_queue *in_service_bfqq, *new_bfqq; ++ ++ if (bfqq->new_bfqq) ++ return bfqq->new_bfqq; ++ ++ if (io_struct && wr_from_too_long(bfqq) && ++ likely(bfqq != &bfqd->oom_bfqq)) ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have looked for coop, but bfq%d wr", ++ bfqq->pid); ++ ++ if (!io_struct || ++ wr_from_too_long(bfqq) || ++ unlikely(bfqq == &bfqd->oom_bfqq)) ++ return NULL; ++ ++ /* If there is only one backlogged queue, don't search. */ ++ if (bfqd->busy_queues == 1) ++ return NULL; ++ ++ in_service_bfqq = bfqd->in_service_queue; ++ ++ if (in_service_bfqq && in_service_bfqq != bfqq && ++ bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) ++ && likely(in_service_bfqq == &bfqd->oom_bfqq)) ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have tried merge with in-service-queue, but wr"); ++ ++ if (!in_service_bfqq || in_service_bfqq == bfqq || ++ !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || ++ unlikely(in_service_bfqq == &bfqd->oom_bfqq)) ++ goto check_scheduled; ++ ++ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && ++ bfqq->entity.parent == in_service_bfqq->entity.parent && ++ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { ++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); ++ if (new_bfqq) ++ return new_bfqq; ++ } ++ /* ++ * Check whether there is a cooperator among currently scheduled ++ * queues. The only thing we need is that the bio/request is not ++ * NULL, as we need it to establish whether a cooperator exists. ++ */ ++check_scheduled: ++ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, ++ bfq_io_struct_pos(io_struct, request)); ++ ++ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); ++ ++ if (new_bfqq && wr_from_too_long(new_bfqq) && ++ likely(new_bfqq != &bfqd->oom_bfqq) && ++ bfq_may_be_close_cooperator(bfqq, new_bfqq)) ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have merged with bfq%d, but wr", ++ new_bfqq->pid); ++ ++ if (new_bfqq && !wr_from_too_long(new_bfqq) && ++ likely(new_bfqq != &bfqd->oom_bfqq) && ++ bfq_may_be_close_cooperator(bfqq, new_bfqq)) ++ return bfq_setup_merge(bfqq, new_bfqq); ++ ++ return NULL; ++} ++ ++static void bfq_bfqq_save_state(struct bfq_queue *bfqq) ++{ ++ struct bfq_io_cq *bic = bfqq->bic; ++ ++ /* ++ * If !bfqq->bic, the queue is already shared or its requests ++ * have already been redirected to a shared queue; both idle window ++ * and weight raising state have already been saved. Do nothing. ++ */ ++ if (!bic) ++ return; ++ ++ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); ++ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); ++ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); ++ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); ++ bic->saved_wr_coeff = bfqq->wr_coeff; ++ bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; ++ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; ++ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++} ++ ++static void bfq_get_bic_reference(struct bfq_queue *bfqq) ++{ ++ /* ++ * If bfqq->bic has a non-NULL value, the bic to which it belongs ++ * is about to begin using a shared bfq_queue. ++ */ ++ if (bfqq->bic) ++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); ++} ++ ++static void ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", ++ (unsigned long) new_bfqq->pid); ++ /* Save weight raising and idle window of the merged queues */ ++ bfq_bfqq_save_state(bfqq); ++ bfq_bfqq_save_state(new_bfqq); ++ if (bfq_bfqq_IO_bound(bfqq)) ++ bfq_mark_bfqq_IO_bound(new_bfqq); ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ /* ++ * If bfqq is weight-raised, then let new_bfqq inherit ++ * weight-raising. To reduce false positives, neglect the case ++ * where bfqq has just been created, but has not yet made it ++ * to be weight-raised (which may happen because EQM may merge ++ * bfqq even before bfq_add_request is executed for the first ++ * time for bfqq). Handling this case would however be very ++ * easy, thanks to the flag just_created. ++ */ ++ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { ++ new_bfqq->wr_coeff = bfqq->wr_coeff; ++ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; ++ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; ++ new_bfqq->wr_start_at_switch_to_srt = ++ bfqq->wr_start_at_switch_to_srt; ++ if (bfq_bfqq_busy(new_bfqq)) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); ++ } ++ ++ new_bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, new_bfqq, ++ "wr start after merge with %d, rais_max_time %u", ++ bfqq->pid, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ ++ bfqq->wr_coeff = 1; ++ bfqq->entity.prio_changed = 1; ++ if (bfq_bfqq_busy(bfqq)) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++ ++ } ++ ++ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", ++ bfqd->wr_busy_queues); ++ ++ /* ++ * Grab a reference to the bic, to prevent it from being destroyed ++ * before being possibly touched by a bfq_split_bfqq(). ++ */ ++ bfq_get_bic_reference(bfqq); ++ bfq_get_bic_reference(new_bfqq); ++ /* ++ * Merge queues (that is, let bic redirect its requests to new_bfqq) ++ */ ++ bic_set_bfqq(bic, new_bfqq, 1); ++ bfq_mark_bfqq_coop(new_bfqq); ++ /* ++ * new_bfqq now belongs to at least two bics (it is a shared queue): ++ * set new_bfqq->bic to NULL. bfqq either: ++ * - does not belong to any bic any more, and hence bfqq->bic must ++ * be set to NULL, or ++ * - is a queue whose owning bics have already been redirected to a ++ * different queue, hence the queue is destined to not belong to ++ * any bic soon and bfqq->bic is already NULL (therefore the next ++ * assignment causes no harm). ++ */ ++ new_bfqq->bic = NULL; ++ bfqq->bic = NULL; ++ /* release process reference to bfqq */ ++ bfq_put_queue(bfqq); ++} ++ ++static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ bool is_sync = op_is_sync(bio->bi_opf); ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq, *new_bfqq; ++ ++ /* ++ * Disallow merge of a sync bio into an async request. ++ */ ++ if (is_sync && !rq_is_sync(rq)) ++ return false; ++ ++ /* ++ * Lookup the bfqq that this bio will be queued with. Allow ++ * merge only if rq is queued there. ++ * Queue lock is held here. ++ */ ++ bic = bfq_bic_lookup(bfqd, current->io_context); ++ if (!bic) ++ return false; ++ ++ bfqq = bic_to_bfqq(bic, is_sync); ++ /* ++ * We take advantage of this function to perform an early merge ++ * of the queues of possible cooperating processes. ++ */ ++ if (bfqq) { ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); ++ if (new_bfqq) { ++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); ++ /* ++ * If we get here, the bio will be queued in the ++ * shared queue, i.e., new_bfqq, so use new_bfqq ++ * to decide whether bio and rq can be merged. ++ */ ++ bfqq = new_bfqq; ++ } ++ } ++ ++ return bfqq == RQ_BFQQ(rq); ++} ++ ++static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, ++ struct request *next) ++{ ++ return RQ_BFQQ(rq) == RQ_BFQQ(next); ++} ++ ++/* ++ * Set the maximum time for the in-service queue to consume its ++ * budget. This prevents seeky processes from lowering the throughput. ++ * In practice, a time-slice service scheme is used with seeky ++ * processes. ++ */ ++static void bfq_set_budget_timeout(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ unsigned int timeout_coeff; ++ ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) ++ timeout_coeff = 1; ++ else ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; ++ ++ bfqd->last_budget_start = ktime_get(); ++ ++ bfqq->budget_timeout = jiffies + ++ bfqd->bfq_timeout * timeout_coeff; ++ ++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", ++ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); ++} ++ ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ if (bfqq) { ++ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); ++ bfq_mark_bfqq_must_alloc(bfqq); ++ bfq_clear_bfqq_fifo_expire(bfqq); ++ ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && ++ bfqq->wr_coeff > 1 && ++ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ time_is_before_jiffies(bfqq->budget_timeout)) { ++ /* ++ * For soft real-time queues, move the start ++ * of the weight-raising period forward by the ++ * time the queue has not received any ++ * service. Otherwise, a relatively long ++ * service delay is likely to cause the ++ * weight-raising period of the queue to end, ++ * because of the short duration of the ++ * weight-raising period of a soft real-time ++ * queue. It is worth noting that this move ++ * is not so dangerous for the other queues, ++ * because soft real-time queues are not ++ * greedy. ++ * ++ * To not add a further variable, we use the ++ * overloaded field budget_timeout to ++ * determine for how long the queue has not ++ * received service, i.e., how much time has ++ * elapsed since the queue expired. However, ++ * this is a little imprecise, because ++ * budget_timeout is set to jiffies if bfqq ++ * not only expires, but also remains with no ++ * request. ++ */ ++ if (time_after(bfqq->budget_timeout, ++ bfqq->last_wr_start_finish)) ++ bfqq->last_wr_start_finish += ++ jiffies - bfqq->budget_timeout; ++ else ++ bfqq->last_wr_start_finish = jiffies; ++ ++ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { ++ pr_crit( ++ "BFQ WARNING:last %lu budget %lu jiffies %lu", ++ bfqq->last_wr_start_finish, ++ bfqq->budget_timeout, ++ jiffies); ++ pr_crit("diff %lu", jiffies - ++ max_t(unsigned long, ++ bfqq->last_wr_start_finish, ++ bfqq->budget_timeout)); ++ bfqq->last_wr_start_finish = jiffies; ++ } ++ } ++ ++ bfq_set_budget_timeout(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "set_in_service_queue, cur-budget = %d", ++ bfqq->entity.budget); ++ } else ++ bfq_log(bfqd, "set_in_service_queue: NULL"); ++ ++ bfqd->in_service_queue = bfqq; ++} ++ ++/* ++ * Get and set a new queue for service. ++ */ ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); ++ ++ __bfq_set_in_service_queue(bfqd, bfqq); ++ return bfqq; ++} ++ ++static void bfq_arm_slice_timer(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->in_service_queue; ++ struct bfq_io_cq *bic; ++ u32 sl; ++ ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ /* Processes have exited, don't wait. */ ++ bic = bfqd->in_service_bic; ++ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) ++ return; ++ ++ bfq_mark_bfqq_wait_request(bfqq); ++ ++ /* ++ * We don't want to idle for seeks, but we do want to allow ++ * fair distribution of slice time for a process doing back-to-back ++ * seeks. So allow a little bit of time for him to submit a new rq. ++ * ++ * To prevent processes with (partly) seeky workloads from ++ * being too ill-treated, grant them a small fraction of the ++ * assigned budget before reducing the waiting time to ++ * BFQ_MIN_TT. This happened to help reduce latency. ++ */ ++ sl = bfqd->bfq_slice_idle; ++ /* ++ * Unless the queue is being weight-raised or the scenario is ++ * asymmetric, grant only minimum idle time if the queue ++ * is seeky. A long idling is preserved for a weight-raised ++ * queue, or, more in general, in an asymemtric scenario, ++ * because a long idling is needed for guaranteeing to a queue ++ * its reserved share of the throughput (in particular, it is ++ * needed if the queue has a higher weight than some other ++ * queue). ++ */ ++ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && ++ bfq_symmetric_scenario(bfqd)) ++ sl = min_t(u32, sl, BFQ_MIN_TT); ++ ++ bfqd->last_idling_start = ktime_get(); ++ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), ++ HRTIMER_MODE_REL); ++ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); ++ bfq_log(bfqd, "arm idle: %ld/%ld ms", ++ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); ++} ++ ++/* ++ * In autotuning mode, max_budget is dynamically recomputed as the ++ * amount of sectors transferred in timeout at the estimated peak ++ * rate. This enables BFQ to utilize a full timeslice with a full ++ * budget, even if the in-service queue is served at peak rate. And ++ * this maximises throughput with sequential workloads. ++ */ ++static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) ++{ ++ return (u64)bfqd->peak_rate * USEC_PER_MSEC * ++ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; ++} ++ ++/* ++ * Update parameters related to throughput and responsiveness, as a ++ * function of the estimated peak rate. See comments on ++ * bfq_calc_max_budget(), and on T_slow and T_fast arrays. ++ */ ++static void update_thr_responsiveness_params(struct bfq_data *bfqd) ++{ ++ int dev_type = blk_queue_nonrot(bfqd->queue); ++ ++ if (bfqd->bfq_user_max_budget == 0) { ++ bfqd->bfq_max_budget = ++ bfq_calc_max_budget(bfqd); ++ BUG_ON(bfqd->bfq_max_budget < 0); ++ bfq_log(bfqd, "new max_budget = %d", ++ bfqd->bfq_max_budget); ++ } ++ ++ if (bfqd->device_speed == BFQ_BFQD_FAST && ++ bfqd->peak_rate < device_speed_thresh[dev_type]) { ++ bfqd->device_speed = BFQ_BFQD_SLOW; ++ bfqd->RT_prod = R_slow[dev_type] * ++ T_slow[dev_type]; ++ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && ++ bfqd->peak_rate > device_speed_thresh[dev_type]) { ++ bfqd->device_speed = BFQ_BFQD_FAST; ++ bfqd->RT_prod = R_fast[dev_type] * ++ T_fast[dev_type]; ++ } ++ ++ bfq_log(bfqd, ++"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", ++ dev_type == 0 ? "ROT" : "NONROT", ++ bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", ++ bfqd->device_speed == BFQ_BFQD_FAST ? ++ (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : ++ (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, ++ (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> ++ BFQ_RATE_SHIFT); ++} ++ ++static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) ++{ ++ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ ++ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; ++ bfqd->peak_rate_samples = 1; ++ bfqd->sequential_samples = 0; ++ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = ++ blk_rq_sectors(rq); ++ } else /* no new rq dispatched, just reset the number of samples */ ++ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ ++ ++ bfq_log(bfqd, ++ "reset_rate_computation at end, sample %u/%u tot_sects %llu", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ bfqd->tot_sectors_dispatched); ++} ++ ++static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) ++{ ++ u32 rate, weight, divisor; ++ ++ /* ++ * For the convergence property to hold (see comments on ++ * bfq_update_peak_rate()) and for the assessment to be ++ * reliable, a minimum number of samples must be present, and ++ * a minimum amount of time must have elapsed. If not so, do ++ * not compute new rate. Just reset parameters, to get ready ++ * for a new evaluation attempt. ++ */ ++ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || ++ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { ++ bfq_log(bfqd, ++ "update_rate_reset: only resetting, delta_first %lluus samples %d", ++ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); ++ goto reset_computation; ++ } ++ ++ /* ++ * If a new request completion has occurred after last ++ * dispatch, then, to approximate the rate at which requests ++ * have been served by the device, it is more precise to ++ * extend the observation interval to the last completion. ++ */ ++ bfqd->delta_from_first = ++ max_t(u64, bfqd->delta_from_first, ++ bfqd->last_completion - bfqd->first_dispatch); ++ ++ BUG_ON(bfqd->delta_from_first == 0); ++ /* ++ * Rate computed in sects/usec, and not sects/nsec, for ++ * precision issues. ++ */ ++ rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); ++ ++ bfq_log(bfqd, ++"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", ++ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ rate > 20< 20M sectors/sec) ++ */ ++ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && ++ rate <= bfqd->peak_rate) || ++ rate > 20<peak_rate_samples, bfqd->sequential_samples, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ goto reset_computation; ++ } else { ++ bfq_log(bfqd, ++ "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ } ++ ++ /* ++ * We have to update the peak rate, at last! To this purpose, ++ * we use a low-pass filter. We compute the smoothing constant ++ * of the filter as a function of the 'weight' of the new ++ * measured rate. ++ * ++ * As can be seen in next formulas, we define this weight as a ++ * quantity proportional to how sequential the workload is, ++ * and to how long the observation time interval is. ++ * ++ * The weight runs from 0 to 8. The maximum value of the ++ * weight, 8, yields the minimum value for the smoothing ++ * constant. At this minimum value for the smoothing constant, ++ * the measured rate contributes for half of the next value of ++ * the estimated peak rate. ++ * ++ * So, the first step is to compute the weight as a function ++ * of how sequential the workload is. Note that the weight ++ * cannot reach 9, because bfqd->sequential_samples cannot ++ * become equal to bfqd->peak_rate_samples, which, in its ++ * turn, holds true because bfqd->sequential_samples is not ++ * incremented for the first sample. ++ */ ++ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; ++ ++ /* ++ * Second step: further refine the weight as a function of the ++ * duration of the observation interval. ++ */ ++ weight = min_t(u32, 8, ++ div_u64(weight * bfqd->delta_from_first, ++ BFQ_RATE_REF_INTERVAL)); ++ ++ /* ++ * Divisor ranging from 10, for minimum weight, to 2, for ++ * maximum weight. ++ */ ++ divisor = 10 - weight; ++ BUG_ON(divisor == 0); ++ ++ /* ++ * Finally, update peak rate: ++ * ++ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor ++ */ ++ bfqd->peak_rate *= divisor-1; ++ bfqd->peak_rate /= divisor; ++ rate /= divisor; /* smoothing constant alpha = 1/divisor */ ++ ++ bfq_log(bfqd, ++ "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", ++ divisor, ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), ++ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); ++ ++ BUG_ON(bfqd->peak_rate == 0); ++ BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; ++ update_thr_responsiveness_params(bfqd); ++ BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */ ++ bfq_log(bfqd, ++ "update_peak_rate: goto reset, samples %d", ++ bfqd->peak_rate_samples) ; ++ bfq_reset_rate_computation(bfqd, rq); ++ goto update_last_values; /* will add one sample */ ++ } ++ ++ /* ++ * Device idle for very long: the observation interval lasting ++ * up to this dispatch cannot be a valid observation interval ++ * for computing a new peak rate (similarly to the late- ++ * completion event in bfq_completed_request()). Go to ++ * update_rate_and_reset to have the following three steps ++ * taken: ++ * - close the observation interval at the last (previous) ++ * request dispatch or completion ++ * - compute rate, if possible, for that observation interval ++ * - start a new observation interval with this dispatch ++ */ ++ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && ++ bfqd->rq_in_driver == 0) { ++ bfq_log(bfqd, ++"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", ++ (now_ns - bfqd->last_dispatch)>>10, ++ bfqd->peak_rate_samples) ; ++ goto update_rate_and_reset; ++ } ++ ++ /* Update sampling information */ ++ bfqd->peak_rate_samples++; ++ ++ if ((bfqd->rq_in_driver > 0 || ++ now_ns - bfqd->last_completion < BFQ_MIN_TT) ++ && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) ++ bfqd->sequential_samples++; ++ ++ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); ++ ++ /* Reset max observed rq size every 32 dispatches */ ++ if (likely(bfqd->peak_rate_samples % 32)) ++ bfqd->last_rq_max_size = ++ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); ++ else ++ bfqd->last_rq_max_size = blk_rq_sectors(rq); ++ ++ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; ++ ++ bfq_log(bfqd, ++ "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ bfqd->tot_sectors_dispatched, ++ bfqd->delta_from_first>>10); ++ ++ /* Target observation interval not yet reached, go on sampling */ ++ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) ++ goto update_last_values; ++ ++update_rate_and_reset: ++ bfq_update_rate_reset(bfqd, rq); ++update_last_values: ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ bfqd->last_dispatch = now_ns; ++ ++ bfq_log(bfqd, ++ "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", ++ (now_ns - bfqd->first_dispatch)>>10, ++ (unsigned long long) bfqd->last_position, ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ bfq_log(bfqd, ++ "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); ++} ++ ++/* ++ * Move request from internal lists to the dispatch list of the request queue ++ */ ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ /* ++ * For consistency, the next instruction should have been executed ++ * after removing the request from the queue and dispatching it. ++ * We execute instead this instruction before bfq_remove_request() ++ * (and hence introduce a temporary inconsistency), for efficiency. ++ * In fact, in a forced_dispatch, this prevents two counters related ++ * to bfqq->dispatched to risk to be uselessly decremented if bfqq ++ * is not in service, and then to be incremented again after ++ * incrementing bfqq->dispatched. ++ */ ++ bfqq->dispatched++; ++ bfq_update_peak_rate(q->elevator->elevator_data, rq); ++ ++ bfq_remove_request(rq); ++ elv_dispatch_sort(q, rq); ++} ++ ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ /* ++ * If this bfqq is shared between multiple processes, check ++ * to make sure that those processes are still issuing I/Os ++ * within the mean seek distance. If not, it may be time to ++ * break the queues apart again. ++ */ ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) ++ bfq_mark_bfqq_split_coop(bfqq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ if (bfqq->dispatched == 0) ++ /* ++ * Overloading budget_timeout field to store ++ * the time at which the queue remains with no ++ * backlog and no outstanding request; used by ++ * the weight-raising mechanism. ++ */ ++ bfqq->budget_timeout = jiffies; ++ ++ bfq_del_bfqq_busy(bfqd, bfqq, true); ++ } else { ++ bfq_requeue_bfqq(bfqd, bfqq); ++ /* ++ * Resort priority tree of potential close cooperators. ++ */ ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ ++ /* ++ * All in-service entities must have been properly deactivated ++ * or requeued before executing the next function, which ++ * resets all in-service entites as no more in service. ++ */ ++ __bfq_bfqd_reset_in_service(bfqd); ++} ++ ++/** ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. ++ * @bfqd: device data. ++ * @bfqq: queue to update. ++ * @reason: reason for expiration. ++ * ++ * Handle the feedback on @bfqq budget at queue expiration. ++ * See the body for detailed comments. ++ */ ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ enum bfqq_expiration reason) ++{ ++ struct request *next_rq; ++ int budget, min_budget; ++ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ min_budget = bfq_min_budget(bfqd); ++ ++ if (bfqq->wr_coeff == 1) ++ budget = bfqq->max_budget; ++ else /* ++ * Use a constant, low budget for weight-raised queues, ++ * to help achieve a low latency. Keep it slightly higher ++ * than the minimum possible budget, to cause a little ++ * bit fewer expirations. ++ */ ++ budget = 2 * min_budget; ++ ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", ++ budget, bfq_min_budget(bfqd)); ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); ++ ++ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { ++ switch (reason) { ++ /* ++ * Caveat: in all the following cases we trade latency ++ * for throughput. ++ */ ++ case BFQ_BFQQ_TOO_IDLE: ++ /* ++ * This is the only case where we may reduce ++ * the budget: if there is no request of the ++ * process still waiting for completion, then ++ * we assume (tentatively) that the timer has ++ * expired because the batch of requests of ++ * the process could have been served with a ++ * smaller budget. Hence, betting that ++ * process will behave in the same way when it ++ * becomes backlogged again, we reduce its ++ * next budget. As long as we guess right, ++ * this budget cut reduces the latency ++ * experienced by the process. ++ * ++ * However, if there are still outstanding ++ * requests, then the process may have not yet ++ * issued its next request just because it is ++ * still waiting for the completion of some of ++ * the still outstanding ones. So in this ++ * subcase we do not reduce its budget, on the ++ * contrary we increase it to possibly boost ++ * the throughput, as discussed in the ++ * comments to the BUDGET_TIMEOUT case. ++ */ ++ if (bfqq->dispatched > 0) /* still outstanding reqs */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ else { ++ if (budget > 5 * min_budget) ++ budget -= 4 * min_budget; ++ else ++ budget = min_budget; ++ } ++ break; ++ case BFQ_BFQQ_BUDGET_TIMEOUT: ++ /* ++ * We double the budget here because it gives ++ * the chance to boost the throughput if this ++ * is not a seeky process (and has bumped into ++ * this timeout because of, e.g., ZBR). ++ */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_BUDGET_EXHAUSTED: ++ /* ++ * The process still has backlog, and did not ++ * let either the budget timeout or the disk ++ * idling timeout expire. Hence it is not ++ * seeky, has a short thinktime and may be ++ * happy with a higher budget too. So ++ * definitely increase the budget of this good ++ * candidate to boost the disk throughput. ++ */ ++ budget = min(budget * 4, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_NO_MORE_REQUESTS: ++ /* ++ * For queues that expire for this reason, it ++ * is particularly important to keep the ++ * budget close to the actual service they ++ * need. Doing so reduces the timestamp ++ * misalignment problem described in the ++ * comments in the body of ++ * __bfq_activate_entity. In fact, suppose ++ * that a queue systematically expires for ++ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a ++ * new request in time to enjoy timestamp ++ * back-shifting. The larger the budget of the ++ * queue is with respect to the service the ++ * queue actually requests in each service ++ * slot, the more times the queue can be ++ * reactivated with the same virtual finish ++ * time. It follows that, even if this finish ++ * time is pushed to the system virtual time ++ * to reduce the consequent timestamp ++ * misalignment, the queue unjustly enjoys for ++ * many re-activations a lower finish time ++ * than all newly activated queues. ++ * ++ * The service needed by bfqq is measured ++ * quite precisely by bfqq->entity.service. ++ * Since bfqq does not enjoy device idling, ++ * bfqq->entity.service is equal to the number ++ * of sectors that the process associated with ++ * bfqq requested to read/write before waiting ++ * for request completions, or blocking for ++ * other reasons. ++ */ ++ budget = max_t(int, bfqq->entity.service, min_budget); ++ break; ++ default: ++ return; ++ } ++ } else if (!bfq_bfqq_sync(bfqq)) ++ /* ++ * Async queues get always the maximum possible ++ * budget, as for them we do not care about latency ++ * (in addition, their ability to dispatch is limited ++ * by the charging factor). ++ */ ++ budget = bfqd->bfq_max_budget; ++ ++ bfqq->max_budget = budget; ++ ++ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && ++ !bfqd->bfq_user_max_budget) ++ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); ++ ++ /* ++ * If there is still backlog, then assign a new budget, making ++ * sure that it is large enough for the next request. Since ++ * the finish time of bfqq must be kept in sync with the ++ * budget, be sure to call __bfq_bfqq_expire() *after* this ++ * update. ++ * ++ * If there is no backlog, then no need to update the budget; ++ * it will be updated on the arrival of a new request. ++ */ ++ next_rq = bfqq->next_rq; ++ if (next_rq) { ++ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || ++ reason == BFQ_BFQQ_NO_MORE_REQUESTS); ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ BUG_ON(!bfq_bfqq_busy(bfqq)); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", ++ next_rq ? blk_rq_sectors(next_rq) : 0, ++ bfqq->entity.budget); ++} ++ ++/* ++ * Return true if the process associated with bfqq is "slow". The slow ++ * flag is used, in addition to the budget timeout, to reduce the ++ * amount of service provided to seeky processes, and thus reduce ++ * their chances to lower the throughput. More details in the comments ++ * on the function bfq_bfqq_expire(). ++ * ++ * An important observation is in order: as discussed in the comments ++ * on the function bfq_update_peak_rate(), with devices with internal ++ * queues, it is hard if ever possible to know when and for how long ++ * an I/O request is processed by the device (apart from the trivial ++ * I/O pattern where a new request is dispatched only after the ++ * previous one has been completed). This makes it hard to evaluate ++ * the real rate at which the I/O requests of each bfq_queue are ++ * served. In fact, for an I/O scheduler like BFQ, serving a ++ * bfq_queue means just dispatching its requests during its service ++ * slot (i.e., until the budget of the queue is exhausted, or the ++ * queue remains idle, or, finally, a timeout fires). But, during the ++ * service slot of a bfq_queue, around 100 ms at most, the device may ++ * be even still processing requests of bfq_queues served in previous ++ * service slots. On the opposite end, the requests of the in-service ++ * bfq_queue may be completed after the service slot of the queue ++ * finishes. ++ * ++ * Anyway, unless more sophisticated solutions are used ++ * (where possible), the sum of the sizes of the requests dispatched ++ * during the service slot of a bfq_queue is probably the only ++ * approximation available for the service received by the bfq_queue ++ * during its service slot. And this sum is the quantity used in this ++ * function to evaluate the I/O speed of a process. ++ */ ++static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool compensate, enum bfqq_expiration reason, ++ unsigned long *delta_ms) ++{ ++ ktime_t delta_ktime; ++ u32 delta_usecs; ++ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ ++ ++ if (!bfq_bfqq_sync(bfqq)) ++ return false; ++ ++ if (compensate) ++ delta_ktime = bfqd->last_idling_start; ++ else ++ delta_ktime = ktime_get(); ++ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); ++ delta_usecs = ktime_to_us(delta_ktime); ++ ++ /* don't use too short time intervals */ ++ if (delta_usecs < 1000) { ++ if (blk_queue_nonrot(bfqd->queue)) ++ /* ++ * give same worst-case guarantees as idling ++ * for seeky ++ */ ++ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; ++ else /* charge at least one seek */ ++ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; ++ ++ bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); ++ ++ return slow; ++ } ++ ++ *delta_ms = delta_usecs / USEC_PER_MSEC; ++ ++ /* ++ * Use only long (> 20ms) intervals to filter out excessive ++ * spikes in service rate estimation. ++ */ ++ if (delta_usecs > 20000) { ++ /* ++ * Caveat for rotational devices: processes doing I/O ++ * in the slower disk zones tend to be slow(er) even ++ * if not seeky. In this respect, the estimated peak ++ * rate is likely to be an average over the disk ++ * surface. Accordingly, to not be too harsh with ++ * unlucky processes, a process is deemed slow only if ++ * its rate has been lower than half of the estimated ++ * peak rate. ++ */ ++ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; ++ bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", ++ bfqq->entity.service, bfqd->bfq_max_budget); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); ++ ++ return slow; ++} ++ ++/* ++ * To be deemed as soft real-time, an application must meet two ++ * requirements. First, the application must not require an average ++ * bandwidth higher than the approximate bandwidth required to playback or ++ * record a compressed high-definition video. ++ * The next function is invoked on the completion of the last request of a ++ * batch, to compute the next-start time instant, soft_rt_next_start, such ++ * that, if the next request of the application does not arrive before ++ * soft_rt_next_start, then the above requirement on the bandwidth is met. ++ * ++ * The second requirement is that the request pattern of the application is ++ * isochronous, i.e., that, after issuing a request or a batch of requests, ++ * the application stops issuing new requests until all its pending requests ++ * have been completed. After that, the application may issue a new batch, ++ * and so on. ++ * For this reason the next function is invoked to compute ++ * soft_rt_next_start only for applications that meet this requirement, ++ * whereas soft_rt_next_start is set to infinity for applications that do ++ * not. ++ * ++ * Unfortunately, even a greedy application may happen to behave in an ++ * isochronous way if the CPU load is high. In fact, the application may ++ * stop issuing requests while the CPUs are busy serving other processes, ++ * then restart, then stop again for a while, and so on. In addition, if ++ * the disk achieves a low enough throughput with the request pattern ++ * issued by the application (e.g., because the request pattern is random ++ * and/or the device is slow), then the application may meet the above ++ * bandwidth requirement too. To prevent such a greedy application to be ++ * deemed as soft real-time, a further rule is used in the computation of ++ * soft_rt_next_start: soft_rt_next_start must be higher than the current ++ * time plus the maximum time for which the arrival of a request is waited ++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. ++ * This filters out greedy applications, as the latter issue instead their ++ * next request as soon as possible after the last one has been completed ++ * (in contrast, when a batch of requests is completed, a soft real-time ++ * application spends some time processing data). ++ * ++ * Unfortunately, the last filter may easily generate false positives if ++ * only bfqd->bfq_slice_idle is used as a reference time interval and one ++ * or both the following cases occur: ++ * 1) HZ is so low that the duration of a jiffy is comparable to or higher ++ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with ++ * HZ=100. ++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing ++ * for a while, then suddenly 'jump' by several units to recover the lost ++ * increments. This seems to happen, e.g., inside virtual machines. ++ * To address this issue, we do not use as a reference time interval just ++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In ++ * particular we add the minimum number of jiffies for which the filter ++ * seems to be quite precise also in embedded systems and KVM/QEMU virtual ++ * machines. ++ */ ++static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, ++"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", ++ bfqq->service_from_backlogged, ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate)); ++ ++ return max(bfqq->last_idle_bklogged + ++ HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); ++} ++ ++/* ++ * Return the farthest future time instant according to jiffies ++ * macros. ++ */ ++static unsigned long bfq_greatest_from_now(void) ++{ ++ return jiffies + MAX_JIFFY_OFFSET; ++} ++ ++/* ++ * Return the farthest past time instant according to jiffies ++ * macros. ++ */ ++static unsigned long bfq_smallest_from_now(void) ++{ ++ return jiffies - MAX_JIFFY_OFFSET; ++} ++ ++/** ++ * bfq_bfqq_expire - expire a queue. ++ * @bfqd: device owning the queue. ++ * @bfqq: the queue to expire. ++ * @compensate: if true, compensate for the time spent idling. ++ * @reason: the reason causing the expiration. ++ * ++ * If the process associated with bfqq does slow I/O (e.g., because it ++ * issues random requests), we charge bfqq with the time it has been ++ * in service instead of the service it has received (see ++ * bfq_bfqq_charge_time for details on how this goal is achieved). As ++ * a consequence, bfqq will typically get higher timestamps upon ++ * reactivation, and hence it will be rescheduled as if it had ++ * received more service than what it has actually received. In the ++ * end, bfqq receives less service in proportion to how slowly its ++ * associated process consumes its budgets (and hence how seriously it ++ * tends to lower the throughput). In addition, this time-charging ++ * strategy guarantees time fairness among slow processes. In ++ * contrast, if the process associated with bfqq is not slow, we ++ * charge bfqq exactly with the service it has received. ++ * ++ * Charging time to the first type of queues and the exact service to ++ * the other has the effect of using the WF2Q+ policy to schedule the ++ * former on a timeslice basis, without violating service domain ++ * guarantees among the latter. ++ */ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason) ++{ ++ bool slow; ++ unsigned long delta = 0; ++ struct bfq_entity *entity = &bfqq->entity; ++ int ref; ++ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ /* ++ * Check whether the process is slow (see bfq_bfqq_is_slow). ++ */ ++ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); ++ ++ /* ++ * Increase service_from_backlogged before next statement, ++ * because the possible next invocation of ++ * bfq_bfqq_charge_time would likely inflate ++ * entity->service. In contrast, service_from_backlogged must ++ * contain real service, to enable the soft real-time ++ * heuristic to correctly compute the bandwidth consumed by ++ * bfqq. ++ */ ++ bfqq->service_from_backlogged += entity->service; ++ ++ /* ++ * As above explained, charge slow (typically seeky) and ++ * timed-out queues with the time and not the service ++ * received, to favor sequential workloads. ++ * ++ * Processes doing I/O in the slower disk zones will tend to ++ * be slow(er) even if not seeky. Therefore, since the ++ * estimated peak rate is actually an average over the disk ++ * surface, these processes may timeout just for bad luck. To ++ * avoid punishing them, do not charge time to processes that ++ * succeeded in consuming at least 2/3 of their budget. This ++ * allows BFQ to preserve enough elasticity to still perform ++ * bandwidth, and not time, distribution with little unlucky ++ * or quasi-sequential processes. ++ */ ++ if (bfqq->wr_coeff == 1 && ++ (slow || ++ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && ++ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) ++ bfq_bfqq_charge_time(bfqd, bfqq, delta); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ if (reason == BFQ_BFQQ_TOO_IDLE && ++ entity->service <= 2 * entity->budget / 10) ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ if (bfqd->low_latency && bfqq->wr_coeff == 1) ++ bfqq->last_wr_start_finish = jiffies; ++ ++ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ /* ++ * If we get here, and there are no outstanding ++ * requests, then the request pattern is isochronous ++ * (see the comments on the function ++ * bfq_bfqq_softrt_next_start()). Thus we can compute ++ * soft_rt_next_start. If, instead, the queue still ++ * has outstanding requests, then we have to wait for ++ * the completion of all the outstanding requests to ++ * discover whether the request pattern is actually ++ * isochronous. ++ */ ++ BUG_ON(bfqd->busy_queues < 1); ++ if (bfqq->dispatched == 0) { ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", ++ bfqq->soft_rt_next_start); ++ } else { ++ /* ++ * The application is still waiting for the ++ * completion of one or more requests: ++ * prevent it from possibly being incorrectly ++ * deemed as soft real-time by setting its ++ * soft_rt_next_start to infinity. In fact, ++ * without this assignment, the application ++ * would be incorrectly deemed as soft ++ * real-time if: ++ * 1) it issued a new request before the ++ * completion of all its in-flight ++ * requests, and ++ * 2) at that time, its soft_rt_next_start ++ * happened to be in the past. ++ */ ++ bfqq->soft_rt_next_start = ++ bfq_greatest_from_now(); ++ /* ++ * Schedule an update of soft_rt_next_start to when ++ * the task may be discovered to be isochronous. ++ */ ++ bfq_mark_bfqq_softrt_update(bfqq); ++ } ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", ++ reason, slow, bfqq->dispatched, ++ bfq_bfqq_has_short_ttime(bfqq), entity->weight); ++ ++ /* ++ * Increase, decrease or leave budget unchanged according to ++ * reason. ++ */ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); ++ BUG_ON(bfqq->next_rq == NULL && ++ bfqq->entity.budget < bfqq->entity.service); ++ ref = bfqq->ref; ++ __bfq_bfqq_expire(bfqd, bfqq); ++ ++ BUG_ON(ref > 1 && ++ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && ++ !bfq_class_idle(bfqq)); ++ ++ /* mark bfqq as waiting a request only if a bic still points to it */ ++ if (ref > 1 && !bfq_bfqq_busy(bfqq) && ++ reason != BFQ_BFQQ_BUDGET_TIMEOUT && ++ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) ++ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); ++} ++ ++/* ++ * Budget timeout is not implemented through a dedicated timer, but ++ * just checked on request arrivals and completions, as well as on ++ * idle timer expirations. ++ */ ++static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) ++{ ++ return time_is_before_eq_jiffies(bfqq->budget_timeout); ++} ++ ++/* ++ * If we expire a queue that is actively waiting (i.e., with the ++ * device idled) for the arrival of a new request, then we may incur ++ * the timestamp misalignment problem described in the body of the ++ * function __bfq_activate_entity. Hence we return true only if this ++ * condition does not hold, or if the queue is slow enough to deserve ++ * only to be kicked off for preserving a high throughput. ++ */ ++static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "may_budget_timeout: wait_request %d left %d timeout %d", ++ bfq_bfqq_wait_request(bfqq), ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, ++ bfq_bfqq_budget_timeout(bfqq)); ++ ++ return (!bfq_bfqq_wait_request(bfqq) || ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) ++ && ++ bfq_bfqq_budget_timeout(bfqq); ++} ++ ++/* ++ * For a queue that becomes empty, device idling is allowed only if ++ * this function returns true for that queue. As a consequence, since ++ * device idling plays a critical role for both throughput boosting ++ * and service guarantees, the return value of this function plays a ++ * critical role as well. ++ * ++ * In a nutshell, this function returns true only if idling is ++ * beneficial for throughput or, even if detrimental for throughput, ++ * idling is however necessary to preserve service guarantees (low ++ * latency, desired throughput distribution, ...). In particular, on ++ * NCQ-capable devices, this function tries to return false, so as to ++ * help keep the drives' internal queues full, whenever this helps the ++ * device boost the throughput without causing any service-guarantee ++ * issue. ++ * ++ * In more detail, the return value of this function is obtained by, ++ * first, computing a number of boolean variables that take into ++ * account throughput and service-guarantee issues, and, then, ++ * combining these variables in a logical expression. Most of the ++ * issues taken into account are not trivial. We discuss these issues ++ * while introducing the variables. ++ */ ++static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ bool rot_without_queueing = ++ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, ++ bfqq_sequential_and_IO_bound, ++ idling_boosts_thr, idling_boosts_thr_without_issues, ++ idling_needed_for_service_guarantees, ++ asymmetric_scenario; ++ ++ if (bfqd->strict_guarantees) ++ return true; ++ ++ /* ++ * Idling is performed only if slice_idle > 0. In addition, we ++ * do not idle if ++ * (a) bfqq is async ++ * (b) bfqq is in the idle io prio class: in this case we do ++ * not idle because we want to minimize the bandwidth that ++ * queues in this class can steal to higher-priority queues ++ */ ++ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || ++ bfq_class_idle(bfqq)) ++ return false; ++ ++ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && ++ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); ++ /* ++ * The next variable takes into account the cases where idling ++ * boosts the throughput. ++ * ++ * The value of the variable is computed considering, first, that ++ * idling is virtually always beneficial for the throughput if: ++ * (a) the device is not NCQ-capable and rotational, or ++ * (b) regardless of the presence of NCQ, the device is rotational and ++ * the request pattern for bfqq is I/O-bound and sequential, or ++ * (c) regardless of whether it is rotational, the device is ++ * not NCQ-capable and the request pattern for bfqq is ++ * I/O-bound and sequential. ++ * ++ * Secondly, and in contrast to the above item (b), idling an ++ * NCQ-capable flash-based device would not boost the ++ * throughput even with sequential I/O; rather it would lower ++ * the throughput in proportion to how fast the device ++ * is. Accordingly, the next variable is true if any of the ++ * above conditions (a), (b) or (c) is true, and, in ++ * particular, happens to be false if bfqd is an NCQ-capable ++ * flash-based device. ++ */ ++ idling_boosts_thr = rot_without_queueing || ++ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && ++ bfqq_sequential_and_IO_bound); ++ ++ /* ++ * The value of the next variable, ++ * idling_boosts_thr_without_issues, is equal to that of ++ * idling_boosts_thr, unless a special case holds. In this ++ * special case, described below, idling may cause problems to ++ * weight-raised queues. ++ * ++ * When the request pool is saturated (e.g., in the presence ++ * of write hogs), if the processes associated with ++ * non-weight-raised queues ask for requests at a lower rate, ++ * then processes associated with weight-raised queues have a ++ * higher probability to get a request from the pool ++ * immediately (or at least soon) when they need one. Thus ++ * they have a higher probability to actually get a fraction ++ * of the device throughput proportional to their high ++ * weight. This is especially true with NCQ-capable drives, ++ * which enqueue several requests in advance, and further ++ * reorder internally-queued requests. ++ * ++ * For this reason, we force to false the value of ++ * idling_boosts_thr_without_issues if there are weight-raised ++ * busy queues. In this case, and if bfqq is not weight-raised, ++ * this guarantees that the device is not idled for bfqq (if, ++ * instead, bfqq is weight-raised, then idling will be ++ * guaranteed by another variable, see below). Combined with ++ * the timestamping rules of BFQ (see [1] for details), this ++ * behavior causes bfqq, and hence any sync non-weight-raised ++ * queue, to get a lower number of requests served, and thus ++ * to ask for a lower number of requests from the request ++ * pool, before the busy weight-raised queues get served ++ * again. This often mitigates starvation problems in the ++ * presence of heavy write workloads and NCQ, thereby ++ * guaranteeing a higher application and system responsiveness ++ * in these hostile scenarios. ++ */ ++ idling_boosts_thr_without_issues = idling_boosts_thr && ++ bfqd->wr_busy_queues == 0; ++ ++ /* ++ * There is then a case where idling must be performed not ++ * for throughput concerns, but to preserve service ++ * guarantees. ++ * ++ * To introduce this case, we can note that allowing the drive ++ * to enqueue more than one request at a time, and hence ++ * delegating de facto final scheduling decisions to the ++ * drive's internal scheduler, entails loss of control on the ++ * actual request service order. In particular, the critical ++ * situation is when requests from different processes happen ++ * to be present, at the same time, in the internal queue(s) ++ * of the drive. In such a situation, the drive, by deciding ++ * the service order of the internally-queued requests, does ++ * determine also the actual throughput distribution among ++ * these processes. But the drive typically has no notion or ++ * concern about per-process throughput distribution, and ++ * makes its decisions only on a per-request basis. Therefore, ++ * the service distribution enforced by the drive's internal ++ * scheduler is likely to coincide with the desired ++ * device-throughput distribution only in a completely ++ * symmetric scenario where: ++ * (i) each of these processes must get the same throughput as ++ * the others; ++ * (ii) all these processes have the same I/O pattern ++ * (either sequential or random). ++ * In fact, in such a scenario, the drive will tend to treat ++ * the requests of each of these processes in about the same ++ * way as the requests of the others, and thus to provide ++ * each of these processes with about the same throughput ++ * (which is exactly the desired throughput distribution). In ++ * contrast, in any asymmetric scenario, device idling is ++ * certainly needed to guarantee that bfqq receives its ++ * assigned fraction of the device throughput (see [1] for ++ * details). ++ * ++ * We address this issue by controlling, actually, only the ++ * symmetry sub-condition (i), i.e., provided that ++ * sub-condition (i) holds, idling is not performed, ++ * regardless of whether sub-condition (ii) holds. In other ++ * words, only if sub-condition (i) holds, then idling is ++ * allowed, and the device tends to be prevented from queueing ++ * many requests, possibly of several processes. The reason ++ * for not controlling also sub-condition (ii) is that we ++ * exploit preemption to preserve guarantees in case of ++ * symmetric scenarios, even if (ii) does not hold, as ++ * explained in the next two paragraphs. ++ * ++ * Even if a queue, say Q, is expired when it remains idle, Q ++ * can still preempt the new in-service queue if the next ++ * request of Q arrives soon (see the comments on ++ * bfq_bfqq_update_budg_for_activation). If all queues and ++ * groups have the same weight, this form of preemption, ++ * combined with the hole-recovery heuristic described in the ++ * comments on function bfq_bfqq_update_budg_for_activation, ++ * are enough to preserve a correct bandwidth distribution in ++ * the mid term, even without idling. In fact, even if not ++ * idling allows the internal queues of the device to contain ++ * many requests, and thus to reorder requests, we can rather ++ * safely assume that the internal scheduler still preserves a ++ * minimum of mid-term fairness. The motivation for using ++ * preemption instead of idling is that, by not idling, ++ * service guarantees are preserved without minimally ++ * sacrificing throughput. In other words, both a high ++ * throughput and its desired distribution are obtained. ++ * ++ * More precisely, this preemption-based, idleless approach ++ * provides fairness in terms of IOPS, and not sectors per ++ * second. This can be seen with a simple example. Suppose ++ * that there are two queues with the same weight, but that ++ * the first queue receives requests of 8 sectors, while the ++ * second queue receives requests of 1024 sectors. In ++ * addition, suppose that each of the two queues contains at ++ * most one request at a time, which implies that each queue ++ * always remains idle after it is served. Finally, after ++ * remaining idle, each queue receives very quickly a new ++ * request. It follows that the two queues are served ++ * alternatively, preempting each other if needed. This ++ * implies that, although both queues have the same weight, ++ * the queue with large requests receives a service that is ++ * 1024/8 times as high as the service received by the other ++ * queue. ++ * ++ * On the other hand, device idling is performed, and thus ++ * pure sector-domain guarantees are provided, for the ++ * following queues, which are likely to need stronger ++ * throughput guarantees: weight-raised queues, and queues ++ * with a higher weight than other queues. When such queues ++ * are active, sub-condition (i) is false, which triggers ++ * device idling. ++ * ++ * According to the above considerations, the next variable is ++ * true (only) if sub-condition (i) holds. To compute the ++ * value of this variable, we not only use the return value of ++ * the function bfq_symmetric_scenario(), but also check ++ * whether bfqq is being weight-raised, because ++ * bfq_symmetric_scenario() does not take into account also ++ * weight-raised queues (see comments on ++ * bfq_weights_tree_add()). ++ * ++ * As a side note, it is worth considering that the above ++ * device-idling countermeasures may however fail in the ++ * following unlucky scenario: if idling is (correctly) ++ * disabled in a time period during which all symmetry ++ * sub-conditions hold, and hence the device is allowed to ++ * enqueue many requests, but at some later point in time some ++ * sub-condition stops to hold, then it may become impossible ++ * to let requests be served in the desired order until all ++ * the requests already queued in the device have been served. ++ */ ++ asymmetric_scenario = bfqq->wr_coeff > 1 || ++ !bfq_symmetric_scenario(bfqd); ++ ++ /* ++ * Finally, there is a case where maximizing throughput is the ++ * best choice even if it may cause unfairness toward ++ * bfqq. Such a case is when bfqq became active in a burst of ++ * queue activations. Queues that became active during a large ++ * burst benefit only from throughput, as discussed in the ++ * comments on bfq_handle_burst. Thus, if bfqq became active ++ * in a burst and not idling the device maximizes throughput, ++ * then the device must no be idled, because not idling the ++ * device provides bfqq and all other queues in the burst with ++ * maximum benefit. Combining this and the above case, we can ++ * now establish when idling is actually needed to preserve ++ * service guarantees. ++ */ ++ idling_needed_for_service_guarantees = ++ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); ++ ++ /* ++ * We have now all the components we need to compute the ++ * return value of the function, which is true only if idling ++ * either boosts the throughput (without issues), or is ++ * necessary to preserve service guarantees. ++ */ ++ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", ++ bfq_bfqq_sync(bfqq), idling_boosts_thr); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", ++ bfqd->wr_busy_queues, ++ idling_boosts_thr_without_issues, ++ bfq_bfqq_IO_bound(bfqq), ++ idling_needed_for_service_guarantees); ++ ++ return idling_boosts_thr_without_issues || ++ idling_needed_for_service_guarantees; ++} ++ ++/* ++ * If the in-service queue is empty but the function bfq_bfqq_may_idle ++ * returns true, then: ++ * 1) the queue must remain in service and cannot be expired, and ++ * 2) the device must be idled to wait for the possible arrival of a new ++ * request for the queue. ++ * See the comments on the function bfq_bfqq_may_idle for the reasons ++ * why performing device idling is the best choice to boost the throughput ++ * and preserve service guarantees when bfq_bfqq_may_idle itself ++ * returns true. ++ */ ++static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) ++{ ++ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); ++} ++ ++/* ++ * Select a queue for service. If we have a current queue in service, ++ * check whether to continue servicing it, or retrieve and set a new one. ++ */ ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ struct request *next_rq; ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ ++ bfqq = bfqd->in_service_queue; ++ if (!bfqq) ++ goto new_queue; ++ ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); ++ ++ if (bfq_may_expire_for_budg_timeout(bfqq) && ++ !hrtimer_active(&bfqd->idle_slice_timer) && ++ !bfq_bfqq_must_idle(bfqq)) ++ goto expire; ++ ++check_queue: ++ /* ++ * This loop is rarely executed more than once. Even when it ++ * happens, it is much more convenient to re-execute this loop ++ * than to return NULL and trigger a new dispatch to get a ++ * request served. ++ */ ++ next_rq = bfqq->next_rq; ++ /* ++ * If bfqq has requests queued and it has enough budget left to ++ * serve them, keep the queue, otherwise expire it. ++ */ ++ if (next_rq) { ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (bfq_serv_to_charge(next_rq, bfqq) > ++ bfq_bfqq_budget_left(bfqq)) { ++ /* ++ * Expire the queue for budget exhaustion, ++ * which makes sure that the next budget is ++ * enough to serve the next request, even if ++ * it comes from the fifo expired path. ++ */ ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; ++ goto expire; ++ } else { ++ /* ++ * The idle timer may be pending because we may ++ * not disable disk idling even when a new request ++ * arrives. ++ */ ++ if (bfq_bfqq_wait_request(bfqq)) { ++ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); ++ /* ++ * If we get here: 1) at least a new request ++ * has arrived but we have not disabled the ++ * timer because the request was too small, ++ * 2) then the block layer has unplugged ++ * the device, causing the dispatch to be ++ * invoked. ++ * ++ * Since the device is unplugged, now the ++ * requests are probably large enough to ++ * provide a reasonable throughput. ++ * So we disable idling. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); ++ } ++ goto keep_queue; ++ } ++ } ++ ++ /* ++ * No requests pending. However, if the in-service queue is idling ++ * for a new request, or has requests waiting for a completion and ++ * may idle after their completion, then keep it anyway. ++ */ ++ if (hrtimer_active(&bfqd->idle_slice_timer) || ++ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { ++ bfqq = NULL; ++ goto keep_queue; ++ } ++ ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS; ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, false, reason); ++new_queue: ++ bfqq = bfq_set_in_service_queue(bfqd); ++ if (bfqq) { ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); ++ goto check_queue; ++ } ++keep_queue: ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); ++ else ++ bfq_log(bfqd, "select_queue: no queue returned"); ++ ++ return bfqq; ++} ++ ++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ ++ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != ++ entity->orig_weight * bfqq->wr_coeff); ++ if (entity->prio_changed) ++ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); ++ ++ /* ++ * If the queue was activated in a burst, or too much ++ * time has elapsed from the beginning of this ++ * weight-raising period, then end weight raising. ++ */ ++ if (bfq_bfqq_in_large_burst(bfqq)) ++ bfq_bfqq_end_wr(bfqq); ++ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time)) { ++ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || ++ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + ++ bfq_wr_duration(bfqd))) ++ bfq_bfqq_end_wr(bfqq); ++ else { ++ /* switch back to interactive wr */ ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ bfqq->last_wr_start_finish = ++ bfqq->wr_start_at_switch_to_srt; ++ BUG_ON(time_is_after_jiffies( ++ bfqq->last_wr_start_finish)); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "back to interactive wr"); ++ } ++ } ++ } ++ /* ++ * To improve latency (for this or other queues), immediately ++ * update weight both if it must be raised and if it must be ++ * lowered. Since, entity may be on some active tree here, and ++ * might have a pending change of its ioprio class, invoke ++ * next function with the last parameter unset (see the ++ * comments on the function). ++ */ ++ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) ++ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), ++ entity, false); ++} ++ ++/* ++ * Dispatch one request from bfqq, moving it to the request queue ++ * dispatch list. ++ */ ++static int bfq_dispatch_request(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ struct request *rq = bfqq->next_rq; ++ unsigned long service_to_charge; ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(!rq); ++ service_to_charge = bfq_serv_to_charge(rq, bfqq); ++ ++ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ bfq_bfqq_served(bfqq, service_to_charge); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ bfq_dispatch_insert(bfqd->queue, rq); ++ ++ /* ++ * If weight raising has to terminate for bfqq, then next ++ * function causes an immediate update of bfqq's weight, ++ * without waiting for next activation. As a consequence, on ++ * expiration, bfqq will be timestamped as if has never been ++ * weight-raised during this service slot, even if it has ++ * received part or even most of the service as a ++ * weight-raised queue. This inflates bfqq's timestamps, which ++ * is beneficial, as bfqq is then more willing to leave the ++ * device immediately to possible other weight-raised queues. ++ */ ++ bfq_update_wr_data(bfqd, bfqq); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "dispatched %u sec req (%llu), budg left %d", ++ blk_rq_sectors(rq), ++ (unsigned long long) blk_rq_pos(rq), ++ bfq_bfqq_budget_left(bfqq)); ++ ++ dispatched++; ++ ++ if (!bfqd->in_service_bic) { ++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); ++ bfqd->in_service_bic = RQ_BIC(rq); ++ } ++ ++ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) ++ goto expire; ++ ++ return dispatched; ++ ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); ++ return dispatched; ++} ++ ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ ++ while (bfqq->next_rq) { ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); ++ dispatched++; ++ } ++ ++ BUG_ON(!list_empty(&bfqq->fifo)); ++ return dispatched; ++} ++ ++/* ++ * Drain our current requests. ++ * Used for barriers and when switching io schedulers on-the-fly. ++ */ ++static int bfq_forced_dispatch(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq, *n; ++ struct bfq_service_tree *st; ++ int dispatched = 0; ++ ++ bfqq = bfqd->in_service_queue; ++ if (bfqq) ++ __bfq_bfqq_expire(bfqd, bfqq); ++ ++ /* ++ * Loop through classes, and be careful to leave the scheduler ++ * in a consistent state, as feedback mechanisms and vtime ++ * updates cannot be disabled during the process. ++ */ ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { ++ st = bfq_entity_service_tree(&bfqq->entity); ++ ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq); ++ ++ bfqq->max_budget = bfq_max_budget(bfqd); ++ bfq_forget_idle(st); ++ } ++ ++ BUG_ON(bfqd->busy_queues != 0); ++ ++ return dispatched; ++} ++ ++static int bfq_dispatch_requests(struct request_queue *q, int force) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq; ++ ++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); ++ ++ if (bfqd->busy_queues == 0) ++ return 0; ++ ++ if (unlikely(force)) ++ return bfq_forced_dispatch(bfqd); ++ ++ /* ++ * Force device to serve one request at a time if ++ * strict_guarantees is true. Forcing this service scheme is ++ * currently the ONLY way to guarantee that the request ++ * service order enforced by the scheduler is respected by a ++ * queueing device. Otherwise the device is free even to make ++ * some unlucky request wait for as long as the device ++ * wishes. ++ * ++ * Of course, serving one request at at time may cause loss of ++ * throughput. ++ */ ++ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) ++ return 0; ++ ++ bfqq = bfq_select_queue(bfqd); ++ if (!bfqq) ++ return 0; ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ BUG_ON(bfq_bfqq_wait_request(bfqq)); ++ ++ if (!bfq_dispatch_request(bfqd, bfqq)) ++ return 0; ++ ++ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", ++ bfq_bfqq_sync(bfqq) ? "sync" : "async"); ++ ++ BUG_ON(bfqq->next_rq == NULL && ++ bfqq->entity.budget < bfqq->entity.service); ++ return 1; ++} ++ ++/* ++ * Task holds one reference to the queue, dropped when task exits. Each rq ++ * in-flight on this queue also holds a reference, dropped when rq is freed. ++ * ++ * Queue lock must be held here. Recall not to use bfqq after calling ++ * this function on it. ++ */ ++static void bfq_put_queue(struct bfq_queue *bfqq) ++{ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ struct bfq_group *bfqg = bfqq_group(bfqq); ++#endif ++ ++ BUG_ON(bfqq->ref <= 0); ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); ++ bfqq->ref--; ++ if (bfqq->ref) ++ return; ++ ++ BUG_ON(rb_first(&bfqq->sort_list)); ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); ++ BUG_ON(bfqq->entity.tree); ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ ++ if (bfq_bfqq_sync(bfqq)) ++ /* ++ * The fact that this queue is being destroyed does not ++ * invalidate the fact that this queue may have been ++ * activated during the current burst. As a consequence, ++ * although the queue does not exist anymore, and hence ++ * needs to be removed from the burst list if there, ++ * the burst size has not to be decremented. ++ */ ++ hlist_del_init(&bfqq->burst_list_node); ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); ++ ++ kmem_cache_free(bfq_pool, bfqq); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ bfqg_put(bfqg); ++#endif ++} ++ ++static void bfq_put_cooperator(struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *__bfqq, *next; ++ ++ /* ++ * If this queue was scheduled to merge with another queue, be ++ * sure to drop the reference taken on that queue (and others in ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. ++ */ ++ __bfqq = bfqq->new_bfqq; ++ while (__bfqq) { ++ if (__bfqq == bfqq) ++ break; ++ next = __bfqq->new_bfqq; ++ bfq_put_queue(__bfqq); ++ __bfqq = next; ++ } ++} ++ ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ if (bfqq == bfqd->in_service_queue) { ++ __bfq_bfqq_expire(bfqd, bfqq); ++ bfq_schedule_dispatch(bfqd); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); /* release process reference */ ++} ++ ++static void bfq_init_icq(struct io_cq *icq) ++{ ++ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); ++} ++ ++static void bfq_exit_icq(struct io_cq *icq) ++{ ++ struct bfq_io_cq *bic = icq_to_bic(icq); ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ ++ if (bic_to_bfqq(bic, false)) { ++ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); ++ bic_set_bfqq(bic, NULL, false); ++ } ++ ++ if (bic_to_bfqq(bic, true)) { ++ /* ++ * If the bic is using a shared queue, put the reference ++ * taken on the io_context when the bic started using a ++ * shared bfq_queue. ++ */ ++ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) ++ put_io_context(icq->ioc); ++ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); ++ bic_set_bfqq(bic, NULL, true); ++ } ++} ++ ++/* ++ * Update the entity prio values; note that the new values will not ++ * be used until the next (re)activation. ++ */ ++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ struct task_struct *tsk = current; ++ int ioprio_class; ++ ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ switch (ioprio_class) { ++ default: ++ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, ++ "bfq: bad prio class %d\n", ioprio_class); ++ case IOPRIO_CLASS_NONE: ++ /* ++ * No prio set, inherit CPU scheduling settings. ++ */ ++ bfqq->new_ioprio = task_nice_ioprio(tsk); ++ bfqq->new_ioprio_class = task_nice_ioclass(tsk); ++ break; ++ case IOPRIO_CLASS_RT: ++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; ++ break; ++ case IOPRIO_CLASS_BE: ++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; ++ break; ++ case IOPRIO_CLASS_IDLE: ++ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; ++ bfqq->new_ioprio = 7; ++ break; ++ } ++ ++ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { ++ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", ++ bfqq->new_ioprio); ++ BUG(); ++ } ++ ++ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "set_next_ioprio_data: bic_class %d prio %d class %d", ++ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); ++} ++ ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ struct bfq_queue *bfqq; ++ unsigned long uninitialized_var(flags); ++ int ioprio = bic->icq.ioc->ioprio; ++ ++ /* ++ * This condition may trigger on a newly created bic, be sure to ++ * drop the lock before returning. ++ */ ++ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) ++ return; ++ ++ bic->ioprio = ioprio; ++ ++ bfqq = bic_to_bfqq(bic, false); ++ if (bfqq) { ++ /* release process reference on this queue */ ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); ++ bic_set_bfqq(bic, bfqq, false); ++ bfq_log_bfqq(bfqd, bfqq, ++ "check_ioprio_change: bfqq %p %d", ++ bfqq, bfqq->ref); ++ } ++ ++ bfqq = bic_to_bfqq(bic, true); ++ if (bfqq) ++ bfq_set_next_ioprio_data(bfqq, bic); ++} ++ ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic, pid_t pid, int is_sync) ++{ ++ RB_CLEAR_NODE(&bfqq->entity.rb_node); ++ INIT_LIST_HEAD(&bfqq->fifo); ++ INIT_HLIST_NODE(&bfqq->burst_list_node); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); ++ ++ bfqq->ref = 0; ++ bfqq->bfqd = bfqd; ++ ++ if (bic) ++ bfq_set_next_ioprio_data(bfqq, bic); ++ ++ if (is_sync) { ++ /* ++ * No need to mark as has_short_ttime if in ++ * idle_class, because no device idling is performed ++ * for queues in idle class ++ */ ++ if (!bfq_class_idle(bfqq)) ++ /* tentatively mark as has_short_ttime */ ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ bfq_mark_bfqq_sync(bfqq); ++ bfq_mark_bfqq_just_created(bfqq); ++ } else ++ bfq_clear_bfqq_sync(bfqq); ++ bfq_mark_bfqq_IO_bound(bfqq); ++ ++ /* Tentative initial value to trade off between thr and lat */ ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; ++ bfqq->pid = pid; ++ ++ bfqq->wr_coeff = 1; ++ bfqq->last_wr_start_finish = jiffies; ++ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); ++ bfqq->budget_timeout = bfq_smallest_from_now(); ++ bfqq->split_time = bfq_smallest_from_now(); ++ ++ /* ++ * Set to the value for which bfqq will not be deemed as ++ * soft rt when it becomes backlogged. ++ */ ++ bfqq->soft_rt_next_start = bfq_greatest_from_now(); ++ ++ /* first request is almost certainly seeky */ ++ bfqq->seek_history = 1; ++} ++ ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ int ioprio_class, int ioprio) ++{ ++ switch (ioprio_class) { ++ case IOPRIO_CLASS_RT: ++ return &bfqg->async_bfqq[0][ioprio]; ++ case IOPRIO_CLASS_NONE: ++ ioprio = IOPRIO_NORM; ++ /* fall through */ ++ case IOPRIO_CLASS_BE: ++ return &bfqg->async_bfqq[1][ioprio]; ++ case IOPRIO_CLASS_IDLE: ++ return &bfqg->async_idle_bfqq; ++ default: ++ BUG(); ++ } ++} ++ ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bio *bio, bool is_sync, ++ struct bfq_io_cq *bic) ++{ ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ struct bfq_queue **async_bfqq = NULL; ++ struct bfq_queue *bfqq; ++ struct bfq_group *bfqg; ++ ++ rcu_read_lock(); ++ ++ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); ++ if (!bfqg) { ++ bfqq = &bfqd->oom_bfqq; ++ goto out; ++ } ++ ++ if (!is_sync) { ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ++ ioprio); ++ bfqq = *async_bfqq; ++ if (bfqq) ++ goto out; ++ } ++ ++ bfqq = kmem_cache_alloc_node(bfq_pool, ++ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, ++ bfqd->queue->node); ++ ++ if (bfqq) { ++ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, ++ is_sync); ++ bfq_init_entity(&bfqq->entity, bfqg); ++ bfq_log_bfqq(bfqd, bfqq, "allocated"); ++ } else { ++ bfqq = &bfqd->oom_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); ++ goto out; ++ } ++ ++ /* ++ * Pin the queue now that it's allocated, scheduler exit will ++ * prune it. ++ */ ++ if (async_bfqq) { ++ bfqq->ref++; /* ++ * Extra group reference, w.r.t. sync ++ * queue. This extra reference is removed ++ * only if bfqq->bfqg disappears, to ++ * guarantee that this queue is not freed ++ * until its group goes away. ++ */ ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", ++ bfqq, bfqq->ref); ++ *async_bfqq = bfqq; ++ } ++ ++out: ++ bfqq->ref++; /* get a process reference to this queue */ ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); ++ rcu_read_unlock(); ++ return bfqq; ++} ++ ++static void bfq_update_io_thinktime(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic) ++{ ++ struct bfq_ttime *ttime = &bic->ttime; ++ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; ++ ++ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); ++ ++ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; ++ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); ++ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, ++ ttime->ttime_samples); ++} ++ ++static void ++bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ bfqq->seek_history <<= 1; ++ bfqq->seek_history |= ++ get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && ++ (!blk_queue_nonrot(bfqd->queue) || ++ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); ++} ++ ++static void bfq_update_has_short_ttime(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ bool has_short_ttime = true; ++ ++ /* ++ * No need to update has_short_ttime if bfqq is async or in ++ * idle io prio class, or if bfq_slice_idle is zero, because ++ * no device idling is performed for bfqq in this case. ++ */ ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || ++ bfqd->bfq_slice_idle == 0) ++ return; ++ ++ /* Idle window just restored, statistics are meaningless. */ ++ if (time_is_after_eq_jiffies(bfqq->split_time + ++ bfqd->bfq_wr_min_idle_time)) ++ return; ++ ++ /* Think time is infinite if no process is linked to ++ * bfqq. Otherwise check average think time to ++ * decide whether to mark as has_short_ttime ++ */ ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || ++ (bfq_sample_valid(bic->ttime.ttime_samples) && ++ bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) ++ has_short_ttime = false; ++ ++ bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", ++ has_short_ttime); ++ ++ if (has_short_ttime) ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ else ++ bfq_clear_bfqq_has_short_ttime(bfqq); ++} ++ ++/* ++ * Called when a new fs request (rq) is added to bfqq. Check if there's ++ * something we should do about it. ++ */ ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ struct bfq_io_cq *bic = RQ_BIC(rq); ++ ++ if (rq->cmd_flags & REQ_META) ++ bfqq->meta_pending++; ++ ++ bfq_update_io_thinktime(bfqd, bic); ++ bfq_update_has_short_ttime(bfqd, bfqq, bic); ++ bfq_update_io_seektime(bfqd, bfqq, rq); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "rq_enqueued: has_short_ttime=%d (seeky %d)", ++ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); ++ ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ ++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { ++ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && ++ blk_rq_sectors(rq) < 32; ++ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); ++ ++ /* ++ * There is just this request queued: if the request ++ * is small and the queue is not to be expired, then ++ * just exit. ++ * ++ * In this way, if the device is being idled to wait ++ * for a new request from the in-service queue, we ++ * avoid unplugging the device and committing the ++ * device to serve just a small request. On the ++ * contrary, we wait for the block layer to decide ++ * when to unplug the device: hopefully, new requests ++ * will be merged to this one quickly, then the device ++ * will be unplugged and larger requests will be ++ * dispatched. ++ */ ++ if (small_req && !budget_timeout) ++ return; ++ ++ /* ++ * A large enough request arrived, or the queue is to ++ * be expired: in both cases disk idling is to be ++ * stopped, so clear wait_request flag and reset ++ * timer. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); ++ ++ /* ++ * The queue is not empty, because a new request just ++ * arrived. Hence we can safely expire the queue, in ++ * case of budget timeout, without risking that the ++ * timestamps of the queue are not updated correctly. ++ * See [1] for more details. ++ */ ++ if (budget_timeout) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ ++ /* ++ * Let the request rip immediately, or let a new queue be ++ * selected if bfqq has just been expired. ++ */ ++ __blk_run_queue(bfqd->queue); ++ } ++} ++ ++static void bfq_insert_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ ++ /* ++ * An unplug may trigger a requeue of a request from the device ++ * driver: make sure we are in process context while trying to ++ * merge two bfq_queues. ++ */ ++ if (!in_interrupt()) { ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); ++ if (new_bfqq) { ++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) ++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); ++ /* ++ * Release the request's reference to the old bfqq ++ * and make sure one is taken to the shared queue. ++ */ ++ new_bfqq->allocated[rq_data_dir(rq)]++; ++ bfqq->allocated[rq_data_dir(rq)]--; ++ new_bfqq->ref++; ++ bfq_clear_bfqq_just_created(bfqq); ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), ++ bfqq, new_bfqq); ++ /* ++ * rq is about to be enqueued into new_bfqq, ++ * release rq reference on bfqq ++ */ ++ bfq_put_queue(bfqq); ++ rq->elv.priv[1] = new_bfqq; ++ bfqq = new_bfqq; ++ } ++ } ++ ++ bfq_add_request(rq); ++ ++ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; ++ list_add_tail(&rq->queuelist, &bfqq->fifo); ++ ++ bfq_rq_enqueued(bfqd, bfqq, rq); ++} ++ ++static void bfq_update_hw_tag(struct bfq_data *bfqd) ++{ ++ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, ++ bfqd->rq_in_driver); ++ ++ if (bfqd->hw_tag == 1) ++ return; ++ ++ /* ++ * This sample is valid if the number of outstanding requests ++ * is large enough to allow a queueing behavior. Note that the ++ * sum is not exact, as it's not taking into account deactivated ++ * requests. ++ */ ++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) ++ return; ++ ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) ++ return; ++ ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; ++ bfqd->max_rq_in_driver = 0; ++ bfqd->hw_tag_samples = 0; ++} ++ ++static void bfq_completed_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ u64 now_ns; ++ u32 delta_us; ++ ++ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", ++ blk_rq_sectors(rq)); ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ bfq_update_hw_tag(bfqd); ++ ++ BUG_ON(!bfqd->rq_in_driver); ++ BUG_ON(!bfqq->dispatched); ++ bfqd->rq_in_driver--; ++ bfqq->dispatched--; ++ bfqg_stats_update_completion(bfqq_group(bfqq), ++ rq_start_time_ns(rq), ++ rq_io_start_time_ns(rq), ++ rq->cmd_flags); ++ ++ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ /* ++ * Set budget_timeout (which we overload to store the ++ * time at which the queue remains with no backlog and ++ * no outstanding request; used by the weight-raising ++ * mechanism). ++ */ ++ bfqq->budget_timeout = jiffies; ++ ++ bfq_weights_tree_remove(bfqd, &bfqq->entity, ++ &bfqd->queue_weights_tree); ++ } ++ ++ now_ns = ktime_get_ns(); ++ ++ RQ_BIC(rq)->ttime.last_end_request = now_ns; ++ ++ /* ++ * Using us instead of ns, to get a reasonable precision in ++ * computing rate in next check. ++ */ ++ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); ++ ++ bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", ++ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, ++ (USEC_PER_SEC* ++ (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, ++ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); ++ ++ /* ++ * If the request took rather long to complete, and, according ++ * to the maximum request size recorded, this completion latency ++ * implies that the request was certainly served at a very low ++ * rate (less than 1M sectors/sec), then the whole observation ++ * interval that lasts up to this time instant cannot be a ++ * valid time interval for computing a new peak rate. Invoke ++ * bfq_update_rate_reset to have the following three steps ++ * taken: ++ * - close the observation interval at the last (previous) ++ * request dispatch or completion ++ * - compute rate, if possible, for that observation interval ++ * - reset to zero samples, which will trigger a proper ++ * re-initialization of the observation interval on next ++ * dispatch ++ */ ++ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && ++ (bfqd->last_rq_max_size<last_completion = now_ns; ++ ++ /* ++ * If we are waiting to discover whether the request pattern ++ * of the task associated with the queue is actually ++ * isochronous, and both requisites for this condition to hold ++ * are now satisfied, then compute soft_rt_next_start (see the ++ * comments on the function bfq_bfqq_softrt_next_start()). We ++ * schedule this delayed check when bfqq expires, if it still ++ * has in-flight requests. ++ */ ++ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list)) ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ ++ /* ++ * If this is the in-service queue, check if it needs to be expired, ++ * or if we want to idle in case it has no pending requests. ++ */ ++ if (bfqd->in_service_queue == bfqq) { ++ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { ++ bfq_arm_slice_timer(bfqd); ++ goto out; ++ } else if (bfq_may_expire_for_budg_timeout(bfqq)) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && ++ (bfqq->dispatched == 0 || ++ !bfq_bfqq_may_idle(bfqq))) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_NO_MORE_REQUESTS); ++ } ++ ++ if (!bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++ ++out: ++ return; ++} ++ ++static int __bfq_may_queue(struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { ++ bfq_clear_bfqq_must_alloc(bfqq); ++ return ELV_MQUEUE_MUST; ++ } ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++static int bfq_may_queue(struct request_queue *q, unsigned int op) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct task_struct *tsk = current; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ /* ++ * Don't force setup of a queue from here, as a call to may_queue ++ * does not necessarily imply that a request actually will be ++ * queued. So just lookup a possibly existing queue, or return ++ * 'may queue' if that fails. ++ */ ++ bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ if (!bic) ++ return ELV_MQUEUE_MAY; ++ ++ bfqq = bic_to_bfqq(bic, op_is_sync(op)); ++ if (bfqq) ++ return __bfq_may_queue(bfqq); ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++/* ++ * Queue lock held here. ++ */ ++static void bfq_put_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ if (bfqq) { ++ const int rw = rq_data_dir(rq); ++ ++ BUG_ON(!bfqq->allocated[rw]); ++ bfqq->allocated[rw]--; ++ ++ rq->elv.priv[0] = NULL; ++ rq->elv.priv[1] = NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", ++ bfqq, bfqq->ref); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++/* ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this ++ * was the last process referring to that bfqq. ++ */ ++static struct bfq_queue * ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); ++ ++ put_io_context(bic->icq.ioc); ++ ++ if (bfqq_process_refs(bfqq) == 1) { ++ bfqq->pid = current->pid; ++ bfq_clear_bfqq_coop(bfqq); ++ bfq_clear_bfqq_split_coop(bfqq); ++ return bfqq; ++ } ++ ++ bic_set_bfqq(bic, NULL, 1); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); ++ return NULL; ++} ++ ++/* ++ * Allocate bfq data structures associated with this request. ++ */ ++static int bfq_set_request(struct request_queue *q, struct request *rq, ++ struct bio *bio, gfp_t gfp_mask) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); ++ const int rw = rq_data_dir(rq); ++ const int is_sync = rq_is_sync(rq); ++ struct bfq_queue *bfqq; ++ unsigned long flags; ++ bool bfqq_already_existing = false, split = false; ++ ++ spin_lock_irqsave(q->queue_lock, flags); ++ ++ if (!bic) ++ goto queue_fail; ++ ++ bfq_check_ioprio_change(bic, bio); ++ ++ bfq_bic_update_cgroup(bic, bio); ++ ++new_queue: ++ bfqq = bic_to_bfqq(bic, is_sync); ++ if (!bfqq || bfqq == &bfqd->oom_bfqq) { ++ if (bfqq) ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); ++ ++ bic_set_bfqq(bic, bfqq, is_sync); ++ if (split && is_sync) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "set_request: was_in_list %d " ++ "was_in_large_burst %d " ++ "large burst in progress %d", ++ bic->was_in_burst_list, ++ bic->saved_in_large_burst, ++ bfqd->large_burst); ++ ++ if ((bic->was_in_burst_list && bfqd->large_burst) || ++ bic->saved_in_large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "set_request: marking in " ++ "large burst"); ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ } else { ++ bfq_log_bfqq(bfqd, bfqq, ++ "set_request: clearing in " ++ "large burst"); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ if (bic->was_in_burst_list) ++ hlist_add_head(&bfqq->burst_list_node, ++ &bfqd->burst_list); ++ } ++ bfqq->split_time = jiffies; ++ } ++ } else { ++ /* If the queue was seeky for too long, break it apart. */ ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); ++ ++ /* Update bic before losing reference to bfqq */ ++ if (bfq_bfqq_in_large_burst(bfqq)) ++ bic->saved_in_large_burst = true; ++ ++ bfqq = bfq_split_bfqq(bic, bfqq); ++ split = true; ++ if (!bfqq) ++ goto new_queue; ++ else ++ bfqq_already_existing = true; ++ } ++ } ++ ++ bfqq->allocated[rw]++; ++ bfqq->ref++; ++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); ++ ++ rq->elv.priv[0] = bic; ++ rq->elv.priv[1] = bfqq; ++ ++ /* ++ * If a bfq_queue has only one process reference, it is owned ++ * by only one bfq_io_cq: we can set the bic field of the ++ * bfq_queue to the address of that structure. Also, if the ++ * queue has just been split, mark a flag so that the ++ * information is available to the other scheduler hooks. ++ */ ++ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { ++ bfqq->bic = bic; ++ if (split) { ++ /* ++ * If the queue has just been split from a shared ++ * queue, restore the idle window and the possible ++ * weight raising period. ++ */ ++ bfq_bfqq_resume_state(bfqq, bfqd, bic, ++ bfqq_already_existing); ++ } ++ } ++ ++ if (unlikely(bfq_bfqq_just_created(bfqq))) ++ bfq_handle_burst(bfqd, bfqq); ++ ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 0; ++ ++queue_fail: ++ bfq_schedule_dispatch(bfqd); ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 1; ++} ++ ++static void bfq_kick_queue(struct work_struct *work) ++{ ++ struct bfq_data *bfqd = ++ container_of(work, struct bfq_data, unplug_work); ++ struct request_queue *q = bfqd->queue; ++ ++ spin_lock_irq(q->queue_lock); ++ __blk_run_queue(q); ++ spin_unlock_irq(q->queue_lock); ++} ++ ++/* ++ * Handler of the expiration of the timer running if the in-service queue ++ * is idling inside its time slice. ++ */ ++static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) ++{ ++ struct bfq_data *bfqd = container_of(timer, struct bfq_data, ++ idle_slice_timer); ++ struct bfq_queue *bfqq; ++ unsigned long flags; ++ enum bfqq_expiration reason; ++ ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags); ++ ++ bfqq = bfqd->in_service_queue; ++ /* ++ * Theoretical race here: the in-service queue can be NULL or ++ * different from the queue that was idling if the timer handler ++ * spins on the queue_lock and a new request arrives for the ++ * current queue and there is a full dispatch cycle that changes ++ * the in-service queue. This can hardly happen, but in the worst ++ * case we just expire a queue too early. ++ */ ++ if (bfqq) { ++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); ++ bfq_clear_bfqq_wait_request(bfqq); ++ ++ if (bfq_bfqq_budget_timeout(bfqq)) ++ /* ++ * Also here the queue can be safely expired ++ * for budget timeout without wasting ++ * guarantees ++ */ ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) ++ /* ++ * The queue may not be empty upon timer expiration, ++ * because we may not disable the timer when the ++ * first request of the in-service queue arrives ++ * during disk idling. ++ */ ++ reason = BFQ_BFQQ_TOO_IDLE; ++ else ++ goto schedule_dispatch; ++ ++ bfq_bfqq_expire(bfqd, bfqq, true, reason); ++ } ++ ++schedule_dispatch: ++ bfq_schedule_dispatch(bfqd); ++ ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); ++ return HRTIMER_NORESTART; ++} ++ ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) ++{ ++ hrtimer_cancel(&bfqd->idle_slice_timer); ++ cancel_work_sync(&bfqd->unplug_work); ++} ++ ++static void __bfq_put_async_bfqq(struct bfq_data *bfqd, ++ struct bfq_queue **bfqq_ptr) ++{ ++ struct bfq_group *root_group = bfqd->root_group; ++ struct bfq_queue *bfqq = *bfqq_ptr; ++ ++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); ++ if (bfqq) { ++ bfq_bfqq_move(bfqd, bfqq, root_group); ++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", ++ bfqq, bfqq->ref); ++ bfq_put_queue(bfqq); ++ *bfqq_ptr = NULL; ++ } ++} ++ ++/* ++ * Release all the bfqg references to its async queues. If we are ++ * deallocating the group these queues may still contain requests, so ++ * we reparent them to the root cgroup (i.e., the only one that will ++ * exist for sure until all the requests on a device are gone). ++ */ ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); ++ ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); ++} ++ ++static void bfq_exit_queue(struct elevator_queue *e) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ struct request_queue *q = bfqd->queue; ++ struct bfq_queue *bfqq, *n; ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ spin_lock_irq(q->queue_lock); ++ ++ BUG_ON(bfqd->in_service_queue); ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) ++ bfq_deactivate_bfqq(bfqd, bfqq, false, false); ++ ++ spin_unlock_irq(q->queue_lock); ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ blkcg_deactivate_policy(q, &blkcg_policy_bfq); ++#else ++ bfq_put_async_queues(bfqd, bfqd->root_group); ++ kfree(bfqd->root_group); ++#endif ++ ++ kfree(bfqd); ++} ++ ++static void bfq_init_root_group(struct bfq_group *root_group, ++ struct bfq_data *bfqd) ++{ ++ int i; ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ root_group->entity.parent = NULL; ++ root_group->my_entity = NULL; ++ root_group->bfqd = bfqd; ++#endif ++ root_group->rq_pos_tree = RB_ROOT; ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ root_group->sched_data.bfq_class_idle_last_service = jiffies; ++} ++ ++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) ++{ ++ struct bfq_data *bfqd; ++ struct elevator_queue *eq; ++ ++ eq = elevator_alloc(q, e); ++ if (!eq) ++ return -ENOMEM; ++ ++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); ++ if (!bfqd) { ++ kobject_put(&eq->kobj); ++ return -ENOMEM; ++ } ++ eq->elevator_data = bfqd; ++ ++ /* ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. ++ * Grab a permanent reference to it, so that the normal code flow ++ * will not attempt to free it. ++ */ ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); ++ bfqd->oom_bfqq.ref++; ++ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; ++ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; ++ bfqd->oom_bfqq.entity.new_weight = ++ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); ++ ++ /* oom_bfqq does not participate to bursts */ ++ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); ++ /* ++ * Trigger weight initialization, according to ioprio, at the ++ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio ++ * class won't be changed any more. ++ */ ++ bfqd->oom_bfqq.entity.prio_changed = 1; ++ ++ bfqd->queue = q; ++ ++ spin_lock_irq(q->queue_lock); ++ q->elevator = eq; ++ spin_unlock_irq(q->queue_lock); ++ ++ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); ++ if (!bfqd->root_group) ++ goto out_free; ++ bfq_init_root_group(bfqd->root_group, bfqd); ++ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); ++ ++ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, ++ HRTIMER_MODE_REL); ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; ++ ++ bfqd->queue_weights_tree = RB_ROOT; ++ bfqd->group_weights_tree = RB_ROOT; ++ ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); ++ ++ INIT_LIST_HEAD(&bfqd->active_list); ++ INIT_LIST_HEAD(&bfqd->idle_list); ++ INIT_HLIST_HEAD(&bfqd->burst_list); ++ ++ bfqd->hw_tag = -1; ++ ++ bfqd->bfq_max_budget = bfq_default_max_budget; ++ ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; ++ bfqd->bfq_back_max = bfq_back_max; ++ bfqd->bfq_back_penalty = bfq_back_penalty; ++ bfqd->bfq_slice_idle = bfq_slice_idle; ++ bfqd->bfq_timeout = bfq_timeout; ++ ++ bfqd->bfq_requests_within_timer = 120; ++ ++ bfqd->bfq_large_burst_thresh = 8; ++ bfqd->bfq_burst_interval = msecs_to_jiffies(180); ++ ++ bfqd->low_latency = true; ++ ++ /* ++ * Trade-off between responsiveness and fairness. ++ */ ++ bfqd->bfq_wr_coeff = 30; ++ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); ++ bfqd->bfq_wr_max_time = 0; ++ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); ++ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); ++ bfqd->bfq_wr_max_softrt_rate = 7000; /* ++ * Approximate rate required ++ * to playback or record a ++ * high-definition compressed ++ * video. ++ */ ++ bfqd->wr_busy_queues = 0; ++ ++ /* ++ * Begin by assuming, optimistically, that the device is a ++ * high-speed one, and that its peak rate is equal to 2/3 of ++ * the highest reference rate. ++ */ ++ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * ++ T_fast[blk_queue_nonrot(bfqd->queue)]; ++ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; ++ bfqd->device_speed = BFQ_BFQD_FAST; ++ ++ return 0; ++ ++out_free: ++ kfree(bfqd); ++ kobject_put(&eq->kobj); ++ return -ENOMEM; ++} ++ ++static void bfq_slab_kill(void) ++{ ++ kmem_cache_destroy(bfq_pool); ++} ++ ++static int __init bfq_slab_setup(void) ++{ ++ bfq_pool = KMEM_CACHE(bfq_queue, 0); ++ if (!bfq_pool) ++ return -ENOMEM; ++ return 0; ++} ++ ++static ssize_t bfq_var_show(unsigned int var, char *page) ++{ ++ return sprintf(page, "%u\n", var); ++} ++ ++static ssize_t bfq_var_store(unsigned long *var, const char *page, ++ size_t count) ++{ ++ unsigned long new_val; ++ int ret = kstrtoul(page, 10, &new_val); ++ ++ if (ret == 0) ++ *var = new_val; ++ ++ return count; ++} ++ ++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ ++ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? ++ jiffies_to_msecs(bfqd->bfq_wr_max_time) : ++ jiffies_to_msecs(bfq_wr_duration(bfqd))); ++} ++ ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_queue *bfqq; ++ struct bfq_data *bfqd = e->elevator_data; ++ ssize_t num_char = 0; ++ ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", ++ bfqd->queued); ++ ++ spin_lock_irq(bfqd->queue->queue_lock); ++ ++ num_char += sprintf(page + num_char, "Active:\n"); ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, nr_queued %d %d, ", ++ bfqq->pid, ++ bfqq->entity.weight, ++ bfqq->queued[0], ++ bfqq->queued[1]); ++ num_char += sprintf(page + num_char, ++ "dur %d/%u\n", ++ jiffies_to_msecs( ++ jiffies - ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ num_char += sprintf(page + num_char, "Idle:\n"); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, dur %d/%u\n", ++ bfqq->pid, ++ bfqq->entity.weight, ++ jiffies_to_msecs(jiffies - ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ spin_unlock_irq(bfqd->queue->queue_lock); ++ ++ return num_char; ++} ++ ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ u64 __data = __VAR; \ ++ if (__CONV == 1) \ ++ __data = jiffies_to_msecs(__data); \ ++ else if (__CONV == 2) \ ++ __data = div_u64(__data, NSEC_PER_MSEC); \ ++ return bfq_var_show(__data, (page)); \ ++} ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); ++SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); ++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); ++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); ++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); ++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, ++ 1); ++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); ++#undef SHOW_FUNCTION ++ ++#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ u64 __data = __VAR; \ ++ __data = div_u64(__data, NSEC_PER_USEC); \ ++ return bfq_var_show(__data, (page)); \ ++} ++USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); ++#undef USEC_SHOW_FUNCTION ++ ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ ++static ssize_t \ ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long uninitialized_var(__data); \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ if (__CONV == 1) \ ++ *(__PTR) = msecs_to_jiffies(__data); \ ++ else if (__CONV == 2) \ ++ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ ++ else \ ++ *(__PTR) = __data; \ ++ return ret; \ ++} ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, ++ INT_MAX, 2); ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, ++ INT_MAX, 2); ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, ++ INT_MAX, 0); ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); ++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, ++ 1); ++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, ++ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, ++ INT_MAX, 0); ++#undef STORE_FUNCTION ++ ++#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ ++static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long uninitialized_var(__data); \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ ++ return ret; \ ++} ++USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, ++ UINT_MAX); ++#undef USEC_STORE_FUNCTION ++ ++/* do nothing for the moment */ ++static ssize_t bfq_weights_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ return count; ++} ++ ++static ssize_t bfq_max_budget_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data == 0) ++ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); ++ else { ++ if (__data > INT_MAX) ++ __data = INT_MAX; ++ bfqd->bfq_max_budget = __data; ++ } ++ ++ bfqd->bfq_user_max_budget = __data; ++ ++ return ret; ++} ++ ++/* ++ * Leaving this name to preserve name compatibility with cfq ++ * parameters, but this timeout is used for both sync and async. ++ */ ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data < 1) ++ __data = 1; ++ else if (__data > INT_MAX) ++ __data = INT_MAX; ++ ++ bfqd->bfq_timeout = msecs_to_jiffies(__data); ++ if (bfqd->bfq_user_max_budget == 0) ++ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); ++ ++ return ret; ++} ++ ++static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ if (!bfqd->strict_guarantees && __data == 1 ++ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) ++ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; ++ ++ bfqd->strict_guarantees = __data; ++ ++ return ret; ++} ++ ++static ssize_t bfq_low_latency_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ if (__data == 0 && bfqd->low_latency != 0) ++ bfq_end_wr(bfqd); ++ bfqd->low_latency = __data; ++ ++ return ret; ++} ++ ++#define BFQ_ATTR(name) \ ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) ++ ++static struct elv_fs_entry bfq_attrs[] = { ++ BFQ_ATTR(fifo_expire_sync), ++ BFQ_ATTR(fifo_expire_async), ++ BFQ_ATTR(back_seek_max), ++ BFQ_ATTR(back_seek_penalty), ++ BFQ_ATTR(slice_idle), ++ BFQ_ATTR(slice_idle_us), ++ BFQ_ATTR(max_budget), ++ BFQ_ATTR(timeout_sync), ++ BFQ_ATTR(strict_guarantees), ++ BFQ_ATTR(low_latency), ++ BFQ_ATTR(wr_coeff), ++ BFQ_ATTR(wr_max_time), ++ BFQ_ATTR(wr_rt_max_time), ++ BFQ_ATTR(wr_min_idle_time), ++ BFQ_ATTR(wr_min_inter_arr_async), ++ BFQ_ATTR(wr_max_softrt_rate), ++ BFQ_ATTR(weights), ++ __ATTR_NULL ++}; ++ ++static struct elevator_type iosched_bfq = { ++ .ops.sq = { ++ .elevator_merge_fn = bfq_merge, ++ .elevator_merged_fn = bfq_merged_request, ++ .elevator_merge_req_fn = bfq_merged_requests, ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ .elevator_bio_merged_fn = bfq_bio_merged, ++#endif ++ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, ++ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, ++ .elevator_dispatch_fn = bfq_dispatch_requests, ++ .elevator_add_req_fn = bfq_insert_request, ++ .elevator_activate_req_fn = bfq_activate_request, ++ .elevator_deactivate_req_fn = bfq_deactivate_request, ++ .elevator_completed_req_fn = bfq_completed_request, ++ .elevator_former_req_fn = elv_rb_former_request, ++ .elevator_latter_req_fn = elv_rb_latter_request, ++ .elevator_init_icq_fn = bfq_init_icq, ++ .elevator_exit_icq_fn = bfq_exit_icq, ++ .elevator_set_req_fn = bfq_set_request, ++ .elevator_put_req_fn = bfq_put_request, ++ .elevator_may_queue_fn = bfq_may_queue, ++ .elevator_init_fn = bfq_init_queue, ++ .elevator_exit_fn = bfq_exit_queue, ++ }, ++ .icq_size = sizeof(struct bfq_io_cq), ++ .icq_align = __alignof__(struct bfq_io_cq), ++ .elevator_attrs = bfq_attrs, ++ .elevator_name = "bfq-sq", ++ .elevator_owner = THIS_MODULE, ++}; ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++static struct blkcg_policy blkcg_policy_bfq = { ++ .dfl_cftypes = bfq_blkg_files, ++ .legacy_cftypes = bfq_blkcg_legacy_files, ++ ++ .cpd_alloc_fn = bfq_cpd_alloc, ++ .cpd_init_fn = bfq_cpd_init, ++ .cpd_bind_fn = bfq_cpd_init, ++ .cpd_free_fn = bfq_cpd_free, ++ ++ .pd_alloc_fn = bfq_pd_alloc, ++ .pd_init_fn = bfq_pd_init, ++ .pd_offline_fn = bfq_pd_offline, ++ .pd_free_fn = bfq_pd_free, ++ .pd_reset_stats_fn = bfq_pd_reset_stats, ++}; ++#endif ++ ++static int __init bfq_init(void) ++{ ++ int ret; ++ char msg[60] = "BFQ I/O-scheduler: v8r12"; ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ ret = blkcg_policy_register(&blkcg_policy_bfq); ++ if (ret) ++ return ret; ++#endif ++ ++ ret = -ENOMEM; ++ if (bfq_slab_setup()) ++ goto err_pol_unreg; ++ ++ /* ++ * Times to load large popular applications for the typical ++ * systems installed on the reference devices (see the ++ * comments before the definitions of the next two ++ * arrays). Actually, we use slightly slower values, as the ++ * estimated peak rate tends to be smaller than the actual ++ * peak rate. The reason for this last fact is that estimates ++ * are computed over much shorter time intervals than the long ++ * intervals typically used for benchmarking. Why? First, to ++ * adapt more quickly to variations. Second, because an I/O ++ * scheduler cannot rely on a peak-rate-evaluation workload to ++ * be run for a long time. ++ */ ++ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ ++ T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ ++ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ ++ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ ++ ++ /* ++ * Thresholds that determine the switch between speed classes ++ * (see the comments before the definition of the array ++ * device_speed_thresh). These thresholds are biased towards ++ * transitions to the fast class. This is safer than the ++ * opposite bias. In fact, a wrong transition to the slow ++ * class results in short weight-raising periods, because the ++ * speed of the device then tends to be higher that the ++ * reference peak rate. On the opposite end, a wrong ++ * transition to the fast class tends to increase ++ * weight-raising periods, because of the opposite reason. ++ */ ++ device_speed_thresh[0] = (4 * R_slow[0]) / 3; ++ device_speed_thresh[1] = (4 * R_slow[1]) / 3; ++ ++ ret = elv_register(&iosched_bfq); ++ if (ret) ++ goto err_pol_unreg; ++ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ strcat(msg, " (with cgroups support)"); ++#endif ++ pr_info("%s", msg); ++ ++ return 0; ++ ++err_pol_unreg: ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ blkcg_policy_unregister(&blkcg_policy_bfq); ++#endif ++ return ret; ++} ++ ++static void __exit bfq_exit(void) ++{ ++ elv_unregister(&iosched_bfq); ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++ blkcg_policy_unregister(&blkcg_policy_bfq); ++#endif ++ bfq_slab_kill(); ++} ++ ++module_init(bfq_init); ++module_exit(bfq_exit); ++ ++MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); ++MODULE_LICENSE("GPL"); + +From e24d2e6461479dbd13d58be2dc44b23b5e24487c Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Mon, 19 Dec 2016 17:13:39 +0100 +Subject: [PATCH 07/51] Add config and build bits for bfq-mq-iosched + +Signed-off-by: Paolo Valente +--- + block/Kconfig.iosched | 10 +++++++++ + block/Makefile | 1 + + block/bfq-cgroup-included.c | 4 ++-- + block/bfq-mq-iosched.c | 25 ++++++++++++----------- + block/bfq-sched.c | 50 ++++++++++++++++++++++----------------------- + block/bfq-sq-iosched.c | 24 +++++++++++----------- + block/bfq.h | 36 +++++++++++++++++++++----------- + 8 files changed, 88 insertions(+), 64 deletions(-) + +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched +index 9e3f4c2f7390..2d94af3d8b0a 100644 +--- a/block/Kconfig.iosched ++++ b/block/Kconfig.iosched +@@ -96,6 +96,16 @@ config DEFAULT_IOSCHED + default "bfq-sq" if DEFAULT_BFQ_SQ + default "noop" if DEFAULT_NOOP + ++config MQ_IOSCHED_BFQ ++ tristate "BFQ-MQ I/O Scheduler" ++ default y ++ ---help--- ++ BFQ I/O scheduler for BLK-MQ. BFQ-MQ distributes bandwidth ++ among all processes according to their weights, regardless of ++ the device parameters and with any workload. It also ++ guarantees a low latency to interactive and soft real-time ++ applications. Details in Documentation/block/bfq-iosched.txt ++ + config MQ_IOSCHED_DEADLINE + tristate "MQ deadline I/O scheduler" + default y +diff --git a/block/Makefile b/block/Makefile +index 59026b425791..a571329c23f0 100644 +--- a/block/Makefile ++++ b/block/Makefile +@@ -25,6 +25,7 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o + bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o + obj-$(CONFIG_IOSCHED_BFQ) += bfq.o + obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o ++obj-$(CONFIG_MQ_IOSCHED_BFQ) += bfq-mq-iosched.o + + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o + obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +index af7c216a3540..9c483b658179 100644 +--- a/block/bfq-cgroup-included.c ++++ b/block/bfq-cgroup-included.c +@@ -15,7 +15,7 @@ + * file. + */ + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + + /* bfqg stats flags */ + enum bfqg_stats_flags { +@@ -1116,7 +1116,7 @@ static struct cftype bfq_blkg_files[] = { + {} /* terminate */ + }; + +-#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ + + static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, + struct bfq_queue *bfqq, unsigned int op) { } +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 30d019fc67e0..e88e00f1e0a7 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -82,6 +82,7 @@ + #include + #include + #include "blk.h" ++#undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ + #include "bfq.h" + + /* Expiration time of sync (0) and async (1) requests, in ns. */ +@@ -387,7 +388,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) + return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && + (bfqd->queue_weights_tree.rb_node->rb_left || + bfqd->queue_weights_tree.rb_node->rb_right) +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + ) || + (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && + (bfqd->group_weights_tree.rb_node->rb_left || +@@ -1672,7 +1673,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, + } + } + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + static void bfq_bio_merged(struct request_queue *q, struct request *req, + struct bio *bio) + { +@@ -3879,7 +3880,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) + */ + static void bfq_put_queue(struct bfq_queue *bfqq) + { +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + struct bfq_group *bfqg = bfqq_group(bfqq); + #endif + +@@ -3909,7 +3910,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); + + kmem_cache_free(bfq_pool, bfqq); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + bfqg_put(bfqg); + #endif + } +@@ -4835,7 +4836,7 @@ static void bfq_exit_queue(struct elevator_queue *e) + + BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + blkcg_deactivate_policy(q, &blkcg_policy_bfq); + #else + bfq_put_async_queues(bfqd, bfqd->root_group); +@@ -4850,7 +4851,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, + { + int i; + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + root_group->entity.parent = NULL; + root_group->my_entity = NULL; + root_group->bfqd = bfqd; +@@ -5265,7 +5266,7 @@ static struct elevator_type iosched_bfq = { + .elevator_merge_fn = bfq_merge, + .elevator_merged_fn = bfq_merged_request, + .elevator_merge_req_fn = bfq_merged_requests, +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + .elevator_bio_merged_fn = bfq_bio_merged, + #endif + .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, +@@ -5292,7 +5293,7 @@ static struct elevator_type iosched_bfq = { + .elevator_owner = THIS_MODULE, + }; + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + static struct blkcg_policy blkcg_policy_bfq = { + .dfl_cftypes = bfq_blkg_files, + .legacy_cftypes = bfq_blkcg_legacy_files, +@@ -5315,7 +5316,7 @@ static int __init bfq_init(void) + int ret; + char msg[60] = "BFQ I/O-scheduler: v8r12"; + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + ret = blkcg_policy_register(&blkcg_policy_bfq); + if (ret) + return ret; +@@ -5362,7 +5363,7 @@ static int __init bfq_init(void) + if (ret) + goto err_pol_unreg; + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + strcat(msg, " (with cgroups support)"); + #endif + pr_info("%s", msg); +@@ -5370,7 +5371,7 @@ static int __init bfq_init(void) + return 0; + + err_pol_unreg: +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + blkcg_policy_unregister(&blkcg_policy_bfq); + #endif + return ret; +@@ -5379,7 +5380,7 @@ static int __init bfq_init(void) + static void __exit bfq_exit(void) + { + elv_unregister(&iosched_bfq); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + blkcg_policy_unregister(&blkcg_policy_bfq); + #endif + bfq_slab_kill(); +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index 5c0f9290a79c..b54a638186e3 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -136,7 +136,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_next_in_service: chosen this queue"); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + else { + struct bfq_group *bfqg = + container_of(next_in_service, +@@ -149,7 +149,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + return parent_sched_may_change; + } + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + /* both next loops stop at one of the child entities of the root group */ + #define for_each_entity(entity) \ + for (; entity ; entity = entity->parent) +@@ -243,7 +243,7 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) + return false; + } + +-#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ + #define for_each_entity(entity) \ + for (; entity ; entity = NULL) + +@@ -260,7 +260,7 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) + return true; + } + +-#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ + + /* + * Shift for timestamp calculations. This actually limits the maximum +@@ -323,7 +323,7 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: start %llu, finish %llu, delta %llu", + start, finish, delta); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); +@@ -473,7 +473,7 @@ static void bfq_update_active_node(struct rb_node *node) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_active_node: new min_start %llu", + ((entity->min_start>>10)*1000)>>12); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); +@@ -540,7 +540,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, + { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node = &entity->rb_node; +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + struct bfq_sched_data *sd = NULL; + struct bfq_group *bfqg = NULL; + struct bfq_data *bfqd = NULL; +@@ -555,7 +555,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, + + bfq_update_active_tree(node); + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + sd = entity->sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); +@@ -563,7 +563,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, + #endif + if (bfqq) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + else { /* bfq_group */ + BUG_ON(!bfqd); + bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); +@@ -652,7 +652,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, + { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node; +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + struct bfq_sched_data *sd = NULL; + struct bfq_group *bfqg = NULL; + struct bfq_data *bfqd = NULL; +@@ -664,7 +664,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, + if (node) + bfq_update_active_tree(node); + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + sd = entity->sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); +@@ -672,7 +672,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, + #endif + if (bfqq) + list_del(&bfqq->bfqq_list); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + else { /* bfq_group */ + BUG_ON(!bfqd); + bfq_weights_tree_remove(bfqd, entity, +@@ -809,14 +809,14 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + unsigned int prev_weight, new_weight; + struct bfq_data *bfqd = NULL; + struct rb_root *root; +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + struct bfq_sched_data *sd; + struct bfq_group *bfqg; + #endif + + if (bfqq) + bfqd = bfqq->bfqd; +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + else { + sd = entity->my_sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); +@@ -907,7 +907,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + return new_st; + } + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); + #endif + +@@ -936,7 +936,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) + st->vtime += bfq_delta(served, st->wsum); + bfq_forget_idle(st); + } +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); + #endif + st = bfq_entity_service_tree(&bfqq->entity); +@@ -1060,7 +1060,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + bfq_log_bfqq(bfqq->bfqd, bfqq, + "__activate_entity: new queue finish %llu", + ((entity->finish>>10)*1000)>>12); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); +@@ -1078,7 +1078,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + bfq_log_bfqq(bfqq->bfqd, bfqq, + "__activate_entity: queue %seligible in st %p", + entity->start <= st->vtime ? "" : "non ", st); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); +@@ -1153,7 +1153,7 @@ static void __bfq_activate_entity(struct bfq_entity *entity, + + BUG_ON(entity->on_st && bfqq); + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + if (entity->on_st && !bfqq) { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, +@@ -1485,7 +1485,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "invoking udpdate_next for this queue"); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + else { + struct bfq_group *bfqg = + container_of(entity, +@@ -1525,7 +1525,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_vtime_jump: new value %llu", + root_entity->min_start); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + else { + struct bfq_group *bfqg = + container_of(root_entity, struct bfq_group, +@@ -1661,7 +1661,7 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service + "__lookup_next: start %llu vtime %llu st %p", + ((entity->start>>10)*1000)>>12, + ((new_vtime>>10)*1000)>>12, st); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); +@@ -1735,7 +1735,7 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", + st + class_idx, class_idx); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); +@@ -1777,7 +1777,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) + */ + sd = &bfqd->root_group->sched_data; + for (; sd ; sd = entity->my_sched_data) { +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + if (entity) { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); +@@ -1867,7 +1867,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) + bfq_log_bfqq(bfqd, bfqq, + "get_next_queue: this queue, finish %llu", + (((entity->finish>>10)*1000)>>10)>>2); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index 30d019fc67e0..25da0d1c0622 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -387,7 +387,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) + return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && + (bfqd->queue_weights_tree.rb_node->rb_left || + bfqd->queue_weights_tree.rb_node->rb_right) +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + ) || + (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && + (bfqd->group_weights_tree.rb_node->rb_left || +@@ -1672,7 +1672,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, + } + } + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + static void bfq_bio_merged(struct request_queue *q, struct request *req, + struct bio *bio) + { +@@ -3879,7 +3879,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) + */ + static void bfq_put_queue(struct bfq_queue *bfqq) + { +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + struct bfq_group *bfqg = bfqq_group(bfqq); + #endif + +@@ -3909,7 +3909,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); + + kmem_cache_free(bfq_pool, bfqq); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + bfqg_put(bfqg); + #endif + } +@@ -4835,7 +4835,7 @@ static void bfq_exit_queue(struct elevator_queue *e) + + BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + blkcg_deactivate_policy(q, &blkcg_policy_bfq); + #else + bfq_put_async_queues(bfqd, bfqd->root_group); +@@ -4850,7 +4850,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, + { + int i; + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + root_group->entity.parent = NULL; + root_group->my_entity = NULL; + root_group->bfqd = bfqd; +@@ -5265,7 +5265,7 @@ static struct elevator_type iosched_bfq = { + .elevator_merge_fn = bfq_merge, + .elevator_merged_fn = bfq_merged_request, + .elevator_merge_req_fn = bfq_merged_requests, +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + .elevator_bio_merged_fn = bfq_bio_merged, + #endif + .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, +@@ -5292,7 +5292,7 @@ static struct elevator_type iosched_bfq = { + .elevator_owner = THIS_MODULE, + }; + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + static struct blkcg_policy blkcg_policy_bfq = { + .dfl_cftypes = bfq_blkg_files, + .legacy_cftypes = bfq_blkcg_legacy_files, +@@ -5315,7 +5315,7 @@ static int __init bfq_init(void) + int ret; + char msg[60] = "BFQ I/O-scheduler: v8r12"; + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + ret = blkcg_policy_register(&blkcg_policy_bfq); + if (ret) + return ret; +@@ -5362,7 +5362,7 @@ static int __init bfq_init(void) + if (ret) + goto err_pol_unreg; + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + strcat(msg, " (with cgroups support)"); + #endif + pr_info("%s", msg); +@@ -5370,7 +5370,7 @@ static int __init bfq_init(void) + return 0; + + err_pol_unreg: +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + blkcg_policy_unregister(&blkcg_policy_bfq); + #endif + return ret; +@@ -5379,7 +5379,7 @@ static int __init bfq_init(void) + static void __exit bfq_exit(void) + { + elv_unregister(&iosched_bfq); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + blkcg_policy_unregister(&blkcg_policy_bfq); + #endif + bfq_slab_kill(); +diff --git a/block/bfq.h b/block/bfq.h +index 34fc4697fd89..53954d1b87f8 100644 +--- a/block/bfq.h ++++ b/block/bfq.h +@@ -19,6 +19,18 @@ + #include + #include + ++/* ++ * Define an alternative macro to compile cgroups support. This is one ++ * of the steps needed to let bfq-mq share the files bfq-sched.c and ++ * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro ++ * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether ++ * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not ++ * CONFIG_BFQ_GROUP_IOSCHED, is defined. ++ */ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#define BFQ_GROUP_IOSCHED_ENABLED ++#endif ++ + #define BFQ_IOPRIO_CLASSES 3 + #define BFQ_CL_IDLE_TIMEOUT (HZ/5) + +@@ -344,7 +356,7 @@ struct bfq_io_cq { + struct bfq_ttime ttime; + /* per (request_queue, blkcg) ioprio */ + int ioprio; +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + uint64_t blkcg_serial_nr; /* the current blkcg serial */ + #endif + +@@ -671,7 +683,7 @@ static const char *checked_dev_name(const struct device *dev) + return nodev; + } + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + +@@ -696,7 +708,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + __pbuf, ##args); \ + } while (0) + +-#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ + + #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + pr_crit("%s bfq%d%c " fmt "\n", \ +@@ -705,7 +717,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + ##args) + #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + +-#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ + + #define bfq_log(bfqd, fmt, args...) \ + pr_crit("%s bfq " fmt "\n", \ +@@ -713,7 +725,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + ##args) + + #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + +@@ -735,7 +747,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ + } while (0) + +-#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ + + #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ +@@ -743,7 +755,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + ##args) + #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + +-#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ + + #define bfq_log(bfqd, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) +@@ -763,7 +775,7 @@ enum bfqq_expiration { + + + struct bfqg_stats { +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + /* number of ios merged */ + struct blkg_rwstat merged; + /* total time spent on device in ns, may not be accurate w/ queueing */ +@@ -794,7 +806,7 @@ struct bfqg_stats { + #endif + }; + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + /* + * struct bfq_group_data - per-blkcg storage for the blkio subsystem. + * +@@ -895,7 +907,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "entity_service_tree %p %d", + sched_data->service_tree + idx, idx); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); +@@ -924,7 +936,7 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) + return bic->icq.q->elevator->elevator_data; + } + +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + + static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) + { +@@ -953,7 +965,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bfq_io_cq *bic); + static void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg); +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); + #endif + static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); + +From add91dbd756cf8ca3aa3add9a19eef742d5fca6b Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Fri, 20 Jan 2017 09:18:25 +0100 +Subject: [PATCH 08/51] Increase max policies for io controller + +To let bfq-mq policy be plugged too (however cgroups +suppport is not yet functional in bfq-mq). + +Signed-off-by: Paolo Valente +--- + include/linux/blkdev.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index bf000c58644b..10f892ca585d 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -54,7 +54,7 @@ struct blk_stat_callback; + * Maximum number of blkcg policies allowed to be registered concurrently. + * Defined here to simplify include dependency. + */ +-#define BLKCG_MAX_POLS 4 ++#define BLKCG_MAX_POLS 5 + + typedef void (rq_end_io_fn)(struct request *, blk_status_t); + + +From 2c39a1d9ab4516d44e01e96f19f578b927e7f2e9 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Mon, 19 Dec 2016 18:11:33 +0100 +Subject: [PATCH 09/51] Copy header file bfq.h as bfq-mq.h + +This commit introduces the header file bfq-mq.h, that will play +for bfq-mq-iosched.c the same role that bfq.h plays for bfq-iosched.c. + +For the moment, the file bfq-mq.h is just a copy of bfq.h. + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 2 +- + block/bfq-mq.h | 973 +++++++++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 974 insertions(+), 1 deletion(-) + create mode 100644 block/bfq-mq.h + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index e88e00f1e0a7..d1125aee658c 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -83,7 +83,7 @@ + #include + #include "blk.h" + #undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ +-#include "bfq.h" ++#include "bfq-mq.h" + + /* Expiration time of sync (0) and async (1) requests, in ns. */ + static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +new file mode 100644 +index 000000000000..53954d1b87f8 +--- /dev/null ++++ b/block/bfq-mq.h +@@ -0,0 +1,973 @@ ++/* ++ * BFQ v8r12 for 4.11.0: data structures and common functions prototypes. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2015 Paolo Valente ++ * ++ * Copyright (C) 2017 Paolo Valente ++ */ ++ ++#ifndef _BFQ_H ++#define _BFQ_H ++ ++#include ++#include ++#include ++ ++/* ++ * Define an alternative macro to compile cgroups support. This is one ++ * of the steps needed to let bfq-mq share the files bfq-sched.c and ++ * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro ++ * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether ++ * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not ++ * CONFIG_BFQ_GROUP_IOSCHED, is defined. ++ */ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#define BFQ_GROUP_IOSCHED_ENABLED ++#endif ++ ++#define BFQ_IOPRIO_CLASSES 3 ++#define BFQ_CL_IDLE_TIMEOUT (HZ/5) ++ ++#define BFQ_MIN_WEIGHT 1 ++#define BFQ_MAX_WEIGHT 1000 ++#define BFQ_WEIGHT_CONVERSION_COEFF 10 ++ ++#define BFQ_DEFAULT_QUEUE_IOPRIO 4 ++ ++#define BFQ_WEIGHT_LEGACY_DFL 100 ++#define BFQ_DEFAULT_GRP_IOPRIO 0 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE ++ ++/* ++ * Soft real-time applications are extremely more latency sensitive ++ * than interactive ones. Over-raise the weight of the former to ++ * privilege them against the latter. ++ */ ++#define BFQ_SOFTRT_WEIGHT_FACTOR 100 ++ ++struct bfq_entity; ++ ++/** ++ * struct bfq_service_tree - per ioprio_class service tree. ++ * ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each ++ * ioprio_class has its own independent scheduler, and so its own ++ * bfq_service_tree. All the fields are protected by the queue lock ++ * of the containing bfqd. ++ */ ++struct bfq_service_tree { ++ /* tree for active entities (i.e., those backlogged) */ ++ struct rb_root active; ++ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ ++ struct rb_root idle; ++ ++ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ ++ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ ++ ++ u64 vtime; /* scheduler virtual time */ ++ /* scheduler weight sum; active and idle entities contribute to it */ ++ unsigned long wsum; ++}; ++ ++/** ++ * struct bfq_sched_data - multi-class scheduler. ++ * ++ * bfq_sched_data is the basic scheduler queue. It supports three ++ * ioprio_classes, and can be used either as a toplevel queue or as an ++ * intermediate queue in a hierarchical setup. ++ * ++ * The supported ioprio_classes are the same as in CFQ, in descending ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. ++ * Requests from higher priority queues are served before all the ++ * requests from lower priority queues; among requests of the same ++ * queue requests are served according to B-WF2Q+. ++ * ++ * The schedule is implemented by the service trees, plus the field ++ * @next_in_service, which points to the entity on the active trees ++ * that will be served next, if 1) no changes in the schedule occurs ++ * before the current in-service entity is expired, 2) the in-service ++ * queue becomes idle when it expires, and 3) if the entity pointed by ++ * in_service_entity is not a queue, then the in-service child entity ++ * of the entity pointed by in_service_entity becomes idle on ++ * expiration. This peculiar definition allows for the following ++ * optimization, not yet exploited: while a given entity is still in ++ * service, we already know which is the best candidate for next ++ * service among the other active entitities in the same parent ++ * entity. We can then quickly compare the timestamps of the ++ * in-service entity with those of such best candidate. ++ * ++ * All the fields are protected by the queue lock of the containing ++ * bfqd. ++ */ ++struct bfq_sched_data { ++ struct bfq_entity *in_service_entity; /* entity in service */ ++ /* head-of-the-line entity in the scheduler (see comments above) */ ++ struct bfq_entity *next_in_service; ++ /* array of service trees, one per ioprio_class */ ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; ++ /* last time CLASS_IDLE was served */ ++ unsigned long bfq_class_idle_last_service; ++ ++}; ++ ++/** ++ * struct bfq_weight_counter - counter of the number of all active entities ++ * with a given weight. ++ */ ++struct bfq_weight_counter { ++ unsigned int weight; /* weight of the entities this counter refers to */ ++ unsigned int num_active; /* nr of active entities with this weight */ ++ /* ++ * Weights tree member (see bfq_data's @queue_weights_tree and ++ * @group_weights_tree) ++ */ ++ struct rb_node weights_node; ++}; ++ ++/** ++ * struct bfq_entity - schedulable entity. ++ * ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each ++ * entity belongs to the sched_data of the parent group in the cgroup ++ * hierarchy. Non-leaf entities have also their own sched_data, stored ++ * in @my_sched_data. ++ * ++ * Each entity stores independently its priority values; this would ++ * allow different weights on different devices, but this ++ * functionality is not exported to userspace by now. Priorities and ++ * weights are updated lazily, first storing the new values into the ++ * new_* fields, then setting the @prio_changed flag. As soon as ++ * there is a transition in the entity state that allows the priority ++ * update to take place the effective and the requested priority ++ * values are synchronized. ++ * ++ * Unless cgroups are used, the weight value is calculated from the ++ * ioprio to export the same interface as CFQ. When dealing with ++ * ``well-behaved'' queues (i.e., queues that do not spend too much ++ * time to consume their budget and have true sequential behavior, and ++ * when there are no external factors breaking anticipation) the ++ * relative weights at each level of the cgroups hierarchy should be ++ * guaranteed. All the fields are protected by the queue lock of the ++ * containing bfqd. ++ */ ++struct bfq_entity { ++ struct rb_node rb_node; /* service_tree member */ ++ /* pointer to the weight counter associated with this entity */ ++ struct bfq_weight_counter *weight_counter; ++ ++ /* ++ * Flag, true if the entity is on a tree (either the active or ++ * the idle one of its service_tree) or is in service. ++ */ ++ bool on_st; ++ ++ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ ++ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ ++ ++ /* tree the entity is enqueued into; %NULL if not on a tree */ ++ struct rb_root *tree; ++ ++ /* ++ * minimum start time of the (active) subtree rooted at this ++ * entity; used for O(log N) lookups into active trees ++ */ ++ u64 min_start; ++ ++ /* amount of service received during the last service slot */ ++ int service; ++ ++ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ ++ int budget; ++ ++ unsigned int weight; /* weight of the queue */ ++ unsigned int new_weight; /* next weight if a change is in progress */ ++ ++ /* original weight, used to implement weight boosting */ ++ unsigned int orig_weight; ++ ++ /* parent entity, for hierarchical scheduling */ ++ struct bfq_entity *parent; ++ ++ /* ++ * For non-leaf nodes in the hierarchy, the associated ++ * scheduler queue, %NULL on leaf nodes. ++ */ ++ struct bfq_sched_data *my_sched_data; ++ /* the scheduler queue this entity belongs to */ ++ struct bfq_sched_data *sched_data; ++ ++ /* flag, set to request a weight, ioprio or ioprio_class change */ ++ int prio_changed; ++}; ++ ++struct bfq_group; ++ ++/** ++ * struct bfq_queue - leaf schedulable entity. ++ * ++ * A bfq_queue is a leaf request queue; it can be associated with an ++ * io_context or more, if it is async or shared between cooperating ++ * processes. @cgroup holds a reference to the cgroup, to be sure that it ++ * does not disappear while a bfqq still references it (mostly to avoid ++ * races between request issuing and task migration followed by cgroup ++ * destruction). ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_queue { ++ /* reference counter */ ++ int ref; ++ /* parent bfq_data */ ++ struct bfq_data *bfqd; ++ ++ /* current ioprio and ioprio class */ ++ unsigned short ioprio, ioprio_class; ++ /* next ioprio and ioprio class if a change is in progress */ ++ unsigned short new_ioprio, new_ioprio_class; ++ ++ /* ++ * Shared bfq_queue if queue is cooperating with one or more ++ * other queues. ++ */ ++ struct bfq_queue *new_bfqq; ++ /* request-position tree member (see bfq_group's @rq_pos_tree) */ ++ struct rb_node pos_node; ++ /* request-position tree root (see bfq_group's @rq_pos_tree) */ ++ struct rb_root *pos_root; ++ ++ /* sorted list of pending requests */ ++ struct rb_root sort_list; ++ /* if fifo isn't expired, next request to serve */ ++ struct request *next_rq; ++ /* number of sync and async requests queued */ ++ int queued[2]; ++ /* number of sync and async requests currently allocated */ ++ int allocated[2]; ++ /* number of pending metadata requests */ ++ int meta_pending; ++ /* fifo list of requests in sort_list */ ++ struct list_head fifo; ++ ++ /* entity representing this queue in the scheduler */ ++ struct bfq_entity entity; ++ ++ /* maximum budget allowed from the feedback mechanism */ ++ int max_budget; ++ /* budget expiration (in jiffies) */ ++ unsigned long budget_timeout; ++ ++ /* number of requests on the dispatch list or inside driver */ ++ int dispatched; ++ ++ unsigned int flags; /* status flags.*/ ++ ++ /* node for active/idle bfqq list inside parent bfqd */ ++ struct list_head bfqq_list; ++ ++ /* bit vector: a 1 for each seeky requests in history */ ++ u32 seek_history; ++ ++ /* node for the device's burst list */ ++ struct hlist_node burst_list_node; ++ ++ /* position of the last request enqueued */ ++ sector_t last_request_pos; ++ ++ /* Number of consecutive pairs of request completion and ++ * arrival, such that the queue becomes idle after the ++ * completion, but the next request arrives within an idle ++ * time slice; used only if the queue's IO_bound flag has been ++ * cleared. ++ */ ++ unsigned int requests_within_timer; ++ ++ /* pid of the process owning the queue, used for logging purposes */ ++ pid_t pid; ++ ++ /* ++ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL ++ * if the queue is shared. ++ */ ++ struct bfq_io_cq *bic; ++ ++ /* current maximum weight-raising time for this queue */ ++ unsigned long wr_cur_max_time; ++ /* ++ * Minimum time instant such that, only if a new request is ++ * enqueued after this time instant in an idle @bfq_queue with ++ * no outstanding requests, then the task associated with the ++ * queue it is deemed as soft real-time (see the comments on ++ * the function bfq_bfqq_softrt_next_start()) ++ */ ++ unsigned long soft_rt_next_start; ++ /* ++ * Start time of the current weight-raising period if ++ * the @bfq-queue is being weight-raised, otherwise ++ * finish time of the last weight-raising period. ++ */ ++ unsigned long last_wr_start_finish; ++ /* factor by which the weight of this queue is multiplied */ ++ unsigned int wr_coeff; ++ /* ++ * Time of the last transition of the @bfq_queue from idle to ++ * backlogged. ++ */ ++ unsigned long last_idle_bklogged; ++ /* ++ * Cumulative service received from the @bfq_queue since the ++ * last transition from idle to backlogged. ++ */ ++ unsigned long service_from_backlogged; ++ /* ++ * Value of wr start time when switching to soft rt ++ */ ++ unsigned long wr_start_at_switch_to_srt; ++ ++ unsigned long split_time; /* time of last split */ ++}; ++ ++/** ++ * struct bfq_ttime - per process thinktime stats. ++ */ ++struct bfq_ttime { ++ u64 last_end_request; /* completion time of last request */ ++ ++ u64 ttime_total; /* total process thinktime */ ++ unsigned long ttime_samples; /* number of thinktime samples */ ++ u64 ttime_mean; /* average process thinktime */ ++ ++}; ++ ++/** ++ * struct bfq_io_cq - per (request_queue, io_context) structure. ++ */ ++struct bfq_io_cq { ++ /* associated io_cq structure */ ++ struct io_cq icq; /* must be the first member */ ++ /* array of two process queues, the sync and the async */ ++ struct bfq_queue *bfqq[2]; ++ /* associated @bfq_ttime struct */ ++ struct bfq_ttime ttime; ++ /* per (request_queue, blkcg) ioprio */ ++ int ioprio; ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ uint64_t blkcg_serial_nr; /* the current blkcg serial */ ++#endif ++ ++ /* ++ * Snapshot of the has_short_time flag before merging; taken ++ * to remember its value while the queue is merged, so as to ++ * be able to restore it in case of split. ++ */ ++ bool saved_has_short_ttime; ++ /* ++ * Same purpose as the previous two fields for the I/O bound ++ * classification of a queue. ++ */ ++ bool saved_IO_bound; ++ ++ /* ++ * Same purpose as the previous fields for the value of the ++ * field keeping the queue's belonging to a large burst ++ */ ++ bool saved_in_large_burst; ++ /* ++ * True if the queue belonged to a burst list before its merge ++ * with another cooperating queue. ++ */ ++ bool was_in_burst_list; ++ ++ /* ++ * Similar to previous fields: save wr information. ++ */ ++ unsigned long saved_wr_coeff; ++ unsigned long saved_last_wr_start_finish; ++ unsigned long saved_wr_start_at_switch_to_srt; ++ unsigned int saved_wr_cur_max_time; ++}; ++ ++enum bfq_device_speed { ++ BFQ_BFQD_FAST, ++ BFQ_BFQD_SLOW, ++}; ++ ++/** ++ * struct bfq_data - per-device data structure. ++ * ++ * All the fields are protected by the @queue lock. ++ */ ++struct bfq_data { ++ /* request queue for the device */ ++ struct request_queue *queue; ++ ++ /* root bfq_group for the device */ ++ struct bfq_group *root_group; ++ ++ /* ++ * rbtree of weight counters of @bfq_queues, sorted by ++ * weight. Used to keep track of whether all @bfq_queues have ++ * the same weight. The tree contains one counter for each ++ * distinct weight associated to some active and not ++ * weight-raised @bfq_queue (see the comments to the functions ++ * bfq_weights_tree_[add|remove] for further details). ++ */ ++ struct rb_root queue_weights_tree; ++ /* ++ * rbtree of non-queue @bfq_entity weight counters, sorted by ++ * weight. Used to keep track of whether all @bfq_groups have ++ * the same weight. The tree contains one counter for each ++ * distinct weight associated to some active @bfq_group (see ++ * the comments to the functions bfq_weights_tree_[add|remove] ++ * for further details). ++ */ ++ struct rb_root group_weights_tree; ++ ++ /* ++ * Number of bfq_queues containing requests (including the ++ * queue in service, even if it is idling). ++ */ ++ int busy_queues; ++ /* number of weight-raised busy @bfq_queues */ ++ int wr_busy_queues; ++ /* number of queued requests */ ++ int queued; ++ /* number of requests dispatched and waiting for completion */ ++ int rq_in_driver; ++ ++ /* ++ * Maximum number of requests in driver in the last ++ * @hw_tag_samples completed requests. ++ */ ++ int max_rq_in_driver; ++ /* number of samples used to calculate hw_tag */ ++ int hw_tag_samples; ++ /* flag set to one if the driver is showing a queueing behavior */ ++ int hw_tag; ++ ++ /* number of budgets assigned */ ++ int budgets_assigned; ++ ++ /* ++ * Timer set when idling (waiting) for the next request from ++ * the queue in service. ++ */ ++ struct hrtimer idle_slice_timer; ++ /* delayed work to restart dispatching on the request queue */ ++ struct work_struct unplug_work; ++ ++ /* bfq_queue in service */ ++ struct bfq_queue *in_service_queue; ++ /* bfq_io_cq (bic) associated with the @in_service_queue */ ++ struct bfq_io_cq *in_service_bic; ++ ++ /* on-disk position of the last served request */ ++ sector_t last_position; ++ ++ /* time of last request completion (ns) */ ++ u64 last_completion; ++ ++ /* time of first rq dispatch in current observation interval (ns) */ ++ u64 first_dispatch; ++ /* time of last rq dispatch in current observation interval (ns) */ ++ u64 last_dispatch; ++ ++ /* beginning of the last budget */ ++ ktime_t last_budget_start; ++ /* beginning of the last idle slice */ ++ ktime_t last_idling_start; ++ ++ /* number of samples in current observation interval */ ++ int peak_rate_samples; ++ /* num of samples of seq dispatches in current observation interval */ ++ u32 sequential_samples; ++ /* total num of sectors transferred in current observation interval */ ++ u64 tot_sectors_dispatched; ++ /* max rq size seen during current observation interval (sectors) */ ++ u32 last_rq_max_size; ++ /* time elapsed from first dispatch in current observ. interval (us) */ ++ u64 delta_from_first; ++ /* current estimate of device peak rate */ ++ u32 peak_rate; ++ ++ /* maximum budget allotted to a bfq_queue before rescheduling */ ++ int bfq_max_budget; ++ ++ /* list of all the bfq_queues active on the device */ ++ struct list_head active_list; ++ /* list of all the bfq_queues idle on the device */ ++ struct list_head idle_list; ++ ++ /* ++ * Timeout for async/sync requests; when it fires, requests ++ * are served in fifo order. ++ */ ++ u64 bfq_fifo_expire[2]; ++ /* weight of backward seeks wrt forward ones */ ++ unsigned int bfq_back_penalty; ++ /* maximum allowed backward seek */ ++ unsigned int bfq_back_max; ++ /* maximum idling time */ ++ u32 bfq_slice_idle; ++ ++ /* user-configured max budget value (0 for auto-tuning) */ ++ int bfq_user_max_budget; ++ /* ++ * Timeout for bfq_queues to consume their budget; used to ++ * prevent seeky queues from imposing long latencies to ++ * sequential or quasi-sequential ones (this also implies that ++ * seeky queues cannot receive guarantees in the service ++ * domain; after a timeout they are charged for the time they ++ * have been in service, to preserve fairness among them, but ++ * without service-domain guarantees). ++ */ ++ unsigned int bfq_timeout; ++ ++ /* ++ * Number of consecutive requests that must be issued within ++ * the idle time slice to set again idling to a queue which ++ * was marked as non-I/O-bound (see the definition of the ++ * IO_bound flag for further details). ++ */ ++ unsigned int bfq_requests_within_timer; ++ ++ /* ++ * Force device idling whenever needed to provide accurate ++ * service guarantees, without caring about throughput ++ * issues. CAVEAT: this may even increase latencies, in case ++ * of useless idling for processes that did stop doing I/O. ++ */ ++ bool strict_guarantees; ++ ++ /* ++ * Last time at which a queue entered the current burst of ++ * queues being activated shortly after each other; for more ++ * details about this and the following parameters related to ++ * a burst of activations, see the comments on the function ++ * bfq_handle_burst. ++ */ ++ unsigned long last_ins_in_burst; ++ /* ++ * Reference time interval used to decide whether a queue has ++ * been activated shortly after @last_ins_in_burst. ++ */ ++ unsigned long bfq_burst_interval; ++ /* number of queues in the current burst of queue activations */ ++ int burst_size; ++ ++ /* common parent entity for the queues in the burst */ ++ struct bfq_entity *burst_parent_entity; ++ /* Maximum burst size above which the current queue-activation ++ * burst is deemed as 'large'. ++ */ ++ unsigned long bfq_large_burst_thresh; ++ /* true if a large queue-activation burst is in progress */ ++ bool large_burst; ++ /* ++ * Head of the burst list (as for the above fields, more ++ * details in the comments on the function bfq_handle_burst). ++ */ ++ struct hlist_head burst_list; ++ ++ /* if set to true, low-latency heuristics are enabled */ ++ bool low_latency; ++ /* ++ * Maximum factor by which the weight of a weight-raised queue ++ * is multiplied. ++ */ ++ unsigned int bfq_wr_coeff; ++ /* maximum duration of a weight-raising period (jiffies) */ ++ unsigned int bfq_wr_max_time; ++ ++ /* Maximum weight-raising duration for soft real-time processes */ ++ unsigned int bfq_wr_rt_max_time; ++ /* ++ * Minimum idle period after which weight-raising may be ++ * reactivated for a queue (in jiffies). ++ */ ++ unsigned int bfq_wr_min_idle_time; ++ /* ++ * Minimum period between request arrivals after which ++ * weight-raising may be reactivated for an already busy async ++ * queue (in jiffies). ++ */ ++ unsigned long bfq_wr_min_inter_arr_async; ++ ++ /* Max service-rate for a soft real-time queue, in sectors/sec */ ++ unsigned int bfq_wr_max_softrt_rate; ++ /* ++ * Cached value of the product R*T, used for computing the ++ * maximum duration of weight raising automatically. ++ */ ++ u64 RT_prod; ++ /* device-speed class for the low-latency heuristic */ ++ enum bfq_device_speed device_speed; ++ ++ /* fallback dummy bfqq for extreme OOM conditions */ ++ struct bfq_queue oom_bfqq; ++}; ++ ++enum bfqq_state_flags { ++ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ ++ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ ++ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* ++ * waiting for a request ++ * without idling the device ++ */ ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ ++ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ ++ BFQ_BFQQ_FLAG_IO_bound, /* ++ * bfqq has timed-out at least once ++ * having consumed at most 2/10 of ++ * its budget ++ */ ++ BFQ_BFQQ_FLAG_in_large_burst, /* ++ * bfqq activated in a large burst, ++ * see comments to bfq_handle_burst. ++ */ ++ BFQ_BFQQ_FLAG_softrt_update, /* ++ * may need softrt-next-start ++ * update ++ */ ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ ++ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ ++}; ++ ++#define BFQ_BFQQ_FNS(name) \ ++static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ ++{ \ ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ ++} ++ ++BFQ_BFQQ_FNS(just_created); ++BFQ_BFQQ_FNS(busy); ++BFQ_BFQQ_FNS(wait_request); ++BFQ_BFQQ_FNS(non_blocking_wait_rq); ++BFQ_BFQQ_FNS(must_alloc); ++BFQ_BFQQ_FNS(fifo_expire); ++BFQ_BFQQ_FNS(has_short_ttime); ++BFQ_BFQQ_FNS(sync); ++BFQ_BFQQ_FNS(IO_bound); ++BFQ_BFQQ_FNS(in_large_burst); ++BFQ_BFQQ_FNS(coop); ++BFQ_BFQQ_FNS(split_coop); ++BFQ_BFQQ_FNS(softrt_update); ++#undef BFQ_BFQQ_FNS ++ ++/* Logging facilities. */ ++#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE ++ ++static const char *checked_dev_name(const struct device *dev) ++{ ++ static const char nodev[] = "nodev"; ++ ++ if (dev) ++ return dev_name(dev); ++ ++ return nodev; ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ assert_spin_locked((bfqd)->queue->queue_lock); \ ++ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ ++ pr_crit("%s bfq%d%c %s " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __pbuf, ##args); \ ++} while (0) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ ++ pr_crit("%s %s " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ __pbuf, ##args); \ ++} while (0) ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ pr_crit("%s bfq%d%c " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ ##args) ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ pr_crit("%s bfq " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ ##args) ++ ++#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ assert_spin_locked((bfqd)->queue->queue_lock); \ ++ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ ++ (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __pbuf, ##args); \ ++} while (0) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ ++ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ ++} while (0) ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ ##args) ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) ++#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ ++ ++/* Expiration reasons. */ ++enum bfqq_expiration { ++ BFQ_BFQQ_TOO_IDLE = 0, /* ++ * queue has been idling for ++ * too long ++ */ ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ ++ BFQ_BFQQ_PREEMPTED /* preemption in progress */ ++}; ++ ++ ++struct bfqg_stats { ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ /* number of ios merged */ ++ struct blkg_rwstat merged; ++ /* total time spent on device in ns, may not be accurate w/ queueing */ ++ struct blkg_rwstat service_time; ++ /* total time spent waiting in scheduler queue in ns */ ++ struct blkg_rwstat wait_time; ++ /* number of IOs queued up */ ++ struct blkg_rwstat queued; ++ /* total disk time and nr sectors dispatched by this group */ ++ struct blkg_stat time; ++ /* sum of number of ios queued across all samples */ ++ struct blkg_stat avg_queue_size_sum; ++ /* count of samples taken for average */ ++ struct blkg_stat avg_queue_size_samples; ++ /* how many times this group has been removed from service tree */ ++ struct blkg_stat dequeue; ++ /* total time spent waiting for it to be assigned a timeslice. */ ++ struct blkg_stat group_wait_time; ++ /* time spent idling for this blkcg_gq */ ++ struct blkg_stat idle_time; ++ /* total time with empty current active q with other requests queued */ ++ struct blkg_stat empty_time; ++ /* fields after this shouldn't be cleared on stat reset */ ++ uint64_t start_group_wait_time; ++ uint64_t start_idle_time; ++ uint64_t start_empty_time; ++ uint16_t flags; ++#endif ++}; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++/* ++ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. ++ * ++ * @ps: @blkcg_policy_storage that this structure inherits ++ * @weight: weight of the bfq_group ++ */ ++struct bfq_group_data { ++ /* must be the first member */ ++ struct blkcg_policy_data pd; ++ ++ unsigned int weight; ++}; ++ ++/** ++ * struct bfq_group - per (device, cgroup) data structure. ++ * @entity: schedulable entity to insert into the parent group sched_data. ++ * @sched_data: own sched_data, to contain child entities (they may be ++ * both bfq_queues and bfq_groups). ++ * @bfqd: the bfq_data for the device this group acts upon. ++ * @async_bfqq: array of async queues for all the tasks belonging to ++ * the group, one queue per ioprio value per ioprio_class, ++ * except for the idle class that has only one queue. ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used ++ * to avoid too many special cases during group creation/ ++ * migration. ++ * @active_entities: number of active entities belonging to the group; ++ * unused for the root group. Used to know whether there ++ * are groups with more than one active @bfq_entity ++ * (see the comments to the function ++ * bfq_bfqq_may_idle()). ++ * @rq_pos_tree: rbtree sorted by next_request position, used when ++ * determining if two or more queues have interleaving ++ * requests (see bfq_find_close_cooperator()). ++ * ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup ++ * there is a set of bfq_groups, each one collecting the lower-level ++ * entities belonging to the group that are acting on the same device. ++ * ++ * Locking works as follows: ++ * o @bfqd is protected by the queue lock, RCU is used to access it ++ * from the readers. ++ * o All the other fields are protected by the @bfqd queue lock. ++ */ ++struct bfq_group { ++ /* must be the first member */ ++ struct blkg_policy_data pd; ++ ++ struct bfq_entity entity; ++ struct bfq_sched_data sched_data; ++ ++ void *bfqd; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct bfq_entity *my_entity; ++ ++ int active_entities; ++ ++ struct rb_root rq_pos_tree; ++ ++ struct bfqg_stats stats; ++}; ++ ++#else ++struct bfq_group { ++ struct bfq_sched_data sched_data; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct rb_root rq_pos_tree; ++}; ++#endif ++ ++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); ++ ++static unsigned int bfq_class_idx(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ return bfqq ? bfqq->ioprio_class - 1 : ++ BFQ_DEFAULT_GRP_CLASS - 1; ++} ++ ++static struct bfq_service_tree * ++bfq_entity_service_tree(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sched_data = entity->sched_data; ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned int idx = bfq_class_idx(entity); ++ ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); ++ BUG_ON(sched_data == NULL); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "entity_service_tree %p %d", ++ sched_data->service_tree + idx, idx); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "entity_service_tree %p %d", ++ sched_data->service_tree + idx, idx); ++ } ++#endif ++ return sched_data->service_tree + idx; ++} ++ ++static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) ++{ ++ return bic->bfqq[is_sync]; ++} ++ ++static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, ++ bool is_sync) ++{ ++ bic->bfqq[is_sync] = bfqq; ++} ++ ++static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) ++{ ++ return bic->icq.q->elevator->elevator_data; ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ ++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *group_entity = bfqq->entity.parent; ++ ++ if (!group_entity) ++ group_entity = &bfqq->bfqd->root_group->entity; ++ ++ return container_of(group_entity, struct bfq_group, entity); ++} ++ ++#else ++ ++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) ++{ ++ return bfqq->bfqd->root_group; ++} ++ ++#endif ++ ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); ++static void bfq_put_queue(struct bfq_queue *bfqq); ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bio *bio, bool is_sync, ++ struct bfq_io_cq *bic); ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); ++#endif ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); ++ ++#endif /* _BFQ_H */ + +From 0bd96428e086fd28800efdf5f0a5f62869af6e30 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Sat, 21 Jan 2017 12:41:14 +0100 +Subject: [PATCH 10/51] Move thinktime from bic to bfqq + +Prep change to make it possible to protect this field with a +scheduler lock. + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 28 ++++++++++++++-------------- + block/bfq-mq.h | 30 ++++++++++++++++-------------- + 2 files changed, 30 insertions(+), 28 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index d1125aee658c..65f5dfb79417 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -698,6 +698,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + if (unlikely(busy)) + old_wr_coeff = bfqq->wr_coeff; + ++ bfqq->ttime = bic->saved_ttime; + bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; + BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); +@@ -1287,7 +1288,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, + * details on the usage of the next variable. + */ + arrived_in_time = ktime_get_ns() <= +- RQ_BIC(rq)->ttime.last_end_request + ++ bfqq->ttime.last_end_request + + bfqd->bfq_slice_idle * 3; + + bfq_log_bfqq(bfqd, bfqq, +@@ -2048,6 +2049,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) + if (!bic) + return; + ++ bic->saved_ttime = bfqq->ttime; + bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); + bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); +@@ -3948,11 +3950,6 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfq_put_queue(bfqq); /* release process reference */ + } + +-static void bfq_init_icq(struct io_cq *icq) +-{ +- icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); +-} +- + static void bfq_exit_icq(struct io_cq *icq) + { + struct bfq_io_cq *bic = icq_to_bic(icq); +@@ -4084,6 +4081,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bfq_mark_bfqq_just_created(bfqq); + } else + bfq_clear_bfqq_sync(bfqq); ++ ++ bfqq->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); ++ + bfq_mark_bfqq_IO_bound(bfqq); + + /* Tentative initial value to trade off between thr and lat */ +@@ -4191,14 +4191,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + } + + static void bfq_update_io_thinktime(struct bfq_data *bfqd, +- struct bfq_io_cq *bic) ++ struct bfq_queue *bfqq) + { +- struct bfq_ttime *ttime = &bic->ttime; +- u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; ++ struct bfq_ttime *ttime = &bfqq->ttime; ++ u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; + + elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); + +- ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; ++ ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; + ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); + ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, + ttime->ttime_samples); +@@ -4240,8 +4240,8 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, + * decide whether to mark as has_short_ttime + */ + if (atomic_read(&bic->icq.ioc->active_ref) == 0 || +- (bfq_sample_valid(bic->ttime.ttime_samples) && +- bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) ++ (bfq_sample_valid(bfqq->ttime.ttime_samples) && ++ bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) + has_short_ttime = false; + + bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", +@@ -4265,7 +4265,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + if (rq->cmd_flags & REQ_META) + bfqq->meta_pending++; + +- bfq_update_io_thinktime(bfqd, bic); ++ bfq_update_io_thinktime(bfqd, bfqq); + bfq_update_has_short_ttime(bfqd, bfqq, bic); + bfq_update_io_seektime(bfqd, bfqq, rq); + +@@ -4436,7 +4436,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) + + now_ns = ktime_get_ns(); + +- RQ_BIC(rq)->ttime.last_end_request = now_ns; ++ bfqq->ttime.last_end_request = now_ns; + + /* + * Using us instead of ns, to get a reasonable precision in +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +index 53954d1b87f8..0f51f270469c 100644 +--- a/block/bfq-mq.h ++++ b/block/bfq-mq.h +@@ -210,6 +210,18 @@ struct bfq_entity { + struct bfq_group; + + /** ++ * struct bfq_ttime - per process thinktime stats. ++ */ ++struct bfq_ttime { ++ u64 last_end_request; /* completion time of last request */ ++ ++ u64 ttime_total; /* total process thinktime */ ++ unsigned long ttime_samples; /* number of thinktime samples */ ++ u64 ttime_mean; /* average process thinktime */ ++ ++}; ++ ++/** + * struct bfq_queue - leaf schedulable entity. + * + * A bfq_queue is a leaf request queue; it can be associated with an +@@ -270,6 +282,9 @@ struct bfq_queue { + /* node for active/idle bfqq list inside parent bfqd */ + struct list_head bfqq_list; + ++ /* associated @bfq_ttime struct */ ++ struct bfq_ttime ttime; ++ + /* bit vector: a 1 for each seeky requests in history */ + u32 seek_history; + +@@ -333,18 +348,6 @@ struct bfq_queue { + }; + + /** +- * struct bfq_ttime - per process thinktime stats. +- */ +-struct bfq_ttime { +- u64 last_end_request; /* completion time of last request */ +- +- u64 ttime_total; /* total process thinktime */ +- unsigned long ttime_samples; /* number of thinktime samples */ +- u64 ttime_mean; /* average process thinktime */ +- +-}; +- +-/** + * struct bfq_io_cq - per (request_queue, io_context) structure. + */ + struct bfq_io_cq { +@@ -352,8 +355,6 @@ struct bfq_io_cq { + struct io_cq icq; /* must be the first member */ + /* array of two process queues, the sync and the async */ + struct bfq_queue *bfqq[2]; +- /* associated @bfq_ttime struct */ +- struct bfq_ttime ttime; + /* per (request_queue, blkcg) ioprio */ + int ioprio; + #ifdef BFQ_GROUP_IOSCHED_ENABLED +@@ -390,6 +391,7 @@ struct bfq_io_cq { + unsigned long saved_last_wr_start_finish; + unsigned long saved_wr_start_at_switch_to_srt; + unsigned int saved_wr_cur_max_time; ++ struct bfq_ttime saved_ttime; + }; + + enum bfq_device_speed { + +From 351a9aea7c0c9c30edacdbf2a3c0d089470de1e8 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 18 Jan 2017 11:42:22 +0100 +Subject: [PATCH 11/51] Embed bfq-ioc.c and add locking on request queue + +The version of bfq-ioc.c for bfq-iosched.c is not correct any more for +bfq-mq, because, in bfq-mq, the request queue lock is not being held +when bfq_bic_lookup is invoked. That function must then take that look +on its own. This commit removes the inclusion of bfq-ioc.c, copies the +content of bfq-ioc.c into bfq-mq-iosched.c, and adds the grabbing of +the lock. + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 39 ++++++++++++++++++++++++++++++++++++--- + 1 file changed, 36 insertions(+), 3 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 65f5dfb79417..756a618d5902 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -195,7 +195,39 @@ static int device_speed_thresh[2]; + + static void bfq_schedule_dispatch(struct bfq_data *bfqd); + +-#include "bfq-ioc.c" ++/** ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. ++ * @icq: the iocontext queue. ++ */ ++static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) ++{ ++ /* bic->icq is the first member, %NULL will convert to %NULL */ ++ return container_of(icq, struct bfq_io_cq, icq); ++} ++ ++/** ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. ++ * @bfqd: the lookup key. ++ * @ioc: the io_context of the process doing I/O. ++ * @q: the request queue. ++ */ ++static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, ++ struct io_context *ioc, ++ struct request_queue *q) ++{ ++ if (ioc) { ++ struct bfq_io_cq *icq; ++ ++ spin_lock_irq(q->queue_lock); ++ icq = icq_to_bic(ioc_lookup_icq(ioc, q)); ++ spin_unlock_irq(q->queue_lock); ++ ++ return icq; ++ } ++ ++ return NULL; ++} ++ + #include "bfq-sched.c" + #include "bfq-cgroup-included.c" + +@@ -1520,13 +1552,14 @@ static void bfq_add_request(struct request *rq) + } + + static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, +- struct bio *bio) ++ struct bio *bio, ++ struct request_queue *q) + { + struct task_struct *tsk = current; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq; + +- bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ bic = bfq_bic_lookup(bfqd, tsk->io_context, q); + if (!bic) + return NULL; + + +From ed0d64e27b2308813a2a846139e405e0479f0849 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Tue, 20 Dec 2016 09:07:19 +0100 +Subject: [PATCH 12/51] Modify interface and operation to comply with + blk-mq-sched + +As for modifications of the operation, the major changes are the introduction +of a scheduler lock, and the moving to deferred work of the body of the hook +exit_icq. The latter change has been made to avoid deadlocks caused by the +combination of the following facts: 1) such a body takes the scheduler lock, +and, if not deferred, 2) it does so from inside the exit_icq hook, which is +invoked with the queue lock held, and 3) there is at least one code path, +namely that starting from bfq_bio_merge, which takes these locks in the +opposite order. + +Signed-off-by: Paolo Valente +--- + block/bfq-cgroup-included.c | 4 - + block/bfq-mq-iosched.c | 695 ++++++++++++++++++++++++-------------------- + block/bfq-mq.h | 35 +-- + 3 files changed, 394 insertions(+), 340 deletions(-) + +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +index 9c483b658179..8a73de76f32b 100644 +--- a/block/bfq-cgroup-included.c ++++ b/block/bfq-cgroup-included.c +@@ -472,8 +472,6 @@ static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, + struct bfq_group *bfqg, *parent; + struct bfq_entity *entity; + +- assert_spin_locked(bfqd->queue->queue_lock); +- + bfqg = bfq_lookup_bfqg(bfqd, blkcg); + + if (unlikely(!bfqg)) +@@ -602,8 +600,6 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, + struct bfq_group *bfqg; + struct bfq_entity *entity; + +- lockdep_assert_held(bfqd->queue->queue_lock); +- + bfqg = bfq_find_set_group(bfqd, blkcg); + + if (unlikely(!bfqg)) +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 756a618d5902..c963d92a32c2 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -81,7 +81,13 @@ + #include + #include + #include ++#include ++#include ++ + #include "blk.h" ++#include "blk-mq.h" ++#include "blk-mq-tag.h" ++#include "blk-mq-sched.h" + #undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ + #include "bfq-mq.h" + +@@ -193,8 +199,6 @@ static int device_speed_thresh[2]; + #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) + #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) + +-static void bfq_schedule_dispatch(struct bfq_data *bfqd); +- + /** + * icq_to_bic - convert iocontext queue structure to bfq_io_cq. + * @icq: the iocontext queue. +@@ -216,11 +220,12 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, + struct request_queue *q) + { + if (ioc) { ++ unsigned long flags; + struct bfq_io_cq *icq; + +- spin_lock_irq(q->queue_lock); ++ spin_lock_irqsave(q->queue_lock, flags); + icq = icq_to_bic(ioc_lookup_icq(ioc, q)); +- spin_unlock_irq(q->queue_lock); ++ spin_unlock_irqrestore(q->queue_lock, flags); + + return icq; + } +@@ -244,7 +249,7 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) + { + if (bfqd->queued != 0) { + bfq_log(bfqd, "schedule dispatch"); +- kblockd_schedule_work(&bfqd->unplug_work); ++ blk_mq_run_hw_queues(bfqd->queue, true); + } + } + +@@ -768,9 +773,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) + { + int process_refs, io_refs; + +- lockdep_assert_held(bfqq->bfqd->queue->queue_lock); +- +- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; ++ io_refs = bfqq->allocated; + process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; + BUG_ON(process_refs < 0); + return process_refs; +@@ -1584,6 +1587,7 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq) + return sdist; + } + ++#if 0 /* Still not clear if we can do without next two functions */ + static void bfq_activate_request(struct request_queue *q, struct request *rq) + { + struct bfq_data *bfqd = q->elevator->elevator_data; +@@ -1597,8 +1601,10 @@ static void bfq_deactivate_request(struct request_queue *q, struct request *rq) + BUG_ON(bfqd->rq_in_driver == 0); + bfqd->rq_in_driver--; + } ++#endif + +-static void bfq_remove_request(struct request *rq) ++static void bfq_remove_request(struct request_queue *q, ++ struct request *rq) + { + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; +@@ -1619,6 +1625,10 @@ static void bfq_remove_request(struct request *rq) + bfqd->queued--; + elv_rb_del(&bfqq->sort_list, rq); + ++ elv_rqhash_del(q, rq); ++ if (q->last_merge == rq) ++ q->last_merge = NULL; ++ + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + bfqq->next_rq = NULL; + +@@ -1659,13 +1669,36 @@ static void bfq_remove_request(struct request *rq) + bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); + } + +-static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, +- struct bio *bio) ++static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) ++{ ++ struct request_queue *q = hctx->queue; ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct request *free = NULL; ++ bool ret; ++ ++ spin_lock_irq(&bfqd->lock); ++ ret = blk_mq_sched_try_merge(q, bio, &free); ++ ++ /* ++ * XXX Not yet freeing without lock held, to avoid an ++ * inconsistency with respect to the lock-protected invocation ++ * of blk_mq_sched_try_insert_merge in bfq_bio_merge. Waiting ++ * for clarifications from Jens. ++ */ ++ if (free) ++ blk_mq_free_request(free); ++ spin_unlock_irq(&bfqd->lock); ++ ++ return ret; ++} ++ ++static int bfq_request_merge(struct request_queue *q, struct request **req, ++ struct bio *bio) + { + struct bfq_data *bfqd = q->elevator->elevator_data; + struct request *__rq; + +- __rq = bfq_find_rq_fmerge(bfqd, bio); ++ __rq = bfq_find_rq_fmerge(bfqd, bio, q); + if (__rq && elv_bio_merge_ok(__rq, bio)) { + *req = __rq; + return ELEVATOR_FRONT_MERGE; +@@ -1674,7 +1707,7 @@ static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, + return ELEVATOR_NO_MERGE; + } + +-static void bfq_merged_request(struct request_queue *q, struct request *req, ++static void bfq_request_merged(struct request_queue *q, struct request *req, + enum elv_merge type) + { + if (type == ELEVATOR_FRONT_MERGE && +@@ -1689,6 +1722,8 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, + /* Reposition request in its sort_list */ + elv_rb_del(&bfqq->sort_list, req); + elv_rb_add(&bfqq->sort_list, req); ++ ++ spin_lock_irq(&bfqd->lock); + /* Choose next request to be served for bfqq */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, +@@ -1704,22 +1739,19 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, + bfq_updated_next_req(bfqd, bfqq); + bfq_pos_tree_add_move(bfqd, bfqq); + } ++ spin_unlock_irq(&bfqd->lock); + } + } + +-#ifdef BFQ_GROUP_IOSCHED_ENABLED +-static void bfq_bio_merged(struct request_queue *q, struct request *req, +- struct bio *bio) +-{ +- bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); +-} +-#endif +- +-static void bfq_merged_requests(struct request_queue *q, struct request *rq, ++static void bfq_requests_merged(struct request_queue *q, struct request *rq, + struct request *next) + { + struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); + ++ if (!RB_EMPTY_NODE(&rq->rb_node)) ++ goto end; ++ spin_lock_irq(&bfqq->bfqd->lock); ++ + /* + * If next and rq belong to the same bfq_queue and next is older + * than rq, then reposition rq in the fifo (by substituting next +@@ -1740,7 +1772,10 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, + if (bfqq->next_rq == next) + bfqq->next_rq = rq; + +- bfq_remove_request(next); ++ bfq_remove_request(q, next); ++ ++ spin_unlock_irq(&bfqq->bfqd->lock); ++end: + bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); + } + +@@ -1786,7 +1821,7 @@ static void bfq_end_wr(struct bfq_data *bfqd) + { + struct bfq_queue *bfqq; + +- spin_lock_irq(bfqd->queue->queue_lock); ++ spin_lock_irq(&bfqd->lock); + + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); +@@ -1794,7 +1829,7 @@ static void bfq_end_wr(struct bfq_data *bfqd) + bfq_bfqq_end_wr(bfqq); + bfq_end_wr_async(bfqd); + +- spin_unlock_irq(bfqd->queue->queue_lock); ++ spin_unlock_irq(&bfqd->lock); + } + + static sector_t bfq_io_struct_pos(void *io_struct, bool request) +@@ -2184,8 +2219,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + bfq_put_queue(bfqq); + } + +-static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, +- struct bio *bio) ++static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, ++ struct bio *bio) + { + struct bfq_data *bfqd = q->elevator->elevator_data; + bool is_sync = op_is_sync(bio->bi_opf); +@@ -2203,7 +2238,7 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, + * merge only if rq is queued there. + * Queue lock is held here. + */ +- bic = bfq_bic_lookup(bfqd, current->io_context); ++ bic = bfq_bic_lookup(bfqd, current->io_context, q); + if (!bic) + return false; + +@@ -2228,12 +2263,6 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, + return bfqq == RQ_BFQQ(rq); + } + +-static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, +- struct request *next) +-{ +- return RQ_BFQQ(rq) == RQ_BFQQ(next); +-} +- + /* + * Set the maximum time for the in-service queue to consume its + * budget. This prevents seeky processes from lowering the throughput. +@@ -2264,7 +2293,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, + { + if (bfqq) { + bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); +- bfq_mark_bfqq_must_alloc(bfqq); + bfq_clear_bfqq_fifo_expire(bfqq); + + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; +@@ -2703,27 +2731,28 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) + } + + /* +- * Move request from internal lists to the dispatch list of the request queue ++ * Remove request from internal lists. + */ +-static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) ++static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) + { + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + /* +- * For consistency, the next instruction should have been executed +- * after removing the request from the queue and dispatching it. +- * We execute instead this instruction before bfq_remove_request() +- * (and hence introduce a temporary inconsistency), for efficiency. +- * In fact, in a forced_dispatch, this prevents two counters related +- * to bfqq->dispatched to risk to be uselessly decremented if bfqq +- * is not in service, and then to be incremented again after +- * incrementing bfqq->dispatched. ++ * For consistency, the next instruction should have been ++ * executed after removing the request from the queue and ++ * dispatching it. We execute instead this instruction before ++ * bfq_remove_request() (and hence introduce a temporary ++ * inconsistency), for efficiency. In fact, should this ++ * dispatch occur for a non in-service bfqq, this anticipated ++ * increment prevents two counters related to bfqq->dispatched ++ * from risking to be, first, uselessly decremented, and then ++ * incremented again when the (new) value of bfqq->dispatched ++ * happens to be taken into account. + */ + bfqq->dispatched++; + bfq_update_peak_rate(q->elevator->elevator_data, rq); + +- bfq_remove_request(rq); +- elv_dispatch_sort(q, rq); ++ bfq_remove_request(q, rq); + } + + static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) +@@ -3605,7 +3634,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); + + if (bfq_may_expire_for_budg_timeout(bfqq) && +- !hrtimer_active(&bfqd->idle_slice_timer) && ++ !bfq_bfqq_wait_request(bfqq) && + !bfq_bfqq_must_idle(bfqq)) + goto expire; + +@@ -3641,7 +3670,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + * arrives. + */ + if (bfq_bfqq_wait_request(bfqq)) { +- BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); + /* + * If we get here: 1) at least a new request + * has arrived but we have not disabled the +@@ -3668,7 +3696,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + * for a new request, or has requests waiting for a completion and + * may idle after their completion, then keep it anyway. + */ +- if (hrtimer_active(&bfqd->idle_slice_timer) || ++ if (bfq_bfqq_wait_request(bfqq) || + (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { + bfqq = NULL; + goto keep_queue; +@@ -3753,13 +3781,11 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) + } + + /* +- * Dispatch one request from bfqq, moving it to the request queue +- * dispatch list. ++ * Dispatch next request from bfqq. + */ +-static int bfq_dispatch_request(struct bfq_data *bfqd, +- struct bfq_queue *bfqq) ++static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) + { +- int dispatched = 0; + struct request *rq = bfqq->next_rq; + unsigned long service_to_charge; + +@@ -3775,7 +3801,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + +- bfq_dispatch_insert(bfqd->queue, rq); ++ bfq_dispatch_remove(bfqd->queue, rq); + + /* + * If weight raising has to terminate for bfqq, then next +@@ -3791,86 +3817,61 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, + bfq_update_wr_data(bfqd, bfqq); + + bfq_log_bfqq(bfqd, bfqq, +- "dispatched %u sec req (%llu), budg left %d", ++ "dispatched %u sec req (%llu), budg left %d, new disp_nr %d", + blk_rq_sectors(rq), + (unsigned long long) blk_rq_pos(rq), +- bfq_bfqq_budget_left(bfqq)); +- +- dispatched++; ++ bfq_bfqq_budget_left(bfqq), ++ bfqq->dispatched); + + if (!bfqd->in_service_bic) { + atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); + bfqd->in_service_bic = RQ_BIC(rq); + } + ++ /* ++ * Expire bfqq, pretending that its budget expired, if bfqq ++ * belongs to CLASS_IDLE and other queues are waiting for ++ * service. ++ */ + if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) + goto expire; + +- return dispatched; ++ return rq; + + expire: + bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); +- return dispatched; +-} +- +-static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) +-{ +- int dispatched = 0; +- +- while (bfqq->next_rq) { +- bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); +- dispatched++; +- } +- +- BUG_ON(!list_empty(&bfqq->fifo)); +- return dispatched; ++ return rq; + } + +-/* +- * Drain our current requests. +- * Used for barriers and when switching io schedulers on-the-fly. +- */ +-static int bfq_forced_dispatch(struct bfq_data *bfqd) ++static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) + { +- struct bfq_queue *bfqq, *n; +- struct bfq_service_tree *st; +- int dispatched = 0; +- +- bfqq = bfqd->in_service_queue; +- if (bfqq) +- __bfq_bfqq_expire(bfqd, bfqq); ++ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + + /* +- * Loop through classes, and be careful to leave the scheduler +- * in a consistent state, as feedback mechanisms and vtime +- * updates cannot be disabled during the process. ++ * Avoiding lock: a race on bfqd->busy_queues should cause at ++ * most a call to dispatch for nothing + */ +- list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { +- st = bfq_entity_service_tree(&bfqq->entity); +- +- dispatched += __bfq_forced_dispatch_bfqq(bfqq); +- +- bfqq->max_budget = bfq_max_budget(bfqd); +- bfq_forget_idle(st); +- } +- +- BUG_ON(bfqd->busy_queues != 0); +- +- return dispatched; ++ return !list_empty_careful(&bfqd->dispatch) || ++ bfqd->busy_queues > 0; + } + +-static int bfq_dispatch_requests(struct request_queue *q, int force) ++static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + { +- struct bfq_data *bfqd = q->elevator->elevator_data; +- struct bfq_queue *bfqq; ++ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; ++ struct request *rq = NULL; ++ struct bfq_queue *bfqq = NULL; ++ ++ if (!list_empty(&bfqd->dispatch)) { ++ rq = list_first_entry(&bfqd->dispatch, struct request, ++ queuelist); ++ list_del_init(&rq->queuelist); ++ goto exit; ++ } + + bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); + + if (bfqd->busy_queues == 0) +- return 0; +- +- if (unlikely(force)) +- return bfq_forced_dispatch(bfqd); ++ goto exit; + + /* + * Force device to serve one request at a time if +@@ -3885,25 +3886,39 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) + * throughput. + */ + if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) +- return 0; ++ goto exit; + + bfqq = bfq_select_queue(bfqd); + if (!bfqq) +- return 0; ++ goto exit; + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + BUG_ON(bfq_bfqq_wait_request(bfqq)); + +- if (!bfq_dispatch_request(bfqd, bfqq)) +- return 0; +- +- bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", +- bfq_bfqq_sync(bfqq) ? "sync" : "async"); ++ rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq); + + BUG_ON(bfqq->next_rq == NULL && + bfqq->entity.budget < bfqq->entity.service); +- return 1; ++exit: ++ if (rq) { ++ rq->rq_flags |= RQF_STARTED; ++ bfqd->rq_in_driver++; ++ } ++ ++ return rq; ++} ++ ++static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) ++{ ++ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; ++ struct request *rq; ++ ++ spin_lock_irq(&bfqd->lock); ++ rq = __bfq_dispatch_request(hctx); ++ spin_unlock_irq(&bfqd->lock); ++ ++ return rq; + } + + /* +@@ -3921,13 +3936,14 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + + BUG_ON(bfqq->ref <= 0); + +- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); ++ if (bfqq->bfqd) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); ++ + bfqq->ref--; + if (bfqq->ref) + return; + + BUG_ON(rb_first(&bfqq->sort_list)); +- BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); + BUG_ON(bfqq->entity.tree); + BUG_ON(bfq_bfqq_busy(bfqq)); + +@@ -3942,7 +3958,8 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + */ + hlist_del_init(&bfqq->burst_list_node); + +- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); ++ if (bfqq->bfqd) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); + + kmem_cache_free(bfq_pool, bfqq); + #ifdef BFQ_GROUP_IOSCHED_ENABLED +@@ -3983,29 +4000,52 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfq_put_queue(bfqq); /* release process reference */ + } + +-static void bfq_exit_icq(struct io_cq *icq) ++static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) + { +- struct bfq_io_cq *bic = icq_to_bic(icq); +- struct bfq_data *bfqd = bic_to_bfqd(bic); ++ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); ++ struct bfq_data *bfqd; + +- if (bic_to_bfqq(bic, false)) { +- bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); +- bic_set_bfqq(bic, NULL, false); +- } ++ if (bfqq) ++ bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ + +- if (bic_to_bfqq(bic, true)) { ++ if (bfqq && bfqd) { ++ spin_lock_irq(&bfqd->lock); + /* + * If the bic is using a shared queue, put the reference + * taken on the io_context when the bic started using a + * shared bfq_queue. + */ +- if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) +- put_io_context(icq->ioc); +- bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); +- bic_set_bfqq(bic, NULL, true); ++ if (is_sync && bfq_bfqq_coop(bfqq)) ++ put_io_context(bic->icq.ioc); ++ bfq_exit_bfqq(bfqd, bfqq); ++ bic_set_bfqq(bic, NULL, is_sync); ++ spin_unlock_irq(&bfqd->lock); + } + } + ++static void bfq_exit_icq_body(struct work_struct *work) ++{ ++ struct bfq_io_cq *bic = ++ container_of(work, struct bfq_io_cq, exit_icq_work); ++ ++ bfq_exit_icq_bfqq(bic, true); ++ bfq_exit_icq_bfqq(bic, false); ++} ++ ++static void bfq_init_icq(struct io_cq *icq) ++{ ++ struct bfq_io_cq *bic = icq_to_bic(icq); ++ ++ INIT_WORK(&bic->exit_icq_work, bfq_exit_icq_body); ++} ++ ++static void bfq_exit_icq(struct io_cq *icq) ++{ ++ struct bfq_io_cq *bic = icq_to_bic(icq); ++ ++ kblockd_schedule_work(&bic->exit_icq_work); ++} ++ + /* + * Update the entity prio values; note that the new values will not + * be used until the next (re)activation. +@@ -4015,6 +4055,10 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, + { + struct task_struct *tsk = current; + int ioprio_class; ++ struct bfq_data *bfqd = bfqq->bfqd; ++ ++ if (!bfqd) ++ return; + + ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); + switch (ioprio_class) { +@@ -4095,6 +4139,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + INIT_HLIST_NODE(&bfqq->burst_list_node); + BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); + ++ spin_lock_init(&bfqq->lock); ++ + bfqq->ref = 0; + bfqq->bfqd = bfqd; + +@@ -4351,22 +4397,13 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + if (budget_timeout) + bfq_bfqq_expire(bfqd, bfqq, false, + BFQ_BFQQ_BUDGET_TIMEOUT); +- +- /* +- * Let the request rip immediately, or let a new queue be +- * selected if bfqq has just been expired. +- */ +- __blk_run_queue(bfqd->queue); + } + } + +-static void bfq_insert_request(struct request_queue *q, struct request *rq) ++static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + { +- struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; + +- assert_spin_locked(bfqd->queue->queue_lock); +- + /* + * An unplug may trigger a requeue of a request from the device + * driver: make sure we are in process context while trying to +@@ -4381,8 +4418,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) + * Release the request's reference to the old bfqq + * and make sure one is taken to the shared queue. + */ +- new_bfqq->allocated[rq_data_dir(rq)]++; +- bfqq->allocated[rq_data_dir(rq)]--; ++ new_bfqq->allocated++; ++ bfqq->allocated--; + new_bfqq->ref++; + bfq_clear_bfqq_just_created(bfqq); + if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) +@@ -4406,6 +4443,55 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) + bfq_rq_enqueued(bfqd, bfqq, rq); + } + ++static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, ++ bool at_head) ++{ ++ struct request_queue *q = hctx->queue; ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ spin_lock_irq(&bfqd->lock); ++ if (blk_mq_sched_try_insert_merge(q, rq)) ++ goto done; ++ spin_unlock_irq(&bfqd->lock); ++ ++ blk_mq_sched_request_inserted(rq); ++ ++ spin_lock_irq(&bfqd->lock); ++ if (at_head || blk_rq_is_passthrough(rq)) { ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ if (at_head) ++ list_add(&rq->queuelist, &bfqd->dispatch); ++ else ++ list_add_tail(&rq->queuelist, &bfqd->dispatch); ++ ++ if (bfqq) ++ bfqq->dispatched++; ++ } else { ++ __bfq_insert_request(bfqd, rq); ++ ++ if (rq_mergeable(rq)) { ++ elv_rqhash_add(q, rq); ++ if (!q->last_merge) ++ q->last_merge = rq; ++ } ++ } ++done: ++ spin_unlock_irq(&bfqd->lock); ++} ++ ++static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, ++ struct list_head *list, bool at_head) ++{ ++ while (!list_empty(list)) { ++ struct request *rq; ++ ++ rq = list_first_entry(list, struct request, queuelist); ++ list_del_init(&rq->queuelist); ++ bfq_insert_request(hctx, rq, at_head); ++ } ++} ++ + static void bfq_update_hw_tag(struct bfq_data *bfqd) + { + bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, +@@ -4431,27 +4517,17 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) + bfqd->hw_tag_samples = 0; + } + +-static void bfq_completed_request(struct request_queue *q, struct request *rq) ++static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) + { +- struct bfq_queue *bfqq = RQ_BFQQ(rq); +- struct bfq_data *bfqd = bfqq->bfqd; + u64 now_ns; + u32 delta_us; + +- bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", +- blk_rq_sectors(rq)); +- +- assert_spin_locked(bfqd->queue->queue_lock); + bfq_update_hw_tag(bfqd); + + BUG_ON(!bfqd->rq_in_driver); + BUG_ON(!bfqq->dispatched); + bfqd->rq_in_driver--; + bfqq->dispatched--; +- bfqg_stats_update_completion(bfqq_group(bfqq), +- rq_start_time_ns(rq), +- rq_io_start_time_ns(rq), +- rq->cmd_flags); + + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); +@@ -4477,7 +4553,8 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) + */ + delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); + +- bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", ++ bfq_log_bfqq(bfqd, bfqq, ++ "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", + delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, + (USEC_PER_SEC* + (u64)((bfqd->last_rq_max_size<in_service_queue == bfqq) { + if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { + bfq_arm_slice_timer(bfqd); +- goto out; ++ return; + } else if (bfq_may_expire_for_budg_timeout(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, false, + BFQ_BFQQ_BUDGET_TIMEOUT); +@@ -4537,68 +4614,55 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) + bfq_bfqq_expire(bfqd, bfqq, false, + BFQ_BFQQ_NO_MORE_REQUESTS); + } +- +- if (!bfqd->rq_in_driver) +- bfq_schedule_dispatch(bfqd); +- +-out: +- return; + } + +-static int __bfq_may_queue(struct bfq_queue *bfqq) ++static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) + { +- if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { +- bfq_clear_bfqq_must_alloc(bfqq); +- return ELV_MQUEUE_MUST; +- } ++ bfqq->allocated--; + +- return ELV_MQUEUE_MAY; ++ bfq_put_queue(bfqq); + } + +-static int bfq_may_queue(struct request_queue *q, unsigned int op) ++static void bfq_put_rq_private(struct request_queue *q, struct request *rq) + { +- struct bfq_data *bfqd = q->elevator->elevator_data; +- struct task_struct *tsk = current; +- struct bfq_io_cq *bic; +- struct bfq_queue *bfqq; +- +- /* +- * Don't force setup of a queue from here, as a call to may_queue +- * does not necessarily imply that a request actually will be +- * queued. So just lookup a possibly existing queue, or return +- * 'may queue' if that fails. +- */ +- bic = bfq_bic_lookup(bfqd, tsk->io_context); +- if (!bic) +- return ELV_MQUEUE_MAY; +- +- bfqq = bic_to_bfqq(bic, op_is_sync(op)); +- if (bfqq) +- return __bfq_may_queue(bfqq); ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; + +- return ELV_MQUEUE_MAY; +-} ++ if (rq->rq_flags & RQF_STARTED) ++ bfqg_stats_update_completion(bfqq_group(bfqq), ++ rq_start_time_ns(rq), ++ rq_io_start_time_ns(rq), ++ rq->cmd_flags); + +-/* +- * Queue lock held here. +- */ +-static void bfq_put_request(struct request *rq) +-{ +- struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ if (likely(rq->rq_flags & RQF_STARTED)) { ++ unsigned long flags; + +- if (bfqq) { +- const int rw = rq_data_dir(rq); ++ spin_lock_irqsave(&bfqd->lock, flags); + +- BUG_ON(!bfqq->allocated[rw]); +- bfqq->allocated[rw]--; ++ bfq_completed_request(bfqq, bfqd); ++ bfq_put_rq_priv_body(bfqq); + +- rq->elv.priv[0] = NULL; +- rq->elv.priv[1] = NULL; ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++ } else { ++ /* ++ * Request rq may be still/already in the scheduler, ++ * in which case we need to remove it. And we cannot ++ * defer such a check and removal, to avoid ++ * inconsistencies in the time interval from the end ++ * of this function to the start of the deferred work. ++ * Fortunately, this situation occurs only in process ++ * context, so taking the scheduler lock does not ++ * cause any deadlock, even if other locks are already ++ * (correctly) held by this process. ++ */ + +- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", +- bfqq, bfqq->ref); +- bfq_put_queue(bfqq); ++ if (!RB_EMPTY_NODE(&rq->rb_node)) ++ bfq_remove_request(q, rq); ++ bfq_put_rq_priv_body(bfqq); + } ++ ++ rq->elv.priv[0] = NULL; ++ rq->elv.priv[1] = NULL; + } + + /* +@@ -4630,18 +4694,16 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) + /* + * Allocate bfq data structures associated with this request. + */ +-static int bfq_set_request(struct request_queue *q, struct request *rq, +- struct bio *bio, gfp_t gfp_mask) ++static int bfq_get_rq_private(struct request_queue *q, struct request *rq, ++ struct bio *bio) + { + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); +- const int rw = rq_data_dir(rq); + const int is_sync = rq_is_sync(rq); + struct bfq_queue *bfqq; +- unsigned long flags; + bool bfqq_already_existing = false, split = false; + +- spin_lock_irqsave(q->queue_lock, flags); ++ spin_lock_irq(&bfqd->lock); + + if (!bic) + goto queue_fail; +@@ -4661,7 +4723,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, + bic_set_bfqq(bic, bfqq, is_sync); + if (split && is_sync) { + bfq_log_bfqq(bfqd, bfqq, +- "set_request: was_in_list %d " ++ "get_request: was_in_list %d " + "was_in_large_burst %d " + "large burst in progress %d", + bic->was_in_burst_list, +@@ -4671,12 +4733,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, + if ((bic->was_in_burst_list && bfqd->large_burst) || + bic->saved_in_large_burst) { + bfq_log_bfqq(bfqd, bfqq, +- "set_request: marking in " ++ "get_request: marking in " + "large burst"); + bfq_mark_bfqq_in_large_burst(bfqq); + } else { + bfq_log_bfqq(bfqd, bfqq, +- "set_request: clearing in " ++ "get_request: clearing in " + "large burst"); + bfq_clear_bfqq_in_large_burst(bfqq); + if (bic->was_in_burst_list) +@@ -4703,9 +4765,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, + } + } + +- bfqq->allocated[rw]++; ++ bfqq->allocated++; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "get_request: new allocated %d", bfqq->allocated); ++ + bfqq->ref++; +- bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "get_request: bfqq %p, %d", bfqq, bfqq->ref); + + rq->elv.priv[0] = bic; + rq->elv.priv[1] = bfqq; +@@ -4733,26 +4798,53 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, + if (unlikely(bfq_bfqq_just_created(bfqq))) + bfq_handle_burst(bfqd, bfqq); + +- spin_unlock_irqrestore(q->queue_lock, flags); ++ spin_unlock_irq(&bfqd->lock); + + return 0; + + queue_fail: +- bfq_schedule_dispatch(bfqd); +- spin_unlock_irqrestore(q->queue_lock, flags); ++ spin_unlock_irq(&bfqd->lock); + + return 1; + } + +-static void bfq_kick_queue(struct work_struct *work) ++static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) + { +- struct bfq_data *bfqd = +- container_of(work, struct bfq_data, unplug_work); +- struct request_queue *q = bfqd->queue; ++ struct bfq_data *bfqd = bfqq->bfqd; ++ enum bfqq_expiration reason; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&bfqd->lock, flags); ++ bfq_clear_bfqq_wait_request(bfqq); + +- spin_lock_irq(q->queue_lock); +- __blk_run_queue(q); +- spin_unlock_irq(q->queue_lock); ++ if (bfqq != bfqd->in_service_queue) { ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++ return; ++ } ++ ++ if (bfq_bfqq_budget_timeout(bfqq)) ++ /* ++ * Also here the queue can be safely expired ++ * for budget timeout without wasting ++ * guarantees ++ */ ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) ++ /* ++ * The queue may not be empty upon timer expiration, ++ * because we may not disable the timer when the ++ * first request of the in-service queue arrives ++ * during disk idling. ++ */ ++ reason = BFQ_BFQQ_TOO_IDLE; ++ else ++ goto schedule_dispatch; ++ ++ bfq_bfqq_expire(bfqd, bfqq, true, reason); ++ ++schedule_dispatch: ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++ bfq_schedule_dispatch(bfqd); + } + + /* +@@ -4763,59 +4855,22 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) + { + struct bfq_data *bfqd = container_of(timer, struct bfq_data, + idle_slice_timer); +- struct bfq_queue *bfqq; +- unsigned long flags; +- enum bfqq_expiration reason; +- +- spin_lock_irqsave(bfqd->queue->queue_lock, flags); ++ struct bfq_queue *bfqq = bfqd->in_service_queue; + +- bfqq = bfqd->in_service_queue; + /* + * Theoretical race here: the in-service queue can be NULL or +- * different from the queue that was idling if the timer handler +- * spins on the queue_lock and a new request arrives for the +- * current queue and there is a full dispatch cycle that changes +- * the in-service queue. This can hardly happen, but in the worst +- * case we just expire a queue too early. ++ * different from the queue that was idling if a new request ++ * arrives for the current queue and there is a full dispatch ++ * cycle that changes the in-service queue. This can hardly ++ * happen, but in the worst case we just expire a queue too ++ * early. + */ +- if (bfqq) { +- bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); +- bfq_clear_bfqq_wait_request(bfqq); +- +- if (bfq_bfqq_budget_timeout(bfqq)) +- /* +- * Also here the queue can be safely expired +- * for budget timeout without wasting +- * guarantees +- */ +- reason = BFQ_BFQQ_BUDGET_TIMEOUT; +- else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) +- /* +- * The queue may not be empty upon timer expiration, +- * because we may not disable the timer when the +- * first request of the in-service queue arrives +- * during disk idling. +- */ +- reason = BFQ_BFQQ_TOO_IDLE; +- else +- goto schedule_dispatch; +- +- bfq_bfqq_expire(bfqd, bfqq, true, reason); +- } +- +-schedule_dispatch: +- bfq_schedule_dispatch(bfqd); ++ if (bfqq) ++ bfq_idle_slice_timer_body(bfqq); + +- spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); + return HRTIMER_NORESTART; + } + +-static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) +-{ +- hrtimer_cancel(&bfqd->idle_slice_timer); +- cancel_work_sync(&bfqd->unplug_work); +-} +- + static void __bfq_put_async_bfqq(struct bfq_data *bfqd, + struct bfq_queue **bfqq_ptr) + { +@@ -4852,28 +4907,40 @@ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) + static void bfq_exit_queue(struct elevator_queue *e) + { + struct bfq_data *bfqd = e->elevator_data; +- struct request_queue *q = bfqd->queue; + struct bfq_queue *bfqq, *n; + +- bfq_shutdown_timer_wq(bfqd); +- +- spin_lock_irq(q->queue_lock); ++ hrtimer_cancel(&bfqd->idle_slice_timer); + + BUG_ON(bfqd->in_service_queue); +- list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) +- bfq_deactivate_bfqq(bfqd, bfqq, false, false); + +- spin_unlock_irq(q->queue_lock); ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { ++ if (bfqq->bic) /* bfqqs without bic are handled below */ ++ cancel_work_sync(&bfqq->bic->exit_icq_work); ++ } ++ ++ spin_lock_irq(&bfqd->lock); ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { ++ bfq_deactivate_bfqq(bfqd, bfqq, false, false); ++ /* ++ * Make sure that deferred exit_icq_work completes ++ * without errors for bfq_queues without bic ++ */ ++ if (!bfqq->bic) ++ bfqq->bfqd = NULL; ++ } ++ spin_unlock_irq(&bfqd->lock); + +- bfq_shutdown_timer_wq(bfqd); ++ hrtimer_cancel(&bfqd->idle_slice_timer); + + BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); + + #ifdef BFQ_GROUP_IOSCHED_ENABLED +- blkcg_deactivate_policy(q, &blkcg_policy_bfq); ++ blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); + #else ++ spin_lock_irq(&bfqd->lock); + bfq_put_async_queues(bfqd, bfqd->root_group); + kfree(bfqd->root_group); ++ spin_unlock_irq(&bfqd->lock); + #endif + + kfree(bfqd); +@@ -4934,10 +5001,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + + bfqd->queue = q; + +- spin_lock_irq(q->queue_lock); +- q->elevator = eq; +- spin_unlock_irq(q->queue_lock); +- + bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); + if (!bfqd->root_group) + goto out_free; +@@ -4951,8 +5014,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + bfqd->queue_weights_tree = RB_ROOT; + bfqd->group_weights_tree = RB_ROOT; + +- INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); +- + INIT_LIST_HEAD(&bfqd->active_list); + INIT_LIST_HEAD(&bfqd->idle_list); + INIT_HLIST_HEAD(&bfqd->burst_list); +@@ -5001,6 +5062,11 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + bfqd->device_speed = BFQ_BFQD_FAST; + ++ spin_lock_init(&bfqd->lock); ++ INIT_LIST_HEAD(&bfqd->dispatch); ++ ++ q->elevator = eq; ++ + return 0; + + out_free: +@@ -5057,7 +5123,7 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) + num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", + bfqd->queued); + +- spin_lock_irq(bfqd->queue->queue_lock); ++ spin_lock_irq(&bfqd->lock); + + num_char += sprintf(page + num_char, "Active:\n"); + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { +@@ -5086,7 +5152,7 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + +- spin_unlock_irq(bfqd->queue->queue_lock); ++ spin_unlock_irq(&bfqd->lock); + + return num_char; + } +@@ -5294,35 +5360,31 @@ static struct elv_fs_entry bfq_attrs[] = { + __ATTR_NULL + }; + +-static struct elevator_type iosched_bfq = { +- .ops.sq = { +- .elevator_merge_fn = bfq_merge, +- .elevator_merged_fn = bfq_merged_request, +- .elevator_merge_req_fn = bfq_merged_requests, +-#ifdef BFQ_GROUP_IOSCHED_ENABLED +- .elevator_bio_merged_fn = bfq_bio_merged, +-#endif +- .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, +- .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, +- .elevator_dispatch_fn = bfq_dispatch_requests, +- .elevator_add_req_fn = bfq_insert_request, +- .elevator_activate_req_fn = bfq_activate_request, +- .elevator_deactivate_req_fn = bfq_deactivate_request, +- .elevator_completed_req_fn = bfq_completed_request, +- .elevator_former_req_fn = elv_rb_former_request, +- .elevator_latter_req_fn = elv_rb_latter_request, +- .elevator_init_icq_fn = bfq_init_icq, +- .elevator_exit_icq_fn = bfq_exit_icq, +- .elevator_set_req_fn = bfq_set_request, +- .elevator_put_req_fn = bfq_put_request, +- .elevator_may_queue_fn = bfq_may_queue, +- .elevator_init_fn = bfq_init_queue, +- .elevator_exit_fn = bfq_exit_queue, ++static struct elevator_type iosched_bfq_mq = { ++ .ops.mq = { ++ .get_rq_priv = bfq_get_rq_private, ++ .put_rq_priv = bfq_put_rq_private, ++ .init_icq = bfq_init_icq, ++ .exit_icq = bfq_exit_icq, ++ .insert_requests = bfq_insert_requests, ++ .dispatch_request = bfq_dispatch_request, ++ .next_request = elv_rb_latter_request, ++ .former_request = elv_rb_former_request, ++ .allow_merge = bfq_allow_bio_merge, ++ .bio_merge = bfq_bio_merge, ++ .request_merge = bfq_request_merge, ++ .requests_merged = bfq_requests_merged, ++ .request_merged = bfq_request_merged, ++ .has_work = bfq_has_work, ++ .init_sched = bfq_init_queue, ++ .exit_sched = bfq_exit_queue, + }, ++ ++ .uses_mq = true, + .icq_size = sizeof(struct bfq_io_cq), + .icq_align = __alignof__(struct bfq_io_cq), + .elevator_attrs = bfq_attrs, +- .elevator_name = "bfq-sq", ++ .elevator_name = "bfq-mq", + .elevator_owner = THIS_MODULE, + }; + +@@ -5392,7 +5454,7 @@ static int __init bfq_init(void) + device_speed_thresh[0] = (4 * R_slow[0]) / 3; + device_speed_thresh[1] = (4 * R_slow[1]) / 3; + +- ret = elv_register(&iosched_bfq); ++ ret = elv_register(&iosched_bfq_mq); + if (ret) + goto err_pol_unreg; + +@@ -5412,8 +5474,8 @@ static int __init bfq_init(void) + + static void __exit bfq_exit(void) + { +- elv_unregister(&iosched_bfq); +-#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ elv_unregister(&iosched_bfq_mq); ++#ifdef CONFIG_BFQ_GROUP_ENABLED + blkcg_policy_unregister(&blkcg_policy_bfq); + #endif + bfq_slab_kill(); +@@ -5422,5 +5484,6 @@ static void __exit bfq_exit(void) + module_init(bfq_init); + module_exit(bfq_exit); + +-MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); ++MODULE_AUTHOR("Paolo Valente"); + MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler"); +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +index 0f51f270469c..c3fcd5ebd735 100644 +--- a/block/bfq-mq.h ++++ b/block/bfq-mq.h +@@ -19,15 +19,8 @@ + #include + #include + +-/* +- * Define an alternative macro to compile cgroups support. This is one +- * of the steps needed to let bfq-mq share the files bfq-sched.c and +- * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro +- * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether +- * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not +- * CONFIG_BFQ_GROUP_IOSCHED, is defined. +- */ +-#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++/* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */ ++#ifdef CONFIG_BFQ_MQ_GROUP_IOSCHED + #define BFQ_GROUP_IOSCHED_ENABLED + #endif + +@@ -259,8 +252,8 @@ struct bfq_queue { + struct request *next_rq; + /* number of sync and async requests queued */ + int queued[2]; +- /* number of sync and async requests currently allocated */ +- int allocated[2]; ++ /* number of requests currently allocated */ ++ int allocated; + /* number of pending metadata requests */ + int meta_pending; + /* fifo list of requests in sort_list */ +@@ -345,6 +338,8 @@ struct bfq_queue { + unsigned long wr_start_at_switch_to_srt; + + unsigned long split_time; /* time of last split */ ++ ++ spinlock_t lock; + }; + + /** +@@ -361,6 +356,9 @@ struct bfq_io_cq { + uint64_t blkcg_serial_nr; /* the current blkcg serial */ + #endif + ++ /* delayed work to exec the body of the the exit_icq handler */ ++ struct work_struct exit_icq_work; ++ + /* + * Snapshot of the has_short_time flag before merging; taken + * to remember its value while the queue is merged, so as to +@@ -402,11 +400,13 @@ enum bfq_device_speed { + /** + * struct bfq_data - per-device data structure. + * +- * All the fields are protected by the @queue lock. ++ * All the fields are protected by @lock. + */ + struct bfq_data { +- /* request queue for the device */ ++ /* device request queue */ + struct request_queue *queue; ++ /* dispatch queue */ ++ struct list_head dispatch; + + /* root bfq_group for the device */ + struct bfq_group *root_group; +@@ -460,8 +460,6 @@ struct bfq_data { + * the queue in service. + */ + struct hrtimer idle_slice_timer; +- /* delayed work to restart dispatching on the request queue */ +- struct work_struct unplug_work; + + /* bfq_queue in service */ + struct bfq_queue *in_service_queue; +@@ -612,6 +610,8 @@ struct bfq_data { + + /* fallback dummy bfqq for extreme OOM conditions */ + struct bfq_queue oom_bfqq; ++ ++ spinlock_t lock; + }; + + enum bfqq_state_flags { +@@ -622,7 +622,6 @@ enum bfqq_state_flags { + * waiting for a request + * without idling the device + */ +- BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ + BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ + BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ + BFQ_BFQQ_FLAG_sync, /* synchronous queue */ +@@ -661,7 +660,6 @@ BFQ_BFQQ_FNS(just_created); + BFQ_BFQQ_FNS(busy); + BFQ_BFQQ_FNS(wait_request); + BFQ_BFQQ_FNS(non_blocking_wait_rq); +-BFQ_BFQQ_FNS(must_alloc); + BFQ_BFQQ_FNS(fifo_expire); + BFQ_BFQQ_FNS(has_short_ttime); + BFQ_BFQQ_FNS(sync); +@@ -692,7 +690,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ + char __pbuf[128]; \ + \ +- assert_spin_locked((bfqd)->queue->queue_lock); \ + blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ + pr_crit("%s bfq%d%c %s " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ +@@ -734,7 +731,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ + char __pbuf[128]; \ + \ +- assert_spin_locked((bfqd)->queue->queue_lock); \ + blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ + (bfqq)->pid, \ +@@ -961,7 +957,6 @@ static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) + + static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); + static void bfq_put_queue(struct bfq_queue *bfqq); +-static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); + static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bio *bio, bool is_sync, + struct bfq_io_cq *bic); + +From bde5235de2241502c1c00337bd51c96d9b60b6df Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Fri, 3 Mar 2017 08:52:40 +0100 +Subject: [PATCH 13/51] Add checks and extra log messages - Part I + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 109 insertions(+), 3 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index c963d92a32c2..40eadb3f7073 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -773,6 +773,8 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) + { + int process_refs, io_refs; + ++ lockdep_assert_held(&bfqq->bfqd->lock); ++ + io_refs = bfqq->allocated; + process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; + BUG_ON(process_refs < 0); +@@ -1483,6 +1485,8 @@ static void bfq_add_request(struct request *rq) + bfqq->queued[rq_is_sync(rq)]++; + bfqd->queued++; + ++ BUG_ON(!RQ_BFQQ(rq)); ++ BUG_ON(RQ_BFQQ(rq) != bfqq); + elv_rb_add(&bfqq->sort_list, rq); + + /* +@@ -1491,6 +1495,8 @@ static void bfq_add_request(struct request *rq) + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); + BUG_ON(!next_rq); ++ BUG_ON(!RQ_BFQQ(next_rq)); ++ BUG_ON(RQ_BFQQ(next_rq) != bfqq); + bfqq->next_rq = next_rq; + + /* +@@ -1615,6 +1621,19 @@ static void bfq_remove_request(struct request_queue *q, + + if (bfqq->next_rq == rq) { + bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); ++ if (bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)) { ++ pr_crit("no bfqq! for next rq %p bfqq %p\n", ++ bfqq->next_rq, bfqq); ++ } ++ ++ BUG_ON(bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)); ++ if (bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq) { ++ pr_crit( ++ "wrong bfqq! for next rq %p, rq_bfqq %p bfqq %p\n", ++ bfqq->next_rq, RQ_BFQQ(bfqq->next_rq), bfqq); ++ } ++ BUG_ON(bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq); ++ + bfq_updated_next_req(bfqd, bfqq); + } + +@@ -1701,6 +1720,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, + __rq = bfq_find_rq_fmerge(bfqd, bio, q); + if (__rq && elv_bio_merge_ok(__rq, bio)) { + *req = __rq; ++ bfq_log(bfqd, "request_merge: req %p", __rq); ++ + return ELEVATOR_FRONT_MERGE; + } + +@@ -1721,6 +1742,8 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, + + /* Reposition request in its sort_list */ + elv_rb_del(&bfqq->sort_list, req); ++ BUG_ON(!RQ_BFQQ(req)); ++ BUG_ON(RQ_BFQQ(req) != bfqq); + elv_rb_add(&bfqq->sort_list, req); + + spin_lock_irq(&bfqd->lock); +@@ -1729,7 +1752,13 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, + bfqd->last_position); + BUG_ON(!next_rq); ++ + bfqq->next_rq = next_rq; ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "requests_merged: req %p prev %p next_rq %p bfqq %p", ++ req, prev, next_rq, bfqq); ++ + /* + * If next_rq changes, update both the queue's budget to + * fit the new request and the queue's position in its +@@ -1748,8 +1777,16 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, + { + struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); + ++ BUG_ON(!RQ_BFQQ(rq)); ++ BUG_ON(!RQ_BFQQ(next)); ++ + if (!RB_EMPTY_NODE(&rq->rb_node)) + goto end; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "requests_merged: rq %p next %p bfqq %p next_bfqq %p", ++ rq, next, bfqq, next_bfqq); ++ + spin_lock_irq(&bfqq->bfqd->lock); + + /* +@@ -3847,6 +3884,9 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) + { + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + ++ bfq_log(bfqd, "has_work, dispatch_non_empty %d busy_queues %d", ++ !list_empty_careful(&bfqd->dispatch), bfqd->busy_queues > 0); ++ + /* + * Avoiding lock: a race on bfqd->busy_queues should cause at + * most a call to dispatch for nothing +@@ -3865,6 +3905,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + rq = list_first_entry(&bfqd->dispatch, struct request, + queuelist); + list_del_init(&rq->queuelist); ++ bfq_log(bfqd, ++ "dispatch requests: picked %p from dispatch list", rq); + goto exit; + } + +@@ -3904,7 +3946,20 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + if (rq) { + rq->rq_flags |= RQF_STARTED; + bfqd->rq_in_driver++; +- } ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, ++ "dispatched %s request %p, rq_in_driver %d", ++ bfq_bfqq_sync(bfqq) ? "sync" : "async", ++ rq, ++ bfqd->rq_in_driver); ++ else ++ bfq_log(bfqd, ++ "dispatched request %p from dispatch list, rq_in_driver %d", ++ rq, bfqd->rq_in_driver); ++ } else ++ bfq_log(bfqd, ++ "returned NULL request, rq_in_driver %d", ++ bfqd->rq_in_driver); + + return rq; + } +@@ -3944,6 +3999,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + return; + + BUG_ON(rb_first(&bfqq->sort_list)); ++ BUG_ON(bfqq->allocated != 0); + BUG_ON(bfqq->entity.tree); + BUG_ON(bfq_bfqq_busy(bfqq)); + +@@ -4043,6 +4099,7 @@ static void bfq_exit_icq(struct io_cq *icq) + { + struct bfq_io_cq *bic = icq_to_bic(icq); + ++ BUG_ON(!bic); + kblockd_schedule_work(&bic->exit_icq_work); + } + +@@ -4057,6 +4114,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, + int ioprio_class; + struct bfq_data *bfqd = bfqq->bfqd; + ++ WARN_ON(!bfqd); + if (!bfqd) + return; + +@@ -4404,6 +4462,10 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + { + struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; + ++ assert_spin_locked(&bfqd->lock); ++ ++ bfq_log_bfqq(bfqd, bfqq, "__insert_req: rq %p bfqq %p", rq, bfqq); ++ + /* + * An unplug may trigger a requeue of a request from the device + * driver: make sure we are in process context while trying to +@@ -4420,6 +4482,12 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + */ + new_bfqq->allocated++; + bfqq->allocated--; ++ bfq_log_bfqq(bfqd, bfqq, ++ "insert_request: new allocated %d", bfqq->allocated); ++ bfq_log_bfqq(bfqd, new_bfqq, ++ "insert_request: new_bfqq new allocated %d", ++ bfqq->allocated); ++ + new_bfqq->ref++; + bfq_clear_bfqq_just_created(bfqq); + if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) +@@ -4529,6 +4597,10 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) + bfqd->rq_in_driver--; + bfqq->dispatched--; + ++ bfq_log_bfqq(bfqd, bfqq, ++ "completed_requests: new disp %d, new rq_in_driver %d", ++ bfqq->dispatched, bfqd->rq_in_driver); ++ + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + /* +@@ -4618,6 +4690,9 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) + + static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) + { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "put_request_body: allocated %d", bfqq->allocated); ++ BUG_ON(!bfqq->allocated); + bfqq->allocated--; + + bfq_put_queue(bfqq); +@@ -4625,8 +4700,27 @@ static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) + + static void bfq_put_rq_private(struct request_queue *q, struct request *rq) + { +- struct bfq_queue *bfqq = RQ_BFQQ(rq); +- struct bfq_data *bfqd = bfqq->bfqd; ++ struct bfq_queue *bfqq; ++ struct bfq_data *bfqd; ++ struct bfq_io_cq *bic; ++ ++ BUG_ON(!rq); ++ bfqq = RQ_BFQQ(rq); ++ BUG_ON(!bfqq); ++ ++ bic = RQ_BIC(rq); ++ BUG_ON(!bic); ++ ++ bfqd = bfqq->bfqd; ++ BUG_ON(!bfqd); ++ ++ BUG_ON(rq->rq_flags & RQF_QUEUED); ++ BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "putting rq %p with %u sects left, STARTED %d", ++ rq, blk_rq_sectors(rq), ++ rq->rq_flags & RQF_STARTED); + + if (rq->rq_flags & RQF_STARTED) + bfqg_stats_update_completion(bfqq_group(bfqq), +@@ -4634,6 +4728,8 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) + rq_io_start_time_ns(rq), + rq->cmd_flags); + ++ BUG_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); ++ + if (likely(rq->rq_flags & RQF_STARTED)) { + unsigned long flags; + +@@ -4655,7 +4751,9 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) + * cause any deadlock, even if other locks are already + * (correctly) held by this process. + */ ++ BUG_ON(in_interrupt()); + ++ assert_spin_locked(&bfqd->lock); + if (!RB_EMPTY_NODE(&rq->rb_node)) + bfq_remove_request(q, rq); + bfq_put_rq_priv_body(bfqq); +@@ -4814,7 +4912,9 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) + enum bfqq_expiration reason; + unsigned long flags; + ++ BUG_ON(!bfqd); + spin_lock_irqsave(&bfqd->lock, flags); ++ bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); + bfq_clear_bfqq_wait_request(bfqq); + + if (bfqq != bfqd->in_service_queue) { +@@ -4857,6 +4957,8 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) + idle_slice_timer); + struct bfq_queue *bfqq = bfqd->in_service_queue; + ++ bfq_log(bfqd, "slice_timer expired"); ++ + /* + * Theoretical race here: the in-service queue can be NULL or + * different from the queue that was idling if a new request +@@ -4909,9 +5011,12 @@ static void bfq_exit_queue(struct elevator_queue *e) + struct bfq_data *bfqd = e->elevator_data; + struct bfq_queue *bfqq, *n; + ++ bfq_log(bfqd, "exit_queue: starting ..."); ++ + hrtimer_cancel(&bfqd->idle_slice_timer); + + BUG_ON(bfqd->in_service_queue); ++ BUG_ON(!list_empty(&bfqd->active_list)); + + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { + if (bfqq->bic) /* bfqqs without bic are handled below */ +@@ -4943,6 +5048,7 @@ static void bfq_exit_queue(struct elevator_queue *e) + spin_unlock_irq(&bfqd->lock); + #endif + ++ bfq_log(bfqd, "exit_queue: finished ..."); + kfree(bfqd); + } + + +From 7f59486861e368d25f59d4136cf8e51a75b7edf9 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Thu, 9 Feb 2017 10:36:27 +0100 +Subject: [PATCH 14/51] Add lock check in bfq_allow_bio_merge + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 40eadb3f7073..21b876aeba16 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -2279,6 +2279,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, + if (!bic) + return false; + ++ assert_spin_locked(&bfqd->lock); + bfqq = bic_to_bfqq(bic, is_sync); + /* + * We take advantage of this function to perform an early merge + +From a2dd19a4d95cf401268c144c79ce549c7fc4bbca Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Tue, 7 Feb 2017 15:14:29 +0100 +Subject: [PATCH 15/51] bfq-mq: execute exit_icq operations immediately + +Exploting Omar's patch that removes the taking of the queue lock in +put_io_context_active, this patch moves back the operation of the bfq_exit_icq +hook from a deferred work to the body of the function. + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 34 +++------------------------------- + block/bfq-mq.h | 3 --- + 2 files changed, 3 insertions(+), 34 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 21b876aeba16..1deb79a47181 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4080,28 +4080,13 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) + } + } + +-static void bfq_exit_icq_body(struct work_struct *work) +-{ +- struct bfq_io_cq *bic = +- container_of(work, struct bfq_io_cq, exit_icq_work); +- +- bfq_exit_icq_bfqq(bic, true); +- bfq_exit_icq_bfqq(bic, false); +-} +- +-static void bfq_init_icq(struct io_cq *icq) +-{ +- struct bfq_io_cq *bic = icq_to_bic(icq); +- +- INIT_WORK(&bic->exit_icq_work, bfq_exit_icq_body); +-} +- + static void bfq_exit_icq(struct io_cq *icq) + { + struct bfq_io_cq *bic = icq_to_bic(icq); + + BUG_ON(!bic); +- kblockd_schedule_work(&bic->exit_icq_work); ++ bfq_exit_icq_bfqq(bic, true); ++ bfq_exit_icq_bfqq(bic, false); + } + + /* +@@ -5019,21 +5004,9 @@ static void bfq_exit_queue(struct elevator_queue *e) + BUG_ON(bfqd->in_service_queue); + BUG_ON(!list_empty(&bfqd->active_list)); + +- list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { +- if (bfqq->bic) /* bfqqs without bic are handled below */ +- cancel_work_sync(&bfqq->bic->exit_icq_work); +- } +- + spin_lock_irq(&bfqd->lock); +- list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) + bfq_deactivate_bfqq(bfqd, bfqq, false, false); +- /* +- * Make sure that deferred exit_icq_work completes +- * without errors for bfq_queues without bic +- */ +- if (!bfqq->bic) +- bfqq->bfqd = NULL; +- } + spin_unlock_irq(&bfqd->lock); + + hrtimer_cancel(&bfqd->idle_slice_timer); +@@ -5471,7 +5444,6 @@ static struct elevator_type iosched_bfq_mq = { + .ops.mq = { + .get_rq_priv = bfq_get_rq_private, + .put_rq_priv = bfq_put_rq_private, +- .init_icq = bfq_init_icq, + .exit_icq = bfq_exit_icq, + .insert_requests = bfq_insert_requests, + .dispatch_request = bfq_dispatch_request, +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +index c3fcd5ebd735..23744b246db6 100644 +--- a/block/bfq-mq.h ++++ b/block/bfq-mq.h +@@ -356,9 +356,6 @@ struct bfq_io_cq { + uint64_t blkcg_serial_nr; /* the current blkcg serial */ + #endif + +- /* delayed work to exec the body of the the exit_icq handler */ +- struct work_struct exit_icq_work; +- + /* + * Snapshot of the has_short_time flag before merging; taken + * to remember its value while the queue is merged, so as to + +From ab7e78a0ff095101de74e700f8743295a500bb20 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Tue, 21 Feb 2017 10:26:22 +0100 +Subject: [PATCH 16/51] Unnest request-queue and ioc locks from scheduler locks + +In some bio-merging functions, the request-queue lock needs to be +taken, to lookup for the bic associated with the process that issued +the bio that may need to be merged. In addition, put_io_context must +be invoked in some other functions, and put_io_context may cause the +lock of the involved ioc to be taken. In both cases, these extra +request-queue or ioc locks are taken, or might be taken, while the +scheduler lock is being held. In this respect, there are other code +paths, in part external to bfq-mq, in which the same locks are taken +(nested) in the opposite order, i.e., it is the scheduler lock to be +taken while the request-queue or the ioc lock is being held. This +leads to circular deadlocks. + +This commit addresses this issue by modifying the logic of the above +functions, so as to let the lookup and put_io_context be performed, +and thus the extra locks be taken, outside the critical sections +protected by the scheduler lock. + +Signed-off-by: Paolo Valente +--- + block/bfq-cgroup-included.c | 9 ++ + block/bfq-mq-iosched.c | 264 ++++++++++++++++++++++++++++---------------- + block/bfq-mq.h | 25 ++++- + block/bfq-sched.c | 11 ++ + 4 files changed, 213 insertions(+), 96 deletions(-) + +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +index 8a73de76f32b..cf59eeb7f08e 100644 +--- a/block/bfq-cgroup-included.c ++++ b/block/bfq-cgroup-included.c +@@ -716,6 +716,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) + struct bfq_group *bfqg; + struct bfq_data *bfqd; + struct bfq_entity *entity; ++#ifdef BFQ_MQ ++ unsigned long flags; ++#endif + int i; + + BUG_ON(!pd); +@@ -729,6 +732,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) + if (!entity) /* root group */ + return; + ++#ifdef BFQ_MQ ++ spin_lock_irqsave(&bfqd->lock, flags); ++#endif + /* + * Empty all service_trees belonging to this group before + * deactivating the group itself. +@@ -766,6 +772,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) + __bfq_deactivate_entity(entity, false); + bfq_put_async_queues(bfqd, bfqg); + ++#ifdef BFQ_MQ ++ bfq_unlock_put_ioc_restore(bfqd, flags); ++#endif + /* + * @blkg is going offline and will be ignored by + * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 1deb79a47181..69ef3761c95d 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -233,6 +233,7 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, + return NULL; + } + ++#define BFQ_MQ + #include "bfq-sched.c" + #include "bfq-cgroup-included.c" + +@@ -1564,15 +1565,9 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, + struct bio *bio, + struct request_queue *q) + { +- struct task_struct *tsk = current; +- struct bfq_io_cq *bic; +- struct bfq_queue *bfqq; ++ struct bfq_queue *bfqq = bfqd->bio_bfqq; + +- bic = bfq_bic_lookup(bfqd, tsk->io_context, q); +- if (!bic) +- return NULL; + +- bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); + if (bfqq) + return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); + +@@ -1693,9 +1688,26 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) + struct request_queue *q = hctx->queue; + struct bfq_data *bfqd = q->elevator->elevator_data; + struct request *free = NULL; ++ /* ++ * bfq_bic_lookup grabs the queue_lock: invoke it now and ++ * store its return value for later use, to avoid nesting ++ * queue_lock inside the bfqd->lock. We assume that the bic ++ * returned by bfq_bic_lookup does not go away before ++ * bfqd->lock is taken. ++ */ ++ struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); + bool ret; + + spin_lock_irq(&bfqd->lock); ++ ++ if (bic) ++ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); ++ else ++ bfqd->bio_bfqq = NULL; ++ bfqd->bio_bic = bic; ++ /* Set next flag just for testing purposes */ ++ bfqd->bio_bfqq_set = true; ++ + ret = blk_mq_sched_try_merge(q, bio, &free); + + /* +@@ -1706,6 +1718,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) + */ + if (free) + blk_mq_free_request(free); ++ bfqd->bio_bfqq_set = false; + spin_unlock_irq(&bfqd->lock); + + return ret; +@@ -2261,8 +2274,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, + { + struct bfq_data *bfqd = q->elevator->elevator_data; + bool is_sync = op_is_sync(bio->bi_opf); +- struct bfq_io_cq *bic; +- struct bfq_queue *bfqq, *new_bfqq; ++ struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; + + /* + * Disallow merge of a sync bio into an async request. +@@ -2273,31 +2285,40 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, + /* + * Lookup the bfqq that this bio will be queued with. Allow + * merge only if rq is queued there. +- * Queue lock is held here. + */ +- bic = bfq_bic_lookup(bfqd, current->io_context, q); +- if (!bic) ++ if (!bfqq) + return false; + +- assert_spin_locked(&bfqd->lock); +- bfqq = bic_to_bfqq(bic, is_sync); + /* + * We take advantage of this function to perform an early merge + * of the queues of possible cooperating processes. + */ +- if (bfqq) { +- new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); +- if (new_bfqq) { +- bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); +- /* +- * If we get here, the bio will be queued in the +- * shared queue, i.e., new_bfqq, so use new_bfqq +- * to decide whether bio and rq can be merged. +- */ +- bfqq = new_bfqq; +- } +- } ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); ++ if (new_bfqq) { ++ /* ++ * bic still points to bfqq, then it has not yet been ++ * redirected to some other bfq_queue, and a queue ++ * merge beween bfqq and new_bfqq can be safely ++ * fulfillled, i.e., bic can be redirected to new_bfqq ++ * and bfqq can be put. ++ */ ++ bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, ++ new_bfqq); ++ /* ++ * If we get here, bio will be queued into new_queue, ++ * so use new_bfqq to decide whether bio and rq can be ++ * merged. ++ */ ++ bfqq = new_bfqq; + ++ /* ++ * Change also bqfd->bio_bfqq, as ++ * bfqd->bio_bic now points to new_bfqq, and ++ * this function may be invoked again (and then may ++ * use again bqfd->bio_bfqq). ++ */ ++ bfqd->bio_bfqq = bfqq; ++ } + return bfqq == RQ_BFQQ(rq); + } + +@@ -3965,14 +3986,43 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + return rq; + } + ++/* ++ * Next two functions release bfqd->lock and put the io context ++ * pointed by bfqd->ioc_to_put. This delayed put is used to not risk ++ * to take an ioc->lock while the scheduler lock is being held. ++ */ ++static void bfq_unlock_put_ioc(struct bfq_data *bfqd) ++{ ++ struct io_context *ioc_to_put = bfqd->ioc_to_put; ++ ++ bfqd->ioc_to_put = NULL; ++ spin_unlock_irq(&bfqd->lock); ++ ++ if (ioc_to_put) ++ put_io_context(ioc_to_put); ++} ++ ++static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, ++ unsigned long flags) ++{ ++ struct io_context *ioc_to_put = bfqd->ioc_to_put; ++ ++ bfqd->ioc_to_put = NULL; ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++ ++ if (ioc_to_put) ++ put_io_context(ioc_to_put); ++} ++ + static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + { + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + struct request *rq; + + spin_lock_irq(&bfqd->lock); ++ + rq = __bfq_dispatch_request(hctx); +- spin_unlock_irq(&bfqd->lock); ++ bfq_unlock_put_ioc(bfqd); + + return rq; + } +@@ -3981,7 +4031,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + * Task holds one reference to the queue, dropped when task exits. Each rq + * in-flight on this queue also holds a reference, dropped when rq is freed. + * +- * Queue lock must be held here. Recall not to use bfqq after calling ++ * Scheduler lock must be held here. Recall not to use bfqq after calling + * this function on it. + */ + static void bfq_put_queue(struct bfq_queue *bfqq) +@@ -4066,17 +4116,23 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) + bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ + + if (bfqq && bfqd) { +- spin_lock_irq(&bfqd->lock); ++ unsigned long flags; ++ ++ spin_lock_irqsave(&bfqd->lock, flags); + /* +- * If the bic is using a shared queue, put the reference +- * taken on the io_context when the bic started using a +- * shared bfq_queue. ++ * If the bic is using a shared queue, put the ++ * reference taken on the io_context when the bic ++ * started using a shared bfq_queue. This put cannot ++ * make ioc->ref_count reach 0, then no ioc->lock ++ * risks to be taken (leading to possible deadlock ++ * scenarios). + */ + if (is_sync && bfq_bfqq_coop(bfqq)) + put_io_context(bic->icq.ioc); ++ + bfq_exit_bfqq(bfqd, bfqq); + bic_set_bfqq(bic, NULL, is_sync); +- spin_unlock_irq(&bfqd->lock); ++ bfq_unlock_put_ioc_restore(bfqd, flags); + } + } + +@@ -4183,8 +4239,6 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + INIT_HLIST_NODE(&bfqq->burst_list_node); + BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); + +- spin_lock_init(&bfqq->lock); +- + bfqq->ref = 0; + bfqq->bfqd = bfqd; + +@@ -4476,6 +4530,14 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + + new_bfqq->ref++; + bfq_clear_bfqq_just_created(bfqq); ++ /* ++ * If the bic associated with the process ++ * issuing this request still points to bfqq ++ * (and thus has not been already redirected ++ * to new_bfqq or even some other bfq_queue), ++ * then complete the merge and redirect it to ++ * new_bfqq. ++ */ + if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + bfq_merge_bfqqs(bfqd, RQ_BIC(rq), + bfqq, new_bfqq); +@@ -4498,14 +4560,17 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + } + + static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, +- bool at_head) ++ bool at_head) + { + struct request_queue *q = hctx->queue; + struct bfq_data *bfqd = q->elevator->elevator_data; + + spin_lock_irq(&bfqd->lock); +- if (blk_mq_sched_try_insert_merge(q, rq)) +- goto done; ++ if (blk_mq_sched_try_insert_merge(q, rq)) { ++ spin_unlock_irq(&bfqd->lock); ++ return; ++ } ++ + spin_unlock_irq(&bfqd->lock); + + blk_mq_sched_request_inserted(rq); +@@ -4530,8 +4595,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + q->last_merge = rq; + } + } +-done: +- spin_unlock_irq(&bfqd->lock); ++ ++ bfq_unlock_put_ioc(bfqd); + } + + static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, +@@ -4724,7 +4789,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) + bfq_completed_request(bfqq, bfqd); + bfq_put_rq_priv_body(bfqq); + +- spin_unlock_irqrestore(&bfqd->lock, flags); ++ bfq_unlock_put_ioc_restore(bfqd, flags); + } else { + /* + * Request rq may be still/already in the scheduler, +@@ -4732,10 +4797,10 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) + * defer such a check and removal, to avoid + * inconsistencies in the time interval from the end + * of this function to the start of the deferred work. +- * Fortunately, this situation occurs only in process +- * context, so taking the scheduler lock does not +- * cause any deadlock, even if other locks are already +- * (correctly) held by this process. ++ * This situation seems to occur only in process ++ * context, as a consequence of a merge. In the ++ * current version of the code, this implies that the ++ * lock is held. + */ + BUG_ON(in_interrupt()); + +@@ -4758,8 +4823,6 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) + { + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); + +- put_io_context(bic->icq.ioc); +- + if (bfqq_process_refs(bfqq) == 1) { + bfqq->pid = current->pid; + bfq_clear_bfqq_coop(bfqq); +@@ -4775,6 +4838,41 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) + return NULL; + } + ++static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, ++ struct bio *bio, ++ bool split, bool is_sync, ++ bool *new_queue) ++{ ++ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); ++ ++ if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) ++ return bfqq; ++ ++ if (new_queue) ++ *new_queue = true; ++ ++ if (bfqq) ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); ++ ++ bic_set_bfqq(bic, bfqq, is_sync); ++ if (split && is_sync) { ++ if ((bic->was_in_burst_list && bfqd->large_burst) || ++ bic->saved_in_large_burst) ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ else { ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ if (bic->was_in_burst_list) ++ hlist_add_head(&bfqq->burst_list_node, ++ &bfqd->burst_list); ++ } ++ bfqq->split_time = jiffies; ++ } ++ ++ return bfqq; ++} ++ + /* + * Allocate bfq data structures associated with this request. + */ +@@ -4786,6 +4884,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + const int is_sync = rq_is_sync(rq); + struct bfq_queue *bfqq; + bool bfqq_already_existing = false, split = false; ++ bool new_queue = false; + + spin_lock_irq(&bfqd->lock); + +@@ -4796,42 +4895,10 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + + bfq_bic_update_cgroup(bic, bio); + +-new_queue: +- bfqq = bic_to_bfqq(bic, is_sync); +- if (!bfqq || bfqq == &bfqd->oom_bfqq) { +- if (bfqq) +- bfq_put_queue(bfqq); +- bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); +- BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); ++ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, ++ &new_queue); + +- bic_set_bfqq(bic, bfqq, is_sync); +- if (split && is_sync) { +- bfq_log_bfqq(bfqd, bfqq, +- "get_request: was_in_list %d " +- "was_in_large_burst %d " +- "large burst in progress %d", +- bic->was_in_burst_list, +- bic->saved_in_large_burst, +- bfqd->large_burst); +- +- if ((bic->was_in_burst_list && bfqd->large_burst) || +- bic->saved_in_large_burst) { +- bfq_log_bfqq(bfqd, bfqq, +- "get_request: marking in " +- "large burst"); +- bfq_mark_bfqq_in_large_burst(bfqq); +- } else { +- bfq_log_bfqq(bfqd, bfqq, +- "get_request: clearing in " +- "large burst"); +- bfq_clear_bfqq_in_large_burst(bfqq); +- if (bic->was_in_burst_list) +- hlist_add_head(&bfqq->burst_list_node, +- &bfqd->burst_list); +- } +- bfqq->split_time = jiffies; +- } +- } else { ++ if (unlikely(!new_queue)) { + /* If the queue was seeky for too long, break it apart. */ + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); +@@ -4841,9 +4908,19 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + bic->saved_in_large_burst = true; + + bfqq = bfq_split_bfqq(bic, bfqq); +- split = true; ++ /* ++ * A reference to bic->icq.ioc needs to be ++ * released after a queue split. Do not do it ++ * immediately, to not risk to possibly take ++ * an ioc->lock while holding the scheduler ++ * lock. ++ */ ++ bfqd->ioc_to_put = bic->icq.ioc; ++ + if (!bfqq) +- goto new_queue; ++ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, ++ true, is_sync, ++ NULL); + else + bfqq_already_existing = true; + } +@@ -4861,18 +4938,17 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + + /* + * If a bfq_queue has only one process reference, it is owned +- * by only one bfq_io_cq: we can set the bic field of the +- * bfq_queue to the address of that structure. Also, if the +- * queue has just been split, mark a flag so that the +- * information is available to the other scheduler hooks. ++ * by only this bic: we can then set bfqq->bic = bic. in ++ * addition, if the queue has also just been split, we have to ++ * resume its state. + */ + if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + bfqq->bic = bic; +- if (split) { ++ if (bfqd->ioc_to_put) { /* if true, then there has been a split */ + /* +- * If the queue has just been split from a shared +- * queue, restore the idle window and the possible +- * weight raising period. ++ * The queue has just been split from a shared ++ * queue: restore the idle window and the ++ * possible weight raising period. + */ + bfq_bfqq_resume_state(bfqq, bfqd, bic, + bfqq_already_existing); +@@ -4882,7 +4958,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + if (unlikely(bfq_bfqq_just_created(bfqq))) + bfq_handle_burst(bfqd, bfqq); + +- spin_unlock_irq(&bfqd->lock); ++ bfq_unlock_put_ioc(bfqd); + + return 0; + +@@ -4929,7 +5005,7 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) + bfq_bfqq_expire(bfqd, bfqq, true, reason); + + schedule_dispatch: +- spin_unlock_irqrestore(&bfqd->lock, flags); ++ bfq_unlock_put_ioc_restore(bfqd, flags); + bfq_schedule_dispatch(bfqd); + } + +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +index 23744b246db6..bd83f1c02573 100644 +--- a/block/bfq-mq.h ++++ b/block/bfq-mq.h +@@ -338,8 +338,6 @@ struct bfq_queue { + unsigned long wr_start_at_switch_to_srt; + + unsigned long split_time; /* time of last split */ +- +- spinlock_t lock; + }; + + /** +@@ -609,6 +607,29 @@ struct bfq_data { + struct bfq_queue oom_bfqq; + + spinlock_t lock; ++ ++ /* ++ * bic associated with the task issuing current bio for ++ * merging. This and the next field are used as a support to ++ * be able to perform the bic lookup, needed by bio-merge ++ * functions, before the scheduler lock is taken, and thus ++ * avoid taking the request-queue lock while the scheduler ++ * lock is being held. ++ */ ++ struct bfq_io_cq *bio_bic; ++ /* bfqq associated with the task issuing current bio for merging */ ++ struct bfq_queue *bio_bfqq; ++ /* Extra flag used only for TESTING */ ++ bool bio_bfqq_set; ++ ++ /* ++ * io context to put right after bfqd->lock is released. This ++ * filed is used to perform put_io_context, when needed, to ++ * after the scheduler lock has been released, and thus ++ * prevent an ioc->lock from being possibly taken while the ++ * scheduler lock is being held. ++ */ ++ struct io_context *ioc_to_put; + }; + + enum bfqq_state_flags { +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index b54a638186e3..a5c8b4acd33c 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -1905,7 +1905,18 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) + struct bfq_entity *entity = in_serv_entity; + + if (bfqd->in_service_bic) { ++#ifdef BFQ_MQ ++ /* ++ * Schedule the release of a reference to ++ * bfqd->in_service_bic->icq.ioc to right after the ++ * scheduler lock is released. This ioc is not ++ * released immediately, to not risk to possibly take ++ * an ioc->lock while holding the scheduler lock. ++ */ ++ bfqd->ioc_to_put = bfqd->in_service_bic->icq.ioc; ++#else + put_io_context(bfqd->in_service_bic->icq.ioc); ++#endif + bfqd->in_service_bic = NULL; + } + + +From 84cc7140cb4f0574710625f51abbb076a1dd2920 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Fri, 3 Mar 2017 09:31:14 +0100 +Subject: [PATCH 17/51] Add checks and extra log messages - Part II + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 42 ++++++++++++++++++++++++++++++++++++++++-- + block/bfq-sched.c | 1 + + 2 files changed, 41 insertions(+), 2 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 69ef3761c95d..5707d42b160d 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -1567,6 +1567,7 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, + { + struct bfq_queue *bfqq = bfqd->bio_bfqq; + ++ BUG_ON(!bfqd->bio_bfqq_set); + + if (bfqq) + return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); +@@ -1719,6 +1720,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) + if (free) + blk_mq_free_request(free); + bfqd->bio_bfqq_set = false; ++ BUG_ON(bfqd->ioc_to_put); + spin_unlock_irq(&bfqd->lock); + + return ret; +@@ -1781,6 +1783,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, + bfq_updated_next_req(bfqd, bfqq); + bfq_pos_tree_add_move(bfqd, bfqq); + } ++ BUG_ON(bfqd->ioc_to_put); + spin_unlock_irq(&bfqd->lock); + } + } +@@ -1824,6 +1827,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, + + bfq_remove_request(q, next); + ++ BUG_ON(bfqq->bfqd->ioc_to_put); + spin_unlock_irq(&bfqq->bfqd->lock); + end: + bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +@@ -2195,9 +2199,11 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + { + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", + (unsigned long) new_bfqq->pid); ++ BUG_ON(bfqq->bic && bfqq->bic == new_bfqq->bic); + /* Save weight raising and idle window of the merged queues */ + bfq_bfqq_save_state(bfqq); + bfq_bfqq_save_state(new_bfqq); ++ + if (bfq_bfqq_IO_bound(bfqq)) + bfq_mark_bfqq_IO_bound(new_bfqq); + bfq_clear_bfqq_IO_bound(bfqq); +@@ -2276,6 +2282,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, + bool is_sync = op_is_sync(bio->bi_opf); + struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; + ++ assert_spin_locked(&bfqd->lock); + /* + * Disallow merge of a sync bio into an async request. + */ +@@ -2286,6 +2293,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, + * Lookup the bfqq that this bio will be queued with. Allow + * merge only if rq is queued there. + */ ++ BUG_ON(!bfqd->bio_bfqq_set); + if (!bfqq) + return false; + +@@ -2294,6 +2302,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, + * of the queues of possible cooperating processes. + */ + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); ++ BUG_ON(new_bfqq == bfqq); + if (new_bfqq) { + /* + * bic still points to bfqq, then it has not yet been +@@ -4040,6 +4049,8 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + struct bfq_group *bfqg = bfqq_group(bfqq); + #endif + ++ assert_spin_locked(&bfqq->bfqd->lock); ++ + BUG_ON(bfqq->ref <= 0); + + if (bfqq->bfqd) +@@ -4119,6 +4130,7 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) + unsigned long flags; + + spin_lock_irqsave(&bfqd->lock, flags); ++ BUG_ON(bfqd->ioc_to_put); + /* + * If the bic is using a shared queue, put the + * reference taken on the io_context when the bic +@@ -4567,10 +4579,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + + spin_lock_irq(&bfqd->lock); + if (blk_mq_sched_try_insert_merge(q, rq)) { ++ BUG_ON(bfqd->ioc_to_put); + spin_unlock_irq(&bfqd->lock); + return; + } + ++ BUG_ON(bfqd->ioc_to_put); + spin_unlock_irq(&bfqd->lock); + + blk_mq_sched_request_inserted(rq); +@@ -4785,6 +4799,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) + unsigned long flags; + + spin_lock_irqsave(&bfqd->lock, flags); ++ BUG_ON(bfqd->ioc_to_put); + + bfq_completed_request(bfqq, bfqd); + bfq_put_rq_priv_body(bfqq); +@@ -4855,13 +4870,28 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, + if (bfqq) + bfq_put_queue(bfqq); + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); + + bic_set_bfqq(bic, bfqq, is_sync); + if (split && is_sync) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "get_request: was_in_list %d " ++ "was_in_large_burst %d " ++ "large burst in progress %d", ++ bic->was_in_burst_list, ++ bic->saved_in_large_burst, ++ bfqd->large_burst); ++ + if ((bic->was_in_burst_list && bfqd->large_burst) || +- bic->saved_in_large_burst) ++ bic->saved_in_large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "get_request: marking in " ++ "large burst"); + bfq_mark_bfqq_in_large_burst(bfqq); +- else { ++ } else { ++ bfq_log_bfqq(bfqd, bfqq, ++ "get_request: clearing in " ++ "large burst"); + bfq_clear_bfqq_in_large_burst(bfqq); + if (bic->was_in_burst_list) + hlist_add_head(&bfqq->burst_list_node, +@@ -4897,10 +4927,12 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + + bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, + &new_queue); ++ BUG_ON(bfqd->ioc_to_put); + + if (unlikely(!new_queue)) { + /* If the queue was seeky for too long, break it apart. */ + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { ++ BUG_ON(!is_sync); + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + + /* Update bic before losing reference to bfqq */ +@@ -4923,6 +4955,9 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + NULL); + else + bfqq_already_existing = true; ++ ++ BUG_ON(!bfqq); ++ BUG_ON(bfqq == &bfqd->oom_bfqq); + } + } + +@@ -4976,6 +5011,8 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) + + BUG_ON(!bfqd); + spin_lock_irqsave(&bfqd->lock, flags); ++ BUG_ON(bfqd->ioc_to_put); ++ + bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); + bfq_clear_bfqq_wait_request(bfqq); + +@@ -5083,6 +5120,7 @@ static void bfq_exit_queue(struct elevator_queue *e) + spin_lock_irq(&bfqd->lock); + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) + bfq_deactivate_bfqq(bfqd, bfqq, false, false); ++ BUG_ON(bfqd->ioc_to_put); + spin_unlock_irq(&bfqd->lock); + + hrtimer_cancel(&bfqd->idle_slice_timer); +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index a5c8b4acd33c..85e59eeb3569 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -1906,6 +1906,7 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) + + if (bfqd->in_service_bic) { + #ifdef BFQ_MQ ++ BUG_ON(bfqd->ioc_to_put); + /* + * Schedule the release of a reference to + * bfqd->in_service_bic->icq.ioc to right after the + +From 3d54cb804f1db2e08ce4a6cc335868538542f587 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 22 Feb 2017 11:30:01 +0100 +Subject: [PATCH 18/51] Fix unbalanced increment of rq_in_driver + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 52 +++++++++++++++++++++++++++++++++++++++++--------- + 1 file changed, 43 insertions(+), 9 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 5707d42b160d..9cbcb8d43d81 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -3936,9 +3936,45 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + rq = list_first_entry(&bfqd->dispatch, struct request, + queuelist); + list_del_init(&rq->queuelist); ++ + bfq_log(bfqd, + "dispatch requests: picked %p from dispatch list", rq); +- goto exit; ++ bfqq = RQ_BFQQ(rq); ++ ++ if (bfqq) { ++ /* ++ * Increment counters here, because this ++ * dispatch does not follow the standard ++ * dispatch flow (where counters are ++ * incremented) ++ */ ++ bfqq->dispatched++; ++ ++ goto inc_in_driver_start_rq; ++ } ++ ++ /* ++ * We exploit the put_rq_private hook to decrement ++ * rq_in_driver, but put_rq_private will not be ++ * invoked on this request. So, to avoid unbalance, ++ * just start this request, without incrementing ++ * rq_in_driver. As a negative consequence, ++ * rq_in_driver is deceptively lower than it should be ++ * while this request is in service. This may cause ++ * bfq_schedule_dispatch to be invoked uselessly. ++ * ++ * As for implementing an exact solution, the ++ * put_request hook, if defined, is probably invoked ++ * also on this request. So, by exploiting this hook, ++ * we could 1) increment rq_in_driver here, and 2) ++ * decrement it in put_request. Such a solution would ++ * let the value of the counter be always accurate, ++ * but it would entail using an extra interface ++ * function. This cost seems higher than the benefit, ++ * being the frequency of non-elevator-private ++ * requests very low. ++ */ ++ goto start_rq; + } + + bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); +@@ -3973,10 +4009,12 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + + BUG_ON(bfqq->next_rq == NULL && + bfqq->entity.budget < bfqq->entity.service); +-exit: ++ + if (rq) { +- rq->rq_flags |= RQF_STARTED; ++ inc_in_driver_start_rq: + bfqd->rq_in_driver++; ++ start_rq: ++ rq->rq_flags |= RQF_STARTED; + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, + "dispatched %s request %p, rq_in_driver %d", +@@ -3992,6 +4030,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + "returned NULL request, rq_in_driver %d", + bfqd->rq_in_driver); + ++exit: + return rq; + } + +@@ -4591,15 +4630,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + + spin_lock_irq(&bfqd->lock); + if (at_head || blk_rq_is_passthrough(rq)) { +- struct bfq_queue *bfqq = RQ_BFQQ(rq); +- + if (at_head) + list_add(&rq->queuelist, &bfqd->dispatch); + else + list_add_tail(&rq->queuelist, &bfqd->dispatch); +- +- if (bfqq) +- bfqq->dispatched++; + } else { + __bfq_insert_request(bfqd, rq); + +@@ -4966,7 +5000,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + "get_request: new allocated %d", bfqq->allocated); + + bfqq->ref++; +- bfq_log_bfqq(bfqd, bfqq, "get_request: bfqq %p, %d", bfqq, bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); + + rq->elv.priv[0] = bic; + rq->elv.priv[1] = bfqq; + +From 7ba977d696b239569b4cd233aebc99e136ecf487 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Fri, 3 Mar 2017 09:39:35 +0100 +Subject: [PATCH 19/51] Add checks and extra log messages - Part III + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 9cbcb8d43d81..24b529a2edc7 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4630,10 +4630,21 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + + spin_lock_irq(&bfqd->lock); + if (at_head || blk_rq_is_passthrough(rq)) { ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ + if (at_head) + list_add(&rq->queuelist, &bfqd->dispatch); + else + list_add_tail(&rq->queuelist, &bfqd->dispatch); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, ++ "insert_request %p in disp: at_head %d", ++ rq, at_head); ++ else ++ bfq_log(bfqd, ++ "insert_request %p in disp: at_head %d", ++ rq, at_head); + } else { + __bfq_insert_request(bfqd, rq); + + +From c94e47b2908600b8ba89f84b0ac7febddd313141 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Fri, 17 Feb 2017 14:28:02 +0100 +Subject: [PATCH 20/51] TESTING: Check wrong invocation of merge and + put_rq_priv functions + +Check that merge functions are not invoked on requests queued in the +dispatch queue, and that neither put_rq_private is invoked on these +requests if, in addition, they have not passed through get_rq_private. + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 22 ++++++++++++++++++++++ + include/linux/blkdev.h | 2 ++ + 2 files changed, 24 insertions(+) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 24b529a2edc7..b4d40bb712d2 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -1746,6 +1746,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, + static void bfq_request_merged(struct request_queue *q, struct request *req, + enum elv_merge type) + { ++ BUG_ON(req->rq_flags & RQF_DISP_LIST); ++ + if (type == ELEVATOR_FRONT_MERGE && + rb_prev(&req->rb_node) && + blk_rq_pos(req) < +@@ -1795,6 +1797,8 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, + + BUG_ON(!RQ_BFQQ(rq)); + BUG_ON(!RQ_BFQQ(next)); ++ BUG_ON(rq->rq_flags & RQF_DISP_LIST); ++ BUG_ON(next->rq_flags & RQF_DISP_LIST); + + if (!RB_EMPTY_NODE(&rq->rb_node)) + goto end; +@@ -3936,6 +3940,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + rq = list_first_entry(&bfqd->dispatch, struct request, + queuelist); + list_del_init(&rq->queuelist); ++ rq->rq_flags &= ~RQF_DISP_LIST; + + bfq_log(bfqd, + "dispatch requests: picked %p from dispatch list", rq); +@@ -3950,6 +3955,17 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + */ + bfqq->dispatched++; + ++ /* ++ * TESTING: reset DISP_LIST flag, because: 1) ++ * this rq this request has passed through ++ * get_rq_private, 2) then it will have ++ * put_rq_private invoked on it, and 3) in ++ * put_rq_private we use this flag to check ++ * that put_rq_private is not invoked on ++ * requests for which get_rq_private has been ++ * invoked. ++ */ ++ rq->rq_flags &= ~RQF_DISP_LIST; + goto inc_in_driver_start_rq; + } + +@@ -4637,6 +4653,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + else + list_add_tail(&rq->queuelist, &bfqd->dispatch); + ++ rq->rq_flags |= RQF_DISP_LIST; + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, + "insert_request %p in disp: at_head %d", +@@ -4824,6 +4841,10 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) + bfqd = bfqq->bfqd; + BUG_ON(!bfqd); + ++ if (rq->rq_flags & RQF_DISP_LIST) { ++ pr_crit("putting disp rq %p for %d", rq, bfqq->pid); ++ BUG(); ++ } + BUG_ON(rq->rq_flags & RQF_QUEUED); + BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); + +@@ -5015,6 +5036,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + + rq->elv.priv[0] = bic; + rq->elv.priv[1] = bfqq; ++ rq->rq_flags &= ~RQF_DISP_LIST; + + /* + * If a bfq_queue has only one process reference, it is owned +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 10f892ca585d..0048e59e6d07 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -121,6 +121,8 @@ typedef __u32 __bitwise req_flags_t; + /* Look at ->special_vec for the actual data payload instead of the + bio chain. */ + #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) ++/* DEBUG: rq in bfq-mq dispatch list */ ++#define RQF_DISP_LIST ((__force req_flags_t)(1 << 19)) + + /* flags that prevent us from merging requests: */ + #define RQF_NOMERGE_FLAGS \ + +From 49206f9052d13c96d49dbc36c612bed41b2d6552 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Sat, 25 Feb 2017 17:38:05 +0100 +Subject: [PATCH 21/51] Complete support for cgroups + +This commit completes cgroups support for bfq-mq. In particular, it deals with +a sort of circular dependency introduced in blk-mq: the function +blkcg_activate_policy, invoked during scheduler initialization, triggers the +invocation of the has_work scheduler hook (before the init function is +finished). To adress this issue, this commit moves the invocation of +blkcg_activate_policy after the initialization of all the fields that could be +initialized before invoking blkcg_activate_policy itself. This enables has_work +to correctly return false, and thus to prevent the blk-mq stack from invoking +further scheduler hooks before the init function is finished. + +Signed-off-by: Paolo Valente +--- + block/Kconfig.iosched | 9 +++++ + block/bfq-mq-iosched.c | 108 ++++++++++++++++++++++++++++--------------------- + block/bfq-mq.h | 2 +- + 3 files changed, 72 insertions(+), 47 deletions(-) + +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched +index 2d94af3d8b0a..299a6861fb90 100644 +--- a/block/Kconfig.iosched ++++ b/block/Kconfig.iosched +@@ -106,6 +106,15 @@ config MQ_IOSCHED_BFQ + guarantees a low latency to interactive and soft real-time + applications. Details in Documentation/block/bfq-iosched.txt + ++config MQ_BFQ_GROUP_IOSCHED ++ bool "BFQ-MQ hierarchical scheduling support" ++ depends on MQ_IOSCHED_BFQ && BLK_CGROUP ++ default n ++ ---help--- ++ ++ Enable hierarchical scheduling in BFQ-MQ, using the blkio ++ (cgroups-v1) or io (cgroups-v2) controller. ++ + config MQ_IOSCHED_DEADLINE + tristate "MQ deadline I/O scheduler" + default y +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index b4d40bb712d2..02a1e7fd0ea4 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -88,7 +88,6 @@ + #include "blk-mq.h" + #include "blk-mq-tag.h" + #include "blk-mq-sched.h" +-#undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ + #include "bfq-mq.h" + + /* Expiration time of sync (0) and async (1) requests, in ns. */ +@@ -233,15 +232,6 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, + return NULL; + } + +-#define BFQ_MQ +-#include "bfq-sched.c" +-#include "bfq-cgroup-included.c" +- +-#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) +-#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) +- +-#define bfq_sample_valid(samples) ((samples) > 80) +- + /* + * Scheduler run of queue, if there are requests pending and no one in the + * driver that will restart queueing. +@@ -255,6 +245,43 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) + } + + /* ++ * Next two functions release bfqd->lock and put the io context ++ * pointed by bfqd->ioc_to_put. This delayed put is used to not risk ++ * to take an ioc->lock while the scheduler lock is being held. ++ */ ++static void bfq_unlock_put_ioc(struct bfq_data *bfqd) ++{ ++ struct io_context *ioc_to_put = bfqd->ioc_to_put; ++ ++ bfqd->ioc_to_put = NULL; ++ spin_unlock_irq(&bfqd->lock); ++ ++ if (ioc_to_put) ++ put_io_context(ioc_to_put); ++} ++ ++static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, ++ unsigned long flags) ++{ ++ struct io_context *ioc_to_put = bfqd->ioc_to_put; ++ ++ bfqd->ioc_to_put = NULL; ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++ ++ if (ioc_to_put) ++ put_io_context(ioc_to_put); ++} ++ ++#define BFQ_MQ ++#include "bfq-sched.c" ++#include "bfq-cgroup-included.c" ++ ++#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) ++#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) ++ ++#define bfq_sample_valid(samples) ((samples) > 80) ++ ++/* + * Lifted from AS - choose which of rq1 and rq2 that is best served now. + * We choose the request that is closesr to the head right now. Distance + * behind the head is penalized and only allowed to a certain extent. +@@ -4050,34 +4077,6 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + return rq; + } + +-/* +- * Next two functions release bfqd->lock and put the io context +- * pointed by bfqd->ioc_to_put. This delayed put is used to not risk +- * to take an ioc->lock while the scheduler lock is being held. +- */ +-static void bfq_unlock_put_ioc(struct bfq_data *bfqd) +-{ +- struct io_context *ioc_to_put = bfqd->ioc_to_put; +- +- bfqd->ioc_to_put = NULL; +- spin_unlock_irq(&bfqd->lock); +- +- if (ioc_to_put) +- put_io_context(ioc_to_put); +-} +- +-static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, +- unsigned long flags) +-{ +- struct io_context *ioc_to_put = bfqd->ioc_to_put; +- +- bfqd->ioc_to_put = NULL; +- spin_unlock_irqrestore(&bfqd->lock, flags); +- +- if (ioc_to_put) +- put_io_context(ioc_to_put); +-} +- + static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + { + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; +@@ -5239,6 +5238,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + } + eq->elevator_data = bfqd; + ++ spin_lock_irq(q->queue_lock); ++ q->elevator = eq; ++ spin_unlock_irq(q->queue_lock); ++ + /* + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow +@@ -5261,12 +5264,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + bfqd->oom_bfqq.entity.prio_changed = 1; + + bfqd->queue = q; +- +- bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); +- if (!bfqd->root_group) +- goto out_free; +- bfq_init_root_group(bfqd->root_group, bfqd); +- bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); ++ INIT_LIST_HEAD(&bfqd->dispatch); + + hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); +@@ -5324,9 +5322,27 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + bfqd->device_speed = BFQ_BFQD_FAST; + + spin_lock_init(&bfqd->lock); +- INIT_LIST_HEAD(&bfqd->dispatch); + +- q->elevator = eq; ++ /* ++ * The invocation of the next bfq_create_group_hierarchy ++ * function is the head of a chain of function calls ++ * (bfq_create_group_hierarchy->blkcg_activate_policy-> ++ * blk_mq_freeze_queue) that may lead to the invocation of the ++ * has_work hook function. For this reason, ++ * bfq_create_group_hierarchy is invoked only after all ++ * scheduler data has been initialized, apart from the fields ++ * that can be initialized only after invoking ++ * bfq_create_group_hierarchy. This, in particular, enables ++ * has_work to correctly return false. Of course, to avoid ++ * other inconsistencies, the blk-mq stack must then refrain ++ * from invoking further scheduler hooks before this init ++ * function is finished. ++ */ ++ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); ++ if (!bfqd->root_group) ++ goto out_free; ++ bfq_init_root_group(bfqd->root_group, bfqd); ++ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); + + return 0; + +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +index bd83f1c02573..2c81c02bccc4 100644 +--- a/block/bfq-mq.h ++++ b/block/bfq-mq.h +@@ -20,7 +20,7 @@ + #include + + /* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */ +-#ifdef CONFIG_BFQ_MQ_GROUP_IOSCHED ++#ifdef CONFIG_MQ_BFQ_GROUP_IOSCHED + #define BFQ_GROUP_IOSCHED_ENABLED + #endif + + +From 62d12db23ce14d2716b5cff7d2635fbc817b96d0 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Fri, 17 Mar 2017 06:15:18 +0100 +Subject: [PATCH 22/51] Remove all get and put of I/O contexts + +When a bfq queue is set in service and when it is merged, a reference +to the I/O context associated with the queue is taken. This reference +is then released when the queue is deselected from service or +split. More precisely, the release of the reference is postponed to +when the scheduler lock is released, to avoid nesting between the +scheduler and the I/O-context lock. In fact, such nesting would lead +to deadlocks, because of other code paths that take the same locks in +the opposite order. This postponing of I/O-context releases does +complicate code. + +This commit addresses this issue by modifying involved operations in +such a way to not need to get the above I/O-context references any +more. Then it also removes any get and release of these references. + +Signed-off-by: Paolo Valente +--- + block/bfq-cgroup-included.c | 2 +- + block/bfq-mq-iosched.c | 127 ++++++++------------------------------------ + block/bfq-mq.h | 11 ---- + block/bfq-sched.c | 17 ------ + 4 files changed, 22 insertions(+), 135 deletions(-) + +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +index cf59eeb7f08e..dfacca799b5e 100644 +--- a/block/bfq-cgroup-included.c ++++ b/block/bfq-cgroup-included.c +@@ -773,7 +773,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) + bfq_put_async_queues(bfqd, bfqg); + + #ifdef BFQ_MQ +- bfq_unlock_put_ioc_restore(bfqd, flags); ++ spin_unlock_irqrestore(&bfqd->lock, flags); + #endif + /* + * @blkg is going offline and will be ignored by +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 02a1e7fd0ea4..8e7589d3280f 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -244,34 +244,6 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) + } + } + +-/* +- * Next two functions release bfqd->lock and put the io context +- * pointed by bfqd->ioc_to_put. This delayed put is used to not risk +- * to take an ioc->lock while the scheduler lock is being held. +- */ +-static void bfq_unlock_put_ioc(struct bfq_data *bfqd) +-{ +- struct io_context *ioc_to_put = bfqd->ioc_to_put; +- +- bfqd->ioc_to_put = NULL; +- spin_unlock_irq(&bfqd->lock); +- +- if (ioc_to_put) +- put_io_context(ioc_to_put); +-} +- +-static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, +- unsigned long flags) +-{ +- struct io_context *ioc_to_put = bfqd->ioc_to_put; +- +- bfqd->ioc_to_put = NULL; +- spin_unlock_irqrestore(&bfqd->lock, flags); +- +- if (ioc_to_put) +- put_io_context(ioc_to_put); +-} +- + #define BFQ_MQ + #include "bfq-sched.c" + #include "bfq-cgroup-included.c" +@@ -1747,7 +1719,6 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) + if (free) + blk_mq_free_request(free); + bfqd->bio_bfqq_set = false; +- BUG_ON(bfqd->ioc_to_put); + spin_unlock_irq(&bfqd->lock); + + return ret; +@@ -1812,7 +1783,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, + bfq_updated_next_req(bfqd, bfqq); + bfq_pos_tree_add_move(bfqd, bfqq); + } +- BUG_ON(bfqd->ioc_to_put); + spin_unlock_irq(&bfqd->lock); + } + } +@@ -1858,7 +1828,6 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, + + bfq_remove_request(q, next); + +- BUG_ON(bfqq->bfqd->ioc_to_put); + spin_unlock_irq(&bfqq->bfqd->lock); + end: + bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +@@ -2035,20 +2004,18 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) + * first time that the requests of some process are redirected to + * it. + * +- * We redirect bfqq to new_bfqq and not the opposite, because we +- * are in the context of the process owning bfqq, hence we have +- * the io_cq of this process. So we can immediately configure this +- * io_cq to redirect the requests of the process to new_bfqq. ++ * We redirect bfqq to new_bfqq and not the opposite, because ++ * we are in the context of the process owning bfqq, thus we ++ * have the io_cq of this process. So we can immediately ++ * configure this io_cq to redirect the requests of the ++ * process to new_bfqq. In contrast, the io_cq of new_bfqq is ++ * not available any more (new_bfqq->bic == NULL). + * +- * NOTE, even if new_bfqq coincides with the in-service queue, the +- * io_cq of new_bfqq is not available, because, if the in-service +- * queue is shared, bfqd->in_service_bic may not point to the +- * io_cq of the in-service queue. +- * Redirecting the requests of the process owning bfqq to the +- * currently in-service queue is in any case the best option, as +- * we feed the in-service queue with new requests close to the +- * last request served and, by doing so, hopefully increase the +- * throughput. ++ * Anyway, even in case new_bfqq coincides with the in-service ++ * queue, redirecting requests the in-service queue is the ++ * best option, as we feed the in-service queue with new ++ * requests close to the last request served and, by doing so, ++ * are likely to increase the throughput. + */ + bfqq->new_bfqq = new_bfqq; + new_bfqq->ref += process_refs; +@@ -2147,13 +2114,13 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + in_service_bfqq = bfqd->in_service_queue; + + if (in_service_bfqq && in_service_bfqq != bfqq && +- bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) ++ wr_from_too_long(in_service_bfqq) + && likely(in_service_bfqq == &bfqd->oom_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have tried merge with in-service-queue, but wr"); + +- if (!in_service_bfqq || in_service_bfqq == bfqq || +- !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || ++ if (!in_service_bfqq || in_service_bfqq == bfqq ++ || wr_from_too_long(in_service_bfqq) || + unlikely(in_service_bfqq == &bfqd->oom_bfqq)) + goto check_scheduled; + +@@ -2214,16 +2181,6 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + } + +-static void bfq_get_bic_reference(struct bfq_queue *bfqq) +-{ +- /* +- * If bfqq->bic has a non-NULL value, the bic to which it belongs +- * is about to begin using a shared bfq_queue. +- */ +- if (bfqq->bic) +- atomic_long_inc(&bfqq->bic->icq.ioc->refcount); +-} +- + static void + bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +@@ -2280,12 +2237,6 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + bfqd->wr_busy_queues); + + /* +- * Grab a reference to the bic, to prevent it from being destroyed +- * before being possibly touched by a bfq_split_bfqq(). +- */ +- bfq_get_bic_reference(bfqq); +- bfq_get_bic_reference(new_bfqq); +- /* + * Merge queues (that is, let bic redirect its requests to new_bfqq) + */ + bic_set_bfqq(bic, new_bfqq, 1); +@@ -2472,16 +2423,10 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) + static void bfq_arm_slice_timer(struct bfq_data *bfqd) + { + struct bfq_queue *bfqq = bfqd->in_service_queue; +- struct bfq_io_cq *bic; + u32 sl; + + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + +- /* Processes have exited, don't wait. */ +- bic = bfqd->in_service_bic; +- if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) +- return; +- + bfq_mark_bfqq_wait_request(bfqq); + + /* +@@ -3922,11 +3867,6 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, + bfq_bfqq_budget_left(bfqq), + bfqq->dispatched); + +- if (!bfqd->in_service_bic) { +- atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); +- bfqd->in_service_bic = RQ_BIC(rq); +- } +- + /* + * Expire bfqq, pretending that its budget expired, if bfqq + * belongs to CLASS_IDLE and other queues are waiting for +@@ -4085,7 +4025,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + spin_lock_irq(&bfqd->lock); + + rq = __bfq_dispatch_request(hctx); +- bfq_unlock_put_ioc(bfqd); ++ spin_unlock_irq(&bfqd->lock); + + return rq; + } +@@ -4184,21 +4124,10 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) + unsigned long flags; + + spin_lock_irqsave(&bfqd->lock, flags); +- BUG_ON(bfqd->ioc_to_put); +- /* +- * If the bic is using a shared queue, put the +- * reference taken on the io_context when the bic +- * started using a shared bfq_queue. This put cannot +- * make ioc->ref_count reach 0, then no ioc->lock +- * risks to be taken (leading to possible deadlock +- * scenarios). +- */ +- if (is_sync && bfq_bfqq_coop(bfqq)) +- put_io_context(bic->icq.ioc); + + bfq_exit_bfqq(bfqd, bfqq); + bic_set_bfqq(bic, NULL, is_sync); +- bfq_unlock_put_ioc_restore(bfqd, flags); ++ spin_unlock_irqrestore(&bfqd->lock, flags); + } + } + +@@ -4633,12 +4562,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + + spin_lock_irq(&bfqd->lock); + if (blk_mq_sched_try_insert_merge(q, rq)) { +- BUG_ON(bfqd->ioc_to_put); + spin_unlock_irq(&bfqd->lock); + return; + } + +- BUG_ON(bfqd->ioc_to_put); + spin_unlock_irq(&bfqd->lock); + + blk_mq_sched_request_inserted(rq); +@@ -4671,7 +4598,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + } + } + +- bfq_unlock_put_ioc(bfqd); ++ spin_unlock_irq(&bfqd->lock); + } + + static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, +@@ -4864,12 +4791,11 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) + unsigned long flags; + + spin_lock_irqsave(&bfqd->lock, flags); +- BUG_ON(bfqd->ioc_to_put); + + bfq_completed_request(bfqq, bfqd); + bfq_put_rq_priv_body(bfqq); + +- bfq_unlock_put_ioc_restore(bfqd, flags); ++ spin_unlock_irqrestore(&bfqd->lock, flags); + } else { + /* + * Request rq may be still/already in the scheduler, +@@ -4992,7 +4918,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + + bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, + &new_queue); +- BUG_ON(bfqd->ioc_to_put); + + if (unlikely(!new_queue)) { + /* If the queue was seeky for too long, break it apart. */ +@@ -5005,14 +4930,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + bic->saved_in_large_burst = true; + + bfqq = bfq_split_bfqq(bic, bfqq); +- /* +- * A reference to bic->icq.ioc needs to be +- * released after a queue split. Do not do it +- * immediately, to not risk to possibly take +- * an ioc->lock while holding the scheduler +- * lock. +- */ +- bfqd->ioc_to_put = bic->icq.ioc; + + if (!bfqq) + bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, +@@ -5045,7 +4962,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + */ + if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + bfqq->bic = bic; +- if (bfqd->ioc_to_put) { /* if true, then there has been a split */ ++ if (split) { + /* + * The queue has just been split from a shared + * queue: restore the idle window and the +@@ -5059,7 +4976,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + if (unlikely(bfq_bfqq_just_created(bfqq))) + bfq_handle_burst(bfqd, bfqq); + +- bfq_unlock_put_ioc(bfqd); ++ spin_unlock_irq(&bfqd->lock); + + return 0; + +@@ -5077,7 +4994,6 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) + + BUG_ON(!bfqd); + spin_lock_irqsave(&bfqd->lock, flags); +- BUG_ON(bfqd->ioc_to_put); + + bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); + bfq_clear_bfqq_wait_request(bfqq); +@@ -5108,7 +5024,7 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) + bfq_bfqq_expire(bfqd, bfqq, true, reason); + + schedule_dispatch: +- bfq_unlock_put_ioc_restore(bfqd, flags); ++ spin_unlock_irqrestore(&bfqd->lock, flags); + bfq_schedule_dispatch(bfqd); + } + +@@ -5186,7 +5102,6 @@ static void bfq_exit_queue(struct elevator_queue *e) + spin_lock_irq(&bfqd->lock); + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) + bfq_deactivate_bfqq(bfqd, bfqq, false, false); +- BUG_ON(bfqd->ioc_to_put); + spin_unlock_irq(&bfqd->lock); + + hrtimer_cancel(&bfqd->idle_slice_timer); +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +index 2c81c02bccc4..36ee24a87dda 100644 +--- a/block/bfq-mq.h ++++ b/block/bfq-mq.h +@@ -458,8 +458,6 @@ struct bfq_data { + + /* bfq_queue in service */ + struct bfq_queue *in_service_queue; +- /* bfq_io_cq (bic) associated with the @in_service_queue */ +- struct bfq_io_cq *in_service_bic; + + /* on-disk position of the last served request */ + sector_t last_position; +@@ -621,15 +619,6 @@ struct bfq_data { + struct bfq_queue *bio_bfqq; + /* Extra flag used only for TESTING */ + bool bio_bfqq_set; +- +- /* +- * io context to put right after bfqd->lock is released. This +- * filed is used to perform put_io_context, when needed, to +- * after the scheduler lock has been released, and thus +- * prevent an ioc->lock from being possibly taken while the +- * scheduler lock is being held. +- */ +- struct io_context *ioc_to_put; + }; + + enum bfqq_state_flags { +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index 85e59eeb3569..9c4e6797d8c9 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -1904,23 +1904,6 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) + struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; + struct bfq_entity *entity = in_serv_entity; + +- if (bfqd->in_service_bic) { +-#ifdef BFQ_MQ +- BUG_ON(bfqd->ioc_to_put); +- /* +- * Schedule the release of a reference to +- * bfqd->in_service_bic->icq.ioc to right after the +- * scheduler lock is released. This ioc is not +- * released immediately, to not risk to possibly take +- * an ioc->lock while holding the scheduler lock. +- */ +- bfqd->ioc_to_put = bfqd->in_service_bic->icq.ioc; +-#else +- put_io_context(bfqd->in_service_bic->icq.ioc); +-#endif +- bfqd->in_service_bic = NULL; +- } +- + bfq_clear_bfqq_wait_request(in_serv_bfqq); + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); + bfqd->in_service_queue = NULL; + +From 1521ad11f8684cf0a1b7249249cd406fee50da6d Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 29 Mar 2017 18:41:46 +0200 +Subject: [PATCH 23/51] BUGFIX: Remove unneeded and deadlock-causing lock in + request_merged + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 8e7589d3280f..bb046335ff4f 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -1761,7 +1761,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, + BUG_ON(RQ_BFQQ(req) != bfqq); + elv_rb_add(&bfqq->sort_list, req); + +- spin_lock_irq(&bfqd->lock); + /* Choose next request to be served for bfqq */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, +@@ -1783,7 +1782,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, + bfq_updated_next_req(bfqd, bfqq); + bfq_pos_tree_add_move(bfqd, bfqq); + } +- spin_unlock_irq(&bfqd->lock); + } + } + + +From 9136b4c953918ea937254c57cfb787b55b5bc2c6 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 29 Mar 2017 18:55:30 +0200 +Subject: [PATCH 24/51] Fix wrong unlikely + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index bb046335ff4f..3ae9bd424b3f 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4917,7 +4917,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, + &new_queue); + +- if (unlikely(!new_queue)) { ++ if (likely(!new_queue)) { + /* If the queue was seeky for too long, break it apart. */ + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { + BUG_ON(!is_sync); + +From 8e05f722f19645f2278f6962368ca3b7c2a81c9c Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Fri, 12 May 2017 09:51:18 +0200 +Subject: [PATCH 25/51] Change cgroup params prefix to bfq-mq for bfq-mq + +Signed-off-by: Paolo Valente +--- + block/bfq-cgroup-included.c | 54 ++++++++++++++++++++++++++------------------- + 1 file changed, 31 insertions(+), 23 deletions(-) + +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +index dfacca799b5e..9e9b0a09e26f 100644 +--- a/block/bfq-cgroup-included.c ++++ b/block/bfq-cgroup-included.c +@@ -995,9 +995,15 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) + return blkg_to_bfqg(bfqd->queue->root_blkg); + } + ++#ifdef BFQ_MQ ++#define BFQ_CGROUP_FNAME(param) "bfq-mq."#param ++#else ++#define BFQ_CGROUP_FNAME(param) "bfq."#param ++#endif ++ + static struct cftype bfq_blkcg_legacy_files[] = { + { +- .name = "bfq.weight", ++ .name = BFQ_CGROUP_FNAME(weight), + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = bfq_io_show_weight, + .write_u64 = bfq_io_set_weight_legacy, +@@ -1005,106 +1011,106 @@ static struct cftype bfq_blkcg_legacy_files[] = { + + /* statistics, covers only the tasks in the bfqg */ + { +- .name = "bfq.time", ++ .name = BFQ_CGROUP_FNAME(time), + .private = offsetof(struct bfq_group, stats.time), + .seq_show = bfqg_print_stat, + }, + { +- .name = "bfq.sectors", ++ .name = BFQ_CGROUP_FNAME(sectors), + .seq_show = bfqg_print_stat_sectors, + }, + { +- .name = "bfq.io_service_bytes", ++ .name = BFQ_CGROUP_FNAME(io_service_bytes), + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_bytes, + }, + { +- .name = "bfq.io_serviced", ++ .name = BFQ_CGROUP_FNAME(io_serviced), + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_ios, + }, + { +- .name = "bfq.io_service_time", ++ .name = BFQ_CGROUP_FNAME(io_service_time), + .private = offsetof(struct bfq_group, stats.service_time), + .seq_show = bfqg_print_rwstat, + }, + { +- .name = "bfq.io_wait_time", ++ .name = BFQ_CGROUP_FNAME(io_wait_time), + .private = offsetof(struct bfq_group, stats.wait_time), + .seq_show = bfqg_print_rwstat, + }, + { +- .name = "bfq.io_merged", ++ .name = BFQ_CGROUP_FNAME(io_merged), + .private = offsetof(struct bfq_group, stats.merged), + .seq_show = bfqg_print_rwstat, + }, + { +- .name = "bfq.io_queued", ++ .name = BFQ_CGROUP_FNAME(io_queued), + .private = offsetof(struct bfq_group, stats.queued), + .seq_show = bfqg_print_rwstat, + }, + + /* the same statictics which cover the bfqg and its descendants */ + { +- .name = "bfq.time_recursive", ++ .name = BFQ_CGROUP_FNAME(time_recursive), + .private = offsetof(struct bfq_group, stats.time), + .seq_show = bfqg_print_stat_recursive, + }, + { +- .name = "bfq.sectors_recursive", ++ .name = BFQ_CGROUP_FNAME(sectors_recursive), + .seq_show = bfqg_print_stat_sectors_recursive, + }, + { +- .name = "bfq.io_service_bytes_recursive", ++ .name = BFQ_CGROUP_FNAME(io_service_bytes_recursive), + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_bytes_recursive, + }, + { +- .name = "bfq.io_serviced_recursive", ++ .name = BFQ_CGROUP_FNAME(io_serviced_recursive), + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_ios_recursive, + }, + { +- .name = "bfq.io_service_time_recursive", ++ .name = BFQ_CGROUP_FNAME(io_service_time_recursive), + .private = offsetof(struct bfq_group, stats.service_time), + .seq_show = bfqg_print_rwstat_recursive, + }, + { +- .name = "bfq.io_wait_time_recursive", ++ .name = BFQ_CGROUP_FNAME(io_wait_time_recursive), + .private = offsetof(struct bfq_group, stats.wait_time), + .seq_show = bfqg_print_rwstat_recursive, + }, + { +- .name = "bfq.io_merged_recursive", ++ .name = BFQ_CGROUP_FNAME(io_merged_recursive), + .private = offsetof(struct bfq_group, stats.merged), + .seq_show = bfqg_print_rwstat_recursive, + }, + { +- .name = "bfq.io_queued_recursive", ++ .name = BFQ_CGROUP_FNAME(io_queued_recursive), + .private = offsetof(struct bfq_group, stats.queued), + .seq_show = bfqg_print_rwstat_recursive, + }, + { +- .name = "bfq.avg_queue_size", ++ .name = BFQ_CGROUP_FNAME(avg_queue_size), + .seq_show = bfqg_print_avg_queue_size, + }, + { +- .name = "bfq.group_wait_time", ++ .name = BFQ_CGROUP_FNAME(group_wait_time), + .private = offsetof(struct bfq_group, stats.group_wait_time), + .seq_show = bfqg_print_stat, + }, + { +- .name = "bfq.idle_time", ++ .name = BFQ_CGROUP_FNAME(idle_time), + .private = offsetof(struct bfq_group, stats.idle_time), + .seq_show = bfqg_print_stat, + }, + { +- .name = "bfq.empty_time", ++ .name = BFQ_CGROUP_FNAME(empty_time), + .private = offsetof(struct bfq_group, stats.empty_time), + .seq_show = bfqg_print_stat, + }, + { +- .name = "bfq.dequeue", ++ .name = BFQ_CGROUP_FNAME(dequeue), + .private = offsetof(struct bfq_group, stats.dequeue), + .seq_show = bfqg_print_stat, + }, +@@ -1113,7 +1119,7 @@ static struct cftype bfq_blkcg_legacy_files[] = { + + static struct cftype bfq_blkg_files[] = { + { +- .name = "bfq.weight", ++ .name = BFQ_CGROUP_FNAME(weight), + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = bfq_io_show_weight, + .write = bfq_io_set_weight, +@@ -1121,6 +1127,8 @@ static struct cftype bfq_blkg_files[] = { + {} /* terminate */ + }; + ++#undef BFQ_CGROUP_FNAME ++ + #else /* BFQ_GROUP_IOSCHED_ENABLED */ + + static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, + +From abdf7565dadbb00e78be5f4fb2cc9b157649840e Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Fri, 12 May 2017 11:56:13 +0200 +Subject: [PATCH 26/51] Add tentative extra tests on groups, reqs and queues + +Signed-off-by: Paolo Valente +--- + block/bfq-cgroup-included.c | 1 + + block/bfq-mq-iosched.c | 5 +++++ + include/linux/blkdev.h | 2 ++ + 3 files changed, 8 insertions(+) + +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +index 9e9b0a09e26f..72107ad12220 100644 +--- a/block/bfq-cgroup-included.c ++++ b/block/bfq-cgroup-included.c +@@ -412,6 +412,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) + BUG_ON(!blkg); + bfqg = blkg_to_bfqg(blkg); + bfqd = blkg->q->elevator->elevator_data; ++ BUG_ON(bfqg == bfqd->root_group); + entity = &bfqg->entity; + d = blkcg_to_bfqgd(blkg->blkcg); + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 3ae9bd424b3f..a9e3406fef06 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4494,6 +4494,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + { + struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; ++ BUG_ON(!bfqq); + + assert_spin_locked(&bfqd->lock); + +@@ -4587,6 +4588,9 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + "insert_request %p in disp: at_head %d", + rq, at_head); + } else { ++ BUG_ON(!(rq->rq_flags & RQF_GOT)); ++ rq->rq_flags &= ~RQF_GOT; ++ + __bfq_insert_request(bfqd, rq); + + if (rq_mergeable(rq)) { +@@ -4974,6 +4978,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + if (unlikely(bfq_bfqq_just_created(bfqq))) + bfq_handle_burst(bfqd, bfqq); + ++ rq->rq_flags |= RQF_GOT; + spin_unlock_irq(&bfqd->lock); + + return 0; +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 0048e59e6d07..9ae814743095 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -123,6 +123,8 @@ typedef __u32 __bitwise req_flags_t; + #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) + /* DEBUG: rq in bfq-mq dispatch list */ + #define RQF_DISP_LIST ((__force req_flags_t)(1 << 19)) ++/* DEBUG: rq had get_rq_private executed on it */ ++#define RQF_GOT ((__force req_flags_t)(1 << 20)) + + /* flags that prevent us from merging requests: */ + #define RQF_NOMERGE_FLAGS \ + +From 9e1c1514bc947c4e04502331372b1cc58459d8d1 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Mon, 15 May 2017 22:25:03 +0200 +Subject: [PATCH 27/51] block, bfq-mq: access and cache blkg data only when + safe + +In blk-cgroup, operations on blkg objects are protected with the +request_queue lock. This is no more the lock that protects +I/O-scheduler operations in blk-mq. In fact, the latter are now +protected with a finer-grained per-scheduler-instance lock. As a +consequence, although blkg lookups are also rcu-protected, blk-mq I/O +schedulers may see inconsistent data when they access blkg and +blkg-related objects. BFQ does access these objects, and does incur +this problem, in the following case. + +The blkg_lookup performed in bfq_get_queue, being protected (only) +through rcu, may happen to return the address of a copy of the +original blkg. If this is the case, then the blkg_get performed in +bfq_get_queue, to pin down the blkg, is useless: it does not prevent +blk-cgroup code from destroying both the original blkg and all objects +directly or indirectly referred by the copy of the blkg. BFQ accesses +these objects, which typically causes a crash for NULL-pointer +dereference of memory-protection violation. + +Some additional protection mechanism should be added to blk-cgroup to +address this issue. In the meantime, this commit provides a quick +temporary fix for BFQ: cache (when safe) blkg data that might +disappear right after a blkg_lookup. + +In particular, this commit exploits the following facts to achieve its +goal without introducing further locks. Destroy operations on a blkg +invoke, as a first step, hooks of the scheduler associated with the +blkg. And these hooks are executed with bfqd->lock held for BFQ. As a +consequence, for any blkg associated with the request queue an +instance of BFQ is attached to, we are guaranteed that such a blkg is +not destroyed, and that all the pointers it contains are consistent, +while that instance is holding its bfqd->lock. A blkg_lookup performed +with bfqd->lock held then returns a fully consistent blkg, which +remains consistent until this lock is held. In more detail, this holds +even if the returned blkg is a copy of the original one. + +Finally, also the object describing a group inside BFQ needs to be +protected from destruction on the blkg_free of the original blkg +(which invokes bfq_pd_free). This commit adds private refcounting for +this object, to let it disappear only after no bfq_queue refers to it +any longer. + +This commit also removes or updates some stale comments on locking +issues related to blk-cgroup operations. + +Reported-by: Tomas Konir +Reported-by: Lee Tibbert +Reported-by: Marco Piazza +Signed-off-by: Paolo Valente +Tested-by: Tomas Konir +Tested-by: Lee Tibbert +Tested-by: Marco Piazza +--- + block/bfq-cgroup-included.c | 149 ++++++++++++++++++++++++++++++++++++++++---- + block/bfq-mq-iosched.c | 2 +- + block/bfq-mq.h | 26 +++----- + 3 files changed, 148 insertions(+), 29 deletions(-) + +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +index 72107ad12220..d903393ee78a 100644 +--- a/block/bfq-cgroup-included.c ++++ b/block/bfq-cgroup-included.c +@@ -43,7 +43,11 @@ BFQG_FLAG_FNS(idling) + BFQG_FLAG_FNS(empty) + #undef BFQG_FLAG_FNS + ++#ifdef BFQ_MQ ++/* This should be called with the scheduler lock held. */ ++#else + /* This should be called with the queue_lock held. */ ++#endif + static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) + { + unsigned long long now; +@@ -58,7 +62,11 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) + bfqg_stats_clear_waiting(stats); + } + ++#ifdef BFQ_MQ ++/* This should be called with the scheduler lock held. */ ++#else + /* This should be called with the queue_lock held. */ ++#endif + static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, + struct bfq_group *curr_bfqg) + { +@@ -72,7 +80,11 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, + bfqg_stats_mark_waiting(stats); + } + ++#ifdef BFQ_MQ ++/* This should be called with the scheduler lock held. */ ++#else + /* This should be called with the queue_lock held. */ ++#endif + static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) + { + unsigned long long now; +@@ -198,14 +210,43 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) + + static void bfqg_get(struct bfq_group *bfqg) + { +- return blkg_get(bfqg_to_blkg(bfqg)); ++#ifdef BFQ_MQ ++ bfqg->ref++; ++#else ++ blkg_get(bfqg_to_blkg(bfqg)); ++#endif + } + + static void bfqg_put(struct bfq_group *bfqg) + { +- return blkg_put(bfqg_to_blkg(bfqg)); ++#ifdef BFQ_MQ ++ bfqg->ref--; ++ ++ BUG_ON(bfqg->ref < 0); ++ if (bfqg->ref == 0) ++ kfree(bfqg); ++#else ++ blkg_put(bfqg_to_blkg(bfqg)); ++#endif ++} ++ ++#ifdef BFQ_MQ ++static void bfqg_and_blkg_get(struct bfq_group *bfqg) ++{ ++ /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ ++ bfqg_get(bfqg); ++ ++ blkg_get(bfqg_to_blkg(bfqg)); + } + ++static void bfqg_and_blkg_put(struct bfq_group *bfqg) ++{ ++ bfqg_put(bfqg); ++ ++ blkg_put(bfqg_to_blkg(bfqg)); ++} ++#endif ++ + static void bfqg_stats_update_io_add(struct bfq_group *bfqg, + struct bfq_queue *bfqq, + unsigned int op) +@@ -310,7 +351,15 @@ static void bfq_init_entity(struct bfq_entity *entity, + if (bfqq) { + bfqq->ioprio = bfqq->new_ioprio; + bfqq->ioprio_class = bfqq->new_ioprio_class; ++#ifdef BFQ_MQ ++ /* ++ * Make sure that bfqg and its associated blkg do not ++ * disappear before entity. ++ */ ++ bfqg_and_blkg_get(bfqg); ++#else + bfqg_get(bfqg); ++#endif + } + entity->parent = bfqg->my_entity; /* NULL for root group */ + entity->sched_data = &bfqg->sched_data; +@@ -397,6 +446,10 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) + return NULL; + } + ++#ifdef BFQ_MQ ++ /* see comments in bfq_bic_update_cgroup for why refcounting */ ++ bfqg_get(bfqg); ++#endif + return &bfqg->pd; + } + +@@ -432,7 +485,11 @@ static void bfq_pd_free(struct blkg_policy_data *pd) + struct bfq_group *bfqg = pd_to_bfqg(pd); + + bfqg_stats_exit(&bfqg->stats); +- return kfree(bfqg); ++#ifdef BFQ_MQ ++ bfqg_put(bfqg); ++#else ++ kfree(bfqg); ++#endif + } + + static void bfq_pd_reset_stats(struct blkg_policy_data *pd) +@@ -516,9 +573,16 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, + * Move @bfqq to @bfqg, deactivating it from its old group and reactivating + * it on the new one. Avoid putting the entity on the old group idle tree. + * ++#ifdef BFQ_MQ ++ * Must be called under the scheduler lock, to make sure that the blkg ++ * owning @bfqg does not disappear (see comments in ++ * bfq_bic_update_cgroup on guaranteeing the consistency of blkg ++ * objects). ++#else + * Must be called under the queue lock; the cgroup owning @bfqg must + * not disappear (by now this just means that we are called under + * rcu_read_lock()). ++#endif + */ + static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_group *bfqg) +@@ -555,16 +619,20 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + entity->tree); + bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); + } ++#ifdef BFQ_MQ ++ bfqg_and_blkg_put(bfqq_group(bfqq)); ++#else + bfqg_put(bfqq_group(bfqq)); ++#endif + +- /* +- * Here we use a reference to bfqg. We don't need a refcounter +- * as the cgroup reference will not be dropped, so that its +- * destroy() callback will not be invoked. +- */ + entity->parent = bfqg->my_entity; + entity->sched_data = &bfqg->sched_data; ++#ifdef BFQ_MQ ++ /* pin down bfqg and its associated blkg */ ++ bfqg_and_blkg_get(bfqg); ++#else + bfqg_get(bfqg); ++#endif + + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); + if (bfq_bfqq_busy(bfqq)) { +@@ -585,8 +653,14 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + * @bic: the bic to move. + * @blkcg: the blk-cgroup to move to. + * ++#ifdef BFQ_MQ ++ * Move bic to blkcg, assuming that bfqd->lock is held; which makes ++ * sure that the reference to cgroup is valid across the call (see ++ * comments in bfq_bic_update_cgroup on this issue) ++#else + * Move bic to blkcg, assuming that bfqd->queue is locked; the caller + * has to make sure that the reference to cgroup is valid across the call. ++#endif + * + * NOTE: an alternative approach might have been to store the current + * cgroup in bfqq and getting a reference to it, reducing the lookup +@@ -645,6 +719,59 @@ static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) + goto out; + + bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); ++#ifdef BFQ_MQ ++ /* ++ * Update blkg_path for bfq_log_* functions. We cache this ++ * path, and update it here, for the following ++ * reasons. Operations on blkg objects in blk-cgroup are ++ * protected with the request_queue lock, and not with the ++ * lock that protects the instances of this scheduler ++ * (bfqd->lock). This exposes BFQ to the following sort of ++ * race. ++ * ++ * The blkg_lookup performed in bfq_get_queue, protected ++ * through rcu, may happen to return the address of a copy of ++ * the original blkg. If this is the case, then the ++ * bfqg_and_blkg_get performed in bfq_get_queue, to pin down ++ * the blkg, is useless: it does not prevent blk-cgroup code ++ * from destroying both the original blkg and all objects ++ * directly or indirectly referred by the copy of the ++ * blkg. ++ * ++ * On the bright side, destroy operations on a blkg invoke, as ++ * a first step, hooks of the scheduler associated with the ++ * blkg. And these hooks are executed with bfqd->lock held for ++ * BFQ. As a consequence, for any blkg associated with the ++ * request queue this instance of the scheduler is attached ++ * to, we are guaranteed that such a blkg is not destroyed, and ++ * that all the pointers it contains are consistent, while we ++ * are holding bfqd->lock. A blkg_lookup performed with ++ * bfqd->lock held then returns a fully consistent blkg, which ++ * remains consistent until this lock is held. ++ * ++ * Thanks to the last fact, and to the fact that: (1) bfqg has ++ * been obtained through a blkg_lookup in the above ++ * assignment, and (2) bfqd->lock is being held, here we can ++ * safely use the policy data for the involved blkg (i.e., the ++ * field bfqg->pd) to get to the blkg associated with bfqg, ++ * and then we can safely use any field of blkg. After we ++ * release bfqd->lock, even just getting blkg through this ++ * bfqg may cause dangling references to be traversed, as ++ * bfqg->pd may not exist any more. ++ * ++ * In view of the above facts, here we cache, in the bfqg, any ++ * blkg data we may need for this bic, and for its associated ++ * bfq_queue. As of now, we need to cache only the path of the ++ * blkg, which is used in the bfq_log_* functions. ++ * ++ * Finally, note that bfqg itself needs to be protected from ++ * destruction on the blkg_free of the original blkg (which ++ * invokes bfq_pd_free). We use an additional private ++ * refcounter for bfqg, to let it disappear only after no ++ * bfq_queue refers to it any longer. ++ */ ++ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); ++#endif + bic->blkcg_serial_nr = serial_nr; + out: + rcu_read_unlock(); +@@ -682,8 +809,6 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, + * @bfqd: the device data structure with the root group. + * @bfqg: the group to move from. + * @st: the service tree with the entities. +- * +- * Needs queue_lock to be taken and reference to be valid over the call. + */ + static void bfq_reparent_active_entities(struct bfq_data *bfqd, + struct bfq_group *bfqg, +@@ -736,6 +861,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) + #ifdef BFQ_MQ + spin_lock_irqsave(&bfqd->lock, flags); + #endif ++ + /* + * Empty all service_trees belonging to this group before + * deactivating the group itself. +@@ -746,8 +872,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) + /* + * The idle tree may still contain bfq_queues belonging + * to exited task because they never migrated to a different +- * cgroup from the one being destroyed now. No one else +- * can access them so it's safe to act without any lock. ++ * cgroup from the one being destroyed now. + */ + bfq_flush_idle_tree(st); + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index a9e3406fef06..4eb668eeacdc 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4073,7 +4073,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + + kmem_cache_free(bfq_pool, bfqq); + #ifdef BFQ_GROUP_IOSCHED_ENABLED +- bfqg_put(bfqg); ++ bfqg_and_blkg_put(bfqg); + #endif + } + +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +index 36ee24a87dda..77ab0f22ed22 100644 +--- a/block/bfq-mq.h ++++ b/block/bfq-mq.h +@@ -695,23 +695,17 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + + #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ +- char __pbuf[128]; \ +- \ +- blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ + pr_crit("%s bfq%d%c %s " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ + (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ +- __pbuf, ##args); \ ++ bfqq_group(bfqq)->blkg_path, ##args); \ + } while (0) + + #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ +- char __pbuf[128]; \ +- \ +- blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ + pr_crit("%s %s " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ +- __pbuf, ##args); \ ++ bfqg->blkg_path, ##args); \ + } while (0) + + #else /* BFQ_GROUP_IOSCHED_ENABLED */ +@@ -736,20 +730,14 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + + #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ +- char __pbuf[128]; \ +- \ +- blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ + (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ +- __pbuf, ##args); \ ++ bfqq_group(bfqq)->blkg_path, ##args); \ + } while (0) + + #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ +- char __pbuf[128]; \ +- \ +- blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ +- blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ ++ blk_add_trace_msg((bfqd)->queue, "%s " fmt, bfqg->blkg_path, ##args);\ + } while (0) + + #else /* BFQ_GROUP_IOSCHED_ENABLED */ +@@ -860,6 +848,12 @@ struct bfq_group { + /* must be the first member */ + struct blkg_policy_data pd; + ++ /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ ++ char blkg_path[128]; ++ ++ /* reference counter (see comments in bfq_bic_update_cgroup) */ ++ int ref; ++ + struct bfq_entity entity; + struct bfq_sched_data sched_data; + + +From c9137b749aceef6c2dde88e99b2fc978d5952e76 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Sat, 17 Jun 2017 11:18:11 +0200 +Subject: [PATCH 28/51] bfq-mq: fix macro name in conditional invocation of + policy_unregister + +This commit fixes the name of the macro in the conditional group that +invokes blkcg_policy_unregister in bfq_exit for bfq-mq. Because of +this error, blkcg_policy_unregister was never invoked. + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 4eb668eeacdc..bc1de3f70ea8 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -5669,7 +5669,7 @@ static int __init bfq_init(void) + static void __exit bfq_exit(void) + { + elv_unregister(&iosched_bfq_mq); +-#ifdef CONFIG_BFQ_GROUP_ENABLED ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + blkcg_policy_unregister(&blkcg_policy_bfq); + #endif + bfq_slab_kill(); + +From c7ceb37496f63b2dba4d06946ab85ec97b87bfb5 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 5 Jul 2017 11:48:17 +0200 +Subject: [PATCH 29/51] Port of "blk-mq-sched: unify request finished methods" + +No need to have two different callouts of bfq vs kyber. + +Signed-off-by: Christoph Hellwig +Signed-off-by: Jens Axboe +--- + block/bfq-mq-iosched.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index bc1de3f70ea8..2598602a0b10 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4753,7 +4753,7 @@ static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) + bfq_put_queue(bfqq); + } + +-static void bfq_put_rq_private(struct request_queue *q, struct request *rq) ++static void bfq_finish_request(struct request *rq) + { + struct bfq_queue *bfqq; + struct bfq_data *bfqd; +@@ -4814,7 +4814,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) + + assert_spin_locked(&bfqd->lock); + if (!RB_EMPTY_NODE(&rq->rb_node)) +- bfq_remove_request(q, rq); ++ bfq_remove_request(rq->q, rq); + bfq_put_rq_priv_body(bfqq); + } + +@@ -5558,7 +5558,7 @@ static struct elv_fs_entry bfq_attrs[] = { + static struct elevator_type iosched_bfq_mq = { + .ops.mq = { + .get_rq_priv = bfq_get_rq_private, +- .put_rq_priv = bfq_put_rq_private, ++ .finish_request = bfq_finish_request, + .exit_icq = bfq_exit_icq, + .insert_requests = bfq_insert_requests, + .dispatch_request = bfq_dispatch_request, + +From 12bef026fe114ab5e2e284772ddc52a8be83fdbc Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 5 Jul 2017 11:54:57 +0200 +Subject: [PATCH 30/51] Port of "bfq-iosched: fix NULL ioc check in + bfq_get_rq_private" + +icq_to_bic is a container_of operation, so we need to check for NULL +before it. Also move the check outside the spinlock while we're at +it. + +Signed-off-by: Christoph Hellwig +Signed-off-by: Jens Axboe +--- + block/bfq-mq-iosched.c | 15 +++++---------- + 1 file changed, 5 insertions(+), 10 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 2598602a0b10..c57774a60911 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4903,16 +4903,17 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + struct bio *bio) + { + struct bfq_data *bfqd = q->elevator->elevator_data; +- struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); ++ struct bfq_io_cq *bic; + const int is_sync = rq_is_sync(rq); + struct bfq_queue *bfqq; + bool bfqq_already_existing = false, split = false; + bool new_queue = false; + +- spin_lock_irq(&bfqd->lock); ++ if (!rq->elv.icq) ++ return 1; ++ bic = icq_to_bic(rq->elv.icq); + +- if (!bic) +- goto queue_fail; ++ spin_lock_irq(&bfqd->lock); + + bfq_check_ioprio_change(bic, bio); + +@@ -4980,13 +4981,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + + rq->rq_flags |= RQF_GOT; + spin_unlock_irq(&bfqd->lock); +- + return 0; +- +-queue_fail: +- spin_unlock_irq(&bfqd->lock); +- +- return 1; + } + + static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) + +From 633e5711347df1bf4ca935fd0aa9118a0054f75d Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 5 Jul 2017 12:02:16 +0200 +Subject: [PATCH 31/51] Port of "blk-mq-sched: unify request prepare methods" + +This patch makes sure we always allocate requests in the core blk-mq +code and use a common prepare_request method to initialize them for +both mq I/O schedulers. For Kyber and additional limit_depth method +is added that is called before allocating the request. + +Also because none of the intializations can really fail the new method +does not return an error - instead the bfq finish method is hardened +to deal with the no-IOC case. + +Last but not least this removes the abuse of RQF_QUEUE by the blk-mq +scheduling code as RQF_ELFPRIV is all that is needed now. + +Signed-off-by: Christoph Hellwig +Signed-off-by: Jens Axboe +--- + block/bfq-mq-iosched.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index c57774a60911..49ffca1ad6e7 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4760,6 +4760,10 @@ static void bfq_finish_request(struct request *rq) + struct bfq_io_cq *bic; + + BUG_ON(!rq); ++ ++ if (!rq->elv.icq) ++ return; ++ + bfqq = RQ_BFQQ(rq); + BUG_ON(!bfqq); + +@@ -4899,9 +4903,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, + /* + * Allocate bfq data structures associated with this request. + */ +-static int bfq_get_rq_private(struct request_queue *q, struct request *rq, +- struct bio *bio) ++static void bfq_prepare_request(struct request *rq, struct bio *bio) + { ++ struct request_queue *q = rq->q; + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_io_cq *bic; + const int is_sync = rq_is_sync(rq); +@@ -4910,7 +4914,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + bool new_queue = false; + + if (!rq->elv.icq) +- return 1; ++ return; + bic = icq_to_bic(rq->elv.icq); + + spin_lock_irq(&bfqd->lock); +@@ -4981,7 +4985,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + + rq->rq_flags |= RQF_GOT; + spin_unlock_irq(&bfqd->lock); +- return 0; + } + + static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) +@@ -5552,7 +5555,7 @@ static struct elv_fs_entry bfq_attrs[] = { + + static struct elevator_type iosched_bfq_mq = { + .ops.mq = { +- .get_rq_priv = bfq_get_rq_private, ++ .prepare_request = bfq_prepare_request, + .finish_request = bfq_finish_request, + .exit_icq = bfq_exit_icq, + .insert_requests = bfq_insert_requests, + +From 5a321acfce282c3e58ac63582faf6f928ad17f27 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 5 Jul 2017 12:43:22 +0200 +Subject: [PATCH 32/51] Add list of bfq instances to documentation + +Signed-off-by: Paolo Valente +--- + Documentation/block/bfq-iosched.txt | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt +index 3d6951d63489..8ce6b9a9bacd 100644 +--- a/Documentation/block/bfq-iosched.txt ++++ b/Documentation/block/bfq-iosched.txt +@@ -11,6 +11,15 @@ controllers), BFQ's main features are: + groups (switching back to time distribution when needed to keep + throughput high). + ++If bfq-mq patches have been applied, then the following three ++instances of BFQ are available (otherwise only the first instance): ++- bfq: mainline version of BFQ, for blk-mq ++- bfq-mq: development version of BFQ for blk-mq; this version contains ++ also all latest features not yet landed in mainline, plus many ++ safety checks ++- bfq: BFQ for legacy blk; also this version contains both latest ++ features and safety checks ++ + In its default configuration, BFQ privileges latency over + throughput. So, when needed for achieving a lower latency, BFQ builds + schedules that may lead to a lower throughput. If your main or only +@@ -27,7 +36,7 @@ sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and + to 120-200 MB/s with 4KB random I/O. BFQ is currently being tested on + multi-queue devices too. + +-The table of contents follow. Impatients can just jump to Section 3. ++The table of contents follows. Impatients can just jump to Section 3. + + CONTENTS + + +From 9f2e5b27227fd9254cc258572dc2d4531838c30b Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 5 Jul 2017 16:28:00 +0200 +Subject: [PATCH 33/51] bfq-sq: fix prefix of names of cgroups parameters + +Signed-off-by: Paolo Valente +--- + Documentation/block/bfq-iosched.txt | 12 +++++++----- + block/bfq-cgroup-included.c | 2 +- + 2 files changed, 8 insertions(+), 6 deletions(-) + +diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt +index 8ce6b9a9bacd..965d82f94db9 100644 +--- a/Documentation/block/bfq-iosched.txt ++++ b/Documentation/block/bfq-iosched.txt +@@ -503,10 +503,12 @@ To get proportional sharing of bandwidth with BFQ for a given device, + BFQ must of course be the active scheduler for that device. + + Within each group directory, the names of the files associated with +-BFQ-specific cgroup parameters and stats begin with the "bfq." +-prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for +-BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group +-parameter to set the weight of a group with BFQ is blkio.bfq.weight ++BFQ-specific cgroup parameters and stats begin with the "bfq.", ++"bfq-sq." or "bfq-mq." prefix, depending on which instance of bfq you ++want to use. So, with cgroups-v1 or cgroups-v2, the full prefix for ++BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be "" ++(i.e., null string), "-sq" or "-mq". For example, the group parameter ++to set the weight of a group with the mainline BFQ is blkio.bfq.weight + or io.bfq.weight. + + Parameters to set +@@ -514,7 +516,7 @@ Parameters to set + + For each group, there is only the following parameter to set. + +-weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the ++weight (namely blkio.bfqX.weight or io.bfqX.weight): the weight of the + group inside its parent. Available values: 1..10000 (default 100). The + linear mapping between ioprio and weights, described at the beginning + of the tunable section, is still valid, but all weights higher than +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +index d903393ee78a..631e53d9150d 100644 +--- a/block/bfq-cgroup-included.c ++++ b/block/bfq-cgroup-included.c +@@ -1124,7 +1124,7 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) + #ifdef BFQ_MQ + #define BFQ_CGROUP_FNAME(param) "bfq-mq."#param + #else +-#define BFQ_CGROUP_FNAME(param) "bfq."#param ++#define BFQ_CGROUP_FNAME(param) "bfq-sq."#param + #endif + + static struct cftype bfq_blkcg_legacy_files[] = { + +From 92b42df8166939ccf26aa450125b5b575cf6d505 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 5 Jul 2017 21:08:32 +0200 +Subject: [PATCH 34/51] Add to documentation that bfq-mq and bfq-sq contain + last fixes too + +Signed-off-by: Paolo Valente +--- + Documentation/block/bfq-iosched.txt | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt +index 965d82f94db9..0e59f1c9d30e 100644 +--- a/Documentation/block/bfq-iosched.txt ++++ b/Documentation/block/bfq-iosched.txt +@@ -15,10 +15,10 @@ If bfq-mq patches have been applied, then the following three + instances of BFQ are available (otherwise only the first instance): + - bfq: mainline version of BFQ, for blk-mq + - bfq-mq: development version of BFQ for blk-mq; this version contains +- also all latest features not yet landed in mainline, plus many ++ also all latest features and fixes not yet landed in mainline, plus many + safety checks +-- bfq: BFQ for legacy blk; also this version contains both latest +- features and safety checks ++- bfq: BFQ for legacy blk; also this version contains latest features ++ and fixes, as well as safety checks + + In its default configuration, BFQ privileges latency over + throughput. So, when needed for achieving a lower latency, BFQ builds + +From 7f9bdd433b848d4f53c167258bf4d0b3f1ae1923 Mon Sep 17 00:00:00 2001 +From: Lee Tibbert +Date: Wed, 19 Jul 2017 10:28:32 -0400 +Subject: [PATCH 35/51] Improve most frequently used no-logging path + +This patch originated as a fix for compiler unused-variable warnings +issued when compiling bfq-mq with logging disabled (both +CONFIG_BLK_DEV_IO_TRACE and CONFIG_BFQ_REDIRECT_TO_CONSOLE +undefined). + +It turns out to also have benefits for the bfq-sq path as well. + +In most performance sensitive production builds blktrace_api logging +will probably be turned off, so it is worth making the no-logging path +compile without warnings. Any performance benefit is a bonus. + +Thank you to T. B. on the bfq-iosched@googlegroups.com list +for ((void) (bfqq)) simplification/suggestion/improvement. All bugs +and unclear descriptions are my own doing. + +The discussion below is based on the gcc compiler with optimization +level of at least 02. Lower optimization levels are unlikely to +remove no-op instruction equivalents. + +Provide three improvements in this likely case. + + 1) Fix multiple occurrences of an unused-variable warning + issued when compiling bfq-mq with no logging. The warning + occurred each time the bfq_log_bfqg macro was expanded inside + a code block such as the following snippet from + block/bfq-sched.c, line 139 and few following, lightly edited for + indentation in order to pass checkpatch.pl maximum line lengths. + +else { + struct bfq_group *bfqg = + container_of(next_in_service, + struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "update_next_in_service: chosen this entity"); + } + + Previously bfq-mq.h expanded bfq_log_bfqg to blk_add_trace_msg. + When both bfq console logging and blktrace_api logging are + disabled, include/linux/blktrace_api expands to + do { } while (0), leaving the code block local variable unused. + + bfq_log_bfqq() had similar behavior but is never called with + a potentially unused variable. This patch fixes that macro for + consistency. + + bfq-sq.h (single queue) with blktrace_api enabled, and the bfq + console logging macros have code paths which not trigger this + warning. + + kernel.org (4.12 & 4.13) bfq (bfq-iosched.h) could trigger + the warning but no code does so now. This patch fixes + bfq-iosched.h for consistency. + + The style above enables a software engineering approach where + complex expressions are moved to a local variable before the + bfq_log* call. This makes it easier to read the expression and + use breakpoints to verify it. bfq-mq uses this approach in + several places. + + New bfq_log* macros are provided for the no-logging case. + I touch only the second argument, because current code never + uses the local variable approach with the first or other + arguments. I tried to balance consistency with simplicity. + + 2) For bfq-sq, reduce to zero, the number of instructions executed + when no logging is configured. No sense marshaling arguments + which are never going to be used. + + On a trial V8R11 builds, this reduced the size of bfq-iosched.o + by 14.3 KiB. The size went from 70304 to 55664 bytes. + + bfq-mq and kernel.org bfq code size does not change because + existing macros already optimize to zero bytes when not logging. + The current changes maintains consistency with the bfq-sq path + and makes the bfq-mq & bfq no-logging paths resistant to future + logging path macro changes which might cause generated code. + + 3) Slightly reduce compile time of all bfq variants by including + blktrace_api.h only when it will be used. + +Signed-off-by: Lee Tibbert +--- + block/bfq-mq.h | 18 +++++++++++++++++- + block/bfq.h | 18 +++++++++++++++++- + 2 files changed, 34 insertions(+), 2 deletions(-) + +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +index 77ab0f22ed22..7ed2cc29be57 100644 +--- a/block/bfq-mq.h ++++ b/block/bfq-mq.h +@@ -15,7 +15,6 @@ + #ifndef _BFQ_H + #define _BFQ_H + +-#include + #include + #include + +@@ -725,6 +724,21 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + ##args) + + #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ ++ ++#if !defined(CONFIG_BLK_DEV_IO_TRACE) ++ ++/* Avoid possible "unused-variable" warning. See commit message. */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) ++ ++#define bfq_log(bfqd, fmt, args...) do {} while (0) ++ ++#else /* CONFIG_BLK_DEV_IO_TRACE */ ++ ++#include ++ + #ifdef BFQ_GROUP_IOSCHED_ENABLED + static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); +@@ -752,6 +766,8 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + + #define bfq_log(bfqd, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) ++ ++#endif /* CONFIG_BLK_DEV_IO_TRACE */ + #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ + + /* Expiration reasons. */ +diff --git a/block/bfq.h b/block/bfq.h +index 53954d1b87f8..15d326f466b7 100644 +--- a/block/bfq.h ++++ b/block/bfq.h +@@ -15,7 +15,6 @@ + #ifndef _BFQ_H + #define _BFQ_H + +-#include + #include + #include + +@@ -725,6 +724,21 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + ##args) + + #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ ++ ++#if !defined(CONFIG_BLK_DEV_IO_TRACE) ++ ++/* Avoid possible "unused-variable" warning. See commit message. */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) ++ ++#define bfq_log(bfqd, fmt, args...) do {} while (0) ++ ++#else /* CONFIG_BLK_DEV_IO_TRACE */ ++ ++#include ++ + #ifdef BFQ_GROUP_IOSCHED_ENABLED + static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); +@@ -759,6 +773,8 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + + #define bfq_log(bfqd, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) ++ ++#endif /* CONFIG_BLK_DEV_IO_TRACE */ + #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ + + /* Expiration reasons. */ + +From f11a0e751e741bf94c6a48234824d50b3c0100ad Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 9 Aug 2017 16:40:39 +0200 +Subject: [PATCH 36/51] bfq-sq: fix commit "Remove all get and put of I/O + contexts" in branch bfq-mq + +The commit "Remove all get and put of I/O contexts" erroneously removed +the reset of the field in_service_bic for bfq-sq. This commit re-adds +that missing reset. + +Signed-off-by: Paolo Valente +--- + block/bfq-sched.c | 7 +++++++ + block/bfq-sq-iosched.c | 1 + + 2 files changed, 8 insertions(+) + +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index 9c4e6797d8c9..7425824c26b8 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -1904,6 +1904,13 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) + struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; + struct bfq_entity *entity = in_serv_entity; + ++#ifndef BFQ_MQ ++ if (bfqd->in_service_bic) { ++ put_io_context(bfqd->in_service_bic->icq.ioc); ++ bfqd->in_service_bic = NULL; ++ } ++#endif ++ + bfq_clear_bfqq_wait_request(in_serv_bfqq); + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); + bfqd->in_service_queue = NULL; +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index 25da0d1c0622..e1960bf149d8 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -3765,6 +3765,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, + if (!bfqd->in_service_bic) { + atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); + bfqd->in_service_bic = RQ_BIC(rq); ++ BUG_ON(!bfqd->in_service_bic); + } + + if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) + +From eceae5457530df8598557767d7be258ca9384de4 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 9 Aug 2017 22:29:01 +0200 +Subject: [PATCH 37/51] bfq-sq-mq: make lookup_next_entity push up vtime on + expirations + +To provide a very smooth service, bfq starts to serve a bfq_queue +only if the queue is 'eligible', i.e., if the same queue would +have started to be served in the ideal, perfectly fair system that +bfq simulates internally. This is obtained by associating each +queue with a virtual start time, and by computing a special system +virtual time quantity: a queue is eligible only if the system +virtual time has reached the virtual start time of the +queue. Finally, bfq guarantees that, when a new queue must be set +in service, there is always at least one eligible entity for each +active parent entity in the scheduler. To provide this guarantee, +the function __bfq_lookup_next_entity pushes up, for each parent +entity on which it is invoked, the system virtual time to the +minimum among the virtual start times of the entities in the +active tree for the parent entity (more precisely, the push up +occurs if the system virtual time happens to be lower than all +such virtual start times). + +There is however a circumstance in which __bfq_lookup_next_entity +cannot push up the system virtual time for a parent entity, even +if the system virtual time is lower than the virtual start times +of all the child entities in the active tree. It happens if one of +the child entities is in service. In fact, in such a case, there +is already an eligible entity, the in-service one, even if it may +not be not present in the active tree (because in-service entities +may be removed from the active tree). + +Unfortunately, in the last re-design of the +hierarchical-scheduling engine, the reset of the pointer to the +in-service entity for a given parent entity--reset to be done as a +consequence of the expiration of the in-service entity--always +happens after the function __bfq_lookup_next_entity has been +invoked. This causes the function to think that there is still an +entity in service for the parent entity, and then that the system +virtual time cannot be pushed up, even if actually such a +no-more-in-service entity has already been properly reinserted +into the active tree (or in some other tree if no more +active). Yet, the system virtual time *had* to be pushed up, to be +ready to correctly choose the next queue to serve. Because of the +lack of this push up, bfq may wrongly set in service a queue that +had been speculatively pre-computed as the possible +next-in-service queue, but that would no more be the one to serve +after the expiration and the reinsertion into the active trees of +the previously in-service entities. + +This commit addresses this issue by making +__bfq_lookup_next_entity properly push up the system virtual time +if an expiration is occurring. + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 4 +-- + block/bfq-sched.c | 77 ++++++++++++++++++++++++++++++++------------------ + block/bfq-sq-iosched.c | 4 +-- + 3 files changed, 53 insertions(+), 32 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 49ffca1ad6e7..b5c848650375 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -682,7 +682,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, + entity->budget = new_budget; + bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", + new_budget); +- bfq_requeue_bfqq(bfqd, bfqq); ++ bfq_requeue_bfqq(bfqd, bfqq, false); + } + } + +@@ -2822,7 +2822,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) + + bfq_del_bfqq_busy(bfqd, bfqq, true); + } else { +- bfq_requeue_bfqq(bfqd, bfqq); ++ bfq_requeue_bfqq(bfqd, bfqq, true); + /* + * Resort priority tree of potential close cooperators. + */ +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index 7425824c26b8..f3001af37256 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -33,7 +33,8 @@ static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) + return rb_entry(node, struct bfq_entity, rb_node); + } + +-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, ++ bool expiration); + + static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); + +@@ -43,6 +44,8 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); + * @new_entity: if not NULL, pointer to the entity whose activation, + * requeueing or repositionig triggered the invocation of + * this function. ++ * @expiration: id true, this function is being invoked after the ++ * expiration of the in-service entity + * + * This function is called to update sd->next_in_service, which, in + * its turn, may change as a consequence of the insertion or +@@ -61,7 +64,8 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); + * entity. + */ + static bool bfq_update_next_in_service(struct bfq_sched_data *sd, +- struct bfq_entity *new_entity) ++ struct bfq_entity *new_entity, ++ bool expiration) + { + struct bfq_entity *next_in_service = sd->next_in_service; + struct bfq_queue *bfqq; +@@ -120,7 +124,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + if (replace_next) + next_in_service = new_entity; + } else /* invoked because of a deactivation: lookup needed */ +- next_in_service = bfq_lookup_next_entity(sd); ++ next_in_service = bfq_lookup_next_entity(sd, expiration); + + if (next_in_service) { + parent_sched_may_change = !sd->next_in_service || +@@ -1291,10 +1295,12 @@ static void __bfq_activate_requeue_entity(struct bfq_entity *entity, + * @requeue: true if this is a requeue, which implies that bfqq is + * being expired; thus ALL its ancestors stop being served and must + * therefore be requeued ++ * @expiration: true if this function is being invoked in the expiration path ++ * of the in-service queue + */ + static void bfq_activate_requeue_entity(struct bfq_entity *entity, + bool non_blocking_wait_rq, +- bool requeue) ++ bool requeue, bool expiration) + { + struct bfq_sched_data *sd; + +@@ -1307,7 +1313,8 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, + RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && + RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); + +- if (!bfq_update_next_in_service(sd, entity) && !requeue) { ++ if (!bfq_update_next_in_service(sd, entity, expiration) && ++ !requeue) { + BUG_ON(!sd->next_in_service); + break; + } +@@ -1373,6 +1380,8 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, + * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. + * @entity: the entity to deactivate. + * @ins_into_idle_tree: true if the entity can be put into the idle tree ++ * @expiration: true if this function is being invoked in the expiration path ++ * of the in-service queue + */ + static void bfq_deactivate_entity(struct bfq_entity *entity, + bool ins_into_idle_tree, +@@ -1417,7 +1426,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, + * then, since entity has just been + * deactivated, a new one must be found. + */ +- bfq_update_next_in_service(sd, NULL); ++ bfq_update_next_in_service(sd, NULL, expiration); + + if (sd->next_in_service || sd->in_service_entity) { + /* +@@ -1495,7 +1504,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, + "invoking udpdate_next for this entity"); + } + #endif +- if (!bfq_update_next_in_service(sd, entity) && ++ if (!bfq_update_next_in_service(sd, entity, expiration) && + !expiration) + /* + * next_in_service unchanged or not causing +@@ -1524,7 +1533,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_vtime_jump: new value %llu", +- root_entity->min_start); ++ ((root_entity->min_start>>10)*1000)>>12); + #ifdef BFQ_GROUP_IOSCHED_ENABLED + else { + struct bfq_group *bfqg = +@@ -1533,7 +1542,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "calc_vtime_jump: new value %llu", +- root_entity->min_start); ++ ((root_entity->min_start>>10)*1000)>>12); + } + #endif + return root_entity->min_start; +@@ -1615,17 +1624,9 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, + * 3) is idle. + */ + static struct bfq_entity * +-__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service +-#if 0 +- , bool force +-#endif +- ) ++__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) + { +- struct bfq_entity *entity +-#if 0 +- , *new_next_in_service = NULL +-#endif +- ; ++ struct bfq_entity *entity; + u64 new_vtime; + struct bfq_queue *bfqq; + +@@ -1667,8 +1668,9 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, +- "__lookup_next: start %llu vtime %llu st %p", ++ "__lookup_next: start %llu vtime %llu (%llu) st %p", + ((entity->start>>10)*1000)>>12, ++ ((st->vtime>>10)*1000)>>12, + ((new_vtime>>10)*1000)>>12, st); + } + #endif +@@ -1681,12 +1683,14 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service + /** + * bfq_lookup_next_entity - return the first eligible entity in @sd. + * @sd: the sched_data. ++ * @expiration: true if we are on the expiration path of the in-service queue + * + * This function is invoked when there has been a change in the trees +- * for sd, and we need know what is the new next entity after this +- * change. ++ * for sd, and we need to know what is the new next entity to serve ++ * after this change. + */ +-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, ++ bool expiration) + { + struct bfq_service_tree *st = sd->service_tree; + struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); +@@ -1716,8 +1720,24 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) + * class, unless the idle class needs to be served. + */ + for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { ++ /* ++ * If expiration is true, then bfq_lookup_next_entity ++ * is being invoked as a part of the expiration path ++ * of the in-service queue. In this case, even if ++ * sd->in_service_entity is not NULL, ++ * sd->in_service_entiy at this point is actually not ++ * in service any more, and, if needed, has already ++ * been properly queued or requeued into the right ++ * tree. The reason why sd->in_service_entity is still ++ * not NULL here, even if expiration is true, is that ++ * sd->in_service_entiy is reset as a last step in the ++ * expiration path. So, if expiration is true, tell ++ * __bfq_lookup_next_entity that there is no ++ * sd->in_service_entity. ++ */ + entity = __bfq_lookup_next_entity(st + class_idx, +- sd->in_service_entity); ++ sd->in_service_entity && ++ !expiration); + + if (entity) + break; +@@ -1891,7 +1911,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) + for_each_entity(entity) { + struct bfq_sched_data *sd = entity->sched_data; + +- if(!bfq_update_next_in_service(sd, NULL)) ++ if (!bfq_update_next_in_service(sd, NULL, false)) + break; + } + +@@ -1951,16 +1971,17 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) + entity->on_st); + + bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), +- false); ++ false, false); + bfq_clear_bfqq_non_blocking_wait_rq(bfqq); + } + +-static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool expiration) + { + struct bfq_entity *entity = &bfqq->entity; + + bfq_activate_requeue_entity(entity, false, +- bfqq == bfqd->in_service_queue); ++ bfqq == bfqd->in_service_queue, expiration); + } + + static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index e1960bf149d8..42393ab889a9 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -644,7 +644,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, + entity->budget = new_budget; + bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", + new_budget); +- bfq_requeue_bfqq(bfqd, bfqq); ++ bfq_requeue_bfqq(bfqd, bfqq, false); + } + } + +@@ -2715,7 +2715,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) + + bfq_del_bfqq_busy(bfqd, bfqq, true); + } else { +- bfq_requeue_bfqq(bfqd, bfqq); ++ bfq_requeue_bfqq(bfqd, bfqq, true); + /* + * Resort priority tree of potential close cooperators. + */ + +From ee9f95b24e1d88ffba4845981c2a4684aefd0245 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 9 Aug 2017 22:53:00 +0200 +Subject: [PATCH 38/51] bfq-sq-mq: remove direct switch to an entity in higher + class + +If the function bfq_update_next_in_service is invoked as a consequence +of the activation or requeueing of an entity, say E, and finds out +that E belongs to a higher-priority class than that of the current +next-in-service entity, then it sets next_in_service directly to +E. But this may lead to anomalous schedules, because E may happen not +be eligible for service, because its virtual start time is higher than +the system virtual time for its service tree. + +This commit addresses this issue by simply removing this direct +switch. + +Signed-off-by: Paolo Valente +--- + block/bfq-sched.c | 19 +++++-------------- + 1 file changed, 5 insertions(+), 14 deletions(-) + +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index f3001af37256..b1a59088db88 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -76,9 +76,8 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + * or repositiong of an entity that does not coincide with + * sd->next_in_service, then a full lookup in the active tree + * can be avoided. In fact, it is enough to check whether the +- * just-modified entity has a higher priority than +- * sd->next_in_service, or, even if it has the same priority +- * as sd->next_in_service, is eligible and has a lower virtual ++ * just-modified entity has the same priority as ++ * sd->next_in_service, is eligible and has a lower virtual + * finish time than sd->next_in_service. If this compound + * condition holds, then the new entity becomes the new + * next_in_service. Otherwise no change is needed. +@@ -94,9 +93,8 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + + /* + * If there is already a next_in_service candidate +- * entity, then compare class priorities or timestamps +- * to decide whether to replace sd->service_tree with +- * new_entity. ++ * entity, then compare timestamps to decide whether ++ * to replace sd->service_tree with new_entity. + */ + if (next_in_service) { + unsigned int new_entity_class_idx = +@@ -104,10 +102,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + struct bfq_service_tree *st = + sd->service_tree + new_entity_class_idx; + +- /* +- * For efficiency, evaluate the most likely +- * sub-condition first. +- */ + replace_next = + (new_entity_class_idx == + bfq_class_idx(next_in_service) +@@ -115,10 +109,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + !bfq_gt(new_entity->start, st->vtime) + && + bfq_gt(next_in_service->finish, +- new_entity->finish)) +- || +- new_entity_class_idx < +- bfq_class_idx(next_in_service); ++ new_entity->finish)); + } + + if (replace_next) + +From a3fdc5af40537355b68c1f0d3997c5a5fb54b9ce Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Thu, 10 Aug 2017 08:15:50 +0200 +Subject: [PATCH 39/51] bfq-sq-mq: guarantee update_next_in_service always + returns an eligible entity + +If the function bfq_update_next_in_service is invoked as a consequence +of the activation or requeueing of an entity, say E, then it doesn't +invoke bfq_lookup_next_entity to get the next-in-service entity. In +contrast, it follows a shorter path: if E happens to be eligible (see +commit "bfq-sq-mq: make lookup_next_entity push up vtime on +expirations" for details on eligibility) and to have a lower virtual +finish time than the current candidate as next-in-service entity, then +E directly becomes the next-in-service entity. Unfortunately, there is +a corner case for which this shorter path makes +bfq_update_next_in_service choose a non eligible entity: it occurs if +both E and the current next-in-service entity happen to be non +eligible when bfq_update_next_in_service is invoked. In this case, E +is not set as next-in-service, and, since bfq_lookup_next_entity is +not invoked, the state of the parent entity is not updated so as to +end up with an eligible entity as the proper next-in-service entity. + +In this respect, next-in-service is actually allowed to be non +eligible while some queue is in service: since no system-virtual-time +push-up can be performed in that case (see again commit "bfq-sq-mq: +make lookup_next_entity push up vtime on expirations" for details), +next-in-service is chosen, speculatively, as a function of the +possible value that the system virtual time may get after a push +up. But the correctness of the schedule breaks if next-in-service is +still a non eligible entity when it is time to set in service the next +entity. Unfortunately, this may happen in the above corner case. + +This commit fixes this problem by making bfq_update_next_in_service +invoke bfq_lookup_next_entity not only if the above shorter path +cannot be taken, but also if the shorter path is taken but fails to +yield an eligible next-in-service entity. + +Signed-off-by: Paolo Valente +--- + block/bfq-sched.c | 38 ++++++++++++++++++++++++++++---------- + 1 file changed, 28 insertions(+), 10 deletions(-) + +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index b1a59088db88..e4a2553a2d2c 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -70,6 +70,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *next_in_service = sd->next_in_service; + struct bfq_queue *bfqq; + bool parent_sched_may_change = false; ++ bool change_without_lookup = false; + + /* + * If this update is triggered by the activation, requeueing +@@ -89,7 +90,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + * set to true, and left as true if + * sd->next_in_service is NULL. + */ +- bool replace_next = true; ++ change_without_lookup = true; + + /* + * If there is already a next_in_service candidate +@@ -102,7 +103,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + struct bfq_service_tree *st = + sd->service_tree + new_entity_class_idx; + +- replace_next = ++ change_without_lookup = + (new_entity_class_idx == + bfq_class_idx(next_in_service) + && +@@ -112,15 +113,32 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + new_entity->finish)); + } + +- if (replace_next) ++ if (change_without_lookup) { + next_in_service = new_entity; +- } else /* invoked because of a deactivation: lookup needed */ ++ bfqq = bfq_entity_to_bfqq(next_in_service); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "update_next_in_service: chose without lookup"); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(next_in_service, ++ struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data*)bfqg->bfqd, bfqg, ++ "update_next_in_service: chose without lookup"); ++ } ++#endif ++ } ++ } ++ ++ if (!change_without_lookup) /* lookup needed */ + next_in_service = bfq_lookup_next_entity(sd, expiration); + +- if (next_in_service) { ++ if (next_in_service) + parent_sched_may_change = !sd->next_in_service || + bfq_update_parent_budget(next_in_service); +- } + + sd->next_in_service = next_in_service; + +@@ -1053,7 +1071,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "__activate_entity: new queue finish %llu", ++ "update_fin_time_enqueue: new queue finish %llu", + ((entity->finish>>10)*1000)>>12); + #ifdef BFQ_GROUP_IOSCHED_ENABLED + } else { +@@ -1061,7 +1079,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, +- "__activate_entity: new group finish %llu", ++ "update_fin_time_enqueue: new group finish %llu", + ((entity->finish>>10)*1000)>>12); + #endif + } +@@ -1071,7 +1089,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "__activate_entity: queue %seligible in st %p", ++ "update_fin_time_enqueue: queue %seligible in st %p", + entity->start <= st->vtime ? "" : "non ", st); + #ifdef BFQ_GROUP_IOSCHED_ENABLED + } else { +@@ -1079,7 +1097,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, +- "__activate_entity: group %seligible in st %p", ++ "update_fin_time_enqueue: group %seligible in st %p", + entity->start <= st->vtime ? "" : "non ", st); + #endif + } + +From 6565e4d1aac029b6f0a5d86a4c6ef38608838eac Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Thu, 31 Aug 2017 19:24:26 +0200 +Subject: [PATCH 40/51] doc, block, bfq: fix some typos and stale sentences + +Signed-off-by: Paolo Valente +Reviewed-by: Jeremy Hickman +Reviewed-by: Laurentiu Nicola +--- + Documentation/block/bfq-iosched.txt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt +index 0e59f1c9d30e..dcfe15523da3 100644 +--- a/Documentation/block/bfq-iosched.txt ++++ b/Documentation/block/bfq-iosched.txt +@@ -17,7 +17,7 @@ instances of BFQ are available (otherwise only the first instance): + - bfq-mq: development version of BFQ for blk-mq; this version contains + also all latest features and fixes not yet landed in mainline, plus many + safety checks +-- bfq: BFQ for legacy blk; also this version contains latest features ++- bfq-sq: BFQ for legacy blk; also this version contains latest features + and fixes, as well as safety checks + + In its default configuration, BFQ privileges latency over + +From 261ee8cc9f43e03d790a07184f0bcaa504ee6737 Mon Sep 17 00:00:00 2001 +From: Luca Miccio +Date: Wed, 13 Sep 2017 12:03:56 +0200 +Subject: [PATCH 41/51] bfq-mq, bfq-sq: Disable writeback throttling + +Similarly to CFQ, BFQ has its write-throttling heuristics, and it +is better not to combine them with further write-throttling +heuristics of a different nature. +So this commit disables write-back throttling for a device if BFQ +is used as I/O scheduler for that device. + +Signed-off-by: Luca Miccio +Signed-off-by: Paolo Valente +Tested-by: Oleksandr Natalenko +--- + block/bfq-mq-iosched.c | 2 ++ + block/bfq-sq-iosched.c | 7 +++++++ + 2 files changed, 9 insertions(+) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index b5c848650375..7d27d5b3befb 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -89,6 +89,7 @@ + #include "blk-mq-tag.h" + #include "blk-mq-sched.h" + #include "bfq-mq.h" ++#include "blk-wbt.h" + + /* Expiration time of sync (0) and async (1) requests, in ns. */ + static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; +@@ -5260,6 +5261,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + bfq_init_root_group(bfqd->root_group, bfqd); + bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); + ++ wbt_disable_default(q); + return 0; + + out_free: +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index 42393ab889a9..6fdc3b1d5bb8 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -83,6 +83,7 @@ + #include + #include "blk.h" + #include "bfq.h" ++#include "blk-wbt.h" + + /* Expiration time of sync (0) and async (1) requests, in ns. */ + static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; +@@ -4976,6 +4977,11 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + return -ENOMEM; + } + ++static void bfq_registered_queue(struct request_queue *q) ++{ ++ wbt_disable_default(q); ++} ++ + static void bfq_slab_kill(void) + { + kmem_cache_destroy(bfq_pool); +@@ -5285,6 +5291,7 @@ static struct elevator_type iosched_bfq = { + .elevator_may_queue_fn = bfq_may_queue, + .elevator_init_fn = bfq_init_queue, + .elevator_exit_fn = bfq_exit_queue, ++ .elevator_registered_fn = bfq_registered_queue, + }, + .icq_size = sizeof(struct bfq_io_cq), + .icq_align = __alignof__(struct bfq_io_cq), + +From 40ea0aed088791da27fcfa51f3b64d1f96b0d06e Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Tue, 12 Sep 2017 16:45:53 +0200 +Subject: [PATCH 42/51] bfq-mq, bfq-sq: fix wrong init of saved start time for + weight raising + +This commit fixes a bug that causes bfq to fail to guarantee a high +responsiveness on some drives, if there is heavy random read+write I/O +in the background. More precisely, such a failure allowed this bug to +be found [1], but the bug may well cause other yet unreported +anomalies. + +BFQ raises the weight of the bfq_queues associated with soft real-time +applications, to privilege the I/O, and thus reduce latency, for these +applications. This mechanism is named soft-real-time weight raising in +BFQ. A soft real-time period may happen to be nested into an +interactive weight raising period, i.e., it may happen that, when a +bfq_queue switches to a soft real-time weight-raised state, the +bfq_queue is already being weight-raised because deemed interactive +too. In this case, BFQ saves in a special variable +wr_start_at_switch_to_srt, the time instant when the interactive +weight-raising period started for the bfq_queue, i.e., the time +instant when BFQ started to deem the bfq_queue interactive. This value +is then used to check whether the interactive weight-raising period +would still be in progress when the soft real-time weight-raising +period ends. If so, interactive weight raising is restored for the +bfq_queue. This restore is useful, in particular, because it prevents +bfq_queues from losing their interactive weight raising prematurely, +as a consequence of spurious, short-lived soft real-time +weight-raising periods caused by wrong detections as soft real-time. + +If, instead, a bfq_queue switches to soft-real-time weight raising +while it *is not* already in an interactive weight-raising period, +then the variable wr_start_at_switch_to_srt has no meaning during the +following soft real-time weight-raising period. Unfortunately the +handling of this case is wrong in BFQ: not only the variable is not +flagged somehow as meaningless, but it is also set to the time when +the switch to soft real-time weight-raising occurs. This may cause an +interactive weight-raising period to be considered mistakenly as still +in progress, and thus a spurious interactive weight-raising period to +start for the bfq_queue, at the end of the soft-real-time +weight-raising period. In particular the spurious interactive +weight-raising period will be considered as still in progress, if the +soft-real-time weight-raising period does not last very long. The +bfq_queue will then be wrongly privileged and, if I/O bound, will +unjustly steal bandwidth to truly interactive or soft real-time +bfq_queues, harming responsiveness and low latency. + +This commit fixes this issue by just setting wr_start_at_switch_to_srt +to minus infinity (farthest past time instant according to jiffies +macros): when the soft-real-time weight-raising period ends, certainly +no interactive weight-raising period will be considered as still in +progress. + +[1] Background I/O Type: Random - Background I/O mix: Reads and writes +- Application to start: LibreOffice Writer in +http://www.phoronix.com/scan.php?page=news_item&px=Linux-4.13-IO-Laptop + +Signed-off-by: Paolo Valente +Signed-off-by: Angelo Ruocco +Tested-by: Oleksandr Natalenko +Tested-by: Lee Tibbert +Tested-by: Mirko Montanari +--- + block/bfq-mq-iosched.c | 50 +++++++++++++++++++++++++++++++------------------- + block/bfq-sq-iosched.c | 50 +++++++++++++++++++++++++++++++------------------- + 2 files changed, 62 insertions(+), 38 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 7d27d5b3befb..f378519b6d33 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -1204,6 +1204,24 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, + return wr_or_deserves_wr; + } + ++/* ++ * Return the farthest future time instant according to jiffies ++ * macros. ++ */ ++static unsigned long bfq_greatest_from_now(void) ++{ ++ return jiffies + MAX_JIFFY_OFFSET; ++} ++ ++/* ++ * Return the farthest past time instant according to jiffies ++ * macros. ++ */ ++static unsigned long bfq_smallest_from_now(void) ++{ ++ return jiffies - MAX_JIFFY_OFFSET; ++} ++ + static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + unsigned int old_wr_coeff, +@@ -1218,7 +1236,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else { +- bfqq->wr_start_at_switch_to_srt = jiffies; ++ /* ++ * No interactive weight raising in progress ++ * here: assign minus infinity to ++ * wr_start_at_switch_to_srt, to make sure ++ * that, at the end of the soft-real-time ++ * weight raising periods that is starting ++ * now, no interactive weight-raising period ++ * may be wrongly considered as still in ++ * progress (and thus actually started by ++ * mistake). ++ */ ++ bfqq->wr_start_at_switch_to_srt = ++ bfq_smallest_from_now(); + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfqq->wr_cur_max_time = +@@ -3174,24 +3204,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); + } + +-/* +- * Return the farthest future time instant according to jiffies +- * macros. +- */ +-static unsigned long bfq_greatest_from_now(void) +-{ +- return jiffies + MAX_JIFFY_OFFSET; +-} +- +-/* +- * Return the farthest past time instant according to jiffies +- * macros. +- */ +-static unsigned long bfq_smallest_from_now(void) +-{ +- return jiffies - MAX_JIFFY_OFFSET; +-} +- + /** + * bfq_bfqq_expire - expire a queue. + * @bfqd: device owning the queue. +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index 6fdc3b1d5bb8..f4654436cd55 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -1165,6 +1165,24 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, + return wr_or_deserves_wr; + } + ++/* ++ * Return the farthest future time instant according to jiffies ++ * macros. ++ */ ++static unsigned long bfq_greatest_from_now(void) ++{ ++ return jiffies + MAX_JIFFY_OFFSET; ++} ++ ++/* ++ * Return the farthest past time instant according to jiffies ++ * macros. ++ */ ++static unsigned long bfq_smallest_from_now(void) ++{ ++ return jiffies - MAX_JIFFY_OFFSET; ++} ++ + static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + unsigned int old_wr_coeff, +@@ -1179,7 +1197,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else { +- bfqq->wr_start_at_switch_to_srt = jiffies; ++ /* ++ * No interactive weight raising in progress ++ * here: assign minus infinity to ++ * wr_start_at_switch_to_srt, to make sure ++ * that, at the end of the soft-real-time ++ * weight raising periods that is starting ++ * now, no interactive weight-raising period ++ * may be wrongly considered as still in ++ * progress (and thus actually started by ++ * mistake). ++ */ ++ bfqq->wr_start_at_switch_to_srt = ++ bfq_smallest_from_now(); + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfqq->wr_cur_max_time = +@@ -3067,24 +3097,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); + } + +-/* +- * Return the farthest future time instant according to jiffies +- * macros. +- */ +-static unsigned long bfq_greatest_from_now(void) +-{ +- return jiffies + MAX_JIFFY_OFFSET; +-} +- +-/* +- * Return the farthest past time instant according to jiffies +- * macros. +- */ +-static unsigned long bfq_smallest_from_now(void) +-{ +- return jiffies - MAX_JIFFY_OFFSET; +-} +- + /** + * bfq_bfqq_expire - expire a queue. + * @bfqd: device owning the queue. + +From 9dbea44b6f721baeff35b9fdf628ec55fe00e09d Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Thu, 14 Sep 2017 05:12:58 -0400 +Subject: [PATCH 43/51] Fix commit "Unnest request-queue and ioc locks from + scheduler locks" + +The commit "Unnest request-queue and ioc locks from scheduler locks" +mistakenly removed the setting of the split flag in function +bfq_prepare_request. This commit puts this missing instruction back in +its place. + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index f378519b6d33..288078e68a2a 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -744,6 +744,12 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", ++ __func__, ++ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, ++ bfqq->wr_cur_max_time); ++ + if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time))) { +@@ -2208,6 +2214,11 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) + bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; + bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", ++ __func__, ++ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, ++ bfqq->wr_cur_max_time); + } + + static void +@@ -4950,6 +4961,7 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio) + bic->saved_in_large_burst = true; + + bfqq = bfq_split_bfqq(bic, bfqq); ++ split = true; + + if (!bfqq) + bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, + +From d4ebb2a66a23dc183792088c521f2be2193b56db Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Fri, 15 Sep 2017 01:53:51 -0400 +Subject: [PATCH 44/51] bfq-sq, bfq-mq: check and switch back to interactive wr + also on queue split + +As already explained in the message of commit "bfq-mq, bfq-sq: fix +wrong init of saved start time for weight raising", if a soft +real-time weight-raising period happens to be nested in a larger +interactive weight-raising period, then BFQ restores the interactive +weight raising at the end of the soft real-time weight raising. In +particular, BFQ checks whether the latter has ended only on request +dispatches. + +Unfortunately, the above scheme fails to restore interactive weight +raising in the following corner case: if a bfq_queue, say Q, +1) Is merged with another bfq_queue while it is in a nested soft +real-time weight-raising period. The weight-raising state of Q is +then saved, and not considered any longer until a split occurs. +2) Is split from the other bfq_queue(s) at a time instant when its +soft real-time weight raising is already finished. +On the split, while resuming the previous, soft real-time +weight-raised state of the bfq_queue Q, BFQ checks whether the +current soft real-time weight-raising period is actually over. If so, +BFQ switches weight raising off for Q, *without* checking whether the +soft real-time period was actually nested in a non-yet-finished +interactive weight-raising period. + +This commit addresses this issue by adding the above missing check in +bfq_queue splits, and restoring interactive weight raising if needed. + +Signed-off-by: Paolo Valente +Tested-by: Angelo Ruocco +Tested-by: Mirko Montanari +--- + block/bfq-mq-iosched.c | 29 +++++++++++++++++++++-------- + block/bfq-sq-iosched.c | 35 +++++++++++++++++++++++++++-------- + 2 files changed, 48 insertions(+), 16 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 288078e68a2a..6130a95c6497 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -716,6 +716,15 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) + return dur; + } + ++/* switch back from soft real-time to interactive weight raising */ ++static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, ++ struct bfq_data *bfqd) ++{ ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; ++} ++ + static void + bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + struct bfq_io_cq *bic, bool bfq_already_existing) +@@ -753,12 +762,20 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time))) { +- bfq_log_bfqq(bfqq->bfqd, bfqq, ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ !bfq_bfqq_in_large_burst(bfqq) && ++ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + ++ bfq_wr_duration(bfqd))) { ++ switch_back_to_interactive_wr(bfqq, bfqd); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "resume state: switching back to interactive"); ++ } else { ++ bfqq->wr_coeff = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, + "resume state: switching off wr (%lu + %lu < %lu)", + bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, + jiffies); +- +- bfqq->wr_coeff = 1; ++ } + } + + /* make sure weight will be updated, however we got here */ +@@ -3820,11 +3837,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfq_wr_duration(bfqd))) + bfq_bfqq_end_wr(bfqq); + else { +- /* switch back to interactive wr */ +- bfqq->wr_coeff = bfqd->bfq_wr_coeff; +- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); +- bfqq->last_wr_start_finish = +- bfqq->wr_start_at_switch_to_srt; ++ switch_back_to_interactive_wr(bfqq, bfqd); + BUG_ON(time_is_after_jiffies( + bfqq->last_wr_start_finish)); + bfqq->entity.prio_changed = 1; +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index f4654436cd55..e07d5d1c0d40 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -678,6 +678,15 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) + return dur; + } + ++/* switch back from soft real-time to interactive weight raising */ ++static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, ++ struct bfq_data *bfqd) ++{ ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; ++} ++ + static void + bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + struct bfq_io_cq *bic, bool bfq_already_existing) +@@ -705,15 +714,29 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", ++ __func__, ++ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, ++ bfqq->wr_cur_max_time); ++ + if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time))) { +- bfq_log_bfqq(bfqq->bfqd, bfqq, ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ !bfq_bfqq_in_large_burst(bfqq) && ++ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + ++ bfq_wr_duration(bfqd))) { ++ switch_back_to_interactive_wr(bfqq, bfqd); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "resume state: switching back to interactive"); ++ } else { ++ bfqq->wr_coeff = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, + "resume state: switching off wr (%lu + %lu < %lu)", + bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, + jiffies); +- +- bfqq->wr_coeff = 1; ++ } + } + + /* make sure weight will be updated, however we got here */ +@@ -3703,11 +3726,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfq_wr_duration(bfqd))) + bfq_bfqq_end_wr(bfqq); + else { +- /* switch back to interactive wr */ +- bfqq->wr_coeff = bfqd->bfq_wr_coeff; +- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); +- bfqq->last_wr_start_finish = +- bfqq->wr_start_at_switch_to_srt; ++ switch_back_to_interactive_wr(bfqq, bfqd); + BUG_ON(time_is_after_jiffies( + bfqq->last_wr_start_finish)); + bfqq->entity.prio_changed = 1; + +From 9eaec0c3a2d675763b09da81c9117a9c43bce942 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Fri, 15 Sep 2017 04:58:33 -0400 +Subject: [PATCH 45/51] bfq-sq, bfq-mq: let early-merged queues be + weight-raised on split too + +A just-created bfq_queue, say Q, may happen to be merged with another +bfq_queue on the very first invocation of the function +__bfq_insert_request. In such a case, even if Q would clearly deserve +interactive weight raising (as it has just been created), the function +bfq_add_request does not make it to be invoked for Q, and thus to +activate weight raising for Q. As a consequence, when the state of Q +is saved for a possible future restore, after a split of Q from the +other bfq_queue(s), such a state happens to be (unjustly) +non-weight-raised. Then the bfq_queue will not enjoy any weight +raising on the split, even if should still be in an interactive +weight-raising period when the split occurs. + +This commit solves this problem as follows, for a just-created +bfq_queue that is being early-merged: it stores directly, in the saved +state of the bfq_queue, the weight-raising state that would have been +assigned to the bfq_queue if not early-merged. + +Signed-off-by: Paolo Valente +Tested-by: Angelo Ruocco +Tested-by: Mirko Montanari +--- + block/bfq-mq-iosched.c | 28 +++++++++++++++++++++++----- + block/bfq-sq-iosched.c | 28 +++++++++++++++++++++++----- + 2 files changed, 46 insertions(+), 10 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 6130a95c6497..af84e506e897 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -2226,10 +2226,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) + bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); +- bic->saved_wr_coeff = bfqq->wr_coeff; +- bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; +- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; +- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; ++ if (unlikely(bfq_bfqq_just_created(bfqq) && ++ !bfq_bfqq_in_large_burst(bfqq))) { ++ /* ++ * bfqq being merged ritgh after being created: bfqq ++ * would have deserved interactive weight raising, but ++ * did not make it to be set in a weight-raised state, ++ * because of this early merge. Store directly the ++ * weight-raising state that would have been assigned ++ * to bfqq, so that to avoid that bfqq unjustly fails ++ * to enjoy weight raising if split soon. ++ */ ++ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; ++ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); ++ bic->saved_last_wr_start_finish = jiffies; ++ } else { ++ bic->saved_wr_coeff = bfqq->wr_coeff; ++ bic->saved_wr_start_at_switch_to_srt = ++ bfqq->wr_start_at_switch_to_srt; ++ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; ++ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; ++ } + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", +@@ -4560,7 +4577,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + bfqq->allocated); + + new_bfqq->ref++; +- bfq_clear_bfqq_just_created(bfqq); + /* + * If the bic associated with the process + * issuing this request still points to bfqq +@@ -4572,6 +4588,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + bfq_merge_bfqqs(bfqd, RQ_BIC(rq), + bfqq, new_bfqq); ++ ++ bfq_clear_bfqq_just_created(bfqq); + /* + * rq is about to be enqueued into new_bfqq, + * release rq reference on bfqq +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index e07d5d1c0d40..0c48f527fe3f 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -2105,10 +2105,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) + bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); +- bic->saved_wr_coeff = bfqq->wr_coeff; +- bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; +- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; +- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; ++ if (unlikely(bfq_bfqq_just_created(bfqq) && ++ !bfq_bfqq_in_large_burst(bfqq))) { ++ /* ++ * bfqq being merged ritgh after being created: bfqq ++ * would have deserved interactive weight raising, but ++ * did not make it to be set in a weight-raised state, ++ * because of this early merge. Store directly the ++ * weight-raising state that would have been assigned ++ * to bfqq, so that to avoid that bfqq unjustly fails ++ * to enjoy weight raising if split soon. ++ */ ++ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; ++ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); ++ bic->saved_last_wr_start_finish = jiffies; ++ } else { ++ bic->saved_wr_coeff = bfqq->wr_coeff; ++ bic->saved_wr_start_at_switch_to_srt = ++ bfqq->wr_start_at_switch_to_srt; ++ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; ++ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; ++ } + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + } + +@@ -4383,10 +4400,11 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) + new_bfqq->allocated[rq_data_dir(rq)]++; + bfqq->allocated[rq_data_dir(rq)]--; + new_bfqq->ref++; +- bfq_clear_bfqq_just_created(bfqq); + if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + bfq_merge_bfqqs(bfqd, RQ_BIC(rq), + bfqq, new_bfqq); ++ ++ bfq_clear_bfqq_just_created(bfqq); + /* + * rq is about to be enqueued into new_bfqq, + * release rq reference on bfqq + +From cb05150675095cb97ab22e4955eb82e4fe2e9dbe Mon Sep 17 00:00:00 2001 +From: omcira +Date: Mon, 18 Sep 2017 10:49:48 +0200 +Subject: [PATCH 46/51] bfq-sq, bfq-mq: decrease burst size when queues in + burst exit + +If many queues belonging to the same group happen to be created +shortly after each other, then the concurrent processes associated +with these queues have typically a common goal, and they get it done +as soon as possible if not hampered by device idling. Examples are +processes spawned by git grep, or by systemd during boot. As for +device idling, this mechanism is currently necessary for weight +raising to succeed in its goal: privileging I/O. In view of these +facts, BFQ does not provide the above queues with either weight +raising or device idling. + +On the other hand, a burst of queue creations may be caused also by +the start-up of a complex application. In this case, these queues need +usually to be served one after the other, and as quickly as possible, +to maximise responsiveness. Therefore, in this case the best strategy +is to weight-raise all the queues created during the burst, i.e., the +exact opposite of the strategy for the above case. + +To distinguish between the two cases, BFQ uses an empirical burst-size +threshold, found through extensive tests and monitoring of daily +usage. Only large bursts, i.e., burst with a size above this +threshold, are considered as generated by a high number of parallel +processes. In this respect, upstart-based boot proved to be rather +hard to detect as generating a large burst of queue creations, because +with upstart most of the queues created in a burst exit *before* the +next queues in the same burst are created. To address this issue, I +changed the burst-detection mechanism so as to not decrease the size +of the current burst even if one of the queues in the burst is +eliminated. + +Unfortunately, this missing decrease causes false positives on very +fast systems: on the start-up of a complex application, such as +libreoffice writer, so many queues are created, served and exited +shortly after each other, that a large burst of queue creations is +wrongly detected as occurring. These false positives just disappear if +the size of a burst is decreased when one of the queues in the burst +exits. This commit restores the missing burst-size decrease, relying +of the fact that upstart is apparently unlikely to be used on systems +running this and future versions of the kernel. + +Signed-off-by: Paolo Valente +Signed-off-by: Mauro Andreolini +Signed-off-by: Angelo Ruocco +Tested-by: Mirko Montanari +--- + block/bfq-mq-iosched.c | 12 +++--------- + block/bfq-sq-iosched.c | 12 +++--------- + 2 files changed, 6 insertions(+), 18 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index af84e506e897..6e413d7236ce 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4111,16 +4111,10 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + BUG_ON(bfqq->entity.tree); + BUG_ON(bfq_bfqq_busy(bfqq)); + +- if (bfq_bfqq_sync(bfqq)) +- /* +- * The fact that this queue is being destroyed does not +- * invalidate the fact that this queue may have been +- * activated during the current burst. As a consequence, +- * although the queue does not exist anymore, and hence +- * needs to be removed from the burst list if there, +- * the burst size has not to be decremented. +- */ ++ if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { + hlist_del_init(&bfqq->burst_list_node); ++ bfqq->bfqd->burst_size--; ++ } + + if (bfqq->bfqd) + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index 0c48f527fe3f..93034dd7b801 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -3945,16 +3945,10 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + BUG_ON(bfqq->entity.tree); + BUG_ON(bfq_bfqq_busy(bfqq)); + +- if (bfq_bfqq_sync(bfqq)) +- /* +- * The fact that this queue is being destroyed does not +- * invalidate the fact that this queue may have been +- * activated during the current burst. As a consequence, +- * although the queue does not exist anymore, and hence +- * needs to be removed from the burst list if there, +- * the burst size has not to be decremented. +- */ ++ if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { + hlist_del_init(&bfqq->burst_list_node); ++ bfqq->bfqd->burst_size--; ++ } + + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); + + +From 60de7307d5e3ed7f272f12c900f631bdfe114db2 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Fri, 6 Oct 2017 19:35:38 +0200 +Subject: [PATCH 47/51] bfq-sq, bfq-mq: fix unbalanced decrements of burst size +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The commit "bfq-sq, bfq-mq: decrease burst size when queues in burst +exit" introduced the decrement of burst_size on the removal of a +bfq_queue from the burst list. Unfortunately, this decrement can +happen to be performed even when burst size is already equal to 0, +because of unbalanced decrements. A description follows of the cause +of these unbalanced decrements, namely a wrong assumption, and of the +way how this wrong assumption leads to unbalanced decrements. + +The wrong assumption is that a bfq_queue can exit only if the process +associated with the bfq_queue has exited. This is false, because a +bfq_queue, say Q, may exit also as a consequence of a merge with +another bfq_queue. In this case, Q exits because the I/O of its +associated process has been redirected to another bfq_queue. + +The decrement unbalance occurs because Q may then be re-created after +a split, and added back to the current burst list, *without* +incrementing burst_size. burst_size is not incremented because Q is +not a new bfq_queue added to the burst list, but a bfq_queue only +temporarily removed from the list, and, before the commit "bfq-sq, +bfq-mq: decrease burst size when queues in burst exit", burst_size was +not decremented when Q was removed. + +This commit addresses this issue by just checking whether the exiting +bfq_queue is a merged bfq_queue, and, in that case, not decrementing +burst_size. Unfortunately, this still leaves room for unbalanced +decrements, in the following rarer case: on a split, the bfq_queue +happens to be inserted into a different burst list than that it was +removed from when merged. If this happens, the number of elements in +the new burst list becomes higher than burst_size (by one). When the +bfq_queue then exits, it is of course not in a merged state any +longer, thus burst_size is decremented, which results in an unbalanced +decrement. To handle this sporadic, unlucky case in a simple way, +this commit also checks that burst_size is larger than 0 before +decrementing it. + +Finally, this commit removes an useless, extra check: the check that +the bfq_queue is sync, performed before checking whether the bfq_queue +is in the burst list. This extra check is redundant, because only sync +bfq_queues can be inserted into the burst list. + +Reported-by: Philip Müller +Signed-off-by: Paolo Valente +Signed-off-by: Angelo Ruocco +Tested-by: Philip Müller +Tested-by: Oleksandr Natalenko +Tested-by: Lee Tibbert +--- + block/bfq-mq-iosched.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-- + block/bfq-sq-iosched.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-- + 2 files changed, 114 insertions(+), 4 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 6e413d7236ce..816bac6cdd3d 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4111,9 +4111,36 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + BUG_ON(bfqq->entity.tree); + BUG_ON(bfq_bfqq_busy(bfqq)); + +- if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { ++ if (!hlist_unhashed(&bfqq->burst_list_node)) { + hlist_del_init(&bfqq->burst_list_node); +- bfqq->bfqd->burst_size--; ++ /* ++ * Decrement also burst size after the removal, if the ++ * process associated with bfqq is exiting, and thus ++ * does not contribute to the burst any longer. This ++ * decrement helps filter out false positives of large ++ * bursts, when some short-lived process (often due to ++ * the execution of commands by some service) happens ++ * to start and exit while a complex application is ++ * starting, and thus spawning several processes that ++ * do I/O (and that *must not* be treated as a large ++ * burst, see comments on bfq_handle_burst). ++ * ++ * In particular, the decrement is performed only if: ++ * 1) bfqq is not a merged queue, because, if it is, ++ * then this free of bfqq is not triggered by the exit ++ * of the process bfqq is associated with, but exactly ++ * by the fact that bfqq has just been merged. ++ * 2) burst_size is greater than 0, to handle ++ * unbalanced decrements. Unbalanced decrements may ++ * happen in te following case: bfqq is inserted into ++ * the current burst list--without incrementing ++ * bust_size--because of a split, but the current ++ * burst list is not the burst list bfqq belonged to ++ * (see comments on the case of a split in ++ * bfq_set_request). ++ */ ++ if (bfqq->bic && bfqq->bfqd->burst_size > 0) ++ bfqq->bfqd->burst_size--; + } + + if (bfqq->bfqd) +@@ -4940,6 +4967,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, + "large burst"); + bfq_clear_bfqq_in_large_burst(bfqq); + if (bic->was_in_burst_list) ++ /* ++ * If bfqq was in the current ++ * burst list before being ++ * merged, then we have to add ++ * it back. And we do not need ++ * to increase burst_size, as ++ * we did not decrement ++ * burst_size when we removed ++ * bfqq from the burst list as ++ * a consequence of a merge ++ * (see comments in ++ * bfq_put_queue). In this ++ * respect, it would be rather ++ * costly to know whether the ++ * current burst list is still ++ * the same burst list from ++ * which bfqq was removed on ++ * the merge. To avoid this ++ * cost, if bfqq was in a ++ * burst list, then we add ++ * bfqq to the current burst ++ * list without any further ++ * check. This can cause ++ * inappropriate insertions, ++ * but rarely enough to not ++ * harm the detection of large ++ * bursts significantly. ++ */ + hlist_add_head(&bfqq->burst_list_node, + &bfqd->burst_list); + } +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index 93034dd7b801..4bbd7f4c0154 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -3945,9 +3945,36 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + BUG_ON(bfqq->entity.tree); + BUG_ON(bfq_bfqq_busy(bfqq)); + +- if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { ++ if (!hlist_unhashed(&bfqq->burst_list_node)) { + hlist_del_init(&bfqq->burst_list_node); +- bfqq->bfqd->burst_size--; ++ /* ++ * Decrement also burst size after the removal, if the ++ * process associated with bfqq is exiting, and thus ++ * does not contribute to the burst any longer. This ++ * decrement helps filter out false positives of large ++ * bursts, when some short-lived process (often due to ++ * the execution of commands by some service) happens ++ * to start and exit while a complex application is ++ * starting, and thus spawning several processes that ++ * do I/O (and that *must not* be treated as a large ++ * burst, see comments on bfq_handle_burst). ++ * ++ * In particular, the decrement is performed only if: ++ * 1) bfqq is not a merged queue, because, if it is, ++ * then this free of bfqq is not triggered by the exit ++ * of the process bfqq is associated with, but exactly ++ * by the fact that bfqq has just been merged. ++ * 2) burst_size is greater than 0, to handle ++ * unbalanced decrements. Unbalanced decrements may ++ * happen in te following case: bfqq is inserted into ++ * the current burst list--without incrementing ++ * bust_size--because of a split, but the current ++ * burst list is not the burst list bfqq belonged to ++ * (see comments on the case of a split in ++ * bfq_set_request). ++ */ ++ if (bfqq->bic && bfqq->bfqd->burst_size > 0) ++ bfqq->bfqd->burst_size--; + } + + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); +@@ -4691,6 +4718,34 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, + "large burst"); + bfq_clear_bfqq_in_large_burst(bfqq); + if (bic->was_in_burst_list) ++ /* ++ * If bfqq was in the current ++ * burst list before being ++ * merged, then we have to add ++ * it back. And we do not need ++ * to increase burst_size, as ++ * we did not decrement ++ * burst_size when we removed ++ * bfqq from the burst list as ++ * a consequence of a merge ++ * (see comments in ++ * bfq_put_queue). In this ++ * respect, it would be rather ++ * costly to know whether the ++ * current burst list is still ++ * the same burst list from ++ * which bfqq was removed on ++ * the merge. To avoid this ++ * cost, if bfqq was in a ++ * burst list, then we add ++ * bfqq to the current burst ++ * list without any further ++ * check. This can cause ++ * inappropriate insertions, ++ * but rarely enough to not ++ * harm the detection of large ++ * bursts significantly. ++ */ + hlist_add_head(&bfqq->burst_list_node, + &bfqd->burst_list); + } + +From 09adbd0f46f4ba395964b35bf611b7cc3dd84b4d Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Mon, 30 Oct 2017 16:50:50 +0100 +Subject: [PATCH 48/51] doc, block, bfq-mq: update max IOPS sustainable with + BFQ + +We have investigated more deeply the performance of BFQ, in terms of +number of IOPS that can be processed by the CPU when BFQ is used as +I/O scheduler. In more detail, using the script [1], we have measured +the number of IOPS reached on top of a null block device configured +with zero latency, as a function of the workload (sequential read, +sequential write, random read, random write) and of the system (we +considered desktops, laptops and embedded systems). + +Basing on the resulting figures, with this commit we update the +current, conservative IOPS range reported in BFQ documentation. In +particular, the documentation now reports, for each of three different +systems, the lowest number of IOPS obtained for that system with the +above test (namely, the value obtained with the workload leading to +the lowest IOPS). + +[1] https://github.com/Algodev-github/IOSpeed + +Signed-off-by: Paolo Valente +Signed-off-by: Luca Miccio +--- + Documentation/block/bfq-iosched.txt | 19 +++++++++++++------ + 1 file changed, 13 insertions(+), 6 deletions(-) + +diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt +index dcfe15523da3..595ff7a5ff34 100644 +--- a/Documentation/block/bfq-iosched.txt ++++ b/Documentation/block/bfq-iosched.txt +@@ -29,12 +29,19 @@ for that device, by setting low_latency to 0. See Section 3 for + details on how to configure BFQ for the desired tradeoff between + latency and throughput, or on how to maximize throughput. + +-On average CPUs, the current version of BFQ can handle devices +-performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a +-reference, 30-50 KIOPS correspond to very high bandwidths with +-sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and +-to 120-200 MB/s with 4KB random I/O. BFQ is currently being tested on +-multi-queue devices too. ++BFQ has a non-null overhead, which limits the maximum IOPS that the ++CPU can process for a device scheduled with BFQ. To give an idea of ++the limits on slow or average CPUs, here are BFQ limits for three ++different CPUs, on, respectively, an average laptop, an old desktop, ++and a cheap embedded system, in case full hierarchical support is ++enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set for bfq-sq, or ++CONFIG_MQ_BFQ_GROUP_IOSCHED is set for bfq-mq, or, finally, ++CONFIG_BFQ_GROUP_IOSCHED is set for bfq): ++- Intel i7-4850HQ: 250 KIOPS ++- AMD A8-3850: 170 KIOPS ++- ARM CortexTM-A53 Octa-core: 45 KIOPS ++ ++BFQ works for multi-queue devices too (bfq and bfq-mq instances). + + The table of contents follows. Impatients can just jump to Section 3. + + +From be94f97b577dc587593185224a7718aa59ac43f7 Mon Sep 17 00:00:00 2001 +From: Luca Miccio +Date: Tue, 31 Oct 2017 09:50:11 +0100 +Subject: [PATCH 49/51] block, bfq-mq: add missing invocations of + bfqg_stats_update_io_add/remove + +bfqg_stats_update_io_add and bfqg_stats_update_io_remove are to be +invoked, respectively, when an I/O request enters and when an I/O +request exits the scheduler. Unfortunately, bfq-mq does not fully comply +with this scheme, because it does not invoke these functions for +requests that are inserted into or extracted from its priority +dispatch list. This commit fixes this mistake. + +Signed-off-by: Paolo Valente +Signed-off-by: Luca Miccio +--- + block/bfq-mq-iosched.c | 24 +++++++++++++++++++----- + 1 file changed, 19 insertions(+), 5 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 816bac6cdd3d..fbf28804c220 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -1394,7 +1394,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + BUG_ON(bfqq == bfqd->in_service_queue); +- bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); + + /* + * bfqq deserves to be weight-raised if: +@@ -1734,7 +1733,6 @@ static void bfq_remove_request(struct request_queue *q, + BUG_ON(bfqq->meta_pending == 0); + bfqq->meta_pending--; + } +- bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); + } + + static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) +@@ -1879,6 +1877,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, + bfqq->next_rq = rq; + + bfq_remove_request(q, next); ++ bfqg_stats_update_io_remove(bfqq_group(bfqq), next->cmd_flags); + + spin_unlock_irq(&bfqq->bfqd->lock); + end: +@@ -4077,6 +4076,10 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + spin_lock_irq(&bfqd->lock); + + rq = __bfq_dispatch_request(hctx); ++ if (rq && RQ_BFQQ(rq)) ++ bfqg_stats_update_io_remove(bfqq_group(RQ_BFQQ(rq)), ++ rq->cmd_flags); ++ + spin_unlock_irq(&bfqd->lock); + + return rq; +@@ -4634,6 +4637,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + { + struct request_queue *q = hctx->queue; + struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); + + spin_lock_irq(&bfqd->lock); + if (blk_mq_sched_try_insert_merge(q, rq)) { +@@ -4647,8 +4651,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + + spin_lock_irq(&bfqd->lock); + if (at_head || blk_rq_is_passthrough(rq)) { +- struct bfq_queue *bfqq = RQ_BFQQ(rq); +- + if (at_head) + list_add(&rq->queuelist, &bfqd->dispatch); + else +@@ -4668,6 +4670,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + rq->rq_flags &= ~RQF_GOT; + + __bfq_insert_request(bfqd, rq); ++ /* ++ * Update bfqq, because, if a queue merge has occurred ++ * in __bfq_insert_request, then rq has been ++ * redirected into a new queue. ++ */ ++ bfqq = RQ_BFQQ(rq); + + if (rq_mergeable(rq)) { + elv_rqhash_add(q, rq); +@@ -4676,6 +4684,9 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + } + } + ++ if (bfqq) ++ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, rq->cmd_flags); ++ + spin_unlock_irq(&bfqd->lock); + } + +@@ -4893,8 +4904,11 @@ static void bfq_finish_request(struct request *rq) + BUG_ON(in_interrupt()); + + assert_spin_locked(&bfqd->lock); +- if (!RB_EMPTY_NODE(&rq->rb_node)) ++ if (!RB_EMPTY_NODE(&rq->rb_node)) { + bfq_remove_request(rq->q, rq); ++ bfqg_stats_update_io_remove(bfqq_group(bfqq), ++ rq->cmd_flags); ++ } + bfq_put_rq_priv_body(bfqq); + } + + +From 8659a1549d2bf241129a0f7c90429bddd9c2bc53 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 8 Nov 2017 19:07:40 +0100 +Subject: [PATCH 50/51] block, bfq-mq: update blkio stats outside the scheduler + lock + +bfq-mq invokes various blkg_*stats_* functions to update the statistics +contained in the special files blkio.bfq-mq.* in the blkio controller +groups, i.e., the I/O accounting related to the proportional-share +policy provided by bfq-mq. The execution of these functions takes a +considerable percentage, about 40%, of the total per-request execution +time of bfq-mq (i.e., of the sum of the execution time of all the bfq-mq +functions that have to be executed to process an I/O request from its +creation to its destruction). This reduces the request-processing +rate sustainable by bfq-mq noticeably, even on a multicore CPU. In fact, +the bfq-mq functions that invoke blkg_*stats_* functions cannot be +executed in parallel with the rest of the code of bfq-mq, because +both are executed under the same same per-device scheduler lock. + +To reduce this slowdown, this commit moves, wherever possible, the +invocation of these functions (more precisely, of the bfq-mq functions +that invoke blkg_*stats_* functions) outside the critical sections +protected by the scheduler lock. + +With this change, and with all blkio.bfq-mq.* statistics enabled, the +throughput grows, e.g., from 250 to 310 KIOPS (+25%) on an Intel +i7-4850HQ, in case of 8 threads doing random I/O in parallel on +null_blk, with the latter configured with 0 latency. We obtained the +same or higher throughput boosts, up to +30%, with other processors +(some figures are reported in the documentation). For our tests, we +used the script [1], with which our results can be easily reproduced. + +NOTE. This commit still protects the invocation of blkg_*stats_* +functions with the request_queue lock, because the group these +functions are invoked on may otherwise disappear before or while these +functions are executed. Fortunately, tests without even this lock +show, by difference, that the serialization caused by this lock has a +little impact (at most ~5% of throughput reduction). + +[1] https://github.com/Algodev-github/IOSpeed + +Signed-off-by: Paolo Valente +Signed-off-by: Luca Miccio +--- + Documentation/block/bfq-iosched.txt | 18 ++++-- + block/bfq-mq-iosched.c | 112 +++++++++++++++++++++++++++++++----- + block/bfq-sched.c | 2 + + 3 files changed, 112 insertions(+), 20 deletions(-) + +diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt +index 595ff7a5ff34..c816c595082d 100644 +--- a/Documentation/block/bfq-iosched.txt ++++ b/Documentation/block/bfq-iosched.txt +@@ -31,16 +31,22 @@ latency and throughput, or on how to maximize throughput. + + BFQ has a non-null overhead, which limits the maximum IOPS that the + CPU can process for a device scheduled with BFQ. To give an idea of +-the limits on slow or average CPUs, here are BFQ limits for three +-different CPUs, on, respectively, an average laptop, an old desktop, +-and a cheap embedded system, in case full hierarchical support is +-enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set for bfq-sq, or +-CONFIG_MQ_BFQ_GROUP_IOSCHED is set for bfq-mq, or, finally, +-CONFIG_BFQ_GROUP_IOSCHED is set for bfq): ++the limits on slow or average CPUs, here are, first, the limits of ++bfq-sq for three different CPUs, on, respectively, an average laptop, ++an old desktop, and a cheap embedded system, in case full hierarchical ++support is enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set): + - Intel i7-4850HQ: 250 KIOPS + - AMD A8-3850: 170 KIOPS + - ARM CortexTM-A53 Octa-core: 45 KIOPS + ++bfq-mq and bfq instances reach, instead, a higher sustainable ++throughput. Their limits, on the same systems as above, are, with full ++hierarchical support enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED set ++for bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED set for bfq): ++- Intel i7-4850HQ: 310 KIOPS ++- AMD A8-3850: 200 KIOPS ++- ARM CortexTM-A53 Octa-core: 56 KIOPS ++ + BFQ works for multi-queue devices too (bfq and bfq-mq instances). + + The table of contents follows. Impatients can just jump to Section 3. +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index fbf28804c220..ab3b83d612c2 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -1822,7 +1822,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, + bfqq->next_rq = next_rq; + + bfq_log_bfqq(bfqd, bfqq, +- "requests_merged: req %p prev %p next_rq %p bfqq %p", ++ "request_merged: req %p prev %p next_rq %p bfqq %p", + req, prev, next_rq, bfqq); + + /* +@@ -2415,7 +2415,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, + struct bfq_queue *bfqq) + { + if (bfqq) { +- bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); + bfq_clear_bfqq_fifo_expire(bfqq); + + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; +@@ -3784,7 +3783,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + */ + bfq_clear_bfqq_wait_request(bfqq); + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); +- bfqg_stats_update_idle_time(bfqq_group(bfqq)); + } + goto keep_queue; + } +@@ -4072,16 +4070,67 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + { + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + struct request *rq; ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ struct bfq_queue *in_serv_queue, *bfqq; ++ bool waiting_rq, idle_timer_disabled; ++#endif + + spin_lock_irq(&bfqd->lock); + ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ in_serv_queue = bfqd->in_service_queue; ++ waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); ++ + rq = __bfq_dispatch_request(hctx); +- if (rq && RQ_BFQQ(rq)) +- bfqg_stats_update_io_remove(bfqq_group(RQ_BFQQ(rq)), +- rq->cmd_flags); + ++ idle_timer_disabled = ++ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); ++ ++#else ++ rq = __bfq_dispatch_request(hctx); ++#endif + spin_unlock_irq(&bfqd->lock); + ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ bfqq = rq ? RQ_BFQQ(rq) : NULL; ++ if (!idle_timer_disabled && !bfqq) ++ return rq; ++ ++ /* ++ * rq and bfqq are guaranteed to exist until this function ++ * ends, for the following reasons. First, rq can be ++ * dispatched to the device, and then can be completed and ++ * freed, only after this function ends. Second, rq cannot be ++ * merged (and thus freed because of a merge) any longer, ++ * because it has already started. Thus rq cannot be freed ++ * before this function ends, and, since rq has a reference to ++ * bfqq, the same guarantee holds for bfqq too. ++ * ++ * In addition, the following queue lock guarantees that ++ * bfqq_group(bfqq) exists as well. ++ */ ++ spin_lock_irq(hctx->queue->queue_lock); ++ if (idle_timer_disabled) ++ /* ++ * Since the idle timer has been disabled, ++ * in_serv_queue contained some request when ++ * __bfq_dispatch_request was invoked above, which ++ * implies that rq was picked exactly from ++ * in_serv_queue. Thus in_serv_queue == bfqq, and is ++ * therefore guaranteed to exist because of the above ++ * arguments. ++ */ ++ bfqg_stats_update_idle_time(bfqq_group(in_serv_queue)); ++ if (bfqq) { ++ struct bfq_group *bfqg = bfqq_group(bfqq); ++ ++ bfqg_stats_update_avg_queue_size(bfqg); ++ bfqg_stats_set_start_empty_time(bfqg); ++ bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); ++ } ++ spin_unlock_irq(hctx->queue->queue_lock); ++#endif ++ + return rq; + } + +@@ -4200,7 +4249,6 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) + unsigned long flags; + + spin_lock_irqsave(&bfqd->lock, flags); +- + bfq_exit_bfqq(bfqd, bfqq); + bic_set_bfqq(bic, NULL, is_sync); + spin_unlock_irqrestore(&bfqd->lock, flags); +@@ -4554,7 +4602,6 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + */ + bfq_clear_bfqq_wait_request(bfqq); + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); +- bfqg_stats_update_idle_time(bfqq_group(bfqq)); + + /* + * The queue is not empty, because a new request just +@@ -4569,9 +4616,11 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + } + } + +-static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) ++/* returns true if it causes the idle timer to be disabled */ ++static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + { + struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; ++ bool waiting, idle_timer_disabled = false; + BUG_ON(!bfqq); + + assert_spin_locked(&bfqd->lock); +@@ -4624,12 +4673,16 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + } + } + ++ waiting = bfqq && bfq_bfqq_wait_request(bfqq); + bfq_add_request(rq); ++ idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); + + rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + list_add_tail(&rq->queuelist, &bfqq->fifo); + + bfq_rq_enqueued(bfqd, bfqq, rq); ++ ++ return idle_timer_disabled; + } + + static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, +@@ -4638,6 +4691,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + struct request_queue *q = hctx->queue; + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ bool idle_timer_disabled = false; ++ unsigned int cmd_flags; ++#endif + + spin_lock_irq(&bfqd->lock); + if (blk_mq_sched_try_insert_merge(q, rq)) { +@@ -4669,13 +4726,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + BUG_ON(!(rq->rq_flags & RQF_GOT)); + rq->rq_flags &= ~RQF_GOT; + +- __bfq_insert_request(bfqd, rq); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ idle_timer_disabled = __bfq_insert_request(bfqd, rq); + /* + * Update bfqq, because, if a queue merge has occurred + * in __bfq_insert_request, then rq has been + * redirected into a new queue. + */ + bfqq = RQ_BFQQ(rq); ++#else ++ __bfq_insert_request(bfqd, rq); ++#endif + + if (rq_mergeable(rq)) { + elv_rqhash_add(q, rq); +@@ -4683,11 +4744,34 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + q->last_merge = rq; + } + } +- +- if (bfqq) +- bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, rq->cmd_flags); +- ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ /* ++ * Cache cmd_flags before releasing scheduler lock, because rq ++ * may disappear afterwards (for example, because of a request ++ * merge). ++ */ ++ cmd_flags = rq->cmd_flags; ++#endif + spin_unlock_irq(&bfqd->lock); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ if (!bfqq) ++ return; ++ /* ++ * bfqq still exists, because it can disappear only after ++ * either it is merged with another queue, or the process it ++ * is associated with exits. But both actions must be taken by ++ * the same process currently executing this flow of ++ * instruction. ++ * ++ * In addition, the following queue lock guarantees that ++ * bfqq_group(bfqq) exists as well. ++ */ ++ spin_lock_irq(q->queue_lock); ++ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); ++ if (idle_timer_disabled) ++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); ++ spin_unlock_irq(q->queue_lock); ++#endif + } + + static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index e4a2553a2d2c..616c0692335a 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -949,9 +949,11 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) + st->vtime += bfq_delta(served, st->wsum); + bfq_forget_idle(st); + } ++#ifndef BFQ_MQ + #ifdef BFQ_GROUP_IOSCHED_ENABLED + bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); + #endif ++#endif + st = bfq_entity_service_tree(&bfqq->entity); + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", + served, ((st->vtime>>10)*1000)>>12, st); + +From abdfb33a3325df55ec0261fd824ca61ddac13575 Mon Sep 17 00:00:00 2001 +From: Luca Miccio +Date: Wed, 8 Nov 2017 19:07:41 +0100 +Subject: [PATCH 51/51] block, bfq-sq, bfq-mq: move debug blkio stats behind + CONFIG_DEBUG_BLK_CGROUP + +BFQ (both bfq-mq and bfq-sq) currently creates, and updates, its own +instance of the whole set of blkio statistics that cfq creates. Yet, +from the comments of Tejun Heo in [1], it turned out that most of +these statistics are meant/useful only for debugging. This commit +makes BFQ create the latter, debugging statistics only if the option +CONFIG_DEBUG_BLK_CGROUP is set. + +By doing so, this commit also enables BFQ to enjoy a high perfomance +boost. The reason is that, if CONFIG_DEBUG_BLK_CGROUP is not set, then +BFQ has to update far fewer statistics, and, in particular, not the +heaviest to update. To give an idea of the benefits, if +CONFIG_DEBUG_BLK_CGROUP is not set, then, on an Intel i7-4850HQ, and +with 8 threads doing random I/O in parallel on null_blk (configured +with 0 latency), the throughput of bfq-mq grows from 310 to 400 KIOPS +(+30%). We have measured similar or even much higher boosts with other +CPUs: e.g., +45% with an ARM CortexTM-A53 Octa-core. Our results have +been obtained and can be reproduced very easily with the script in [1]. + +[1] https://www.spinics.net/lists/linux-block/msg18943.html + +Reported-by: Tejun Heo +Signed-off-by: Luca Miccio +Signed-off-by: Paolo Valente +--- + Documentation/block/bfq-iosched.txt | 59 ++++++++++--- + block/bfq-cgroup-included.c | 163 ++++++++++++++++++++---------------- + block/bfq-mq-iosched.c | 14 ++-- + block/bfq-mq.h | 4 +- + block/bfq.h | 4 +- + 5 files changed, 147 insertions(+), 97 deletions(-) + +diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt +index c816c595082d..30ef2dba85ad 100644 +--- a/Documentation/block/bfq-iosched.txt ++++ b/Documentation/block/bfq-iosched.txt +@@ -29,24 +29,41 @@ for that device, by setting low_latency to 0. See Section 3 for + details on how to configure BFQ for the desired tradeoff between + latency and throughput, or on how to maximize throughput. + +-BFQ has a non-null overhead, which limits the maximum IOPS that the +-CPU can process for a device scheduled with BFQ. To give an idea of +-the limits on slow or average CPUs, here are, first, the limits of +-bfq-sq for three different CPUs, on, respectively, an average laptop, ++BFQ has a non-null overhead, which limits the maximum IOPS that a CPU ++can process for a device scheduled with BFQ. To give an idea of the ++limits on slow or average CPUs, here are, first, the limits of bfq-mq ++and bfq for three different CPUs, on, respectively, an average laptop, + an old desktop, and a cheap embedded system, in case full hierarchical +-support is enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set): +-- Intel i7-4850HQ: 250 KIOPS +-- AMD A8-3850: 170 KIOPS +-- ARM CortexTM-A53 Octa-core: 45 KIOPS +- +-bfq-mq and bfq instances reach, instead, a higher sustainable +-throughput. Their limits, on the same systems as above, are, with full +-hierarchical support enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED set +-for bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED set for bfq): ++support is enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED is set for ++bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED is set for bfq), but ++CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2): ++- Intel i7-4850HQ: 400 KIOPS ++- AMD A8-3850: 250 KIOPS ++- ARM CortexTM-A53 Octa-core: 80 KIOPS ++ ++As for bfq-sq, it cannot reach the above IOPS, because of the ++inherent, lower parallelism of legacy blk and of the components within ++it (including bfq-sq itself). In particular, results with ++CONFIG_DEBUG_BLK_CGROUP unset are rather fluctuating. The limits ++reported below for the case CONFIG_DEBUG_BLK_CGROUP set will however ++provide a lower bound to the limits of bfq-sq. ++ ++Turning back to bfq-mq and bfq, If CONFIG_DEBUG_BLK_CGROUP is set (and ++of course full hierarchical support is enabled), then the sustainable ++throughput with bfq-mq and bfq decreases, because all blkio.bfq* ++statistics are created and updated (Section 4-2). For bfq-mq and bfq, ++this leads to the following maximum sustainable throughputs, on the ++same systems as above: + - Intel i7-4850HQ: 310 KIOPS + - AMD A8-3850: 200 KIOPS + - ARM CortexTM-A53 Octa-core: 56 KIOPS + ++Finally, if CONFIG_DEBUG_BLK_CGROUP is set (and full hierarchical ++support is enabled), then bfq-sq exhibits the following limits: ++- Intel i7-4850HQ: 250 KIOPS ++- AMD A8-3850: 170 KIOPS ++- ARM CortexTM-A53 Octa-core: 45 KIOPS ++ + BFQ works for multi-queue devices too (bfq and bfq-mq instances). + + The table of contents follows. Impatients can just jump to Section 3. +@@ -524,6 +541,22 @@ BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be "" + to set the weight of a group with the mainline BFQ is blkio.bfq.weight + or io.bfq.weight. + ++As for cgroups-v1 (blkio controller), the exact set of stat files ++created, and kept up-to-date by bfq*, depends on whether ++CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq* creates all ++the stat files documented in ++Documentation/cgroup-v1/blkio-controller.txt. If, instead, ++CONFIG_DEBUG_BLK_CGROUP is not set, then bfq* creates only the files ++blkio.bfq*.io_service_bytes ++blkio.bfq*.io_service_bytes_recursive ++blkio.bfq*.io_serviced ++blkio.bfq*.io_serviced_recursive ++ ++The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum ++throughput sustainable with bfq*, because updating the blkio.bfq* ++stats is rather costly, especially for some of the stats enabled by ++CONFIG_DEBUG_BLK_CGROUP. ++ + Parameters to set + ----------------- + +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +index 631e53d9150d..562b0ce581a7 100644 +--- a/block/bfq-cgroup-included.c ++++ b/block/bfq-cgroup-included.c +@@ -15,7 +15,7 @@ + * file. + */ + +-#ifdef BFQ_GROUP_IOSCHED_ENABLED ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) + + /* bfqg stats flags */ + enum bfqg_stats_flags { +@@ -155,6 +155,63 @@ static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) + bfqg_stats_update_group_wait_time(stats); + } + ++static void bfqg_stats_update_io_add(struct bfq_group *bfqg, ++ struct bfq_queue *bfqq, unsigned int op) ++{ ++ blkg_rwstat_add(&bfqg->stats.queued, op, 1); ++ bfqg_stats_end_empty_time(&bfqg->stats); ++ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) ++ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); ++} ++ ++static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) ++{ ++ blkg_rwstat_add(&bfqg->stats.queued, op, -1); ++} ++ ++static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) ++{ ++ blkg_rwstat_add(&bfqg->stats.merged, op, 1); ++} ++ ++static void bfqg_stats_update_completion(struct bfq_group *bfqg, ++ uint64_t start_time, uint64_t io_start_time, unsigned int op) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ unsigned long long now = sched_clock(); ++ ++ if (time_after64(now, io_start_time)) ++ blkg_rwstat_add(&stats->service_time, op, ++ now - io_start_time); ++ if (time_after64(io_start_time, start_time)) ++ blkg_rwstat_add(&stats->wait_time, op, ++ io_start_time - start_time); ++} ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ ++ ++static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, ++ struct bfq_queue *bfqq, unsigned int op) { } ++static inline void ++bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } ++static inline void ++bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } ++static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, ++ uint64_t start_time, uint64_t io_start_time, ++ unsigned int op) { } ++static inline void ++bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, ++ struct bfq_group *curr_bfqg) { } ++static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } ++static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + static struct blkcg_policy blkcg_policy_bfq; + + /* +@@ -247,44 +304,10 @@ static void bfqg_and_blkg_put(struct bfq_group *bfqg) + } + #endif + +-static void bfqg_stats_update_io_add(struct bfq_group *bfqg, +- struct bfq_queue *bfqq, +- unsigned int op) +-{ +- blkg_rwstat_add(&bfqg->stats.queued, op, 1); +- bfqg_stats_end_empty_time(&bfqg->stats); +- if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) +- bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); +-} +- +-static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) +-{ +- blkg_rwstat_add(&bfqg->stats.queued, op, -1); +-} +- +-static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) +-{ +- blkg_rwstat_add(&bfqg->stats.merged, op, 1); +-} +- +-static void bfqg_stats_update_completion(struct bfq_group *bfqg, +- uint64_t start_time, uint64_t io_start_time, +- unsigned int op) +-{ +- struct bfqg_stats *stats = &bfqg->stats; +- unsigned long long now = sched_clock(); +- +- if (time_after64(now, io_start_time)) +- blkg_rwstat_add(&stats->service_time, op, +- now - io_start_time); +- if (time_after64(io_start_time, start_time)) +- blkg_rwstat_add(&stats->wait_time, op, +- io_start_time - start_time); +-} +- + /* @stats = 0 */ + static void bfqg_stats_reset(struct bfqg_stats *stats) + { ++#ifdef CONFIG_DEBUG_BLK_CGROUP + /* queued stats shouldn't be cleared */ + blkg_rwstat_reset(&stats->merged); + blkg_rwstat_reset(&stats->service_time); +@@ -296,6 +319,7 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) + blkg_stat_reset(&stats->group_wait_time); + blkg_stat_reset(&stats->idle_time); + blkg_stat_reset(&stats->empty_time); ++#endif + } + + /* @to += @from */ +@@ -304,6 +328,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) + if (!to || !from) + return; + ++#ifdef CONFIG_DEBUG_BLK_CGROUP + /* queued stats shouldn't be cleared */ + blkg_rwstat_add_aux(&to->merged, &from->merged); + blkg_rwstat_add_aux(&to->service_time, &from->service_time); +@@ -316,6 +341,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) + blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); + blkg_stat_add_aux(&to->idle_time, &from->idle_time); + blkg_stat_add_aux(&to->empty_time, &from->empty_time); ++#endif + } + + /* +@@ -367,6 +393,7 @@ static void bfq_init_entity(struct bfq_entity *entity, + + static void bfqg_stats_exit(struct bfqg_stats *stats) + { ++#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_rwstat_exit(&stats->merged); + blkg_rwstat_exit(&stats->service_time); + blkg_rwstat_exit(&stats->wait_time); +@@ -378,10 +405,12 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) + blkg_stat_exit(&stats->group_wait_time); + blkg_stat_exit(&stats->idle_time); + blkg_stat_exit(&stats->empty_time); ++#endif + } + + static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) + { ++#ifdef CONFIG_DEBUG_BLK_CGROUP + if (blkg_rwstat_init(&stats->merged, gfp) || + blkg_rwstat_init(&stats->service_time, gfp) || + blkg_rwstat_init(&stats->wait_time, gfp) || +@@ -396,6 +425,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) + bfqg_stats_exit(stats); + return -ENOMEM; + } ++#endif + + return 0; + } +@@ -1003,6 +1033,7 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, + return bfq_io_set_weight_legacy(of_css(of), NULL, weight); + } + ++#ifdef CONFIG_DEBUG_BLK_CGROUP + static int bfqg_print_stat(struct seq_file *sf, void *v) + { + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, +@@ -1108,6 +1139,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) + 0, false); + return 0; + } ++#endif /* CONFIG_DEBUG_BLK_CGROUP */ + + static struct bfq_group * + bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) +@@ -1137,15 +1169,6 @@ static struct cftype bfq_blkcg_legacy_files[] = { + + /* statistics, covers only the tasks in the bfqg */ + { +- .name = BFQ_CGROUP_FNAME(time), +- .private = offsetof(struct bfq_group, stats.time), +- .seq_show = bfqg_print_stat, +- }, +- { +- .name = BFQ_CGROUP_FNAME(sectors), +- .seq_show = bfqg_print_stat_sectors, +- }, +- { + .name = BFQ_CGROUP_FNAME(io_service_bytes), + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_bytes, +@@ -1155,6 +1178,16 @@ static struct cftype bfq_blkcg_legacy_files[] = { + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_ios, + }, ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++ { ++ .name = BFQ_CGROUP_FNAME(time), ++ .private = offsetof(struct bfq_group, stats.time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(sectors), ++ .seq_show = bfqg_print_stat_sectors, ++ }, + { + .name = BFQ_CGROUP_FNAME(io_service_time), + .private = offsetof(struct bfq_group, stats.service_time), +@@ -1175,18 +1208,10 @@ static struct cftype bfq_blkcg_legacy_files[] = { + .private = offsetof(struct bfq_group, stats.queued), + .seq_show = bfqg_print_rwstat, + }, ++#endif /* CONFIG_DEBUG_BLK_CGROUP */ + + /* the same statictics which cover the bfqg and its descendants */ + { +- .name = BFQ_CGROUP_FNAME(time_recursive), +- .private = offsetof(struct bfq_group, stats.time), +- .seq_show = bfqg_print_stat_recursive, +- }, +- { +- .name = BFQ_CGROUP_FNAME(sectors_recursive), +- .seq_show = bfqg_print_stat_sectors_recursive, +- }, +- { + .name = BFQ_CGROUP_FNAME(io_service_bytes_recursive), + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_bytes_recursive, +@@ -1196,6 +1221,16 @@ static struct cftype bfq_blkcg_legacy_files[] = { + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_ios_recursive, + }, ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++ { ++ .name = BFQ_CGROUP_FNAME(time_recursive), ++ .private = offsetof(struct bfq_group, stats.time), ++ .seq_show = bfqg_print_stat_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(sectors_recursive), ++ .seq_show = bfqg_print_stat_sectors_recursive, ++ }, + { + .name = BFQ_CGROUP_FNAME(io_service_time_recursive), + .private = offsetof(struct bfq_group, stats.service_time), +@@ -1240,6 +1275,7 @@ static struct cftype bfq_blkcg_legacy_files[] = { + .private = offsetof(struct bfq_group, stats.dequeue), + .seq_show = bfqg_print_stat, + }, ++#endif /* CONFIG_DEBUG_BLK_CGROUP */ + { } /* terminate */ + }; + +@@ -1257,25 +1293,6 @@ static struct cftype bfq_blkg_files[] = { + + #else /* BFQ_GROUP_IOSCHED_ENABLED */ + +-static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, +- struct bfq_queue *bfqq, unsigned int op) { } +-static inline void +-bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } +-static inline void +-bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } +-static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, +- uint64_t start_time, uint64_t io_start_time, +- unsigned int op) { } +-static inline void +-bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, +- struct bfq_group *curr_bfqg) { } +-static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } +-static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } +-static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } +-static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } +-static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } +-static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } +- + static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_group *bfqg) {} + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index ab3b83d612c2..0c09609a6099 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4070,14 +4070,14 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + { + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + struct request *rq; +-#ifdef BFQ_GROUP_IOSCHED_ENABLED ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) + struct bfq_queue *in_serv_queue, *bfqq; + bool waiting_rq, idle_timer_disabled; + #endif + + spin_lock_irq(&bfqd->lock); + +-#ifdef BFQ_GROUP_IOSCHED_ENABLED ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) + in_serv_queue = bfqd->in_service_queue; + waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); + +@@ -4091,7 +4091,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + #endif + spin_unlock_irq(&bfqd->lock); + +-#ifdef BFQ_GROUP_IOSCHED_ENABLED ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) + bfqq = rq ? RQ_BFQQ(rq) : NULL; + if (!idle_timer_disabled && !bfqq) + return rq; +@@ -4691,7 +4691,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + struct request_queue *q = hctx->queue; + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq); +-#ifdef BFQ_GROUP_IOSCHED_ENABLED ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) + bool idle_timer_disabled = false; + unsigned int cmd_flags; + #endif +@@ -4726,7 +4726,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + BUG_ON(!(rq->rq_flags & RQF_GOT)); + rq->rq_flags &= ~RQF_GOT; + +-#ifdef BFQ_GROUP_IOSCHED_ENABLED ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) + idle_timer_disabled = __bfq_insert_request(bfqd, rq); + /* + * Update bfqq, because, if a queue merge has occurred +@@ -4744,7 +4744,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + q->last_merge = rq; + } + } +-#ifdef BFQ_GROUP_IOSCHED_ENABLED ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) + /* + * Cache cmd_flags before releasing scheduler lock, because rq + * may disappear afterwards (for example, because of a request +@@ -4753,7 +4753,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + cmd_flags = rq->cmd_flags; + #endif + spin_unlock_irq(&bfqd->lock); +-#ifdef BFQ_GROUP_IOSCHED_ENABLED ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) + if (!bfqq) + return; + /* +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +index 7ed2cc29be57..1cb05bb853d2 100644 +--- a/block/bfq-mq.h ++++ b/block/bfq-mq.h +@@ -784,7 +784,7 @@ enum bfqq_expiration { + + + struct bfqg_stats { +-#ifdef BFQ_GROUP_IOSCHED_ENABLED ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) + /* number of ios merged */ + struct blkg_rwstat merged; + /* total time spent on device in ns, may not be accurate w/ queueing */ +@@ -812,7 +812,7 @@ struct bfqg_stats { + uint64_t start_idle_time; + uint64_t start_empty_time; + uint16_t flags; +-#endif ++#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ + }; + + #ifdef BFQ_GROUP_IOSCHED_ENABLED +diff --git a/block/bfq.h b/block/bfq.h +index 15d326f466b7..47cd4d5a8c32 100644 +--- a/block/bfq.h ++++ b/block/bfq.h +@@ -791,7 +791,7 @@ enum bfqq_expiration { + + + struct bfqg_stats { +-#ifdef BFQ_GROUP_IOSCHED_ENABLED ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) + /* number of ios merged */ + struct blkg_rwstat merged; + /* total time spent on device in ns, may not be accurate w/ queueing */ +@@ -819,7 +819,7 @@ struct bfqg_stats { + uint64_t start_idle_time; + uint64_t start_empty_time; + uint16_t flags; +-#endif ++#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ + }; + + #ifdef BFQ_GROUP_IOSCHED_ENABLED diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0001-MuQSS-version-0.162-CPU-scheduler-linux-hardened.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0001-MuQSS-version-0.162-CPU-scheduler-linux-hardened.patch new file mode 100644 index 00000000..a81dbeac --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0001-MuQSS-version-0.162-CPU-scheduler-linux-hardened.patch @@ -0,0 +1,9560 @@ +diff -Nur a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +--- a/arch/powerpc/platforms/cell/spufs/sched.c 2018-12-21 13:13:19.000000000 +0000 ++++ b/arch/powerpc/platforms/cell/spufs/sched.c 2019-01-05 20:22:51.089998199 +0000 +@@ -65,11 +65,6 @@ + static struct timer_list spuloadavg_timer; + + /* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- +-/* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. + */ +diff -Nur a/arch/x86/Kconfig b/arch/x86/Kconfig +--- a/arch/x86/Kconfig 2019-01-05 20:17:13.829237906 +0000 ++++ b/arch/x86/Kconfig 2019-01-05 20:30:14.244135060 +0000 +@@ -957,6 +957,20 @@ + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_MUQSS && SCHED_SMT ++ default y ++ ---help--- ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ If unsure say Y here. ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +diff -Nur a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt +--- a/Documentation/scheduler/sched-BFS.txt 1970-01-01 01:00:00.000000000 +0100 ++++ b/Documentation/scheduler/sched-BFS.txt 2019-01-05 20:22:51.089998199 +0000 +@@ -0,0 +1,351 @@ ++BFS - The Brain Fuck Scheduler by Con Kolivas. ++ ++Goals. ++ ++The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to ++completely do away with the complex designs of the past for the cpu process ++scheduler and instead implement one that is very simple in basic design. ++The main focus of BFS is to achieve excellent desktop interactivity and ++responsiveness without heuristics and tuning knobs that are difficult to ++understand, impossible to model and predict the effect of, and when tuned to ++one workload cause massive detriment to another. ++ ++ ++Design summary. ++ ++BFS is best described as a single runqueue, O(n) lookup, earliest effective ++virtual deadline first design, loosely based on EEVDF (earliest eligible virtual ++deadline first) and my previous Staircase Deadline scheduler. Each component ++shall be described in order to understand the significance of, and reasoning for ++it. The codebase when the first stable version was released was approximately ++9000 lines less code than the existing mainline linux kernel scheduler (in ++2.6.31). This does not even take into account the removal of documentation and ++the cgroups code that is not used. ++ ++Design reasoning. ++ ++The single runqueue refers to the queued but not running processes for the ++entire system, regardless of the number of CPUs. The reason for going back to ++a single runqueue design is that once multiple runqueues are introduced, ++per-CPU or otherwise, there will be complex interactions as each runqueue will ++be responsible for the scheduling latency and fairness of the tasks only on its ++own runqueue, and to achieve fairness and low latency across multiple CPUs, any ++advantage in throughput of having CPU local tasks causes other disadvantages. ++This is due to requiring a very complex balancing system to at best achieve some ++semblance of fairness across CPUs and can only maintain relatively low latency ++for tasks bound to the same CPUs, not across them. To increase said fairness ++and latency across CPUs, the advantage of local runqueue locking, which makes ++for better scalability, is lost due to having to grab multiple locks. ++ ++A significant feature of BFS is that all accounting is done purely based on CPU ++used and nowhere is sleep time used in any way to determine entitlement or ++interactivity. Interactivity "estimators" that use some kind of sleep/run ++algorithm are doomed to fail to detect all interactive tasks, and to falsely tag ++tasks that aren't interactive as being so. The reason for this is that it is ++close to impossible to determine that when a task is sleeping, whether it is ++doing it voluntarily, as in a userspace application waiting for input in the ++form of a mouse click or otherwise, or involuntarily, because it is waiting for ++another thread, process, I/O, kernel activity or whatever. Thus, such an ++estimator will introduce corner cases, and more heuristics will be required to ++cope with those corner cases, introducing more corner cases and failed ++interactivity detection and so on. Interactivity in BFS is built into the design ++by virtue of the fact that tasks that are waking up have not used up their quota ++of CPU time, and have earlier effective deadlines, thereby making it very likely ++they will preempt any CPU bound task of equivalent nice level. See below for ++more information on the virtual deadline mechanism. Even if they do not preempt ++a running task, because the rr interval is guaranteed to have a bound upper ++limit on how long a task will wait for, it will be scheduled within a timeframe ++that will not cause visible interface jitter. ++ ++ ++Design details. ++ ++Task insertion. ++ ++BFS inserts tasks into each relevant queue as an O(1) insertion into a double ++linked list. On insertion, *every* running queue is checked to see if the newly ++queued task can run on any idle queue, or preempt the lowest running task on the ++system. This is how the cross-CPU scheduling of BFS achieves significantly lower ++latency per extra CPU the system has. In this case the lookup is, in the worst ++case scenario, O(n) where n is the number of CPUs on the system. ++ ++Data protection. ++ ++BFS has one single lock protecting the process local data of every task in the ++global queue. Thus every insertion, removal and modification of task data in the ++global runqueue needs to grab the global lock. However, once a task is taken by ++a CPU, the CPU has its own local data copy of the running process' accounting ++information which only that CPU accesses and modifies (such as during a ++timer tick) thus allowing the accounting data to be updated lockless. Once a ++CPU has taken a task to run, it removes it from the global queue. Thus the ++global queue only ever has, at most, ++ ++ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 ++ ++tasks in the global queue. This value is relevant for the time taken to look up ++tasks during scheduling. This will increase if many tasks with CPU affinity set ++in their policy to limit which CPUs they're allowed to run on if they outnumber ++the number of CPUs. The +1 is because when rescheduling a task, the CPU's ++currently running task is put back on the queue. Lookup will be described after ++the virtual deadline mechanism is explained. ++ ++Virtual deadline. ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in BFS is entirely in the virtual deadline mechanism. The one ++tunable in BFS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in jiffies by this equation: ++ ++ jiffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. Once a task is descheduled, it is put back on the queue, and an ++O(n) lookup of all queued-but-not-running tasks is done to determine which has ++the earliest deadline and that task is chosen to receive CPU next. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (jiffies) is ++constantly moving. ++ ++Task lookup. ++ ++BFS has 103 priority queues. 100 of these are dedicated to the static priority ++of realtime tasks, and the remaining 3 are, in order of best to worst priority, ++SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority ++scheduling). When a task of these priorities is queued, a bitmap of running ++priorities is set showing which of these priorities has tasks waiting for CPU ++time. When a CPU is made to reschedule, the lookup for the next task to get ++CPU time is performed in the following way: ++ ++First the bitmap is checked to see what static priority tasks are queued. If ++any realtime priorities are found, the corresponding queue is checked and the ++first task listed there is taken (provided CPU affinity is suitable) and lookup ++is complete. If the priority corresponds to a SCHED_ISO task, they are also ++taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds ++to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this ++stage, every task in the runlist that corresponds to that priority is checked ++to see which has the earliest set deadline, and (provided it has suitable CPU ++affinity) it is taken off the runqueue and given the CPU. If a task has an ++expired deadline, it is taken and the rest of the lookup aborted (as they are ++chosen in FIFO order). ++ ++Thus, the lookup is O(n) in the worst case only, where n is as described ++earlier, as tasks may be chosen before the whole task list is looked over. ++ ++ ++Scalability. ++ ++The major limitations of BFS will be that of scalability, as the separate ++runqueue designs will have less lock contention as the number of CPUs rises. ++However they do not scale linearly even with separate runqueues as multiple ++runqueues will need to be locked concurrently on such designs to be able to ++achieve fair CPU balancing, to try and achieve some sort of nice-level fairness ++across CPUs, and to achieve low enough latency for tasks on a busy CPU when ++other CPUs would be more suited. BFS has the advantage that it requires no ++balancing algorithm whatsoever, as balancing occurs by proxy simply because ++all CPUs draw off the global runqueue, in priority and deadline order. Despite ++the fact that scalability is _not_ the prime concern of BFS, it both shows very ++good scalability to smaller numbers of CPUs and is likely a more scalable design ++at these numbers of CPUs. ++ ++It also has some very low overhead scalability features built into the design ++when it has been deemed their overhead is so marginal that they're worth adding. ++The first is the local copy of the running process' data to the CPU it's running ++on to allow that data to be updated lockless where possible. Then there is ++deference paid to the last CPU a task was running on, by trying that CPU first ++when looking for an idle CPU to use the next time it's scheduled. Finally there ++is the notion of cache locality beyond the last running CPU. The sched_domains ++information is used to determine the relative virtual "cache distance" that ++other CPUs have from the last CPU a task was running on. CPUs with shared ++caches, such as SMT siblings, or multicore CPUs with shared caches, are treated ++as cache local. CPUs without shared caches are treated as not cache local, and ++CPUs on different NUMA nodes are treated as very distant. This "relative cache ++distance" is used by modifying the virtual deadline value when doing lookups. ++Effectively, the deadline is unaltered between "cache local" CPUs, doubled for ++"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning ++behind the doubling of deadlines is as follows. The real cost of migrating a ++task from one CPU to another is entirely dependant on the cache footprint of ++the task, how cache intensive the task is, how long it's been running on that ++CPU to take up the bulk of its cache, how big the CPU cache is, how fast and ++how layered the CPU cache is, how fast a context switch is... and so on. In ++other words, it's close to random in the real world where we do more than just ++one sole workload. The only thing we can be sure of is that it's not free. So ++BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs ++is more important than cache locality, and cache locality only plays a part ++after that. Doubling the effective deadline is based on the premise that the ++"cache local" CPUs will tend to work on the same tasks up to double the number ++of cache local CPUs, and once the workload is beyond that amount, it is likely ++that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA ++is a value I pulled out of my arse. ++ ++When choosing an idle CPU for a waking task, the cache locality is determined ++according to where the task last ran and then idle CPUs are ranked from best ++to worst to choose the most suitable idle CPU based on cache locality, NUMA ++node locality and hyperthread sibling business. They are chosen in the ++following preference (if idle): ++ ++* Same core, idle or busy cache, idle threads ++* Other core, same cache, idle or busy cache, idle threads. ++* Same node, other CPU, idle cache, idle threads. ++* Same node, other CPU, busy cache, idle threads. ++* Same core, busy threads. ++* Other core, same cache, busy threads. ++* Same node, other CPU, busy threads. ++* Other node, other CPU, idle cache, idle threads. ++* Other node, other CPU, busy cache, idle threads. ++* Other node, other CPU, busy threads. ++ ++This shows the SMT or "hyperthread" awareness in the design as well which will ++choose a real idle core first before a logical SMT sibling which already has ++tasks on the physical CPU. ++ ++Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. ++However this benchmarking was performed on an earlier design that was far less ++scalable than the current one so it's hard to know how scalable it is in terms ++of both CPUs (due to the global runqueue) and heavily loaded machines (due to ++O(n) lookup) at this stage. Note that in terms of scalability, the number of ++_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) ++quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark ++results are very promising indeed, without needing to tweak any knobs, features ++or options. Benchmark contributions are most welcome. ++ ++ ++Features ++ ++As the initial prime target audience for BFS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval ++and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition ++to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is ++support for CGROUPS. The average user should neither need to know what these ++are, nor should they need to be using them to have good desktop behaviour. ++ ++rr_interval ++ ++There is only one "scheduler" tunable, the round robin interval. This can be ++accessed in ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6 on a ++uniprocessor machine, and automatically set to a progressively higher value on ++multiprocessor machines. The reasoning behind increasing the value on more CPUs ++is that the effective latency is decreased by virtue of there being more CPUs on ++BFS (for reasons explained above), and increasing the value allows for less ++cache contention and more throughput. Valid values are from 1 to 1000 ++Decreasing the value will decrease latencies at the cost of decreasing ++throughput, while increasing it will improve throughput, but at the cost of ++worsening latencies. The accuracy of the rr interval is limited by HZ resolution ++of the kernel configuration. Thus, the worst case latencies are usually slightly ++higher than this actual value. The default value of 6 is not an arbitrary one. ++It is based on the fact that humans can detect jitter at approximately 7ms, so ++aiming for much lower latencies is pointless under most circumstances. It is ++worth noting this fact when comparing the latency performance of BFS to other ++schedulers. Worst case latencies being higher than 7ms are far worse than ++average latencies not being in the microsecond range. ++ ++Isochronous scheduling. ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of _total CPU_ available across the machine, configurable ++as a percentage in the following "resource handling" tunable (as opposed to a ++scheduler tunable): ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of BFS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++Because some applications constantly set their policy as well as their nice ++level, there is potential for them to undo the override specified by the user ++on the command line of setting the policy to SCHED_ISO. To counter this, once ++a task has been set to SCHED_ISO policy, it needs superuser privileges to set ++it back to SCHED_NORMAL. This will ensure the task remains ISO and all child ++processes and threads will also inherit the ISO policy. ++ ++Idleprio scheduling. ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start ++a video encode or so on without any slowdown of other tasks. To avoid this ++policy from grabbing shared resources and holding them indefinitely, if it ++detects a state where the task is waiting on I/O, the machine is about to ++suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As ++per the Isochronous task management, once a task has been scheduled as IDLEPRIO, ++it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can ++be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++ schedtool -D -e ./mprime ++ ++Subtick accounting. ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the ++timer tick frequency (HZ) is lowered. It is possible to create an application ++which uses almost 100% CPU, yet by being descheduled at the right time, records ++zero CPU usage. While the main problem with this is that there are possible ++security implications, it is also difficult to determine how much CPU a task ++really does use. BFS tries to use the sub-tick accounting from the TSC clock, ++where possible, to determine real CPU usage. This is not entirely reliable, but ++is far more likely to produce accurate CPU usage data than the existing designs ++and will not show tasks as consuming no CPU usage when they actually are. Thus, ++the amount of CPU reported as being used by BFS will more accurately represent ++how much CPU the task itself is using (as is shown for example by the 'time' ++application), so the reported values may be quite different to other schedulers. ++Values reported as the 'load' are more prone to problems with this design, but ++per process values are closer to real usage. When comparing throughput of BFS ++to other designs, it is important to compare the actual completed work in terms ++of total wall clock time taken and total work done, rather than the reported ++"cpu usage". ++ ++ ++Con Kolivas Fri Aug 27 2010 +diff -Nur a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt +--- a/Documentation/scheduler/sched-MuQSS.txt 1970-01-01 01:00:00.000000000 +0100 ++++ b/Documentation/scheduler/sched-MuQSS.txt 2019-01-05 20:22:51.089998199 +0000 +@@ -0,0 +1,347 @@ ++MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. ++ ++MuQSS is a per-cpu runqueue variant of the original BFS scheduler with ++one 8 level skiplist per runqueue, and fine grained locking for much more ++scalability. ++ ++ ++Goals. ++ ++The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from ++here on (pronounced mux) is to completely do away with the complex designs of ++the past for the cpu process scheduler and instead implement one that is very ++simple in basic design. The main focus of MuQSS is to achieve excellent desktop ++interactivity and responsiveness without heuristics and tuning knobs that are ++difficult to understand, impossible to model and predict the effect of, and when ++tuned to one workload cause massive detriment to another, while still being ++scalable to many CPUs and processes. ++ ++ ++Design summary. ++ ++MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) ++lookup, earliest effective virtual deadline first tickless design, loosely based ++on EEVDF (earliest eligible virtual deadline first) and my previous Staircase ++Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. ++Each component shall be described in order to understand the significance of, ++and reasoning for it. ++ ++ ++Design reasoning. ++ ++In BFS, the use of a single runqueue across all CPUs meant that each CPU would ++need to scan the entire runqueue looking for the process with the earliest ++deadline and schedule that next, regardless of which CPU it originally came ++from. This made BFS deterministic with respect to latency and provided ++guaranteed latencies dependent on number of processes and CPUs. The single ++runqueue, however, meant that all CPUs would compete for the single lock ++protecting it, which would lead to increasing lock contention as the number of ++CPUs rose and appeared to limit scalability of common workloads beyond 16 ++logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously ++increased overhead proportionate to the number of queued proecesses and led to ++cache thrashing while iterating over the linked list. ++ ++MuQSS is an evolution of BFS, designed to maintain the same scheduling ++decision mechanism and be virtually deterministic without relying on the ++constrained design of the single runqueue by splitting out the single runqueue ++to be per-CPU and use skiplists instead of linked lists. ++ ++The original reason for going back to a single runqueue design for BFS was that ++once multiple runqueues are introduced, per-CPU or otherwise, there will be ++complex interactions as each runqueue will be responsible for the scheduling ++latency and fairness of the tasks only on its own runqueue, and to achieve ++fairness and low latency across multiple CPUs, any advantage in throughput of ++having CPU local tasks causes other disadvantages. This is due to requiring a ++very complex balancing system to at best achieve some semblance of fairness ++across CPUs and can only maintain relatively low latency for tasks bound to the ++same CPUs, not across them. To increase said fairness and latency across CPUs, ++the advantage of local runqueue locking, which makes for better scalability, is ++lost due to having to grab multiple locks. ++ ++MuQSS works around the problems inherent in multiple runqueue designs by ++making its skip lists priority ordered and through novel use of lockless ++examination of each other runqueue it can decide if it should take the earliest ++deadline task from another runqueue for latency reasons, or for CPU balancing ++reasons. It still does not have a balancing system, choosing to allow the ++next task scheduling decision and task wakeup CPU choice to allow balancing to ++happen by virtue of its choices. ++ ++ ++Design details. ++ ++Custom skip list implementation: ++ ++To avoid the overhead of building up and tearing down skip list structures, ++the variant used by MuQSS has a number of optimisations making it specific for ++its use case in the scheduler. It uses static arrays of 8 'levels' instead of ++building up and tearing down structures dynamically. This makes each runqueue ++only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU ++it means that it scales O(log N) up to 64k x number of logical CPUs which is ++far beyond the realistic task limits each CPU could handle. By being 8 levels ++it also makes the array exactly one cacheline in size. Additionally, each ++skip list node is bidirectional making insertion and removal amortised O(1), ++being O(k) where k is 1-8. Uniquely, we are only ever interested in the very ++first entry in each list at all times with MuQSS, so there is never a need to ++do a search and thus look up is always O(1). In interactive mode, the queues ++will be searched beyond their first entry if the first task is not suitable ++for affinity or SMT nice reasons. ++ ++Task insertion: ++ ++MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into ++a custom skip list as described above (based on the original design by William ++Pugh). Insertion is ordered in such a way that there is never a need to do a ++search by ordering tasks according to static priority primarily, and then ++virtual deadline at the time of insertion. ++ ++Niffies: ++ ++Niffies are a monotonic forward moving timer not unlike the "jiffies" but are ++of nanosecond resolution. Niffies are calculated per-runqueue from the high ++resolution TSC timers, and in order to maintain fairness are synchronised ++between CPUs whenever both runqueues are locked concurrently. ++ ++Virtual deadline: ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in MuQSS is entirely in the virtual deadline mechanism. The one ++tunable in MuQSS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in niffies by this equation: ++ ++ niffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (niffies) is ++constantly moving. ++ ++Task lookup: ++ ++As tasks are already pre-ordered according to anticipated scheduling order in ++the skip lists, lookup for the next suitable task per-runqueue is always a ++matter of simply selecting the first task in the 0th level skip list entry. ++In order to maintain optimal latency and fairness across CPUs, MuQSS does a ++novel examination of every other runqueue in cache locality order, choosing the ++best task across all runqueues. This provides near-determinism of how long any ++task across the entire system may wait before receiving CPU time. The other ++runqueues are first examine lockless and then trylocked to minimise the ++potential lock contention if they are likely to have a suitable better task. ++Each other runqueue lock is only held for as long as it takes to examine the ++entry for suitability. In "interactive" mode, the default setting, MuQSS will ++look for the best deadline task across all CPUs, while in !interactive mode, ++it will only select a better deadline task from another CPU if it is more ++heavily laden than the current one. ++ ++Lookup is therefore O(k) where k is number of CPUs. ++ ++ ++Latency. ++ ++Through the use of virtual deadlines to govern the scheduling order of normal ++tasks, queue-to-activation latency per runqueue is guaranteed to be bound by ++the rr_interval tunable which is set to 6ms by default. This means that the ++longest a CPU bound task will wait for more CPU is proportional to the number ++of running tasks and in the common case of 0-2 running tasks per CPU, will be ++under the 7ms threshold for human perception of jitter. Additionally, as newly ++woken tasks will have an early deadline from their previous runtime, the very ++tasks that are usually latency sensitive will have the shortest interval for ++activation, usually preempting any existing CPU bound tasks. ++ ++Tickless expiry: ++ ++A feature of MuQSS is that it is not tied to the resolution of the chosen tick ++rate in Hz, instead depending entirely on the high resolution timers where ++possible for sub-millisecond accuracy on timeouts regarless of the underlying ++tick rate. This allows MuQSS to be run with the low overhead of low Hz rates ++such as 100 by default, benefiting from the improved throughput and lower ++power usage it provides. Another advantage of this approach is that in ++combination with the Full No HZ option, which disables ticks on running task ++CPUs instead of just idle CPUs, the tick can be disabled at all times ++regardless of how many tasks are running instead of being limited to just one ++running task. Note that this option is NOT recommended for regular desktop ++users. ++ ++ ++Scalability and balancing. ++ ++Unlike traditional approaches where balancing is a combination of CPU selection ++at task wakeup and intermittent balancing based on a vast array of rules set ++according to architecture, busyness calculations and special case management, ++MuQSS indirectly balances on the fly at task wakeup and next task selection. ++During initialisation, MuQSS creates a cache coherency ordered list of CPUs for ++each logical CPU and uses this to aid task/CPU selection when CPUs are busy. ++Additionally it selects any idle CPUs, if they are available, at any time over ++busy CPUs according to the following preference: ++ ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ ++Mux is therefore SMT, MC and Numa aware without the need for extra ++intermittent balancing to maintain CPUs busy and make the most of cache ++coherency. ++ ++ ++Features ++ ++As the initial prime target audience for MuQSS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, ++interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO ++policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS ++does _not_ now feature is support for CGROUPS. The average user should neither ++need to know what these are, nor should they need to be using them to have good ++desktop behaviour. However since some applications refuse to work without ++cgroups, one can enable them with MuQSS as a stub and the filesystem will be ++created which will allow the applications to work. ++ ++rr_interval: ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6. Valid values ++are from 1 to 1000 Decreasing the value will decrease latencies at the cost of ++decreasing throughput, while increasing it will improve throughput, but at the ++cost of worsening latencies. It is based on the fact that humans can detect ++jitter at approximately 7ms, so aiming for much lower latencies is pointless ++under most circumstances. It is worth noting this fact when comparing the ++latency performance of MuQSS to other schedulers. Worst case latencies being ++higher than 7ms are far worse than average latencies not being in the ++microsecond range. ++ ++interactive: ++ ++ /proc/sys/kernel/interactive ++ ++The value is a simple boolean of 1 for on and 0 for off and is set to on by ++default. Disabling this will disable the near-determinism of MuQSS when ++selecting the next task by not examining all CPUs for the earliest deadline ++task, or which CPU to wake to, instead prioritising CPU balancing for improved ++throughput. Latency will still be bound by rr_interval, but on a per-CPU basis ++instead of across the whole system. ++ ++Isochronous scheduling: ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of CPU available per CPU, configurable as a percentage in ++the following "resource handling" tunable (as opposed to a scheduler tunable): ++ ++iso_cpu: ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of MuQSS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++ ++ ++Idleprio scheduling: ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start a ++video encode or so on without any slowdown of other tasks. To avoid this policy ++from grabbing shared resources and holding them indefinitely, if it detects a ++state where the task is waiting on I/O, the machine is about to suspend to ram ++and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has ++been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without ++superuser privileges since it is effectively a lower scheduling policy. Tasks ++can be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++schedtool -D -e ./mprime ++ ++Subtick accounting: ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the timer ++tick frequency (HZ) is lowered. It is possible to create an application which ++uses almost 100% CPU, yet by being descheduled at the right time, records zero ++CPU usage. While the main problem with this is that there are possible security ++implications, it is also difficult to determine how much CPU a task really does ++use. Mux uses sub-tick accounting from the TSC clock to determine real CPU ++usage. Thus, the amount of CPU reported as being used by MuQSS will more ++accurately represent how much CPU the task itself is using (as is shown for ++example by the 'time' application), so the reported values may be quite ++different to other schedulers. When comparing throughput of MuQSS to other ++designs, it is important to compare the actual completed work in terms of total ++wall clock time taken and total work done, rather than the reported "cpu usage". ++ ++Symmetric MultiThreading (SMT) aware nice: ++ ++SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the ++logical CPU count rises by adding thread units to each CPU core, allowing more ++than one task to be run simultaneously on the same core, the disadvantage of it ++is that the CPU power is shared between the tasks, not summating to the power ++of two CPUs. The practical upshot of this is that two tasks running on ++separate threads of the same core run significantly slower than if they had one ++core each to run on. While smart CPU selection allows each task to have a core ++to itself whenever available (as is done on MuQSS), it cannot offset the ++slowdown that occurs when the cores are all loaded and only a thread is left. ++Most of the time this is harmless as the CPU is effectively overloaded at this ++point and the extra thread is of benefit. However when running a niced task in ++the presence of an un-niced task (say nice 19 v nice 0), the nice task gets ++precisely the same amount of CPU power as the unniced one. MuQSS has an ++optional configuration feature known as SMT-NICE which selectively idles the ++secondary niced thread for a period proportional to the nice difference, ++allowing CPU distribution according to nice level to be maintained, at the ++expense of a small amount of extra overhead. If this is configured in on a ++machine without SMT threads, the overhead is minimal. ++ ++ ++Con Kolivas Sat, 29th October 2016 +diff -Nur a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt +--- a/Documentation/sysctl/kernel.txt 2019-01-05 20:17:13.829237906 +0000 ++++ b/Documentation/sysctl/kernel.txt 2019-01-05 20:22:51.089998199 +0000 +@@ -39,6 +39,7 @@ + - hung_task_timeout_secs + - hung_task_warnings + - kexec_load_disabled ++- iso_cpu + - kptr_restrict + - l2cr [ PPC only ] + - modprobe ==> Documentation/debugging-modules.txt +@@ -73,6 +74,7 @@ + - randomize_va_space + - real-root-dev ==> Documentation/admin-guide/initrd.rst + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst +@@ -95,6 +97,7 @@ + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + ============================================================== +@@ -397,6 +400,16 @@ + + ============================================================== + ++iso_cpu: (MuQSS CPU scheduler only). ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling five ++seconds over the -whole- system, meaning all cpus. ++ ++Set to 70 (percent) by default. ++ ++============================================================== ++ + l2cr: (PPC only) + + This flag controls the L2 cache of G3 processor boards. If +@@ -823,6 +836,20 @@ + + ============================================================== + ++rr_interval: (MuQSS CPU scheduler only) ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. Conversely decreasing it will decrease average and maximum ++latencies but at the expense of throughput. This value is in ++milliseconds and the default value chosen depends on the number of ++cpus available at scheduler initialisation with a minimum of 6. ++ ++Valid values are from 1-1000. ++ ++============================================================== ++ + rtsig-max & rtsig-nr: + + The file rtsig-max can be used to tune the maximum number +@@ -1081,3 +1108,13 @@ + tunable to zero will disable lockup detection altogether. + + ============================================================== ++ ++yield_type: (MuQSS CPU scheduler only) ++ ++This determines what type of yield calls to sched_yield will perform. ++ ++ 0: No yield. ++ 1: Yield only to better priority/deadline tasks. (default) ++ 2: Expire timeslice and recalculate deadline. ++ ++============================================================== +diff -Nur a/fs/proc/base.c b/fs/proc/base.c +--- a/fs/proc/base.c 2018-12-21 13:13:19.000000000 +0000 ++++ b/fs/proc/base.c 2019-01-05 20:22:51.089998199 +0000 +@@ -481,7 +481,7 @@ + seq_printf(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff -Nur a/include/linux/init_task.h b/include/linux/init_task.h +--- a/include/linux/init_task.h 2018-12-21 13:13:19.000000000 +0000 ++++ b/include/linux/init_task.h 2019-01-05 20:22:51.089998199 +0000 +@@ -172,8 +172,6 @@ + # define INIT_VTIME(tsk) + #endif + +-#define INIT_TASK_COMM "swapper" +- + #ifdef CONFIG_RT_MUTEXES + # define INIT_RT_MUTEXES(tsk) \ + .pi_waiters = RB_ROOT_CACHED, \ +@@ -223,6 +221,80 @@ + * INIT_TASK is used to set up the first task table, touch at + * your own risk!. Base=0, limit=0x1fffff (=2MB) + */ ++#ifdef CONFIG_SCHED_MUQSS ++#define INIT_TASK_COMM "MuQSS" ++#define INIT_TASK(tsk) \ ++{ \ ++ INIT_TASK_TI(tsk) \ ++ .state = 0, \ ++ .stack = init_stack, \ ++ .usage = ATOMIC_INIT(2), \ ++ .flags = PF_KTHREAD, \ ++ .prio = NORMAL_PRIO, \ ++ .static_prio = MAX_PRIO-20, \ ++ .normal_prio = NORMAL_PRIO, \ ++ .deadline = 0, \ ++ .policy = SCHED_NORMAL, \ ++ .cpus_allowed = CPU_MASK_ALL, \ ++ .mm = NULL, \ ++ .active_mm = &init_mm, \ ++ .restart_block = { \ ++ .fn = do_no_restart_syscall, \ ++ }, \ ++ .time_slice = 1000000, \ ++ .tasks = LIST_HEAD_INIT(tsk.tasks), \ ++ INIT_PUSHABLE_TASKS(tsk) \ ++ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ ++ .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ ++ .real_parent = &tsk, \ ++ .parent = &tsk, \ ++ .children = LIST_HEAD_INIT(tsk.children), \ ++ .sibling = LIST_HEAD_INIT(tsk.sibling), \ ++ .group_leader = &tsk, \ ++ RCU_POINTER_INITIALIZER(real_cred, &init_cred), \ ++ RCU_POINTER_INITIALIZER(cred, &init_cred), \ ++ .comm = INIT_TASK_COMM, \ ++ .thread = INIT_THREAD, \ ++ .fs = &init_fs, \ ++ .files = &init_files, \ ++ .signal = &init_signals, \ ++ .sighand = &init_sighand, \ ++ .nsproxy = &init_nsproxy, \ ++ .pending = { \ ++ .list = LIST_HEAD_INIT(tsk.pending.list), \ ++ .signal = {{0}}}, \ ++ .blocked = {{0}}, \ ++ .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ ++ .journal_info = NULL, \ ++ INIT_CPU_TIMERS(tsk) \ ++ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ ++ .timer_slack_ns = 50000, /* 50 usec default slack */ \ ++ .pids = { \ ++ [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ ++ [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ ++ [PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \ ++ }, \ ++ .thread_group = LIST_HEAD_INIT(tsk.thread_group), \ ++ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), \ ++ INIT_IDS \ ++ INIT_PERF_EVENTS(tsk) \ ++ INIT_TRACE_IRQFLAGS \ ++ INIT_LOCKDEP \ ++ INIT_FTRACE_GRAPH \ ++ INIT_TRACE_RECURSION \ ++ INIT_TASK_RCU_PREEMPT(tsk) \ ++ INIT_TASK_RCU_TASKS(tsk) \ ++ INIT_CPUSET_SEQ(tsk) \ ++ INIT_RT_MUTEXES(tsk) \ ++ INIT_PREV_CPUTIME(tsk) \ ++ INIT_VTIME(tsk) \ ++ INIT_NUMA_BALANCING(tsk) \ ++ INIT_KASAN(tsk) \ ++ INIT_LIVEPATCH(tsk) \ ++ INIT_TASK_SECURITY \ ++} ++#else /* CONFIG_SCHED_MUQSS */ ++#define INIT_TASK_COMM "swapper" + #define INIT_TASK(tsk) \ + { \ + INIT_TASK_TI(tsk) \ +@@ -300,7 +372,7 @@ + INIT_LIVEPATCH(tsk) \ + INIT_TASK_SECURITY \ + } +- ++#endif /* CONFIG_SCHED_MUQSS */ + + /* Attach to the init_task data structure for proper alignment */ + #define __init_task_data __attribute__((__section__(".data..init_task"))) +diff -Nur a/include/linux/ioprio.h b/include/linux/ioprio.h +--- a/include/linux/ioprio.h 2018-12-21 13:13:19.000000000 +0000 ++++ b/include/linux/ioprio.h 2019-01-05 20:22:51.089998199 +0000 +@@ -52,6 +52,8 @@ + */ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (iso_task(task)) ++ return 0; + return (task_nice(task) + 20) / 5; + } + +diff -Nur a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h +--- a/include/linux/sched/nohz.h 2018-12-21 13:13:19.000000000 +0000 ++++ b/include/linux/sched/nohz.h 2019-01-05 20:22:51.089998199 +0000 +@@ -6,7 +6,7 @@ + * This is the interface between the scheduler and nohz/dynticks: + */ + +-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) ++#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) + extern void cpu_load_update_nohz_start(void); + extern void cpu_load_update_nohz_stop(void); + #else +@@ -23,7 +23,7 @@ + static inline void set_cpu_sd_state_idle(void) { } + #endif + +-#ifdef CONFIG_NO_HZ_COMMON ++#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) + void calc_load_nohz_start(void); + void calc_load_nohz_stop(void); + #else +diff -Nur a/include/linux/sched/prio.h b/include/linux/sched/prio.h +--- a/include/linux/sched/prio.h 2018-12-21 13:13:19.000000000 +0000 ++++ b/include/linux/sched/prio.h 2019-01-05 20:22:51.089998199 +0000 +@@ -20,8 +20,20 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_MUQSS ++/* Note different MAX_RT_PRIO */ ++#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) ++ ++#define ISO_PRIO (MAX_RT_PRIO) ++#define NORMAL_PRIO (MAX_RT_PRIO + 1) ++#define IDLE_PRIO (MAX_RT_PRIO + 2) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* CONFIG_SCHED_MUQSS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + ++#endif /* CONFIG_SCHED_MUQSS */ ++ + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + +diff -Nur a/include/linux/sched/task.h b/include/linux/sched/task.h +--- a/include/linux/sched/task.h 2018-12-21 13:13:19.000000000 +0000 ++++ b/include/linux/sched/task.h 2019-01-05 20:22:51.089998199 +0000 +@@ -80,7 +80,7 @@ + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff -Nur a/include/linux/sched.h b/include/linux/sched.h +--- a/include/linux/sched.h 2018-12-21 13:13:19.000000000 +0000 ++++ b/include/linux/sched.h 2019-01-05 20:22:51.089998199 +0000 +@@ -27,6 +27,9 @@ + #include + #include + #include ++#ifdef CONFIG_SCHED_MUQSS ++#include ++#endif + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -579,9 +582,11 @@ + unsigned int flags; + unsigned int ptrace; + ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) ++ int on_cpu; ++#endif + #ifdef CONFIG_SMP + struct llist_node wake_entry; +- int on_cpu; + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +@@ -598,10 +603,25 @@ + int static_prio; + int normal_prio; + unsigned int rt_priority; ++#ifdef CONFIG_SCHED_MUQSS ++ int time_slice; ++ u64 deadline; ++ skiplist_node node; /* Skip list node */ ++ u64 last_ran; ++ u64 sched_time; /* sched_clock time spent running */ ++#ifdef CONFIG_SMT_NICE ++ int smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++#ifdef CONFIG_HOTPLUG_CPU ++ bool zerobound; /* Bound to CPU0 for hotplug */ ++#endif ++ unsigned long rt_timeout; ++#else /* CONFIG_SCHED_MUQSS */ + + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +@@ -751,6 +771,10 @@ + u64 utimescaled; + u64 stimescaled; + #endif ++#ifdef CONFIG_SCHED_MUQSS ++ /* Unbanked cpu time */ ++ unsigned long utime_ns, stime_ns; ++#endif + u64 gtime; + struct prev_cputime prev_cputime; + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +@@ -1155,6 +1179,40 @@ + */ + }; + ++#ifdef CONFIG_SCHED_MUQSS ++#define tsk_seruntime(t) ((t)->sched_time) ++#define tsk_rttimeout(t) ((t)->rt_timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++} ++ ++void print_scheduler_version(void); ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return (p->policy == SCHED_ISO); ++} ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++ p->nr_cpus_allowed = current->nr_cpus_allowed; ++} ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "CFS CPU scheduler.\n"); ++} ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_MUQSS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->pids[PIDTYPE_PID].pid; +diff -Nur a/include/linux/skip_list.h b/include/linux/skip_list.h +--- a/include/linux/skip_list.h 1970-01-01 01:00:00.000000000 +0100 ++++ b/include/linux/skip_list.h 2019-01-05 20:22:51.089998199 +0000 +@@ -0,0 +1,33 @@ ++#ifndef _LINUX_SKIP_LISTS_H ++#define _LINUX_SKIP_LISTS_H ++typedef u64 keyType; ++typedef void *valueType; ++ ++typedef struct nodeStructure skiplist_node; ++ ++struct nodeStructure { ++ int level; /* Levels in this structure */ ++ keyType key; ++ valueType value; ++ skiplist_node *next[8]; ++ skiplist_node *prev[8]; ++}; ++ ++typedef struct listStructure { ++ int entries; ++ int level; /* Maximum level of the list ++ (1 more than the number of levels in the list) */ ++ skiplist_node *header; /* pointer to header */ ++} skiplist; ++ ++void skiplist_init(skiplist_node *slnode); ++skiplist *new_skiplist(skiplist_node *slnode); ++void free_skiplist(skiplist *l); ++void skiplist_node_init(skiplist_node *node); ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); ++void skiplist_delete(skiplist *l, skiplist_node *node); ++ ++static inline bool skiplist_node_empty(skiplist_node *node) { ++ return (!node->next[0]); ++} ++#endif /* _LINUX_SKIP_LISTS_H */ +diff -Nur a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +--- a/include/uapi/linux/sched.h 2018-12-21 13:13:19.000000000 +0000 ++++ b/include/uapi/linux/sched.h 2019-01-05 20:22:51.089998199 +0000 +@@ -37,9 +37,16 @@ + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented on MuQSS only */ + #define SCHED_IDLE 5 ++#ifdef CONFIG_SCHED_MUQSS ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO SCHED_IDLE ++#define SCHED_MAX (SCHED_IDLEPRIO) ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++#else /* CONFIG_SCHED_MUQSS */ + #define SCHED_DEADLINE 6 ++#endif /* CONFIG_SCHED_MUQSS */ + + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ + #define SCHED_RESET_ON_FORK 0x40000000 +diff -Nur a/init/Kconfig b/init/Kconfig +--- a/init/Kconfig 2019-01-05 20:17:13.849238543 +0000 ++++ b/init/Kconfig 2019-01-05 20:22:51.089998199 +0000 +@@ -38,6 +38,18 @@ + + menu "General setup" + ++config SCHED_MUQSS ++ bool "MuQSS cpu scheduler" ++ select HIGH_RES_TIMERS ++ ---help--- ++ The Multiple Queue Skiplist Scheduler for excellent interactivity and ++ responsiveness on the desktop and highly scalable deterministic ++ low latency on any hardware. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -621,6 +633,7 @@ + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_MUQSS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -723,9 +736,13 @@ + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group +- tasks. ++ tasks. In combination with MuQSS this is purely a STUB to create the ++ files associated with the CPU controller cgroup but most of the ++ controls do nothing. This is useful for working in environments and ++ with applications that will only work if this control group is ++ present. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_MUQSS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -832,6 +849,7 @@ + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_MUQSS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -950,6 +968,7 @@ + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_MUQSS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff -Nur a/init/main.c b/init/main.c +--- a/init/main.c 2018-12-21 13:13:19.000000000 +0000 ++++ b/init/main.c 2019-01-05 20:22:51.089998199 +0000 +@@ -841,7 +841,6 @@ + return ret; + } + +- + extern initcall_t __initcall_start[]; + extern initcall_t __initcall0_start[]; + extern initcall_t __initcall1_start[]; +@@ -1008,6 +1007,8 @@ + + rcu_end_inkernel_boot(); + ++ print_scheduler_version(); ++ + if (ramdisk_execute_command) { + ret = run_init_process(ramdisk_execute_command); + if (!ret) +diff -Nur a/kernel/delayacct.c b/kernel/delayacct.c +--- a/kernel/delayacct.c 2018-12-21 13:13:19.000000000 +0000 ++++ b/kernel/delayacct.c 2019-01-05 20:22:51.089998199 +0000 +@@ -115,7 +115,7 @@ + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff -Nur a/kernel/exit.c b/kernel/exit.c +--- a/kernel/exit.c 2018-12-21 13:13:19.000000000 +0000 ++++ b/kernel/exit.c 2019-01-05 20:22:51.089998199 +0000 +@@ -129,7 +129,7 @@ + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -150,7 +150,7 @@ + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff -Nur a/kernel/kthread.c b/kernel/kthread.c +--- a/kernel/kthread.c 2018-12-21 13:13:19.000000000 +0000 ++++ b/kernel/kthread.c 2019-01-05 20:22:51.099998516 +0000 +@@ -410,6 +410,34 @@ + } + EXPORT_SYMBOL(kthread_bind); + ++#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) ++extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); ++ ++/* ++ * new_kthread_bind is a special variant of __kthread_bind_mask. ++ * For new threads to work on muqss we want to call do_set_cpus_allowed ++ * without the task_cpu being set and the task rescheduled until they're ++ * rescheduled on their own so we call __do_set_cpus_allowed directly which ++ * only changes the cpumask. This is particularly important for smpboot threads ++ * to work. ++ */ ++static void new_kthread_bind(struct task_struct *p, unsigned int cpu) ++{ ++ unsigned long flags; ++ ++ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) ++ return; ++ ++ /* It's safe because the task is inactive. */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ __do_set_cpus_allowed(p, cpumask_of(cpu)); ++ p->flags |= PF_NO_SETAFFINITY; ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++#else ++#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) ++#endif ++ + /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). +@@ -431,7 +459,7 @@ + cpu); + if (IS_ERR(p)) + return p; +- kthread_bind(p, cpu); ++ new_kthread_bind(p, cpu); + /* CPU hotplug need to bind once again when unparking the thread. */ + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; +diff -Nur a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +--- a/kernel/livepatch/transition.c 2018-12-21 13:13:19.000000000 +0000 ++++ b/kernel/livepatch/transition.c 2019-01-05 20:22:51.099998516 +0000 +@@ -277,6 +277,12 @@ + return 0; + } + ++#ifdef CONFIG_SCHED_MUQSS ++typedef unsigned long rq_flags_t; ++#else ++typedef struct rq_flags rq_flag_t; ++#endif ++ + /* + * Try to safely switch a task to the target patch state. If it's currently + * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or +@@ -285,7 +291,7 @@ + static bool klp_try_switch_task(struct task_struct *task) + { + struct rq *rq; +- struct rq_flags flags; ++ rq_flags_t flags; + int ret; + bool success = false; + char err_buf[STACK_ERR_BUF_SIZE]; +diff -Nur a/kernel/Makefile b/kernel/Makefile +--- a/kernel/Makefile 2018-12-21 13:13:19.000000000 +0000 ++++ b/kernel/Makefile 2019-01-05 20:22:51.099998516 +0000 +@@ -10,7 +10,7 @@ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ + notifier.o ksysfs.o cred.o reboot.o \ +- async.o range.o smpboot.o ucount.o ++ async.o range.o smpboot.o ucount.o skip_list.o + + obj-$(CONFIG_MODULES) += kmod.o + obj-$(CONFIG_MULTIUSER) += groups.o +diff -Nur a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig +--- a/kernel/rcu/Kconfig 2018-12-21 13:13:19.000000000 +0000 ++++ b/kernel/rcu/Kconfig 2019-01-05 20:22:51.099998516 +0000 +@@ -93,7 +93,7 @@ + config CONTEXT_TRACKING_FORCE + bool "Force context tracking" + depends on CONTEXT_TRACKING +- default y if !NO_HZ_FULL ++ default y if !NO_HZ_FULL && !SCHED_MUQSS + help + The major pre-requirement for full dynticks to work is to + support the context tracking subsystem. But there are also +diff -Nur a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +--- a/kernel/sched/cpufreq_schedutil.c 2018-12-21 13:13:19.000000000 +0000 ++++ b/kernel/sched/cpufreq_schedutil.c 2019-01-05 20:22:51.099998516 +0000 +@@ -176,6 +176,17 @@ + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifdef CONFIG_SCHED_MUQSS ++static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ *util = rq->load_avg; ++ if (*util > SCHED_CAPACITY_SCALE) ++ *util = SCHED_CAPACITY_SCALE; ++ *max = SCHED_CAPACITY_SCALE; ++} ++#else /* CONFIG_SCHED_MUQSS */ + static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) + { + struct rq *rq = cpu_rq(cpu); +@@ -186,6 +197,7 @@ + *util = min(rq->cfs.avg.util_avg, cfs_max); + *max = cfs_max; + } ++#endif /* CONFIG_SCHED_MUQSS */ + + static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, + unsigned int flags) +diff -Nur a/kernel/sched/cputime.c b/kernel/sched/cputime.c +--- a/kernel/sched/cputime.c 2018-12-21 13:13:19.000000000 +0000 ++++ b/kernel/sched/cputime.c 2019-01-05 20:22:51.099998516 +0000 +@@ -270,26 +270,6 @@ + return accounted; + } + +-#ifdef CONFIG_64BIT +-static inline u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- return t->se.sum_exec_runtime; +-} +-#else +-static u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- u64 ns; +- struct rq_flags rf; +- struct rq *rq; +- +- rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; +- task_rq_unlock(rq, t, &rf); +- +- return ns; +-} +-#endif +- + /* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. +@@ -661,7 +641,7 @@ + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff -Nur a/kernel/sched/idle.c b/kernel/sched/idle.c +--- a/kernel/sched/idle.c 2018-12-21 13:13:19.000000000 +0000 ++++ b/kernel/sched/idle.c 2019-01-05 20:22:51.099998516 +0000 +@@ -209,6 +209,9 @@ + */ + static void do_idle(void) + { ++ int cpu = smp_processor_id(); ++ bool pending = false; ++ + /* + * If the arch has a polling bit, we maintain an invariant: + * +@@ -220,13 +223,16 @@ + + __current_set_polling(); + quiet_vmstat(); +- tick_nohz_idle_enter(); ++ if (unlikely(softirq_pending(cpu))) ++ pending = true; ++ else ++ tick_nohz_idle_enter(); + + while (!need_resched()) { + check_pgt_cache(); + rmb(); + +- if (cpu_is_offline(smp_processor_id())) { ++ if (cpu_is_offline(cpu)) { + cpuhp_report_idle_dead(); + arch_cpu_idle_dead(); + } +@@ -255,7 +261,8 @@ + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); +- tick_nohz_idle_exit(); ++ if (!pending) ++ tick_nohz_idle_exit(); + __current_clr_polling(); + + /* +diff -Nur a/kernel/sched/Makefile b/kernel/sched/Makefile +--- a/kernel/sched/Makefile 2018-12-21 13:13:19.000000000 +0000 ++++ b/kernel/sched/Makefile 2019-01-05 20:22:51.099998516 +0000 +@@ -16,14 +16,20 @@ + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + +-obj-y += core.o loadavg.o clock.o cputime.o ++ifdef CONFIG_SCHED_MUQSS ++obj-y += MuQSS.o clock.o ++else ++obj-y += core.o loadavg.o clock.o + obj-y += idle_task.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o idle.o +-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o ++obj-$(CONFIG_SMP) += cpudeadline.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-y += cputime.o ++obj-y += wait.o wait_bit.o swait.o completion.o idle.o ++obj-$(CONFIG_SMP) += cpupri.o topology.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff -Nur a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +--- a/kernel/sched/MuQSS.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/kernel/sched/MuQSS.c 2019-01-05 20:22:51.099998516 +0000 +@@ -0,0 +1,6923 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * kernel/sched/MuQSS.c, was kernel/sched.c ++ * ++ * Kernel scheduler and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and ++ * make semaphores SMP safe ++ * 1998-11-19 Implemented schedule_timeout() and related stuff ++ * by Andrea Arcangeli ++ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: ++ * hybrid priority-list and round-robin design with ++ * an array-switch method of distributing timeslices ++ * and per-CPU runqueues. Cleanups and useful suggestions ++ * by Davide Libenzi, preemptible kernel bits by Robert Love. ++ * 2003-09-03 Interactivity tuning by Con Kolivas. ++ * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-04-15 Work begun on replacing all interactivity tuning with a ++ * fair scheduling design by Con Kolivas. ++ * 2007-05-05 Load balancing (smp-nice) and other improvements ++ * by Peter Williams ++ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith ++ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri ++ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, ++ * Thomas Gleixner, Mike Kravetz ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS ++ * scheduler by Con Kolivas. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#ifdef CONFIG_PARAVIRT ++#include ++#endif ++ ++#include "../workqueue_internal.h" ++#include "../smpboot.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++#include "MuQSS.h" ++ ++#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) ++#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++ ++#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) ++#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) ++#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) ++ ++#define is_iso_policy(policy) ((policy) == SCHED_ISO) ++#define iso_task(p) unlikely(is_iso_policy((p)->policy)) ++#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) ++ ++#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) ++ ++#define ISO_PERIOD (5 * HZ) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (1073741824 / HZ)) ++#define JIFFY_NS (1073741824 / HZ) ++#define JIFFY_US (1048576 / HZ) ++#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) ++#define HALF_JIFFY_NS (1073741824 / HZ / 2) ++#define HALF_JIFFY_US (1048576 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "MuQSS CPU scheduler v0.162 by Con Kolivas.\n"); ++} ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++ ++/* ++ * Tunable to choose whether to prioritise latency or throughput, simple ++ * binary yes or no ++ */ ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run five seconds as real time tasks. This is the total over ++ * all online cpus. ++ */ ++int sched_iso_cpu __read_mostly = 70; ++ ++/* ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The relative length of deadline for each priority(nice) level. ++ */ ++static int prio_ratios[NICE_WIDTH] __read_mostly; ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++#ifdef CONFIG_SMP ++static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; ++#endif ++ ++/* CPUs with isolated domains */ ++cpumask_var_t cpu_isolated_map; ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#ifdef CONFIG_SMP ++struct rq *cpu_rq(int cpu) ++{ ++ return &per_cpu(runqueues, (cpu)); ++} ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++/* ++ * For asym packing, by default the lower numbered cpu has higher priority. ++ */ ++int __weak arch_asym_cpu_priority(int cpu) ++{ ++ return -cpu; ++} ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++#include "stats.h" ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_switch ++# define finish_arch_switch(prev) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/* ++ * All common locking functions performed on rq->lock. rq->clock is local to ++ * the CPU accessing it so it can be modified just with interrupts disabled ++ * when we're not updating niffies. ++ * Looking up task_rq must be done under rq->lock to be safe. ++ */ ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ s64 steal = paravirt_steal_clock(cpu_of(rq)); ++ ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ ++ delta -= steal; ++ } ++#endif ++ rq->clock_task += delta; ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta < 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++/* ++ * Niffies are a globally increasing nanosecond counter. They're only used by ++ * update_load_avg and time_slice_expired, however deadlines are based on them ++ * across CPUs. Update them whenever we will call one of those functions, and ++ * synchronise them across CPUs whenever we hold both runqueue locks. ++ */ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff, minndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ rq->old_clock = rq->clock; ++ jdiff = jiffies - rq->last_jiffy; ++ ++ /* Subtract any niffies added by balancing with other rqs */ ++ ndiff -= rq->niffies - rq->last_niffy; ++ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; ++ if (minndiff < 0) ++ minndiff = 0; ++ ndiff = max(ndiff, minndiff); ++ rq->niffies += ndiff; ++ rq->last_niffy = rq->niffies; ++ if (jdiff) { ++ rq->last_jiffy += jdiff; ++ rq->last_jiffy_niffies = rq->niffies; ++ } ++} ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_MIGRATING; ++} ++ ++static inline int rq_trylock(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ return raw_spin_trylock(&rq->lock); ++} ++ ++/* ++ * Any time we have two runqueues locked we use that as an opportunity to ++ * synchronise niffies to the highest value as idle ticks may have artificially ++ * kept niffies low on one CPU and the truth can only be later. ++ */ ++static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) ++{ ++ if (rq1->niffies > rq2->niffies) ++ rq2->niffies = rq1->niffies; ++ else ++ rq1->niffies = rq2->niffies; ++} ++ ++/* ++ * double_rq_lock - safely lock two runqueues ++ * ++ * Note this does not disable interrupts like task_rq_lock, ++ * you need to do so manually before calling. ++ */ ++ ++/* For when we know rq1 != rq2 */ ++static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ if (rq1 < rq2) { ++ raw_spin_lock(&rq1->lock); ++ raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); ++ } else { ++ raw_spin_lock(&rq2->lock); ++ raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); ++ } ++} ++ ++static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ BUG_ON(!irqs_disabled()); ++ if (rq1 == rq2) { ++ raw_spin_lock(&rq1->lock); ++ __acquire(rq2->lock); /* Fake it out ;) */ ++ } else ++ __double_rq_lock(rq1, rq2); ++ synchronise_niffies(rq1, rq2); ++} ++ ++/* ++ * double_rq_unlock - safely unlock two runqueues ++ * ++ * Note this does not restore interrupts like task_rq_unlock, ++ * you need to do so manually after calling. ++ */ ++static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) ++ __releases(rq1->lock) ++ __releases(rq2->lock) ++{ ++ raw_spin_unlock(&rq1->lock); ++ if (rq1 != rq2) ++ raw_spin_unlock(&rq2->lock); ++ else ++ __release(rq2->lock); ++} ++ ++static inline void lock_all_rqs(void) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_lock(&rq->lock); ++ } ++} ++ ++static inline void unlock_all_rqs(void) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_unlock(&rq->lock); ++ } ++ preempt_enable(); ++} ++ ++/* Specially nest trylock an rq */ ++static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) ++{ ++ if (unlikely(!do_raw_spin_trylock(&rq->lock))) ++ return false; ++ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ synchronise_niffies(this_rq, rq); ++ return true; ++} ++ ++/* Unlock a specially nested trylocked rq */ ++static inline void unlock_rq(struct rq *rq) ++{ ++ spin_release(&rq->lock.dep_map, 1, _RET_IP_); ++ do_raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * This cmpxchg() implies a full barrier, which pairs with the write ++ * barrier implied by the wakeup in wake_up_q(). ++ */ ++ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) ++ return; ++ ++ get_task_struct(task); ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* Task can safely be re-inserted now */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() implies a wmb() to pair with the queueing ++ * in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ next->on_cpu = 1; ++} ++ ++static inline void smp_sched_reschedule(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ smp_send_reschedule(cpu); ++} ++ ++/* ++ * resched_task - mark a task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_task(struct task_struct *p) ++{ ++ int cpu; ++#ifdef CONFIG_LOCKDEP ++ /* Kernel threads call this when creating workqueues while still ++ * inactive from __kthread_bind_mask, holding only the pi_lock */ ++ if (!(p->flags & PF_KTHREAD)) { ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(&rq->lock); ++ } ++#endif ++ if (test_tsk_need_resched(p)) ++ return; ++ ++ cpu = task_cpu(p); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(p)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * A task that is not running or queued will not have a node set. ++ * A task that is queued but not running will have a node set. ++ * A task that is currently running will have ->on_cpu set but no node set. ++ */ ++static inline bool task_queued(struct task_struct *p) ++{ ++ return !skiplist_node_empty(&p->node); ++} ++ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); ++static inline void resched_if_idle(struct rq *rq); ++ ++/* Dodgy workaround till we figure out where the softirqs are going */ ++static inline void do_pending_softirq(struct rq *rq, struct task_struct *next) ++{ ++ if (unlikely(next == rq->idle && local_softirq_pending() && !in_interrupt())) ++ do_softirq_own_stack(); ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#endif ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = current; ++#endif ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ ++#ifdef CONFIG_SMP ++ /* ++ * If prev was marked as migrating to another CPU in return_task, drop ++ * the local runqueue lock but leave interrupts disabled and grab the ++ * remote lock we're migrating it to before enabling them. ++ */ ++ if (unlikely(task_on_rq_migrating(prev))) { ++ sched_info_dequeued(rq, prev); ++ /* ++ * We move the ownership of prev to the new cpu now. ttwu can't ++ * activate prev to the wrong cpu since it has to grab this ++ * runqueue in ttwu_remote. ++ */ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ prev->cpu = prev->wake_cpu; ++#else ++ task_thread_info(prev)->cpu = prev->wake_cpu; ++#endif ++ raw_spin_unlock(&rq->lock); ++ ++ raw_spin_lock(&prev->pi_lock); ++ rq = __task_rq_lock(prev); ++ /* Check that someone else hasn't already queued prev */ ++ if (likely(!task_queued(prev))) { ++ enqueue_task(rq, prev, 0); ++ prev->on_rq = TASK_ON_RQ_QUEUED; ++ /* Wake up the CPU if it's not already running */ ++ resched_if_idle(rq); ++ } ++ raw_spin_unlock(&prev->pi_lock); ++ } ++#endif ++ /* Accurately set nr_running here for load average calculations */ ++ rq->nr_running = rq->sl->entries + !rq_idle(rq); ++ rq_unlock(rq); ++ ++ do_pending_softirq(rq, current); ++ ++ local_irq_enable(); ++} ++ ++static inline bool deadline_before(u64 deadline, u64 time) ++{ ++ return (deadline < time); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes cpu fairly amongst tasks of the ++ * same nice value, it proportions cpu according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 prio_deadline_diff(int user_prio) ++{ ++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); ++} ++ ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ return prio_deadline_diff(TASK_USER_PRIO(p)); ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return prio_deadline_diff(USER_PRIO(static_prio)); ++} ++ ++static inline int longest_deadline_diff(void) ++{ ++ return prio_deadline_diff(39); ++} ++ ++static inline int ms_longest_deadline_diff(void) ++{ ++ return NS_TO_MS(longest_deadline_diff()); ++} ++ ++static inline bool rq_local(struct rq *rq); ++ ++#ifndef SCHED_CAPACITY_SCALE ++#define SCHED_CAPACITY_SCALE 1024 ++#endif ++ ++static inline int rq_load(struct rq *rq) ++{ ++ return rq->nr_running; ++} ++ ++/* ++ * Update the load average for feeding into cpu frequency governors. Use a ++ * rough estimate of a rolling average with ~ time constant of 32ms. ++ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 ++ * Make sure a call to update_clocks has been made before calling this to get ++ * an updated rq->niffies. ++ */ ++static void update_load_avg(struct rq *rq, unsigned int flags) ++{ ++ unsigned long us_interval, curload; ++ long load; ++ ++ if (unlikely(rq->niffies <= rq->load_update)) ++ return; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->load_update); ++ curload = rq_load(rq); ++ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->load_avg = load; ++ ++ rq->load_update = rq->niffies; ++ if (likely(rq_local(rq))) ++ cpufreq_trigger(rq, flags); ++} ++ ++/* ++ * Removing from the runqueue. Enter with rq locked. Deleting a task ++ * from the skip list is done via the stored node reference in the task struct ++ * and does not require a full look up. Thus it occurs in O(k) time where k ++ * is the "level" of the list the task was stored at - usually < 4, max 8. ++ */ ++static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ skiplist_delete(rq->sl, &p->node); ++ rq->best_key = rq->node.next[0]->key; ++ update_clocks(rq); ++ ++ if (!(flags & DEQUEUE_SAVE)) ++ sched_info_dequeued(task_rq(p), p); ++ update_load_avg(rq, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_RCU ++static bool rcu_read_critical(struct task_struct *p) ++{ ++ return p->rcu_read_unlock_special.b.blocked; ++} ++#else /* CONFIG_PREEMPT_RCU */ ++#define rcu_read_critical(p) (false) ++#endif /* CONFIG_PREEMPT_RCU */ ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && ++ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); ++} ++ ++/* ++ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check ++ * that the iso_refractory flag is not set. ++ */ ++static inline bool isoprio_suitable(struct rq *rq) ++{ ++ return !rq->iso_refractory; ++} ++ ++/* ++ * Adding to the runqueue. Enter with rq locked. ++ */ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ unsigned int randseed, cflags = 0; ++ u64 sl_id; ++ ++ if (!rt_task(p)) { ++ /* Check it hasn't gotten rt from PI */ ++ if ((idleprio_task(p) && idleprio_suitable(p)) || ++ (iso_task(p) && isoprio_suitable(rq))) ++ p->prio = p->normal_prio; ++ else ++ p->prio = NORMAL_PRIO; ++ } ++ /* ++ * The sl_id key passed to the skiplist generates a sorted list. ++ * Realtime and sched iso tasks run FIFO so they only need be sorted ++ * according to priority. The skiplist will put tasks of the same ++ * key inserted later in FIFO order. Tasks of sched normal, batch ++ * and idleprio are sorted according to their deadlines. Idleprio ++ * tasks are offset by an impossibly large deadline value ensuring ++ * they get sorted into last positions, but still according to their ++ * own deadlines. This creates a "landscape" of skiplists running ++ * from priority 0 realtime in first place to the lowest priority ++ * idleprio tasks last. Skiplist insertion is an O(log n) process. ++ */ ++ if (p->prio <= ISO_PRIO) { ++ sl_id = p->prio; ++ cflags = SCHED_CPUFREQ_RT; ++ } else { ++ sl_id = p->deadline; ++ if (idleprio_task(p)) { ++ if (p->prio == IDLE_PRIO) ++ sl_id |= 0xF000000000000000; ++ else ++ sl_id += longest_deadline_diff(); ++ } ++ } ++ /* ++ * Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as the random seed for skiplist insertion. ++ */ ++ update_clocks(rq); ++ if (!(flags & ENQUEUE_RESTORE)) ++ sched_info_queued(rq, p); ++ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; ++ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); ++ rq->best_key = rq->node.next[0]->key; ++ if (p->in_iowait) ++ cflags |= SCHED_CPUFREQ_IOWAIT; ++ update_load_avg(rq, cflags); ++} ++ ++/* ++ * Returns the relative length of deadline all compared to the shortest ++ * deadline which is that of nice -20. ++ */ ++static inline int task_prio_ratio(struct task_struct *p) ++{ ++ return prio_ratios[TASK_USER_PRIO(p)]; ++} ++ ++/* ++ * task_timeslice - all tasks of all priorities get the exact same timeslice ++ * length. CPU distribution is handled by giving different deadlines to ++ * tasks of different priorities. Use 128 as the base value for fast shifts. ++ */ ++static inline int task_timeslice(struct task_struct *p) ++{ ++ return (rr_interval * task_prio_ratio(p) / 128); ++} ++ ++#ifdef CONFIG_SMP ++/* Entered with rq locked */ ++static inline void resched_if_idle(struct rq *rq) ++{ ++ if (rq_idle(rq)) ++ resched_task(rq->curr); ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return (rq->cpu == smp_processor_id()); ++} ++#ifdef CONFIG_SMT_NICE ++static const cpumask_t *thread_cpumask(int cpu); ++ ++/* Find the best real time priority running on any SMT siblings of cpu and if ++ * none are running, the static priority of the best deadline task running. ++ * The lookups to the other runqueues is done lockless as the occasional wrong ++ * value would be harmless. */ ++static int best_smt_bias(struct rq *this_rq) ++{ ++ int other_cpu, best_bias = 0; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq = cpu_rq(other_cpu); ++ ++ if (rq_idle(rq)) ++ continue; ++ if (unlikely(!rq->online)) ++ continue; ++ if (!rq->rq_mm) ++ continue; ++ if (likely(rq->rq_smt_bias > best_bias)) ++ best_bias = rq->rq_smt_bias; ++ } ++ return best_bias; ++} ++ ++static int task_prio_bias(struct task_struct *p) ++{ ++ if (rt_task(p)) ++ return 1 << 30; ++ else if (task_running_iso(p)) ++ return 1 << 29; ++ else if (task_running_idle(p)) ++ return 0; ++ return MAX_PRIO - p->static_prio; ++} ++ ++static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) ++{ ++ return true; ++} ++ ++static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; ++ ++/* We've already decided p can run on CPU, now test if it shouldn't for SMT ++ * nice reasons. */ ++static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) ++{ ++ int best_bias, task_bias; ++ ++ /* Kernel threads always run */ ++ if (unlikely(!p->mm)) ++ return true; ++ if (rt_task(p)) ++ return true; ++ if (!idleprio_suitable(p)) ++ return true; ++ best_bias = best_smt_bias(this_rq); ++ /* The smt siblings are all idle or running IDLEPRIO */ ++ if (best_bias < 1) ++ return true; ++ task_bias = task_prio_bias(p); ++ if (task_bias < 1) ++ return false; ++ if (task_bias >= best_bias) ++ return true; ++ /* Dither 25% cpu of normal tasks regardless of nice difference */ ++ if (best_bias % 4 == 1) ++ return true; ++ /* Sorry, you lose */ ++ return false; ++} ++#else /* CONFIG_SMT_NICE */ ++#define smt_schedule(p, this_rq) (true) ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) ++{ ++ set_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++/* ++ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to ++ * allow easy lookup of whether any suitable idle CPUs are available. ++ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the ++ * idle_cpus variable than to do a full bitmask check when we are busy. The ++ * bits are set atomically but read locklessly as occasional false positive / ++ * negative is harmless. ++ */ ++static inline void set_cpuidle_map(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ atomic_set_cpu(cpu, &cpu_idle_map); ++} ++ ++static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) ++{ ++ clear_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++ atomic_clear_cpu(cpu, &cpu_idle_map); ++} ++ ++static bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return (cpumask_intersects(&p->cpus_allowed, &cpu_idle_map)); ++} ++ ++/* ++ * Resched current on rq. We don't know if rq is local to this CPU nor if it ++ * is locked so we do not use an intermediate variable for the task to avoid ++ * having it dereferenced. ++ */ ++static void resched_curr(struct rq *rq) ++{ ++ int cpu; ++ ++ if (test_tsk_need_resched(rq->curr)) ++ return; ++ ++ rq->preempt = rq->curr; ++ cpu = rq->cpu; ++ ++ /* We're doing this without holding the rq lock if it's not task_rq */ ++ ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(rq->curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(rq->curr)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++#define CPUIDLE_DIFF_THREAD (1) ++#define CPUIDLE_DIFF_CORE (2) ++#define CPUIDLE_CACHE_BUSY (4) ++#define CPUIDLE_DIFF_CPU (8) ++#define CPUIDLE_THREAD_BUSY (16) ++#define CPUIDLE_DIFF_NODE (32) ++ ++/* ++ * The best idle CPU is chosen according to the CPUIDLE ranking above where the ++ * lowest value would give the most suitable CPU to schedule p onto next. The ++ * order works out to be the following: ++ * ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ */ ++static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) ++{ ++ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | ++ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | ++ CPUIDLE_DIFF_THREAD; ++ int cpu_tmp; ++ ++ if (cpumask_test_cpu(best_cpu, tmpmask)) ++ goto out; ++ ++ for_each_cpu(cpu_tmp, tmpmask) { ++ int ranking, locality; ++ struct rq *tmp_rq; ++ ++ ranking = 0; ++ tmp_rq = cpu_rq(cpu_tmp); ++ ++ locality = rq->cpu_locality[cpu_tmp]; ++#ifdef CONFIG_NUMA ++ if (locality > 3) ++ ranking |= CPUIDLE_DIFF_NODE; ++ else ++#endif ++ if (locality > 2) ++ ranking |= CPUIDLE_DIFF_CPU; ++#ifdef CONFIG_SCHED_MC ++ else if (locality == 2) ++ ranking |= CPUIDLE_DIFF_CORE; ++ else if (!(tmp_rq->cache_idle(tmp_rq))) ++ ranking |= CPUIDLE_CACHE_BUSY; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ if (locality == 1) ++ ranking |= CPUIDLE_DIFF_THREAD; ++ if (!(tmp_rq->siblings_idle(tmp_rq))) ++ ranking |= CPUIDLE_THREAD_BUSY; ++#endif ++ if (ranking < best_ranking) { ++ best_cpu = cpu_tmp; ++ best_ranking = ranking; ++ } ++ } ++out: ++ return best_cpu; ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ struct rq *this_rq = cpu_rq(this_cpu); ++ ++ return (this_rq->cpu_locality[that_cpu] < 3); ++} ++ ++/* As per resched_curr but only will resched idle task */ ++static inline void resched_idle(struct rq *rq) ++{ ++ if (test_tsk_need_resched(rq->idle)) ++ return; ++ ++ rq->preempt = rq->idle; ++ ++ set_tsk_need_resched(rq->idle); ++ ++ if (rq_local(rq)) { ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ smp_sched_reschedule(rq->cpu); ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ cpumask_t tmpmask; ++ struct rq *rq; ++ int best_cpu; ++ ++ cpumask_and(&tmpmask, &p->cpus_allowed, &cpu_idle_map); ++ best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); ++ rq = cpu_rq(best_cpu); ++ if (!smt_schedule(p, rq)) ++ return NULL; ++ rq->preempt = p; ++ resched_idle(rq); ++ return rq; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq->rq_order[cpu]; ++} ++#else /* CONFIG_SMP */ ++static inline void set_cpuidle_map(int cpu) ++{ ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++} ++ ++static inline bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return uprq->curr == uprq->idle; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++} ++ ++static inline void resched_curr(struct rq *rq) ++{ ++ resched_task(rq->curr); ++} ++ ++static inline void resched_if_idle(struct rq *rq) ++{ ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return true; ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq; ++} ++ ++static inline bool smt_schedule(struct task_struct *p, struct rq *rq) ++{ ++ return true; ++} ++#endif /* CONFIG_SMP */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ if (idleprio_task(p)) ++ return IDLE_PRIO; ++ if (iso_task(p)) ++ return ISO_PRIO; ++ return NORMAL_PRIO; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. Enter with rq locked. ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ resched_if_idle(rq); ++ ++ /* ++ * Sleep time is in units of nanosecs, so shift by 20 to get a ++ * milliseconds-range estimation of the amount of time that the task ++ * spent sleeping: ++ */ ++ if (unlikely(prof_on == SLEEP_PROFILING)) { ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), ++ (rq->niffies - p->last_ran) >> 20); ++ } ++ ++ p->prio = effective_prio(p); ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ ++ enqueue_task(rq, p, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++} ++ ++/* ++ * deactivate_task - If it's running, it's not on the runqueue and we can just ++ * decrement the nr_running. Enter with rq locked. ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ ++ p->on_rq = 0; ++ sched_info_dequeued(rq, p); ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++ struct rq *rq; ++ ++ if (task_cpu(p) == new_cpu) ++ return; ++ ++ /* Do NOT call set_task_cpu on a currently queued task as we will not ++ * be reliably holding the rq lock after changing CPU. */ ++ BUG_ON(task_queued(p)); ++ rq = task_rq(p); ++ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * Furthermore, all task_rq users should acquire both locks, see ++ * task_rq_lock(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&rq->lock))); ++#endif ++ ++ trace_sched_migrate_task(p, new_cpu); ++ perf_event_task_migrate(p); ++ ++ /* ++ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++ p->wake_cpu = new_cpu; ++ ++ if (task_running(rq, p)) { ++ /* ++ * We should only be calling this on a running task if we're ++ * holding rq lock. ++ */ ++ lockdep_assert_held(&rq->lock); ++ ++ /* ++ * We can't change the task_thread_info CPU on a running task ++ * as p will still be protected by the rq lock of the CPU it ++ * is still running on so we only set the wake_cpu for it to be ++ * lazily updated once off the CPU. ++ */ ++ return; ++ } ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ p->cpu = new_cpu; ++#else ++ task_thread_info(p)->cpu = new_cpu; ++#endif ++ /* We're no longer protecting p after this point since we're holding ++ * the wrong runqueue lock. */ ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Move a task off the runqueue and take it to a cpu for it will ++ * become the running task. ++ */ ++static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ struct rq *p_rq = task_rq(p); ++ ++ dequeue_task(p_rq, p, DEQUEUE_SAVE); ++ if (p_rq != rq) { ++ sched_info_dequeued(p_rq, p); ++ sched_info_queued(rq, p); ++ } ++ set_task_cpu(p, cpu); ++} ++ ++/* ++ * Returns a descheduling task to the runqueue unless it is being ++ * deactivated. ++ */ ++static inline void return_task(struct task_struct *p, struct rq *rq, ++ int cpu, bool deactivate) ++{ ++ if (deactivate) ++ deactivate_task(p, rq); ++ else { ++#ifdef CONFIG_SMP ++ /* ++ * set_task_cpu was called on the running task that doesn't ++ * want to deactivate so it has to be enqueued to a different ++ * CPU and we need its lock. Tag it to be moved with as the ++ * lock is dropped in finish_lock_switch. ++ */ ++ if (unlikely(p->wake_cpu != cpu)) ++ p->on_rq = TASK_ON_RQ_MIGRATING; ++ else ++#endif ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ } ++} ++ ++/* Enter with rq lock held. We know p is on the local cpu */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ int running, queued; ++ unsigned long flags; ++ unsigned long ncsw; ++ struct rq *rq; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(rq, p)) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ rq = task_rq_lock(p, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(rq, p); ++ queued = task_on_rq_queued(p); ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_rq_unlock(rq, p, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(queued)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_sched_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++#endif ++ ++/* ++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the ++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or ++ * between themselves, they cooperatively multitask. An idle rq scores as ++ * prio PRIO_LIMIT so it is always preempted. ++ */ ++static inline bool ++can_preempt(struct task_struct *p, int prio, u64 deadline) ++{ ++ /* Better static priority RT task or better policy preemption */ ++ if (p->prio < prio) ++ return true; ++ if (p->prio > prio) ++ return false; ++ if (p->policy == SCHED_BATCH) ++ return false; ++ /* SCHED_NORMAL and ISO will preempt based on deadline */ ++ if (!deadline_before(p->deadline, deadline)) ++ return false; ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * Check to see if p can run on cpu, and if not, whether there are any online ++ * CPUs it can run on instead. ++ */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed))) ++ return true; ++ return false; ++} ++#define cpu_online_map (*(cpumask_t *)cpu_online_mask) ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ int i, this_entries = rq_load(this_rq); ++ cpumask_t tmp; ++ ++ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) ++ return; ++ ++ /* IDLEPRIO tasks never preempt anything but idle */ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ ++ cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed); ++ ++ for (i = 0; i < num_possible_cpus(); i++) { ++ struct rq *rq = this_rq->rq_order[i]; ++ ++ if (!cpumask_test_cpu(rq->cpu, &tmp)) ++ continue; ++ ++ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) ++ continue; ++ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { ++ /* We set rq->preempting lockless, it's a hint only */ ++ rq->preempting = p; ++ resched_curr(rq); ++ return; ++ } ++ } ++} ++ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check); ++#else /* CONFIG_SMP */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ return false; ++} ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) ++ resched_curr(uprq); ++} ++ ++static inline int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq = this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) ++ schedstat_inc(rq->ttwu_local); ++ else { ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(rq->cpu, sd) { ++ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ schedstat_inc(sd->ttwu_wake_remote); ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ schedstat_inc(rq->ttwu_count); ++} ++ ++static inline void ttwu_activate(struct rq *rq, struct task_struct *p) ++{ ++ activate_task(p, rq); ++ ++ /* if a worker is waking up, notify the workqueue */ ++ if (p->flags & PF_WQ_WORKER) ++ wq_worker_waking_up(p, cpu_of(rq)); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ /* ++ * Sync wakeups (i.e. those types of wakeups where the waker ++ * has indicated that it will leave the CPU in short order) ++ * don't trigger a preemption if there are no idle cpus, ++ * instead waiting for current to deschedule. ++ */ ++ if (wake_flags & WF_SYNC) ++ resched_suitable_idle(p); ++ else ++ try_preempt(p, rq); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++#endif ++ ++ ttwu_activate(rq, p); ++ ttwu_do_wakeup(rq, p, wake_flags); ++} ++ ++/* ++ * Called in case the task @p isn't fully descheduled from its runqueue, ++ * in this case we must do a remote wakeup. Its a 'light' wakeup though, ++ * since all we need to do is flip p->state to TASK_RUNNING, since ++ * the task is still ->on_rq. ++ */ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = __task_rq_lock(p); ++ if (likely(task_on_rq_queued(p))) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_rq_unlock(rq); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void) ++{ ++ struct rq *rq = this_rq(); ++ struct llist_node *llist = llist_del_all(&rq->wake_list); ++ struct task_struct *p, *t; ++ unsigned long flags; ++ ++ if (!llist) ++ return; ++ ++ rq_lock_irqsave(rq, &flags); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry) ++ ttwu_do_activate(rq, p, 0); ++ ++ rq_unlock_irqrestore(rq, &flags); ++} ++ ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) ++ return; ++ ++ /* ++ * Not all reschedule IPI handlers call irq_enter/irq_exit, since ++ * traditionally all their work was done from the interrupt return ++ * path. Now that we actually do some work, we need to make sure ++ * we do call them. ++ * ++ * Some archs already do call them, luckily irq_enter/exit nest ++ * properly. ++ * ++ * Arguably we should visit all archs and update all handlers, ++ * however a fair share of IPIs are still resched only so this would ++ * somewhat pessimize the simple resched case. ++ */ ++ irq_enter(); ++ sched_ttwu_pending(); ++ irq_exit(); ++} ++ ++static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { ++ if (!set_nr_if_polling(rq->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++ } ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ rq_lock_irqsave(rq, &flags); ++ if (likely(is_idle_task(rq->curr))) ++ smp_sched_reschedule(cpu); ++ /* Else cpu is not in idle, do nothing here */ ++ rq_unlock_irqrestore(rq, &flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ cpumask_t valid_mask; ++ ++ if (p->flags & PF_KTHREAD) ++ cpumask_and(&valid_mask, &p->cpus_allowed, cpu_online_mask); ++ else ++ cpumask_and(&valid_mask, &p->cpus_allowed, cpu_active_mask); ++ ++ if (unlikely(!cpumask_weight(&valid_mask))) { ++ /* Hotplug boot threads do this before the CPU is up */ ++ printk(KERN_INFO "SCHED: No cpumask for %s/%d weight %d\n", p->comm, p->pid, cpumask_weight(&p->cpus_allowed)); ++ return cpumask_any(&p->cpus_allowed); ++ } ++ return cpumask_any(&valid_mask); ++} ++ ++/* ++ * For a task that's just being woken up we have a valuable balancing ++ * opportunity so choose the nearest cache most lightly loaded runqueue. ++ * Entered with rq locked and returns with the chosen runqueue locked. ++ */ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ unsigned int idlest = ~0U; ++ struct rq *rq = NULL; ++ int i; ++ ++ if (suitable_idle_cpus(p)) { ++ int cpu = task_cpu(p); ++ ++ if (unlikely(needs_other_cpu(p, cpu))) ++ cpu = valid_task_cpu(p); ++ rq = resched_best_idle(p, cpu); ++ if (likely(rq)) ++ return rq->cpu; ++ } ++ ++ for (i = 0; i < num_possible_cpus(); i++) { ++ struct rq *other_rq = task_rq(p)->rq_order[i]; ++ int entries; ++ ++ if (!other_rq->online) ++ continue; ++ if (needs_other_cpu(p, other_rq->cpu)) ++ continue; ++ entries = rq_load(other_rq); ++ if (entries >= idlest) ++ continue; ++ idlest = entries; ++ rq = other_rq; ++ } ++ if (unlikely(!rq)) ++ return task_cpu(p); ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ return NULL; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++#if defined(CONFIG_SMP) ++ if (!cpus_share_cache(smp_processor_id(), cpu)) { ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ ttwu_queue_remote(p, cpu, wake_flags); ++ return; ++ } ++#endif ++ rq_lock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ rq_unlock(rq); ++} ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int ++try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ /* state is a volatile long, どうして、分からない */ ++ if (!((unsigned int)p->state & state)) ++ goto out; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * [S] p->on_rq = 1; [L] P->state ++ * UNLOCK rq->lock -----. ++ * \ ++ * +--- RMB ++ * schedule() / ++ * LOCK rq->lock -----' ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq ++ * ++ * Pairs with the UNLOCK+LOCK on rq->lock from the ++ * last wakeup of our task and the schedule that got our task ++ * current. ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto stat; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * [S] ->on_cpu = 1; [L] ->on_rq ++ * UNLOCK rq->lock ++ * RMB ++ * LOCK rq->lock ++ * [S] ->on_rq = 0; [L] ->on_cpu ++ * ++ * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock ++ * from the consecutive calls to schedule(); the first switching to our ++ * task, the second putting it to sleep. ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_lock_switch(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ cpu = select_best_cpu(p); ++ if (task_cpu(p) != cpu) ++ set_task_cpu(p, cpu); ++ ++#else /* CONFIG_SMP */ ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++stat: ++ ttwu_stat(p, cpu, wake_flags); ++out: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return success; ++} ++ ++/** ++ * try_to_wake_up_local - try to wake up a local task with rq lock held ++ * @p: the thread to be awakened ++ * ++ * Put @p on the run-queue if it's not already there. The caller must ++ * ensure that rq is locked and, @p is not the current task. ++ * rq stays locked over invocation. ++ */ ++static void try_to_wake_up_local(struct task_struct *p) ++{ ++ struct rq *rq = task_rq(p); ++ ++ if (WARN_ON_ONCE(rq != this_rq()) || ++ WARN_ON_ONCE(p == current)) ++ return; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (!raw_spin_trylock(&p->pi_lock)) { ++ /* ++ * This is OK, because current is on_cpu, which avoids it being ++ * picked for load-balance and preemption/IRQs are still ++ * disabled avoiding further scheduler activity on it and we've ++ * not yet picked a replacement task. ++ */ ++ rq_unlock(rq); ++ raw_spin_lock(&p->pi_lock); ++ rq_lock(rq); ++ } ++ ++ if (!(p->state & TASK_NORMAL)) ++ goto out; ++ ++ trace_sched_waking(p); ++ ++ if (!task_on_rq_queued(p)) { ++ if (p->in_iowait) { ++ delayacct_blkio_end(); ++ atomic_dec(&rq->nr_iowait); ++ } ++ ttwu_activate(rq, p); ++ } ++ ++ ttwu_do_wakeup(rq, p, 0); ++ ttwu_stat(p, smp_processor_id(), 0); ++out: ++ raw_spin_unlock(&p->pi_lock); ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * It may be assumed that this function implies a write memory barrier before ++ * changing the task state if and only if any tasks are woken up. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++static void time_slice_expired(struct task_struct *p, struct rq *rq); ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ int cpu = get_cpu(); ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * The process state is set to the same value of the process executing ++ * do_fork() code. That is running. This guarantees that nobody will ++ * actually run it, and a signal or other external event cannot wake ++ * it up and insert it on the runqueue either. ++ */ ++ ++ /* Should be reset in fork.c but done here for ease of MuQSS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = ++ p->stime_ns = ++ p->utime_ns = 0; ++ skiplist_node_init(&p->node); ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { ++ p->policy = SCHED_NORMAL; ++ p->normal_prio = normal_prio(p); ++ } ++ ++ if (PRIO_TO_NICE(p->static_prio) < 0) { ++ p->static_prio = NICE_TO_PRIO(0); ++ p->normal_prio = p->static_prio; ++ } ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ set_task_cpu(p, cpu); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ put_cpu(); ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); ++ ++static void account_task_cpu(struct rq *rq, struct task_struct *p) ++{ ++ update_clocks(rq); ++ /* This isn't really a context switch but accounting is the same */ ++ update_cpu_clock_switch(rq, p); ++ p->last_ran = rq->niffies; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++static inline int hrexpiry_enabled(struct rq *rq) ++{ ++ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrexpiry_timer); ++} ++ ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++static inline void hrexpiry_clear(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (hrtimer_active(&rq->hrexpiry_timer)) ++ hrtimer_cancel(&rq->hrexpiry_timer); ++} ++ ++/* ++ * High-resolution time_slice expiry. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrexpiry(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); ++ struct task_struct *p; ++ ++ /* This can happen during CPU hotplug / resume */ ++ if (unlikely(cpu_of(rq) != smp_processor_id())) ++ goto out; ++ ++ /* ++ * We're doing this without the runqueue lock but this should always ++ * be run on the local CPU. Time slice should run out in __schedule ++ * but we set it to zero here in case niffies is slightly less. ++ */ ++ p = rq->curr; ++ p->time_slice = 0; ++ __set_tsk_resched(p); ++out: ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Called to set the hrexpiry timer state. ++ * ++ * called with irqs disabled from the local CPU only ++ */ ++static void hrexpiry_start(struct rq *rq, u64 delay) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ ++ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED); ++} ++ ++static void init_rq_hrexpiry(struct rq *rq) ++{ ++ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ rq->hrexpiry_timer.function = hrexpiry; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return HALF_JIFFY_US; ++ return 0; ++} ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ struct task_struct *parent, *rq_curr; ++ struct rq *rq, *new_rq; ++ unsigned long flags; ++ ++ parent = p->parent; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ p->state = TASK_RUNNING; ++ /* Task_rq can't change yet on a new task */ ++ new_rq = rq = task_rq(p); ++ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { ++ set_task_cpu(p, valid_task_cpu(p)); ++ new_rq = task_rq(p); ++ } ++ ++ double_rq_lock(rq, new_rq); ++ rq_curr = rq->curr; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = rq_curr->normal_prio; ++ ++ trace_sched_wakeup_new(p); ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. If it's negative, it won't ++ * matter since that's the same as being 0. rq->rq_deadline is only ++ * modified within schedule() so it is always equal to ++ * current->deadline. ++ */ ++ account_task_cpu(rq, rq_curr); ++ p->last_ran = rq_curr->last_ran; ++ if (likely(rq_curr->policy != SCHED_FIFO)) { ++ rq_curr->time_slice /= 2; ++ if (rq_curr->time_slice < RESCHED_US) { ++ /* ++ * Forking task has run out of timeslice. Reschedule it and ++ * start its child with a new time slice and deadline. The ++ * child will end up running first because its deadline will ++ * be slightly earlier. ++ */ ++ __set_tsk_resched(rq_curr); ++ time_slice_expired(p, new_rq); ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++ else if (unlikely(rq != new_rq)) ++ try_preempt(p, new_rq); ++ } else { ++ p->time_slice = rq_curr->time_slice; ++ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { ++ /* ++ * The VM isn't cloned, so we're in a good position to ++ * do child-runs-first in anticipation of an exec. This ++ * usually avoids a lot of COW overhead. ++ */ ++ __set_tsk_resched(rq_curr); ++ } else { ++ /* ++ * Adjust the hrexpiry since rq_curr will keep ++ * running and its timeslice has been shortened. ++ */ ++ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); ++ try_preempt(p, new_rq); ++ } ++ } ++ } else { ++ time_slice_expired(p, new_rq); ++ try_preempt(p, new_rq); ++ } ++ activate_task(p, new_rq); ++ double_rq_unlock(rq, new_rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE; ++ ++void preempt_notifier_inc(void) ++{ ++ static_key_slow_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_key_slow_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_key_false(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_key_false(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_key_false(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_lock_switch(rq, next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static void finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_lock_switch), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ /* ++ * The membarrier system call requires a full memory barrier ++ * after storing to rq->curr, before going back to user-space. ++ * ++ * TODO: This smp_mb__after_unlock_lock can go away if PPC end ++ * up adding a full barrier to switch_mm(), or we should figure ++ * out if a smp_mb__after_unlock_lock is really the proper API ++ * to use. ++ */ ++ smp_mb__after_unlock_lock(); ++ finish_lock_switch(rq, prev); ++ finish_arch_post_lock_switch(); ++ ++ fire_sched_in_preempt_notifiers(current); ++ if (mm) ++ mmdrop(mm); ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct(prev); ++ } ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++{ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline void ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ struct mm_struct *mm, *oldmm; ++ ++ prepare_task_switch(rq, prev, next); ++ ++ mm = next->mm; ++ oldmm = prev->active_mm; ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ if (!mm) { ++ next->active_mm = oldmm; ++ mmgrab(oldmm); ++ enter_lazy_tlb(oldmm, next); ++ } else ++ switch_mm_irqs_off(oldmm, mm, next); ++ ++ if (!prev->mm) { ++ prev->active_mm = NULL; ++ rq->prev_mm = oldmm; ++ } ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, 1, _THIS_IP_); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++static unsigned long nr_uninterruptible(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_uninterruptible; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptable section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ struct rq *rq = cpu_rq(smp_processor_id()); ++ ++ if (rq_load(rq) == 1) ++ return true; ++ else ++ return false; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += atomic_read(&cpu_rq(i)->nr_iowait); ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpufreq menu ++ * governor are using nonsensical data. Boosting frequency for a CPU that has ++ * IO-wait which might not even end up running the task when it does become ++ * runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ struct rq *this = cpu_rq(cpu); ++ return atomic_read(&this->nr_iowait); ++} ++ ++unsigned long nr_active(void) ++{ ++ return nr_running() + nr_uninterruptible(); ++} ++ ++/* ++ * I/O wait is the number of running or queued tasks with their ->rq pointer ++ * set to this cpu as being the CPU they're more likely to run on. ++ */ ++void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) ++{ ++ struct rq *rq = this_rq(); ++ ++ *nr_waiters = atomic_read(&rq->nr_iowait); ++ *load = rq_load(rq); ++} ++ ++/* Variables and functions for calc_load */ ++static unsigned long calc_load_update; ++unsigned long avenrun[3]; ++EXPORT_SYMBOL(avenrun); ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} ++ ++static unsigned long ++calc_load(unsigned long load, unsigned long exp, unsigned long active) ++{ ++ unsigned long newload; ++ ++ newload = load * exp + active * (FIXED_1 - exp); ++ if (active >= load) ++ newload += FIXED_1-1; ++ ++ return newload / FIXED_1; ++} ++ ++/* ++ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. ++ */ ++void calc_global_load(unsigned long ticks) ++{ ++ long active; ++ ++ if (time_before(jiffies, READ_ONCE(calc_load_update))) ++ return; ++ active = nr_active() * FIXED_1; ++ ++ avenrun[0] = calc_load(avenrun[0], EXP_1, active); ++ avenrun[1] = calc_load(avenrun[1], EXP_5, active); ++ avenrun[2] = calc_load(avenrun[2], EXP_15, active); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++#ifdef CONFIG_PARAVIRT ++static inline u64 steal_ticks(u64 steal) ++{ ++ if (unlikely(steal > NSEC_PER_SEC)) ++ return div_u64(steal, TICK_NSEC); ++ ++ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); ++} ++#endif ++ ++#ifndef nsecs_to_cputime ++# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) ++#endif ++ ++/* ++ * On each tick, add the number of nanoseconds to the unbanked variables and ++ * once one tick's worth has accumulated, account it allowing for accurate ++ * sub-tick accounting and totals. ++ */ ++static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ if (atomic_read(&rq->nr_iowait) > 0) { ++ rq->iowait_ns += ns; ++ if (rq->iowait_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->iowait_ns); ++ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_NSEC * ticks; ++ rq->iowait_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->idle_ns += ns; ++ if (rq->idle_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->idle_ns); ++ cpustat[CPUTIME_IDLE] += (__force u64)TICK_NSEC * ticks; ++ rq->idle_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(idle); ++} ++ ++static void pc_system_time(struct rq *rq, struct task_struct *p, ++ int hardirq_offset, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->stime_ns += ns; ++ if (p->stime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->stime_ns); ++ p->stime_ns %= JIFFY_NS; ++ p->stime += (__force u64)TICK_NSEC * ticks; ++ account_group_system_time(p, TICK_NSEC * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (hardirq_count() - hardirq_offset) { ++ rq->irq_ns += ns; ++ if (rq->irq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->irq_ns); ++ cpustat[CPUTIME_IRQ] += (__force u64)TICK_NSEC * ticks; ++ rq->irq_ns %= JIFFY_NS; ++ } ++ } else if (in_serving_softirq()) { ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_NSEC * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->system_ns += ns; ++ if (rq->system_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->system_ns); ++ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_NSEC * ticks; ++ rq->system_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->utime_ns += ns; ++ if (p->utime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->utime_ns); ++ p->utime_ns %= JIFFY_NS; ++ p->utime += (__force u64)TICK_NSEC * ticks; ++ account_group_user_time(p, TICK_NSEC * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (this_cpu_ksoftirqd() == p) { ++ /* ++ * ksoftirqd time do not get accounted in cpu_softirq_time. ++ * So, we have to handle it separately here. ++ */ ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_NSEC * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } ++ ++ if (task_nice(p) > 0 || idleprio_task(p)) { ++ rq->nice_ns += ns; ++ if (rq->nice_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->nice_ns); ++ cpustat[CPUTIME_NICE] += (__force u64)TICK_NSEC * ticks; ++ rq->nice_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->user_ns += ns; ++ if (rq->user_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->user_ns); ++ cpustat[CPUTIME_USER] += (__force u64)TICK_NSEC * ticks; ++ rq->user_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++/* ++ * This is called on clock ticks. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate tick timekeeping */ ++ if (user_mode(get_irq_regs())) ++ pc_user_time(rq, p, account_ns); ++ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { ++ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); ++ } else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++ ++ p->last_ran = rq->niffies; ++} ++ ++/* ++ * This is called on context switches. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate subtick timekeeping */ ++ if (p != idle) ++ pc_user_time(rq, p, account_ns); ++ else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++} ++ ++/* ++ * Return any ns on the sched_clock that have not yet been accounted in ++ * @p in case that task is currently running. ++ * ++ * Called with task_rq_lock(p) held. ++ */ ++static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) ++{ ++ u64 ns = 0; ++ ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_clocks(rq); ++ ns = rq->niffies - p->last_ran; ++ } ++ ++ return ns; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ * ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_rq_lock(p, &flags); ++ ns = p->sched_time + do_task_delta_exec(p, rq); ++ task_rq_unlock(rq, p, &flags); ++ ++ return ns; ++} ++ ++/* ++ * Functions to test for when SCHED_ISO tasks have used their allocated ++ * quota as real time scheduling and convert them back to SCHED_NORMAL. All ++ * data is modified only by the local runqueue during scheduler_tick with ++ * interrupts disabled. ++ */ ++ ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a ++ * slow division. ++ */ ++static inline void iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++ rq->iso_ticks += 100; ++ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { ++ rq->iso_refractory = true; ++ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) ++ rq->iso_ticks = ISO_PERIOD * 100; ++ } ++} ++ ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq, int ticks) ++{ ++ if (rq->iso_ticks > 0 || rq->iso_refractory) { ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; ++ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { ++ rq->iso_refractory = false; ++ if (unlikely(rq->iso_ticks < 0)) ++ rq->iso_ticks = 0; ++ } ++ } ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if (rt_task(p) || task_running_iso(p)) ++ iso_tick(rq); ++ else ++ no_iso_tick(rq, 1); ++ ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->policy == SCHED_FIFO) ++ return; ++ ++ if (iso_task(p)) { ++ if (task_running_iso(p)) { ++ if (rq->iso_refractory) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Force it to reschedule as ++ * SCHED_NORMAL by zeroing its time_slice ++ */ ++ p->time_slice = 0; ++ } ++ } else if (!rq->iso_refractory) { ++ /* Can now run again ISO. Reschedule to pick up prio */ ++ goto out_resched; ++ } ++ } ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ * Dither is used as a backup for when hrexpiry is disabled or high res ++ * timers not configured in. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++out_resched: ++ rq_lock(rq); ++ __set_tsk_resched(p); ++ rq_unlock(rq); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * We can stop the timer tick any time highres timers are active since ++ * we rely entirely on highres timeouts for task expiry rescheduling. ++ */ ++static void sched_stop_tick(struct rq *rq, int cpu) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (!tick_nohz_full_enabled()) ++ return; ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++static inline void sched_start_tick(struct rq *rq, int cpu) ++{ ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++/** ++ * scheduler_tick_max_deferment ++ * ++ * Keep at least one tick per second when a single ++ * active task is running. ++ * ++ * This makes sure that uptime continues to move forward, even ++ * with a very low granularity. ++ * ++ * Return: Maximum deferment in nanoseconds. ++ */ ++u64 scheduler_tick_max_deferment(void) ++{ ++ struct rq *rq = this_rq(); ++ unsigned long next, now = READ_ONCE(jiffies); ++ ++ next = rq->last_jiffy + HZ; ++ ++ if (time_before_eq(next, now)) ++ return 0; ++ ++ return jiffies_to_nsecs(next - now); ++} ++#else ++static inline void sched_stop_tick(struct rq *rq, int cpu) ++{ ++} ++ ++static inline void sched_start_tick(struct rq *rq, int cpu) ++{ ++} ++#endif ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ update_clocks(rq); ++ update_load_avg(rq, 0); ++ update_cpu_clock_tick(rq, rq->curr); ++ if (!rq_idle(rq)) ++ task_running_tick(rq); ++ else if (rq->last_jiffy > rq->last_scheduler_tick) ++ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); ++ rq->last_scheduler_tick = rq->last_jiffy; ++ rq->last_tick = rq->clock; ++ perf_event_task_tick(); ++ sched_stop_tick(rq, cpu); ++} ++ ++#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline. Make sure update_clocks has been called recently to update ++ * rq->niffies. ++ */ ++static void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ p->deadline = rq->niffies + task_deadline_diff(p); ++#ifdef CONFIG_SMT_NICE ++ if (!p->mm) ++ p->smt_bias = 0; ++ else if (rt_task(p)) ++ p->smt_bias = 1 << 30; ++ else if (task_running_iso(p)) ++ p->smt_bias = 1 << 29; ++ else if (idleprio_task(p)) { ++ if (task_running_idle(p)) ++ p->smt_bias = 0; ++ else ++ p->smt_bias = 1; ++ } else if (--p->smt_bias < 1) ++ p->smt_bias = MAX_PRIO - p->static_prio; ++#endif ++} ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (p->time_slice < RESCHED_US || batch_task(p)) ++ time_slice_expired(p, rq); ++} ++ ++/* ++ * Task selection with skiplists is a simple matter of picking off the first ++ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) ++ * being bound to the number of processors. ++ * ++ * Runqueues are selectively locked based on their unlocked data and then ++ * unlocked if not needed. At most 3 locks will be held at any time and are ++ * released as soon as they're no longer needed. All balancing between CPUs ++ * is thus done here in an extremely simple first come best fit manner. ++ * ++ * This iterates over runqueues in cache locality order. In interactive mode ++ * it iterates over all CPUs and finds the task with the best key/deadline. ++ * In non-interactive mode it will only take a task if it's from the current ++ * runqueue or a runqueue with more tasks than the current one with a better ++ * key/deadline. ++ */ ++#ifdef CONFIG_SMP ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct rq *locked = NULL, *chosen = NULL; ++ struct task_struct *edt = idle; ++ int i, best_entries = 0; ++ u64 best_key = ~0ULL; ++ ++ for (i = 0; i < num_possible_cpus(); i++) { ++ struct rq *other_rq = rq_order(rq, i); ++ int entries = other_rq->sl->entries; ++ skiplist_node *next; ++ ++ /* ++ * Check for queued entres lockless first. The local runqueue ++ * is locked so entries will always be accurate. ++ */ ++ if (!sched_interactive) { ++ /* ++ * Don't reschedule balance across nodes unless the CPU ++ * is idle. ++ */ ++ if (edt != idle && rq->cpu_locality[other_rq->cpu] > 3) ++ break; ++ if (entries <= best_entries) ++ continue; ++ } else if (!entries) ++ continue; ++ ++ /* if (i) implies other_rq != rq */ ++ if (i) { ++ /* Check for best id queued lockless first */ ++ if (other_rq->best_key >= best_key) ++ continue; ++ ++ if (unlikely(!trylock_rq(rq, other_rq))) ++ continue; ++ ++ /* Need to reevaluate entries after locking */ ++ entries = other_rq->sl->entries; ++ if (unlikely(!entries)) { ++ unlock_rq(other_rq); ++ continue; ++ } ++ } ++ ++ next = &other_rq->node; ++ /* ++ * In interactive mode we check beyond the best entry on other ++ * runqueues if we can't get the best for smt or affinity ++ * reasons. ++ */ ++ while ((next = next->next[0]) != &other_rq->node) { ++ struct task_struct *p; ++ u64 key = next->key; ++ ++ /* Reevaluate key after locking */ ++ if (key >= best_key) ++ break; ++ ++ p = next->value; ++ if (!smt_schedule(p, rq)) { ++ if (i && !sched_interactive) ++ break; ++ continue; ++ } ++ ++ /* Make sure affinity is ok */ ++ if (i) { ++ if (needs_other_cpu(p, cpu)) { ++ if (sched_interactive) ++ continue; ++ break; ++ } ++ /* From this point on p is the best so far */ ++ if (locked) ++ unlock_rq(locked); ++ chosen = locked = other_rq; ++ } ++ best_entries = entries; ++ best_key = key; ++ edt = p; ++ break; ++ } ++ /* rq->preempting is a hint only as the state may have changed ++ * since it was set with the resched call but if we have met ++ * the condition we can break out here. */ ++ if (edt == rq->preempting) ++ break; ++ if (i && other_rq != chosen) ++ unlock_rq(other_rq); ++ } ++ ++ if (likely(edt != idle)) ++ take_task(rq, cpu, edt); ++ ++ if (locked) ++ unlock_rq(locked); ++ ++ rq->preempting = NULL; ++ ++ return edt; ++} ++#else /* CONFIG_SMP */ ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct task_struct *edt; ++ ++ if (unlikely(!rq->sl->entries)) ++ return idle; ++ edt = rq->node.next[0]->value; ++ take_task(rq, cpu, edt); ++ return edt; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++/* ++ * The currently running task's information is all stored in rq local data ++ * which is only modified by the local CPU. ++ */ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ if (p == rq->idle || p->policy == SCHED_FIFO) ++ hrexpiry_clear(rq); ++ else ++ hrexpiry_start(rq, US_TO_NS(p->time_slice)); ++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) ++ rq->dither = 0; ++ else ++ rq->dither = rq_dither(rq); ++ ++ rq->rq_deadline = p->deadline; ++ rq->rq_prio = p->prio; ++#ifdef CONFIG_SMT_NICE ++ rq->rq_mm = p->mm; ++ rq->rq_smt_bias = p->smt_bias; ++#endif ++} ++ ++#ifdef CONFIG_SMT_NICE ++static void check_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; ++static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; ++ ++/* Iterate over smt siblings when we've scheduled a process on cpu and decide ++ * whether they should continue running or be descheduled. */ ++static void check_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct task_struct *p; ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ continue; ++ p = rq->curr; ++ if (!smt_schedule(p, this_rq)) ++ resched_curr(rq); ++ } ++} ++ ++static void wake_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ resched_idle(rq); ++ } ++} ++#else ++static void check_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_siblings(struct rq __maybe_unused *this_rq) {} ++#endif ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPT=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next, *idle; ++ unsigned long *switch_count; ++ bool deactivate = false; ++ struct rq *rq; ++ u64 niffies; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ idle = rq->idle; ++ ++ schedule_debug(prev); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ */ ++ rq_lock(rq); ++ smp_mb__after_spinlock(); ++#ifdef CONFIG_SMP ++ if (rq->preempt) { ++ /* ++ * Make sure resched_curr hasn't triggered a preemption ++ * locklessly on a task that has since scheduled away. Spurious ++ * wakeup of idle is okay though. ++ */ ++ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { ++ rq->preempt = NULL; ++ clear_preempt_need_resched(); ++ rq_unlock_irq(rq); ++ return; ++ } ++ rq->preempt = NULL; ++ } ++#endif ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (unlikely(signal_pending_state(prev->state, prev))) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate = true; ++ prev->on_rq = 0; ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ ++ /* ++ * If a worker is going to sleep, notify and ++ * ask workqueue whether it wants to wake up a ++ * task to maintain concurrency. If so, wake ++ * up the task. ++ */ ++ if (prev->flags & PF_WQ_WORKER) { ++ struct task_struct *to_wakeup; ++ ++ to_wakeup = wq_worker_sleeping(prev); ++ if (to_wakeup) ++ try_to_wake_up_local(to_wakeup); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ /* ++ * Store the niffy value here for use by the next task's last_ran ++ * below to avoid losing niffies due to update_clocks being called ++ * again after this point. ++ */ ++ update_clocks(rq); ++ niffies = rq->niffies; ++ update_cpu_clock_switch(rq, prev); ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ if (idle != prev) { ++ check_deadline(prev, rq); ++ return_task(prev, rq, cpu, deactivate); ++ } ++ ++ next = earliest_deadline_task(rq, cpu, idle); ++ if (likely(next->prio != PRIO_LIMIT)) ++ clear_cpuidle_map(cpu); ++ else { ++ set_cpuidle_map(cpu); ++ update_load_avg(rq, 0); ++ } ++ ++ set_rq_task(rq, next); ++ next->last_ran = niffies; ++ ++ if (likely(prev != next)) { ++ /* ++ * Don't reschedule an idle task or deactivated tasks ++ */ ++ if (prev != idle && !deactivate) ++ resched_suitable_idle(prev); ++ if (next != idle) ++ check_siblings(rq); ++ else ++ wake_siblings(rq); ++ rq->nr_switches++; ++ rq->curr = next; ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. For TSO ++ * (e.g. x86), the architecture must provide its own ++ * barrier in switch_mm(). For weakly ordered machines ++ * for which spin_unlock() acts as a full memory ++ * barrier, finish_lock_switch() in common code takes ++ * care of this barrier. For weakly ordered machines for ++ * which spin_unlock() acts as a RELEASE barrier (only ++ * arm64 and PowerPC), arm64 has a full barrier in ++ * switch_to(), and PowerPC has ++ * smp_mb__after_unlock_lock() before ++ * finish_lock_switch(). ++ */ ++ ++*switch_count; ++ ++ trace_sched_switch(preempt, prev, next); ++ context_switch(rq, prev, next); /* unlocks the rq */ ++ } else { ++ check_siblings(rq); ++ rq_unlock(rq); ++ do_pending_softirq(rq, next); ++ local_irq_enable(); ++ } ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* ++ * The setting of TASK_RUNNING by try_to_wake_up() may be delayed ++ * when the following two conditions become true. ++ * - There is race condition of mmap_sem (It is acquired by ++ * exit_mm()), and ++ * - SMI occurs before setting TASK_RUNINNG. ++ * (or hypervisor of virtual machine switches to other guest) ++ * As a result, we may become TASK_RUNNING after becoming TASK_DEAD ++ * ++ * To avoid it, we have to wait for releasing tsk->pi_lock which ++ * is held by try_to_wake_up() ++ */ ++ raw_spin_lock_irq(¤t->pi_lock); ++ raw_spin_unlock_irq(¤t->pi_lock); ++ ++ /* Causes final put_task_struct in finish_task_switch(). */ ++ __set_current_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state || tsk_is_pi_blocked(tsk) || ++ preempt_count() || ++ signal_pending_state(tsk->state, tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++} ++ ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != IN_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPT ++/* ++ * this is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. Kernel preemptions off return from interrupt ++ * occur there and call schedule directly. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPT */ ++ ++/* ++ * this is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio, oldprio; ++ struct rq *rq; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_rq_lock(p); ++ update_rq_clock(rq); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ oldprio = p->prio; ++ p->prio = prio; ++ if (task_running(rq, p)){ ++ if (prio > oldprio) ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (prio < oldprio) ++ try_preempt(p, rq); ++ } ++out_unlock: ++ __task_rq_unlock(rq); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++/* ++ * Adjust the deadline for when the priority is to change, before it's ++ * changed. ++ */ ++static inline void adjust_deadline(struct task_struct *p, int new_prio) ++{ ++ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); ++} ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static, old_static; ++ unsigned long flags; ++ struct rq *rq; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ rq = task_rq_lock(p, &flags); ++ update_rq_clock(rq); ++ ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (has_rt_policy(p)) { ++ p->static_prio = new_static; ++ goto out_unlock; ++ } ++ ++ adjust_deadline(p, new_static); ++ old_static = p->static_prio; ++ p->static_prio = new_static; ++ p->prio = effective_prio(p); ++ ++ if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (new_static < old_static) ++ try_preempt(p, rq); ++ } else if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ if (old_static < new_static) ++ resched_task(p); ++ } ++out_unlock: ++ task_rq_unlock(rq, p, &flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int delta, prio = p->prio - MAX_RT_PRIO; ++ ++ /* rt tasks and iso tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ /* Convert to ms to avoid overflows */ ++ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); ++ if (unlikely(delta < 0)) ++ delta = 0; ++ delta = delta * 40 / ms_longest_deadline_diff(); ++ if (delta <= 80) ++ prio += delta; ++ if (idleprio_task(p)) ++ prio += 40; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the CPU @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, ++ int prio, bool keep_boost) ++{ ++ int oldrtprio, oldprio; ++ ++ p->policy = policy; ++ oldrtprio = p->rt_priority; ++ p->rt_priority = prio; ++ p->normal_prio = normal_prio(p); ++ oldprio = p->prio; ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ ++ if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (p->prio < oldprio || p->rt_priority > oldrtprio) ++ try_preempt(p, rq); ++ } ++} ++ ++/* ++ * Check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int ++__sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool user, bool pi) ++{ ++ struct sched_param zero_param = { .sched_priority = 0 }; ++ unsigned long flags, rlim_rtprio = 0; ++ int retval, oldpolicy = -1; ++ int reset_on_fork; ++ struct rq *rq; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ param = &zero_param; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); ++ policy &= ~SCHED_RESET_ON_FORK; ++ ++ if (!SCHED_RANGE(policy)) ++ return -EINVAL; ++ } ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH is 0. ++ */ ++ if (param->sched_priority < 0 || ++ (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && param->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if (is_rt_policy(policy) != (param->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (is_rt_policy(policy)) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (param->sched_priority > p->rt_priority && ++ param->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy != SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag: */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ * ++ * To be able to change p->policy safely, the runqueue lock must be ++ * held. ++ */ ++ rq = task_rq_lock(p, &flags); ++ update_rq_clock(rq); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea: ++ */ ++ if (p == rq->stop) { ++ task_rq_unlock(rq, p, &flags); ++ return -EINVAL; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || ++ param->sched_priority == p->rt_priority))) { ++ task_rq_unlock(rq, p, &flags); ++ return 0; ++ } ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ task_rq_unlock(rq, p, &flags); ++ goto recheck; ++ } ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ __setscheduler(p, rq, policy, param->sched_priority, pi); ++ task_rq_unlock(rq, p, &flags); ++ ++ if (pi) ++ rt_mutex_adjust_pi(p); ++out: ++ return 0; ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return __sched_setscheduler(p, policy, param, true, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ const struct sched_param param = { .sched_priority = attr->sched_priority }; ++ int policy = attr->sched_policy; ++ ++ return __sched_setscheduler(p, policy, ¶m, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return __sched_setscheduler(p, policy, param, false, true); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setscheduler(p, policy, &lparam); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, ++ struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) ++ return -EFAULT; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* Bail out on silly large: */ ++ if (size > PAGE_SIZE) ++ goto err_size; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0) ++ goto err_size; ++ ++ /* ++ * If we're handed a bigger struct than we know of, ++ * ensure all the unknown bits are 0 - i.e. new ++ * user-space does not rely on any kernel feature ++ * extensions we dont know about yet. ++ */ ++ if (size > sizeof(*attr)) { ++ unsigned char __user *addr; ++ unsigned char __user *end; ++ unsigned char val; ++ ++ addr = (void __user *)uattr + sizeof(*attr); ++ end = (void __user *)uattr + size; ++ ++ for (; addr < end; addr++) { ++ ret = get_user(val, addr); ++ if (ret) ++ return ret; ++ if (val) ++ goto err_size; ++ } ++ size = sizeof(*attr); ++ } ++ ++ ret = copy_from_user(attr, uattr, size); ++ if (ret) ++ return -EFAULT; ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++static int sched_read_attr(struct sched_attr __user *uattr, ++ struct sched_attr *attr, ++ unsigned int usize) ++{ ++ int ret; ++ ++ if (!access_ok(VERIFY_WRITE, uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * If we're handed a smaller struct than we know of, ++ * ensure all the unknown bits are 0 - i.e. old ++ * user-space does not get uncomplete information. ++ */ ++ if (usize < sizeof(*attr)) { ++ unsigned char *addr; ++ unsigned char *end; ++ ++ addr = (void *)attr + usize; ++ end = (void *)attr + sizeof(*attr); ++ ++ for (; addr < end; addr++) { ++ if (*addr) ++ return -EFBIG; ++ } ++ ++ attr->size = usize; ++ } ++ ++ ret = copy_to_user(uattr, attr, attr->size); ++ if (ret) ++ return -EFAULT; ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return ret; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @size: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, size, unsigned int, flags) ++{ ++ struct sched_attr attr = { ++ .size = sizeof(struct sched_attr), ++ }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || size > PAGE_SIZE || ++ size < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ attr.sched_policy = p->policy; ++ if (rt_task(p)) ++ attr.sched_priority = p->rt_priority; ++ else ++ attr.sched_nice = task_nice(p); ++ ++ rcu_read_unlock(); ++ ++ retval = sched_read_attr(uattr, &attr, size); ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ cpumask_t *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ unsigned long flags; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ put_online_cpus(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ size_t retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ struct rq *rq; ++ ++ if (!sched_yield_type) ++ goto out; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ ++ if (sched_yield_type > 1) ++ time_slice_expired(current, rq); ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ rq_unlock(rq); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++out: ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPT ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++int __sched __cond_resched_softirq(void) ++{ ++ BUG_ON(!in_softirq()); ++ ++ if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { ++ local_bh_enable(); ++ preempt_schedule_common(); ++ local_bh_disable(); ++ return 1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(__cond_resched_softirq); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ sys_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ struct task_struct *rq_p; ++ struct rq *rq, *p_rq; ++ unsigned long flags; ++ int yielded = 0; ++ ++ local_irq_save(flags); ++ rq = this_rq(); ++ ++again: ++ p_rq = task_rq(p); ++ /* ++ * If we're the only runnable task on the rq and target rq also ++ * has only one task, there's absolutely no point in yielding. ++ */ ++ if (task_running(p_rq, p) || p->state) { ++ yielded = -ESRCH; ++ goto out_irq; ++ } ++ ++ double_rq_lock(rq, p_rq); ++ if (unlikely(task_rq(p) != p_rq)) { ++ double_rq_unlock(rq, p_rq); ++ goto again; ++ } ++ ++ yielded = 1; ++ schedstat_inc(rq->yld_count); ++ rq_p = rq->curr; ++ if (p->deadline > rq_p->deadline) ++ p->deadline = rq_p->deadline; ++ p->time_slice += rq_p->time_slice; ++ if (p->time_slice > timeslice()) ++ p->time_slice = timeslice(); ++ time_slice_expired(rq_p, rq); ++ if (preempt && rq != p_rq) ++ resched_task(p_rq->curr); ++ double_rq_unlock(rq, p_rq); ++out_irq: ++ local_irq_restore(flags); ++ ++ if (yielded > 0) ++ schedule(); ++ return yielded; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct timespec __user *, interval) ++{ ++ struct task_struct *p; ++ unsigned int time_slice; ++ unsigned long flags; ++ struct timespec t; ++ struct rq *rq; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ rq = task_rq_lock(p, &flags); ++ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); ++ task_rq_unlock(rq, p, &flags); ++ ++ rcu_read_unlock(); ++ t = ns_to_timespec(time_slice); ++ retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++#ifdef CONFIG_SMP ++void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_allowed, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ cpumask_copy(&p->cpus_allowed, new_mask); ++ ++ if (task_queued(p)) { ++ /* ++ * Because __kthread_bind() calls this on blocked tasks without ++ * holding rq->lock. ++ */ ++ lockdep_assert_held(&rq->lock); ++ } ++} ++ ++/* ++ * Calling do_set_cpus_allowed from outside the scheduler code should not be ++ * called on a running or queued task. We should be holding pi_lock. ++ */ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ __do_set_cpus_allowed(p, new_mask); ++ if (needs_other_cpu(p, task_cpu(p))) { ++ struct rq *rq; ++ ++ rq = __task_rq_lock(p); ++ set_task_cpu(p, valid_task_cpu(p)); ++ resched_task(p); ++ __task_rq_unlock(rq); ++ } ++} ++#endif ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ idle->last_ran = rq->niffies; ++ time_slice_expired(idle, rq); ++ idle->state = TASK_RUNNING; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#ifdef CONFIG_SMT_NICE ++ idle->smt_bias = 0; ++#endif ++#endif ++ set_rq_task(rq, idle); ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->curr = rq->idle = idle; ++ idle->on_rq = TASK_ON_RQ_QUEUED; ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_allowed may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rq_lock_irqsave(rq, &flags); ++ resched_task(cpu_curr(cpu)); ++ rq_unlock_irqrestore(rq, &flags); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct sched_domain *sd; ++ ++ if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) ++ return cpu; ++ ++ rcu_read_lock(); ++ for_each_domain(cpu, sd) { ++ for_each_cpu(i, sched_domain_span(sd)) { ++ if (cpu == i) ++ continue; ++ ++ if (!idle_cpu(i) && is_housekeeping_cpu(i)) { ++ cpu = i; ++ cpu = i; ++ goto unlock; ++ } ++ } ++ } ++ ++ if (!is_housekeeping_cpu(cpu)) ++ cpu = housekeeping_any_cpu(); ++unlock: ++ rcu_read_unlock(); ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (cpu_is_offline(cpu)) ++ return true; /* Don't try to wake offline CPUs. */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Wake up the specified CPU. If the CPU is going offline, it is the ++ * caller's responsibility to deal with the lost wakeup, for example, ++ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. ++ */ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (!wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ bool queued = false, running_wrong = false, kthread; ++ struct cpumask old_mask; ++ unsigned long flags; ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = task_rq_lock(p, &flags); ++ update_rq_clock(rq); ++ ++ kthread = !!(p->flags & PF_KTHREAD); ++ if (kthread) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ cpumask_copy(&old_mask, &p->cpus_allowed); ++ if (cpumask_equal(&old_mask, new_mask)) ++ goto out; ++ ++ if (!cpumask_intersects(new_mask, cpu_valid_mask)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ queued = task_queued(p); ++ __do_set_cpus_allowed(p, new_mask); ++ ++ if (kthread) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(rq, p)) { ++ /* Task is running on the wrong cpu now, reschedule it. */ ++ if (rq == this_rq()) { ++ set_tsk_need_resched(p); ++ running_wrong = true; ++ } else ++ resched_task(p); ++ } else { ++ int cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ ++ if (queued) { ++ /* ++ * Switch runqueue locks after dequeueing the task ++ * here while still holding the pi_lock to be holding ++ * the correct lock for enqueueing. ++ */ ++ dequeue_task(rq, p, 0); ++ rq_unlock(rq); ++ ++ rq = cpu_rq(cpu); ++ rq_lock(rq); ++ } ++ set_task_cpu(p, cpu); ++ if (queued) ++ enqueue_task(rq, p, 0); ++ } ++ if (queued) ++ try_preempt(p, rq); ++ if (running_wrong) ++ preempt_disable(); ++out: ++ task_rq_unlock(rq, p, &flags); ++ ++ if (running_wrong) { ++ __schedule(true); ++ preempt_enable(); ++ } ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Run through task list and find tasks affined to the dead cpu, then remove ++ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold ++ * cpu 0 and src_cpu's runqueue locks. ++ */ ++static void bind_zero(int src_cpu) ++{ ++ struct task_struct *p, *t; ++ struct rq *rq0; ++ int bound = 0; ++ ++ if (src_cpu == 0) ++ return; ++ ++ rq0 = cpu_rq(0); ++ ++ do_each_thread(t, p) { ++ if (cpumask_test_cpu(src_cpu, &p->cpus_allowed)) { ++ bool local = (task_cpu(p) == src_cpu); ++ struct rq *rq = task_rq(p); ++ ++ /* task_running is the cpu stopper thread */ ++ if (local && task_running(rq, p)) ++ continue; ++ atomic_clear_cpu(src_cpu, &p->cpus_allowed); ++ atomic_set_cpu(0, &p->cpus_allowed); ++ p->zerobound = true; ++ bound++; ++ if (local) { ++ bool queued = task_queued(p); ++ ++ if (queued) ++ dequeue_task(rq, p, 0); ++ set_task_cpu(p, 0); ++ if (queued) ++ enqueue_task(rq0, p, 0); ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (bound) { ++ printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n", ++ bound, src_cpu); ++ } ++} ++ ++/* Find processes with the zerobound flag and reenable their affinity for the ++ * CPU coming alive. */ ++static void unbind_zero(int src_cpu) ++{ ++ int unbound = 0, zerobound = 0; ++ struct task_struct *p, *t; ++ ++ if (src_cpu == 0) ++ return; ++ ++ do_each_thread(t, p) { ++ if (!p->mm) ++ p->zerobound = false; ++ if (p->zerobound) { ++ unbound++; ++ cpumask_set_cpu(src_cpu, &p->cpus_allowed); ++ /* Once every CPU affinity has been re-enabled, remove ++ * the zerobound flag */ ++ if (cpumask_subset(cpu_possible_mask, &p->cpus_allowed)) { ++ p->zerobound = false; ++ zerobound++; ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (unbound) { ++ printk(KERN_INFO "Added affinity for %d processes to cpu %d\n", ++ unbound, src_cpu); ++ } ++ if (zerobound) { ++ printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n", ++ zerobound); ++ } ++} ++ ++/* ++ * Ensure that the idle task is using init_mm right before its cpu goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++#else /* CONFIG_HOTPLUG_CPU */ ++static void unbind_zero(int src_cpu) {} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++ ++static struct ctl_table sd_ctl_dir[] = { ++ { ++ .procname = "sched_domain", ++ .mode = 0555, ++ }, ++ {} ++}; ++ ++static struct ctl_table sd_ctl_root[] = { ++ { ++ .procname = "kernel", ++ .mode = 0555, ++ .child = sd_ctl_dir, ++ }, ++ {} ++}; ++ ++static struct ctl_table *sd_alloc_ctl_entry(int n) ++{ ++ struct ctl_table *entry = ++ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); ++ ++ return entry; ++} ++ ++static void sd_free_ctl_entry(struct ctl_table **tablep) ++{ ++ struct ctl_table *entry; ++ ++ /* ++ * In the intermediate directories, both the child directory and ++ * procname are dynamically allocated and could fail but the mode ++ * will always be set. In the lowest directory the names are ++ * static strings and all have proc handlers. ++ */ ++ for (entry = *tablep; entry->mode; entry++) { ++ if (entry->child) ++ sd_free_ctl_entry(&entry->child); ++ if (entry->proc_handler == NULL) ++ kfree(entry->procname); ++ } ++ ++ kfree(*tablep); ++ *tablep = NULL; ++} ++ ++#define CPU_LOAD_IDX_MAX 5 ++static int min_load_idx = 0; ++static int max_load_idx = CPU_LOAD_IDX_MAX-1; ++ ++static void ++set_table_entry(struct ctl_table *entry, ++ const char *procname, void *data, int maxlen, ++ umode_t mode, proc_handler *proc_handler, ++ bool load_idx) ++{ ++ entry->procname = procname; ++ entry->data = data; ++ entry->maxlen = maxlen; ++ entry->mode = mode; ++ entry->proc_handler = proc_handler; ++ ++ if (load_idx) { ++ entry->extra1 = &min_load_idx; ++ entry->extra2 = &max_load_idx; ++ } ++} ++ ++static struct ctl_table * ++sd_alloc_ctl_domain_table(struct sched_domain *sd) ++{ ++ struct ctl_table *table = sd_alloc_ctl_entry(14); ++ ++ if (table == NULL) ++ return NULL; ++ ++ set_table_entry(&table[0], "min_interval", &sd->min_interval, ++ sizeof(long), 0644, proc_doulongvec_minmax, false); ++ set_table_entry(&table[1], "max_interval", &sd->max_interval, ++ sizeof(long), 0644, proc_doulongvec_minmax, false); ++ set_table_entry(&table[2], "busy_idx", &sd->busy_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[3], "idle_idx", &sd->idle_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[5], "wake_idx", &sd->wake_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[7], "busy_factor", &sd->busy_factor, ++ sizeof(int), 0644, proc_dointvec_minmax, false); ++ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, ++ sizeof(int), 0644, proc_dointvec_minmax, false); ++ set_table_entry(&table[9], "cache_nice_tries", ++ &sd->cache_nice_tries, ++ sizeof(int), 0644, proc_dointvec_minmax, false); ++ set_table_entry(&table[10], "flags", &sd->flags, ++ sizeof(int), 0644, proc_dointvec_minmax, false); ++ set_table_entry(&table[11], "max_newidle_lb_cost", ++ &sd->max_newidle_lb_cost, ++ sizeof(long), 0644, proc_doulongvec_minmax, false); ++ set_table_entry(&table[12], "name", sd->name, ++ CORENAME_MAX_SIZE, 0444, proc_dostring, false); ++ /* &table[13] is terminator */ ++ ++ return table; ++} ++ ++static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) ++{ ++ struct ctl_table *entry, *table; ++ struct sched_domain *sd; ++ int domain_num = 0, i; ++ char buf[32]; ++ ++ for_each_domain(cpu, sd) ++ domain_num++; ++ entry = table = sd_alloc_ctl_entry(domain_num + 1); ++ if (table == NULL) ++ return NULL; ++ ++ i = 0; ++ for_each_domain(cpu, sd) { ++ snprintf(buf, 32, "domain%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_domain_table(sd); ++ entry++; ++ i++; ++ } ++ return table; ++} ++ ++static cpumask_var_t sd_sysctl_cpus; ++static struct ctl_table_header *sd_sysctl_header; ++ ++void register_sched_domain_sysctl(void) ++{ ++ static struct ctl_table *cpu_entries; ++ static struct ctl_table **cpu_idx; ++ char buf[32]; ++ int i; ++ ++ if (!cpu_entries) { ++ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); ++ if (!cpu_entries) ++ return; ++ ++ WARN_ON(sd_ctl_dir[0].child); ++ sd_ctl_dir[0].child = cpu_entries; ++ } ++ ++ if (!cpu_idx) { ++ struct ctl_table *e = cpu_entries; ++ ++ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); ++ if (!cpu_idx) ++ return; ++ ++ /* deal with sparse possible map */ ++ for_each_possible_cpu(i) { ++ cpu_idx[i] = e; ++ e++; ++ } ++ } ++ ++ if (!cpumask_available(sd_sysctl_cpus)) { ++ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) ++ return; ++ ++ /* init to possible to not have holes in @cpu_entries */ ++ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); ++ } ++ ++ for_each_cpu(i, sd_sysctl_cpus) { ++ struct ctl_table *e = cpu_idx[i]; ++ ++ if (e->child) ++ sd_free_ctl_entry(&e->child); ++ ++ if (!e->procname) { ++ snprintf(buf, 32, "cpu%d", i); ++ e->procname = kstrdup(buf, GFP_KERNEL); ++ } ++ e->mode = 0555; ++ e->child = sd_alloc_ctl_cpu_table(i); ++ ++ __cpumask_clear_cpu(i, sd_sysctl_cpus); ++ } ++ ++ WARN_ON(sd_sysctl_header); ++ sd_sysctl_header = register_sysctl_table(sd_ctl_root); ++} ++ ++void dirty_sched_domain_sysctl(int cpu) ++{ ++ if (cpumask_available(sd_sysctl_cpus)) ++ __cpumask_set_cpu(cpu, sd_sysctl_cpus); ++} ++ ++/* may be called multiple times per register */ ++void unregister_sched_domain_sysctl(void) ++{ ++ unregister_sysctl_table(sd_sysctl_header); ++ sd_sysctl_header = NULL; ++} ++#endif /* CONFIG_SYSCTL */ ++ ++void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) { ++ cpumask_set_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = true; ++ } ++} ++ ++void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) { ++ int cpu = cpu_of(rq); ++ ++ cpumask_clear_cpu(cpu, rq->rd->online); ++ rq->online = false; ++ clear_cpuidle_map(cpu); ++ } ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) { ++ sched_domains_numa_masks_set(cpu); ++ cpuset_cpu_active(); ++ } ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all CPUs have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ rq_lock_irqsave(rq, &flags); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_online(rq); ++ } ++ unbind_zero(cpu); ++ rq_unlock_irqrestore(rq, &flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu_mult(call_rcu, call_rcu_sched); ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ sched_domains_numa_masks_clear(cpu); ++ return 0; ++} ++ ++int sched_cpu_starting(unsigned int __maybe_unused cpu) ++{ ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ double_rq_lock(rq, cpu_rq(0)); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ bind_zero(cpu); ++ double_rq_unlock(rq, cpu_rq(0)); ++ sched_start_tick(rq, cpu); ++ hrexpiry_clear(rq); ++ local_irq_restore(flags); ++ ++ return 0; ++} ++#endif ++ ++#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) ++/* ++ * Cheaper version of the below functions in case support for SMT and MC is ++ * compiled in but CPUs have no siblings. ++ */ ++static bool sole_cpu_idle(struct rq *rq) ++{ ++ return rq_idle(rq); ++} ++#endif ++#ifdef CONFIG_SCHED_SMT ++static const cpumask_t *thread_cpumask(int cpu) ++{ ++ return topology_sibling_cpumask(cpu); ++} ++/* All this CPU's SMT siblings are idle */ ++static bool siblings_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); ++} ++#endif ++#ifdef CONFIG_SCHED_MC ++static const cpumask_t *core_cpumask(int cpu) ++{ ++ return topology_core_cpumask(cpu); ++} ++/* All this CPU's shared cache siblings are idle */ ++static bool cache_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->core_mask, &cpu_idle_map); ++} ++#endif ++ ++enum sched_domain_level { ++ SD_LV_NONE = 0, ++ SD_LV_SIBLING, ++ SD_LV_MC, ++ SD_LV_BOOK, ++ SD_LV_CPU, ++ SD_LV_NODE, ++ SD_LV_ALLNODES, ++ SD_LV_MAX ++}; ++ ++void __init sched_init_smp(void) ++{ ++ struct sched_domain *sd; ++ int cpu, other_cpu; ++#ifdef CONFIG_SCHED_SMT ++ bool smt_threads = false; ++#endif ++ cpumask_var_t non_isolated_cpus; ++ struct rq *rq; ++ ++ alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); ++ ++ sched_init_numa(); ++ ++ /* ++ * There's no userspace yet to cause hotplug operations; hence all the ++ * cpu masks are stable and all blatant races in the below code cannot ++ * happen. ++ */ ++ mutex_lock(&sched_domains_mutex); ++ sched_init_domains(cpu_active_mask); ++ cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); ++ if (cpumask_empty(non_isolated_cpus)) ++ cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); ++ mutex_unlock(&sched_domains_mutex); ++ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) ++ BUG(); ++ free_cpumask_var(non_isolated_cpus); ++ ++ mutex_lock(&sched_domains_mutex); ++ local_irq_disable(); ++ lock_all_rqs(); ++ /* ++ * Set up the relative cache distance of each online cpu from each ++ * other in a simple array for quick lookup. Locality is determined ++ * by the closest sched_domain that CPUs are separated by. CPUs with ++ * shared cache in SMT and MC are treated as local. Separate CPUs ++ * (within the same package or physically) within the same node are ++ * treated as not local. CPUs not even in the same domain (different ++ * nodes) are treated as very distant. ++ */ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ ++ /* First check if this cpu is in the same node */ ++ for_each_domain(cpu, sd) { ++ if (sd->level > SD_LV_MC) ++ continue; ++ /* Set locality to local node if not already found lower */ ++ for_each_cpu(other_cpu, sched_domain_span(sd)) { ++ if (rq->cpu_locality[other_cpu] > 3) ++ rq->cpu_locality[other_cpu] = 3; ++ } ++ } ++ ++ /* ++ * Each runqueue has its own function in case it doesn't have ++ * siblings of its own allowing mixed topologies. ++ */ ++#ifdef CONFIG_SCHED_MC ++ for_each_cpu(other_cpu, core_cpumask(cpu)) { ++ if (rq->cpu_locality[other_cpu] > 2) ++ rq->cpu_locality[other_cpu] = 2; ++ } ++ if (cpumask_weight(core_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->core_mask, core_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->core_mask); ++ rq->cache_idle = cache_cpu_idle; ++ } ++#endif ++#ifdef CONFIG_SCHED_SMT ++ if (cpumask_weight(thread_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->thread_mask); ++ for_each_cpu(other_cpu, thread_cpumask(cpu)) ++ rq->cpu_locality[other_cpu] = 1; ++ rq->siblings_idle = siblings_cpu_idle; ++ smt_threads = true; ++ } ++#endif ++ } ++ for_each_possible_cpu(cpu) { ++ int total_cpus = 1, locality; ++ ++ rq = cpu_rq(cpu); ++ for (locality = 1; locality <= 4; locality++) { ++ for_each_possible_cpu(other_cpu) { ++ if (rq->cpu_locality[other_cpu] == locality) ++ rq->rq_order[total_cpus++] = cpu_rq(other_cpu); ++ } ++ } ++ } ++#ifdef CONFIG_SMT_NICE ++ if (smt_threads) { ++ check_siblings = &check_smt_siblings; ++ wake_siblings = &wake_smt_siblings; ++ smt_schedule = &smt_should_schedule; ++ } ++#endif ++ unlock_all_rqs(); ++ local_irq_enable(); ++ mutex_unlock(&sched_domains_mutex); ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ ++ for_each_online_cpu(other_cpu) { ++ if (other_cpu <= cpu) ++ continue; ++ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); ++ } ++ } ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ sched_smp_initialized = true; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++#ifdef CONFIG_SMP ++ int cpu_ids; ++#endif ++ int i; ++ struct rq *rq; ++ ++ sched_clock_init(); ++ ++ wait_bit_init(); ++ ++ prio_ratios[0] = 128; ++ for (i = 1 ; i < NICE_WIDTH ; i++) ++ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; ++ ++ skiplist_node_init(&init_task.node); ++ ++#ifdef CONFIG_SMP ++ init_defrootdomain(); ++ cpumask_clear(&cpu_idle_map); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ skiplist_init(&rq->node); ++ rq->sl = new_skiplist(&rq->node); ++ raw_spin_lock_init(&rq->lock); ++ rq->nr_running = 0; ++ rq->nr_uninterruptible = 0; ++ rq->nr_switches = 0; ++ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; ++ rq->last_jiffy = jiffies; ++ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = ++ rq->iowait_ns = rq->idle_ns = 0; ++ rq->dither = 0; ++ set_rq_task(rq, &init_task); ++ rq->iso_ticks = 0; ++ rq->iso_refractory = false; ++#ifdef CONFIG_SMP ++ rq->sd = NULL; ++ rq->rd = NULL; ++ rq->online = false; ++ rq->cpu = i; ++ rq_attach_root(rq, &def_root_domain); ++#endif ++ init_rq_hrexpiry(rq); ++ atomic_set(&rq->nr_iowait, 0); ++ } ++ ++#ifdef CONFIG_SMP ++ cpu_ids = i; ++ /* ++ * Set the base locality for cpu cache distance calculation to ++ * "distant" (3). Make sure the distance from a CPU to itself is 0. ++ */ ++ for_each_possible_cpu(i) { ++ int j; ++ ++ rq = cpu_rq(i); ++#ifdef CONFIG_SCHED_SMT ++ rq->siblings_idle = sole_cpu_idle; ++#endif ++#ifdef CONFIG_SCHED_MC ++ rq->cache_idle = sole_cpu_idle; ++#endif ++ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); ++ for_each_possible_cpu(j) { ++ if (i == j) ++ rq->cpu_locality[j] = 0; ++ else ++ rq->cpu_locality[j] = 4; ++ } ++ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->rq_order[0] = rq; ++ for (j = 1; j < cpu_ids; j++) ++ rq->rq_order[j] = cpu_rq(j); ++ } ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++#ifdef CONFIG_SMP ++ /* May be allocated at isolcpus cmdline parse time */ ++ if (cpu_isolated_map == NULL) ++ zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); ++ idle_thread_set_boot_cpu(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current)) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && !preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++static inline void normalise_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ unsigned long flags; ++ struct rq *rq; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p) && !iso_task(p)) ++ continue; ++ ++ rq = task_rq_lock(p, &flags); ++ __setscheduler(p, rq, SCHED_NORMAL, 0, false); ++ task_rq_unlock(rq, p, &flags); ++ } ++ read_unlock(&tasklist_lock); ++} ++ ++void normalize_rt_tasks(void) ++{ ++ normalise_rt_tasks(); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++void init_idle_bootup_task(struct task_struct *idle) ++{} ++ ++#ifdef CONFIG_SCHED_DEBUG ++__read_mostly bool sched_debug_enabled; ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_SMP ++#define SCHED_LOAD_SHIFT (10) ++#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) ++ ++unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) ++{ ++ return SCHED_LOAD_SCALE; ++} ++ ++unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) ++{ ++ unsigned long weight = cpumask_weight(sched_domain_span(sd)); ++ unsigned long smt_gain = sd->smt_gain; ++ ++ smt_gain /= weight; ++ ++ return smt_gain; ++} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_files[] = { ++ { } /* Terminate */ ++}; ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .early_init = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ +diff -Nur a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h +--- a/kernel/sched/MuQSS.h 1970-01-01 01:00:00.000000000 +0100 ++++ b/kernel/sched/MuQSS.h 2019-01-05 20:22:51.099998516 +0000 +@@ -0,0 +1,725 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_PARAVIRT ++#include ++#endif ++ ++#include "cpuacct.h" ++ ++#ifndef MUQSS_SCHED_H ++#define MUQSS_SCHED_H ++ ++#ifdef CONFIG_SCHED_DEBUG ++# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) ++#else ++# define SCHED_WARN_ON(x) ((void)(x)) ++#endif ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++struct rq; ++ ++#ifdef CONFIG_SMP ++ ++static inline bool sched_asym_prefer(int a, int b) ++{ ++ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); ++} ++ ++/* ++ * We add the notion of a root-domain which will be used to define per-domain ++ * variables. Each exclusive cpuset essentially defines an island domain by ++ * fully partitioning the member cpus from any other cpuset. Whenever a new ++ * exclusive cpuset is created, we also create and attach a new root-domain ++ * object. ++ * ++ */ ++struct root_domain { ++ atomic_t refcount; ++ atomic_t rto_count; ++ struct rcu_head rcu; ++ cpumask_var_t span; ++ cpumask_var_t online; ++ ++ /* Indicate more than one runnable task for any CPU */ ++ bool overload; ++ ++ /* ++ * The bit corresponding to a CPU gets set here if such CPU has more ++ * than one runnable -deadline task (as it is below for RT tasks). ++ */ ++ cpumask_var_t dlo_mask; ++ atomic_t dlo_count; ++ /* Replace unused CFS structures with void */ ++ //struct dl_bw dl_bw; ++ //struct cpudl cpudl; ++ void *dl_bw; ++ void *cpudl; ++ ++ /* ++ * The "RT overload" flag: it gets set if a CPU has more than ++ * one runnable RT task. ++ */ ++ cpumask_var_t rto_mask; ++ //struct cpupri cpupri; ++ void *cpupri; ++ ++ unsigned long max_cpu_capacity; ++}; ++ ++extern struct root_domain def_root_domain; ++extern struct mutex sched_domains_mutex; ++ ++extern void init_defrootdomain(void); ++extern int sched_init_domains(const struct cpumask *cpu_map); ++extern void rq_attach_root(struct rq *rq, struct root_domain *rd); ++ ++static inline void cpupri_cleanup(void __maybe_unused *cpupri) ++{ ++} ++ ++static inline void cpudl_cleanup(void __maybe_unused *cpudl) ++{ ++} ++ ++static inline void init_dl_bw(void __maybe_unused *dl_bw) ++{ ++} ++ ++static inline int cpudl_init(void __maybe_unused *dl_bw) ++{ ++ return 0; ++} ++ ++static inline int cpupri_init(void __maybe_unused *cpupri) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ raw_spinlock_t lock; ++ ++ struct task_struct *curr, *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ unsigned int nr_running; ++ /* ++ * This is part of a global counter where only the total sum ++ * over all CPUs matters. A task can increase this counter on ++ * one CPU and if it got migrated afterwards it may decrease ++ * it on another CPU. Always updated under the runqueue lock: ++ */ ++ unsigned long nr_uninterruptible; ++ u64 nr_switches; ++ ++ /* Stored data about rq->curr to work outside rq lock */ ++ u64 rq_deadline; ++ int rq_prio; ++ ++ /* Best queued id for use outside lock */ ++ u64 best_key; ++ ++ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ ++ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ ++ u64 niffies; /* Last time this RQ updated rq clock */ ++ u64 last_niffy; /* Last niffies as updated by local clock */ ++ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ ++ ++ u64 load_update; /* When we last updated load */ ++ unsigned long load_avg; /* Rolling load average */ ++#ifdef CONFIG_SMT_NICE ++ struct mm_struct *rq_mm; ++ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++ /* Accurate timekeeping data */ ++ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, ++ iowait_ns, idle_ns; ++ atomic_t nr_iowait; ++ ++ skiplist_node node; ++ skiplist *sl; ++#ifdef CONFIG_SMP ++ struct task_struct *preempt; /* Preempt triggered on this task */ ++ struct task_struct *preempting; /* Hint only, what task is preempting */ ++ ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++ struct root_domain *rd; ++ struct sched_domain *sd; ++ ++ unsigned long cpu_capacity_orig; ++ ++ int *cpu_locality; /* CPU relative cache distance */ ++ struct rq **rq_order; /* RQs ordered by relative cache distance */ ++ ++#ifdef CONFIG_SCHED_SMT ++ cpumask_t thread_mask; ++ bool (*siblings_idle)(struct rq *rq); ++ /* See if all smt siblings are idle */ ++#endif /* CONFIG_SCHED_SMT */ ++#ifdef CONFIG_SCHED_MC ++ cpumask_t core_mask; ++ bool (*cache_idle)(struct rq *rq); ++ /* See if all cache siblings are idle */ ++#endif /* CONFIG_SCHED_MC */ ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ u64 clock, old_clock, last_tick; ++ u64 clock_task; ++ int dither; ++ ++ int iso_ticks; ++ bool iso_refractory; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ struct hrtimer hrexpiry_timer; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_SMP ++ struct llist_head wake_list; ++#endif ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++#ifdef CONFIG_SMP ++struct rq *cpu_rq(int cpu); ++#endif ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#endif /* CONFIG_SMP */ ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline int task_running(struct rq *rq, struct task_struct *p) ++{ ++#ifdef CONFIG_SMP ++ return p->on_cpu; ++#else ++ return task_current(rq, p); ++#endif ++} ++ ++static inline void rq_lock(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock(&rq->lock); ++} ++ ++static inline void rq_unlock(struct rq *rq) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void rq_lock_irq(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irq(&rq->lock); ++} ++ ++static inline void rq_unlock_irq(struct rq *rq) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++} ++ ++static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++} ++ ++static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ while (42) { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ return rq; ++} ++ ++static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ rq_unlock(rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++} ++ ++static inline struct rq *__task_rq_lock(struct task_struct *p) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ while (42) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(&rq->lock); ++ } ++ return rq; ++} ++ ++static inline void __task_rq_unlock(struct rq *rq) ++{ ++ rq_unlock(rq); ++} ++ ++/* ++ * {de,en}queue flags: Most not used on MuQSS. ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks ++ * are in a known state which allows modification. Such pairs ++ * should preserve as much state as possible. ++ * ++ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location ++ * in the runqueue. ++ * ++ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) ++ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) ++ * ENQUEUE_MIGRATED - the task was migrated during wakeup ++ * ++ */ ++ ++#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ ++ ++#define ENQUEUE_RESTORE 0x02 ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ return rq->clock_task; ++} ++ ++#ifdef CONFIG_NUMA ++enum numa_topology_type { ++ NUMA_DIRECT, ++ NUMA_GLUELESS_MESH, ++ NUMA_BACKPLANE, ++}; ++extern enum numa_topology_type sched_numa_topology_type; ++extern int sched_max_numa_distance; ++extern bool find_numa_distance(int distance); ++ ++extern void sched_init_numa(void); ++extern void sched_domains_numa_masks_set(unsigned int cpu); ++extern void sched_domains_numa_masks_clear(unsigned int cpu); ++#else ++static inline void sched_init_numa(void) { } ++static inline void sched_domains_numa_masks_set(unsigned int cpu) { } ++static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } ++#endif ++ ++extern struct mutex sched_domains_mutex; ++extern struct static_key_false sched_schedstats; ++ ++#define rcu_dereference_check_sched_domain(p) \ ++ rcu_dereference_check((p), \ ++ lockdep_is_held(&sched_domains_mutex)) ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. ++ * See detach_destroy_domains: synchronize_sched for details. ++ * ++ * The domain tree of any CPU may only be accessed from within ++ * preempt-disabled sections. ++ */ ++#define for_each_domain(cpu, __sd) \ ++ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ ++ __sd; __sd = __sd->parent) ++ ++#define for_each_lower_domain(sd) for (; sd; sd = sd->child) ++ ++/** ++ * highest_flag_domain - Return highest sched_domain containing flag. ++ * @cpu: The cpu whose highest level of sched domain is to ++ * be returned. ++ * @flag: The flag to check for the highest sched_domain ++ * for the given cpu. ++ * ++ * Returns the highest sched_domain of a cpu which contains the given flag. ++ */ ++static inline struct sched_domain *highest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd, *hsd = NULL; ++ ++ for_each_domain(cpu, sd) { ++ if (!(sd->flags & flag)) ++ break; ++ hsd = sd; ++ } ++ ++ return hsd; ++} ++ ++static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd; ++ ++ for_each_domain(cpu, sd) { ++ if (sd->flags & flag) ++ break; ++ } ++ ++ return sd; ++} ++ ++DECLARE_PER_CPU(struct sched_domain *, sd_llc); ++DECLARE_PER_CPU(int, sd_llc_size); ++DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); ++DECLARE_PER_CPU(struct sched_domain *, sd_numa); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym); ++ ++struct sched_group_capacity { ++ atomic_t ref; ++ /* ++ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity ++ * for a single CPU. ++ */ ++ unsigned long capacity; ++ unsigned long min_capacity; /* Min per-CPU capacity in group */ ++ unsigned long next_update; ++ int imbalance; /* XXX unrelated to capacity but shared group state */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++ int id; ++#endif ++ ++ unsigned long cpumask[0]; /* balance mask */ ++}; ++ ++struct sched_group { ++ struct sched_group *next; /* Must be a circular list */ ++ atomic_t ref; ++ ++ unsigned int group_weight; ++ struct sched_group_capacity *sgc; ++ int asym_prefer_cpu; /* cpu of highest priority in group */ ++ ++ /* ++ * The CPUs this group covers. ++ * ++ * NOTE: this field is variable length. (Allocated dynamically ++ * by attaching extra space to the end of the structure, ++ * depending on how many CPUs the kernel has booted up with) ++ */ ++ unsigned long cpumask[0]; ++}; ++ ++static inline struct cpumask *sched_group_span(struct sched_group *sg) ++{ ++ return to_cpumask(sg->cpumask); ++} ++ ++/* ++ * See build_balance_mask(). ++ */ ++static inline struct cpumask *group_balance_mask(struct sched_group *sg) ++{ ++ return to_cpumask(sg->sgc->cpumask); ++} ++ ++/** ++ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. ++ * @group: The group whose first cpu is to be returned. ++ */ ++static inline unsigned int group_first_cpu(struct sched_group *group) ++{ ++ return cpumask_first(sched_group_span(group)); ++} ++ ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void dirty_sched_domain_sysctl(int cpu); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void dirty_sched_domain_sysctl(int cpu) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern void sched_ttwu_pending(void); ++extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); ++extern void set_rq_online (struct rq *rq); ++extern void set_rq_offline(struct rq *rq); ++extern bool sched_smp_initialized; ++ ++static inline void update_group_capacity(struct sched_domain *sd, int cpu) ++{ ++} ++ ++static inline void trigger_load_balance(struct rq *rq) ++{ ++} ++ ++#define sched_feat(x) 0 ++ ++#else /* CONFIG_SMP */ ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ SCHED_WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++extern bool sched_debug_enabled; ++#endif ++ ++extern void schedule_idle(void); ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_SMP ++static inline int cpu_of(struct rq *rq) ++{ ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static inline int cpu_of(struct rq *rq) ++{ ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); ++ ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, ++ cpu_of(rq))); ++ ++ if (data) ++ data->func(data, rq->niffies, flags); ++} ++#else ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) ++{ ++} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++/* ++ * This should only be called when current == rq->idle. Dodgy workaround for ++ * when softirqs are pending and we are in the idle loop. Setting current to ++ * resched will kick us out of the idle loop and the softirqs will be serviced ++ * on our next pass through schedule(). ++ */ ++static inline bool softirq_pending(int cpu) ++{ ++ if (likely(!local_softirq_pending())) ++ return false; ++ set_tsk_need_resched(current); ++ return true; ++} ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return tsk_seruntime(t); ++} ++#else ++struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags); ++void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags); ++ ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ unsigned long flags; ++ u64 ns; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &flags); ++ ns = tsk_seruntime(t); ++ task_rq_unlock(rq, t, &flags); ++ ++ return ns; ++} ++#endif ++ ++#endif /* MUQSS_SCHED_H */ +diff -Nur a/kernel/sched/sched.h b/kernel/sched/sched.h +--- a/kernel/sched/sched.h 2018-12-21 13:13:19.000000000 +0000 ++++ b/kernel/sched/sched.h 2019-01-05 20:22:51.099998516 +0000 +@@ -1,5 +1,8 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_MUQSS ++#include "MuQSS.h" ++#else /* CONFIG_SCHED_MUQSS */ + #include + #include + #include +@@ -2103,3 +2106,29 @@ + #else /* arch_scale_freq_capacity */ + #define arch_scale_freq_invariant() (false) + #endif ++ ++static inline bool softirq_pending(int cpu) ++{ ++ return false; ++} ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return t->se.sum_exec_runtime; ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ u64 ns; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = t->se.sum_exec_runtime; ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++#endif /* CONFIG_SCHED_MUQSS */ +diff -Nur a/kernel/skip_list.c b/kernel/skip_list.c +--- a/kernel/skip_list.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/kernel/skip_list.c 2019-01-05 20:22:51.099998516 +0000 +@@ -0,0 +1,148 @@ ++/* ++ Copyright (C) 2011,2016 Con Kolivas. ++ ++ Code based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++The routine randomLevel has been hard-coded to generate random ++levels using p=0.25. It can be easily changed. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++Levels start at zero and go up to MaxLevel (which is equal to ++MaxNumberOfLevels-1). ++ ++The routines defined in this file are: ++ ++init: defines slnode ++ ++new_skiplist: returns a new, empty list ++ ++randomLevel: Returns a random level based on a u64 random seed passed to it. ++In MuQSS, the "niffy" time is used for this purpose. ++ ++insert(l,key, value): inserts the binding (key, value) into l. This operation ++occurs in O(log n) time. ++ ++delnode(slnode, l, node): deletes any binding of key from the l based on the ++actual node value. This operation occurs in O(k) time where k is the ++number of levels of the node in question (max 8). The original delete ++function occurred in O(log n) time and involved a search. ++ ++MuQSS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++ ++*/ ++ ++#include ++#include ++ ++#define MaxNumberOfLevels 8 ++#define MaxLevel (MaxNumberOfLevels - 1) ++ ++void skiplist_init(skiplist_node *slnode) ++{ ++ int i; ++ ++ slnode->key = 0xFFFFFFFFFFFFFFFF; ++ slnode->level = 0; ++ slnode->value = NULL; ++ for (i = 0; i < MaxNumberOfLevels; i++) ++ slnode->next[i] = slnode->prev[i] = slnode; ++} ++ ++skiplist *new_skiplist(skiplist_node *slnode) ++{ ++ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); ++ ++ BUG_ON(!l); ++ l->header = slnode; ++ return l; ++} ++ ++void free_skiplist(skiplist *l) ++{ ++ skiplist_node *p, *q; ++ ++ p = l->header; ++ do { ++ q = p->next[0]; ++ p->next[0]->prev[0] = q->prev[0]; ++ skiplist_node_init(p); ++ p = q; ++ } while (p != l->header); ++ kfree(l); ++} ++ ++void skiplist_node_init(skiplist_node *node) ++{ ++ memset(node, 0, sizeof(skiplist_node)); ++} ++ ++static inline unsigned int randomLevel(const long unsigned int randseed) ++{ ++ return find_first_bit(&randseed, MaxLevel) / 2; ++} ++ ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) ++{ ++ skiplist_node *update[MaxNumberOfLevels]; ++ skiplist_node *p, *q; ++ int k = l->level; ++ ++ p = l->header; ++ do { ++ while (q = p->next[k], q->key <= key) ++ p = q; ++ update[k] = p; ++ } while (--k >= 0); ++ ++ ++l->entries; ++ k = randomLevel(randseed); ++ if (k > l->level) { ++ k = ++l->level; ++ update[k] = l->header; ++ } ++ ++ node->level = k; ++ node->key = key; ++ node->value = value; ++ do { ++ p = update[k]; ++ node->next[k] = p->next[k]; ++ p->next[k] = node; ++ node->prev[k] = p; ++ node->next[k]->prev[k] = node; ++ } while (--k >= 0); ++} ++ ++void skiplist_delete(skiplist *l, skiplist_node *node) ++{ ++ int k, m = node->level; ++ ++ for (k = 0; k <= m; k++) { ++ node->prev[k]->next[k] = node->next[k]; ++ node->next[k]->prev[k] = node->prev[k]; ++ } ++ skiplist_node_init(node); ++ if (m == l->level) { ++ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) ++ m--; ++ l->level = m; ++ } ++ l->entries--; ++} +diff -Nur a/kernel/sysctl.c b/kernel/sysctl.c +--- a/kernel/sysctl.c 2019-01-05 20:17:13.859238862 +0000 ++++ b/kernel/sysctl.c 2019-01-05 20:22:51.099998516 +0000 +@@ -135,6 +135,12 @@ + static unsigned long one_ul __read_only = 1; + static int one_hundred __read_only = 100; + static int one_thousand __read_only = 1000; ++#ifdef CONFIG_SCHED_MUQSS ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand __read_only = 10000; + #endif +@@ -296,7 +302,7 @@ + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) + static int min_sched_granularity_ns __read_only = 100000; /* 100 usecs */ + static int max_sched_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns __read_only; /* 0 usecs */ +@@ -313,6 +319,7 @@ + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_MUQSS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -475,6 +482,7 @@ + .extra1 = &one, + }, + #endif ++#endif /* !CONFIG_SCHED_MUQSS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1073,6 +1081,44 @@ + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_MUQSS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++ { ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff -Nur a/kernel/time/clockevents.c b/kernel/time/clockevents.c +--- a/kernel/time/clockevents.c 2018-12-21 13:13:19.000000000 +0000 ++++ b/kernel/time/clockevents.c 2019-01-05 20:22:51.099998516 +0000 +@@ -198,8 +198,13 @@ + + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + ++#ifdef CONFIG_SCHED_MUQSS ++/* Limit min_delta to 100us */ ++#define MIN_DELTA_LIMIT (NSEC_PER_SEC / 10000) ++#else + /* Limit min_delta to a jiffie */ + #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) ++#endif + + /** + * clockevents_increase_min_delta - raise minimum delta of a clock event device +diff -Nur a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +--- a/kernel/time/posix-cpu-timers.c 2018-12-21 13:13:19.000000000 +0000 ++++ b/kernel/time/posix-cpu-timers.c 2019-01-05 20:22:51.109998835 +0000 +@@ -818,7 +818,7 @@ + tsk_expires->virt_exp = expires; + + tsk_expires->sched_exp = check_timers_list(++timers, firing, +- tsk->se.sum_exec_runtime); ++ tsk_seruntime(tsk)); + + /* + * Check for the special case thread timers. +@@ -828,7 +828,7 @@ + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + if (hard != RLIM_INFINITY && +- tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { ++ tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { + /* + * At the hard limit, we just die. + * No need to calculate anything else now. +@@ -840,7 +840,7 @@ + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } +- if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { ++ if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { + /* + * At the soft limit, send a SIGXCPU every second. + */ +@@ -1081,7 +1081,7 @@ + struct task_cputime task_sample; + + task_cputime(tsk, &task_sample.utime, &task_sample.stime); +- task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; ++ task_sample.sum_exec_runtime = tsk_seruntime(tsk); + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + return 1; + } +diff -Nur a/kernel/time/timer.c b/kernel/time/timer.c +--- a/kernel/time/timer.c 2019-01-05 20:17:13.859238862 +0000 ++++ b/kernel/time/timer.c 2019-01-05 20:22:51.109998835 +0000 +@@ -1434,7 +1434,7 @@ + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) ++static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) + { + u64 nextevt = hrtimer_get_next_event(); + +@@ -1452,6 +1452,9 @@ + if (nextevt <= basem) + return basem; + ++ if (nextevt < expires && nextevt - basem <= TICK_NSEC) ++ base->is_idle = false; ++ + /* + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to +@@ -1521,7 +1524,7 @@ + } + raw_spin_unlock(&base->lock); + +- return cmp_next_hrtimer_event(basem, expires); ++ return cmp_next_hrtimer_event(base, basem, expires); + } + + /** +diff -Nur a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +--- a/kernel/trace/trace_selftest.c 2018-12-21 13:13:19.000000000 +0000 ++++ b/kernel/trace/trace_selftest.c 2019-01-05 20:22:51.109998835 +0000 +@@ -1041,10 +1041,15 @@ + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_MUQSS ++ /* No deadline on MuQSS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0002-BFQ-v8r12-20180404.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0002-BFQ-v8r12-20180404.patch new file mode 100644 index 00000000..104325d6 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0002-BFQ-v8r12-20180404.patch @@ -0,0 +1,4611 @@ +From 7bd365a925748767d7ed807e5498f90bae0ebc25 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Tue, 14 Nov 2017 08:28:45 +0100 +Subject: [PATCH 01/23] block, bfq-mq: turn BUG_ON on request-size into WARN_ON + +BFQ has many checks of internal and external consistency. One of them +checks that an I/O request has still sectors to serve, if it happens +to be retired without being served. If the request has no sector to +serve, a BUG_ON signals the failure and causes the kernel to +terminate. Yet, from a crash report by a user [1], this condition may +happen to hold, in apparently correct functioning, for I/O with a +CD/DVD. + +To address this issue, this commit turns the above BUG_ON into a +WARN_ON. This commit also adds a companion WARN_ON on request +insertion into the scheduler. + +[1] https://groups.google.com/d/msg/bfq-iosched/DDOTJBroBa4/VyU1zUFtCgAJ + +Reported-by: Alexandre Frade +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 0c09609a6099..0fc757ae7a42 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -1540,6 +1540,8 @@ static void bfq_add_request(struct request *rq) + + BUG_ON(!RQ_BFQQ(rq)); + BUG_ON(RQ_BFQQ(rq) != bfqq); ++ WARN_ON(blk_rq_sectors(rq) == 0); ++ + elv_rb_add(&bfqq->sort_list, rq); + + /* +@@ -4962,7 +4964,7 @@ static void bfq_finish_request(struct request *rq) + rq_io_start_time_ns(rq), + rq->cmd_flags); + +- BUG_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); ++ WARN_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); + + if (likely(rq->rq_flags & RQF_STARTED)) { + unsigned long flags; + +From 1097d368a20456c88acd75b3184c68df38e8f7b8 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Sun, 12 Nov 2017 22:43:46 +0100 +Subject: [PATCH 02/23] block, bfq-sq, bfq-mq: consider also past I/O in soft + real-time detection + +BFQ privileges the I/O of soft real-time applications, such as video +players, to guarantee to these application a high bandwidth and a low +latency. In this respect, it is not easy to correctly detect when an +application is soft real-time. A particularly nasty false positive is +that of an I/O-bound application that occasionally happens to meet all +requirements to be deemed as soft real-time. After being detected as +soft real-time, such an application monopolizes the device. Fortunately, +BFQ will realize soon that the application is actually not soft +real-time and suspend every privilege. Yet, the application may happen +again to be wrongly detected as soft real-time, and so on. + +As highlighted by our tests, this problem causes BFQ to occasionally +fail to guarantee a high responsiveness, in the presence of heavy +background I/O workloads. The reason is that the background workload +happens to be detected as soft real-time, more or less frequently, +during the execution of the interactive task under test. To give an +idea, because of this problem, Libreoffice Writer occasionally takes 8 +seconds, instead of 3, to start up, if there are sequential reads and +writes in the background, on a Kingston SSDNow V300. + +This commit addresses this issue by leveraging the following facts. + +The reason why some applications are detected as soft real-time despite +all BFQ checks to avoid false positives, is simply that, during high +CPU or storage-device load, I/O-bound applications may happen to do +I/O slowly enough to meet all soft real-time requirements, and pass +all BFQ extra checks. Yet, this happens only for limited time periods: +slow-speed time intervals are usually interspersed between other time +intervals during which these applications do I/O at a very high speed. +To exploit these facts, this commit introduces a little change, in the +detection of soft real-time behavior, to systematically consider also +the recent past: the higher the speed was in the recent past, the +later next I/O should arrive for the application to be considered as +soft real-time. At the beginning of a slow-speed interval, the minimum +arrival time allowed for the next I/O usually happens to still be so +high, to fall *after* the end of the slow-speed period itself. As a +consequence, the application does not risk to be deemed as soft +real-time during the slow-speed interval. Then, during the next +high-speed interval, the application cannot, evidently, be deemed as +soft real-time (exactly because of its speed), and so on. + +This extra filtering proved to be rather effective: in the above test, +the frequency of false positives became so low that the start-up time +was 3 seconds in all iterations (apart from occasional outliers, +caused by page-cache-management issues, which are out of the scope of +this commit, and cannot be solved by an I/O scheduler). + +Signed-off-by: Paolo Valente +Signed-off-by: Angelo Ruocco +--- + block/bfq-mq-iosched.c | 115 ++++++++++++++++++++++++++++++++++--------------- + block/bfq-sq-iosched.c | 115 ++++++++++++++++++++++++++++++++++--------------- + 2 files changed, 162 insertions(+), 68 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 0fc757ae7a42..4d06d900f45e 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -3201,37 +3201,78 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, + * whereas soft_rt_next_start is set to infinity for applications that do + * not. + * +- * Unfortunately, even a greedy application may happen to behave in an +- * isochronous way if the CPU load is high. In fact, the application may +- * stop issuing requests while the CPUs are busy serving other processes, +- * then restart, then stop again for a while, and so on. In addition, if +- * the disk achieves a low enough throughput with the request pattern +- * issued by the application (e.g., because the request pattern is random +- * and/or the device is slow), then the application may meet the above +- * bandwidth requirement too. To prevent such a greedy application to be +- * deemed as soft real-time, a further rule is used in the computation of +- * soft_rt_next_start: soft_rt_next_start must be higher than the current +- * time plus the maximum time for which the arrival of a request is waited +- * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. +- * This filters out greedy applications, as the latter issue instead their +- * next request as soon as possible after the last one has been completed +- * (in contrast, when a batch of requests is completed, a soft real-time +- * application spends some time processing data). ++ * Unfortunately, even a greedy (i.e., I/O-bound) application may ++ * happen to meet, occasionally or systematically, both the above ++ * bandwidth and isochrony requirements. This may happen at least in ++ * the following circumstances. First, if the CPU load is high. The ++ * application may stop issuing requests while the CPUs are busy ++ * serving other processes, then restart, then stop again for a while, ++ * and so on. The other circumstances are related to the storage ++ * device: the storage device is highly loaded or reaches a low-enough ++ * throughput with the I/O of the application (e.g., because the I/O ++ * is random and/or the device is slow). In all these cases, the ++ * I/O of the application may be simply slowed down enough to meet ++ * the bandwidth and isochrony requirements. To reduce the probability ++ * that greedy applications are deemed as soft real-time in these ++ * corner cases, a further rule is used in the computation of ++ * soft_rt_next_start: the return value of this function is forced to ++ * be higher than the maximum between the following two quantities. + * +- * Unfortunately, the last filter may easily generate false positives if +- * only bfqd->bfq_slice_idle is used as a reference time interval and one +- * or both the following cases occur: +- * 1) HZ is so low that the duration of a jiffy is comparable to or higher +- * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with +- * HZ=100. ++ * (a) Current time plus: (1) the maximum time for which the arrival ++ * of a request is waited for when a sync queue becomes idle, ++ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We ++ * postpone for a moment the reason for adding a few extra ++ * jiffies; we get back to it after next item (b). Lower-bounding ++ * the return value of this function with the current time plus ++ * bfqd->bfq_slice_idle tends to filter out greedy applications, ++ * because the latter issue their next request as soon as possible ++ * after the last one has been completed. In contrast, a soft ++ * real-time application spends some time processing data, after a ++ * batch of its requests has been completed. ++ * ++ * (b) Current value of bfqq->soft_rt_next_start. As pointed out ++ * above, greedy applications may happen to meet both the ++ * bandwidth and isochrony requirements under heavy CPU or ++ * storage-device load. In more detail, in these scenarios, these ++ * applications happen, only for limited time periods, to do I/O ++ * slowly enough to meet all the requirements described so far, ++ * including the filtering in above item (a). These slow-speed ++ * time intervals are usually interspersed between other time ++ * intervals during which these applications do I/O at a very high ++ * speed. Fortunately, exactly because of the high speed of the ++ * I/O in the high-speed intervals, the values returned by this ++ * function happen to be so high, near the end of any such ++ * high-speed interval, to be likely to fall *after* the end of ++ * the low-speed time interval that follows. These high values are ++ * stored in bfqq->soft_rt_next_start after each invocation of ++ * this function. As a consequence, if the last value of ++ * bfqq->soft_rt_next_start is constantly used to lower-bound the ++ * next value that this function may return, then, from the very ++ * beginning of a low-speed interval, bfqq->soft_rt_next_start is ++ * likely to be constantly kept so high that any I/O request ++ * issued during the low-speed interval is considered as arriving ++ * to soon for the application to be deemed as soft ++ * real-time. Then, in the high-speed interval that follows, the ++ * application will not be deemed as soft real-time, just because ++ * it will do I/O at a high speed. And so on. ++ * ++ * Getting back to the filtering in item (a), in the following two ++ * cases this filtering might be easily passed by a greedy ++ * application, if the reference quantity was just ++ * bfqd->bfq_slice_idle: ++ * 1) HZ is so low that the duration of a jiffy is comparable to or ++ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow ++ * devices with HZ=100. The time granularity may be so coarse ++ * that the approximation, in jiffies, of bfqd->bfq_slice_idle ++ * is rather lower than the exact value. + * 2) jiffies, instead of increasing at a constant rate, may stop increasing + * for a while, then suddenly 'jump' by several units to recover the lost + * increments. This seems to happen, e.g., inside virtual machines. +- * To address this issue, we do not use as a reference time interval just +- * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In +- * particular we add the minimum number of jiffies for which the filter +- * seems to be quite precise also in embedded systems and KVM/QEMU virtual +- * machines. ++ * To address this issue, in the filtering in (a) we do not use as a ++ * reference time interval just bfqd->bfq_slice_idle, but ++ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the ++ * minimum number of jiffies for which the filter seems to be quite ++ * precise also in embedded systems and KVM/QEMU virtual machines. + */ + static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +@@ -3243,10 +3284,11 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + jiffies_to_msecs(HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate)); + +- return max(bfqq->last_idle_bklogged + +- HZ * bfqq->service_from_backlogged / +- bfqd->bfq_wr_max_softrt_rate, +- jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); ++ return max3(bfqq->soft_rt_next_start, ++ bfqq->last_idle_bklogged + ++ HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); + } + + /** +@@ -4395,10 +4437,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bfqq->split_time = bfq_smallest_from_now(); + + /* +- * Set to the value for which bfqq will not be deemed as +- * soft rt when it becomes backlogged. ++ * To not forget the possibly high bandwidth consumed by a ++ * process/queue in the recent past, ++ * bfq_bfqq_softrt_next_start() returns a value at least equal ++ * to the current value of bfqq->soft_rt_next_start (see ++ * comments on bfq_bfqq_softrt_next_start). Set ++ * soft_rt_next_start to now, to mean that bfqq has consumed ++ * no bandwidth so far. + */ +- bfqq->soft_rt_next_start = bfq_greatest_from_now(); ++ bfqq->soft_rt_next_start = jiffies; + + /* first request is almost certainly seeky */ + bfqq->seek_history = 1; +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index 4bbd7f4c0154..987dc255c82c 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -3089,37 +3089,78 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, + * whereas soft_rt_next_start is set to infinity for applications that do + * not. + * +- * Unfortunately, even a greedy application may happen to behave in an +- * isochronous way if the CPU load is high. In fact, the application may +- * stop issuing requests while the CPUs are busy serving other processes, +- * then restart, then stop again for a while, and so on. In addition, if +- * the disk achieves a low enough throughput with the request pattern +- * issued by the application (e.g., because the request pattern is random +- * and/or the device is slow), then the application may meet the above +- * bandwidth requirement too. To prevent such a greedy application to be +- * deemed as soft real-time, a further rule is used in the computation of +- * soft_rt_next_start: soft_rt_next_start must be higher than the current +- * time plus the maximum time for which the arrival of a request is waited +- * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. +- * This filters out greedy applications, as the latter issue instead their +- * next request as soon as possible after the last one has been completed +- * (in contrast, when a batch of requests is completed, a soft real-time +- * application spends some time processing data). ++ * Unfortunately, even a greedy (i.e., I/O-bound) application may ++ * happen to meet, occasionally or systematically, both the above ++ * bandwidth and isochrony requirements. This may happen at least in ++ * the following circumstances. First, if the CPU load is high. The ++ * application may stop issuing requests while the CPUs are busy ++ * serving other processes, then restart, then stop again for a while, ++ * and so on. The other circumstances are related to the storage ++ * device: the storage device is highly loaded or reaches a low-enough ++ * throughput with the I/O of the application (e.g., because the I/O ++ * is random and/or the device is slow). In all these cases, the ++ * I/O of the application may be simply slowed down enough to meet ++ * the bandwidth and isochrony requirements. To reduce the probability ++ * that greedy applications are deemed as soft real-time in these ++ * corner cases, a further rule is used in the computation of ++ * soft_rt_next_start: the return value of this function is forced to ++ * be higher than the maximum between the following two quantities. + * +- * Unfortunately, the last filter may easily generate false positives if +- * only bfqd->bfq_slice_idle is used as a reference time interval and one +- * or both the following cases occur: +- * 1) HZ is so low that the duration of a jiffy is comparable to or higher +- * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with +- * HZ=100. ++ * (a) Current time plus: (1) the maximum time for which the arrival ++ * of a request is waited for when a sync queue becomes idle, ++ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We ++ * postpone for a moment the reason for adding a few extra ++ * jiffies; we get back to it after next item (b). Lower-bounding ++ * the return value of this function with the current time plus ++ * bfqd->bfq_slice_idle tends to filter out greedy applications, ++ * because the latter issue their next request as soon as possible ++ * after the last one has been completed. In contrast, a soft ++ * real-time application spends some time processing data, after a ++ * batch of its requests has been completed. ++ * ++ * (b) Current value of bfqq->soft_rt_next_start. As pointed out ++ * above, greedy applications may happen to meet both the ++ * bandwidth and isochrony requirements under heavy CPU or ++ * storage-device load. In more detail, in these scenarios, these ++ * applications happen, only for limited time periods, to do I/O ++ * slowly enough to meet all the requirements described so far, ++ * including the filtering in above item (a). These slow-speed ++ * time intervals are usually interspersed between other time ++ * intervals during which these applications do I/O at a very high ++ * speed. Fortunately, exactly because of the high speed of the ++ * I/O in the high-speed intervals, the values returned by this ++ * function happen to be so high, near the end of any such ++ * high-speed interval, to be likely to fall *after* the end of ++ * the low-speed time interval that follows. These high values are ++ * stored in bfqq->soft_rt_next_start after each invocation of ++ * this function. As a consequence, if the last value of ++ * bfqq->soft_rt_next_start is constantly used to lower-bound the ++ * next value that this function may return, then, from the very ++ * beginning of a low-speed interval, bfqq->soft_rt_next_start is ++ * likely to be constantly kept so high that any I/O request ++ * issued during the low-speed interval is considered as arriving ++ * to soon for the application to be deemed as soft ++ * real-time. Then, in the high-speed interval that follows, the ++ * application will not be deemed as soft real-time, just because ++ * it will do I/O at a high speed. And so on. ++ * ++ * Getting back to the filtering in item (a), in the following two ++ * cases this filtering might be easily passed by a greedy ++ * application, if the reference quantity was just ++ * bfqd->bfq_slice_idle: ++ * 1) HZ is so low that the duration of a jiffy is comparable to or ++ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow ++ * devices with HZ=100. The time granularity may be so coarse ++ * that the approximation, in jiffies, of bfqd->bfq_slice_idle ++ * is rather lower than the exact value. + * 2) jiffies, instead of increasing at a constant rate, may stop increasing + * for a while, then suddenly 'jump' by several units to recover the lost + * increments. This seems to happen, e.g., inside virtual machines. +- * To address this issue, we do not use as a reference time interval just +- * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In +- * particular we add the minimum number of jiffies for which the filter +- * seems to be quite precise also in embedded systems and KVM/QEMU virtual +- * machines. ++ * To address this issue, in the filtering in (a) we do not use as a ++ * reference time interval just bfqd->bfq_slice_idle, but ++ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the ++ * minimum number of jiffies for which the filter seems to be quite ++ * precise also in embedded systems and KVM/QEMU virtual machines. + */ + static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +@@ -3131,10 +3172,11 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + jiffies_to_msecs(HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate)); + +- return max(bfqq->last_idle_bklogged + +- HZ * bfqq->service_from_backlogged / +- bfqd->bfq_wr_max_softrt_rate, +- jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); ++ return max3(bfqq->soft_rt_next_start, ++ bfqq->last_idle_bklogged + ++ HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); + } + + /** +@@ -4167,10 +4209,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bfqq->split_time = bfq_smallest_from_now(); + + /* +- * Set to the value for which bfqq will not be deemed as +- * soft rt when it becomes backlogged. ++ * To not forget the possibly high bandwidth consumed by a ++ * process/queue in the recent past, ++ * bfq_bfqq_softrt_next_start() returns a value at least equal ++ * to the current value of bfqq->soft_rt_next_start (see ++ * comments on bfq_bfqq_softrt_next_start). Set ++ * soft_rt_next_start to now, to mean that bfqq has consumed ++ * no bandwidth so far. + */ +- bfqq->soft_rt_next_start = bfq_greatest_from_now(); ++ bfqq->soft_rt_next_start = jiffies; + + /* first request is almost certainly seeky */ + bfqq->seek_history = 1; + +From 2a09b505660c81dbb80a5d68c9bc558c326d041f Mon Sep 17 00:00:00 2001 +From: Chiara Bruschi +Date: Thu, 7 Dec 2017 09:57:19 +0100 +Subject: [PATCH 03/23] block, bfq-mq: fix occurrences of request + prepare/finish methods' old names + +Commits 'b01f1fa3bb19' (Port of "blk-mq-sched: unify request prepare +methods") and 'cc10d2d7d2c1' (Port of "blk-mq-sched: unify request +finished methods") changed the old names of current bfq_prepare_request +and bfq_finish_request methods, but left them unchanged elsewhere in +the code (related comments, part of function name bfq_put_rq_priv_body). + +This commit fixes every occurrence of the old names of these methods +by changing them into the current names. + +Fixes: b01f1fa3bb19 (Port of "blk-mq-sched: unify request prepare methods") +Fixes: cc10d2d7d2c1 (Port of "blk-mq-sched: unify request finished methods") +Reviewed-by: Paolo Valente +Signed-off-by: Federico Motta +Signed-off-by: Chiara Bruschi +--- + block/bfq-mq-iosched.c | 38 +++++++++++++++++++------------------- + 1 file changed, 19 insertions(+), 19 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 4d06d900f45e..8f8d5eccb016 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4018,20 +4018,20 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + /* + * TESTING: reset DISP_LIST flag, because: 1) + * this rq this request has passed through +- * get_rq_private, 2) then it will have +- * put_rq_private invoked on it, and 3) in +- * put_rq_private we use this flag to check +- * that put_rq_private is not invoked on +- * requests for which get_rq_private has been +- * invoked. ++ * bfq_prepare_request, 2) then it will have ++ * bfq_finish_request invoked on it, and 3) in ++ * bfq_finish_request we use this flag to check ++ * that bfq_finish_request is not invoked on ++ * requests for which bfq_prepare_request has ++ * been invoked. + */ + rq->rq_flags &= ~RQF_DISP_LIST; + goto inc_in_driver_start_rq; + } + + /* +- * We exploit the put_rq_private hook to decrement +- * rq_in_driver, but put_rq_private will not be ++ * We exploit the bfq_finish_request hook to decrement ++ * rq_in_driver, but bfq_finish_request will not be + * invoked on this request. So, to avoid unbalance, + * just start this request, without incrementing + * rq_in_driver. As a negative consequence, +@@ -4040,14 +4040,14 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + * bfq_schedule_dispatch to be invoked uselessly. + * + * As for implementing an exact solution, the +- * put_request hook, if defined, is probably invoked +- * also on this request. So, by exploiting this hook, +- * we could 1) increment rq_in_driver here, and 2) +- * decrement it in put_request. Such a solution would +- * let the value of the counter be always accurate, +- * but it would entail using an extra interface +- * function. This cost seems higher than the benefit, +- * being the frequency of non-elevator-private ++ * bfq_finish_request hook, if defined, is probably ++ * invoked also on this request. So, by exploiting ++ * this hook, we could 1) increment rq_in_driver here, ++ * and 2) decrement it in bfq_finish_request. Such a ++ * solution would let the value of the counter be ++ * always accurate, but it would entail using an extra ++ * interface function. This cost seems higher than the ++ * benefit, being the frequency of non-elevator-private + * requests very low. + */ + goto start_rq; +@@ -4963,7 +4963,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) + } + } + +-static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) ++static void bfq_finish_request_body(struct bfq_queue *bfqq) + { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "put_request_body: allocated %d", bfqq->allocated); +@@ -5019,7 +5019,7 @@ static void bfq_finish_request(struct request *rq) + spin_lock_irqsave(&bfqd->lock, flags); + + bfq_completed_request(bfqq, bfqd); +- bfq_put_rq_priv_body(bfqq); ++ bfq_finish_request_body(bfqq); + + spin_unlock_irqrestore(&bfqd->lock, flags); + } else { +@@ -5042,7 +5042,7 @@ static void bfq_finish_request(struct request *rq) + bfqg_stats_update_io_remove(bfqq_group(bfqq), + rq->cmd_flags); + } +- bfq_put_rq_priv_body(bfqq); ++ bfq_finish_request_body(bfqq); + } + + rq->elv.priv[0] = NULL; + +From 4df19943c3a767df453abea3d2ac3433c3326ce0 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Thu, 16 Nov 2017 18:38:13 +0100 +Subject: [PATCH 04/23] block, bfq-sq, bfq-mq: add missing rq_pos_tree update + on rq removal + +If two processes do I/O close to each other, then BFQ merges the +bfq_queues associated with these processes, to get a more sequential +I/O, and thus a higher throughput. In this respect, to detect whether +two processes are doing I/O close to each other, BFQ keeps a list of +the head-of-line I/O requests of all active bfq_queues. The list is +ordered by initial sectors, and implemented through a red-black tree +(rq_pos_tree). + +Unfortunately, the update of the rq_pos_tree was incomplete, because +the tree was not updated on the removal of the head-of-line I/O +request of a bfq_queue, in case the queue did not remain empty. This +commit adds the missing update. + +Signed-off-by: Paolo Valente +Signed-off-by: Angelo Ruocco +--- + block/bfq-mq-iosched.c | 3 +++ + block/bfq-sq-iosched.c | 3 +++ + 2 files changed, 6 insertions(+) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 8f8d5eccb016..603191c9008f 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -1729,6 +1729,9 @@ static void bfq_remove_request(struct request_queue *q, + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } ++ } else { ++ BUG_ON(!bfqq->next_rq); ++ bfq_pos_tree_add_move(bfqd, bfqq); + } + + if (rq->cmd_flags & REQ_META) { +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index 987dc255c82c..ea90ace79e49 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -1669,6 +1669,9 @@ static void bfq_remove_request(struct request *rq) + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } ++ } else { ++ BUG_ON(!bfqq->next_rq); ++ bfq_pos_tree_add_move(bfqd, bfqq); + } + + if (rq->cmd_flags & REQ_META) { + +From b844e345140aaea957d84a21d2aa67588b020cd5 Mon Sep 17 00:00:00 2001 +From: Angelo Ruocco +Date: Mon, 18 Dec 2017 08:28:08 +0100 +Subject: [PATCH 05/23] block, bfq-sq, bfq-mq: check low_latency flag in + bfq_bfqq_save_state() + +A just-created bfq_queue will certainly be deemed as interactive on +the arrival of its first I/O request, if the low_latency flag is +set. Yet, if the queue is merged with another queue on the arrival of +its first I/O request, it will not have the chance to be flagged as +interactive. Nevertheless, if the queue is then split soon enough, it +has to be flagged as interactive after the split. + +To handle this early-merge scenario correctly, BFQ saves the state of +the queue, on the merge, as if the latter had already been deemed +interactive. So, if the queue is split soon, it will get +weight-raised, because the previous state of the queue is resumed on +the split. + +Unfortunately, in the act of saving the state of the newly-created +queue, BFQ doesn't check whether the low_latency flag is set, and this +causes early-merged queues to be then weight-raised, on queue splits, +even if low_latency is off. This commit addresses this problem by +adding the missing check. + +Signed-off-by: Angelo Ruocco +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 3 ++- + block/bfq-sq-iosched.c | 3 ++- + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 603191c9008f..ff9776c8836a 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -2231,7 +2231,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) + bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + if (unlikely(bfq_bfqq_just_created(bfqq) && +- !bfq_bfqq_in_large_burst(bfqq))) { ++ !bfq_bfqq_in_large_burst(bfqq) && ++ bfqq->bfqd->low_latency)) { + /* + * bfqq being merged ritgh after being created: bfqq + * would have deserved interactive weight raising, but +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index ea90ace79e49..3a2d764e760c 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -2109,7 +2109,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) + bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + if (unlikely(bfq_bfqq_just_created(bfqq) && +- !bfq_bfqq_in_large_burst(bfqq))) { ++ !bfq_bfqq_in_large_burst(bfqq) && ++ bfqq->bfqd->low_latency)) { + /* + * bfqq being merged ritgh after being created: bfqq + * would have deserved interactive weight raising, but + +From 4cc6896fe1de2e0b4de151a6e70658f10b9ec2fa Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Fri, 27 Oct 2017 11:12:14 +0200 +Subject: [PATCH 06/23] block, bfq-sq, bfq-mq: let a queue be merged only + shortly after starting I/O + +In BFQ and CFQ, two processes are said to be cooperating if they do +I/O in such a way that the union of their I/O requests yields a +sequential I/O pattern. To get such a sequential I/O pattern out of +the non-sequential pattern of each cooperating process, BFQ and CFQ +merge the queues associated with these processes. In more detail, +cooperating processes, and thus their associated queues, usually +start, or restart, to do I/O shortly after each other. This is the +case, e.g., for the I/O threads of KVM/QEMU and of the dump +utility. Basing on this assumption, this commit allows a bfq_queue to +be merged only during a short time interval (100ms) after it starts, +or re-starts, to do I/O. This filtering provides two important +benefits. + +First, it greatly reduces the probability that two non-cooperating +processes have their queues merged by mistake, if they just happen to +do I/O close to each other for a short time interval. These spurious +merges cause loss of service guarantees. A low-weight bfq_queue may +unjustly get more than its expected share of the throughput: if such a +low-weight queue is merged with a high-weight queue, then the I/O for +the low-weight queue is served as if the queue had a high weight. This +may damage other high-weight queues unexpectedly. For instance, +because of this issue, lxterminal occasionally took 7.5 seconds to +start, instead of 6.5 seconds, when some sequential readers and +writers did I/O in the background on a FUJITSU MHX2300BT HDD. The +reason is that the bfq_queues associated with some of the readers or +the writers were merged with the high-weight queues of some processes +that had to do some urgent but little I/O. The readers then exploited +the inherited high weight for all or most of their I/O, during the +start-up of terminal. The filtering introduced by this commit +eliminated any outlier caused by spurious queue merges in our start-up +time tests. + +This filtering also provides a little boost of the throughput +sustainable by BFQ: 3-4%, depending on the CPU. The reason is that, +once a bfq_queue cannot be merged any longer, this commit makes BFQ +stop updating the data needed to handle merging for the queue. + +Signed-off-by: Paolo Valente +Signed-off-by: Angelo Ruocco +--- + block/bfq-mq-iosched.c | 64 +++++++++++++++++++++++++++++++++++++++++--------- + block/bfq-mq.h | 1 + + block/bfq-sched.c | 4 ++++ + block/bfq-sq-iosched.c | 64 +++++++++++++++++++++++++++++++++++++++++--------- + block/bfq.h | 2 ++ + 5 files changed, 113 insertions(+), 22 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index ff9776c8836a..8b17b25a3c30 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -119,6 +119,20 @@ static const int bfq_async_charge_factor = 10; + /* Default timeout values, in jiffies, approximating CFQ defaults. */ + static const int bfq_timeout = (HZ / 8); + ++/* ++ * Time limit for merging (see comments in bfq_setup_cooperator). Set ++ * to the slowest value that, in our tests, proved to be effective in ++ * removing false positives, while not causing true positives to miss ++ * queue merging. ++ * ++ * As can be deduced from the low time limit below, queue merging, if ++ * successful, happens at the very beggining of the I/O of the involved ++ * cooperating processes, as a consequence of the arrival of the very ++ * first requests from each cooperator. After that, there is very ++ * little chance to find cooperators. ++ */ ++static const unsigned long bfq_merge_time_limit = HZ/10; ++ + static struct kmem_cache *bfq_pool; + + /* Below this threshold (in ns), we consider thinktime immediate. */ +@@ -389,6 +403,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, + return bfqq; + } + ++static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) ++{ ++ return bfqq->service_from_backlogged > 0 && ++ time_is_before_jiffies(bfqq->first_IO_time + ++ bfq_merge_time_limit); ++} ++ + static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) + { + struct rb_node **p, *parent; +@@ -399,6 +420,14 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfqq->pos_root = NULL; + } + ++ /* ++ * bfqq cannot be merged any longer (see comments in ++ * bfq_setup_cooperator): no point in adding bfqq into the ++ * position tree. ++ */ ++ if (bfq_too_late_for_merging(bfqq)) ++ return; ++ + if (bfq_class_idle(bfqq)) + return; + if (!bfqq->next_rq) +@@ -2081,6 +2110,13 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) + static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, + struct bfq_queue *new_bfqq) + { ++ if (bfq_too_late_for_merging(new_bfqq)) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "[%s] too late for bfq%d to be merged", ++ __func__, new_bfqq->pid); ++ return false; ++ } ++ + if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || + (bfqq->ioprio_class != new_bfqq->ioprio_class)) + return false; +@@ -2149,6 +2185,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + { + struct bfq_queue *in_service_bfqq, *new_bfqq; + ++ /* ++ * Prevent bfqq from being merged if it has been created too ++ * long ago. The idea is that true cooperating processes, and ++ * thus their associated bfq_queues, are supposed to be ++ * created shortly after each other. This is the case, e.g., ++ * for KVM/QEMU and dump I/O threads. Basing on this ++ * assumption, the following filtering greatly reduces the ++ * probability that two non-cooperating processes, which just ++ * happen to do close I/O for some short time interval, have ++ * their queues merged by mistake. ++ */ ++ if (bfq_too_late_for_merging(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have looked for coop, but too late"); ++ return NULL; ++ } ++ + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + +@@ -3338,17 +3391,6 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, + */ + slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); + +- /* +- * Increase service_from_backlogged before next statement, +- * because the possible next invocation of +- * bfq_bfqq_charge_time would likely inflate +- * entity->service. In contrast, service_from_backlogged must +- * contain real service, to enable the soft real-time +- * heuristic to correctly compute the bandwidth consumed by +- * bfqq. +- */ +- bfqq->service_from_backlogged += entity->service; +- + /* + * As above explained, charge slow (typically seeky) and + * timed-out queues with the time and not the service +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +index 1cb05bb853d2..a5947b203ef2 100644 +--- a/block/bfq-mq.h ++++ b/block/bfq-mq.h +@@ -337,6 +337,7 @@ struct bfq_queue { + unsigned long wr_start_at_switch_to_srt; + + unsigned long split_time; /* time of last split */ ++ unsigned long first_IO_time; /* time of first I/O for this queue */ + }; + + /** +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index 616c0692335a..9d261dd428e4 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -939,6 +939,10 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st; + ++ if (!bfqq->service_from_backlogged) ++ bfqq->first_IO_time = jiffies; ++ ++ bfqq->service_from_backlogged += served; + for_each_entity(entity) { + st = bfq_entity_service_tree(entity); + +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index 3a2d764e760c..cd00a41ca35d 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -113,6 +113,20 @@ static const int bfq_async_charge_factor = 10; + /* Default timeout values, in jiffies, approximating CFQ defaults. */ + static const int bfq_timeout = (HZ / 8); + ++/* ++ * Time limit for merging (see comments in bfq_setup_cooperator). Set ++ * to the slowest value that, in our tests, proved to be effective in ++ * removing false positives, while not causing true positives to miss ++ * queue merging. ++ * ++ * As can be deduced from the low time limit below, queue merging, if ++ * successful, happens at the very beggining of the I/O of the involved ++ * cooperating processes, as a consequence of the arrival of the very ++ * first requests from each cooperator. After that, there is very ++ * little chance to find cooperators. ++ */ ++static const unsigned long bfq_merge_time_limit = HZ/10; ++ + static struct kmem_cache *bfq_pool; + + /* Below this threshold (in ns), we consider thinktime immediate. */ +@@ -351,6 +365,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, + return bfqq; + } + ++static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) ++{ ++ return bfqq->service_from_backlogged > 0 && ++ time_is_before_jiffies(bfqq->first_IO_time + ++ bfq_merge_time_limit); ++} ++ + static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) + { + struct rb_node **p, *parent; +@@ -361,6 +382,14 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfqq->pos_root = NULL; + } + ++ /* ++ * bfqq cannot be merged any longer (see comments in ++ * bfq_setup_cooperator): no point in adding bfqq into the ++ * position tree. ++ */ ++ if (bfq_too_late_for_merging(bfqq)) ++ return; ++ + if (bfq_class_idle(bfqq)) + return; + if (!bfqq->next_rq) +@@ -1960,6 +1989,13 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) + static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, + struct bfq_queue *new_bfqq) + { ++ if (bfq_too_late_for_merging(new_bfqq)) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "[%s] too late for bfq%d to be merged", ++ __func__, new_bfqq->pid); ++ return false; ++ } ++ + if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || + (bfqq->ioprio_class != new_bfqq->ioprio_class)) + return false; +@@ -2028,6 +2064,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + { + struct bfq_queue *in_service_bfqq, *new_bfqq; + ++ /* ++ * Prevent bfqq from being merged if it has been created too ++ * long ago. The idea is that true cooperating processes, and ++ * thus their associated bfq_queues, are supposed to be ++ * created shortly after each other. This is the case, e.g., ++ * for KVM/QEMU and dump I/O threads. Basing on this ++ * assumption, the following filtering greatly reduces the ++ * probability that two non-cooperating processes, which just ++ * happen to do close I/O for some short time interval, have ++ * their queues merged by mistake. ++ */ ++ if (bfq_too_late_for_merging(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have looked for coop, but too late"); ++ return NULL; ++ } ++ + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + +@@ -3226,17 +3279,6 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, + */ + slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); + +- /* +- * Increase service_from_backlogged before next statement, +- * because the possible next invocation of +- * bfq_bfqq_charge_time would likely inflate +- * entity->service. In contrast, service_from_backlogged must +- * contain real service, to enable the soft real-time +- * heuristic to correctly compute the bandwidth consumed by +- * bfqq. +- */ +- bfqq->service_from_backlogged += entity->service; +- + /* + * As above explained, charge slow (typically seeky) and + * timed-out queues with the time and not the service +diff --git a/block/bfq.h b/block/bfq.h +index 47cd4d5a8c32..59539adc00a5 100644 +--- a/block/bfq.h ++++ b/block/bfq.h +@@ -329,6 +329,8 @@ struct bfq_queue { + unsigned long wr_start_at_switch_to_srt; + + unsigned long split_time; /* time of last split */ ++ ++ unsigned long first_IO_time; /* time of first I/O for this queue */ + }; + + /** + +From 157f39c43ab182280634cd4f6335d0187b3741a0 Mon Sep 17 00:00:00 2001 +From: Angelo Ruocco +Date: Mon, 11 Dec 2017 14:19:54 +0100 +Subject: [PATCH 07/23] block, bfq-sq, bfq-mq: remove superfluous check in + queue-merging setup + +When two or more processes do I/O in a way that the their requests are +sequential in respect to one another, BFQ merges the bfq_queues associated +with the processes. This way the overall I/O pattern becomes sequential, +and thus there is a boost in througput. +These cooperating processes usually start or restart to do I/O shortly +after each other. So, in order to avoid merging non-cooperating processes, +BFQ ensures that none of these queues has been in weight raising for too +long. + +In this respect, from commit "block, bfq-sq, bfq-mq: let a queue be merged +only shortly after being created", BFQ checks whether any queue (and not +only weight-raised ones) is doing I/O continuously from too long to be +merged. + +This new additional check makes the first one useless: a queue doing +I/O from long enough, if being weight-raised, is also a queue in +weight raising for too long to be merged. Accordingly, this commit +removes the first check. + +Signed-off-by: Angelo Ruocco +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 53 ++++---------------------------------------------- + block/bfq-sq-iosched.c | 53 ++++---------------------------------------------- + 2 files changed, 8 insertions(+), 98 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 8b17b25a3c30..f5db8613a70f 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -2140,20 +2140,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, + return true; + } + +-/* +- * If this function returns true, then bfqq cannot be merged. The idea +- * is that true cooperation happens very early after processes start +- * to do I/O. Usually, late cooperations are just accidental false +- * positives. In case bfqq is weight-raised, such false positives +- * would evidently degrade latency guarantees for bfqq. +- */ +-static bool wr_from_too_long(struct bfq_queue *bfqq) +-{ +- return bfqq->wr_coeff > 1 && +- time_is_before_jiffies(bfqq->last_wr_start_finish + +- msecs_to_jiffies(100)); +-} +- + /* + * Attempt to schedule a merge of bfqq with the currently in-service + * queue or with a close queue among the scheduled queues. Return +@@ -2167,11 +2153,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq) + * to maintain. Besides, in such a critical condition as an out of memory, + * the benefits of queue merging may be little relevant, or even negligible. + * +- * Weight-raised queues can be merged only if their weight-raising +- * period has just started. In fact cooperating processes are usually +- * started together. Thus, with this filter we avoid false positives +- * that would jeopardize low-latency guarantees. +- * + * WARNING: queue merging may impair fairness among non-weight raised + * queues, for at least two reasons: 1) the original weight of a + * merged queue may change during the merged state, 2) even being the +@@ -2205,15 +2186,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + +- if (io_struct && wr_from_too_long(bfqq) && +- likely(bfqq != &bfqd->oom_bfqq)) +- bfq_log_bfqq(bfqd, bfqq, +- "would have looked for coop, but bfq%d wr", +- bfqq->pid); +- +- if (!io_struct || +- wr_from_too_long(bfqq) || +- unlikely(bfqq == &bfqd->oom_bfqq)) ++ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) + return NULL; + + /* If there is only one backlogged queue, don't search. */ +@@ -2223,17 +2196,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + in_service_bfqq = bfqd->in_service_queue; + + if (in_service_bfqq && in_service_bfqq != bfqq && +- wr_from_too_long(in_service_bfqq) +- && likely(in_service_bfqq == &bfqd->oom_bfqq)) +- bfq_log_bfqq(bfqd, bfqq, +- "would have tried merge with in-service-queue, but wr"); +- +- if (!in_service_bfqq || in_service_bfqq == bfqq +- || wr_from_too_long(in_service_bfqq) || +- unlikely(in_service_bfqq == &bfqd->oom_bfqq)) +- goto check_scheduled; +- +- if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && ++ likely(in_service_bfqq != &bfqd->oom_bfqq) && ++ bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && + bfqq->entity.parent == in_service_bfqq->entity.parent && + bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { + new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); +@@ -2245,21 +2209,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + * queues. The only thing we need is that the bio/request is not + * NULL, as we need it to establish whether a cooperator exists. + */ +-check_scheduled: + new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, + bfq_io_struct_pos(io_struct, request)); + + BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); + +- if (new_bfqq && wr_from_too_long(new_bfqq) && +- likely(new_bfqq != &bfqd->oom_bfqq) && +- bfq_may_be_close_cooperator(bfqq, new_bfqq)) +- bfq_log_bfqq(bfqd, bfqq, +- "would have merged with bfq%d, but wr", +- new_bfqq->pid); +- +- if (new_bfqq && !wr_from_too_long(new_bfqq) && +- likely(new_bfqq != &bfqd->oom_bfqq) && ++ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && + bfq_may_be_close_cooperator(bfqq, new_bfqq)) + return bfq_setup_merge(bfqq, new_bfqq); + +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index cd00a41ca35d..d8a358e5e284 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -2019,20 +2019,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, + return true; + } + +-/* +- * If this function returns true, then bfqq cannot be merged. The idea +- * is that true cooperation happens very early after processes start +- * to do I/O. Usually, late cooperations are just accidental false +- * positives. In case bfqq is weight-raised, such false positives +- * would evidently degrade latency guarantees for bfqq. +- */ +-static bool wr_from_too_long(struct bfq_queue *bfqq) +-{ +- return bfqq->wr_coeff > 1 && +- time_is_before_jiffies(bfqq->last_wr_start_finish + +- msecs_to_jiffies(100)); +-} +- + /* + * Attempt to schedule a merge of bfqq with the currently in-service + * queue or with a close queue among the scheduled queues. Return +@@ -2046,11 +2032,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq) + * to maintain. Besides, in such a critical condition as an out of memory, + * the benefits of queue merging may be little relevant, or even negligible. + * +- * Weight-raised queues can be merged only if their weight-raising +- * period has just started. In fact cooperating processes are usually +- * started together. Thus, with this filter we avoid false positives +- * that would jeopardize low-latency guarantees. +- * + * WARNING: queue merging may impair fairness among non-weight raised + * queues, for at least two reasons: 1) the original weight of a + * merged queue may change during the merged state, 2) even being the +@@ -2084,15 +2065,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + +- if (io_struct && wr_from_too_long(bfqq) && +- likely(bfqq != &bfqd->oom_bfqq)) +- bfq_log_bfqq(bfqd, bfqq, +- "would have looked for coop, but bfq%d wr", +- bfqq->pid); +- +- if (!io_struct || +- wr_from_too_long(bfqq) || +- unlikely(bfqq == &bfqd->oom_bfqq)) ++ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) + return NULL; + + /* If there is only one backlogged queue, don't search. */ +@@ -2102,17 +2075,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + in_service_bfqq = bfqd->in_service_queue; + + if (in_service_bfqq && in_service_bfqq != bfqq && +- bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) +- && likely(in_service_bfqq == &bfqd->oom_bfqq)) +- bfq_log_bfqq(bfqd, bfqq, +- "would have tried merge with in-service-queue, but wr"); +- +- if (!in_service_bfqq || in_service_bfqq == bfqq || +- !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || +- unlikely(in_service_bfqq == &bfqd->oom_bfqq)) +- goto check_scheduled; +- +- if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && ++ likely(in_service_bfqq != &bfqd->oom_bfqq) && ++ bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && + bfqq->entity.parent == in_service_bfqq->entity.parent && + bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { + new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); +@@ -2124,21 +2088,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + * queues. The only thing we need is that the bio/request is not + * NULL, as we need it to establish whether a cooperator exists. + */ +-check_scheduled: + new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, + bfq_io_struct_pos(io_struct, request)); + + BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); + +- if (new_bfqq && wr_from_too_long(new_bfqq) && +- likely(new_bfqq != &bfqd->oom_bfqq) && +- bfq_may_be_close_cooperator(bfqq, new_bfqq)) +- bfq_log_bfqq(bfqd, bfqq, +- "would have merged with bfq%d, but wr", +- new_bfqq->pid); +- +- if (new_bfqq && !wr_from_too_long(new_bfqq) && +- likely(new_bfqq != &bfqd->oom_bfqq) && ++ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && + bfq_may_be_close_cooperator(bfqq, new_bfqq)) + return bfq_setup_merge(bfqq, new_bfqq); + + +From b82eb91d87f172aba7eb5eb98e8d5e2a621adf51 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Thu, 30 Nov 2017 17:48:28 +0100 +Subject: [PATCH 08/23] block, bfq-sq, bfq-mq: increase threshold to deem I/O + as random + +If two processes do I/O close to each other, i.e., are cooperating +processes in BFQ (and CFQ'S) nomenclature, then BFQ merges their +associated bfq_queues, so as to get sequential I/O from the union of +the I/O requests of the processes, and thus reach a higher +throughput. A merged queue is then split if its I/O stops being +sequential. In this respect, BFQ deems the I/O of a bfq_queue as +(mostly) sequential only if less than 4 I/O requests are random, out +of the last 32 requests inserted into the queue. + +Unfortunately, extensive testing (with the interleaved_io benchmark of +the S suite [1], and with real applications spawning cooperating +processes) has clearly shown that, with such a low threshold, only a +rather low I/O throughput may be reached when several cooperating +processes do I/O. In particular, the outcome of each test run was +bimodal: if queue merging occurred and was stable during the test, +then the throughput was close to the peak rate of the storage device, +otherwise the throughput was arbitrarily low (usually around 1/10 of +the peak rate with a rotational device). The probability to get the +unlucky outcomes grew with the number of cooperating processes: it was +already significant with 5 processes, and close to one with 7 or more +processes. + +The cause of the low throughput in the unlucky runs was that the +merged queues containing the I/O of these cooperating processes were +soon split, because they contained more random I/O requests than those +tolerated by the 4/32 threshold, but +- that I/O would have however allowed the storage device to reach + peak throughput or almost peak throughput; +- in contrast, the I/O of these processes, if served individually + (from separate queues) yielded a rather low throughput. + +So we repeated our tests with increasing values of the threshold, +until we found the minimum value (19) for which we obtained maximum +throughput, reliably, with at least up to 9 cooperating +processes. Then we checked that the use of that higher threshold value +did not cause any regression for any other benchmark in the suite [1]. +This commit raises the threshold to such a higher value. + +[1] https://github.com/Algodev-github/S + +Signed-off-by: Angelo Ruocco +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 2 +- + block/bfq-sq-iosched.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index f5db8613a70f..cb5f49ddecb6 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -145,7 +145,7 @@ static struct kmem_cache *bfq_pool; + #define BFQQ_SEEK_THR (sector_t)(8 * 100) + #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) + #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) +-#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) ++#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) + + /* Min number of samples required to perform peak-rate update */ + #define BFQ_RATE_MIN_SAMPLES 32 +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index d8a358e5e284..e1c6dc651be1 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -139,7 +139,7 @@ static struct kmem_cache *bfq_pool; + #define BFQQ_SEEK_THR (sector_t)(8 * 100) + #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) + #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) +-#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) ++#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) + + /* Min number of samples required to perform peak-rate update */ + #define BFQ_RATE_MIN_SAMPLES 32 + +From b739dda4e4b3a1cbbc905f86f9fbb0860b068ce7 Mon Sep 17 00:00:00 2001 +From: Chiara Bruschi +Date: Mon, 11 Dec 2017 18:55:26 +0100 +Subject: [PATCH 09/23] block, bfq-sq, bfq-mq: specify usage condition of + delta_us in bfq_log_bfqq call + +Inside the function bfq_completed_request the value of a variable +called delta_us is computed as current request completion time. +delta_us is used inside a call to the function bfq_log_bfqq as divisor +in a division operation to compute a rate value, but no check makes +sure that delta_us has non-zero value. A divisor with value 0 leads +to a division error that could result in a kernel oops (therefore +unstable/unreliable system state) and consequently cause kernel panic +if resources are unavailable after the system fault. + +This commit fixes this call to bfq_log_bfqq specifying the condition +that allows delta_us to be safely used as divisor. + +Signed-off-by: Paolo Valente +Signed-off-by: Chiara Bruschi +--- + block/bfq-mq-iosched.c | 5 ++++- + block/bfq-sq-iosched.c | 5 ++++- + 2 files changed, 8 insertions(+), 2 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index cb5f49ddecb6..6ce2c0789046 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4904,9 +4904,12 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) + bfq_log_bfqq(bfqd, bfqq, + "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", + delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, ++ delta_us > 0 ? + (USEC_PER_SEC* + (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, ++ >>BFQ_RATE_SHIFT : ++ (USEC_PER_SEC* ++ (u64)(bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, + (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); + + /* +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index e1c6dc651be1..eff4c4edf5a0 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -4565,9 +4565,12 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) + + bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", + delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, ++ delta_us > 0 ? + (USEC_PER_SEC* + (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, ++ >>BFQ_RATE_SHIFT : ++ (USEC_PER_SEC* ++ (u64)(bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, + (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); + + /* + +From ae4310c13eca762644734d53074d8456c85e2dec Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Tue, 19 Dec 2017 12:07:12 +0100 +Subject: [PATCH 10/23] block, bfq-mq: limit tags for writes and async I/O + +Asynchronous I/O can easily starve synchronous I/O (both sync reads +and sync writes), by consuming all request tags. Similarly, storms of +synchronous writes, such as those that sync(2) may trigger, can starve +synchronous reads. In their turn, these two problems may also cause +BFQ to loose control on latency for interactive and soft real-time +applications. For example, on a PLEXTOR PX-256M5S SSD, LibreOffice +Writer takes 0.6 seconds to start if the device is idle, but it takes +more than 45 seconds (!) if there are sequential writes in the +background. + +This commit addresses this issue by limiting the maximum percentage of +tags that asynchronous I/O requests and synchronous write requests can +consume. In particular, this commit grants a higher threshold to +synchronous writes, to prevent the latter from being starved by +asynchronous I/O. + +According to the above test, LibreOffice Writer now starts in about +1.2 seconds on average, regardless of the background workload, and +apart from some rare outlier. To check this improvement, run, e.g., +sudo ./comm_startup_lat.sh bfq-mq 5 5 seq 10 "lowriter --terminate_after_init" +for the comm_startup_lat benchmark in the S suite [1]. + +[1] https://github.com/Algodev-github/S + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++ + block/bfq-mq.h | 12 ++++++++ + 2 files changed, 89 insertions(+) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 6ce2c0789046..f384f5566672 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -362,6 +362,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, + } + } + ++/* ++ * See the comments on bfq_limit_depth for the purpose of ++ * the depths set in the function. ++ */ ++static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) ++{ ++ bfqd->sb_shift = bt->sb.shift; ++ ++ /* ++ * In-word depths if no bfq_queue is being weight-raised: ++ * leaving 25% of tags only for sync reads. ++ * ++ * In next formulas, right-shift the value ++ * (1U<sb_shift), instead of computing directly ++ * (1U<<(bfqd->sb_shift - something)), to be robust against ++ * any possible value of bfqd->sb_shift, without having to ++ * limit 'something'. ++ */ ++ /* no more than 50% of tags for async I/O */ ++ bfqd->word_depths[0][0] = max((1U<sb_shift)>>1, 1U); ++ /* ++ * no more than 75% of tags for sync writes (25% extra tags ++ * w.r.t. async I/O, to prevent async I/O from starving sync ++ * writes) ++ */ ++ bfqd->word_depths[0][1] = max(((1U<sb_shift) * 3)>>2, 1U); ++ ++ /* ++ * In-word depths in case some bfq_queue is being weight- ++ * raised: leaving ~63% of tags for sync reads. This is the ++ * highest percentage for which, in our tests, application ++ * start-up times didn't suffer from any regression due to tag ++ * shortage. ++ */ ++ /* no more than ~18% of tags for async I/O */ ++ bfqd->word_depths[1][0] = max(((1U<sb_shift) * 3)>>4, 1U); ++ /* no more than ~37% of tags for sync writes (~20% extra tags) */ ++ bfqd->word_depths[1][1] = max(((1U<sb_shift) * 6)>>4, 1U); ++} ++ ++/* ++ * Async I/O can easily starve sync I/O (both sync reads and sync ++ * writes), by consuming all tags. Similarly, storms of sync writes, ++ * such as those that sync(2) may trigger, can starve sync reads. ++ * Limit depths of async I/O and sync writes so as to counter both ++ * problems. ++ */ ++static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) ++{ ++ struct blk_mq_tags *tags = blk_mq_tags_from_data(data); ++ struct bfq_data *bfqd = data->q->elevator->elevator_data; ++ struct sbitmap_queue *bt; ++ ++ if (op_is_sync(op) && !op_is_write(op)) ++ return; ++ ++ if (data->flags & BLK_MQ_REQ_RESERVED) { ++ if (unlikely(!tags->nr_reserved_tags)) { ++ WARN_ON_ONCE(1); ++ return; ++ } ++ bt = &tags->breserved_tags; ++ } else ++ bt = &tags->bitmap_tags; ++ ++ if (unlikely(bfqd->sb_shift != bt->sb.shift)) ++ bfq_update_depths(bfqd, bt); ++ ++ data->shallow_depth = ++ bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; ++ ++ bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", ++ __func__, bfqd->wr_busy_queues, op_is_sync(op), ++ data->shallow_depth); ++} ++ + static struct bfq_queue * + bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, + sector_t sector, struct rb_node **ret_parent, +@@ -5812,6 +5888,7 @@ static struct elv_fs_entry bfq_attrs[] = { + + static struct elevator_type iosched_bfq_mq = { + .ops.mq = { ++ .limit_depth = bfq_limit_depth, + .prepare_request = bfq_prepare_request, + .finish_request = bfq_finish_request, + .exit_icq = bfq_exit_icq, +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +index a5947b203ef2..458099ee0308 100644 +--- a/block/bfq-mq.h ++++ b/block/bfq-mq.h +@@ -619,6 +619,18 @@ struct bfq_data { + struct bfq_queue *bio_bfqq; + /* Extra flag used only for TESTING */ + bool bio_bfqq_set; ++ ++ /* ++ * Cached sbitmap shift, used to compute depth limits in ++ * bfq_update_depths. ++ */ ++ unsigned int sb_shift; ++ ++ /* ++ * Depth limits used in bfq_limit_depth (see comments on the ++ * function) ++ */ ++ unsigned int word_depths[2][2]; + }; + + enum bfqq_state_flags { + +From 402e5f6b59662d290ab2b3c10b0016207a63ad21 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Thu, 21 Dec 2017 15:51:39 +0100 +Subject: [PATCH 11/23] bfq-sq, bfq-mq: limit sectors served with interactive + weight raising + +To maximise responsiveness, BFQ raises the weight, and performs device +idling, for bfq_queues associated with processes deemed as +interactive. In particular, weight raising has a maximum duration, +equal to the time needed to start a large application. If a +weight-raised process goes on doing I/O beyond this maximum duration, +it loses weight-raising. + +This mechanism is evidently vulnerable to the following false +positives: I/O-bound applications that will go on doing I/O for much +longer than the duration of weight-raising. These applications have +basically no benefit from being weight-raised at the beginning of +their I/O. On the opposite end, while being weight-raised, these +applications +a) unjustly steal throughput to applications that may truly need +low latency; +b) make BFQ uselessly perform device idling; device idling results +in loss of device throughput with most flash-based storage, and may +increase latencies when used purposelessly. + +This commit adds a countermeasure to reduce both the above +problems. To introduce this countermeasure, we provide the following +extra piece of information (full details in the comments added by this +commit). During the start-up of the large application used as a +reference to set the duration of weight-raising, involved processes +transfer at most ~110K sectors each. Accordingly, a process initially +deemed as interactive has no right to be weight-raised any longer, +once transferred 110K sectors or more. + +Basing on this consideration, this commit early-ends weight-raising +for a bfq_queue if the latter happens to have received an amount of +service at least equal to 110K sectors (actually, a little bit more, +to keep a safety margin). I/O-bound applications that reach a high +throughput, such as file copy, get to this threshold much before the +allowed weight-raising period finishes. Thus this early ending of +weight-raising reduces the amount of time during which these +applications cause the problems described above. + +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 84 ++++++++++++++++++++++++++++++++++++++++++++------ + block/bfq-mq.h | 5 +++ + block/bfq-sched.c | 3 ++ + block/bfq-sq-iosched.c | 84 ++++++++++++++++++++++++++++++++++++++++++++------ + block/bfq.h | 5 +++ + 5 files changed, 163 insertions(+), 18 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index f384f5566672..63fdd16dec3c 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -162,15 +162,17 @@ static struct kmem_cache *bfq_pool; + * interactive applications automatically, using the following formula: + * duration = (R / r) * T, where r is the peak rate of the device, and + * R and T are two reference parameters. +- * In particular, R is the peak rate of the reference device (see below), +- * and T is a reference time: given the systems that are likely to be +- * installed on the reference device according to its speed class, T is +- * about the maximum time needed, under BFQ and while reading two files in +- * parallel, to load typical large applications on these systems. +- * In practice, the slower/faster the device at hand is, the more/less it +- * takes to load applications with respect to the reference device. +- * Accordingly, the longer/shorter BFQ grants weight raising to interactive +- * applications. ++ * In particular, R is the peak rate of the reference device (see ++ * below), and T is a reference time: given the systems that are ++ * likely to be installed on the reference device according to its ++ * speed class, T is about the maximum time needed, under BFQ and ++ * while reading two files in parallel, to load typical large ++ * applications on these systems (see the comments on ++ * max_service_from_wr below, for more details on how T is obtained). ++ * In practice, the slower/faster the device at hand is, the more/less ++ * it takes to load applications with respect to the reference device. ++ * Accordingly, the longer/shorter BFQ grants weight raising to ++ * interactive applications. + * + * BFQ uses four different reference pairs (R, T), depending on: + * . whether the device is rotational or non-rotational; +@@ -207,6 +209,60 @@ static int T_slow[2]; + static int T_fast[2]; + static int device_speed_thresh[2]; + ++/* ++ * BFQ uses the above-detailed, time-based weight-raising mechanism to ++ * privilege interactive tasks. This mechanism is vulnerable to the ++ * following false positives: I/O-bound applications that will go on ++ * doing I/O for much longer than the duration of weight ++ * raising. These applications have basically no benefit from being ++ * weight-raised at the beginning of their I/O. On the opposite end, ++ * while being weight-raised, these applications ++ * a) unjustly steal throughput to applications that may actually need ++ * low latency; ++ * b) make BFQ uselessly perform device idling; device idling results ++ * in loss of device throughput with most flash-based storage, and may ++ * increase latencies when used purposelessly. ++ * ++ * BFQ tries to reduce these problems, by adopting the following ++ * countermeasure. To introduce this countermeasure, we need first to ++ * finish explaining how the duration of weight-raising for ++ * interactive tasks is computed. ++ * ++ * For a bfq_queue deemed as interactive, the duration of weight ++ * raising is dynamically adjusted, as a function of the estimated ++ * peak rate of the device, so as to be equal to the time needed to ++ * execute the 'largest' interactive task we benchmarked so far. By ++ * largest task, we mean the task for which each involved process has ++ * to do more I/O than for any of the other tasks we benchmarked. This ++ * reference interactive task is the start-up of LibreOffice Writer, ++ * and in this task each process/bfq_queue needs to have at most ~110K ++ * sectors transferred. ++ * ++ * This last piece of information enables BFQ to reduce the actual ++ * duration of weight-raising for at least one class of I/O-bound ++ * applications: those doing sequential or quasi-sequential I/O. An ++ * example is file copy. In fact, once started, the main I/O-bound ++ * processes of these applications usually consume the above 110K ++ * sectors in much less time than the processes of an application that ++ * is starting, because these I/O-bound processes will greedily devote ++ * almost all their CPU cycles only to their target, ++ * throughput-friendly I/O operations. This is even more true if BFQ ++ * happens to be underestimating the device peak rate, and thus ++ * overestimating the duration of weight raising. But, according to ++ * our measurements, once transferred 110K sectors, these processes ++ * have no right to be weight-raised any longer. ++ * ++ * Basing on the last consideration, BFQ ends weight-raising for a ++ * bfq_queue if the latter happens to have received an amount of ++ * service at least equal to the following constant. The constant is ++ * set to slightly more than 110K, to have a minimum safety margin. ++ * ++ * This early ending of weight-raising reduces the amount of time ++ * during which interactive false positives cause the two problems ++ * described at the beginning of these comments. ++ */ ++static const unsigned long max_service_from_wr = 120000; ++ + #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) + +@@ -1361,6 +1417,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, + if (old_wr_coeff == 1 && wr_or_deserves_wr) { + /* start a weight-raising period */ + if (interactive) { ++ bfqq->service_from_wr = 0; + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else { +@@ -3980,6 +4037,15 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) + "back to interactive wr"); + } + } ++ if (bfqq->wr_coeff > 1 && ++ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && ++ bfqq->service_from_wr > max_service_from_wr) { ++ /* see comments on max_service_from_wr */ ++ bfq_bfqq_end_wr(bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "[%s] too much service", ++ __func__); ++ } + } + /* + * To improve latency (for this or other queues), immediately +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +index 458099ee0308..9a5ce1168ff5 100644 +--- a/block/bfq-mq.h ++++ b/block/bfq-mq.h +@@ -331,6 +331,11 @@ struct bfq_queue { + * last transition from idle to backlogged. + */ + unsigned long service_from_backlogged; ++ /* ++ * Cumulative service received from the @bfq_queue since its ++ * last transition to weight-raised state. ++ */ ++ unsigned long service_from_wr; + /* + * Value of wr start time when switching to soft rt + */ +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index 9d261dd428e4..4e6c5232e2fb 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -942,6 +942,9 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) + if (!bfqq->service_from_backlogged) + bfqq->first_IO_time = jiffies; + ++ if (bfqq->wr_coeff > 1) ++ bfqq->service_from_wr += served; ++ + bfqq->service_from_backlogged += served; + for_each_entity(entity) { + st = bfq_entity_service_tree(entity); +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index eff4c4edf5a0..486493aafaf8 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -156,15 +156,17 @@ static struct kmem_cache *bfq_pool; + * interactive applications automatically, using the following formula: + * duration = (R / r) * T, where r is the peak rate of the device, and + * R and T are two reference parameters. +- * In particular, R is the peak rate of the reference device (see below), +- * and T is a reference time: given the systems that are likely to be +- * installed on the reference device according to its speed class, T is +- * about the maximum time needed, under BFQ and while reading two files in +- * parallel, to load typical large applications on these systems. +- * In practice, the slower/faster the device at hand is, the more/less it +- * takes to load applications with respect to the reference device. +- * Accordingly, the longer/shorter BFQ grants weight raising to interactive +- * applications. ++ * In particular, R is the peak rate of the reference device (see ++ * below), and T is a reference time: given the systems that are ++ * likely to be installed on the reference device according to its ++ * speed class, T is about the maximum time needed, under BFQ and ++ * while reading two files in parallel, to load typical large ++ * applications on these systems (see the comments on ++ * max_service_from_wr below, for more details on how T is obtained). ++ * In practice, the slower/faster the device at hand is, the more/less ++ * it takes to load applications with respect to the reference device. ++ * Accordingly, the longer/shorter BFQ grants weight raising to ++ * interactive applications. + * + * BFQ uses four different reference pairs (R, T), depending on: + * . whether the device is rotational or non-rotational; +@@ -201,6 +203,60 @@ static int T_slow[2]; + static int T_fast[2]; + static int device_speed_thresh[2]; + ++/* ++ * BFQ uses the above-detailed, time-based weight-raising mechanism to ++ * privilege interactive tasks. This mechanism is vulnerable to the ++ * following false positives: I/O-bound applications that will go on ++ * doing I/O for much longer than the duration of weight ++ * raising. These applications have basically no benefit from being ++ * weight-raised at the beginning of their I/O. On the opposite end, ++ * while being weight-raised, these applications ++ * a) unjustly steal throughput to applications that may actually need ++ * low latency; ++ * b) make BFQ uselessly perform device idling; device idling results ++ * in loss of device throughput with most flash-based storage, and may ++ * increase latencies when used purposelessly. ++ * ++ * BFQ tries to reduce these problems, by adopting the following ++ * countermeasure. To introduce this countermeasure, we need first to ++ * finish explaining how the duration of weight-raising for ++ * interactive tasks is computed. ++ * ++ * For a bfq_queue deemed as interactive, the duration of weight ++ * raising is dynamically adjusted, as a function of the estimated ++ * peak rate of the device, so as to be equal to the time needed to ++ * execute the 'largest' interactive task we benchmarked so far. By ++ * largest task, we mean the task for which each involved process has ++ * to do more I/O than for any of the other tasks we benchmarked. This ++ * reference interactive task is the start-up of LibreOffice Writer, ++ * and in this task each process/bfq_queue needs to have at most ~110K ++ * sectors transfered. ++ * ++ * This last piece of information enables BFQ to reduce the actual ++ * duration of weight-raising for at least one class of I/O-bound ++ * applications: those doing sequential or quasi-sequential I/O. An ++ * example is file copy. In fact, once started, the main I/O-bound ++ * processes of these applications usually consume the above 110K ++ * sectors in much less time than the processes of an application that ++ * is starting, because these I/O-bound processes will greedily devote ++ * almost all their CPU cycles only to their target, ++ * throughput-friendly I/O operations. This is even more true if BFQ ++ * happens to be underestimating the device peak rate, and thus ++ * overestimating the duration of weight raising. But, according to ++ * our measurements, once transferred 110K sectors, these processes ++ * have no right to be weight-raised any longer. ++ * ++ * Basing on the last consideration, BFQ ends weight-raising for a ++ * bfq_queue if the latter happens to have received an amount of ++ * service at least equal to the following constant. The constant is ++ * set to slightly more than 110K, to have a minimum safety margin. ++ * ++ * This early ending of weight-raising reduces the amount of time ++ * during which interactive false positives cause the two problems ++ * described at the beginning of these comments. ++ */ ++static const unsigned long max_service_from_wr = 120000; ++ + #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) + +@@ -1246,6 +1302,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, + if (old_wr_coeff == 1 && wr_or_deserves_wr) { + /* start a weight-raising period */ + if (interactive) { ++ bfqq->service_from_wr = 0; + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else { +@@ -3794,6 +3851,15 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) + "back to interactive wr"); + } + } ++ if (bfqq->wr_coeff > 1 && ++ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && ++ bfqq->service_from_wr > max_service_from_wr) { ++ /* see comments on max_service_from_wr */ ++ bfq_bfqq_end_wr(bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "[%s] too much service", ++ __func__); ++ } + } + /* + * To improve latency (for this or other queues), immediately +diff --git a/block/bfq.h b/block/bfq.h +index 59539adc00a5..0cd7a3f251a7 100644 +--- a/block/bfq.h ++++ b/block/bfq.h +@@ -323,6 +323,11 @@ struct bfq_queue { + * last transition from idle to backlogged. + */ + unsigned long service_from_backlogged; ++ /* ++ * Cumulative service received from the @bfq_queue since its ++ * last transition to weight-raised state. ++ */ ++ unsigned long service_from_wr; + /* + * Value of wr start time when switching to soft rt + */ + +From 59efebb94b2f9bac653faf62dadb45b83bd27fa7 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Thu, 4 Jan 2018 16:29:58 +0100 +Subject: [PATCH 12/23] bfq-sq, bfq-mq: put async queues for root bfq groups + too +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +For each pair [device for which bfq is selected as I/O scheduler, +group in blkio/io], bfq maintains a corresponding bfq group. Each such +bfq group contains a set of async queues, with each async queue +created on demand, i.e., when some I/O request arrives for it. On +creation, an async queue gets an extra reference, to make sure that +the queue is not freed as long as its bfq group exists. Accordingly, +to allow the queue to be freed after the group exited, this extra +reference must released on group exit. + +The above holds also for a bfq root group, i.e., for the bfq group +corresponding to the root blkio/io root for a given device. Yet, by +mistake, the references to the existing async queues of a root group +are not released when the latter exits. This causes a memory leak when +the instance of bfq for a given device exits. In a similar vein, +bfqg_stats_xfer_dead is not executed for a root group. + +This commit fixes bfq_pd_offline so that the latter executes the above +missing operations for a root group too. + +Reported-by: Holger Hoffstätte +Reported-by: Guoqing Jiang +Signed-off-by: Davide Ferrari +Signed-off-by: Paolo Valente +--- + block/bfq-cgroup-included.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +index 562b0ce581a7..45fefb2e2d57 100644 +--- a/block/bfq-cgroup-included.c ++++ b/block/bfq-cgroup-included.c +@@ -885,13 +885,13 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) + + entity = bfqg->my_entity; + +- if (!entity) /* root group */ +- return; +- + #ifdef BFQ_MQ + spin_lock_irqsave(&bfqd->lock, flags); + #endif + ++ if (!entity) /* root group */ ++ goto put_async_queues; ++ + /* + * Empty all service_trees belonging to this group before + * deactivating the group itself. +@@ -926,6 +926,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) + BUG_ON(bfqg->sched_data.in_service_entity); + + __bfq_deactivate_entity(entity, false); ++ ++put_async_queues: + bfq_put_async_queues(bfqd, bfqg); + + #ifdef BFQ_MQ + +From 2dfbaaaf95054e2da3ededc0deb1ba5a4f589e53 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Mon, 8 Jan 2018 19:38:45 +0100 +Subject: [PATCH 13/23] bfq-sq, bfq-mq: release oom-queue ref to root group on + exit + +On scheduler init, a reference to the root group, and a reference to +its corresponding blkg are taken for the oom queue. Yet these +references are not released on scheduler exit, which prevents these +objects from be freed. This commit adds the missing reference +releases. + +Reported-by: Davide Ferrari +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 3 +++ + block/bfq-sq-iosched.c | 3 +++ + 2 files changed, 6 insertions(+) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 63fdd16dec3c..b82c52fabf91 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -5507,6 +5507,9 @@ static void bfq_exit_queue(struct elevator_queue *e) + + BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); + ++ /* release oom-queue reference to root group */ ++ bfqg_and_blkg_put(bfqd->root_group); ++ + #ifdef BFQ_GROUP_IOSCHED_ENABLED + blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); + #else +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index 486493aafaf8..851af055664d 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -5052,6 +5052,9 @@ static void bfq_exit_queue(struct elevator_queue *e) + + BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); + ++ /* release oom-queue reference to root group */ ++ bfqg_put(bfqd->root_group); ++ + #ifdef BFQ_GROUP_IOSCHED_ENABLED + blkcg_deactivate_policy(q, &blkcg_policy_bfq); + #else + +From 13efe00c8292d78d223e1090a7f36426e360eb38 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Mon, 8 Jan 2018 19:40:38 +0100 +Subject: [PATCH 14/23] block, bfq-sq, bfq-mq: trace get and put of bfq groups + +Signed-off-by: Paolo Valente +--- + block/bfq-cgroup-included.c | 15 +++++++++++++++ + block/bfq-mq-iosched.c | 3 ++- + 2 files changed, 17 insertions(+), 1 deletion(-) + +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +index 45fefb2e2d57..f94743fb2e7d 100644 +--- a/block/bfq-cgroup-included.c ++++ b/block/bfq-cgroup-included.c +@@ -267,6 +267,8 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) + + static void bfqg_get(struct bfq_group *bfqg) + { ++ trace_printk("bfqg %p\n", bfqg); ++ + #ifdef BFQ_MQ + bfqg->ref++; + #else +@@ -280,6 +282,9 @@ static void bfqg_put(struct bfq_group *bfqg) + bfqg->ref--; + + BUG_ON(bfqg->ref < 0); ++ trace_printk("putting bfqg %p %s\n", bfqg, ++ bfqg->ref == 0 ? "and freeing it" : ""); ++ + if (bfqg->ref == 0) + kfree(bfqg); + #else +@@ -293,6 +298,7 @@ static void bfqg_and_blkg_get(struct bfq_group *bfqg) + /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ + bfqg_get(bfqg); + ++ trace_printk("getting blkg for bfqg %p\n", bfqg); + blkg_get(bfqg_to_blkg(bfqg)); + } + +@@ -300,6 +306,7 @@ static void bfqg_and_blkg_put(struct bfq_group *bfqg) + { + bfqg_put(bfqg); + ++ trace_printk("putting blkg for bfqg %p\n", bfqg); + blkg_put(bfqg_to_blkg(bfqg)); + } + #endif +@@ -382,6 +389,8 @@ static void bfq_init_entity(struct bfq_entity *entity, + * Make sure that bfqg and its associated blkg do not + * disappear before entity. + */ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting bfqg %p and blkg\n", __func__, bfqg); ++ + bfqg_and_blkg_get(bfqg); + #else + bfqg_get(bfqg); +@@ -475,6 +484,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) + kfree(bfqg); + return NULL; + } ++ trace_printk("bfqg %p\n", bfqg); + + #ifdef BFQ_MQ + /* see comments in bfq_bic_update_cgroup for why refcounting */ +@@ -513,6 +523,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) + static void bfq_pd_free(struct blkg_policy_data *pd) + { + struct bfq_group *bfqg = pd_to_bfqg(pd); ++ trace_printk("bfqg %p\n", bfqg); + + bfqg_stats_exit(&bfqg->stats); + #ifdef BFQ_MQ +@@ -650,6 +661,8 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); + } + #ifdef BFQ_MQ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); ++ + bfqg_and_blkg_put(bfqq_group(bfqq)); + #else + bfqg_put(bfqq_group(bfqq)); +@@ -658,6 +671,8 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + entity->parent = bfqg->my_entity; + entity->sched_data = &bfqg->sched_data; + #ifdef BFQ_MQ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting blkg and bfqg %p\n", __func__, bfqg); ++ + /* pin down bfqg and its associated blkg */ + bfqg_and_blkg_get(bfqg); + #else +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index b82c52fabf91..d5b7a6b985d7 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4385,10 +4385,11 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + if (bfqq->bfqd) + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); + +- kmem_cache_free(bfq_pool, bfqq); + #ifdef BFQ_GROUP_IOSCHED_ENABLED ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); + bfqg_and_blkg_put(bfqg); + #endif ++ kmem_cache_free(bfq_pool, bfqq); + } + + static void bfq_put_cooperator(struct bfq_queue *bfqq) + +From 816b77fba966171974eb5ee25d81bc4e19eaf1b4 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 10 Jan 2018 09:08:22 +0100 +Subject: [PATCH 15/23] bfq-sq, bfq-mq: compile group put for oom queue only if + BFQ_GROUP_IOSCHED is set + +Commit ("bfq-sq, bfq-mq: release oom-queue ref to root group on exit") +added a missing put of the root bfq group for the oom queue. That put +has to be, and can be, performed only if CONFIG_BFQ_GROUP_IOSCHED is +defined: the function doing the put is even not defined at all if +CONFIG_BFQ_GROUP_IOSCHED is not defined. But that commit makes that +put be invoked regardless of whether CONFIG_BFQ_GROUP_IOSCHED is +defined. This commit fixes this mistake, by making that invocation be +compiled only if CONFIG_BFQ_GROUP_IOSCHED is actually defined. + +Fixes ("block, bfq: release oom-queue ref to root group on exit") +Reported-by: Jan Alexander Steffens +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 2 +- + block/bfq-sq-iosched.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index d5b7a6b985d7..2581fe0f6f2f 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -5508,10 +5508,10 @@ static void bfq_exit_queue(struct elevator_queue *e) + + BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); + ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + /* release oom-queue reference to root group */ + bfqg_and_blkg_put(bfqd->root_group); + +-#ifdef BFQ_GROUP_IOSCHED_ENABLED + blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); + #else + spin_lock_irq(&bfqd->lock); +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index 851af055664d..c4df156b1fb4 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -5052,10 +5052,10 @@ static void bfq_exit_queue(struct elevator_queue *e) + + BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); + ++#ifdef BFQ_GROUP_IOSCHED_ENABLED + /* release oom-queue reference to root group */ + bfqg_put(bfqd->root_group); + +-#ifdef BFQ_GROUP_IOSCHED_ENABLED + blkcg_deactivate_policy(q, &blkcg_policy_bfq); + #else + bfq_put_async_queues(bfqd, bfqd->root_group); + +From 643a89c659172b2c9ae16adfe03af4e3e88e1326 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Sat, 13 Jan 2018 18:48:41 +0100 +Subject: [PATCH 16/23] block, bfq-sq, bfq-mq: remove trace_printks + +Commit ("block, bfq-sq, bfq-mq: trace get and put of bfq groups") +unwisely added some invocations of the function trace_printk, which +is inappropriate in production kernels. This commit removes those +invocations. + +Signed-off-by: Paolo Valente +--- + block/bfq-cgroup-included.c | 10 ---------- + 1 file changed, 10 deletions(-) + +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +index f94743fb2e7d..a4f8a03edfc9 100644 +--- a/block/bfq-cgroup-included.c ++++ b/block/bfq-cgroup-included.c +@@ -267,8 +267,6 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) + + static void bfqg_get(struct bfq_group *bfqg) + { +- trace_printk("bfqg %p\n", bfqg); +- + #ifdef BFQ_MQ + bfqg->ref++; + #else +@@ -282,9 +280,6 @@ static void bfqg_put(struct bfq_group *bfqg) + bfqg->ref--; + + BUG_ON(bfqg->ref < 0); +- trace_printk("putting bfqg %p %s\n", bfqg, +- bfqg->ref == 0 ? "and freeing it" : ""); +- + if (bfqg->ref == 0) + kfree(bfqg); + #else +@@ -298,7 +293,6 @@ static void bfqg_and_blkg_get(struct bfq_group *bfqg) + /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ + bfqg_get(bfqg); + +- trace_printk("getting blkg for bfqg %p\n", bfqg); + blkg_get(bfqg_to_blkg(bfqg)); + } + +@@ -306,7 +300,6 @@ static void bfqg_and_blkg_put(struct bfq_group *bfqg) + { + bfqg_put(bfqg); + +- trace_printk("putting blkg for bfqg %p\n", bfqg); + blkg_put(bfqg_to_blkg(bfqg)); + } + #endif +@@ -484,8 +477,6 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) + kfree(bfqg); + return NULL; + } +- trace_printk("bfqg %p\n", bfqg); +- + #ifdef BFQ_MQ + /* see comments in bfq_bic_update_cgroup for why refcounting */ + bfqg_get(bfqg); +@@ -523,7 +514,6 @@ static void bfq_pd_init(struct blkg_policy_data *pd) + static void bfq_pd_free(struct blkg_policy_data *pd) + { + struct bfq_group *bfqg = pd_to_bfqg(pd); +- trace_printk("bfqg %p\n", bfqg); + + bfqg_stats_exit(&bfqg->stats); + #ifdef BFQ_MQ + +From ce050275e24fecec800f346c09d9494563e9fc8a Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Mon, 15 Jan 2018 15:07:05 +0100 +Subject: [PATCH 17/23] block, bfq-mq: add requeue-request hook + +Commit 'a6a252e64914 ("blk-mq-sched: decide how to handle flush rq via +RQF_FLUSH_SEQ")' makes all non-flush re-prepared requests for a device +be re-inserted into the active I/O scheduler for that device. As a +consequence, I/O schedulers may get the same request inserted again, +even several times, without a finish_request invoked on that request +before each re-insertion. + +This fact is the cause of the failure reported in [1]. For an I/O +scheduler, every re-insertion of the same re-prepared request is +equivalent to the insertion of a new request. For schedulers like +mq-deadline or kyber, this fact causes no harm. In contrast, it +confuses a stateful scheduler like BFQ, which keeps state for an I/O +request, until the finish_request hook is invoked on the request. In +particular, BFQ may get stuck, waiting forever for the number of +request dispatches, of the same request, to be balanced by an equal +number of request completions (while there will be one completion for +that request). In this state, BFQ may refuse to serve I/O requests +from other bfq_queues. The hang reported in [1] then follows. + +However, the above re-prepared requests undergo a requeue, thus the +requeue_request hook of the active elevator is invoked for these +requests, if set. This commit then addresses the above issue by +properly implementing the hook requeue_request in BFQ. + +[1] https://marc.info/?l=linux-block&m=151211117608676 + +Reported-by: Ivan Kozik +Reported-by: Alban Browaeys +Signed-off-by: Paolo Valente +Signed-off-by: Serena Ziviani +--- + block/bfq-mq-iosched.c | 90 ++++++++++++++++++++++++++++++++++++++++---------- + 1 file changed, 73 insertions(+), 17 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 2581fe0f6f2f..bb7ccc2f1165 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4162,9 +4162,9 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + * TESTING: reset DISP_LIST flag, because: 1) + * this rq this request has passed through + * bfq_prepare_request, 2) then it will have +- * bfq_finish_request invoked on it, and 3) in +- * bfq_finish_request we use this flag to check +- * that bfq_finish_request is not invoked on ++ * bfq_finish_requeue_request invoked on it, and 3) in ++ * bfq_finish_requeue_request we use this flag to check ++ * that bfq_finish_requeue_request is not invoked on + * requests for which bfq_prepare_request has + * been invoked. + */ +@@ -4173,8 +4173,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + } + + /* +- * We exploit the bfq_finish_request hook to decrement +- * rq_in_driver, but bfq_finish_request will not be ++ * We exploit the bfq_finish_requeue_request hook to decrement ++ * rq_in_driver, but bfq_finish_requeue_request will not be + * invoked on this request. So, to avoid unbalance, + * just start this request, without incrementing + * rq_in_driver. As a negative consequence, +@@ -4183,10 +4183,10 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + * bfq_schedule_dispatch to be invoked uselessly. + * + * As for implementing an exact solution, the +- * bfq_finish_request hook, if defined, is probably ++ * bfq_finish_requeue_request hook, if defined, is probably + * invoked also on this request. So, by exploiting + * this hook, we could 1) increment rq_in_driver here, +- * and 2) decrement it in bfq_finish_request. Such a ++ * and 2) decrement it in bfq_finish_requeue_request. Such a + * solution would let the value of the counter be + * always accurate, but it would entail using an extra + * interface function. This cost seems higher than the +@@ -4878,6 +4878,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + return idle_timer_disabled; + } + ++static void bfq_prepare_request(struct request *rq, struct bio *bio); ++ + static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head) + { +@@ -4919,6 +4921,20 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + BUG_ON(!(rq->rq_flags & RQF_GOT)); + rq->rq_flags &= ~RQF_GOT; + ++ if (!bfqq) { ++ /* ++ * This should never happen. Most likely rq is ++ * a requeued regular request, being ++ * re-inserted without being first ++ * re-prepared. Do a prepare, to avoid ++ * failure. ++ */ ++ pr_warn("Regular request associated with no queue"); ++ WARN_ON(1); ++ bfq_prepare_request(rq, rq->bio); ++ bfqq = RQ_BFQQ(rq); ++ } ++ + #if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) + idle_timer_disabled = __bfq_insert_request(bfqd, rq); + /* +@@ -5110,7 +5126,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) + } + } + +-static void bfq_finish_request_body(struct bfq_queue *bfqq) ++static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) + { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "put_request_body: allocated %d", bfqq->allocated); +@@ -5120,7 +5136,13 @@ static void bfq_finish_request_body(struct bfq_queue *bfqq) + bfq_put_queue(bfqq); + } + +-static void bfq_finish_request(struct request *rq) ++/* ++ * Handle either a requeue or a finish for rq. The things to do are ++ * the same in both cases: all references to rq are to be dropped. In ++ * particular, rq is considered completed from the point of view of ++ * the scheduler. ++ */ ++static void bfq_finish_requeue_request(struct request *rq) + { + struct bfq_queue *bfqq; + struct bfq_data *bfqd; +@@ -5128,11 +5150,27 @@ static void bfq_finish_request(struct request *rq) + + BUG_ON(!rq); + +- if (!rq->elv.icq) ++ bfqq = RQ_BFQQ(rq); ++ ++ /* ++ * Requeue and finish hooks are invoked in blk-mq without ++ * checking whether the involved request is actually still ++ * referenced in the scheduler. To handle this fact, the ++ * following two checks make this function exit in case of ++ * spurious invocations, for which there is nothing to do. ++ * ++ * First, check whether rq has nothing to do with an elevator. ++ */ ++ if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) + return; + +- bfqq = RQ_BFQQ(rq); +- BUG_ON(!bfqq); ++ /* ++ * rq either is not associated with any icq, or is an already ++ * requeued request that has not (yet) been re-inserted into ++ * a bfq_queue. ++ */ ++ if (!rq->elv.icq || !bfqq) ++ return; + + bic = RQ_BIC(rq); + BUG_ON(!bic); +@@ -5145,7 +5183,6 @@ static void bfq_finish_request(struct request *rq) + BUG(); + } + BUG_ON(rq->rq_flags & RQF_QUEUED); +- BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); + + bfq_log_bfqq(bfqd, bfqq, + "putting rq %p with %u sects left, STARTED %d", +@@ -5166,13 +5203,14 @@ static void bfq_finish_request(struct request *rq) + spin_lock_irqsave(&bfqd->lock, flags); + + bfq_completed_request(bfqq, bfqd); +- bfq_finish_request_body(bfqq); ++ bfq_finish_requeue_request_body(bfqq); + + spin_unlock_irqrestore(&bfqd->lock, flags); + } else { + /* + * Request rq may be still/already in the scheduler, +- * in which case we need to remove it. And we cannot ++ * in which case we need to remove it (this should ++ * never happen in case of requeue). And we cannot + * defer such a check and removal, to avoid + * inconsistencies in the time interval from the end + * of this function to the start of the deferred work. +@@ -5189,9 +5227,26 @@ static void bfq_finish_request(struct request *rq) + bfqg_stats_update_io_remove(bfqq_group(bfqq), + rq->cmd_flags); + } +- bfq_finish_request_body(bfqq); ++ bfq_finish_requeue_request_body(bfqq); + } + ++ /* ++ * Reset private fields. In case of a requeue, this allows ++ * this function to correctly do nothing if it is spuriously ++ * invoked again on this same request (see the check at the ++ * beginning of the function). Probably, a better general ++ * design would be to prevent blk-mq from invoking the requeue ++ * or finish hooks of an elevator, for a request that is not ++ * referred by that elevator. ++ * ++ * Resetting the following fields would break the ++ * request-insertion logic if rq is re-inserted into a bfq ++ * internal queue, without a re-preparation. Here we assume ++ * that re-insertions of requeued requests, without ++ * re-preparation, can happen only for pass_through or at_head ++ * requests (which are not re-inserted into bfq internal ++ * queues). ++ */ + rq->elv.priv[0] = NULL; + rq->elv.priv[1] = NULL; + } +@@ -5960,7 +6015,8 @@ static struct elevator_type iosched_bfq_mq = { + .ops.mq = { + .limit_depth = bfq_limit_depth, + .prepare_request = bfq_prepare_request, +- .finish_request = bfq_finish_request, ++ .requeue_request = bfq_finish_requeue_request, ++ .finish_request = bfq_finish_requeue_request, + .exit_icq = bfq_exit_icq, + .insert_requests = bfq_insert_requests, + .dispatch_request = bfq_dispatch_request, + +From 3e4f292191cc62b3844316b9741534c3f1b36f0a Mon Sep 17 00:00:00 2001 +From: Davide Paganelli +Date: Thu, 8 Feb 2018 12:19:24 +0100 +Subject: [PATCH 18/23] block, bfq-mq, bfq-sq: make log functions print names + of calling functions + +Add the macro __func__ as a parameter to the invocations of the functions +pr_crit, blk_add_trace_msg and blk_add_cgroup_trace_msg in bfq_log* +functions, in order to include automatically in the log messages +the names of the functions that call the log functions. +The programmer can then avoid doing it. + +Signed-off-by: Davide Paganelli +Signed-off-by: Paolo Valente +--- + block/bfq-cgroup-included.c | 9 +-- + block/bfq-mq-iosched.c | 167 ++++++++++++++++++++++---------------------- + block/bfq-mq.h | 33 ++++----- + block/bfq-sched.c | 54 +++++++------- + block/bfq-sq-iosched.c | 134 +++++++++++++++++------------------ + block/bfq.h | 33 ++++----- + 6 files changed, 214 insertions(+), 216 deletions(-) + +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +index a4f8a03edfc9..613f154e9da5 100644 +--- a/block/bfq-cgroup-included.c ++++ b/block/bfq-cgroup-included.c +@@ -382,7 +382,8 @@ static void bfq_init_entity(struct bfq_entity *entity, + * Make sure that bfqg and its associated blkg do not + * disappear before entity. + */ +- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting bfqg %p and blkg\n", __func__, bfqg); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting bfqg %p and blkg\n", ++ bfqg); + + bfqg_and_blkg_get(bfqg); + #else +@@ -651,7 +652,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); + } + #ifdef BFQ_MQ +- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); + + bfqg_and_blkg_put(bfqq_group(bfqq)); + #else +@@ -661,7 +662,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + entity->parent = bfqg->my_entity; + entity->sched_data = &bfqg->sched_data; + #ifdef BFQ_MQ +- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] getting blkg and bfqg %p\n", __func__, bfqg); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting blkg and bfqg %p\n", bfqg); + + /* pin down bfqg and its associated blkg */ + bfqg_and_blkg_get(bfqg); +@@ -721,7 +722,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, + if (entity->sched_data != &bfqg->sched_data) { + bic_set_bfqq(bic, NULL, 0); + bfq_log_bfqq(bfqd, async_bfqq, +- "bic_change_group: %p %d", ++ "%p %d", + async_bfqq, + async_bfqq->ref); + bfq_put_queue(async_bfqq); +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index bb7ccc2f1165..edc93b6af186 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -310,7 +310,7 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, + static void bfq_schedule_dispatch(struct bfq_data *bfqd) + { + if (bfqd->queued != 0) { +- bfq_log(bfqd, "schedule dispatch"); ++ bfq_log(bfqd, ""); + blk_mq_run_hw_queues(bfqd->queue, true); + } + } +@@ -489,8 +489,8 @@ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) + data->shallow_depth = + bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + +- bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", +- __func__, bfqd->wr_busy_queues, op_is_sync(op), ++ bfq_log(bfqd, "wr_busy %d sync %d depth %u", ++ bfqd->wr_busy_queues, op_is_sync(op), + data->shallow_depth); + } + +@@ -528,7 +528,7 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, + if (rb_link) + *rb_link = p; + +- bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", ++ bfq_log(bfqd, "%llu: returning %d", + (unsigned long long) sector, + bfqq ? bfqq->pid : 0); + +@@ -749,7 +749,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, + if (rq == last || ktime_get_ns() < rq->fifo_time) + return NULL; + +- bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); + BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); + return rq; + } +@@ -842,7 +842,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, + bfq_serv_to_charge(next_rq, bfqq)); + if (entity->budget != new_budget) { + entity->budget = new_budget; +- bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", ++ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", + new_budget); + bfq_requeue_bfqq(bfqd, bfqq, false); + } +@@ -915,8 +915,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", +- __func__, ++ "bic %p wr_coeff %d start_finish %lu max_time %lu", + bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, + bfqq->wr_cur_max_time); + +@@ -929,11 +928,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + bfq_wr_duration(bfqd))) { + switch_back_to_interactive_wr(bfqq, bfqd); + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "resume state: switching back to interactive"); ++ "switching back to interactive"); + } else { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "resume state: switching off wr (%lu + %lu < %lu)", ++ "switching off wr (%lu + %lu < %lu)", + bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, + jiffies); + } +@@ -985,7 +984,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) + /* Increment burst size to take into account also bfqq */ + bfqd->burst_size++; + +- bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); ++ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); + + BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); + +@@ -998,7 +997,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) + * other to consider this burst as large. + */ + bfqd->large_burst = true; +- bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); ++ bfq_log_bfqq(bfqd, bfqq, "large burst started"); + + /* + * We can now mark all queues in the burst list as +@@ -1170,7 +1169,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfqd->large_burst = false; + bfq_reset_burst_list(bfqd, bfqq); + bfq_log_bfqq(bfqd, bfqq, +- "handle_burst: late activation or different group"); ++ "late activation or different group"); + goto end; + } + +@@ -1180,7 +1179,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) + * bfqq as belonging to this large burst immediately. + */ + if (bfqd->large_burst) { +- bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); ++ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); + bfq_mark_bfqq_in_large_burst(bfqq); + goto end; + } +@@ -1686,7 +1685,7 @@ static void bfq_add_request(struct request *rq) + unsigned int old_wr_coeff = bfqq->wr_coeff; + bool interactive = false; + +- bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", ++ bfq_log_bfqq(bfqd, bfqq, "size %u %s", + blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); + + if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ +@@ -1952,7 +1951,7 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, + __rq = bfq_find_rq_fmerge(bfqd, bio, q); + if (__rq && elv_bio_merge_ok(__rq, bio)) { + *req = __rq; +- bfq_log(bfqd, "request_merge: req %p", __rq); ++ bfq_log(bfqd, "req %p", __rq); + + return ELEVATOR_FRONT_MERGE; + } +@@ -1989,7 +1988,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, + bfqq->next_rq = next_rq; + + bfq_log_bfqq(bfqd, bfqq, +- "request_merged: req %p prev %p next_rq %p bfqq %p", ++ "req %p prev %p next_rq %p bfqq %p", + req, prev, next_rq, bfqq); + + /* +@@ -2018,7 +2017,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, + goto end; + + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "requests_merged: rq %p next %p bfqq %p next_bfqq %p", ++ "rq %p next %p bfqq %p next_bfqq %p", + rq, next, bfqq, next_bfqq); + + spin_lock_irq(&bfqq->bfqd->lock); +@@ -2069,10 +2068,10 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) + */ + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "end_wr: wrais ending at %lu, rais_max_time %u", ++ "wrais ending at %lu, rais_max_time %u", + bfqq->last_wr_start_finish, + jiffies_to_msecs(bfqq->wr_cur_max_time)); +- bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", + bfqq->bfqd->wr_busy_queues); + } + +@@ -2245,8 +2244,8 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, + { + if (bfq_too_late_for_merging(new_bfqq)) { + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "[%s] too late for bfq%d to be merged", +- __func__, new_bfqq->pid); ++ "too late for bfq%d to be merged", ++ new_bfqq->pid); + return false; + } + +@@ -2395,8 +2394,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) + } + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", +- __func__, ++ "bic %p wr_coeff %d start_finish %lu max_time %lu", + bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, + bfqq->wr_cur_max_time); + } +@@ -2453,7 +2451,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + + } + +- bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", ++ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", + bfqd->wr_busy_queues); + + /* +@@ -2554,7 +2552,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd, + bfqq->budget_timeout = jiffies + + bfqd->bfq_timeout * timeout_coeff; + +- bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", ++ bfq_log_bfqq(bfqd, bfqq, "%u", + jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); + } + +@@ -2620,10 +2618,10 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, + + bfq_set_budget_timeout(bfqd, bfqq); + bfq_log_bfqq(bfqd, bfqq, +- "set_in_service_queue, cur-budget = %d", ++ "cur-budget = %d", + bfqq->entity.budget); + } else +- bfq_log(bfqd, "set_in_service_queue: NULL"); ++ bfq_log(bfqd, "NULL"); + + bfqd->in_service_queue = bfqq; + } +@@ -2746,7 +2744,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq + bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ + + bfq_log(bfqd, +- "reset_rate_computation at end, sample %u/%u tot_sects %llu", ++ "at end, sample %u/%u tot_sects %llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched); + } +@@ -2766,7 +2764,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) + if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || + bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { + bfq_log(bfqd, +- "update_rate_reset: only resetting, delta_first %lluus samples %d", ++ "only resetting, delta_first %lluus samples %d", + bfqd->delta_from_first>>10, bfqd->peak_rate_samples); + goto reset_computation; + } +@@ -2790,7 +2788,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) + div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); + + bfq_log(bfqd, +-"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", ++"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", + bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + rate > 20<peak_rate) || + rate > 20<peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + goto reset_computation; + } else { + bfq_log(bfqd, +- "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", ++ "do update, samples %u/%u rate/peak %llu/%llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); +@@ -2868,7 +2866,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) + rate /= divisor; /* smoothing constant alpha = 1/divisor */ + + bfq_log(bfqd, +- "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", ++ "divisor %d tmp_peak_rate %llu tmp_rate %u", + divisor, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), + (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); +@@ -2922,7 +2920,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) + + if (bfqd->peak_rate_samples == 0) { /* first dispatch */ + bfq_log(bfqd, +- "update_peak_rate: goto reset, samples %d", ++ "goto reset, samples %d", + bfqd->peak_rate_samples) ; + bfq_reset_rate_computation(bfqd, rq); + goto update_last_values; /* will add one sample */ +@@ -2943,7 +2941,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) + if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && + bfqd->rq_in_driver == 0) { + bfq_log(bfqd, +-"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", ++"jumping to updating&resetting delta_last %lluus samples %d", + (now_ns - bfqd->last_dispatch)>>10, + bfqd->peak_rate_samples) ; + goto update_rate_and_reset; +@@ -2969,7 +2967,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) + bfqd->delta_from_first = now_ns - bfqd->first_dispatch; + + bfq_log(bfqd, +- "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", ++ "added samples %u/%u tot_sects %llu delta_first %lluus", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched, + bfqd->delta_from_first>>10); +@@ -2985,12 +2983,12 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) + bfqd->last_dispatch = now_ns; + + bfq_log(bfqd, +- "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", ++ "delta_first %lluus last_pos %llu peak_rate %llu", + (now_ns - bfqd->first_dispatch)>>10, + (unsigned long long) bfqd->last_position, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + bfq_log(bfqd, +- "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); ++ "samples at end %d", bfqd->peak_rate_samples); + } + + /* +@@ -3088,11 +3086,11 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + */ + budget = 2 * min_budget; + +- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", ++ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); +- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", ++ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", + budget, bfq_min_budget(bfqd)); +- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", ++ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", + bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); + + if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { +@@ -3294,7 +3292,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, + else /* charge at least one seek */ + *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; + +- bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); ++ bfq_log(bfqd, "too short %u", delta_usecs); + + return slow; + } +@@ -3317,11 +3315,11 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, + * peak rate. + */ + slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; +- bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", ++ bfq_log(bfqd, "relative rate %d/%d", + bfqq->entity.service, bfqd->bfq_max_budget); + } + +- bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); ++ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); + + return slow; + } +@@ -3423,7 +3421,7 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + struct bfq_queue *bfqq) + { + bfq_log_bfqq(bfqd, bfqq, +-"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", ++"service_blkg %lu soft_rate %u sects/sec interval %u", + bfqq->service_from_backlogged, + bfqd->bfq_wr_max_softrt_rate, + jiffies_to_msecs(HZ * bfqq->service_from_backlogged / +@@ -3602,7 +3600,7 @@ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) + static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) + { + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "may_budget_timeout: wait_request %d left %d timeout %d", ++ "wait_request %d left %d timeout %d", + bfq_bfqq_wait_request(bfqq), + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, + bfq_bfqq_budget_timeout(bfqq)); +@@ -3863,11 +3861,11 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + * either boosts the throughput (without issues), or is + * necessary to preserve service guarantees. + */ +- bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", ++ bfq_log_bfqq(bfqd, bfqq, "sync %d idling_boosts_thr %d", + bfq_bfqq_sync(bfqq), idling_boosts_thr); + + bfq_log_bfqq(bfqd, bfqq, +- "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", ++ "wr_busy %d boosts %d IO-bound %d guar %d", + bfqd->wr_busy_queues, + idling_boosts_thr_without_issues, + bfq_bfqq_IO_bound(bfqq), +@@ -3907,7 +3905,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + if (!bfqq) + goto new_queue; + +- bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); ++ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); + + if (bfq_may_expire_for_budg_timeout(bfqq) && + !bfq_bfqq_wait_request(bfqq) && +@@ -3983,14 +3981,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + new_queue: + bfqq = bfq_set_in_service_queue(bfqd); + if (bfqq) { +- bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); ++ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); + goto check_queue; + } + keep_queue: + if (bfqq) +- bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); ++ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); + else +- bfq_log(bfqd, "select_queue: no queue returned"); ++ bfq_log(bfqd, "no queue returned"); + + return bfqq; + } +@@ -4043,8 +4041,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) + /* see comments on max_service_from_wr */ + bfq_bfqq_end_wr(bfqq); + bfq_log_bfqq(bfqd, bfqq, +- "[%s] too much service", +- __func__); ++ "too much service"); + } + } + /* +@@ -4122,7 +4119,7 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) + { + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + +- bfq_log(bfqd, "has_work, dispatch_non_empty %d busy_queues %d", ++ bfq_log(bfqd, "dispatch_non_empty %d busy_queues %d", + !list_empty_careful(&bfqd->dispatch), bfqd->busy_queues > 0); + + /* +@@ -4146,7 +4143,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + rq->rq_flags &= ~RQF_DISP_LIST; + + bfq_log(bfqd, +- "dispatch requests: picked %p from dispatch list", rq); ++ "picked %p from dispatch list", rq); + bfqq = RQ_BFQQ(rq); + + if (bfqq) { +@@ -4196,7 +4193,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + goto start_rq; + } + +- bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); ++ bfq_log(bfqd, "%d busy queues", bfqd->busy_queues); + + if (bfqd->busy_queues == 0) + goto exit; +@@ -4236,13 +4233,13 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + rq->rq_flags |= RQF_STARTED; + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, +- "dispatched %s request %p, rq_in_driver %d", ++ "%s request %p, rq_in_driver %d", + bfq_bfqq_sync(bfqq) ? "sync" : "async", + rq, + bfqd->rq_in_driver); + else + bfq_log(bfqd, +- "dispatched request %p from dispatch list, rq_in_driver %d", ++ "request %p from dispatch list, rq_in_driver %d", + rq, bfqd->rq_in_driver); + } else + bfq_log(bfqd, +@@ -4339,7 +4336,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + BUG_ON(bfqq->ref <= 0); + + if (bfqq->bfqd) +- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); + + bfqq->ref--; + if (bfqq->ref) +@@ -4383,10 +4380,10 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + } + + if (bfqq->bfqd) +- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); + + #ifdef BFQ_GROUP_IOSCHED_ENABLED +- bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] putting blkg and bfqg %p\n", __func__, bfqg); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); + bfqg_and_blkg_put(bfqg); + #endif + kmem_cache_free(bfq_pool, bfqq); +@@ -4418,7 +4415,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfq_schedule_dispatch(bfqd); + } + +- bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); + + bfq_put_cooperator(bfqq); + +@@ -4502,7 +4499,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, + bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "set_next_ioprio_data: bic_class %d prio %d class %d", ++ "bic_class %d prio %d class %d", + ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); + } + +@@ -4529,7 +4526,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) + bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); + bic_set_bfqq(bic, bfqq, false); + bfq_log_bfqq(bfqd, bfqq, +- "check_ioprio_change: bfqq %p %d", ++ "bfqq %p %d", + bfqq, bfqq->ref); + } + +@@ -4667,14 +4664,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + * guarantee that this queue is not freed + * until its group goes away. + */ +- bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", ++ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", + bfqq, bfqq->ref); + *async_bfqq = bfqq; + } + + out: + bfqq->ref++; /* get a process reference to this queue */ +- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); + rcu_read_unlock(); + return bfqq; + } +@@ -4733,7 +4730,7 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, + bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) + has_short_ttime = false; + +- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", ++ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", + has_short_ttime); + + if (has_short_ttime) +@@ -4759,7 +4756,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bfq_update_io_seektime(bfqd, bfqq, rq); + + bfq_log_bfqq(bfqd, bfqq, +- "rq_enqueued: has_short_ttime=%d (seeky %d)", ++ "has_short_ttime=%d (seeky %d)", + bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); + + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); +@@ -4818,7 +4815,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + + assert_spin_locked(&bfqd->lock); + +- bfq_log_bfqq(bfqd, bfqq, "__insert_req: rq %p bfqq %p", rq, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, "rq %p bfqq %p", rq, bfqq); + + /* + * An unplug may trigger a requeue of a request from the device +@@ -4837,9 +4834,9 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + new_bfqq->allocated++; + bfqq->allocated--; + bfq_log_bfqq(bfqd, bfqq, +- "insert_request: new allocated %d", bfqq->allocated); ++ "new allocated %d", bfqq->allocated); + bfq_log_bfqq(bfqd, new_bfqq, +- "insert_request: new_bfqq new allocated %d", ++ "new_bfqq new allocated %d", + bfqq->allocated); + + new_bfqq->ref++; +@@ -4911,11 +4908,11 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + rq->rq_flags |= RQF_DISP_LIST; + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, +- "insert_request %p in disp: at_head %d", ++ "%p in disp: at_head %d", + rq, at_head); + else + bfq_log(bfqd, +- "insert_request %p in disp: at_head %d", ++ "%p in disp: at_head %d", + rq, at_head); + } else { + BUG_ON(!(rq->rq_flags & RQF_GOT)); +@@ -5033,7 +5030,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) + bfqq->dispatched--; + + bfq_log_bfqq(bfqd, bfqq, +- "completed_requests: new disp %d, new rq_in_driver %d", ++ "new disp %d, new rq_in_driver %d", + bfqq->dispatched, bfqd->rq_in_driver); + + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { +@@ -5061,7 +5058,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) + delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); + + bfq_log_bfqq(bfqd, bfqq, +- "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", ++ "delta %uus/%luus max_size %u rate %llu/%llu", + delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, + delta_us > 0 ? + (USEC_PER_SEC* +@@ -5129,7 +5126,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) + static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) + { + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "put_request_body: allocated %d", bfqq->allocated); ++ "allocated %d", bfqq->allocated); + BUG_ON(!bfqq->allocated); + bfqq->allocated--; + +@@ -5406,10 +5403,10 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio) + + bfqq->allocated++; + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "get_request: new allocated %d", bfqq->allocated); ++ "new allocated %d", bfqq->allocated); + + bfqq->ref++; +- bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "%p: bfqq %p, %d", rq, bfqq, bfqq->ref); + + rq->elv.priv[0] = bic; + rq->elv.priv[1] = bfqq; +@@ -5493,7 +5490,7 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) + idle_slice_timer); + struct bfq_queue *bfqq = bfqd->in_service_queue; + +- bfq_log(bfqd, "slice_timer expired"); ++ bfq_log(bfqd, "expired"); + + /* + * Theoretical race here: the in-service queue can be NULL or +@@ -5515,10 +5512,10 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, + struct bfq_group *root_group = bfqd->root_group; + struct bfq_queue *bfqq = *bfqq_ptr; + +- bfq_log(bfqd, "put_async_bfqq: %p", bfqq); ++ bfq_log(bfqd, "%p", bfqq); + if (bfqq) { + bfq_bfqq_move(bfqd, bfqq, root_group); +- bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", ++ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", + bfqq, bfqq->ref); + bfq_put_queue(bfqq); + *bfqq_ptr = NULL; +@@ -5547,7 +5544,7 @@ static void bfq_exit_queue(struct elevator_queue *e) + struct bfq_data *bfqd = e->elevator_data; + struct bfq_queue *bfqq, *n; + +- bfq_log(bfqd, "exit_queue: starting ..."); ++ bfq_log(bfqd, "starting ..."); + + hrtimer_cancel(&bfqd->idle_slice_timer); + +@@ -5575,7 +5572,7 @@ static void bfq_exit_queue(struct elevator_queue *e) + spin_unlock_irq(&bfqd->lock); + #endif + +- bfq_log(bfqd, "exit_queue: finished ..."); ++ bfq_log(bfqd, "finished ..."); + kfree(bfqd); + } + +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +index 9a5ce1168ff5..e2ae11bf8f76 100644 +--- a/block/bfq-mq.h ++++ b/block/bfq-mq.h +@@ -712,34 +712,34 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + + #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ +- pr_crit("%s bfq%d%c %s " fmt "\n", \ ++ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ + (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ +- bfqq_group(bfqq)->blkg_path, ##args); \ ++ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ + } while (0) + + #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ +- pr_crit("%s %s " fmt "\n", \ ++ pr_crit("%s %s [%s] " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ +- bfqg->blkg_path, ##args); \ ++ bfqg->blkg_path, __func__, ##args); \ + } while (0) + + #else /* BFQ_GROUP_IOSCHED_ENABLED */ + + #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ +- pr_crit("%s bfq%d%c " fmt "\n", \ ++ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ + (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ +- ##args) ++ __func__, ##args) + #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + + #endif /* BFQ_GROUP_IOSCHED_ENABLED */ + + #define bfq_log(bfqd, fmt, args...) \ +- pr_crit("%s bfq " fmt "\n", \ ++ pr_crit("%s bfq [%s] " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ +- ##args) ++ __func__, ##args) + + #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ + +@@ -762,28 +762,29 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + + #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ +- blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ + (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ +- bfqq_group(bfqq)->blkg_path, ##args); \ ++ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ + } while (0) + + #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ +- blk_add_trace_msg((bfqd)->queue, "%s " fmt, bfqg->blkg_path, ##args);\ ++ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, bfqg->blkg_path, \ ++ __func__, ##args);\ + } while (0) + + #else /* BFQ_GROUP_IOSCHED_ENABLED */ + + #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ +- blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ +- ##args) ++ __func__, ##args) + #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + + #endif /* BFQ_GROUP_IOSCHED_ENABLED */ + + #define bfq_log(bfqd, fmt, args...) \ +- blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) ++ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) + + #endif /* CONFIG_BLK_DEV_IO_TRACE */ + #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ +@@ -938,7 +939,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) + + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "entity_service_tree %p %d", ++ "%p %d", + sched_data->service_tree + idx, idx); + #ifdef BFQ_GROUP_IOSCHED_ENABLED + else { +@@ -946,7 +947,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, +- "entity_service_tree %p %d", ++ "%p %d", + sched_data->service_tree + idx, idx); + } + #endif +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +index 4e6c5232e2fb..ead34c30a7c2 100644 +--- a/block/bfq-sched.c ++++ b/block/bfq-sched.c +@@ -119,7 +119,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "update_next_in_service: chose without lookup"); ++ "chose without lookup"); + #ifdef BFQ_GROUP_IOSCHED_ENABLED + else { + struct bfq_group *bfqg = +@@ -127,7 +127,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data*)bfqg->bfqd, bfqg, +- "update_next_in_service: chose without lookup"); ++ "chose without lookup"); + } + #endif + } +@@ -148,7 +148,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + bfqq = bfq_entity_to_bfqq(next_in_service); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "update_next_in_service: chosen this queue"); ++ "chosen this queue"); + #ifdef BFQ_GROUP_IOSCHED_ENABLED + else { + struct bfq_group *bfqg = +@@ -156,7 +156,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, +- "update_next_in_service: chosen this entity"); ++ "chosen this entity"); + } + #endif + return parent_sched_may_change; +@@ -331,10 +331,10 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "calc_finish: serv %lu, w %d", ++ "serv %lu, w %d", + service, entity->weight); + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "calc_finish: start %llu, finish %llu, delta %llu", ++ "start %llu, finish %llu, delta %llu", + start, finish, delta); + #ifdef BFQ_GROUP_IOSCHED_ENABLED + } else { +@@ -342,10 +342,10 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, +- "calc_finish group: serv %lu, w %d", ++ "group: serv %lu, w %d", + service, entity->weight); + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, +- "calc_finish group: start %llu, finish %llu, delta %llu", ++ "group: start %llu, finish %llu, delta %llu", + start, finish, delta); + #endif + } +@@ -484,7 +484,7 @@ static void bfq_update_active_node(struct rb_node *node) + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "update_active_node: new min_start %llu", ++ "new min_start %llu", + ((entity->min_start>>10)*1000)>>12); + #ifdef BFQ_GROUP_IOSCHED_ENABLED + } else { +@@ -492,7 +492,7 @@ static void bfq_update_active_node(struct rb_node *node) + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, +- "update_active_node: new min_start %llu", ++ "new min_start %llu", + ((entity->min_start>>10)*1000)>>12); + #endif + } +@@ -620,7 +620,7 @@ static void bfq_get_entity(struct bfq_entity *entity) + + if (bfqq) { + bfqq->ref++; +- bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", + bfqq, bfqq->ref); + } + } +@@ -748,7 +748,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st, + entity->on_st = false; + st->wsum -= entity->weight; + if (bfqq && !is_in_service) { +- bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity (before): %p %d", ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "(before): %p %d", + bfqq, bfqq->ref); + bfq_put_queue(bfqq); + } +@@ -1008,7 +1008,7 @@ static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, + tot_serv_to_charge = entity->service; + + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "charge_time: %lu/%u ms, %d/%d/%d sectors", ++ "%lu/%u ms, %d/%d/%d sectors", + time_ms, timeout_ms, entity->service, + tot_serv_to_charge, entity->budget); + +@@ -1080,7 +1080,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "update_fin_time_enqueue: new queue finish %llu", ++ "new queue finish %llu", + ((entity->finish>>10)*1000)>>12); + #ifdef BFQ_GROUP_IOSCHED_ENABLED + } else { +@@ -1088,7 +1088,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, +- "update_fin_time_enqueue: new group finish %llu", ++ "new group finish %llu", + ((entity->finish>>10)*1000)>>12); + #endif + } +@@ -1098,7 +1098,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "update_fin_time_enqueue: queue %seligible in st %p", ++ "queue %seligible in st %p", + entity->start <= st->vtime ? "" : "non ", st); + #ifdef BFQ_GROUP_IOSCHED_ENABLED + } else { +@@ -1106,7 +1106,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, +- "update_fin_time_enqueue: group %seligible in st %p", ++ "group %seligible in st %p", + entity->start <= st->vtime ? "" : "non ", st); + #endif + } +@@ -1550,7 +1550,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) + + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "calc_vtime_jump: new value %llu", ++ "new value %llu", + ((root_entity->min_start>>10)*1000)>>12); + #ifdef BFQ_GROUP_IOSCHED_ENABLED + else { +@@ -1559,7 +1559,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) + entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, +- "calc_vtime_jump: new value %llu", ++ "new value %llu", + ((root_entity->min_start>>10)*1000)>>12); + } + #endif +@@ -1677,7 +1677,7 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) + bfqq = bfq_entity_to_bfqq(entity); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "__lookup_next: start %llu vtime %llu st %p", ++ "start %llu vtime %llu st %p", + ((entity->start>>10)*1000)>>12, + ((new_vtime>>10)*1000)>>12, st); + #ifdef BFQ_GROUP_IOSCHED_ENABLED +@@ -1686,7 +1686,7 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, +- "__lookup_next: start %llu vtime %llu (%llu) st %p", ++ "start %llu vtime %llu (%llu) st %p", + ((entity->start>>10)*1000)>>12, + ((st->vtime>>10)*1000)>>12, + ((new_vtime>>10)*1000)>>12, st); +@@ -1821,14 +1821,14 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg(bfqd, bfqg, +- "get_next_queue: lookup in this group"); ++ "lookup in this group"); + if (!sd->next_in_service) +- pr_crit("get_next_queue: lookup in this group"); ++ pr_crit("lookup in this group"); + } else { + bfq_log_bfqg(bfqd, bfqd->root_group, +- "get_next_queue: lookup in root group"); ++ "lookup in root group"); + if (!sd->next_in_service) +- pr_crit("get_next_queue: lookup in root group"); ++ pr_crit("lookup in root group"); + } + #endif + +@@ -1903,7 +1903,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) + bfqq = bfq_entity_to_bfqq(entity); + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, +- "get_next_queue: this queue, finish %llu", ++ "this queue, finish %llu", + (((entity->finish>>10)*1000)>>10)>>2); + #ifdef BFQ_GROUP_IOSCHED_ENABLED + else { +@@ -1911,7 +1911,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg(bfqd, bfqg, +- "get_next_queue: this entity, finish %llu", ++ "this entity, finish %llu", + (((entity->finish>>10)*1000)>>10)>>2); + } + #endif +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index c4df156b1fb4..e49e8ac882b3 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -281,7 +281,7 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd); + static void bfq_schedule_dispatch(struct bfq_data *bfqd) + { + if (bfqd->queued != 0) { +- bfq_log(bfqd, "schedule dispatch"); ++ bfq_log(bfqd, ""); + kblockd_schedule_work(&bfqd->unplug_work); + } + } +@@ -414,7 +414,7 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, + if (rb_link) + *rb_link = p; + +- bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", ++ bfq_log(bfqd, "%llu: returning %d", + (unsigned long long) sector, + bfqq ? bfqq->pid : 0); + +@@ -635,7 +635,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, + if (rq == last || ktime_get_ns() < rq->fifo_time) + return NULL; + +- bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); + BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); + return rq; + } +@@ -728,7 +728,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, + bfq_serv_to_charge(next_rq, bfqq)); + if (entity->budget != new_budget) { + entity->budget = new_budget; +- bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", ++ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", + new_budget); + bfq_requeue_bfqq(bfqd, bfqq, false); + } +@@ -800,8 +800,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", +- __func__, ++ "bic %p wr_coeff %d start_finish %lu max_time %lu", + bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, + bfqq->wr_cur_max_time); + +@@ -814,11 +813,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + bfq_wr_duration(bfqd))) { + switch_back_to_interactive_wr(bfqq, bfqd); + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "resume state: switching back to interactive"); ++ "switching back to interactive"); + } else { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "resume state: switching off wr (%lu + %lu < %lu)", ++ "switching off wr (%lu + %lu < %lu)", + bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, + jiffies); + } +@@ -870,7 +869,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) + /* Increment burst size to take into account also bfqq */ + bfqd->burst_size++; + +- bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); ++ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); + + BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); + +@@ -883,7 +882,7 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) + * other to consider this burst as large. + */ + bfqd->large_burst = true; +- bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); ++ bfq_log_bfqq(bfqd, bfqq, "large burst started"); + + /* + * We can now mark all queues in the burst list as +@@ -1055,7 +1054,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfqd->large_burst = false; + bfq_reset_burst_list(bfqd, bfqq); + bfq_log_bfqq(bfqd, bfqq, +- "handle_burst: late activation or different group"); ++ "late activation or different group"); + goto end; + } + +@@ -1065,7 +1064,7 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) + * bfqq as belonging to this large burst immediately. + */ + if (bfqd->large_burst) { +- bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); ++ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); + bfq_mark_bfqq_in_large_burst(bfqq); + goto end; + } +@@ -1572,7 +1571,7 @@ static void bfq_add_request(struct request *rq) + unsigned int old_wr_coeff = bfqq->wr_coeff; + bool interactive = false; + +- bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", ++ bfq_log_bfqq(bfqd, bfqq, "size %u %s", + blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); + + if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ +@@ -1870,10 +1869,10 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) + */ + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "end_wr: wrais ending at %lu, rais_max_time %u", ++ "wrais ending at %lu, rais_max_time %u", + bfqq->last_wr_start_finish, + jiffies_to_msecs(bfqq->wr_cur_max_time)); +- bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", + bfqq->bfqd->wr_busy_queues); + } + +@@ -2048,8 +2047,8 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, + { + if (bfq_too_late_for_merging(new_bfqq)) { + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "[%s] too late for bfq%d to be merged", +- __func__, new_bfqq->pid); ++ "too late for bfq%d to be merged", ++ new_bfqq->pid); + return false; + } + +@@ -2258,7 +2257,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + + } + +- bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", ++ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", + bfqd->wr_busy_queues); + + /* +@@ -2359,7 +2358,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd, + bfqq->budget_timeout = jiffies + + bfqd->bfq_timeout * timeout_coeff; + +- bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", ++ bfq_log_bfqq(bfqd, bfqq, "%u", + jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); + } + +@@ -2427,10 +2426,10 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, + + bfq_set_budget_timeout(bfqd, bfqq); + bfq_log_bfqq(bfqd, bfqq, +- "set_in_service_queue, cur-budget = %d", ++ "cur-budget = %d", + bfqq->entity.budget); + } else +- bfq_log(bfqd, "set_in_service_queue: NULL"); ++ bfq_log(bfqd, "NULL"); + + bfqd->in_service_queue = bfqq; + } +@@ -2559,7 +2558,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq + bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ + + bfq_log(bfqd, +- "reset_rate_computation at end, sample %u/%u tot_sects %llu", ++ "at end, sample %u/%u tot_sects %llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched); + } +@@ -2579,7 +2578,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) + if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || + bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { + bfq_log(bfqd, +- "update_rate_reset: only resetting, delta_first %lluus samples %d", ++ "only resetting, delta_first %lluus samples %d", + bfqd->delta_from_first>>10, bfqd->peak_rate_samples); + goto reset_computation; + } +@@ -2603,7 +2602,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) + div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); + + bfq_log(bfqd, +-"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", ++"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", + bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + rate > 20<peak_rate) || + rate > 20<peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + goto reset_computation; + } else { + bfq_log(bfqd, +- "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", ++ "do update, samples %u/%u rate/peak %llu/%llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); +@@ -2681,7 +2680,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) + rate /= divisor; /* smoothing constant alpha = 1/divisor */ + + bfq_log(bfqd, +- "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", ++ "divisor %d tmp_peak_rate %llu tmp_rate %u", + divisor, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), + (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); +@@ -2735,7 +2734,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) + + if (bfqd->peak_rate_samples == 0) { /* first dispatch */ + bfq_log(bfqd, +- "update_peak_rate: goto reset, samples %d", ++ "goto reset, samples %d", + bfqd->peak_rate_samples) ; + bfq_reset_rate_computation(bfqd, rq); + goto update_last_values; /* will add one sample */ +@@ -2756,7 +2755,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) + if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && + bfqd->rq_in_driver == 0) { + bfq_log(bfqd, +-"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", ++"jumping to updating&resetting delta_last %lluus samples %d", + (now_ns - bfqd->last_dispatch)>>10, + bfqd->peak_rate_samples) ; + goto update_rate_and_reset; +@@ -2782,7 +2781,7 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) + bfqd->delta_from_first = now_ns - bfqd->first_dispatch; + + bfq_log(bfqd, +- "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", ++ "added samples %u/%u tot_sects %llu delta_first %lluus", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched, + bfqd->delta_from_first>>10); +@@ -2798,12 +2797,12 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) + bfqd->last_dispatch = now_ns; + + bfq_log(bfqd, +- "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", ++ "delta_first %lluus last_pos %llu peak_rate %llu", + (now_ns - bfqd->first_dispatch)>>10, + (unsigned long long) bfqd->last_position, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + bfq_log(bfqd, +- "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); ++ "samples at end %d", bfqd->peak_rate_samples); + } + + /* +@@ -2900,11 +2899,11 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + */ + budget = 2 * min_budget; + +- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", ++ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); +- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", ++ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", + budget, bfq_min_budget(bfqd)); +- bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", ++ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", + bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); + + if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { +@@ -3106,7 +3105,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, + else /* charge at least one seek */ + *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; + +- bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); ++ bfq_log(bfqd, "too short %u", delta_usecs); + + return slow; + } +@@ -3129,11 +3128,11 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, + * peak rate. + */ + slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; +- bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", ++ bfq_log(bfqd, "relative rate %d/%d", + bfqq->entity.service, bfqd->bfq_max_budget); + } + +- bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); ++ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); + + return slow; + } +@@ -3235,7 +3234,7 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + struct bfq_queue *bfqq) + { + bfq_log_bfqq(bfqd, bfqq, +-"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", ++"service_blkg %lu soft_rate %u sects/sec interval %u", + bfqq->service_from_backlogged, + bfqd->bfq_wr_max_softrt_rate, + jiffies_to_msecs(HZ * bfqq->service_from_backlogged / +@@ -3414,7 +3413,7 @@ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) + static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) + { + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "may_budget_timeout: wait_request %d left %d timeout %d", ++ "wait_request %d left %d timeout %d", + bfq_bfqq_wait_request(bfqq), + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, + bfq_bfqq_budget_timeout(bfqq)); +@@ -3675,11 +3674,11 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) + * either boosts the throughput (without issues), or is + * necessary to preserve service guarantees. + */ +- bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", ++ bfq_log_bfqq(bfqd, bfqq, "sync %d idling_boosts_thr %d", + bfq_bfqq_sync(bfqq), idling_boosts_thr); + + bfq_log_bfqq(bfqd, bfqq, +- "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", ++ "wr_busy %d boosts %d IO-bound %d guar %d", + bfqd->wr_busy_queues, + idling_boosts_thr_without_issues, + bfq_bfqq_IO_bound(bfqq), +@@ -3719,7 +3718,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + if (!bfqq) + goto new_queue; + +- bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); ++ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); + + if (bfq_may_expire_for_budg_timeout(bfqq) && + !hrtimer_active(&bfqd->idle_slice_timer) && +@@ -3797,14 +3796,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + new_queue: + bfqq = bfq_set_in_service_queue(bfqd); + if (bfqq) { +- bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); ++ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); + goto check_queue; + } + keep_queue: + if (bfqq) +- bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); ++ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); + else +- bfq_log(bfqd, "select_queue: no queue returned"); ++ bfq_log(bfqd, "no queue returned"); + + return bfqq; + } +@@ -3857,8 +3856,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) + /* see comments on max_service_from_wr */ + bfq_bfqq_end_wr(bfqq); + bfq_log_bfqq(bfqd, bfqq, +- "[%s] too much service", +- __func__); ++ "too much service"); + } + } + /* +@@ -3987,7 +3985,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq; + +- bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); ++ bfq_log(bfqd, "%d busy queues", bfqd->busy_queues); + + if (bfqd->busy_queues == 0) + return 0; +@@ -4021,7 +4019,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) + if (!bfq_dispatch_request(bfqd, bfqq)) + return 0; + +- bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", ++ bfq_log_bfqq(bfqd, bfqq, "%s request", + bfq_bfqq_sync(bfqq) ? "sync" : "async"); + + BUG_ON(bfqq->next_rq == NULL && +@@ -4044,7 +4042,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + + BUG_ON(bfqq->ref <= 0); + +- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); + bfqq->ref--; + if (bfqq->ref) + return; +@@ -4086,7 +4084,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) + bfqq->bfqd->burst_size--; + } + +- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); + + kmem_cache_free(bfq_pool, bfqq); + #ifdef BFQ_GROUP_IOSCHED_ENABLED +@@ -4120,7 +4118,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) + bfq_schedule_dispatch(bfqd); + } + +- bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); + + bfq_put_cooperator(bfqq); + +@@ -4200,7 +4198,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, + bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "set_next_ioprio_data: bic_class %d prio %d class %d", ++ "bic_class %d prio %d class %d", + ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); + } + +@@ -4227,7 +4225,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) + bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); + bic_set_bfqq(bic, bfqq, false); + bfq_log_bfqq(bfqd, bfqq, +- "check_ioprio_change: bfqq %p %d", ++ "bfqq %p %d", + bfqq, bfqq->ref); + } + +@@ -4362,14 +4360,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + * guarantee that this queue is not freed + * until its group goes away. + */ +- bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", ++ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", + bfqq, bfqq->ref); + *async_bfqq = bfqq; + } + + out: + bfqq->ref++; /* get a process reference to this queue */ +- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); + rcu_read_unlock(); + return bfqq; + } +@@ -4428,7 +4426,7 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, + bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) + has_short_ttime = false; + +- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", ++ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", + has_short_ttime); + + if (has_short_ttime) +@@ -4454,7 +4452,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bfq_update_io_seektime(bfqd, bfqq, rq); + + bfq_log_bfqq(bfqd, bfqq, +- "rq_enqueued: has_short_ttime=%d (seeky %d)", ++ "has_short_ttime=%d (seeky %d)", + bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); + + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); +@@ -4629,7 +4627,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) + */ + delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); + +- bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", ++ bfq_log(bfqd, "delta %uus/%luus max_size %u rate %llu/%llu", + delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, + delta_us > 0 ? + (USEC_PER_SEC* +@@ -4750,7 +4748,7 @@ static void bfq_put_request(struct request *rq) + rq->elv.priv[0] = NULL; + rq->elv.priv[1] = NULL; + +- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p, %d", + bfqq, bfqq->ref); + bfq_put_queue(bfqq); + } +@@ -4816,7 +4814,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, + bic_set_bfqq(bic, bfqq, is_sync); + if (split && is_sync) { + bfq_log_bfqq(bfqd, bfqq, +- "set_request: was_in_list %d " ++ "was_in_list %d " + "was_in_large_burst %d " + "large burst in progress %d", + bic->was_in_burst_list, +@@ -4826,12 +4824,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, + if ((bic->was_in_burst_list && bfqd->large_burst) || + bic->saved_in_large_burst) { + bfq_log_bfqq(bfqd, bfqq, +- "set_request: marking in " ++ "marking in " + "large burst"); + bfq_mark_bfqq_in_large_burst(bfqq); + } else { + bfq_log_bfqq(bfqd, bfqq, +- "set_request: clearing in " ++ "clearing in " + "large burst"); + bfq_clear_bfqq_in_large_burst(bfqq); + if (bic->was_in_burst_list) +@@ -4888,7 +4886,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, + + bfqq->allocated[rw]++; + bfqq->ref++; +- bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "bfqq %p, %d", bfqq, bfqq->ref); + + rq->elv.priv[0] = bic; + rq->elv.priv[1] = bfqq; +@@ -4962,7 +4960,7 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) + * case we just expire a queue too early. + */ + if (bfqq) { +- bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); ++ bfq_log_bfqq(bfqd, bfqq, "expired"); + bfq_clear_bfqq_wait_request(bfqq); + + if (bfq_bfqq_budget_timeout(bfqq)) +@@ -5005,10 +5003,10 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, + struct bfq_group *root_group = bfqd->root_group; + struct bfq_queue *bfqq = *bfqq_ptr; + +- bfq_log(bfqd, "put_async_bfqq: %p", bfqq); ++ bfq_log(bfqd, "%p", bfqq); + if (bfqq) { + bfq_bfqq_move(bfqd, bfqq, root_group); +- bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", ++ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", + bfqq, bfqq->ref); + bfq_put_queue(bfqq); + *bfqq_ptr = NULL; +diff --git a/block/bfq.h b/block/bfq.h +index 0cd7a3f251a7..4d2fe7f77af1 100644 +--- a/block/bfq.h ++++ b/block/bfq.h +@@ -698,37 +698,37 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + \ + assert_spin_locked((bfqd)->queue->queue_lock); \ + blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ +- pr_crit("%s bfq%d%c %s " fmt "\n", \ ++ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ + (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ +- __pbuf, ##args); \ ++ __pbuf, __func__, ##args); \ + } while (0) + + #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ +- pr_crit("%s %s " fmt "\n", \ ++ pr_crit("%s %s [%s] " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ +- __pbuf, ##args); \ ++ __pbuf, __func__, ##args); \ + } while (0) + + #else /* BFQ_GROUP_IOSCHED_ENABLED */ + + #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ +- pr_crit("%s bfq%d%c " fmt "\n", \ ++ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ + (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ +- ##args) ++ __func__, ##args) + #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + + #endif /* BFQ_GROUP_IOSCHED_ENABLED */ + + #define bfq_log(bfqd, fmt, args...) \ +- pr_crit("%s bfq " fmt "\n", \ ++ pr_crit("%s bfq [%s] " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ +- ##args) ++ __func__, ##args) + + #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ + +@@ -755,31 +755,32 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + \ + assert_spin_locked((bfqd)->queue->queue_lock); \ + blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ +- blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ + (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ +- __pbuf, ##args); \ ++ __pbuf, __func__, ##args); \ + } while (0) + + #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ +- blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ ++ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, __pbuf, \ ++ __func__, ##args); \ + } while (0) + + #else /* BFQ_GROUP_IOSCHED_ENABLED */ + + #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ +- blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ +- ##args) ++ __func__, ##args) + #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + + #endif /* BFQ_GROUP_IOSCHED_ENABLED */ + + #define bfq_log(bfqd, fmt, args...) \ +- blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) ++ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) + + #endif /* CONFIG_BLK_DEV_IO_TRACE */ + #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ +@@ -928,7 +929,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) + + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, +- "entity_service_tree %p %d", ++ "%p %d", + sched_data->service_tree + idx, idx); + #ifdef BFQ_GROUP_IOSCHED_ENABLED + else { +@@ -936,7 +937,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, +- "entity_service_tree %p %d", ++ "%p %d", + sched_data->service_tree + idx, idx); + } + #endif + +From 673a457e8a54d1c4b66e61b1a50956ba0b8c6a60 Mon Sep 17 00:00:00 2001 +From: Davide Paganelli +Date: Thu, 8 Feb 2018 11:49:58 +0100 +Subject: [PATCH 19/23] block, bfq-mq, bfq-sq: make bfq_bfqq_expire print + expiration reason + +Improve readability of the log messages related to the expiration +reasons of the function bfq_bfqq_expire. +Change the printing of the number that represents the reason for +expiration with an actual textual description of the reason. + +Signed-off-by: Davide Paganelli +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 10 ++++++++-- + block/bfq-sq-iosched.c | 10 ++++++++-- + 2 files changed, 16 insertions(+), 4 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index edc93b6af186..9268dd47a4e5 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -133,6 +133,12 @@ static const int bfq_timeout = (HZ / 8); + */ + static const unsigned long bfq_merge_time_limit = HZ/10; + ++#define MAX_LENGTH_REASON_NAME 25 ++ ++static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", ++"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", ++"PREEMPTED"}; ++ + static struct kmem_cache *bfq_pool; + + /* Below this threshold (in ns), we consider thinktime immediate. */ +@@ -3553,8 +3559,8 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, + } + + bfq_log_bfqq(bfqd, bfqq, +- "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", +- reason, slow, bfqq->dispatched, ++ "expire (%s, slow %d, num_disp %d, short_ttime %d, weight %d)", ++ reason_name[reason], slow, bfqq->dispatched, + bfq_bfqq_has_short_ttime(bfqq), entity->weight); + + /* +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index e49e8ac882b3..f95deaab49a1 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -127,6 +127,12 @@ static const int bfq_timeout = (HZ / 8); + */ + static const unsigned long bfq_merge_time_limit = HZ/10; + ++#define MAX_LENGTH_REASON_NAME 25 ++ ++static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", ++"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", ++"PREEMPTED"}; ++ + static struct kmem_cache *bfq_pool; + + /* Below this threshold (in ns), we consider thinktime immediate. */ +@@ -3366,8 +3372,8 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, + } + + bfq_log_bfqq(bfqd, bfqq, +- "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", +- reason, slow, bfqq->dispatched, ++ "expire (%s, slow %d, num_disp %d, short_ttime %d, weight %d)", ++ reason_name[reason], slow, bfqq->dispatched, + bfq_bfqq_has_short_ttime(bfqq), entity->weight); + + /* + +From 62e80623fbb58367c3f667dab22fea0804001f3b Mon Sep 17 00:00:00 2001 +From: Melzani Alessandro +Date: Mon, 26 Feb 2018 22:21:59 +0100 +Subject: [PATCH 20/23] bfq-mq: port of "block, bfq: remove batches of + confusing ifdefs" + +Commit a33801e8b473 ("block, bfq: move debug blkio stats behind +CONFIG_DEBUG_BLK_CGROUP") introduced two batches of confusing ifdefs: +one reported in [1], plus a similar one in another function. This +commit removes both batches, in the way suggested in [1]. + +[1] https://www.spinics.net/lists/linux-block/msg20043.html + +Fixes: a33801e8b473 ("block, bfq: move debug blkio stats behind CONFIG_DEBUG_BLK_CGROUP") + +Signed-off-by: Alessandro Melzani +--- + block/bfq-mq-iosched.c | 128 ++++++++++++++++++++++++++++--------------------- + 1 file changed, 73 insertions(+), 55 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 9268dd47a4e5..5a211620f316 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -4256,35 +4256,17 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + return rq; + } + +-static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) +-{ +- struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; +- struct request *rq; +-#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) +- struct bfq_queue *in_serv_queue, *bfqq; +- bool waiting_rq, idle_timer_disabled; +-#endif + +- spin_lock_irq(&bfqd->lock); +- +-#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) +- in_serv_queue = bfqd->in_service_queue; +- waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); +- +- rq = __bfq_dispatch_request(hctx); +- +- idle_timer_disabled = +- waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); +- +-#else +- rq = __bfq_dispatch_request(hctx); +-#endif +- spin_unlock_irq(&bfqd->lock); ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) ++static void bfq_update_dispatch_stats(struct request_queue *q, ++ struct request *rq, ++ struct bfq_queue *in_serv_queue, ++ bool idle_timer_disabled) ++{ ++ struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; + +-#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) +- bfqq = rq ? RQ_BFQQ(rq) : NULL; + if (!idle_timer_disabled && !bfqq) +- return rq; ++ return; + + /* + * rq and bfqq are guaranteed to exist until this function +@@ -4299,7 +4281,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + * In addition, the following queue lock guarantees that + * bfqq_group(bfqq) exists as well. + */ +- spin_lock_irq(hctx->queue->queue_lock); ++ spin_lock_irq(q->queue_lock); + if (idle_timer_disabled) + /* + * Since the idle timer has been disabled, +@@ -4318,8 +4300,35 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + bfqg_stats_set_start_empty_time(bfqg); + bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); + } +- spin_unlock_irq(hctx->queue->queue_lock); ++ spin_unlock_irq(q->queue_lock); ++} ++#else ++static inline void bfq_update_dispatch_stats(struct request_queue *q, ++ struct request *rq, ++ struct bfq_queue *in_serv_queue, ++ bool idle_timer_disabled) {} + #endif ++static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) ++{ ++ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; ++ struct request *rq; ++ struct bfq_queue *in_serv_queue; ++ bool waiting_rq, idle_timer_disabled; ++ ++ spin_lock_irq(&bfqd->lock); ++ ++ in_serv_queue = bfqd->in_service_queue; ++ waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); ++ ++ rq = __bfq_dispatch_request(hctx); ++ ++ idle_timer_disabled = ++ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); ++ ++ spin_unlock_irq(&bfqd->lock); ++ ++ bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, ++ idle_timer_disabled); + + return rq; + } +@@ -4881,6 +4890,38 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) + return idle_timer_disabled; + } + ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) ++static void bfq_update_insert_stats(struct request_queue *q, ++ struct bfq_queue *bfqq, ++ bool idle_timer_disabled, ++ unsigned int cmd_flags) ++{ ++ if (!bfqq) ++ return; ++ ++ /* ++ * bfqq still exists, because it can disappear only after ++ * either it is merged with another queue, or the process it ++ * is associated with exits. But both actions must be taken by ++ * the same process currently executing this flow of ++ * instructions. ++ * ++ * In addition, the following queue lock guarantees that ++ * bfqq_group(bfqq) exists as well. ++ */ ++ spin_lock_irq(q->queue_lock); ++ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); ++ if (idle_timer_disabled) ++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); ++ spin_unlock_irq(q->queue_lock); ++} ++#else ++static inline void bfq_update_insert_stats(struct request_queue *q, ++ struct bfq_queue *bfqq, ++ bool idle_timer_disabled, ++ unsigned int cmd_flags) {} ++#endif ++ + static void bfq_prepare_request(struct request *rq, struct bio *bio); + + static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, +@@ -4889,10 +4930,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + struct request_queue *q = hctx->queue; + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq); +-#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) + bool idle_timer_disabled = false; + unsigned int cmd_flags; +-#endif + + spin_lock_irq(&bfqd->lock); + if (blk_mq_sched_try_insert_merge(q, rq)) { +@@ -4938,7 +4977,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bfqq = RQ_BFQQ(rq); + } + +-#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) + idle_timer_disabled = __bfq_insert_request(bfqd, rq); + /* + * Update bfqq, because, if a queue merge has occurred +@@ -4946,9 +4984,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + * redirected into a new queue. + */ + bfqq = RQ_BFQQ(rq); +-#else +- __bfq_insert_request(bfqd, rq); +-#endif + + if (rq_mergeable(rq)) { + elv_rqhash_add(q, rq); +@@ -4956,34 +4991,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + q->last_merge = rq; + } + } +-#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) ++ + /* + * Cache cmd_flags before releasing scheduler lock, because rq + * may disappear afterwards (for example, because of a request + * merge). + */ + cmd_flags = rq->cmd_flags; +-#endif ++ + spin_unlock_irq(&bfqd->lock); +-#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) +- if (!bfqq) +- return; +- /* +- * bfqq still exists, because it can disappear only after +- * either it is merged with another queue, or the process it +- * is associated with exits. But both actions must be taken by +- * the same process currently executing this flow of +- * instruction. +- * +- * In addition, the following queue lock guarantees that +- * bfqq_group(bfqq) exists as well. +- */ +- spin_lock_irq(q->queue_lock); +- bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); +- if (idle_timer_disabled) +- bfqg_stats_update_idle_time(bfqq_group(bfqq)); +- spin_unlock_irq(q->queue_lock); +-#endif ++ bfq_update_insert_stats(q, bfqq, idle_timer_disabled, ++ cmd_flags); + } + + static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, + +From 0d0d05632872b226f4fae5e56af8736a4c24bf57 Mon Sep 17 00:00:00 2001 +From: Melzani Alessandro +Date: Mon, 26 Feb 2018 22:43:30 +0100 +Subject: [PATCH 21/23] bfq-sq, bfq-mq: port of "bfq: Use icq_to_bic() + consistently" + +Some code uses icq_to_bic() to convert an io_cq pointer to a +bfq_io_cq pointer while other code uses a direct cast. Convert +the code that uses a direct cast such that it uses icq_to_bic(). + +Signed-off-by: Alessandro Melzani +--- + block/bfq-mq-iosched.c | 2 +- + block/bfq-sq-iosched.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 5a211620f316..7b1269558c47 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -272,7 +272,7 @@ static const unsigned long max_service_from_wr = 120000; + #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) + +-#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) ++#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) + #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) + + /** +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index f95deaab49a1..c4aff8d55fc4 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -266,7 +266,7 @@ static const unsigned long max_service_from_wr = 120000; + #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) + +-#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) ++#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) + #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) + + static void bfq_schedule_dispatch(struct bfq_data *bfqd); + +From 4cb5de6add7d6ad0d25d73cb95dc871305db1522 Mon Sep 17 00:00:00 2001 +From: Melzani Alessandro +Date: Mon, 26 Feb 2018 22:59:30 +0100 +Subject: [PATCH 22/23] bfq-sq, bfq-mq: port of "block, bfq: fix error handle + in bfq_init" + +if elv_register fail, bfq_pool should be free. + +Signed-off-by: Alessandro Melzani +--- + block/bfq-mq-iosched.c | 4 +++- + block/bfq-sq-iosched.c | 4 +++- + 2 files changed, 6 insertions(+), 2 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 7b1269558c47..964e88c2ce59 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -6129,7 +6129,7 @@ static int __init bfq_init(void) + + ret = elv_register(&iosched_bfq_mq); + if (ret) +- goto err_pol_unreg; ++ goto slab_kill; + + #ifdef BFQ_GROUP_IOSCHED_ENABLED + strcat(msg, " (with cgroups support)"); +@@ -6138,6 +6138,8 @@ static int __init bfq_init(void) + + return 0; + ++slab_kill: ++ bfq_slab_kill(); + err_pol_unreg: + #ifdef BFQ_GROUP_IOSCHED_ENABLED + blkcg_policy_unregister(&blkcg_policy_bfq); +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +index c4aff8d55fc4..7f0cf1f01ffc 100644 +--- a/block/bfq-sq-iosched.c ++++ b/block/bfq-sq-iosched.c +@@ -5590,7 +5590,7 @@ static int __init bfq_init(void) + + ret = elv_register(&iosched_bfq); + if (ret) +- goto err_pol_unreg; ++ goto slab_kill; + + #ifdef BFQ_GROUP_IOSCHED_ENABLED + strcat(msg, " (with cgroups support)"); +@@ -5599,6 +5599,8 @@ static int __init bfq_init(void) + + return 0; + ++slab_kill: ++ bfq_slab_kill(); + err_pol_unreg: + #ifdef BFQ_GROUP_IOSCHED_ENABLED + blkcg_policy_unregister(&blkcg_policy_bfq); + +From 1f77c173aaa87ffb22c9f062a6449245d14311e4 Mon Sep 17 00:00:00 2001 +From: Paolo Valente +Date: Wed, 4 Apr 2018 11:28:16 +0200 +Subject: [PATCH 23/23] block, bfq-sq, bfq-mq: lower-bound the estimated peak + rate to 1 + +If a storage device handled by BFQ happens to be slower than 7.5 KB/s +for a certain amount of time (in the order of a second), then the +estimated peak rate of the device, maintained in BFQ, becomes equal to +0. The reason is the limited precision with which the rate is +represented (details on the range of representable values in the +comments introduced by this commit). This leads to a division-by-zero +error where the estimated peak rate is used as divisor. Such a type of +failure has been reported in [1]. + +This commit addresses this issue by: +1. Lower-bounding the estimated peak rate to 1 +2. Adding and improving comments on the range of rates representable + +[1] https://www.spinics.net/lists/kernel/msg2739205.html + +Signed-off-by: Konstantin Khlebnikov +Signed-off-by: Paolo Valente +--- + block/bfq-mq-iosched.c | 25 ++++++++++++++++++++++++- + block/bfq-mq.h | 7 ++++++- + block/bfq-sq-iosched.c | 25 ++++++++++++++++++++++++- + block/bfq.h | 7 ++++++- + 4 files changed, 60 insertions(+), 4 deletions(-) + +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +index 964e88c2ce59..03efd90c5d20 100644 +--- a/block/bfq-mq-iosched.c ++++ b/block/bfq-mq-iosched.c +@@ -160,7 +160,20 @@ static struct kmem_cache *bfq_pool; + /* Target observation time interval for a peak-rate update (ns) */ + #define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC + +-/* Shift used for peak rate fixed precision calculations. */ ++/* ++ * Shift used for peak-rate fixed precision calculations. ++ * With ++ * - the current shift: 16 positions ++ * - the current type used to store rate: u32 ++ * - the current unit of measure for rate: [sectors/usec], or, more precisely, ++ * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, ++ * the range of rates that can be stored is ++ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = ++ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = ++ * [15, 65G] sectors/sec ++ * Which, assuming a sector size of 512B, corresponds to a range of ++ * [7.5K, 33T] B/sec ++ */ + #define BFQ_RATE_SHIFT 16 + + /* +@@ -2881,6 +2894,16 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) + BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; ++ ++ /* ++ * For a very slow device, bfqd->peak_rate can reach 0 (see ++ * the minimum representable values reported in the comments ++ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid ++ * divisions by zero where bfqd->peak_rate is used as a ++ * divisor. ++ */ ++ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); ++ + update_thr_responsiveness_params(bfqd); + BUG_ON(bfqd->peak_rate > 20<peak_rate > 20<peak_rate += rate; ++ ++ /* ++ * For a very slow device, bfqd->peak_rate can reach 0 (see ++ * the minimum representable values reported in the comments ++ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid ++ * divisions by zero where bfqd->peak_rate is used as a ++ * divisor. ++ */ ++ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); ++ + update_thr_responsiveness_params(bfqd); + BUG_ON(bfqd->peak_rate > 20< +Date: Sat, 29 Oct 2016 11:20:37 +1100 +Subject: [PATCH 02/16] Make preemptible kernel default. + +Make full preempt default on all arches. +--- + arch/arc/configs/tb10x_defconfig | 2 +- + arch/arm/configs/bcm2835_defconfig | 2 +- + arch/arm/configs/imx_v6_v7_defconfig | 2 +- + arch/arm/configs/mps2_defconfig | 2 +- + arch/arm/configs/mxs_defconfig | 2 +- + arch/blackfin/configs/BF518F-EZBRD_defconfig | 2 +- + arch/blackfin/configs/BF526-EZBRD_defconfig | 2 +- + arch/blackfin/configs/BF527-EZKIT-V2_defconfig | 2 +- + arch/blackfin/configs/BF527-EZKIT_defconfig | 2 +- + arch/blackfin/configs/BF527-TLL6527M_defconfig | 2 +- + arch/blackfin/configs/BF533-EZKIT_defconfig | 2 +- + arch/blackfin/configs/BF533-STAMP_defconfig | 2 +- + arch/blackfin/configs/BF537-STAMP_defconfig | 2 +- + arch/blackfin/configs/BF538-EZKIT_defconfig | 2 +- + arch/blackfin/configs/BF548-EZKIT_defconfig | 2 +- + arch/blackfin/configs/BF561-ACVILON_defconfig | 2 +- + arch/blackfin/configs/BF561-EZKIT-SMP_defconfig | 2 +- + arch/blackfin/configs/BF561-EZKIT_defconfig | 2 +- + arch/blackfin/configs/BF609-EZKIT_defconfig | 2 +- + arch/blackfin/configs/BlackStamp_defconfig | 2 +- + arch/blackfin/configs/CM-BF527_defconfig | 2 +- + arch/blackfin/configs/PNAV-10_defconfig | 2 +- + arch/blackfin/configs/SRV1_defconfig | 2 +- + arch/blackfin/configs/TCM-BF518_defconfig | 2 +- + arch/mips/configs/fuloong2e_defconfig | 3 ++- + arch/mips/configs/gpr_defconfig | 3 ++- + arch/mips/configs/ip22_defconfig | 3 ++- + arch/mips/configs/ip28_defconfig | 3 ++- + arch/mips/configs/jazz_defconfig | 3 ++- + arch/mips/configs/mtx1_defconfig | 3 ++- + arch/mips/configs/nlm_xlr_defconfig | 2 +- + arch/mips/configs/pic32mzda_defconfig | 2 +- + arch/mips/configs/pistachio_defconfig | 2 +- + arch/mips/configs/pnx8335_stb225_defconfig | 2 +- + arch/mips/configs/rm200_defconfig | 3 ++- + arch/parisc/configs/712_defconfig | 2 +- + arch/parisc/configs/c3000_defconfig | 2 +- + arch/parisc/configs/default_defconfig | 2 +- + arch/powerpc/configs/c2k_defconfig | 2 +- + arch/powerpc/configs/ppc6xx_defconfig | 2 +- + arch/score/configs/spct6600_defconfig | 2 +- + arch/sh/configs/se7712_defconfig | 2 +- + arch/sh/configs/se7721_defconfig | 2 +- + arch/sh/configs/titan_defconfig | 2 +- + arch/sparc/configs/sparc64_defconfig | 2 +- + arch/tile/configs/tilegx_defconfig | 2 +- + arch/tile/configs/tilepro_defconfig | 2 +- + arch/x86/configs/i386_defconfig | 2 +- + arch/x86/configs/x86_64_defconfig | 2 +- + kernel/Kconfig.preempt | 7 ++++--- + 50 files changed, 60 insertions(+), 52 deletions(-) + +diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig +index f30182549395..42910f628869 100644 +--- a/arch/arc/configs/tb10x_defconfig ++++ b/arch/arc/configs/tb10x_defconfig +@@ -28,7 +28,7 @@ CONFIG_ARC_PLAT_TB10X=y + CONFIG_ARC_CACHE_LINE_SHIFT=5 + CONFIG_HZ=250 + CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk" +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_COMPACTION is not set + CONFIG_NET=y + CONFIG_PACKET=y +diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig +index 43dab4890ad3..44a52166ca5e 100644 +--- a/arch/arm/configs/bcm2835_defconfig ++++ b/arch/arm/configs/bcm2835_defconfig +@@ -29,7 +29,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_ARCH_MULTI_V6=y + CONFIG_ARCH_BCM=y + CONFIG_ARCH_BCM2835=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_AEABI=y + CONFIG_KSM=y + CONFIG_CLEANCACHE=y +diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig +index 32acac9ab81a..1482bb312987 100644 +--- a/arch/arm/configs/imx_v6_v7_defconfig ++++ b/arch/arm/configs/imx_v6_v7_defconfig +@@ -47,7 +47,7 @@ CONFIG_PCI_MSI=y + CONFIG_PCI_IMX6=y + CONFIG_SMP=y + CONFIG_ARM_PSCI=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_AEABI=y + CONFIG_HIGHMEM=y + CONFIG_CMA=y +diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig +index 0bcdec7cc169..10ceaefa51e0 100644 +--- a/arch/arm/configs/mps2_defconfig ++++ b/arch/arm/configs/mps2_defconfig +@@ -18,7 +18,7 @@ CONFIG_ARCH_MPS2=y + CONFIG_SET_MEM_PARAM=y + CONFIG_DRAM_BASE=0x21000000 + CONFIG_DRAM_SIZE=0x1000000 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_ATAGS is not set + CONFIG_ZBOOT_ROM_TEXT=0x0 + CONFIG_ZBOOT_ROM_BSS=0x0 +diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig +index e5822ab01b7d..3e77e02f678f 100644 +--- a/arch/arm/configs/mxs_defconfig ++++ b/arch/arm/configs/mxs_defconfig +@@ -27,7 +27,7 @@ CONFIG_BLK_DEV_INTEGRITY=y + # CONFIG_ARCH_MULTI_V7 is not set + CONFIG_ARCH_MXS=y + # CONFIG_ARM_THUMB is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_AEABI=y + CONFIG_NET=y + CONFIG_PACKET=y +diff --git a/arch/blackfin/configs/BF518F-EZBRD_defconfig b/arch/blackfin/configs/BF518F-EZBRD_defconfig +index 99c00d835f47..39b91dfa55b5 100644 +--- a/arch/blackfin/configs/BF518F-EZBRD_defconfig ++++ b/arch/blackfin/configs/BF518F-EZBRD_defconfig +@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF518=y + CONFIG_IRQ_TIMER0=12 + # CONFIG_CYCLES_CLOCKSOURCE is not set +diff --git a/arch/blackfin/configs/BF526-EZBRD_defconfig b/arch/blackfin/configs/BF526-EZBRD_defconfig +index e66ba31ef84d..675cadb3a0c4 100644 +--- a/arch/blackfin/configs/BF526-EZBRD_defconfig ++++ b/arch/blackfin/configs/BF526-EZBRD_defconfig +@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF526=y + CONFIG_IRQ_TIMER0=12 + CONFIG_BFIN526_EZBRD=y +diff --git a/arch/blackfin/configs/BF527-EZKIT-V2_defconfig b/arch/blackfin/configs/BF527-EZKIT-V2_defconfig +index 0207c588c19f..4c517c443af5 100644 +--- a/arch/blackfin/configs/BF527-EZKIT-V2_defconfig ++++ b/arch/blackfin/configs/BF527-EZKIT-V2_defconfig +@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF527=y + CONFIG_BF_REV_0_2=y + CONFIG_BFIN527_EZKIT_V2=y +diff --git a/arch/blackfin/configs/BF527-EZKIT_defconfig b/arch/blackfin/configs/BF527-EZKIT_defconfig +index 99c131ba7d90..bf8df3e6cf02 100644 +--- a/arch/blackfin/configs/BF527-EZKIT_defconfig ++++ b/arch/blackfin/configs/BF527-EZKIT_defconfig +@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF527=y + CONFIG_BF_REV_0_1=y + CONFIG_IRQ_USB_INT0=11 +diff --git a/arch/blackfin/configs/BF527-TLL6527M_defconfig b/arch/blackfin/configs/BF527-TLL6527M_defconfig +index cdeb51856f26..0220b3b15c53 100644 +--- a/arch/blackfin/configs/BF527-TLL6527M_defconfig ++++ b/arch/blackfin/configs/BF527-TLL6527M_defconfig +@@ -21,7 +21,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_LBDAF is not set + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF527=y + CONFIG_BF_REV_0_2=y + CONFIG_BFIN527_TLL6527M=y +diff --git a/arch/blackfin/configs/BF533-EZKIT_defconfig b/arch/blackfin/configs/BF533-EZKIT_defconfig +index ed7d2c096739..6023e3fd2c48 100644 +--- a/arch/blackfin/configs/BF533-EZKIT_defconfig ++++ b/arch/blackfin/configs/BF533-EZKIT_defconfig +@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BFIN533_EZKIT=y + CONFIG_TIMER0=11 + CONFIG_CLKIN_HZ=27000000 +diff --git a/arch/blackfin/configs/BF533-STAMP_defconfig b/arch/blackfin/configs/BF533-STAMP_defconfig +index 0c241f4d28d7..f5cd0f18b711 100644 +--- a/arch/blackfin/configs/BF533-STAMP_defconfig ++++ b/arch/blackfin/configs/BF533-STAMP_defconfig +@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_TIMER0=11 + CONFIG_HIGH_RES_TIMERS=y + CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 +diff --git a/arch/blackfin/configs/BF537-STAMP_defconfig b/arch/blackfin/configs/BF537-STAMP_defconfig +index e5360b30e39a..48085fde7f9e 100644 +--- a/arch/blackfin/configs/BF537-STAMP_defconfig ++++ b/arch/blackfin/configs/BF537-STAMP_defconfig +@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF537=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 +diff --git a/arch/blackfin/configs/BF538-EZKIT_defconfig b/arch/blackfin/configs/BF538-EZKIT_defconfig +index 60f6fb86125c..12deeaaef3cb 100644 +--- a/arch/blackfin/configs/BF538-EZKIT_defconfig ++++ b/arch/blackfin/configs/BF538-EZKIT_defconfig +@@ -21,7 +21,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF538=y + CONFIG_IRQ_TIMER0=12 + CONFIG_IRQ_TIMER1=12 +diff --git a/arch/blackfin/configs/BF548-EZKIT_defconfig b/arch/blackfin/configs/BF548-EZKIT_defconfig +index 38cb17d218d4..6a68ffc55b5a 100644 +--- a/arch/blackfin/configs/BF548-EZKIT_defconfig ++++ b/arch/blackfin/configs/BF548-EZKIT_defconfig +@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF548_std=y + CONFIG_IRQ_TIMER0=11 + # CONFIG_CYCLES_CLOCKSOURCE is not set +diff --git a/arch/blackfin/configs/BF561-ACVILON_defconfig b/arch/blackfin/configs/BF561-ACVILON_defconfig +index 78f6bc79f910..e9f3ba783a4e 100644 +--- a/arch/blackfin/configs/BF561-ACVILON_defconfig ++++ b/arch/blackfin/configs/BF561-ACVILON_defconfig +@@ -20,7 +20,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_LBDAF is not set + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF561=y + CONFIG_BF_REV_0_5=y + CONFIG_IRQ_TIMER0=10 +diff --git a/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig +index fac8bb578249..89b75a6c3fab 100644 +--- a/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig ++++ b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig +@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF561=y + CONFIG_SMP=y + CONFIG_IRQ_TIMER0=10 +diff --git a/arch/blackfin/configs/BF561-EZKIT_defconfig b/arch/blackfin/configs/BF561-EZKIT_defconfig +index 2a2e4d0cebc1..67b3d2f419ba 100644 +--- a/arch/blackfin/configs/BF561-EZKIT_defconfig ++++ b/arch/blackfin/configs/BF561-EZKIT_defconfig +@@ -19,7 +19,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF561=y + CONFIG_IRQ_TIMER0=10 + CONFIG_CLKIN_HZ=30000000 +diff --git a/arch/blackfin/configs/BF609-EZKIT_defconfig b/arch/blackfin/configs/BF609-EZKIT_defconfig +index 3ce77f07208a..8cc75d4218fb 100644 +--- a/arch/blackfin/configs/BF609-EZKIT_defconfig ++++ b/arch/blackfin/configs/BF609-EZKIT_defconfig +@@ -20,7 +20,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF609=y + CONFIG_PINT1_ASSIGN=0x01010000 + CONFIG_PINT2_ASSIGN=0x07000101 +diff --git a/arch/blackfin/configs/BlackStamp_defconfig b/arch/blackfin/configs/BlackStamp_defconfig +index f4a9200e1ab1..9faf0ec7007f 100644 +--- a/arch/blackfin/configs/BlackStamp_defconfig ++++ b/arch/blackfin/configs/BlackStamp_defconfig +@@ -17,7 +17,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF532=y + CONFIG_BF_REV_0_5=y + CONFIG_BLACKSTAMP=y +diff --git a/arch/blackfin/configs/CM-BF527_defconfig b/arch/blackfin/configs/CM-BF527_defconfig +index 1902bb05d086..4a1ad4fd7bb2 100644 +--- a/arch/blackfin/configs/CM-BF527_defconfig ++++ b/arch/blackfin/configs/CM-BF527_defconfig +@@ -19,7 +19,7 @@ CONFIG_MODULES=y + CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF527=y + CONFIG_BF_REV_0_1=y + CONFIG_IRQ_TIMER0=12 +diff --git a/arch/blackfin/configs/PNAV-10_defconfig b/arch/blackfin/configs/PNAV-10_defconfig +index c7926812971c..9d787e28bbe8 100644 +--- a/arch/blackfin/configs/PNAV-10_defconfig ++++ b/arch/blackfin/configs/PNAV-10_defconfig +@@ -15,7 +15,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF537=y + CONFIG_IRQ_TIMER0=12 + CONFIG_PNAV10=y +diff --git a/arch/blackfin/configs/SRV1_defconfig b/arch/blackfin/configs/SRV1_defconfig +index 23fdc57d657a..225df32dc9a8 100644 +--- a/arch/blackfin/configs/SRV1_defconfig ++++ b/arch/blackfin/configs/SRV1_defconfig +@@ -13,7 +13,7 @@ CONFIG_MMAP_ALLOW_UNINITIALIZED=y + CONFIG_MODULES=y + CONFIG_MODULE_UNLOAD=y + # CONFIG_IOSCHED_DEADLINE is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF537=y + CONFIG_IRQ_TIMER0=12 + CONFIG_BOOT_LOAD=0x400000 +diff --git a/arch/blackfin/configs/TCM-BF518_defconfig b/arch/blackfin/configs/TCM-BF518_defconfig +index e28959479fe0..425c24e43c34 100644 +--- a/arch/blackfin/configs/TCM-BF518_defconfig ++++ b/arch/blackfin/configs/TCM-BF518_defconfig +@@ -23,7 +23,7 @@ CONFIG_MODULE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BF518=y + CONFIG_BF_REV_0_1=y + CONFIG_BFIN518F_TCM=y +diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig +index 499f51498ecb..f7cb39b0662c 100644 +--- a/arch/mips/configs/fuloong2e_defconfig ++++ b/arch/mips/configs/fuloong2e_defconfig +@@ -2,7 +2,8 @@ CONFIG_MACH_LOONGSON64=y + CONFIG_64BIT=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_LOCALVERSION="-fuloong2e" + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y +diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig +index 55438fc9991e..db03ef4f737d 100644 +--- a/arch/mips/configs/gpr_defconfig ++++ b/arch/mips/configs/gpr_defconfig +@@ -1,7 +1,8 @@ + CONFIG_MIPS_ALCHEMY=y + CONFIG_MIPS_GPR=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig +index 83e8fe2064aa..93e7b167433b 100644 +--- a/arch/mips/configs/ip22_defconfig ++++ b/arch/mips/configs/ip22_defconfig +@@ -3,7 +3,8 @@ CONFIG_CPU_R5000=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_SYSVIPC=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig +index d0a4c2cfacf8..6f0600e99c25 100644 +--- a/arch/mips/configs/ip28_defconfig ++++ b/arch/mips/configs/ip28_defconfig +@@ -1,6 +1,7 @@ + CONFIG_SGI_IP28=y + CONFIG_ARC_CONSOLE=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_SYSVIPC=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig +index 9ad1c94376c8..1d62ce7ff5dc 100644 +--- a/arch/mips/configs/jazz_defconfig ++++ b/arch/mips/configs/jazz_defconfig +@@ -1,6 +1,7 @@ + CONFIG_MACH_JAZZ=y + CONFIG_OLIVETTI_M700=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_BSD_PROCESS_ACCT=y +diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig +index c3d0d0a6e044..aa3426d5f7d7 100644 +--- a/arch/mips/configs/mtx1_defconfig ++++ b/arch/mips/configs/mtx1_defconfig +@@ -1,6 +1,7 @@ + CONFIG_MIPS_ALCHEMY=y + CONFIG_MIPS_MTX1=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig +index 1e18fd7de209..b514e91e5426 100644 +--- a/arch/mips/configs/nlm_xlr_defconfig ++++ b/arch/mips/configs/nlm_xlr_defconfig +@@ -5,7 +5,7 @@ CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 + CONFIG_SMP=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_KEXEC=y + CONFIG_CROSS_COMPILE="" + # CONFIG_LOCALVERSION_AUTO is not set +diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig +index 52192c632ae8..96b087498dab 100644 +--- a/arch/mips/configs/pic32mzda_defconfig ++++ b/arch/mips/configs/pic32mzda_defconfig +@@ -1,7 +1,7 @@ + CONFIG_MACH_PIC32=y + CONFIG_DTB_PIC32_MZDA_SK=y + CONFIG_HZ_100=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_SECCOMP is not set + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y +diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig +index b22a3cf149b6..cfffca3d37f4 100644 +--- a/arch/mips/configs/pistachio_defconfig ++++ b/arch/mips/configs/pistachio_defconfig +@@ -5,7 +5,7 @@ CONFIG_MIPS_CPS=y + CONFIG_DEFAULT_MMAP_MIN_ADDR=32768 + CONFIG_ZSMALLOC=y + CONFIG_NR_CPUS=4 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_DEFAULT_HOSTNAME="localhost" + CONFIG_SYSVIPC=y +diff --git a/arch/mips/configs/pnx8335_stb225_defconfig b/arch/mips/configs/pnx8335_stb225_defconfig +index 81b5eb89446c..19f8cea849a1 100644 +--- a/arch/mips/configs/pnx8335_stb225_defconfig ++++ b/arch/mips/configs/pnx8335_stb225_defconfig +@@ -3,7 +3,7 @@ CONFIG_CPU_LITTLE_ENDIAN=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_HZ_128=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_SECCOMP is not set + # CONFIG_LOCALVERSION_AUTO is not set + # CONFIG_SWAP is not set +diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig +index 99679e514042..2ced507a8ba7 100644 +--- a/arch/mips/configs/rm200_defconfig ++++ b/arch/mips/configs/rm200_defconfig +@@ -2,7 +2,8 @@ CONFIG_SNI_RM=y + CONFIG_CPU_LITTLE_ENDIAN=y + CONFIG_ARC_CONSOLE=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_BSD_PROCESS_ACCT=y +diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig +index ccc109761f44..a6a5b0b7a9c9 100644 +--- a/arch/parisc/configs/712_defconfig ++++ b/arch/parisc/configs/712_defconfig +@@ -13,7 +13,7 @@ CONFIG_MODULES=y + CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_PA7100LC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_GSC_LASI=y + # CONFIG_PDC_CHASSIS is not set + CONFIG_BINFMT_MISC=m +diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig +index 8d41a73bd71b..b8e0a6662ff9 100644 +--- a/arch/parisc/configs/c3000_defconfig ++++ b/arch/parisc/configs/c3000_defconfig +@@ -13,7 +13,7 @@ CONFIG_MODULES=y + CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_PA8X00=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_GSC is not set + CONFIG_PCI=y + CONFIG_PCI_LBA=y +diff --git a/arch/parisc/configs/default_defconfig b/arch/parisc/configs/default_defconfig +index 52c9050a7c5c..8d86d2e989f4 100644 +--- a/arch/parisc/configs/default_defconfig ++++ b/arch/parisc/configs/default_defconfig +@@ -14,7 +14,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + CONFIG_PA7100LC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IOMMU_CCIO=y + CONFIG_GSC_LASI=y + CONFIG_GSC_WAX=y +diff --git a/arch/powerpc/configs/c2k_defconfig b/arch/powerpc/configs/c2k_defconfig +index f1552af9eecc..f8505e6ec7b3 100644 +--- a/arch/powerpc/configs/c2k_defconfig ++++ b/arch/powerpc/configs/c2k_defconfig +@@ -29,7 +29,7 @@ CONFIG_CPU_FREQ_GOV_POWERSAVE=m + CONFIG_CPU_FREQ_GOV_ONDEMAND=m + CONFIG_GEN_RTC=y + CONFIG_HIGHMEM=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BINFMT_MISC=y + CONFIG_PM=y + CONFIG_PCI_MSI=y +diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig +index da0e8d535eb8..c016af41ab4f 100644 +--- a/arch/powerpc/configs/ppc6xx_defconfig ++++ b/arch/powerpc/configs/ppc6xx_defconfig +@@ -74,7 +74,7 @@ CONFIG_QE_GPIO=y + CONFIG_MCU_MPC8349EMITX=y + CONFIG_HIGHMEM=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BINFMT_MISC=y + CONFIG_HIBERNATION=y + CONFIG_PM_DEBUG=y +diff --git a/arch/score/configs/spct6600_defconfig b/arch/score/configs/spct6600_defconfig +index b2d8802f43b4..46434ca1fa10 100644 +--- a/arch/score/configs/spct6600_defconfig ++++ b/arch/score/configs/spct6600_defconfig +@@ -1,5 +1,5 @@ + CONFIG_HZ_100=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_EXPERIMENTAL=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y +diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig +index 5a1097641247..eb5fbf554e7f 100644 +--- a/arch/sh/configs/se7712_defconfig ++++ b/arch/sh/configs/se7712_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=66666666 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" + CONFIG_NET=y +diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig +index 9c0ef13bee10..cbaa65c8bf9e 100644 +--- a/arch/sh/configs/se7721_defconfig ++++ b/arch/sh/configs/se7721_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_7721_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=33333333 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda2" + CONFIG_NET=y +diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig +index ceb48e9b70f4..1a69eda6610c 100644 +--- a/arch/sh/configs/titan_defconfig ++++ b/arch/sh/configs/titan_defconfig +@@ -20,7 +20,7 @@ CONFIG_SH_TITAN=y + CONFIG_SH_PCLK_FREQ=30000000 + CONFIG_SH_DMA=y + CONFIG_SH_DMA_API=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" + CONFIG_PCI=y +diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig +index 4d4e1cc6402f..04bea1d28ba7 100644 +--- a/arch/sparc/configs/sparc64_defconfig ++++ b/arch/sparc/configs/sparc64_defconfig +@@ -22,7 +22,7 @@ CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_NUMA=y + CONFIG_DEFAULT_MMAP_MIN_ADDR=8192 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_SUN_LDOMS=y + CONFIG_PCI=y + CONFIG_PCI_MSI=y +diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig +index 9f94435cc44f..aa78ee6cd5eb 100644 +--- a/arch/tile/configs/tilegx_defconfig ++++ b/arch/tile/configs/tilegx_defconfig +@@ -47,7 +47,7 @@ CONFIG_CFQ_GROUP_IOSCHED=y + CONFIG_NR_CPUS=100 + CONFIG_HZ_100=y + # CONFIG_COMPACTION is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_TILE_PCI_IO=y + CONFIG_PCI_DEBUG=y + # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig +index 1c5bd4f8ffca..38005862062c 100644 +--- a/arch/tile/configs/tilepro_defconfig ++++ b/arch/tile/configs/tilepro_defconfig +@@ -44,7 +44,7 @@ CONFIG_KARMA_PARTITION=y + CONFIG_CFQ_GROUP_IOSCHED=y + CONFIG_HZ_100=y + # CONFIG_COMPACTION is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_PCI_DEBUG=y + # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set + CONFIG_BINFMT_MISC=y +diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig +index 0eb9f92f3717..e5890ae917e5 100644 +--- a/arch/x86/configs/i386_defconfig ++++ b/arch/x86/configs/i386_defconfig +@@ -41,7 +41,7 @@ CONFIG_SMP=y + CONFIG_X86_GENERIC=y + CONFIG_HPET_TIMER=y + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_X86_REBOOTFIXUPS=y +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index 4a4b16e56d35..7452dcadda74 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -40,7 +40,7 @@ CONFIG_SMP=y + CONFIG_CALGARY_IOMMU=y + CONFIG_NR_CPUS=64 + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_MICROCODE=y +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index 3f9c97419f02..1dc79ec7ad09 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -1,7 +1,7 @@ + + choice + prompt "Preemption Model" +- default PREEMPT_NONE ++ default PREEMPT + + config PREEMPT_NONE + bool "No Forced Preemption (Server)" +@@ -17,7 +17,7 @@ config PREEMPT_NONE + latencies. + + config PREEMPT_VOLUNTARY +- bool "Voluntary Kernel Preemption (Desktop)" ++ bool "Voluntary Kernel Preemption (Nothing)" + help + This option reduces the latency of the kernel by adding more + "explicit preemption points" to the kernel code. These new +@@ -31,7 +31,8 @@ config PREEMPT_VOLUNTARY + applications to run more 'smoothly' even when the system is + under load. + +- Select this if you are building a kernel for a desktop system. ++ Select this for no system in particular (choose Preemptible ++ instead on a desktop if you know what's good for you). + + config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" +-- +2.11.0 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0003-Expose-vmsplit-for-our-poor-32-bit-users.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0003-Expose-vmsplit-for-our-poor-32-bit-users.patch new file mode 100644 index 00000000..b7897dbe --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0003-Expose-vmsplit-for-our-poor-32-bit-users.patch @@ -0,0 +1,48 @@ +From 44fc740a3ff85d378c28a416a076cc7e019d7b8c Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Fri, 12 May 2017 13:07:37 +1000 +Subject: [PATCH 03/16] Expose vmsplit for our poor 32 bit users. + +--- + arch/x86/Kconfig | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index e06a7b4e1dc4..931aba4fc567 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1361,7 +1361,7 @@ config HIGHMEM64G + endchoice + + choice +- prompt "Memory split" if EXPERT ++ prompt "Memory split" + default VMSPLIT_3G + depends on X86_32 + ---help--- +@@ -1381,17 +1381,17 @@ choice + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !X86_PAE +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_2G_OPT + depends on !X86_PAE +- bool "2G/2G user/kernel split (for full 2G low memory)" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +-- +2.11.0 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0004-Create-highres-timeout-variants-of-schedule_timeout-.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0004-Create-highres-timeout-variants-of-schedule_timeout-.patch new file mode 100644 index 00000000..3c182fbe --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0004-Create-highres-timeout-variants-of-schedule_timeout-.patch @@ -0,0 +1,153 @@ +From d27b58b0707ac311be5a51594fc6f22ed1d109e5 Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Sat, 12 Aug 2017 11:53:39 +1000 +Subject: [PATCH 04/16] Create highres timeout variants of schedule_timeout + functions. + +--- + include/linux/freezer.h | 1 + + include/linux/sched.h | 31 +++++++++++++++++++-- + kernel/time/hrtimer.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 101 insertions(+), 2 deletions(-) + +diff --git a/include/linux/freezer.h b/include/linux/freezer.h +index 3995df1d068f..f8645e8f2444 100644 +--- a/include/linux/freezer.h ++++ b/include/linux/freezer.h +@@ -297,6 +297,7 @@ static inline void set_freezable(void) {} + #define wait_event_freezekillable_unsafe(wq, condition) \ + wait_event_killable(wq, condition) + ++#define pm_freezing (false) + #endif /* !CONFIG_FREEZER */ + + #endif /* FREEZER_H_INCLUDED */ +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 35dc91a0e2ed..38852ebfa864 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -173,13 +173,40 @@ extern cpumask_var_t cpu_isolated_map; + + extern void scheduler_tick(void); + +-#define MAX_SCHEDULE_TIMEOUT LONG_MAX +- ++#define MAX_SCHEDULE_TIMEOUT LONG_MAX + extern long schedule_timeout(long timeout); + extern long schedule_timeout_interruptible(long timeout); + extern long schedule_timeout_killable(long timeout); + extern long schedule_timeout_uninterruptible(long timeout); + extern long schedule_timeout_idle(long timeout); ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++extern long schedule_msec_hrtimeout(long timeout); ++extern long schedule_min_hrtimeout(void); ++extern long schedule_msec_hrtimeout_interruptible(long timeout); ++extern long schedule_msec_hrtimeout_uninterruptible(long timeout); ++#else ++static inline long schedule_msec_hrtimeout(long timeout) ++{ ++ return schedule_timeout(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_min_hrtimeout(void) ++{ ++ return schedule_timeout(1); ++} ++ ++static inline long schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ return schedule_timeout_interruptible(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout)); ++} ++#endif ++ + asmlinkage void schedule(void); + extern void schedule_preempt_disabled(void); + +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 88f75f92ef36..13227cf2814c 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -1787,3 +1787,74 @@ int __sched schedule_hrtimeout(ktime_t *expires, + return schedule_hrtimeout_range(expires, 0, mode); + } + EXPORT_SYMBOL_GPL(schedule_hrtimeout); ++ ++/* ++ * As per schedule_hrtimeout but taskes a millisecond value and returns how ++ * many milliseconds are left. ++ */ ++long __sched schedule_msec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ int delta, secs, jiffs; ++ ktime_t expires; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ jiffs = msecs_to_jiffies(timeout); ++ /* ++ * If regular timer resolution is adequate or hrtimer resolution is not ++ * (yet) better than Hz, as would occur during startup, use regular ++ * timers. ++ */ ++ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ) ++ return schedule_timeout(jiffs); ++ ++ secs = timeout / 1000; ++ delta = (timeout % 1000) * NSEC_PER_MSEC; ++ expires = ktime_set(secs, delta); ++ ++ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_init_sleeper(&t, current); ++ ++ hrtimer_start_expires(&t.timer, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_ms(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++EXPORT_SYMBOL(schedule_msec_hrtimeout); ++ ++long __sched schedule_min_hrtimeout(void) ++{ ++ return schedule_msec_hrtimeout(1); ++} ++ ++EXPORT_SYMBOL(schedule_min_hrtimeout); ++ ++long __sched schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ __set_current_state(TASK_INTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible); ++ ++long __sched schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible); +-- +2.11.0 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0005-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0005-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch new file mode 100644 index 00000000..3c889719 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0005-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch @@ -0,0 +1,50 @@ +From 5da7d1778b96c514394334c92de9b3d8d71f4a29 Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Sat, 5 Nov 2016 09:27:36 +1100 +Subject: [PATCH 05/16] Special case calls of schedule_timeout(1) to use the + min hrtimeout of 1ms, working around low Hz resolutions. + +--- + kernel/time/timer.c | 17 +++++++++++++++-- + 1 file changed, 15 insertions(+), 2 deletions(-) + +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 9c18e16059a3..dd4d1b193286 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1741,6 +1741,19 @@ signed long __sched schedule_timeout(signed long timeout) + + expire = timeout + jiffies; + ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ /* ++ * Special case 1 as being a request for the minimum timeout ++ * and use highres timers to timeout after 1ms to workaround ++ * the granularity of low Hz tick timers. ++ */ ++ if (!schedule_min_hrtimeout()) ++ return 0; ++ goto out_timeout; ++ } ++#endif ++ + setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); + __mod_timer(&timer, expire, false); + schedule(); +@@ -1748,10 +1761,10 @@ signed long __sched schedule_timeout(signed long timeout) + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer); +- ++out_timeout: + timeout = expire - jiffies; + +- out: ++out: + return timeout < 0 ? 0 : timeout; + } + EXPORT_SYMBOL(schedule_timeout); +-- +2.11.0 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0006-Convert-msleep-to-use-hrtimers-when-active.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0006-Convert-msleep-to-use-hrtimers-when-active.patch new file mode 100644 index 00000000..2f065652 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0006-Convert-msleep-to-use-hrtimers-when-active.patch @@ -0,0 +1,54 @@ +From 9df803c28bb8ccb2588c0ccaf857b9e673175fed Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Fri, 4 Nov 2016 09:25:54 +1100 +Subject: [PATCH 06/16] Convert msleep to use hrtimers when active. + +--- + kernel/time/timer.c | 24 ++++++++++++++++++++++-- + 1 file changed, 22 insertions(+), 2 deletions(-) + +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index dd4d1b193286..c68cb9307f64 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1884,7 +1884,19 @@ void __init init_timers(void) + */ + void msleep(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ /* ++ * Use high resolution timers where the resolution of tick based ++ * timers is inadequate. ++ */ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ while (msecs) ++ msecs = schedule_msec_hrtimeout_uninterruptible(msecs); ++ return; ++ } ++ timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +@@ -1898,7 +1910,15 @@ EXPORT_SYMBOL(msleep); + */ + unsigned long msleep_interruptible(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ while (msecs && !signal_pending(current)) ++ msecs = schedule_msec_hrtimeout_interruptible(msecs); ++ return msecs; ++ } ++ timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); +-- +2.11.0 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0007-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0007-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch new file mode 100644 index 00000000..ff071da8 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0007-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch @@ -0,0 +1,529 @@ +diff -Nur a/drivers/block/swim.c b/drivers/block/swim.c +--- a/drivers/block/swim.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/block/swim.c 2018-11-03 16:30:39.471807304 +0000 +@@ -332,7 +332,7 @@ + if (swim_readbit(base, MOTOR_ON)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + } else if (action == OFF) { + swim_action(base, MOTOR_OFF); +@@ -351,7 +351,7 @@ + if (!swim_readbit(base, DISK_IN)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + swim_select(base, RELAX); + } +@@ -375,7 +375,7 @@ + for (wait = 0; wait < HZ; wait++) { + + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + swim_select(base, RELAX); + if (!swim_readbit(base, STEP)) +diff -Nur a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c +--- a/drivers/bluetooth/hci_qca.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/bluetooth/hci_qca.c 2018-11-03 16:31:56.065260061 +0000 +@@ -880,7 +880,7 @@ + * then host can communicate with new baudrate to controller + */ + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(BAUDRATE_SETTLE_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((BAUDRATE_SETTLE_TIMEOUT_MS)); + set_current_state(TASK_RUNNING); + + return 0; +diff -Nur a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c +--- a/drivers/char/ipmi/ipmi_msghandler.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/char/ipmi/ipmi_msghandler.c 2018-11-03 16:30:39.473807368 +0000 +@@ -2953,7 +2953,7 @@ + /* Current message first, to preserve order */ + while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) { + /* Wait for the message to clear out. */ +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + /* No need for locks, the interface is down. */ +diff -Nur a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c +--- a/drivers/char/ipmi/ipmi_ssif.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/char/ipmi/ipmi_ssif.c 2018-11-03 16:30:39.473807368 +0000 +@@ -1200,7 +1200,7 @@ + + /* make sure the driver is not looking for flags any more. */ + while (ssif_info->ssif_state != SSIF_NORMAL) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + ssif_info->stopping = true; + del_timer_sync(&ssif_info->retry_timer); +diff -Nur a/drivers/char/snsc.c b/drivers/char/snsc.c +--- a/drivers/char/snsc.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/char/snsc.c 2018-11-03 16:30:39.474807400 +0000 +@@ -198,7 +198,7 @@ + add_wait_queue(&sd->sd_rq, &wait); + spin_unlock_irqrestore(&sd->sd_rlock, flags); + +- schedule_timeout(msecs_to_jiffies(SCDRV_TIMEOUT)); ++ schedule_msec_hrtimeout((SCDRV_TIMEOUT)); + + remove_wait_queue(&sd->sd_rq, &wait); + if (signal_pending(current)) { +@@ -294,7 +294,7 @@ + add_wait_queue(&sd->sd_wq, &wait); + spin_unlock_irqrestore(&sd->sd_wlock, flags); + +- schedule_timeout(msecs_to_jiffies(SCDRV_TIMEOUT)); ++ schedule_msec_hrtimeout((SCDRV_TIMEOUT)); + + remove_wait_queue(&sd->sd_wq, &wait); + if (signal_pending(current)) { +diff -Nur a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c 2018-11-03 16:30:39.474807400 +0000 +@@ -235,7 +235,7 @@ + DRM_ERROR("SVGA device lockup.\n"); + break; + } +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + if (interruptible && signal_pending(current)) { + ret = -ERESTARTSYS; + break; +diff -Nur a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c 2018-11-03 16:30:39.474807400 +0000 +@@ -202,7 +202,7 @@ + break; + } + if (lazy) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + else if ((++count & 0x0F) == 0) { + /** + * FIXME: Use schedule_hr_timeout here for +diff -Nur a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c +--- a/drivers/media/pci/ivtv/ivtv-ioctl.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/media/pci/ivtv/ivtv-ioctl.c 2018-11-03 16:30:39.475807432 +0000 +@@ -1154,7 +1154,7 @@ + TASK_UNINTERRUPTIBLE); + if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100) + break; +- schedule_timeout(msecs_to_jiffies(25)); ++ schedule_msec_hrtimeout((25)); + } + finish_wait(&itv->vsync_waitq, &wait); + mutex_lock(&itv->serialize_lock); +diff -Nur a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c +--- a/drivers/media/pci/ivtv/ivtv-streams.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/media/pci/ivtv/ivtv-streams.c 2018-11-03 16:30:39.475807432 +0000 +@@ -834,7 +834,7 @@ + while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) && + time_before(jiffies, + then + msecs_to_jiffies(2000))) { +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + } + + /* To convert jiffies to ms, we must multiply by 1000 +diff -Nur a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c +--- a/drivers/mfd/ucb1x00-core.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/mfd/ucb1x00-core.c 2018-11-03 16:30:39.476807464 +0000 +@@ -253,7 +253,7 @@ + break; + /* yield to other processes */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + return UCB_ADC_DAT(val); +diff -Nur a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c +--- a/drivers/misc/sgi-xp/xpc_channel.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/misc/sgi-xp/xpc_channel.c 2018-11-03 16:30:39.476807464 +0000 +@@ -837,7 +837,7 @@ + + atomic_inc(&ch->n_on_msg_allocate_wq); + prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); +- ret = schedule_timeout(1); ++ ret = schedule_min_hrtimeout(); + finish_wait(&ch->msg_allocate_wq, &wait); + atomic_dec(&ch->n_on_msg_allocate_wq); + +diff -Nur a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c +--- a/drivers/net/caif/caif_hsi.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/net/caif/caif_hsi.c 2018-11-03 16:30:39.477807497 +0000 +@@ -940,7 +940,7 @@ + break; + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + retry--; + } + +diff -Nur a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c +--- a/drivers/net/can/usb/peak_usb/pcan_usb.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/net/can/usb/peak_usb/pcan_usb.c 2018-11-03 16:30:39.477807497 +0000 +@@ -250,7 +250,7 @@ + } else { + /* the PCAN-USB needs time to init */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); ++ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT)); + } + + return err; +diff -Nur a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c +--- a/drivers/net/usb/lan78xx.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/net/usb/lan78xx.c 2018-11-03 16:30:39.478807529 +0000 +@@ -2567,7 +2567,7 @@ + while (!skb_queue_empty(&dev->rxq) && + !skb_queue_empty(&dev->txq) && + !skb_queue_empty(&dev->done)) { +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + netif_dbg(dev, ifdown, dev->net, + "waited for %d urb completions\n", temp); +diff -Nur a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c +--- a/drivers/net/usb/usbnet.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/net/usb/usbnet.c 2018-11-03 16:30:39.479807561 +0000 +@@ -772,7 +772,7 @@ + spin_lock_irqsave(&q->lock, flags); + while (!skb_queue_empty(q)) { + spin_unlock_irqrestore(&q->lock, flags); +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_lock_irqsave(&q->lock, flags); + } +diff -Nur a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c +--- a/drivers/ntb/test/ntb_perf.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/ntb/test/ntb_perf.c 2018-11-03 16:30:39.479807561 +0000 +@@ -310,7 +310,7 @@ + if (unlikely((jiffies - last_sleep) > 5 * HZ)) { + last_sleep = jiffies; + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + if (unlikely(kthread_should_stop())) +diff -Nur a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c +--- a/drivers/scsi/fnic/fnic_scsi.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/scsi/fnic/fnic_scsi.c 2018-11-03 16:30:39.480807592 +0000 +@@ -217,7 +217,7 @@ + + /* wait for io cmpl */ + while (atomic_read(&fnic->in_flight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); + +@@ -2255,7 +2255,7 @@ + } + } + +- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); ++ schedule_msec_hrtimeout((2 * fnic->config.ed_tov)); + + /* walk again to check, if IOs are still pending in fw */ + if (fnic_is_abts_pending(fnic, lr_sc)) +diff -Nur a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c +--- a/drivers/scsi/snic/snic_scsi.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/scsi/snic/snic_scsi.c 2018-11-03 16:30:39.481807625 +0000 +@@ -2354,7 +2354,7 @@ + + /* Wait for all the IOs that are entered in Qcmd */ + while (atomic_read(&snic->ios_inflight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + ret = snic_issue_hba_reset(snic, sc); + if (ret) { +diff -Nur a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c +--- a/drivers/staging/comedi/drivers/ni_mio_common.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/staging/comedi/drivers/ni_mio_common.c 2018-11-03 16:30:39.483807688 +0000 +@@ -4657,7 +4657,7 @@ + if ((status & NI67XX_CAL_STATUS_BUSY) == 0) + break; + set_current_state(TASK_INTERRUPTIBLE); +- if (schedule_timeout(1)) ++ if (schedule_min_hrtimeout()) + return -EIO; + } + if (i == timeout) { +diff -Nur a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c +--- a/drivers/staging/lustre/lnet/lnet/lib-eq.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c 2018-11-03 16:30:39.483807688 +0000 +@@ -329,7 +329,7 @@ + schedule(); + } else { + now = jiffies; +- schedule_timeout(msecs_to_jiffies(tms)); ++ schedule_msec_hrtimeout((tms)); + tms -= jiffies_to_msecs(jiffies - now); + if (tms < 0) /* no more wait but may have new event */ + tms = 0; +diff -Nur a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c +--- a/drivers/staging/rts5208/rtsx.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/staging/rts5208/rtsx.c 2018-11-03 16:30:39.483807688 +0000 +@@ -524,7 +524,7 @@ + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL)); ++ schedule_msec_hrtimeout((POLLING_INTERVAL)); + + /* lock the device pointers */ + mutex_lock(&dev->dev_mutex); +diff -Nur a/drivers/staging/speakup/speakup_acntpc.c b/drivers/staging/speakup/speakup_acntpc.c +--- a/drivers/staging/speakup/speakup_acntpc.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/staging/speakup/speakup_acntpc.c 2018-11-03 16:30:39.484807721 +0000 +@@ -206,7 +206,7 @@ + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -234,7 +234,7 @@ + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff -Nur a/drivers/staging/speakup/speakup_apollo.c b/drivers/staging/speakup/speakup_apollo.c +--- a/drivers/staging/speakup/speakup_apollo.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/staging/speakup/speakup_apollo.c 2018-11-03 16:30:39.484807721 +0000 +@@ -174,7 +174,7 @@ + if (!synth->io_ops->synth_out(synth, ch)) { + synth->io_ops->tiocmset(0, UART_MCR_RTS); + synth->io_ops->tiocmset(UART_MCR_RTS, 0); +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +diff -Nur a/drivers/staging/speakup/speakup_decext.c b/drivers/staging/speakup/speakup_decext.c +--- a/drivers/staging/speakup/speakup_decext.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/staging/speakup/speakup_decext.c 2018-11-03 16:30:39.484807721 +0000 +@@ -185,7 +185,7 @@ + if (ch == '\n') + ch = 0x0D; + if (synth_full() || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff -Nur a/drivers/staging/speakup/speakup_decpc.c b/drivers/staging/speakup/speakup_decpc.c +--- a/drivers/staging/speakup/speakup_decpc.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/staging/speakup/speakup_decpc.c 2018-11-03 16:30:39.484807721 +0000 +@@ -403,7 +403,7 @@ + if (ch == '\n') + ch = 0x0D; + if (dt_sendchar(ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +diff -Nur a/drivers/staging/speakup/speakup_dectlk.c b/drivers/staging/speakup/speakup_dectlk.c +--- a/drivers/staging/speakup/speakup_dectlk.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/staging/speakup/speakup_dectlk.c 2018-11-03 16:30:39.485807753 +0000 +@@ -253,7 +253,7 @@ + if (ch == '\n') + ch = 0x0D; + if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff -Nur a/drivers/staging/speakup/speakup_dtlk.c b/drivers/staging/speakup/speakup_dtlk.c +--- a/drivers/staging/speakup/speakup_dtlk.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/staging/speakup/speakup_dtlk.c 2018-11-03 16:30:39.485807753 +0000 +@@ -220,7 +220,7 @@ + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -236,7 +236,7 @@ + delay_time_val = delay_time->u.n.value; + jiffy_delta_val = jiffy_delta->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff -Nur a/drivers/staging/speakup/speakup_keypc.c b/drivers/staging/speakup/speakup_keypc.c +--- a/drivers/staging/speakup/speakup_keypc.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/staging/speakup/speakup_keypc.c 2018-11-03 16:30:39.485807753 +0000 +@@ -208,7 +208,7 @@ + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -241,7 +241,7 @@ + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + jiff_max = jiffies+jiffy_delta_val; + } + } +diff -Nur a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c +--- a/drivers/staging/speakup/synth.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/staging/speakup/synth.c 2018-11-03 16:30:39.486807785 +0000 +@@ -92,7 +92,7 @@ + if (ch == '\n') + ch = synth->procspeech; + if (!synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +diff -Nur a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c +--- a/drivers/staging/unisys/visornic/visornic_main.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/staging/unisys/visornic/visornic_main.c 2018-11-03 16:30:39.486807785 +0000 +@@ -556,7 +556,7 @@ + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +@@ -567,7 +567,7 @@ + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + if (atomic_read(&devdata->usage)) + break; +@@ -721,7 +721,7 @@ + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +diff -Nur a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c +--- a/drivers/target/target_core_user.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/target/target_core_user.c 2018-11-03 16:30:39.487807817 +0000 +@@ -808,10 +808,9 @@ + pr_debug("sleeping for ring space\n"); + mutex_unlock(&udev->cmdr_lock); + if (udev->cmd_time_out) +- ret = schedule_timeout( +- msecs_to_jiffies(udev->cmd_time_out)); ++ ret = schedule_msec_hrtimeout(udev->cmd_time_out); + else +- ret = schedule_timeout(msecs_to_jiffies(TCMU_TIME_OUT)); ++ ret = schedule_msec_hrtimeout(TCMU_TIME_OUT); + finish_wait(&udev->wait_cmdr, &__wait); + if (!ret) { + pr_warn("tcmu: command timed out\n"); +diff -Nur a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c +--- a/drivers/video/fbdev/omap/hwa742.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/video/fbdev/omap/hwa742.c 2018-11-03 16:30:39.487807817 +0000 +@@ -926,7 +926,7 @@ + if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7)) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(5)); ++ schedule_msec_hrtimeout((5)); + } + hwa742_set_update_mode(hwa742.update_mode_before_suspend); + } +diff -Nur a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c +--- a/drivers/video/fbdev/pxafb.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/drivers/video/fbdev/pxafb.c 2018-11-03 16:30:39.488807849 +0000 +@@ -1286,7 +1286,7 @@ + mutex_unlock(&fbi->ctrlr_lock); + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(30)); ++ schedule_msec_hrtimeout((30)); + } + + pr_debug("%s(): task ending\n", __func__); +diff -Nur a/fs/afs/vlocation.c b/fs/afs/vlocation.c +--- a/fs/afs/vlocation.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/fs/afs/vlocation.c 2018-11-03 16:30:39.488807849 +0000 +@@ -129,7 +129,7 @@ + if (vl->upd_busy_cnt > 1) { + /* second+ BUSY - sleep a little bit */ + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + continue; + } +diff -Nur a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +--- a/fs/btrfs/extent-tree.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/fs/btrfs/extent-tree.c 2018-11-03 16:30:39.491807945 +0000 +@@ -6106,7 +6106,7 @@ + + if (flush != BTRFS_RESERVE_NO_FLUSH && + btrfs_transaction_in_commit(fs_info)) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + if (delalloc_lock) + mutex_lock(&inode->delalloc_mutex); +diff -Nur a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c +--- a/fs/btrfs/inode-map.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/fs/btrfs/inode-map.c 2018-11-03 16:30:39.492807977 +0000 +@@ -89,7 +89,7 @@ + btrfs_release_path(path); + root->ino_cache_progress = last; + up_read(&fs_info->commit_root_sem); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + goto again; + } else + continue; +diff -Nur a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c +--- a/sound/usb/line6/pcm.c 2018-10-10 07:54:28.000000000 +0100 ++++ b/sound/usb/line6/pcm.c 2018-11-03 16:30:39.492807977 +0000 +@@ -131,7 +131,7 @@ + if (!alive) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } while (--timeout > 0); + if (alive) + dev_err(line6pcm->line6->ifcdev, diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0008-Replace-all-calls-to-schedule_timeout_interruptible-.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0008-Replace-all-calls-to-schedule_timeout_interruptible-.patch new file mode 100644 index 00000000..f9f274ce --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0008-Replace-all-calls-to-schedule_timeout_interruptible-.patch @@ -0,0 +1,311 @@ +From 3ef5df78c2f425115b87f0f2f59fd189c0f1bbe3 Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Mon, 20 Feb 2017 13:30:07 +1100 +Subject: [PATCH 08/16] Replace all calls to schedule_timeout_interruptible of + potentially under 50ms to use schedule_msec_hrtimeout_interruptible. + +--- + drivers/hwmon/fam15h_power.c | 2 +- + drivers/iio/light/tsl2563.c | 6 +----- + drivers/media/i2c/msp3400-driver.c | 4 ++-- + drivers/media/pci/ivtv/ivtv-gpio.c | 6 +++--- + drivers/media/radio/radio-mr800.c | 2 +- + drivers/media/radio/radio-tea5777.c | 2 +- + drivers/media/radio/tea575x.c | 2 +- + drivers/parport/ieee1284.c | 2 +- + drivers/parport/ieee1284_ops.c | 2 +- + drivers/platform/x86/intel_ips.c | 8 ++++---- + net/core/pktgen.c | 2 +- + sound/soc/codecs/wm8350.c | 12 ++++++------ + sound/soc/codecs/wm8900.c | 2 +- + sound/soc/codecs/wm9713.c | 4 ++-- + 14 files changed, 26 insertions(+), 30 deletions(-) + +diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c +index 9545a346044f..c24cf1302ec7 100644 +--- a/drivers/hwmon/fam15h_power.c ++++ b/drivers/hwmon/fam15h_power.c +@@ -237,7 +237,7 @@ static ssize_t power1_average_show(struct device *dev, + prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; + } + +- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); ++ leftover = schedule_msec_hrtimeout_interruptible((data->power_period)); + if (leftover) + return 0; + +diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c +index 7599693f7fe9..452090739138 100644 +--- a/drivers/iio/light/tsl2563.c ++++ b/drivers/iio/light/tsl2563.c +@@ -282,11 +282,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip) + default: + delay = 402; + } +- /* +- * TODO: Make sure that we wait at least required delay but why we +- * have to extend it one tick more? +- */ +- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2); ++ schedule_msec_hrtimeout_interruptible(delay + 1); + } + + static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc) +diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c +index 3db966db83eb..f0fab7676f72 100644 +--- a/drivers/media/i2c/msp3400-driver.c ++++ b/drivers/media/i2c/msp3400-driver.c +@@ -179,7 +179,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr) + break; + dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +@@ -220,7 +220,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val) + break; + dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c +index f752f3993687..23372af61ebf 100644 +--- a/drivers/media/pci/ivtv/ivtv-gpio.c ++++ b/drivers/media/pci/ivtv/ivtv-gpio.c +@@ -117,7 +117,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv) + curout = (curout & ~0xF) | 1; + write_reg(curout, IVTV_REG_GPIO_OUT); + /* We could use something else for smaller time */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + curout |= 2; + write_reg(curout, IVTV_REG_GPIO_OUT); + curdir &= ~0x80; +@@ -137,11 +137,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value) + curout = read_reg(IVTV_REG_GPIO_OUT); + curout &= ~(1 << itv->card->xceive_pin); + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + + curout |= 1 << itv->card->xceive_pin; + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + return 0; + } + +diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c +index c9f59129af79..cb6f8394a5c2 100644 +--- a/drivers/media/radio/radio-mr800.c ++++ b/drivers/media/radio/radio-mr800.c +@@ -378,7 +378,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv, + retval = -ENODATA; + break; + } +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + retval = -ERESTARTSYS; + break; + } +diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c +index 04ed1a5d1177..d593d28dc286 100644 +--- a/drivers/media/radio/radio-tea5777.c ++++ b/drivers/media/radio/radio-tea5777.c +@@ -245,7 +245,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait) + } + + if (wait) { +- if (schedule_timeout_interruptible(msecs_to_jiffies(wait))) ++ if (schedule_msec_hrtimeout_interruptible((wait))) + return -ERESTARTSYS; + } + +diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c +index 4dc2067bce14..29f4416fb9ae 100644 +--- a/drivers/media/radio/tea575x.c ++++ b/drivers/media/radio/tea575x.c +@@ -416,7 +416,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea, + for (;;) { + if (time_after(jiffies, timeout)) + break; +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + /* some signal arrived, stop search */ + tea->val &= ~TEA575X_BIT_SEARCH; + snd_tea575x_set_freq(tea); +diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c +index 74cc6dd982d2..c22c4d5f08d0 100644 +--- a/drivers/parport/ieee1284.c ++++ b/drivers/parport/ieee1284.c +@@ -215,7 +215,7 @@ int parport_wait_peripheral(struct parport *port, + /* parport_wait_event didn't time out, but the + * peripheral wasn't actually ready either. + * Wait for another 10ms. */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + } + +diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c +index 5d41dda6da4e..34705f6b423f 100644 +--- a/drivers/parport/ieee1284_ops.c ++++ b/drivers/parport/ieee1284_ops.c +@@ -537,7 +537,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, + /* Yield the port for a while. */ + if (count && dev->port->irq != PARPORT_IRQ_NONE) { + parport_release (dev); +- schedule_timeout_interruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_interruptible((40)); + parport_claim_or_block (dev); + } + else +diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c +index 58dcee562d64..b661b7c071bb 100644 +--- a/drivers/platform/x86/intel_ips.c ++++ b/drivers/platform/x86/intel_ips.c +@@ -813,7 +813,7 @@ static int ips_adjust(void *data) + ips_gpu_lower(ips); + + sleep: +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD)); + } while (!kthread_should_stop()); + + dev_dbg(&ips->dev->dev, "ips-adjust thread stopped\n"); +@@ -992,7 +992,7 @@ static int ips_monitor(void *data) + seqno_timestamp = get_jiffies_64(); + + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + + /* Collect an initial average */ + for (i = 0; i < IPS_SAMPLE_COUNT; i++) { +@@ -1019,7 +1019,7 @@ static int ips_monitor(void *data) + mchp_samples[i] = mchp; + } + +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + if (kthread_should_stop()) + break; + } +@@ -1046,7 +1046,7 @@ static int ips_monitor(void *data) + * us to reduce the sample frequency if the CPU and GPU are idle. + */ + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + last_sample_period = IPS_SAMPLE_PERIOD; + + setup_deferrable_timer_on_stack(&timer, monitor_timeout, +diff --git a/net/core/pktgen.c b/net/core/pktgen.c +index 6e1e10ff433a..be5d6f7142e4 100644 +--- a/net/core/pktgen.c ++++ b/net/core/pktgen.c +@@ -1992,7 +1992,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) + mutex_unlock(&pktgen_thread_lock); + pr_debug("%s: waiting for %s to disappear....\n", + __func__, ifname); +- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); ++ schedule_msec_hrtimeout_interruptible((msec_per_try)); + mutex_lock(&pktgen_thread_lock); + + if (++i >= max_tries) { +diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c +index 2efc5b41ad0f..3e3248c48c6b 100644 +--- a/sound/soc/codecs/wm8350.c ++++ b/sound/soc/codecs/wm8350.c +@@ -236,10 +236,10 @@ static void wm8350_pga_work(struct work_struct *work) + out2->ramp == WM8350_RAMP_UP) { + /* delay is longer over 0dB as increases are larger */ + if (i >= WM8350_OUTn_0dB) +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (2)); + else +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (1)); + } else + udelay(50); /* doesn't matter if we delay longer */ +@@ -1123,7 +1123,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec, + (platform->dis_out4 << 6)); + + /* wait for discharge */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + cap_discharge_msecs)); + +@@ -1139,7 +1139,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec, + WM8350_VBUFEN); + + /* wait for vmid */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_charge_msecs)); + +@@ -1190,7 +1190,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec, + wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_discharge_msecs)); + +@@ -1208,7 +1208,7 @@ static int wm8350_set_bias_level(struct snd_soc_codec *codec, + pm1 | WM8350_OUTPUT_DRAIN_EN); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform->drain_msecs)); + + pm1 &= ~WM8350_BIASEN; +diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c +index c77b49a29311..fc50456e90a9 100644 +--- a/sound/soc/codecs/wm8900.c ++++ b/sound/soc/codecs/wm8900.c +@@ -1112,7 +1112,7 @@ static int wm8900_set_bias_level(struct snd_soc_codec *codec, + /* Need to let things settle before stopping the clock + * to ensure that restart works, see "Stopping the + * master clock" in the datasheet. */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + snd_soc_write(codec, WM8900_REG_POWER2, + WM8900_REG_POWER2_SYSCLK_ENA); + break; +diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c +index 7e4822185feb..0c85a207446a 100644 +--- a/sound/soc/codecs/wm9713.c ++++ b/sound/soc/codecs/wm9713.c +@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w, + + /* Gracefully shut down the voice interface. */ + snd_soc_update_bits(codec, AC97_HANDSET_RATE, 0x0f00, 0x0200); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + snd_soc_update_bits(codec, AC97_HANDSET_RATE, 0x0f00, 0x0f00); + snd_soc_update_bits(codec, AC97_EXTENDED_MID, 0x1000, 0x1000); + +@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_codec *codec, + wm9713->pll_in = freq_in; + + /* wait 10ms AC97 link frames for the link to stabilise */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + return 0; + } + +-- +2.11.0 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0009-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0009-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch new file mode 100644 index 00000000..c910f3df --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0009-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch @@ -0,0 +1,160 @@ +From 6044370cf4bbc5e05f5d78f5772c1d88e3153603 Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Mon, 20 Feb 2017 13:30:32 +1100 +Subject: [PATCH 09/16] Replace all calls to schedule_timeout_uninterruptible + of potentially under 50ms to use schedule_msec_hrtimeout_uninterruptible + +--- + drivers/media/pci/cx18/cx18-gpio.c | 4 ++-- + drivers/net/wireless/intel/ipw2x00/ipw2100.c | 4 ++-- + drivers/rtc/rtc-wm8350.c | 6 +++--- + drivers/scsi/lpfc/lpfc_scsi.c | 2 +- + sound/pci/maestro3.c | 4 ++-- + sound/soc/codecs/rt5631.c | 4 ++-- + sound/soc/soc-dapm.c | 2 +- + 7 files changed, 13 insertions(+), 13 deletions(-) + +diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c +index 012859e6dc7b..206bd08265a5 100644 +--- a/drivers/media/pci/cx18/cx18-gpio.c ++++ b/drivers/media/pci/cx18/cx18-gpio.c +@@ -90,11 +90,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi, + + /* Assert */ + gpio_update(cx, mask, ~active_lo); +- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((assert_msecs)); + + /* Deassert */ + gpio_update(cx, mask, ~active_hi); +- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((recovery_msecs)); + } + + /* +diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +index 19c442cb93e4..448f41782060 100644 +--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c ++++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +@@ -830,7 +830,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv, + * doesn't seem to have as many firmware restart cycles... + * + * As a test, we're sticking in a 1/100s delay here */ +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + return 0; + +@@ -1281,7 +1281,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv) + IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n"); + i = 5000; + do { +- schedule_timeout_uninterruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_uninterruptible((40)); + /* Todo... wait for sync command ... */ + + read_register(priv->net_dev, IPW_REG_INTA, &inta); +diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c +index 483c7993516b..fddbaa475066 100644 +--- a/drivers/rtc/rtc-wm8350.c ++++ b/drivers/rtc/rtc-wm8350.c +@@ -119,7 +119,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); + + if (!retries) { +@@ -202,7 +202,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS)); + + if (!(rtc_ctrl & WM8350_RTC_ALMSTS)) +@@ -225,7 +225,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) + /* Wait until confirmation */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS); + + if (rtc_ctrl & WM8350_RTC_ALMSTS) +diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c +index 1a6f122bb25d..c0db66302a3e 100644 +--- a/drivers/scsi/lpfc/lpfc_scsi.c ++++ b/drivers/scsi/lpfc/lpfc_scsi.c +@@ -5131,7 +5131,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id, + tgt_id, lun_id, context); + later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies; + while (time_after(later, jiffies) && cnt) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(20)); ++ schedule_msec_hrtimeout_uninterruptible((20)); + cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context); + } + if (cnt) { +diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c +index 8f20dec97843..944ce63431b0 100644 +--- a/sound/pci/maestro3.c ++++ b/sound/pci/maestro3.c +@@ -2016,7 +2016,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(0, io + GPIO_DATA); + outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1)); ++ schedule_msec_hrtimeout_uninterruptible((delay1)); + + outw(GPO_PRIMARY_AC97, io + GPIO_DATA); + udelay(5); +@@ -2024,7 +2024,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A); + outw(~0, io + GPIO_MASK); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2)); ++ schedule_msec_hrtimeout_uninterruptible((delay2)); + + if (! snd_m3_try_read_vendor(chip)) + break; +diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c +index 55b04c55fb4b..2ed02ad6ac41 100644 +--- a/sound/soc/codecs/rt5631.c ++++ b/sound/soc/codecs/rt5631.c +@@ -419,7 +419,7 @@ static void onebit_depop_mute_stage(struct snd_soc_codec *codec, int enable) + hp_zc = snd_soc_read(codec, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_write(codec, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + /* config one-bit depop parameter */ + rt5631_write_index(codec, RT5631_SPK_INTL_CTRL, 0x307f); + snd_soc_update_bits(codec, RT5631_HP_OUT_VOL, +@@ -529,7 +529,7 @@ static void depop_seq_mute_stage(struct snd_soc_codec *codec, int enable) + hp_zc = snd_soc_read(codec, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_write(codec, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + /* config depop sequence parameter */ + rt5631_write_index(codec, RT5631_SPK_INTL_CTRL, 0x302f); +diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c +index dcef67a9bd48..11c2bb48c8f2 100644 +--- a/sound/soc/soc-dapm.c ++++ b/sound/soc/soc-dapm.c +@@ -134,7 +134,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm) + static void pop_wait(u32 pop_time) + { + if (pop_time) +- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time)); ++ schedule_msec_hrtimeout_uninterruptible((pop_time)); + } + + static void pop_dbg(struct device *dev, u32 pop_time, const char *fmt, ...) +-- +2.11.0 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0010-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0010-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch new file mode 100644 index 00000000..260bb98d --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0010-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch @@ -0,0 +1,69 @@ +From 071486de633698dcdd163295173ce4663ec9158c Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Mon, 20 Feb 2017 13:32:58 +1100 +Subject: [PATCH 10/16] Don't use hrtimer overlay when pm_freezing since some + drivers still don't correctly use freezable timeouts. + +--- + kernel/time/hrtimer.c | 2 +- + kernel/time/timer.c | 9 +++++---- + 2 files changed, 6 insertions(+), 5 deletions(-) + +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 13227cf2814c..66456c72bace 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -1809,7 +1809,7 @@ long __sched schedule_msec_hrtimeout(long timeout) + * (yet) better than Hz, as would occur during startup, use regular + * timers. + */ +- if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ) ++ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) + return schedule_timeout(jiffs); + + secs = timeout / 1000; +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index c68cb9307f64..2f2c96b03efe 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -44,6 +44,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1891,12 +1892,12 @@ void msleep(unsigned int msecs) + * Use high resolution timers where the resolution of tick based + * timers is inadequate. + */ +- if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { + while (msecs) + msecs = schedule_msec_hrtimeout_uninterruptible(msecs); + return; + } +- timeout = msecs_to_jiffies(msecs) + 1; ++ timeout = jiffs + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +@@ -1913,12 +1914,12 @@ unsigned long msleep_interruptible(unsigned int msecs) + int jiffs = msecs_to_jiffies(msecs); + unsigned long timeout; + +- if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { + while (msecs && !signal_pending(current)) + msecs = schedule_msec_hrtimeout_interruptible(msecs); + return msecs; + } +- timeout = msecs_to_jiffies(msecs) + 1; ++ timeout = jiffs + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); +-- +2.11.0 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0011-Make-hrtimer-granularity-and-minimum-hrtimeout-confi.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0011-Make-hrtimer-granularity-and-minimum-hrtimeout-confi.patch new file mode 100644 index 00000000..5ac20300 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0011-Make-hrtimer-granularity-and-minimum-hrtimeout-confi.patch @@ -0,0 +1,136 @@ +diff -Nur a/kernel/sysctl.c b/kernel/sysctl.c +--- a/kernel/sysctl.c 2018-11-03 17:03:07.433069521 +0000 ++++ b/kernel/sysctl.c 2018-11-03 17:02:11.020267246 +0000 +@@ -141,7 +141,9 @@ + extern int sched_iso_cpu; + extern int sched_yield_type; + #endif +-#ifdef CONFIG_PRINTK ++extern int hrtimer_granularity_us; ++extern int hrtimeout_min_us; ++#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS) + static int ten_thousand __read_only = 10000; + #endif + #ifdef CONFIG_PERF_EVENTS +@@ -1119,6 +1121,24 @@ + .extra2 = &two, + }, + #endif ++ { ++ .procname = "hrtimer_granularity_us", ++ .data = &hrtimer_granularity_us, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &ten_thousand, ++ }, ++ { ++ .procname = "hrtimeout_min_us", ++ .data = &hrtimeout_min_us, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &ten_thousand, ++ }, + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff -Nur a/kernel/time/clockevents.c b/kernel/time/clockevents.c +--- a/kernel/time/clockevents.c 2018-11-03 17:03:07.433069521 +0000 ++++ b/kernel/time/clockevents.c 2018-11-03 16:58:17.283800909 +0000 +@@ -198,13 +198,9 @@ + + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + +-#ifdef CONFIG_SCHED_MUQSS ++int __read_mostly hrtimer_granularity_us = 100; + /* Limit min_delta to 100us */ +-#define MIN_DELTA_LIMIT (NSEC_PER_SEC / 10000) +-#else +-/* Limit min_delta to a jiffie */ +-#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) +-#endif ++#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC) + + /** + * clockevents_increase_min_delta - raise minimum delta of a clock event device +diff -Nur a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +--- a/kernel/time/hrtimer.c 2018-11-03 17:04:16.448274547 +0000 ++++ b/kernel/time/hrtimer.c 2018-11-03 16:58:17.283800909 +0000 +@@ -1803,7 +1803,7 @@ + long __sched schedule_msec_hrtimeout(long timeout) + { + struct hrtimer_sleeper t; +- int delta, secs, jiffs; ++ int delta, jiffs; + ktime_t expires; + + if (!timeout) { +@@ -1820,9 +1820,8 @@ + if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) + return schedule_timeout(jiffs); + +- secs = timeout / 1000; + delta = (timeout % 1000) * NSEC_PER_MSEC; +- expires = ktime_set(secs, delta); ++ expires = ktime_set(0, delta); + + hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_set_expires_range_ns(&t.timer, expires, delta); +@@ -1846,9 +1845,53 @@ + + EXPORT_SYMBOL(schedule_msec_hrtimeout); + ++#define USECS_PER_SEC 1000000 ++extern int hrtimer_granularity_us; ++ ++static inline long schedule_usec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ ktime_t expires; ++ int delta; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ if (hrtimer_resolution >= NSEC_PER_SEC / HZ) ++ return schedule_timeout(usecs_to_jiffies(timeout)); ++ ++ if (timeout < hrtimer_granularity_us) ++ timeout = hrtimer_granularity_us; ++ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC; ++ expires = ktime_set(0, delta); ++ ++ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_init_sleeper(&t, current); ++ ++ hrtimer_start_expires(&t.timer, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_us(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++int __read_mostly hrtimeout_min_us = 1000; ++ + long __sched schedule_min_hrtimeout(void) + { +- return schedule_msec_hrtimeout(1); ++ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us)); + } + + EXPORT_SYMBOL(schedule_min_hrtimeout); diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0012-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0012-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch new file mode 100644 index 00000000..99b28d65 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0012-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch @@ -0,0 +1,81 @@ +From 9e47a80f690080c12ce607158b96c305707543b8 Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Wed, 7 Dec 2016 21:23:01 +1100 +Subject: [PATCH 12/16] Reinstate default Hz of 100 in combination with MuQSS + and -ck patches. + +--- + kernel/Kconfig.hz | 25 ++++++++++++++++++------- + 1 file changed, 18 insertions(+), 7 deletions(-) + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1806fcac8f14 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,8 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_100 if SCHED_MUQSS ++ default HZ_250_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -19,11 +20,18 @@ choice + config HZ_100 + bool "100 HZ" + help ++ 100 Hz is a suitable choice in combination with MuQSS which does ++ not rely on ticks for rescheduling interrupts, and is not Hz limited ++ for timeouts and sleeps from both the kernel and userspace. ++ This allows us to benefit from the lower overhead and higher ++ throughput of fewer timer ticks. ++ ++ Non-MuQSS kernels: + 100 Hz is a typical choice for servers, SMP and NUMA systems + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEF + bool "250 HZ" + help + 250 Hz is a good compromise choice allowing server performance +@@ -31,7 +39,10 @@ choice + on SMP and NUMA systems. If you are going to be using NTSC video + or multimedia, selected 300Hz instead. + +- config HZ_300 ++ 250 Hz is the default choice for the mainline scheduler but not ++ advantageous in combination with MuQSS. ++ ++ config HZ_300_NODEF + bool "300 HZ" + help + 300 Hz is a good compromise choice allowing server performance +@@ -39,7 +50,7 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + +- config HZ_1000 ++ config HZ_1000_NODEF + bool "1000 HZ" + help + 1000 Hz is the preferred choice for desktop systems and other +@@ -50,9 +61,9 @@ endchoice + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 +- default 300 if HZ_300 +- default 1000 if HZ_1000 ++ default 250 if HZ_250_NODEF ++ default 300 if HZ_300_NODEF ++ default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS +-- +2.11.0 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0013-Make-threaded-IRQs-optionally-the-default-which-can-.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0013-Make-threaded-IRQs-optionally-the-default-which-can-.patch new file mode 100644 index 00000000..63ec9fdf --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0013-Make-threaded-IRQs-optionally-the-default-which-can-.patch @@ -0,0 +1,61 @@ +From 5902b315d4061ebbe73a62c52e6d3b618066cebc Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Wed, 7 Dec 2016 21:13:16 +1100 +Subject: [PATCH 13/16] Make threaded IRQs optionally the default which can be + disabled. + +--- + kernel/irq/Kconfig | 14 ++++++++++++++ + kernel/irq/manage.c | 10 ++++++++++ + 2 files changed, 24 insertions(+) + +diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig +index a117adf7084b..0984c54fd4e9 100644 +--- a/kernel/irq/Kconfig ++++ b/kernel/irq/Kconfig +@@ -111,6 +111,20 @@ config IRQ_DOMAIN_DEBUG + config IRQ_FORCED_THREADING + bool + ++config FORCE_IRQ_THREADING ++ bool "Make IRQ threading compulsory" ++ depends on IRQ_FORCED_THREADING ++ default y ++ ---help--- ++ ++ Make IRQ threading mandatory for any IRQ handlers that support it ++ instead of being optional and requiring the threadirqs kernel ++ parameter. Instead they can be optionally disabled with the ++ nothreadirqs kernel parameter. ++ ++ Enable if you are building for a desktop or low latency system, ++ otherwise say N. ++ + config SPARSE_IRQ + bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ + ---help--- +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index 4bff6a10ae8e..5a6df0dd23c4 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -24,7 +24,17 @@ + #include "internals.h" + + #ifdef CONFIG_IRQ_FORCED_THREADING ++#ifdef CONFIG_FORCE_IRQ_THREADING ++__read_mostly bool force_irqthreads = true; ++#else + __read_mostly bool force_irqthreads; ++#endif ++static int __init setup_noforced_irqthreads(char *arg) ++{ ++ force_irqthreads = false; ++ return 0; ++} ++early_param("nothreadirqs", setup_noforced_irqthreads); + + static int __init setup_forced_irqthreads(char *arg) + { +-- +2.11.0 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0014-Swap-sucks.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0014-Swap-sucks.patch new file mode 100644 index 00000000..6bf5bcda --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0014-Swap-sucks.patch @@ -0,0 +1,25 @@ +From ed0ab4c80fcb6fa4abb4f2f897e591df6eaa2d0e Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Sat, 12 Aug 2017 12:02:04 +1000 +Subject: [PATCH 14/16] Swap sucks. + +--- + mm/vmscan.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index eb2f0315b8c0..67d03efab288 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -149,7 +149,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 33; + /* + * The total number of pages which are beyond the high watermark within all + * zones. +-- +2.11.0 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch new file mode 100644 index 00000000..bfa509a5 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch @@ -0,0 +1,19 @@ +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +index e84d700709ff6..16364915cff53 100644 +--- a/kernel/sched/MuQSS.c ++++ b/kernel/sched/MuQSS.c +@@ -70,6 +70,14 @@ + + #include "MuQSS.h" + ++/* needing to include irq_regs.h, "because reasons"... ++ * implicit declaration of function ‘get_irq_regs’; ++ * did you mean ‘get_ibs_caps’? ++ * [-Werror=implicit-function-declaration] ++ * ^ this is because autodetect is not flawless ++ */ ++#include ++ + #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) + #define rt_task(p) rt_prio((p)->prio) + #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0016-unfuck-MuQSS-on-linux-4_14_15+.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0016-unfuck-MuQSS-on-linux-4_14_15+.patch new file mode 100644 index 00000000..f7dc1d1c --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0016-unfuck-MuQSS-on-linux-4_14_15+.patch @@ -0,0 +1,48 @@ +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +index e84d700709ff6..b0be7fcfe41f9 100644 +--- a/kernel/sched/MuQSS.c ++++ b/kernel/sched/MuQSS.c +@@ -55,6 +55,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1959,7 +1960,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) + p->state = TASK_WAKING; + + if (p->in_iowait) { ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 15) + delayacct_blkio_end(); ++#else ++ delayacct_blkio_end(p); ++#endif + atomic_dec(&task_rq(p)->nr_iowait); + } + +@@ -1970,7 +1975,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) + #else /* CONFIG_SMP */ + + if (p->in_iowait) { ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 15) + delayacct_blkio_end(); ++#else ++ delayacct_blkio_end(p); ++#endif + atomic_dec(&task_rq(p)->nr_iowait); + } + +@@ -2022,7 +2031,11 @@ static void try_to_wake_up_local(struct task_struct *p) + + if (!task_on_rq_queued(p)) { + if (p->in_iowait) { ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 15) + delayacct_blkio_end(); ++#else ++ delayacct_blkio_end(p); ++#endif + atomic_dec(&rq->nr_iowait); + } + ttwu_activate(rq, p); diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-0017-unfuck-MuQSS-on-linux-4_14_75+.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-0017-unfuck-MuQSS-on-linux-4_14_75+.patch new file mode 100644 index 00000000..1a1717bf --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-0017-unfuck-MuQSS-on-linux-4_14_75+.patch @@ -0,0 +1,14 @@ +diff -Nur a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +--- a/kernel/sched/MuQSS.c 2019-01-05 22:51:24.547448624 +0000 ++++ b/kernel/sched/MuQSS.c 2019-01-05 22:58:29.821451056 +0000 +@@ -1021,6 +1021,10 @@ + #define CPUIDLE_THREAD_BUSY (16) + #define CPUIDLE_DIFF_NODE (32) + ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++#endif ++ + /* + * The best idle CPU is chosen according to the CPUIDLE ranking above where the + * lowest value would give the most suitable CPU to schedule p onto next. The diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-Revert-ath10k-activate-user-space-firmware-loading.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-Revert-ath10k-activate-user-space-firmware-loading.patch new file mode 100644 index 00000000..28f9b2f6 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-Revert-ath10k-activate-user-space-firmware-loading.patch @@ -0,0 +1,13 @@ +diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c +index a4f635820..9b4c4facf 100644 +--- a/drivers/net/wireless/ath/ath10k/core.c ++++ b/drivers/net/wireless/ath/ath10k/core.c +@@ -519,7 +519,7 @@ static const struct firmware *ath10k_fetch_fw_file(struct ath10k *ar, + dir = "."; + + snprintf(filename, sizeof(filename), "%s/%s", dir, file); +- ret = request_firmware(&fw, filename, ar->dev); ++ ret = request_firmware_direct(&fw, filename, ar->dev); + ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot fw request '%s': %d\n", + filename, ret); + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-introduce-NUMA-identity-node-sched-domain.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-introduce-NUMA-identity-node-sched-domain.patch new file mode 100644 index 00000000..2376edae --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-introduce-NUMA-identity-node-sched-domain.patch @@ -0,0 +1,46 @@ +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 808998fe1..18d3321ef 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1339,6 +1339,10 @@ void sched_init_numa(void) + if (!sched_domains_numa_distance) + return; + ++ /* Includes NUMA identity node at level 0. */ ++ sched_domains_numa_distance[level++] = curr_distance; ++ sched_domains_numa_levels = level; ++ + /* + * O(nr_nodes^2) deduplicating selection sort -- in order to find the + * unique distances in the node_distance() table. +@@ -1386,8 +1390,7 @@ void sched_init_numa(void) + return; + + /* +- * 'level' contains the number of unique distances, excluding the +- * identity distance node_distance(i,i). ++ * 'level' contains the number of unique distances + * + * The sched_domains_numa_distance[] array includes the actual distance + * numbers. +@@ -1448,10 +1451,19 @@ void sched_init_numa(void) + for (i = 0; sched_domain_topology[i].mask; i++) + tl[i] = sched_domain_topology[i]; + ++ /* ++ * Add the NUMA identity distance, aka single NODE. ++ */ ++ tl[i++] = (struct sched_domain_topology_level){ ++ .mask = sd_numa_mask, ++ .numa_level = 0, ++ SD_INIT_NAME(NODE) ++ }; ++ + /* + * .. and append 'j' levels of NUMA goodness. + */ +- for (j = 0; j < level; i++, j++) { ++ for (j = 1; j < level; i++, j++) { + tl[i] = (struct sched_domain_topology_level){ + .mask = sd_numa_mask, + .sd_flags = cpu_numa_flags, diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-k10temp-add-ZEN-support.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-k10temp-add-ZEN-support.patch new file mode 100644 index 00000000..b1e8a9b0 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-k10temp-add-ZEN-support.patch @@ -0,0 +1,177 @@ +diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c +index ce3b91f22..0721e1756 100644 +--- a/drivers/hwmon/k10temp.c ++++ b/drivers/hwmon/k10temp.c +@@ -36,6 +36,10 @@ MODULE_PARM_DESC(force, "force loading on processors with erratum 319"); + /* Provide lock for writing to NB_SMU_IND_ADDR */ + static DEFINE_MUTEX(nb_smu_ind_mutex); + ++#ifndef PCI_DEVICE_ID_AMD_17H_DF_F3 ++#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463 ++#endif ++ + /* CPUID function 0x80000001, ebx */ + #define CPUID_PKGTYPE_MASK 0xf0000000 + #define CPUID_PKGTYPE_F 0x00000000 +@@ -61,31 +65,72 @@ static DEFINE_MUTEX(nb_smu_ind_mutex); + */ + #define F15H_M60H_REPORTED_TEMP_CTRL_OFFSET 0xd8200ca4 + +-static void amd_nb_smu_index_read(struct pci_dev *pdev, unsigned int devfn, +- int offset, u32 *val) ++/* F17h M01h Access througn SMN */ ++#define F17H_M01H_REPORTED_TEMP_CTRL_OFFSET 0x00059800 ++ ++struct k10temp_data { ++ struct pci_dev *pdev; ++ void (*read_tempreg)(struct pci_dev *pdev, u32 *regval); ++ int temp_offset; ++}; ++ ++struct tctl_offset { ++ u8 model; ++ char const *id; ++ int offset; ++}; ++ ++static const struct tctl_offset tctl_offset_table[] = { ++ { 0x17, "AMD Ryzen 5 1600X", 20000 }, ++ { 0x17, "AMD Ryzen 7 1700X", 20000 }, ++ { 0x17, "AMD Ryzen 7 1800X", 20000 }, ++ { 0x17, "AMD Ryzen Threadripper 1950X", 27000 }, ++ { 0x17, "AMD Ryzen Threadripper 1920X", 27000 }, ++ { 0x17, "AMD Ryzen Threadripper 1950", 10000 }, ++ { 0x17, "AMD Ryzen Threadripper 1920", 10000 }, ++ { 0x17, "AMD Ryzen Threadripper 1910", 10000 }, ++}; ++ ++static void read_tempreg_pci(struct pci_dev *pdev, u32 *regval) ++{ ++ pci_read_config_dword(pdev, REG_REPORTED_TEMPERATURE, regval); ++} ++ ++static void amd_nb_index_read(struct pci_dev *pdev, unsigned int devfn, ++ unsigned int base, int offset, u32 *val) + { + mutex_lock(&nb_smu_ind_mutex); + pci_bus_write_config_dword(pdev->bus, devfn, +- 0xb8, offset); ++ base, offset); + pci_bus_read_config_dword(pdev->bus, devfn, +- 0xbc, val); ++ base + 4, val); + mutex_unlock(&nb_smu_ind_mutex); + } + ++static void read_tempreg_nb_f15(struct pci_dev *pdev, u32 *regval) ++{ ++ amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0xb8, ++ F15H_M60H_REPORTED_TEMP_CTRL_OFFSET, regval); ++} ++ ++static void read_tempreg_nb_f17(struct pci_dev *pdev, u32 *regval) ++{ ++ amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0x60, ++ F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval); ++} ++ + static ssize_t temp1_input_show(struct device *dev, + struct device_attribute *attr, char *buf) + { ++ struct k10temp_data *data = dev_get_drvdata(dev); + u32 regval; +- struct pci_dev *pdev = dev_get_drvdata(dev); +- +- if (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model == 0x60) { +- amd_nb_smu_index_read(pdev, PCI_DEVFN(0, 0), +- F15H_M60H_REPORTED_TEMP_CTRL_OFFSET, +- ®val); +- } else { +- pci_read_config_dword(pdev, REG_REPORTED_TEMPERATURE, ®val); +- } +- return sprintf(buf, "%u\n", (regval >> 21) * 125); ++ unsigned int temp; ++ ++ data->read_tempreg(data->pdev, ®val); ++ temp = (regval >> 21) * 125; ++ temp -= data->temp_offset; ++ ++ return sprintf(buf, "%u\n", temp); + } + + static ssize_t temp1_max_show(struct device *dev, +@@ -98,11 +143,12 @@ static ssize_t show_temp_crit(struct device *dev, + struct device_attribute *devattr, char *buf) + { + struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); ++ struct k10temp_data *data = dev_get_drvdata(dev); + int show_hyst = attr->index; + u32 regval; + int value; + +- pci_read_config_dword(dev_get_drvdata(dev), ++ pci_read_config_dword(data->pdev, + REG_HARDWARE_THERMAL_CONTROL, ®val); + value = ((regval >> 16) & 0x7f) * 500 + 52000; + if (show_hyst) +@@ -119,7 +165,8 @@ static umode_t k10temp_is_visible(struct kobject *kobj, + struct attribute *attr, int index) + { + struct device *dev = container_of(kobj, struct device, kobj); +- struct pci_dev *pdev = dev_get_drvdata(dev); ++ struct k10temp_data *data = dev_get_drvdata(dev); ++ struct pci_dev *pdev = data->pdev; + + if (index >= 2) { + u32 reg_caps, reg_htc; +@@ -187,7 +234,9 @@ static int k10temp_probe(struct pci_dev *pdev, + { + int unreliable = has_erratum_319(pdev); + struct device *dev = &pdev->dev; ++ struct k10temp_data *data; + struct device *hwmon_dev; ++ int i; + + if (unreliable) { + if (!force) { +@@ -199,7 +248,31 @@ static int k10temp_probe(struct pci_dev *pdev, + "unreliable CPU thermal sensor; check erratum 319\n"); + } + +- hwmon_dev = devm_hwmon_device_register_with_groups(dev, "k10temp", pdev, ++ data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); ++ if (!data) ++ return -ENOMEM; ++ ++ data->pdev = pdev; ++ ++ if (boot_cpu_data.x86 == 0x15 && (boot_cpu_data.x86_model == 0x60 || ++ boot_cpu_data.x86_model == 0x70)) ++ data->read_tempreg = read_tempreg_nb_f15; ++ else if (boot_cpu_data.x86 == 0x17) ++ data->read_tempreg = read_tempreg_nb_f17; ++ else ++ data->read_tempreg = read_tempreg_pci; ++ ++ for (i = 0; i < ARRAY_SIZE(tctl_offset_table); i++) { ++ const struct tctl_offset *entry = &tctl_offset_table[i]; ++ ++ if (boot_cpu_data.x86 == entry->model && ++ strstr(boot_cpu_data.x86_model_id, entry->id)) { ++ data->temp_offset = entry->offset; ++ break; ++ } ++ } ++ ++ hwmon_dev = devm_hwmon_device_register_with_groups(dev, "k10temp", data, + k10temp_groups); + return PTR_ERR_OR_ZERO(hwmon_dev); + } +@@ -214,6 +287,7 @@ static const struct pci_device_id k10temp_id_table[] = { + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, ++ { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, + {} + }; + MODULE_DEVICE_TABLE(pci, k10temp_id_table); diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-linux-hardened.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-linux-hardened.patch new file mode 100644 index 00000000..9280791e --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-linux-hardened.patch @@ -0,0 +1,2868 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 7d8b17ce8804..7e4f071c3bf2 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -490,16 +490,6 @@ + nosocket -- Disable socket memory accounting. + nokmem -- Disable kernel memory accounting. + +- checkreqprot [SELINUX] Set initial checkreqprot flag value. +- Format: { "0" | "1" } +- See security/selinux/Kconfig help text. +- 0 -- check protection applied by kernel (includes +- any implied execute protection). +- 1 -- check protection requested by application. +- Default value is set via a kernel config option. +- Value can be changed at runtime via +- /selinux/checkreqprot. +- + cio_ignore= [S390] + See Documentation/s390/CommonIO for details. + clk_ignore_unused +@@ -2984,6 +2974,11 @@ + the specified number of seconds. This is to be used if + your oopses keep scrolling off the screen. + ++ extra_latent_entropy ++ Enable a very simple form of latent entropy extraction ++ from the first 4GB of memory as the bootmem allocator ++ passes the memory pages to the buddy allocator. ++ + pcbit= [HW,ISDN] + + pcd. [PARIDE] +diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt +index 694968c7523c..002d86416ef8 100644 +--- a/Documentation/sysctl/kernel.txt ++++ b/Documentation/sysctl/kernel.txt +@@ -91,6 +91,7 @@ show up in /proc/sys/kernel: + - sysctl_writes_strict + - tainted + - threads-max ++- tiocsti_restrict + - unknown_nmi_panic + - watchdog + - watchdog_thresh +@@ -999,6 +1000,26 @@ available RAM pages threads-max is reduced accordingly. + + ============================================================== + ++tiocsti_restrict: ++ ++This toggle indicates whether unprivileged users are prevented ++from using the TIOCSTI ioctl to inject commands into other processes ++which share a tty session. ++ ++When tiocsti_restrict is set to (0) there are no restrictions(accept ++the default restriction of only being able to injection commands into ++one's own tty). When tiocsti_restrict is set to (1), users must ++have CAP_SYS_ADMIN to use the TIOCSTI ioctl. ++ ++When user namespaces are in use, the check for the capability ++CAP_SYS_ADMIN is done against the user namespace that originally ++opened the tty. ++ ++The kernel config option CONFIG_SECURITY_TIOCSTI_RESTRICT sets the ++default value of tiocsti_restrict. ++ ++============================================================== ++ + unknown_nmi_panic: + + The value in this file affects behavior of handling NMI. When the +diff --git a/Makefile b/Makefile +index 70cc37cb3e99..edc3de99b3cd 100644 +--- a/Makefile ++++ b/Makefile +@@ -714,6 +714,9 @@ endif + KBUILD_CFLAGS += $(stackp-flag) + + ifeq ($(cc-name),clang) ++ifdef CONFIG_LOCAL_INIT ++KBUILD_CFLAGS += -fsanitize=local-init ++endif + KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) + KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) + KBUILD_CFLAGS += $(call cc-disable-warning, gnu) +diff --git a/arch/Kconfig b/arch/Kconfig +index 77b3e21c4844..3dff252446ac 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -446,6 +446,11 @@ config GCC_PLUGIN_LATENT_ENTROPY + is some slowdown of the boot process (about 0.5%) and fork and + irq processing. + ++ When extra_latent_entropy is passed on the kernel command line, ++ entropy will be extracted from up to the first 4GB of RAM while the ++ runtime memory allocator is being initialized. This costs even more ++ slowdown of the boot process. ++ + Note that entropy extracted this way is not cryptographically + secure! + +@@ -539,7 +544,7 @@ config CC_STACKPROTECTOR + choice + prompt "Stack Protector buffer overflow detection" + depends on HAVE_CC_STACKPROTECTOR +- default CC_STACKPROTECTOR_NONE ++ default CC_STACKPROTECTOR_STRONG + help + This option turns on the "stack-protector" GCC feature. This + feature puts, at the beginning of functions, a canary value on +@@ -741,7 +746,7 @@ config ARCH_MMAP_RND_BITS + int "Number of bits to use for ASLR of mmap base address" if EXPERT + range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX + default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT +- default ARCH_MMAP_RND_BITS_MIN ++ default ARCH_MMAP_RND_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_BITS + help + This value can be used to select the number of bits to use to +@@ -775,7 +780,7 @@ config ARCH_MMAP_RND_COMPAT_BITS + int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT + range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX + default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT +- default ARCH_MMAP_RND_COMPAT_BITS_MIN ++ default ARCH_MMAP_RND_COMPAT_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS + help + This value can be used to select the number of bits to use to +@@ -958,6 +963,7 @@ config ARCH_HAS_REFCOUNT + + config REFCOUNT_FULL + bool "Perform full reference count validation at the expense of speed" ++ default y + help + Enabling this switches the refcounting infrastructure from a fast + unchecked atomic_t implementation to a fully state checked +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index c30cd78b6918..ba32a283f027 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -926,6 +926,7 @@ endif + + config ARM64_SW_TTBR0_PAN + bool "Emulate Privileged Access Never using TTBR0_EL1 switching" ++ default y + help + Enabling this option prevents the kernel from accessing + user-space memory directly by pointing TTBR0_EL1 to a reserved +@@ -1052,6 +1053,7 @@ config RANDOMIZE_BASE + bool "Randomize the address of the kernel image" + select ARM64_MODULE_PLTS if MODULES + select RELOCATABLE ++ default y + help + Randomizes the virtual address at which the kernel image is + loaded, as a security feature that deters exploit attempts +diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug +index cc6bd559af85..01d5442d4722 100644 +--- a/arch/arm64/Kconfig.debug ++++ b/arch/arm64/Kconfig.debug +@@ -45,6 +45,7 @@ config ARM64_RANDOMIZE_TEXT_OFFSET + config DEBUG_WX + bool "Warn on W+X mappings at boot" + select ARM64_PTDUMP_CORE ++ default y + ---help--- + Generate a warning if any W+X mappings are found at boot. + +diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig +index b05796578e7a..8f6e2099717d 100644 +--- a/arch/arm64/configs/defconfig ++++ b/arch/arm64/configs/defconfig +@@ -1,4 +1,3 @@ +-CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y + CONFIG_NO_HZ_IDLE=y +diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h +index 33be513ef24c..6f0c0e3ef0dd 100644 +--- a/arch/arm64/include/asm/elf.h ++++ b/arch/arm64/include/asm/elf.h +@@ -114,10 +114,10 @@ + + /* + * This is the base location for PIE (ET_DYN with INTERP) loads. On +- * 64-bit, this is above 4GB to leave the entire 32-bit address ++ * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ +-#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) ++#define ELF_ET_DYN_BASE 0x100000000UL + + #ifndef __ASSEMBLY__ + +@@ -158,10 +158,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, + /* 1GB of VA */ + #ifdef CONFIG_COMPAT + #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \ +- 0x7ff >> (PAGE_SHIFT - 12) : \ +- 0x3ffff >> (PAGE_SHIFT - 12)) ++ ((1UL << mmap_rnd_compat_bits) - 1) >> (PAGE_SHIFT - 12) : \ ++ ((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) + #else +-#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) ++#define STACK_RND_MASK (((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) + #endif + + #ifdef __AARCH64EB__ +diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c +index 9e773732520c..91359f45b5fc 100644 +--- a/arch/arm64/kernel/process.c ++++ b/arch/arm64/kernel/process.c +@@ -419,9 +419,9 @@ unsigned long arch_align_stack(unsigned long sp) + unsigned long arch_randomize_brk(struct mm_struct *mm) + { + if (is_compat_task()) +- return randomize_page(mm->brk, SZ_32M); ++ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; + else +- return randomize_page(mm->brk, SZ_1G); ++ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; + } + + /* +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 4f393eb9745f..1a31f8fc82ed 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1145,8 +1145,7 @@ config VM86 + default X86_LEGACY_VM86 + + config X86_16BIT +- bool "Enable support for 16-bit segments" if EXPERT +- default y ++ bool "Enable support for 16-bit segments" + depends on MODIFY_LDT_SYSCALL + ---help--- + This option is required by programs like Wine to run 16-bit +@@ -2220,7 +2219,7 @@ config COMPAT_VDSO + choice + prompt "vsyscall table for legacy applications" + depends on X86_64 +- default LEGACY_VSYSCALL_EMULATE ++ default LEGACY_VSYSCALL_NONE + help + Legacy user code that does not know how to find the vDSO expects + to be able to issue three syscalls by calling fixed addresses in +@@ -2310,8 +2309,7 @@ config CMDLINE_OVERRIDE + be set to 'N' under normal conditions. + + config MODIFY_LDT_SYSCALL +- bool "Enable the LDT (local descriptor table)" if EXPERT +- default y ++ bool "Enable the LDT (local descriptor table)" + ---help--- + Linux can allow user programs to install a per-process x86 + Local Descriptor Table (LDT) using the modify_ldt(2) system +diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug +index 6293a8768a91..add82e0f1df3 100644 +--- a/arch/x86/Kconfig.debug ++++ b/arch/x86/Kconfig.debug +@@ -101,6 +101,7 @@ config EFI_PGT_DUMP + config DEBUG_WX + bool "Warn on W+X mappings at boot" + select X86_PTDUMP_CORE ++ default y + ---help--- + Generate a warning if any W+X mappings are found at boot. + +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index e32fc1f274d8..d08acc76502a 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -1,5 +1,4 @@ + # CONFIG_LOCALVERSION_AUTO is not set +-CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_TASKSTATS=y +diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c +index 1911310959f8..bba8dbbc07a8 100644 +--- a/arch/x86/entry/vdso/vma.c ++++ b/arch/x86/entry/vdso/vma.c +@@ -203,55 +203,9 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) + } + + #ifdef CONFIG_X86_64 +-/* +- * Put the vdso above the (randomized) stack with another randomized +- * offset. This way there is no hole in the middle of address space. +- * To save memory make sure it is still in the same PTE as the stack +- * top. This doesn't give that many random bits. +- * +- * Note that this algorithm is imperfect: the distribution of the vdso +- * start address within a PMD is biased toward the end. +- * +- * Only used for the 64-bit and x32 vdsos. +- */ +-static unsigned long vdso_addr(unsigned long start, unsigned len) +-{ +- unsigned long addr, end; +- unsigned offset; +- +- /* +- * Round up the start address. It can start out unaligned as a result +- * of stack start randomization. +- */ +- start = PAGE_ALIGN(start); +- +- /* Round the lowest possible end address up to a PMD boundary. */ +- end = (start + len + PMD_SIZE - 1) & PMD_MASK; +- if (end >= TASK_SIZE_MAX) +- end = TASK_SIZE_MAX; +- end -= len; +- +- if (end > start) { +- offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); +- addr = start + (offset << PAGE_SHIFT); +- } else { +- addr = start; +- } +- +- /* +- * Forcibly align the final address in case we have a hardware +- * issue that requires alignment for performance reasons. +- */ +- addr = align_vdso_addr(addr); +- +- return addr; +-} +- + static int map_vdso_randomized(const struct vdso_image *image) + { +- unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); +- +- return map_vdso(image, addr); ++ return map_vdso(image, 0); + } + #endif + +diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h +index 3a091cea36c5..0931c05a3348 100644 +--- a/arch/x86/include/asm/elf.h ++++ b/arch/x86/include/asm/elf.h +@@ -249,11 +249,11 @@ extern int force_personality32; + + /* + * This is the base location for PIE (ET_DYN with INTERP) loads. On +- * 64-bit, this is above 4GB to leave the entire 32-bit address ++ * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ + #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ +- (DEFAULT_MAP_WINDOW / 3 * 2)) ++ 0x100000000UL) + + /* This yields a mask that user programs can use to figure out what + instruction set this CPU supports. This could be done in user space, +@@ -312,8 +312,8 @@ extern unsigned long get_mmap_base(int is_legacy); + + #ifdef CONFIG_X86_32 + +-#define __STACK_RND_MASK(is32bit) (0x7ff) +-#define STACK_RND_MASK (0x7ff) ++#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) ++#define STACK_RND_MASK ((1UL << mmap_rnd_bits) - 1) + + #define ARCH_DLINFO ARCH_DLINFO_IA32 + +@@ -322,7 +322,11 @@ extern unsigned long get_mmap_base(int is_legacy); + #else /* CONFIG_X86_32 */ + + /* 1GB for 64bit, 8MB for 32bit */ +-#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) ++#ifdef CONFIG_COMPAT ++#define __STACK_RND_MASK(is32bit) ((is32bit) ? (1UL << mmap_rnd_compat_bits) - 1 : (1UL << mmap_rnd_bits) - 1) ++#else ++#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) ++#endif + #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) + + #define ARCH_DLINFO \ +@@ -380,5 +384,4 @@ struct va_alignment { + } ____cacheline_aligned; + + extern struct va_alignment va_align; +-extern unsigned long align_vdso_addr(unsigned long); + #endif /* _ASM_X86_ELF_H */ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index e31040333f0c..14f3f214c9d1 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -302,6 +302,7 @@ static inline void cr4_set_bits(unsigned long mask) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + if ((cr4 | mask) != cr4) { + cr4 |= mask; + this_cpu_write(cpu_tlbstate.cr4, cr4); +@@ -315,6 +316,7 @@ static inline void cr4_clear_bits(unsigned long mask) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + if ((cr4 & ~mask) != cr4) { + cr4 &= ~mask; + this_cpu_write(cpu_tlbstate.cr4, cr4); +@@ -327,6 +329,7 @@ static inline void cr4_toggle_bits(unsigned long mask) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + cr4 ^= mask; + this_cpu_write(cpu_tlbstate.cr4, cr4); + __write_cr4(cr4); +@@ -435,6 +438,7 @@ static inline void __native_flush_tlb_global(void) + raw_local_irq_save(flags); + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + /* toggle PGE */ + native_write_cr4(cr4 ^ X86_CR4_PGE); + /* write old PGE again and flush TLBs */ +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 51e49f6fe8e1..7ee813033624 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1669,7 +1669,6 @@ void cpu_init(void) + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + +- x86_configure_nx(); + x2apic_setup(); + + /* +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index a98d1cdd6299..7426eb5d1c03 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -40,6 +40,8 @@ + #include + #include + #include ++#include ++#include + + #include "process.h" + +@@ -782,7 +784,10 @@ unsigned long arch_align_stack(unsigned long sp) + + unsigned long arch_randomize_brk(struct mm_struct *mm) + { +- return randomize_page(mm->brk, 0x02000000); ++ if (mmap_is_ia32()) ++ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; ++ else ++ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; + } + + /* +diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c +index a63fe77b3217..e1085e76043e 100644 +--- a/arch/x86/kernel/sys_x86_64.c ++++ b/arch/x86/kernel/sys_x86_64.c +@@ -54,13 +54,6 @@ static unsigned long get_align_bits(void) + return va_align.bits & get_align_mask(); + } + +-unsigned long align_vdso_addr(unsigned long addr) +-{ +- unsigned long align_mask = get_align_mask(); +- addr = (addr + align_mask) & ~align_mask; +- return addr | get_align_bits(); +-} +- + static int __init control_va_addr_alignment(char *str) + { + /* guard against enabling this on other CPU families */ +@@ -122,10 +115,7 @@ static void find_start_end(unsigned long addr, unsigned long flags, + } + + *begin = get_mmap_base(1); +- if (in_compat_syscall()) +- *end = task_size_32bit(); +- else +- *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); ++ *end = get_mmap_base(0); + } + + unsigned long +@@ -206,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; +- info.low_limit = PAGE_SIZE; ++ info.low_limit = get_mmap_base(1); + info.high_limit = get_mmap_base(0); + + /* +diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c +index 3141e67ec24c..e93173193f60 100644 +--- a/arch/x86/mm/init_32.c ++++ b/arch/x86/mm/init_32.c +@@ -558,7 +558,7 @@ static void __init pagetable_init(void) + permanent_kmaps_init(pgd_base); + } + +-pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); ++pteval_t __supported_pte_mask __ro_after_init = ~(_PAGE_NX | _PAGE_GLOBAL); + EXPORT_SYMBOL_GPL(__supported_pte_mask); + + /* user-defined highmem size */ +@@ -865,7 +865,7 @@ int arch_remove_memory(u64 start, u64 size) + #endif + #endif + +-int kernel_set_to_readonly __read_mostly; ++int kernel_set_to_readonly __ro_after_init; + + void set_kernel_text_rw(void) + { +@@ -917,12 +917,11 @@ void mark_rodata_ro(void) + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + ++ kernel_set_to_readonly = 1; + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); + +- kernel_set_to_readonly = 1; +- + #ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", + start, start+size); +diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c +index 624edfbff02d..54bb0705dd53 100644 +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -65,7 +65,7 @@ + * around without checking the pgd every time. + */ + +-pteval_t __supported_pte_mask __read_mostly = ~0; ++pteval_t __supported_pte_mask __ro_after_init = ~0; + EXPORT_SYMBOL_GPL(__supported_pte_mask); + + int force_personality32; +@@ -1179,7 +1179,7 @@ void __init mem_init(void) + mem_init_print_info(NULL); + } + +-int kernel_set_to_readonly; ++int kernel_set_to_readonly __ro_after_init; + + void set_kernel_text_rw(void) + { +@@ -1228,9 +1228,8 @@ void mark_rodata_ro(void) + + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); +- set_memory_ro(start, (end - start) >> PAGE_SHIFT); +- + kernel_set_to_readonly = 1; ++ set_memory_ro(start, (end - start) >> PAGE_SHIFT); + + /* + * The rodata/data/bss/brk section (but not the kernel text!) +diff --git a/block/blk-softirq.c b/block/blk-softirq.c +index 01e2b353a2b9..9aeddca4a29f 100644 +--- a/block/blk-softirq.c ++++ b/block/blk-softirq.c +@@ -20,7 +20,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done); + * Softirq action handler - move entries to local list and loop over them + * while passing them to the queue registered handler. + */ +-static __latent_entropy void blk_done_softirq(struct softirq_action *h) ++static __latent_entropy void blk_done_softirq(void) + { + struct list_head *cpu_list, local_list; + +diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c +index 04f406d7e973..60d8c59fa824 100644 +--- a/drivers/ata/libata-core.c ++++ b/drivers/ata/libata-core.c +@@ -5148,7 +5148,7 @@ void ata_qc_free(struct ata_queued_cmd *qc) + struct ata_port *ap; + unsigned int tag; + +- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ ++ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + ap = qc->ap; + + qc->flags = 0; +@@ -5165,7 +5165,7 @@ void __ata_qc_complete(struct ata_queued_cmd *qc) + struct ata_port *ap; + struct ata_link *link; + +- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ ++ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE)); + ap = qc->ap; + link = qc->dev->link; +diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig +index c28dca0c613d..d4813f0d25ca 100644 +--- a/drivers/char/Kconfig ++++ b/drivers/char/Kconfig +@@ -9,7 +9,6 @@ source "drivers/tty/Kconfig" + + config DEVMEM + bool "/dev/mem virtual device support" +- default y + help + Say Y here if you want to support the /dev/mem device. + The /dev/mem device is used to access areas of physical +@@ -568,7 +567,6 @@ config TELCLOCK + config DEVPORT + bool "/dev/port character device" + depends on ISA || PCI +- default y + help + Say Y here if you want to support the /dev/port device. The /dev/port + device is similar to /dev/mem, but for I/O ports. +diff --git a/drivers/media/dvb-frontends/cx24116.c b/drivers/media/dvb-frontends/cx24116.c +index e105532bfba8..e07d52bb9b62 100644 +--- a/drivers/media/dvb-frontends/cx24116.c ++++ b/drivers/media/dvb-frontends/cx24116.c +@@ -1462,7 +1462,7 @@ static int cx24116_tune(struct dvb_frontend *fe, bool re_tune, + return cx24116_read_status(fe, status); + } + +-static int cx24116_get_algo(struct dvb_frontend *fe) ++static enum dvbfe_algo cx24116_get_algo(struct dvb_frontend *fe) + { + return DVBFE_ALGO_HW; + } +diff --git a/drivers/media/dvb-frontends/cx24117.c b/drivers/media/dvb-frontends/cx24117.c +index d37cb7762bd6..97e0feff0ede 100644 +--- a/drivers/media/dvb-frontends/cx24117.c ++++ b/drivers/media/dvb-frontends/cx24117.c +@@ -1555,7 +1555,7 @@ static int cx24117_tune(struct dvb_frontend *fe, bool re_tune, + return cx24117_read_status(fe, status); + } + +-static int cx24117_get_algo(struct dvb_frontend *fe) ++static enum dvbfe_algo cx24117_get_algo(struct dvb_frontend *fe) + { + return DVBFE_ALGO_HW; + } +diff --git a/drivers/media/dvb-frontends/cx24120.c b/drivers/media/dvb-frontends/cx24120.c +index 7f11dcc94d85..01da670760ba 100644 +--- a/drivers/media/dvb-frontends/cx24120.c ++++ b/drivers/media/dvb-frontends/cx24120.c +@@ -1491,7 +1491,7 @@ static int cx24120_tune(struct dvb_frontend *fe, bool re_tune, + return cx24120_read_status(fe, status); + } + +-static int cx24120_get_algo(struct dvb_frontend *fe) ++static enum dvbfe_algo cx24120_get_algo(struct dvb_frontend *fe) + { + return DVBFE_ALGO_HW; + } +diff --git a/drivers/media/dvb-frontends/cx24123.c b/drivers/media/dvb-frontends/cx24123.c +index 1d59d1d3bd82..41cd0e9ea199 100644 +--- a/drivers/media/dvb-frontends/cx24123.c ++++ b/drivers/media/dvb-frontends/cx24123.c +@@ -1005,7 +1005,7 @@ static int cx24123_tune(struct dvb_frontend *fe, + return retval; + } + +-static int cx24123_get_algo(struct dvb_frontend *fe) ++static enum dvbfe_algo cx24123_get_algo(struct dvb_frontend *fe) + { + return DVBFE_ALGO_HW; + } +diff --git a/drivers/media/dvb-frontends/cxd2820r_core.c b/drivers/media/dvb-frontends/cxd2820r_core.c +index f6ebbb47b9b2..3e0d8cbd76da 100644 +--- a/drivers/media/dvb-frontends/cxd2820r_core.c ++++ b/drivers/media/dvb-frontends/cxd2820r_core.c +@@ -403,7 +403,7 @@ static enum dvbfe_search cxd2820r_search(struct dvb_frontend *fe) + return DVBFE_ALGO_SEARCH_ERROR; + } + +-static int cxd2820r_get_frontend_algo(struct dvb_frontend *fe) ++static enum dvbfe_algo cxd2820r_get_frontend_algo(struct dvb_frontend *fe) + { + return DVBFE_ALGO_CUSTOM; + } +diff --git a/drivers/media/dvb-frontends/mb86a20s.c b/drivers/media/dvb-frontends/mb86a20s.c +index e8ac8c3e2ec0..e0f4ba8302d1 100644 +--- a/drivers/media/dvb-frontends/mb86a20s.c ++++ b/drivers/media/dvb-frontends/mb86a20s.c +@@ -2055,7 +2055,7 @@ static void mb86a20s_release(struct dvb_frontend *fe) + kfree(state); + } + +-static int mb86a20s_get_frontend_algo(struct dvb_frontend *fe) ++static enum dvbfe_algo mb86a20s_get_frontend_algo(struct dvb_frontend *fe) + { + return DVBFE_ALGO_HW; + } +diff --git a/drivers/media/dvb-frontends/s921.c b/drivers/media/dvb-frontends/s921.c +index 274544a3ae0e..9ef9b9bc1bd2 100644 +--- a/drivers/media/dvb-frontends/s921.c ++++ b/drivers/media/dvb-frontends/s921.c +@@ -464,7 +464,7 @@ static int s921_tune(struct dvb_frontend *fe, + return rc; + } + +-static int s921_get_algo(struct dvb_frontend *fe) ++static enum dvbfe_algo s921_get_algo(struct dvb_frontend *fe) + { + return DVBFE_ALGO_HW; + } +diff --git a/drivers/media/pci/bt8xx/dst.c b/drivers/media/pci/bt8xx/dst.c +index 7166d2279465..fa682f9fdc4b 100644 +--- a/drivers/media/pci/bt8xx/dst.c ++++ b/drivers/media/pci/bt8xx/dst.c +@@ -1657,7 +1657,7 @@ static int dst_tune_frontend(struct dvb_frontend* fe, + return 0; + } + +-static int dst_get_tuning_algo(struct dvb_frontend *fe) ++static enum dvbfe_algo dst_get_tuning_algo(struct dvb_frontend *fe) + { + return dst_algo ? DVBFE_ALGO_HW : DVBFE_ALGO_SW; + } +diff --git a/drivers/media/pci/pt1/va1j5jf8007s.c b/drivers/media/pci/pt1/va1j5jf8007s.c +index f75f69556be7..d913a6050e8c 100644 +--- a/drivers/media/pci/pt1/va1j5jf8007s.c ++++ b/drivers/media/pci/pt1/va1j5jf8007s.c +@@ -98,7 +98,7 @@ static int va1j5jf8007s_read_snr(struct dvb_frontend *fe, u16 *snr) + return 0; + } + +-static int va1j5jf8007s_get_frontend_algo(struct dvb_frontend *fe) ++static enum dvbfe_algo va1j5jf8007s_get_frontend_algo(struct dvb_frontend *fe) + { + return DVBFE_ALGO_HW; + } +diff --git a/drivers/media/pci/pt1/va1j5jf8007t.c b/drivers/media/pci/pt1/va1j5jf8007t.c +index 63fda79a75c0..4115c3ccd4a8 100644 +--- a/drivers/media/pci/pt1/va1j5jf8007t.c ++++ b/drivers/media/pci/pt1/va1j5jf8007t.c +@@ -88,7 +88,7 @@ static int va1j5jf8007t_read_snr(struct dvb_frontend *fe, u16 *snr) + return 0; + } + +-static int va1j5jf8007t_get_frontend_algo(struct dvb_frontend *fe) ++static enum dvbfe_algo va1j5jf8007t_get_frontend_algo(struct dvb_frontend *fe) + { + return DVBFE_ALGO_HW; + } +diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c +index 981b3ef71e47..9883da1da383 100644 +--- a/drivers/misc/lkdtm_core.c ++++ b/drivers/misc/lkdtm_core.c +@@ -78,7 +78,7 @@ static irqreturn_t jp_handle_irq_event(unsigned int irq, + return 0; + } + +-static void jp_tasklet_action(struct softirq_action *a) ++static void jp_tasklet_action(void) + { + lkdtm_handler(); + jprobe_return(); +diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig +index b811442c5ce6..4f62a63cbcb1 100644 +--- a/drivers/tty/Kconfig ++++ b/drivers/tty/Kconfig +@@ -122,7 +122,6 @@ config UNIX98_PTYS + + config LEGACY_PTYS + bool "Legacy (BSD) PTY support" +- default y + ---help--- + A pseudo terminal (PTY) is a software device consisting of two + halves: a master and a slave. The slave device behaves identical to +diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c +index 417b81c67fe9..4e9bb7851ab1 100644 +--- a/drivers/tty/tty_io.c ++++ b/drivers/tty/tty_io.c +@@ -171,6 +171,7 @@ static void free_tty_struct(struct tty_struct *tty) + put_device(tty->dev); + kfree(tty->write_buf); + tty->magic = 0xDEADDEAD; ++ put_user_ns(tty->owner_user_ns); + kfree(tty); + } + +@@ -2167,11 +2168,19 @@ static int tty_fasync(int fd, struct file *filp, int on) + * FIXME: may race normal receive processing + */ + ++int tiocsti_restrict = IS_ENABLED(CONFIG_SECURITY_TIOCSTI_RESTRICT); ++ + static int tiocsti(struct tty_struct *tty, char __user *p) + { + char ch, mbz = 0; + struct tty_ldisc *ld; + ++ if (tiocsti_restrict && ++ !ns_capable(tty->owner_user_ns, CAP_SYS_ADMIN)) { ++ dev_warn_ratelimited(tty->dev, ++ "Denied TIOCSTI ioctl for non-privileged process\n"); ++ return -EPERM; ++ } + if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (get_user(ch, p)) +@@ -2854,6 +2863,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) + tty->index = idx; + tty_line_name(driver, idx, tty->name); + tty->dev = tty_get_device(tty); ++ tty->owner_user_ns = get_user_ns(current_user_ns()); + + return tty; + } +diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c +index a073cb5be013..e9dfece7b7ce 100644 +--- a/drivers/usb/core/hub.c ++++ b/drivers/usb/core/hub.c +@@ -38,6 +38,8 @@ + #define USB_VENDOR_GENESYS_LOGIC 0x05e3 + #define HUB_QUIRK_CHECK_PORT_AUTOSUSPEND 0x01 + ++extern int deny_new_usb; ++ + /* Protect struct usb_device->state and ->children members + * Note: Both are also protected by ->dev.sem, except that ->state can + * change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */ +@@ -4818,6 +4820,12 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, + goto done; + return; + } ++ ++ if (deny_new_usb) { ++ dev_err(&port_dev->dev, "denied insert of USB device on port %d\n", port1); ++ goto done; ++ } ++ + if (hub_is_superspeed(hub->hdev)) + unit_load = 150; + else +diff --git a/fs/exec.c b/fs/exec.c +index 0da4d748b4e6..69fcee853363 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -62,6 +62,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -321,6 +322,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm) + arch_bprm_mm_init(mm, vma); + up_write(&mm->mmap_sem); + bprm->p = vma->vm_end - sizeof(void *); ++ if (randomize_va_space) ++ bprm->p ^= get_random_int() & ~PAGE_MASK; + return 0; + err: + up_write(&mm->mmap_sem); +diff --git a/fs/namei.c b/fs/namei.c +index d1e467b7b9de..0d96ad71b700 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -902,10 +902,10 @@ static inline void put_link(struct nameidata *nd) + path_put(&last->link); + } + +-int sysctl_protected_symlinks __read_mostly = 0; +-int sysctl_protected_hardlinks __read_mostly = 0; +-int sysctl_protected_fifos __read_mostly; +-int sysctl_protected_regular __read_mostly; ++int sysctl_protected_symlinks __read_mostly = 1; ++int sysctl_protected_hardlinks __read_mostly = 1; ++int sysctl_protected_fifos __read_mostly = 2; ++int sysctl_protected_regular __read_mostly = 2; + + /** + * may_follow_link - Check symlink following for unsafe situations +diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig +index 5f93cfacb3d1..cea0d7d3b23e 100644 +--- a/fs/nfs/Kconfig ++++ b/fs/nfs/Kconfig +@@ -195,4 +195,3 @@ config NFS_DEBUG + bool + depends on NFS_FS && SUNRPC_DEBUG + select CRC32 +- default y +diff --git a/fs/pipe.c b/fs/pipe.c +index 8ef7d7bef775..b82f305ec13d 100644 +--- a/fs/pipe.c ++++ b/fs/pipe.c +@@ -38,7 +38,7 @@ unsigned int pipe_max_size = 1048576; + /* + * Minimum pipe size, as required by POSIX + */ +-unsigned int pipe_min_size = PAGE_SIZE; ++unsigned int pipe_min_size __read_only = PAGE_SIZE; + + /* Maximum allocatable pages per user. Hard limit is unset by default, soft + * matches default values. +diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig +index 1ade1206bb89..60b0f76dec47 100644 +--- a/fs/proc/Kconfig ++++ b/fs/proc/Kconfig +@@ -39,7 +39,6 @@ config PROC_KCORE + config PROC_VMCORE + bool "/proc/vmcore support" + depends on PROC_FS && CRASH_DUMP +- default y + help + Exports the dump image of crashed kernel in ELF format. + +diff --git a/fs/stat.c b/fs/stat.c +index 873785dae022..d3c2ada8b9c7 100644 +--- a/fs/stat.c ++++ b/fs/stat.c +@@ -40,8 +40,13 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) + stat->gid = inode->i_gid; + stat->rdev = inode->i_rdev; + stat->size = i_size_read(inode); +- stat->atime = inode->i_atime; +- stat->mtime = inode->i_mtime; ++ if (is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { ++ stat->atime = inode->i_ctime; ++ stat->mtime = inode->i_ctime; ++ } else { ++ stat->atime = inode->i_atime; ++ stat->mtime = inode->i_mtime; ++ } + stat->ctime = inode->i_ctime; + stat->blksize = i_blocksize(inode); + stat->blocks = inode->i_blocks; +@@ -75,9 +80,14 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, + stat->result_mask |= STATX_BASIC_STATS; + request_mask &= STATX_ALL; + query_flags &= KSTAT_QUERY_FLAGS; +- if (inode->i_op->getattr) +- return inode->i_op->getattr(path, stat, request_mask, +- query_flags); ++ if (inode->i_op->getattr) { ++ int retval = inode->i_op->getattr(path, stat, request_mask, query_flags); ++ if (!retval && is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { ++ stat->atime = stat->ctime; ++ stat->mtime = stat->ctime; ++ } ++ return retval; ++ } + + generic_fillattr(inode, stat); + return 0; +diff --git a/include/linux/cache.h b/include/linux/cache.h +index 750621e41d1c..e7157c18c62c 100644 +--- a/include/linux/cache.h ++++ b/include/linux/cache.h +@@ -31,6 +31,8 @@ + #define __ro_after_init __attribute__((__section__(".data..ro_after_init"))) + #endif + ++#define __read_only __ro_after_init ++ + #ifndef ____cacheline_aligned + #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) + #endif +diff --git a/include/linux/capability.h b/include/linux/capability.h +index f640dcbc880c..2b4f5d651f19 100644 +--- a/include/linux/capability.h ++++ b/include/linux/capability.h +@@ -207,6 +207,7 @@ extern bool has_capability_noaudit(struct task_struct *t, int cap); + extern bool has_ns_capability_noaudit(struct task_struct *t, + struct user_namespace *ns, int cap); + extern bool capable(int cap); ++extern bool capable_noaudit(int cap); + extern bool ns_capable(struct user_namespace *ns, int cap); + extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); + #else +@@ -232,6 +233,10 @@ static inline bool capable(int cap) + { + return true; + } ++static inline bool capable_noaudit(int cap) ++{ ++ return true; ++} + static inline bool ns_capable(struct user_namespace *ns, int cap) + { + return true; +diff --git a/include/linux/fs.h b/include/linux/fs.h +index f6a577edec67..fa3a6caeca6c 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -3383,4 +3383,15 @@ static inline bool dir_relax_shared(struct inode *inode) + extern bool path_noexec(const struct path *path); + extern void inode_nohighmem(struct inode *inode); + ++extern int device_sidechannel_restrict; ++ ++static inline bool is_sidechannel_device(const struct inode *inode) ++{ ++ umode_t mode; ++ if (!device_sidechannel_restrict) ++ return false; ++ mode = inode->i_mode; ++ return ((S_ISCHR(mode) || S_ISBLK(mode)) && (mode & (S_IROTH | S_IWOTH))); ++} ++ + #endif /* _LINUX_FS_H */ +diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h +index bdaf22582f6e..326ff15d4637 100644 +--- a/include/linux/fsnotify.h ++++ b/include/linux/fsnotify.h +@@ -181,6 +181,9 @@ static inline void fsnotify_access(struct file *file) + struct inode *inode = path->dentry->d_inode; + __u32 mask = FS_ACCESS; + ++ if (is_sidechannel_device(inode)) ++ return; ++ + if (S_ISDIR(inode->i_mode)) + mask |= FS_ISDIR; + +@@ -199,6 +202,9 @@ static inline void fsnotify_modify(struct file *file) + struct inode *inode = path->dentry->d_inode; + __u32 mask = FS_MODIFY; + ++ if (is_sidechannel_device(inode)) ++ return; ++ + if (S_ISDIR(inode->i_mode)) + mask |= FS_ISDIR; + +diff --git a/include/linux/gfp.h b/include/linux/gfp.h +index b041f94678de..fd8bb5a78b75 100644 +--- a/include/linux/gfp.h ++++ b/include/linux/gfp.h +@@ -518,9 +518,9 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, + extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); + extern unsigned long get_zeroed_page(gfp_t gfp_mask); + +-void *alloc_pages_exact(size_t size, gfp_t gfp_mask); ++void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __attribute__((alloc_size(1))); + void free_pages_exact(void *virt, size_t size); +-void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); ++void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __attribute__((alloc_size(2))); + + #define __get_free_page(gfp_mask) \ + __get_free_pages((gfp_mask), 0) +diff --git a/include/linux/highmem.h b/include/linux/highmem.h +index 776f90f3a1cd..3f5c47000059 100644 +--- a/include/linux/highmem.h ++++ b/include/linux/highmem.h +@@ -191,6 +191,13 @@ static inline void clear_highpage(struct page *page) + kunmap_atomic(kaddr); + } + ++static inline void verify_zero_highpage(struct page *page) ++{ ++ void *kaddr = kmap_atomic(page); ++ BUG_ON(memchr_inv(kaddr, 0, PAGE_SIZE)); ++ kunmap_atomic(kaddr); ++} ++ + static inline void zero_user_segments(struct page *page, + unsigned start1, unsigned end1, + unsigned start2, unsigned end2) +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index 69c238210325..ee487ea4f48f 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -485,7 +485,7 @@ extern const char * const softirq_to_name[NR_SOFTIRQS]; + + struct softirq_action + { +- void (*action)(struct softirq_action *); ++ void (*action)(void); + }; + + asmlinkage void do_softirq(void); +@@ -500,7 +500,7 @@ static inline void do_softirq_own_stack(void) + } + #endif + +-extern void open_softirq(int nr, void (*action)(struct softirq_action *)); ++extern void __init open_softirq(int nr, void (*action)(void)); + extern void softirq_init(void); + extern void __raise_softirq_irqoff(unsigned int nr); + +diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h +index df32d2508290..c992d130b94d 100644 +--- a/include/linux/kobject_ns.h ++++ b/include/linux/kobject_ns.h +@@ -46,7 +46,7 @@ struct kobj_ns_type_operations { + void (*drop_ns)(void *); + }; + +-int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); ++int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops); + int kobj_ns_type_registered(enum kobj_ns_type type); + const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent); + const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj); +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 58f2263de4de..e90dc5d98c7f 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -525,7 +525,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) + } + #endif + +-extern void *kvmalloc_node(size_t size, gfp_t flags, int node); ++extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __attribute__((alloc_size(1))); + static inline void *kvmalloc(size_t size, gfp_t flags) + { + return kvmalloc_node(size, flags, NUMA_NO_NODE); +diff --git a/include/linux/percpu.h b/include/linux/percpu.h +index 296bbe49d5d1..b26652c9a98d 100644 +--- a/include/linux/percpu.h ++++ b/include/linux/percpu.h +@@ -129,7 +129,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_populate_pte_fn_t populate_pte_fn); + #endif + +-extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); ++extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); + extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); + extern bool is_kernel_percpu_address(unsigned long addr); + +@@ -137,8 +137,8 @@ extern bool is_kernel_percpu_address(unsigned long addr); + extern void __init setup_per_cpu_areas(void); + #endif + +-extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); +-extern void __percpu *__alloc_percpu(size_t size, size_t align); ++extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __attribute__((alloc_size(1))); ++extern void __percpu *__alloc_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); + extern void free_percpu(void __percpu *__pdata); + extern phys_addr_t per_cpu_ptr_to_phys(void *addr); + +diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h +index 8e22f24ded6a..b7fecdfa6de5 100644 +--- a/include/linux/perf_event.h ++++ b/include/linux/perf_event.h +@@ -1165,6 +1165,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + int perf_event_max_stack_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + ++static inline bool perf_paranoid_any(void) ++{ ++ return sysctl_perf_event_paranoid > 2; ++} ++ + static inline bool perf_paranoid_tracepoint_raw(void) + { + return sysctl_perf_event_paranoid > -1; +diff --git a/include/linux/slab.h b/include/linux/slab.h +index ae5ed6492d54..fd0786124504 100644 +--- a/include/linux/slab.h ++++ b/include/linux/slab.h +@@ -146,8 +146,8 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *); + /* + * Common kmalloc functions provided by all allocators + */ +-void * __must_check __krealloc(const void *, size_t, gfp_t); +-void * __must_check krealloc(const void *, size_t, gfp_t); ++void * __must_check __krealloc(const void *, size_t, gfp_t) __attribute__((alloc_size(2))); ++void * __must_check krealloc(const void *, size_t, gfp_t) __attribute((alloc_size(2))); + void kfree(const void *); + void kzfree(const void *); + size_t ksize(const void *); +@@ -324,7 +324,7 @@ static __always_inline int kmalloc_index(size_t size) + } + #endif /* !CONFIG_SLOB */ + +-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; ++void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); + void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; + void kmem_cache_free(struct kmem_cache *, void *); + +@@ -348,7 +348,7 @@ static __always_inline void kfree_bulk(size_t size, void **p) + } + + #ifdef CONFIG_NUMA +-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; ++void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); + void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; + #else + static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) +@@ -473,7 +473,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) + * for general use, and so are not documented here. For a full list of + * potential flags, always refer to linux/gfp.h. + */ +-static __always_inline void *kmalloc(size_t size, gfp_t flags) ++static __always_inline __attribute__((alloc_size(1))) void *kmalloc(size_t size, gfp_t flags) + { + if (__builtin_constant_p(size)) { + if (size > KMALLOC_MAX_CACHE_SIZE) +@@ -513,7 +513,7 @@ static __always_inline int kmalloc_size(int n) + return 0; + } + +-static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) ++static __always_inline __attribute__((alloc_size(1))) void *kmalloc_node(size_t size, gfp_t flags, int node) + { + #ifndef CONFIG_SLOB + if (__builtin_constant_p(size) && +diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h +index f8ced87a2efe..cd61c8d2aa6e 100644 +--- a/include/linux/slub_def.h ++++ b/include/linux/slub_def.h +@@ -121,6 +121,11 @@ struct kmem_cache { + unsigned long random; + #endif + ++#ifdef CONFIG_SLAB_CANARY ++ unsigned long random_active; ++ unsigned long random_inactive; ++#endif ++ + #ifdef CONFIG_NUMA + /* + * Defragmentation by allocating from a remote node. +diff --git a/include/linux/string.h b/include/linux/string.h +index 96115bf561b4..f93d908c5bbc 100644 +--- a/include/linux/string.h ++++ b/include/linux/string.h +@@ -234,10 +234,16 @@ void __read_overflow2(void) __compiletime_error("detected read beyond size of ob + void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter"); + void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); + ++#ifdef CONFIG_FORTIFY_SOURCE_STRICT_STRING ++#define __string_size(p) __builtin_object_size(p, 1) ++#else ++#define __string_size(p) __builtin_object_size(p, 0) ++#endif ++ + #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) + __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + if (p_size < size) +@@ -247,7 +253,7 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) + + __FORTIFY_INLINE char *strcat(char *p, const char *q) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + if (p_size == (size_t)-1) + return __builtin_strcat(p, q); + if (strlcat(p, q, p_size) >= p_size) +@@ -258,7 +264,7 @@ __FORTIFY_INLINE char *strcat(char *p, const char *q) + __FORTIFY_INLINE __kernel_size_t strlen(const char *p) + { + __kernel_size_t ret; +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + + /* Work around gcc excess stack consumption issue */ + if (p_size == (size_t)-1 || +@@ -273,7 +279,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) + extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); + __FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); + if (p_size <= ret && maxlen != ret) + fortify_panic(__func__); +@@ -285,8 +291,8 @@ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); + __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) + { + size_t ret; +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __real_strlcpy(p, q, size); + ret = strlen(q); +@@ -306,8 +312,8 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) + __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) + { + size_t p_len, copy_len; +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __builtin_strncat(p, q, count); + p_len = strlen(p); +@@ -420,8 +426,8 @@ __FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) + /* defined after fortified strlen and memcpy to reuse them */ + __FORTIFY_INLINE char *strcpy(char *p, const char *q) + { +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __builtin_strcpy(p, q); + memcpy(p, q, strlen(q) + 1); +diff --git a/include/linux/tty.h b/include/linux/tty.h +index 1dd587ba6d88..9a9a04fb641d 100644 +--- a/include/linux/tty.h ++++ b/include/linux/tty.h +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + + + /* +@@ -335,6 +336,7 @@ struct tty_struct { + /* If the tty has a pending do_SAK, queue it here - akpm */ + struct work_struct SAK_work; + struct tty_port *port; ++ struct user_namespace *owner_user_ns; + } __randomize_layout; + + /* Each of a tty's open files has private_data pointing to tty_file_private */ +@@ -344,6 +346,8 @@ struct tty_file_private { + struct list_head list; + }; + ++extern int tiocsti_restrict; ++ + /* tty magic number */ + #define TTY_MAGIC 0x5401 + +diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h +index 1e5d8c392f15..66d0e49c9987 100644 +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -68,19 +68,19 @@ static inline void vmalloc_init(void) + } + #endif + +-extern void *vmalloc(unsigned long size); +-extern void *vzalloc(unsigned long size); +-extern void *vmalloc_user(unsigned long size); +-extern void *vmalloc_node(unsigned long size, int node); +-extern void *vzalloc_node(unsigned long size, int node); +-extern void *vmalloc_exec(unsigned long size); +-extern void *vmalloc_32(unsigned long size); +-extern void *vmalloc_32_user(unsigned long size); +-extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); ++extern void *vmalloc(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vzalloc(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_user(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); ++extern void *vzalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); ++extern void *vmalloc_exec(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_32(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_32_user(unsigned long size) __attribute__((alloc_size(1))); ++extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) __attribute__((alloc_size(1))); + extern void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, +- const void *caller); ++ const void *caller) __attribute__((alloc_size(1))); + #ifndef CONFIG_MMU + extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); + static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, +diff --git a/init/Kconfig b/init/Kconfig +index 46075327c165..0c78750bc76d 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -309,6 +309,7 @@ config USELIB + config AUDIT + bool "Auditing support" + depends on NET ++ default y + help + Enable auditing infrastructure that can be used with another + kernel subsystem, such as SELinux (which requires this for +@@ -1052,6 +1053,12 @@ config CC_OPTIMIZE_FOR_SIZE + + endchoice + ++config LOCAL_INIT ++ bool "Zero uninitialized locals" ++ help ++ Zero-fill uninitialized local variables, other than variable-length ++ arrays. Requires compiler support. ++ + config SYSCTL + bool + +@@ -1361,8 +1368,7 @@ config SHMEM + which may be appropriate on small systems without swap. + + config AIO +- bool "Enable AIO support" if EXPERT +- default y ++ bool "Enable AIO support" + help + This option enables POSIX asynchronous I/O which may by used + by some high performance threaded applications. Disabling +@@ -1491,7 +1497,7 @@ config VM_EVENT_COUNTERS + + config SLUB_DEBUG + default y +- bool "Enable SLUB debugging support" if EXPERT ++ bool "Enable SLUB debugging support" + depends on SLUB && SYSFS + help + SLUB has extensive debug support features. Disabling these can +@@ -1515,7 +1521,6 @@ config SLUB_MEMCG_SYSFS_ON + + config COMPAT_BRK + bool "Disable heap randomization" +- default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). +@@ -1562,7 +1567,6 @@ endchoice + + config SLAB_MERGE_DEFAULT + bool "Allow slab caches to be merged" +- default y + help + For reduced kernel memory fragmentation, slab caches can be + merged when they share the same size and other characteristics. +@@ -1575,9 +1579,9 @@ config SLAB_MERGE_DEFAULT + command line. + + config SLAB_FREELIST_RANDOM +- default n + depends on SLAB || SLUB + bool "SLAB freelist randomization" ++ default y + help + Randomizes the freelist order used on creating new pages. This + security feature reduces the predictability of the kernel slab +@@ -1586,12 +1590,56 @@ config SLAB_FREELIST_RANDOM + config SLAB_FREELIST_HARDENED + bool "Harden slab freelist metadata" + depends on SLUB ++ default y + help + Many kernel heap attacks try to target slab cache metadata and + other infrastructure. This options makes minor performance + sacrifies to harden the kernel slab allocator against common + freelist exploit methods. + ++config SLAB_HARDENED ++ default y ++ depends on SLUB ++ bool "Hardened SLAB infrastructure" ++ help ++ Make minor performance sacrifices to harden the kernel slab ++ allocator. ++ ++config SLAB_CANARY ++ depends on SLUB ++ depends on !SLAB_MERGE_DEFAULT ++ bool "SLAB canaries" ++ default y ++ help ++ Place canaries at the end of kernel slab allocations, sacrificing ++ some performance and memory usage for security. ++ ++ Canaries can detect some forms of heap corruption when allocations ++ are freed and as part of the HARDENED_USERCOPY feature. It provides ++ basic use-after-free detection for HARDENED_USERCOPY. ++ ++ Canaries absorb small overflows (rendering them harmless), mitigate ++ non-NUL terminated C string overflows on 64-bit via a guaranteed zero ++ byte and provide basic double-free detection. ++ ++config SLAB_SANITIZE ++ bool "Sanitize SLAB allocations" ++ depends on SLUB ++ default y ++ help ++ Zero fill slab allocations on free, reducing the lifetime of ++ sensitive data and helping to mitigate use-after-free bugs. ++ ++ For slabs with debug poisoning enabling, this has no impact. ++ ++config SLAB_SANITIZE_VERIFY ++ depends on SLAB_SANITIZE && PAGE_SANITIZE ++ default y ++ bool "Verify sanitized SLAB allocations" ++ help ++ Verify that newly allocated slab allocations are zeroed to detect ++ write-after-free bugs. ++ + config SLUB_CPU_PARTIAL + default y + depends on SLUB && SMP +diff --git a/kernel/audit.c b/kernel/audit.c +index d301276bca58..d55a1e290cea 100644 +--- a/kernel/audit.c ++++ b/kernel/audit.c +@@ -1575,6 +1575,9 @@ static int __init audit_enable(char *str) + audit_default = !!simple_strtol(str, NULL, 0); + if (!audit_default) + audit_initialized = AUDIT_DISABLED; ++ else ++ audit_initialized = AUDIT_UNINITIALIZED; ++ + audit_enabled = audit_default; + audit_ever_enabled = !!audit_enabled; + +diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c +index d203a5d6b726..2a6c3e2c57a6 100644 +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -539,7 +539,7 @@ void __weak bpf_jit_free(struct bpf_prog *fp) + bpf_prog_unlock_free(fp); + } + +-int bpf_jit_harden __read_mostly; ++int bpf_jit_harden __read_mostly = 2; + + static int bpf_jit_blind_insn(const struct bpf_insn *from, + const struct bpf_insn *aux, +diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c +index 5c9deed4524e..6d90aabecfc7 100644 +--- a/kernel/bpf/syscall.c ++++ b/kernel/bpf/syscall.c +@@ -37,7 +37,7 @@ static DEFINE_SPINLOCK(prog_idr_lock); + static DEFINE_IDR(map_idr); + static DEFINE_SPINLOCK(map_idr_lock); + +-int sysctl_unprivileged_bpf_disabled __read_mostly; ++int sysctl_unprivileged_bpf_disabled __read_mostly = 1; + + static const struct bpf_map_ops * const bpf_map_types[] = { + #define BPF_PROG_TYPE(_id, _ops) +diff --git a/kernel/capability.c b/kernel/capability.c +index 1e1c0236f55b..452062fe45ce 100644 +--- a/kernel/capability.c ++++ b/kernel/capability.c +@@ -431,6 +431,12 @@ bool capable(int cap) + return ns_capable(&init_user_ns, cap); + } + EXPORT_SYMBOL(capable); ++ ++bool capable_noaudit(int cap) ++{ ++ return ns_capable_noaudit(&init_user_ns, cap); ++} ++EXPORT_SYMBOL(capable_noaudit); + #endif /* CONFIG_MULTIUSER */ + + /** +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 991af683ef9e..66f66b648707 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -397,8 +397,13 @@ static cpumask_var_t perf_online_mask; + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv ++ * 3 - disallow all unpriv perf event use + */ ++#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT ++int sysctl_perf_event_paranoid __read_mostly = 3; ++#else + int sysctl_perf_event_paranoid __read_mostly = 2; ++#endif + + /* Minimum for 512 kiB + 1 user control page */ + int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ +@@ -9984,6 +9989,9 @@ SYSCALL_DEFINE5(perf_event_open, + if (flags & ~PERF_FLAG_ALL) + return -EINVAL; + ++ if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN)) ++ return -EACCES; ++ + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; +diff --git a/kernel/fork.c b/kernel/fork.c +index 6d6ce2c3a364..951a76b3dc32 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -102,6 +102,11 @@ + + #define CREATE_TRACE_POINTS + #include ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#else ++#define unprivileged_userns_clone 0 ++#endif + + /* + * Minimum number of threads to boot the kernel +@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -2357,6 +2366,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c +index 0972a8e09d08..00dde7aad47a 100644 +--- a/kernel/power/snapshot.c ++++ b/kernel/power/snapshot.c +@@ -1136,7 +1136,7 @@ void free_basic_memory_bitmaps(void) + + void clear_free_pages(void) + { +-#ifdef CONFIG_PAGE_POISONING_ZERO ++#if defined(CONFIG_PAGE_POISONING_ZERO) || defined(CONFIG_PAGE_SANITIZE) + struct memory_bitmap *bm = free_pages_map; + unsigned long pfn; + +@@ -1153,7 +1153,7 @@ void clear_free_pages(void) + } + memory_bm_position_reset(bm); + pr_info("PM: free pages cleared after restore\n"); +-#endif /* PAGE_POISONING_ZERO */ ++#endif /* PAGE_POISONING_ZERO || PAGE_SANITIZE */ + } + + /** +diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c +index a64eee0db39e..4d7de378fe4c 100644 +--- a/kernel/rcu/tiny.c ++++ b/kernel/rcu/tiny.c +@@ -164,7 +164,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) + } + } + +-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) ++static __latent_entropy void rcu_process_callbacks(void) + { + __rcu_process_callbacks(&rcu_sched_ctrlblk); + __rcu_process_callbacks(&rcu_bh_ctrlblk); +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c +index 710ce1d6b982..4013b634e820 100644 +--- a/kernel/rcu/tree.c ++++ b/kernel/rcu/tree.c +@@ -2927,7 +2927,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) + /* + * Do RCU core processing for the current CPU. + */ +-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) ++static __latent_entropy void rcu_process_callbacks(void) + { + struct rcu_state *rsp; + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index f33b24080b1c..99c5e423906f 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8982,7 +8982,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ +-static __latent_entropy void run_rebalance_domains(struct softirq_action *h) ++static __latent_entropy void run_rebalance_domains(void) + { + struct rq *this_rq = this_rq(); + enum cpu_idle_type idle = this_rq->idle_balance ? +diff --git a/kernel/softirq.c b/kernel/softirq.c +index a4c87cf27f9d..efb97a8dc568 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -53,7 +53,7 @@ irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; + EXPORT_SYMBOL(irq_stat); + #endif + +-static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; ++static struct softirq_action softirq_vec[NR_SOFTIRQS] __ro_after_init __aligned(PAGE_SIZE); + + DEFINE_PER_CPU(struct task_struct *, ksoftirqd); + +@@ -285,7 +285,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + kstat_incr_softirqs_this_cpu(vec_nr); + + trace_softirq_entry(vec_nr); +- h->action(h); ++ h->action(); + trace_softirq_exit(vec_nr); + if (unlikely(prev_count != preempt_count())) { + pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", +@@ -448,7 +448,7 @@ void __raise_softirq_irqoff(unsigned int nr) + or_softirq_pending(1UL << nr); + } + +-void open_softirq(int nr, void (*action)(struct softirq_action *)) ++void __init open_softirq(int nr, void (*action)(void)) + { + softirq_vec[nr].action = action; + } +@@ -490,7 +490,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) + } + EXPORT_SYMBOL(__tasklet_hi_schedule); + +-static __latent_entropy void tasklet_action(struct softirq_action *a) ++static __latent_entropy void tasklet_action(void) + { + struct tasklet_struct *list; + +@@ -526,7 +526,7 @@ static __latent_entropy void tasklet_action(struct softirq_action *a) + } + } + +-static __latent_entropy void tasklet_hi_action(struct softirq_action *a) ++static __latent_entropy void tasklet_hi_action(void) + { + struct tasklet_struct *list; + +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index d330b1ce3b94..050278b12928 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -66,6 +66,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -98,12 +99,19 @@ + #if defined(CONFIG_SYSCTL) + + /* External variables not in a header file. */ ++#if IS_ENABLED(CONFIG_USB) ++int deny_new_usb __read_mostly = 0; ++EXPORT_SYMBOL(deny_new_usb); ++#endif + extern int suid_dumpable; + #ifdef CONFIG_COREDUMP + extern int core_uses_pid; + extern char core_pattern[]; + extern unsigned int core_pipe_limit; + #endif ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#endif + extern int pid_max; + extern int pid_max_min, pid_max_max; + extern int percpu_pagelist_fraction; +@@ -115,40 +123,43 @@ extern int sysctl_nr_trim_pages; + + /* Constants used for minimum and maximum */ + #ifdef CONFIG_LOCKUP_DETECTOR +-static int sixty = 60; ++static int sixty __read_only = 60; + #endif + +-static int __maybe_unused neg_one = -1; ++static int __maybe_unused neg_one __read_only = -1; + + static int zero; +-static int __maybe_unused one = 1; +-static int __maybe_unused two = 2; +-static int __maybe_unused four = 4; +-static unsigned long one_ul = 1; +-static int one_hundred = 100; +-static int one_thousand = 1000; ++static int __maybe_unused one __read_only = 1; ++static int __maybe_unused two __read_only = 2; ++static int __maybe_unused four __read_only = 4; ++static unsigned long one_ul __read_only = 1; ++static int one_hundred __read_only = 100; ++static int one_thousand __read_only = 1000; + #ifdef CONFIG_PRINTK +-static int ten_thousand = 10000; ++static int ten_thousand __read_only = 10000; + #endif + #ifdef CONFIG_PERF_EVENTS +-static int six_hundred_forty_kb = 640 * 1024; ++static int six_hundred_forty_kb __read_only = 640 * 1024; + #endif + + /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +-static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; ++static unsigned long dirty_bytes_min __read_only = 2 * PAGE_SIZE; + + /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +-static int maxolduid = 65535; +-static int minolduid; ++static int maxolduid __read_only = 65535; ++static int minolduid __read_only; + +-static int ngroups_max = NGROUPS_MAX; ++static int ngroups_max __read_only = NGROUPS_MAX; + static const int cap_last_cap = CAP_LAST_CAP; + + /*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ + #ifdef CONFIG_DETECT_HUNG_TASK +-static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); ++static unsigned long hung_task_timeout_max __read_only = (LONG_MAX/HZ); + #endif + ++int device_sidechannel_restrict __read_mostly = 1; ++EXPORT_SYMBOL(device_sidechannel_restrict); ++ + #ifdef CONFIG_INOTIFY_USER + #include + #endif +@@ -286,19 +297,19 @@ static struct ctl_table sysctl_base_table[] = { + }; + + #ifdef CONFIG_SCHED_DEBUG +-static int min_sched_granularity_ns = 100000; /* 100 usecs */ +-static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ +-static int min_wakeup_granularity_ns; /* 0 usecs */ +-static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ ++static int min_sched_granularity_ns __read_only = 100000; /* 100 usecs */ ++static int max_sched_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ ++static int min_wakeup_granularity_ns __read_only; /* 0 usecs */ ++static int max_wakeup_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ + #ifdef CONFIG_SMP +-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; ++static int min_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_NONE; ++static int max_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_END-1; + #endif /* CONFIG_SMP */ + #endif /* CONFIG_SCHED_DEBUG */ + + #ifdef CONFIG_COMPACTION +-static int min_extfrag_threshold; +-static int max_extfrag_threshold = 1000; ++static int min_extfrag_threshold __read_only; ++static int max_extfrag_threshold __read_only = 1000; + #endif + + static struct ctl_table kern_table[] = { +@@ -512,6 +523,15 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", +@@ -853,6 +873,37 @@ static struct ctl_table kern_table[] = { + .extra1 = &zero, + .extra2 = &two, + }, ++#endif ++#if defined CONFIG_TTY ++ { ++ .procname = "tiocsti_restrict", ++ .data = &tiocsti_restrict, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++#endif ++ { ++ .procname = "device_sidechannel_restrict", ++ .data = &device_sidechannel_restrict, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++#if IS_ENABLED(CONFIG_USB) ++ { ++ .procname = "deny_new_usb", ++ .data = &deny_new_usb, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, + #endif + { + .procname = "ngroups_max", +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index f17c76a1a05f..50f079d11488 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1640,7 +1640,7 @@ static inline void __run_timers(struct timer_base *base) + /* + * This function runs timers and the timer-tq in bottom half context. + */ +-static __latent_entropy void run_timer_softirq(struct softirq_action *h) ++static __latent_entropy void run_timer_softirq(void) + { + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index ed80a88980f0..ff6d27d06af0 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -24,6 +24,9 @@ + #include + #include + ++/* sysctl */ ++int unprivileged_userns_clone; ++ + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); + +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 62d0e25c054c..3953072277eb 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -937,6 +937,7 @@ endmenu # "Debug lockups and hangs" + + config PANIC_ON_OOPS + bool "Panic on Oops" ++ default y + help + Say Y here to enable the kernel to panic when it oopses. This + has the same effect as setting oops=panic on the kernel command +@@ -946,7 +947,7 @@ config PANIC_ON_OOPS + anything erroneous after an oops which could result in data + corruption or other issues. + +- Say N if unsure. ++ Say Y if unsure. + + config PANIC_ON_OOPS_VALUE + int +@@ -1319,6 +1320,7 @@ config DEBUG_BUGVERBOSE + config DEBUG_LIST + bool "Debug linked list manipulation" + depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION ++ default y + help + Enable this to turn on extended checks in the linked-list + walking routines. +@@ -1932,6 +1934,7 @@ config MEMTEST + config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select DEBUG_LIST ++ default y + help + Select this option if the kernel should BUG when it encounters + data corruption in kernel memory structures when they get checked +@@ -1952,7 +1955,7 @@ config STRICT_DEVMEM + bool "Filter access to /dev/mem" + depends on MMU && DEVMEM + depends on ARCH_HAS_DEVMEM_IS_ALLOWED +- default y if TILE || PPC ++ default y + ---help--- + If this option is disabled, you allow userspace (root) access to all + of memory, including kernel and userspace memory. Accidental +@@ -1971,6 +1974,7 @@ config STRICT_DEVMEM + config IO_STRICT_DEVMEM + bool "Filter I/O access to /dev/mem" + depends on STRICT_DEVMEM ++ default y + ---help--- + If this option is disabled, you allow userspace (root) access to all + io-memory regardless of whether a driver is actively using that +diff --git a/lib/irq_poll.c b/lib/irq_poll.c +index 86a709954f5a..6f15787fcb1b 100644 +--- a/lib/irq_poll.c ++++ b/lib/irq_poll.c +@@ -75,7 +75,7 @@ void irq_poll_complete(struct irq_poll *iop) + } + EXPORT_SYMBOL(irq_poll_complete); + +-static void __latent_entropy irq_poll_softirq(struct softirq_action *h) ++static void __latent_entropy irq_poll_softirq(void) + { + struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); + int rearm = 0, budget = irq_poll_budget; +diff --git a/lib/kobject.c b/lib/kobject.c +index bbbb067de8ec..fec2f780cf9b 100644 +--- a/lib/kobject.c ++++ b/lib/kobject.c +@@ -956,9 +956,9 @@ EXPORT_SYMBOL_GPL(kset_create_and_add); + + + static DEFINE_SPINLOCK(kobj_ns_type_lock); +-static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES]; ++static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES] __ro_after_init; + +-int kobj_ns_type_register(const struct kobj_ns_type_operations *ops) ++int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops) + { + enum kobj_ns_type type = ops->type; + int error; +diff --git a/lib/nlattr.c b/lib/nlattr.c +index 3d8295c85505..3fa3b3409d69 100644 +--- a/lib/nlattr.c ++++ b/lib/nlattr.c +@@ -341,6 +341,8 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count) + { + int minlen = min_t(int, count, nla_len(src)); + ++ BUG_ON(minlen < 0); ++ + memcpy(dest, nla_data(src), minlen); + if (count > minlen) + memset(dest + minlen, 0, count - minlen); +diff --git a/lib/vsprintf.c b/lib/vsprintf.c +index 4a990f3fd345..3df8db5af0ba 100644 +--- a/lib/vsprintf.c ++++ b/lib/vsprintf.c +@@ -1588,7 +1588,7 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, + return widen_string(buf, buf - buf_start, end, spec); + } + +-int kptr_restrict __read_mostly; ++int kptr_restrict __read_mostly = 2; + + /* + * Show a '%p' thing. A kernel extension is that the '%p' is followed +diff --git a/mm/Kconfig b/mm/Kconfig +index 59efbd3337e0..c070e14ec83d 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -319,7 +319,8 @@ config KSM + config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" + depends on MMU +- default 4096 ++ default 32768 if ARM || (ARM64 && COMPAT) ++ default 65536 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages +diff --git a/mm/mmap.c b/mm/mmap.c +index 2398776195d2..a8ffa2223ad1 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -220,6 +220,13 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) + + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); ++ /* properly handle unaligned min_brk as an empty heap */ ++ if (min_brk & ~PAGE_MASK) { ++ if (brk == min_brk) ++ newbrk -= PAGE_SIZE; ++ if (mm->brk == min_brk) ++ oldbrk -= PAGE_SIZE; ++ } + if (oldbrk == newbrk) + goto set_brk; + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index a2f365f40433..5e726e59de20 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -67,6 +67,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -98,6 +99,15 @@ int _node_numa_mem_[MAX_NUMNODES]; + DEFINE_MUTEX(pcpu_drain_mutex); + DEFINE_PER_CPU(struct work_struct, pcpu_drain); + ++bool __meminitdata extra_latent_entropy; ++ ++static int __init setup_extra_latent_entropy(char *str) ++{ ++ extra_latent_entropy = true; ++ return 0; ++} ++early_param("extra_latent_entropy", setup_extra_latent_entropy); ++ + #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY + volatile unsigned long latent_entropy __latent_entropy; + EXPORT_SYMBOL(latent_entropy); +@@ -1063,6 +1073,13 @@ static __always_inline bool free_pages_prepare(struct page *page, + debug_check_no_obj_freed(page_address(page), + PAGE_SIZE << order); + } ++ ++ if (IS_ENABLED(CONFIG_PAGE_SANITIZE)) { ++ int i; ++ for (i = 0; i < (1 << order); i++) ++ clear_highpage(page + i); ++ } ++ + arch_free_page(page, order); + kernel_poison_pages(page, 1 << order, 0); + kernel_map_pages(page, 1 << order, 0); +@@ -1278,6 +1295,21 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order) + __ClearPageReserved(p); + set_page_count(p, 0); + ++ if (extra_latent_entropy && !PageHighMem(page) && page_to_pfn(page) < 0x100000) { ++ unsigned long hash = 0; ++ size_t index, end = PAGE_SIZE * nr_pages / sizeof hash; ++ const unsigned long *data = lowmem_page_address(page); ++ ++ for (index = 0; index < end; index++) ++ hash ^= hash + data[index]; ++#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY ++ latent_entropy ^= hash; ++ add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); ++#else ++ add_device_randomness((const void *)&hash, sizeof(hash)); ++#endif ++ } ++ + page_zone(page)->managed_pages += nr_pages; + set_page_refcounted(page); + __free_pages(page, order); +@@ -1718,8 +1750,8 @@ static inline int check_new_page(struct page *page) + + static inline bool free_pages_prezeroed(void) + { +- return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && +- page_poisoning_enabled(); ++ return IS_ENABLED(CONFIG_PAGE_SANITIZE) || ++ (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && page_poisoning_enabled()); + } + + #ifdef CONFIG_DEBUG_VM +@@ -1776,6 +1808,11 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags + + post_alloc_hook(page, order, gfp_flags); + ++ if (IS_ENABLED(CONFIG_PAGE_SANITIZE_VERIFY)) { ++ for (i = 0; i < (1 << order); i++) ++ verify_zero_highpage(page + i); ++ } ++ + if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +diff --git a/mm/slab.h b/mm/slab.h +index 485d9fbb8802..436461588804 100644 +--- a/mm/slab.h ++++ b/mm/slab.h +@@ -311,7 +311,11 @@ static inline bool is_root_cache(struct kmem_cache *s) + static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) + { ++#ifdef CONFIG_SLAB_HARDENED ++ return p == s; ++#else + return true; ++#endif + } + + static inline const char *cache_name(struct kmem_cache *s) +@@ -363,18 +367,26 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) + * to not do even the assignment. In that case, slab_equal_or_root + * will also be a constant. + */ +- if (!memcg_kmem_enabled() && ++ if (!IS_ENABLED(CONFIG_SLAB_HARDENED) && ++ !memcg_kmem_enabled() && + !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) + return s; + + page = virt_to_head_page(x); ++#ifdef CONFIG_SLAB_HARDENED ++ BUG_ON(!PageSlab(page)); ++#endif + cachep = page->slab_cache; + if (slab_equal_or_root(cachep, s)) + return cachep; + + pr_err("%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name); ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG_ON(1); ++#else + WARN_ON_ONCE(1); ++#endif + return s; + } + +@@ -399,7 +411,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) + * back there or track user information then we can + * only use the space before that information. + */ +- if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) ++ if ((s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) || IS_ENABLED(CONFIG_SLAB_CANARY)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation +diff --git a/mm/slab_common.c b/mm/slab_common.c +index f6764cf162b8..015c8e4df318 100644 +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -26,10 +26,10 @@ + + #include "slab.h" + +-enum slab_state slab_state; ++enum slab_state slab_state __ro_after_init; + LIST_HEAD(slab_caches); + DEFINE_MUTEX(slab_mutex); +-struct kmem_cache *kmem_cache; ++struct kmem_cache *kmem_cache __ro_after_init; + + static LIST_HEAD(slab_caches_to_rcu_destroy); + static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work); +@@ -49,7 +49,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, + /* + * Merge control. If this is set then no merging of slab caches will occur. + */ +-static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); ++static bool slab_nomerge __ro_after_init = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); + + static int __init setup_slab_nomerge(char *str) + { +@@ -931,7 +931,7 @@ EXPORT_SYMBOL(kmalloc_dma_caches); + * of two cache sizes there. The size of larger slabs can be determined using + * fls. + */ +-static s8 size_index[24] = { ++static s8 size_index[24] __ro_after_init = { + 3, /* 8 */ + 4, /* 16 */ + 5, /* 24 */ +diff --git a/mm/slub.c b/mm/slub.c +index 220d42e592ef..3decf87b1cf2 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -125,6 +125,16 @@ static inline int kmem_cache_debug(struct kmem_cache *s) + #endif + } + ++static inline bool has_sanitize(struct kmem_cache *s) ++{ ++ return IS_ENABLED(CONFIG_SLAB_SANITIZE) && !(s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)); ++} ++ ++static inline bool has_sanitize_verify(struct kmem_cache *s) ++{ ++ return IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && has_sanitize(s); ++} ++ + void *fixup_red_left(struct kmem_cache *s, void *p) + { + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) +@@ -297,6 +307,35 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) + *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); + } + ++#ifdef CONFIG_SLAB_CANARY ++static inline unsigned long *get_canary(struct kmem_cache *s, void *object) ++{ ++ if (s->offset) ++ return object + s->offset + sizeof(void *); ++ return object + s->inuse; ++} ++ ++static inline unsigned long get_canary_value(const void *canary, unsigned long value) ++{ ++ return (value ^ (unsigned long)canary) & CANARY_MASK; ++} ++ ++static inline void set_canary(struct kmem_cache *s, void *object, unsigned long value) ++{ ++ unsigned long *canary = get_canary(s, object); ++ *canary = get_canary_value(canary, value); ++} ++ ++static inline void check_canary(struct kmem_cache *s, void *object, unsigned long value) ++{ ++ unsigned long *canary = get_canary(s, object); ++ BUG_ON(*canary != get_canary_value(canary, value)); ++} ++#else ++#define set_canary(s, object, value) ++#define check_canary(s, object, value) ++#endif ++ + /* Loop over all objects in a slab */ + #define for_each_object(__p, __s, __addr, __objects) \ + for (__p = fixup_red_left(__s, __addr); \ +@@ -484,13 +523,13 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) + * Debug settings: + */ + #if defined(CONFIG_SLUB_DEBUG_ON) +-static int slub_debug = DEBUG_DEFAULT_FLAGS; ++static int slub_debug __ro_after_init = DEBUG_DEFAULT_FLAGS; + #else +-static int slub_debug; ++static int slub_debug __ro_after_init; + #endif + +-static char *slub_debug_slabs; +-static int disable_higher_order_debug; ++static char *slub_debug_slabs __ro_after_init; ++static int disable_higher_order_debug __ro_after_init; + + /* + * slub is about to manipulate internal object metadata. This memory lies +@@ -550,6 +589,9 @@ static struct track *get_track(struct kmem_cache *s, void *object, + else + p = object + s->inuse; + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ p = (void *)p + sizeof(void *); ++ + return p + alloc; + } + +@@ -688,6 +730,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) + else + off = s->inuse; + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ off += sizeof(void *); ++ + if (s->flags & SLAB_STORE_USER) + off += 2 * sizeof(struct track); + +@@ -817,6 +862,9 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) + /* Freepointer is placed after the object. */ + off += sizeof(void *); + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ off += sizeof(void *); ++ + if (s->flags & SLAB_STORE_USER) + /* We also have user information there */ + off += 2 * sizeof(struct track); +@@ -1416,8 +1464,9 @@ static void setup_object(struct kmem_cache *s, struct page *page, + void *object) + { + setup_object_debug(s, page, object); ++ set_canary(s, object, s->random_inactive); + kasan_init_slab_obj(s, object); +- if (unlikely(s->ctor)) { ++ if (unlikely(s->ctor) && !has_sanitize_verify(s)) { + kasan_unpoison_object_data(s, object); + s->ctor(object); + kasan_poison_object_data(s, object); +@@ -2717,9 +2766,21 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, + stat(s, ALLOC_FASTPATH); + } + +- if (unlikely(gfpflags & __GFP_ZERO) && object) ++ if (has_sanitize_verify(s) && object) { ++ size_t offset = s->offset ? 0 : sizeof(void *); ++ BUG_ON(memchr_inv(object + offset, 0, s->object_size - offset)); ++ if (s->ctor) ++ s->ctor(object); ++ if (unlikely(gfpflags & __GFP_ZERO) && offset) ++ memset(object, 0, sizeof(void *)); ++ } else if (unlikely(gfpflags & __GFP_ZERO) && object) + memset(object, 0, s->object_size); + ++ if (object) { ++ check_canary(s, object, s->random_inactive); ++ set_canary(s, object, s->random_active); ++ } ++ + slab_post_alloc_hook(s, gfpflags, 1, &object); + + return object; +@@ -2926,6 +2987,27 @@ static __always_inline void do_slab_free(struct kmem_cache *s, + void *tail_obj = tail ? : head; + struct kmem_cache_cpu *c; + unsigned long tid; ++ bool sanitize = has_sanitize(s); ++ ++ if (IS_ENABLED(CONFIG_SLAB_CANARY) || sanitize) { ++ __maybe_unused int offset = s->offset ? 0 : sizeof(void *); ++ void *x = head; ++ ++ while (1) { ++ check_canary(s, x, s->random_active); ++ set_canary(s, x, s->random_inactive); ++ ++ if (sanitize) { ++ memset(x + offset, 0, s->object_size - offset); ++ if (!IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && s->ctor) ++ s->ctor(x); ++ } ++ if (x == tail_obj) ++ break; ++ x = get_freepointer(s, x); ++ } ++ } ++ + redo: + /* + * Determine the currently cpus per cpu slab. +@@ -3104,7 +3186,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) + { + struct kmem_cache_cpu *c; +- int i; ++ int i, k; + + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, flags); +@@ -3141,13 +3223,29 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + local_irq_enable(); + + /* Clear memory outside IRQ disabled fastpath loop */ +- if (unlikely(flags & __GFP_ZERO)) { ++ if (has_sanitize_verify(s)) { ++ int j; ++ ++ for (j = 0; j < i; j++) { ++ size_t offset = s->offset ? 0 : sizeof(void *); ++ BUG_ON(memchr_inv(p[j] + offset, 0, s->object_size - offset)); ++ if (s->ctor) ++ s->ctor(p[j]); ++ if (unlikely(flags & __GFP_ZERO) && offset) ++ memset(p[j], 0, sizeof(void *)); ++ } ++ } else if (unlikely(flags & __GFP_ZERO)) { + int j; + + for (j = 0; j < i; j++) + memset(p[j], 0, s->object_size); + } + ++ for (k = 0; k < i; k++) { ++ check_canary(s, p[k], s->random_inactive); ++ set_canary(s, p[k], s->random_active); ++ } ++ + /* memcg and kmem_cache debug support */ + slab_post_alloc_hook(s, flags, size, p); + return i; +@@ -3179,9 +3277,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); + * and increases the number of allocations possible without having to + * take the list_lock. + */ +-static int slub_min_order; +-static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +-static int slub_min_objects; ++static int slub_min_order __ro_after_init; ++static int slub_max_order __ro_after_init = PAGE_ALLOC_COSTLY_ORDER; ++static int slub_min_objects __ro_after_init; + + /* + * Calculate the order of allocation given an slab object size. +@@ -3351,6 +3449,7 @@ static void early_kmem_cache_node_alloc(int node) + init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); + init_tracking(kmem_cache_node, n); + #endif ++ set_canary(kmem_cache_node, n, kmem_cache_node->random_active); + kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), + GFP_KERNEL); + init_kmem_cache_node(n); +@@ -3507,6 +3606,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) + size += sizeof(void *); + } + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ size += sizeof(void *); ++ + #ifdef CONFIG_SLUB_DEBUG + if (flags & SLAB_STORE_USER) + /* +@@ -3577,6 +3679,10 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) + #ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); + #endif ++#ifdef CONFIG_SLAB_CANARY ++ s->random_active = get_random_long(); ++ s->random_inactive = get_random_long(); ++#endif + + if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) + s->reserved = sizeof(struct rcu_head); +@@ -3841,6 +3947,8 @@ const char *__check_heap_object(const void *ptr, unsigned long n, + offset -= s->red_left_pad; + } + ++ check_canary(s, (void *)ptr - offset, s->random_active); ++ + /* Allow address range falling entirely within object size. */ + if (offset <= object_size && n <= object_size - offset) + return NULL; +@@ -3859,7 +3967,11 @@ static size_t __ksize(const void *object) + page = virt_to_head_page(object); + + if (unlikely(!PageSlab(page))) { ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG_ON(!PageCompound(page)); ++#else + WARN_ON(!PageCompound(page)); ++#endif + return PAGE_SIZE << compound_order(page); + } + +@@ -4724,7 +4836,7 @@ enum slab_stat_type { + #define SO_TOTAL (1 << SL_TOTAL) + + #ifdef CONFIG_MEMCG +-static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); ++static bool memcg_sysfs_enabled __ro_after_init = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); + + static int __init setup_slub_memcg_sysfs(char *str) + { +diff --git a/mm/swap.c b/mm/swap.c +index a77d68f2c1b6..d1f1d75f4d1f 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -92,6 +92,13 @@ static void __put_compound_page(struct page *page) + if (!PageHuge(page)) + __page_cache_release(page); + dtor = get_compound_page_dtor(page); ++ if (!PageHuge(page)) ++ BUG_ON(dtor != free_compound_page ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++ && dtor != free_transhuge_page ++#endif ++ ); ++ + (*dtor)(page); + } + +diff --git a/net/core/dev.c b/net/core/dev.c +index 4337450a5fdb..5a3c7d217719 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -4117,7 +4117,7 @@ int netif_rx_ni(struct sk_buff *skb) + } + EXPORT_SYMBOL(netif_rx_ni); + +-static __latent_entropy void net_tx_action(struct softirq_action *h) ++static __latent_entropy void net_tx_action(void) + { + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + +@@ -5635,7 +5635,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) + return work; + } + +-static __latent_entropy void net_rx_action(struct softirq_action *h) ++static __latent_entropy void net_rx_action(void) + { + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + unsigned long time_limit = jiffies + +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index f48fe6fc7e8c..d78c52835c08 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -261,6 +261,7 @@ config IP_PIMSM_V2 + + config SYN_COOKIES + bool "IP: TCP syncookie support" ++ default y + ---help--- + Normal TCP/IP networking is open to an attack known as "SYN + flooding". This denial-of-service attack prevents legitimate remote +diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c +index 18bc8738e989..d2866f6dd736 100644 +--- a/scripts/mod/modpost.c ++++ b/scripts/mod/modpost.c +@@ -37,6 +37,7 @@ static int vmlinux_section_warnings = 1; + static int warn_unresolved = 0; + /* How a symbol is exported */ + static int sec_mismatch_count = 0; ++static int writable_fptr_count = 0; + static int sec_mismatch_verbose = 1; + static int sec_mismatch_fatal = 0; + /* ignore missing files */ +@@ -965,6 +966,7 @@ enum mismatch { + ANY_EXIT_TO_ANY_INIT, + EXPORT_TO_INIT_EXIT, + EXTABLE_TO_NON_TEXT, ++ DATA_TO_TEXT + }; + + /** +@@ -1091,6 +1093,12 @@ static const struct sectioncheck sectioncheck[] = { + .good_tosec = {ALL_TEXT_SECTIONS , NULL}, + .mismatch = EXTABLE_TO_NON_TEXT, + .handler = extable_mismatch_handler, ++}, ++/* Do not reference code from writable data */ ++{ ++ .fromsec = { DATA_SECTIONS, NULL }, ++ .bad_tosec = { ALL_TEXT_SECTIONS, NULL }, ++ .mismatch = DATA_TO_TEXT + } + }; + +@@ -1240,10 +1248,10 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr, + continue; + if (ELF_ST_TYPE(sym->st_info) == STT_SECTION) + continue; +- if (sym->st_value == addr) +- return sym; + /* Find a symbol nearby - addr are maybe negative */ + d = sym->st_value - addr; ++ if (d == 0) ++ return sym; + if (d < 0) + d = addr - sym->st_value; + if (d < distance) { +@@ -1402,7 +1410,11 @@ static void report_sec_mismatch(const char *modname, + char *prl_from; + char *prl_to; + +- sec_mismatch_count++; ++ if (mismatch->mismatch == DATA_TO_TEXT) ++ writable_fptr_count++; ++ else ++ sec_mismatch_count++; ++ + if (!sec_mismatch_verbose) + return; + +@@ -1526,6 +1538,14 @@ static void report_sec_mismatch(const char *modname, + fatal("There's a special handler for this mismatch type, " + "we should never get here."); + break; ++ case DATA_TO_TEXT: ++#if 0 ++ fprintf(stderr, ++ "The %s %s:%s references\n" ++ "the %s %s:%s%s\n", ++ from, fromsec, fromsym, to, tosec, tosym, to_p); ++#endif ++ break; + } + fprintf(stderr, "\n"); + } +@@ -2539,6 +2559,14 @@ int main(int argc, char **argv) + } + } + free(buf.p); ++ if (writable_fptr_count) { ++ if (!sec_mismatch_verbose) { ++ warn("modpost: Found %d writable function pointer(s).\n" ++ "To see full details build your kernel with:\n" ++ "'make CONFIG_DEBUG_SECTION_MISMATCH=y'\n", ++ writable_fptr_count); ++ } ++ } + + return err; + } +diff --git a/security/Kconfig b/security/Kconfig +index 87f2a6f842fd..7bdbb7edf5bf 100644 +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -8,7 +8,7 @@ source security/keys/Kconfig + + config SECURITY_DMESG_RESTRICT + bool "Restrict unprivileged access to the kernel syslog" +- default n ++ default y + help + This enforces restrictions on unprivileged users reading the kernel + syslog via dmesg(8). +@@ -18,10 +18,34 @@ config SECURITY_DMESG_RESTRICT + + If you are unsure how to answer this question, answer N. + ++config SECURITY_PERF_EVENTS_RESTRICT ++ bool "Restrict unprivileged use of performance events" ++ depends on PERF_EVENTS ++ default y ++ help ++ If you say Y here, the kernel.perf_event_paranoid sysctl ++ will be set to 3 by default, and no unprivileged use of the ++ perf_event_open syscall will be permitted unless it is ++ changed. ++ ++config SECURITY_TIOCSTI_RESTRICT ++ bool "Restrict unprivileged use of tiocsti command injection" ++ default y ++ help ++ This enforces restrictions on unprivileged users injecting commands ++ into other processes which share a tty session using the TIOCSTI ++ ioctl. This option makes TIOCSTI use require CAP_SYS_ADMIN. ++ ++ If this option is not selected, no restrictions will be enforced ++ unless the tiocsti_restrict sysctl is explicitly set to (1). ++ ++ If you are unsure how to answer this question, answer N. ++ + config SECURITY + bool "Enable different security models" + depends on SYSFS + depends on MULTIUSER ++ default y + help + This allows you to choose different security modules to be + configured into your kernel. +@@ -48,6 +72,7 @@ config SECURITYFS + config SECURITY_NETWORK + bool "Socket and Networking Security Hooks" + depends on SECURITY ++ default y + help + This enables the socket and networking security hooks. + If enabled, a security module can use these hooks to +@@ -155,6 +180,7 @@ config HARDENED_USERCOPY + depends on HAVE_HARDENED_USERCOPY_ALLOCATOR + select BUG + imply STRICT_DEVMEM ++ default y + help + This option checks for obviously wrong memory regions when + copying memory to/from the kernel (via copy_to_user() and +@@ -178,10 +204,36 @@ config HARDENED_USERCOPY_PAGESPAN + config FORTIFY_SOURCE + bool "Harden common str/mem functions against buffer overflows" + depends on ARCH_HAS_FORTIFY_SOURCE ++ default y + help + Detect overflows of buffers in common string and memory functions + where the compiler can determine and validate the buffer sizes. + ++config FORTIFY_SOURCE_STRICT_STRING ++ bool "Harden common functions against buffer overflows" ++ depends on FORTIFY_SOURCE ++ depends on EXPERT ++ help ++ Perform stricter overflow checks catching overflows within objects ++ for common C string functions rather than only between objects. ++ ++ This is not yet intended for production use, only bug finding. ++ ++config PAGE_SANITIZE ++ bool "Sanitize pages" ++ default y ++ help ++ Zero fill page allocations on free, reducing the lifetime of ++ sensitive data and helping to mitigate use-after-free bugs. ++ ++config PAGE_SANITIZE_VERIFY ++ bool "Verify sanitized pages" ++ depends on PAGE_SANITIZE ++ default y ++ help ++ Verify that newly allocated pages are zeroed to detect ++ write-after-free bugs. ++ + config STATIC_USERMODEHELPER + bool "Force all usermode helper calls through a single binary" + help +diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig +index 8af7a690eb40..6539694b0fd3 100644 +--- a/security/selinux/Kconfig ++++ b/security/selinux/Kconfig +@@ -2,7 +2,7 @@ config SECURITY_SELINUX + bool "NSA SELinux Support" + depends on SECURITY_NETWORK && AUDIT && NET && INET + select NETWORK_SECMARK +- default n ++ default y + help + This selects NSA Security-Enhanced Linux (SELinux). + You will also need a policy configuration and a labeled filesystem. +@@ -79,23 +79,3 @@ config SECURITY_SELINUX_AVC_STATS + This option collects access vector cache statistics to + /selinux/avc/cache_stats, which may be monitored via + tools such as avcstat. +- +-config SECURITY_SELINUX_CHECKREQPROT_VALUE +- int "NSA SELinux checkreqprot default value" +- depends on SECURITY_SELINUX +- range 0 1 +- default 0 +- help +- This option sets the default value for the 'checkreqprot' flag +- that determines whether SELinux checks the protection requested +- by the application or the protection that will be applied by the +- kernel (including any implied execute for read-implies-exec) for +- mmap and mprotect calls. If this option is set to 0 (zero), +- SELinux will default to checking the protection that will be applied +- by the kernel. If this option is set to 1 (one), SELinux will +- default to checking the protection requested by the application. +- The checkreqprot flag may be changed from the default via the +- 'checkreqprot=' boot parameter. It may also be changed at runtime +- via /selinux/checkreqprot if authorized by policy. +- +- If you are unsure how to answer this question, answer 0. +diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h +index 1649cd18eb0b..067f35559aa7 100644 +--- a/security/selinux/include/objsec.h ++++ b/security/selinux/include/objsec.h +@@ -150,6 +150,6 @@ struct pkey_security_struct { + u32 sid; /* SID of pkey */ + }; + +-extern unsigned int selinux_checkreqprot; ++extern const unsigned int selinux_checkreqprot; + + #endif /* _SELINUX_OBJSEC_H_ */ +diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c +index 00eed842c491..8f7b8d7e6f91 100644 +--- a/security/selinux/selinuxfs.c ++++ b/security/selinux/selinuxfs.c +@@ -41,16 +41,7 @@ + #include "objsec.h" + #include "conditional.h" + +-unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; +- +-static int __init checkreqprot_setup(char *str) +-{ +- unsigned long checkreqprot; +- if (!kstrtoul(str, 0, &checkreqprot)) +- selinux_checkreqprot = checkreqprot ? 1 : 0; +- return 1; +-} +-__setup("checkreqprot=", checkreqprot_setup); ++const unsigned int selinux_checkreqprot; + + static DEFINE_MUTEX(sel_mutex); + +@@ -610,10 +601,9 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, + return PTR_ERR(page); + + length = -EINVAL; +- if (sscanf(page, "%u", &new_value) != 1) ++ if (sscanf(page, "%u", &new_value) != 1 || new_value) + goto out; + +- selinux_checkreqprot = new_value ? 1 : 0; + length = count; + out: + kfree(page); +diff --git a/security/yama/Kconfig b/security/yama/Kconfig +index 96b27405558a..485c1b85c325 100644 +--- a/security/yama/Kconfig ++++ b/security/yama/Kconfig +@@ -1,7 +1,7 @@ + config SECURITY_YAMA + bool "Yama support" + depends on SECURITY +- default n ++ default y + help + This selects Yama, which extends DAC support with additional + system-wide security settings beyond regular Linux discretionary diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-mute-pps_state_mismatch.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-mute-pps_state_mismatch.patch new file mode 100644 index 00000000..dc1d254b --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-mute-pps_state_mismatch.patch @@ -0,0 +1,16 @@ +diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c +index 09f274419..595bc5844 100644 +--- a/drivers/gpu/drm/i915/intel_dp.c ++++ b/drivers/gpu/drm/i915/intel_dp.c +@@ -5249,7 +5249,10 @@ intel_pps_verify_state(struct drm_i915_private *dev_priv, + + if (hw.t1_t3 != sw->t1_t3 || hw.t8 != sw->t8 || hw.t9 != sw->t9 || + hw.t10 != sw->t10 || hw.t11_t12 != sw->t11_t12) { +- DRM_ERROR("PPS state mismatch\n"); ++ /* seem buggy on 4.14.x .. mute that for now ++ * even is not a real solution .. ++ * DRM_ERROR("PPS state mismatch\n"); ++ */ + intel_pps_dump_state("sw", sw); + intel_pps_dump_state("hw", &hw); + } diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-redcore-lts-amd64.config b/sys-kernel/linux-sources-redcore-lts/files/4.14-redcore-lts-amd64.config new file mode 100644 index 00000000..23e35863 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-redcore-lts-amd64.config @@ -0,0 +1,9106 @@ +# +# Automatically generated file; DO NOT EDIT. +# Linux/x86 4.14.90-redcore-lts Kernel Configuration +# +CONFIG_64BIT=y +CONFIG_X86_64=y +CONFIG_X86=y +CONFIG_INSTRUCTION_DECODER=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" +CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_MMU=y +CONFIG_ARCH_MMAP_RND_BITS_MIN=28 +CONFIG_ARCH_MMAP_RND_BITS_MAX=32 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 +CONFIG_NEED_DMA_MAP_STATE=y +CONFIG_NEED_SG_DMA_LENGTH=y +CONFIG_GENERIC_ISA_DMA=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y +CONFIG_GENERIC_HWEIGHT=y +CONFIG_ARCH_MAY_HAVE_PC_FDC=y +CONFIG_RWSEM_XCHGADD_ALGORITHM=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_ARCH_HAS_CPU_RELAX=y +CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y +CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y +CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y +CONFIG_ARCH_WANT_GENERAL_HUGETLB=y +CONFIG_ZONE_DMA32=y +CONFIG_AUDIT_ARCH=y +CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y +CONFIG_HAVE_INTEL_TXT=y +CONFIG_X86_64_SMP=y +CONFIG_ARCH_SUPPORTS_UPROBES=y +CONFIG_FIX_EARLYCON_MEM=y +CONFIG_PGTABLE_LEVELS=4 +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" +CONFIG_IRQ_WORK=y +CONFIG_BUILDTIME_EXTABLE_SORT=y +CONFIG_THREAD_INFO_IN_TASK=y + +# +# General setup +# +CONFIG_SCHED_MUQSS=y +CONFIG_INIT_ENV_ARG_LIMIT=32 +CONFIG_CROSS_COMPILE="" +# CONFIG_COMPILE_TEST is not set +CONFIG_LOCALVERSION="" +CONFIG_LOCALVERSION_AUTO=y +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_HAVE_KERNEL_XZ=y +CONFIG_HAVE_KERNEL_LZO=y +CONFIG_HAVE_KERNEL_LZ4=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set +# CONFIG_KERNEL_XZ is not set +# CONFIG_KERNEL_LZO is not set +CONFIG_KERNEL_LZ4=y +CONFIG_DEFAULT_HOSTNAME="(none)" +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y +CONFIG_CROSS_MEMORY_ATTACH=y +CONFIG_FHANDLE=y +# CONFIG_USELIB is not set +CONFIG_AUDIT=y +CONFIG_HAVE_ARCH_AUDITSYSCALL=y +CONFIG_AUDITSYSCALL=y +CONFIG_AUDIT_WATCH=y +CONFIG_AUDIT_TREE=y + +# +# IRQ subsystem +# +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_IRQ_SHOW=y +CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_GENERIC_IRQ_MIGRATION=y +CONFIG_GENERIC_IRQ_CHIP=y +CONFIG_IRQ_DOMAIN=y +CONFIG_IRQ_SIM=y +CONFIG_IRQ_DOMAIN_HIERARCHY=y +CONFIG_GENERIC_MSI_IRQ=y +CONFIG_GENERIC_MSI_IRQ_DOMAIN=y +# CONFIG_IRQ_DOMAIN_DEBUG is not set +CONFIG_IRQ_FORCED_THREADING=y +CONFIG_FORCE_IRQ_THREADING=y +CONFIG_SPARSE_IRQ=y +# CONFIG_GENERIC_IRQ_DEBUGFS is not set +CONFIG_CLOCKSOURCE_WATCHDOG=y +CONFIG_ARCH_CLOCKSOURCE_DATA=y +CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y +CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y +CONFIG_GENERIC_CMOS_UPDATE=y + +# +# Timers subsystem +# +CONFIG_TICK_ONESHOT=y +CONFIG_HZ_PERIODIC=y +# CONFIG_NO_HZ_IDLE is not set +# CONFIG_NO_HZ_FULL is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y + +# +# CPU/Task time and stats accounting +# +CONFIG_VIRT_CPU_ACCOUNTING=y +# CONFIG_TICK_CPU_ACCOUNTING is not set +CONFIG_VIRT_CPU_ACCOUNTING_GEN=y +CONFIG_IRQ_TIME_ACCOUNTING=y +CONFIG_BSD_PROCESS_ACCT=y +# CONFIG_BSD_PROCESS_ACCT_V3 is not set +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y + +# +# RCU Subsystem +# +CONFIG_PREEMPT_RCU=y +# CONFIG_RCU_EXPERT is not set +CONFIG_SRCU=y +CONFIG_TREE_SRCU=y +CONFIG_TASKS_RCU=y +CONFIG_RCU_STALL_COMMON=y +CONFIG_RCU_NEED_SEGCBLIST=y +CONFIG_CONTEXT_TRACKING=y +# CONFIG_CONTEXT_TRACKING_FORCE is not set +CONFIG_BUILD_BIN2C=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_CPU_MAX_BUF_SHIFT=13 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y +CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y +CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y +CONFIG_ARCH_SUPPORTS_INT128=y +CONFIG_CGROUPS=y +CONFIG_PAGE_COUNTER=y +CONFIG_MEMCG=y +CONFIG_MEMCG_SWAP=y +CONFIG_MEMCG_SWAP_ENABLED=y +CONFIG_BLK_CGROUP=y +# CONFIG_DEBUG_BLK_CGROUP is not set +CONFIG_CGROUP_WRITEBACK=y +CONFIG_CGROUP_SCHED=y +CONFIG_CGROUP_PIDS=y +# CONFIG_CGROUP_RDMA is not set +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_PROC_PID_CPUSET=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_BPF=y +# CONFIG_CGROUP_DEBUG is not set +CONFIG_SOCK_CGROUP_DATA=y +# CONFIG_CHECKPOINT_RESTORE is not set +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +# CONFIG_SYSFS_DEPRECATED is not set +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y +CONFIG_RD_XZ=y +CONFIG_RD_LZO=y +CONFIG_RD_LZ4=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +# CONFIG_LOCAL_INIT is not set +CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y +CONFIG_HAVE_UID16=y +CONFIG_SYSCTL_EXCEPTION_TRACE=y +CONFIG_HAVE_PCSPKR_PLATFORM=y +CONFIG_BPF=y +# CONFIG_EXPERT is not set +CONFIG_UID16=y +CONFIG_MULTIUSER=y +CONFIG_SGETMASK_SYSCALL=y +CONFIG_SYSFS_SYSCALL=y +# CONFIG_SYSCTL_SYSCALL is not set +CONFIG_POSIX_TIMERS=y +CONFIG_KALLSYMS=y +# CONFIG_KALLSYMS_ALL is not set +CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y +CONFIG_KALLSYMS_BASE_RELATIVE=y +CONFIG_PRINTK=y +CONFIG_PRINTK_NMI=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_FUTEX_PI=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_ADVISE_SYSCALLS=y +CONFIG_USERFAULTFD=y +CONFIG_PCI_QUIRKS=y +CONFIG_MEMBARRIER=y +# CONFIG_EMBEDDED is not set +CONFIG_HAVE_PERF_EVENTS=y +# CONFIG_PC104 is not set + +# +# Kernel Performance Events And Counters +# +CONFIG_PERF_EVENTS=y +# CONFIG_DEBUG_PERF_USE_VMALLOC is not set +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_SLUB_DEBUG=y +# CONFIG_SLUB_MEMCG_SYSFS_ON is not set +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB is not set +CONFIG_SLUB=y +CONFIG_SLAB_MERGE_DEFAULT=y +CONFIG_SLAB_FREELIST_RANDOM=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_SLAB_HARDENED=y +CONFIG_SLAB_SANITIZE=y +CONFIG_SLAB_SANITIZE_VERIFY=y +CONFIG_SLUB_CPU_PARTIAL=y +CONFIG_SYSTEM_DATA_VERIFICATION=y +CONFIG_PROFILING=y +CONFIG_TRACEPOINTS=y +CONFIG_CRASH_CORE=y +CONFIG_KEXEC_CORE=y +CONFIG_HOTPLUG_SMT=y +# CONFIG_OPROFILE is not set +CONFIG_HAVE_OPROFILE=y +CONFIG_OPROFILE_NMI_TIMER=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +# CONFIG_STATIC_KEYS_SELFTEST is not set +CONFIG_OPTPROBES=y +CONFIG_UPROBES=y +# CONFIG_HAVE_64BIT_ALIGNED_ACCESS is not set +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_ARCH_USE_BUILTIN_BSWAP=y +CONFIG_KRETPROBES=y +CONFIG_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_HAVE_OPTPROBES=y +CONFIG_HAVE_KPROBES_ON_FTRACE=y +CONFIG_HAVE_NMI=y +CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_CONTIGUOUS=y +CONFIG_GENERIC_SMP_IDLE_THREAD=y +CONFIG_ARCH_HAS_FORTIFY_SOURCE=y +CONFIG_ARCH_HAS_SET_MEMORY=y +CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y +CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y +CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +CONFIG_HAVE_HW_BREAKPOINT=y +CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y +CONFIG_HAVE_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_PERF_EVENTS_NMI=y +CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HAVE_PERF_REGS=y +CONFIG_HAVE_PERF_USER_STACK_DUMP=y +CONFIG_HAVE_ARCH_JUMP_LABEL=y +CONFIG_HAVE_RCU_TABLE_FREE=y +CONFIG_HAVE_RCU_TABLE_INVALIDATE=y +CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y +CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y +CONFIG_HAVE_CMPXCHG_LOCAL=y +CONFIG_HAVE_CMPXCHG_DOUBLE=y +CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y +CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y +CONFIG_HAVE_ARCH_SECCOMP_FILTER=y +CONFIG_SECCOMP_FILTER=y +CONFIG_HAVE_GCC_PLUGINS=y +# CONFIG_GCC_PLUGINS is not set +CONFIG_HAVE_CC_STACKPROTECTOR=y +CONFIG_CC_STACKPROTECTOR=y +# CONFIG_CC_STACKPROTECTOR_NONE is not set +# CONFIG_CC_STACKPROTECTOR_REGULAR is not set +CONFIG_CC_STACKPROTECTOR_STRONG=y +CONFIG_THIN_ARCHIVES=y +CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y +CONFIG_HAVE_CONTEXT_TRACKING=y +CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y +CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y +CONFIG_HAVE_ARCH_HUGE_VMAP=y +CONFIG_HAVE_ARCH_SOFT_DIRTY=y +CONFIG_HAVE_MOD_ARCH_SPECIFIC=y +CONFIG_MODULES_USE_ELF_RELA=y +CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK=y +CONFIG_ARCH_HAS_ELF_RANDOMIZE=y +CONFIG_HAVE_ARCH_MMAP_RND_BITS=y +CONFIG_HAVE_EXIT_THREAD=y +CONFIG_ARCH_MMAP_RND_BITS=32 +CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16 +CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y +CONFIG_HAVE_COPY_THREAD_TLS=y +CONFIG_HAVE_STACK_VALIDATION=y +# CONFIG_HAVE_ARCH_HASH is not set +# CONFIG_ISA_BUS_API is not set +CONFIG_OLD_SIGSUSPEND3=y +CONFIG_COMPAT_OLD_SIGACTION=y +# CONFIG_CPU_NO_EFFICIENT_FFS is not set +CONFIG_HAVE_ARCH_VMAP_STACK=y +CONFIG_VMAP_STACK=y +# CONFIG_ARCH_OPTIONAL_KERNEL_RWX is not set +# CONFIG_ARCH_OPTIONAL_KERNEL_RWX_DEFAULT is not set +CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y +CONFIG_STRICT_MODULE_RWX=y +CONFIG_ARCH_HAS_REFCOUNT=y +CONFIG_REFCOUNT_FULL=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y +# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set +CONFIG_SLABINFO=y +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG=y +# CONFIG_MODULE_SIG_FORCE is not set +CONFIG_MODULE_SIG_ALL=y +# CONFIG_MODULE_SIG_SHA1 is not set +# CONFIG_MODULE_SIG_SHA224 is not set +# CONFIG_MODULE_SIG_SHA256 is not set +# CONFIG_MODULE_SIG_SHA384 is not set +CONFIG_MODULE_SIG_SHA512=y +CONFIG_MODULE_SIG_HASH="sha512" +CONFIG_MODULE_COMPRESS=y +CONFIG_MODULE_COMPRESS_GZIP=y +# CONFIG_MODULE_COMPRESS_XZ is not set +# CONFIG_TRIM_UNUSED_KSYMS is not set +CONFIG_MODULES_TREE_LOOKUP=y +CONFIG_BLOCK=y +CONFIG_BLK_SCSI_REQUEST=y +CONFIG_BLK_DEV_BSG=y +CONFIG_BLK_DEV_BSGLIB=y +CONFIG_BLK_DEV_INTEGRITY=y +CONFIG_BLK_DEV_ZONED=y +CONFIG_BLK_DEV_THROTTLING=y +# CONFIG_BLK_DEV_THROTTLING_LOW is not set +CONFIG_BLK_CMDLINE_PARSER=y +CONFIG_BLK_WBT=y +CONFIG_BLK_WBT_SQ=y +CONFIG_BLK_WBT_MQ=y +CONFIG_BLK_DEBUG_FS=y +# CONFIG_BLK_SED_OPAL is not set + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +# CONFIG_AIX_PARTITION is not set +# CONFIG_OSF_PARTITION is not set +# CONFIG_AMIGA_PARTITION is not set +# CONFIG_ATARI_PARTITION is not set +# CONFIG_MAC_PARTITION is not set +CONFIG_MSDOS_PARTITION=y +# CONFIG_BSD_DISKLABEL is not set +# CONFIG_MINIX_SUBPARTITION is not set +# CONFIG_SOLARIS_X86_PARTITION is not set +# CONFIG_UNIXWARE_DISKLABEL is not set +CONFIG_LDM_PARTITION=y +CONFIG_LDM_DEBUG=y +# CONFIG_SGI_PARTITION is not set +# CONFIG_ULTRIX_PARTITION is not set +# CONFIG_SUN_PARTITION is not set +# CONFIG_KARMA_PARTITION is not set +CONFIG_EFI_PARTITION=y +# CONFIG_SYSV68_PARTITION is not set +CONFIG_CMDLINE_PARTITION=y +CONFIG_BLOCK_COMPAT=y +CONFIG_BLK_MQ_PCI=y +CONFIG_BLK_MQ_VIRTIO=y +CONFIG_BLK_MQ_RDMA=y + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_CFQ_GROUP_IOSCHED=y +CONFIG_IOSCHED_BFQ_SQ=y +CONFIG_BFQ_SQ_GROUP_IOSCHED=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +CONFIG_DEFAULT_BFQ_SQ=y +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="bfq-sq" +CONFIG_MQ_IOSCHED_BFQ=y +CONFIG_MQ_BFQ_GROUP_IOSCHED=y +CONFIG_MQ_IOSCHED_DEADLINE=y +# CONFIG_MQ_IOSCHED_KYBER is not set +CONFIG_IOSCHED_BFQ=y +CONFIG_BFQ_GROUP_IOSCHED=y +CONFIG_PREEMPT_NOTIFIERS=y +CONFIG_PADATA=y +CONFIG_ASN1=y +CONFIG_UNINLINE_SPIN_UNLOCK=y +CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y +CONFIG_MUTEX_SPIN_ON_OWNER=y +CONFIG_RWSEM_SPIN_ON_OWNER=y +CONFIG_LOCK_SPIN_ON_OWNER=y +CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y +CONFIG_QUEUED_SPINLOCKS=y +CONFIG_ARCH_USE_QUEUED_RWLOCKS=y +CONFIG_QUEUED_RWLOCKS=y +CONFIG_FREEZER=y + +# +# Processor type and features +# +CONFIG_ZONE_DMA=y +CONFIG_SMP=y +CONFIG_X86_FEATURE_NAMES=y +CONFIG_X86_FAST_FEATURE_TESTS=y +CONFIG_X86_X2APIC=y +CONFIG_X86_MPPARSE=y +# CONFIG_GOLDFISH is not set +CONFIG_RETPOLINE=y +CONFIG_INTEL_RDT=y +# CONFIG_X86_EXTENDED_PLATFORM is not set +CONFIG_X86_INTEL_LPSS=y +CONFIG_X86_AMD_PLATFORM_DEVICE=y +CONFIG_IOSF_MBI=y +# CONFIG_IOSF_MBI_DEBUG is not set +CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y +CONFIG_SCHED_OMIT_FRAME_POINTER=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +# CONFIG_PARAVIRT_DEBUG is not set +# CONFIG_PARAVIRT_SPINLOCKS is not set +# CONFIG_XEN is not set +CONFIG_KVM_GUEST=y +# CONFIG_KVM_DEBUG_FS is not set +# CONFIG_PARAVIRT_TIME_ACCOUNTING is not set +CONFIG_PARAVIRT_CLOCK=y +CONFIG_NO_BOOTMEM=y +# CONFIG_MK8 is not set +# CONFIG_MPSC is not set +# CONFIG_MCORE2 is not set +# CONFIG_MATOM is not set +CONFIG_GENERIC_CPU=y +CONFIG_X86_INTERNODE_CACHE_SHIFT=6 +CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG64=y +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_FAMILY=64 +CONFIG_X86_DEBUGCTLMSR=y +CONFIG_CPU_SUP_INTEL=y +CONFIG_CPU_SUP_AMD=y +CONFIG_CPU_SUP_CENTAUR=y +CONFIG_HPET_TIMER=y +CONFIG_HPET_EMULATE_RTC=y +CONFIG_DMI=y +CONFIG_GART_IOMMU=y +CONFIG_CALGARY_IOMMU=y +CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y +CONFIG_SWIOTLB=y +CONFIG_IOMMU_HELPER=y +CONFIG_MAXSMP=y +CONFIG_NR_CPUS=8192 +CONFIG_SCHED_SMT=y +CONFIG_SMT_NICE=y +CONFIG_SCHED_MC=y +CONFIG_SCHED_MC_PRIO=y +# CONFIG_PREEMPT_NONE is not set +# CONFIG_PREEMPT_VOLUNTARY is not set +CONFIG_PREEMPT=y +CONFIG_PREEMPT_COUNT=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y +CONFIG_X86_MCE=y +# CONFIG_X86_MCELOG_LEGACY is not set +CONFIG_X86_MCE_INTEL=y +CONFIG_X86_MCE_AMD=y +CONFIG_X86_MCE_THRESHOLD=y +# CONFIG_X86_MCE_INJECT is not set +CONFIG_X86_THERMAL_VECTOR=y + +# +# Performance monitoring +# +CONFIG_PERF_EVENTS_INTEL_UNCORE=y +CONFIG_PERF_EVENTS_INTEL_RAPL=y +CONFIG_PERF_EVENTS_INTEL_CSTATE=y +CONFIG_PERF_EVENTS_AMD_POWER=m +# CONFIG_VM86 is not set +# CONFIG_X86_16BIT is not set +CONFIG_X86_VSYSCALL_EMULATION=y +CONFIG_I8K=m +CONFIG_MICROCODE=y +CONFIG_MICROCODE_INTEL=y +CONFIG_MICROCODE_AMD=y +CONFIG_MICROCODE_OLD_INTERFACE=y +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m +# CONFIG_X86_5LEVEL is not set +CONFIG_ARCH_PHYS_ADDR_T_64BIT=y +CONFIG_ARCH_DMA_ADDR_T_64BIT=y +CONFIG_X86_DIRECT_GBPAGES=y +CONFIG_ARCH_HAS_MEM_ENCRYPT=y +CONFIG_AMD_MEM_ENCRYPT=y +# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set +CONFIG_ARCH_USE_MEMREMAP_PROT=y +CONFIG_NUMA=y +CONFIG_AMD_NUMA=y +CONFIG_X86_64_ACPI_NUMA=y +CONFIG_NODES_SPAN_OTHER_NODES=y +# CONFIG_NUMA_EMU is not set +CONFIG_NODES_SHIFT=10 +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_DEFAULT=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ARCH_MEMORY_PROBE=y +CONFIG_ARCH_PROC_KCORE_TEXT=y +CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 +CONFIG_SELECT_MEMORY_MODEL=y +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y +CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_HAVE_MEMORY_PRESENT=y +CONFIG_SPARSEMEM_EXTREME=y +CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y +CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER=y +CONFIG_SPARSEMEM_VMEMMAP=y +CONFIG_HAVE_MEMBLOCK=y +CONFIG_HAVE_MEMBLOCK_NODE_MAP=y +CONFIG_HAVE_GENERIC_GUP=y +CONFIG_ARCH_DISCARD_MEMBLOCK=y +CONFIG_MEMORY_ISOLATION=y +CONFIG_HAVE_BOOTMEM_INFO_NODE=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTPLUG_SPARSE=y +# CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE is not set +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y +CONFIG_MEMORY_BALLOON=y +CONFIG_BALLOON_COMPACTION=y +CONFIG_COMPACTION=y +CONFIG_MIGRATION=y +CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y +CONFIG_ARCH_ENABLE_THP_MIGRATION=y +CONFIG_PHYS_ADDR_T_64BIT=y +CONFIG_BOUNCE=y +CONFIG_VIRT_TO_BUS=y +CONFIG_MMU_NOTIFIER=y +CONFIG_KSM=y +CONFIG_UKSM=y +# CONFIG_KSM_LEGACY is not set +CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y +# CONFIG_HWPOISON_INJECT is not set +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y +# CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set +CONFIG_ARCH_WANTS_THP_SWAP=y +CONFIG_THP_SWAP=y +CONFIG_TRANSPARENT_HUGE_PAGECACHE=y +CONFIG_CLEANCACHE=y +CONFIG_FRONTSWAP=y +CONFIG_CMA=y +# CONFIG_CMA_DEBUG is not set +# CONFIG_CMA_DEBUGFS is not set +CONFIG_CMA_AREAS=7 +# CONFIG_ZSWAP is not set +CONFIG_ZPOOL=m +CONFIG_ZBUD=m +CONFIG_Z3FOLD=m +CONFIG_ZSMALLOC=y +# CONFIG_PGTABLE_MAPPING is not set +# CONFIG_ZSMALLOC_STAT is not set +CONFIG_GENERIC_EARLY_IOREMAP=y +CONFIG_ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT=y +# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set +# CONFIG_IDLE_PAGE_TRACKING is not set +CONFIG_ARCH_HAS_ZONE_DEVICE=y +# CONFIG_ZONE_DEVICE is not set +CONFIG_FRAME_VECTOR=y +CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y +CONFIG_ARCH_HAS_PKEYS=y +# CONFIG_PERCPU_STATS is not set +CONFIG_X86_PMEM_LEGACY_DEVICE=y +CONFIG_X86_PMEM_LEGACY=m +CONFIG_X86_CHECK_BIOS_CORRUPTION=y +CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y +CONFIG_X86_RESERVE_LOW=64 +CONFIG_MTRR=y +CONFIG_MTRR_SANITIZER=y +CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=0 +CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=1 +CONFIG_X86_PAT=y +CONFIG_ARCH_USES_PG_UNCACHED=y +CONFIG_ARCH_RANDOM=y +CONFIG_X86_SMAP=y +CONFIG_X86_INTEL_MPX=y +CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y +CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_EFI_MIXED=y +CONFIG_SECCOMP=y +CONFIG_HZ_100=y +# CONFIG_HZ_250_NODEF is not set +# CONFIG_HZ_300_NODEF is not set +# CONFIG_HZ_1000_NODEF is not set +CONFIG_HZ=100 +CONFIG_SCHED_HRTICK=y +CONFIG_KEXEC=y +# CONFIG_CRASH_DUMP is not set +# CONFIG_KEXEC_JUMP is not set +CONFIG_PHYSICAL_START=0x1000000 +CONFIG_RELOCATABLE=y +CONFIG_RANDOMIZE_BASE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x1000000 +CONFIG_RANDOMIZE_MEMORY=y +CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0xa +CONFIG_HOTPLUG_CPU=y +CONFIG_BOOTPARAM_HOTPLUG_CPU0=y +# CONFIG_DEBUG_HOTPLUG_CPU0 is not set +# CONFIG_COMPAT_VDSO is not set +CONFIG_LEGACY_VSYSCALL_NATIVE=y +# CONFIG_LEGACY_VSYSCALL_EMULATE is not set +# CONFIG_LEGACY_VSYSCALL_NONE is not set +# CONFIG_CMDLINE_BOOL is not set +CONFIG_MODIFY_LDT_SYSCALL=y +CONFIG_HAVE_LIVEPATCH=y +CONFIG_ARCH_HAS_ADD_PAGES=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +CONFIG_USE_PERCPU_NUMA_NODE_ID=y + +# +# Power management and ACPI options +# +CONFIG_ARCH_HIBERNATION_HEADER=y +CONFIG_SUSPEND=y +CONFIG_SUSPEND_FREEZER=y +CONFIG_HIBERNATE_CALLBACKS=y +CONFIG_HIBERNATION=y +CONFIG_PM_STD_PARTITION="" +CONFIG_PM_SLEEP=y +CONFIG_PM_SLEEP_SMP=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=100 +CONFIG_PM_WAKELOCKS_GC=y +CONFIG_PM=y +# CONFIG_PM_DEBUG is not set +CONFIG_PM_OPP=y +CONFIG_PM_CLK=y +CONFIG_PM_GENERIC_DOMAINS=y +# CONFIG_WQ_POWER_EFFICIENT_DEFAULT is not set +CONFIG_PM_GENERIC_DOMAINS_SLEEP=y +CONFIG_ACPI=y +CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y +CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y +CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y +# CONFIG_ACPI_DEBUGGER is not set +CONFIG_ACPI_SLEEP=y +# CONFIG_ACPI_PROCFS_POWER is not set +CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y +# CONFIG_ACPI_EC_DEBUGFS is not set +CONFIG_ACPI_AC=m +CONFIG_ACPI_BATTERY=m +CONFIG_ACPI_BUTTON=m +CONFIG_ACPI_VIDEO=m +CONFIG_ACPI_FAN=m +CONFIG_ACPI_DOCK=y +CONFIG_ACPI_CPU_FREQ_PSS=y +CONFIG_ACPI_PROCESSOR_CSTATE=y +CONFIG_ACPI_PROCESSOR_IDLE=y +CONFIG_ACPI_CPPC_LIB=y +CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_IPMI=m +CONFIG_ACPI_HOTPLUG_CPU=y +CONFIG_ACPI_PROCESSOR_AGGREGATOR=m +CONFIG_ACPI_THERMAL=m +CONFIG_ACPI_NUMA=y +# CONFIG_ACPI_CUSTOM_DSDT is not set +CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_TABLE_UPGRADE=y +# CONFIG_ACPI_DEBUG is not set +CONFIG_ACPI_PCI_SLOT=y +CONFIG_X86_PM_TIMER=y +CONFIG_ACPI_CONTAINER=y +CONFIG_ACPI_HOTPLUG_MEMORY=y +CONFIG_ACPI_HOTPLUG_IOAPIC=y +CONFIG_ACPI_SBS=m +CONFIG_ACPI_HED=y +# CONFIG_ACPI_CUSTOM_METHOD is not set +CONFIG_ACPI_BGRT=y +# CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set +CONFIG_ACPI_NFIT=m +CONFIG_HAVE_ACPI_APEI=y +CONFIG_HAVE_ACPI_APEI_NMI=y +CONFIG_ACPI_APEI=y +CONFIG_ACPI_APEI_GHES=y +CONFIG_ACPI_APEI_PCIEAER=y +CONFIG_ACPI_APEI_MEMORY_FAILURE=y +# CONFIG_ACPI_APEI_EINJ is not set +# CONFIG_ACPI_APEI_ERST_DEBUG is not set +CONFIG_DPTF_POWER=m +CONFIG_ACPI_WATCHDOG=y +CONFIG_ACPI_EXTLOG=m +CONFIG_PMIC_OPREGION=y +# CONFIG_XPOWER_PMIC_OPREGION is not set +# CONFIG_BXT_WC_PMIC_OPREGION is not set +CONFIG_ACPI_CONFIGFS=m +CONFIG_SFI=y + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_GOV_ATTR_SET=y +CONFIG_CPU_FREQ_GOV_COMMON=y +CONFIG_CPU_FREQ_STAT=y +CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y +CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y + +# +# CPU frequency scaling drivers +# +CONFIG_X86_INTEL_PSTATE=y +CONFIG_X86_PCC_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ_CPB=y +CONFIG_X86_POWERNOW_K8=m +CONFIG_X86_AMD_FREQ_SENSITIVITY=m +# CONFIG_X86_SPEEDSTEP_CENTRINO is not set +# CONFIG_X86_P4_CLOCKMOD is not set + +# +# shared options +# +# CONFIG_X86_SPEEDSTEP_LIB is not set + +# +# CPU Idle +# +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y +# CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED is not set +CONFIG_INTEL_IDLE=y + +# +# Bus options (PCI etc.) +# +CONFIG_PCI=y +CONFIG_PCI_DIRECT=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_DOMAINS=y +CONFIG_PCIEPORTBUS=y +CONFIG_HOTPLUG_PCI_PCIE=y +CONFIG_PCIEAER=y +CONFIG_PCIE_ECRC=y +CONFIG_PCIEAER_INJECT=m +CONFIG_PCIEASPM=y +# CONFIG_PCIEASPM_DEBUG is not set +CONFIG_PCIEASPM_DEFAULT=y +# CONFIG_PCIEASPM_POWERSAVE is not set +# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set +# CONFIG_PCIEASPM_PERFORMANCE is not set +CONFIG_PCIE_PME=y +CONFIG_PCIE_DPC=y +CONFIG_PCIE_PTM=y +CONFIG_PCI_BUS_ADDR_T_64BIT=y +CONFIG_PCI_MSI=y +CONFIG_PCI_MSI_IRQ_DOMAIN=y +# CONFIG_PCI_DEBUG is not set +CONFIG_PCI_REALLOC_ENABLE_AUTO=y +CONFIG_PCI_STUB=m +CONFIG_HT_IRQ=y +CONFIG_PCI_ATS=y +CONFIG_PCI_LOCKLESS_CONFIG=y +CONFIG_PCI_IOV=y +CONFIG_PCI_PRI=y +CONFIG_PCI_PASID=y +CONFIG_PCI_LABEL=y +CONFIG_PCI_HYPERV=m +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_ACPI=y +CONFIG_HOTPLUG_PCI_ACPI_IBM=m +CONFIG_HOTPLUG_PCI_CPCI=y +CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m +CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m +CONFIG_HOTPLUG_PCI_SHPC=m + +# +# DesignWare PCI Core Support +# +# CONFIG_PCIE_DW_PLAT is not set + +# +# PCI host controller drivers +# +CONFIG_VMD=m + +# +# PCI Endpoint +# +# CONFIG_PCI_ENDPOINT is not set + +# +# PCI switch controller drivers +# +CONFIG_PCI_SW_SWITCHTEC=m +CONFIG_ISA_DMA_API=y +CONFIG_AMD_NB=y +CONFIG_PCCARD=m +CONFIG_PCMCIA=m +CONFIG_PCMCIA_LOAD_CIS=y +CONFIG_CARDBUS=y + +# +# PC-card bridges +# +CONFIG_YENTA=m +CONFIG_YENTA_O2=y +CONFIG_YENTA_RICOH=y +CONFIG_YENTA_TI=y +CONFIG_YENTA_ENE_TUNE=y +CONFIG_YENTA_TOSHIBA=y +CONFIG_PD6729=m +CONFIG_I82092=m +CONFIG_PCCARD_NONSTATIC=y +CONFIG_RAPIDIO=y +CONFIG_RAPIDIO_TSI721=y +CONFIG_RAPIDIO_DISC_TIMEOUT=30 +CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y +CONFIG_RAPIDIO_DMA_ENGINE=y +# CONFIG_RAPIDIO_DEBUG is not set +CONFIG_RAPIDIO_ENUM_BASIC=m +CONFIG_RAPIDIO_CHMAN=m +CONFIG_RAPIDIO_MPORT_CDEV=m + +# +# RapidIO Switch drivers +# +CONFIG_RAPIDIO_TSI57X=y +CONFIG_RAPIDIO_CPS_XX=y +CONFIG_RAPIDIO_TSI568=y +CONFIG_RAPIDIO_CPS_GEN2=y +CONFIG_RAPIDIO_RXS_GEN3=m +CONFIG_X86_SYSFB=y + +# +# Executable file formats / Emulations +# +CONFIG_BINFMT_ELF=y +CONFIG_COMPAT_BINFMT_ELF=y +CONFIG_ELFCORE=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y +CONFIG_BINFMT_SCRIPT=y +# CONFIG_HAVE_AOUT is not set +CONFIG_BINFMT_MISC=y +CONFIG_COREDUMP=y +CONFIG_IA32_EMULATION=y +CONFIG_IA32_AOUT=y +CONFIG_X86_X32=y +CONFIG_COMPAT_32=y +CONFIG_COMPAT=y +CONFIG_COMPAT_FOR_U64_ALIGNMENT=y +CONFIG_SYSVIPC_COMPAT=y +CONFIG_X86_DEV_DMA_OPS=y +CONFIG_NET=y +CONFIG_COMPAT_NETLINK_MESSAGES=y +CONFIG_NET_INGRESS=y +CONFIG_NET_EGRESS=y + +# +# Networking options +# +CONFIG_PACKET=m +CONFIG_PACKET_DIAG=m +CONFIG_UNIX=m +CONFIG_UNIX_DIAG=m +CONFIG_TLS=m +CONFIG_XFRM=y +CONFIG_XFRM_OFFLOAD=y +CONFIG_XFRM_ALGO=m +CONFIG_XFRM_USER=m +CONFIG_XFRM_SUB_POLICY=y +CONFIG_XFRM_MIGRATE=y +CONFIG_XFRM_STATISTICS=y +CONFIG_XFRM_IPCOMP=m +CONFIG_NET_KEY=m +CONFIG_NET_KEY_MIGRATE=y +CONFIG_SMC=m +CONFIG_SMC_DIAG=m +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_FIB_TRIE_STATS=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_ROUTE_CLASSID=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IP_TUNNEL=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_NET_IPVTI=m +CONFIG_NET_UDP_TUNNEL=m +CONFIG_NET_FOU=m +CONFIG_NET_FOU_IP_TUNNELS=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_ESP_OFFLOAD=m +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_TUNNEL=m +CONFIG_INET_TUNNEL=m +CONFIG_INET_XFRM_MODE_TRANSPORT=m +CONFIG_INET_XFRM_MODE_TUNNEL=m +CONFIG_INET_XFRM_MODE_BEET=m +CONFIG_INET_DIAG=m +CONFIG_INET_TCP_DIAG=m +CONFIG_INET_UDP_DIAG=m +# CONFIG_INET_RAW_DIAG is not set +# CONFIG_INET_DIAG_DESTROY is not set +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_BIC=m +CONFIG_TCP_CONG_CUBIC=m +CONFIG_TCP_CONG_WESTWOOD=m +CONFIG_TCP_CONG_HTCP=m +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_VEGAS=m +CONFIG_TCP_CONG_NV=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_TCP_CONG_DCTCP=m +# CONFIG_TCP_CONG_CDG is not set +CONFIG_TCP_CONG_BBR=m +CONFIG_DEFAULT_RENO=y +CONFIG_DEFAULT_TCP_CONG="reno" +CONFIG_TCP_MD5SIG=y +CONFIG_IPV6=m +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_ESP_OFFLOAD=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_IPV6_ILA=m +CONFIG_INET6_XFRM_TUNNEL=m +CONFIG_INET6_TUNNEL=m +CONFIG_INET6_XFRM_MODE_TRANSPORT=m +CONFIG_INET6_XFRM_MODE_TUNNEL=m +CONFIG_INET6_XFRM_MODE_BEET=m +CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m +CONFIG_IPV6_VTI=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_SIT_6RD=y +CONFIG_IPV6_NDISC_NODETYPE=y +CONFIG_IPV6_TUNNEL=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_FOU=m +CONFIG_IPV6_FOU_TUNNEL=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_MROUTE=y +CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y +CONFIG_IPV6_PIMSM_V2=y +CONFIG_IPV6_SEG6_LWTUNNEL=y +CONFIG_IPV6_SEG6_HMAC=y +# CONFIG_NETLABEL is not set +CONFIG_NETWORK_SECMARK=y +CONFIG_NET_PTP_CLASSIFY=y +CONFIG_NETWORK_PHY_TIMESTAMPING=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_BRIDGE_NETFILTER=m + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_INGRESS=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_NETLINK_ACCT=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_NETLINK_LOG=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_LOG_COMMON=m +CONFIG_NF_LOG_NETDEV=m +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CONNTRACK_PROCFS=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y +CONFIG_NF_CONNTRACK_LABELS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_GRE=m +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_BROADCAST=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y +CONFIG_NF_NAT=m +CONFIG_NF_NAT_NEEDED=y +CONFIG_NF_NAT_PROTO_DCCP=y +CONFIG_NF_NAT_PROTO_UDPLITE=y +CONFIG_NF_NAT_PROTO_SCTP=y +CONFIG_NF_NAT_AMANDA=m +CONFIG_NF_NAT_FTP=m +CONFIG_NF_NAT_IRC=m +CONFIG_NF_NAT_SIP=m +CONFIG_NF_NAT_TFTP=m +CONFIG_NF_NAT_REDIRECT=m +CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_INET=m +CONFIG_NF_TABLES_NETDEV=m +CONFIG_NFT_EXTHDR=m +CONFIG_NFT_META=m +CONFIG_NFT_RT=m +CONFIG_NFT_NUMGEN=m +CONFIG_NFT_CT=m +CONFIG_NFT_SET_RBTREE=m +CONFIG_NFT_SET_HASH=m +CONFIG_NFT_SET_BITMAP=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_NAT=m +CONFIG_NFT_OBJREF=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m +CONFIG_NFT_REJECT=m +CONFIG_NFT_REJECT_INET=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m +CONFIG_NFT_FIB=m +CONFIG_NFT_FIB_INET=m +CONFIG_NF_DUP_NETDEV=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NETFILTER_XTABLES=m + +# +# Xtables combined modules +# +CONFIG_NETFILTER_XT_MARK=m +CONFIG_NETFILTER_XT_CONNMARK=m +CONFIG_NETFILTER_XT_SET=m + +# +# Xtables targets +# +CONFIG_NETFILTER_XT_TARGET_AUDIT=m +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_CT=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HL=m +CONFIG_NETFILTER_XT_TARGET_HMARK=m +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m +CONFIG_NETFILTER_XT_TARGET_LED=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_NOTRACK=m +CONFIG_NETFILTER_XT_TARGET_RATEEST=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NETFILTER_XT_TARGET_TEE=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m + +# +# Xtables matches +# +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_CPU=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ECN=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_HL=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MATCH_L2TP=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_NFACCT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SCTP=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +CONFIG_IP_SET=m +CONFIG_IP_SET_MAX=256 +CONFIG_IP_SET_BITMAP_IP=m +CONFIG_IP_SET_BITMAP_IPMAC=m +CONFIG_IP_SET_BITMAP_PORT=m +CONFIG_IP_SET_HASH_IP=m +CONFIG_IP_SET_HASH_IPMARK=m +CONFIG_IP_SET_HASH_IPPORT=m +CONFIG_IP_SET_HASH_IPPORTIP=m +CONFIG_IP_SET_HASH_IPPORTNET=m +CONFIG_IP_SET_HASH_IPMAC=m +CONFIG_IP_SET_HASH_MAC=m +CONFIG_IP_SET_HASH_NETPORTNET=m +CONFIG_IP_SET_HASH_NET=m +CONFIG_IP_SET_HASH_NETNET=m +CONFIG_IP_SET_HASH_NETPORT=m +CONFIG_IP_SET_HASH_NETIFACE=m +CONFIG_IP_SET_LIST_SET=m +CONFIG_IP_VS=m +CONFIG_IP_VS_IPV6=y +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=12 + +# +# IPVS transport protocol load balancing support +# +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_AH_ESP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_PROTO_SCTP=y + +# +# IPVS scheduler +# +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_FO=m +CONFIG_IP_VS_OVF=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m + +# +# IPVS SH scheduler +# +CONFIG_IP_VS_SH_TAB_BITS=8 + +# +# IPVS application helper +# +CONFIG_IP_VS_FTP=m +CONFIG_IP_VS_NFCT=y +CONFIG_IP_VS_PE_SIP=m + +# +# IP: Netfilter Configuration +# +CONFIG_NF_DEFRAG_IPV4=m +CONFIG_NF_CONNTRACK_IPV4=m +CONFIG_NF_SOCKET_IPV4=m +CONFIG_NF_TABLES_IPV4=m +CONFIG_NFT_CHAIN_ROUTE_IPV4=m +CONFIG_NFT_REJECT_IPV4=m +CONFIG_NFT_DUP_IPV4=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NF_TABLES_ARP=m +CONFIG_NF_DUP_IPV4=m +CONFIG_NF_LOG_ARP=m +CONFIG_NF_LOG_IPV4=m +CONFIG_NF_REJECT_IPV4=m +CONFIG_NF_NAT_IPV4=m +CONFIG_NFT_CHAIN_NAT_IPV4=m +CONFIG_NF_NAT_MASQUERADE_IPV4=m +CONFIG_NFT_MASQ_IPV4=m +CONFIG_NFT_REDIR_IPV4=m +CONFIG_NF_NAT_SNMP_BASIC=m +CONFIG_NF_NAT_PROTO_GRE=m +CONFIG_NF_NAT_PPTP=m +CONFIG_NF_NAT_H323=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_SYNPROXY=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +# CONFIG_IP_NF_SECURITY is not set +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m + +# +# IPv6: Netfilter Configuration +# +CONFIG_NF_DEFRAG_IPV6=m +CONFIG_NF_CONNTRACK_IPV6=m +CONFIG_NF_SOCKET_IPV6=m +CONFIG_NF_TABLES_IPV6=m +CONFIG_NFT_CHAIN_ROUTE_IPV6=m +CONFIG_NFT_CHAIN_NAT_IPV6=m +CONFIG_NFT_MASQ_IPV6=m +CONFIG_NFT_REDIR_IPV6=m +CONFIG_NFT_REJECT_IPV6=m +CONFIG_NFT_DUP_IPV6=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_NF_DUP_IPV6=m +CONFIG_NF_REJECT_IPV6=m +CONFIG_NF_LOG_IPV6=m +CONFIG_NF_NAT_IPV6=m +CONFIG_NF_NAT_MASQUERADE_IPV6=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_TARGET_SYNPROXY=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +# CONFIG_IP6_NF_SECURITY is not set +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_IP6_NF_TARGET_NPT=m + +# +# DECnet: Netfilter Configuration +# +CONFIG_DECNET_NF_GRABULATOR=m +CONFIG_NF_TABLES_BRIDGE=m +CONFIG_NFT_BRIDGE_META=m +CONFIG_NFT_BRIDGE_REJECT=m +CONFIG_NF_LOG_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_IP6=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_SNAT=m +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_BRIDGE_EBT_NFLOG=m +CONFIG_IP_DCCP=m +CONFIG_INET_DCCP_DIAG=m + +# +# DCCP CCIDs Configuration +# +# CONFIG_IP_DCCP_CCID2_DEBUG is not set +CONFIG_IP_DCCP_CCID3=y +# CONFIG_IP_DCCP_CCID3_DEBUG is not set +CONFIG_IP_DCCP_TFRC_LIB=y + +# +# DCCP Kernel Hacking +# +# CONFIG_IP_DCCP_DEBUG is not set +# CONFIG_NET_DCCPPROBE is not set +CONFIG_IP_SCTP=m +# CONFIG_NET_SCTPPROBE is not set +# CONFIG_SCTP_DBG_OBJCNT is not set +CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5=y +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1 is not set +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set +CONFIG_SCTP_COOKIE_HMAC_MD5=y +CONFIG_SCTP_COOKIE_HMAC_SHA1=y +CONFIG_INET_SCTP_DIAG=m +CONFIG_RDS=m +CONFIG_RDS_RDMA=m +CONFIG_RDS_TCP=m +# CONFIG_RDS_DEBUG is not set +CONFIG_TIPC=m +CONFIG_TIPC_MEDIA_IB=y +CONFIG_TIPC_MEDIA_UDP=y +CONFIG_ATM=m +CONFIG_ATM_CLIP=m +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +# CONFIG_ATM_BR2684_IPFILTER is not set +CONFIG_L2TP=m +# CONFIG_L2TP_DEBUGFS is not set +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m +CONFIG_STP=m +CONFIG_GARP=m +CONFIG_MRP=m +CONFIG_BRIDGE=m +CONFIG_BRIDGE_IGMP_SNOOPING=y +CONFIG_BRIDGE_VLAN_FILTERING=y +CONFIG_HAVE_NET_DSA=y +CONFIG_NET_DSA=m +CONFIG_NET_DSA_TAG_DSA=y +CONFIG_NET_DSA_TAG_EDSA=y +CONFIG_NET_DSA_TAG_KSZ=y +CONFIG_NET_DSA_TAG_LAN9303=y +CONFIG_NET_DSA_TAG_MTK=y +CONFIG_NET_DSA_TAG_TRAILER=y +CONFIG_NET_DSA_TAG_QCA=y +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y +CONFIG_VLAN_8021Q_MVRP=y +CONFIG_DECNET=m +CONFIG_DECNET_ROUTER=y +CONFIG_LLC=m +CONFIG_LLC2=m +CONFIG_IPX=m +CONFIG_IPX_INTERN=y +CONFIG_ATALK=m +CONFIG_DEV_APPLETALK=m +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +CONFIG_X25=m +CONFIG_LAPB=m +CONFIG_PHONET=m +CONFIG_6LOWPAN=m +# CONFIG_6LOWPAN_DEBUGFS is not set +CONFIG_6LOWPAN_NHC=m +CONFIG_6LOWPAN_NHC_DEST=m +CONFIG_6LOWPAN_NHC_FRAGMENT=m +CONFIG_6LOWPAN_NHC_HOP=m +CONFIG_6LOWPAN_NHC_IPV6=m +CONFIG_6LOWPAN_NHC_MOBILITY=m +CONFIG_6LOWPAN_NHC_ROUTING=m +CONFIG_6LOWPAN_NHC_UDP=m +CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m +CONFIG_6LOWPAN_GHC_UDP=m +CONFIG_6LOWPAN_GHC_ICMPV6=m +CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m +CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m +CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m +CONFIG_IEEE802154=m +CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y +CONFIG_IEEE802154_SOCKET=m +CONFIG_IEEE802154_6LOWPAN=m +CONFIG_MAC802154=m +CONFIG_NET_SCHED=y + +# +# Queueing/Scheduling +# +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_ATM=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFB=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_MQPRIO=m +CONFIG_NET_SCH_CHOKE=m +CONFIG_NET_SCH_QFQ=m +CONFIG_NET_SCH_CODEL=m +CONFIG_NET_SCH_FQ_CODEL=m +CONFIG_NET_SCH_FQ=m +CONFIG_NET_SCH_HHF=m +CONFIG_NET_SCH_PIE=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_SCH_PLUG=m +# CONFIG_NET_SCH_DEFAULT is not set + +# +# Classification +# +CONFIG_NET_CLS=y +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_CLS_CGROUP=m +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_STACK=32 +CONFIG_NET_EMATCH_CMP=m +CONFIG_NET_EMATCH_NBYTE=m +CONFIG_NET_EMATCH_U32=m +CONFIG_NET_EMATCH_META=m +CONFIG_NET_EMATCH_TEXT=m +CONFIG_NET_EMATCH_CANID=m +CONFIG_NET_EMATCH_IPSET=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_SAMPLE=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +# CONFIG_NET_ACT_SIMP is not set +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_BPF=m +CONFIG_NET_ACT_CONNMARK=m +CONFIG_NET_ACT_SKBMOD=m +CONFIG_NET_ACT_IFE=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_IFE_SKBMARK=m +CONFIG_NET_IFE_SKBPRIO=m +CONFIG_NET_IFE_SKBTCINDEX=m +CONFIG_NET_CLS_IND=y +CONFIG_NET_SCH_FIFO=y +CONFIG_DCB=y +CONFIG_DNS_RESOLVER=y +CONFIG_BATMAN_ADV=m +# CONFIG_BATMAN_ADV_BATMAN_V is not set +CONFIG_BATMAN_ADV_BLA=y +CONFIG_BATMAN_ADV_DAT=y +CONFIG_BATMAN_ADV_NC=y +CONFIG_BATMAN_ADV_MCAST=y +CONFIG_BATMAN_ADV_DEBUGFS=y +# CONFIG_BATMAN_ADV_DEBUG is not set +CONFIG_OPENVSWITCH=m +CONFIG_OPENVSWITCH_GRE=m +CONFIG_OPENVSWITCH_VXLAN=m +CONFIG_OPENVSWITCH_GENEVE=m +CONFIG_VSOCKETS=m +CONFIG_VMWARE_VMCI_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS_COMMON=m +CONFIG_HYPERV_VSOCKETS=m +CONFIG_NETLINK_DIAG=m +CONFIG_MPLS=y +CONFIG_NET_MPLS_GSO=m +CONFIG_MPLS_ROUTING=m +CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m +CONFIG_HSR=m +CONFIG_NET_SWITCHDEV=y +CONFIG_NET_L3_MASTER_DEV=y +# CONFIG_NET_NCSI is not set +CONFIG_RPS=y +CONFIG_RFS_ACCEL=y +CONFIG_XPS=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CGROUP_NET_CLASSID=y +CONFIG_NET_RX_BUSY_POLL=y +CONFIG_BQL=y +CONFIG_BPF_JIT=y +# CONFIG_BPF_STREAM_PARSER is not set +CONFIG_NET_FLOW_LIMIT=y + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set +# CONFIG_NET_TCPPROBE is not set +# CONFIG_NET_DROP_MONITOR is not set +CONFIG_HAMRADIO=y + +# +# Packet Radio protocols +# +CONFIG_AX25=m +CONFIG_AX25_DAMA_SLAVE=y +CONFIG_NETROM=m +CONFIG_ROSE=m + +# +# AX.25 network device drivers +# +CONFIG_MKISS=m +CONFIG_6PACK=m +CONFIG_BPQETHER=m +CONFIG_BAYCOM_SER_FDX=m +CONFIG_BAYCOM_SER_HDX=m +CONFIG_BAYCOM_PAR=m +CONFIG_YAM=m +CONFIG_CAN=m +CONFIG_CAN_RAW=m +CONFIG_CAN_BCM=m +CONFIG_CAN_GW=m + +# +# CAN Device Drivers +# +CONFIG_CAN_VCAN=m +CONFIG_CAN_VXCAN=m +CONFIG_CAN_SLCAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_CALC_BITTIMING=y +CONFIG_CAN_LEDS=y +CONFIG_CAN_JANZ_ICAN3=m +CONFIG_CAN_C_CAN=m +CONFIG_CAN_C_CAN_PLATFORM=m +CONFIG_CAN_C_CAN_PCI=m +CONFIG_CAN_CC770=m +CONFIG_CAN_CC770_ISA=m +CONFIG_CAN_CC770_PLATFORM=m +CONFIG_CAN_IFI_CANFD=m +CONFIG_CAN_M_CAN=m +CONFIG_CAN_PEAK_PCIEFD=m +CONFIG_CAN_SJA1000=m +CONFIG_CAN_SJA1000_ISA=m +CONFIG_CAN_SJA1000_PLATFORM=m +CONFIG_CAN_EMS_PCMCIA=m +CONFIG_CAN_EMS_PCI=m +CONFIG_CAN_PEAK_PCMCIA=m +CONFIG_CAN_PEAK_PCI=m +CONFIG_CAN_PEAK_PCIEC=y +CONFIG_CAN_KVASER_PCI=m +CONFIG_CAN_PLX_PCI=m +CONFIG_CAN_SOFTING=m +CONFIG_CAN_SOFTING_CS=m + +# +# CAN SPI interfaces +# +CONFIG_CAN_HI311X=m +CONFIG_CAN_MCP251X=m + +# +# CAN USB interfaces +# +CONFIG_CAN_EMS_USB=m +CONFIG_CAN_ESD_USB2=m +CONFIG_CAN_GS_USB=m +CONFIG_CAN_KVASER_USB=m +CONFIG_CAN_PEAK_USB=m +CONFIG_CAN_8DEV_USB=m +CONFIG_CAN_MCBA_USB=m +# CONFIG_CAN_DEBUG_DEVICES is not set +CONFIG_BT=m +CONFIG_BT_BREDR=y +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_CMTP=m +CONFIG_BT_HIDP=m +CONFIG_BT_HS=y +CONFIG_BT_LE=y +CONFIG_BT_6LOWPAN=m +CONFIG_BT_LEDS=y +# CONFIG_BT_SELFTEST is not set +CONFIG_BT_DEBUGFS=y + +# +# Bluetooth device drivers +# +CONFIG_BT_INTEL=m +CONFIG_BT_BCM=m +CONFIG_BT_RTL=m +CONFIG_BT_QCA=m +CONFIG_BT_HCIBTUSB=m +CONFIG_BT_HCIBTUSB_BCM=y +CONFIG_BT_HCIBTUSB_RTL=y +CONFIG_BT_HCIBTSDIO=m +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_SERDEV=y +CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_NOKIA=m +CONFIG_BT_HCIUART_BCSP=y +CONFIG_BT_HCIUART_ATH3K=y +CONFIG_BT_HCIUART_LL=y +CONFIG_BT_HCIUART_3WIRE=y +CONFIG_BT_HCIUART_INTEL=y +CONFIG_BT_HCIUART_BCM=y +CONFIG_BT_HCIUART_QCA=y +CONFIG_BT_HCIUART_AG6XX=y +CONFIG_BT_HCIUART_MRVL=y +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBPA10X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIDTL1=m +CONFIG_BT_HCIBT3C=m +CONFIG_BT_HCIBLUECARD=m +CONFIG_BT_HCIBTUART=m +CONFIG_BT_HCIVHCI=m +CONFIG_BT_MRVL=m +CONFIG_BT_MRVL_SDIO=m +CONFIG_BT_ATH3K=m +CONFIG_BT_WILINK=m +CONFIG_AF_RXRPC=m +CONFIG_AF_RXRPC_IPV6=y +# CONFIG_AF_RXRPC_INJECT_LOSS is not set +# CONFIG_AF_RXRPC_DEBUG is not set +# CONFIG_RXKAD is not set +CONFIG_AF_KCM=m +CONFIG_STREAM_PARSER=m +CONFIG_FIB_RULES=y +CONFIG_WIRELESS=y +CONFIG_WIRELESS_EXT=y +CONFIG_WEXT_CORE=y +CONFIG_WEXT_PROC=y +CONFIG_WEXT_SPY=y +CONFIG_WEXT_PRIV=y +CONFIG_CFG80211=m +CONFIG_NL80211_TESTMODE=y +# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set +CONFIG_CFG80211_DEFAULT_PS=y +# CONFIG_CFG80211_DEBUGFS is not set +# CONFIG_CFG80211_INTERNAL_REGDB is not set +CONFIG_CFG80211_CRDA_SUPPORT=y +CONFIG_CFG80211_WEXT=y +CONFIG_CFG80211_WEXT_EXPORT=y +CONFIG_LIB80211=m +CONFIG_LIB80211_CRYPT_WEP=m +CONFIG_LIB80211_CRYPT_CCMP=m +CONFIG_LIB80211_CRYPT_TKIP=m +# CONFIG_LIB80211_DEBUG is not set +CONFIG_MAC80211=m +CONFIG_MAC80211_HAS_RC=y +CONFIG_MAC80211_RC_MINSTREL=y +CONFIG_MAC80211_RC_MINSTREL_HT=y +# CONFIG_MAC80211_RC_MINSTREL_VHT is not set +CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" +CONFIG_MAC80211_MESH=y +CONFIG_MAC80211_LEDS=y +# CONFIG_MAC80211_DEBUGFS is not set +# CONFIG_MAC80211_MESSAGE_TRACING is not set +# CONFIG_MAC80211_DEBUG_MENU is not set +CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 +CONFIG_WIMAX=m +CONFIG_WIMAX_DEBUG_LEVEL=8 +CONFIG_RFKILL=m +CONFIG_RFKILL_LEDS=y +CONFIG_RFKILL_INPUT=y +CONFIG_RFKILL_GPIO=m +CONFIG_NET_9P=m +CONFIG_NET_9P_VIRTIO=m +CONFIG_NET_9P_RDMA=m +# CONFIG_NET_9P_DEBUG is not set +CONFIG_CAIF=m +# CONFIG_CAIF_DEBUG is not set +CONFIG_CAIF_NETDEV=m +CONFIG_CAIF_USB=m +CONFIG_CEPH_LIB=m +# CONFIG_CEPH_LIB_PRETTYDEBUG is not set +CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y +CONFIG_NFC=m +CONFIG_NFC_DIGITAL=m +CONFIG_NFC_NCI=m +CONFIG_NFC_NCI_SPI=m +CONFIG_NFC_NCI_UART=m +CONFIG_NFC_HCI=m +CONFIG_NFC_SHDLC=y + +# +# Near Field Communication (NFC) devices +# +CONFIG_NFC_TRF7970A=m +CONFIG_NFC_MEI_PHY=m +CONFIG_NFC_SIM=m +CONFIG_NFC_PORT100=m +CONFIG_NFC_FDP=m +CONFIG_NFC_FDP_I2C=m +CONFIG_NFC_PN544=m +CONFIG_NFC_PN544_I2C=m +CONFIG_NFC_PN544_MEI=m +CONFIG_NFC_PN533=m +CONFIG_NFC_PN533_USB=m +CONFIG_NFC_PN533_I2C=m +CONFIG_NFC_MICROREAD=m +CONFIG_NFC_MICROREAD_I2C=m +CONFIG_NFC_MICROREAD_MEI=m +CONFIG_NFC_MRVL=m +CONFIG_NFC_MRVL_USB=m +CONFIG_NFC_MRVL_UART=m +CONFIG_NFC_MRVL_I2C=m +CONFIG_NFC_MRVL_SPI=m +CONFIG_NFC_ST21NFCA=m +CONFIG_NFC_ST21NFCA_I2C=m +CONFIG_NFC_ST_NCI=m +CONFIG_NFC_ST_NCI_I2C=m +CONFIG_NFC_ST_NCI_SPI=m +CONFIG_NFC_NXP_NCI=m +CONFIG_NFC_NXP_NCI_I2C=m +CONFIG_NFC_S3FWRN5=m +CONFIG_NFC_S3FWRN5_I2C=m +CONFIG_NFC_ST95HF=m +CONFIG_PSAMPLE=m +CONFIG_NET_IFE=m +CONFIG_LWTUNNEL=y +CONFIG_LWTUNNEL_BPF=y +CONFIG_DST_CACHE=y +CONFIG_GRO_CELLS=y +CONFIG_NET_DEVLINK=m +CONFIG_MAY_USE_DEVLINK=m +CONFIG_HAVE_EBPF_JIT=y + +# +# Device Drivers +# + +# +# Generic Driver Options +# +# CONFIG_UEVENT_HELPER is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y +CONFIG_FW_LOADER=y +# CONFIG_FIRMWARE_IN_KERNEL is not set +CONFIG_EXTRA_FIRMWARE="" +CONFIG_FW_LOADER_USER_HELPER=y +# CONFIG_FW_LOADER_USER_HELPER_FALLBACK is not set +CONFIG_WANT_DEV_COREDUMP=y +CONFIG_ALLOW_DEV_COREDUMP=y +CONFIG_DEV_COREDUMP=y +# CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set +# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set +CONFIG_TEST_ASYNC_DRIVER_PROBE=m +# CONFIG_SYS_HYPERVISOR is not set +# CONFIG_GENERIC_CPU_DEVICES is not set +CONFIG_GENERIC_CPU_AUTOPROBE=y +CONFIG_GENERIC_CPU_VULNERABILITIES=y +CONFIG_REGMAP=y +CONFIG_REGMAP_I2C=m +CONFIG_REGMAP_SPI=y +CONFIG_REGMAP_SPMI=m +CONFIG_REGMAP_W1=m +CONFIG_REGMAP_MMIO=y +CONFIG_REGMAP_IRQ=y +CONFIG_DMA_SHARED_BUFFER=y +# CONFIG_DMA_FENCE_TRACE is not set +# CONFIG_DMA_CMA is not set + +# +# Bus devices +# +CONFIG_CONNECTOR=m +CONFIG_MTD=m +CONFIG_MTD_TESTS=m +CONFIG_MTD_REDBOOT_PARTS=m +CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 +CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED=y +CONFIG_MTD_REDBOOT_PARTS_READONLY=y +CONFIG_MTD_CMDLINE_PARTS=m +CONFIG_MTD_AR7_PARTS=m + +# +# Partition parsers +# + +# +# User Modules And Translation Layers +# +CONFIG_MTD_BLKDEVS=m +CONFIG_MTD_BLOCK=m +CONFIG_MTD_BLOCK_RO=m +CONFIG_FTL=m +CONFIG_NFTL=m +CONFIG_NFTL_RW=y +CONFIG_INFTL=m +CONFIG_RFD_FTL=m +CONFIG_SSFDC=m +CONFIG_SM_FTL=m +CONFIG_MTD_OOPS=m +CONFIG_MTD_SWAP=m +# CONFIG_MTD_PARTITIONED_MASTER is not set + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=m +CONFIG_MTD_JEDECPROBE=m +CONFIG_MTD_GEN_PROBE=m +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set +# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set +# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +# CONFIG_MTD_CFI_I4 is not set +# CONFIG_MTD_CFI_I8 is not set +CONFIG_MTD_CFI_INTELEXT=m +CONFIG_MTD_CFI_AMDSTD=m +CONFIG_MTD_CFI_STAA=m +CONFIG_MTD_CFI_UTIL=m +CONFIG_MTD_RAM=m +CONFIG_MTD_ROM=m +CONFIG_MTD_ABSENT=m + +# +# Mapping drivers for chip access +# +CONFIG_MTD_COMPLEX_MAPPINGS=y +CONFIG_MTD_PHYSMAP=m +# CONFIG_MTD_PHYSMAP_COMPAT is not set +CONFIG_MTD_SBC_GXX=m +CONFIG_MTD_AMD76XROM=m +CONFIG_MTD_ICHXROM=m +CONFIG_MTD_ESB2ROM=m +CONFIG_MTD_CK804XROM=m +CONFIG_MTD_SCB2_FLASH=m +CONFIG_MTD_NETtel=m +CONFIG_MTD_L440GX=m +CONFIG_MTD_PCI=m +CONFIG_MTD_PCMCIA=m +# CONFIG_MTD_PCMCIA_ANONYMOUS is not set +CONFIG_MTD_GPIO_ADDR=m +CONFIG_MTD_INTEL_VR_NOR=m +CONFIG_MTD_PLATRAM=m +CONFIG_MTD_LATCH_ADDR=m + +# +# Self-contained MTD device drivers +# +CONFIG_MTD_PMC551=m +CONFIG_MTD_PMC551_BUGFIX=y +# CONFIG_MTD_PMC551_DEBUG is not set +CONFIG_MTD_DATAFLASH=m +CONFIG_MTD_DATAFLASH_WRITE_VERIFY=y +CONFIG_MTD_DATAFLASH_OTP=y +CONFIG_MTD_M25P80=m +CONFIG_MTD_MCHP23K256=m +CONFIG_MTD_SST25L=m +CONFIG_MTD_SLRAM=m +CONFIG_MTD_PHRAM=m +CONFIG_MTD_MTDRAM=m +CONFIG_MTDRAM_TOTAL_SIZE=4096 +CONFIG_MTDRAM_ERASE_SIZE=128 +CONFIG_MTD_BLOCK2MTD=m + +# +# Disk-On-Chip Device Drivers +# +CONFIG_MTD_DOCG3=m +CONFIG_BCH_CONST_M=14 +CONFIG_BCH_CONST_T=4 +CONFIG_MTD_NAND_ECC=m +CONFIG_MTD_NAND_ECC_SMC=y +CONFIG_MTD_NAND=m +CONFIG_MTD_NAND_BCH=m +CONFIG_MTD_NAND_ECC_BCH=y +CONFIG_MTD_SM_COMMON=m +CONFIG_MTD_NAND_DENALI=m +CONFIG_MTD_NAND_DENALI_PCI=m +CONFIG_MTD_NAND_GPIO=m +# CONFIG_MTD_NAND_OMAP_BCH_BUILD is not set +CONFIG_MTD_NAND_RICOH=m +CONFIG_MTD_NAND_DISKONCHIP=m +CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED=y +CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0x0 +CONFIG_MTD_NAND_DISKONCHIP_PROBE_HIGH=y +CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y +CONFIG_MTD_NAND_DOCG4=m +CONFIG_MTD_NAND_CAFE=m +CONFIG_MTD_NAND_NANDSIM=m +CONFIG_MTD_NAND_PLATFORM=m +CONFIG_MTD_ONENAND=m +CONFIG_MTD_ONENAND_VERIFY_WRITE=y +CONFIG_MTD_ONENAND_GENERIC=m +CONFIG_MTD_ONENAND_OTP=y +CONFIG_MTD_ONENAND_2X_PROGRAM=y + +# +# LPDDR & LPDDR2 PCM memory drivers +# +CONFIG_MTD_LPDDR=m +CONFIG_MTD_QINFO_PROBE=m +CONFIG_MTD_SPI_NOR=m +CONFIG_MTD_MT81xx_NOR=m +CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y +CONFIG_MTD_UBI=m +CONFIG_MTD_UBI_WL_THRESHOLD=4096 +CONFIG_MTD_UBI_BEB_LIMIT=20 +CONFIG_MTD_UBI_FASTMAP=y +# CONFIG_MTD_UBI_GLUEBI is not set +CONFIG_MTD_UBI_BLOCK=y +# CONFIG_OF is not set +CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_SERIAL=m +CONFIG_PARPORT_PC_FIFO=y +CONFIG_PARPORT_PC_SUPERIO=y +CONFIG_PARPORT_PC_PCMCIA=m +# CONFIG_PARPORT_GSC is not set +CONFIG_PARPORT_AX88796=m +CONFIG_PARPORT_1284=y +CONFIG_PARPORT_NOT_PC=y +CONFIG_PNP=y +# CONFIG_PNP_DEBUG_MESSAGES is not set + +# +# Protocols +# +CONFIG_PNPACPI=y +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_NULL_BLK is not set +CONFIG_BLK_DEV_FD=m +CONFIG_PARIDE=m + +# +# Parallel IDE high-level drivers +# +CONFIG_PARIDE_PD=m +CONFIG_PARIDE_PCD=m +CONFIG_PARIDE_PF=m +CONFIG_PARIDE_PT=m +CONFIG_PARIDE_PG=m + +# +# Parallel IDE protocol modules +# +CONFIG_PARIDE_ATEN=m +CONFIG_PARIDE_BPCK=m +CONFIG_PARIDE_COMM=m +CONFIG_PARIDE_DSTR=m +CONFIG_PARIDE_FIT2=m +CONFIG_PARIDE_FIT3=m +CONFIG_PARIDE_EPAT=m +CONFIG_PARIDE_EPATC8=y +CONFIG_PARIDE_EPIA=m +CONFIG_PARIDE_FRIQ=m +CONFIG_PARIDE_FRPW=m +CONFIG_PARIDE_KBIC=m +CONFIG_PARIDE_KTTI=m +CONFIG_PARIDE_ON20=m +CONFIG_PARIDE_ON26=m +CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m +CONFIG_ZRAM=m +CONFIG_ZRAM_WRITEBACK=y +CONFIG_BLK_DEV_DAC960=m +CONFIG_BLK_DEV_UMEM=m +# CONFIG_BLK_DEV_COW_COMMON is not set +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_DRBD=m +# CONFIG_DRBD_FAULT_INJECTION is not set +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_SKD=m +CONFIG_BLK_DEV_SX8=m +CONFIG_BLK_DEV_RAM=m +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=16384 +CONFIG_BLK_DEV_RAM_DAX=y +CONFIG_CDROM_PKTCDVD=m +CONFIG_CDROM_PKTCDVD_BUFFERS=8 +CONFIG_CDROM_PKTCDVD_WCACHE=y +CONFIG_ATA_OVER_ETH=m +CONFIG_VIRTIO_BLK=m +# CONFIG_VIRTIO_BLK_SCSI is not set +CONFIG_BLK_DEV_RBD=m +CONFIG_BLK_DEV_RSXX=m +CONFIG_NVME_CORE=m +CONFIG_BLK_DEV_NVME=m +CONFIG_NVME_FABRICS=m +CONFIG_NVME_RDMA=m +CONFIG_NVME_FC=m +CONFIG_NVME_TARGET=m +CONFIG_NVME_TARGET_LOOP=m +CONFIG_NVME_TARGET_RDMA=m +CONFIG_NVME_TARGET_FC=m +CONFIG_NVME_TARGET_FCLOOP=m + +# +# Misc devices +# +CONFIG_SENSORS_LIS3LV02D=m +CONFIG_AD525X_DPOT=m +CONFIG_AD525X_DPOT_I2C=m +CONFIG_AD525X_DPOT_SPI=m +# CONFIG_DUMMY_IRQ is not set +CONFIG_IBM_ASM=m +CONFIG_PHANTOM=m +CONFIG_SGI_IOC4=m +CONFIG_TIFM_CORE=m +CONFIG_TIFM_7XX1=m +CONFIG_ICS932S401=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_HP_ILO=m +CONFIG_APDS9802ALS=m +CONFIG_ISL29003=m +CONFIG_ISL29020=m +CONFIG_SENSORS_TSL2550=m +CONFIG_SENSORS_BH1770=m +CONFIG_SENSORS_APDS990X=m +CONFIG_HMC6352=m +CONFIG_DS1682=m +CONFIG_TI_DAC7512=m +CONFIG_VMWARE_BALLOON=m +CONFIG_USB_SWITCH_FSA9480=m +CONFIG_LATTICE_ECP3_CONFIG=m +CONFIG_SRAM=y +CONFIG_PCI_ENDPOINT_TEST=m +CONFIG_C2PORT=m +CONFIG_C2PORT_DURAMAR_2150=m + +# +# EEPROM support +# +CONFIG_EEPROM_AT24=m +CONFIG_EEPROM_AT25=m +CONFIG_EEPROM_LEGACY=m +CONFIG_EEPROM_MAX6875=m +CONFIG_EEPROM_93CX6=m +CONFIG_EEPROM_93XX46=m +CONFIG_EEPROM_IDT_89HPESX=m +CONFIG_CB710_CORE=m +# CONFIG_CB710_DEBUG is not set +CONFIG_CB710_DEBUG_ASSUMPTIONS=y + +# +# Texas Instruments shared transport line discipline +# +CONFIG_TI_ST=m +CONFIG_SENSORS_LIS3_I2C=m + +# +# Altera FPGA firmware download module +# +CONFIG_ALTERA_STAPL=m +CONFIG_INTEL_MEI=y +CONFIG_INTEL_MEI_ME=y +CONFIG_INTEL_MEI_TXE=m +CONFIG_VMWARE_VMCI=m + +# +# Intel MIC Bus Driver +# +CONFIG_INTEL_MIC_BUS=m + +# +# SCIF Bus Driver +# +CONFIG_SCIF_BUS=m + +# +# VOP Bus Driver +# +CONFIG_VOP_BUS=m + +# +# Intel MIC Host Driver +# +CONFIG_INTEL_MIC_HOST=m + +# +# Intel MIC Card Driver +# +CONFIG_INTEL_MIC_CARD=m + +# +# SCIF Driver +# +CONFIG_SCIF=m + +# +# Intel MIC Coprocessor State Management (COSM) Drivers +# +CONFIG_MIC_COSM=m + +# +# VOP Driver +# +CONFIG_VOP=m +CONFIG_VHOST_RING=m +CONFIG_GENWQE=m +CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 +CONFIG_ECHO=m +# CONFIG_CXL_BASE is not set +# CONFIG_CXL_AFU_DRIVER_OPS is not set +# CONFIG_CXL_LIB is not set +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set + +# +# SCSI device support +# +CONFIG_SCSI_MOD=m +CONFIG_RAID_ATTRS=m +CONFIG_SCSI=m +CONFIG_SCSI_DMA=y +CONFIG_SCSI_NETLINK=y +# CONFIG_SCSI_MQ_DEFAULT is not set +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=m +CONFIG_CHR_DEV_ST=m +CONFIG_CHR_DEV_OSST=m +CONFIG_BLK_DEV_SR=m +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=m +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SCAN_ASYNC=y + +# +# SCSI Transports +# +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SCSI_SAS_ATTRS=m +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SAS_ATA=y +CONFIG_SCSI_SAS_HOST_SMP=y +CONFIG_SCSI_SRP_ATTRS=m +CONFIG_SCSI_LOWLEVEL=y +CONFIG_ISCSI_TCP=m +CONFIG_ISCSI_BOOT_SYSFS=m +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_CXGB4_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_SCSI_BNX2X_FCOE=m +CONFIG_BE2ISCSI=m +CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_HPSA=m +CONFIG_SCSI_3W_9XXX=m +CONFIG_SCSI_3W_SAS=m +CONFIG_SCSI_ACARD=m +CONFIG_SCSI_AACRAID=m +CONFIG_SCSI_AIC7XXX=m +CONFIG_AIC7XXX_CMDS_PER_DEVICE=4 +CONFIG_AIC7XXX_RESET_DELAY_MS=15000 +# CONFIG_AIC7XXX_DEBUG_ENABLE is not set +CONFIG_AIC7XXX_DEBUG_MASK=0 +# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set +CONFIG_SCSI_AIC79XX=m +CONFIG_AIC79XX_CMDS_PER_DEVICE=4 +CONFIG_AIC79XX_RESET_DELAY_MS=15000 +# CONFIG_AIC79XX_DEBUG_ENABLE is not set +CONFIG_AIC79XX_DEBUG_MASK=0 +# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set +CONFIG_SCSI_AIC94XX=m +# CONFIG_AIC94XX_DEBUG is not set +CONFIG_SCSI_MVSAS=m +# CONFIG_SCSI_MVSAS_DEBUG is not set +CONFIG_SCSI_MVSAS_TASKLET=y +CONFIG_SCSI_MVUMI=m +CONFIG_SCSI_DPT_I2O=m +CONFIG_SCSI_ADVANSYS=m +CONFIG_SCSI_ARCMSR=m +CONFIG_SCSI_ESAS2R=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m +CONFIG_SCSI_MPT3SAS=m +CONFIG_SCSI_MPT2SAS_MAX_SGE=128 +CONFIG_SCSI_MPT3SAS_MAX_SGE=128 +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_SMARTPQI=m +CONFIG_SCSI_UFSHCD=m +CONFIG_SCSI_UFSHCD_PCI=m +CONFIG_SCSI_UFS_DWC_TC_PCI=m +CONFIG_SCSI_UFSHCD_PLATFORM=m +CONFIG_SCSI_UFS_DWC_TC_PLATFORM=m +CONFIG_SCSI_HPTIOP=m +CONFIG_SCSI_BUSLOGIC=m +CONFIG_SCSI_FLASHPOINT=y +CONFIG_VMWARE_PVSCSI=m +CONFIG_HYPERV_STORAGE=m +CONFIG_LIBFC=m +CONFIG_LIBFCOE=m +CONFIG_FCOE=m +CONFIG_FCOE_FNIC=m +CONFIG_SCSI_SNIC=m +# CONFIG_SCSI_SNIC_DEBUG_FS is not set +CONFIG_SCSI_DMX3191D=m +CONFIG_SCSI_EATA=m +# CONFIG_SCSI_EATA_TAGGED_QUEUE is not set +# CONFIG_SCSI_EATA_LINKED_COMMANDS is not set +CONFIG_SCSI_EATA_MAX_TAGS=16 +CONFIG_SCSI_FUTURE_DOMAIN=m +CONFIG_SCSI_GDTH=m +CONFIG_SCSI_ISCI=m +CONFIG_SCSI_IPS=m +CONFIG_SCSI_INITIO=m +CONFIG_SCSI_INIA100=m +CONFIG_SCSI_PPA=m +CONFIG_SCSI_IMM=m +# CONFIG_SCSI_IZIP_EPP16 is not set +# CONFIG_SCSI_IZIP_SLOW_CTR is not set +CONFIG_SCSI_STEX=m +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +CONFIG_SCSI_SYM53C8XX_MMIO=y +CONFIG_SCSI_IPR=m +# CONFIG_SCSI_IPR_TRACE is not set +# CONFIG_SCSI_IPR_DUMP is not set +CONFIG_SCSI_QLOGIC_1280=m +CONFIG_SCSI_QLA_FC=m +CONFIG_TCM_QLA2XXX=m +# CONFIG_TCM_QLA2XXX_DEBUG is not set +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_QEDI=m +CONFIG_QEDF=m +CONFIG_SCSI_LPFC=m +# CONFIG_SCSI_LPFC_DEBUG_FS is not set +CONFIG_SCSI_DC395x=m +CONFIG_SCSI_AM53C974=m +CONFIG_SCSI_WD719X=m +# CONFIG_SCSI_DEBUG is not set +CONFIG_SCSI_PMCRAID=m +CONFIG_SCSI_PM8001=m +CONFIG_SCSI_BFA_FC=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_CHELSIO_FCOE=m +CONFIG_SCSI_LOWLEVEL_PCMCIA=y +CONFIG_PCMCIA_AHA152X=m +CONFIG_PCMCIA_FDOMAIN=m +CONFIG_PCMCIA_QLOGIC=m +CONFIG_PCMCIA_SYM53C500=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +CONFIG_SCSI_OSD_INITIATOR=m +CONFIG_SCSI_OSD_ULD=m +CONFIG_SCSI_OSD_DPRINT_SENSE=1 +# CONFIG_SCSI_OSD_DEBUG is not set +CONFIG_ATA=m +# CONFIG_ATA_NONSTANDARD is not set +CONFIG_ATA_VERBOSE_ERROR=y +CONFIG_ATA_ACPI=y +CONFIG_SATA_ZPODD=y +CONFIG_SATA_PMP=y + +# +# Controllers with non-SFF native interface +# +CONFIG_SATA_AHCI=m +CONFIG_SATA_AHCI_PLATFORM=m +CONFIG_SATA_INIC162X=m +CONFIG_SATA_ACARD_AHCI=m +CONFIG_SATA_SIL24=m +CONFIG_ATA_SFF=y + +# +# SFF controllers with custom DMA interface +# +CONFIG_PDC_ADMA=m +CONFIG_SATA_QSTOR=m +CONFIG_SATA_SX4=m +CONFIG_ATA_BMDMA=y + +# +# SATA SFF controllers with BMDMA +# +CONFIG_ATA_PIIX=m +CONFIG_SATA_DWC=m +# CONFIG_SATA_DWC_OLD_DMA is not set +# CONFIG_SATA_DWC_DEBUG is not set +CONFIG_SATA_MV=m +CONFIG_SATA_NV=m +CONFIG_SATA_PROMISE=m +CONFIG_SATA_SIL=m +CONFIG_SATA_SIS=m +CONFIG_SATA_SVW=m +CONFIG_SATA_ULI=m +CONFIG_SATA_VIA=m +CONFIG_SATA_VITESSE=m + +# +# PATA SFF controllers with BMDMA +# +CONFIG_PATA_ALI=m +CONFIG_PATA_AMD=m +CONFIG_PATA_ARTOP=m +CONFIG_PATA_ATIIXP=m +CONFIG_PATA_ATP867X=m +CONFIG_PATA_CMD64X=m +CONFIG_PATA_CYPRESS=m +CONFIG_PATA_EFAR=m +CONFIG_PATA_HPT366=m +CONFIG_PATA_HPT37X=m +CONFIG_PATA_HPT3X2N=m +CONFIG_PATA_HPT3X3=m +CONFIG_PATA_HPT3X3_DMA=y +CONFIG_PATA_IT8213=m +CONFIG_PATA_IT821X=m +CONFIG_PATA_JMICRON=m +CONFIG_PATA_MARVELL=m +CONFIG_PATA_NETCELL=m +CONFIG_PATA_NINJA32=m +CONFIG_PATA_NS87415=m +CONFIG_PATA_OLDPIIX=m +CONFIG_PATA_OPTIDMA=m +CONFIG_PATA_PDC2027X=m +CONFIG_PATA_PDC_OLD=m +CONFIG_PATA_RADISYS=m +CONFIG_PATA_RDC=m +CONFIG_PATA_SCH=m +CONFIG_PATA_SERVERWORKS=m +CONFIG_PATA_SIL680=m +CONFIG_PATA_SIS=m +CONFIG_PATA_TOSHIBA=m +CONFIG_PATA_TRIFLEX=m +CONFIG_PATA_VIA=m +CONFIG_PATA_WINBOND=m + +# +# PIO-only SFF controllers +# +CONFIG_PATA_CMD640_PCI=m +CONFIG_PATA_MPIIX=m +CONFIG_PATA_NS87410=m +CONFIG_PATA_OPTI=m +CONFIG_PATA_PCMCIA=m +CONFIG_PATA_RZ1000=m + +# +# Generic fallback / legacy drivers +# +CONFIG_PATA_ACPI=m +CONFIG_ATA_GENERIC=m +CONFIG_PATA_LEGACY=m +CONFIG_MD=y +CONFIG_BLK_DEV_MD=m +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_MD_CLUSTER=m +CONFIG_BCACHE=m +# CONFIG_BCACHE_DEBUG is not set +# CONFIG_BCACHE_CLOSURES_DEBUG is not set +CONFIG_BLK_DEV_DM_BUILTIN=y +CONFIG_BLK_DEV_DM=m +# CONFIG_DM_MQ_DEFAULT is not set +# CONFIG_DM_DEBUG is not set +CONFIG_DM_BUFIO=m +# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set +CONFIG_DM_BIO_PRISON=m +CONFIG_DM_PERSISTENT_DATA=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m +CONFIG_DM_CACHE=m +CONFIG_DM_CACHE_SMQ=m +CONFIG_DM_ERA=m +CONFIG_DM_MIRROR=m +CONFIG_DM_LOG_USERSPACE=m +CONFIG_DM_RAID=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_QL=m +CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_DELAY=m +CONFIG_DM_UEVENT=y +CONFIG_DM_FLAKEY=m +CONFIG_DM_VERITY=m +# CONFIG_DM_VERITY_FEC is not set +CONFIG_DM_SWITCH=m +CONFIG_DM_LOG_WRITES=m +CONFIG_DM_INTEGRITY=m +CONFIG_DM_ZONED=m +CONFIG_TARGET_CORE=m +CONFIG_TCM_IBLOCK=m +CONFIG_TCM_FILEIO=m +CONFIG_TCM_PSCSI=m +CONFIG_TCM_USER2=m +CONFIG_LOOPBACK_TARGET=m +CONFIG_TCM_FC=m +CONFIG_ISCSI_TARGET=m +CONFIG_ISCSI_TARGET_CXGB4=m +CONFIG_SBP_TARGET=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m +CONFIG_FUSION_MAX_SGE=128 +CONFIG_FUSION_CTL=m +CONFIG_FUSION_LAN=m +CONFIG_FUSION_LOGGING=y + +# +# IEEE 1394 (FireWire) support +# +CONFIG_FIREWIRE=m +CONFIG_FIREWIRE_OHCI=m +CONFIG_FIREWIRE_SBP2=m +CONFIG_FIREWIRE_NET=m +CONFIG_FIREWIRE_NOSY=m +CONFIG_MACINTOSH_DRIVERS=y +CONFIG_MAC_EMUMOUSEBTN=m +CONFIG_NETDEVICES=y +CONFIG_MII=m +CONFIG_NET_CORE=y +CONFIG_BONDING=m +CONFIG_DUMMY=m +CONFIG_EQUALIZER=m +CONFIG_NET_FC=y +CONFIG_IFB=m +CONFIG_NET_TEAM=m +CONFIG_NET_TEAM_MODE_BROADCAST=m +CONFIG_NET_TEAM_MODE_ROUNDROBIN=m +CONFIG_NET_TEAM_MODE_RANDOM=m +CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m +CONFIG_NET_TEAM_MODE_LOADBALANCE=m +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m +CONFIG_IPVLAN=m +CONFIG_IPVTAP=m +CONFIG_VXLAN=m +CONFIG_GENEVE=m +CONFIG_GTP=m +CONFIG_MACSEC=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETPOLL=y +CONFIG_NET_POLL_CONTROLLER=y +# CONFIG_NTB_NETDEV is not set +CONFIG_RIONET=m +CONFIG_RIONET_TX_SIZE=128 +CONFIG_RIONET_RX_SIZE=128 +CONFIG_TUN=m +CONFIG_TAP=m +# CONFIG_TUN_VNET_CROSS_LE is not set +CONFIG_VETH=m +CONFIG_VIRTIO_NET=m +CONFIG_NLMON=m +CONFIG_NET_VRF=m +CONFIG_VSOCKMON=m +CONFIG_SUNGEM_PHY=m +CONFIG_ARCNET=m +CONFIG_ARCNET_1201=m +CONFIG_ARCNET_1051=m +CONFIG_ARCNET_RAW=m +CONFIG_ARCNET_CAP=m +CONFIG_ARCNET_COM90xx=m +CONFIG_ARCNET_COM90xxIO=m +CONFIG_ARCNET_RIM_I=m +CONFIG_ARCNET_COM20020=m +CONFIG_ARCNET_COM20020_PCI=m +CONFIG_ARCNET_COM20020_CS=m +CONFIG_ATM_DRIVERS=y +# CONFIG_ATM_DUMMY is not set +CONFIG_ATM_TCP=m +CONFIG_ATM_LANAI=m +CONFIG_ATM_ENI=m +# CONFIG_ATM_ENI_DEBUG is not set +# CONFIG_ATM_ENI_TUNE_BURST is not set +CONFIG_ATM_FIRESTREAM=m +CONFIG_ATM_ZATM=m +# CONFIG_ATM_ZATM_DEBUG is not set +CONFIG_ATM_NICSTAR=m +CONFIG_ATM_NICSTAR_USE_SUNI=y +CONFIG_ATM_NICSTAR_USE_IDT77105=y +CONFIG_ATM_IDT77252=m +# CONFIG_ATM_IDT77252_DEBUG is not set +# CONFIG_ATM_IDT77252_RCV_ALL is not set +CONFIG_ATM_IDT77252_USE_SUNI=y +CONFIG_ATM_AMBASSADOR=m +# CONFIG_ATM_AMBASSADOR_DEBUG is not set +CONFIG_ATM_HORIZON=m +# CONFIG_ATM_HORIZON_DEBUG is not set +CONFIG_ATM_IA=m +# CONFIG_ATM_IA_DEBUG is not set +CONFIG_ATM_FORE200E=m +CONFIG_ATM_FORE200E_USE_TASKLET=y +CONFIG_ATM_FORE200E_TX_RETRY=16 +CONFIG_ATM_FORE200E_DEBUG=0 +CONFIG_ATM_HE=m +CONFIG_ATM_HE_USE_SUNI=y +CONFIG_ATM_SOLOS=m + +# +# CAIF transport drivers +# +CONFIG_CAIF_TTY=m +CONFIG_CAIF_SPI_SLAVE=m +CONFIG_CAIF_SPI_SYNC=y +CONFIG_CAIF_HSI=m +CONFIG_CAIF_VIRTIO=m + +# +# Distributed Switch Architecture drivers +# +CONFIG_B53=m +CONFIG_B53_SPI_DRIVER=m +CONFIG_B53_MDIO_DRIVER=m +CONFIG_B53_MMAP_DRIVER=m +CONFIG_B53_SRAB_DRIVER=m +CONFIG_NET_DSA_LOOP=m +CONFIG_NET_DSA_MT7530=m +CONFIG_NET_DSA_MV88E6060=m +CONFIG_MICROCHIP_KSZ=m +CONFIG_MICROCHIP_KSZ_SPI_DRIVER=m +CONFIG_NET_DSA_MV88E6XXX=m +CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y +CONFIG_NET_DSA_QCA8K=m +CONFIG_NET_DSA_SMSC_LAN9303=m +CONFIG_NET_DSA_SMSC_LAN9303_I2C=m +CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m +CONFIG_ETHERNET=y +CONFIG_MDIO=m +CONFIG_NET_VENDOR_3COM=y +CONFIG_PCMCIA_3C574=m +CONFIG_PCMCIA_3C589=m +CONFIG_VORTEX=m +CONFIG_TYPHOON=m +CONFIG_NET_VENDOR_ADAPTEC=y +CONFIG_ADAPTEC_STARFIRE=m +CONFIG_NET_VENDOR_AGERE=y +CONFIG_ET131X=m +CONFIG_NET_VENDOR_ALACRITECH=y +CONFIG_SLICOSS=m +CONFIG_NET_VENDOR_ALTEON=y +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +CONFIG_ALTERA_TSE=m +CONFIG_NET_VENDOR_AMAZON=y +CONFIG_ENA_ETHERNET=m +CONFIG_NET_VENDOR_AMD=y +CONFIG_AMD8111_ETH=m +CONFIG_PCNET32=m +CONFIG_PCMCIA_NMCLAN=m +CONFIG_AMD_XGBE=m +CONFIG_AMD_XGBE_DCB=y +CONFIG_AMD_XGBE_HAVE_ECC=y +CONFIG_NET_VENDOR_AQUANTIA=y +CONFIG_AQTION=m +CONFIG_NET_VENDOR_ARC=y +CONFIG_NET_VENDOR_ATHEROS=y +CONFIG_ATL2=m +CONFIG_ATL1=m +CONFIG_ATL1E=m +CONFIG_ATL1C=m +CONFIG_ALX=m +CONFIG_NET_VENDOR_AURORA=y +CONFIG_AURORA_NB8800=m +CONFIG_NET_CADENCE=y +CONFIG_MACB=m +CONFIG_MACB_USE_HWSTAMP=y +CONFIG_MACB_PCI=m +CONFIG_NET_VENDOR_BROADCOM=y +CONFIG_B44=m +CONFIG_B44_PCI_AUTOSELECT=y +CONFIG_B44_PCICORE_AUTOSELECT=y +CONFIG_B44_PCI=y +CONFIG_BNX2=m +CONFIG_CNIC=m +CONFIG_TIGON3=m +CONFIG_TIGON3_HWMON=y +CONFIG_BNX2X=m +CONFIG_BNX2X_SRIOV=y +CONFIG_BNXT=m +CONFIG_BNXT_SRIOV=y +CONFIG_BNXT_FLOWER_OFFLOAD=y +CONFIG_BNXT_DCB=y +CONFIG_NET_VENDOR_BROCADE=y +CONFIG_BNA=m +CONFIG_NET_VENDOR_CAVIUM=y +CONFIG_THUNDER_NIC_PF=m +CONFIG_THUNDER_NIC_VF=m +CONFIG_THUNDER_NIC_BGX=m +CONFIG_THUNDER_NIC_RGX=m +CONFIG_LIQUIDIO=m +CONFIG_LIQUIDIO_VF=m +CONFIG_NET_VENDOR_CHELSIO=y +CONFIG_CHELSIO_T1=m +CONFIG_CHELSIO_T1_1G=y +CONFIG_CHELSIO_T3=m +CONFIG_CHELSIO_T4=m +CONFIG_CHELSIO_T4_DCB=y +# CONFIG_CHELSIO_T4_FCOE is not set +CONFIG_CHELSIO_T4VF=m +CONFIG_CHELSIO_LIB=m +CONFIG_NET_VENDOR_CISCO=y +CONFIG_ENIC=m +CONFIG_CX_ECAT=m +CONFIG_DNET=m +CONFIG_NET_VENDOR_DEC=y +CONFIG_NET_TULIP=y +CONFIG_DE2104X=m +CONFIG_DE2104X_DSL=0 +CONFIG_TULIP=m +CONFIG_TULIP_MWI=y +CONFIG_TULIP_MMIO=y +CONFIG_TULIP_NAPI=y +CONFIG_TULIP_NAPI_HW_MITIGATION=y +CONFIG_DE4X5=m +CONFIG_WINBOND_840=m +CONFIG_DM9102=m +CONFIG_ULI526X=m +CONFIG_PCMCIA_XIRCOM=m +CONFIG_NET_VENDOR_DLINK=y +CONFIG_DL2K=m +CONFIG_SUNDANCE=m +# CONFIG_SUNDANCE_MMIO is not set +CONFIG_NET_VENDOR_EMULEX=y +CONFIG_BE2NET=m +CONFIG_BE2NET_HWMON=y +CONFIG_NET_VENDOR_EZCHIP=y +CONFIG_NET_VENDOR_EXAR=y +CONFIG_S2IO=m +CONFIG_VXGE=m +# CONFIG_VXGE_DEBUG_TRACE_ALL is not set +CONFIG_NET_VENDOR_FUJITSU=y +CONFIG_PCMCIA_FMVJ18X=m +CONFIG_NET_VENDOR_HP=y +CONFIG_HP100=m +CONFIG_NET_VENDOR_HUAWEI=y +CONFIG_HINIC=m +CONFIG_NET_VENDOR_INTEL=y +CONFIG_E100=m +CONFIG_E1000=m +CONFIG_E1000E=m +CONFIG_E1000E_HWTS=y +CONFIG_IGB=m +CONFIG_IGB_HWMON=y +CONFIG_IGB_DCA=y +CONFIG_IGBVF=m +CONFIG_IXGB=m +CONFIG_IXGBE=m +CONFIG_IXGBE_HWMON=y +CONFIG_IXGBE_DCA=y +CONFIG_IXGBE_DCB=y +CONFIG_IXGBEVF=m +CONFIG_I40E=m +CONFIG_I40E_DCB=y +CONFIG_I40EVF=m +CONFIG_FM10K=m +CONFIG_NET_VENDOR_I825XX=y +CONFIG_JME=m +CONFIG_NET_VENDOR_MARVELL=y +CONFIG_MVMDIO=m +CONFIG_SKGE=m +# CONFIG_SKGE_DEBUG is not set +CONFIG_SKGE_GENESIS=y +CONFIG_SKY2=m +# CONFIG_SKY2_DEBUG is not set +CONFIG_NET_VENDOR_MELLANOX=y +CONFIG_MLX4_EN=m +CONFIG_MLX4_EN_DCB=y +CONFIG_MLX4_CORE=m +CONFIG_MLX4_DEBUG=y +CONFIG_MLX5_CORE=m +CONFIG_MLX5_ACCEL=y +CONFIG_MLX5_FPGA=y +# CONFIG_MLX5_CORE_EN is not set +CONFIG_MLXSW_CORE=m +CONFIG_MLXSW_CORE_HWMON=y +CONFIG_MLXSW_CORE_THERMAL=y +CONFIG_MLXSW_PCI=m +CONFIG_MLXSW_I2C=m +CONFIG_MLXSW_SWITCHIB=m +CONFIG_MLXSW_SWITCHX2=m +CONFIG_MLXSW_SPECTRUM=m +CONFIG_MLXSW_SPECTRUM_DCB=y +CONFIG_MLXSW_MINIMAL=m +CONFIG_MLXFW=m +CONFIG_NET_VENDOR_MICREL=y +CONFIG_KS8842=m +CONFIG_KS8851=m +CONFIG_KS8851_MLL=m +CONFIG_KSZ884X_PCI=m +CONFIG_NET_VENDOR_MICROCHIP=y +CONFIG_ENC28J60=m +# CONFIG_ENC28J60_WRITEVERIFY is not set +CONFIG_ENCX24J600=m +CONFIG_NET_VENDOR_MYRI=y +CONFIG_MYRI10GE=m +CONFIG_MYRI10GE_DCA=y +CONFIG_FEALNX=m +CONFIG_NET_VENDOR_NATSEMI=y +CONFIG_NATSEMI=m +CONFIG_NS83820=m +CONFIG_NET_VENDOR_NETRONOME=y +CONFIG_NFP=m +# CONFIG_NFP_APP_FLOWER is not set +# CONFIG_NFP_DEBUG is not set +CONFIG_NET_VENDOR_8390=y +CONFIG_PCMCIA_AXNET=m +CONFIG_NE2K_PCI=m +CONFIG_PCMCIA_PCNET=m +CONFIG_NET_VENDOR_NVIDIA=y +CONFIG_FORCEDETH=m +CONFIG_NET_VENDOR_OKI=y +CONFIG_ETHOC=m +CONFIG_NET_PACKET_ENGINE=y +CONFIG_HAMACHI=m +CONFIG_YELLOWFIN=m +CONFIG_NET_VENDOR_QLOGIC=y +CONFIG_QLA3XXX=m +CONFIG_QLCNIC=m +CONFIG_QLCNIC_SRIOV=y +CONFIG_QLCNIC_DCB=y +CONFIG_QLCNIC_HWMON=y +CONFIG_QLGE=m +CONFIG_NETXEN_NIC=m +CONFIG_QED=m +CONFIG_QED_LL2=y +CONFIG_QED_SRIOV=y +CONFIG_QEDE=m +CONFIG_QED_RDMA=y +CONFIG_QED_ISCSI=y +CONFIG_QED_FCOE=y +CONFIG_NET_VENDOR_QUALCOMM=y +CONFIG_QCOM_EMAC=m +CONFIG_RMNET=m +CONFIG_NET_VENDOR_REALTEK=y +CONFIG_ATP=m +CONFIG_8139CP=m +CONFIG_8139TOO=m +# CONFIG_8139TOO_PIO is not set +CONFIG_8139TOO_TUNE_TWISTER=y +CONFIG_8139TOO_8129=y +# CONFIG_8139_OLD_RX_RESET is not set +CONFIG_R8169=m +CONFIG_NET_VENDOR_RENESAS=y +CONFIG_NET_VENDOR_RDC=y +CONFIG_R6040=m +CONFIG_NET_VENDOR_ROCKER=y +CONFIG_ROCKER=m +CONFIG_NET_VENDOR_SAMSUNG=y +CONFIG_SXGBE_ETH=m +CONFIG_NET_VENDOR_SEEQ=y +CONFIG_NET_VENDOR_SILAN=y +CONFIG_SC92031=m +CONFIG_NET_VENDOR_SIS=y +CONFIG_SIS900=m +CONFIG_SIS190=m +CONFIG_NET_VENDOR_SOLARFLARE=y +CONFIG_SFC=m +CONFIG_SFC_MTD=y +CONFIG_SFC_MCDI_MON=y +CONFIG_SFC_SRIOV=y +CONFIG_SFC_MCDI_LOGGING=y +CONFIG_SFC_FALCON=m +CONFIG_SFC_FALCON_MTD=y +CONFIG_NET_VENDOR_SMSC=y +CONFIG_PCMCIA_SMC91C92=m +CONFIG_EPIC100=m +CONFIG_SMSC911X=m +# CONFIG_SMSC911X_ARCH_HOOKS is not set +CONFIG_SMSC9420=m +CONFIG_NET_VENDOR_STMICRO=y +CONFIG_STMMAC_ETH=m +CONFIG_STMMAC_PLATFORM=m +CONFIG_DWMAC_GENERIC=m +CONFIG_STMMAC_PCI=m +CONFIG_NET_VENDOR_SUN=y +CONFIG_HAPPYMEAL=m +CONFIG_SUNGEM=m +CONFIG_CASSINI=m +CONFIG_NIU=m +CONFIG_NET_VENDOR_TEHUTI=y +CONFIG_TEHUTI=m +CONFIG_NET_VENDOR_TI=y +CONFIG_TI_CPSW_ALE=m +CONFIG_TLAN=m +CONFIG_NET_VENDOR_VIA=y +CONFIG_VIA_RHINE=m +CONFIG_VIA_RHINE_MMIO=y +CONFIG_VIA_VELOCITY=m +CONFIG_NET_VENDOR_WIZNET=y +CONFIG_WIZNET_W5100=m +CONFIG_WIZNET_W5300=m +# CONFIG_WIZNET_BUS_DIRECT is not set +# CONFIG_WIZNET_BUS_INDIRECT is not set +CONFIG_WIZNET_BUS_ANY=y +CONFIG_WIZNET_W5100_SPI=m +CONFIG_NET_VENDOR_XIRCOM=y +CONFIG_PCMCIA_XIRC2PS=m +CONFIG_NET_VENDOR_SYNOPSYS=y +CONFIG_DWC_XLGMAC=m +CONFIG_DWC_XLGMAC_PCI=m +CONFIG_FDDI=m +CONFIG_DEFXX=m +# CONFIG_DEFXX_MMIO is not set +CONFIG_SKFP=m +CONFIG_HIPPI=y +CONFIG_ROADRUNNER=m +CONFIG_ROADRUNNER_LARGE_RINGS=y +CONFIG_NET_SB1000=m +CONFIG_MDIO_DEVICE=m +CONFIG_MDIO_BUS=m +CONFIG_MDIO_BITBANG=m +CONFIG_MDIO_CAVIUM=m +CONFIG_MDIO_GPIO=m +CONFIG_MDIO_THUNDER=m +CONFIG_PHYLIB=m +CONFIG_SWPHY=y +CONFIG_LED_TRIGGER_PHY=y + +# +# MII PHY device drivers +# +CONFIG_AMD_PHY=m +CONFIG_AQUANTIA_PHY=m +CONFIG_AT803X_PHY=m +CONFIG_BCM7XXX_PHY=m +CONFIG_BCM87XX_PHY=m +CONFIG_BCM_NET_PHYLIB=m +CONFIG_BROADCOM_PHY=m +CONFIG_CICADA_PHY=m +CONFIG_CORTINA_PHY=m +CONFIG_DAVICOM_PHY=m +CONFIG_DP83848_PHY=m +CONFIG_DP83867_PHY=m +CONFIG_FIXED_PHY=m +CONFIG_ICPLUS_PHY=m +CONFIG_INTEL_XWAY_PHY=m +CONFIG_LSI_ET1011C_PHY=m +CONFIG_LXT_PHY=m +CONFIG_MARVELL_PHY=m +CONFIG_MARVELL_10G_PHY=m +CONFIG_MICREL_PHY=m +CONFIG_MICROCHIP_PHY=m +CONFIG_MICROSEMI_PHY=m +CONFIG_NATIONAL_PHY=m +CONFIG_QSEMI_PHY=m +CONFIG_REALTEK_PHY=m +CONFIG_ROCKCHIP_PHY=m +CONFIG_SMSC_PHY=m +CONFIG_STE10XP=m +CONFIG_TERANETICS_PHY=m +CONFIG_VITESSE_PHY=m +CONFIG_XILINX_GMII2RGMII=m +CONFIG_MICREL_KS8995MA=m +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_FILTER=y +CONFIG_PPP_MPPE=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPPOATM=m +CONFIG_PPPOE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_SLIP=m +CONFIG_SLHC=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y + +# +# Host-side USB support is needed for USB Network Adapter support +# +CONFIG_USB_NET_DRIVERS=m +CONFIG_USB_CATC=m +CONFIG_USB_KAWETH=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_RTL8152=m +CONFIG_USB_LAN78XX=m +CONFIG_USB_USBNET=m +CONFIG_USB_NET_AX8817X=m +CONFIG_USB_NET_AX88179_178A=m +CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_CDC_EEM=m +CONFIG_USB_NET_CDC_NCM=m +CONFIG_USB_NET_HUAWEI_CDC_NCM=m +CONFIG_USB_NET_CDC_MBIM=m +CONFIG_USB_NET_DM9601=m +CONFIG_USB_NET_SR9700=m +CONFIG_USB_NET_SR9800=m +CONFIG_USB_NET_SMSC75XX=m +CONFIG_USB_NET_SMSC95XX=m +CONFIG_USB_NET_GL620A=m +CONFIG_USB_NET_NET1080=m +CONFIG_USB_NET_PLUSB=m +CONFIG_USB_NET_MCS7830=m +CONFIG_USB_NET_RNDIS_HOST=m +CONFIG_USB_NET_CDC_SUBSET_ENABLE=m +CONFIG_USB_NET_CDC_SUBSET=m +CONFIG_USB_ALI_M5632=y +CONFIG_USB_AN2720=y +CONFIG_USB_BELKIN=y +CONFIG_USB_ARMLINUX=y +CONFIG_USB_EPSON2888=y +CONFIG_USB_KC2190=y +CONFIG_USB_NET_ZAURUS=m +CONFIG_USB_NET_CX82310_ETH=m +CONFIG_USB_NET_KALMIA=m +CONFIG_USB_NET_QMI_WWAN=m +CONFIG_USB_HSO=m +CONFIG_USB_NET_INT51X1=m +CONFIG_USB_CDC_PHONET=m +CONFIG_USB_IPHETH=m +CONFIG_USB_SIERRA_NET=m +CONFIG_USB_VL600=m +CONFIG_USB_NET_CH9200=m +CONFIG_WLAN=y +CONFIG_WLAN_VENDOR_ADMTEK=y +CONFIG_ADM8211=m +CONFIG_ATH_COMMON=m +CONFIG_WLAN_VENDOR_ATH=y +# CONFIG_ATH_DEBUG is not set +CONFIG_ATH5K=m +# CONFIG_ATH5K_DEBUG is not set +# CONFIG_ATH5K_TRACER is not set +CONFIG_ATH5K_PCI=y +CONFIG_ATH9K_HW=m +CONFIG_ATH9K_COMMON=m +CONFIG_ATH9K_BTCOEX_SUPPORT=y +CONFIG_ATH9K=m +CONFIG_ATH9K_PCI=y +CONFIG_ATH9K_AHB=y +# CONFIG_ATH9K_DEBUGFS is not set +CONFIG_ATH9K_DYNACK=y +CONFIG_ATH9K_WOW=y +CONFIG_ATH9K_RFKILL=y +CONFIG_ATH9K_CHANNEL_CONTEXT=y +CONFIG_ATH9K_PCOEM=y +CONFIG_ATH9K_HTC=m +# CONFIG_ATH9K_HTC_DEBUGFS is not set +CONFIG_ATH9K_HWRNG=y +CONFIG_CARL9170=m +CONFIG_CARL9170_LEDS=y +CONFIG_CARL9170_WPC=y +# CONFIG_CARL9170_HWRNG is not set +CONFIG_ATH6KL=m +CONFIG_ATH6KL_SDIO=m +CONFIG_ATH6KL_USB=m +# CONFIG_ATH6KL_DEBUG is not set +# CONFIG_ATH6KL_TRACING is not set +CONFIG_AR5523=m +CONFIG_WIL6210=m +CONFIG_WIL6210_ISR_COR=y +CONFIG_WIL6210_TRACING=y +CONFIG_WIL6210_DEBUGFS=y +CONFIG_ATH10K=m +CONFIG_ATH10K_PCI=m +CONFIG_ATH10K_SDIO=m +CONFIG_ATH10K_USB=m +# CONFIG_ATH10K_DEBUG is not set +# CONFIG_ATH10K_DEBUGFS is not set +# CONFIG_ATH10K_TRACING is not set +CONFIG_WCN36XX=m +# CONFIG_WCN36XX_DEBUGFS is not set +CONFIG_WLAN_VENDOR_ATMEL=y +CONFIG_ATMEL=m +CONFIG_PCI_ATMEL=m +CONFIG_PCMCIA_ATMEL=m +CONFIG_AT76C50X_USB=m +CONFIG_WLAN_VENDOR_BROADCOM=y +CONFIG_B43=m +CONFIG_B43_BCMA=y +CONFIG_B43_SSB=y +CONFIG_B43_BUSES_BCMA_AND_SSB=y +# CONFIG_B43_BUSES_BCMA is not set +# CONFIG_B43_BUSES_SSB is not set +CONFIG_B43_PCI_AUTOSELECT=y +CONFIG_B43_PCICORE_AUTOSELECT=y +CONFIG_B43_SDIO=y +CONFIG_B43_BCMA_PIO=y +CONFIG_B43_PIO=y +CONFIG_B43_PHY_G=y +CONFIG_B43_PHY_N=y +CONFIG_B43_PHY_LP=y +CONFIG_B43_PHY_HT=y +CONFIG_B43_LEDS=y +CONFIG_B43_HWRNG=y +# CONFIG_B43_DEBUG is not set +CONFIG_B43LEGACY=m +CONFIG_B43LEGACY_PCI_AUTOSELECT=y +CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y +CONFIG_B43LEGACY_LEDS=y +CONFIG_B43LEGACY_HWRNG=y +# CONFIG_B43LEGACY_DEBUG is not set +CONFIG_B43LEGACY_DMA=y +CONFIG_B43LEGACY_PIO=y +CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y +# CONFIG_B43LEGACY_DMA_MODE is not set +# CONFIG_B43LEGACY_PIO_MODE is not set +CONFIG_BRCMUTIL=m +CONFIG_BRCMSMAC=m +CONFIG_BRCMFMAC=m +CONFIG_BRCMFMAC_PROTO_BCDC=y +CONFIG_BRCMFMAC_PROTO_MSGBUF=y +CONFIG_BRCMFMAC_SDIO=y +CONFIG_BRCMFMAC_USB=y +CONFIG_BRCMFMAC_PCIE=y +CONFIG_BRCM_TRACING=y +# CONFIG_BRCMDBG is not set +CONFIG_WLAN_VENDOR_CISCO=y +CONFIG_AIRO=m +CONFIG_AIRO_CS=m +CONFIG_WLAN_VENDOR_INTEL=y +CONFIG_IPW2100=m +CONFIG_IPW2100_MONITOR=y +# CONFIG_IPW2100_DEBUG is not set +CONFIG_IPW2200=m +CONFIG_IPW2200_MONITOR=y +CONFIG_IPW2200_RADIOTAP=y +CONFIG_IPW2200_PROMISCUOUS=y +CONFIG_IPW2200_QOS=y +# CONFIG_IPW2200_DEBUG is not set +CONFIG_LIBIPW=m +# CONFIG_LIBIPW_DEBUG is not set +CONFIG_IWLEGACY=m +CONFIG_IWL4965=m +CONFIG_IWL3945=m + +# +# iwl3945 / iwl4965 Debugging Options +# +# CONFIG_IWLEGACY_DEBUG is not set +CONFIG_IWLWIFI=m +CONFIG_IWLWIFI_LEDS=y +CONFIG_IWLDVM=m +CONFIG_IWLMVM=m +CONFIG_IWLWIFI_OPMODE_MODULAR=y +CONFIG_IWLWIFI_BCAST_FILTERING=y + +# +# Debugging Options +# +# CONFIG_IWLWIFI_DEBUG is not set +CONFIG_IWLWIFI_DEVICE_TRACING=y +CONFIG_WLAN_VENDOR_INTERSIL=y +CONFIG_HOSTAP=m +CONFIG_HOSTAP_FIRMWARE=y +CONFIG_HOSTAP_FIRMWARE_NVRAM=y +CONFIG_HOSTAP_PLX=m +CONFIG_HOSTAP_PCI=m +CONFIG_HOSTAP_CS=m +CONFIG_HERMES=m +CONFIG_HERMES_PRISM=y +CONFIG_HERMES_CACHE_FW_ON_INIT=y +CONFIG_PLX_HERMES=m +CONFIG_TMD_HERMES=m +CONFIG_NORTEL_HERMES=m +CONFIG_PCI_HERMES=m +CONFIG_PCMCIA_HERMES=m +CONFIG_PCMCIA_SPECTRUM=m +CONFIG_ORINOCO_USB=m +CONFIG_P54_COMMON=m +CONFIG_P54_USB=m +CONFIG_P54_PCI=m +CONFIG_P54_SPI=m +CONFIG_P54_SPI_DEFAULT_EEPROM=y +CONFIG_P54_LEDS=y +CONFIG_PRISM54=m +CONFIG_WLAN_VENDOR_MARVELL=y +CONFIG_LIBERTAS=m +CONFIG_LIBERTAS_USB=m +CONFIG_LIBERTAS_CS=m +CONFIG_LIBERTAS_SDIO=m +CONFIG_LIBERTAS_SPI=m +# CONFIG_LIBERTAS_DEBUG is not set +CONFIG_LIBERTAS_MESH=y +CONFIG_LIBERTAS_THINFIRM=m +# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set +CONFIG_LIBERTAS_THINFIRM_USB=m +CONFIG_MWIFIEX=m +CONFIG_MWIFIEX_SDIO=m +CONFIG_MWIFIEX_PCIE=m +CONFIG_MWIFIEX_USB=m +CONFIG_MWL8K=m +CONFIG_WLAN_VENDOR_MEDIATEK=y +CONFIG_MT7601U=m +CONFIG_WLAN_VENDOR_RALINK=y +CONFIG_RT2X00=m +CONFIG_RT2400PCI=m +CONFIG_RT2500PCI=m +CONFIG_RT61PCI=m +CONFIG_RT2800PCI=m +CONFIG_RT2800PCI_RT33XX=y +CONFIG_RT2800PCI_RT35XX=y +CONFIG_RT2800PCI_RT53XX=y +CONFIG_RT2800PCI_RT3290=y +CONFIG_RT2500USB=m +CONFIG_RT73USB=m +CONFIG_RT2800USB=m +CONFIG_RT2800USB_RT33XX=y +CONFIG_RT2800USB_RT35XX=y +CONFIG_RT2800USB_RT3573=y +CONFIG_RT2800USB_RT53XX=y +CONFIG_RT2800USB_RT55XX=y +CONFIG_RT2800USB_UNKNOWN=y +CONFIG_RT2800_LIB=m +CONFIG_RT2800_LIB_MMIO=m +CONFIG_RT2X00_LIB_MMIO=m +CONFIG_RT2X00_LIB_PCI=m +CONFIG_RT2X00_LIB_USB=m +CONFIG_RT2X00_LIB=m +CONFIG_RT2X00_LIB_FIRMWARE=y +CONFIG_RT2X00_LIB_CRYPTO=y +CONFIG_RT2X00_LIB_LEDS=y +# CONFIG_RT2X00_DEBUG is not set +CONFIG_WLAN_VENDOR_REALTEK=y +CONFIG_RTL8180=m +CONFIG_RTL8187=m +CONFIG_RTL8187_LEDS=y +CONFIG_RTL_CARDS=m +CONFIG_RTL8192CE=m +CONFIG_RTL8192SE=m +CONFIG_RTL8192DE=m +CONFIG_RTL8723AE=m +CONFIG_RTL8723BE=m +CONFIG_RTL8188EE=m +CONFIG_RTL8192EE=m +CONFIG_RTL8821AE=m +CONFIG_RTL8192CU=m +CONFIG_RTLWIFI=m +CONFIG_RTLWIFI_PCI=m +CONFIG_RTLWIFI_USB=m +# CONFIG_RTLWIFI_DEBUG is not set +CONFIG_RTL8192C_COMMON=m +CONFIG_RTL8723_COMMON=m +CONFIG_RTLBTCOEXIST=m +CONFIG_RTL8XXXU=m +CONFIG_RTL8XXXU_UNTESTED=y +CONFIG_WLAN_VENDOR_RSI=y +CONFIG_RSI_91X=m +# CONFIG_RSI_DEBUGFS is not set +CONFIG_RSI_SDIO=m +CONFIG_RSI_USB=m +CONFIG_WLAN_VENDOR_ST=y +CONFIG_CW1200=m +CONFIG_CW1200_WLAN_SDIO=m +CONFIG_CW1200_WLAN_SPI=m +CONFIG_WLAN_VENDOR_TI=y +CONFIG_WL1251=m +CONFIG_WL1251_SPI=m +CONFIG_WL1251_SDIO=m +CONFIG_WL12XX=m +CONFIG_WL18XX=m +CONFIG_WLCORE=m +CONFIG_WLCORE_SDIO=m +CONFIG_WILINK_PLATFORM_DATA=y +CONFIG_WLAN_VENDOR_ZYDAS=y +CONFIG_USB_ZD1201=m +CONFIG_ZD1211RW=m +# CONFIG_ZD1211RW_DEBUG is not set +CONFIG_WLAN_VENDOR_QUANTENNA=y +CONFIG_QTNFMAC=m +CONFIG_QTNFMAC_PEARL_PCIE=m +CONFIG_PCMCIA_RAYCS=m +CONFIG_PCMCIA_WL3501=m +# CONFIG_MAC80211_HWSIM is not set +CONFIG_USB_NET_RNDIS_WLAN=m + +# +# WiMAX Wireless Broadband devices +# +CONFIG_WIMAX_I2400M=m +CONFIG_WIMAX_I2400M_USB=m +CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 +CONFIG_WAN=y +CONFIG_LANMEDIA=m +CONFIG_HDLC=m +CONFIG_HDLC_RAW=m +CONFIG_HDLC_RAW_ETH=m +CONFIG_HDLC_CISCO=m +CONFIG_HDLC_FR=m +CONFIG_HDLC_PPP=m +CONFIG_HDLC_X25=m +CONFIG_PCI200SYN=m +CONFIG_WANXL=m +CONFIG_PC300TOO=m +CONFIG_FARSYNC=m +CONFIG_DSCC4=m +CONFIG_DSCC4_PCISYNC=y +CONFIG_DSCC4_PCI_RST=y +CONFIG_DLCI=m +CONFIG_DLCI_MAX=8 +CONFIG_LAPBETHER=m +CONFIG_X25_ASY=m +CONFIG_SBNI=m +CONFIG_SBNI_MULTILINE=y +CONFIG_IEEE802154_DRIVERS=m +CONFIG_IEEE802154_FAKELB=m +CONFIG_IEEE802154_AT86RF230=m +# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set +CONFIG_IEEE802154_MRF24J40=m +CONFIG_IEEE802154_CC2520=m +CONFIG_IEEE802154_ATUSB=m +CONFIG_IEEE802154_ADF7242=m +CONFIG_IEEE802154_CA8210=m +# CONFIG_IEEE802154_CA8210_DEBUGFS is not set +CONFIG_VMXNET3=m +CONFIG_FUJITSU_ES=m +CONFIG_HYPERV_NET=m +CONFIG_ISDN=y +CONFIG_ISDN_I4L=m +CONFIG_ISDN_PPP=y +CONFIG_ISDN_PPP_VJ=y +CONFIG_ISDN_MPP=y +CONFIG_IPPP_FILTER=y +CONFIG_ISDN_PPP_BSDCOMP=m +CONFIG_ISDN_AUDIO=y +CONFIG_ISDN_TTY_FAX=y +CONFIG_ISDN_X25=y + +# +# ISDN feature submodules +# +CONFIG_ISDN_DIVERSION=m + +# +# ISDN4Linux hardware drivers +# + +# +# Passive cards +# +CONFIG_ISDN_DRV_HISAX=m + +# +# D-channel protocol features +# +CONFIG_HISAX_EURO=y +CONFIG_DE_AOC=y +# CONFIG_HISAX_NO_SENDCOMPLETE is not set +# CONFIG_HISAX_NO_LLC is not set +# CONFIG_HISAX_NO_KEYPAD is not set +CONFIG_HISAX_1TR6=y +CONFIG_HISAX_NI1=y +CONFIG_HISAX_MAX_CARDS=8 + +# +# HiSax supported cards +# +CONFIG_HISAX_16_3=y +CONFIG_HISAX_TELESPCI=y +CONFIG_HISAX_S0BOX=y +CONFIG_HISAX_FRITZPCI=y +CONFIG_HISAX_AVM_A1_PCMCIA=y +CONFIG_HISAX_ELSA=y +CONFIG_HISAX_DIEHLDIVA=y +CONFIG_HISAX_SEDLBAUER=y +CONFIG_HISAX_NETJET=y +CONFIG_HISAX_NETJET_U=y +CONFIG_HISAX_NICCY=y +CONFIG_HISAX_BKM_A4T=y +CONFIG_HISAX_SCT_QUADRO=y +CONFIG_HISAX_GAZEL=y +CONFIG_HISAX_HFC_PCI=y +CONFIG_HISAX_W6692=y +CONFIG_HISAX_HFC_SX=y +CONFIG_HISAX_ENTERNOW_PCI=y +# CONFIG_HISAX_DEBUG is not set + +# +# HiSax PCMCIA card service modules +# +CONFIG_HISAX_SEDLBAUER_CS=m +CONFIG_HISAX_ELSA_CS=m +CONFIG_HISAX_AVM_A1_CS=m +CONFIG_HISAX_TELES_CS=m + +# +# HiSax sub driver modules +# +CONFIG_HISAX_ST5481=m +CONFIG_HISAX_HFCUSB=m +CONFIG_HISAX_HFC4S8S=m +CONFIG_HISAX_FRITZ_PCIPNP=m +CONFIG_ISDN_CAPI=m +CONFIG_CAPI_TRACE=y +CONFIG_ISDN_CAPI_CAPI20=m +CONFIG_ISDN_CAPI_MIDDLEWARE=y +CONFIG_ISDN_CAPI_CAPIDRV=m +# CONFIG_ISDN_CAPI_CAPIDRV_VERBOSE is not set + +# +# CAPI hardware drivers +# +CONFIG_CAPI_AVM=y +CONFIG_ISDN_DRV_AVMB1_B1PCI=m +CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y +CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m +CONFIG_ISDN_DRV_AVMB1_AVM_CS=m +CONFIG_ISDN_DRV_AVMB1_T1PCI=m +CONFIG_ISDN_DRV_AVMB1_C4=m +CONFIG_CAPI_EICON=y +CONFIG_ISDN_DIVAS=m +CONFIG_ISDN_DIVAS_BRIPCI=y +CONFIG_ISDN_DIVAS_PRIPCI=y +CONFIG_ISDN_DIVAS_DIVACAPI=m +CONFIG_ISDN_DIVAS_USERIDI=m +CONFIG_ISDN_DIVAS_MAINT=m +CONFIG_ISDN_DRV_GIGASET=m +CONFIG_GIGASET_CAPI=y +# CONFIG_GIGASET_I4L is not set +# CONFIG_GIGASET_DUMMYLL is not set +CONFIG_GIGASET_BASE=m +CONFIG_GIGASET_M105=m +CONFIG_GIGASET_M101=m +# CONFIG_GIGASET_DEBUG is not set +CONFIG_HYSDN=m +CONFIG_HYSDN_CAPI=y +CONFIG_MISDN=m +CONFIG_MISDN_DSP=m +CONFIG_MISDN_L1OIP=m + +# +# mISDN hardware drivers +# +CONFIG_MISDN_HFCPCI=m +CONFIG_MISDN_HFCMULTI=m +CONFIG_MISDN_HFCUSB=m +CONFIG_MISDN_AVMFRITZ=m +CONFIG_MISDN_SPEEDFAX=m +CONFIG_MISDN_INFINEON=m +CONFIG_MISDN_W6692=m +CONFIG_MISDN_NETJET=m +CONFIG_MISDN_IPAC=m +CONFIG_MISDN_ISAR=m +CONFIG_ISDN_HDLC=m +CONFIG_NVM=y +# CONFIG_NVM_DEBUG is not set +CONFIG_NVM_RRPC=m +CONFIG_NVM_PBLK=m + +# +# Input device support +# +CONFIG_INPUT=y +CONFIG_INPUT_LEDS=y +CONFIG_INPUT_FF_MEMLESS=m +CONFIG_INPUT_POLLDEV=m +CONFIG_INPUT_SPARSEKMAP=m +CONFIG_INPUT_MATRIXKMAP=m + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=y +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_EVDEV=m +# CONFIG_INPUT_EVBUG is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ADC=m +CONFIG_KEYBOARD_ADP5588=m +CONFIG_KEYBOARD_ADP5589=m +CONFIG_KEYBOARD_ATKBD=y +CONFIG_KEYBOARD_QT1070=m +CONFIG_KEYBOARD_QT2160=m +CONFIG_KEYBOARD_DLINK_DIR685=m +CONFIG_KEYBOARD_LKKBD=m +CONFIG_KEYBOARD_GPIO=m +CONFIG_KEYBOARD_GPIO_POLLED=m +CONFIG_KEYBOARD_TCA6416=m +CONFIG_KEYBOARD_TCA8418=m +CONFIG_KEYBOARD_MATRIX=m +CONFIG_KEYBOARD_LM8323=m +CONFIG_KEYBOARD_LM8333=m +CONFIG_KEYBOARD_MAX7359=m +CONFIG_KEYBOARD_MCS=m +CONFIG_KEYBOARD_MPR121=m +CONFIG_KEYBOARD_NEWTON=m +CONFIG_KEYBOARD_OPENCORES=m +CONFIG_KEYBOARD_SAMSUNG=m +CONFIG_KEYBOARD_STOWAWAY=m +CONFIG_KEYBOARD_SUNKBD=m +CONFIG_KEYBOARD_TM2_TOUCHKEY=m +CONFIG_KEYBOARD_XTKBD=m +CONFIG_KEYBOARD_CROS_EC=m +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=m +CONFIG_MOUSE_PS2_ALPS=y +CONFIG_MOUSE_PS2_BYD=y +CONFIG_MOUSE_PS2_LOGIPS2PP=y +CONFIG_MOUSE_PS2_SYNAPTICS=y +CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y +CONFIG_MOUSE_PS2_CYPRESS=y +CONFIG_MOUSE_PS2_LIFEBOOK=y +CONFIG_MOUSE_PS2_TRACKPOINT=y +CONFIG_MOUSE_PS2_ELANTECH=y +CONFIG_MOUSE_PS2_SENTELIC=y +CONFIG_MOUSE_PS2_TOUCHKIT=y +CONFIG_MOUSE_PS2_FOCALTECH=y +# CONFIG_MOUSE_PS2_VMMOUSE is not set +CONFIG_MOUSE_PS2_SMBUS=y +CONFIG_MOUSE_SERIAL=m +CONFIG_MOUSE_APPLETOUCH=m +CONFIG_MOUSE_BCM5974=m +CONFIG_MOUSE_CYAPA=m +CONFIG_MOUSE_ELAN_I2C=m +CONFIG_MOUSE_ELAN_I2C_I2C=y +CONFIG_MOUSE_ELAN_I2C_SMBUS=y +CONFIG_MOUSE_VSXXXAA=m +CONFIG_MOUSE_GPIO=m +CONFIG_MOUSE_SYNAPTICS_I2C=m +CONFIG_MOUSE_SYNAPTICS_USB=m +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_ANALOG=m +CONFIG_JOYSTICK_A3D=m +CONFIG_JOYSTICK_ADI=m +CONFIG_JOYSTICK_COBRA=m +CONFIG_JOYSTICK_GF2K=m +CONFIG_JOYSTICK_GRIP=m +CONFIG_JOYSTICK_GRIP_MP=m +CONFIG_JOYSTICK_GUILLEMOT=m +CONFIG_JOYSTICK_INTERACT=m +CONFIG_JOYSTICK_SIDEWINDER=m +CONFIG_JOYSTICK_TMDC=m +CONFIG_JOYSTICK_IFORCE=m +CONFIG_JOYSTICK_IFORCE_USB=y +CONFIG_JOYSTICK_IFORCE_232=y +CONFIG_JOYSTICK_WARRIOR=m +CONFIG_JOYSTICK_MAGELLAN=m +CONFIG_JOYSTICK_SPACEORB=m +CONFIG_JOYSTICK_SPACEBALL=m +CONFIG_JOYSTICK_STINGER=m +CONFIG_JOYSTICK_TWIDJOY=m +CONFIG_JOYSTICK_ZHENHUA=m +CONFIG_JOYSTICK_DB9=m +CONFIG_JOYSTICK_GAMECON=m +CONFIG_JOYSTICK_TURBOGRAFX=m +CONFIG_JOYSTICK_AS5011=m +# CONFIG_JOYSTICK_JOYDUMP is not set +CONFIG_JOYSTICK_XPAD=m +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_JOYSTICK_WALKERA0701=m +CONFIG_JOYSTICK_PSXPAD_SPI=m +CONFIG_JOYSTICK_PSXPAD_SPI_FF=y +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=m +CONFIG_TABLET_USB_AIPTEK=m +CONFIG_TABLET_USB_GTCO=m +CONFIG_TABLET_USB_HANWANG=m +CONFIG_TABLET_USB_KBTAB=m +CONFIG_TABLET_USB_PEGASUS=m +CONFIG_TABLET_SERIAL_WACOM4=m +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_PROPERTIES=y +CONFIG_TOUCHSCREEN_ADS7846=m +CONFIG_TOUCHSCREEN_AD7877=m +CONFIG_TOUCHSCREEN_AD7879=m +CONFIG_TOUCHSCREEN_AD7879_I2C=m +CONFIG_TOUCHSCREEN_AD7879_SPI=m +CONFIG_TOUCHSCREEN_ATMEL_MXT=m +# CONFIG_TOUCHSCREEN_ATMEL_MXT_T37 is not set +CONFIG_TOUCHSCREEN_AUO_PIXCIR=m +CONFIG_TOUCHSCREEN_BU21013=m +CONFIG_TOUCHSCREEN_CY8CTMG110=m +CONFIG_TOUCHSCREEN_CYTTSP_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP_SPI=m +CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m +CONFIG_TOUCHSCREEN_DA9052=m +CONFIG_TOUCHSCREEN_DYNAPRO=m +CONFIG_TOUCHSCREEN_HAMPSHIRE=m +CONFIG_TOUCHSCREEN_EETI=m +CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m +CONFIG_TOUCHSCREEN_FUJITSU=m +CONFIG_TOUCHSCREEN_GOODIX=m +CONFIG_TOUCHSCREEN_ILI210X=m +CONFIG_TOUCHSCREEN_GUNZE=m +CONFIG_TOUCHSCREEN_EKTF2127=m +CONFIG_TOUCHSCREEN_ELAN=m +CONFIG_TOUCHSCREEN_ELO=m +CONFIG_TOUCHSCREEN_WACOM_W8001=m +CONFIG_TOUCHSCREEN_WACOM_I2C=m +CONFIG_TOUCHSCREEN_MAX11801=m +CONFIG_TOUCHSCREEN_MCS5000=m +CONFIG_TOUCHSCREEN_MMS114=m +CONFIG_TOUCHSCREEN_MELFAS_MIP4=m +CONFIG_TOUCHSCREEN_MTOUCH=m +CONFIG_TOUCHSCREEN_INEXIO=m +CONFIG_TOUCHSCREEN_MK712=m +CONFIG_TOUCHSCREEN_PENMOUNT=m +CONFIG_TOUCHSCREEN_EDT_FT5X06=m +CONFIG_TOUCHSCREEN_TOUCHRIGHT=m +CONFIG_TOUCHSCREEN_TOUCHWIN=m +CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m +CONFIG_TOUCHSCREEN_UCB1400=m +CONFIG_TOUCHSCREEN_PIXCIR=m +CONFIG_TOUCHSCREEN_WDT87XX_I2C=m +CONFIG_TOUCHSCREEN_WM831X=m +CONFIG_TOUCHSCREEN_WM97XX=m +CONFIG_TOUCHSCREEN_WM9705=y +CONFIG_TOUCHSCREEN_WM9712=y +CONFIG_TOUCHSCREEN_WM9713=y +CONFIG_TOUCHSCREEN_USB_COMPOSITE=m +CONFIG_TOUCHSCREEN_MC13783=m +CONFIG_TOUCHSCREEN_USB_EGALAX=y +CONFIG_TOUCHSCREEN_USB_PANJIT=y +CONFIG_TOUCHSCREEN_USB_3M=y +CONFIG_TOUCHSCREEN_USB_ITM=y +CONFIG_TOUCHSCREEN_USB_ETURBO=y +CONFIG_TOUCHSCREEN_USB_GUNZE=y +CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y +CONFIG_TOUCHSCREEN_USB_IRTOUCH=y +CONFIG_TOUCHSCREEN_USB_IDEALTEK=y +CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y +CONFIG_TOUCHSCREEN_USB_GOTOP=y +CONFIG_TOUCHSCREEN_USB_JASTEC=y +CONFIG_TOUCHSCREEN_USB_ELO=y +CONFIG_TOUCHSCREEN_USB_E2I=y +CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y +CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y +CONFIG_TOUCHSCREEN_USB_NEXIO=y +CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y +CONFIG_TOUCHSCREEN_TOUCHIT213=m +CONFIG_TOUCHSCREEN_TSC_SERIO=m +CONFIG_TOUCHSCREEN_TSC200X_CORE=m +CONFIG_TOUCHSCREEN_TSC2004=m +CONFIG_TOUCHSCREEN_TSC2005=m +CONFIG_TOUCHSCREEN_TSC2007=m +CONFIG_TOUCHSCREEN_TSC2007_IIO=y +CONFIG_TOUCHSCREEN_PCAP=m +CONFIG_TOUCHSCREEN_RM_TS=m +CONFIG_TOUCHSCREEN_SILEAD=m +CONFIG_TOUCHSCREEN_SIS_I2C=m +CONFIG_TOUCHSCREEN_ST1232=m +CONFIG_TOUCHSCREEN_STMFTS=m +CONFIG_TOUCHSCREEN_SUR40=m +CONFIG_TOUCHSCREEN_SURFACE3_SPI=m +CONFIG_TOUCHSCREEN_SX8654=m +CONFIG_TOUCHSCREEN_TPS6507X=m +CONFIG_TOUCHSCREEN_ZET6223=m +CONFIG_TOUCHSCREEN_ZFORCE=m +CONFIG_TOUCHSCREEN_ROHM_BU21023=m +CONFIG_INPUT_MISC=y +CONFIG_INPUT_88PM80X_ONKEY=m +CONFIG_INPUT_AD714X=m +CONFIG_INPUT_AD714X_I2C=m +CONFIG_INPUT_AD714X_SPI=m +CONFIG_INPUT_ARIZONA_HAPTICS=m +CONFIG_INPUT_BMA150=m +CONFIG_INPUT_E3X0_BUTTON=m +CONFIG_INPUT_PCSPKR=m +CONFIG_INPUT_MAX77693_HAPTIC=m +CONFIG_INPUT_MC13783_PWRBUTTON=m +CONFIG_INPUT_MMA8450=m +CONFIG_INPUT_APANEL=m +CONFIG_INPUT_GP2A=m +CONFIG_INPUT_GPIO_BEEPER=m +CONFIG_INPUT_GPIO_TILT_POLLED=m +CONFIG_INPUT_GPIO_DECODER=m +CONFIG_INPUT_ATLAS_BTNS=m +CONFIG_INPUT_ATI_REMOTE2=m +CONFIG_INPUT_KEYSPAN_REMOTE=m +CONFIG_INPUT_KXTJ9=m +CONFIG_INPUT_KXTJ9_POLLED_MODE=y +CONFIG_INPUT_POWERMATE=m +CONFIG_INPUT_YEALINK=m +CONFIG_INPUT_CM109=m +CONFIG_INPUT_REGULATOR_HAPTIC=m +CONFIG_INPUT_RETU_PWRBUTTON=m +CONFIG_INPUT_TPS65218_PWRBUTTON=m +CONFIG_INPUT_AXP20X_PEK=m +CONFIG_INPUT_UINPUT=m +CONFIG_INPUT_PCF50633_PMU=m +CONFIG_INPUT_PCF8574=m +CONFIG_INPUT_PWM_BEEPER=m +CONFIG_INPUT_PWM_VIBRA=m +CONFIG_INPUT_GPIO_ROTARY_ENCODER=m +CONFIG_INPUT_DA9052_ONKEY=m +CONFIG_INPUT_DA9063_ONKEY=m +CONFIG_INPUT_WM831X_ON=m +CONFIG_INPUT_PCAP=m +CONFIG_INPUT_ADXL34X=m +CONFIG_INPUT_ADXL34X_I2C=m +CONFIG_INPUT_ADXL34X_SPI=m +CONFIG_INPUT_IMS_PCU=m +CONFIG_INPUT_CMA3000=m +CONFIG_INPUT_CMA3000_I2C=m +CONFIG_INPUT_IDEAPAD_SLIDEBAR=m +CONFIG_INPUT_SOC_BUTTON_ARRAY=m +CONFIG_INPUT_DRV260X_HAPTICS=m +CONFIG_INPUT_DRV2665_HAPTICS=m +CONFIG_INPUT_DRV2667_HAPTICS=m +CONFIG_RMI4_CORE=m +CONFIG_RMI4_I2C=m +CONFIG_RMI4_SPI=m +CONFIG_RMI4_SMB=m +CONFIG_RMI4_F03=y +CONFIG_RMI4_F03_SERIO=m +CONFIG_RMI4_2D_SENSOR=y +CONFIG_RMI4_F11=y +CONFIG_RMI4_F12=y +CONFIG_RMI4_F30=y +CONFIG_RMI4_F34=y +CONFIG_RMI4_F54=y +CONFIG_RMI4_F55=y + +# +# Hardware I/O ports +# +CONFIG_SERIO=y +CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y +CONFIG_SERIO_I8042=y +CONFIG_SERIO_SERPORT=m +CONFIG_SERIO_CT82C710=m +CONFIG_SERIO_PARKBD=m +CONFIG_SERIO_PCIPS2=m +CONFIG_SERIO_LIBPS2=y +CONFIG_SERIO_RAW=m +CONFIG_SERIO_ALTERA_PS2=m +CONFIG_SERIO_PS2MULT=m +CONFIG_SERIO_ARC_PS2=m +CONFIG_HYPERV_KEYBOARD=m +CONFIG_SERIO_GPIO_PS2=m +CONFIG_USERIO=m +CONFIG_GAMEPORT=m +CONFIG_GAMEPORT_NS558=m +CONFIG_GAMEPORT_L4=m +CONFIG_GAMEPORT_EMU10K1=m +CONFIG_GAMEPORT_FM801=m + +# +# Character devices +# +CONFIG_TTY=y +CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y +CONFIG_VT_CONSOLE=y +CONFIG_VT_CONSOLE_SLEEP=y +CONFIG_HW_CONSOLE=y +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_UNIX98_PTYS=y +CONFIG_LEGACY_PTYS=y +CONFIG_LEGACY_PTY_COUNT=256 +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_ROCKETPORT=m +CONFIG_CYCLADES=m +CONFIG_CYZ_INTR=y +CONFIG_MOXA_INTELLIO=m +CONFIG_MOXA_SMARTIO=m +CONFIG_SYNCLINK=m +CONFIG_SYNCLINKMP=m +CONFIG_SYNCLINK_GT=m +CONFIG_NOZOMI=m +CONFIG_ISI=m +CONFIG_N_HDLC=m +CONFIG_N_GSM=m +CONFIG_TRACE_ROUTER=m +CONFIG_TRACE_SINK=m +CONFIG_DEVMEM=y +# CONFIG_DEVKMEM is not set + +# +# Serial drivers +# +CONFIG_SERIAL_EARLYCON=y +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set +CONFIG_SERIAL_8250_PNP=y +CONFIG_SERIAL_8250_FINTEK=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_DMA=y +CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_EXAR=y +CONFIG_SERIAL_8250_CS=m +CONFIG_SERIAL_8250_MEN_MCB=m +CONFIG_SERIAL_8250_NR_UARTS=4 +CONFIG_SERIAL_8250_RUNTIME_UARTS=4 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_SHARE_IRQ=y +CONFIG_SERIAL_8250_DETECT_IRQ=y +CONFIG_SERIAL_8250_RSA=y +# CONFIG_SERIAL_8250_FSL is not set +CONFIG_SERIAL_8250_DW=m +CONFIG_SERIAL_8250_RT288X=y +CONFIG_SERIAL_8250_LPSS=y +CONFIG_SERIAL_8250_MID=y +CONFIG_SERIAL_8250_MOXA=m + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_MAX3100=m +CONFIG_SERIAL_MAX310X=y +CONFIG_SERIAL_UARTLITE=m +CONFIG_SERIAL_UARTLITE_NR_UARTS=1 +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +CONFIG_SERIAL_JSM=m +CONFIG_SERIAL_SCCNXP=m +CONFIG_SERIAL_SC16IS7XX_CORE=m +CONFIG_SERIAL_SC16IS7XX=m +CONFIG_SERIAL_SC16IS7XX_I2C=y +CONFIG_SERIAL_SC16IS7XX_SPI=y +CONFIG_SERIAL_ALTERA_JTAGUART=m +CONFIG_SERIAL_ALTERA_UART=m +CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 +CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 +CONFIG_SERIAL_IFX6X60=m +CONFIG_SERIAL_ARC=m +CONFIG_SERIAL_ARC_NR_PORTS=1 +CONFIG_SERIAL_RP2=m +CONFIG_SERIAL_RP2_NR_UARTS=32 +CONFIG_SERIAL_FSL_LPUART=m +CONFIG_SERIAL_MEN_Z135=m +CONFIG_SERIAL_DEV_BUS=m +CONFIG_PRINTER=m +CONFIG_LP_CONSOLE=y +CONFIG_PPDEV=m +CONFIG_HVC_DRIVER=y +CONFIG_VIRTIO_CONSOLE=m +CONFIG_IPMI_HANDLER=m +CONFIG_IPMI_DMI_DECODE=y +CONFIG_IPMI_PANIC_EVENT=y +CONFIG_IPMI_PANIC_STRING=y +CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m +CONFIG_IPMI_SSIF=m +CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m +CONFIG_HW_RANDOM=m +CONFIG_HW_RANDOM_TIMERIOMEM=m +CONFIG_HW_RANDOM_INTEL=m +CONFIG_HW_RANDOM_AMD=m +CONFIG_HW_RANDOM_VIA=m +CONFIG_HW_RANDOM_VIRTIO=m +CONFIG_HW_RANDOM_TPM=m +CONFIG_NVRAM=m +CONFIG_R3964=m +CONFIG_APPLICOM=m + +# +# PCMCIA character devices +# +CONFIG_SYNCLINK_CS=m +CONFIG_CARDMAN_4000=m +CONFIG_CARDMAN_4040=m +CONFIG_SCR24X=m +CONFIG_IPWIRELESS=m +CONFIG_MWAVE=m +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=256 +CONFIG_HPET=y +CONFIG_HPET_MMAP=y +CONFIG_HPET_MMAP_DEFAULT=y +CONFIG_HANGCHECK_TIMER=m +CONFIG_TCG_TPM=m +CONFIG_TCG_TIS_CORE=m +CONFIG_TCG_TIS=m +CONFIG_TCG_TIS_SPI=m +CONFIG_TCG_TIS_I2C_ATMEL=m +CONFIG_TCG_TIS_I2C_INFINEON=m +CONFIG_TCG_TIS_I2C_NUVOTON=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m +CONFIG_TCG_INFINEON=m +CONFIG_TCG_CRB=m +CONFIG_TCG_VTPM_PROXY=m +CONFIG_TCG_TIS_ST33ZP24=m +CONFIG_TCG_TIS_ST33ZP24_I2C=m +CONFIG_TCG_TIS_ST33ZP24_SPI=m +CONFIG_TELCLOCK=m +CONFIG_DEVPORT=y +CONFIG_XILLYBUS=m +CONFIG_XILLYBUS_PCIE=m + +# +# I2C support +# +CONFIG_I2C=m +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_COMPAT=y +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_MUX=m + +# +# Multiplexer I2C Chip support +# +CONFIG_I2C_MUX_GPIO=m +CONFIG_I2C_MUX_LTC4306=m +CONFIG_I2C_MUX_PCA9541=m +CONFIG_I2C_MUX_PCA954x=m +CONFIG_I2C_MUX_REG=m +CONFIG_I2C_MUX_MLXCPLD=m +CONFIG_I2C_HELPER_AUTO=y +CONFIG_I2C_SMBUS=m +CONFIG_I2C_ALGOBIT=m +CONFIG_I2C_ALGOPCA=m + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +CONFIG_I2C_ALI1535=m +CONFIG_I2C_ALI1563=m +CONFIG_I2C_ALI15X3=m +CONFIG_I2C_AMD756=m +CONFIG_I2C_AMD756_S4882=m +CONFIG_I2C_AMD8111=m +CONFIG_I2C_I801=m +CONFIG_I2C_ISCH=m +CONFIG_I2C_ISMT=m +CONFIG_I2C_PIIX4=m +CONFIG_I2C_NFORCE2=m +CONFIG_I2C_NFORCE2_S4985=m +CONFIG_I2C_SIS5595=m +CONFIG_I2C_SIS630=m +CONFIG_I2C_SIS96X=m +CONFIG_I2C_VIA=m +CONFIG_I2C_VIAPRO=m + +# +# ACPI drivers +# +CONFIG_I2C_SCMI=m + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_CBUS_GPIO=m +CONFIG_I2C_DESIGNWARE_CORE=m +CONFIG_I2C_DESIGNWARE_PLATFORM=m +# CONFIG_I2C_DESIGNWARE_SLAVE is not set +CONFIG_I2C_DESIGNWARE_PCI=m +# CONFIG_I2C_DESIGNWARE_BAYTRAIL is not set +CONFIG_I2C_EMEV2=m +CONFIG_I2C_GPIO=m +CONFIG_I2C_KEMPLD=m +CONFIG_I2C_OCORES=m +CONFIG_I2C_PCA_PLATFORM=m +# CONFIG_I2C_PXA_PCI is not set +CONFIG_I2C_SIMTEC=m +CONFIG_I2C_XILINX=m + +# +# External I2C/SMBus adapter drivers +# +CONFIG_I2C_DIOLAN_U2C=m +CONFIG_I2C_DLN2=m +CONFIG_I2C_PARPORT=m +CONFIG_I2C_PARPORT_LIGHT=m +CONFIG_I2C_ROBOTFUZZ_OSIF=m +CONFIG_I2C_TAOS_EVM=m +CONFIG_I2C_TINY_USB=m +CONFIG_I2C_VIPERBOARD=m + +# +# Other I2C/SMBus bus drivers +# +CONFIG_I2C_MLXCPLD=m +CONFIG_I2C_CROS_EC_TUNNEL=m +# CONFIG_I2C_STUB is not set +CONFIG_I2C_SLAVE=y +CONFIG_I2C_SLAVE_EEPROM=m +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +CONFIG_SPI=y +# CONFIG_SPI_DEBUG is not set +CONFIG_SPI_MASTER=y + +# +# SPI Master Controller Drivers +# +CONFIG_SPI_ALTERA=m +CONFIG_SPI_AXI_SPI_ENGINE=m +CONFIG_SPI_BITBANG=m +CONFIG_SPI_BUTTERFLY=m +CONFIG_SPI_CADENCE=m +CONFIG_SPI_DESIGNWARE=m +CONFIG_SPI_DW_PCI=m +CONFIG_SPI_DW_MID_DMA=y +CONFIG_SPI_DW_MMIO=m +CONFIG_SPI_DLN2=m +CONFIG_SPI_GPIO=m +CONFIG_SPI_LM70_LLP=m +CONFIG_SPI_OC_TINY=m +CONFIG_SPI_PXA2XX=m +CONFIG_SPI_PXA2XX_PCI=m +CONFIG_SPI_ROCKCHIP=m +CONFIG_SPI_SC18IS602=m +CONFIG_SPI_XCOMM=m +CONFIG_SPI_XILINX=m +CONFIG_SPI_ZYNQMP_GQSPI=m + +# +# SPI Protocol Masters +# +CONFIG_SPI_SPIDEV=m +CONFIG_SPI_LOOPBACK_TEST=m +CONFIG_SPI_TLE62X0=m +CONFIG_SPI_SLAVE=y +CONFIG_SPI_SLAVE_TIME=m +CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m +CONFIG_SPMI=m +CONFIG_HSI=m +CONFIG_HSI_BOARDINFO=y + +# +# HSI controllers +# + +# +# HSI clients +# +CONFIG_HSI_CHAR=m +CONFIG_PPS=m +# CONFIG_PPS_DEBUG is not set +# CONFIG_NTP_PPS is not set + +# +# PPS clients support +# +# CONFIG_PPS_CLIENT_KTIMER is not set +CONFIG_PPS_CLIENT_LDISC=m +CONFIG_PPS_CLIENT_PARPORT=m +CONFIG_PPS_CLIENT_GPIO=m + +# +# PPS generators support +# + +# +# PTP clock support +# +CONFIG_PTP_1588_CLOCK=m +CONFIG_DP83640_PHY=m +CONFIG_PTP_1588_CLOCK_KVM=m +CONFIG_PINCTRL=y + +# +# Pin controllers +# +CONFIG_PINMUX=y +CONFIG_PINCONF=y +CONFIG_GENERIC_PINCONF=y +# CONFIG_DEBUG_PINCTRL is not set +CONFIG_PINCTRL_AMD=m +CONFIG_PINCTRL_MCP23S08=m +CONFIG_PINCTRL_BAYTRAIL=y +CONFIG_PINCTRL_CHERRYVIEW=m +CONFIG_PINCTRL_INTEL=m +CONFIG_PINCTRL_BROXTON=m +CONFIG_PINCTRL_CANNONLAKE=m +CONFIG_PINCTRL_DENVERTON=m +CONFIG_PINCTRL_GEMINILAKE=m +CONFIG_PINCTRL_LEWISBURG=m +CONFIG_PINCTRL_SUNRISEPOINT=m +CONFIG_GPIOLIB=y +CONFIG_GPIO_ACPI=y +CONFIG_GPIOLIB_IRQCHIP=y +# CONFIG_DEBUG_GPIO is not set +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_GENERIC=m +CONFIG_GPIO_MAX730X=m + +# +# Memory mapped GPIO drivers +# +CONFIG_GPIO_AMDPT=m +CONFIG_GPIO_AXP209=m +CONFIG_GPIO_DWAPB=m +CONFIG_GPIO_EXAR=m +CONFIG_GPIO_GENERIC_PLATFORM=m +CONFIG_GPIO_ICH=m +CONFIG_GPIO_LYNXPOINT=y +CONFIG_GPIO_MENZ127=m +CONFIG_GPIO_MOCKUP=m +CONFIG_GPIO_VX855=m + +# +# Port-mapped I/O GPIO drivers +# +CONFIG_GPIO_F7188X=m +CONFIG_GPIO_IT87=m +CONFIG_GPIO_SCH=m +CONFIG_GPIO_SCH311X=m + +# +# I2C GPIO expanders +# +CONFIG_GPIO_ADP5588=m +CONFIG_GPIO_MAX7300=m +CONFIG_GPIO_MAX732X=m +CONFIG_GPIO_PCA953X=m +CONFIG_GPIO_PCF857X=m +CONFIG_GPIO_TPIC2810=m + +# +# MFD GPIO expanders +# +CONFIG_GPIO_ARIZONA=m +CONFIG_GPIO_BD9571MWV=m +CONFIG_GPIO_DA9052=m +CONFIG_GPIO_DLN2=m +CONFIG_GPIO_JANZ_TTL=m +CONFIG_GPIO_KEMPLD=m +CONFIG_GPIO_LP3943=m +CONFIG_GPIO_LP873X=m +CONFIG_GPIO_TPS65086=m +CONFIG_GPIO_TPS65218=m +CONFIG_GPIO_TPS65912=m +CONFIG_GPIO_UCB1400=m +CONFIG_GPIO_WHISKEY_COVE=m +CONFIG_GPIO_WM831X=m +CONFIG_GPIO_WM8994=m + +# +# PCI GPIO expanders +# +CONFIG_GPIO_AMD8111=m +CONFIG_GPIO_ML_IOH=m +CONFIG_GPIO_PCI_IDIO_16=m +CONFIG_GPIO_RDC321X=m + +# +# SPI GPIO expanders +# +CONFIG_GPIO_MAX7301=m +CONFIG_GPIO_MC33880=m +CONFIG_GPIO_PISOSR=m +CONFIG_GPIO_XRA1403=m + +# +# USB GPIO expanders +# +CONFIG_GPIO_VIPERBOARD=m +CONFIG_W1=m +CONFIG_W1_CON=y + +# +# 1-wire Bus Masters +# +CONFIG_W1_MASTER_MATROX=m +CONFIG_W1_MASTER_DS2490=m +CONFIG_W1_MASTER_DS2482=m +CONFIG_W1_MASTER_DS1WM=m +CONFIG_W1_MASTER_GPIO=m + +# +# 1-wire Slaves +# +CONFIG_W1_SLAVE_THERM=m +CONFIG_W1_SLAVE_SMEM=m +# CONFIG_W1_SLAVE_DS2405 is not set +CONFIG_W1_SLAVE_DS2408=m +# CONFIG_W1_SLAVE_DS2408_READBACK is not set +CONFIG_W1_SLAVE_DS2413=m +CONFIG_W1_SLAVE_DS2406=m +CONFIG_W1_SLAVE_DS2423=m +CONFIG_W1_SLAVE_DS2805=m +CONFIG_W1_SLAVE_DS2431=m +CONFIG_W1_SLAVE_DS2433=m +CONFIG_W1_SLAVE_DS2433_CRC=y +CONFIG_W1_SLAVE_DS2438=m +CONFIG_W1_SLAVE_DS2760=m +CONFIG_W1_SLAVE_DS2780=m +CONFIG_W1_SLAVE_DS2781=m +CONFIG_W1_SLAVE_DS28E04=m +CONFIG_POWER_AVS=y +CONFIG_POWER_RESET=y +# CONFIG_POWER_RESET_RESTART is not set +CONFIG_POWER_SUPPLY=y +# CONFIG_POWER_SUPPLY_DEBUG is not set +CONFIG_PDA_POWER=m +CONFIG_GENERIC_ADC_BATTERY=m +CONFIG_WM831X_BACKUP=m +CONFIG_WM831X_POWER=m +# CONFIG_TEST_POWER is not set +CONFIG_BATTERY_DS2760=m +CONFIG_BATTERY_DS2780=m +CONFIG_BATTERY_DS2781=m +CONFIG_BATTERY_DS2782=m +CONFIG_BATTERY_SBS=m +CONFIG_CHARGER_SBS=m +CONFIG_BATTERY_BQ27XXX=m +CONFIG_BATTERY_BQ27XXX_I2C=m +CONFIG_BATTERY_BQ27XXX_HDQ=m +# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set +CONFIG_BATTERY_DA9052=m +CONFIG_CHARGER_DA9150=m +CONFIG_BATTERY_DA9150=m +CONFIG_CHARGER_AXP20X=m +CONFIG_BATTERY_AXP20X=m +CONFIG_AXP20X_POWER=m +CONFIG_AXP288_CHARGER=m +CONFIG_AXP288_FUEL_GAUGE=m +CONFIG_BATTERY_MAX17040=m +CONFIG_BATTERY_MAX17042=m +CONFIG_BATTERY_MAX1721X=m +CONFIG_CHARGER_PCF50633=m +CONFIG_CHARGER_ISP1704=m +CONFIG_CHARGER_MAX8903=m +CONFIG_CHARGER_LP8727=m +CONFIG_CHARGER_GPIO=m +CONFIG_CHARGER_MANAGER=y +CONFIG_CHARGER_LTC3651=m +CONFIG_CHARGER_MAX14577=m +CONFIG_CHARGER_MAX77693=m +CONFIG_CHARGER_BQ2415X=m +CONFIG_CHARGER_BQ24190=m +CONFIG_CHARGER_BQ24257=m +CONFIG_CHARGER_BQ24735=m +CONFIG_CHARGER_BQ25890=m +CONFIG_CHARGER_SMB347=m +CONFIG_CHARGER_TPS65217=m +CONFIG_BATTERY_GAUGE_LTC2941=m +CONFIG_BATTERY_RT5033=m +CONFIG_CHARGER_RT9455=m +CONFIG_HWMON=m +CONFIG_HWMON_VID=m +# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# Native drivers +# +CONFIG_SENSORS_ABITUGURU=m +CONFIG_SENSORS_ABITUGURU3=m +CONFIG_SENSORS_AD7314=m +CONFIG_SENSORS_AD7414=m +CONFIG_SENSORS_AD7418=m +CONFIG_SENSORS_ADM1021=m +CONFIG_SENSORS_ADM1025=m +CONFIG_SENSORS_ADM1026=m +CONFIG_SENSORS_ADM1029=m +CONFIG_SENSORS_ADM1031=m +CONFIG_SENSORS_ADM9240=m +CONFIG_SENSORS_ADT7X10=m +CONFIG_SENSORS_ADT7310=m +CONFIG_SENSORS_ADT7410=m +CONFIG_SENSORS_ADT7411=m +CONFIG_SENSORS_ADT7462=m +CONFIG_SENSORS_ADT7470=m +CONFIG_SENSORS_ADT7475=m +CONFIG_SENSORS_ASC7621=m +CONFIG_SENSORS_K8TEMP=m +CONFIG_SENSORS_K10TEMP=m +CONFIG_SENSORS_FAM15H_POWER=m +CONFIG_SENSORS_APPLESMC=m +CONFIG_SENSORS_ASB100=m +CONFIG_SENSORS_ASPEED=m +CONFIG_SENSORS_ATXP1=m +CONFIG_SENSORS_DS620=m +CONFIG_SENSORS_DS1621=m +CONFIG_SENSORS_DELL_SMM=m +CONFIG_SENSORS_DA9052_ADC=m +CONFIG_SENSORS_I5K_AMB=m +CONFIG_SENSORS_F71805F=m +CONFIG_SENSORS_F71882FG=m +CONFIG_SENSORS_F75375S=m +CONFIG_SENSORS_MC13783_ADC=m +CONFIG_SENSORS_FSCHMD=m +CONFIG_SENSORS_FTSTEUTATES=m +CONFIG_SENSORS_GL518SM=m +CONFIG_SENSORS_GL520SM=m +CONFIG_SENSORS_G760A=m +CONFIG_SENSORS_G762=m +CONFIG_SENSORS_GPIO_FAN=m +CONFIG_SENSORS_HIH6130=m +CONFIG_SENSORS_IBMAEM=m +CONFIG_SENSORS_IBMPEX=m +CONFIG_SENSORS_IIO_HWMON=m +CONFIG_SENSORS_I5500=m +CONFIG_SENSORS_CORETEMP=m +CONFIG_SENSORS_IT87=m +CONFIG_SENSORS_JC42=m +CONFIG_SENSORS_POWR1220=m +CONFIG_SENSORS_LINEAGE=m +CONFIG_SENSORS_LTC2945=m +CONFIG_SENSORS_LTC2990=m +CONFIG_SENSORS_LTC4151=m +CONFIG_SENSORS_LTC4215=m +CONFIG_SENSORS_LTC4222=m +CONFIG_SENSORS_LTC4245=m +CONFIG_SENSORS_LTC4260=m +CONFIG_SENSORS_LTC4261=m +CONFIG_SENSORS_MAX1111=m +CONFIG_SENSORS_MAX16065=m +CONFIG_SENSORS_MAX1619=m +CONFIG_SENSORS_MAX1668=m +CONFIG_SENSORS_MAX197=m +CONFIG_SENSORS_MAX31722=m +CONFIG_SENSORS_MAX6639=m +CONFIG_SENSORS_MAX6642=m +CONFIG_SENSORS_MAX6650=m +CONFIG_SENSORS_MAX6697=m +CONFIG_SENSORS_MAX31790=m +CONFIG_SENSORS_MCP3021=m +CONFIG_SENSORS_TC654=m +CONFIG_SENSORS_MENF21BMC_HWMON=m +CONFIG_SENSORS_ADCXX=m +CONFIG_SENSORS_LM63=m +CONFIG_SENSORS_LM70=m +CONFIG_SENSORS_LM73=m +CONFIG_SENSORS_LM75=m +CONFIG_SENSORS_LM77=m +CONFIG_SENSORS_LM78=m +CONFIG_SENSORS_LM80=m +CONFIG_SENSORS_LM83=m +CONFIG_SENSORS_LM85=m +CONFIG_SENSORS_LM87=m +CONFIG_SENSORS_LM90=m +CONFIG_SENSORS_LM92=m +CONFIG_SENSORS_LM93=m +CONFIG_SENSORS_LM95234=m +CONFIG_SENSORS_LM95241=m +CONFIG_SENSORS_LM95245=m +CONFIG_SENSORS_PC87360=m +CONFIG_SENSORS_PC87427=m +CONFIG_SENSORS_NTC_THERMISTOR=m +CONFIG_SENSORS_NCT6683=m +CONFIG_SENSORS_NCT6775=m +CONFIG_SENSORS_NCT7802=m +CONFIG_SENSORS_NCT7904=m +CONFIG_SENSORS_PCF8591=m +CONFIG_PMBUS=m +CONFIG_SENSORS_PMBUS=m +CONFIG_SENSORS_ADM1275=m +CONFIG_SENSORS_IBM_CFFPS=m +CONFIG_SENSORS_IR35221=m +CONFIG_SENSORS_LM25066=m +CONFIG_SENSORS_LTC2978=m +CONFIG_SENSORS_LTC2978_REGULATOR=y +CONFIG_SENSORS_LTC3815=m +CONFIG_SENSORS_MAX16064=m +CONFIG_SENSORS_MAX20751=m +CONFIG_SENSORS_MAX34440=m +CONFIG_SENSORS_MAX8688=m +CONFIG_SENSORS_TPS40422=m +CONFIG_SENSORS_TPS53679=m +CONFIG_SENSORS_UCD9000=m +CONFIG_SENSORS_UCD9200=m +CONFIG_SENSORS_ZL6100=m +CONFIG_SENSORS_SHT15=m +CONFIG_SENSORS_SHT21=m +CONFIG_SENSORS_SHT3x=m +CONFIG_SENSORS_SHTC1=m +CONFIG_SENSORS_SIS5595=m +CONFIG_SENSORS_DME1737=m +CONFIG_SENSORS_EMC1403=m +CONFIG_SENSORS_EMC2103=m +CONFIG_SENSORS_EMC6W201=m +CONFIG_SENSORS_SMSC47M1=m +CONFIG_SENSORS_SMSC47M192=m +CONFIG_SENSORS_SMSC47B397=m +CONFIG_SENSORS_SCH56XX_COMMON=m +CONFIG_SENSORS_SCH5627=m +CONFIG_SENSORS_SCH5636=m +CONFIG_SENSORS_STTS751=m +CONFIG_SENSORS_SMM665=m +CONFIG_SENSORS_ADC128D818=m +CONFIG_SENSORS_ADS1015=m +CONFIG_SENSORS_ADS7828=m +CONFIG_SENSORS_ADS7871=m +CONFIG_SENSORS_AMC6821=m +CONFIG_SENSORS_INA209=m +CONFIG_SENSORS_INA2XX=m +CONFIG_SENSORS_INA3221=m +CONFIG_SENSORS_TC74=m +CONFIG_SENSORS_THMC50=m +CONFIG_SENSORS_TMP102=m +CONFIG_SENSORS_TMP103=m +CONFIG_SENSORS_TMP108=m +CONFIG_SENSORS_TMP401=m +CONFIG_SENSORS_TMP421=m +CONFIG_SENSORS_VIA_CPUTEMP=m +CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_VT1211=m +CONFIG_SENSORS_VT8231=m +CONFIG_SENSORS_W83781D=m +CONFIG_SENSORS_W83791D=m +CONFIG_SENSORS_W83792D=m +CONFIG_SENSORS_W83793=m +CONFIG_SENSORS_W83795=m +# CONFIG_SENSORS_W83795_FANCTRL is not set +CONFIG_SENSORS_W83L785TS=m +CONFIG_SENSORS_W83L786NG=m +CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_W83627EHF=m +CONFIG_SENSORS_WM831X=m +CONFIG_SENSORS_XGENE=m + +# +# ACPI drivers +# +CONFIG_SENSORS_ACPI_POWER=m +CONFIG_SENSORS_ATK0110=m +CONFIG_THERMAL=y +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=0 +CONFIG_THERMAL_WRITABLE_TRIPS=y +CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y +# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set +# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set +# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set +CONFIG_THERMAL_GOV_FAIR_SHARE=y +CONFIG_THERMAL_GOV_STEP_WISE=y +CONFIG_THERMAL_GOV_BANG_BANG=y +CONFIG_THERMAL_GOV_USER_SPACE=y +# CONFIG_THERMAL_GOV_POWER_ALLOCATOR is not set +CONFIG_CLOCK_THERMAL=y +CONFIG_DEVFREQ_THERMAL=y +# CONFIG_THERMAL_EMULATION is not set +CONFIG_INTEL_POWERCLAMP=m +CONFIG_X86_PKG_TEMP_THERMAL=m +CONFIG_INTEL_SOC_DTS_IOSF_CORE=m +CONFIG_INTEL_SOC_DTS_THERMAL=m + +# +# ACPI INT340X thermal drivers +# +CONFIG_INT340X_THERMAL=m +CONFIG_ACPI_THERMAL_REL=m +CONFIG_INT3406_THERMAL=m +CONFIG_INTEL_BXT_PMIC_THERMAL=m +CONFIG_INTEL_PCH_THERMAL=m +CONFIG_GENERIC_ADC_THERMAL=m +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_CORE=y +# CONFIG_WATCHDOG_NOWAYOUT is not set +CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y +# CONFIG_WATCHDOG_SYSFS is not set + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=m +# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set +CONFIG_DA9052_WATCHDOG=m +CONFIG_DA9063_WATCHDOG=m +CONFIG_DA9062_WATCHDOG=m +CONFIG_MENF21BMC_WATCHDOG=m +CONFIG_WDAT_WDT=m +CONFIG_WM831X_WATCHDOG=m +CONFIG_XILINX_WATCHDOG=m +CONFIG_ZIIRAVE_WATCHDOG=m +CONFIG_CADENCE_WATCHDOG=m +CONFIG_DW_WATCHDOG=m +CONFIG_MAX63XX_WATCHDOG=m +CONFIG_RETU_WATCHDOG=m +CONFIG_ACQUIRE_WDT=m +CONFIG_ADVANTECH_WDT=m +CONFIG_ALIM1535_WDT=m +CONFIG_ALIM7101_WDT=m +CONFIG_F71808E_WDT=m +# CONFIG_SP5100_TCO is not set +CONFIG_SBC_FITPC2_WATCHDOG=m +CONFIG_EUROTECH_WDT=m +CONFIG_IB700_WDT=m +CONFIG_IBMASR=m +CONFIG_WAFER_WDT=m +CONFIG_I6300ESB_WDT=m +CONFIG_IE6XX_WDT=m +CONFIG_ITCO_WDT=m +CONFIG_ITCO_VENDOR_SUPPORT=y +CONFIG_IT8712F_WDT=m +CONFIG_IT87_WDT=m +CONFIG_HP_WATCHDOG=m +CONFIG_KEMPLD_WDT=m +CONFIG_HPWDT_NMI_DECODING=y +CONFIG_SC1200_WDT=m +CONFIG_PC87413_WDT=m +CONFIG_NV_TCO=m +CONFIG_60XX_WDT=m +CONFIG_CPU5_WDT=m +CONFIG_SMSC_SCH311X_WDT=m +CONFIG_SMSC37B787_WDT=m +CONFIG_VIA_WDT=m +CONFIG_W83627HF_WDT=m +CONFIG_W83877F_WDT=m +CONFIG_W83977F_WDT=m +CONFIG_MACHZ_WDT=m +CONFIG_SBC_EPX_C3_WATCHDOG=m +CONFIG_INTEL_MEI_WDT=m +CONFIG_NI903X_WDT=m +CONFIG_NIC7018_WDT=m +CONFIG_MEN_A21_WDT=m + +# +# PCI-based Watchdog Cards +# +CONFIG_PCIPCWATCHDOG=m +CONFIG_WDTPCI=m + +# +# USB-based Watchdog Cards +# +CONFIG_USBPCWATCHDOG=m + +# +# Watchdog Pretimeout Governors +# +CONFIG_WATCHDOG_PRETIMEOUT_GOV=y +# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set +CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y +CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y +CONFIG_SSB_POSSIBLE=y + +# +# Sonics Silicon Backplane +# +CONFIG_SSB=m +CONFIG_SSB_SPROM=y +CONFIG_SSB_BLOCKIO=y +CONFIG_SSB_PCIHOST_POSSIBLE=y +CONFIG_SSB_PCIHOST=y +CONFIG_SSB_B43_PCI_BRIDGE=y +CONFIG_SSB_PCMCIAHOST_POSSIBLE=y +CONFIG_SSB_PCMCIAHOST=y +CONFIG_SSB_SDIOHOST_POSSIBLE=y +CONFIG_SSB_SDIOHOST=y +# CONFIG_SSB_DEBUG is not set +CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y +CONFIG_SSB_DRIVER_PCICORE=y +CONFIG_SSB_DRIVER_GPIO=y +CONFIG_BCMA_POSSIBLE=y +CONFIG_BCMA=m +CONFIG_BCMA_BLOCKIO=y +CONFIG_BCMA_HOST_PCI_POSSIBLE=y +CONFIG_BCMA_HOST_PCI=y +CONFIG_BCMA_HOST_SOC=y +CONFIG_BCMA_DRIVER_PCI=y +CONFIG_BCMA_SFLASH=y +CONFIG_BCMA_DRIVER_GMAC_CMN=y +CONFIG_BCMA_DRIVER_GPIO=y +# CONFIG_BCMA_DEBUG is not set + +# +# Multifunction device drivers +# +CONFIG_MFD_CORE=y +CONFIG_MFD_BCM590XX=m +CONFIG_MFD_BD9571MWV=m +CONFIG_MFD_AXP20X=m +CONFIG_MFD_AXP20X_I2C=m +CONFIG_MFD_CROS_EC=m +CONFIG_MFD_CROS_EC_I2C=m +CONFIG_MFD_CROS_EC_SPI=m +CONFIG_PMIC_DA9052=y +CONFIG_MFD_DA9052_SPI=y +CONFIG_MFD_DA9062=m +CONFIG_MFD_DA9063=m +CONFIG_MFD_DA9150=m +CONFIG_MFD_DLN2=m +CONFIG_MFD_MC13XXX=m +CONFIG_MFD_MC13XXX_SPI=m +CONFIG_MFD_MC13XXX_I2C=m +CONFIG_HTC_PASIC3=m +CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m +CONFIG_LPC_ICH=m +CONFIG_LPC_SCH=m +CONFIG_INTEL_SOC_PMIC_BXTWC=m +CONFIG_MFD_INTEL_LPSS=m +CONFIG_MFD_INTEL_LPSS_ACPI=m +CONFIG_MFD_INTEL_LPSS_PCI=m +CONFIG_MFD_JANZ_CMODIO=m +CONFIG_MFD_KEMPLD=m +CONFIG_MFD_88PM800=m +CONFIG_MFD_88PM805=m +CONFIG_MFD_MAX14577=m +CONFIG_MFD_MAX77693=m +CONFIG_MFD_MAX8907=m +CONFIG_MFD_MT6397=m +CONFIG_MFD_MENF21BMC=m +CONFIG_EZX_PCAP=y +CONFIG_MFD_VIPERBOARD=m +CONFIG_MFD_RETU=m +CONFIG_MFD_PCF50633=m +CONFIG_PCF50633_ADC=m +CONFIG_PCF50633_GPIO=m +CONFIG_UCB1400_CORE=m +CONFIG_MFD_RDC321X=m +CONFIG_MFD_RTSX_PCI=m +CONFIG_MFD_RT5033=m +CONFIG_MFD_RTSX_USB=m +CONFIG_MFD_SI476X_CORE=m +CONFIG_MFD_SM501=m +CONFIG_MFD_SM501_GPIO=y +CONFIG_MFD_SKY81452=m +CONFIG_ABX500_CORE=y +CONFIG_MFD_SYSCON=y +CONFIG_MFD_TI_AM335X_TSCADC=m +CONFIG_MFD_LP3943=m +CONFIG_MFD_TI_LMU=m +CONFIG_TPS6105X=m +CONFIG_TPS65010=m +CONFIG_TPS6507X=m +CONFIG_MFD_TPS65086=m +CONFIG_MFD_TPS65217=m +CONFIG_MFD_TI_LP873X=m +CONFIG_MFD_TPS65218=m +CONFIG_MFD_TPS65912=y +CONFIG_MFD_TPS65912_I2C=m +CONFIG_MFD_TPS65912_SPI=y +CONFIG_MFD_WL1273_CORE=m +CONFIG_MFD_LM3533=m +# CONFIG_MFD_TMIO is not set +CONFIG_MFD_VX855=m +CONFIG_MFD_ARIZONA=y +CONFIG_MFD_ARIZONA_I2C=m +CONFIG_MFD_ARIZONA_SPI=m +CONFIG_MFD_CS47L24=y +CONFIG_MFD_WM5102=y +CONFIG_MFD_WM5110=y +CONFIG_MFD_WM8997=y +CONFIG_MFD_WM8998=y +CONFIG_MFD_WM831X=y +CONFIG_MFD_WM831X_SPI=y +CONFIG_MFD_WM8994=m +CONFIG_REGULATOR=y +# CONFIG_REGULATOR_DEBUG is not set +CONFIG_REGULATOR_FIXED_VOLTAGE=m +CONFIG_REGULATOR_VIRTUAL_CONSUMER=m +CONFIG_REGULATOR_USERSPACE_CONSUMER=m +CONFIG_REGULATOR_88PM800=m +CONFIG_REGULATOR_ACT8865=m +CONFIG_REGULATOR_AD5398=m +CONFIG_REGULATOR_ANATOP=m +CONFIG_REGULATOR_ARIZONA_LDO1=m +CONFIG_REGULATOR_ARIZONA_MICSUPP=m +CONFIG_REGULATOR_AXP20X=m +CONFIG_REGULATOR_BCM590XX=m +CONFIG_REGULATOR_BD9571MWV=m +CONFIG_REGULATOR_DA9052=m +CONFIG_REGULATOR_DA9062=m +CONFIG_REGULATOR_DA9063=m +CONFIG_REGULATOR_DA9210=m +CONFIG_REGULATOR_DA9211=m +CONFIG_REGULATOR_FAN53555=m +CONFIG_REGULATOR_GPIO=m +CONFIG_REGULATOR_ISL9305=m +CONFIG_REGULATOR_ISL6271A=m +CONFIG_REGULATOR_LM363X=m +CONFIG_REGULATOR_LP3971=m +CONFIG_REGULATOR_LP3972=m +CONFIG_REGULATOR_LP872X=m +CONFIG_REGULATOR_LP8755=m +CONFIG_REGULATOR_LTC3589=m +CONFIG_REGULATOR_LTC3676=m +CONFIG_REGULATOR_MAX14577=m +CONFIG_REGULATOR_MAX1586=m +CONFIG_REGULATOR_MAX8649=m +CONFIG_REGULATOR_MAX8660=m +CONFIG_REGULATOR_MAX8907=m +CONFIG_REGULATOR_MAX8952=m +CONFIG_REGULATOR_MAX77693=m +CONFIG_REGULATOR_MC13XXX_CORE=m +CONFIG_REGULATOR_MC13783=m +CONFIG_REGULATOR_MC13892=m +CONFIG_REGULATOR_MT6311=m +CONFIG_REGULATOR_MT6323=m +CONFIG_REGULATOR_MT6397=m +CONFIG_REGULATOR_PCAP=m +CONFIG_REGULATOR_PCF50633=m +CONFIG_REGULATOR_PFUZE100=m +CONFIG_REGULATOR_PV88060=m +CONFIG_REGULATOR_PV88080=m +CONFIG_REGULATOR_PV88090=m +CONFIG_REGULATOR_PWM=m +CONFIG_REGULATOR_QCOM_SPMI=m +CONFIG_REGULATOR_RT5033=m +CONFIG_REGULATOR_SKY81452=m +CONFIG_REGULATOR_TPS51632=m +CONFIG_REGULATOR_TPS6105X=m +CONFIG_REGULATOR_TPS62360=m +CONFIG_REGULATOR_TPS65023=m +CONFIG_REGULATOR_TPS6507X=m +CONFIG_REGULATOR_TPS65086=m +CONFIG_REGULATOR_TPS65132=m +CONFIG_REGULATOR_TPS65217=m +CONFIG_REGULATOR_TPS6524X=m +CONFIG_REGULATOR_TPS65912=m +CONFIG_REGULATOR_WM831X=m +CONFIG_REGULATOR_WM8994=m +CONFIG_CEC_CORE=m +CONFIG_RC_CORE=m +CONFIG_RC_MAP=m +CONFIG_RC_DECODERS=y +CONFIG_LIRC=m +CONFIG_IR_LIRC_CODEC=m +CONFIG_IR_NEC_DECODER=m +CONFIG_IR_RC5_DECODER=m +CONFIG_IR_RC6_DECODER=m +CONFIG_IR_JVC_DECODER=m +CONFIG_IR_SONY_DECODER=m +CONFIG_IR_SANYO_DECODER=m +CONFIG_IR_SHARP_DECODER=m +CONFIG_IR_MCE_KBD_DECODER=m +CONFIG_IR_XMP_DECODER=m +CONFIG_RC_DEVICES=y +CONFIG_RC_ATI_REMOTE=m +CONFIG_IR_ENE=m +CONFIG_IR_HIX5HD2=m +CONFIG_IR_IMON=m +CONFIG_IR_MCEUSB=m +CONFIG_IR_ITE_CIR=m +CONFIG_IR_FINTEK=m +CONFIG_IR_NUVOTON=m +CONFIG_IR_REDRAT3=m +CONFIG_IR_SPI=m +CONFIG_IR_STREAMZAP=m +CONFIG_IR_WINBOND_CIR=m +CONFIG_IR_IGORPLUGUSB=m +CONFIG_IR_IGUANA=m +CONFIG_IR_TTUSBIR=m +CONFIG_RC_LOOPBACK=m +CONFIG_IR_GPIO_CIR=m +CONFIG_IR_GPIO_TX=m +CONFIG_IR_PWM_TX=m +CONFIG_IR_SERIAL=m +CONFIG_IR_SERIAL_TRANSMITTER=y +CONFIG_IR_SIR=m +CONFIG_MEDIA_SUPPORT=m + +# +# Multimedia core support +# +CONFIG_MEDIA_CAMERA_SUPPORT=y +CONFIG_MEDIA_ANALOG_TV_SUPPORT=y +CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y +CONFIG_MEDIA_RADIO_SUPPORT=y +CONFIG_MEDIA_SDR_SUPPORT=y +CONFIG_MEDIA_CEC_SUPPORT=y +CONFIG_MEDIA_CEC_RC=y +CONFIG_MEDIA_CONTROLLER=y +CONFIG_MEDIA_CONTROLLER_DVB=y +CONFIG_VIDEO_DEV=m +CONFIG_VIDEO_V4L2_SUBDEV_API=y +CONFIG_VIDEO_V4L2=m +# CONFIG_VIDEO_ADV_DEBUG is not set +# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set +CONFIG_VIDEO_PCI_SKELETON=m +CONFIG_VIDEO_TUNER=m +CONFIG_V4L2_MEM2MEM_DEV=m +CONFIG_V4L2_FLASH_LED_CLASS=m +CONFIG_V4L2_FWNODE=m +CONFIG_VIDEOBUF_GEN=m +CONFIG_VIDEOBUF_DMA_SG=m +CONFIG_VIDEOBUF_VMALLOC=m +CONFIG_VIDEOBUF_DVB=m +CONFIG_VIDEOBUF2_CORE=m +CONFIG_VIDEOBUF2_MEMOPS=m +CONFIG_VIDEOBUF2_DMA_CONTIG=m +CONFIG_VIDEOBUF2_VMALLOC=m +CONFIG_VIDEOBUF2_DMA_SG=m +CONFIG_VIDEOBUF2_DVB=m +CONFIG_DVB_CORE=m +CONFIG_DVB_NET=y +CONFIG_TTPCI_EEPROM=m +CONFIG_DVB_MAX_ADAPTERS=8 +CONFIG_DVB_DYNAMIC_MINORS=y +# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set + +# +# Media drivers +# +CONFIG_MEDIA_USB_SUPPORT=y + +# +# Webcam devices +# +CONFIG_USB_VIDEO_CLASS=m +CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y +CONFIG_USB_GSPCA=m +CONFIG_USB_M5602=m +CONFIG_USB_STV06XX=m +CONFIG_USB_GL860=m +CONFIG_USB_GSPCA_BENQ=m +CONFIG_USB_GSPCA_CONEX=m +CONFIG_USB_GSPCA_CPIA1=m +CONFIG_USB_GSPCA_DTCS033=m +CONFIG_USB_GSPCA_ETOMS=m +CONFIG_USB_GSPCA_FINEPIX=m +CONFIG_USB_GSPCA_JEILINJ=m +CONFIG_USB_GSPCA_JL2005BCD=m +CONFIG_USB_GSPCA_KINECT=m +CONFIG_USB_GSPCA_KONICA=m +CONFIG_USB_GSPCA_MARS=m +CONFIG_USB_GSPCA_MR97310A=m +CONFIG_USB_GSPCA_NW80X=m +CONFIG_USB_GSPCA_OV519=m +CONFIG_USB_GSPCA_OV534=m +CONFIG_USB_GSPCA_OV534_9=m +CONFIG_USB_GSPCA_PAC207=m +CONFIG_USB_GSPCA_PAC7302=m +CONFIG_USB_GSPCA_PAC7311=m +CONFIG_USB_GSPCA_SE401=m +CONFIG_USB_GSPCA_SN9C2028=m +CONFIG_USB_GSPCA_SN9C20X=m +CONFIG_USB_GSPCA_SONIXB=m +CONFIG_USB_GSPCA_SONIXJ=m +CONFIG_USB_GSPCA_SPCA500=m +CONFIG_USB_GSPCA_SPCA501=m +CONFIG_USB_GSPCA_SPCA505=m +CONFIG_USB_GSPCA_SPCA506=m +CONFIG_USB_GSPCA_SPCA508=m +CONFIG_USB_GSPCA_SPCA561=m +CONFIG_USB_GSPCA_SPCA1528=m +CONFIG_USB_GSPCA_SQ905=m +CONFIG_USB_GSPCA_SQ905C=m +CONFIG_USB_GSPCA_SQ930X=m +CONFIG_USB_GSPCA_STK014=m +CONFIG_USB_GSPCA_STK1135=m +CONFIG_USB_GSPCA_STV0680=m +CONFIG_USB_GSPCA_SUNPLUS=m +CONFIG_USB_GSPCA_T613=m +CONFIG_USB_GSPCA_TOPRO=m +CONFIG_USB_GSPCA_TOUPTEK=m +CONFIG_USB_GSPCA_TV8532=m +CONFIG_USB_GSPCA_VC032X=m +CONFIG_USB_GSPCA_VICAM=m +CONFIG_USB_GSPCA_XIRLINK_CIT=m +CONFIG_USB_GSPCA_ZC3XX=m +CONFIG_USB_PWC=m +# CONFIG_USB_PWC_DEBUG is not set +CONFIG_USB_PWC_INPUT_EVDEV=y +CONFIG_VIDEO_CPIA2=m +CONFIG_USB_ZR364XX=m +CONFIG_USB_STKWEBCAM=m +CONFIG_USB_S2255=m +CONFIG_VIDEO_USBTV=m + +# +# Analog TV USB devices +# +CONFIG_VIDEO_PVRUSB2=m +CONFIG_VIDEO_PVRUSB2_SYSFS=y +CONFIG_VIDEO_PVRUSB2_DVB=y +# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set +CONFIG_VIDEO_HDPVR=m +CONFIG_VIDEO_USBVISION=m +CONFIG_VIDEO_STK1160_COMMON=m +CONFIG_VIDEO_STK1160=m +CONFIG_VIDEO_GO7007=m +CONFIG_VIDEO_GO7007_USB=m +CONFIG_VIDEO_GO7007_LOADER=m +CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m + +# +# Analog/digital TV USB devices +# +CONFIG_VIDEO_AU0828=m +CONFIG_VIDEO_AU0828_V4L2=y +CONFIG_VIDEO_AU0828_RC=y +CONFIG_VIDEO_CX231XX=m +CONFIG_VIDEO_CX231XX_RC=y +CONFIG_VIDEO_CX231XX_ALSA=m +CONFIG_VIDEO_CX231XX_DVB=m +CONFIG_VIDEO_TM6000=m +CONFIG_VIDEO_TM6000_ALSA=m +CONFIG_VIDEO_TM6000_DVB=m + +# +# Digital TV USB devices +# +CONFIG_DVB_USB=m +# CONFIG_DVB_USB_DEBUG is not set +CONFIG_DVB_USB_DIB3000MC=m +CONFIG_DVB_USB_A800=m +CONFIG_DVB_USB_DIBUSB_MB=m +CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y +CONFIG_DVB_USB_DIBUSB_MC=m +CONFIG_DVB_USB_DIB0700=m +CONFIG_DVB_USB_UMT_010=m +CONFIG_DVB_USB_CXUSB=m +CONFIG_DVB_USB_M920X=m +CONFIG_DVB_USB_DIGITV=m +CONFIG_DVB_USB_VP7045=m +CONFIG_DVB_USB_VP702X=m +CONFIG_DVB_USB_GP8PSK=m +CONFIG_DVB_USB_NOVA_T_USB2=m +CONFIG_DVB_USB_TTUSB2=m +CONFIG_DVB_USB_DTT200U=m +CONFIG_DVB_USB_OPERA1=m +CONFIG_DVB_USB_AF9005=m +CONFIG_DVB_USB_AF9005_REMOTE=m +CONFIG_DVB_USB_PCTV452E=m +CONFIG_DVB_USB_DW2102=m +CONFIG_DVB_USB_CINERGY_T2=m +CONFIG_DVB_USB_DTV5100=m +CONFIG_DVB_USB_FRIIO=m +CONFIG_DVB_USB_AZ6027=m +CONFIG_DVB_USB_TECHNISAT_USB2=m +CONFIG_DVB_USB_V2=m +CONFIG_DVB_USB_AF9015=m +CONFIG_DVB_USB_AF9035=m +CONFIG_DVB_USB_ANYSEE=m +CONFIG_DVB_USB_AU6610=m +CONFIG_DVB_USB_AZ6007=m +CONFIG_DVB_USB_CE6230=m +CONFIG_DVB_USB_EC168=m +CONFIG_DVB_USB_GL861=m +CONFIG_DVB_USB_LME2510=m +CONFIG_DVB_USB_MXL111SF=m +CONFIG_DVB_USB_RTL28XXU=m +CONFIG_DVB_USB_DVBSKY=m +CONFIG_DVB_USB_ZD1301=m +CONFIG_DVB_TTUSB_BUDGET=m +CONFIG_DVB_TTUSB_DEC=m +CONFIG_SMS_USB_DRV=m +CONFIG_DVB_B2C2_FLEXCOP_USB=m +# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set +CONFIG_DVB_AS102=m + +# +# Webcam, TV (analog/digital) USB devices +# +CONFIG_VIDEO_EM28XX=m +CONFIG_VIDEO_EM28XX_V4L2=m +CONFIG_VIDEO_EM28XX_ALSA=m +CONFIG_VIDEO_EM28XX_DVB=m +CONFIG_VIDEO_EM28XX_RC=m + +# +# Software defined radio USB devices +# +CONFIG_USB_AIRSPY=m +CONFIG_USB_HACKRF=m +CONFIG_USB_MSI2500=m + +# +# USB HDMI CEC adapters +# +CONFIG_USB_PULSE8_CEC=m +CONFIG_USB_RAINSHADOW_CEC=m +CONFIG_MEDIA_PCI_SUPPORT=y + +# +# Media capture support +# +CONFIG_VIDEO_MEYE=m +CONFIG_VIDEO_SOLO6X10=m +CONFIG_VIDEO_TW5864=m +CONFIG_VIDEO_TW68=m +CONFIG_VIDEO_TW686X=m +CONFIG_VIDEO_ZORAN=m +CONFIG_VIDEO_ZORAN_DC30=m +CONFIG_VIDEO_ZORAN_ZR36060=m +CONFIG_VIDEO_ZORAN_BUZ=m +CONFIG_VIDEO_ZORAN_DC10=m +CONFIG_VIDEO_ZORAN_LML33=m +CONFIG_VIDEO_ZORAN_LML33R10=m +CONFIG_VIDEO_ZORAN_AVS6EYES=m + +# +# Media capture/analog TV support +# +CONFIG_VIDEO_IVTV=m +# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set +CONFIG_VIDEO_IVTV_ALSA=m +CONFIG_VIDEO_FB_IVTV=m +CONFIG_VIDEO_HEXIUM_GEMINI=m +CONFIG_VIDEO_HEXIUM_ORION=m +CONFIG_VIDEO_MXB=m +CONFIG_VIDEO_DT3155=m + +# +# Media capture/analog/hybrid TV support +# +CONFIG_VIDEO_CX18=m +CONFIG_VIDEO_CX18_ALSA=m +CONFIG_VIDEO_CX23885=m +CONFIG_MEDIA_ALTERA_CI=m +CONFIG_VIDEO_CX25821=m +CONFIG_VIDEO_CX25821_ALSA=m +CONFIG_VIDEO_CX88=m +CONFIG_VIDEO_CX88_ALSA=m +CONFIG_VIDEO_CX88_BLACKBIRD=m +CONFIG_VIDEO_CX88_DVB=m +CONFIG_VIDEO_CX88_ENABLE_VP3054=y +CONFIG_VIDEO_CX88_VP3054=m +CONFIG_VIDEO_CX88_MPEG=m +CONFIG_VIDEO_BT848=m +CONFIG_DVB_BT8XX=m +CONFIG_VIDEO_SAA7134=m +CONFIG_VIDEO_SAA7134_ALSA=m +CONFIG_VIDEO_SAA7134_RC=y +CONFIG_VIDEO_SAA7134_DVB=m +CONFIG_VIDEO_SAA7134_GO7007=m +CONFIG_VIDEO_SAA7164=m +# CONFIG_VIDEO_COBALT is not set + +# +# Media digital TV PCI Adapters +# +CONFIG_DVB_AV7110_IR=y +CONFIG_DVB_AV7110=m +CONFIG_DVB_AV7110_OSD=y +CONFIG_DVB_BUDGET_CORE=m +CONFIG_DVB_BUDGET=m +CONFIG_DVB_BUDGET_CI=m +CONFIG_DVB_BUDGET_AV=m +CONFIG_DVB_BUDGET_PATCH=m +CONFIG_DVB_B2C2_FLEXCOP_PCI=m +# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set +CONFIG_DVB_PLUTO2=m +CONFIG_DVB_DM1105=m +CONFIG_DVB_PT1=m +CONFIG_DVB_PT3=m +CONFIG_MANTIS_CORE=m +CONFIG_DVB_MANTIS=m +CONFIG_DVB_HOPPER=m +CONFIG_DVB_NGENE=m +CONFIG_DVB_DDBRIDGE=m +# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set +CONFIG_DVB_SMIPCIE=m +CONFIG_DVB_NETUP_UNIDVB=m +CONFIG_V4L_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CAFE_CCIC=m +CONFIG_SOC_CAMERA=m +CONFIG_SOC_CAMERA_PLATFORM=m +CONFIG_V4L_MEM2MEM_DRIVERS=y +CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m +CONFIG_VIDEO_SH_VEU=m +# CONFIG_V4L_TEST_DRIVERS is not set +CONFIG_DVB_PLATFORM_DRIVERS=y +CONFIG_CEC_PLATFORM_DRIVERS=y +CONFIG_SDR_PLATFORM_DRIVERS=y + +# +# Supported MMC/SDIO adapters +# +CONFIG_SMS_SDIO_DRV=m +CONFIG_RADIO_ADAPTERS=y +CONFIG_RADIO_TEA575X=m +CONFIG_RADIO_SI470X=y +CONFIG_USB_SI470X=m +CONFIG_I2C_SI470X=m +CONFIG_RADIO_SI4713=m +CONFIG_USB_SI4713=m +CONFIG_PLATFORM_SI4713=m +CONFIG_I2C_SI4713=m +CONFIG_RADIO_SI476X=m +CONFIG_USB_MR800=m +CONFIG_USB_DSBR=m +CONFIG_RADIO_MAXIRADIO=m +CONFIG_RADIO_SHARK=m +CONFIG_RADIO_SHARK2=m +CONFIG_USB_KEENE=m +CONFIG_USB_RAREMONO=m +CONFIG_USB_MA901=m +CONFIG_RADIO_TEA5764=m +CONFIG_RADIO_SAA7706H=m +CONFIG_RADIO_TEF6862=m +CONFIG_RADIO_WL1273=m + +# +# Texas Instruments WL128x FM driver (ST based) +# +CONFIG_RADIO_WL128X=m + +# +# Supported FireWire (IEEE 1394) Adapters +# +CONFIG_DVB_FIREDTV=m +CONFIG_DVB_FIREDTV_INPUT=y +CONFIG_MEDIA_COMMON_OPTIONS=y + +# +# common driver options +# +CONFIG_VIDEO_CX2341X=m +CONFIG_VIDEO_TVEEPROM=m +CONFIG_CYPRESS_FIRMWARE=m +CONFIG_DVB_B2C2_FLEXCOP=m +CONFIG_VIDEO_SAA7146=m +CONFIG_VIDEO_SAA7146_VV=m +CONFIG_SMS_SIANO_MDTV=m +CONFIG_SMS_SIANO_RC=y +# CONFIG_SMS_SIANO_DEBUGFS is not set + +# +# Media ancillary drivers (tuners, sensors, i2c, spi, frontends) +# +CONFIG_MEDIA_SUBDRV_AUTOSELECT=y +CONFIG_MEDIA_ATTACH=y +CONFIG_VIDEO_IR_I2C=m + +# +# Audio decoders, processors and mixers +# +CONFIG_VIDEO_TVAUDIO=m +CONFIG_VIDEO_TDA7432=m +CONFIG_VIDEO_TDA9840=m +CONFIG_VIDEO_TEA6415C=m +CONFIG_VIDEO_TEA6420=m +CONFIG_VIDEO_MSP3400=m +CONFIG_VIDEO_CS3308=m +CONFIG_VIDEO_CS5345=m +CONFIG_VIDEO_CS53L32A=m +CONFIG_VIDEO_UDA1342=m +CONFIG_VIDEO_WM8775=m +CONFIG_VIDEO_WM8739=m +CONFIG_VIDEO_VP27SMPX=m +CONFIG_VIDEO_SONY_BTF_MPX=m + +# +# RDS decoders +# +CONFIG_VIDEO_SAA6588=m + +# +# Video decoders +# +CONFIG_VIDEO_BT819=m +CONFIG_VIDEO_BT856=m +CONFIG_VIDEO_BT866=m +CONFIG_VIDEO_KS0127=m +CONFIG_VIDEO_SAA7110=m +CONFIG_VIDEO_SAA711X=m +CONFIG_VIDEO_TVP5150=m +CONFIG_VIDEO_TW2804=m +CONFIG_VIDEO_TW9903=m +CONFIG_VIDEO_TW9906=m +CONFIG_VIDEO_VPX3220=m + +# +# Video and audio decoders +# +CONFIG_VIDEO_SAA717X=m +CONFIG_VIDEO_CX25840=m + +# +# Video encoders +# +CONFIG_VIDEO_SAA7127=m +CONFIG_VIDEO_SAA7185=m +CONFIG_VIDEO_ADV7170=m +CONFIG_VIDEO_ADV7175=m + +# +# Camera sensor devices +# +CONFIG_VIDEO_OV2640=m +CONFIG_VIDEO_OV7640=m +CONFIG_VIDEO_OV7670=m +CONFIG_VIDEO_MT9M111=m +CONFIG_VIDEO_MT9V011=m + +# +# Flash devices +# + +# +# Video improvement chips +# +CONFIG_VIDEO_UPD64031A=m +CONFIG_VIDEO_UPD64083=m + +# +# Audio/Video compression chips +# +CONFIG_VIDEO_SAA6752HS=m + +# +# SDR tuner chips +# + +# +# Miscellaneous helper chips +# +CONFIG_VIDEO_M52790=m + +# +# Sensors used on soc_camera driver +# + +# +# soc_camera sensor drivers +# +CONFIG_SOC_CAMERA_IMX074=m +CONFIG_SOC_CAMERA_MT9M001=m +CONFIG_SOC_CAMERA_MT9M111=m +CONFIG_SOC_CAMERA_MT9T031=m +CONFIG_SOC_CAMERA_MT9T112=m +CONFIG_SOC_CAMERA_MT9V022=m +CONFIG_SOC_CAMERA_OV5642=m +CONFIG_SOC_CAMERA_OV772X=m +CONFIG_SOC_CAMERA_OV9640=m +CONFIG_SOC_CAMERA_OV9740=m +CONFIG_SOC_CAMERA_RJ54N1=m +CONFIG_SOC_CAMERA_TW9910=m +CONFIG_MEDIA_TUNER=m +CONFIG_MEDIA_TUNER_SIMPLE=m +CONFIG_MEDIA_TUNER_TDA8290=m +CONFIG_MEDIA_TUNER_TDA827X=m +CONFIG_MEDIA_TUNER_TDA18271=m +CONFIG_MEDIA_TUNER_TDA9887=m +CONFIG_MEDIA_TUNER_TEA5761=m +CONFIG_MEDIA_TUNER_TEA5767=m +CONFIG_MEDIA_TUNER_MSI001=m +CONFIG_MEDIA_TUNER_MT20XX=m +CONFIG_MEDIA_TUNER_MT2060=m +CONFIG_MEDIA_TUNER_MT2063=m +CONFIG_MEDIA_TUNER_MT2266=m +CONFIG_MEDIA_TUNER_MT2131=m +CONFIG_MEDIA_TUNER_QT1010=m +CONFIG_MEDIA_TUNER_XC2028=m +CONFIG_MEDIA_TUNER_XC5000=m +CONFIG_MEDIA_TUNER_XC4000=m +CONFIG_MEDIA_TUNER_MXL5005S=m +CONFIG_MEDIA_TUNER_MXL5007T=m +CONFIG_MEDIA_TUNER_MC44S803=m +CONFIG_MEDIA_TUNER_MAX2165=m +CONFIG_MEDIA_TUNER_TDA18218=m +CONFIG_MEDIA_TUNER_FC0011=m +CONFIG_MEDIA_TUNER_FC0012=m +CONFIG_MEDIA_TUNER_FC0013=m +CONFIG_MEDIA_TUNER_TDA18212=m +CONFIG_MEDIA_TUNER_E4000=m +CONFIG_MEDIA_TUNER_FC2580=m +CONFIG_MEDIA_TUNER_M88RS6000T=m +CONFIG_MEDIA_TUNER_TUA9001=m +CONFIG_MEDIA_TUNER_SI2157=m +CONFIG_MEDIA_TUNER_IT913X=m +CONFIG_MEDIA_TUNER_R820T=m +CONFIG_MEDIA_TUNER_MXL301RF=m +CONFIG_MEDIA_TUNER_QM1D1C0042=m + +# +# Multistandard (satellite) frontends +# +CONFIG_DVB_STB0899=m +CONFIG_DVB_STB6100=m +CONFIG_DVB_STV090x=m +CONFIG_DVB_STV0910=m +CONFIG_DVB_STV6110x=m +CONFIG_DVB_STV6111=m +CONFIG_DVB_MXL5XX=m +CONFIG_DVB_M88DS3103=m + +# +# Multistandard (cable + terrestrial) frontends +# +CONFIG_DVB_DRXK=m +CONFIG_DVB_TDA18271C2DD=m +CONFIG_DVB_SI2165=m +CONFIG_DVB_MN88472=m +CONFIG_DVB_MN88473=m + +# +# DVB-S (satellite) frontends +# +CONFIG_DVB_CX24110=m +CONFIG_DVB_CX24123=m +CONFIG_DVB_MT312=m +CONFIG_DVB_ZL10036=m +CONFIG_DVB_ZL10039=m +CONFIG_DVB_S5H1420=m +CONFIG_DVB_STV0288=m +CONFIG_DVB_STB6000=m +CONFIG_DVB_STV0299=m +CONFIG_DVB_STV6110=m +CONFIG_DVB_STV0900=m +CONFIG_DVB_TDA8083=m +CONFIG_DVB_TDA10086=m +CONFIG_DVB_TDA8261=m +CONFIG_DVB_VES1X93=m +CONFIG_DVB_TUNER_ITD1000=m +CONFIG_DVB_TUNER_CX24113=m +CONFIG_DVB_TDA826X=m +CONFIG_DVB_TUA6100=m +CONFIG_DVB_CX24116=m +CONFIG_DVB_CX24117=m +CONFIG_DVB_CX24120=m +CONFIG_DVB_SI21XX=m +CONFIG_DVB_TS2020=m +CONFIG_DVB_DS3000=m +CONFIG_DVB_MB86A16=m +CONFIG_DVB_TDA10071=m + +# +# DVB-T (terrestrial) frontends +# +CONFIG_DVB_SP8870=m +CONFIG_DVB_SP887X=m +CONFIG_DVB_CX22700=m +CONFIG_DVB_CX22702=m +CONFIG_DVB_DRXD=m +CONFIG_DVB_L64781=m +CONFIG_DVB_TDA1004X=m +CONFIG_DVB_NXT6000=m +CONFIG_DVB_MT352=m +CONFIG_DVB_ZL10353=m +CONFIG_DVB_DIB3000MB=m +CONFIG_DVB_DIB3000MC=m +CONFIG_DVB_DIB7000M=m +CONFIG_DVB_DIB7000P=m +CONFIG_DVB_TDA10048=m +CONFIG_DVB_AF9013=m +CONFIG_DVB_EC100=m +CONFIG_DVB_STV0367=m +CONFIG_DVB_CXD2820R=m +CONFIG_DVB_CXD2841ER=m +CONFIG_DVB_RTL2830=m +CONFIG_DVB_RTL2832=m +CONFIG_DVB_RTL2832_SDR=m +CONFIG_DVB_SI2168=m +CONFIG_DVB_AS102_FE=m +CONFIG_DVB_ZD1301_DEMOD=m +CONFIG_DVB_GP8PSK_FE=m + +# +# DVB-C (cable) frontends +# +CONFIG_DVB_VES1820=m +CONFIG_DVB_TDA10021=m +CONFIG_DVB_TDA10023=m +CONFIG_DVB_STV0297=m + +# +# ATSC (North American/Korean Terrestrial/Cable DTV) frontends +# +CONFIG_DVB_NXT200X=m +CONFIG_DVB_OR51211=m +CONFIG_DVB_OR51132=m +CONFIG_DVB_BCM3510=m +CONFIG_DVB_LGDT330X=m +CONFIG_DVB_LGDT3305=m +CONFIG_DVB_LGDT3306A=m +CONFIG_DVB_LG2160=m +CONFIG_DVB_S5H1409=m +CONFIG_DVB_AU8522=m +CONFIG_DVB_AU8522_DTV=m +CONFIG_DVB_AU8522_V4L=m +CONFIG_DVB_S5H1411=m + +# +# ISDB-T (terrestrial) frontends +# +CONFIG_DVB_S921=m +CONFIG_DVB_DIB8000=m +CONFIG_DVB_MB86A20S=m + +# +# ISDB-S (satellite) & ISDB-T (terrestrial) frontends +# +CONFIG_DVB_TC90522=m + +# +# Digital terrestrial only tuners/PLL +# +CONFIG_DVB_PLL=m +CONFIG_DVB_TUNER_DIB0070=m +CONFIG_DVB_TUNER_DIB0090=m + +# +# SEC control devices for DVB-S +# +CONFIG_DVB_DRX39XYJ=m +CONFIG_DVB_LNBH25=m +CONFIG_DVB_LNBP21=m +CONFIG_DVB_LNBP22=m +CONFIG_DVB_ISL6405=m +CONFIG_DVB_ISL6421=m +CONFIG_DVB_ISL6423=m +CONFIG_DVB_A8293=m +CONFIG_DVB_SP2=m +CONFIG_DVB_LGS8GXX=m +CONFIG_DVB_ATBM8830=m +CONFIG_DVB_TDA665x=m +CONFIG_DVB_IX2505V=m +CONFIG_DVB_M88RS2000=m +CONFIG_DVB_AF9033=m +CONFIG_DVB_HORUS3A=m +CONFIG_DVB_ASCOT2E=m +CONFIG_DVB_HELENE=m + +# +# Tools to develop new frontends +# +# CONFIG_DVB_DUMMY_FE is not set + +# +# Graphics support +# +CONFIG_AGP=m +CONFIG_AGP_AMD64=m +CONFIG_AGP_INTEL=m +CONFIG_AGP_SIS=m +CONFIG_AGP_VIA=m +CONFIG_INTEL_GTT=m +CONFIG_VGA_ARB=y +CONFIG_VGA_ARB_MAX_GPUS=16 +CONFIG_VGA_SWITCHEROO=y +CONFIG_DRM=m +CONFIG_DRM_MIPI_DSI=y +CONFIG_DRM_DP_AUX_CHARDEV=y +# CONFIG_DRM_DEBUG_MM_SELFTEST is not set +CONFIG_DRM_KMS_HELPER=m +CONFIG_DRM_KMS_FB_HELPER=y +CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_FBDEV_OVERALLOC=100 +# CONFIG_DRM_LOAD_EDID_FIRMWARE is not set +CONFIG_DRM_TTM=m +CONFIG_DRM_GEM_CMA_HELPER=y +CONFIG_DRM_KMS_CMA_HELPER=y +CONFIG_DRM_VM=y + +# +# I2C encoder or helper chips +# +CONFIG_DRM_I2C_CH7006=m +CONFIG_DRM_I2C_SIL164=m +CONFIG_DRM_I2C_NXP_TDA998X=m +CONFIG_DRM_RADEON=m +# CONFIG_DRM_RADEON_USERPTR is not set +CONFIG_DRM_AMDGPU=m +CONFIG_DRM_AMDGPU_SI=y +CONFIG_DRM_AMDGPU_CIK=y +CONFIG_DRM_AMDGPU_USERPTR=y +# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set + +# +# ACP (Audio CoProcessor) Configuration +# +CONFIG_DRM_AMD_ACP=y +CONFIG_DRM_NOUVEAU=m +CONFIG_NOUVEAU_DEBUG=5 +CONFIG_NOUVEAU_DEBUG_DEFAULT=3 +CONFIG_DRM_NOUVEAU_BACKLIGHT=y +CONFIG_DRM_I915=m +# CONFIG_DRM_I915_ALPHA_SUPPORT is not set +CONFIG_DRM_I915_CAPTURE_ERROR=y +CONFIG_DRM_I915_COMPRESS_ERROR=y +CONFIG_DRM_I915_USERPTR=y +CONFIG_DRM_I915_GVT=y +CONFIG_DRM_I915_GVT_KVMGT=m +CONFIG_DRM_VGEM=m +CONFIG_DRM_VMWGFX=m +CONFIG_DRM_VMWGFX_FBCON=y +CONFIG_DRM_GMA500=m +CONFIG_DRM_GMA600=y +CONFIG_DRM_GMA3600=y +CONFIG_DRM_UDL=m +CONFIG_DRM_AST=m +CONFIG_DRM_MGAG200=m +CONFIG_DRM_CIRRUS_QEMU=m +CONFIG_DRM_QXL=m +CONFIG_DRM_BOCHS=m +CONFIG_DRM_VIRTIO_GPU=m +CONFIG_DRM_PANEL=y + +# +# Display Panels +# +CONFIG_DRM_BRIDGE=y +CONFIG_DRM_PANEL_BRIDGE=y + +# +# Display Interface Bridges +# +CONFIG_DRM_ANALOGIX_ANX78XX=m +CONFIG_HSA_AMD=m +CONFIG_DRM_HISI_HIBMC=m +CONFIG_DRM_TINYDRM=m +CONFIG_TINYDRM_MIPI_DBI=m +CONFIG_TINYDRM_MI0283QT=m +CONFIG_TINYDRM_REPAPER=m +CONFIG_TINYDRM_ST7586=m +# CONFIG_DRM_LEGACY is not set +# CONFIG_DRM_LIB_RANDOM is not set + +# +# Frame buffer Devices +# +CONFIG_FB=y +CONFIG_FIRMWARE_EDID=y +CONFIG_FB_CMDLINE=y +CONFIG_FB_NOTIFY=y +# CONFIG_FB_DDC is not set +CONFIG_FB_BOOT_VESA_SUPPORT=y +CONFIG_FB_CFB_FILLRECT=y +CONFIG_FB_CFB_COPYAREA=y +CONFIG_FB_CFB_IMAGEBLIT=y +# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set +CONFIG_FB_SYS_FILLRECT=m +CONFIG_FB_SYS_COPYAREA=m +CONFIG_FB_SYS_IMAGEBLIT=m +# CONFIG_FB_PROVIDE_GET_FB_UNMAPPED_AREA is not set +CONFIG_FB_FOREIGN_ENDIAN=y +CONFIG_FB_BOTH_ENDIAN=y +# CONFIG_FB_BIG_ENDIAN is not set +# CONFIG_FB_LITTLE_ENDIAN is not set +CONFIG_FB_SYS_FOPS=m +CONFIG_FB_DEFERRED_IO=y +# CONFIG_FB_SVGALIB is not set +# CONFIG_FB_MACMODES is not set +CONFIG_FB_BACKLIGHT=y +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y + +# +# Frame buffer hardware drivers +# +# CONFIG_FB_CIRRUS is not set +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set +# CONFIG_FB_ARC is not set +# CONFIG_FB_ASILIANT is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_VGA16 is not set +# CONFIG_FB_UVESA is not set +CONFIG_FB_VESA=y +CONFIG_FB_EFI=y +# CONFIG_FB_N411 is not set +# CONFIG_FB_HGA is not set +# CONFIG_FB_OPENCORES is not set +# CONFIG_FB_S1D13XXX is not set +# CONFIG_FB_NVIDIA is not set +# CONFIG_FB_RIVA is not set +# CONFIG_FB_I740 is not set +# CONFIG_FB_LE80578 is not set +# CONFIG_FB_MATROX is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_S3 is not set +# CONFIG_FB_SAVAGE is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_VIA is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_VT8623 is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_ARK is not set +# CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set +# CONFIG_FB_SM501 is not set +# CONFIG_FB_SMSCUFX is not set +# CONFIG_FB_UDL is not set +# CONFIG_FB_IBM_GXT4500 is not set +# CONFIG_FB_VIRTUAL is not set +# CONFIG_FB_METRONOME is not set +# CONFIG_FB_MB862XX is not set +# CONFIG_FB_BROADSHEET is not set +# CONFIG_FB_AUO_K190X is not set +# CONFIG_FB_HYPERV is not set +CONFIG_FB_SIMPLE=y +# CONFIG_FB_SM712 is not set +CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_LCD_CLASS_DEVICE=m +CONFIG_LCD_L4F00242T03=m +CONFIG_LCD_LMS283GF05=m +CONFIG_LCD_LTV350QV=m +CONFIG_LCD_ILI922X=m +CONFIG_LCD_ILI9320=m +CONFIG_LCD_TDO24M=m +CONFIG_LCD_VGG2432A4=m +CONFIG_LCD_PLATFORM=m +CONFIG_LCD_S6E63M0=m +CONFIG_LCD_LD9040=m +CONFIG_LCD_AMS369FG06=m +CONFIG_LCD_LMS501KF03=m +CONFIG_LCD_HX8357=m +CONFIG_BACKLIGHT_CLASS_DEVICE=y +CONFIG_BACKLIGHT_GENERIC=m +CONFIG_BACKLIGHT_LM3533=m +CONFIG_BACKLIGHT_PWM=m +CONFIG_BACKLIGHT_DA9052=m +CONFIG_BACKLIGHT_APPLE=m +CONFIG_BACKLIGHT_PM8941_WLED=m +CONFIG_BACKLIGHT_SAHARA=m +CONFIG_BACKLIGHT_WM831X=m +CONFIG_BACKLIGHT_ADP8860=m +CONFIG_BACKLIGHT_ADP8870=m +CONFIG_BACKLIGHT_PCF50633=m +CONFIG_BACKLIGHT_LM3630A=m +CONFIG_BACKLIGHT_LM3639=m +CONFIG_BACKLIGHT_LP855X=m +CONFIG_BACKLIGHT_SKY81452=m +CONFIG_BACKLIGHT_TPS65217=m +CONFIG_BACKLIGHT_GPIO=m +CONFIG_BACKLIGHT_LV5207LP=m +CONFIG_BACKLIGHT_BD6107=m +CONFIG_BACKLIGHT_ARCXCNN=m +# CONFIG_VGASTATE is not set +CONFIG_HDMI=y + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +CONFIG_VGACON_SOFT_SCROLLBACK=y +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 +# CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set +CONFIG_DUMMY_CONSOLE=y +CONFIG_DUMMY_CONSOLE_COLUMNS=80 +CONFIG_DUMMY_CONSOLE_ROWS=25 +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y +CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y +CONFIG_LOGO=y +CONFIG_LOGO_LINUX_MONO=y +CONFIG_LOGO_LINUX_VGA16=y +CONFIG_LOGO_LINUX_CLUT224=y +CONFIG_SOUND=m +CONFIG_SOUND_OSS_CORE=y +CONFIG_SOUND_OSS_CORE_PRECLAIM=y +CONFIG_SND=m +CONFIG_SND_TIMER=m +CONFIG_SND_PCM=m +CONFIG_SND_PCM_ELD=y +CONFIG_SND_PCM_IEC958=y +CONFIG_SND_DMAENGINE_PCM=m +CONFIG_SND_HWDEP=m +CONFIG_SND_SEQ_DEVICE=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_COMPRESS_OFFLOAD=m +CONFIG_SND_JACK=y +CONFIG_SND_JACK_INPUT_DEV=y +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=m +CONFIG_SND_PCM_OSS=m +CONFIG_SND_PCM_OSS_PLUGINS=y +CONFIG_SND_PCM_TIMER=y +CONFIG_SND_HRTIMER=m +CONFIG_SND_DYNAMIC_MINORS=y +CONFIG_SND_MAX_CARDS=32 +CONFIG_SND_SUPPORT_OLD_API=y +CONFIG_SND_PROC_FS=y +CONFIG_SND_VERBOSE_PROCFS=y +# CONFIG_SND_VERBOSE_PRINTK is not set +# CONFIG_SND_DEBUG is not set +CONFIG_SND_VMASTER=y +CONFIG_SND_DMA_SGBUF=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m +CONFIG_SND_SEQ_HRTIMER_DEFAULT=y +CONFIG_SND_SEQ_MIDI_EVENT=m +CONFIG_SND_SEQ_MIDI=m +CONFIG_SND_SEQ_MIDI_EMUL=m +CONFIG_SND_SEQ_VIRMIDI=m +CONFIG_SND_MPU401_UART=m +CONFIG_SND_OPL3_LIB=m +CONFIG_SND_OPL3_LIB_SEQ=m +# CONFIG_SND_OPL4_LIB_SEQ is not set +CONFIG_SND_VX_LIB=m +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_DRIVERS=y +# CONFIG_SND_PCSP is not set +CONFIG_SND_DUMMY=m +CONFIG_SND_ALOOP=m +CONFIG_SND_VIRMIDI=m +CONFIG_SND_MTPAV=m +CONFIG_SND_MTS64=m +CONFIG_SND_SERIAL_U16550=m +CONFIG_SND_MPU401=m +CONFIG_SND_PORTMAN2X4=m +CONFIG_SND_AC97_POWER_SAVE=y +CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 +CONFIG_SND_SB_COMMON=m +CONFIG_SND_PCI=y +CONFIG_SND_AD1889=m +CONFIG_SND_ALS300=m +CONFIG_SND_ALS4000=m +CONFIG_SND_ALI5451=m +CONFIG_SND_ASIHPI=m +CONFIG_SND_ATIIXP=m +CONFIG_SND_ATIIXP_MODEM=m +CONFIG_SND_AU8810=m +CONFIG_SND_AU8820=m +CONFIG_SND_AU8830=m +CONFIG_SND_AW2=m +CONFIG_SND_AZT3328=m +CONFIG_SND_BT87X=m +CONFIG_SND_BT87X_OVERCLOCK=y +CONFIG_SND_CA0106=m +CONFIG_SND_CMIPCI=m +CONFIG_SND_OXYGEN_LIB=m +CONFIG_SND_OXYGEN=m +CONFIG_SND_CS4281=m +CONFIG_SND_CS46XX=m +CONFIG_SND_CS46XX_NEW_DSP=y +CONFIG_SND_CTXFI=m +CONFIG_SND_DARLA20=m +CONFIG_SND_GINA20=m +CONFIG_SND_LAYLA20=m +CONFIG_SND_DARLA24=m +CONFIG_SND_GINA24=m +CONFIG_SND_LAYLA24=m +CONFIG_SND_MONA=m +CONFIG_SND_MIA=m +CONFIG_SND_ECHO3G=m +CONFIG_SND_INDIGO=m +CONFIG_SND_INDIGOIO=m +CONFIG_SND_INDIGODJ=m +CONFIG_SND_INDIGOIOX=m +CONFIG_SND_INDIGODJX=m +CONFIG_SND_EMU10K1=m +CONFIG_SND_EMU10K1_SEQ=m +CONFIG_SND_EMU10K1X=m +CONFIG_SND_ENS1370=m +CONFIG_SND_ENS1371=m +CONFIG_SND_ES1938=m +CONFIG_SND_ES1968=m +CONFIG_SND_ES1968_INPUT=y +CONFIG_SND_ES1968_RADIO=y +CONFIG_SND_FM801=m +CONFIG_SND_FM801_TEA575X_BOOL=y +CONFIG_SND_HDSP=m +CONFIG_SND_HDSPM=m +CONFIG_SND_ICE1712=m +CONFIG_SND_ICE1724=m +CONFIG_SND_INTEL8X0=m +CONFIG_SND_INTEL8X0M=m +CONFIG_SND_KORG1212=m +CONFIG_SND_LOLA=m +CONFIG_SND_LX6464ES=m +CONFIG_SND_MAESTRO3=m +CONFIG_SND_MAESTRO3_INPUT=y +CONFIG_SND_MIXART=m +CONFIG_SND_NM256=m +CONFIG_SND_PCXHR=m +CONFIG_SND_RIPTIDE=m +CONFIG_SND_RME32=m +CONFIG_SND_RME96=m +CONFIG_SND_RME9652=m +CONFIG_SND_SONICVIBES=m +CONFIG_SND_TRIDENT=m +CONFIG_SND_VIA82XX=m +CONFIG_SND_VIA82XX_MODEM=m +CONFIG_SND_VIRTUOSO=m +CONFIG_SND_VX222=m +CONFIG_SND_YMFPCI=m + +# +# HD-Audio +# +CONFIG_SND_HDA=m +CONFIG_SND_HDA_INTEL=m +CONFIG_SND_HDA_HWDEP=y +CONFIG_SND_HDA_RECONFIG=y +CONFIG_SND_HDA_INPUT_BEEP=y +CONFIG_SND_HDA_INPUT_BEEP_MODE=1 +CONFIG_SND_HDA_PATCH_LOADER=y +CONFIG_SND_HDA_CODEC_REALTEK=m +CONFIG_SND_HDA_CODEC_ANALOG=m +CONFIG_SND_HDA_CODEC_SIGMATEL=m +CONFIG_SND_HDA_CODEC_VIA=m +CONFIG_SND_HDA_CODEC_HDMI=m +CONFIG_SND_HDA_CODEC_CIRRUS=m +CONFIG_SND_HDA_CODEC_CONEXANT=m +CONFIG_SND_HDA_CODEC_CA0110=m +CONFIG_SND_HDA_CODEC_CA0132=m +CONFIG_SND_HDA_CODEC_CA0132_DSP=y +CONFIG_SND_HDA_CODEC_CMEDIA=m +CONFIG_SND_HDA_CODEC_SI3054=m +CONFIG_SND_HDA_GENERIC=m +CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 +CONFIG_SND_HDA_CORE=m +CONFIG_SND_HDA_DSP_LOADER=y +CONFIG_SND_HDA_I915=y +CONFIG_SND_HDA_EXT_CORE=m +CONFIG_SND_HDA_PREALLOC_SIZE=64 +CONFIG_SND_SPI=y +CONFIG_SND_USB=y +CONFIG_SND_USB_AUDIO=m +CONFIG_SND_USB_UA101=m +CONFIG_SND_USB_USX2Y=m +CONFIG_SND_USB_CAIAQ=m +CONFIG_SND_USB_CAIAQ_INPUT=y +CONFIG_SND_USB_US122L=m +CONFIG_SND_USB_6FIRE=m +CONFIG_SND_USB_HIFACE=m +CONFIG_SND_BCD2000=m +CONFIG_SND_USB_LINE6=m +CONFIG_SND_USB_POD=m +CONFIG_SND_USB_PODHD=m +CONFIG_SND_USB_TONEPORT=m +CONFIG_SND_USB_VARIAX=m +CONFIG_SND_FIREWIRE=y +CONFIG_SND_FIREWIRE_LIB=m +CONFIG_SND_DICE=m +CONFIG_SND_OXFW=m +CONFIG_SND_ISIGHT=m +CONFIG_SND_FIREWORKS=m +CONFIG_SND_BEBOB=m +CONFIG_SND_FIREWIRE_DIGI00X=m +CONFIG_SND_FIREWIRE_TASCAM=m +CONFIG_SND_FIREWIRE_MOTU=m +CONFIG_SND_FIREFACE=m +CONFIG_SND_PCMCIA=y +CONFIG_SND_VXPOCKET=m +CONFIG_SND_PDAUDIOCF=m +CONFIG_SND_SOC=m +CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y +CONFIG_SND_SOC_COMPRESS=y +CONFIG_SND_SOC_TOPOLOGY=y +CONFIG_SND_SOC_AMD_ACP=m +CONFIG_SND_ATMEL_SOC=m +CONFIG_SND_DESIGNWARE_I2S=m +CONFIG_SND_DESIGNWARE_PCM=y + +# +# SoC Audio for Freescale CPUs +# + +# +# Common SoC Audio options for Freescale CPUs: +# +CONFIG_SND_SOC_FSL_ASRC=m +CONFIG_SND_SOC_FSL_SAI=m +CONFIG_SND_SOC_FSL_SSI=m +CONFIG_SND_SOC_FSL_SPDIF=m +CONFIG_SND_SOC_FSL_ESAI=m +CONFIG_SND_SOC_IMX_AUDMUX=m +CONFIG_SND_I2S_HI6210_I2S=m +CONFIG_SND_SOC_IMG=y +CONFIG_SND_SOC_IMG_I2S_IN=m +CONFIG_SND_SOC_IMG_I2S_OUT=m +CONFIG_SND_SOC_IMG_PARALLEL_OUT=m +CONFIG_SND_SOC_IMG_SPDIF_IN=m +CONFIG_SND_SOC_IMG_SPDIF_OUT=m +CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m +CONFIG_SND_SST_IPC=m +CONFIG_SND_SST_IPC_ACPI=m +CONFIG_SND_SOC_INTEL_SST=m +CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m +CONFIG_SND_SOC_INTEL_SST_ACPI=m +CONFIG_SND_SOC_INTEL_SST_MATCH=m +CONFIG_SND_SOC_INTEL_HASWELL=m +CONFIG_SND_SOC_INTEL_HASWELL_MACH=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m +CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m +CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_SKYLAKE=m +CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m + +# +# STMicroelectronics STM32 SOC audio support +# +CONFIG_SND_SOC_XTFPGA_I2S=m +CONFIG_ZX_TDM=m +CONFIG_SND_SOC_I2C_AND_SPI=m + +# +# CODEC drivers +# +# CONFIG_SND_SOC_AC97_CODEC is not set +CONFIG_SND_SOC_ADAU_UTILS=m +CONFIG_SND_SOC_ADAU1701=m +CONFIG_SND_SOC_ADAU17X1=m +CONFIG_SND_SOC_ADAU1761=m +CONFIG_SND_SOC_ADAU1761_I2C=m +CONFIG_SND_SOC_ADAU1761_SPI=m +CONFIG_SND_SOC_ADAU7002=m +CONFIG_SND_SOC_AK4104=m +CONFIG_SND_SOC_AK4554=m +CONFIG_SND_SOC_AK4613=m +CONFIG_SND_SOC_AK4642=m +CONFIG_SND_SOC_AK5386=m +CONFIG_SND_SOC_ALC5623=m +# CONFIG_SND_SOC_BT_SCO is not set +CONFIG_SND_SOC_CS35L32=m +CONFIG_SND_SOC_CS35L33=m +CONFIG_SND_SOC_CS35L34=m +CONFIG_SND_SOC_CS35L35=m +CONFIG_SND_SOC_CS42L42=m +CONFIG_SND_SOC_CS42L51=m +CONFIG_SND_SOC_CS42L51_I2C=m +CONFIG_SND_SOC_CS42L52=m +CONFIG_SND_SOC_CS42L56=m +CONFIG_SND_SOC_CS42L73=m +CONFIG_SND_SOC_CS4265=m +CONFIG_SND_SOC_CS4270=m +CONFIG_SND_SOC_CS4271=m +CONFIG_SND_SOC_CS4271_I2C=m +CONFIG_SND_SOC_CS4271_SPI=m +CONFIG_SND_SOC_CS42XX8=m +CONFIG_SND_SOC_CS42XX8_I2C=m +CONFIG_SND_SOC_CS43130=m +CONFIG_SND_SOC_CS4349=m +CONFIG_SND_SOC_CS53L30=m +CONFIG_SND_SOC_DA7213=m +CONFIG_SND_SOC_DA7219=m +CONFIG_SND_SOC_DIO2125=m +CONFIG_SND_SOC_DMIC=m +CONFIG_SND_SOC_HDMI_CODEC=m +CONFIG_SND_SOC_ES7134=m +CONFIG_SND_SOC_ES8316=m +CONFIG_SND_SOC_ES8328=m +CONFIG_SND_SOC_ES8328_I2C=m +CONFIG_SND_SOC_ES8328_SPI=m +CONFIG_SND_SOC_GTM601=m +CONFIG_SND_SOC_HDAC_HDMI=m +CONFIG_SND_SOC_INNO_RK3036=m +CONFIG_SND_SOC_MAX98090=m +CONFIG_SND_SOC_MAX98357A=m +CONFIG_SND_SOC_MAX98504=m +CONFIG_SND_SOC_MAX98927=m +CONFIG_SND_SOC_MAX9860=m +CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m +CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m +CONFIG_SND_SOC_PCM1681=m +CONFIG_SND_SOC_PCM179X=m +CONFIG_SND_SOC_PCM179X_I2C=m +CONFIG_SND_SOC_PCM179X_SPI=m +CONFIG_SND_SOC_PCM3168A=m +CONFIG_SND_SOC_PCM3168A_I2C=m +CONFIG_SND_SOC_PCM3168A_SPI=m +CONFIG_SND_SOC_PCM512x=m +CONFIG_SND_SOC_PCM512x_I2C=m +CONFIG_SND_SOC_PCM512x_SPI=m +CONFIG_SND_SOC_RL6231=m +CONFIG_SND_SOC_RL6347A=m +CONFIG_SND_SOC_RT286=m +CONFIG_SND_SOC_RT298=m +CONFIG_SND_SOC_RT5514=m +CONFIG_SND_SOC_RT5514_SPI=m +CONFIG_SND_SOC_RT5616=m +CONFIG_SND_SOC_RT5631=m +CONFIG_SND_SOC_RT5640=m +CONFIG_SND_SOC_RT5645=m +CONFIG_SND_SOC_RT5651=m +CONFIG_SND_SOC_RT5663=m +CONFIG_SND_SOC_RT5670=m +CONFIG_SND_SOC_RT5677=m +CONFIG_SND_SOC_RT5677_SPI=m +CONFIG_SND_SOC_SGTL5000=m +CONFIG_SND_SOC_SI476X=m +CONFIG_SND_SOC_SIGMADSP=m +CONFIG_SND_SOC_SIGMADSP_I2C=m +CONFIG_SND_SOC_SIGMADSP_REGMAP=m +CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m +CONFIG_SND_SOC_SPDIF=m +CONFIG_SND_SOC_SSM2602=m +CONFIG_SND_SOC_SSM2602_SPI=m +CONFIG_SND_SOC_SSM2602_I2C=m +CONFIG_SND_SOC_SSM4567=m +CONFIG_SND_SOC_STA32X=m +CONFIG_SND_SOC_STA350=m +CONFIG_SND_SOC_STI_SAS=m +CONFIG_SND_SOC_TAS2552=m +CONFIG_SND_SOC_TAS5086=m +CONFIG_SND_SOC_TAS571X=m +CONFIG_SND_SOC_TAS5720=m +CONFIG_SND_SOC_TFA9879=m +CONFIG_SND_SOC_TLV320AIC23=m +CONFIG_SND_SOC_TLV320AIC23_I2C=m +CONFIG_SND_SOC_TLV320AIC23_SPI=m +CONFIG_SND_SOC_TLV320AIC31XX=m +CONFIG_SND_SOC_TLV320AIC3X=m +CONFIG_SND_SOC_TS3A227E=m +CONFIG_SND_SOC_WM8510=m +CONFIG_SND_SOC_WM8523=m +CONFIG_SND_SOC_WM8524=m +CONFIG_SND_SOC_WM8580=m +CONFIG_SND_SOC_WM8711=m +CONFIG_SND_SOC_WM8728=m +CONFIG_SND_SOC_WM8731=m +CONFIG_SND_SOC_WM8737=m +CONFIG_SND_SOC_WM8741=m +CONFIG_SND_SOC_WM8750=m +CONFIG_SND_SOC_WM8753=m +CONFIG_SND_SOC_WM8770=m +CONFIG_SND_SOC_WM8776=m +CONFIG_SND_SOC_WM8804=m +CONFIG_SND_SOC_WM8804_I2C=m +CONFIG_SND_SOC_WM8804_SPI=m +CONFIG_SND_SOC_WM8903=m +CONFIG_SND_SOC_WM8960=m +CONFIG_SND_SOC_WM8962=m +CONFIG_SND_SOC_WM8974=m +CONFIG_SND_SOC_WM8978=m +CONFIG_SND_SOC_WM8985=m +CONFIG_SND_SOC_ZX_AUD96P22=m +CONFIG_SND_SOC_NAU8540=m +CONFIG_SND_SOC_NAU8810=m +CONFIG_SND_SOC_NAU8824=m +CONFIG_SND_SOC_NAU8825=m +CONFIG_SND_SOC_TPA6130A2=m +CONFIG_SND_SIMPLE_CARD_UTILS=m +CONFIG_SND_SIMPLE_CARD=m +CONFIG_SND_X86=y +CONFIG_HDMI_LPE_AUDIO=m +CONFIG_SND_SYNTH_EMUX=m +CONFIG_AC97_BUS=m + +# +# HID support +# +CONFIG_HID=m +CONFIG_HID_BATTERY_STRENGTH=y +CONFIG_HIDRAW=y +CONFIG_UHID=m +CONFIG_HID_GENERIC=m + +# +# Special HID drivers +# +CONFIG_HID_A4TECH=m +CONFIG_HID_ACCUTOUCH=m +CONFIG_HID_ACRUX=m +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=m +CONFIG_HID_APPLEIR=m +CONFIG_HID_ASUS=m +CONFIG_HID_AUREAL=m +CONFIG_HID_BELKIN=m +CONFIG_HID_BETOP_FF=m +CONFIG_HID_CHERRY=m +CONFIG_HID_CHICONY=m +CONFIG_HID_CORSAIR=m +CONFIG_HID_PRODIKEYS=m +CONFIG_HID_CMEDIA=m +CONFIG_HID_CP2112=m +CONFIG_HID_CYPRESS=m +CONFIG_HID_DRAGONRISE=m +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=m +CONFIG_HID_ELECOM=m +CONFIG_HID_ELO=m +CONFIG_HID_EZKEY=m +CONFIG_HID_GEMBIRD=m +CONFIG_HID_GFRM=m +CONFIG_HID_HOLTEK=m +CONFIG_HOLTEK_FF=y +CONFIG_HID_GT683R=m +CONFIG_HID_KEYTOUCH=m +CONFIG_HID_KYE=m +CONFIG_HID_UCLOGIC=m +CONFIG_HID_WALTOP=m +CONFIG_HID_GYRATION=m +CONFIG_HID_ICADE=m +CONFIG_HID_ITE=m +CONFIG_HID_TWINHAN=m +CONFIG_HID_KENSINGTON=m +CONFIG_HID_LCPOWER=m +CONFIG_HID_LED=m +CONFIG_HID_LENOVO=m +CONFIG_HID_LOGITECH=m +CONFIG_HID_LOGITECH_DJ=m +CONFIG_HID_LOGITECH_HIDPP=m +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIWHEELS_FF=y +CONFIG_HID_MAGICMOUSE=m +CONFIG_HID_MAYFLASH=m +CONFIG_HID_MICROSOFT=m +CONFIG_HID_MONTEREY=m +CONFIG_HID_MULTITOUCH=m +CONFIG_HID_NTI=m +CONFIG_HID_NTRIG=m +CONFIG_HID_ORTEK=m +CONFIG_HID_PANTHERLORD=m +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PENMOUNT=m +CONFIG_HID_PETALYNX=m +CONFIG_HID_PICOLCD=m +CONFIG_HID_PICOLCD_FB=y +CONFIG_HID_PICOLCD_BACKLIGHT=y +CONFIG_HID_PICOLCD_LCD=y +CONFIG_HID_PICOLCD_LEDS=y +CONFIG_HID_PICOLCD_CIR=y +CONFIG_HID_PLANTRONICS=m +CONFIG_HID_PRIMAX=m +CONFIG_HID_RETRODE=m +CONFIG_HID_ROCCAT=m +CONFIG_HID_SAITEK=m +CONFIG_HID_SAMSUNG=m +CONFIG_HID_SONY=m +CONFIG_SONY_FF=y +CONFIG_HID_SPEEDLINK=m +CONFIG_HID_STEELSERIES=m +CONFIG_HID_SUNPLUS=m +CONFIG_HID_RMI=m +CONFIG_HID_GREENASIA=m +CONFIG_GREENASIA_FF=y +CONFIG_HID_HYPERV_MOUSE=m +CONFIG_HID_SMARTJOYPLUS=m +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=m +CONFIG_HID_TOPSEED=m +CONFIG_HID_THINGM=m +CONFIG_HID_THRUSTMASTER=m +CONFIG_THRUSTMASTER_FF=y +CONFIG_HID_UDRAW_PS3=m +CONFIG_HID_WACOM=m +CONFIG_HID_WIIMOTE=m +CONFIG_HID_XINMO=m +CONFIG_HID_ZEROPLUS=m +CONFIG_ZEROPLUS_FF=y +CONFIG_HID_ZYDACRON=m +CONFIG_HID_SENSOR_HUB=m +CONFIG_HID_SENSOR_CUSTOM_SENSOR=m +CONFIG_HID_ALPS=m + +# +# USB HID support +# +CONFIG_USB_HID=m +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y + +# +# I2C HID support +# +CONFIG_I2C_HID=m + +# +# Intel ISH HID support +# +CONFIG_INTEL_ISH_HID=m +CONFIG_USB_OHCI_LITTLE_ENDIAN=y +CONFIG_USB_SUPPORT=y +CONFIG_USB_COMMON=y +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB=m +CONFIG_USB_PCI=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y + +# +# Miscellaneous USB options +# +CONFIG_USB_DEFAULT_PERSIST=y +CONFIG_USB_DYNAMIC_MINORS=y +CONFIG_USB_OTG=y +# CONFIG_USB_OTG_WHITELIST is not set +# CONFIG_USB_OTG_BLACKLIST_HUB is not set +CONFIG_USB_OTG_FSM=m +CONFIG_USB_LEDS_TRIGGER_USBPORT=m +CONFIG_USB_MON=m +CONFIG_USB_WUSB=m +CONFIG_USB_WUSB_CBAF=m +# CONFIG_USB_WUSB_CBAF_DEBUG is not set + +# +# USB Host Controller Drivers +# +CONFIG_USB_C67X00_HCD=m +CONFIG_USB_XHCI_HCD=m +CONFIG_USB_XHCI_PCI=m +CONFIG_USB_XHCI_PLATFORM=m +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_EHCI_TT_NEWSCHED=y +CONFIG_USB_EHCI_PCI=m +CONFIG_USB_EHCI_HCD_PLATFORM=m +CONFIG_USB_OXU210HP_HCD=m +CONFIG_USB_ISP116X_HCD=m +CONFIG_USB_ISP1362_HCD=m +CONFIG_USB_FOTG210_HCD=m +CONFIG_USB_MAX3421_HCD=m +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_OHCI_HCD_PCI=m +CONFIG_USB_OHCI_HCD_SSB=y +CONFIG_USB_OHCI_HCD_PLATFORM=m +CONFIG_USB_UHCI_HCD=m +CONFIG_USB_U132_HCD=m +CONFIG_USB_SL811_HCD=m +# CONFIG_USB_SL811_HCD_ISO is not set +CONFIG_USB_SL811_CS=m +CONFIG_USB_R8A66597_HCD=m +CONFIG_USB_WHCI_HCD=m +CONFIG_USB_HWA_HCD=m +CONFIG_USB_HCD_BCMA=m +CONFIG_USB_HCD_SSB=m +# CONFIG_USB_HCD_TEST_MODE is not set + +# +# USB Device Class drivers +# +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_WDM=m +CONFIG_USB_TMC=m + +# +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may +# + +# +# also be needed; see USB_STORAGE Help for more info +# +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_REALTEK=m +CONFIG_REALTEK_AUTOPM=y +CONFIG_USB_STORAGE_DATAFAB=m +CONFIG_USB_STORAGE_FREECOM=m +CONFIG_USB_STORAGE_ISD200=m +CONFIG_USB_STORAGE_USBAT=m +CONFIG_USB_STORAGE_SDDR09=m +CONFIG_USB_STORAGE_SDDR55=m +CONFIG_USB_STORAGE_JUMPSHOT=m +CONFIG_USB_STORAGE_ALAUDA=m +CONFIG_USB_STORAGE_ONETOUCH=m +CONFIG_USB_STORAGE_KARMA=m +CONFIG_USB_STORAGE_CYPRESS_ATACB=m +CONFIG_USB_STORAGE_ENE_UB6250=m +CONFIG_USB_UAS=m + +# +# USB Imaging devices +# +CONFIG_USB_MDC800=m +CONFIG_USB_MICROTEK=m +CONFIG_USBIP_CORE=m +CONFIG_USBIP_VHCI_HCD=m +CONFIG_USBIP_VHCI_HC_PORTS=8 +CONFIG_USBIP_VHCI_NR_HCS=1 +CONFIG_USBIP_HOST=m +CONFIG_USBIP_VUDC=m +# CONFIG_USBIP_DEBUG is not set +CONFIG_USB_MUSB_HDRC=m +# CONFIG_USB_MUSB_HOST is not set +# CONFIG_USB_MUSB_GADGET is not set +CONFIG_USB_MUSB_DUAL_ROLE=y + +# +# Platform Glue Layer +# + +# +# MUSB DMA mode +# +CONFIG_MUSB_PIO_ONLY=y +CONFIG_USB_DWC3=m +# CONFIG_USB_DWC3_ULPI is not set +# CONFIG_USB_DWC3_HOST is not set +# CONFIG_USB_DWC3_GADGET is not set +CONFIG_USB_DWC3_DUAL_ROLE=y + +# +# Platform Glue Driver Support +# +CONFIG_USB_DWC3_PCI=m +CONFIG_USB_DWC2=m +# CONFIG_USB_DWC2_HOST is not set + +# +# Gadget/Dual-role mode requires USB Gadget support to be enabled +# +# CONFIG_USB_DWC2_PERIPHERAL is not set +CONFIG_USB_DWC2_DUAL_ROLE=y +CONFIG_USB_DWC2_PCI=m +# CONFIG_USB_DWC2_DEBUG is not set +# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set +CONFIG_USB_CHIPIDEA=m +CONFIG_USB_CHIPIDEA_PCI=m +CONFIG_USB_CHIPIDEA_UDC=y +CONFIG_USB_CHIPIDEA_HOST=y +# CONFIG_USB_CHIPIDEA_ULPI is not set +CONFIG_USB_ISP1760=m +CONFIG_USB_ISP1760_HCD=y +CONFIG_USB_ISP1761_UDC=y +# CONFIG_USB_ISP1760_HOST_ROLE is not set +# CONFIG_USB_ISP1760_GADGET_ROLE is not set +CONFIG_USB_ISP1760_DUAL_ROLE=y + +# +# USB port drivers +# +CONFIG_USB_USS720=m +CONFIG_USB_SERIAL=m +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_SIMPLE=m +CONFIG_USB_SERIAL_AIRCABLE=m +CONFIG_USB_SERIAL_ARK3116=m +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_CH341=m +CONFIG_USB_SERIAL_WHITEHEAT=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_CP210X=m +CONFIG_USB_SERIAL_CYPRESS_M8=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_F81232=m +CONFIG_USB_SERIAL_F8153X=m +CONFIG_USB_SERIAL_GARMIN=m +CONFIG_USB_SERIAL_IPW=m +CONFIG_USB_SERIAL_IUU=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_METRO=m +CONFIG_USB_SERIAL_MOS7720=m +CONFIG_USB_SERIAL_MOS7715_PARPORT=y +CONFIG_USB_SERIAL_MOS7840=m +CONFIG_USB_SERIAL_MXUPORT=m +CONFIG_USB_SERIAL_NAVMAN=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_OTI6858=m +CONFIG_USB_SERIAL_QCAUX=m +CONFIG_USB_SERIAL_QUALCOMM=m +CONFIG_USB_SERIAL_SPCP8X5=m +CONFIG_USB_SERIAL_SAFE=m +CONFIG_USB_SERIAL_SAFE_PADDED=y +CONFIG_USB_SERIAL_SIERRAWIRELESS=m +CONFIG_USB_SERIAL_SYMBOL=m +CONFIG_USB_SERIAL_TI=m +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_WWAN=m +CONFIG_USB_SERIAL_OPTION=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_SERIAL_OPTICON=m +CONFIG_USB_SERIAL_XSENS_MT=m +CONFIG_USB_SERIAL_WISHBONE=m +CONFIG_USB_SERIAL_SSU100=m +CONFIG_USB_SERIAL_QT2=m +# CONFIG_USB_SERIAL_UPD78F0730 is not set +# CONFIG_USB_SERIAL_DEBUG is not set + +# +# USB Miscellaneous drivers +# +CONFIG_USB_EMI62=m +CONFIG_USB_EMI26=m +CONFIG_USB_ADUTUX=m +CONFIG_USB_SEVSEG=m +CONFIG_USB_RIO500=m +CONFIG_USB_LEGOTOWER=m +CONFIG_USB_LCD=m +CONFIG_USB_CYPRESS_CY7C63=m +CONFIG_USB_CYTHERM=m +CONFIG_USB_IDMOUSE=m +CONFIG_USB_FTDI_ELAN=m +CONFIG_USB_APPLEDISPLAY=m +CONFIG_USB_SISUSBVGA=m +CONFIG_USB_SISUSBVGA_CON=y +CONFIG_USB_LD=m +CONFIG_USB_TRANCEVIBRATOR=m +CONFIG_USB_IOWARRIOR=m +CONFIG_USB_TEST=m +CONFIG_USB_EHSET_TEST_FIXTURE=m +CONFIG_USB_ISIGHTFW=m +CONFIG_USB_YUREX=m +CONFIG_USB_EZUSB_FX2=m +CONFIG_USB_HUB_USB251XB=m +CONFIG_USB_HSIC_USB3503=m +CONFIG_USB_HSIC_USB4604=m +CONFIG_USB_LINK_LAYER_TEST=m +CONFIG_USB_CHAOSKEY=m +CONFIG_USB_ATM=m +CONFIG_USB_SPEEDTOUCH=m +CONFIG_USB_CXACRU=m +CONFIG_USB_UEAGLEATM=m +CONFIG_USB_XUSBATM=m + +# +# USB Physical Layer drivers +# +CONFIG_USB_PHY=y +CONFIG_NOP_USB_XCEIV=m +CONFIG_USB_GPIO_VBUS=m +CONFIG_TAHVO_USB=m +CONFIG_TAHVO_USB_HOST_BY_DEFAULT=y +CONFIG_USB_ISP1301=m +CONFIG_USB_GADGET=m +# CONFIG_USB_GADGET_DEBUG is not set +# CONFIG_USB_GADGET_DEBUG_FILES is not set +# CONFIG_USB_GADGET_DEBUG_FS is not set +CONFIG_USB_GADGET_VBUS_DRAW=2 +CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 +CONFIG_U_SERIAL_CONSOLE=y + +# +# USB Peripheral Controller +# +CONFIG_USB_FOTG210_UDC=m +CONFIG_USB_GR_UDC=m +CONFIG_USB_R8A66597=m +CONFIG_USB_PXA27X=m +CONFIG_USB_MV_UDC=m +CONFIG_USB_MV_U3D=m +CONFIG_USB_SNP_CORE=m +CONFIG_USB_M66592=m +CONFIG_USB_BDC_UDC=m + +# +# Platform Support +# +CONFIG_USB_BDC_PCI=m +CONFIG_USB_AMD5536UDC=m +CONFIG_USB_NET2272=m +CONFIG_USB_NET2272_DMA=y +CONFIG_USB_NET2280=m +CONFIG_USB_GOKU=m +CONFIG_USB_EG20T=m +# CONFIG_USB_DUMMY_HCD is not set +CONFIG_USB_LIBCOMPOSITE=m +CONFIG_USB_F_ACM=m +CONFIG_USB_F_SS_LB=m +CONFIG_USB_U_SERIAL=m +CONFIG_USB_U_ETHER=m +CONFIG_USB_U_AUDIO=m +CONFIG_USB_F_SERIAL=m +CONFIG_USB_F_OBEX=m +CONFIG_USB_F_NCM=m +CONFIG_USB_F_ECM=m +CONFIG_USB_F_PHONET=m +CONFIG_USB_F_EEM=m +CONFIG_USB_F_SUBSET=m +CONFIG_USB_F_RNDIS=m +CONFIG_USB_F_MASS_STORAGE=m +CONFIG_USB_F_FS=m +CONFIG_USB_F_UAC1=m +CONFIG_USB_F_UAC2=m +CONFIG_USB_F_UVC=m +CONFIG_USB_F_MIDI=m +CONFIG_USB_F_HID=m +CONFIG_USB_F_PRINTER=m +CONFIG_USB_F_TCM=m +CONFIG_USB_CONFIGFS=m +CONFIG_USB_CONFIGFS_SERIAL=y +CONFIG_USB_CONFIGFS_ACM=y +CONFIG_USB_CONFIGFS_OBEX=y +CONFIG_USB_CONFIGFS_NCM=y +CONFIG_USB_CONFIGFS_ECM=y +CONFIG_USB_CONFIGFS_ECM_SUBSET=y +CONFIG_USB_CONFIGFS_RNDIS=y +CONFIG_USB_CONFIGFS_EEM=y +CONFIG_USB_CONFIGFS_PHONET=y +CONFIG_USB_CONFIGFS_MASS_STORAGE=y +CONFIG_USB_CONFIGFS_F_LB_SS=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_UAC1=y +# CONFIG_USB_CONFIGFS_F_UAC1_LEGACY is not set +CONFIG_USB_CONFIGFS_F_UAC2=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_CONFIGFS_F_HID=y +CONFIG_USB_CONFIGFS_F_UVC=y +CONFIG_USB_CONFIGFS_F_PRINTER=y +CONFIG_USB_CONFIGFS_F_TCM=y +CONFIG_USB_ZERO=m +CONFIG_USB_ZERO_HNPTEST=y +CONFIG_USB_AUDIO=m +CONFIG_GADGET_UAC1=y +# CONFIG_GADGET_UAC1_LEGACY is not set +CONFIG_USB_ETH=m +CONFIG_USB_ETH_RNDIS=y +CONFIG_USB_ETH_EEM=y +CONFIG_USB_G_NCM=m +CONFIG_USB_GADGETFS=m +CONFIG_USB_FUNCTIONFS=m +CONFIG_USB_FUNCTIONFS_ETH=y +CONFIG_USB_FUNCTIONFS_RNDIS=y +CONFIG_USB_FUNCTIONFS_GENERIC=y +CONFIG_USB_MASS_STORAGE=m +CONFIG_USB_GADGET_TARGET=m +CONFIG_USB_G_SERIAL=m +CONFIG_USB_MIDI_GADGET=m +CONFIG_USB_G_PRINTER=m +CONFIG_USB_CDC_COMPOSITE=m +CONFIG_USB_G_NOKIA=m +CONFIG_USB_G_ACM_MS=m +CONFIG_USB_G_MULTI=m +CONFIG_USB_G_MULTI_RNDIS=y +CONFIG_USB_G_MULTI_CDC=y +CONFIG_USB_G_HID=m +# CONFIG_USB_G_DBGP is not set +CONFIG_USB_G_WEBCAM=m + +# +# USB Power Delivery and Type-C drivers +# +CONFIG_TYPEC=m +CONFIG_TYPEC_UCSI=m +CONFIG_UCSI_ACPI=m +CONFIG_USB_LED_TRIG=y +CONFIG_USB_ULPI_BUS=m +CONFIG_UWB=m +CONFIG_UWB_HWA=m +CONFIG_UWB_WHCI=m +CONFIG_UWB_I1480U=m +CONFIG_MMC=m +CONFIG_MMC_BLOCK=m +CONFIG_MMC_BLOCK_MINORS=8 +CONFIG_SDIO_UART=m +# CONFIG_MMC_TEST is not set + +# +# MMC/SD/SDIO Host Controller Drivers +# +# CONFIG_MMC_DEBUG is not set +CONFIG_MMC_SDHCI=m +CONFIG_MMC_SDHCI_PCI=m +CONFIG_MMC_RICOH_MMC=y +CONFIG_MMC_SDHCI_ACPI=m +CONFIG_MMC_SDHCI_PLTFM=m +CONFIG_MMC_WBSD=m +CONFIG_MMC_TIFM_SD=m +CONFIG_MMC_SPI=m +CONFIG_MMC_SDRICOH_CS=m +CONFIG_MMC_CB710=m +CONFIG_MMC_VIA_SDMMC=m +CONFIG_MMC_VUB300=m +CONFIG_MMC_USHC=m +CONFIG_MMC_USDHI6ROL0=m +CONFIG_MMC_REALTEK_PCI=m +CONFIG_MMC_REALTEK_USB=m +CONFIG_MMC_TOSHIBA_PCI=m +CONFIG_MMC_MTK=m +CONFIG_MMC_SDHCI_XENON=m +CONFIG_MEMSTICK=m +# CONFIG_MEMSTICK_DEBUG is not set + +# +# MemoryStick drivers +# +# CONFIG_MEMSTICK_UNSAFE_RESUME is not set +CONFIG_MSPRO_BLOCK=m +CONFIG_MS_BLOCK=m + +# +# MemoryStick Host Controller Drivers +# +CONFIG_MEMSTICK_TIFM_MS=m +CONFIG_MEMSTICK_JMICRON_38X=m +CONFIG_MEMSTICK_R592=m +CONFIG_MEMSTICK_REALTEK_PCI=m +CONFIG_MEMSTICK_REALTEK_USB=m +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_CLASS_FLASH=m +# CONFIG_LEDS_BRIGHTNESS_HW_CHANGED is not set + +# +# LED drivers +# +CONFIG_LEDS_AS3645A=m +CONFIG_LEDS_LM3530=m +CONFIG_LEDS_LM3533=m +CONFIG_LEDS_LM3642=m +CONFIG_LEDS_MT6323=m +CONFIG_LEDS_PCA9532=m +CONFIG_LEDS_PCA9532_GPIO=y +CONFIG_LEDS_GPIO=m +CONFIG_LEDS_LP3944=m +CONFIG_LEDS_LP3952=m +CONFIG_LEDS_LP55XX_COMMON=m +CONFIG_LEDS_LP5521=m +CONFIG_LEDS_LP5523=m +CONFIG_LEDS_LP5562=m +CONFIG_LEDS_LP8501=m +CONFIG_LEDS_LP8860=m +CONFIG_LEDS_CLEVO_MAIL=m +CONFIG_LEDS_PCA955X=m +# CONFIG_LEDS_PCA955X_GPIO is not set +CONFIG_LEDS_PCA963X=m +CONFIG_LEDS_WM831X_STATUS=m +CONFIG_LEDS_DA9052=m +CONFIG_LEDS_DAC124S085=m +CONFIG_LEDS_PWM=m +CONFIG_LEDS_REGULATOR=m +CONFIG_LEDS_BD2802=m +CONFIG_LEDS_INTEL_SS4200=m +CONFIG_LEDS_LT3593=m +CONFIG_LEDS_MC13783=m +CONFIG_LEDS_TCA6507=m +CONFIG_LEDS_TLC591XX=m +CONFIG_LEDS_LM355x=m +CONFIG_LEDS_MENF21BMC=m + +# +# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) +# +CONFIG_LEDS_BLINKM=m +CONFIG_LEDS_MLXCPLD=m +CONFIG_LEDS_USER=m +CONFIG_LEDS_NIC78BX=m + +# +# LED Triggers +# +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_ONESHOT=m +CONFIG_LEDS_TRIGGER_DISK=y +# CONFIG_LEDS_TRIGGER_MTD is not set +CONFIG_LEDS_TRIGGER_HEARTBEAT=m +CONFIG_LEDS_TRIGGER_BACKLIGHT=m +CONFIG_LEDS_TRIGGER_CPU=y +CONFIG_LEDS_TRIGGER_GPIO=m +CONFIG_LEDS_TRIGGER_DEFAULT_ON=m + +# +# iptables trigger is under Netfilter config (LED target) +# +CONFIG_LEDS_TRIGGER_TRANSIENT=m +CONFIG_LEDS_TRIGGER_CAMERA=m +CONFIG_LEDS_TRIGGER_PANIC=y +CONFIG_ACCESSIBILITY=y +CONFIG_A11Y_BRAILLE_CONSOLE=y +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_MAD=m +CONFIG_INFINIBAND_USER_ACCESS=m +# CONFIG_INFINIBAND_EXP_USER_ACCESS is not set +CONFIG_INFINIBAND_USER_MEM=y +CONFIG_INFINIBAND_ON_DEMAND_PAGING=y +CONFIG_INFINIBAND_ADDR_TRANS=y +CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y +CONFIG_INFINIBAND_MTHCA=m +CONFIG_INFINIBAND_MTHCA_DEBUG=y +CONFIG_INFINIBAND_QIB=m +CONFIG_INFINIBAND_QIB_DCA=y +CONFIG_INFINIBAND_CXGB3=m +# CONFIG_INFINIBAND_CXGB3_DEBUG is not set +CONFIG_INFINIBAND_CXGB4=m +CONFIG_INFINIBAND_I40IW=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_MLX5_INFINIBAND=m +CONFIG_INFINIBAND_NES=m +# CONFIG_INFINIBAND_NES_DEBUG is not set +CONFIG_INFINIBAND_OCRDMA=m +CONFIG_INFINIBAND_VMWARE_PVRDMA=m +CONFIG_INFINIBAND_USNIC=m +CONFIG_INFINIBAND_IPOIB=m +CONFIG_INFINIBAND_IPOIB_CM=y +CONFIG_INFINIBAND_IPOIB_DEBUG=y +# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set +CONFIG_INFINIBAND_SRP=m +CONFIG_INFINIBAND_SRPT=m +CONFIG_INFINIBAND_ISER=m +CONFIG_INFINIBAND_ISERT=m +CONFIG_INFINIBAND_OPA_VNIC=m +CONFIG_INFINIBAND_RDMAVT=m +CONFIG_RDMA_RXE=m +CONFIG_INFINIBAND_HFI1=m +# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set +# CONFIG_SDMA_VERBOSITY is not set +CONFIG_INFINIBAND_QEDR=m +CONFIG_INFINIBAND_BNXT_RE=m +CONFIG_EDAC_ATOMIC_SCRUB=y +CONFIG_EDAC_SUPPORT=y +CONFIG_EDAC=y +CONFIG_EDAC_LEGACY_SYSFS=y +# CONFIG_EDAC_DEBUG is not set +CONFIG_EDAC_DECODE_MCE=m +CONFIG_EDAC_GHES=y +CONFIG_EDAC_AMD64=m +CONFIG_EDAC_AMD64_ERROR_INJECTION=y +CONFIG_EDAC_E752X=m +CONFIG_EDAC_I82975X=m +CONFIG_EDAC_I3000=m +CONFIG_EDAC_I3200=m +CONFIG_EDAC_IE31200=m +CONFIG_EDAC_X38=m +CONFIG_EDAC_I5400=m +CONFIG_EDAC_I7CORE=m +CONFIG_EDAC_I5000=m +CONFIG_EDAC_I5100=m +CONFIG_EDAC_I7300=m +CONFIG_EDAC_SBRIDGE=m +CONFIG_EDAC_SKX=m +CONFIG_EDAC_PND2=m +CONFIG_RTC_LIB=y +CONFIG_RTC_MC146818_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +CONFIG_RTC_SYSTOHC=y +CONFIG_RTC_SYSTOHC_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set +CONFIG_RTC_NVMEM=y + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +CONFIG_RTC_INTF_DEV_UIE_EMUL=y +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +CONFIG_RTC_DRV_88PM80X=m +CONFIG_RTC_DRV_ABB5ZES3=m +CONFIG_RTC_DRV_ABX80X=m +CONFIG_RTC_DRV_DS1307=m +CONFIG_RTC_DRV_DS1307_HWMON=y +# CONFIG_RTC_DRV_DS1307_CENTURY is not set +CONFIG_RTC_DRV_DS1374=m +CONFIG_RTC_DRV_DS1374_WDT=y +CONFIG_RTC_DRV_DS1672=m +CONFIG_RTC_DRV_MAX6900=m +CONFIG_RTC_DRV_MAX8907=m +CONFIG_RTC_DRV_RS5C372=m +CONFIG_RTC_DRV_ISL1208=m +CONFIG_RTC_DRV_ISL12022=m +CONFIG_RTC_DRV_X1205=m +CONFIG_RTC_DRV_PCF8523=m +CONFIG_RTC_DRV_PCF85063=m +CONFIG_RTC_DRV_PCF8563=m +CONFIG_RTC_DRV_PCF8583=m +CONFIG_RTC_DRV_M41T80=m +CONFIG_RTC_DRV_M41T80_WDT=y +CONFIG_RTC_DRV_BQ32K=m +CONFIG_RTC_DRV_S35390A=m +CONFIG_RTC_DRV_FM3130=m +CONFIG_RTC_DRV_RX8010=m +CONFIG_RTC_DRV_RX8581=m +CONFIG_RTC_DRV_RX8025=m +CONFIG_RTC_DRV_EM3027=m +CONFIG_RTC_DRV_RV8803=m + +# +# SPI RTC drivers +# +CONFIG_RTC_DRV_M41T93=m +CONFIG_RTC_DRV_M41T94=m +CONFIG_RTC_DRV_DS1302=m +CONFIG_RTC_DRV_DS1305=m +CONFIG_RTC_DRV_DS1343=m +CONFIG_RTC_DRV_DS1347=m +CONFIG_RTC_DRV_DS1390=m +CONFIG_RTC_DRV_MAX6916=m +CONFIG_RTC_DRV_R9701=m +CONFIG_RTC_DRV_RX4581=m +CONFIG_RTC_DRV_RX6110=m +CONFIG_RTC_DRV_RS5C348=m +CONFIG_RTC_DRV_MAX6902=m +CONFIG_RTC_DRV_PCF2123=m +CONFIG_RTC_DRV_MCP795=m +CONFIG_RTC_I2C_AND_SPI=m + +# +# SPI and I2C RTC drivers +# +CONFIG_RTC_DRV_DS3232=m +CONFIG_RTC_DRV_DS3232_HWMON=y +CONFIG_RTC_DRV_PCF2127=m +CONFIG_RTC_DRV_RV3029C2=m +CONFIG_RTC_DRV_RV3029_HWMON=y + +# +# Platform RTC drivers +# +CONFIG_RTC_DRV_CMOS=y +CONFIG_RTC_DRV_DS1286=m +CONFIG_RTC_DRV_DS1511=m +CONFIG_RTC_DRV_DS1553=m +CONFIG_RTC_DRV_DS1685_FAMILY=m +CONFIG_RTC_DRV_DS1685=y +# CONFIG_RTC_DRV_DS1689 is not set +# CONFIG_RTC_DRV_DS17285 is not set +# CONFIG_RTC_DRV_DS17485 is not set +# CONFIG_RTC_DRV_DS17885 is not set +CONFIG_RTC_DS1685_PROC_REGS=y +CONFIG_RTC_DS1685_SYSFS_REGS=y +CONFIG_RTC_DRV_DS1742=m +CONFIG_RTC_DRV_DS2404=m +CONFIG_RTC_DRV_DA9052=m +CONFIG_RTC_DRV_DA9063=m +CONFIG_RTC_DRV_STK17TA8=m +CONFIG_RTC_DRV_M48T86=m +CONFIG_RTC_DRV_M48T35=m +CONFIG_RTC_DRV_M48T59=m +CONFIG_RTC_DRV_MSM6242=m +CONFIG_RTC_DRV_BQ4802=m +CONFIG_RTC_DRV_RP5C01=m +CONFIG_RTC_DRV_V3020=m +CONFIG_RTC_DRV_WM831X=m +CONFIG_RTC_DRV_PCF50633=m + +# +# on-CPU RTC drivers +# +CONFIG_RTC_DRV_FTRTC010=m +CONFIG_RTC_DRV_PCAP=m +CONFIG_RTC_DRV_MC13XXX=m +CONFIG_RTC_DRV_MT6397=m + +# +# HID Sensor RTC drivers +# +CONFIG_RTC_DRV_HID_SENSOR_TIME=m +CONFIG_DMADEVICES=y +# CONFIG_DMADEVICES_DEBUG is not set + +# +# DMA Devices +# +CONFIG_DMA_ENGINE=y +CONFIG_DMA_VIRTUAL_CHANNELS=y +CONFIG_DMA_ACPI=y +CONFIG_ALTERA_MSGDMA=m +CONFIG_INTEL_IDMA64=m +CONFIG_INTEL_IOATDMA=m +CONFIG_INTEL_MIC_X100_DMA=m +CONFIG_QCOM_HIDMA_MGMT=m +CONFIG_QCOM_HIDMA=m +CONFIG_DW_DMAC_CORE=y +CONFIG_DW_DMAC=m +CONFIG_DW_DMAC_PCI=y +CONFIG_HSU_DMA=y + +# +# DMA Clients +# +CONFIG_ASYNC_TX_DMA=y +# CONFIG_DMATEST is not set +CONFIG_DMA_ENGINE_RAID=y + +# +# DMABUF options +# +CONFIG_SYNC_FILE=y +# CONFIG_SW_SYNC is not set +CONFIG_DCA=m +CONFIG_AUXDISPLAY=y +CONFIG_HD44780=m +CONFIG_KS0108=m +CONFIG_KS0108_PORT=0x378 +CONFIG_KS0108_DELAY=2 +CONFIG_CFAG12864B=m +CONFIG_CFAG12864B_RATE=20 +CONFIG_IMG_ASCII_LCD=m +CONFIG_PANEL=m +CONFIG_PANEL_PARPORT=0 +CONFIG_PANEL_PROFILE=5 +# CONFIG_PANEL_CHANGE_MESSAGE is not set +CONFIG_CHARLCD=m +CONFIG_UIO=m +CONFIG_UIO_CIF=m +CONFIG_UIO_PDRV_GENIRQ=m +CONFIG_UIO_DMEM_GENIRQ=m +CONFIG_UIO_AEC=m +CONFIG_UIO_SERCOS3=m +CONFIG_UIO_PCI_GENERIC=m +CONFIG_UIO_NETX=m +CONFIG_UIO_PRUSS=m +CONFIG_UIO_MF624=m +CONFIG_UIO_HV_GENERIC=m +CONFIG_VFIO_IOMMU_TYPE1=m +CONFIG_VFIO_VIRQFD=m +CONFIG_VFIO=m +# CONFIG_VFIO_NOIOMMU is not set +CONFIG_VFIO_PCI=m +CONFIG_VFIO_PCI_VGA=y +CONFIG_VFIO_PCI_MMAP=y +CONFIG_VFIO_PCI_INTX=y +CONFIG_VFIO_PCI_IGD=y +CONFIG_VFIO_MDEV=m +CONFIG_VFIO_MDEV_DEVICE=m +CONFIG_IRQ_BYPASS_MANAGER=m +CONFIG_VIRT_DRIVERS=y +CONFIG_VIRTIO=m + +# +# Virtio drivers +# +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_PCI_LEGACY=y +CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_INPUT=m +CONFIG_VIRTIO_MMIO=m +CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y + +# +# Microsoft Hyper-V guest support +# +CONFIG_HYPERV=m +CONFIG_HYPERV_TSCPAGE=y +CONFIG_HYPERV_UTILS=m +CONFIG_HYPERV_BALLOON=m +CONFIG_STAGING=y +CONFIG_IRDA=m + +# +# IrDA protocols +# +CONFIG_IRLAN=m +CONFIG_IRNET=m +CONFIG_IRCOMM=m +CONFIG_IRDA_ULTRA=y + +# +# IrDA options +# +CONFIG_IRDA_CACHE_LAST_LSAP=y +CONFIG_IRDA_FAST_RR=y +# CONFIG_IRDA_DEBUG is not set + +# +# Infrared-port device drivers +# + +# +# SIR device drivers +# +CONFIG_IRTTY_SIR=m + +# +# Dongle support +# +CONFIG_DONGLE=y +CONFIG_ESI_DONGLE=m +CONFIG_ACTISYS_DONGLE=m +CONFIG_TEKRAM_DONGLE=m +CONFIG_TOIM3232_DONGLE=m +CONFIG_LITELINK_DONGLE=m +CONFIG_MA600_DONGLE=m +CONFIG_GIRBIL_DONGLE=m +CONFIG_MCP2120_DONGLE=m +CONFIG_OLD_BELKIN_DONGLE=m +CONFIG_ACT200L_DONGLE=m +CONFIG_KINGSUN_DONGLE=m +CONFIG_KSDAZZLE_DONGLE=m +CONFIG_KS959_DONGLE=m + +# +# FIR device drivers +# +CONFIG_USB_IRDA=m +CONFIG_SIGMATEL_FIR=m +CONFIG_NSC_FIR=m +CONFIG_WINBOND_FIR=m +CONFIG_SMC_IRCC_FIR=m +CONFIG_ALI_FIR=m +CONFIG_VLSI_FIR=m +CONFIG_VIA_FIR=m +CONFIG_MCS_FIR=m +CONFIG_PRISM2_USB=m +CONFIG_COMEDI=m +# CONFIG_COMEDI_DEBUG is not set +CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 +CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 +CONFIG_COMEDI_MISC_DRIVERS=y +CONFIG_COMEDI_BOND=m +CONFIG_COMEDI_TEST=m +CONFIG_COMEDI_PARPORT=m +CONFIG_COMEDI_SERIAL2002=m +CONFIG_COMEDI_ISA_DRIVERS=y +CONFIG_COMEDI_PCL711=m +CONFIG_COMEDI_PCL724=m +CONFIG_COMEDI_PCL726=m +CONFIG_COMEDI_PCL730=m +CONFIG_COMEDI_PCL812=m +CONFIG_COMEDI_PCL816=m +CONFIG_COMEDI_PCL818=m +CONFIG_COMEDI_PCM3724=m +CONFIG_COMEDI_AMPLC_DIO200_ISA=m +CONFIG_COMEDI_AMPLC_PC236_ISA=m +CONFIG_COMEDI_AMPLC_PC263_ISA=m +CONFIG_COMEDI_RTI800=m +CONFIG_COMEDI_RTI802=m +CONFIG_COMEDI_DAC02=m +CONFIG_COMEDI_DAS16M1=m +CONFIG_COMEDI_DAS08_ISA=m +CONFIG_COMEDI_DAS16=m +CONFIG_COMEDI_DAS800=m +CONFIG_COMEDI_DAS1800=m +CONFIG_COMEDI_DAS6402=m +CONFIG_COMEDI_DT2801=m +CONFIG_COMEDI_DT2811=m +CONFIG_COMEDI_DT2814=m +CONFIG_COMEDI_DT2815=m +CONFIG_COMEDI_DT2817=m +CONFIG_COMEDI_DT282X=m +CONFIG_COMEDI_DMM32AT=m +CONFIG_COMEDI_FL512=m +CONFIG_COMEDI_AIO_AIO12_8=m +CONFIG_COMEDI_AIO_IIRO_16=m +CONFIG_COMEDI_II_PCI20KC=m +CONFIG_COMEDI_C6XDIGIO=m +CONFIG_COMEDI_MPC624=m +CONFIG_COMEDI_ADQ12B=m +CONFIG_COMEDI_NI_AT_A2150=m +CONFIG_COMEDI_NI_AT_AO=m +CONFIG_COMEDI_NI_ATMIO=m +CONFIG_COMEDI_NI_ATMIO16D=m +CONFIG_COMEDI_NI_LABPC_ISA=m +CONFIG_COMEDI_PCMAD=m +CONFIG_COMEDI_PCMDA12=m +CONFIG_COMEDI_PCMMIO=m +CONFIG_COMEDI_PCMUIO=m +CONFIG_COMEDI_MULTIQ3=m +CONFIG_COMEDI_S526=m +CONFIG_COMEDI_PCI_DRIVERS=m +CONFIG_COMEDI_8255_PCI=m +CONFIG_COMEDI_ADDI_WATCHDOG=m +CONFIG_COMEDI_ADDI_APCI_1032=m +CONFIG_COMEDI_ADDI_APCI_1500=m +CONFIG_COMEDI_ADDI_APCI_1516=m +CONFIG_COMEDI_ADDI_APCI_1564=m +CONFIG_COMEDI_ADDI_APCI_16XX=m +CONFIG_COMEDI_ADDI_APCI_2032=m +CONFIG_COMEDI_ADDI_APCI_2200=m +CONFIG_COMEDI_ADDI_APCI_3120=m +CONFIG_COMEDI_ADDI_APCI_3501=m +CONFIG_COMEDI_ADDI_APCI_3XXX=m +CONFIG_COMEDI_ADL_PCI6208=m +CONFIG_COMEDI_ADL_PCI7X3X=m +CONFIG_COMEDI_ADL_PCI8164=m +CONFIG_COMEDI_ADL_PCI9111=m +CONFIG_COMEDI_ADL_PCI9118=m +CONFIG_COMEDI_ADV_PCI1710=m +CONFIG_COMEDI_ADV_PCI1720=m +CONFIG_COMEDI_ADV_PCI1723=m +CONFIG_COMEDI_ADV_PCI1724=m +CONFIG_COMEDI_ADV_PCI1760=m +CONFIG_COMEDI_ADV_PCI_DIO=m +CONFIG_COMEDI_AMPLC_DIO200_PCI=m +CONFIG_COMEDI_AMPLC_PC236_PCI=m +CONFIG_COMEDI_AMPLC_PC263_PCI=m +CONFIG_COMEDI_AMPLC_PCI224=m +CONFIG_COMEDI_AMPLC_PCI230=m +CONFIG_COMEDI_CONTEC_PCI_DIO=m +CONFIG_COMEDI_DAS08_PCI=m +CONFIG_COMEDI_DT3000=m +CONFIG_COMEDI_DYNA_PCI10XX=m +CONFIG_COMEDI_GSC_HPDI=m +CONFIG_COMEDI_MF6X4=m +CONFIG_COMEDI_ICP_MULTI=m +CONFIG_COMEDI_DAQBOARD2000=m +CONFIG_COMEDI_JR3_PCI=m +CONFIG_COMEDI_KE_COUNTER=m +CONFIG_COMEDI_CB_PCIDAS64=m +CONFIG_COMEDI_CB_PCIDAS=m +CONFIG_COMEDI_CB_PCIDDA=m +CONFIG_COMEDI_CB_PCIMDAS=m +CONFIG_COMEDI_CB_PCIMDDA=m +CONFIG_COMEDI_ME4000=m +CONFIG_COMEDI_ME_DAQ=m +CONFIG_COMEDI_NI_6527=m +CONFIG_COMEDI_NI_65XX=m +CONFIG_COMEDI_NI_660X=m +CONFIG_COMEDI_NI_670X=m +CONFIG_COMEDI_NI_LABPC_PCI=m +CONFIG_COMEDI_NI_PCIDIO=m +CONFIG_COMEDI_NI_PCIMIO=m +CONFIG_COMEDI_RTD520=m +CONFIG_COMEDI_S626=m +CONFIG_COMEDI_MITE=m +CONFIG_COMEDI_NI_TIOCMD=m +CONFIG_COMEDI_PCMCIA_DRIVERS=m +CONFIG_COMEDI_CB_DAS16_CS=m +CONFIG_COMEDI_DAS08_CS=m +CONFIG_COMEDI_NI_DAQ_700_CS=m +CONFIG_COMEDI_NI_DAQ_DIO24_CS=m +CONFIG_COMEDI_NI_LABPC_CS=m +CONFIG_COMEDI_NI_MIO_CS=m +CONFIG_COMEDI_QUATECH_DAQP_CS=m +CONFIG_COMEDI_USB_DRIVERS=m +CONFIG_COMEDI_DT9812=m +CONFIG_COMEDI_NI_USB6501=m +CONFIG_COMEDI_USBDUX=m +CONFIG_COMEDI_USBDUXFAST=m +CONFIG_COMEDI_USBDUXSIGMA=m +CONFIG_COMEDI_VMK80XX=m +CONFIG_COMEDI_8254=m +CONFIG_COMEDI_8255=m +CONFIG_COMEDI_8255_SA=m +CONFIG_COMEDI_KCOMEDILIB=m +CONFIG_COMEDI_AMPLC_DIO200=m +CONFIG_COMEDI_AMPLC_PC236=m +CONFIG_COMEDI_DAS08=m +CONFIG_COMEDI_ISADMA=m +CONFIG_COMEDI_NI_LABPC=m +CONFIG_COMEDI_NI_LABPC_ISADMA=m +CONFIG_COMEDI_NI_TIO=m +CONFIG_RTL8192U=m +CONFIG_RTLLIB=m +CONFIG_RTLLIB_CRYPTO_CCMP=m +CONFIG_RTLLIB_CRYPTO_TKIP=m +CONFIG_RTLLIB_CRYPTO_WEP=m +CONFIG_RTL8192E=m +CONFIG_RTL8723BS=m +CONFIG_R8712U=m +CONFIG_R8188EU=m +CONFIG_88EU_AP_MODE=y +CONFIG_R8822BE=m +CONFIG_RTLHALMAC_ST=m +CONFIG_RTLPHYDM_ST=m +CONFIG_RTLWIFI_DEBUG_ST=y +CONFIG_RTS5208=m +CONFIG_VT6655=m +CONFIG_VT6656=m + +# +# IIO staging drivers +# + +# +# Accelerometers +# +CONFIG_ADIS16201=m +CONFIG_ADIS16203=m +CONFIG_ADIS16209=m +CONFIG_ADIS16240=m + +# +# Analog to digital converters +# +CONFIG_AD7606=m +CONFIG_AD7606_IFACE_PARALLEL=m +CONFIG_AD7606_IFACE_SPI=m +CONFIG_AD7780=m +CONFIG_AD7816=m +CONFIG_AD7192=m +CONFIG_AD7280=m + +# +# Analog digital bi-direction converters +# +CONFIG_ADT7316=m +CONFIG_ADT7316_SPI=m +CONFIG_ADT7316_I2C=m + +# +# Capacitance to digital converters +# +CONFIG_AD7150=m +CONFIG_AD7152=m +CONFIG_AD7746=m + +# +# Direct Digital Synthesis +# +CONFIG_AD9832=m +CONFIG_AD9834=m + +# +# Digital gyroscope sensors +# +CONFIG_ADIS16060=m + +# +# Network Analyzer, Impedance Converters +# +CONFIG_AD5933=m + +# +# Light sensors +# +CONFIG_TSL2x7x=m + +# +# Active energy metering IC +# +CONFIG_ADE7753=m +CONFIG_ADE7754=m +CONFIG_ADE7758=m +CONFIG_ADE7759=m +CONFIG_ADE7854=m +CONFIG_ADE7854_I2C=m +CONFIG_ADE7854_SPI=m + +# +# Resolver to digital converters +# +CONFIG_AD2S90=m +CONFIG_AD2S1200=m +CONFIG_AD2S1210=m + +# +# Triggers - standalone +# +CONFIG_FB_SM750=m +CONFIG_FB_XGI=m + +# +# Speakup console speech +# +CONFIG_SPEAKUP=m +CONFIG_SPEAKUP_SYNTH_ACNTSA=m +CONFIG_SPEAKUP_SYNTH_APOLLO=m +CONFIG_SPEAKUP_SYNTH_AUDPTR=m +CONFIG_SPEAKUP_SYNTH_BNS=m +CONFIG_SPEAKUP_SYNTH_DECTLK=m +CONFIG_SPEAKUP_SYNTH_DECEXT=m +CONFIG_SPEAKUP_SYNTH_LTLK=m +CONFIG_SPEAKUP_SYNTH_SOFT=m +CONFIG_SPEAKUP_SYNTH_SPKOUT=m +CONFIG_SPEAKUP_SYNTH_TXPRT=m +# CONFIG_SPEAKUP_SYNTH_DUMMY is not set +CONFIG_STAGING_MEDIA=y +CONFIG_INTEL_ATOMISP=y +CONFIG_VIDEO_ATOMISP=m +CONFIG_VIDEO_OV5693=m +CONFIG_VIDEO_IMX=m +CONFIG_VIDEO_OV2722=m +CONFIG_VIDEO_GC2235=m +CONFIG_VIDEO_OV8858=m +CONFIG_VIDEO_MSRLIST_HELPER=m +CONFIG_VIDEO_MT9M114=m +CONFIG_VIDEO_AP1302=m +CONFIG_VIDEO_GC0310=m +CONFIG_VIDEO_OV2680=m +CONFIG_VIDEO_LM3554=m +CONFIG_I2C_BCM2048=m +CONFIG_DVB_CXD2099=m +CONFIG_LIRC_STAGING=y +CONFIG_LIRC_ZILOG=m + +# +# Android +# +CONFIG_LTE_GDM724X=m +CONFIG_FIREWIRE_SERIAL=m +CONFIG_FWTTY_MAX_TOTAL_PORTS=64 +CONFIG_FWTTY_MAX_CARD_PORTS=32 +CONFIG_MTD_SPINAND_MT29F=m +CONFIG_MTD_SPINAND_ONDIEECC=y +CONFIG_LNET=m +CONFIG_LNET_MAX_PAYLOAD=1048576 +# CONFIG_LNET_SELFTEST is not set +CONFIG_LNET_XPRT_IB=m +CONFIG_LUSTRE_FS=m +# CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK is not set +CONFIG_DGNC=m +CONFIG_GS_FPGABOOT=m +CONFIG_CRYPTO_SKEIN=m +CONFIG_UNISYSSPAR=y +# CONFIG_UNISYS_VISORBUS is not set +CONFIG_FB_TFT=m +CONFIG_FB_TFT_AGM1264K_FL=m +CONFIG_FB_TFT_BD663474=m +CONFIG_FB_TFT_HX8340BN=m +CONFIG_FB_TFT_HX8347D=m +CONFIG_FB_TFT_HX8353D=m +# CONFIG_FB_TFT_HX8357D is not set +# CONFIG_FB_TFT_ILI9163 is not set +CONFIG_FB_TFT_ILI9320=m +CONFIG_FB_TFT_ILI9325=m +CONFIG_FB_TFT_ILI9340=m +CONFIG_FB_TFT_ILI9341=m +CONFIG_FB_TFT_ILI9481=m +CONFIG_FB_TFT_ILI9486=m +CONFIG_FB_TFT_PCD8544=m +CONFIG_FB_TFT_RA8875=m +CONFIG_FB_TFT_S6D02A1=m +CONFIG_FB_TFT_S6D1121=m +CONFIG_FB_TFT_SH1106=m +CONFIG_FB_TFT_SSD1289=m +CONFIG_FB_TFT_SSD1305=m +CONFIG_FB_TFT_SSD1306=m +CONFIG_FB_TFT_SSD1325=m +CONFIG_FB_TFT_SSD1331=m +CONFIG_FB_TFT_SSD1351=m +CONFIG_FB_TFT_ST7735R=m +CONFIG_FB_TFT_ST7789V=m +CONFIG_FB_TFT_TINYLCD=m +CONFIG_FB_TFT_TLS8204=m +CONFIG_FB_TFT_UC1611=m +CONFIG_FB_TFT_UC1701=m +CONFIG_FB_TFT_UPD161704=m +CONFIG_FB_TFT_WATTEROTT=m +CONFIG_FB_FLEX=m +CONFIG_FB_TFT_FBTFT_DEVICE=m +CONFIG_WILC1000=m +CONFIG_WILC1000_SDIO=m +CONFIG_WILC1000_SPI=m +# CONFIG_WILC1000_HW_OOB_INTR is not set +CONFIG_MOST=m +CONFIG_MOSTCORE=m +CONFIG_AIM_CDEV=m +CONFIG_AIM_NETWORK=m +CONFIG_AIM_SOUND=m +CONFIG_AIM_V4L2=m +CONFIG_HDM_DIM2=m +CONFIG_HDM_I2C=m +CONFIG_HDM_USB=m +CONFIG_KS7010=m +CONFIG_GREYBUS=m +CONFIG_GREYBUS_ES2=m +CONFIG_GREYBUS_AUDIO=m +CONFIG_GREYBUS_BOOTROM=m +CONFIG_GREYBUS_FIRMWARE=m +CONFIG_GREYBUS_HID=m +CONFIG_GREYBUS_LIGHT=m +CONFIG_GREYBUS_LOG=m +CONFIG_GREYBUS_LOOPBACK=m +CONFIG_GREYBUS_POWER=m +CONFIG_GREYBUS_RAW=m +CONFIG_GREYBUS_VIBRATOR=m +CONFIG_GREYBUS_BRIDGED_PHY=m +CONFIG_GREYBUS_GPIO=m +CONFIG_GREYBUS_I2C=m +CONFIG_GREYBUS_PWM=m +CONFIG_GREYBUS_SDIO=m +CONFIG_GREYBUS_SPI=m +CONFIG_GREYBUS_UART=m +CONFIG_GREYBUS_USB=m + +# +# USB Power Delivery and Type-C drivers +# +CONFIG_TYPEC_TCPM=m +CONFIG_TYPEC_TCPCI=m +CONFIG_TYPEC_FUSB302=m +CONFIG_DRM_VBOXVIDEO=m +CONFIG_PI433=m +CONFIG_X86_PLATFORM_DEVICES=y +CONFIG_ACER_WMI=m +CONFIG_ACERHDF=m +CONFIG_ALIENWARE_WMI=m +CONFIG_ASUS_LAPTOP=m +CONFIG_DELL_SMBIOS=m +CONFIG_DELL_LAPTOP=m +CONFIG_DELL_WMI=m +CONFIG_DELL_WMI_AIO=m +CONFIG_DELL_WMI_LED=m +CONFIG_DELL_SMO8800=m +CONFIG_DELL_RBTN=m +CONFIG_FUJITSU_LAPTOP=m +CONFIG_FUJITSU_TABLET=m +CONFIG_AMILO_RFKILL=m +CONFIG_HP_ACCEL=m +CONFIG_HP_WIRELESS=m +CONFIG_HP_WMI=m +CONFIG_MSI_LAPTOP=m +CONFIG_PANASONIC_LAPTOP=m +CONFIG_COMPAL_LAPTOP=m +CONFIG_SONY_LAPTOP=m +CONFIG_SONYPI_COMPAT=y +CONFIG_IDEAPAD_LAPTOP=m +CONFIG_SURFACE3_WMI=m +CONFIG_THINKPAD_ACPI=m +CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y +# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set +# CONFIG_THINKPAD_ACPI_DEBUG is not set +# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set +CONFIG_THINKPAD_ACPI_VIDEO=y +CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y +CONFIG_SENSORS_HDAPS=m +CONFIG_INTEL_MENLOW=m +CONFIG_EEEPC_LAPTOP=m +CONFIG_ASUS_WMI=m +CONFIG_ASUS_NB_WMI=m +CONFIG_EEEPC_WMI=m +CONFIG_ASUS_WIRELESS=m +CONFIG_ACPI_WMI=m +CONFIG_WMI_BMOF=m +CONFIG_MSI_WMI=m +CONFIG_PEAQ_WMI=m +CONFIG_TOPSTAR_LAPTOP=m +CONFIG_ACPI_TOSHIBA=m +CONFIG_TOSHIBA_BT_RFKILL=m +CONFIG_TOSHIBA_HAPS=m +CONFIG_TOSHIBA_WMI=m +CONFIG_ACPI_CMPC=m +CONFIG_INTEL_CHT_INT33FE=m +CONFIG_INTEL_INT0002_VGPIO=m +CONFIG_INTEL_HID_EVENT=m +CONFIG_INTEL_VBTN=m +CONFIG_INTEL_IPS=m +CONFIG_INTEL_PMC_CORE=y +CONFIG_IBM_RTL=m +CONFIG_SAMSUNG_LAPTOP=m +CONFIG_MXM_WMI=m +CONFIG_INTEL_OAKTRAIL=m +CONFIG_SAMSUNG_Q10=m +CONFIG_APPLE_GMUX=m +CONFIG_INTEL_RST=m +CONFIG_INTEL_SMARTCONNECT=m +CONFIG_PVPANIC=m +CONFIG_INTEL_PMC_IPC=m +CONFIG_INTEL_BXTWC_PMIC_TMU=m +CONFIG_SURFACE_PRO3_BUTTON=m +CONFIG_SURFACE_3_BUTTON=m +CONFIG_INTEL_PUNIT_IPC=m +CONFIG_INTEL_TELEMETRY=m +CONFIG_MLX_PLATFORM=m +CONFIG_MLX_CPLD_PLATFORM=m +# CONFIG_INTEL_TURBO_MAX_3 is not set +CONFIG_PMC_ATOM=y +CONFIG_CHROME_PLATFORMS=y +CONFIG_CHROMEOS_LAPTOP=m +CONFIG_CHROMEOS_PSTORE=m +CONFIG_CROS_EC_CHARDEV=m +CONFIG_CROS_EC_LPC=m +CONFIG_CROS_EC_LPC_MEC=y +CONFIG_CROS_EC_PROTO=y +CONFIG_CROS_KBD_LED_BACKLIGHT=m +CONFIG_CLKDEV_LOOKUP=y +CONFIG_HAVE_CLK_PREPARE=y +CONFIG_COMMON_CLK=y + +# +# Common Clock Framework +# +CONFIG_COMMON_CLK_WM831X=m +CONFIG_COMMON_CLK_SI5351=m +CONFIG_COMMON_CLK_CDCE706=m +CONFIG_COMMON_CLK_CS2000_CP=m +# CONFIG_COMMON_CLK_NXP is not set +CONFIG_COMMON_CLK_PWM=m +# CONFIG_COMMON_CLK_PXA is not set +# CONFIG_COMMON_CLK_PIC32 is not set +CONFIG_HWSPINLOCK=m + +# +# Clock Source drivers +# +CONFIG_CLKEVT_I8253=y +CONFIG_I8253_LOCK=y +CONFIG_CLKBLD_I8253=y +# CONFIG_ATMEL_PIT is not set +# CONFIG_SH_TIMER_CMT is not set +# CONFIG_SH_TIMER_MTU2 is not set +# CONFIG_SH_TIMER_TMU is not set +# CONFIG_EM_TIMER_STI is not set +CONFIG_MAILBOX=y +CONFIG_PCC=y +CONFIG_ALTERA_MBOX=m +CONFIG_IOMMU_API=y +CONFIG_IOMMU_SUPPORT=y + +# +# Generic IOMMU Pagetable Support +# +CONFIG_IOMMU_IOVA=y +CONFIG_AMD_IOMMU=y +CONFIG_AMD_IOMMU_V2=m +CONFIG_DMAR_TABLE=y +CONFIG_INTEL_IOMMU=y +CONFIG_INTEL_IOMMU_SVM=y +# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set +CONFIG_INTEL_IOMMU_FLOPPY_WA=y +CONFIG_IRQ_REMAP=y + +# +# Remoteproc drivers +# +CONFIG_REMOTEPROC=m + +# +# Rpmsg drivers +# +CONFIG_RPMSG=m +CONFIG_RPMSG_CHAR=m +CONFIG_RPMSG_QCOM_GLINK_NATIVE=m +CONFIG_RPMSG_QCOM_GLINK_RPM=m + +# +# SOC (System On Chip) specific Drivers +# + +# +# Amlogic SoC drivers +# + +# +# Broadcom SoC drivers +# + +# +# i.MX SoC drivers +# + +# +# Qualcomm SoC drivers +# +# CONFIG_SUNXI_SRAM is not set +CONFIG_SOC_TI=y +CONFIG_PM_DEVFREQ=y + +# +# DEVFREQ Governors +# +CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y +CONFIG_DEVFREQ_GOV_PERFORMANCE=y +CONFIG_DEVFREQ_GOV_POWERSAVE=y +CONFIG_DEVFREQ_GOV_USERSPACE=y +CONFIG_DEVFREQ_GOV_PASSIVE=m + +# +# DEVFREQ Drivers +# +CONFIG_PM_DEVFREQ_EVENT=y +CONFIG_EXTCON=y + +# +# Extcon Device Drivers +# +CONFIG_EXTCON_ADC_JACK=m +CONFIG_EXTCON_ARIZONA=m +CONFIG_EXTCON_AXP288=m +CONFIG_EXTCON_GPIO=m +# CONFIG_EXTCON_INTEL_INT3496 is not set +CONFIG_EXTCON_MAX14577=m +CONFIG_EXTCON_MAX3355=m +CONFIG_EXTCON_MAX77693=m +CONFIG_EXTCON_RT8973A=m +CONFIG_EXTCON_SM5502=m +# CONFIG_EXTCON_USB_GPIO is not set +CONFIG_EXTCON_USBC_CROS_EC=m +CONFIG_MEMORY=y +CONFIG_IIO=m +CONFIG_IIO_BUFFER=y +CONFIG_IIO_BUFFER_CB=m +CONFIG_IIO_KFIFO_BUF=m +CONFIG_IIO_TRIGGERED_BUFFER=m +CONFIG_IIO_CONFIGFS=m +CONFIG_IIO_TRIGGER=y +CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 +CONFIG_IIO_SW_DEVICE=m +CONFIG_IIO_SW_TRIGGER=m +CONFIG_IIO_TRIGGERED_EVENT=m + +# +# Accelerometers +# +CONFIG_BMA180=m +CONFIG_BMA220=m +CONFIG_BMC150_ACCEL=m +CONFIG_BMC150_ACCEL_I2C=m +CONFIG_BMC150_ACCEL_SPI=m +CONFIG_DA280=m +CONFIG_DA311=m +CONFIG_DMARD09=m +CONFIG_DMARD10=m +CONFIG_HID_SENSOR_ACCEL_3D=m +CONFIG_IIO_ST_ACCEL_3AXIS=m +CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m +CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m +CONFIG_KXSD9=m +CONFIG_KXSD9_SPI=m +CONFIG_KXSD9_I2C=m +CONFIG_KXCJK1013=m +CONFIG_MC3230=m +CONFIG_MMA7455=m +CONFIG_MMA7455_I2C=m +CONFIG_MMA7455_SPI=m +CONFIG_MMA7660=m +CONFIG_MMA8452=m +CONFIG_MMA9551_CORE=m +CONFIG_MMA9551=m +CONFIG_MMA9553=m +CONFIG_MXC4005=m +CONFIG_MXC6255=m +CONFIG_SCA3000=m +CONFIG_STK8312=m +CONFIG_STK8BA50=m + +# +# Analog to digital converters +# +CONFIG_AD_SIGMA_DELTA=m +CONFIG_AD7266=m +CONFIG_AD7291=m +CONFIG_AD7298=m +CONFIG_AD7476=m +CONFIG_AD7766=m +CONFIG_AD7791=m +CONFIG_AD7793=m +CONFIG_AD7887=m +CONFIG_AD7923=m +CONFIG_AD799X=m +CONFIG_AXP20X_ADC=m +CONFIG_AXP288_ADC=m +CONFIG_CC10001_ADC=m +CONFIG_DA9150_GPADC=m +CONFIG_DLN2_ADC=m +CONFIG_HI8435=m +CONFIG_HX711=m +CONFIG_INA2XX_ADC=m +CONFIG_LTC2471=m +CONFIG_LTC2485=m +CONFIG_LTC2497=m +CONFIG_MAX1027=m +CONFIG_MAX11100=m +CONFIG_MAX1118=m +CONFIG_MAX1363=m +CONFIG_MAX9611=m +CONFIG_MCP320X=m +CONFIG_MCP3422=m +CONFIG_MEN_Z188_ADC=m +CONFIG_NAU7802=m +CONFIG_QCOM_VADC_COMMON=m +CONFIG_QCOM_SPMI_IADC=m +CONFIG_QCOM_SPMI_VADC=m +CONFIG_TI_ADC081C=m +CONFIG_TI_ADC0832=m +CONFIG_TI_ADC084S021=m +CONFIG_TI_ADC12138=m +CONFIG_TI_ADC108S102=m +CONFIG_TI_ADC128S052=m +CONFIG_TI_ADC161S626=m +CONFIG_TI_ADS1015=m +CONFIG_TI_ADS7950=m +CONFIG_TI_AM335X_ADC=m +CONFIG_TI_TLC4541=m +CONFIG_VIPERBOARD_ADC=m + +# +# Amplifiers +# +CONFIG_AD8366=m + +# +# Chemical Sensors +# +CONFIG_ATLAS_PH_SENSOR=m +CONFIG_CCS811=m +CONFIG_IAQCORE=m +CONFIG_VZ89X=m +CONFIG_IIO_CROS_EC_SENSORS_CORE=m +CONFIG_IIO_CROS_EC_SENSORS=m + +# +# Hid Sensor IIO Common +# +CONFIG_HID_SENSOR_IIO_COMMON=m +CONFIG_HID_SENSOR_IIO_TRIGGER=m +CONFIG_IIO_MS_SENSORS_I2C=m + +# +# SSP Sensor Common +# +CONFIG_IIO_SSP_SENSORS_COMMONS=m +CONFIG_IIO_SSP_SENSORHUB=m +CONFIG_IIO_ST_SENSORS_I2C=m +CONFIG_IIO_ST_SENSORS_SPI=m +CONFIG_IIO_ST_SENSORS_CORE=m + +# +# Counters +# + +# +# Digital to analog converters +# +CONFIG_AD5064=m +CONFIG_AD5360=m +CONFIG_AD5380=m +CONFIG_AD5421=m +CONFIG_AD5446=m +CONFIG_AD5449=m +CONFIG_AD5592R_BASE=m +CONFIG_AD5592R=m +CONFIG_AD5593R=m +CONFIG_AD5504=m +CONFIG_AD5624R_SPI=m +CONFIG_LTC2632=m +CONFIG_AD5686=m +CONFIG_AD5755=m +CONFIG_AD5761=m +CONFIG_AD5764=m +CONFIG_AD5791=m +CONFIG_AD7303=m +CONFIG_AD8801=m +CONFIG_M62332=m +CONFIG_MAX517=m +CONFIG_MCP4725=m +CONFIG_MCP4922=m + +# +# IIO dummy driver +# +# CONFIG_IIO_SIMPLE_DUMMY is not set + +# +# Frequency Synthesizers DDS/PLL +# + +# +# Clock Generator/Distribution +# +CONFIG_AD9523=m + +# +# Phase-Locked Loop (PLL) frequency synthesizers +# +CONFIG_ADF4350=m + +# +# Digital gyroscope sensors +# +CONFIG_ADIS16080=m +CONFIG_ADIS16130=m +CONFIG_ADIS16136=m +CONFIG_ADIS16260=m +CONFIG_ADXRS450=m +CONFIG_BMG160=m +CONFIG_BMG160_I2C=m +CONFIG_BMG160_SPI=m +CONFIG_HID_SENSOR_GYRO_3D=m +CONFIG_MPU3050=m +CONFIG_MPU3050_I2C=m +CONFIG_IIO_ST_GYRO_3AXIS=m +CONFIG_IIO_ST_GYRO_I2C_3AXIS=m +CONFIG_IIO_ST_GYRO_SPI_3AXIS=m +CONFIG_ITG3200=m + +# +# Health Sensors +# + +# +# Heart Rate Monitors +# +CONFIG_AFE4403=m +CONFIG_AFE4404=m +CONFIG_MAX30100=m +CONFIG_MAX30102=m + +# +# Humidity sensors +# +CONFIG_AM2315=m +CONFIG_DHT11=m +CONFIG_HDC100X=m +CONFIG_HID_SENSOR_HUMIDITY=m +CONFIG_HTS221=m +CONFIG_HTS221_I2C=m +CONFIG_HTS221_SPI=m +CONFIG_HTU21=m +CONFIG_SI7005=m +CONFIG_SI7020=m + +# +# Inertial measurement units +# +CONFIG_ADIS16400=m +CONFIG_ADIS16480=m +CONFIG_BMI160=m +CONFIG_BMI160_I2C=m +CONFIG_BMI160_SPI=m +CONFIG_KMX61=m +CONFIG_INV_MPU6050_IIO=m +CONFIG_INV_MPU6050_I2C=m +CONFIG_INV_MPU6050_SPI=m +CONFIG_IIO_ST_LSM6DSX=m +CONFIG_IIO_ST_LSM6DSX_I2C=m +CONFIG_IIO_ST_LSM6DSX_SPI=m +CONFIG_IIO_ADIS_LIB=m +CONFIG_IIO_ADIS_LIB_BUFFER=y + +# +# Light sensors +# +# CONFIG_ACPI_ALS is not set +CONFIG_ADJD_S311=m +CONFIG_AL3320A=m +CONFIG_APDS9300=m +CONFIG_APDS9960=m +CONFIG_BH1750=m +CONFIG_BH1780=m +CONFIG_CM32181=m +CONFIG_CM3232=m +CONFIG_CM3323=m +CONFIG_CM36651=m +CONFIG_IIO_CROS_EC_LIGHT_PROX=m +CONFIG_GP2AP020A00F=m +CONFIG_SENSORS_ISL29018=m +CONFIG_SENSORS_ISL29028=m +CONFIG_ISL29125=m +CONFIG_HID_SENSOR_ALS=m +CONFIG_HID_SENSOR_PROX=m +CONFIG_JSA1212=m +CONFIG_RPR0521=m +CONFIG_SENSORS_LM3533=m +CONFIG_LTR501=m +CONFIG_MAX44000=m +CONFIG_OPT3001=m +CONFIG_PA12203001=m +CONFIG_SI1145=m +CONFIG_STK3310=m +CONFIG_TCS3414=m +CONFIG_TCS3472=m +CONFIG_SENSORS_TSL2563=m +CONFIG_TSL2583=m +CONFIG_TSL4531=m +CONFIG_US5182D=m +CONFIG_VCNL4000=m +CONFIG_VEML6070=m +CONFIG_VL6180=m + +# +# Magnetometer sensors +# +CONFIG_AK8975=m +CONFIG_AK09911=m +CONFIG_BMC150_MAGN=m +CONFIG_BMC150_MAGN_I2C=m +CONFIG_BMC150_MAGN_SPI=m +CONFIG_MAG3110=m +CONFIG_HID_SENSOR_MAGNETOMETER_3D=m +CONFIG_MMC35240=m +CONFIG_IIO_ST_MAGN_3AXIS=m +CONFIG_IIO_ST_MAGN_I2C_3AXIS=m +CONFIG_IIO_ST_MAGN_SPI_3AXIS=m +CONFIG_SENSORS_HMC5843=m +CONFIG_SENSORS_HMC5843_I2C=m +CONFIG_SENSORS_HMC5843_SPI=m + +# +# Multiplexers +# + +# +# Inclinometer sensors +# +CONFIG_HID_SENSOR_INCLINOMETER_3D=m +CONFIG_HID_SENSOR_DEVICE_ROTATION=m + +# +# Triggers - standalone +# +CONFIG_IIO_HRTIMER_TRIGGER=m +CONFIG_IIO_INTERRUPT_TRIGGER=m +CONFIG_IIO_TIGHTLOOP_TRIGGER=m +CONFIG_IIO_SYSFS_TRIGGER=m + +# +# Digital potentiometers +# +CONFIG_DS1803=m +CONFIG_MAX5481=m +CONFIG_MAX5487=m +CONFIG_MCP4131=m +CONFIG_MCP4531=m +CONFIG_TPL0102=m + +# +# Digital potentiostats +# +CONFIG_LMP91000=m + +# +# Pressure sensors +# +CONFIG_ABP060MG=m +CONFIG_BMP280=m +CONFIG_BMP280_I2C=m +CONFIG_BMP280_SPI=m +CONFIG_IIO_CROS_EC_BARO=m +CONFIG_HID_SENSOR_PRESS=m +CONFIG_HP03=m +CONFIG_MPL115=m +CONFIG_MPL115_I2C=m +CONFIG_MPL115_SPI=m +CONFIG_MPL3115=m +CONFIG_MS5611=m +CONFIG_MS5611_I2C=m +CONFIG_MS5611_SPI=m +CONFIG_MS5637=m +CONFIG_IIO_ST_PRESS=m +CONFIG_IIO_ST_PRESS_I2C=m +CONFIG_IIO_ST_PRESS_SPI=m +CONFIG_T5403=m +CONFIG_HP206C=m +CONFIG_ZPA2326=m +CONFIG_ZPA2326_I2C=m +CONFIG_ZPA2326_SPI=m + +# +# Lightning sensors +# +CONFIG_AS3935=m + +# +# Proximity and distance sensors +# +CONFIG_LIDAR_LITE_V2=m +CONFIG_SRF04=m +CONFIG_SX9500=m +CONFIG_SRF08=m + +# +# Temperature sensors +# +CONFIG_MAXIM_THERMOCOUPLE=m +CONFIG_HID_SENSOR_TEMP=m +CONFIG_MLX90614=m +CONFIG_TMP006=m +CONFIG_TMP007=m +CONFIG_TSYS01=m +CONFIG_TSYS02D=m +CONFIG_NTB=m +CONFIG_NTB_AMD=m +CONFIG_NTB_IDT=m +CONFIG_NTB_INTEL=m +CONFIG_NTB_PINGPONG=m +CONFIG_NTB_TOOL=m +CONFIG_NTB_PERF=m +CONFIG_NTB_TRANSPORT=m +CONFIG_VME_BUS=y + +# +# VME Bridge Drivers +# +CONFIG_VME_CA91CX42=m +CONFIG_VME_TSI148=m +CONFIG_VME_FAKE=m + +# +# VME Board Drivers +# +CONFIG_VMIVME_7805=m + +# +# VME Device Drivers +# +CONFIG_VME_USER=m +CONFIG_VME_PIO2=m +CONFIG_PWM=y +CONFIG_PWM_SYSFS=y +CONFIG_PWM_CROS_EC=m +CONFIG_PWM_LP3943=m +CONFIG_PWM_LPSS=m +CONFIG_PWM_LPSS_PCI=m +CONFIG_PWM_LPSS_PLATFORM=m +CONFIG_PWM_PCA9685=m +CONFIG_ARM_GIC_MAX_NR=1 +CONFIG_IPACK_BUS=m +CONFIG_BOARD_TPCI200=m +CONFIG_SERIAL_IPOCTAL=m +CONFIG_RESET_CONTROLLER=y +# CONFIG_RESET_ATH79 is not set +# CONFIG_RESET_BERLIN is not set +# CONFIG_RESET_IMX7 is not set +# CONFIG_RESET_LANTIQ is not set +# CONFIG_RESET_LPC18XX is not set +# CONFIG_RESET_MESON is not set +# CONFIG_RESET_PISTACHIO is not set +# CONFIG_RESET_SOCFPGA is not set +# CONFIG_RESET_STM32 is not set +# CONFIG_RESET_SUNXI is not set +CONFIG_RESET_TI_SYSCON=m +# CONFIG_RESET_ZYNQ is not set +# CONFIG_RESET_TEGRA_BPMP is not set +CONFIG_FMC=m +CONFIG_FMC_FAKEDEV=m +CONFIG_FMC_TRIVIAL=m +CONFIG_FMC_WRITE_EEPROM=m +CONFIG_FMC_CHARDEV=m + +# +# PHY Subsystem +# +CONFIG_GENERIC_PHY=y +CONFIG_BCM_KONA_USB2_PHY=m +CONFIG_PHY_PXA_28NM_HSIC=m +CONFIG_PHY_PXA_28NM_USB2=m +CONFIG_PHY_CPCAP_USB=m +CONFIG_PHY_QCOM_USB_HS=m +CONFIG_PHY_QCOM_USB_HSIC=m +CONFIG_PHY_SAMSUNG_USB2=m +# CONFIG_PHY_EXYNOS4210_USB2 is not set +# CONFIG_PHY_EXYNOS4X12_USB2 is not set +# CONFIG_PHY_EXYNOS5250_USB2 is not set +CONFIG_PHY_TUSB1210=m +CONFIG_POWERCAP=y +CONFIG_INTEL_RAPL=m +CONFIG_MCB=m +CONFIG_MCB_PCI=m +CONFIG_MCB_LPC=m + +# +# Performance monitor support +# +CONFIG_RAS=y +CONFIG_RAS_CEC=y +CONFIG_THUNDERBOLT=m + +# +# Android +# +# CONFIG_ANDROID is not set +CONFIG_LIBNVDIMM=m +CONFIG_BLK_DEV_PMEM=m +CONFIG_ND_BLK=m +CONFIG_ND_CLAIM=y +CONFIG_ND_BTT=m +CONFIG_BTT=y +CONFIG_DAX=y +CONFIG_DEV_DAX=m +CONFIG_NVMEM=y +CONFIG_STM=m +# CONFIG_STM_DUMMY is not set +CONFIG_STM_SOURCE_CONSOLE=m +CONFIG_STM_SOURCE_HEARTBEAT=m +CONFIG_INTEL_TH=m +CONFIG_INTEL_TH_PCI=m +CONFIG_INTEL_TH_GTH=m +CONFIG_INTEL_TH_STH=m +CONFIG_INTEL_TH_MSU=m +CONFIG_INTEL_TH_PTI=m +# CONFIG_INTEL_TH_DEBUG is not set +CONFIG_FPGA=m +CONFIG_FPGA_MGR_ALTERA_CVP=m +CONFIG_FPGA_MGR_ALTERA_PS_SPI=m +CONFIG_FPGA_MGR_XILINX_SPI=m +CONFIG_ALTERA_PR_IP_CORE=m + +# +# FSI support +# +CONFIG_FSI=m +CONFIG_FSI_MASTER_GPIO=m +CONFIG_FSI_MASTER_HUB=m +CONFIG_FSI_SCOM=m + +# +# Firmware Drivers +# +CONFIG_EDD=m +# CONFIG_EDD_OFF is not set +CONFIG_FIRMWARE_MEMMAP=y +CONFIG_DELL_RBU=m +CONFIG_DCDBAS=m +CONFIG_DMIID=y +CONFIG_DMI_SYSFS=m +CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_ISCSI_IBFT=m +CONFIG_FW_CFG_SYSFS=m +# CONFIG_FW_CFG_SYSFS_CMDLINE is not set +CONFIG_GOOGLE_FIRMWARE=y +CONFIG_GOOGLE_SMI=m +CONFIG_GOOGLE_COREBOOT_TABLE=m +CONFIG_GOOGLE_COREBOOT_TABLE_ACPI=m +CONFIG_GOOGLE_MEMCONSOLE=m +CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY=m +CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m +CONFIG_GOOGLE_VPD=m + +# +# EFI (Extensible Firmware Interface) Support +# +CONFIG_EFI_VARS=m +CONFIG_EFI_ESRT=y +CONFIG_EFI_VARS_PSTORE=m +CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE=y +CONFIG_EFI_RUNTIME_MAP=y +# CONFIG_EFI_FAKE_MEMMAP is not set +CONFIG_EFI_RUNTIME_WRAPPERS=y +CONFIG_EFI_BOOTLOADER_CONTROL=m +CONFIG_EFI_CAPSULE_LOADER=m +CONFIG_EFI_TEST=m +CONFIG_APPLE_PROPERTIES=y +CONFIG_RESET_ATTACK_MITIGATION=y +CONFIG_UEFI_CPER=y +CONFIG_EFI_DEV_PATH_PARSER=y + +# +# Tegra firmware driver +# + +# +# File systems +# +CONFIG_DCACHE_WORD_ACCESS=y +CONFIG_FS_IOMAP=y +CONFIG_EXT2_FS=m +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_POSIX_ACL=y +CONFIG_EXT2_FS_SECURITY=y +CONFIG_EXT3_FS=m +CONFIG_EXT3_FS_POSIX_ACL=y +CONFIG_EXT3_FS_SECURITY=y +CONFIG_EXT4_FS=m +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_EXT4_ENCRYPTION=y +CONFIG_EXT4_FS_ENCRYPTION=y +# CONFIG_EXT4_DEBUG is not set +CONFIG_JBD2=m +# CONFIG_JBD2_DEBUG is not set +CONFIG_FS_MBCACHE=m +# CONFIG_REISERFS_FS is not set +# CONFIG_JFS_FS is not set +CONFIG_XFS_FS=m +CONFIG_XFS_QUOTA=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_XFS_RT=y +# CONFIG_XFS_WARN is not set +# CONFIG_XFS_DEBUG is not set +# CONFIG_GFS2_FS is not set +# CONFIG_OCFS2_FS is not set +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set +# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set +# CONFIG_BTRFS_DEBUG is not set +# CONFIG_BTRFS_ASSERT is not set +CONFIG_NILFS2_FS=m +CONFIG_F2FS_FS=m +CONFIG_F2FS_STAT_FS=y +CONFIG_F2FS_FS_XATTR=y +CONFIG_F2FS_FS_POSIX_ACL=y +CONFIG_F2FS_FS_SECURITY=y +CONFIG_F2FS_CHECK_FS=y +CONFIG_F2FS_FS_ENCRYPTION=y +# CONFIG_F2FS_FAULT_INJECTION is not set +CONFIG_FS_DAX=y +CONFIG_FS_POSIX_ACL=y +CONFIG_EXPORTFS=y +CONFIG_EXPORTFS_BLOCK_OPS=y +CONFIG_FILE_LOCKING=y +CONFIG_MANDATORY_FILE_LOCKING=y +CONFIG_FS_ENCRYPTION=m +CONFIG_FSNOTIFY=y +CONFIG_DNOTIFY=y +CONFIG_INOTIFY_USER=y +CONFIG_FANOTIFY=y +# CONFIG_FANOTIFY_ACCESS_PERMISSIONS is not set +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +# CONFIG_QUOTA_DEBUG is not set +CONFIG_QUOTA_TREE=m +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_QUOTACTL=y +CONFIG_QUOTACTL_COMPAT=y +CONFIG_AUTOFS4_FS=m +CONFIG_FUSE_FS=m +CONFIG_CUSE=m +CONFIG_OVERLAY_FS=m +# CONFIG_OVERLAY_FS_REDIRECT_DIR is not set +CONFIG_OVERLAY_FS_INDEX=y + +# +# Caches +# +CONFIG_FSCACHE=m +CONFIG_FSCACHE_STATS=y +# CONFIG_FSCACHE_HISTOGRAM is not set +# CONFIG_FSCACHE_DEBUG is not set +# CONFIG_FSCACHE_OBJECT_LIST is not set +CONFIG_CACHEFILES=m +# CONFIG_CACHEFILES_DEBUG is not set +# CONFIG_CACHEFILES_HISTOGRAM is not set + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +CONFIG_UDF_NLS=y + +# +# DOS/FAT/NT Filesystems +# +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=437 +CONFIG_FAT_DEFAULT_IOCHARSET="utf8" +# CONFIG_FAT_DEFAULT_UTF8 is not set +CONFIG_NTFS_FS=m +# CONFIG_NTFS_DEBUG is not set +CONFIG_NTFS_RW=y + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +CONFIG_PROC_SYSCTL=y +CONFIG_PROC_PAGE_MONITOR=y +# CONFIG_PROC_CHILDREN is not set +CONFIG_KERNFS=y +CONFIG_SYSFS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_XATTR=y +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y +CONFIG_ARCH_HAS_GIGANTIC_PAGE=y +CONFIG_CONFIGFS_FS=m +CONFIG_EFIVAR_FS=m +CONFIG_MISC_FILESYSTEMS=y +CONFIG_ORANGEFS_FS=m +CONFIG_ADFS_FS=m +# CONFIG_ADFS_FS_RW is not set +CONFIG_AFFS_FS=m +CONFIG_ECRYPT_FS=m +CONFIG_ECRYPT_FS_MESSAGING=y +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +CONFIG_HFSPLUS_FS_POSIX_ACL=y +CONFIG_BEFS_FS=m +# CONFIG_BEFS_DEBUG is not set +CONFIG_BFS_FS=m +CONFIG_EFS_FS=m +CONFIG_JFFS2_FS=m +CONFIG_JFFS2_FS_DEBUG=0 +CONFIG_JFFS2_FS_WRITEBUFFER=y +CONFIG_JFFS2_FS_WBUF_VERIFY=y +CONFIG_JFFS2_SUMMARY=y +CONFIG_JFFS2_FS_XATTR=y +CONFIG_JFFS2_FS_POSIX_ACL=y +CONFIG_JFFS2_FS_SECURITY=y +CONFIG_JFFS2_COMPRESSION_OPTIONS=y +CONFIG_JFFS2_ZLIB=y +CONFIG_JFFS2_LZO=y +CONFIG_JFFS2_RTIME=y +CONFIG_JFFS2_RUBIN=y +# CONFIG_JFFS2_CMODE_NONE is not set +CONFIG_JFFS2_CMODE_PRIORITY=y +# CONFIG_JFFS2_CMODE_SIZE is not set +# CONFIG_JFFS2_CMODE_FAVOURLZO is not set +CONFIG_UBIFS_FS=m +CONFIG_UBIFS_FS_ADVANCED_COMPR=y +CONFIG_UBIFS_FS_LZO=y +CONFIG_UBIFS_FS_ZLIB=y +CONFIG_UBIFS_ATIME_SUPPORT=y +CONFIG_UBIFS_FS_ENCRYPTION=y +CONFIG_UBIFS_FS_SECURITY=y +CONFIG_CRAMFS=m +CONFIG_SQUASHFS=m +# CONFIG_SQUASHFS_FILE_CACHE is not set +CONFIG_SQUASHFS_FILE_DIRECT=y +# CONFIG_SQUASHFS_DECOMP_SINGLE is not set +# CONFIG_SQUASHFS_DECOMP_MULTI is not set +CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU=y +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_ZLIB=y +CONFIG_SQUASHFS_LZ4=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_SQUASHFS_ZSTD=y +CONFIG_SQUASHFS_4K_DEVBLK_SIZE=y +CONFIG_SQUASHFS_EMBEDDED=y +CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 +CONFIG_VXFS_FS=m +CONFIG_MINIX_FS=m +CONFIG_OMFS_FS=m +CONFIG_HPFS_FS=m +CONFIG_QNX4FS_FS=m +CONFIG_QNX6FS_FS=m +# CONFIG_QNX6FS_DEBUG is not set +CONFIG_ROMFS_FS=m +# CONFIG_ROMFS_BACKED_BY_BLOCK is not set +# CONFIG_ROMFS_BACKED_BY_MTD is not set +CONFIG_ROMFS_BACKED_BY_BOTH=y +CONFIG_ROMFS_ON_BLOCK=y +CONFIG_ROMFS_ON_MTD=y +CONFIG_PSTORE=y +# CONFIG_PSTORE_ZLIB_COMPRESS is not set +# CONFIG_PSTORE_LZO_COMPRESS is not set +CONFIG_PSTORE_LZ4_COMPRESS=y +# CONFIG_PSTORE_CONSOLE is not set +CONFIG_PSTORE_PMSG=y +CONFIG_PSTORE_RAM=m +CONFIG_SYSV_FS=m +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set +# CONFIG_UFS_DEBUG is not set +CONFIG_EXOFS_FS=m +# CONFIG_EXOFS_DEBUG is not set +CONFIG_ORE=m +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_NFS_FS=m +CONFIG_NFS_V2=m +CONFIG_NFS_V3=m +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=m +CONFIG_NFS_SWAP=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_PNFS_FILE_LAYOUT=m +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_FLEXFILE_LAYOUT=m +CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" +CONFIG_NFS_V4_1_MIGRATION=y +CONFIG_NFS_V4_SECURITY_LABEL=y +CONFIG_NFS_FSCACHE=y +# CONFIG_NFS_USE_LEGACY_DNS is not set +CONFIG_NFS_USE_KERNEL_DNS=y +CONFIG_NFSD=m +CONFIG_NFSD_V2_ACL=y +CONFIG_NFSD_V3=y +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_NFSD_PNFS=y +CONFIG_NFSD_BLOCKLAYOUT=y +CONFIG_NFSD_SCSILAYOUT=y +CONFIG_NFSD_FLEXFILELAYOUT=y +# CONFIG_NFSD_V4_SECURITY_LABEL is not set +# CONFIG_NFSD_FAULT_INJECTION is not set +CONFIG_GRACE_PERIOD=m +CONFIG_LOCKD=m +CONFIG_LOCKD_V4=y +CONFIG_NFS_ACL_SUPPORT=m +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=m +CONFIG_SUNRPC_GSS=m +CONFIG_SUNRPC_BACKCHANNEL=y +CONFIG_SUNRPC_SWAP=y +CONFIG_RPCSEC_GSS_KRB5=m +# CONFIG_SUNRPC_DEBUG is not set +CONFIG_SUNRPC_XPRT_RDMA=m +CONFIG_CEPH_FS=m +CONFIG_CEPH_FSCACHE=y +CONFIG_CEPH_FS_POSIX_ACL=y +CONFIG_CIFS=m +CONFIG_CIFS_STATS=y +CONFIG_CIFS_STATS2=y +CONFIG_CIFS_WEAK_PW_HASH=y +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_ACL=y +# CONFIG_CIFS_DEBUG is not set +CONFIG_CIFS_DFS_UPCALL=y +CONFIG_CIFS_SMB311=y +CONFIG_CIFS_FSCACHE=y +CONFIG_NCP_FS=m +CONFIG_NCPFS_PACKET_SIGNING=y +CONFIG_NCPFS_IOCTL_LOCKING=y +CONFIG_NCPFS_STRONG=y +CONFIG_NCPFS_NFS_NS=y +CONFIG_NCPFS_OS2_NS=y +CONFIG_NCPFS_SMALLDOS=y +CONFIG_NCPFS_NLS=y +CONFIG_NCPFS_EXTRAS=y +CONFIG_CODA_FS=m +CONFIG_AFS_FS=m +# CONFIG_AFS_DEBUG is not set +CONFIG_AFS_FSCACHE=y +CONFIG_9P_FS=m +CONFIG_9P_FSCACHE=y +CONFIG_9P_FS_POSIX_ACL=y +CONFIG_9P_FS_SECURITY=y +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_MAC_ROMAN=m +CONFIG_NLS_MAC_CELTIC=m +CONFIG_NLS_MAC_CENTEURO=m +CONFIG_NLS_MAC_CROATIAN=m +CONFIG_NLS_MAC_CYRILLIC=m +CONFIG_NLS_MAC_GAELIC=m +CONFIG_NLS_MAC_GREEK=m +CONFIG_NLS_MAC_ICELAND=m +CONFIG_NLS_MAC_INUIT=m +CONFIG_NLS_MAC_ROMANIAN=m +CONFIG_NLS_MAC_TURKISH=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +# CONFIG_DLM_DEBUG is not set + +# +# Kernel hacking +# +CONFIG_TRACE_IRQFLAGS_SUPPORT=y + +# +# printk and dmesg options +# +# CONFIG_PRINTK_TIME is not set +CONFIG_CONSOLE_LOGLEVEL_DEFAULT=1 +CONFIG_MESSAGE_LOGLEVEL_DEFAULT=1 +# CONFIG_BOOT_PRINTK_DELAY is not set +# CONFIG_DYNAMIC_DEBUG is not set + +# +# Compile-time checks and compiler options +# +# CONFIG_DEBUG_INFO is not set +# CONFIG_ENABLE_WARN_DEPRECATED is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=0 +CONFIG_STRIP_ASM_SYMS=y +# CONFIG_READABLE_ASM is not set +# CONFIG_UNUSED_SYMBOLS is not set +# CONFIG_PAGE_OWNER is not set +CONFIG_DEBUG_FS=y +# CONFIG_HEADERS_CHECK is not set +# CONFIG_DEBUG_SECTION_MISMATCH is not set +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +CONFIG_STACK_VALIDATION=y +# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set +CONFIG_MAGIC_SYSRQ=y +CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x1 +CONFIG_MAGIC_SYSRQ_SERIAL=y +CONFIG_DEBUG_KERNEL=y + +# +# Memory Debugging +# +# CONFIG_PAGE_EXTENSION is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PAGE_POISONING is not set +# CONFIG_DEBUG_PAGE_REF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +# CONFIG_DEBUG_OBJECTS is not set +CONFIG_SLUB_DEBUG_ON=y +# CONFIG_SLUB_STATS is not set +CONFIG_HAVE_DEBUG_KMEMLEAK=y +# CONFIG_DEBUG_KMEMLEAK is not set +# CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_DEBUG_VM is not set +CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y +# CONFIG_DEBUG_VIRTUAL is not set +CONFIG_DEBUG_MEMORY_INIT=y +# CONFIG_DEBUG_PER_CPU_MAPS is not set +CONFIG_HAVE_DEBUG_STACKOVERFLOW=y +# CONFIG_DEBUG_STACKOVERFLOW is not set +CONFIG_HAVE_ARCH_KASAN=y +# CONFIG_KASAN is not set +CONFIG_ARCH_HAS_KCOV=y +# CONFIG_KCOV is not set +# CONFIG_DEBUG_SHIRQ is not set + +# +# Debug Lockups and Hangs +# +# CONFIG_SOFTLOCKUP_DETECTOR is not set +CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y +# CONFIG_HARDLOCKUP_DETECTOR is not set +# CONFIG_DETECT_HUNG_TASK is not set +# CONFIG_WQ_WATCHDOG is not set +# CONFIG_PANIC_ON_OOPS is not set +CONFIG_PANIC_ON_OOPS_VALUE=0 +CONFIG_PANIC_TIMEOUT=0 +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_INFO=y +CONFIG_SCHEDSTATS=y +CONFIG_SCHED_STACK_END_CHECK=y +# CONFIG_DEBUG_TIMEKEEPING is not set +# CONFIG_DEBUG_PREEMPT is not set + +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set +# CONFIG_DEBUG_ATOMIC_SLEEP is not set +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +# CONFIG_LOCK_TORTURE_TEST is not set +# CONFIG_WW_MUTEX_SELFTEST is not set +CONFIG_STACKTRACE=y +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +# CONFIG_DEBUG_KOBJECT is not set +CONFIG_DEBUG_BUGVERBOSE=y +CONFIG_DEBUG_LIST=y +CONFIG_DEBUG_PI_LIST=y +CONFIG_DEBUG_SG=y +CONFIG_DEBUG_NOTIFIERS=y +CONFIG_DEBUG_CREDENTIALS=y + +# +# RCU Debugging +# +# CONFIG_PROVE_RCU is not set +CONFIG_TORTURE_TEST=m +CONFIG_RCU_PERF_TEST=m +# CONFIG_RCU_TORTURE_TEST is not set +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +# CONFIG_RCU_TRACE is not set +# CONFIG_RCU_EQS_DEBUG is not set +# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set +# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set +# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set +# CONFIG_NOTIFIER_ERROR_INJECTION is not set +# CONFIG_FAULT_INJECTION is not set +# CONFIG_LATENCYTOP is not set +CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y +CONFIG_HAVE_FENTRY=y +CONFIG_HAVE_C_RECORDMCOUNT=y +CONFIG_TRACE_CLOCK=y +CONFIG_RING_BUFFER=y +CONFIG_EVENT_TRACING=y +CONFIG_CONTEXT_SWITCH_TRACER=y +CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y +CONFIG_FTRACE=y +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_HWLAT_TRACER is not set +# CONFIG_ENABLE_DEFAULT_TRACERS is not set +# CONFIG_FTRACE_SYSCALLS is not set +# CONFIG_TRACER_SNAPSHOT is not set +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +CONFIG_KPROBE_EVENTS=y +CONFIG_UPROBE_EVENTS=y +CONFIG_BPF_EVENTS=y +CONFIG_PROBE_EVENTS=y +# CONFIG_MMIOTRACE is not set +# CONFIG_HIST_TRIGGERS is not set +# CONFIG_TRACEPOINT_BENCHMARK is not set +# CONFIG_RING_BUFFER_BENCHMARK is not set +# CONFIG_RING_BUFFER_STARTUP_TEST is not set +# CONFIG_TRACE_EVAL_MAP_FILE is not set +# CONFIG_TRACING_EVENTS_GPIO is not set +# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set +# CONFIG_DMA_API_DEBUG is not set + +# +# Runtime Testing +# +CONFIG_LKDTM=m +# CONFIG_TEST_LIST_SORT is not set +# CONFIG_TEST_SORT is not set +# CONFIG_KPROBES_SANITY_TEST is not set +# CONFIG_BACKTRACE_SELF_TEST is not set +# CONFIG_RBTREE_TEST is not set +# CONFIG_INTERVAL_TREE_TEST is not set +# CONFIG_PERCPU_TEST is not set +# CONFIG_ATOMIC64_SELFTEST is not set +# CONFIG_ASYNC_RAID6_TEST is not set +# CONFIG_TEST_HEXDUMP is not set +# CONFIG_TEST_STRING_HELPERS is not set +# CONFIG_TEST_KSTRTOX is not set +CONFIG_TEST_PRINTF=m +CONFIG_TEST_BITMAP=m +CONFIG_TEST_UUID=m +# CONFIG_TEST_RHASHTABLE is not set +CONFIG_TEST_HASH=m +# CONFIG_TEST_PARMAN is not set +CONFIG_TEST_LKM=m +# CONFIG_TEST_USER_COPY is not set +# CONFIG_TEST_BPF is not set +# CONFIG_TEST_FIRMWARE is not set +CONFIG_TEST_SYSCTL=m +# CONFIG_TEST_UDELAY is not set +CONFIG_TEST_STATIC_KEYS=m +CONFIG_TEST_KMOD=m +CONFIG_MEMTEST=y +# CONFIG_BUG_ON_DATA_CORRUPTION is not set +# CONFIG_SAMPLES is not set +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_KGDB is not set +CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y +# CONFIG_ARCH_WANTS_UBSAN_NO_NULL is not set +# CONFIG_UBSAN is not set +CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y +CONFIG_STRICT_DEVMEM=y +CONFIG_IO_STRICT_DEVMEM=y +CONFIG_EARLY_PRINTK_USB=y +CONFIG_X86_VERBOSE_BOOTUP=y +CONFIG_EARLY_PRINTK=y +# CONFIG_EARLY_PRINTK_DBGP is not set +# CONFIG_EARLY_PRINTK_EFI is not set +CONFIG_EARLY_PRINTK_USB_XDBC=y +CONFIG_X86_PTDUMP_CORE=y +# CONFIG_X86_PTDUMP is not set +# CONFIG_EFI_PGT_DUMP is not set +CONFIG_DEBUG_WX=y +CONFIG_DOUBLEFAULT=y +# CONFIG_DEBUG_TLBFLUSH is not set +# CONFIG_IOMMU_DEBUG is not set +# CONFIG_IOMMU_STRESS is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y +# CONFIG_X86_DECODER_SELFTEST is not set +CONFIG_IO_DELAY_TYPE_0X80=0 +CONFIG_IO_DELAY_TYPE_0XED=1 +CONFIG_IO_DELAY_TYPE_UDELAY=2 +CONFIG_IO_DELAY_TYPE_NONE=3 +CONFIG_IO_DELAY_0X80=y +# CONFIG_IO_DELAY_0XED is not set +# CONFIG_IO_DELAY_UDELAY is not set +# CONFIG_IO_DELAY_NONE is not set +CONFIG_DEFAULT_IO_DELAY_TYPE=0 +# CONFIG_DEBUG_BOOT_PARAMS is not set +# CONFIG_CPA_DEBUG is not set +# CONFIG_OPTIMIZE_INLINING is not set +# CONFIG_DEBUG_ENTRY is not set +# CONFIG_DEBUG_NMI_SELFTEST is not set +CONFIG_X86_DEBUG_FPU=y +# CONFIG_PUNIT_ATOM_DEBUG is not set +CONFIG_UNWINDER_ORC=y +# CONFIG_UNWINDER_FRAME_POINTER is not set + +# +# Security options +# +CONFIG_KEYS=y +CONFIG_KEYS_COMPAT=y +CONFIG_PERSISTENT_KEYRINGS=y +# CONFIG_BIG_KEYS is not set +CONFIG_TRUSTED_KEYS=m +CONFIG_ENCRYPTED_KEYS=m +# CONFIG_KEY_DH_OPERATIONS is not set +CONFIG_SECURITY_DMESG_RESTRICT=y +CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y +CONFIG_SECURITY_TIOCSTI_RESTRICT=y +CONFIG_SECURITY=y +# CONFIG_SECURITY_WRITABLE_HOOKS is not set +CONFIG_SECURITYFS=y +CONFIG_SECURITY_NETWORK=y +CONFIG_PAGE_TABLE_ISOLATION=y +# CONFIG_SECURITY_INFINIBAND is not set +# CONFIG_SECURITY_NETWORK_XFRM is not set +CONFIG_SECURITY_PATH=y +CONFIG_INTEL_TXT=y +CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y +CONFIG_HARDENED_USERCOPY=y +CONFIG_FORTIFY_SOURCE=y +CONFIG_PAGE_SANITIZE=y +CONFIG_PAGE_SANITIZE_VERIFY=y +# CONFIG_STATIC_USERMODEHELPER is not set +# CONFIG_SECURITY_SELINUX is not set +# CONFIG_SECURITY_SMACK is not set +# CONFIG_SECURITY_TOMOYO is not set +CONFIG_SECURITY_APPARMOR=y +CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE=1 +CONFIG_SECURITY_APPARMOR_HASH=y +CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y +# CONFIG_SECURITY_APPARMOR_DEBUG is not set +# CONFIG_SECURITY_LOADPIN is not set +CONFIG_SECURITY_YAMA=y +# CONFIG_INTEGRITY is not set +CONFIG_DEFAULT_SECURITY_APPARMOR=y +# CONFIG_DEFAULT_SECURITY_DAC is not set +CONFIG_DEFAULT_SECURITY="apparmor" +CONFIG_XOR_BLOCKS=m +CONFIG_ASYNC_CORE=m +CONFIG_ASYNC_MEMCPY=m +CONFIG_ASYNC_XOR=m +CONFIG_ASYNC_PQ=m +CONFIG_ASYNC_RAID6_RECOV=m +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_ALGAPI2=y +CONFIG_CRYPTO_AEAD=m +CONFIG_CRYPTO_AEAD2=y +CONFIG_CRYPTO_BLKCIPHER=y +CONFIG_CRYPTO_BLKCIPHER2=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG=m +CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_RNG_DEFAULT=m +CONFIG_CRYPTO_AKCIPHER2=y +CONFIG_CRYPTO_AKCIPHER=y +CONFIG_CRYPTO_KPP2=y +CONFIG_CRYPTO_KPP=m +CONFIG_CRYPTO_ACOMP2=y +CONFIG_CRYPTO_RSA=y +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_MANAGER=y +CONFIG_CRYPTO_MANAGER2=y +CONFIG_CRYPTO_USER=m +CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y +CONFIG_CRYPTO_GF128MUL=m +CONFIG_CRYPTO_NULL=m +CONFIG_CRYPTO_NULL2=y +CONFIG_CRYPTO_PCRYPT=m +CONFIG_CRYPTO_WORKQUEUE=y +CONFIG_CRYPTO_CRYPTD=m +CONFIG_CRYPTO_MCRYPTD=m +CONFIG_CRYPTO_AUTHENC=m +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_ABLK_HELPER=m +CONFIG_CRYPTO_SIMD=m +CONFIG_CRYPTO_GLUE_HELPER_X86=m +CONFIG_CRYPTO_ENGINE=m + +# +# Authenticated Encryption with Associated Data +# +CONFIG_CRYPTO_CCM=m +CONFIG_CRYPTO_GCM=m +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_SEQIV=m +CONFIG_CRYPTO_ECHAINIV=m + +# +# Block modes +# +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_CTR=m +CONFIG_CRYPTO_CTS=m +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m +CONFIG_CRYPTO_KEYWRAP=m + +# +# Hash modes +# +CONFIG_CRYPTO_CMAC=m +CONFIG_CRYPTO_HMAC=m +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_VMAC=m + +# +# Digest +# +CONFIG_CRYPTO_CRC32C=m +CONFIG_CRYPTO_CRC32C_INTEL=m +CONFIG_CRYPTO_CRC32=m +CONFIG_CRYPTO_CRC32_PCLMUL=m +CONFIG_CRYPTO_CRCT10DIF=y +CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m +CONFIG_CRYPTO_GHASH=m +CONFIG_CRYPTO_POLY1305=m +CONFIG_CRYPTO_POLY1305_X86_64=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD128=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_RMD256=m +CONFIG_CRYPTO_RMD320=m +CONFIG_CRYPTO_SHA1=y +CONFIG_CRYPTO_SHA1_SSSE3=m +CONFIG_CRYPTO_SHA256_SSSE3=m +CONFIG_CRYPTO_SHA512_SSSE3=m +CONFIG_CRYPTO_SHA1_MB=m +CONFIG_CRYPTO_SHA256_MB=m +CONFIG_CRYPTO_SHA512_MB=m +CONFIG_CRYPTO_SHA256=m +CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m + +# +# Ciphers +# +CONFIG_CRYPTO_AES=y +CONFIG_CRYPTO_AES_TI=m +CONFIG_CRYPTO_AES_X86_64=m +CONFIG_CRYPTO_AES_NI_INTEL=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_BLOWFISH_COMMON=m +CONFIG_CRYPTO_BLOWFISH_X86_64=m +CONFIG_CRYPTO_CAMELLIA=m +CONFIG_CRYPTO_CAMELLIA_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m +CONFIG_CRYPTO_CAST_COMMON=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST5_AVX_X86_64=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_CAST6_AVX_X86_64=m +CONFIG_CRYPTO_DES=m +CONFIG_CRYPTO_DES3_EDE_X86_64=m +CONFIG_CRYPTO_FCRYPT=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SALSA20=m +CONFIG_CRYPTO_CHACHA20=m +CONFIG_CRYPTO_CHACHA20_X86_64=m +CONFIG_CRYPTO_SEED=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_TWOFISH_COMMON=m +CONFIG_CRYPTO_TWOFISH_X86_64=m +CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m +CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m + +# +# Compression +# +CONFIG_CRYPTO_DEFLATE=m +CONFIG_CRYPTO_LZO=y +CONFIG_CRYPTO_842=m +CONFIG_CRYPTO_LZ4=m +CONFIG_CRYPTO_LZ4HC=m + +# +# Random Number Generation +# +CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_DRBG_MENU=m +CONFIG_CRYPTO_DRBG_HMAC=y +CONFIG_CRYPTO_DRBG_HASH=y +CONFIG_CRYPTO_DRBG_CTR=y +CONFIG_CRYPTO_DRBG=m +CONFIG_CRYPTO_JITTERENTROPY=m +CONFIG_CRYPTO_USER_API=m +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_USER_API_SKCIPHER=m +CONFIG_CRYPTO_USER_API_RNG=m +CONFIG_CRYPTO_USER_API_AEAD=m +CONFIG_CRYPTO_HASH_INFO=y +CONFIG_CRYPTO_HW=y +CONFIG_CRYPTO_DEV_PADLOCK=m +CONFIG_CRYPTO_DEV_PADLOCK_AES=m +CONFIG_CRYPTO_DEV_PADLOCK_SHA=m +# CONFIG_CRYPTO_DEV_FSL_CAAM_CRYPTO_API_DESC is not set +CONFIG_CRYPTO_DEV_CCP=y +CONFIG_CRYPTO_DEV_CCP_DD=m +CONFIG_CRYPTO_DEV_SP_CCP=y +CONFIG_CRYPTO_DEV_CCP_CRYPTO=m +CONFIG_CRYPTO_DEV_QAT=m +CONFIG_CRYPTO_DEV_QAT_DH895xCC=m +CONFIG_CRYPTO_DEV_QAT_C3XXX=m +CONFIG_CRYPTO_DEV_QAT_C62X=m +CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m +CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m +CONFIG_CRYPTO_DEV_QAT_C62XVF=m +CONFIG_CRYPTO_DEV_NITROX=m +CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m +CONFIG_CRYPTO_DEV_CHELSIO=m +CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS7_MESSAGE_PARSER=y +# CONFIG_PKCS7_TEST_KEY is not set +# CONFIG_SIGNED_PE_FILE_VERIFICATION is not set + +# +# Certificates for signature checking +# +CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" +CONFIG_SYSTEM_TRUSTED_KEYRING=y +CONFIG_SYSTEM_TRUSTED_KEYS="" +# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set +CONFIG_SECONDARY_TRUSTED_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" +CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y +CONFIG_HAVE_KVM_IRQFD=y +CONFIG_HAVE_KVM_IRQ_ROUTING=y +CONFIG_HAVE_KVM_EVENTFD=y +CONFIG_KVM_MMIO=y +CONFIG_KVM_ASYNC_PF=y +CONFIG_HAVE_KVM_MSI=y +CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y +CONFIG_KVM_VFIO=y +CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y +CONFIG_KVM_COMPAT=y +CONFIG_HAVE_KVM_IRQ_BYPASS=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=m +CONFIG_KVM_INTEL=m +CONFIG_KVM_AMD=m +# CONFIG_KVM_MMU_AUDIT is not set +CONFIG_VHOST_NET=m +CONFIG_VHOST_SCSI=m +CONFIG_VHOST_VSOCK=m +CONFIG_VHOST=m +# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set +CONFIG_BINARY_PRINTF=y + +# +# Library routines +# +CONFIG_RAID6_PQ=m +CONFIG_BITREVERSE=y +# CONFIG_HAVE_ARCH_BITREVERSE is not set +CONFIG_RATIONAL=y +CONFIG_GENERIC_STRNCPY_FROM_USER=y +CONFIG_GENERIC_STRNLEN_USER=y +CONFIG_GENERIC_NET_UTILS=y +CONFIG_GENERIC_FIND_FIRST_BIT=y +CONFIG_GENERIC_PCI_IOMAP=y +CONFIG_GENERIC_IOMAP=y +CONFIG_GENERIC_IO=y +CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y +CONFIG_ARCH_HAS_FAST_MULTIPLIER=y +CONFIG_CRC_CCITT=m +CONFIG_CRC16=m +CONFIG_CRC_T10DIF=y +CONFIG_CRC_ITU_T=m +CONFIG_CRC32=y +# CONFIG_CRC32_SELFTEST is not set +CONFIG_CRC32_SLICEBY8=y +# CONFIG_CRC32_SLICEBY4 is not set +# CONFIG_CRC32_SARWATE is not set +# CONFIG_CRC32_BIT is not set +CONFIG_CRC4=m +CONFIG_CRC7=m +CONFIG_LIBCRC32C=m +CONFIG_CRC8=m +CONFIG_XXHASH=m +# CONFIG_AUDIT_ARCH_COMPAT_GENERIC is not set +# CONFIG_RANDOM32_SELFTEST is not set +CONFIG_842_COMPRESS=m +CONFIG_842_DECOMPRESS=m +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=y +CONFIG_LZO_COMPRESS=y +CONFIG_LZO_DECOMPRESS=y +CONFIG_LZ4_COMPRESS=y +CONFIG_LZ4HC_COMPRESS=m +CONFIG_LZ4_DECOMPRESS=y +CONFIG_ZSTD_COMPRESS=m +CONFIG_ZSTD_DECOMPRESS=m +CONFIG_XZ_DEC=y +CONFIG_XZ_DEC_X86=y +CONFIG_XZ_DEC_POWERPC=y +CONFIG_XZ_DEC_IA64=y +CONFIG_XZ_DEC_ARM=y +CONFIG_XZ_DEC_ARMTHUMB=y +CONFIG_XZ_DEC_SPARC=y +CONFIG_XZ_DEC_BCJ=y +# CONFIG_XZ_DEC_TEST is not set +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y +CONFIG_DECOMPRESS_XZ=y +CONFIG_DECOMPRESS_LZO=y +CONFIG_DECOMPRESS_LZ4=y +CONFIG_GENERIC_ALLOCATOR=y +CONFIG_REED_SOLOMON=m +CONFIG_REED_SOLOMON_ENC8=y +CONFIG_REED_SOLOMON_DEC8=y +CONFIG_REED_SOLOMON_DEC16=y +CONFIG_BCH=m +CONFIG_BCH_CONST_PARAMS=y +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m +CONFIG_BTREE=y +CONFIG_INTERVAL_TREE=y +CONFIG_RADIX_TREE_MULTIORDER=y +CONFIG_ASSOCIATIVE_ARRAY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT_MAP=y +CONFIG_HAS_DMA=y +# CONFIG_DMA_NOOP_OPS is not set +CONFIG_DMA_VIRT_OPS=y +CONFIG_CHECK_SIGNATURE=y +CONFIG_CPUMASK_OFFSTACK=y +CONFIG_CPU_RMAP=y +CONFIG_DQL=y +CONFIG_GLOB=y +# CONFIG_GLOB_SELFTEST is not set +CONFIG_NLATTR=y +CONFIG_LRU_CACHE=m +CONFIG_CLZ_TAB=y +CONFIG_CORDIC=m +CONFIG_DDR=y +CONFIG_IRQ_POLL=y +CONFIG_MPILIB=y +CONFIG_OID_REGISTRY=y +CONFIG_UCS2_STRING=y +CONFIG_FONT_SUPPORT=y +CONFIG_FONTS=y +CONFIG_FONT_8x8=y +CONFIG_FONT_8x16=y +# CONFIG_FONT_6x11 is not set +# CONFIG_FONT_7x14 is not set +# CONFIG_FONT_PEARL_8x8 is not set +# CONFIG_FONT_ACORN_8x8 is not set +# CONFIG_FONT_MINI_4x6 is not set +# CONFIG_FONT_6x10 is not set +# CONFIG_FONT_10x18 is not set +# CONFIG_FONT_SUN8x16 is not set +# CONFIG_FONT_SUN12x22 is not set +# CONFIG_SG_SPLIT is not set +CONFIG_SG_POOL=y +CONFIG_ARCH_HAS_SG_CHAIN=y +CONFIG_ARCH_HAS_PMEM_API=y +CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y +CONFIG_SBITMAP=y +CONFIG_PARMAN=m +# CONFIG_STRING_SELFTEST is not set diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-restore-SD_PREFER_SIBLING-on-MC-domains.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-restore-SD_PREFER_SIBLING-on-MC-domains.patch new file mode 100644 index 00000000..b6be46cc --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-restore-SD_PREFER_SIBLING-on-MC-domains.patch @@ -0,0 +1,12 @@ +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 093f2ceba..808998fe1 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1164,6 +1164,7 @@ sd_init(struct sched_domain_topology_level *tl, + sd->smt_gain = 1178; /* ~15% */ + + } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { ++ sd->flags |= SD_PREFER_SIBLING; + sd->imbalance_pct = 117; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.14-uksm-linux-hardened.patch b/sys-kernel/linux-sources-redcore-lts/files/4.14-uksm-linux-hardened.patch new file mode 100644 index 00000000..f0596117 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.14-uksm-linux-hardened.patch @@ -0,0 +1,6919 @@ +diff -Nur a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX +--- a/Documentation/vm/00-INDEX 2018-05-25 15:18:02.000000000 +0100 ++++ b/Documentation/vm/00-INDEX 2018-05-26 19:30:55.783140311 +0100 +@@ -20,6 +20,8 @@ + - description of the idle page tracking feature. + ksm.txt + - how to use the Kernel Samepage Merging feature. ++uksm.txt ++ - Introduction to Ultra KSM + numa + - information about NUMA specific code in the Linux vm. + numa_memory_policy.txt +diff -Nur a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt +--- a/Documentation/vm/uksm.txt 1970-01-01 01:00:00.000000000 +0100 ++++ b/Documentation/vm/uksm.txt 2018-05-26 19:30:55.783140311 +0100 +@@ -0,0 +1,61 @@ ++The Ultra Kernel Samepage Merging feature ++---------------------------------------------- ++/* ++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia ++ * ++ * This is an improvement upon KSM. Some basic data structures and routines ++ * are borrowed from ksm.c . ++ * ++ * Its new features: ++ * 1. Full system scan: ++ * It automatically scans all user processes' anonymous VMAs. Kernel-user ++ * interaction to submit a memory area to KSM is no longer needed. ++ * ++ * 2. Rich area detection: ++ * It automatically detects rich areas containing abundant duplicated ++ * pages based. Rich areas are given a full scan speed. Poor areas are ++ * sampled at a reasonable speed with very low CPU consumption. ++ * ++ * 3. Ultra Per-page scan speed improvement: ++ * A new hash algorithm is proposed. As a result, on a machine with ++ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it ++ * can scan memory areas that does not contain duplicated pages at speed of ++ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of ++ * 477MB/sec ~ 923MB/sec. ++ * ++ * 4. Thrashing area avoidance: ++ * Thrashing area(an VMA that has frequent Ksm page break-out) can be ++ * filtered out. My benchmark shows it's more efficient than KSM's per-page ++ * hash value based volatile page detection. ++ * ++ * ++ * 5. Misc changes upon KSM: ++ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page ++ * comparison. It's much faster than default C version on x86. ++ * * rmap_item now has an struct *page member to loosely cache a ++ * address-->page mapping, which reduces too much time-costly ++ * follow_page(). ++ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. ++ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ ++ * ksm is needed for this case. ++ * ++ * 6. Full Zero Page consideration(contributed by Figo Zhang) ++ * Now uksmd consider full zero pages as special pages and merge them to an ++ * special unswappable uksm zero page. ++ */ ++ ++ChangeLog: ++ ++2012-05-05 The creation of this Doc ++2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. ++2012-05-28 UKSM 0.1.1.2 bug fix release ++2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 ++2012-07-2 UKSM 0.1.2-beta2 ++2012-07-10 UKSM 0.1.2-beta3 ++2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. ++2012-10-13 UKSM 0.1.2.1 Bug fixes. ++2012-12-31 UKSM 0.1.2.2 Minor bug fixes. ++2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug". ++2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings. ++2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation. ++2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration. +diff -Nur a/fs/exec.c b/fs/exec.c +--- a/fs/exec.c 2018-05-26 19:24:34.831782903 +0100 ++++ b/fs/exec.c 2018-05-26 19:31:18.404873956 +0100 +@@ -63,6 +63,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1377,6 +1378,7 @@ + /* An exec changes our domain. We are no longer part of the thread + group */ + current->self_exec_id++; ++ + flush_signal_handlers(current, 0); + } + EXPORT_SYMBOL(setup_new_exec); +diff -Nur a/fs/proc/meminfo.c b/fs/proc/meminfo.c +--- a/fs/proc/meminfo.c 2018-05-25 15:18:02.000000000 +0100 ++++ b/fs/proc/meminfo.c 2018-05-26 19:30:55.784140344 +0100 +@@ -118,6 +118,10 @@ + global_zone_page_state(NR_KERNEL_STACK_KB)); + show_val_kb(m, "PageTables: ", + global_zone_page_state(NR_PAGETABLE)); ++#ifdef CONFIG_UKSM ++ show_val_kb(m, "KsmZeroPages: ", ++ global_zone_page_state(NR_UKSM_ZERO_PAGES)); ++#endif + #ifdef CONFIG_QUICKLIST + show_val_kb(m, "Quicklists: ", quicklist_total_size()); + #endif +diff -Nur a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h +--- a/include/asm-generic/pgtable.h 2018-05-25 15:18:02.000000000 +0100 ++++ b/include/asm-generic/pgtable.h 2018-05-26 19:30:55.784140344 +0100 +@@ -789,12 +789,25 @@ + extern void untrack_pfn_moved(struct vm_area_struct *vma); + #endif + ++#ifdef CONFIG_UKSM ++static inline int is_uksm_zero_pfn(unsigned long pfn) ++{ ++ extern unsigned long uksm_zero_pfn; ++ return pfn == uksm_zero_pfn; ++} ++#else ++static inline int is_uksm_zero_pfn(unsigned long pfn) ++{ ++ return 0; ++} ++#endif ++ + #ifdef __HAVE_COLOR_ZERO_PAGE + static inline int is_zero_pfn(unsigned long pfn) + { + extern unsigned long zero_pfn; + unsigned long offset_from_zero_pfn = pfn - zero_pfn; +- return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); ++ return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn); + } + + #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) +@@ -803,7 +816,7 @@ + static inline int is_zero_pfn(unsigned long pfn) + { + extern unsigned long zero_pfn; +- return pfn == zero_pfn; ++ return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); + } + + static inline unsigned long my_zero_pfn(unsigned long addr) +diff -Nur a/include/linux/ksm.h b/include/linux/ksm.h +--- a/include/linux/ksm.h 2018-05-25 15:18:02.000000000 +0100 ++++ b/include/linux/ksm.h 2018-05-26 19:30:55.784140344 +0100 +@@ -21,21 +21,6 @@ + #ifdef CONFIG_KSM + int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags); +-int __ksm_enter(struct mm_struct *mm); +-void __ksm_exit(struct mm_struct *mm); +- +-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +-{ +- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) +- return __ksm_enter(mm); +- return 0; +-} +- +-static inline void ksm_exit(struct mm_struct *mm) +-{ +- if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) +- __ksm_exit(mm); +-} + + static inline struct stable_node *page_stable_node(struct page *page) + { +@@ -65,6 +50,33 @@ + void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); + void ksm_migrate_page(struct page *newpage, struct page *oldpage); + ++#ifdef CONFIG_KSM_LEGACY ++int __ksm_enter(struct mm_struct *mm); ++void __ksm_exit(struct mm_struct *mm); ++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++{ ++ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) ++ return __ksm_enter(mm); ++ return 0; ++} ++ ++static inline void ksm_exit(struct mm_struct *mm) ++{ ++ if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) ++ __ksm_exit(mm); ++} ++ ++#elif defined(CONFIG_UKSM) ++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++{ ++ return 0; ++} ++ ++static inline void ksm_exit(struct mm_struct *mm) ++{ ++} ++#endif /* !CONFIG_UKSM */ ++ + #else /* !CONFIG_KSM */ + + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +@@ -106,4 +118,6 @@ + #endif /* CONFIG_MMU */ + #endif /* !CONFIG_KSM */ + ++#include ++ + #endif /* __LINUX_KSM_H */ +diff -Nur a/include/linux/mm_types.h b/include/linux/mm_types.h +--- a/include/linux/mm_types.h 2018-05-25 15:18:02.000000000 +0100 ++++ b/include/linux/mm_types.h 2018-05-26 19:30:55.784140344 +0100 +@@ -337,6 +337,9 @@ + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ + #endif + struct vm_userfaultfd_ctx vm_userfaultfd_ctx; ++#ifdef CONFIG_UKSM ++ struct vma_slot *uksm_vma_slot; ++#endif + } __randomize_layout; + + struct core_thread { +diff -Nur a/include/linux/mmzone.h b/include/linux/mmzone.h +--- a/include/linux/mmzone.h 2018-05-25 15:18:02.000000000 +0100 ++++ b/include/linux/mmzone.h 2018-05-26 19:30:55.785140376 +0100 +@@ -148,6 +148,9 @@ + NR_ZSPAGES, /* allocated in zsmalloc */ + #endif + NR_FREE_CMA_PAGES, ++#ifdef CONFIG_UKSM ++ NR_UKSM_ZERO_PAGES, ++#endif + NR_VM_ZONE_STAT_ITEMS }; + + enum node_stat_item { +@@ -872,7 +875,7 @@ + } + + /** +- * is_highmem - helper function to quickly check if a struct zone is a ++ * is_highmem - helper function to quickly check if a struct zone is a + * highmem zone or not. This is an attempt to keep references + * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. + * @zone - pointer to struct zone variable +diff -Nur a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h +--- a/include/linux/sradix-tree.h 1970-01-01 01:00:00.000000000 +0100 ++++ b/include/linux/sradix-tree.h 2018-05-26 19:30:55.785140376 +0100 +@@ -0,0 +1,77 @@ ++#ifndef _LINUX_SRADIX_TREE_H ++#define _LINUX_SRADIX_TREE_H ++ ++ ++#define INIT_SRADIX_TREE(root, mask) \ ++do { \ ++ (root)->height = 0; \ ++ (root)->gfp_mask = (mask); \ ++ (root)->rnode = NULL; \ ++} while (0) ++ ++#define ULONG_BITS (sizeof(unsigned long) * 8) ++#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) ++//#define SRADIX_TREE_MAP_SHIFT 6 ++//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) ++//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) ++ ++struct sradix_tree_node { ++ unsigned int height; /* Height from the bottom */ ++ unsigned int count; ++ unsigned int fulls; /* Number of full sublevel trees */ ++ struct sradix_tree_node *parent; ++ void *stores[0]; ++}; ++ ++/* A simple radix tree implementation */ ++struct sradix_tree_root { ++ unsigned int height; ++ struct sradix_tree_node *rnode; ++ ++ /* Where found to have available empty stores in its sublevels */ ++ struct sradix_tree_node *enter_node; ++ unsigned int shift; ++ unsigned int stores_size; ++ unsigned int mask; ++ unsigned long min; /* The first hole index */ ++ unsigned long num; ++ //unsigned long *height_to_maxindex; ++ ++ /* How the node is allocated and freed. */ ++ struct sradix_tree_node *(*alloc)(void); ++ void (*free)(struct sradix_tree_node *node); ++ ++ /* When a new node is added and removed */ ++ void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); ++ void (*assign)(struct sradix_tree_node *node, unsigned int index, void *item); ++ void (*rm)(struct sradix_tree_node *node, unsigned int offset); ++}; ++ ++struct sradix_tree_path { ++ struct sradix_tree_node *node; ++ int offset; ++}; ++ ++static inline ++void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) ++{ ++ root->height = 0; ++ root->rnode = NULL; ++ root->shift = shift; ++ root->stores_size = 1UL << shift; ++ root->mask = root->stores_size - 1; ++} ++ ++ ++extern void *sradix_tree_next(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index, ++ int (*iter)(void *, unsigned long)); ++ ++extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); ++ ++extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index); ++ ++extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); ++ ++#endif /* _LINUX_SRADIX_TREE_H */ +diff -Nur a/include/linux/uksm.h b/include/linux/uksm.h +--- a/include/linux/uksm.h 1970-01-01 01:00:00.000000000 +0100 ++++ b/include/linux/uksm.h 2018-05-26 19:30:55.785140376 +0100 +@@ -0,0 +1,149 @@ ++#ifndef __LINUX_UKSM_H ++#define __LINUX_UKSM_H ++/* ++ * Memory merging support. ++ * ++ * This code enables dynamic sharing of identical pages found in different ++ * memory areas, even if they are not shared by fork(). ++ */ ++ ++/* if !CONFIG_UKSM this file should not be compiled at all. */ ++#ifdef CONFIG_UKSM ++ ++#include ++#include ++#include ++#include ++#include ++ ++extern unsigned long zero_pfn __read_mostly; ++extern unsigned long uksm_zero_pfn __read_mostly; ++extern struct page *empty_uksm_zero_page; ++ ++/* must be done before linked to mm */ ++extern void uksm_vma_add_new(struct vm_area_struct *vma); ++extern void uksm_remove_vma(struct vm_area_struct *vma); ++ ++#define UKSM_SLOT_NEED_SORT (1 << 0) ++#define UKSM_SLOT_NEED_RERAND (1 << 1) ++#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ ++#define UKSM_SLOT_FUL_SCANNED (1 << 3) ++#define UKSM_SLOT_IN_UKSM (1 << 4) ++ ++struct vma_slot { ++ struct sradix_tree_node *snode; ++ unsigned long sindex; ++ ++ struct list_head slot_list; ++ unsigned long fully_scanned_round; ++ unsigned long dedup_num; ++ unsigned long pages_scanned; ++ unsigned long this_sampled; ++ unsigned long last_scanned; ++ unsigned long pages_to_scan; ++ struct scan_rung *rung; ++ struct page **rmap_list_pool; ++ unsigned int *pool_counts; ++ unsigned long pool_size; ++ struct vm_area_struct *vma; ++ struct mm_struct *mm; ++ unsigned long ctime_j; ++ unsigned long pages; ++ unsigned long flags; ++ unsigned long pages_cowed; /* pages cowed this round */ ++ unsigned long pages_merged; /* pages merged this round */ ++ unsigned long pages_bemerged; ++ ++ /* when it has page merged in this eval round */ ++ struct list_head dedup_list; ++}; ++ ++static inline void uksm_unmap_zero_page(pte_t pte) ++{ ++ if (pte_pfn(pte) == uksm_zero_pfn) ++ __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); ++} ++ ++static inline void uksm_map_zero_page(pte_t pte) ++{ ++ if (pte_pfn(pte) == uksm_zero_pfn) ++ __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); ++} ++ ++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) ++{ ++ if (vma->uksm_vma_slot && PageKsm(page)) ++ vma->uksm_vma_slot->pages_cowed++; ++} ++ ++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) ++{ ++ if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) ++ vma->uksm_vma_slot->pages_cowed++; ++} ++ ++static inline int uksm_flags_can_scan(unsigned long vm_flags) ++{ ++#ifdef VM_SAO ++ if (vm_flags & VM_SAO) ++ return 0; ++#endif ++ ++ return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | ++ VM_HUGETLB | VM_MIXEDMAP | VM_SHARED ++ | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN)); ++} ++ ++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) ++{ ++ if (uksm_flags_can_scan(*vm_flags_p)) ++ *vm_flags_p |= VM_MERGEABLE; ++} ++ ++/* ++ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will ++ * be removed when uksm zero page patch is stable enough. ++ */ ++static inline void uksm_bugon_zeropage(pte_t pte) ++{ ++ BUG_ON(pte_pfn(pte) == uksm_zero_pfn); ++} ++#else ++static inline void uksm_vma_add_new(struct vm_area_struct *vma) ++{ ++} ++ ++static inline void uksm_remove_vma(struct vm_area_struct *vma) ++{ ++} ++ ++static inline void uksm_unmap_zero_page(pte_t pte) ++{ ++} ++ ++static inline void uksm_map_zero_page(pte_t pte) ++{ ++} ++ ++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) ++{ ++} ++ ++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) ++{ ++} ++ ++static inline int uksm_flags_can_scan(unsigned long vm_flags) ++{ ++ return 0; ++} ++ ++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) ++{ ++} ++ ++static inline void uksm_bugon_zeropage(pte_t pte) ++{ ++} ++#endif /* !CONFIG_UKSM */ ++#endif /* __LINUX_UKSM_H */ +diff -Nur a/kernel/fork.c b/kernel/fork.c +--- a/kernel/fork.c 2018-05-26 19:24:34.840783196 +0100 ++++ b/kernel/fork.c 2018-05-26 19:30:55.785140376 +0100 +@@ -655,7 +655,7 @@ + goto fail_nomem; + charge = len; + } +- tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); ++ tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; +@@ -714,7 +714,7 @@ + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; +- ++ uksm_vma_add_new(tmp); + mm->map_count++; + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); +diff -Nur a/lib/Makefile b/lib/Makefile +--- a/lib/Makefile 2018-05-25 15:18:02.000000000 +0100 ++++ b/lib/Makefile 2018-05-26 19:30:55.786140408 +0100 +@@ -18,7 +18,7 @@ + KCOV_INSTRUMENT_dynamic_debug.o := n + + lib-y := ctype.o string.o vsprintf.o cmdline.o \ +- rbtree.o radix-tree.o dump_stack.o timerqueue.o\ ++ rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\ + idr.o int_sqrt.o extable.o \ + sha1.o chacha20.o irq_regs.o argv_split.o \ + flex_proportions.o ratelimit.o show_mem.o \ +diff -Nur a/lib/sradix-tree.c b/lib/sradix-tree.c +--- a/lib/sradix-tree.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/lib/sradix-tree.c 2018-05-26 19:30:55.786140408 +0100 +@@ -0,0 +1,476 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) ++{ ++ return node->fulls == root->stores_size || ++ (node->height == 1 && node->count == root->stores_size); ++} ++ ++/* ++ * Extend a sradix tree so it can store key @index. ++ */ ++static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) ++{ ++ struct sradix_tree_node *node; ++ unsigned int height; ++ ++ if (unlikely(root->rnode == NULL)) { ++ if (!(node = root->alloc())) ++ return -ENOMEM; ++ ++ node->height = 1; ++ root->rnode = node; ++ root->height = 1; ++ } ++ ++ /* Figure out what the height should be. */ ++ height = root->height; ++ index >>= root->shift * height; ++ ++ while (index) { ++ index >>= root->shift; ++ height++; ++ } ++ ++ while (height > root->height) { ++ unsigned int newheight; ++ ++ if (!(node = root->alloc())) ++ return -ENOMEM; ++ ++ /* Increase the height. */ ++ node->stores[0] = root->rnode; ++ root->rnode->parent = node; ++ if (root->extend) ++ root->extend(node, root->rnode); ++ ++ newheight = root->height + 1; ++ node->height = newheight; ++ node->count = 1; ++ if (sradix_node_full(root, root->rnode)) ++ node->fulls = 1; ++ ++ root->rnode = node; ++ root->height = newheight; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Search the next item from the current node, that is not NULL ++ * and can satify root->iter(). ++ */ ++void *sradix_tree_next(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index, ++ int (*iter)(void *item, unsigned long height)) ++{ ++ unsigned long offset; ++ void *item; ++ ++ if (unlikely(node == NULL)) { ++ node = root->rnode; ++ for (offset = 0; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (unlikely(offset >= root->stores_size)) ++ return NULL; ++ ++ if (node->height == 1) ++ return item; ++ else ++ goto go_down; ++ } ++ ++ while (node) { ++ offset = (index & root->mask) + 1; ++ for (; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (offset < root->stores_size) ++ break; ++ ++ node = node->parent; ++ index >>= root->shift; ++ } ++ ++ if (!node) ++ return NULL; ++ ++ while (node->height > 1) { ++go_down: ++ node = item; ++ for (offset = 0; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (unlikely(offset >= root->stores_size)) ++ return NULL; ++ } ++ ++ BUG_ON(offset > root->stores_size); ++ ++ return item; ++} ++ ++/* ++ * Blindly insert the item to the tree. Typically, we reuse the ++ * first empty store item. ++ */ ++int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) ++{ ++ unsigned long index; ++ unsigned int height; ++ struct sradix_tree_node *node, *tmp = NULL; ++ int offset, offset_saved; ++ void **store = NULL; ++ int error, i, j, shift; ++ ++go_on: ++ index = root->min; ++ ++ if (root->enter_node && !sradix_node_full(root, root->enter_node)) { ++ node = root->enter_node; ++ BUG_ON((index >> (root->shift * root->height))); ++ } else { ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height)) ++ || sradix_node_full(root, node)) { ++ error = sradix_tree_extend(root, index); ++ if (error) ++ return error; ++ ++ node = root->rnode; ++ } ++ } ++ ++ ++ height = node->height; ++ shift = (height - 1) * root->shift; ++ offset = (index >> shift) & root->mask; ++ while (shift > 0) { ++ offset_saved = offset; ++ for (; offset < root->stores_size; offset++) { ++ store = &node->stores[offset]; ++ tmp = *store; ++ ++ if (!tmp || !sradix_node_full(root, tmp)) ++ break; ++ } ++ BUG_ON(offset >= root->stores_size); ++ ++ if (offset != offset_saved) { ++ index += (offset - offset_saved) << shift; ++ index &= ~((1UL << shift) - 1); ++ } ++ ++ if (!tmp) { ++ if (!(tmp = root->alloc())) ++ return -ENOMEM; ++ ++ tmp->height = shift / root->shift; ++ *store = tmp; ++ tmp->parent = node; ++ node->count++; ++// if (root->extend) ++// root->extend(node, tmp); ++ } ++ ++ node = tmp; ++ shift -= root->shift; ++ offset = (index >> shift) & root->mask; ++ } ++ ++ BUG_ON(node->height != 1); ++ ++ ++ store = &node->stores[offset]; ++ for (i = 0, j = 0; ++ j < root->stores_size - node->count && ++ i < root->stores_size - offset && j < num; i++) { ++ if (!store[i]) { ++ store[i] = item[j]; ++ if (root->assign) ++ root->assign(node, index + i, item[j]); ++ j++; ++ } ++ } ++ ++ node->count += j; ++ root->num += j; ++ num -= j; ++ ++ while (sradix_node_full(root, node)) { ++ node = node->parent; ++ if (!node) ++ break; ++ ++ node->fulls++; ++ } ++ ++ if (unlikely(!node)) { ++ /* All nodes are full */ ++ root->min = 1 << (root->height * root->shift); ++ root->enter_node = NULL; ++ } else { ++ root->min = index + i - 1; ++ root->min |= (1UL << (node->height - 1)) - 1; ++ root->min++; ++ root->enter_node = node; ++ } ++ ++ if (num) { ++ item += j; ++ goto go_on; ++ } ++ ++ return 0; ++} ++ ++ ++/** ++ * sradix_tree_shrink - shrink height of a sradix tree to minimal ++ * @root sradix tree root ++ * ++ */ ++static inline void sradix_tree_shrink(struct sradix_tree_root *root) ++{ ++ /* try to shrink tree height */ ++ while (root->height > 1) { ++ struct sradix_tree_node *to_free = root->rnode; ++ ++ /* ++ * The candidate node has more than one child, or its child ++ * is not at the leftmost store, we cannot shrink. ++ */ ++ if (to_free->count != 1 || !to_free->stores[0]) ++ break; ++ ++ root->rnode = to_free->stores[0]; ++ root->rnode->parent = NULL; ++ root->height--; ++ if (unlikely(root->enter_node == to_free)) ++ root->enter_node = NULL; ++ root->free(to_free); ++ } ++} ++ ++/* ++ * Del the item on the known leaf node and index ++ */ ++void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index) ++{ ++ unsigned int offset; ++ struct sradix_tree_node *start, *end; ++ ++ BUG_ON(node->height != 1); ++ ++ start = node; ++ while (node && !(--node->count)) ++ node = node->parent; ++ ++ end = node; ++ if (!node) { ++ root->rnode = NULL; ++ root->height = 0; ++ root->min = 0; ++ root->num = 0; ++ root->enter_node = NULL; ++ } else { ++ offset = (index >> (root->shift * (node->height - 1))) & root->mask; ++ if (root->rm) ++ root->rm(node, offset); ++ node->stores[offset] = NULL; ++ root->num--; ++ if (root->min > index) { ++ root->min = index; ++ root->enter_node = node; ++ } ++ } ++ ++ if (start != end) { ++ do { ++ node = start; ++ start = start->parent; ++ if (unlikely(root->enter_node == node)) ++ root->enter_node = end; ++ root->free(node); ++ } while (start != end); ++ ++ /* ++ * Note that shrink may free "end", so enter_node still need to ++ * be checked inside. ++ */ ++ sradix_tree_shrink(root); ++ } else if (node->count == root->stores_size - 1) { ++ /* It WAS a full leaf node. Update the ancestors */ ++ node = node->parent; ++ while (node) { ++ node->fulls--; ++ if (node->fulls != root->stores_size - 1) ++ break; ++ ++ node = node->parent; ++ } ++ } ++} ++ ++void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node; ++ int shift; ++ ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height))) ++ return NULL; ++ ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ node = node->stores[offset]; ++ if (!node) ++ return NULL; ++ ++ shift -= root->shift; ++ } while (shift >= 0); ++ ++ return node; ++} ++ ++/* ++ * Return the item if it exists, otherwise create it in place ++ * and return the created item. ++ */ ++void *sradix_tree_lookup_create(struct sradix_tree_root *root, ++ unsigned long index, void *(*item_alloc)(void)) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node, *tmp; ++ void *item; ++ int shift, error; ++ ++ if (root->rnode == NULL || (index >> (root->shift * root->height))) { ++ if (item_alloc) { ++ error = sradix_tree_extend(root, index); ++ if (error) ++ return NULL; ++ } else { ++ return NULL; ++ } ++ } ++ ++ node = root->rnode; ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ if (!node->stores[offset]) { ++ if (!(tmp = root->alloc())) ++ return NULL; ++ ++ tmp->height = shift / root->shift; ++ node->stores[offset] = tmp; ++ tmp->parent = node; ++ node->count++; ++ node = tmp; ++ } else { ++ node = node->stores[offset]; ++ } ++ ++ shift -= root->shift; ++ } while (shift > 0); ++ ++ BUG_ON(node->height != 1); ++ offset = index & root->mask; ++ if (node->stores[offset]) { ++ return node->stores[offset]; ++ } else if (item_alloc) { ++ if (!(item = item_alloc())) ++ return NULL; ++ ++ node->stores[offset] = item; ++ ++ /* ++ * NOTE: we do NOT call root->assign here, since this item is ++ * newly created by us having no meaning. Caller can call this ++ * if it's necessary to do so. ++ */ ++ ++ node->count++; ++ root->num++; ++ ++ while (sradix_node_full(root, node)) { ++ node = node->parent; ++ if (!node) ++ break; ++ ++ node->fulls++; ++ } ++ ++ if (unlikely(!node)) { ++ /* All nodes are full */ ++ root->min = 1 << (root->height * root->shift); ++ } else { ++ if (root->min == index) { ++ root->min |= (1UL << (node->height - 1)) - 1; ++ root->min++; ++ root->enter_node = node; ++ } ++ } ++ ++ return item; ++ } else { ++ return NULL; ++ } ++ ++} ++ ++int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node; ++ int shift; ++ ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height))) ++ return -ENOENT; ++ ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ node = node->stores[offset]; ++ if (!node) ++ return -ENOENT; ++ ++ shift -= root->shift; ++ } while (shift > 0); ++ ++ offset = index & root->mask; ++ if (!node->stores[offset]) ++ return -ENOENT; ++ ++ sradix_tree_delete_from_leaf(root, node, index); ++ ++ return 0; ++} +diff -Nur a/mm/Kconfig b/mm/Kconfig +--- a/mm/Kconfig 2018-05-26 19:24:34.846783391 +0100 ++++ b/mm/Kconfig 2018-05-26 19:30:55.786140408 +0100 +@@ -315,6 +315,32 @@ + See Documentation/vm/ksm.txt for more information: KSM is inactive + until a program has madvised that an area is MADV_MERGEABLE, and + root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). ++choice ++ prompt "Choose UKSM/KSM strategy" ++ default UKSM ++ depends on KSM ++ help ++ This option allows to select a UKSM/KSM stragety. ++ ++config UKSM ++ bool "Ultra-KSM for page merging" ++ depends on KSM ++ help ++ UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same ++ page Merging), but with a fundamentally rewritten core algorithm. With ++ an advanced algorithm, UKSM now can transparently scans all anonymously ++ mapped user space applications with an significantly improved scan speed ++ and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from ++ UKSM. Now UKSM has its first stable release and first real world enterprise user. ++ For more information, please goto its project page. ++ (www.kerneldedup.org) ++ ++config KSM_LEGACY ++ bool "Legacy KSM implementation" ++ depends on KSM ++ help ++ The legacy KSM implementation from Red Hat. ++endchoice + + config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" +diff -Nur a/mm/Makefile b/mm/Makefile +--- a/mm/Makefile 2018-05-25 15:18:02.000000000 +0100 ++++ b/mm/Makefile 2018-05-26 19:30:55.786140408 +0100 +@@ -65,7 +65,8 @@ + obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o + obj-$(CONFIG_SLOB) += slob.o + obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +-obj-$(CONFIG_KSM) += ksm.o ++obj-$(CONFIG_KSM_LEGACY) += ksm.o ++obj-$(CONFIG_UKSM) += uksm.o + obj-$(CONFIG_PAGE_POISONING) += page_poison.o + obj-$(CONFIG_SLAB) += slab.o + obj-$(CONFIG_SLUB) += slub.o +diff -Nur a/mm/memory.c b/mm/memory.c +--- a/mm/memory.c 2018-05-25 15:18:02.000000000 +0100 ++++ b/mm/memory.c 2018-05-26 19:30:55.787140441 +0100 +@@ -129,6 +129,25 @@ + + unsigned long highest_memmap_pfn __read_mostly; + ++#ifdef CONFIG_UKSM ++unsigned long uksm_zero_pfn __read_mostly; ++EXPORT_SYMBOL_GPL(uksm_zero_pfn); ++struct page *empty_uksm_zero_page; ++ ++static int __init setup_uksm_zero_page(void) ++{ ++ empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0); ++ if (!empty_uksm_zero_page) ++ panic("Oh boy, that early out of memory?"); ++ ++ SetPageReserved(empty_uksm_zero_page); ++ uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); ++ ++ return 0; ++} ++core_initcall(setup_uksm_zero_page); ++#endif ++ + /* + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() + */ +@@ -140,6 +159,7 @@ + core_initcall(init_zero_pfn); + + ++ + #if defined(SPLIT_RSS_COUNTING) + + void sync_mm_rss(struct mm_struct *mm) +@@ -1035,6 +1055,9 @@ + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; ++ ++ /* Should return NULL in vm_normal_page() */ ++ uksm_bugon_zeropage(pte); + } else if (pte_devmap(pte)) { + page = pte_page(pte); + +@@ -1048,6 +1071,8 @@ + page_dup_rmap(page, false); + rss[mm_counter(page)]++; + } ++ } else { ++ uksm_map_zero_page(pte); + } + + out_set_pte: +@@ -1317,8 +1342,10 @@ + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + tlb_remove_tlb_entry(tlb, pte, addr); +- if (unlikely(!page)) ++ if (unlikely(!page)) { ++ uksm_unmap_zero_page(ptent); + continue; ++ } + + if (!PageAnon(page)) { + if (pte_dirty(ptent)) { +@@ -2318,8 +2345,10 @@ + clear_page(kaddr); + kunmap_atomic(kaddr); + flush_dcache_page(dst); +- } else ++ } else { + copy_user_highpage(dst, src, va, vma); ++ uksm_cow_page(vma, src); ++ } + } + + static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) +@@ -2468,6 +2497,7 @@ + vmf->address); + if (!new_page) + goto oom; ++ uksm_cow_pte(vma, vmf->orig_pte); + } else { + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, + vmf->address); +@@ -2494,7 +2524,9 @@ + mm_counter_file(old_page)); + inc_mm_counter_fast(mm, MM_ANONPAGES); + } ++ uksm_bugon_zeropage(vmf->orig_pte); + } else { ++ uksm_unmap_zero_page(vmf->orig_pte); + inc_mm_counter_fast(mm, MM_ANONPAGES); + } + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); +diff -Nur a/mm/mmap.c b/mm/mmap.c +--- a/mm/mmap.c 2018-05-26 19:24:34.847783423 +0100 ++++ b/mm/mmap.c 2018-05-26 19:30:55.788140473 +0100 +@@ -45,6 +45,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -173,6 +174,7 @@ + if (vma->vm_file) + fput(vma->vm_file); + mpol_put(vma_policy(vma)); ++ uksm_remove_vma(vma); + kmem_cache_free(vm_area_cachep, vma); + return next; + } +@@ -699,9 +701,16 @@ + long adjust_next = 0; + int remove_next = 0; + ++/* ++ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is ++ * acquired ++ */ ++ uksm_remove_vma(vma); ++ + if (next && !insert) { + struct vm_area_struct *exporter = NULL, *importer = NULL; + ++ uksm_remove_vma(next); + if (end >= next->vm_end) { + /* + * vma expands, overlapping all the next, and +@@ -834,6 +843,7 @@ + end_changed = true; + } + vma->vm_pgoff = pgoff; ++ + if (adjust_next) { + next->vm_start += adjust_next << PAGE_SHIFT; + next->vm_pgoff += adjust_next; +@@ -939,6 +949,7 @@ + if (remove_next == 2) { + remove_next = 1; + end = next->vm_end; ++ uksm_remove_vma(next); + goto again; + } + else if (next) +@@ -965,10 +976,14 @@ + */ + VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); + } ++ } else { ++ if (next && !insert) ++ uksm_vma_add_new(next); + } + if (insert && file) + uprobe_mmap(insert); + ++ uksm_vma_add_new(vma); + validate_mm(mm); + + return 0; +@@ -1385,6 +1400,9 @@ + vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + ++ /* If uksm is enabled, we add VM_MERGEABLE to new VMAs. */ ++ uksm_vm_flags_mod(&vm_flags); ++ + if (flags & MAP_LOCKED) + if (!can_do_mlock()) + return -EPERM; +@@ -1724,6 +1742,7 @@ + allow_write_access(file); + } + file = vma->vm_file; ++ uksm_vma_add_new(vma); + out: + perf_event_mmap(vma); + +@@ -1765,6 +1784,7 @@ + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + free_vma: ++ uksm_remove_vma(vma); + kmem_cache_free(vm_area_cachep, vma); + unacct_error: + if (charged) +@@ -2589,6 +2609,8 @@ + else + err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + ++ uksm_vma_add_new(new); ++ + /* Success. */ + if (!err) + return 0; +@@ -2881,6 +2903,7 @@ + if ((flags & (~VM_EXEC)) != 0) + return -EINVAL; + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; ++ uksm_vm_flags_mod(&flags); + + error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (offset_in_page(error)) +@@ -2938,6 +2961,7 @@ + vma->vm_flags = flags; + vma->vm_page_prot = vm_get_page_prot(flags); + vma_link(mm, vma, prev, rb_link, rb_parent); ++ uksm_vma_add_new(vma); + out: + perf_event_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; +@@ -3015,6 +3039,12 @@ + up_write(&mm->mmap_sem); + } + ++ /* ++ * Taking write lock on mmap_sem does not harm others, ++ * but it's crucial for uksm to avoid races. ++ */ ++ down_write(&mm->mmap_sem); ++ + if (mm->locked_vm) { + vma = mm->mmap; + while (vma) { +@@ -3049,6 +3079,11 @@ + vma = remove_vma(vma); + } + vm_unacct_memory(nr_accounted); ++ ++ mm->mmap = NULL; ++ mm->mm_rb = RB_ROOT; ++ vmacache_invalidate(mm); ++ up_write(&mm->mmap_sem); + } + + /* Insert vm structure into process list sorted by address +@@ -3158,6 +3193,7 @@ + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); + *need_rmap_locks = false; ++ uksm_vma_add_new(new_vma); + } + return new_vma; + +@@ -3308,6 +3344,7 @@ + vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); + + perf_event_mmap(vma); ++ uksm_vma_add_new(vma); + + return vma; + +diff -Nur a/mm/rmap.c b/mm/rmap.c +--- a/mm/rmap.c 2018-05-25 15:18:02.000000000 +0100 ++++ b/mm/rmap.c 2018-05-26 19:30:55.788140473 +0100 +@@ -1013,9 +1013,9 @@ + + /** + * __page_set_anon_rmap - set up new anonymous rmap +- * @page: Page to add to rmap ++ * @page: Page to add to rmap + * @vma: VM area to add page to. +- * @address: User virtual address of the mapping ++ * @address: User virtual address of the mapping + * @exclusive: the page is exclusively owned by the current process + */ + static void __page_set_anon_rmap(struct page *page, +diff -Nur a/mm/uksm.c b/mm/uksm.c +--- a/mm/uksm.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/mm/uksm.c 2018-05-26 19:30:55.791140570 +0100 +@@ -0,0 +1,5584 @@ ++/* ++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia ++ * ++ * This is an improvement upon KSM. Some basic data structures and routines ++ * are borrowed from ksm.c . ++ * ++ * Its new features: ++ * 1. Full system scan: ++ * It automatically scans all user processes' anonymous VMAs. Kernel-user ++ * interaction to submit a memory area to KSM is no longer needed. ++ * ++ * 2. Rich area detection: ++ * It automatically detects rich areas containing abundant duplicated ++ * pages based. Rich areas are given a full scan speed. Poor areas are ++ * sampled at a reasonable speed with very low CPU consumption. ++ * ++ * 3. Ultra Per-page scan speed improvement: ++ * A new hash algorithm is proposed. As a result, on a machine with ++ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it ++ * can scan memory areas that does not contain duplicated pages at speed of ++ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of ++ * 477MB/sec ~ 923MB/sec. ++ * ++ * 4. Thrashing area avoidance: ++ * Thrashing area(an VMA that has frequent Ksm page break-out) can be ++ * filtered out. My benchmark shows it's more efficient than KSM's per-page ++ * hash value based volatile page detection. ++ * ++ * ++ * 5. Misc changes upon KSM: ++ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page ++ * comparison. It's much faster than default C version on x86. ++ * * rmap_item now has an struct *page member to loosely cache a ++ * address-->page mapping, which reduces too much time-costly ++ * follow_page(). ++ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. ++ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ ++ * ksm is needed for this case. ++ * ++ * 6. Full Zero Page consideration(contributed by Figo Zhang) ++ * Now uksmd consider full zero pages as special pages and merge them to an ++ * special unswappable uksm zero page. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include "internal.h" ++ ++#ifdef CONFIG_X86 ++#undef memcmp ++ ++#ifdef CONFIG_X86_32 ++#define memcmp memcmpx86_32 ++/* ++ * Compare 4-byte-aligned address s1 and s2, with length n ++ */ ++int memcmpx86_32(void *s1, void *s2, size_t n) ++{ ++ size_t num = n / 4; ++ register int res; ++ ++ __asm__ __volatile__ ++ ( ++ "testl %3,%3\n\t" ++ "repe; cmpsd\n\t" ++ "je 1f\n\t" ++ "sbbl %0,%0\n\t" ++ "orl $1,%0\n" ++ "1:" ++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) ++ : "0" (0) ++ : "cc"); ++ ++ return res; ++} ++ ++/* ++ * Check the page is all zero ? ++ */ ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned char same; ++ ++ len /= 4; ++ ++ __asm__ __volatile__ ++ ("repe; scasl;" ++ "sete %0" ++ : "=qm" (same), "+D" (s1), "+c" (len) ++ : "a" (0) ++ : "cc"); ++ ++ return same; ++} ++ ++ ++#elif defined(CONFIG_X86_64) ++#define memcmp memcmpx86_64 ++/* ++ * Compare 8-byte-aligned address s1 and s2, with length n ++ */ ++int memcmpx86_64(void *s1, void *s2, size_t n) ++{ ++ size_t num = n / 8; ++ register int res; ++ ++ __asm__ __volatile__ ++ ( ++ "testq %q3,%q3\n\t" ++ "repe; cmpsq\n\t" ++ "je 1f\n\t" ++ "sbbq %q0,%q0\n\t" ++ "orq $1,%q0\n" ++ "1:" ++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) ++ : "0" (0) ++ : "cc"); ++ ++ return res; ++} ++ ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned char same; ++ ++ len /= 8; ++ ++ __asm__ __volatile__ ++ ("repe; scasq;" ++ "sete %0" ++ : "=qm" (same), "+D" (s1), "+c" (len) ++ : "a" (0) ++ : "cc"); ++ ++ return same; ++} ++ ++#endif ++#else ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned long *src = s1; ++ int i; ++ ++ len /= sizeof(*src); ++ ++ for (i = 0; i < len; i++) { ++ if (src[i]) ++ return 0; ++ } ++ ++ return 1; ++} ++#endif ++ ++#define UKSM_RUNG_ROUND_FINISHED (1 << 0) ++#define TIME_RATIO_SCALE 10000 ++ ++#define SLOT_TREE_NODE_SHIFT 8 ++#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) ++struct slot_tree_node { ++ unsigned long size; ++ struct sradix_tree_node snode; ++ void *stores[SLOT_TREE_NODE_STORE_SIZE]; ++}; ++ ++static struct kmem_cache *slot_tree_node_cachep; ++ ++static struct sradix_tree_node *slot_tree_node_alloc(void) ++{ ++ struct slot_tree_node *p; ++ ++ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (!p) ++ return NULL; ++ ++ return &p->snode; ++} ++ ++static void slot_tree_node_free(struct sradix_tree_node *node) ++{ ++ struct slot_tree_node *p; ++ ++ p = container_of(node, struct slot_tree_node, snode); ++ kmem_cache_free(slot_tree_node_cachep, p); ++} ++ ++static void slot_tree_node_extend(struct sradix_tree_node *parent, ++ struct sradix_tree_node *child) ++{ ++ struct slot_tree_node *p, *c; ++ ++ p = container_of(parent, struct slot_tree_node, snode); ++ c = container_of(child, struct slot_tree_node, snode); ++ ++ p->size += c->size; ++} ++ ++void slot_tree_node_assign(struct sradix_tree_node *node, ++ unsigned int index, void *item) ++{ ++ struct vma_slot *slot = item; ++ struct slot_tree_node *cur; ++ ++ slot->snode = node; ++ slot->sindex = index; ++ ++ while (node) { ++ cur = container_of(node, struct slot_tree_node, snode); ++ cur->size += slot->pages; ++ node = node->parent; ++ } ++} ++ ++void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset) ++{ ++ struct vma_slot *slot; ++ struct slot_tree_node *cur; ++ unsigned long pages; ++ ++ if (node->height == 1) { ++ slot = node->stores[offset]; ++ pages = slot->pages; ++ } else { ++ cur = container_of(node->stores[offset], ++ struct slot_tree_node, snode); ++ pages = cur->size; ++ } ++ ++ while (node) { ++ cur = container_of(node, struct slot_tree_node, snode); ++ cur->size -= pages; ++ node = node->parent; ++ } ++} ++ ++unsigned long slot_iter_index; ++int slot_iter(void *item, unsigned long height) ++{ ++ struct slot_tree_node *node; ++ struct vma_slot *slot; ++ ++ if (height == 1) { ++ slot = item; ++ if (slot_iter_index < slot->pages) { ++ /*in this one*/ ++ return 1; ++ } else { ++ slot_iter_index -= slot->pages; ++ return 0; ++ } ++ ++ } else { ++ node = container_of(item, struct slot_tree_node, snode); ++ if (slot_iter_index < node->size) { ++ /*in this one*/ ++ return 1; ++ } else { ++ slot_iter_index -= node->size; ++ return 0; ++ } ++ } ++} ++ ++ ++static inline void slot_tree_init_root(struct sradix_tree_root *root) ++{ ++ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); ++ root->alloc = slot_tree_node_alloc; ++ root->free = slot_tree_node_free; ++ root->extend = slot_tree_node_extend; ++ root->assign = slot_tree_node_assign; ++ root->rm = slot_tree_node_rm; ++} ++ ++void slot_tree_init(void) ++{ ++ slot_tree_node_cachep = kmem_cache_create("slot_tree_node", ++ sizeof(struct slot_tree_node), 0, ++ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, ++ NULL); ++} ++ ++ ++/* Each rung of this ladder is a list of VMAs having a same scan ratio */ ++struct scan_rung { ++ //struct list_head scanned_list; ++ struct sradix_tree_root vma_root; ++ struct sradix_tree_root vma_root2; ++ ++ struct vma_slot *current_scan; ++ unsigned long current_offset; ++ ++ /* ++ * The initial value for current_offset, it should loop over ++ * [0~ step - 1] to let all slot have its chance to be scanned. ++ */ ++ unsigned long offset_init; ++ unsigned long step; /* dynamic step for current_offset */ ++ unsigned int flags; ++ unsigned long pages_to_scan; ++ //unsigned long fully_scanned_slots; ++ /* ++ * a little bit tricky - if cpu_time_ratio > 0, then the value is the ++ * the cpu time ratio it can spend in rung_i for every scan ++ * period. if < 0, then it is the cpu time ratio relative to the ++ * max cpu percentage user specified. Both in unit of ++ * 1/TIME_RATIO_SCALE ++ */ ++ int cpu_ratio; ++ ++ /* ++ * How long it will take for all slots in this rung to be fully ++ * scanned? If it's zero, we don't care about the cover time: ++ * it's fully scanned. ++ */ ++ unsigned int cover_msecs; ++ //unsigned long vma_num; ++ //unsigned long pages; /* Sum of all slot's pages in rung */ ++}; ++ ++/** ++ * node of either the stable or unstale rbtree ++ * ++ */ ++struct tree_node { ++ struct rb_node node; /* link in the main (un)stable rbtree */ ++ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ ++ u32 hash; ++ unsigned long count; /* TODO: merged with sub_root */ ++ struct list_head all_list; /* all tree nodes in stable/unstable tree */ ++}; ++ ++/** ++ * struct stable_node - node of the stable rbtree ++ * @node: rb node of this ksm page in the stable tree ++ * @hlist: hlist head of rmap_items using this ksm page ++ * @kpfn: page frame number of this ksm page ++ */ ++struct stable_node { ++ struct rb_node node; /* link in sub-rbtree */ ++ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ ++ struct hlist_head hlist; ++ unsigned long kpfn; ++ u32 hash_max; /* if ==0 then it's not been calculated yet */ ++ struct list_head all_list; /* in a list for all stable nodes */ ++}; ++ ++/** ++ * struct node_vma - group rmap_items linked in a same stable ++ * node together. ++ */ ++struct node_vma { ++ union { ++ struct vma_slot *slot; ++ unsigned long key; /* slot is used as key sorted on hlist */ ++ }; ++ struct hlist_node hlist; ++ struct hlist_head rmap_hlist; ++ struct stable_node *head; ++}; ++ ++/** ++ * struct rmap_item - reverse mapping item for virtual addresses ++ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list ++ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree ++ * @mm: the memory structure this rmap_item is pointing into ++ * @address: the virtual address this rmap_item tracks (+ flags in low bits) ++ * @node: rb node of this rmap_item in the unstable tree ++ * @head: pointer to stable_node heading this list in the stable tree ++ * @hlist: link into hlist of rmap_items hanging off that stable_node ++ */ ++struct rmap_item { ++ struct vma_slot *slot; ++ struct page *page; ++ unsigned long address; /* + low bits used for flags below */ ++ unsigned long hash_round; ++ unsigned long entry_index; ++ union { ++ struct {/* when in unstable tree */ ++ struct rb_node node; ++ struct tree_node *tree_node; ++ u32 hash_max; ++ }; ++ struct { /* when in stable tree */ ++ struct node_vma *head; ++ struct hlist_node hlist; ++ struct anon_vma *anon_vma; ++ }; ++ }; ++} __aligned(4); ++ ++struct rmap_list_entry { ++ union { ++ struct rmap_item *item; ++ unsigned long addr; ++ }; ++ /* lowest bit is used for is_addr tag */ ++} __aligned(4); /* 4 aligned to fit in to pages*/ ++ ++ ++/* Basic data structure definition ends */ ++ ++ ++/* ++ * Flags for rmap_item to judge if it's listed in the stable/unstable tree. ++ * The flags use the low bits of rmap_item.address ++ */ ++#define UNSTABLE_FLAG 0x1 ++#define STABLE_FLAG 0x2 ++#define get_rmap_addr(x) ((x)->address & PAGE_MASK) ++ ++/* ++ * rmap_list_entry helpers ++ */ ++#define IS_ADDR_FLAG 1 ++#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) ++#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) ++#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) ++ ++ ++/* ++ * High speed caches for frequently allocated and freed structs ++ */ ++static struct kmem_cache *rmap_item_cache; ++static struct kmem_cache *stable_node_cache; ++static struct kmem_cache *node_vma_cache; ++static struct kmem_cache *vma_slot_cache; ++static struct kmem_cache *tree_node_cache; ++#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ ++ sizeof(struct __struct), __alignof__(struct __struct),\ ++ (__flags), NULL) ++ ++/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ ++#define SCAN_LADDER_SIZE 4 ++static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; ++ ++/* The evaluation rounds uksmd has finished */ ++static unsigned long long uksm_eval_round = 1; ++ ++/* ++ * we add 1 to this var when we consider we should rebuild the whole ++ * unstable tree. ++ */ ++static unsigned long uksm_hash_round = 1; ++ ++/* ++ * How many times the whole memory is scanned. ++ */ ++static unsigned long long fully_scanned_round = 1; ++ ++/* The total number of virtual pages of all vma slots */ ++static u64 uksm_pages_total; ++ ++/* The number of pages has been scanned since the start up */ ++static u64 uksm_pages_scanned; ++ ++static u64 scanned_virtual_pages; ++ ++/* The number of pages has been scanned since last encode_benefit call */ ++static u64 uksm_pages_scanned_last; ++ ++/* If the scanned number is tooo large, we encode it here */ ++static u64 pages_scanned_stored; ++ ++static unsigned long pages_scanned_base; ++ ++/* The number of nodes in the stable tree */ ++static unsigned long uksm_pages_shared; ++ ++/* The number of page slots additionally sharing those nodes */ ++static unsigned long uksm_pages_sharing; ++ ++/* The number of nodes in the unstable tree */ ++static unsigned long uksm_pages_unshared; ++ ++/* ++ * Milliseconds ksmd should sleep between scans, ++ * >= 100ms to be consistent with ++ * scan_time_to_sleep_msec() ++ */ ++static unsigned int uksm_sleep_jiffies; ++ ++/* The real value for the uksmd next sleep */ ++static unsigned int uksm_sleep_real; ++ ++/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ ++static unsigned int uksm_sleep_saved; ++ ++/* Max percentage of cpu utilization ksmd can take to scan in one batch */ ++static unsigned int uksm_max_cpu_percentage; ++ ++static int uksm_cpu_governor; ++ ++static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; ++ ++struct uksm_cpu_preset_s { ++ int cpu_ratio[SCAN_LADDER_SIZE]; ++ unsigned int cover_msecs[SCAN_LADDER_SIZE]; ++ unsigned int max_cpu; /* percentage */ ++}; ++ ++struct uksm_cpu_preset_s uksm_cpu_preset[4] = { ++ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, ++ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, ++ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, ++ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, ++}; ++ ++/* The default value for uksm_ema_page_time if it's not initialized */ ++#define UKSM_PAGE_TIME_DEFAULT 500 ++ ++/*cost to scan one page by expotional moving average in nsecs */ ++static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; ++ ++/* The expotional moving average alpha weight, in percentage. */ ++#define EMA_ALPHA 20 ++ ++/* ++ * The threshold used to filter out thrashing areas, ++ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound ++ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio ++ * will be considered as having a zero duplication ratio. ++ */ ++static unsigned int uksm_thrash_threshold = 50; ++ ++/* How much dedup ratio is considered to be abundant*/ ++static unsigned int uksm_abundant_threshold = 10; ++ ++/* All slots having merged pages in this eval round. */ ++struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); ++ ++/* How many times the ksmd has slept since startup */ ++static unsigned long long uksm_sleep_times; ++ ++#define UKSM_RUN_STOP 0 ++#define UKSM_RUN_MERGE 1 ++static unsigned int uksm_run = 1; ++ ++static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); ++static DEFINE_MUTEX(uksm_thread_mutex); ++ ++/* ++ * List vma_slot_new is for newly created vma_slot waiting to be added by ++ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to ++ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding ++ * VMA has been removed/freed. ++ */ ++struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); ++struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); ++struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); ++static DEFINE_SPINLOCK(vma_slot_list_lock); ++ ++/* The unstable tree heads */ ++static struct rb_root root_unstable_tree = RB_ROOT; ++ ++/* ++ * All tree_nodes are in a list to be freed at once when unstable tree is ++ * freed after each scan round. ++ */ ++static struct list_head unstable_tree_node_list = ++ LIST_HEAD_INIT(unstable_tree_node_list); ++ ++/* List contains all stable nodes */ ++static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); ++ ++/* ++ * When the hash strength is changed, the stable tree must be delta_hashed and ++ * re-structured. We use two set of below structs to speed up the ++ * re-structuring of stable tree. ++ */ ++static struct list_head ++stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), ++ LIST_HEAD_INIT(stable_tree_node_list[1])}; ++ ++static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; ++static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; ++static struct rb_root *root_stable_treep = &root_stable_tree[0]; ++static unsigned long stable_tree_index; ++ ++/* The hash strength needed to hash a full page */ ++#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) ++ ++/* The hash strength needed for loop-back hashing */ ++#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) ++ ++/* The random offsets in a page */ ++static u32 *random_nums; ++ ++/* The hash strength */ ++static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; ++ ++/* The delta value each time the hash strength increases or decreases */ ++static unsigned long hash_strength_delta; ++#define HASH_STRENGTH_DELTA_MAX 5 ++ ++/* The time we have saved due to random_sample_hash */ ++static u64 rshash_pos; ++ ++/* The time we have wasted due to hash collision */ ++static u64 rshash_neg; ++ ++struct uksm_benefit { ++ u64 pos; ++ u64 neg; ++ u64 scanned; ++ unsigned long base; ++} benefit; ++ ++/* ++ * The relative cost of memcmp, compared to 1 time unit of random sample ++ * hash, this value is tested when ksm module is initialized ++ */ ++static unsigned long memcmp_cost; ++ ++static unsigned long rshash_neg_cont_zero; ++static unsigned long rshash_cont_obscure; ++ ++/* The possible states of hash strength adjustment heuristic */ ++enum rshash_states { ++ RSHASH_STILL, ++ RSHASH_TRYUP, ++ RSHASH_TRYDOWN, ++ RSHASH_NEW, ++ RSHASH_PRE_STILL, ++}; ++ ++/* The possible direction we are about to adjust hash strength */ ++enum rshash_direct { ++ GO_UP, ++ GO_DOWN, ++ OBSCURE, ++ STILL, ++}; ++ ++/* random sampling hash state machine */ ++static struct { ++ enum rshash_states state; ++ enum rshash_direct pre_direct; ++ u8 below_count; ++ /* Keep a lookup window of size 5, iff above_count/below_count > 3 ++ * in this window we stop trying. ++ */ ++ u8 lookup_window_index; ++ u64 stable_benefit; ++ unsigned long turn_point_down; ++ unsigned long turn_benefit_down; ++ unsigned long turn_point_up; ++ unsigned long turn_benefit_up; ++ unsigned long stable_point; ++} rshash_state; ++ ++/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ ++static u32 *zero_hash_table; ++ ++static inline struct node_vma *alloc_node_vma(void) ++{ ++ struct node_vma *node_vma; ++ ++ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (node_vma) { ++ INIT_HLIST_HEAD(&node_vma->rmap_hlist); ++ INIT_HLIST_NODE(&node_vma->hlist); ++ } ++ return node_vma; ++} ++ ++static inline void free_node_vma(struct node_vma *node_vma) ++{ ++ kmem_cache_free(node_vma_cache, node_vma); ++} ++ ++ ++static inline struct vma_slot *alloc_vma_slot(void) ++{ ++ struct vma_slot *slot; ++ ++ /* ++ * In case ksm is not initialized by now. ++ * Oops, we need to consider the call site of uksm_init() in the future. ++ */ ++ if (!vma_slot_cache) ++ return NULL; ++ ++ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (slot) { ++ INIT_LIST_HEAD(&slot->slot_list); ++ INIT_LIST_HEAD(&slot->dedup_list); ++ slot->flags |= UKSM_SLOT_NEED_RERAND; ++ } ++ return slot; ++} ++ ++static inline void free_vma_slot(struct vma_slot *vma_slot) ++{ ++ kmem_cache_free(vma_slot_cache, vma_slot); ++} ++ ++ ++ ++static inline struct rmap_item *alloc_rmap_item(void) ++{ ++ struct rmap_item *rmap_item; ++ ++ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (rmap_item) { ++ /* bug on lowest bit is not clear for flag use */ ++ BUG_ON(is_addr(rmap_item)); ++ } ++ return rmap_item; ++} ++ ++static inline void free_rmap_item(struct rmap_item *rmap_item) ++{ ++ rmap_item->slot = NULL; /* debug safety */ ++ kmem_cache_free(rmap_item_cache, rmap_item); ++} ++ ++static inline struct stable_node *alloc_stable_node(void) ++{ ++ struct stable_node *node; ++ ++ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (!node) ++ return NULL; ++ ++ INIT_HLIST_HEAD(&node->hlist); ++ list_add(&node->all_list, &stable_node_list); ++ return node; ++} ++ ++static inline void free_stable_node(struct stable_node *stable_node) ++{ ++ list_del(&stable_node->all_list); ++ kmem_cache_free(stable_node_cache, stable_node); ++} ++ ++static inline struct tree_node *alloc_tree_node(struct list_head *list) ++{ ++ struct tree_node *node; ++ ++ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (!node) ++ return NULL; ++ ++ list_add(&node->all_list, list); ++ return node; ++} ++ ++static inline void free_tree_node(struct tree_node *node) ++{ ++ list_del(&node->all_list); ++ kmem_cache_free(tree_node_cache, node); ++} ++ ++static void uksm_drop_anon_vma(struct rmap_item *rmap_item) ++{ ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ ++ put_anon_vma(anon_vma); ++} ++ ++ ++/** ++ * Remove a stable node from stable_tree, may unlink from its tree_node and ++ * may remove its parent tree_node if no other stable node is pending. ++ * ++ * @stable_node The node need to be removed ++ * @unlink_rb Will this node be unlinked from the rbtree? ++ * @remove_tree_ node Will its tree_node be removed if empty? ++ */ ++static void remove_node_from_stable_tree(struct stable_node *stable_node, ++ int unlink_rb, int remove_tree_node) ++{ ++ struct node_vma *node_vma; ++ struct rmap_item *rmap_item; ++ struct hlist_node *n; ++ ++ if (!hlist_empty(&stable_node->hlist)) { ++ hlist_for_each_entry_safe(node_vma, n, ++ &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { ++ uksm_pages_sharing--; ++ ++ uksm_drop_anon_vma(rmap_item); ++ rmap_item->address &= PAGE_MASK; ++ } ++ free_node_vma(node_vma); ++ cond_resched(); ++ } ++ ++ /* the last one is counted as shared */ ++ uksm_pages_shared--; ++ uksm_pages_sharing++; ++ } ++ ++ if (stable_node->tree_node && unlink_rb) { ++ rb_erase(&stable_node->node, ++ &stable_node->tree_node->sub_root); ++ ++ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && ++ remove_tree_node) { ++ rb_erase(&stable_node->tree_node->node, ++ root_stable_treep); ++ free_tree_node(stable_node->tree_node); ++ } else { ++ stable_node->tree_node->count--; ++ } ++ } ++ ++ free_stable_node(stable_node); ++} ++ ++ ++/* ++ * get_uksm_page: checks if the page indicated by the stable node ++ * is still its ksm page, despite having held no reference to it. ++ * In which case we can trust the content of the page, and it ++ * returns the gotten page; but if the page has now been zapped, ++ * remove the stale node from the stable tree and return NULL. ++ * ++ * You would expect the stable_node to hold a reference to the ksm page. ++ * But if it increments the page's count, swapping out has to wait for ++ * ksmd to come around again before it can free the page, which may take ++ * seconds or even minutes: much too unresponsive. So instead we use a ++ * "keyhole reference": access to the ksm page from the stable node peeps ++ * out through its keyhole to see if that page still holds the right key, ++ * pointing back to this stable node. This relies on freeing a PageAnon ++ * page to reset its page->mapping to NULL, and relies on no other use of ++ * a page to put something that might look like our key in page->mapping. ++ * ++ * include/linux/pagemap.h page_cache_get_speculative() is a good reference, ++ * but this is different - made simpler by uksm_thread_mutex being held, but ++ * interesting for assuming that no other use of the struct page could ever ++ * put our expected_mapping into page->mapping (or a field of the union which ++ * coincides with page->mapping). The RCU calls are not for KSM at all, but ++ * to keep the page_count protocol described with page_cache_get_speculative. ++ * ++ * Note: it is possible that get_uksm_page() will return NULL one moment, ++ * then page the next, if the page is in between page_freeze_refs() and ++ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page ++ * is on its way to being freed; but it is an anomaly to bear in mind. ++ * ++ * @unlink_rb: if the removal of this node will firstly unlink from ++ * its rbtree. stable_node_reinsert will prevent this when restructuring the ++ * node from its old tree. ++ * ++ * @remove_tree_node: if this is the last one of its tree_node, will the ++ * tree_node be freed ? If we are inserting stable node, this tree_node may ++ * be reused, so don't free it. ++ */ ++static struct page *get_uksm_page(struct stable_node *stable_node, ++ int unlink_rb, int remove_tree_node) ++{ ++ struct page *page; ++ void *expected_mapping; ++ unsigned long kpfn; ++ ++ expected_mapping = (void *)((unsigned long)stable_node | ++ PAGE_MAPPING_KSM); ++again: ++ kpfn = READ_ONCE(stable_node->kpfn); ++ page = pfn_to_page(kpfn); ++ ++ /* ++ * page is computed from kpfn, so on most architectures reading ++ * page->mapping is naturally ordered after reading node->kpfn, ++ * but on Alpha we need to be more careful. ++ */ ++ smp_read_barrier_depends(); ++ ++ if (READ_ONCE(page->mapping) != expected_mapping) ++ goto stale; ++ ++ /* ++ * We cannot do anything with the page while its refcount is 0. ++ * Usually 0 means free, or tail of a higher-order page: in which ++ * case this node is no longer referenced, and should be freed; ++ * however, it might mean that the page is under page_freeze_refs(). ++ * The __remove_mapping() case is easy, again the node is now stale; ++ * but if page is swapcache in migrate_page_move_mapping(), it might ++ * still be our page, in which case it's essential to keep the node. ++ */ ++ while (!get_page_unless_zero(page)) { ++ /* ++ * Another check for page->mapping != expected_mapping would ++ * work here too. We have chosen the !PageSwapCache test to ++ * optimize the common case, when the page is or is about to ++ * be freed: PageSwapCache is cleared (under spin_lock_irq) ++ * in the freeze_refs section of __remove_mapping(); but Anon ++ * page->mapping reset to NULL later, in free_pages_prepare(). ++ */ ++ if (!PageSwapCache(page)) ++ goto stale; ++ cpu_relax(); ++ } ++ ++ if (READ_ONCE(page->mapping) != expected_mapping) { ++ put_page(page); ++ goto stale; ++ } ++ ++ lock_page(page); ++ if (READ_ONCE(page->mapping) != expected_mapping) { ++ unlock_page(page); ++ put_page(page); ++ goto stale; ++ } ++ unlock_page(page); ++ return page; ++stale: ++ /* ++ * We come here from above when page->mapping or !PageSwapCache ++ * suggests that the node is stale; but it might be under migration. ++ * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), ++ * before checking whether node->kpfn has been changed. ++ */ ++ smp_rmb(); ++ if (stable_node->kpfn != kpfn) ++ goto again; ++ ++ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); ++ ++ return NULL; ++} ++ ++/* ++ * Removing rmap_item from stable or unstable tree. ++ * This function will clean the information from the stable/unstable tree. ++ */ ++static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ++{ ++ if (rmap_item->address & STABLE_FLAG) { ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct page *page; ++ ++ node_vma = rmap_item->head; ++ stable_node = node_vma->head; ++ page = get_uksm_page(stable_node, 1, 1); ++ if (!page) ++ goto out; ++ ++ /* ++ * page lock is needed because it's racing with ++ * try_to_unmap_ksm(), etc. ++ */ ++ lock_page(page); ++ hlist_del(&rmap_item->hlist); ++ ++ if (hlist_empty(&node_vma->rmap_hlist)) { ++ hlist_del(&node_vma->hlist); ++ free_node_vma(node_vma); ++ } ++ unlock_page(page); ++ ++ put_page(page); ++ if (hlist_empty(&stable_node->hlist)) { ++ /* do NOT call remove_node_from_stable_tree() here, ++ * it's possible for a forked rmap_item not in ++ * stable tree while the in-tree rmap_items were ++ * deleted. ++ */ ++ uksm_pages_shared--; ++ } else ++ uksm_pages_sharing--; ++ ++ ++ uksm_drop_anon_vma(rmap_item); ++ } else if (rmap_item->address & UNSTABLE_FLAG) { ++ if (rmap_item->hash_round == uksm_hash_round) { ++ ++ rb_erase(&rmap_item->node, ++ &rmap_item->tree_node->sub_root); ++ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { ++ rb_erase(&rmap_item->tree_node->node, ++ &root_unstable_tree); ++ ++ free_tree_node(rmap_item->tree_node); ++ } else ++ rmap_item->tree_node->count--; ++ } ++ uksm_pages_unshared--; ++ } ++ ++ rmap_item->address &= PAGE_MASK; ++ rmap_item->hash_max = 0; ++ ++out: ++ cond_resched(); /* we're called from many long loops */ ++} ++ ++static inline int slot_in_uksm(struct vma_slot *slot) ++{ ++ return list_empty(&slot->slot_list); ++} ++ ++/* ++ * Test if the mm is exiting ++ */ ++static inline bool uksm_test_exit(struct mm_struct *mm) ++{ ++ return atomic_read(&mm->mm_users) == 0; ++} ++ ++static inline unsigned long vma_pool_size(struct vma_slot *slot) ++{ ++ return round_up(sizeof(struct rmap_list_entry) * slot->pages, ++ PAGE_SIZE) >> PAGE_SHIFT; ++} ++ ++#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) ++ ++/* must be done with sem locked */ ++static int slot_pool_alloc(struct vma_slot *slot) ++{ ++ unsigned long pool_size; ++ ++ if (slot->rmap_list_pool) ++ return 0; ++ ++ pool_size = vma_pool_size(slot); ++ slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *), ++ GFP_KERNEL); ++ if (!slot->rmap_list_pool) ++ return -ENOMEM; ++ ++ slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int), ++ GFP_KERNEL); ++ if (!slot->pool_counts) { ++ kfree(slot->rmap_list_pool); ++ return -ENOMEM; ++ } ++ ++ slot->pool_size = pool_size; ++ BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); ++ slot->flags |= UKSM_SLOT_IN_UKSM; ++ uksm_pages_total += slot->pages; ++ ++ return 0; ++} ++ ++/* ++ * Called after vma is unlinked from its mm ++ */ ++void uksm_remove_vma(struct vm_area_struct *vma) ++{ ++ struct vma_slot *slot; ++ ++ if (!vma->uksm_vma_slot) ++ return; ++ ++ spin_lock(&vma_slot_list_lock); ++ slot = vma->uksm_vma_slot; ++ if (!slot) ++ goto out; ++ ++ if (slot_in_uksm(slot)) { ++ /** ++ * This slot has been added by ksmd, so move to the del list ++ * waiting ksmd to free it. ++ */ ++ list_add_tail(&slot->slot_list, &vma_slot_del); ++ } else { ++ /** ++ * It's still on new list. It's ok to free slot directly. ++ */ ++ list_del(&slot->slot_list); ++ free_vma_slot(slot); ++ } ++out: ++ vma->uksm_vma_slot = NULL; ++ spin_unlock(&vma_slot_list_lock); ++} ++ ++/** ++ * Need to do two things: ++ * 1. check if slot was moved to del list ++ * 2. make sure the mmap_sem is manipulated under valid vma. ++ * ++ * My concern here is that in some cases, this may make ++ * vma_slot_list_lock() waiters to serialized further by some ++ * sem->wait_lock, can this really be expensive? ++ * ++ * ++ * @return ++ * 0: if successfully locked mmap_sem ++ * -ENOENT: this slot was moved to del list ++ * -EBUSY: vma lock failed ++ */ ++static int try_down_read_slot_mmap_sem(struct vma_slot *slot) ++{ ++ struct vm_area_struct *vma; ++ struct mm_struct *mm; ++ struct rw_semaphore *sem; ++ ++ spin_lock(&vma_slot_list_lock); ++ ++ /* the slot_list was removed and inited from new list, when it enters ++ * uksm_list. If now it's not empty, then it must be moved to del list ++ */ ++ if (!slot_in_uksm(slot)) { ++ spin_unlock(&vma_slot_list_lock); ++ return -ENOENT; ++ } ++ ++ BUG_ON(slot->pages != vma_pages(slot->vma)); ++ /* Ok, vma still valid */ ++ vma = slot->vma; ++ mm = vma->vm_mm; ++ sem = &mm->mmap_sem; ++ ++ if (uksm_test_exit(mm)) { ++ spin_unlock(&vma_slot_list_lock); ++ return -ENOENT; ++ } ++ ++ if (down_read_trylock(sem)) { ++ spin_unlock(&vma_slot_list_lock); ++ if (slot_pool_alloc(slot)) { ++ uksm_remove_vma(vma); ++ up_read(sem); ++ return -ENOENT; ++ } ++ return 0; ++ } ++ ++ spin_unlock(&vma_slot_list_lock); ++ return -EBUSY; ++} ++ ++static inline unsigned long ++vma_page_address(struct page *page, struct vm_area_struct *vma) ++{ ++ pgoff_t pgoff = page->index; ++ unsigned long address; ++ ++ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ++ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { ++ /* page should be within @vma mapping range */ ++ return -EFAULT; ++ } ++ return address; ++} ++ ++ ++/* return 0 on success with the item's mmap_sem locked */ ++static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) ++{ ++ struct mm_struct *mm; ++ struct vma_slot *slot = item->slot; ++ int err = -EINVAL; ++ ++ struct page *page; ++ ++ /* ++ * try_down_read_slot_mmap_sem() returns non-zero if the slot ++ * has been removed by uksm_remove_vma(). ++ */ ++ if (try_down_read_slot_mmap_sem(slot)) ++ return -EBUSY; ++ ++ mm = slot->vma->vm_mm; ++ ++ if (uksm_test_exit(mm)) ++ goto failout_up; ++ ++ page = item->page; ++ rcu_read_lock(); ++ if (!get_page_unless_zero(page)) { ++ rcu_read_unlock(); ++ goto failout_up; ++ } ++ ++ /* No need to consider huge page here. */ ++ if (item->slot->vma->anon_vma != page_anon_vma(page) || ++ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { ++ /* ++ * TODO: ++ * should we release this item becase of its stale page ++ * mapping? ++ */ ++ put_page(page); ++ rcu_read_unlock(); ++ goto failout_up; ++ } ++ rcu_read_unlock(); ++ return 0; ++ ++failout_up: ++ up_read(&mm->mmap_sem); ++ return err; ++} ++ ++/* ++ * What kind of VMA is considered ? ++ */ ++static inline int vma_can_enter(struct vm_area_struct *vma) ++{ ++ return uksm_flags_can_scan(vma->vm_flags); ++} ++ ++/* ++ * Called whenever a fresh new vma is created A new vma_slot. ++ * is created and inserted into a global list Must be called. ++ * after vma is inserted to its mm. ++ */ ++void uksm_vma_add_new(struct vm_area_struct *vma) ++{ ++ struct vma_slot *slot; ++ ++ if (!vma_can_enter(vma)) { ++ vma->uksm_vma_slot = NULL; ++ return; ++ } ++ ++ slot = alloc_vma_slot(); ++ if (!slot) { ++ vma->uksm_vma_slot = NULL; ++ return; ++ } ++ ++ vma->uksm_vma_slot = slot; ++ vma->vm_flags |= VM_MERGEABLE; ++ slot->vma = vma; ++ slot->mm = vma->vm_mm; ++ slot->ctime_j = jiffies; ++ slot->pages = vma_pages(vma); ++ spin_lock(&vma_slot_list_lock); ++ list_add_tail(&slot->slot_list, &vma_slot_new); ++ spin_unlock(&vma_slot_list_lock); ++} ++ ++/* 32/3 < they < 32/2 */ ++#define shiftl 8 ++#define shiftr 12 ++ ++#define HASH_FROM_TO(from, to) \ ++for (index = from; index < to; index++) { \ ++ pos = random_nums[index]; \ ++ hash += key[pos]; \ ++ hash += (hash << shiftl); \ ++ hash ^= (hash >> shiftr); \ ++} ++ ++ ++#define HASH_FROM_DOWN_TO(from, to) \ ++for (index = from - 1; index >= to; index--) { \ ++ hash ^= (hash >> shiftr); \ ++ hash ^= (hash >> (shiftr*2)); \ ++ hash -= (hash << shiftl); \ ++ hash += (hash << (shiftl*2)); \ ++ pos = random_nums[index]; \ ++ hash -= key[pos]; \ ++} ++ ++/* ++ * The main random sample hash function. ++ */ ++static u32 random_sample_hash(void *addr, u32 hash_strength) ++{ ++ u32 hash = 0xdeadbeef; ++ int index, pos, loop = hash_strength; ++ u32 *key = (u32 *)addr; ++ ++ if (loop > HASH_STRENGTH_FULL) ++ loop = HASH_STRENGTH_FULL; ++ ++ HASH_FROM_TO(0, loop); ++ ++ if (hash_strength > HASH_STRENGTH_FULL) { ++ loop = hash_strength - HASH_STRENGTH_FULL; ++ HASH_FROM_TO(0, loop); ++ } ++ ++ return hash; ++} ++ ++ ++/** ++ * It's used when hash strength is adjusted ++ * ++ * @addr The page's virtual address ++ * @from The original hash strength ++ * @to The hash strength changed to ++ * @hash The hash value generated with "from" hash value ++ * ++ * return the hash value ++ */ ++static u32 delta_hash(void *addr, int from, int to, u32 hash) ++{ ++ u32 *key = (u32 *)addr; ++ int index, pos; /* make sure they are int type */ ++ ++ if (to > from) { ++ if (from >= HASH_STRENGTH_FULL) { ++ from -= HASH_STRENGTH_FULL; ++ to -= HASH_STRENGTH_FULL; ++ HASH_FROM_TO(from, to); ++ } else if (to <= HASH_STRENGTH_FULL) { ++ HASH_FROM_TO(from, to); ++ } else { ++ HASH_FROM_TO(from, HASH_STRENGTH_FULL); ++ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); ++ } ++ } else { ++ if (from <= HASH_STRENGTH_FULL) { ++ HASH_FROM_DOWN_TO(from, to); ++ } else if (to >= HASH_STRENGTH_FULL) { ++ from -= HASH_STRENGTH_FULL; ++ to -= HASH_STRENGTH_FULL; ++ HASH_FROM_DOWN_TO(from, to); ++ } else { ++ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); ++ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); ++ } ++ } ++ ++ return hash; ++} ++ ++/** ++ * ++ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round ++ * has finished. ++ * ++ * return 0 if no page has been scanned since last call, 1 otherwise. ++ */ ++static inline int encode_benefit(void) ++{ ++ u64 scanned_delta, pos_delta, neg_delta; ++ unsigned long base = benefit.base; ++ ++ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; ++ ++ if (!scanned_delta) ++ return 0; ++ ++ scanned_delta >>= base; ++ pos_delta = rshash_pos >> base; ++ neg_delta = rshash_neg >> base; ++ ++ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || ++ CAN_OVERFLOW_U64(benefit.neg, neg_delta) || ++ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { ++ benefit.scanned >>= 1; ++ benefit.neg >>= 1; ++ benefit.pos >>= 1; ++ benefit.base++; ++ scanned_delta >>= 1; ++ pos_delta >>= 1; ++ neg_delta >>= 1; ++ } ++ ++ benefit.pos += pos_delta; ++ benefit.neg += neg_delta; ++ benefit.scanned += scanned_delta; ++ ++ BUG_ON(!benefit.scanned); ++ ++ rshash_pos = rshash_neg = 0; ++ uksm_pages_scanned_last = uksm_pages_scanned; ++ ++ return 1; ++} ++ ++static inline void reset_benefit(void) ++{ ++ benefit.pos = 0; ++ benefit.neg = 0; ++ benefit.base = 0; ++ benefit.scanned = 0; ++} ++ ++static inline void inc_rshash_pos(unsigned long delta) ++{ ++ if (CAN_OVERFLOW_U64(rshash_pos, delta)) ++ encode_benefit(); ++ ++ rshash_pos += delta; ++} ++ ++static inline void inc_rshash_neg(unsigned long delta) ++{ ++ if (CAN_OVERFLOW_U64(rshash_neg, delta)) ++ encode_benefit(); ++ ++ rshash_neg += delta; ++} ++ ++ ++static inline u32 page_hash(struct page *page, unsigned long hash_strength, ++ int cost_accounting) ++{ ++ u32 val; ++ unsigned long delta; ++ ++ void *addr = kmap_atomic(page); ++ ++ val = random_sample_hash(addr, hash_strength); ++ kunmap_atomic(addr); ++ ++ if (cost_accounting) { ++ if (hash_strength < HASH_STRENGTH_FULL) ++ delta = HASH_STRENGTH_FULL - hash_strength; ++ else ++ delta = 0; ++ ++ inc_rshash_pos(delta); ++ } ++ ++ return val; ++} ++ ++static int memcmp_pages(struct page *page1, struct page *page2, ++ int cost_accounting) ++{ ++ char *addr1, *addr2; ++ int ret; ++ ++ addr1 = kmap_atomic(page1); ++ addr2 = kmap_atomic(page2); ++ ret = memcmp(addr1, addr2, PAGE_SIZE); ++ kunmap_atomic(addr2); ++ kunmap_atomic(addr1); ++ ++ if (cost_accounting) ++ inc_rshash_neg(memcmp_cost); ++ ++ return ret; ++} ++ ++static inline int pages_identical(struct page *page1, struct page *page2) ++{ ++ return !memcmp_pages(page1, page2, 0); ++} ++ ++static inline int is_page_full_zero(struct page *page) ++{ ++ char *addr; ++ int ret; ++ ++ addr = kmap_atomic(page); ++ ret = is_full_zero(addr, PAGE_SIZE); ++ kunmap_atomic(addr); ++ ++ return ret; ++} ++ ++static int write_protect_page(struct vm_area_struct *vma, struct page *page, ++ pte_t *orig_pte, pte_t *old_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ struct page_vma_mapped_walk pvmw = { ++ .page = page, ++ .vma = vma, ++ }; ++ int swapped; ++ int err = -EFAULT; ++ unsigned long mmun_start; /* For mmu_notifiers */ ++ unsigned long mmun_end; /* For mmu_notifiers */ ++ ++ pvmw.address = page_address_in_vma(page, vma); ++ if (pvmw.address == -EFAULT) ++ goto out; ++ ++ BUG_ON(PageTransCompound(page)); ++ ++ mmun_start = pvmw.address; ++ mmun_end = pvmw.address + PAGE_SIZE; ++ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ++ ++ if (!page_vma_mapped_walk(&pvmw)) ++ goto out_mn; ++ if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) ++ goto out_unlock; ++ ++ if (old_pte) ++ *old_pte = *pvmw.pte; ++ ++ if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || ++ (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) { ++ pte_t entry; ++ ++ swapped = PageSwapCache(page); ++ flush_cache_page(vma, pvmw.address, page_to_pfn(page)); ++ /* ++ * Ok this is tricky, when get_user_pages_fast() run it doesn't ++ * take any lock, therefore the check that we are going to make ++ * with the pagecount against the mapcount is racey and ++ * O_DIRECT can happen right after the check. ++ * So we clear the pte and flush the tlb before the check ++ * this assure us that no O_DIRECT can happen after the check ++ * or in the middle of the check. ++ */ ++ entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); ++ /* ++ * Check that no O_DIRECT or similar I/O is in progress on the ++ * page ++ */ ++ if (page_mapcount(page) + 1 + swapped != page_count(page)) { ++ set_pte_at(mm, pvmw.address, pvmw.pte, entry); ++ goto out_unlock; ++ } ++ if (pte_dirty(entry)) ++ set_page_dirty(page); ++ ++ if (pte_protnone(entry)) ++ entry = pte_mkclean(pte_clear_savedwrite(entry)); ++ else ++ entry = pte_mkclean(pte_wrprotect(entry)); ++ ++ set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); ++ } ++ *orig_pte = *pvmw.pte; ++ err = 0; ++ ++out_unlock: ++ page_vma_mapped_walk_done(&pvmw); ++out_mn: ++ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); ++out: ++ return err; ++} ++ ++#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ ++#define MERGE_ERR_COLLI 2 /* there is a collision */ ++#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ ++#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ ++ ++ ++/** ++ * replace_page - replace page in vma by new ksm page ++ * @vma: vma that holds the pte pointing to page ++ * @page: the page we are replacing by kpage ++ * @kpage: the ksm page we replace page by ++ * @orig_pte: the original value of the pte ++ * ++ * Returns 0 on success, MERGE_ERR_PGERR on failure. ++ */ ++static int replace_page(struct vm_area_struct *vma, struct page *page, ++ struct page *kpage, pte_t orig_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ p4d_t *p4d; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *ptep; ++ spinlock_t *ptl; ++ pte_t entry; ++ ++ unsigned long addr; ++ int err = MERGE_ERR_PGERR; ++ unsigned long mmun_start; /* For mmu_notifiers */ ++ unsigned long mmun_end; /* For mmu_notifiers */ ++ ++ addr = page_address_in_vma(page, vma); ++ if (addr == -EFAULT) ++ goto out; ++ ++ pgd = pgd_offset(mm, addr); ++ if (!pgd_present(*pgd)) ++ goto out; ++ ++ p4d = p4d_offset(pgd, addr); ++ pud = pud_offset(p4d, addr); ++ if (!pud_present(*pud)) ++ goto out; ++ ++ pmd = pmd_offset(pud, addr); ++ BUG_ON(pmd_trans_huge(*pmd)); ++ if (!pmd_present(*pmd)) ++ goto out; ++ ++ mmun_start = addr; ++ mmun_end = addr + PAGE_SIZE; ++ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ++ ++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ if (!pte_same(*ptep, orig_pte)) { ++ pte_unmap_unlock(ptep, ptl); ++ goto out_mn; ++ } ++ ++ flush_cache_page(vma, addr, pte_pfn(*ptep)); ++ ptep_clear_flush_notify(vma, addr, ptep); ++ entry = mk_pte(kpage, vma->vm_page_prot); ++ ++ /* special treatment is needed for zero_page */ ++ if ((page_to_pfn(kpage) == uksm_zero_pfn) || ++ (page_to_pfn(kpage) == zero_pfn)) { ++ entry = pte_mkspecial(entry); ++ dec_mm_counter(mm, MM_ANONPAGES); ++ inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); ++ } else { ++ get_page(kpage); ++ page_add_anon_rmap(kpage, vma, addr, false); ++ } ++ ++ set_pte_at_notify(mm, addr, ptep, entry); ++ ++ page_remove_rmap(page, false); ++ if (!page_mapped(page)) ++ try_to_free_swap(page); ++ put_page(page); ++ ++ pte_unmap_unlock(ptep, ptl); ++ err = 0; ++out_mn: ++ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); ++out: ++ return err; ++} ++ ++ ++/** ++ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The ++ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its ++ * hash_max member has not been calculated. ++ * ++ * @page The page needs to be hashed ++ * @hash_old The hash value calculated with current hash strength ++ * ++ * return the new hash value calculated at HASH_STRENGTH_MAX ++ */ ++static inline u32 page_hash_max(struct page *page, u32 hash_old) ++{ ++ u32 hash_max = 0; ++ void *addr; ++ ++ addr = kmap_atomic(page); ++ hash_max = delta_hash(addr, hash_strength, ++ HASH_STRENGTH_MAX, hash_old); ++ ++ kunmap_atomic(addr); ++ ++ if (!hash_max) ++ hash_max = 1; ++ ++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); ++ return hash_max; ++} ++ ++/* ++ * We compare the hash again, to ensure that it is really a hash collision ++ * instead of being caused by page write. ++ */ ++static inline int check_collision(struct rmap_item *rmap_item, ++ u32 hash) ++{ ++ int err; ++ struct page *page = rmap_item->page; ++ ++ /* if this rmap_item has already been hash_maxed, then the collision ++ * must appears in the second-level rbtree search. In this case we check ++ * if its hash_max value has been changed. Otherwise, the collision ++ * happens in the first-level rbtree search, so we check against it's ++ * current hash value. ++ */ ++ if (rmap_item->hash_max) { ++ inc_rshash_neg(memcmp_cost); ++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); ++ ++ if (rmap_item->hash_max == page_hash_max(page, hash)) ++ err = MERGE_ERR_COLLI; ++ else ++ err = MERGE_ERR_CHANGED; ++ } else { ++ inc_rshash_neg(memcmp_cost + hash_strength); ++ ++ if (page_hash(page, hash_strength, 0) == hash) ++ err = MERGE_ERR_COLLI; ++ else ++ err = MERGE_ERR_CHANGED; ++ } ++ ++ return err; ++} ++ ++/** ++ * Try to merge a rmap_item.page with a kpage in stable node. kpage must ++ * already be a ksm page. ++ * ++ * @return 0 if the pages were merged, -EFAULT otherwise. ++ */ ++static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, ++ struct page *kpage, u32 hash) ++{ ++ struct vm_area_struct *vma = rmap_item->slot->vma; ++ struct mm_struct *mm = vma->vm_mm; ++ pte_t orig_pte = __pte(0); ++ int err = MERGE_ERR_PGERR; ++ struct page *page; ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ page = rmap_item->page; ++ ++ if (page == kpage) { /* ksm page forked */ ++ err = 0; ++ goto out; ++ } ++ ++ /* ++ * We need the page lock to read a stable PageSwapCache in ++ * write_protect_page(). We use trylock_page() instead of ++ * lock_page() because we don't want to wait here - we ++ * prefer to continue scanning and merging different pages, ++ * then come back to this page when it is unlocked. ++ */ ++ if (!trylock_page(page)) ++ goto out; ++ ++ if (!PageAnon(page) || !PageKsm(kpage)) ++ goto out_unlock; ++ ++ if (PageTransCompound(page)) { ++ err = split_huge_page(page); ++ if (err) ++ goto out_unlock; ++ } ++ ++ /* ++ * If this anonymous page is mapped only here, its pte may need ++ * to be write-protected. If it's mapped elsewhere, all of its ++ * ptes are necessarily already write-protected. But in either ++ * case, we need to lock and check page_count is not raised. ++ */ ++ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { ++ if (pages_identical(page, kpage)) ++ err = replace_page(vma, page, kpage, orig_pte); ++ else ++ err = check_collision(rmap_item, hash); ++ } ++ ++ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { ++ munlock_vma_page(page); ++ if (!PageMlocked(kpage)) { ++ unlock_page(page); ++ lock_page(kpage); ++ mlock_vma_page(kpage); ++ page = kpage; /* for final unlock */ ++ } ++ } ++ ++out_unlock: ++ unlock_page(page); ++out: ++ return err; ++} ++ ++ ++ ++/** ++ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance ++ * to restore a page mapping that has been changed in try_to_merge_two_pages. ++ * ++ * @return 0 on success. ++ */ ++static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, ++ pte_t orig_pte, pte_t wprt_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ p4d_t *p4d; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *ptep; ++ spinlock_t *ptl; ++ ++ int err = -EFAULT; ++ ++ pgd = pgd_offset(mm, addr); ++ if (!pgd_present(*pgd)) ++ goto out; ++ ++ p4d = p4d_offset(pgd, addr); ++ pud = pud_offset(p4d, addr); ++ if (!pud_present(*pud)) ++ goto out; ++ ++ pmd = pmd_offset(pud, addr); ++ if (!pmd_present(*pmd)) ++ goto out; ++ ++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ if (!pte_same(*ptep, wprt_pte)) { ++ /* already copied, let it be */ ++ pte_unmap_unlock(ptep, ptl); ++ goto out; ++ } ++ ++ /* ++ * Good boy, still here. When we still get the ksm page, it does not ++ * return to the free page pool, there is no way that a pte was changed ++ * to other page and gets back to this page. And remind that ksm page ++ * do not reuse in do_wp_page(). So it's safe to restore the original ++ * pte. ++ */ ++ flush_cache_page(vma, addr, pte_pfn(*ptep)); ++ ptep_clear_flush_notify(vma, addr, ptep); ++ set_pte_at_notify(mm, addr, ptep, orig_pte); ++ ++ pte_unmap_unlock(ptep, ptl); ++ err = 0; ++out: ++ return err; ++} ++ ++/** ++ * try_to_merge_two_pages() - take two identical pages and prepare ++ * them to be merged into one page(rmap_item->page) ++ * ++ * @return 0 if we successfully merged two identical pages into ++ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision ++ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been ++ * changed since it's hashed. MERGE_ERR_PGERR otherwise. ++ * ++ */ ++static int try_to_merge_two_pages(struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ u32 hash) ++{ ++ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); ++ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); ++ struct vm_area_struct *vma1 = rmap_item->slot->vma; ++ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; ++ struct page *page = rmap_item->page; ++ struct page *tree_page = tree_rmap_item->page; ++ int err = MERGE_ERR_PGERR; ++ struct address_space *saved_mapping; ++ ++ ++ if (rmap_item->page == tree_rmap_item->page) ++ goto out; ++ ++ if (!trylock_page(page)) ++ goto out; ++ ++ if (!PageAnon(page)) ++ goto out_unlock; ++ ++ if (PageTransCompound(page)) { ++ err = split_huge_page(page); ++ if (err) ++ goto out_unlock; ++ } ++ ++ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { ++ unlock_page(page); ++ goto out; ++ } ++ ++ /* ++ * While we hold page lock, upgrade page from ++ * PageAnon+anon_vma to PageKsm+NULL stable_node: ++ * stable_tree_insert() will update stable_node. ++ */ ++ saved_mapping = page->mapping; ++ set_page_stable_node(page, NULL); ++ mark_page_accessed(page); ++ if (!PageDirty(page)) ++ SetPageDirty(page); ++ ++ unlock_page(page); ++ ++ if (!trylock_page(tree_page)) ++ goto restore_out; ++ ++ if (!PageAnon(tree_page)) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if (PageTransCompound(tree_page)) { ++ err = split_huge_page(tree_page); ++ if (err) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ } ++ ++ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if (pages_identical(page, tree_page)) { ++ err = replace_page(vma2, tree_page, page, wprt_pte2); ++ if (err) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if ((vma2->vm_flags & VM_LOCKED)) { ++ munlock_vma_page(tree_page); ++ if (!PageMlocked(page)) { ++ unlock_page(tree_page); ++ lock_page(page); ++ mlock_vma_page(page); ++ tree_page = page; /* for final unlock */ ++ } ++ } ++ ++ unlock_page(tree_page); ++ ++ goto out; /* success */ ++ ++ } else { ++ if (tree_rmap_item->hash_max && ++ tree_rmap_item->hash_max == rmap_item->hash_max) { ++ err = MERGE_ERR_COLLI_MAX; ++ } else if (page_hash(page, hash_strength, 0) == ++ page_hash(tree_page, hash_strength, 0)) { ++ inc_rshash_neg(memcmp_cost + hash_strength * 2); ++ err = MERGE_ERR_COLLI; ++ } else { ++ err = MERGE_ERR_CHANGED; ++ } ++ ++ unlock_page(tree_page); ++ } ++ ++restore_out: ++ lock_page(page); ++ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), ++ orig_pte1, wprt_pte1)) ++ page->mapping = saved_mapping; ++ ++out_unlock: ++ unlock_page(page); ++out: ++ return err; ++} ++ ++static inline int hash_cmp(u32 new_val, u32 node_val) ++{ ++ if (new_val > node_val) ++ return 1; ++ else if (new_val < node_val) ++ return -1; ++ else ++ return 0; ++} ++ ++static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) ++{ ++ u32 hash_max = item->hash_max; ++ ++ if (!hash_max) { ++ hash_max = page_hash_max(item->page, hash); ++ ++ item->hash_max = hash_max; ++ } ++ ++ return hash_max; ++} ++ ++ ++ ++/** ++ * stable_tree_search() - search the stable tree for a page ++ * ++ * @item: the rmap_item we are comparing with ++ * @hash: the hash value of this item->page already calculated ++ * ++ * @return the page we have found, NULL otherwise. The page returned has ++ * been gotten. ++ */ ++static struct page *stable_tree_search(struct rmap_item *item, u32 hash) ++{ ++ struct rb_node *node = root_stable_treep->rb_node; ++ struct tree_node *tree_node; ++ unsigned long hash_max; ++ struct page *page = item->page; ++ struct stable_node *stable_node; ++ ++ stable_node = page_stable_node(page); ++ if (stable_node) { ++ /* ksm page forked, that is ++ * if (PageKsm(page) && !in_stable_tree(rmap_item)) ++ * it's actually gotten once outside. ++ */ ++ get_page(page); ++ return page; ++ } ++ ++ while (node) { ++ int cmp; ++ ++ tree_node = rb_entry(node, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) ++ node = node->rb_left; ++ else if (cmp > 0) ++ node = node->rb_right; ++ else ++ break; ++ } ++ ++ if (!node) ++ return NULL; ++ ++ if (tree_node->count == 1) { ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ BUG_ON(!stable_node); ++ ++ goto get_page_out; ++ } ++ ++ /* ++ * ok, we have to search the second ++ * level subtree, hash the page to a ++ * full strength. ++ */ ++ node = tree_node->sub_root.rb_node; ++ BUG_ON(!node); ++ hash_max = rmap_item_hash_max(item, hash); ++ ++ while (node) { ++ int cmp; ++ ++ stable_node = rb_entry(node, struct stable_node, node); ++ ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ if (cmp < 0) ++ node = node->rb_left; ++ else if (cmp > 0) ++ node = node->rb_right; ++ else ++ goto get_page_out; ++ } ++ ++ return NULL; ++ ++get_page_out: ++ page = get_uksm_page(stable_node, 1, 1); ++ return page; ++} ++ ++static int try_merge_rmap_item(struct rmap_item *item, ++ struct page *kpage, ++ struct page *tree_page) ++{ ++ struct vm_area_struct *vma = item->slot->vma; ++ struct page_vma_mapped_walk pvmw = { ++ .page = kpage, ++ .vma = vma, ++ }; ++ ++ pvmw.address = get_rmap_addr(item); ++ if (!page_vma_mapped_walk(&pvmw)) ++ return 0; ++ ++ if (pte_write(*pvmw.pte)) { ++ /* has changed, abort! */ ++ page_vma_mapped_walk_done(&pvmw); ++ return 0; ++ } ++ ++ get_page(tree_page); ++ page_add_anon_rmap(tree_page, vma, pvmw.address, false); ++ ++ flush_cache_page(vma, pvmw.address, page_to_pfn(kpage)); ++ ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); ++ set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte, ++ mk_pte(tree_page, vma->vm_page_prot)); ++ ++ page_remove_rmap(kpage, false); ++ put_page(kpage); ++ ++ page_vma_mapped_walk_done(&pvmw); ++ ++ return 1; ++} ++ ++/** ++ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted ++ * into stable tree, the page was found to be identical to a stable ksm page, ++ * this is the last chance we can merge them into one. ++ * ++ * @item1: the rmap_item holding the page which we wanted to insert ++ * into stable tree. ++ * @item2: the other rmap_item we found when unstable tree search ++ * @oldpage: the page currently mapped by the two rmap_items ++ * @tree_page: the page we found identical in stable tree node ++ * @success1: return if item1 is successfully merged ++ * @success2: return if item2 is successfully merged ++ */ ++static void try_merge_with_stable(struct rmap_item *item1, ++ struct rmap_item *item2, ++ struct page **kpage, ++ struct page *tree_page, ++ int *success1, int *success2) ++{ ++ struct vm_area_struct *vma1 = item1->slot->vma; ++ struct vm_area_struct *vma2 = item2->slot->vma; ++ *success1 = 0; ++ *success2 = 0; ++ ++ if (unlikely(*kpage == tree_page)) { ++ /* I don't think this can really happen */ ++ pr_warn("UKSM: unexpected condition detected in " ++ "%s -- *kpage == tree_page !\n", __func__); ++ *success1 = 1; ++ *success2 = 1; ++ return; ++ } ++ ++ if (!PageAnon(*kpage) || !PageKsm(*kpage)) ++ goto failed; ++ ++ if (!trylock_page(tree_page)) ++ goto failed; ++ ++ /* If the oldpage is still ksm and still pointed ++ * to in the right place, and still write protected, ++ * we are confident it's not changed, no need to ++ * memcmp anymore. ++ * be ware, we cannot take nested pte locks, ++ * deadlock risk. ++ */ ++ if (!try_merge_rmap_item(item1, *kpage, tree_page)) ++ goto unlock_failed; ++ ++ /* ok, then vma2, remind that pte1 already set */ ++ if (!try_merge_rmap_item(item2, *kpage, tree_page)) ++ goto success_1; ++ ++ *success2 = 1; ++success_1: ++ *success1 = 1; ++ ++ ++ if ((*success1 && vma1->vm_flags & VM_LOCKED) || ++ (*success2 && vma2->vm_flags & VM_LOCKED)) { ++ munlock_vma_page(*kpage); ++ if (!PageMlocked(tree_page)) ++ mlock_vma_page(tree_page); ++ } ++ ++ /* ++ * We do not need oldpage any more in the caller, so can break the lock ++ * now. ++ */ ++ unlock_page(*kpage); ++ *kpage = tree_page; /* Get unlocked outside. */ ++ return; ++ ++unlock_failed: ++ unlock_page(tree_page); ++failed: ++ return; ++} ++ ++static inline void stable_node_hash_max(struct stable_node *node, ++ struct page *page, u32 hash) ++{ ++ u32 hash_max = node->hash_max; ++ ++ if (!hash_max) { ++ hash_max = page_hash_max(page, hash); ++ node->hash_max = hash_max; ++ } ++} ++ ++static inline ++struct stable_node *new_stable_node(struct tree_node *tree_node, ++ struct page *kpage, u32 hash_max) ++{ ++ struct stable_node *new_stable_node; ++ ++ new_stable_node = alloc_stable_node(); ++ if (!new_stable_node) ++ return NULL; ++ ++ new_stable_node->kpfn = page_to_pfn(kpage); ++ new_stable_node->hash_max = hash_max; ++ new_stable_node->tree_node = tree_node; ++ set_page_stable_node(kpage, new_stable_node); ++ ++ return new_stable_node; ++} ++ ++static inline ++struct stable_node *first_level_insert(struct tree_node *tree_node, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ struct page **kpage, u32 hash, ++ int *success1, int *success2) ++{ ++ int cmp; ++ struct page *tree_page; ++ u32 hash_max = 0; ++ struct stable_node *stable_node, *new_snode; ++ struct rb_node *parent = NULL, **new; ++ ++ /* this tree node contains no sub-tree yet */ ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ cmp = memcmp_pages(*kpage, tree_page, 1); ++ if (!cmp) { ++ try_merge_with_stable(rmap_item, tree_rmap_item, kpage, ++ tree_page, success1, success2); ++ put_page(tree_page); ++ if (!*success1 && !*success2) ++ goto failed; ++ ++ return stable_node; ++ ++ } else { ++ /* ++ * collision in first level try to create a subtree. ++ * A new node need to be created. ++ */ ++ put_page(tree_page); ++ ++ stable_node_hash_max(stable_node, tree_page, ++ tree_node->hash); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ parent = &stable_node->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto failed; ++ } ++ ++ } else { ++ /* the only stable_node deleted, we reuse its tree_node. ++ */ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++ new_snode = new_stable_node(tree_node, *kpage, hash_max); ++ if (!new_snode) ++ goto failed; ++ ++ rb_link_node(&new_snode->node, parent, new); ++ rb_insert_color(&new_snode->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ ++ return new_snode; ++ ++failed: ++ return NULL; ++} ++ ++static inline ++struct stable_node *stable_subtree_insert(struct tree_node *tree_node, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ struct page **kpage, u32 hash, ++ int *success1, int *success2) ++{ ++ struct page *tree_page; ++ u32 hash_max; ++ struct stable_node *stable_node, *new_snode; ++ struct rb_node *parent, **new; ++ ++research: ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ BUG_ON(!*new); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ while (*new) { ++ int cmp; ++ ++ stable_node = rb_entry(*new, struct stable_node, node); ++ ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else { ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ cmp = memcmp_pages(*kpage, tree_page, 1); ++ if (!cmp) { ++ try_merge_with_stable(rmap_item, ++ tree_rmap_item, kpage, ++ tree_page, success1, success2); ++ ++ put_page(tree_page); ++ if (!*success1 && !*success2) ++ goto failed; ++ /* ++ * successfully merged with a stable ++ * node ++ */ ++ return stable_node; ++ } else { ++ put_page(tree_page); ++ goto failed; ++ } ++ } else { ++ /* ++ * stable node may be deleted, ++ * and subtree maybe ++ * restructed, cannot ++ * continue, research it. ++ */ ++ if (tree_node->count) { ++ goto research; ++ } else { ++ /* reuse the tree node*/ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ } ++ } ++ } ++ ++ new_snode = new_stable_node(tree_node, *kpage, hash_max); ++ if (!new_snode) ++ goto failed; ++ ++ rb_link_node(&new_snode->node, parent, new); ++ rb_insert_color(&new_snode->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ ++ return new_snode; ++ ++failed: ++ return NULL; ++} ++ ++ ++/** ++ * stable_tree_insert() - try to insert a merged page in unstable tree to ++ * the stable tree ++ * ++ * @kpage: the page need to be inserted ++ * @hash: the current hash of this page ++ * @rmap_item: the rmap_item being scanned ++ * @tree_rmap_item: the rmap_item found on unstable tree ++ * @success1: return if rmap_item is merged ++ * @success2: return if tree_rmap_item is merged ++ * ++ * @return the stable_node on stable tree if at least one ++ * rmap_item is inserted into stable tree, NULL ++ * otherwise. ++ */ ++static struct stable_node * ++stable_tree_insert(struct page **kpage, u32 hash, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ int *success1, int *success2) ++{ ++ struct rb_node **new = &root_stable_treep->rb_node; ++ struct rb_node *parent = NULL; ++ struct stable_node *stable_node; ++ struct tree_node *tree_node; ++ u32 hash_max = 0; ++ ++ *success1 = *success2 = 0; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ if (tree_node->count == 1) { ++ stable_node = first_level_insert(tree_node, rmap_item, ++ tree_rmap_item, kpage, ++ hash, success1, success2); ++ } else { ++ stable_node = stable_subtree_insert(tree_node, ++ rmap_item, tree_rmap_item, kpage, ++ hash, success1, success2); ++ } ++ } else { ++ ++ /* no tree node found */ ++ tree_node = alloc_tree_node(stable_tree_node_listp); ++ if (!tree_node) { ++ stable_node = NULL; ++ goto out; ++ } ++ ++ stable_node = new_stable_node(tree_node, *kpage, hash_max); ++ if (!stable_node) { ++ free_tree_node(tree_node); ++ goto out; ++ } ++ ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, root_stable_treep); ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ ++ rb_link_node(&stable_node->node, parent, new); ++ rb_insert_color(&stable_node->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ } ++ ++out: ++ return stable_node; ++} ++ ++ ++/** ++ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem ++ * ++ * @return 0 on success, -EBUSY if unable to lock the mmap_sem, ++ * -EINVAL if the page mapping has been changed. ++ */ ++static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) ++{ ++ int err; ++ ++ err = get_mergeable_page_lock_mmap(tree_rmap_item); ++ ++ if (err == -EINVAL) { ++ /* its page map has been changed, remove it */ ++ remove_rmap_item_from_tree(tree_rmap_item); ++ } ++ ++ /* The page is gotten and mmap_sem is locked now. */ ++ return err; ++} ++ ++ ++/** ++ * unstable_tree_search_insert() - search an unstable tree rmap_item with the ++ * same hash value. Get its page and trylock the mmap_sem ++ */ ++static inline ++struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, ++ u32 hash) ++ ++{ ++ struct rb_node **new = &root_unstable_tree.rb_node; ++ struct rb_node *parent = NULL; ++ struct tree_node *tree_node; ++ u32 hash_max; ++ struct rmap_item *tree_rmap_item; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ /* got the tree_node */ ++ if (tree_node->count == 1) { ++ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, ++ struct rmap_item, node); ++ BUG_ON(!tree_rmap_item); ++ ++ goto get_page_out; ++ } ++ ++ /* well, search the collision subtree */ ++ new = &tree_node->sub_root.rb_node; ++ BUG_ON(!*new); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ ++ while (*new) { ++ int cmp; ++ ++ tree_rmap_item = rb_entry(*new, struct rmap_item, ++ node); ++ ++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); ++ parent = *new; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto get_page_out; ++ } ++ } else { ++ /* alloc a new tree_node */ ++ tree_node = alloc_tree_node(&unstable_tree_node_list); ++ if (!tree_node) ++ return NULL; ++ ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, &root_unstable_tree); ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++ /* did not found even in sub-tree */ ++ rmap_item->tree_node = tree_node; ++ rmap_item->address |= UNSTABLE_FLAG; ++ rmap_item->hash_round = uksm_hash_round; ++ rb_link_node(&rmap_item->node, parent, new); ++ rb_insert_color(&rmap_item->node, &tree_node->sub_root); ++ ++ uksm_pages_unshared++; ++ return NULL; ++ ++get_page_out: ++ if (tree_rmap_item->page == rmap_item->page) ++ return NULL; ++ ++ if (get_tree_rmap_item_page(tree_rmap_item)) ++ return NULL; ++ ++ return tree_rmap_item; ++} ++ ++static void hold_anon_vma(struct rmap_item *rmap_item, ++ struct anon_vma *anon_vma) ++{ ++ rmap_item->anon_vma = anon_vma; ++ get_anon_vma(anon_vma); ++} ++ ++ ++/** ++ * stable_tree_append() - append a rmap_item to a stable node. Deduplication ++ * ratio statistics is done in this function. ++ * ++ */ ++static void stable_tree_append(struct rmap_item *rmap_item, ++ struct stable_node *stable_node, int logdedup) ++{ ++ struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL; ++ unsigned long key = (unsigned long)rmap_item->slot; ++ unsigned long factor = rmap_item->slot->rung->step; ++ ++ BUG_ON(!stable_node); ++ rmap_item->address |= STABLE_FLAG; ++ ++ if (hlist_empty(&stable_node->hlist)) { ++ uksm_pages_shared++; ++ goto node_vma_new; ++ } else { ++ uksm_pages_sharing++; ++ } ++ ++ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { ++ if (node_vma->key >= key) ++ break; ++ ++ if (logdedup) { ++ node_vma->slot->pages_bemerged += factor; ++ if (list_empty(&node_vma->slot->dedup_list)) ++ list_add(&node_vma->slot->dedup_list, ++ &vma_slot_dedup); ++ } ++ } ++ ++ if (node_vma) { ++ if (node_vma->key == key) { ++ node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist); ++ goto node_vma_ok; ++ } else if (node_vma->key > key) { ++ node_vma_cont = node_vma; ++ } ++ } ++ ++node_vma_new: ++ /* no same vma already in node, alloc a new node_vma */ ++ new_node_vma = alloc_node_vma(); ++ BUG_ON(!new_node_vma); ++ new_node_vma->head = stable_node; ++ new_node_vma->slot = rmap_item->slot; ++ ++ if (!node_vma) { ++ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); ++ } else if (node_vma->key != key) { ++ if (node_vma->key < key) ++ hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist); ++ else { ++ hlist_add_before(&new_node_vma->hlist, ++ &node_vma->hlist); ++ } ++ ++ } ++ node_vma = new_node_vma; ++ ++node_vma_ok: /* ok, ready to add to the list */ ++ rmap_item->head = node_vma; ++ hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); ++ hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); ++ if (logdedup) { ++ rmap_item->slot->pages_merged++; ++ if (node_vma_cont) { ++ node_vma = node_vma_cont; ++ hlist_for_each_entry_continue(node_vma, hlist) { ++ node_vma->slot->pages_bemerged += factor; ++ if (list_empty(&node_vma->slot->dedup_list)) ++ list_add(&node_vma->slot->dedup_list, ++ &vma_slot_dedup); ++ } ++ } ++ } ++} ++ ++/* ++ * We use break_ksm to break COW on a ksm page: it's a stripped down ++ * ++ * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) ++ * put_page(page); ++ * ++ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, ++ * in case the application has unmapped and remapped mm,addr meanwhile. ++ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP ++ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. ++ */ ++static int break_ksm(struct vm_area_struct *vma, unsigned long addr) ++{ ++ struct page *page; ++ int ret = 0; ++ ++ do { ++ cond_resched(); ++ page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); ++ if (IS_ERR_OR_NULL(page)) ++ break; ++ if (PageKsm(page)) { ++ ret = handle_mm_fault(vma, addr, ++ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE); ++ } else ++ ret = VM_FAULT_WRITE; ++ put_page(page); ++ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); ++ /* ++ * We must loop because handle_mm_fault() may back out if there's ++ * any difficulty e.g. if pte accessed bit gets updated concurrently. ++ * ++ * VM_FAULT_WRITE is what we have been hoping for: it indicates that ++ * COW has been broken, even if the vma does not permit VM_WRITE; ++ * but note that a concurrent fault might break PageKsm for us. ++ * ++ * VM_FAULT_SIGBUS could occur if we race with truncation of the ++ * backing file, which also invalidates anonymous pages: that's ++ * okay, that truncation will have unmapped the PageKsm for us. ++ * ++ * VM_FAULT_OOM: at the time of writing (late July 2009), setting ++ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the ++ * current task has TIF_MEMDIE set, and will be OOM killed on return ++ * to user; and ksmd, having no mm, would never be chosen for that. ++ * ++ * But if the mm is in a limited mem_cgroup, then the fault may fail ++ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and ++ * even ksmd can fail in this way - though it's usually breaking ksm ++ * just to undo a merge it made a moment before, so unlikely to oom. ++ * ++ * That's a pity: we might therefore have more kernel pages allocated ++ * than we're counting as nodes in the stable tree; but uksm_do_scan ++ * will retry to break_cow on each pass, so should recover the page ++ * in due course. The important thing is to not let VM_MERGEABLE ++ * be cleared while any such pages might remain in the area. ++ */ ++ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; ++} ++ ++static void break_cow(struct rmap_item *rmap_item) ++{ ++ struct vm_area_struct *vma = rmap_item->slot->vma; ++ struct mm_struct *mm = vma->vm_mm; ++ unsigned long addr = get_rmap_addr(rmap_item); ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ break_ksm(vma, addr); ++out: ++ return; ++} ++ ++/* ++ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather ++ * than check every pte of a given vma, the locking doesn't quite work for ++ * that - an rmap_item is assigned to the stable tree after inserting ksm ++ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing ++ * rmap_items from parent to child at fork time (so as not to waste time ++ * if exit comes before the next scan reaches it). ++ * ++ * Similarly, although we'd like to remove rmap_items (so updating counts ++ * and freeing memory) when unmerging an area, it's easier to leave that ++ * to the next pass of ksmd - consider, for example, how ksmd might be ++ * in cmp_and_merge_page on one of the rmap_items we would be removing. ++ */ ++inline int unmerge_uksm_pages(struct vm_area_struct *vma, ++ unsigned long start, unsigned long end) ++{ ++ unsigned long addr; ++ int err = 0; ++ ++ for (addr = start; addr < end && !err; addr += PAGE_SIZE) { ++ if (uksm_test_exit(vma->vm_mm)) ++ break; ++ if (signal_pending(current)) ++ err = -ERESTARTSYS; ++ else ++ err = break_ksm(vma, addr); ++ } ++ return err; ++} ++ ++static inline void inc_uksm_pages_scanned(void) ++{ ++ u64 delta; ++ ++ ++ if (uksm_pages_scanned == U64_MAX) { ++ encode_benefit(); ++ ++ delta = uksm_pages_scanned >> pages_scanned_base; ++ ++ if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { ++ pages_scanned_stored >>= 1; ++ delta >>= 1; ++ pages_scanned_base++; ++ } ++ ++ pages_scanned_stored += delta; ++ ++ uksm_pages_scanned = uksm_pages_scanned_last = 0; ++ } ++ ++ uksm_pages_scanned++; ++} ++ ++static inline int find_zero_page_hash(int strength, u32 hash) ++{ ++ return (zero_hash_table[strength] == hash); ++} ++ ++static ++int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) ++{ ++ struct page *zero_page = empty_uksm_zero_page; ++ struct mm_struct *mm = vma->vm_mm; ++ pte_t orig_pte = __pte(0); ++ int err = -EFAULT; ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ if (!trylock_page(page)) ++ goto out; ++ ++ if (!PageAnon(page)) ++ goto out_unlock; ++ ++ if (PageTransCompound(page)) { ++ err = split_huge_page(page); ++ if (err) ++ goto out_unlock; ++ } ++ ++ if (write_protect_page(vma, page, &orig_pte, 0) == 0) { ++ if (is_page_full_zero(page)) ++ err = replace_page(vma, page, zero_page, orig_pte); ++ } ++ ++out_unlock: ++ unlock_page(page); ++out: ++ return err; ++} ++ ++/* ++ * cmp_and_merge_page() - first see if page can be merged into the stable ++ * tree; if not, compare hash to previous and if it's the same, see if page ++ * can be inserted into the unstable tree, or merged with a page already there ++ * and both transferred to the stable tree. ++ * ++ * @page: the page that we are searching identical page to. ++ * @rmap_item: the reverse mapping into the virtual address of this page ++ */ ++static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) ++{ ++ struct rmap_item *tree_rmap_item; ++ struct page *page; ++ struct page *kpage = NULL; ++ u32 hash_max; ++ int err; ++ unsigned int success1, success2; ++ struct stable_node *snode; ++ int cmp; ++ struct rb_node *parent = NULL, **new; ++ ++ remove_rmap_item_from_tree(rmap_item); ++ page = rmap_item->page; ++ ++ /* We first start with searching the page inside the stable tree */ ++ kpage = stable_tree_search(rmap_item, hash); ++ if (kpage) { ++ err = try_to_merge_with_uksm_page(rmap_item, kpage, ++ hash); ++ if (!err) { ++ /* ++ * The page was successfully merged, add ++ * its rmap_item to the stable tree. ++ * page lock is needed because it's ++ * racing with try_to_unmap_ksm(), etc. ++ */ ++ lock_page(kpage); ++ snode = page_stable_node(kpage); ++ stable_tree_append(rmap_item, snode, 1); ++ unlock_page(kpage); ++ put_page(kpage); ++ return; /* success */ ++ } ++ put_page(kpage); ++ ++ /* ++ * if it's a collision and it has been search in sub-rbtree ++ * (hash_max != 0), we want to abort, because if it is ++ * successfully merged in unstable tree, the collision trends to ++ * happen again. ++ */ ++ if (err == MERGE_ERR_COLLI && rmap_item->hash_max) ++ return; ++ } ++ ++ tree_rmap_item = ++ unstable_tree_search_insert(rmap_item, hash); ++ if (tree_rmap_item) { ++ err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); ++ /* ++ * As soon as we merge this page, we want to remove the ++ * rmap_item of the page we have merged with from the unstable ++ * tree, and insert it instead as new node in the stable tree. ++ */ ++ if (!err) { ++ kpage = page; ++ remove_rmap_item_from_tree(tree_rmap_item); ++ lock_page(kpage); ++ snode = stable_tree_insert(&kpage, hash, ++ rmap_item, tree_rmap_item, ++ &success1, &success2); ++ ++ /* ++ * Do not log dedup for tree item, it's not counted as ++ * scanned in this round. ++ */ ++ if (success2) ++ stable_tree_append(tree_rmap_item, snode, 0); ++ ++ /* ++ * The order of these two stable append is important: ++ * we are scanning rmap_item. ++ */ ++ if (success1) ++ stable_tree_append(rmap_item, snode, 1); ++ ++ /* ++ * The original kpage may be unlocked inside ++ * stable_tree_insert() already. This page ++ * should be unlocked before doing ++ * break_cow(). ++ */ ++ unlock_page(kpage); ++ ++ if (!success1) ++ break_cow(rmap_item); ++ ++ if (!success2) ++ break_cow(tree_rmap_item); ++ ++ } else if (err == MERGE_ERR_COLLI) { ++ BUG_ON(tree_rmap_item->tree_node->count > 1); ++ ++ rmap_item_hash_max(tree_rmap_item, ++ tree_rmap_item->tree_node->hash); ++ ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); ++ parent = &tree_rmap_item->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto put_up_out; ++ ++ rmap_item->tree_node = tree_rmap_item->tree_node; ++ rmap_item->address |= UNSTABLE_FLAG; ++ rmap_item->hash_round = uksm_hash_round; ++ rb_link_node(&rmap_item->node, parent, new); ++ rb_insert_color(&rmap_item->node, ++ &tree_rmap_item->tree_node->sub_root); ++ rmap_item->tree_node->count++; ++ } else { ++ /* ++ * either one of the page has changed or they collide ++ * at the max hash, we consider them as ill items. ++ */ ++ remove_rmap_item_from_tree(tree_rmap_item); ++ } ++put_up_out: ++ put_page(tree_rmap_item->page); ++ up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem); ++ } ++} ++ ++ ++ ++ ++static inline unsigned long get_pool_index(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; ++ if (pool_index >= slot->pool_size) ++ BUG(); ++ return pool_index; ++} ++ ++static inline unsigned long index_page_offset(unsigned long index) ++{ ++ return offset_in_page(sizeof(struct rmap_list_entry *) * index); ++} ++ ++static inline ++struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, ++ unsigned long index, int need_alloc) ++{ ++ unsigned long pool_index; ++ struct page *page; ++ void *addr; ++ ++ ++ pool_index = get_pool_index(slot, index); ++ if (!slot->rmap_list_pool[pool_index]) { ++ if (!need_alloc) ++ return NULL; ++ ++ page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); ++ if (!page) ++ return NULL; ++ ++ slot->rmap_list_pool[pool_index] = page; ++ } ++ ++ addr = kmap(slot->rmap_list_pool[pool_index]); ++ addr += index_page_offset(index); ++ ++ return addr; ++} ++ ++static inline void put_rmap_list_entry(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ kunmap(slot->rmap_list_pool[pool_index]); ++} ++ ++static inline int entry_is_new(struct rmap_list_entry *entry) ++{ ++ return !entry->item; ++} ++ ++static inline unsigned long get_index_orig_addr(struct vma_slot *slot, ++ unsigned long index) ++{ ++ return slot->vma->vm_start + (index << PAGE_SHIFT); ++} ++ ++static inline unsigned long get_entry_address(struct rmap_list_entry *entry) ++{ ++ unsigned long addr; ++ ++ if (is_addr(entry->addr)) ++ addr = get_clean_addr(entry->addr); ++ else if (entry->item) ++ addr = get_rmap_addr(entry->item); ++ else ++ BUG(); ++ ++ return addr; ++} ++ ++static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) ++{ ++ if (is_addr(entry->addr)) ++ return NULL; ++ ++ return entry->item; ++} ++ ++static inline void inc_rmap_list_pool_count(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ slot->pool_counts[pool_index]++; ++} ++ ++static inline void dec_rmap_list_pool_count(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ BUG_ON(!slot->pool_counts[pool_index]); ++ slot->pool_counts[pool_index]--; ++} ++ ++static inline int entry_has_rmap(struct rmap_list_entry *entry) ++{ ++ return !is_addr(entry->addr) && entry->item; ++} ++ ++static inline void swap_entries(struct rmap_list_entry *entry1, ++ unsigned long index1, ++ struct rmap_list_entry *entry2, ++ unsigned long index2) ++{ ++ struct rmap_list_entry tmp; ++ ++ /* swapping two new entries is meaningless */ ++ BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); ++ ++ tmp = *entry1; ++ *entry1 = *entry2; ++ *entry2 = tmp; ++ ++ if (entry_has_rmap(entry1)) ++ entry1->item->entry_index = index1; ++ ++ if (entry_has_rmap(entry2)) ++ entry2->item->entry_index = index2; ++ ++ if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { ++ inc_rmap_list_pool_count(entry1->item->slot, index1); ++ dec_rmap_list_pool_count(entry1->item->slot, index2); ++ } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { ++ inc_rmap_list_pool_count(entry2->item->slot, index2); ++ dec_rmap_list_pool_count(entry2->item->slot, index1); ++ } ++} ++ ++static inline void free_entry_item(struct rmap_list_entry *entry) ++{ ++ unsigned long index; ++ struct rmap_item *item; ++ ++ if (!is_addr(entry->addr)) { ++ BUG_ON(!entry->item); ++ item = entry->item; ++ entry->addr = get_rmap_addr(item); ++ set_is_addr(entry->addr); ++ index = item->entry_index; ++ remove_rmap_item_from_tree(item); ++ dec_rmap_list_pool_count(item->slot, index); ++ free_rmap_item(item); ++ } ++} ++ ++static inline int pool_entry_boundary(unsigned long index) ++{ ++ unsigned long linear_addr; ++ ++ linear_addr = sizeof(struct rmap_list_entry *) * index; ++ return index && !offset_in_page(linear_addr); ++} ++ ++static inline void try_free_last_pool(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ if (slot->rmap_list_pool[pool_index] && ++ !slot->pool_counts[pool_index]) { ++ __free_page(slot->rmap_list_pool[pool_index]); ++ slot->rmap_list_pool[pool_index] = NULL; ++ slot->flags |= UKSM_SLOT_NEED_SORT; ++ } ++ ++} ++ ++static inline unsigned long vma_item_index(struct vm_area_struct *vma, ++ struct rmap_item *item) ++{ ++ return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; ++} ++ ++static int within_same_pool(struct vma_slot *slot, ++ unsigned long i, unsigned long j) ++{ ++ unsigned long pool_i, pool_j; ++ ++ pool_i = get_pool_index(slot, i); ++ pool_j = get_pool_index(slot, j); ++ ++ return (pool_i == pool_j); ++} ++ ++static void sort_rmap_entry_list(struct vma_slot *slot) ++{ ++ unsigned long i, j; ++ struct rmap_list_entry *entry, *swap_entry; ++ ++ entry = get_rmap_list_entry(slot, 0, 0); ++ for (i = 0; i < slot->pages; ) { ++ ++ if (!entry) ++ goto skip_whole_pool; ++ ++ if (entry_is_new(entry)) ++ goto next_entry; ++ ++ if (is_addr(entry->addr)) { ++ entry->addr = 0; ++ goto next_entry; ++ } ++ ++ j = vma_item_index(slot->vma, entry->item); ++ if (j == i) ++ goto next_entry; ++ ++ if (within_same_pool(slot, i, j)) ++ swap_entry = entry + j - i; ++ else ++ swap_entry = get_rmap_list_entry(slot, j, 1); ++ ++ swap_entries(entry, i, swap_entry, j); ++ if (!within_same_pool(slot, i, j)) ++ put_rmap_list_entry(slot, j); ++ continue; ++ ++skip_whole_pool: ++ i += PAGE_SIZE / sizeof(*entry); ++ if (i < slot->pages) ++ entry = get_rmap_list_entry(slot, i, 0); ++ continue; ++ ++next_entry: ++ if (i >= slot->pages - 1 || ++ !within_same_pool(slot, i, i + 1)) { ++ put_rmap_list_entry(slot, i); ++ if (i + 1 < slot->pages) ++ entry = get_rmap_list_entry(slot, i + 1, 0); ++ } else ++ entry++; ++ i++; ++ continue; ++ } ++ ++ /* free empty pool entries which contain no rmap_item */ ++ /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ ++ for (i = 0; i < slot->pool_size; i++) { ++ unsigned char has_rmap; ++ void *addr; ++ ++ if (!slot->rmap_list_pool[i]) ++ continue; ++ ++ has_rmap = 0; ++ addr = kmap(slot->rmap_list_pool[i]); ++ BUG_ON(!addr); ++ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { ++ entry = (struct rmap_list_entry *)addr + j; ++ if (is_addr(entry->addr)) ++ continue; ++ if (!entry->item) ++ continue; ++ has_rmap = 1; ++ } ++ kunmap(slot->rmap_list_pool[i]); ++ if (!has_rmap) { ++ BUG_ON(slot->pool_counts[i]); ++ __free_page(slot->rmap_list_pool[i]); ++ slot->rmap_list_pool[i] = NULL; ++ } ++ } ++ ++ slot->flags &= ~UKSM_SLOT_NEED_SORT; ++} ++ ++/* ++ * vma_fully_scanned() - if all the pages in this slot have been scanned. ++ */ ++static inline int vma_fully_scanned(struct vma_slot *slot) ++{ ++ return slot->pages_scanned == slot->pages; ++} ++ ++/** ++ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to ++ * its random permutation. This function is embedded with the random ++ * permutation index management code. ++ */ ++static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) ++{ ++ unsigned long rand_range, addr, swap_index, scan_index; ++ struct rmap_item *item = NULL; ++ struct rmap_list_entry *scan_entry, *swap_entry = NULL; ++ struct page *page; ++ ++ scan_index = swap_index = slot->pages_scanned % slot->pages; ++ ++ if (pool_entry_boundary(scan_index)) ++ try_free_last_pool(slot, scan_index - 1); ++ ++ if (vma_fully_scanned(slot)) { ++ if (slot->flags & UKSM_SLOT_NEED_SORT) ++ slot->flags |= UKSM_SLOT_NEED_RERAND; ++ else ++ slot->flags &= ~UKSM_SLOT_NEED_RERAND; ++ if (slot->flags & UKSM_SLOT_NEED_SORT) ++ sort_rmap_entry_list(slot); ++ } ++ ++ scan_entry = get_rmap_list_entry(slot, scan_index, 1); ++ if (!scan_entry) ++ return NULL; ++ ++ if (entry_is_new(scan_entry)) { ++ scan_entry->addr = get_index_orig_addr(slot, scan_index); ++ set_is_addr(scan_entry->addr); ++ } ++ ++ if (slot->flags & UKSM_SLOT_NEED_RERAND) { ++ rand_range = slot->pages - scan_index; ++ BUG_ON(!rand_range); ++ swap_index = scan_index + (prandom_u32() % rand_range); ++ } ++ ++ if (swap_index != scan_index) { ++ swap_entry = get_rmap_list_entry(slot, swap_index, 1); ++ if (entry_is_new(swap_entry)) { ++ swap_entry->addr = get_index_orig_addr(slot, ++ swap_index); ++ set_is_addr(swap_entry->addr); ++ } ++ swap_entries(scan_entry, scan_index, swap_entry, swap_index); ++ } ++ ++ addr = get_entry_address(scan_entry); ++ item = get_entry_item(scan_entry); ++ BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); ++ ++ page = follow_page(slot->vma, addr, FOLL_GET); ++ if (IS_ERR_OR_NULL(page)) ++ goto nopage; ++ ++ if (!PageAnon(page)) ++ goto putpage; ++ ++ /*check is zero_page pfn or uksm_zero_page*/ ++ if ((page_to_pfn(page) == zero_pfn) ++ || (page_to_pfn(page) == uksm_zero_pfn)) ++ goto putpage; ++ ++ flush_anon_page(slot->vma, page, addr); ++ flush_dcache_page(page); ++ ++ ++ *hash = page_hash(page, hash_strength, 1); ++ inc_uksm_pages_scanned(); ++ /*if the page content all zero, re-map to zero-page*/ ++ if (find_zero_page_hash(hash_strength, *hash)) { ++ if (!cmp_and_merge_zero_page(slot->vma, page)) { ++ slot->pages_merged++; ++ ++ /* For full-zero pages, no need to create rmap item */ ++ goto putpage; ++ } else { ++ inc_rshash_neg(memcmp_cost / 2); ++ } ++ } ++ ++ if (!item) { ++ item = alloc_rmap_item(); ++ if (item) { ++ /* It has already been zeroed */ ++ item->slot = slot; ++ item->address = addr; ++ item->entry_index = scan_index; ++ scan_entry->item = item; ++ inc_rmap_list_pool_count(slot, scan_index); ++ } else ++ goto putpage; ++ } ++ ++ BUG_ON(item->slot != slot); ++ /* the page may have changed */ ++ item->page = page; ++ put_rmap_list_entry(slot, scan_index); ++ if (swap_entry) ++ put_rmap_list_entry(slot, swap_index); ++ return item; ++ ++putpage: ++ put_page(page); ++ page = NULL; ++nopage: ++ /* no page, store addr back and free rmap_item if possible */ ++ free_entry_item(scan_entry); ++ put_rmap_list_entry(slot, scan_index); ++ if (swap_entry) ++ put_rmap_list_entry(slot, swap_index); ++ return NULL; ++} ++ ++static inline int in_stable_tree(struct rmap_item *rmap_item) ++{ ++ return rmap_item->address & STABLE_FLAG; ++} ++ ++/** ++ * scan_vma_one_page() - scan the next page in a vma_slot. Called with ++ * mmap_sem locked. ++ */ ++static noinline void scan_vma_one_page(struct vma_slot *slot) ++{ ++ u32 hash; ++ struct mm_struct *mm; ++ struct rmap_item *rmap_item = NULL; ++ struct vm_area_struct *vma = slot->vma; ++ ++ mm = vma->vm_mm; ++ BUG_ON(!mm); ++ BUG_ON(!slot); ++ ++ rmap_item = get_next_rmap_item(slot, &hash); ++ if (!rmap_item) ++ goto out1; ++ ++ if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) ++ goto out2; ++ ++ cmp_and_merge_page(rmap_item, hash); ++out2: ++ put_page(rmap_item->page); ++out1: ++ slot->pages_scanned++; ++ slot->this_sampled++; ++ if (slot->fully_scanned_round != fully_scanned_round) ++ scanned_virtual_pages++; ++ ++ if (vma_fully_scanned(slot)) ++ slot->fully_scanned_round = fully_scanned_round; ++} ++ ++static inline unsigned long rung_get_pages(struct scan_rung *rung) ++{ ++ struct slot_tree_node *node; ++ ++ if (!rung->vma_root.rnode) ++ return 0; ++ ++ node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); ++ ++ return node->size; ++} ++ ++#define RUNG_SAMPLED_MIN 3 ++ ++static inline ++void uksm_calc_rung_step(struct scan_rung *rung, ++ unsigned long page_time, unsigned long ratio) ++{ ++ unsigned long sampled, pages; ++ ++ /* will be fully scanned ? */ ++ if (!rung->cover_msecs) { ++ rung->step = 1; ++ return; ++ } ++ ++ sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) ++ * ratio / page_time; ++ ++ /* ++ * Before we finsish a scan round and expensive per-round jobs, ++ * we need to have a chance to estimate the per page time. So ++ * the sampled number can not be too small. ++ */ ++ if (sampled < RUNG_SAMPLED_MIN) ++ sampled = RUNG_SAMPLED_MIN; ++ ++ pages = rung_get_pages(rung); ++ if (likely(pages > sampled)) ++ rung->step = pages / sampled; ++ else ++ rung->step = 1; ++} ++ ++static inline int step_need_recalc(struct scan_rung *rung) ++{ ++ unsigned long pages, stepmax; ++ ++ pages = rung_get_pages(rung); ++ stepmax = pages / RUNG_SAMPLED_MIN; ++ ++ return pages && (rung->step > pages || ++ (stepmax && rung->step > stepmax)); ++} ++ ++static inline ++void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) ++{ ++ struct vma_slot *slot; ++ ++ if (finished) ++ rung->flags |= UKSM_RUNG_ROUND_FINISHED; ++ ++ if (step_recalc || step_need_recalc(rung)) { ++ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); ++ BUG_ON(step_need_recalc(rung)); ++ } ++ ++ slot_iter_index = prandom_u32() % rung->step; ++ BUG_ON(!rung->vma_root.rnode); ++ slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); ++ BUG_ON(!slot); ++ ++ rung->current_scan = slot; ++ rung->current_offset = slot_iter_index; ++} ++ ++static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) ++{ ++ return &slot->rung->vma_root; ++} ++ ++/* ++ * return if resetted. ++ */ ++static int advance_current_scan(struct scan_rung *rung) ++{ ++ unsigned short n; ++ struct vma_slot *slot, *next = NULL; ++ ++ BUG_ON(!rung->vma_root.num); ++ ++ slot = rung->current_scan; ++ n = (slot->pages - rung->current_offset) % rung->step; ++ slot_iter_index = rung->step - n; ++ next = sradix_tree_next(&rung->vma_root, slot->snode, ++ slot->sindex, slot_iter); ++ ++ if (next) { ++ rung->current_offset = slot_iter_index; ++ rung->current_scan = next; ++ return 0; ++ } else { ++ reset_current_scan(rung, 1, 0); ++ return 1; ++ } ++} ++ ++static inline void rung_rm_slot(struct vma_slot *slot) ++{ ++ struct scan_rung *rung = slot->rung; ++ struct sradix_tree_root *root; ++ ++ if (rung->current_scan == slot) ++ advance_current_scan(rung); ++ ++ root = slot_get_root(slot); ++ sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); ++ slot->snode = NULL; ++ if (step_need_recalc(rung)) { ++ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); ++ BUG_ON(step_need_recalc(rung)); ++ } ++ ++ /* In case advance_current_scan loop back to this slot again */ ++ if (rung->vma_root.num && rung->current_scan == slot) ++ reset_current_scan(slot->rung, 1, 0); ++} ++ ++static inline void rung_add_new_slots(struct scan_rung *rung, ++ struct vma_slot **slots, unsigned long num) ++{ ++ int err; ++ struct vma_slot *slot; ++ unsigned long i; ++ struct sradix_tree_root *root = &rung->vma_root; ++ ++ err = sradix_tree_enter(root, (void **)slots, num); ++ BUG_ON(err); ++ ++ for (i = 0; i < num; i++) { ++ slot = slots[i]; ++ slot->rung = rung; ++ BUG_ON(vma_fully_scanned(slot)); ++ } ++ ++ if (rung->vma_root.num == num) ++ reset_current_scan(rung, 0, 1); ++} ++ ++static inline int rung_add_one_slot(struct scan_rung *rung, ++ struct vma_slot *slot) ++{ ++ int err; ++ ++ err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); ++ if (err) ++ return err; ++ ++ slot->rung = rung; ++ if (rung->vma_root.num == 1) ++ reset_current_scan(rung, 0, 1); ++ ++ return 0; ++} ++ ++/* ++ * Return true if the slot is deleted from its rung. ++ */ ++static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) ++{ ++ struct scan_rung *old_rung = slot->rung; ++ int err; ++ ++ if (old_rung == rung) ++ return 0; ++ ++ rung_rm_slot(slot); ++ err = rung_add_one_slot(rung, slot); ++ if (err) { ++ err = rung_add_one_slot(old_rung, slot); ++ WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ ++ } ++ ++ return 1; ++} ++ ++static inline int vma_rung_up(struct vma_slot *slot) ++{ ++ struct scan_rung *rung; ++ ++ rung = slot->rung; ++ if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) ++ rung++; ++ ++ return vma_rung_enter(slot, rung); ++} ++ ++static inline int vma_rung_down(struct vma_slot *slot) ++{ ++ struct scan_rung *rung; ++ ++ rung = slot->rung; ++ if (slot->rung != &uksm_scan_ladder[0]) ++ rung--; ++ ++ return vma_rung_enter(slot, rung); ++} ++ ++/** ++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. ++ */ ++static unsigned long cal_dedup_ratio(struct vma_slot *slot) ++{ ++ unsigned long ret; ++ unsigned long pages; ++ ++ pages = slot->this_sampled; ++ if (!pages) ++ return 0; ++ ++ BUG_ON(slot->pages_scanned == slot->last_scanned); ++ ++ ret = slot->pages_merged; ++ ++ /* Thrashing area filtering */ ++ if (ret && uksm_thrash_threshold) { ++ if (slot->pages_cowed * 100 / slot->pages_merged ++ > uksm_thrash_threshold) { ++ ret = 0; ++ } else { ++ ret = slot->pages_merged - slot->pages_cowed; ++ } ++ } ++ ++ return ret * 100 / pages; ++} ++ ++/** ++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. ++ */ ++static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) ++{ ++ unsigned long ret; ++ unsigned long pages; ++ ++ pages = slot->pages; ++ if (!pages) ++ return 0; ++ ++ ret = slot->pages_bemerged; ++ ++ /* Thrashing area filtering */ ++ if (ret && uksm_thrash_threshold) { ++ if (slot->pages_cowed * 100 / slot->pages_bemerged ++ > uksm_thrash_threshold) { ++ ret = 0; ++ } else { ++ ret = slot->pages_bemerged - slot->pages_cowed; ++ } ++ } ++ ++ return ret * 100 / pages; ++} ++ ++/** ++ * stable_node_reinsert() - When the hash_strength has been adjusted, the ++ * stable tree need to be restructured, this is the function re-inserting the ++ * stable node. ++ */ ++static inline void stable_node_reinsert(struct stable_node *new_node, ++ struct page *page, ++ struct rb_root *root_treep, ++ struct list_head *tree_node_listp, ++ u32 hash) ++{ ++ struct rb_node **new = &root_treep->rb_node; ++ struct rb_node *parent = NULL; ++ struct stable_node *stable_node; ++ struct tree_node *tree_node; ++ struct page *tree_page; ++ int cmp; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ /* find a stable tree node with same first level hash value */ ++ stable_node_hash_max(new_node, page, hash); ++ if (tree_node->count == 1) { ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ stable_node_hash_max(stable_node, ++ tree_page, hash); ++ put_page(tree_page); ++ ++ /* prepare for stable node insertion */ ++ ++ cmp = hash_cmp(new_node->hash_max, ++ stable_node->hash_max); ++ parent = &stable_node->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto failed; ++ ++ goto add_node; ++ } else { ++ /* the only stable_node deleted, the tree node ++ * was not deleted. ++ */ ++ goto tree_node_reuse; ++ } ++ } ++ ++ /* well, search the collision subtree */ ++ new = &tree_node->sub_root.rb_node; ++ parent = NULL; ++ BUG_ON(!*new); ++ while (*new) { ++ int cmp; ++ ++ stable_node = rb_entry(*new, struct stable_node, node); ++ ++ cmp = hash_cmp(new_node->hash_max, ++ stable_node->hash_max); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else { ++ /* oh, no, still a collision */ ++ goto failed; ++ } ++ } ++ ++ goto add_node; ++ } ++ ++ /* no tree node found */ ++ tree_node = alloc_tree_node(tree_node_listp); ++ if (!tree_node) { ++ pr_err("UKSM: memory allocation error!\n"); ++ goto failed; ++ } else { ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, root_treep); ++ ++tree_node_reuse: ++ /* prepare for stable node insertion */ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++add_node: ++ rb_link_node(&new_node->node, parent, new); ++ rb_insert_color(&new_node->node, &tree_node->sub_root); ++ new_node->tree_node = tree_node; ++ tree_node->count++; ++ return; ++ ++failed: ++ /* This can only happen when two nodes have collided ++ * in two levels. ++ */ ++ new_node->tree_node = NULL; ++ return; ++} ++ ++static inline void free_all_tree_nodes(struct list_head *list) ++{ ++ struct tree_node *node, *tmp; ++ ++ list_for_each_entry_safe(node, tmp, list, all_list) { ++ free_tree_node(node); ++ } ++} ++ ++/** ++ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash ++ * strength to the current hash_strength. It re-structures the hole tree. ++ */ ++static inline void stable_tree_delta_hash(u32 prev_hash_strength) ++{ ++ struct stable_node *node, *tmp; ++ struct rb_root *root_new_treep; ++ struct list_head *new_tree_node_listp; ++ ++ stable_tree_index = (stable_tree_index + 1) % 2; ++ root_new_treep = &root_stable_tree[stable_tree_index]; ++ new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; ++ *root_new_treep = RB_ROOT; ++ BUG_ON(!list_empty(new_tree_node_listp)); ++ ++ /* ++ * we need to be safe, the node could be removed by get_uksm_page() ++ */ ++ list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { ++ void *addr; ++ struct page *node_page; ++ u32 hash; ++ ++ /* ++ * We are completely re-structuring the stable nodes to a new ++ * stable tree. We don't want to touch the old tree unlinks and ++ * old tree_nodes. The old tree_nodes will be freed at once. ++ */ ++ node_page = get_uksm_page(node, 0, 0); ++ if (!node_page) ++ continue; ++ ++ if (node->tree_node) { ++ hash = node->tree_node->hash; ++ ++ addr = kmap_atomic(node_page); ++ ++ hash = delta_hash(addr, prev_hash_strength, ++ hash_strength, hash); ++ kunmap_atomic(addr); ++ } else { ++ /* ++ *it was not inserted to rbtree due to collision in last ++ *round scan. ++ */ ++ hash = page_hash(node_page, hash_strength, 0); ++ } ++ ++ stable_node_reinsert(node, node_page, root_new_treep, ++ new_tree_node_listp, hash); ++ put_page(node_page); ++ } ++ ++ root_stable_treep = root_new_treep; ++ free_all_tree_nodes(stable_tree_node_listp); ++ BUG_ON(!list_empty(stable_tree_node_listp)); ++ stable_tree_node_listp = new_tree_node_listp; ++} ++ ++static inline void inc_hash_strength(unsigned long delta) ++{ ++ hash_strength += 1 << delta; ++ if (hash_strength > HASH_STRENGTH_MAX) ++ hash_strength = HASH_STRENGTH_MAX; ++} ++ ++static inline void dec_hash_strength(unsigned long delta) ++{ ++ unsigned long change = 1 << delta; ++ ++ if (hash_strength <= change + 1) ++ hash_strength = 1; ++ else ++ hash_strength -= change; ++} ++ ++static inline void inc_hash_strength_delta(void) ++{ ++ hash_strength_delta++; ++ if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) ++ hash_strength_delta = HASH_STRENGTH_DELTA_MAX; ++} ++ ++static inline unsigned long get_current_neg_ratio(void) ++{ ++ u64 pos = benefit.pos; ++ u64 neg = benefit.neg; ++ ++ if (!neg) ++ return 0; ++ ++ if (!pos || neg > pos) ++ return 100; ++ ++ if (neg > div64_u64(U64_MAX, 100)) ++ pos = div64_u64(pos, 100); ++ else ++ neg *= 100; ++ ++ return div64_u64(neg, pos); ++} ++ ++static inline unsigned long get_current_benefit(void) ++{ ++ u64 pos = benefit.pos; ++ u64 neg = benefit.neg; ++ u64 scanned = benefit.scanned; ++ ++ if (neg > pos) ++ return 0; ++ ++ return div64_u64((pos - neg), scanned); ++} ++ ++static inline int judge_rshash_direction(void) ++{ ++ u64 current_neg_ratio, stable_benefit; ++ u64 current_benefit, delta = 0; ++ int ret = STILL; ++ ++ /* ++ * Try to probe a value after the boot, and in case the system ++ * are still for a long time. ++ */ ++ if ((fully_scanned_round & 0xFFULL) == 10) { ++ ret = OBSCURE; ++ goto out; ++ } ++ ++ current_neg_ratio = get_current_neg_ratio(); ++ ++ if (current_neg_ratio == 0) { ++ rshash_neg_cont_zero++; ++ if (rshash_neg_cont_zero > 2) ++ return GO_DOWN; ++ else ++ return STILL; ++ } ++ rshash_neg_cont_zero = 0; ++ ++ if (current_neg_ratio > 90) { ++ ret = GO_UP; ++ goto out; ++ } ++ ++ current_benefit = get_current_benefit(); ++ stable_benefit = rshash_state.stable_benefit; ++ ++ if (!stable_benefit) { ++ ret = OBSCURE; ++ goto out; ++ } ++ ++ if (current_benefit > stable_benefit) ++ delta = current_benefit - stable_benefit; ++ else if (current_benefit < stable_benefit) ++ delta = stable_benefit - current_benefit; ++ ++ delta = div64_u64(100 * delta, stable_benefit); ++ ++ if (delta > 50) { ++ rshash_cont_obscure++; ++ if (rshash_cont_obscure > 2) ++ return OBSCURE; ++ else ++ return STILL; ++ } ++ ++out: ++ rshash_cont_obscure = 0; ++ return ret; ++} ++ ++/** ++ * rshash_adjust() - The main function to control the random sampling state ++ * machine for hash strength adapting. ++ * ++ * return true if hash_strength has changed. ++ */ ++static inline int rshash_adjust(void) ++{ ++ unsigned long prev_hash_strength = hash_strength; ++ ++ if (!encode_benefit()) ++ return 0; ++ ++ switch (rshash_state.state) { ++ case RSHASH_STILL: ++ switch (judge_rshash_direction()) { ++ case GO_UP: ++ if (rshash_state.pre_direct == GO_DOWN) ++ hash_strength_delta = 0; ++ ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.pre_direct = GO_UP; ++ break; ++ ++ case GO_DOWN: ++ if (rshash_state.pre_direct == GO_UP) ++ hash_strength_delta = 0; ++ ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.pre_direct = GO_DOWN; ++ break; ++ ++ case OBSCURE: ++ rshash_state.stable_point = hash_strength; ++ rshash_state.turn_point_down = hash_strength; ++ rshash_state.turn_point_up = hash_strength; ++ rshash_state.turn_benefit_down = get_current_benefit(); ++ rshash_state.turn_benefit_up = get_current_benefit(); ++ rshash_state.lookup_window_index = 0; ++ rshash_state.state = RSHASH_TRYDOWN; ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ break; ++ ++ case STILL: ++ break; ++ default: ++ BUG(); ++ } ++ break; ++ ++ case RSHASH_TRYDOWN: ++ if (rshash_state.lookup_window_index++ % 5 == 0) ++ rshash_state.below_count = 0; ++ ++ if (get_current_benefit() < rshash_state.stable_benefit) ++ rshash_state.below_count++; ++ else if (get_current_benefit() > ++ rshash_state.turn_benefit_down) { ++ rshash_state.turn_point_down = hash_strength; ++ rshash_state.turn_benefit_down = get_current_benefit(); ++ } ++ ++ if (rshash_state.below_count >= 3 || ++ judge_rshash_direction() == GO_UP || ++ hash_strength == 1) { ++ hash_strength = rshash_state.stable_point; ++ hash_strength_delta = 0; ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.lookup_window_index = 0; ++ rshash_state.state = RSHASH_TRYUP; ++ hash_strength_delta = 0; ++ } else { ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ } ++ break; ++ ++ case RSHASH_TRYUP: ++ if (rshash_state.lookup_window_index++ % 5 == 0) ++ rshash_state.below_count = 0; ++ ++ if (get_current_benefit() < rshash_state.turn_benefit_down) ++ rshash_state.below_count++; ++ else if (get_current_benefit() > rshash_state.turn_benefit_up) { ++ rshash_state.turn_point_up = hash_strength; ++ rshash_state.turn_benefit_up = get_current_benefit(); ++ } ++ ++ if (rshash_state.below_count >= 3 || ++ judge_rshash_direction() == GO_DOWN || ++ hash_strength == HASH_STRENGTH_MAX) { ++ hash_strength = rshash_state.turn_benefit_up > ++ rshash_state.turn_benefit_down ? ++ rshash_state.turn_point_up : ++ rshash_state.turn_point_down; ++ ++ rshash_state.state = RSHASH_PRE_STILL; ++ } else { ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ } ++ ++ break; ++ ++ case RSHASH_NEW: ++ case RSHASH_PRE_STILL: ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.state = RSHASH_STILL; ++ hash_strength_delta = 0; ++ break; ++ default: ++ BUG(); ++ } ++ ++ /* rshash_neg = rshash_pos = 0; */ ++ reset_benefit(); ++ ++ if (prev_hash_strength != hash_strength) ++ stable_tree_delta_hash(prev_hash_strength); ++ ++ return prev_hash_strength != hash_strength; ++} ++ ++/** ++ * round_update_ladder() - The main function to do update of all the ++ * adjustments whenever a scan round is finished. ++ */ ++static noinline void round_update_ladder(void) ++{ ++ int i; ++ unsigned long dedup; ++ struct vma_slot *slot, *tmp_slot; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) ++ uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; ++ ++ list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { ++ ++ /* slot may be rung_rm_slot() when mm exits */ ++ if (slot->snode) { ++ dedup = cal_dedup_ratio_old(slot); ++ if (dedup && dedup >= uksm_abundant_threshold) ++ vma_rung_up(slot); ++ } ++ ++ slot->pages_bemerged = 0; ++ slot->pages_cowed = 0; ++ ++ list_del_init(&slot->dedup_list); ++ } ++} ++ ++static void uksm_del_vma_slot(struct vma_slot *slot) ++{ ++ int i, j; ++ struct rmap_list_entry *entry; ++ ++ if (slot->snode) { ++ /* ++ * In case it just failed when entering the rung, it's not ++ * necessary. ++ */ ++ rung_rm_slot(slot); ++ } ++ ++ if (!list_empty(&slot->dedup_list)) ++ list_del(&slot->dedup_list); ++ ++ if (!slot->rmap_list_pool || !slot->pool_counts) { ++ /* In case it OOMed in uksm_vma_enter() */ ++ goto out; ++ } ++ ++ for (i = 0; i < slot->pool_size; i++) { ++ void *addr; ++ ++ if (!slot->rmap_list_pool[i]) ++ continue; ++ ++ addr = kmap(slot->rmap_list_pool[i]); ++ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { ++ entry = (struct rmap_list_entry *)addr + j; ++ if (is_addr(entry->addr)) ++ continue; ++ if (!entry->item) ++ continue; ++ ++ remove_rmap_item_from_tree(entry->item); ++ free_rmap_item(entry->item); ++ slot->pool_counts[i]--; ++ } ++ BUG_ON(slot->pool_counts[i]); ++ kunmap(slot->rmap_list_pool[i]); ++ __free_page(slot->rmap_list_pool[i]); ++ } ++ kfree(slot->rmap_list_pool); ++ kfree(slot->pool_counts); ++ ++out: ++ slot->rung = NULL; ++ if (slot->flags & UKSM_SLOT_IN_UKSM) { ++ BUG_ON(uksm_pages_total < slot->pages); ++ uksm_pages_total -= slot->pages; ++ } ++ ++ if (slot->fully_scanned_round == fully_scanned_round) ++ scanned_virtual_pages -= slot->pages; ++ else ++ scanned_virtual_pages -= slot->pages_scanned; ++ free_vma_slot(slot); ++} ++ ++ ++#define SPIN_LOCK_PERIOD 32 ++static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; ++static inline void cleanup_vma_slots(void) ++{ ++ struct vma_slot *slot; ++ int i; ++ ++ i = 0; ++ spin_lock(&vma_slot_list_lock); ++ while (!list_empty(&vma_slot_del)) { ++ slot = list_entry(vma_slot_del.next, ++ struct vma_slot, slot_list); ++ list_del(&slot->slot_list); ++ cleanup_slots[i++] = slot; ++ if (i == SPIN_LOCK_PERIOD) { ++ spin_unlock(&vma_slot_list_lock); ++ while (--i >= 0) ++ uksm_del_vma_slot(cleanup_slots[i]); ++ i = 0; ++ spin_lock(&vma_slot_list_lock); ++ } ++ } ++ spin_unlock(&vma_slot_list_lock); ++ ++ while (--i >= 0) ++ uksm_del_vma_slot(cleanup_slots[i]); ++} ++ ++/* ++ * Expotional moving average formula ++ */ ++static inline unsigned long ema(unsigned long curr, unsigned long last_ema) ++{ ++ /* ++ * For a very high burst, even the ema cannot work well, a false very ++ * high per-page time estimation can result in feedback in very high ++ * overhead of context switch and rung update -- this will then lead ++ * to higher per-paper time, this may not converge. ++ * ++ * Instead, we try to approach this value in a binary manner. ++ */ ++ if (curr > last_ema * 10) ++ return last_ema * 2; ++ ++ return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; ++} ++ ++/* ++ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to ++ * nanoseconds based on current uksm_sleep_jiffies. ++ */ ++static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) ++{ ++ return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / ++ (TIME_RATIO_SCALE - ratio) * ratio; ++} ++ ++ ++static inline unsigned long rung_real_ratio(int cpu_time_ratio) ++{ ++ unsigned long ret; ++ ++ BUG_ON(!cpu_time_ratio); ++ ++ if (cpu_time_ratio > 0) ++ ret = cpu_time_ratio; ++ else ++ ret = (unsigned long)(-cpu_time_ratio) * ++ uksm_max_cpu_percentage / 100UL; ++ ++ return ret ? ret : 1; ++} ++ ++static noinline void uksm_calc_scan_pages(void) ++{ ++ struct scan_rung *ladder = uksm_scan_ladder; ++ unsigned long sleep_usecs, nsecs; ++ unsigned long ratio; ++ int i; ++ unsigned long per_page; ++ ++ if (uksm_ema_page_time > 100000 || ++ (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) ++ uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; ++ ++ per_page = uksm_ema_page_time; ++ BUG_ON(!per_page); ++ ++ /* ++ * For every 8 eval round, we try to probe a uksm_sleep_jiffies value ++ * based on saved user input. ++ */ ++ if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) ++ uksm_sleep_jiffies = uksm_sleep_saved; ++ ++ /* We require a rung scan at least 1 page in a period. */ ++ nsecs = per_page; ++ ratio = rung_real_ratio(ladder[0].cpu_ratio); ++ if (cpu_ratio_to_nsec(ratio) < nsecs) { ++ sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio ++ / NSEC_PER_USEC; ++ uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ ratio = rung_real_ratio(ladder[i].cpu_ratio); ++ ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / ++ per_page; ++ BUG_ON(!ladder[i].pages_to_scan); ++ uksm_calc_rung_step(&ladder[i], per_page, ratio); ++ } ++} ++ ++/* ++ * From the scan time of this round (ns) to next expected min sleep time ++ * (ms), be careful of the possible overflows. ratio is taken from ++ * rung_real_ratio() ++ */ ++static inline ++unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) ++{ ++ scan_time >>= 20; /* to msec level now */ ++ BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); ++ ++ return (unsigned int) ((unsigned long) scan_time * ++ (TIME_RATIO_SCALE - ratio) / ratio); ++} ++ ++#define __round_mask(x, y) ((__typeof__(x))((y)-1)) ++#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) ++ ++static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) ++{ ++ struct scan_rung *rung; ++ ++ rung = &uksm_scan_ladder[0]; ++ rung_add_new_slots(rung, slots, num); ++} ++ ++static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; ++ ++static void uksm_enter_all_slots(void) ++{ ++ struct vma_slot *slot; ++ unsigned long index; ++ struct list_head empty_vma_list; ++ int i; ++ ++ i = 0; ++ index = 0; ++ INIT_LIST_HEAD(&empty_vma_list); ++ ++ spin_lock(&vma_slot_list_lock); ++ while (!list_empty(&vma_slot_new)) { ++ slot = list_entry(vma_slot_new.next, ++ struct vma_slot, slot_list); ++ ++ if (!slot->vma->anon_vma) { ++ list_move(&slot->slot_list, &empty_vma_list); ++ } else if (vma_can_enter(slot->vma)) { ++ batch_slots[index++] = slot; ++ list_del_init(&slot->slot_list); ++ } else { ++ list_move(&slot->slot_list, &vma_slot_noadd); ++ } ++ ++ if (++i == SPIN_LOCK_PERIOD || ++ (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { ++ spin_unlock(&vma_slot_list_lock); ++ ++ if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { ++ uksm_vma_enter(batch_slots, index); ++ index = 0; ++ } ++ i = 0; ++ cond_resched(); ++ spin_lock(&vma_slot_list_lock); ++ } ++ } ++ ++ list_splice(&empty_vma_list, &vma_slot_new); ++ ++ spin_unlock(&vma_slot_list_lock); ++ ++ if (index) ++ uksm_vma_enter(batch_slots, index); ++ ++} ++ ++static inline int rung_round_finished(struct scan_rung *rung) ++{ ++ return rung->flags & UKSM_RUNG_ROUND_FINISHED; ++} ++ ++static inline void judge_slot(struct vma_slot *slot) ++{ ++ struct scan_rung *rung = slot->rung; ++ unsigned long dedup; ++ int deleted; ++ ++ dedup = cal_dedup_ratio(slot); ++ if (vma_fully_scanned(slot) && uksm_thrash_threshold) ++ deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); ++ else if (dedup && dedup >= uksm_abundant_threshold) ++ deleted = vma_rung_up(slot); ++ else ++ deleted = vma_rung_down(slot); ++ ++ slot->pages_merged = 0; ++ slot->pages_cowed = 0; ++ slot->this_sampled = 0; ++ ++ if (vma_fully_scanned(slot)) ++ slot->pages_scanned = 0; ++ ++ slot->last_scanned = slot->pages_scanned; ++ ++ /* If its deleted in above, then rung was already advanced. */ ++ if (!deleted) ++ advance_current_scan(rung); ++} ++ ++ ++static inline int hash_round_finished(void) ++{ ++ if (scanned_virtual_pages > (uksm_pages_total >> 2)) { ++ scanned_virtual_pages = 0; ++ if (uksm_pages_scanned) ++ fully_scanned_round++; ++ ++ return 1; ++ } else { ++ return 0; ++ } ++} ++ ++#define UKSM_MMSEM_BATCH 5 ++#define BUSY_RETRY 100 ++ ++/** ++ * uksm_do_scan() - the main worker function. ++ */ ++static noinline void uksm_do_scan(void) ++{ ++ struct vma_slot *slot, *iter; ++ struct mm_struct *busy_mm; ++ unsigned char round_finished, all_rungs_emtpy; ++ int i, err, mmsem_batch; ++ unsigned long pcost; ++ long long delta_exec; ++ unsigned long vpages, max_cpu_ratio; ++ unsigned long long start_time, end_time, scan_time; ++ unsigned int expected_jiffies; ++ ++ might_sleep(); ++ ++ vpages = 0; ++ ++ start_time = task_sched_runtime(current); ++ max_cpu_ratio = 0; ++ mmsem_batch = 0; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE;) { ++ struct scan_rung *rung = &uksm_scan_ladder[i]; ++ unsigned long ratio; ++ int busy_retry; ++ ++ if (!rung->pages_to_scan) { ++ i++; ++ continue; ++ } ++ ++ if (!rung->vma_root.num) { ++ rung->pages_to_scan = 0; ++ i++; ++ continue; ++ } ++ ++ ratio = rung_real_ratio(rung->cpu_ratio); ++ if (ratio > max_cpu_ratio) ++ max_cpu_ratio = ratio; ++ ++ busy_retry = BUSY_RETRY; ++ /* ++ * Do not consider rung_round_finished() here, just used up the ++ * rung->pages_to_scan quota. ++ */ ++ while (rung->pages_to_scan && rung->vma_root.num && ++ likely(!freezing(current))) { ++ int reset = 0; ++ ++ slot = rung->current_scan; ++ ++ BUG_ON(vma_fully_scanned(slot)); ++ ++ if (mmsem_batch) ++ err = 0; ++ else ++ err = try_down_read_slot_mmap_sem(slot); ++ ++ if (err == -ENOENT) { ++rm_slot: ++ rung_rm_slot(slot); ++ continue; ++ } ++ ++ busy_mm = slot->mm; ++ ++ if (err == -EBUSY) { ++ /* skip other vmas on the same mm */ ++ do { ++ reset = advance_current_scan(rung); ++ iter = rung->current_scan; ++ busy_retry--; ++ if (iter->vma->vm_mm != busy_mm || ++ !busy_retry || reset) ++ break; ++ } while (1); ++ ++ if (iter->vma->vm_mm != busy_mm) { ++ continue; ++ } else { ++ /* scan round finsished */ ++ break; ++ } ++ } ++ ++ BUG_ON(!vma_can_enter(slot->vma)); ++ if (uksm_test_exit(slot->vma->vm_mm)) { ++ mmsem_batch = 0; ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ goto rm_slot; ++ } ++ ++ if (mmsem_batch) ++ mmsem_batch--; ++ else ++ mmsem_batch = UKSM_MMSEM_BATCH; ++ ++ /* Ok, we have take the mmap_sem, ready to scan */ ++ scan_vma_one_page(slot); ++ rung->pages_to_scan--; ++ vpages++; ++ ++ if (rung->current_offset + rung->step > slot->pages - 1 ++ || vma_fully_scanned(slot)) { ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ judge_slot(slot); ++ mmsem_batch = 0; ++ } else { ++ rung->current_offset += rung->step; ++ if (!mmsem_batch) ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ } ++ ++ busy_retry = BUSY_RETRY; ++ cond_resched(); ++ } ++ ++ if (mmsem_batch) { ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ mmsem_batch = 0; ++ } ++ ++ if (freezing(current)) ++ break; ++ ++ cond_resched(); ++ } ++ end_time = task_sched_runtime(current); ++ delta_exec = end_time - start_time; ++ ++ if (freezing(current)) ++ return; ++ ++ cleanup_vma_slots(); ++ uksm_enter_all_slots(); ++ ++ round_finished = 1; ++ all_rungs_emtpy = 1; ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ struct scan_rung *rung = &uksm_scan_ladder[i]; ++ ++ if (rung->vma_root.num) { ++ all_rungs_emtpy = 0; ++ if (!rung_round_finished(rung)) ++ round_finished = 0; ++ } ++ } ++ ++ if (all_rungs_emtpy) ++ round_finished = 0; ++ ++ if (round_finished) { ++ round_update_ladder(); ++ uksm_eval_round++; ++ ++ if (hash_round_finished() && rshash_adjust()) { ++ /* Reset the unstable root iff hash strength changed */ ++ uksm_hash_round++; ++ root_unstable_tree = RB_ROOT; ++ free_all_tree_nodes(&unstable_tree_node_list); ++ } ++ ++ /* ++ * A number of pages can hang around indefinitely on per-cpu ++ * pagevecs, raised page count preventing write_protect_page ++ * from merging them. Though it doesn't really matter much, ++ * it is puzzling to see some stuck in pages_volatile until ++ * other activity jostles them out, and they also prevented ++ * LTP's KSM test from succeeding deterministically; so drain ++ * them here (here rather than on entry to uksm_do_scan(), ++ * so we don't IPI too often when pages_to_scan is set low). ++ */ ++ lru_add_drain_all(); ++ } ++ ++ ++ if (vpages && delta_exec > 0) { ++ pcost = (unsigned long) delta_exec / vpages; ++ if (likely(uksm_ema_page_time)) ++ uksm_ema_page_time = ema(pcost, uksm_ema_page_time); ++ else ++ uksm_ema_page_time = pcost; ++ } ++ ++ uksm_calc_scan_pages(); ++ uksm_sleep_real = uksm_sleep_jiffies; ++ /* in case of radical cpu bursts, apply the upper bound */ ++ end_time = task_sched_runtime(current); ++ if (max_cpu_ratio && end_time > start_time) { ++ scan_time = end_time - start_time; ++ expected_jiffies = msecs_to_jiffies( ++ scan_time_to_sleep(scan_time, max_cpu_ratio)); ++ ++ if (expected_jiffies > uksm_sleep_real) ++ uksm_sleep_real = expected_jiffies; ++ ++ /* We have a 1 second up bound for responsiveness. */ ++ if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) ++ uksm_sleep_real = msecs_to_jiffies(1000); ++ } ++ ++ return; ++} ++ ++static int ksmd_should_run(void) ++{ ++ return uksm_run & UKSM_RUN_MERGE; ++} ++ ++static int uksm_scan_thread(void *nothing) ++{ ++ set_freezable(); ++ set_user_nice(current, 5); ++ ++ while (!kthread_should_stop()) { ++ mutex_lock(&uksm_thread_mutex); ++ if (ksmd_should_run()) ++ uksm_do_scan(); ++ mutex_unlock(&uksm_thread_mutex); ++ ++ try_to_freeze(); ++ ++ if (ksmd_should_run()) { ++ schedule_timeout_interruptible(uksm_sleep_real); ++ uksm_sleep_times++; ++ } else { ++ wait_event_freezable(uksm_thread_wait, ++ ksmd_should_run() || kthread_should_stop()); ++ } ++ } ++ return 0; ++} ++ ++void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) ++{ ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct rmap_item *rmap_item; ++ int search_new_forks = 0; ++ unsigned long address; ++ ++ VM_BUG_ON_PAGE(!PageKsm(page), page); ++ VM_BUG_ON_PAGE(!PageLocked(page), page); ++ ++ stable_node = page_stable_node(page); ++ if (!stable_node) ++ return; ++again: ++ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ struct anon_vma_chain *vmac; ++ struct vm_area_struct *vma; ++ ++ cond_resched(); ++ anon_vma_lock_read(anon_vma); ++ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, ++ 0, ULONG_MAX) { ++ cond_resched(); ++ vma = vmac->vma; ++ address = get_rmap_addr(rmap_item); ++ ++ if (address < vma->vm_start || ++ address >= vma->vm_end) ++ continue; ++ ++ if ((rmap_item->slot->vma == vma) == ++ search_new_forks) ++ continue; ++ ++ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) ++ continue; ++ ++ if (!rwc->rmap_one(page, vma, address, rwc->arg)) { ++ anon_vma_unlock_read(anon_vma); ++ return; ++ } ++ ++ if (rwc->done && rwc->done(page)) { ++ anon_vma_unlock_read(anon_vma); ++ return; ++ } ++ } ++ anon_vma_unlock_read(anon_vma); ++ } ++ } ++ if (!search_new_forks++) ++ goto again; ++} ++ ++#ifdef CONFIG_MIGRATION ++/* Common ksm interface but may be specific to uksm */ ++void ksm_migrate_page(struct page *newpage, struct page *oldpage) ++{ ++ struct stable_node *stable_node; ++ ++ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); ++ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); ++ VM_BUG_ON(newpage->mapping != oldpage->mapping); ++ ++ stable_node = page_stable_node(newpage); ++ if (stable_node) { ++ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); ++ stable_node->kpfn = page_to_pfn(newpage); ++ /* ++ * newpage->mapping was set in advance; now we need smp_wmb() ++ * to make sure that the new stable_node->kpfn is visible ++ * to get_ksm_page() before it can see that oldpage->mapping ++ * has gone stale (or that PageSwapCache has been cleared). ++ */ ++ smp_wmb(); ++ set_page_stable_node(oldpage, NULL); ++ } ++} ++#endif /* CONFIG_MIGRATION */ ++ ++#ifdef CONFIG_MEMORY_HOTREMOVE ++static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, ++ unsigned long end_pfn) ++{ ++ struct rb_node *node; ++ ++ for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { ++ struct stable_node *stable_node; ++ ++ stable_node = rb_entry(node, struct stable_node, node); ++ if (stable_node->kpfn >= start_pfn && ++ stable_node->kpfn < end_pfn) ++ return stable_node; ++ } ++ return NULL; ++} ++ ++static int uksm_memory_callback(struct notifier_block *self, ++ unsigned long action, void *arg) ++{ ++ struct memory_notify *mn = arg; ++ struct stable_node *stable_node; ++ ++ switch (action) { ++ case MEM_GOING_OFFLINE: ++ /* ++ * Keep it very simple for now: just lock out ksmd and ++ * MADV_UNMERGEABLE while any memory is going offline. ++ * mutex_lock_nested() is necessary because lockdep was alarmed ++ * that here we take uksm_thread_mutex inside notifier chain ++ * mutex, and later take notifier chain mutex inside ++ * uksm_thread_mutex to unlock it. But that's safe because both ++ * are inside mem_hotplug_mutex. ++ */ ++ mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); ++ break; ++ ++ case MEM_OFFLINE: ++ /* ++ * Most of the work is done by page migration; but there might ++ * be a few stable_nodes left over, still pointing to struct ++ * pages which have been offlined: prune those from the tree. ++ */ ++ while ((stable_node = uksm_check_stable_tree(mn->start_pfn, ++ mn->start_pfn + mn->nr_pages)) != NULL) ++ remove_node_from_stable_tree(stable_node, 1, 1); ++ /* fallthrough */ ++ ++ case MEM_CANCEL_OFFLINE: ++ mutex_unlock(&uksm_thread_mutex); ++ break; ++ } ++ return NOTIFY_OK; ++} ++#endif /* CONFIG_MEMORY_HOTREMOVE */ ++ ++#ifdef CONFIG_SYSFS ++/* ++ * This all compiles without CONFIG_SYSFS, but is a waste of space. ++ */ ++ ++#define UKSM_ATTR_RO(_name) \ ++ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) ++#define UKSM_ATTR(_name) \ ++ static struct kobj_attribute _name##_attr = \ ++ __ATTR(_name, 0644, _name##_show, _name##_store) ++ ++static ssize_t max_cpu_percentage_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_max_cpu_percentage); ++} ++ ++static ssize_t max_cpu_percentage_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned long max_cpu_percentage; ++ int err; ++ ++ err = kstrtoul(buf, 10, &max_cpu_percentage); ++ if (err || max_cpu_percentage > 100) ++ return -EINVAL; ++ ++ if (max_cpu_percentage == 100) ++ max_cpu_percentage = 99; ++ else if (max_cpu_percentage < 10) ++ max_cpu_percentage = 10; ++ ++ uksm_max_cpu_percentage = max_cpu_percentage; ++ ++ return count; ++} ++UKSM_ATTR(max_cpu_percentage); ++ ++static ssize_t sleep_millisecs_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); ++} ++ ++static ssize_t sleep_millisecs_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned long msecs; ++ int err; ++ ++ err = kstrtoul(buf, 10, &msecs); ++ if (err || msecs > MSEC_PER_SEC) ++ return -EINVAL; ++ ++ uksm_sleep_jiffies = msecs_to_jiffies(msecs); ++ uksm_sleep_saved = uksm_sleep_jiffies; ++ ++ return count; ++} ++UKSM_ATTR(sleep_millisecs); ++ ++ ++static ssize_t cpu_governor_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); ++ int i; ++ ++ buf[0] = '\0'; ++ for (i = 0; i < n ; i++) { ++ if (uksm_cpu_governor == i) ++ strcat(buf, "["); ++ ++ strcat(buf, uksm_cpu_governor_str[i]); ++ ++ if (uksm_cpu_governor == i) ++ strcat(buf, "]"); ++ ++ strcat(buf, " "); ++ } ++ strcat(buf, "\n"); ++ ++ return strlen(buf); ++} ++ ++static inline void init_performance_values(void) ++{ ++ int i; ++ struct scan_rung *rung; ++ struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; ++ ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = uksm_scan_ladder + i; ++ rung->cpu_ratio = preset->cpu_ratio[i]; ++ rung->cover_msecs = preset->cover_msecs[i]; ++ } ++ ++ uksm_max_cpu_percentage = preset->max_cpu; ++} ++ ++static ssize_t cpu_governor_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); ++ ++ for (n--; n >= 0 ; n--) { ++ if (!strncmp(buf, uksm_cpu_governor_str[n], ++ strlen(uksm_cpu_governor_str[n]))) ++ break; ++ } ++ ++ if (n < 0) ++ return -EINVAL; ++ else ++ uksm_cpu_governor = n; ++ ++ init_performance_values(); ++ ++ return count; ++} ++UKSM_ATTR(cpu_governor); ++ ++static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_run); ++} ++ ++static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = kstrtoul(buf, 10, &flags); ++ if (err || flags > UINT_MAX) ++ return -EINVAL; ++ if (flags > UKSM_RUN_MERGE) ++ return -EINVAL; ++ ++ mutex_lock(&uksm_thread_mutex); ++ if (uksm_run != flags) ++ uksm_run = flags; ++ mutex_unlock(&uksm_thread_mutex); ++ ++ if (flags & UKSM_RUN_MERGE) ++ wake_up_interruptible(&uksm_thread_wait); ++ ++ return count; ++} ++UKSM_ATTR(run); ++ ++static ssize_t abundant_threshold_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_abundant_threshold); ++} ++ ++static ssize_t abundant_threshold_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = kstrtoul(buf, 10, &flags); ++ if (err || flags > 99) ++ return -EINVAL; ++ ++ uksm_abundant_threshold = flags; ++ ++ return count; ++} ++UKSM_ATTR(abundant_threshold); ++ ++static ssize_t thrash_threshold_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_thrash_threshold); ++} ++ ++static ssize_t thrash_threshold_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = kstrtoul(buf, 10, &flags); ++ if (err || flags > 99) ++ return -EINVAL; ++ ++ uksm_thrash_threshold = flags; ++ ++ return count; ++} ++UKSM_ATTR(thrash_threshold); ++ ++static ssize_t cpu_ratios_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int i, size; ++ struct scan_rung *rung; ++ char *p = buf; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ if (rung->cpu_ratio > 0) ++ size = sprintf(p, "%d ", rung->cpu_ratio); ++ else ++ size = sprintf(p, "MAX/%d ", ++ TIME_RATIO_SCALE / -rung->cpu_ratio); ++ ++ p += size; ++ } ++ ++ *p++ = '\n'; ++ *p = '\0'; ++ ++ return p - buf; ++} ++ ++static ssize_t cpu_ratios_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int i, cpuratios[SCAN_LADDER_SIZE], err; ++ unsigned long value; ++ struct scan_rung *rung; ++ char *p, *end = NULL; ++ ++ p = kzalloc(count, GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ memcpy(p, buf, count); ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ if (i != SCAN_LADDER_SIZE - 1) { ++ end = strchr(p, ' '); ++ if (!end) ++ return -EINVAL; ++ ++ *end = '\0'; ++ } ++ ++ if (strstr(p, "MAX/")) { ++ p = strchr(p, '/') + 1; ++ err = kstrtoul(p, 10, &value); ++ if (err || value > TIME_RATIO_SCALE || !value) ++ return -EINVAL; ++ ++ cpuratios[i] = -(int) (TIME_RATIO_SCALE / value); ++ } else { ++ err = kstrtoul(p, 10, &value); ++ if (err || value > TIME_RATIO_SCALE || !value) ++ return -EINVAL; ++ ++ cpuratios[i] = value; ++ } ++ ++ p = end + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ rung->cpu_ratio = cpuratios[i]; ++ } ++ ++ return count; ++} ++UKSM_ATTR(cpu_ratios); ++ ++static ssize_t eval_intervals_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int i, size; ++ struct scan_rung *rung; ++ char *p = buf; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ size = sprintf(p, "%u ", rung->cover_msecs); ++ p += size; ++ } ++ ++ *p++ = '\n'; ++ *p = '\0'; ++ ++ return p - buf; ++} ++ ++static ssize_t eval_intervals_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int i, err; ++ unsigned long values[SCAN_LADDER_SIZE]; ++ struct scan_rung *rung; ++ char *p, *end = NULL; ++ ssize_t ret = count; ++ ++ p = kzalloc(count + 2, GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ memcpy(p, buf, count); ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ if (i != SCAN_LADDER_SIZE - 1) { ++ end = strchr(p, ' '); ++ if (!end) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ *end = '\0'; ++ } ++ ++ err = kstrtoul(p, 10, &values[i]); ++ if (err) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ p = end + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ rung->cover_msecs = values[i]; ++ } ++ ++out: ++ kfree(p); ++ return ret; ++} ++UKSM_ATTR(eval_intervals); ++ ++static ssize_t ema_per_page_time_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_ema_page_time); ++} ++UKSM_ATTR_RO(ema_per_page_time); ++ ++static ssize_t pages_shared_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_shared); ++} ++UKSM_ATTR_RO(pages_shared); ++ ++static ssize_t pages_sharing_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_sharing); ++} ++UKSM_ATTR_RO(pages_sharing); ++ ++static ssize_t pages_unshared_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_unshared); ++} ++UKSM_ATTR_RO(pages_unshared); ++ ++static ssize_t full_scans_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%llu\n", fully_scanned_round); ++} ++UKSM_ATTR_RO(full_scans); ++ ++static ssize_t pages_scanned_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ unsigned long base = 0; ++ u64 delta, ret; ++ ++ if (pages_scanned_stored) { ++ base = pages_scanned_base; ++ ret = pages_scanned_stored; ++ delta = uksm_pages_scanned >> base; ++ if (CAN_OVERFLOW_U64(ret, delta)) { ++ ret >>= 1; ++ delta >>= 1; ++ base++; ++ ret += delta; ++ } ++ } else { ++ ret = uksm_pages_scanned; ++ } ++ ++ while (ret > ULONG_MAX) { ++ ret >>= 1; ++ base++; ++ } ++ ++ if (base) ++ return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); ++ else ++ return sprintf(buf, "%lu\n", (unsigned long)ret); ++} ++UKSM_ATTR_RO(pages_scanned); ++ ++static ssize_t hash_strength_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", hash_strength); ++} ++UKSM_ATTR_RO(hash_strength); ++ ++static ssize_t sleep_times_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%llu\n", uksm_sleep_times); ++} ++UKSM_ATTR_RO(sleep_times); ++ ++ ++static struct attribute *uksm_attrs[] = { ++ &max_cpu_percentage_attr.attr, ++ &sleep_millisecs_attr.attr, ++ &cpu_governor_attr.attr, ++ &run_attr.attr, ++ &ema_per_page_time_attr.attr, ++ &pages_shared_attr.attr, ++ &pages_sharing_attr.attr, ++ &pages_unshared_attr.attr, ++ &full_scans_attr.attr, ++ &pages_scanned_attr.attr, ++ &hash_strength_attr.attr, ++ &sleep_times_attr.attr, ++ &thrash_threshold_attr.attr, ++ &abundant_threshold_attr.attr, ++ &cpu_ratios_attr.attr, ++ &eval_intervals_attr.attr, ++ NULL, ++}; ++ ++static struct attribute_group uksm_attr_group = { ++ .attrs = uksm_attrs, ++ .name = "uksm", ++}; ++#endif /* CONFIG_SYSFS */ ++ ++static inline void init_scan_ladder(void) ++{ ++ int i; ++ struct scan_rung *rung; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = uksm_scan_ladder + i; ++ slot_tree_init_root(&rung->vma_root); ++ } ++ ++ init_performance_values(); ++ uksm_calc_scan_pages(); ++} ++ ++static inline int cal_positive_negative_costs(void) ++{ ++ struct page *p1, *p2; ++ unsigned char *addr1, *addr2; ++ unsigned long i, time_start, hash_cost; ++ unsigned long loopnum = 0; ++ ++ /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ ++ volatile u32 hash; ++ volatile int ret; ++ ++ p1 = alloc_page(GFP_KERNEL); ++ if (!p1) ++ return -ENOMEM; ++ ++ p2 = alloc_page(GFP_KERNEL); ++ if (!p2) ++ return -ENOMEM; ++ ++ addr1 = kmap_atomic(p1); ++ addr2 = kmap_atomic(p2); ++ memset(addr1, prandom_u32(), PAGE_SIZE); ++ memcpy(addr2, addr1, PAGE_SIZE); ++ ++ /* make sure that the two pages differ in last byte */ ++ addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; ++ kunmap_atomic(addr2); ++ kunmap_atomic(addr1); ++ ++ time_start = jiffies; ++ while (jiffies - time_start < 100) { ++ for (i = 0; i < 100; i++) ++ hash = page_hash(p1, HASH_STRENGTH_FULL, 0); ++ loopnum += 100; ++ } ++ hash_cost = (jiffies - time_start); ++ ++ time_start = jiffies; ++ for (i = 0; i < loopnum; i++) ++ ret = pages_identical(p1, p2); ++ memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); ++ memcmp_cost /= hash_cost; ++ pr_info("UKSM: relative memcmp_cost = %lu " ++ "hash=%u cmp_ret=%d.\n", ++ memcmp_cost, hash, ret); ++ ++ __free_page(p1); ++ __free_page(p2); ++ return 0; ++} ++ ++static int init_zeropage_hash_table(void) ++{ ++ struct page *page; ++ char *addr; ++ int i; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ addr = kmap_atomic(page); ++ memset(addr, 0, PAGE_SIZE); ++ kunmap_atomic(addr); ++ ++ zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32), ++ GFP_KERNEL); ++ if (!zero_hash_table) ++ return -ENOMEM; ++ ++ for (i = 0; i < HASH_STRENGTH_MAX; i++) ++ zero_hash_table[i] = page_hash(page, i, 0); ++ ++ __free_page(page); ++ ++ return 0; ++} ++ ++static inline int init_random_sampling(void) ++{ ++ unsigned long i; ++ ++ random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); ++ if (!random_nums) ++ return -ENOMEM; ++ ++ for (i = 0; i < HASH_STRENGTH_FULL; i++) ++ random_nums[i] = i; ++ ++ for (i = 0; i < HASH_STRENGTH_FULL; i++) { ++ unsigned long rand_range, swap_index, tmp; ++ ++ rand_range = HASH_STRENGTH_FULL - i; ++ swap_index = i + prandom_u32() % rand_range; ++ tmp = random_nums[i]; ++ random_nums[i] = random_nums[swap_index]; ++ random_nums[swap_index] = tmp; ++ } ++ ++ rshash_state.state = RSHASH_NEW; ++ rshash_state.below_count = 0; ++ rshash_state.lookup_window_index = 0; ++ ++ return cal_positive_negative_costs(); ++} ++ ++static int __init uksm_slab_init(void) ++{ ++ rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); ++ if (!rmap_item_cache) ++ goto out; ++ ++ stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); ++ if (!stable_node_cache) ++ goto out_free1; ++ ++ node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); ++ if (!node_vma_cache) ++ goto out_free2; ++ ++ vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); ++ if (!vma_slot_cache) ++ goto out_free3; ++ ++ tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); ++ if (!tree_node_cache) ++ goto out_free4; ++ ++ return 0; ++ ++out_free4: ++ kmem_cache_destroy(vma_slot_cache); ++out_free3: ++ kmem_cache_destroy(node_vma_cache); ++out_free2: ++ kmem_cache_destroy(stable_node_cache); ++out_free1: ++ kmem_cache_destroy(rmap_item_cache); ++out: ++ return -ENOMEM; ++} ++ ++static void __init uksm_slab_free(void) ++{ ++ kmem_cache_destroy(stable_node_cache); ++ kmem_cache_destroy(rmap_item_cache); ++ kmem_cache_destroy(node_vma_cache); ++ kmem_cache_destroy(vma_slot_cache); ++ kmem_cache_destroy(tree_node_cache); ++} ++ ++/* Common interface to ksm, different to it. */ ++int ksm_madvise(struct vm_area_struct *vma, unsigned long start, ++ unsigned long end, int advice, unsigned long *vm_flags) ++{ ++ int err; ++ ++ switch (advice) { ++ case MADV_MERGEABLE: ++ return 0; /* just ignore the advice */ ++ ++ case MADV_UNMERGEABLE: ++ if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags)) ++ return 0; /* just ignore the advice */ ++ ++ if (vma->anon_vma) { ++ err = unmerge_uksm_pages(vma, start, end); ++ if (err) ++ return err; ++ } ++ ++ uksm_remove_vma(vma); ++ *vm_flags &= ~VM_MERGEABLE; ++ break; ++ } ++ ++ return 0; ++} ++ ++/* Common interface to ksm, actually the same. */ ++struct page *ksm_might_need_to_copy(struct page *page, ++ struct vm_area_struct *vma, unsigned long address) ++{ ++ struct anon_vma *anon_vma = page_anon_vma(page); ++ struct page *new_page; ++ ++ if (PageKsm(page)) { ++ if (page_stable_node(page)) ++ return page; /* no need to copy it */ ++ } else if (!anon_vma) { ++ return page; /* no need to copy it */ ++ } else if (anon_vma->root == vma->anon_vma->root && ++ page->index == linear_page_index(vma, address)) { ++ return page; /* still no need to copy it */ ++ } ++ if (!PageUptodate(page)) ++ return page; /* let do_swap_page report the error */ ++ ++ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); ++ if (new_page) { ++ copy_user_highpage(new_page, page, address, vma); ++ ++ SetPageDirty(new_page); ++ __SetPageUptodate(new_page); ++ __SetPageLocked(new_page); ++ } ++ ++ return new_page; ++} ++ ++static int __init uksm_init(void) ++{ ++ struct task_struct *uksm_thread; ++ int err; ++ ++ uksm_sleep_jiffies = msecs_to_jiffies(100); ++ uksm_sleep_saved = uksm_sleep_jiffies; ++ ++ slot_tree_init(); ++ init_scan_ladder(); ++ ++ ++ err = init_random_sampling(); ++ if (err) ++ goto out_free2; ++ ++ err = uksm_slab_init(); ++ if (err) ++ goto out_free1; ++ ++ err = init_zeropage_hash_table(); ++ if (err) ++ goto out_free0; ++ ++ uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); ++ if (IS_ERR(uksm_thread)) { ++ pr_err("uksm: creating kthread failed\n"); ++ err = PTR_ERR(uksm_thread); ++ goto out_free; ++ } ++ ++#ifdef CONFIG_SYSFS ++ err = sysfs_create_group(mm_kobj, &uksm_attr_group); ++ if (err) { ++ pr_err("uksm: register sysfs failed\n"); ++ kthread_stop(uksm_thread); ++ goto out_free; ++ } ++#else ++ uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ ++ ++#endif /* CONFIG_SYSFS */ ++ ++#ifdef CONFIG_MEMORY_HOTREMOVE ++ /* ++ * Choose a high priority since the callback takes uksm_thread_mutex: ++ * later callbacks could only be taking locks which nest within that. ++ */ ++ hotplug_memory_notifier(uksm_memory_callback, 100); ++#endif ++ return 0; ++ ++out_free: ++ kfree(zero_hash_table); ++out_free0: ++ uksm_slab_free(); ++out_free1: ++ kfree(random_nums); ++out_free2: ++ kfree(uksm_scan_ladder); ++ return err; ++} ++ ++#ifdef MODULE ++subsys_initcall(ksm_init); ++#else ++late_initcall(uksm_init); ++#endif ++ +diff -Nur a/mm/vmstat.c b/mm/vmstat.c +--- a/mm/vmstat.c 2018-05-25 15:18:02.000000000 +0100 ++++ b/mm/vmstat.c 2018-05-26 19:30:55.791140570 +0100 +@@ -1091,6 +1091,9 @@ + "nr_dirtied", + "nr_written", + ++#ifdef CONFIG_UKSM ++ "nr_uksm_zero_pages", ++#endif + /* enum writeback_stat_item counters */ + "nr_dirty_threshold", + "nr_dirty_background_threshold", diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0001-MultiQueue-Skiplist-Scheduler-version-v0.180-linux-hardened.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0001-MultiQueue-Skiplist-Scheduler-version-v0.180-linux-hardened.patch new file mode 100644 index 00000000..ee298f6a --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0001-MultiQueue-Skiplist-Scheduler-version-v0.180-linux-hardened.patch @@ -0,0 +1,10305 @@ +diff -Nur a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +--- a/arch/powerpc/platforms/cell/spufs/sched.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/arch/powerpc/platforms/cell/spufs/sched.c 2019-02-09 17:46:11.991297545 +0000 +@@ -65,11 +65,6 @@ + static struct timer_list spuloadavg_timer; + + /* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- +-/* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. + */ +diff -Nur a/arch/x86/Kconfig b/arch/x86/Kconfig +--- a/arch/x86/Kconfig 2019-02-09 17:20:30.461820549 +0000 ++++ b/arch/x86/Kconfig 2019-02-09 17:51:10.780941815 +0000 +@@ -1003,6 +1003,20 @@ + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_MUQSS && SCHED_SMT ++ default y ++ ---help--- ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ If unsure say Y here. ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +@@ -1033,6 +1047,80 @@ + + If unsure say Y here. + ++choice ++ prompt "CPU scheduler runqueue sharing" ++ default RQ_MC if SCHED_MUQSS ++ default RQ_NONE ++ ++config RQ_NONE ++ bool "No sharing" ++ help ++ This is the default behaviour where the CPU scheduler has one runqueue ++ per CPU, whether it is a physical or logical CPU (hyperthread). ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=none ++ ++ If unsure, say N. ++ ++config RQ_SMT ++ bool "SMT (hyperthread) siblings" ++ depends on SCHED_SMT && SCHED_MUQSS ++ ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by SMT (hyperthread) siblings. As these logical cores share ++ one physical core, sharing the runqueue resource can lead to decreased ++ overhead, lower latency and higher throughput. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smt ++ ++ If unsure, say N. ++ ++config RQ_MC ++ bool "Multicore siblings" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by multicore siblings in addition to any SMT siblings. ++ As these physical cores share caches, sharing the runqueue resource ++ will lead to lower latency, but its effects on overhead and throughput ++ are less predictable. As a general rule, 6 or fewer cores will likely ++ benefit from this, while larger CPUs will only derive a latency ++ benefit. If your workloads are primarily single threaded, this will ++ possibly worsen throughput. If you are only concerned about latency ++ then enable this regardless of how many cores you have. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=mc ++ ++ If unsure, say Y. ++ ++config RQ_SMP ++ bool "Symmetric Multi-Processing" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by all physical CPUs unless they are on separate NUMA nodes. ++ As physical CPUs usually do not share resources, sharing the runqueue ++ will normally worsen throughput but improve latency. If you only ++ care about latency enable this. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smp ++ ++ If unsure, say N. ++endchoice ++ ++config SHARERQ ++ int ++ default 0 if RQ_NONE ++ default 1 if RQ_SMT ++ default 2 if RQ_MC ++ default 3 if RQ_SMP ++ ++ + config UP_LATE_INIT + def_bool y + depends on !SMP && X86_LOCAL_APIC +diff -Nur a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +--- a/Documentation/admin-guide/kernel-parameters.txt 2019-02-09 17:20:30.451820228 +0000 ++++ b/Documentation/admin-guide/kernel-parameters.txt 2019-02-09 17:46:11.981297222 +0000 +@@ -4003,6 +4003,14 @@ + Memory area to be used by remote processor image, + managed by CMA. + ++ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type. ++ Format: ++ smt -- Share SMT (hyperthread) sibling runqueues ++ mc -- Share MC (multicore) sibling runqueues ++ smp -- Share SMP runqueues ++ none -- So not share any runqueues ++ Default value is mc ++ + rw [KNL] Mount root device read-write on boot + + S [KNL] Run init in single mode +diff -Nur a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt +--- a/Documentation/scheduler/sched-BFS.txt 1970-01-01 01:00:00.000000000 +0100 ++++ b/Documentation/scheduler/sched-BFS.txt 2019-02-09 17:46:11.981297222 +0000 +@@ -0,0 +1,351 @@ ++BFS - The Brain Fuck Scheduler by Con Kolivas. ++ ++Goals. ++ ++The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to ++completely do away with the complex designs of the past for the cpu process ++scheduler and instead implement one that is very simple in basic design. ++The main focus of BFS is to achieve excellent desktop interactivity and ++responsiveness without heuristics and tuning knobs that are difficult to ++understand, impossible to model and predict the effect of, and when tuned to ++one workload cause massive detriment to another. ++ ++ ++Design summary. ++ ++BFS is best described as a single runqueue, O(n) lookup, earliest effective ++virtual deadline first design, loosely based on EEVDF (earliest eligible virtual ++deadline first) and my previous Staircase Deadline scheduler. Each component ++shall be described in order to understand the significance of, and reasoning for ++it. The codebase when the first stable version was released was approximately ++9000 lines less code than the existing mainline linux kernel scheduler (in ++2.6.31). This does not even take into account the removal of documentation and ++the cgroups code that is not used. ++ ++Design reasoning. ++ ++The single runqueue refers to the queued but not running processes for the ++entire system, regardless of the number of CPUs. The reason for going back to ++a single runqueue design is that once multiple runqueues are introduced, ++per-CPU or otherwise, there will be complex interactions as each runqueue will ++be responsible for the scheduling latency and fairness of the tasks only on its ++own runqueue, and to achieve fairness and low latency across multiple CPUs, any ++advantage in throughput of having CPU local tasks causes other disadvantages. ++This is due to requiring a very complex balancing system to at best achieve some ++semblance of fairness across CPUs and can only maintain relatively low latency ++for tasks bound to the same CPUs, not across them. To increase said fairness ++and latency across CPUs, the advantage of local runqueue locking, which makes ++for better scalability, is lost due to having to grab multiple locks. ++ ++A significant feature of BFS is that all accounting is done purely based on CPU ++used and nowhere is sleep time used in any way to determine entitlement or ++interactivity. Interactivity "estimators" that use some kind of sleep/run ++algorithm are doomed to fail to detect all interactive tasks, and to falsely tag ++tasks that aren't interactive as being so. The reason for this is that it is ++close to impossible to determine that when a task is sleeping, whether it is ++doing it voluntarily, as in a userspace application waiting for input in the ++form of a mouse click or otherwise, or involuntarily, because it is waiting for ++another thread, process, I/O, kernel activity or whatever. Thus, such an ++estimator will introduce corner cases, and more heuristics will be required to ++cope with those corner cases, introducing more corner cases and failed ++interactivity detection and so on. Interactivity in BFS is built into the design ++by virtue of the fact that tasks that are waking up have not used up their quota ++of CPU time, and have earlier effective deadlines, thereby making it very likely ++they will preempt any CPU bound task of equivalent nice level. See below for ++more information on the virtual deadline mechanism. Even if they do not preempt ++a running task, because the rr interval is guaranteed to have a bound upper ++limit on how long a task will wait for, it will be scheduled within a timeframe ++that will not cause visible interface jitter. ++ ++ ++Design details. ++ ++Task insertion. ++ ++BFS inserts tasks into each relevant queue as an O(1) insertion into a double ++linked list. On insertion, *every* running queue is checked to see if the newly ++queued task can run on any idle queue, or preempt the lowest running task on the ++system. This is how the cross-CPU scheduling of BFS achieves significantly lower ++latency per extra CPU the system has. In this case the lookup is, in the worst ++case scenario, O(n) where n is the number of CPUs on the system. ++ ++Data protection. ++ ++BFS has one single lock protecting the process local data of every task in the ++global queue. Thus every insertion, removal and modification of task data in the ++global runqueue needs to grab the global lock. However, once a task is taken by ++a CPU, the CPU has its own local data copy of the running process' accounting ++information which only that CPU accesses and modifies (such as during a ++timer tick) thus allowing the accounting data to be updated lockless. Once a ++CPU has taken a task to run, it removes it from the global queue. Thus the ++global queue only ever has, at most, ++ ++ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 ++ ++tasks in the global queue. This value is relevant for the time taken to look up ++tasks during scheduling. This will increase if many tasks with CPU affinity set ++in their policy to limit which CPUs they're allowed to run on if they outnumber ++the number of CPUs. The +1 is because when rescheduling a task, the CPU's ++currently running task is put back on the queue. Lookup will be described after ++the virtual deadline mechanism is explained. ++ ++Virtual deadline. ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in BFS is entirely in the virtual deadline mechanism. The one ++tunable in BFS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in jiffies by this equation: ++ ++ jiffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. Once a task is descheduled, it is put back on the queue, and an ++O(n) lookup of all queued-but-not-running tasks is done to determine which has ++the earliest deadline and that task is chosen to receive CPU next. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (jiffies) is ++constantly moving. ++ ++Task lookup. ++ ++BFS has 103 priority queues. 100 of these are dedicated to the static priority ++of realtime tasks, and the remaining 3 are, in order of best to worst priority, ++SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority ++scheduling). When a task of these priorities is queued, a bitmap of running ++priorities is set showing which of these priorities has tasks waiting for CPU ++time. When a CPU is made to reschedule, the lookup for the next task to get ++CPU time is performed in the following way: ++ ++First the bitmap is checked to see what static priority tasks are queued. If ++any realtime priorities are found, the corresponding queue is checked and the ++first task listed there is taken (provided CPU affinity is suitable) and lookup ++is complete. If the priority corresponds to a SCHED_ISO task, they are also ++taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds ++to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this ++stage, every task in the runlist that corresponds to that priority is checked ++to see which has the earliest set deadline, and (provided it has suitable CPU ++affinity) it is taken off the runqueue and given the CPU. If a task has an ++expired deadline, it is taken and the rest of the lookup aborted (as they are ++chosen in FIFO order). ++ ++Thus, the lookup is O(n) in the worst case only, where n is as described ++earlier, as tasks may be chosen before the whole task list is looked over. ++ ++ ++Scalability. ++ ++The major limitations of BFS will be that of scalability, as the separate ++runqueue designs will have less lock contention as the number of CPUs rises. ++However they do not scale linearly even with separate runqueues as multiple ++runqueues will need to be locked concurrently on such designs to be able to ++achieve fair CPU balancing, to try and achieve some sort of nice-level fairness ++across CPUs, and to achieve low enough latency for tasks on a busy CPU when ++other CPUs would be more suited. BFS has the advantage that it requires no ++balancing algorithm whatsoever, as balancing occurs by proxy simply because ++all CPUs draw off the global runqueue, in priority and deadline order. Despite ++the fact that scalability is _not_ the prime concern of BFS, it both shows very ++good scalability to smaller numbers of CPUs and is likely a more scalable design ++at these numbers of CPUs. ++ ++It also has some very low overhead scalability features built into the design ++when it has been deemed their overhead is so marginal that they're worth adding. ++The first is the local copy of the running process' data to the CPU it's running ++on to allow that data to be updated lockless where possible. Then there is ++deference paid to the last CPU a task was running on, by trying that CPU first ++when looking for an idle CPU to use the next time it's scheduled. Finally there ++is the notion of cache locality beyond the last running CPU. The sched_domains ++information is used to determine the relative virtual "cache distance" that ++other CPUs have from the last CPU a task was running on. CPUs with shared ++caches, such as SMT siblings, or multicore CPUs with shared caches, are treated ++as cache local. CPUs without shared caches are treated as not cache local, and ++CPUs on different NUMA nodes are treated as very distant. This "relative cache ++distance" is used by modifying the virtual deadline value when doing lookups. ++Effectively, the deadline is unaltered between "cache local" CPUs, doubled for ++"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning ++behind the doubling of deadlines is as follows. The real cost of migrating a ++task from one CPU to another is entirely dependant on the cache footprint of ++the task, how cache intensive the task is, how long it's been running on that ++CPU to take up the bulk of its cache, how big the CPU cache is, how fast and ++how layered the CPU cache is, how fast a context switch is... and so on. In ++other words, it's close to random in the real world where we do more than just ++one sole workload. The only thing we can be sure of is that it's not free. So ++BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs ++is more important than cache locality, and cache locality only plays a part ++after that. Doubling the effective deadline is based on the premise that the ++"cache local" CPUs will tend to work on the same tasks up to double the number ++of cache local CPUs, and once the workload is beyond that amount, it is likely ++that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA ++is a value I pulled out of my arse. ++ ++When choosing an idle CPU for a waking task, the cache locality is determined ++according to where the task last ran and then idle CPUs are ranked from best ++to worst to choose the most suitable idle CPU based on cache locality, NUMA ++node locality and hyperthread sibling business. They are chosen in the ++following preference (if idle): ++ ++* Same core, idle or busy cache, idle threads ++* Other core, same cache, idle or busy cache, idle threads. ++* Same node, other CPU, idle cache, idle threads. ++* Same node, other CPU, busy cache, idle threads. ++* Same core, busy threads. ++* Other core, same cache, busy threads. ++* Same node, other CPU, busy threads. ++* Other node, other CPU, idle cache, idle threads. ++* Other node, other CPU, busy cache, idle threads. ++* Other node, other CPU, busy threads. ++ ++This shows the SMT or "hyperthread" awareness in the design as well which will ++choose a real idle core first before a logical SMT sibling which already has ++tasks on the physical CPU. ++ ++Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. ++However this benchmarking was performed on an earlier design that was far less ++scalable than the current one so it's hard to know how scalable it is in terms ++of both CPUs (due to the global runqueue) and heavily loaded machines (due to ++O(n) lookup) at this stage. Note that in terms of scalability, the number of ++_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) ++quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark ++results are very promising indeed, without needing to tweak any knobs, features ++or options. Benchmark contributions are most welcome. ++ ++ ++Features ++ ++As the initial prime target audience for BFS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval ++and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition ++to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is ++support for CGROUPS. The average user should neither need to know what these ++are, nor should they need to be using them to have good desktop behaviour. ++ ++rr_interval ++ ++There is only one "scheduler" tunable, the round robin interval. This can be ++accessed in ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6 on a ++uniprocessor machine, and automatically set to a progressively higher value on ++multiprocessor machines. The reasoning behind increasing the value on more CPUs ++is that the effective latency is decreased by virtue of there being more CPUs on ++BFS (for reasons explained above), and increasing the value allows for less ++cache contention and more throughput. Valid values are from 1 to 1000 ++Decreasing the value will decrease latencies at the cost of decreasing ++throughput, while increasing it will improve throughput, but at the cost of ++worsening latencies. The accuracy of the rr interval is limited by HZ resolution ++of the kernel configuration. Thus, the worst case latencies are usually slightly ++higher than this actual value. The default value of 6 is not an arbitrary one. ++It is based on the fact that humans can detect jitter at approximately 7ms, so ++aiming for much lower latencies is pointless under most circumstances. It is ++worth noting this fact when comparing the latency performance of BFS to other ++schedulers. Worst case latencies being higher than 7ms are far worse than ++average latencies not being in the microsecond range. ++ ++Isochronous scheduling. ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of _total CPU_ available across the machine, configurable ++as a percentage in the following "resource handling" tunable (as opposed to a ++scheduler tunable): ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of BFS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++Because some applications constantly set their policy as well as their nice ++level, there is potential for them to undo the override specified by the user ++on the command line of setting the policy to SCHED_ISO. To counter this, once ++a task has been set to SCHED_ISO policy, it needs superuser privileges to set ++it back to SCHED_NORMAL. This will ensure the task remains ISO and all child ++processes and threads will also inherit the ISO policy. ++ ++Idleprio scheduling. ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start ++a video encode or so on without any slowdown of other tasks. To avoid this ++policy from grabbing shared resources and holding them indefinitely, if it ++detects a state where the task is waiting on I/O, the machine is about to ++suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As ++per the Isochronous task management, once a task has been scheduled as IDLEPRIO, ++it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can ++be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++ schedtool -D -e ./mprime ++ ++Subtick accounting. ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the ++timer tick frequency (HZ) is lowered. It is possible to create an application ++which uses almost 100% CPU, yet by being descheduled at the right time, records ++zero CPU usage. While the main problem with this is that there are possible ++security implications, it is also difficult to determine how much CPU a task ++really does use. BFS tries to use the sub-tick accounting from the TSC clock, ++where possible, to determine real CPU usage. This is not entirely reliable, but ++is far more likely to produce accurate CPU usage data than the existing designs ++and will not show tasks as consuming no CPU usage when they actually are. Thus, ++the amount of CPU reported as being used by BFS will more accurately represent ++how much CPU the task itself is using (as is shown for example by the 'time' ++application), so the reported values may be quite different to other schedulers. ++Values reported as the 'load' are more prone to problems with this design, but ++per process values are closer to real usage. When comparing throughput of BFS ++to other designs, it is important to compare the actual completed work in terms ++of total wall clock time taken and total work done, rather than the reported ++"cpu usage". ++ ++ ++Con Kolivas Fri Aug 27 2010 +diff -Nur a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt +--- a/Documentation/scheduler/sched-MuQSS.txt 1970-01-01 01:00:00.000000000 +0100 ++++ b/Documentation/scheduler/sched-MuQSS.txt 2019-02-09 17:46:11.991297545 +0000 +@@ -0,0 +1,373 @@ ++MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. ++ ++MuQSS is a per-cpu runqueue variant of the original BFS scheduler with ++one 8 level skiplist per runqueue, and fine grained locking for much more ++scalability. ++ ++ ++Goals. ++ ++The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from ++here on (pronounced mux) is to completely do away with the complex designs of ++the past for the cpu process scheduler and instead implement one that is very ++simple in basic design. The main focus of MuQSS is to achieve excellent desktop ++interactivity and responsiveness without heuristics and tuning knobs that are ++difficult to understand, impossible to model and predict the effect of, and when ++tuned to one workload cause massive detriment to another, while still being ++scalable to many CPUs and processes. ++ ++ ++Design summary. ++ ++MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) ++lookup, earliest effective virtual deadline first tickless design, loosely based ++on EEVDF (earliest eligible virtual deadline first) and my previous Staircase ++Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. ++Each component shall be described in order to understand the significance of, ++and reasoning for it. ++ ++ ++Design reasoning. ++ ++In BFS, the use of a single runqueue across all CPUs meant that each CPU would ++need to scan the entire runqueue looking for the process with the earliest ++deadline and schedule that next, regardless of which CPU it originally came ++from. This made BFS deterministic with respect to latency and provided ++guaranteed latencies dependent on number of processes and CPUs. The single ++runqueue, however, meant that all CPUs would compete for the single lock ++protecting it, which would lead to increasing lock contention as the number of ++CPUs rose and appeared to limit scalability of common workloads beyond 16 ++logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously ++increased overhead proportionate to the number of queued proecesses and led to ++cache thrashing while iterating over the linked list. ++ ++MuQSS is an evolution of BFS, designed to maintain the same scheduling ++decision mechanism and be virtually deterministic without relying on the ++constrained design of the single runqueue by splitting out the single runqueue ++to be per-CPU and use skiplists instead of linked lists. ++ ++The original reason for going back to a single runqueue design for BFS was that ++once multiple runqueues are introduced, per-CPU or otherwise, there will be ++complex interactions as each runqueue will be responsible for the scheduling ++latency and fairness of the tasks only on its own runqueue, and to achieve ++fairness and low latency across multiple CPUs, any advantage in throughput of ++having CPU local tasks causes other disadvantages. This is due to requiring a ++very complex balancing system to at best achieve some semblance of fairness ++across CPUs and can only maintain relatively low latency for tasks bound to the ++same CPUs, not across them. To increase said fairness and latency across CPUs, ++the advantage of local runqueue locking, which makes for better scalability, is ++lost due to having to grab multiple locks. ++ ++MuQSS works around the problems inherent in multiple runqueue designs by ++making its skip lists priority ordered and through novel use of lockless ++examination of each other runqueue it can decide if it should take the earliest ++deadline task from another runqueue for latency reasons, or for CPU balancing ++reasons. It still does not have a balancing system, choosing to allow the ++next task scheduling decision and task wakeup CPU choice to allow balancing to ++happen by virtue of its choices. ++ ++As a further evolution of the design, MuQSS normally configures sharing of ++runqueues in a logical fashion for when CPU resources are shared for improved ++latency and throughput. By default it shares runqueues and locks between ++multicore siblings. Optionally it can be configured to run with sharing of ++SMT siblings only, all SMP packages or no sharing at all. Additionally it can ++be selected at boot time. ++ ++ ++Design details. ++ ++Custom skip list implementation: ++ ++To avoid the overhead of building up and tearing down skip list structures, ++the variant used by MuQSS has a number of optimisations making it specific for ++its use case in the scheduler. It uses static arrays of 8 'levels' instead of ++building up and tearing down structures dynamically. This makes each runqueue ++only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU ++it means that it scales O(log N) up to 64k x number of logical CPUs which is ++far beyond the realistic task limits each CPU could handle. By being 8 levels ++it also makes the array exactly one cacheline in size. Additionally, each ++skip list node is bidirectional making insertion and removal amortised O(1), ++being O(k) where k is 1-8. Uniquely, we are only ever interested in the very ++first entry in each list at all times with MuQSS, so there is never a need to ++do a search and thus look up is always O(1). In interactive mode, the queues ++will be searched beyond their first entry if the first task is not suitable ++for affinity or SMT nice reasons. ++ ++Task insertion: ++ ++MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into ++a custom skip list as described above (based on the original design by William ++Pugh). Insertion is ordered in such a way that there is never a need to do a ++search by ordering tasks according to static priority primarily, and then ++virtual deadline at the time of insertion. ++ ++Niffies: ++ ++Niffies are a monotonic forward moving timer not unlike the "jiffies" but are ++of nanosecond resolution. Niffies are calculated per-runqueue from the high ++resolution TSC timers, and in order to maintain fairness are synchronised ++between CPUs whenever both runqueues are locked concurrently. ++ ++Virtual deadline: ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in MuQSS is entirely in the virtual deadline mechanism. The one ++tunable in MuQSS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in niffies by this equation: ++ ++ niffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (niffies) is ++constantly moving. ++ ++Task lookup: ++ ++As tasks are already pre-ordered according to anticipated scheduling order in ++the skip lists, lookup for the next suitable task per-runqueue is always a ++matter of simply selecting the first task in the 0th level skip list entry. ++In order to maintain optimal latency and fairness across CPUs, MuQSS does a ++novel examination of every other runqueue in cache locality order, choosing the ++best task across all runqueues. This provides near-determinism of how long any ++task across the entire system may wait before receiving CPU time. The other ++runqueues are first examine lockless and then trylocked to minimise the ++potential lock contention if they are likely to have a suitable better task. ++Each other runqueue lock is only held for as long as it takes to examine the ++entry for suitability. In "interactive" mode, the default setting, MuQSS will ++look for the best deadline task across all CPUs, while in !interactive mode, ++it will only select a better deadline task from another CPU if it is more ++heavily laden than the current one. ++ ++Lookup is therefore O(k) where k is number of CPUs. ++ ++ ++Latency. ++ ++Through the use of virtual deadlines to govern the scheduling order of normal ++tasks, queue-to-activation latency per runqueue is guaranteed to be bound by ++the rr_interval tunable which is set to 6ms by default. This means that the ++longest a CPU bound task will wait for more CPU is proportional to the number ++of running tasks and in the common case of 0-2 running tasks per CPU, will be ++under the 7ms threshold for human perception of jitter. Additionally, as newly ++woken tasks will have an early deadline from their previous runtime, the very ++tasks that are usually latency sensitive will have the shortest interval for ++activation, usually preempting any existing CPU bound tasks. ++ ++Tickless expiry: ++ ++A feature of MuQSS is that it is not tied to the resolution of the chosen tick ++rate in Hz, instead depending entirely on the high resolution timers where ++possible for sub-millisecond accuracy on timeouts regarless of the underlying ++tick rate. This allows MuQSS to be run with the low overhead of low Hz rates ++such as 100 by default, benefiting from the improved throughput and lower ++power usage it provides. Another advantage of this approach is that in ++combination with the Full No HZ option, which disables ticks on running task ++CPUs instead of just idle CPUs, the tick can be disabled at all times ++regardless of how many tasks are running instead of being limited to just one ++running task. Note that this option is NOT recommended for regular desktop ++users. ++ ++ ++Scalability and balancing. ++ ++Unlike traditional approaches where balancing is a combination of CPU selection ++at task wakeup and intermittent balancing based on a vast array of rules set ++according to architecture, busyness calculations and special case management, ++MuQSS indirectly balances on the fly at task wakeup and next task selection. ++During initialisation, MuQSS creates a cache coherency ordered list of CPUs for ++each logical CPU and uses this to aid task/CPU selection when CPUs are busy. ++Additionally it selects any idle CPUs, if they are available, at any time over ++busy CPUs according to the following preference: ++ ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ ++Mux is therefore SMT, MC and Numa aware without the need for extra ++intermittent balancing to maintain CPUs busy and make the most of cache ++coherency. ++ ++ ++Features ++ ++As the initial prime target audience for MuQSS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, ++interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO ++policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS ++does _not_ now feature is support for CGROUPS. The average user should neither ++need to know what these are, nor should they need to be using them to have good ++desktop behaviour. However since some applications refuse to work without ++cgroups, one can enable them with MuQSS as a stub and the filesystem will be ++created which will allow the applications to work. ++ ++rr_interval: ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6. Valid values ++are from 1 to 1000 Decreasing the value will decrease latencies at the cost of ++decreasing throughput, while increasing it will improve throughput, but at the ++cost of worsening latencies. It is based on the fact that humans can detect ++jitter at approximately 7ms, so aiming for much lower latencies is pointless ++under most circumstances. It is worth noting this fact when comparing the ++latency performance of MuQSS to other schedulers. Worst case latencies being ++higher than 7ms are far worse than average latencies not being in the ++microsecond range. ++ ++interactive: ++ ++ /proc/sys/kernel/interactive ++ ++The value is a simple boolean of 1 for on and 0 for off and is set to on by ++default. Disabling this will disable the near-determinism of MuQSS when ++selecting the next task by not examining all CPUs for the earliest deadline ++task, or which CPU to wake to, instead prioritising CPU balancing for improved ++throughput. Latency will still be bound by rr_interval, but on a per-CPU basis ++instead of across the whole system. ++ ++Runqueue sharing. ++ ++By default MuQSS chooses to share runqueue resources (specifically the skip ++list and locking) between multicore siblings. It is configurable at build time ++to select between None, SMT, MC and SMP, corresponding to no sharing, sharing ++only between simultaneous mulithreading siblings, multicore siblings, or ++symmetric multiprocessing physical packages. Additionally it can be se at ++bootime with the use of the rqshare parameter. The reason for configurability ++is that some architectures have CPUs with many multicore siblings (>= 16) ++where it may be detrimental to throughput to share runqueues and another ++sharing option may be desirable. Additionally, more sharing than usual can ++improve latency on a system-wide level at the expense of throughput if desired. ++ ++The options are: ++none, smt, mc, smp ++ ++eg: ++ rqshare=mc ++ ++Isochronous scheduling: ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of CPU available per CPU, configurable as a percentage in ++the following "resource handling" tunable (as opposed to a scheduler tunable): ++ ++iso_cpu: ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of MuQSS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++ ++ ++Idleprio scheduling: ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start a ++video encode or so on without any slowdown of other tasks. To avoid this policy ++from grabbing shared resources and holding them indefinitely, if it detects a ++state where the task is waiting on I/O, the machine is about to suspend to ram ++and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has ++been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without ++superuser privileges since it is effectively a lower scheduling policy. Tasks ++can be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++schedtool -D -e ./mprime ++ ++Subtick accounting: ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the timer ++tick frequency (HZ) is lowered. It is possible to create an application which ++uses almost 100% CPU, yet by being descheduled at the right time, records zero ++CPU usage. While the main problem with this is that there are possible security ++implications, it is also difficult to determine how much CPU a task really does ++use. Mux uses sub-tick accounting from the TSC clock to determine real CPU ++usage. Thus, the amount of CPU reported as being used by MuQSS will more ++accurately represent how much CPU the task itself is using (as is shown for ++example by the 'time' application), so the reported values may be quite ++different to other schedulers. When comparing throughput of MuQSS to other ++designs, it is important to compare the actual completed work in terms of total ++wall clock time taken and total work done, rather than the reported "cpu usage". ++ ++Symmetric MultiThreading (SMT) aware nice: ++ ++SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the ++logical CPU count rises by adding thread units to each CPU core, allowing more ++than one task to be run simultaneously on the same core, the disadvantage of it ++is that the CPU power is shared between the tasks, not summating to the power ++of two CPUs. The practical upshot of this is that two tasks running on ++separate threads of the same core run significantly slower than if they had one ++core each to run on. While smart CPU selection allows each task to have a core ++to itself whenever available (as is done on MuQSS), it cannot offset the ++slowdown that occurs when the cores are all loaded and only a thread is left. ++Most of the time this is harmless as the CPU is effectively overloaded at this ++point and the extra thread is of benefit. However when running a niced task in ++the presence of an un-niced task (say nice 19 v nice 0), the nice task gets ++precisely the same amount of CPU power as the unniced one. MuQSS has an ++optional configuration feature known as SMT-NICE which selectively idles the ++secondary niced thread for a period proportional to the nice difference, ++allowing CPU distribution according to nice level to be maintained, at the ++expense of a small amount of extra overhead. If this is configured in on a ++machine without SMT threads, the overhead is minimal. ++ ++ ++Con Kolivas Sat, 29th October 2016 +diff -Nur a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt +--- a/Documentation/sysctl/kernel.txt 2019-02-09 17:20:30.451820228 +0000 ++++ b/Documentation/sysctl/kernel.txt 2019-02-09 17:46:11.991297545 +0000 +@@ -41,6 +41,7 @@ + - hung_task_check_interval_secs + - hung_task_warnings + - hyperv_record_panic_msg ++- iso_cpu + - kexec_load_disabled + - kptr_restrict + - l2cr [ PPC only ] +@@ -76,6 +77,7 @@ + - randomize_va_space + - real-root-dev ==> Documentation/admin-guide/initrd.rst + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst +@@ -98,6 +100,7 @@ + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + ============================================================== +@@ -436,6 +439,16 @@ + + ============================================================== + ++iso_cpu: (MuQSS CPU scheduler only). ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling five ++seconds over the -whole- system, meaning all cpus. ++ ++Set to 70 (percent) by default. ++ ++============================================================== ++ + l2cr: (PPC only) + + This flag controls the L2 cache of G3 processor boards. If +@@ -863,6 +876,20 @@ + + ============================================================== + ++rr_interval: (MuQSS CPU scheduler only) ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. Conversely decreasing it will decrease average and maximum ++latencies but at the expense of throughput. This value is in ++milliseconds and the default value chosen depends on the number of ++cpus available at scheduler initialisation with a minimum of 6. ++ ++Valid values are from 1-1000. ++ ++============================================================== ++ + rtsig-max & rtsig-nr: + + The file rtsig-max can be used to tune the maximum number +@@ -1123,3 +1150,13 @@ + tunable to zero will disable lockup detection altogether. + + ============================================================== ++ ++yield_type: (MuQSS CPU scheduler only) ++ ++This determines what type of yield calls to sched_yield will perform. ++ ++ 0: No yield. ++ 1: Yield only to better priority/deadline tasks. (default) ++ 2: Expire timeslice and recalculate deadline. ++ ++============================================================== +diff -Nur a/fs/proc/base.c b/fs/proc/base.c +--- a/fs/proc/base.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/fs/proc/base.c 2019-02-09 17:46:11.991297545 +0000 +@@ -459,7 +459,7 @@ + seq_printf(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff -Nur a/include/linux/init_task.h b/include/linux/init_task.h +--- a/include/linux/init_task.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/init_task.h 2019-02-09 17:46:11.991297545 +0000 +@@ -46,7 +46,11 @@ + #define INIT_CPU_TIMERS(s) + #endif + ++#ifdef CONFIG_SCHED_MUQSS ++#define INIT_TASK_COMM "MuQSS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff -Nur a/include/linux/ioprio.h b/include/linux/ioprio.h +--- a/include/linux/ioprio.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/ioprio.h 2019-02-09 17:46:11.991297545 +0000 +@@ -53,6 +53,8 @@ + */ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (iso_task(task)) ++ return 0; + return (task_nice(task) + 20) / 5; + } + +diff -Nur a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h +--- a/include/linux/sched/nohz.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/sched/nohz.h 2019-02-09 17:46:11.991297545 +0000 +@@ -6,7 +6,7 @@ + * This is the interface between the scheduler and nohz/dynticks: + */ + +-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) ++#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) + extern void cpu_load_update_nohz_start(void); + extern void cpu_load_update_nohz_stop(void); + #else +@@ -21,7 +21,7 @@ + static inline void nohz_balance_enter_idle(int cpu) { } + #endif + +-#ifdef CONFIG_NO_HZ_COMMON ++#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) + void calc_load_nohz_start(void); + void calc_load_nohz_stop(void); + #else +diff -Nur a/include/linux/sched/prio.h b/include/linux/sched/prio.h +--- a/include/linux/sched/prio.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/sched/prio.h 2019-02-09 17:46:11.991297545 +0000 +@@ -20,8 +20,20 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_MUQSS ++/* Note different MAX_RT_PRIO */ ++#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) ++ ++#define ISO_PRIO (MAX_RT_PRIO) ++#define NORMAL_PRIO (MAX_RT_PRIO + 1) ++#define IDLE_PRIO (MAX_RT_PRIO + 2) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* CONFIG_SCHED_MUQSS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + ++#endif /* CONFIG_SCHED_MUQSS */ ++ + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + +diff -Nur a/include/linux/sched/rt.h b/include/linux/sched/rt.h +--- a/include/linux/sched/rt.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/sched/rt.h 2019-02-09 17:46:11.991297545 +0000 +@@ -24,8 +24,10 @@ + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_MUQSS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff -Nur a/include/linux/sched/task.h b/include/linux/sched/task.h +--- a/include/linux/sched/task.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/sched/task.h 2019-02-09 17:46:11.991297545 +0000 +@@ -80,7 +80,7 @@ + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff -Nur a/include/linux/sched.h b/include/linux/sched.h +--- a/include/linux/sched.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/sched.h 2019-02-09 17:46:11.991297545 +0000 +@@ -28,6 +28,9 @@ + #include + #include + #include ++#ifdef CONFIG_SCHED_MUQSS ++#include ++#endif + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -613,9 +616,11 @@ + unsigned int flags; + unsigned int ptrace; + ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) ++ int on_cpu; ++#endif + #ifdef CONFIG_SMP + struct llist_node wake_entry; +- int on_cpu; + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +@@ -640,10 +645,25 @@ + int static_prio; + int normal_prio; + unsigned int rt_priority; ++#ifdef CONFIG_SCHED_MUQSS ++ int time_slice; ++ u64 deadline; ++ skiplist_node node; /* Skip list node */ ++ u64 last_ran; ++ u64 sched_time; /* sched_clock time spent running */ ++#ifdef CONFIG_SMT_NICE ++ int smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++#ifdef CONFIG_HOTPLUG_CPU ++ bool zerobound; /* Bound to CPU0 for hotplug */ ++#endif ++ unsigned long rt_timeout; ++#else /* CONFIG_SCHED_MUQSS */ + + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +@@ -798,6 +818,10 @@ + u64 utimescaled; + u64 stimescaled; + #endif ++#ifdef CONFIG_SCHED_MUQSS ++ /* Unbanked cpu time */ ++ unsigned long utime_ns, stime_ns; ++#endif + u64 gtime; + struct prev_cputime prev_cputime; + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +@@ -1210,6 +1234,40 @@ + */ + }; + ++#ifdef CONFIG_SCHED_MUQSS ++#define tsk_seruntime(t) ((t)->sched_time) ++#define tsk_rttimeout(t) ((t)->rt_timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++} ++ ++void print_scheduler_version(void); ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return (p->policy == SCHED_ISO); ++} ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++ p->nr_cpus_allowed = current->nr_cpus_allowed; ++} ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "CFS CPU scheduler.\n"); ++} ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_MUQSS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff -Nur a/include/linux/skip_list.h b/include/linux/skip_list.h +--- a/include/linux/skip_list.h 1970-01-01 01:00:00.000000000 +0100 ++++ b/include/linux/skip_list.h 2019-02-09 17:46:11.991297545 +0000 +@@ -0,0 +1,33 @@ ++#ifndef _LINUX_SKIP_LISTS_H ++#define _LINUX_SKIP_LISTS_H ++typedef u64 keyType; ++typedef void *valueType; ++ ++typedef struct nodeStructure skiplist_node; ++ ++struct nodeStructure { ++ int level; /* Levels in this structure */ ++ keyType key; ++ valueType value; ++ skiplist_node *next[8]; ++ skiplist_node *prev[8]; ++}; ++ ++typedef struct listStructure { ++ int entries; ++ int level; /* Maximum level of the list ++ (1 more than the number of levels in the list) */ ++ skiplist_node *header; /* pointer to header */ ++} skiplist; ++ ++void skiplist_init(skiplist_node *slnode); ++skiplist *new_skiplist(skiplist_node *slnode); ++void free_skiplist(skiplist *l); ++void skiplist_node_init(skiplist_node *node); ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); ++void skiplist_delete(skiplist *l, skiplist_node *node); ++ ++static inline bool skiplist_node_empty(skiplist_node *node) { ++ return (!node->next[0]); ++} ++#endif /* _LINUX_SKIP_LISTS_H */ +diff -Nur a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +--- a/include/uapi/linux/sched.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/uapi/linux/sched.h 2019-02-09 17:46:11.991297545 +0000 +@@ -37,9 +37,16 @@ + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented on MuQSS only */ + #define SCHED_IDLE 5 ++#ifdef CONFIG_SCHED_MUQSS ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO SCHED_IDLE ++#define SCHED_MAX (SCHED_IDLEPRIO) ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++#else /* CONFIG_SCHED_MUQSS */ + #define SCHED_DEADLINE 6 ++#endif /* CONFIG_SCHED_MUQSS */ + + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ + #define SCHED_RESET_ON_FORK 0x40000000 +diff -Nur a/init/init_task.c b/init/init_task.c +--- a/init/init_task.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/init/init_task.c 2019-02-09 17:46:11.991297545 +0000 +@@ -67,9 +67,17 @@ + .stack = init_stack, + .usage = ATOMIC_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_MUQSS ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO-20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, ++ .time_slice = 1000000, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_allowed = CPU_MASK_ALL, + .nr_cpus_allowed= NR_CPUS, +@@ -78,6 +86,7 @@ + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifndef CONFIG_SCHED_MUQSS + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -85,6 +94,7 @@ + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff -Nur a/init/Kconfig b/init/Kconfig +--- a/init/Kconfig 2019-02-09 17:20:30.481821193 +0000 ++++ b/init/Kconfig 2019-02-09 17:46:11.991297545 +0000 +@@ -45,6 +45,18 @@ + + menu "General setup" + ++config SCHED_MUQSS ++ bool "MuQSS cpu scheduler" ++ select HIGH_RES_TIMERS ++ ---help--- ++ The Multiple Queue Skiplist Scheduler for excellent interactivity and ++ responsiveness on the desktop and highly scalable deterministic ++ low latency on any hardware. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -653,6 +665,7 @@ + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_MUQSS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -760,9 +773,13 @@ + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group +- tasks. ++ tasks. In combination with MuQSS this is purely a STUB to create the ++ files associated with the CPU controller cgroup but most of the ++ controls do nothing. This is useful for working in environments and ++ with applications that will only work if this control group is ++ present. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_MUQSS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -869,6 +886,7 @@ + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_MUQSS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -987,6 +1005,7 @@ + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_MUQSS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff -Nur a/init/main.c b/init/main.c +--- a/init/main.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/init/main.c 2019-02-09 17:46:11.991297545 +0000 +@@ -1079,6 +1079,8 @@ + + rcu_end_inkernel_boot(); + ++ print_scheduler_version(); ++ + if (ramdisk_execute_command) { + ret = run_init_process(ramdisk_execute_command); + if (!ret) +diff -Nur a/kernel/delayacct.c b/kernel/delayacct.c +--- a/kernel/delayacct.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/delayacct.c 2019-02-09 17:46:11.991297545 +0000 +@@ -115,7 +115,7 @@ + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff -Nur a/kernel/exit.c b/kernel/exit.c +--- a/kernel/exit.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/exit.c 2019-02-09 17:46:11.991297545 +0000 +@@ -130,7 +130,7 @@ + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -151,7 +151,7 @@ + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff -Nur a/kernel/kthread.c b/kernel/kthread.c +--- a/kernel/kthread.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/kthread.c 2019-02-09 17:46:11.991297545 +0000 +@@ -424,6 +424,34 @@ + } + EXPORT_SYMBOL(kthread_bind); + ++#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) ++extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); ++ ++/* ++ * new_kthread_bind is a special variant of __kthread_bind_mask. ++ * For new threads to work on muqss we want to call do_set_cpus_allowed ++ * without the task_cpu being set and the task rescheduled until they're ++ * rescheduled on their own so we call __do_set_cpus_allowed directly which ++ * only changes the cpumask. This is particularly important for smpboot threads ++ * to work. ++ */ ++static void new_kthread_bind(struct task_struct *p, unsigned int cpu) ++{ ++ unsigned long flags; ++ ++ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) ++ return; ++ ++ /* It's safe because the task is inactive. */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ __do_set_cpus_allowed(p, cpumask_of(cpu)); ++ p->flags |= PF_NO_SETAFFINITY; ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++#else ++#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) ++#endif ++ + /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). +@@ -445,7 +473,7 @@ + cpu); + if (IS_ERR(p)) + return p; +- kthread_bind(p, cpu); ++ new_kthread_bind(p, cpu); + /* CPU hotplug need to bind once again when unparking the thread. */ + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; +diff -Nur a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +--- a/kernel/livepatch/transition.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/livepatch/transition.c 2019-02-09 17:46:11.991297545 +0000 +@@ -290,6 +290,12 @@ + return 0; + } + ++#ifdef CONFIG_SCHED_MUQSS ++typedef unsigned long rq_flags_t; ++#else ++typedef struct rq_flags rq_flag_t; ++#endif ++ + /* + * Try to safely switch a task to the target patch state. If it's currently + * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or +@@ -298,7 +304,7 @@ + static bool klp_try_switch_task(struct task_struct *task) + { + struct rq *rq; +- struct rq_flags flags; ++ rq_flags_t flags; + int ret; + bool success = false; + char err_buf[STACK_ERR_BUF_SIZE]; +diff -Nur a/kernel/Makefile b/kernel/Makefile +--- a/kernel/Makefile 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/Makefile 2019-02-09 17:46:11.991297545 +0000 +@@ -10,7 +10,7 @@ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ + notifier.o ksysfs.o cred.o reboot.o \ +- async.o range.o smpboot.o ucount.o ++ async.o range.o smpboot.o ucount.o skip_list.o + + obj-$(CONFIG_MODULES) += kmod.o + obj-$(CONFIG_MULTIUSER) += groups.o +diff -Nur a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig +--- a/kernel/rcu/Kconfig 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/rcu/Kconfig 2019-02-09 17:46:11.991297545 +0000 +@@ -93,7 +93,7 @@ + config CONTEXT_TRACKING_FORCE + bool "Force context tracking" + depends on CONTEXT_TRACKING +- default y if !NO_HZ_FULL ++ default y if !NO_HZ_FULL && !SCHED_MUQSS + help + The major pre-requirement for full dynticks to work is to + support the context tracking subsystem. But there are also +diff -Nur a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +--- a/kernel/sched/cpufreq_schedutil.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/sched/cpufreq_schedutil.c 2019-02-09 17:46:12.001297867 +0000 +@@ -177,6 +177,12 @@ + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifdef CONFIG_SCHED_MUQSS ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(rq) ++#else ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(&rq->rt) ++#endif ++ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -205,7 +211,7 @@ + sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + sg_cpu->bw_dl = cpu_bw_dl(rq); + +- if (rt_rq_is_runnable(&rq->rt)) ++ if (rt_rq_runnable(rq)) + return max; + + /* +@@ -626,7 +632,11 @@ + struct task_struct *thread; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), ++#ifdef CONFIG_SCHED_MUQSS ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, ++#endif + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, +diff -Nur a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h +--- a/kernel/sched/cpupri.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/sched/cpupri.h 2019-02-09 17:46:12.001297867 +0000 +@@ -17,9 +17,11 @@ + int *cpu_to_pri; + }; + ++#ifndef CONFIG_SCHED_MUQSS + #ifdef CONFIG_SMP + int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); + void cpupri_set(struct cpupri *cp, int cpu, int pri); + int cpupri_init(struct cpupri *cp); + void cpupri_cleanup(struct cpupri *cp); + #endif ++#endif +diff -Nur a/kernel/sched/cputime.c b/kernel/sched/cputime.c +--- a/kernel/sched/cputime.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/sched/cputime.c 2019-02-09 17:46:12.001297867 +0000 +@@ -265,26 +265,6 @@ + return accounted; + } + +-#ifdef CONFIG_64BIT +-static inline u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- return t->se.sum_exec_runtime; +-} +-#else +-static u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- u64 ns; +- struct rq_flags rf; +- struct rq *rq; +- +- rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; +- task_rq_unlock(rq, t, &rf); +- +- return ns; +-} +-#endif +- + /* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. +@@ -662,7 +642,7 @@ + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff -Nur a/kernel/sched/idle.c b/kernel/sched/idle.c +--- a/kernel/sched/idle.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/sched/idle.c 2019-02-09 17:46:12.001297867 +0000 +@@ -224,6 +224,8 @@ + static void do_idle(void) + { + int cpu = smp_processor_id(); ++ bool pending = false; ++ + /* + * If the arch has a polling bit, we maintain an invariant: + * +@@ -234,7 +236,10 @@ + */ + + __current_set_polling(); +- tick_nohz_idle_enter(); ++ if (unlikely(softirq_pending(cpu))) ++ pending = true; ++ else ++ tick_nohz_idle_enter(); + + while (!need_resched()) { + check_pgt_cache(); +@@ -272,7 +277,8 @@ + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); +- tick_nohz_idle_exit(); ++ if (!pending) ++ tick_nohz_idle_exit(); + __current_clr_polling(); + + /* +@@ -368,6 +374,7 @@ + do_idle(); + } + ++#ifndef CONFIG_SCHED_MUQSS + /* + * idle-task scheduling class. + */ +@@ -480,3 +487,4 @@ + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif /* CONFIG_SCHED_MUQSS */ +diff -Nur a/kernel/sched/Makefile b/kernel/sched/Makefile +--- a/kernel/sched/Makefile 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/sched/Makefile 2019-02-09 17:46:11.991297545 +0000 +@@ -16,6 +16,17 @@ + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + ++ifdef CONFIG_SCHED_MUQSS ++obj-y += MuQSS.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++ ++obj-$(CONFIG_SMP) += topology.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o ++obj-$(CONFIG_CPU_FREQ) += cpufreq.o ++obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o ++obj-$(CONFIG_CPU_ISOLATION) += isolation.o ++else + obj-y += core.o loadavg.o clock.o cputime.o + obj-y += idle.o fair.o rt.o deadline.o + obj-y += wait.o wait_bit.o swait.o completion.o +@@ -29,3 +40,4 @@ + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o + obj-$(CONFIG_CPU_ISOLATION) += isolation.o ++endif +diff -Nur a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +--- a/kernel/sched/MuQSS.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/kernel/sched/MuQSS.c 2019-02-09 17:46:12.001297867 +0000 +@@ -0,0 +1,7366 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * kernel/sched/MuQSS.c, was kernel/sched.c ++ * ++ * Kernel scheduler and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and ++ * make semaphores SMP safe ++ * 1998-11-19 Implemented schedule_timeout() and related stuff ++ * by Andrea Arcangeli ++ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: ++ * hybrid priority-list and round-robin design with ++ * an array-switch method of distributing timeslices ++ * and per-CPU runqueues. Cleanups and useful suggestions ++ * by Davide Libenzi, preemptible kernel bits by Robert Love. ++ * 2003-09-03 Interactivity tuning by Con Kolivas. ++ * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-04-15 Work begun on replacing all interactivity tuning with a ++ * fair scheduling design by Con Kolivas. ++ * 2007-05-05 Load balancing (smp-nice) and other improvements ++ * by Peter Williams ++ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith ++ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri ++ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, ++ * Thomas Gleixner, Mike Kravetz ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS ++ * scheduler by Con Kolivas. ++ */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include "../workqueue_internal.h" ++#include "../smpboot.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++#include "MuQSS.h" ++ ++#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) ++#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++ ++#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) ++#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) ++#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) ++ ++#define is_iso_policy(policy) ((policy) == SCHED_ISO) ++#define iso_task(p) unlikely(is_iso_policy((p)->policy)) ++#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) ++ ++#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) ++ ++#define ISO_PERIOD (5 * HZ) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define APPROX_NS_PS (1073741824) /* Approximate ns per second */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ)) ++#define JIFFY_NS (APPROX_NS_PS / HZ) ++#define JIFFY_US (1048576 / HZ) ++#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) ++#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2) ++#define HALF_JIFFY_US (1048576 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "MuQSS CPU scheduler v0.180 by Con Kolivas.\n"); ++} ++ ++#define RQSHARE_NONE 0 ++#define RQSHARE_SMT 1 ++#define RQSHARE_MC 2 ++#define RQSHARE_SMP 3 ++ ++/* ++ * This determines what level of runqueue sharing will be done and is ++ * configurable at boot time with the bootparam rqshare = ++ */ ++static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */ ++ ++static int __init set_rqshare(char *str) ++{ ++ if (!strncmp(str, "none", 4)) { ++ rqshare = RQSHARE_NONE; ++ return 0; ++ } ++ if (!strncmp(str, "smt", 3)) { ++ rqshare = RQSHARE_SMT; ++ return 0; ++ } ++ if (!strncmp(str, "mc", 2)) { ++ rqshare = RQSHARE_MC; ++ return 0; ++ } ++ if (!strncmp(str, "smp", 2)) { ++ rqshare = RQSHARE_SMP; ++ return 0; ++ } ++ return 1; ++} ++__setup("rqshare=", set_rqshare); ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++ ++/* ++ * Tunable to choose whether to prioritise latency or throughput, simple ++ * binary yes or no ++ */ ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run five seconds as real time tasks. This is the total over ++ * all online cpus. ++ */ ++int sched_iso_cpu __read_mostly = 70; ++ ++/* ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The relative length of deadline for each priority(nice) level. ++ */ ++static int prio_ratios[NICE_WIDTH] __read_mostly; ++ ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifdef CONFIG_SMP ++/* ++ * Total number of runqueues. Equals number of CPUs when there is no runqueue ++ * sharing but is usually less with SMT/MC sharing of runqueues. ++ */ ++static int total_runqueues __read_mostly = 1; ++ ++static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; ++ ++struct rq *cpu_rq(int cpu) ++{ ++ return &per_cpu(runqueues, (cpu)); ++} ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++/* ++ * For asym packing, by default the lower numbered cpu has higher priority. ++ */ ++int __weak arch_asym_cpu_priority(int cpu) ++{ ++ return -cpu; ++} ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++#include "stats.h" ++ ++/* ++ * All common locking functions performed on rq->lock. rq->clock is local to ++ * the CPU accessing it so it can be modified just with interrupts disabled ++ * when we're not updating niffies. ++ * Looking up task_rq must be done under rq->lock to be safe. ++ */ ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++static void update_irq_load_avg(struct rq *rq, long delta); ++#else ++static inline void update_irq_load_avg(struct rq *rq, long delta) {} ++#endif ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) ++ s64 steal = 0, irq_delta = 0; ++#endif ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ rq->clock_task += delta; ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++ if (irq_delta + steal) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta < 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++/* ++ * Niffies are a globally increasing nanosecond counter. They're only used by ++ * update_load_avg and time_slice_expired, however deadlines are based on them ++ * across CPUs. Update them whenever we will call one of those functions, and ++ * synchronise them across CPUs whenever we hold both runqueue locks. ++ */ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff, minndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ rq->old_clock = rq->clock; ++ jdiff = jiffies - rq->last_jiffy; ++ ++ /* Subtract any niffies added by balancing with other rqs */ ++ ndiff -= rq->niffies - rq->last_niffy; ++ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; ++ if (minndiff < 0) ++ minndiff = 0; ++ ndiff = max(ndiff, minndiff); ++ rq->niffies += ndiff; ++ rq->last_niffy = rq->niffies; ++ if (jdiff) { ++ rq->last_jiffy += jdiff; ++ rq->last_jiffy_niffies = rq->niffies; ++ } ++} ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_MIGRATING; ++} ++ ++/* ++ * Any time we have two runqueues locked we use that as an opportunity to ++ * synchronise niffies to the highest value as idle ticks may have artificially ++ * kept niffies low on one CPU and the truth can only be later. ++ */ ++static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) ++{ ++ if (rq1->niffies > rq2->niffies) ++ rq2->niffies = rq1->niffies; ++ else ++ rq1->niffies = rq2->niffies; ++} ++ ++/* ++ * double_rq_lock - safely lock two runqueues ++ * ++ * Note this does not disable interrupts like task_rq_lock, ++ * you need to do so manually before calling. ++ */ ++ ++/* For when we know rq1 != rq2 */ ++static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ if (rq1 < rq2) { ++ raw_spin_lock(rq1->lock); ++ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING); ++ } else { ++ raw_spin_lock(rq2->lock); ++ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING); ++ } ++} ++ ++static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ BUG_ON(!irqs_disabled()); ++ if (rq1->lock == rq2->lock) { ++ raw_spin_lock(rq1->lock); ++ __acquire(rq2->lock); /* Fake it out ;) */ ++ } else ++ __double_rq_lock(rq1, rq2); ++ synchronise_niffies(rq1, rq2); ++} ++ ++/* ++ * double_rq_unlock - safely unlock two runqueues ++ * ++ * Note this does not restore interrupts like task_rq_unlock, ++ * you need to do so manually after calling. ++ */ ++static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) ++ __releases(rq1->lock) ++ __releases(rq2->lock) ++{ ++ raw_spin_unlock(rq1->lock); ++ if (rq1->lock != rq2->lock) ++ raw_spin_unlock(rq2->lock); ++ else ++ __release(rq2->lock); ++} ++ ++static inline void lock_all_rqs(void) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_lock(rq->lock); ++ } ++} ++ ++static inline void unlock_all_rqs(void) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_unlock(rq->lock); ++ } ++ preempt_enable(); ++} ++ ++/* Specially nest trylock an rq */ ++static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) ++{ ++ if (unlikely(!do_raw_spin_trylock(rq->lock))) ++ return false; ++ spin_acquire(rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ synchronise_niffies(this_rq, rq); ++ return true; ++} ++ ++/* Unlock a specially nested trylocked rq */ ++static inline void unlock_rq(struct rq *rq) ++{ ++ spin_release(rq->lock.dep_map, 1, _RET_IP_); ++ do_raw_spin_unlock(rq->lock); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * This cmpxchg() executes a full barrier, which pairs with the full ++ * barrier executed by the wakeup in wake_up_q(). ++ */ ++ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) ++ return; ++ ++ get_task_struct(task); ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* Task can safely be re-inserted now */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++static inline void smp_sched_reschedule(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ smp_send_reschedule(cpu); ++} ++ ++/* ++ * resched_task - mark a task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_task(struct task_struct *p) ++{ ++ int cpu; ++#ifdef CONFIG_LOCKDEP ++ /* Kernel threads call this when creating workqueues while still ++ * inactive from __kthread_bind_mask, holding only the pi_lock */ ++ if (!(p->flags & PF_KTHREAD)) { ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(rq->lock); ++ } ++#endif ++ if (test_tsk_need_resched(p)) ++ return; ++ ++ cpu = task_cpu(p); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(p)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * A task that is not running or queued will not have a node set. ++ * A task that is queued but not running will have a node set. ++ * A task that is currently running will have ->on_cpu set but no node set. ++ */ ++static inline bool task_queued(struct task_struct *p) ++{ ++ return !skiplist_node_empty(&p->node); ++} ++ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); ++static inline void resched_if_idle(struct rq *rq); ++ ++/* Dodgy workaround till we figure out where the softirqs are going */ ++static inline void do_pending_softirq(struct rq *rq, struct task_struct *next) ++{ ++ if (unlikely(next == rq->idle && local_softirq_pending() && !in_interrupt())) ++ do_softirq_own_stack(); ++} ++ ++static inline bool deadline_before(u64 deadline, u64 time) ++{ ++ return (deadline < time); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes cpu fairly amongst tasks of the ++ * same nice value, it proportions cpu according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 prio_deadline_diff(int user_prio) ++{ ++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); ++} ++ ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ return prio_deadline_diff(TASK_USER_PRIO(p)); ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return prio_deadline_diff(USER_PRIO(static_prio)); ++} ++ ++static inline int longest_deadline_diff(void) ++{ ++ return prio_deadline_diff(39); ++} ++ ++static inline int ms_longest_deadline_diff(void) ++{ ++ return NS_TO_MS(longest_deadline_diff()); ++} ++ ++static inline bool rq_local(struct rq *rq); ++ ++#ifndef SCHED_CAPACITY_SCALE ++#define SCHED_CAPACITY_SCALE 1024 ++#endif ++ ++static inline int rq_load(struct rq *rq) ++{ ++ return rq->nr_running; ++} ++ ++/* ++ * Update the load average for feeding into cpu frequency governors. Use a ++ * rough estimate of a rolling average with ~ time constant of 32ms. ++ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 ++ * Make sure a call to update_clocks has been made before calling this to get ++ * an updated rq->niffies. ++ */ ++static void update_load_avg(struct rq *rq, unsigned int flags) ++{ ++ long us_interval, load; ++ unsigned long curload; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ curload = rq_load(rq); ++ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->load_avg = load; ++ ++ rq->load_update = rq->niffies; ++ update_irq_load_avg(rq, 0); ++ if (likely(rq_local(rq))) ++ cpufreq_trigger(rq, flags); ++} ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++/* ++ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds ++ * here so we scale curload to how long it's been since the last update. ++ */ ++static void update_irq_load_avg(struct rq *rq, long delta) ++{ ++ long us_interval, load; ++ unsigned long curload; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ curload = NS_TO_US(delta) / us_interval; ++ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->irq_load_avg = load; ++ ++ rq->irq_load_update = rq->niffies; ++} ++#endif ++ ++/* ++ * Removing from the runqueue. Enter with rq locked. Deleting a task ++ * from the skip list is done via the stored node reference in the task struct ++ * and does not require a full look up. Thus it occurs in O(k) time where k ++ * is the "level" of the list the task was stored at - usually < 4, max 8. ++ */ ++static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ skiplist_delete(rq->sl, &p->node); ++ rq->best_key = rq->node->next[0]->key; ++ update_clocks(rq); ++ ++ if (!(flags & DEQUEUE_SAVE)) ++ sched_info_dequeued(task_rq(p), p); ++ rq->nr_running--; ++ if (rt_task(p)) ++ rq->rt_nr_running--; ++ update_load_avg(rq, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_RCU ++static bool rcu_read_critical(struct task_struct *p) ++{ ++ return p->rcu_read_unlock_special.b.blocked; ++} ++#else /* CONFIG_PREEMPT_RCU */ ++#define rcu_read_critical(p) (false) ++#endif /* CONFIG_PREEMPT_RCU */ ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && ++ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); ++} ++ ++/* ++ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check ++ * that the iso_refractory flag is not set. ++ */ ++static inline bool isoprio_suitable(struct rq *rq) ++{ ++ return !rq->iso_refractory; ++} ++ ++/* ++ * Adding to the runqueue. Enter with rq locked. ++ */ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ unsigned int randseed, cflags = 0; ++ u64 sl_id; ++ ++ if (!rt_task(p)) { ++ /* Check it hasn't gotten rt from PI */ ++ if ((idleprio_task(p) && idleprio_suitable(p)) || ++ (iso_task(p) && isoprio_suitable(rq))) ++ p->prio = p->normal_prio; ++ else ++ p->prio = NORMAL_PRIO; ++ } else ++ rq->rt_nr_running++; ++ /* ++ * The sl_id key passed to the skiplist generates a sorted list. ++ * Realtime and sched iso tasks run FIFO so they only need be sorted ++ * according to priority. The skiplist will put tasks of the same ++ * key inserted later in FIFO order. Tasks of sched normal, batch ++ * and idleprio are sorted according to their deadlines. Idleprio ++ * tasks are offset by an impossibly large deadline value ensuring ++ * they get sorted into last positions, but still according to their ++ * own deadlines. This creates a "landscape" of skiplists running ++ * from priority 0 realtime in first place to the lowest priority ++ * idleprio tasks last. Skiplist insertion is an O(log n) process. ++ */ ++ if (p->prio <= ISO_PRIO) { ++ sl_id = p->prio; ++ } else { ++ sl_id = p->deadline; ++ if (idleprio_task(p)) { ++ if (p->prio == IDLE_PRIO) ++ sl_id |= 0xF000000000000000; ++ else ++ sl_id += longest_deadline_diff(); ++ } ++ } ++ /* ++ * Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as the random seed for skiplist insertion. ++ */ ++ update_clocks(rq); ++ if (!(flags & ENQUEUE_RESTORE)) ++ sched_info_queued(rq, p); ++ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; ++ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); ++ rq->best_key = rq->node->next[0]->key; ++ if (p->in_iowait) ++ cflags |= SCHED_CPUFREQ_IOWAIT; ++ rq->nr_running++; ++ update_load_avg(rq, cflags); ++} ++ ++/* ++ * Returns the relative length of deadline all compared to the shortest ++ * deadline which is that of nice -20. ++ */ ++static inline int task_prio_ratio(struct task_struct *p) ++{ ++ return prio_ratios[TASK_USER_PRIO(p)]; ++} ++ ++/* ++ * task_timeslice - all tasks of all priorities get the exact same timeslice ++ * length. CPU distribution is handled by giving different deadlines to ++ * tasks of different priorities. Use 128 as the base value for fast shifts. ++ */ ++static inline int task_timeslice(struct task_struct *p) ++{ ++ return (rr_interval * task_prio_ratio(p) / 128); ++} ++ ++#ifdef CONFIG_SMP ++/* Entered with rq locked */ ++static inline void resched_if_idle(struct rq *rq) ++{ ++ if (rq_idle(rq)) ++ resched_task(rq->curr); ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return (rq->cpu == smp_processor_id()); ++} ++#ifdef CONFIG_SMT_NICE ++static const cpumask_t *thread_cpumask(int cpu); ++ ++/* Find the best real time priority running on any SMT siblings of cpu and if ++ * none are running, the static priority of the best deadline task running. ++ * The lookups to the other runqueues is done lockless as the occasional wrong ++ * value would be harmless. */ ++static int best_smt_bias(struct rq *this_rq) ++{ ++ int other_cpu, best_bias = 0; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq = cpu_rq(other_cpu); ++ ++ if (rq_idle(rq)) ++ continue; ++ if (unlikely(!rq->online)) ++ continue; ++ if (!rq->rq_mm) ++ continue; ++ if (likely(rq->rq_smt_bias > best_bias)) ++ best_bias = rq->rq_smt_bias; ++ } ++ return best_bias; ++} ++ ++static int task_prio_bias(struct task_struct *p) ++{ ++ if (rt_task(p)) ++ return 1 << 30; ++ else if (task_running_iso(p)) ++ return 1 << 29; ++ else if (task_running_idle(p)) ++ return 0; ++ return MAX_PRIO - p->static_prio; ++} ++ ++static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) ++{ ++ return true; ++} ++ ++static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; ++ ++/* We've already decided p can run on CPU, now test if it shouldn't for SMT ++ * nice reasons. */ ++static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) ++{ ++ int best_bias, task_bias; ++ ++ /* Kernel threads always run */ ++ if (unlikely(!p->mm)) ++ return true; ++ if (rt_task(p)) ++ return true; ++ if (!idleprio_suitable(p)) ++ return true; ++ best_bias = best_smt_bias(this_rq); ++ /* The smt siblings are all idle or running IDLEPRIO */ ++ if (best_bias < 1) ++ return true; ++ task_bias = task_prio_bias(p); ++ if (task_bias < 1) ++ return false; ++ if (task_bias >= best_bias) ++ return true; ++ /* Dither 25% cpu of normal tasks regardless of nice difference */ ++ if (best_bias % 4 == 1) ++ return true; ++ /* Sorry, you lose */ ++ return false; ++} ++#else /* CONFIG_SMT_NICE */ ++#define smt_schedule(p, this_rq) (true) ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) ++{ ++ set_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++/* ++ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to ++ * allow easy lookup of whether any suitable idle CPUs are available. ++ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the ++ * idle_cpus variable than to do a full bitmask check when we are busy. The ++ * bits are set atomically but read locklessly as occasional false positive / ++ * negative is harmless. ++ */ ++static inline void set_cpuidle_map(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ atomic_set_cpu(cpu, &cpu_idle_map); ++} ++ ++static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) ++{ ++ clear_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++ atomic_clear_cpu(cpu, &cpu_idle_map); ++} ++ ++static bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return (cpumask_intersects(&p->cpus_allowed, &cpu_idle_map)); ++} ++ ++/* ++ * Resched current on rq. We don't know if rq is local to this CPU nor if it ++ * is locked so we do not use an intermediate variable for the task to avoid ++ * having it dereferenced. ++ */ ++static void resched_curr(struct rq *rq) ++{ ++ int cpu; ++ ++ if (test_tsk_need_resched(rq->curr)) ++ return; ++ ++ rq->preempt = rq->curr; ++ cpu = rq->cpu; ++ ++ /* We're doing this without holding the rq lock if it's not task_rq */ ++ ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(rq->curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(rq->curr)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++#define CPUIDLE_DIFF_THREAD (1) ++#define CPUIDLE_DIFF_CORE (2) ++#define CPUIDLE_CACHE_BUSY (4) ++#define CPUIDLE_DIFF_CPU (8) ++#define CPUIDLE_THREAD_BUSY (16) ++#define CPUIDLE_DIFF_NODE (32) ++ ++/* ++ * The best idle CPU is chosen according to the CPUIDLE ranking above where the ++ * lowest value would give the most suitable CPU to schedule p onto next. The ++ * order works out to be the following: ++ * ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ */ ++static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) ++{ ++ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | ++ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | ++ CPUIDLE_DIFF_THREAD; ++ int cpu_tmp; ++ ++ if (cpumask_test_cpu(best_cpu, tmpmask)) ++ goto out; ++ ++ for_each_cpu(cpu_tmp, tmpmask) { ++ int ranking, locality; ++ struct rq *tmp_rq; ++ ++ ranking = 0; ++ tmp_rq = cpu_rq(cpu_tmp); ++ ++ locality = rq->cpu_locality[cpu_tmp]; ++#ifdef CONFIG_NUMA ++ if (locality > 3) ++ ranking |= CPUIDLE_DIFF_NODE; ++ else ++#endif ++ if (locality > 2) ++ ranking |= CPUIDLE_DIFF_CPU; ++#ifdef CONFIG_SCHED_MC ++ else if (locality == 2) ++ ranking |= CPUIDLE_DIFF_CORE; ++ else if (!(tmp_rq->cache_idle(tmp_rq))) ++ ranking |= CPUIDLE_CACHE_BUSY; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ if (locality == 1) ++ ranking |= CPUIDLE_DIFF_THREAD; ++ if (!(tmp_rq->siblings_idle(tmp_rq))) ++ ranking |= CPUIDLE_THREAD_BUSY; ++#endif ++ if (ranking < best_ranking) { ++ best_cpu = cpu_tmp; ++ best_ranking = ranking; ++ } ++ } ++out: ++ return best_cpu; ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ struct rq *this_rq = cpu_rq(this_cpu); ++ ++ return (this_rq->cpu_locality[that_cpu] < 3); ++} ++ ++/* As per resched_curr but only will resched idle task */ ++static inline void resched_idle(struct rq *rq) ++{ ++ if (test_tsk_need_resched(rq->idle)) ++ return; ++ ++ rq->preempt = rq->idle; ++ ++ set_tsk_need_resched(rq->idle); ++ ++ if (rq_local(rq)) { ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ smp_sched_reschedule(rq->cpu); ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ cpumask_t tmpmask; ++ struct rq *rq; ++ int best_cpu; ++ ++ cpumask_and(&tmpmask, &p->cpus_allowed, &cpu_idle_map); ++ best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); ++ rq = cpu_rq(best_cpu); ++ if (!smt_schedule(p, rq)) ++ return NULL; ++ rq->preempt = p; ++ resched_idle(rq); ++ return rq; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq->rq_order[cpu]; ++} ++#else /* CONFIG_SMP */ ++static inline void set_cpuidle_map(int cpu) ++{ ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++} ++ ++static inline bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return uprq->curr == uprq->idle; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++} ++ ++static inline void resched_curr(struct rq *rq) ++{ ++ resched_task(rq->curr); ++} ++ ++static inline void resched_if_idle(struct rq *rq) ++{ ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return true; ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq; ++} ++ ++static inline bool smt_schedule(struct task_struct *p, struct rq *rq) ++{ ++ return true; ++} ++#endif /* CONFIG_SMP */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ if (idleprio_task(p)) ++ return IDLE_PRIO; ++ if (iso_task(p)) ++ return ISO_PRIO; ++ return NORMAL_PRIO; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. Enter with rq locked. ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ resched_if_idle(rq); ++ ++ /* ++ * Sleep time is in units of nanosecs, so shift by 20 to get a ++ * milliseconds-range estimation of the amount of time that the task ++ * spent sleeping: ++ */ ++ if (unlikely(prof_on == SLEEP_PROFILING)) { ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), ++ (rq->niffies - p->last_ran) >> 20); ++ } ++ ++ p->prio = effective_prio(p); ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ ++ enqueue_task(rq, p, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++} ++ ++/* ++ * deactivate_task - If it's running, it's not on the runqueue and we can just ++ * decrement the nr_running. Enter with rq locked. ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ ++ p->on_rq = 0; ++ sched_info_dequeued(rq, p); ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++ struct rq *rq; ++ ++ if (task_cpu(p) == new_cpu) ++ return; ++ ++ /* Do NOT call set_task_cpu on a currently queued task as we will not ++ * be reliably holding the rq lock after changing CPU. */ ++ BUG_ON(task_queued(p)); ++ rq = task_rq(p); ++ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * Furthermore, all task_rq users should acquire both locks, see ++ * task_rq_lock(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(rq->lock))); ++#endif ++ ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ /* ++ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++ p->wake_cpu = new_cpu; ++ ++ if (task_running(rq, p)) { ++ /* ++ * We should only be calling this on a running task if we're ++ * holding rq lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ ++ /* ++ * We can't change the task_thread_info CPU on a running task ++ * as p will still be protected by the rq lock of the CPU it ++ * is still running on so we only set the wake_cpu for it to be ++ * lazily updated once off the CPU. ++ */ ++ return; ++ } ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ p->cpu = new_cpu; ++#else ++ task_thread_info(p)->cpu = new_cpu; ++#endif ++ /* We're no longer protecting p after this point since we're holding ++ * the wrong runqueue lock. */ ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Move a task off the runqueue and take it to a cpu for it will ++ * become the running task. ++ */ ++static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ struct rq *p_rq = task_rq(p); ++ ++ dequeue_task(p_rq, p, DEQUEUE_SAVE); ++ if (p_rq != rq) { ++ sched_info_dequeued(p_rq, p); ++ sched_info_queued(rq, p); ++ } ++ set_task_cpu(p, cpu); ++} ++ ++/* ++ * Returns a descheduling task to the runqueue unless it is being ++ * deactivated. ++ */ ++static inline void return_task(struct task_struct *p, struct rq *rq, ++ int cpu, bool deactivate) ++{ ++ if (deactivate) ++ deactivate_task(p, rq); ++ else { ++#ifdef CONFIG_SMP ++ /* ++ * set_task_cpu was called on the running task that doesn't ++ * want to deactivate so it has to be enqueued to a different ++ * CPU and we need its lock. Tag it to be moved with as the ++ * lock is dropped in finish_lock_switch. ++ */ ++ if (unlikely(p->wake_cpu != cpu)) ++ p->on_rq = TASK_ON_RQ_MIGRATING; ++ else ++#endif ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ } ++} ++ ++/* Enter with rq lock held. We know p is on the local cpu */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ int running, queued; ++ unsigned long flags; ++ unsigned long ncsw; ++ struct rq *rq; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(rq, p)) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ rq = task_rq_lock(p, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(rq, p); ++ queued = task_on_rq_queued(p); ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_rq_unlock(rq, p, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(queued)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_sched_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++#endif ++ ++/* ++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the ++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or ++ * between themselves, they cooperatively multitask. An idle rq scores as ++ * prio PRIO_LIMIT so it is always preempted. ++ */ ++static inline bool ++can_preempt(struct task_struct *p, int prio, u64 deadline) ++{ ++ /* Better static priority RT task or better policy preemption */ ++ if (p->prio < prio) ++ return true; ++ if (p->prio > prio) ++ return false; ++ if (p->policy == SCHED_BATCH) ++ return false; ++ /* SCHED_NORMAL and ISO will preempt based on deadline */ ++ if (!deadline_before(p->deadline, deadline)) ++ return false; ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ if (!(p->flags & PF_KTHREAD)) ++ return false; ++ ++ if (p->nr_cpus_allowed != 1) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * Check to see if p can run on cpu, and if not, whether there are any online ++ * CPUs it can run on instead. This only happens with the hotplug threads that ++ * bring up the CPUs. ++ */ ++static inline bool sched_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (likely(cpumask_test_cpu(cpu, &p->cpus_allowed))) ++ return false; ++ if (p->nr_cpus_allowed == 1) { ++ cpumask_t valid_mask; ++ ++ cpumask_and(&valid_mask, &p->cpus_allowed, cpu_online_mask); ++ if (unlikely(cpumask_empty(&valid_mask))) ++ return false; ++ } ++ return true; ++} ++ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (cpumask_test_cpu(cpu, &p->cpus_allowed)) ++ return false; ++ return true; ++} ++ ++#define cpu_online_map (*(cpumask_t *)cpu_online_mask) ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ int i, this_entries = rq_load(this_rq); ++ cpumask_t tmp; ++ ++ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) ++ return; ++ ++ /* IDLEPRIO tasks never preempt anything but idle */ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ ++ cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed); ++ ++ for (i = 0; i < num_possible_cpus(); i++) { ++ struct rq *rq = this_rq->cpu_order[i]; ++ ++ if (!cpumask_test_cpu(rq->cpu, &tmp)) ++ continue; ++ ++ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) ++ continue; ++ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { ++ /* We set rq->preempting lockless, it's a hint only */ ++ rq->preempting = p; ++ resched_curr(rq); ++ return; ++ } ++ } ++} ++ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check); ++#else /* CONFIG_SMP */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ return false; ++} ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) ++ resched_curr(uprq); ++} ++ ++static inline int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq = this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) { ++ __schedstat_inc(rq->ttwu_local); ++ } else { ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(rq->cpu, sd) { ++ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ __schedstat_inc(sd->ttwu_wake_remote); ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++static inline void ttwu_activate(struct rq *rq, struct task_struct *p) ++{ ++ activate_task(p, rq); ++ ++ /* if a worker is waking up, notify the workqueue */ ++ if (p->flags & PF_WQ_WORKER) ++ wq_worker_waking_up(p, cpu_of(rq)); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ /* ++ * Sync wakeups (i.e. those types of wakeups where the waker ++ * has indicated that it will leave the CPU in short order) ++ * don't trigger a preemption if there are no idle cpus, ++ * instead waiting for current to deschedule. ++ */ ++ if (wake_flags & WF_SYNC) ++ resched_suitable_idle(p); ++ else ++ try_preempt(p, rq); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ lockdep_assert_held(rq->lock); ++ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++#endif ++ ++ ttwu_activate(rq, p); ++ ttwu_do_wakeup(rq, p, wake_flags); ++} ++ ++/* ++ * Called in case the task @p isn't fully descheduled from its runqueue, ++ * in this case we must do a remote wakeup. Its a 'light' wakeup though, ++ * since all we need to do is flip p->state to TASK_RUNNING, since ++ * the task is still ->on_rq. ++ */ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = __task_rq_lock(p); ++ if (likely(task_on_rq_queued(p))) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_rq_unlock(rq); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void) ++{ ++ struct rq *rq = this_rq(); ++ struct llist_node *llist = llist_del_all(&rq->wake_list); ++ struct task_struct *p, *t; ++ unsigned long flags; ++ ++ if (!llist) ++ return; ++ ++ rq_lock_irqsave(rq, &flags); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry) ++ ttwu_do_activate(rq, p, 0); ++ ++ rq_unlock_irqrestore(rq, &flags); ++} ++ ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) ++ return; ++ ++ /* ++ * Not all reschedule IPI handlers call irq_enter/irq_exit, since ++ * traditionally all their work was done from the interrupt return ++ * path. Now that we actually do some work, we need to make sure ++ * we do call them. ++ * ++ * Some archs already do call them, luckily irq_enter/exit nest ++ * properly. ++ * ++ * Arguably we should visit all archs and update all handlers, ++ * however a fair share of IPIs are still resched only so this would ++ * somewhat pessimize the simple resched case. ++ */ ++ irq_enter(); ++ sched_ttwu_pending(); ++ irq_exit(); ++} ++ ++static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { ++ if (!set_nr_if_polling(rq->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++ } ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ rq_lock_irqsave(rq, &flags); ++ if (likely(is_idle_task(rq->curr))) ++ smp_sched_reschedule(cpu); ++ /* Else cpu is not in idle, do nothing here */ ++ rq_unlock_irqrestore(rq, &flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ cpumask_t valid_mask; ++ ++ if (p->flags & PF_KTHREAD) ++ cpumask_and(&valid_mask, &p->cpus_allowed, cpu_all_mask); ++ else ++ cpumask_and(&valid_mask, &p->cpus_allowed, cpu_active_mask); ++ ++ if (unlikely(!cpumask_weight(&valid_mask))) { ++ /* We shouldn't be hitting this any more */ ++ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm, ++ p->pid, cpumask_weight(&p->cpus_allowed)); ++ return cpumask_any(&p->cpus_allowed); ++ } ++ return cpumask_any(&valid_mask); ++} ++ ++/* ++ * For a task that's just being woken up we have a valuable balancing ++ * opportunity so choose the nearest cache most lightly loaded runqueue. ++ * Entered with rq locked and returns with the chosen runqueue locked. ++ */ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ unsigned int idlest = ~0U; ++ struct rq *rq = NULL; ++ int i; ++ ++ if (suitable_idle_cpus(p)) { ++ int cpu = task_cpu(p); ++ ++ if (unlikely(needs_other_cpu(p, cpu))) ++ cpu = valid_task_cpu(p); ++ rq = resched_best_idle(p, cpu); ++ if (likely(rq)) ++ return rq->cpu; ++ } ++ ++ for (i = 0; i < num_possible_cpus(); i++) { ++ struct rq *other_rq = task_rq(p)->cpu_order[i]; ++ int entries; ++ ++ if (!other_rq->online) ++ continue; ++ if (needs_other_cpu(p, other_rq->cpu)) ++ continue; ++ entries = rq_load(other_rq); ++ if (entries >= idlest) ++ continue; ++ idlest = entries; ++ rq = other_rq; ++ } ++ if (unlikely(!rq)) ++ return task_cpu(p); ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ return NULL; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++#if defined(CONFIG_SMP) ++ if (!cpus_share_cache(smp_processor_id(), cpu)) { ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ ttwu_queue_remote(p, cpu, wake_flags); ++ return; ++ } ++#endif ++ rq_lock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ rq_unlock(rq); ++} ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int ++try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ /* state is a volatile long, どうして、分からない */ ++ if (!((unsigned int)p->state & state)) ++ goto out; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto stat; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ cpu = select_best_cpu(p); ++ if (task_cpu(p) != cpu) ++ set_task_cpu(p, cpu); ++ ++#else /* CONFIG_SMP */ ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++stat: ++ ttwu_stat(p, cpu, wake_flags); ++out: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return success; ++} ++ ++/** ++ * try_to_wake_up_local - try to wake up a local task with rq lock held ++ * @p: the thread to be awakened ++ * ++ * Put @p on the run-queue if it's not already there. The caller must ++ * ensure that rq is locked and, @p is not the current task. ++ * rq stays locked over invocation. ++ */ ++static void try_to_wake_up_local(struct task_struct *p) ++{ ++ struct rq *rq = task_rq(p); ++ ++ if (WARN_ON_ONCE(rq != this_rq()) || ++ WARN_ON_ONCE(p == current)) ++ return; ++ ++ lockdep_assert_held(rq->lock); ++ ++ if (!raw_spin_trylock(&p->pi_lock)) { ++ /* ++ * This is OK, because current is on_cpu, which avoids it being ++ * picked for load-balance and preemption/IRQs are still ++ * disabled avoiding further scheduler activity on it and we've ++ * not yet picked a replacement task. ++ */ ++ rq_unlock(rq); ++ raw_spin_lock(&p->pi_lock); ++ rq_lock(rq); ++ } ++ ++ if (!(p->state & TASK_NORMAL)) ++ goto out; ++ ++ trace_sched_waking(p); ++ ++ if (!task_on_rq_queued(p)) { ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&rq->nr_iowait); ++ } ++ ttwu_activate(rq, p); ++ } ++ ++ ttwu_do_wakeup(rq, p, 0); ++ ttwu_stat(p, smp_processor_id(), 0); ++out: ++ raw_spin_unlock(&p->pi_lock); ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++static void time_slice_expired(struct task_struct *p, struct rq *rq); ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * The process state is set to the same value of the process executing ++ * do_fork() code. That is running. This guarantees that nobody will ++ * actually run it, and a signal or other external event cannot wake ++ * it up and insert it on the runqueue either. ++ */ ++ ++ /* Should be reset in fork.c but done here for ease of MuQSS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = ++ p->stime_ns = ++ p->utime_ns = 0; ++ skiplist_node_init(&p->node); ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { ++ p->policy = SCHED_NORMAL; ++ p->normal_prio = normal_prio(p); ++ } ++ ++ if (PRIO_TO_NICE(p->static_prio) < 0) { ++ p->static_prio = NICE_TO_PRIO(0); ++ p->normal_prio = p->static_prio; ++ } ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ set_task_cpu(p, smp_processor_id()); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); ++ ++static void account_task_cpu(struct rq *rq, struct task_struct *p) ++{ ++ update_clocks(rq); ++ /* This isn't really a context switch but accounting is the same */ ++ update_cpu_clock_switch(rq, p); ++ p->last_ran = rq->niffies; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++static inline int hrexpiry_enabled(struct rq *rq) ++{ ++ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrexpiry_timer); ++} ++ ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++static inline void hrexpiry_clear(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (hrtimer_active(&rq->hrexpiry_timer)) ++ hrtimer_cancel(&rq->hrexpiry_timer); ++} ++ ++/* ++ * High-resolution time_slice expiry. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrexpiry(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); ++ struct task_struct *p; ++ ++ /* This can happen during CPU hotplug / resume */ ++ if (unlikely(cpu_of(rq) != smp_processor_id())) ++ goto out; ++ ++ /* ++ * We're doing this without the runqueue lock but this should always ++ * be run on the local CPU. Time slice should run out in __schedule ++ * but we set it to zero here in case niffies is slightly less. ++ */ ++ p = rq->curr; ++ p->time_slice = 0; ++ __set_tsk_resched(p); ++out: ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Called to set the hrexpiry timer state. ++ * ++ * called with irqs disabled from the local CPU only ++ */ ++static void hrexpiry_start(struct rq *rq, u64 delay) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ ++ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED); ++} ++ ++static void init_rq_hrexpiry(struct rq *rq) ++{ ++ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ rq->hrexpiry_timer.function = hrexpiry; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return HALF_JIFFY_US; ++ return 0; ++} ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ struct task_struct *parent, *rq_curr; ++ struct rq *rq, *new_rq; ++ unsigned long flags; ++ ++ parent = p->parent; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ p->state = TASK_RUNNING; ++ /* Task_rq can't change yet on a new task */ ++ new_rq = rq = task_rq(p); ++ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { ++ set_task_cpu(p, valid_task_cpu(p)); ++ new_rq = task_rq(p); ++ } ++ ++ double_rq_lock(rq, new_rq); ++ rq_curr = rq->curr; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = rq_curr->normal_prio; ++ ++ trace_sched_wakeup_new(p); ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. If it's negative, it won't ++ * matter since that's the same as being 0. rq->rq_deadline is only ++ * modified within schedule() so it is always equal to ++ * current->deadline. ++ */ ++ account_task_cpu(rq, rq_curr); ++ p->last_ran = rq_curr->last_ran; ++ if (likely(rq_curr->policy != SCHED_FIFO)) { ++ rq_curr->time_slice /= 2; ++ if (rq_curr->time_slice < RESCHED_US) { ++ /* ++ * Forking task has run out of timeslice. Reschedule it and ++ * start its child with a new time slice and deadline. The ++ * child will end up running first because its deadline will ++ * be slightly earlier. ++ */ ++ __set_tsk_resched(rq_curr); ++ time_slice_expired(p, new_rq); ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++ else if (unlikely(rq != new_rq)) ++ try_preempt(p, new_rq); ++ } else { ++ p->time_slice = rq_curr->time_slice; ++ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { ++ /* ++ * The VM isn't cloned, so we're in a good position to ++ * do child-runs-first in anticipation of an exec. This ++ * usually avoids a lot of COW overhead. ++ */ ++ __set_tsk_resched(rq_curr); ++ } else { ++ /* ++ * Adjust the hrexpiry since rq_curr will keep ++ * running and its timeslice has been shortened. ++ */ ++ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); ++ try_preempt(p, new_rq); ++ } ++ } ++ } else { ++ time_slice_expired(p, new_rq); ++ try_preempt(p, new_rq); ++ } ++ activate_task(p, new_rq); ++ double_rq_unlock(rq, new_rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, 1, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ ++#ifdef CONFIG_SMP ++ /* ++ * If prev was marked as migrating to another CPU in return_task, drop ++ * the local runqueue lock but leave interrupts disabled and grab the ++ * remote lock we're migrating it to before enabling them. ++ */ ++ if (unlikely(task_on_rq_migrating(prev))) { ++ sched_info_dequeued(rq, prev); ++ /* ++ * We move the ownership of prev to the new cpu now. ttwu can't ++ * activate prev to the wrong cpu since it has to grab this ++ * runqueue in ttwu_remote. ++ */ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ prev->cpu = prev->wake_cpu; ++#else ++ task_thread_info(prev)->cpu = prev->wake_cpu; ++#endif ++ raw_spin_unlock(rq->lock); ++ ++ raw_spin_lock(&prev->pi_lock); ++ rq = __task_rq_lock(prev); ++ /* Check that someone else hasn't already queued prev */ ++ if (likely(!task_queued(prev))) { ++ enqueue_task(rq, prev, 0); ++ prev->on_rq = TASK_ON_RQ_QUEUED; ++ /* Wake up the CPU if it's not already running */ ++ resched_if_idle(rq); ++ } ++ raw_spin_unlock(&prev->pi_lock); ++ } ++#endif ++ rq_unlock(rq); ++ ++ do_pending_softirq(rq, current); ++ ++ local_irq_enable(); ++} ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_switch ++# define finish_arch_switch(prev) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static void finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq, prev); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct(prev); ++ } ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++{ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline void ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ struct mm_struct *mm, *oldmm; ++ ++ prepare_task_switch(rq, prev, next); ++ ++ mm = next->mm; ++ oldmm = prev->active_mm; ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * If mm is non-NULL, we pass through switch_mm(). If mm is ++ * NULL, we will pass through mmdrop() in finish_task_switch(). ++ * Both of these contain the full memory barrier required by ++ * membarrier after storing to rq->curr, before returning to ++ * user-space. ++ */ ++ if (!mm) { ++ next->active_mm = oldmm; ++ mmgrab(oldmm); ++ enter_lazy_tlb(oldmm, next); ++ } else ++ switch_mm_irqs_off(oldmm, mm, next); ++ ++ if (!prev->mm) { ++ prev->active_mm = NULL; ++ rq->prev_mm = oldmm; ++ } ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++static unsigned long nr_uninterruptible(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_uninterruptible; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptable section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ struct rq *rq = cpu_rq(smp_processor_id()); ++ ++ if (rq_load(rq) == 1) ++ return true; ++ else ++ return false; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += atomic_read(&cpu_rq(i)->nr_iowait); ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpufreq menu ++ * governor are using nonsensical data. Boosting frequency for a CPU that has ++ * IO-wait which might not even end up running the task when it does become ++ * runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ struct rq *this = cpu_rq(cpu); ++ return atomic_read(&this->nr_iowait); ++} ++ ++unsigned long nr_active(void) ++{ ++ return nr_running() + nr_uninterruptible(); ++} ++ ++/* ++ * I/O wait is the number of running or queued tasks with their ->rq pointer ++ * set to this cpu as being the CPU they're more likely to run on. ++ */ ++void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) ++{ ++ struct rq *rq = this_rq(); ++ ++ *nr_waiters = atomic_read(&rq->nr_iowait); ++ *load = rq_load(rq); ++} ++ ++/* Variables and functions for calc_load */ ++static unsigned long calc_load_update; ++unsigned long avenrun[3]; ++EXPORT_SYMBOL(avenrun); ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} ++ ++static unsigned long ++calc_load(unsigned long load, unsigned long exp, unsigned long active) ++{ ++ unsigned long newload; ++ ++ newload = load * exp + active * (FIXED_1 - exp); ++ if (active >= load) ++ newload += FIXED_1-1; ++ ++ return newload / FIXED_1; ++} ++ ++/* ++ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. ++ */ ++void calc_global_load(unsigned long ticks) ++{ ++ long active; ++ ++ if (time_before(jiffies, READ_ONCE(calc_load_update))) ++ return; ++ active = nr_active() * FIXED_1; ++ ++ avenrun[0] = calc_load(avenrun[0], EXP_1, active); ++ avenrun[1] = calc_load(avenrun[1], EXP_5, active); ++ avenrun[2] = calc_load(avenrun[2], EXP_15, active); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++#ifdef CONFIG_PARAVIRT ++static inline u64 steal_ticks(u64 steal) ++{ ++ if (unlikely(steal > NSEC_PER_SEC)) ++ return div_u64(steal, TICK_NSEC); ++ ++ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); ++} ++#endif ++ ++#ifndef nsecs_to_cputime ++# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) ++#endif ++ ++/* ++ * On each tick, add the number of nanoseconds to the unbanked variables and ++ * once one tick's worth has accumulated, account it allowing for accurate ++ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we ++ * deduct nanoseconds. ++ */ ++static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ if (atomic_read(&rq->nr_iowait) > 0) { ++ rq->iowait_ns += ns; ++ if (rq->iowait_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->iowait_ns); ++ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->iowait_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->idle_ns += ns; ++ if (rq->idle_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->idle_ns); ++ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->idle_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(idle); ++} ++ ++static void pc_system_time(struct rq *rq, struct task_struct *p, ++ int hardirq_offset, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->stime_ns += ns; ++ if (p->stime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->stime_ns); ++ p->stime_ns %= JIFFY_NS; ++ p->stime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_system_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (hardirq_count() - hardirq_offset) { ++ rq->irq_ns += ns; ++ if (rq->irq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->irq_ns); ++ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->irq_ns %= JIFFY_NS; ++ } ++ } else if (in_serving_softirq()) { ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->system_ns += ns; ++ if (rq->system_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->system_ns); ++ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->system_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->utime_ns += ns; ++ if (p->utime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->utime_ns); ++ p->utime_ns %= JIFFY_NS; ++ p->utime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_user_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (this_cpu_ksoftirqd() == p) { ++ /* ++ * ksoftirqd time do not get accounted in cpu_softirq_time. ++ * So, we have to handle it separately here. ++ */ ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } ++ ++ if (task_nice(p) > 0 || idleprio_task(p)) { ++ rq->nice_ns += ns; ++ if (rq->nice_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->nice_ns); ++ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->nice_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->user_ns += ns; ++ if (rq->user_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->user_ns); ++ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->user_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++/* ++ * This is called on clock ticks. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate tick timekeeping */ ++ if (user_mode(get_irq_regs())) ++ pc_user_time(rq, p, account_ns); ++ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { ++ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); ++ } else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++ ++ p->last_ran = rq->niffies; ++} ++ ++/* ++ * This is called on context switches. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate subtick timekeeping */ ++ if (p != idle) ++ pc_user_time(rq, p, account_ns); ++ else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++} ++ ++/* ++ * Return any ns on the sched_clock that have not yet been accounted in ++ * @p in case that task is currently running. ++ * ++ * Called with task_rq_lock(p) held. ++ */ ++static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) ++{ ++ u64 ns = 0; ++ ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_clocks(rq); ++ ns = rq->niffies - p->last_ran; ++ } ++ ++ return ns; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ * ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimisation chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_rq_lock(p, &flags); ++ ns = p->sched_time + do_task_delta_exec(p, rq); ++ task_rq_unlock(rq, p, &flags); ++ ++ return ns; ++} ++ ++/* ++ * Functions to test for when SCHED_ISO tasks have used their allocated ++ * quota as real time scheduling and convert them back to SCHED_NORMAL. All ++ * data is modified only by the local runqueue during scheduler_tick with ++ * interrupts disabled. ++ */ ++ ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a ++ * slow division. ++ */ ++static inline void iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++ rq->iso_ticks += 100; ++ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { ++ rq->iso_refractory = true; ++ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) ++ rq->iso_ticks = ISO_PERIOD * 100; ++ } ++} ++ ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq, int ticks) ++{ ++ if (rq->iso_ticks > 0 || rq->iso_refractory) { ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; ++ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { ++ rq->iso_refractory = false; ++ if (unlikely(rq->iso_ticks < 0)) ++ rq->iso_ticks = 0; ++ } ++ } ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if (rt_task(p) || task_running_iso(p)) ++ iso_tick(rq); ++ else ++ no_iso_tick(rq, 1); ++ ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->policy == SCHED_FIFO) ++ return; ++ ++ if (iso_task(p)) { ++ if (task_running_iso(p)) { ++ if (rq->iso_refractory) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Force it to reschedule as ++ * SCHED_NORMAL by zeroing its time_slice ++ */ ++ p->time_slice = 0; ++ } ++ } else if (!rq->iso_refractory) { ++ /* Can now run again ISO. Reschedule to pick up prio */ ++ goto out_resched; ++ } ++ } ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ * Dither is used as a backup for when hrexpiry is disabled or high res ++ * timers not configured in. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++out_resched: ++ rq_lock(rq); ++ __set_tsk_resched(p); ++ rq_unlock(rq); ++} ++ ++static inline void task_tick(struct rq *rq) ++{ ++ if (!rq_idle(rq)) ++ task_running_tick(rq); ++ else if (rq->last_jiffy > rq->last_scheduler_tick) ++ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * We can stop the timer tick any time highres timers are active since ++ * we rely entirely on highres timeouts for task expiry rescheduling. ++ */ ++static void sched_stop_tick(struct rq *rq, int cpu) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (!tick_nohz_full_enabled()) ++ return; ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++static inline void sched_start_tick(struct rq *rq, int cpu) ++{ ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++struct tick_work { ++ int cpu; ++ struct delayed_work work; ++}; ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ u64 delta; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ rq_lock_irq(rq); ++ curr = rq->curr; ++ if (is_idle_task(curr)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ delta = rq_clock_task(rq) - curr->last_ran; ++ ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ task_tick(rq); ++ ++out_unlock: ++ rq_unlock_irq(rq); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. ++ */ ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_stop_tick(struct rq *rq, int cpu) {} ++static inline void sched_start_tick(struct rq *rq, int cpu) {} ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ update_clocks(rq); ++ update_load_avg(rq, 0); ++ update_cpu_clock_tick(rq, rq->curr); ++ task_tick(rq); ++ rq->last_scheduler_tick = rq->last_jiffy; ++ rq->last_tick = rq->clock; ++ perf_event_task_tick(); ++ sched_stop_tick(rq, cpu); ++} ++ ++#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline. Make sure update_clocks has been called recently to update ++ * rq->niffies. ++ */ ++static void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ p->deadline = rq->niffies + task_deadline_diff(p); ++#ifdef CONFIG_SMT_NICE ++ if (!p->mm) ++ p->smt_bias = 0; ++ else if (rt_task(p)) ++ p->smt_bias = 1 << 30; ++ else if (task_running_iso(p)) ++ p->smt_bias = 1 << 29; ++ else if (idleprio_task(p)) { ++ if (task_running_idle(p)) ++ p->smt_bias = 0; ++ else ++ p->smt_bias = 1; ++ } else if (--p->smt_bias < 1) ++ p->smt_bias = MAX_PRIO - p->static_prio; ++#endif ++} ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (p->time_slice < RESCHED_US || batch_task(p)) ++ time_slice_expired(p, rq); ++} ++ ++/* ++ * Task selection with skiplists is a simple matter of picking off the first ++ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) ++ * being bound to the number of processors. ++ * ++ * Runqueues are selectively locked based on their unlocked data and then ++ * unlocked if not needed. At most 3 locks will be held at any time and are ++ * released as soon as they're no longer needed. All balancing between CPUs ++ * is thus done here in an extremely simple first come best fit manner. ++ * ++ * This iterates over runqueues in cache locality order. In interactive mode ++ * it iterates over all CPUs and finds the task with the best key/deadline. ++ * In non-interactive mode it will only take a task if it's from the current ++ * runqueue or a runqueue with more tasks than the current one with a better ++ * key/deadline. ++ */ ++#ifdef CONFIG_SMP ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct rq *locked = NULL, *chosen = NULL; ++ struct task_struct *edt = idle; ++ int i, best_entries = 0; ++ u64 best_key = ~0ULL; ++ ++ for (i = 0; i < total_runqueues; i++) { ++ struct rq *other_rq = rq_order(rq, i); ++ skiplist_node *next; ++ int entries; ++ ++ entries = other_rq->sl->entries; ++ /* ++ * Check for queued entres lockless first. The local runqueue ++ * is locked so entries will always be accurate. ++ */ ++ if (!sched_interactive) { ++ /* ++ * Don't reschedule balance across nodes unless the CPU ++ * is idle. ++ */ ++ if (edt != idle && rq->cpu_locality[other_rq->cpu] > 3) ++ break; ++ if (entries <= best_entries) ++ continue; ++ } else if (!entries) ++ continue; ++ ++ /* if (i) implies other_rq != rq */ ++ if (i) { ++ /* Check for best id queued lockless first */ ++ if (other_rq->best_key >= best_key) ++ continue; ++ ++ if (unlikely(!trylock_rq(rq, other_rq))) ++ continue; ++ ++ /* Need to reevaluate entries after locking */ ++ entries = other_rq->sl->entries; ++ if (unlikely(!entries)) { ++ unlock_rq(other_rq); ++ continue; ++ } ++ } ++ ++ next = other_rq->node; ++ /* ++ * In interactive mode we check beyond the best entry on other ++ * runqueues if we can't get the best for smt or affinity ++ * reasons. ++ */ ++ while ((next = next->next[0]) != other_rq->node) { ++ struct task_struct *p; ++ u64 key = next->key; ++ ++ /* Reevaluate key after locking */ ++ if (key >= best_key) ++ break; ++ ++ p = next->value; ++ if (!smt_schedule(p, rq)) { ++ if (i && !sched_interactive) ++ break; ++ continue; ++ } ++ ++ if (sched_other_cpu(p, cpu)) { ++ if (sched_interactive || !i) ++ continue; ++ break; ++ } ++ /* Make sure affinity is ok */ ++ if (i) { ++ /* From this point on p is the best so far */ ++ if (locked) ++ unlock_rq(locked); ++ chosen = locked = other_rq; ++ } ++ best_entries = entries; ++ best_key = key; ++ edt = p; ++ break; ++ } ++ /* rq->preempting is a hint only as the state may have changed ++ * since it was set with the resched call but if we have met ++ * the condition we can break out here. */ ++ if (edt == rq->preempting) ++ break; ++ if (i && other_rq != chosen) ++ unlock_rq(other_rq); ++ } ++ ++ if (likely(edt != idle)) ++ take_task(rq, cpu, edt); ++ ++ if (locked) ++ unlock_rq(locked); ++ ++ rq->preempting = NULL; ++ ++ return edt; ++} ++#else /* CONFIG_SMP */ ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct task_struct *edt; ++ ++ if (unlikely(!rq->sl->entries)) ++ return idle; ++ edt = rq->node->next[0]->value; ++ take_task(rq, cpu, edt); ++ return edt; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++/* ++ * The currently running task's information is all stored in rq local data ++ * which is only modified by the local CPU. ++ */ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ if (p == rq->idle || p->policy == SCHED_FIFO) ++ hrexpiry_clear(rq); ++ else ++ hrexpiry_start(rq, US_TO_NS(p->time_slice)); ++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) ++ rq->dither = 0; ++ else ++ rq->dither = rq_dither(rq); ++ ++ rq->rq_deadline = p->deadline; ++ rq->rq_prio = p->prio; ++#ifdef CONFIG_SMT_NICE ++ rq->rq_mm = p->mm; ++ rq->rq_smt_bias = p->smt_bias; ++#endif ++} ++ ++#ifdef CONFIG_SMT_NICE ++static void check_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; ++static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; ++ ++/* Iterate over smt siblings when we've scheduled a process on cpu and decide ++ * whether they should continue running or be descheduled. */ ++static void check_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct task_struct *p; ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ continue; ++ p = rq->curr; ++ if (!smt_schedule(p, this_rq)) ++ resched_curr(rq); ++ } ++} ++ ++static void wake_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ resched_idle(rq); ++ } ++} ++#else ++static void check_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_siblings(struct rq __maybe_unused *this_rq) {} ++#endif ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPT=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next, *idle; ++ unsigned long *switch_count; ++ bool deactivate = false; ++ struct rq *rq; ++ u64 niffies; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ idle = rq->idle; ++ ++ schedule_debug(prev); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ rq_lock(rq); ++ smp_mb__after_spinlock(); ++#ifdef CONFIG_SMP ++ if (rq->preempt) { ++ /* ++ * Make sure resched_curr hasn't triggered a preemption ++ * locklessly on a task that has since scheduled away. Spurious ++ * wakeup of idle is okay though. ++ */ ++ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { ++ rq->preempt = NULL; ++ clear_preempt_need_resched(); ++ rq_unlock_irq(rq); ++ return; ++ } ++ rq->preempt = NULL; ++ } ++#endif ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (unlikely(signal_pending_state(prev->state, prev))) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate = true; ++ prev->on_rq = 0; ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ ++ /* ++ * If a worker is going to sleep, notify and ++ * ask workqueue whether it wants to wake up a ++ * task to maintain concurrency. If so, wake ++ * up the task. ++ */ ++ if (prev->flags & PF_WQ_WORKER) { ++ struct task_struct *to_wakeup; ++ ++ to_wakeup = wq_worker_sleeping(prev); ++ if (to_wakeup) ++ try_to_wake_up_local(to_wakeup); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ /* ++ * Store the niffy value here for use by the next task's last_ran ++ * below to avoid losing niffies due to update_clocks being called ++ * again after this point. ++ */ ++ update_clocks(rq); ++ niffies = rq->niffies; ++ update_cpu_clock_switch(rq, prev); ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ if (idle != prev) { ++ check_deadline(prev, rq); ++ return_task(prev, rq, cpu, deactivate); ++ } ++ ++ next = earliest_deadline_task(rq, cpu, idle); ++ if (likely(next->prio != PRIO_LIMIT)) ++ clear_cpuidle_map(cpu); ++ else { ++ set_cpuidle_map(cpu); ++ update_load_avg(rq, 0); ++ } ++ ++ set_rq_task(rq, next); ++ next->last_ran = niffies; ++ ++ if (likely(prev != next)) { ++ /* ++ * Don't reschedule an idle task or deactivated tasks ++ */ ++ if (prev == idle) { ++ rq->nr_running++; ++ if (rt_task(next)) ++ rq->rt_nr_running++; ++ } else if (!deactivate) ++ resched_suitable_idle(prev); ++ if (unlikely(next == idle)) { ++ rq->nr_running--; ++ if (rt_task(prev)) ++ rq->rt_nr_running--; ++ wake_siblings(rq); ++ } else ++ check_siblings(rq); ++ rq->nr_switches++; ++ rq->curr = next; ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ ++ trace_sched_switch(preempt, prev, next); ++ context_switch(rq, prev, next); /* unlocks the rq */ ++ } else { ++ check_siblings(rq); ++ rq_unlock(rq); ++ do_pending_softirq(rq, next); ++ local_irq_enable(); ++ } ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(). */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state || tsk_is_pi_blocked(tsk) || ++ preempt_count() || ++ signal_pending_state(tsk->state, tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++} ++ ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != IN_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPT ++/* ++ * this is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. Kernel preemptions off return from interrupt ++ * occur there and call schedule directly. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPT */ ++ ++/* ++ * this is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio, oldprio; ++ struct rq *rq; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_rq_lock(p); ++ update_rq_clock(rq); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ oldprio = p->prio; ++ p->prio = prio; ++ if (task_running(rq, p)){ ++ if (prio > oldprio) ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (prio < oldprio) ++ try_preempt(p, rq); ++ } ++out_unlock: ++ __task_rq_unlock(rq); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++/* ++ * Adjust the deadline for when the priority is to change, before it's ++ * changed. ++ */ ++static inline void adjust_deadline(struct task_struct *p, int new_prio) ++{ ++ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); ++} ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static, old_static; ++ unsigned long flags; ++ struct rq *rq; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ rq = task_rq_lock(p, &flags); ++ update_rq_clock(rq); ++ ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (has_rt_policy(p)) { ++ p->static_prio = new_static; ++ goto out_unlock; ++ } ++ ++ adjust_deadline(p, new_static); ++ old_static = p->static_prio; ++ p->static_prio = new_static; ++ p->prio = effective_prio(p); ++ ++ if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (new_static < old_static) ++ try_preempt(p, rq); ++ } else if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ if (old_static < new_static) ++ resched_task(p); ++ } ++out_unlock: ++ task_rq_unlock(rq, p, &flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int delta, prio = p->prio - MAX_RT_PRIO; ++ ++ /* rt tasks and iso tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ /* Convert to ms to avoid overflows */ ++ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); ++ if (unlikely(delta < 0)) ++ delta = 0; ++ delta = delta * 40 / ms_longest_deadline_diff(); ++ if (delta <= 80) ++ prio += delta; ++ if (idleprio_task(p)) ++ prio += 40; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * available_idle_cpu - is a given CPU idle for enqueuing work. ++ * @cpu: the CPU in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int available_idle_cpu(int cpu) ++{ ++ if (!idle_cpu(cpu)) ++ return 0; ++ ++ if (vcpu_is_preempted(cpu)) ++ return 0; ++ ++ return 1; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the CPU @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, ++ int prio, bool keep_boost) ++{ ++ int oldrtprio, oldprio; ++ ++ p->policy = policy; ++ oldrtprio = p->rt_priority; ++ p->rt_priority = prio; ++ p->normal_prio = normal_prio(p); ++ oldprio = p->prio; ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ ++ if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (p->prio < oldprio || p->rt_priority > oldrtprio) ++ try_preempt(p, rq); ++ } ++} ++ ++/* ++ * Check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority; ++ unsigned long flags, rlim_rtprio = 0; ++ int reset_on_fork; ++ struct rq *rq; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ priority = 0; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); ++ policy &= ~SCHED_RESET_ON_FORK; ++ ++ if (!SCHED_RANGE(policy)) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH is 0. ++ */ ++ if (priority < 0 || ++ (p->mm && priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if (is_rt_policy(policy) != (priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (is_rt_policy(policy)) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (priority > p->rt_priority && ++ priority > rlim_rtprio) ++ return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy != SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag: */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ * ++ * To be able to change p->policy safely, the runqueue lock must be ++ * held. ++ */ ++ rq = task_rq_lock(p, &flags); ++ update_rq_clock(rq); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea: ++ */ ++ if (p == rq->stop) { ++ task_rq_unlock(rq, p, &flags); ++ return -EINVAL; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || ++ priority == p->rt_priority))) { ++ task_rq_unlock(rq, p, &flags); ++ return 0; ++ } ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ task_rq_unlock(rq, p, &flags); ++ goto recheck; ++ } ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ __setscheduler(p, rq, policy, priority, pi); ++ task_rq_unlock(rq, p, &flags); ++ ++ if (pi) ++ rt_mutex_adjust_pi(p); ++out: ++ return 0; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setscheduler(p, policy, &lparam); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, ++ struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) ++ return -EFAULT; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* Bail out on silly large: */ ++ if (size > PAGE_SIZE) ++ goto err_size; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0) ++ goto err_size; ++ ++ /* ++ * If we're handed a bigger struct than we know of, ++ * ensure all the unknown bits are 0 - i.e. new ++ * user-space does not rely on any kernel feature ++ * extensions we dont know about yet. ++ */ ++ if (size > sizeof(*attr)) { ++ unsigned char __user *addr; ++ unsigned char __user *end; ++ unsigned char val; ++ ++ addr = (void __user *)uattr + sizeof(*attr); ++ end = (void __user *)uattr + size; ++ ++ for (; addr < end; addr++) { ++ ret = get_user(val, addr); ++ if (ret) ++ return ret; ++ if (val) ++ goto err_size; ++ } ++ size = sizeof(*attr); ++ } ++ ++ ret = copy_from_user(attr, uattr, size); ++ if (ret) ++ return -EFAULT; ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++static int sched_read_attr(struct sched_attr __user *uattr, ++ struct sched_attr *attr, ++ unsigned int usize) ++{ ++ int ret; ++ ++ if (!access_ok(VERIFY_WRITE, uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * If we're handed a smaller struct than we know of, ++ * ensure all the unknown bits are 0 - i.e. old ++ * user-space does not get uncomplete information. ++ */ ++ if (usize < sizeof(*attr)) { ++ unsigned char *addr; ++ unsigned char *end; ++ ++ addr = (void *)attr + usize; ++ end = (void *)attr + sizeof(*attr); ++ ++ for (; addr < end; addr++) { ++ if (*addr) ++ return -EFBIG; ++ } ++ ++ attr->size = usize; ++ } ++ ++ ret = copy_to_user(uattr, attr, attr->size); ++ if (ret) ++ return -EFAULT; ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return ret; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @size: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, size, unsigned int, flags) ++{ ++ struct sched_attr attr = { ++ .size = sizeof(struct sched_attr), ++ }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || size > PAGE_SIZE || ++ size < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ attr.sched_policy = p->policy; ++ if (rt_task(p)) ++ attr.sched_priority = p->rt_priority; ++ else ++ attr.sched_nice = task_nice(p); ++ ++ rcu_read_unlock(); ++ ++ retval = sched_read_attr(uattr, &attr, size); ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ cpumask_t *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ unsigned long flags; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ put_online_cpus(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min(len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ ++ if (!sched_yield_type) ++ return; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ ++ if (sched_yield_type > 1) ++ time_slice_expired(current, rq); ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ rq_unlock(rq); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPT ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ struct task_struct *rq_p; ++ struct rq *rq, *p_rq; ++ unsigned long flags; ++ int yielded = 0; ++ ++ local_irq_save(flags); ++ rq = this_rq(); ++ ++again: ++ p_rq = task_rq(p); ++ /* ++ * If we're the only runnable task on the rq and target rq also ++ * has only one task, there's absolutely no point in yielding. ++ */ ++ if (task_running(p_rq, p) || p->state) { ++ yielded = -ESRCH; ++ goto out_irq; ++ } ++ ++ double_rq_lock(rq, p_rq); ++ if (unlikely(task_rq(p) != p_rq)) { ++ double_rq_unlock(rq, p_rq); ++ goto again; ++ } ++ ++ yielded = 1; ++ schedstat_inc(rq->yld_count); ++ rq_p = rq->curr; ++ if (p->deadline > rq_p->deadline) ++ p->deadline = rq_p->deadline; ++ p->time_slice += rq_p->time_slice; ++ if (p->time_slice > timeslice()) ++ p->time_slice = timeslice(); ++ time_slice_expired(rq_p, rq); ++ if (preempt && rq != p_rq) ++ resched_task(p_rq->curr); ++ double_rq_unlock(rq, p_rq); ++out_irq: ++ local_irq_restore(flags); ++ ++ if (yielded > 0) ++ schedule(); ++ return yielded; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ unsigned int time_slice; ++ unsigned long flags; ++ struct rq *rq; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ rq = task_rq_lock(p, &flags); ++ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); ++ task_rq_unlock(rq, p, &flags); ++ ++ rcu_read_unlock(); ++ *t = ns_to_timespec64(time_slice); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * this syscall writes the default timeslice value of a given process ++ * into the user-space timespec buffer. A value of '0' means infinity. ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT ++COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, ++ compat_pid_t, pid, ++ struct compat_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = compat_put_timespec64(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++#ifdef CONFIG_SMP ++void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_allowed, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ cpumask_copy(&p->cpus_allowed, new_mask); ++ ++ if (task_queued(p)) { ++ /* ++ * Because __kthread_bind() calls this on blocked tasks without ++ * holding rq->lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ } ++} ++ ++/* ++ * Calling do_set_cpus_allowed from outside the scheduler code should not be ++ * called on a running or queued task. We should be holding pi_lock. ++ */ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ __do_set_cpus_allowed(p, new_mask); ++ if (needs_other_cpu(p, task_cpu(p))) { ++ struct rq *rq; ++ ++ rq = __task_rq_lock(p); ++ set_task_cpu(p, valid_task_cpu(p)); ++ resched_task(p); ++ __task_rq_unlock(rq); ++ } ++} ++#endif ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(rq->lock); ++ idle->last_ran = rq->niffies; ++ time_slice_expired(idle, rq); ++ idle->state = TASK_RUNNING; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#ifdef CONFIG_SMT_NICE ++ idle->smt_bias = 0; ++#endif ++#endif ++ set_rq_task(rq, idle); ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->curr = rq->idle = idle; ++ idle->on_rq = TASK_ON_RQ_QUEUED; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_allowed may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rq_lock_irqsave(rq, &flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(rq); ++ rq_unlock_irqrestore(rq, &flags); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct sched_domain *sd; ++ ++ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ return cpu; ++ ++ rcu_read_lock(); ++ for_each_domain(cpu, sd) { ++ for_each_cpu(i, sched_domain_span(sd)) { ++ if (cpu == i) ++ continue; ++ ++ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { ++ cpu = i; ++ cpu = i; ++ goto unlock; ++ } ++ } ++ } ++ ++ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++unlock: ++ rcu_read_unlock(); ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (cpu_is_offline(cpu)) ++ return true; /* Don't try to wake offline CPUs. */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Wake up the specified CPU. If the CPU is going offline, it is the ++ * caller's responsibility to deal with the lost wakeup, for example, ++ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. ++ */ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (!wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ bool queued = false, running_wrong = false, kthread; ++ struct cpumask old_mask; ++ unsigned long flags; ++ int cpu, ret = 0; ++ struct rq *rq; ++ ++ rq = task_rq_lock(p, &flags); ++ update_rq_clock(rq); ++ ++ kthread = !!(p->flags & PF_KTHREAD); ++ if (kthread) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ cpumask_copy(&old_mask, &p->cpus_allowed); ++ if (cpumask_equal(&old_mask, new_mask)) ++ goto out; ++ ++ if (!cpumask_intersects(new_mask, cpu_valid_mask)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ queued = task_queued(p); ++ __do_set_cpus_allowed(p, new_mask); ++ ++ if (kthread) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(rq, p)) { ++ /* Task is running on the wrong cpu now, reschedule it. */ ++ if (rq == this_rq()) { ++ cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ set_task_cpu(p, cpu); ++ set_tsk_need_resched(p); ++ running_wrong = true; ++ } else ++ resched_task(p); ++ } else { ++ cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (queued) { ++ /* ++ * Switch runqueue locks after dequeueing the task ++ * here while still holding the pi_lock to be holding ++ * the correct lock for enqueueing. ++ */ ++ dequeue_task(rq, p, 0); ++ rq_unlock(rq); ++ ++ rq = cpu_rq(cpu); ++ rq_lock(rq); ++ } ++ set_task_cpu(p, cpu); ++ if (queued) ++ enqueue_task(rq, p, 0); ++ } ++ if (queued) ++ try_preempt(p, rq); ++ if (running_wrong) ++ preempt_disable(); ++out: ++ task_rq_unlock(rq, p, &flags); ++ ++ if (running_wrong) { ++ __schedule(true); ++ preempt_enable(); ++ } ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Run through task list and find tasks affined to the dead cpu, then remove ++ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold ++ * cpu 0 and src_cpu's runqueue locks. ++ */ ++static void bind_zero(int src_cpu) ++{ ++ struct task_struct *p, *t; ++ struct rq *rq0; ++ int bound = 0; ++ ++ if (src_cpu == 0) ++ return; ++ ++ rq0 = cpu_rq(0); ++ ++ do_each_thread(t, p) { ++ if (cpumask_test_cpu(src_cpu, &p->cpus_allowed)) { ++ bool local = (task_cpu(p) == src_cpu); ++ struct rq *rq = task_rq(p); ++ ++ /* task_running is the cpu stopper thread */ ++ if (local && task_running(rq, p)) ++ continue; ++ atomic_clear_cpu(src_cpu, &p->cpus_allowed); ++ atomic_set_cpu(0, &p->cpus_allowed); ++ p->zerobound = true; ++ bound++; ++ if (local) { ++ bool queued = task_queued(p); ++ ++ if (queued) ++ dequeue_task(rq, p, 0); ++ set_task_cpu(p, 0); ++ if (queued) ++ enqueue_task(rq0, p, 0); ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (bound) { ++ printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n", ++ bound, src_cpu); ++ } ++} ++ ++/* Find processes with the zerobound flag and reenable their affinity for the ++ * CPU coming alive. */ ++static void unbind_zero(int src_cpu) ++{ ++ int unbound = 0, zerobound = 0; ++ struct task_struct *p, *t; ++ ++ if (src_cpu == 0) ++ return; ++ ++ do_each_thread(t, p) { ++ if (!p->mm) ++ p->zerobound = false; ++ if (p->zerobound) { ++ unbound++; ++ cpumask_set_cpu(src_cpu, &p->cpus_allowed); ++ /* Once every CPU affinity has been re-enabled, remove ++ * the zerobound flag */ ++ if (cpumask_subset(cpu_possible_mask, &p->cpus_allowed)) { ++ p->zerobound = false; ++ zerobound++; ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (unbound) { ++ printk(KERN_INFO "Added affinity for %d processes to cpu %d\n", ++ unbound, src_cpu); ++ } ++ if (zerobound) { ++ printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n", ++ zerobound); ++ } ++} ++ ++/* ++ * Ensure that the idle task is using init_mm right before its cpu goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++#else /* CONFIG_HOTPLUG_CPU */ ++static void unbind_zero(int src_cpu) {} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++ ++static struct ctl_table sd_ctl_dir[] = { ++ { ++ .procname = "sched_domain", ++ .mode = 0555, ++ }, ++ {} ++}; ++ ++static struct ctl_table sd_ctl_root[] = { ++ { ++ .procname = "kernel", ++ .mode = 0555, ++ .child = sd_ctl_dir, ++ }, ++ {} ++}; ++ ++static struct ctl_table *sd_alloc_ctl_entry(int n) ++{ ++ struct ctl_table *entry = ++ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); ++ ++ return entry; ++} ++ ++static void sd_free_ctl_entry(struct ctl_table **tablep) ++{ ++ struct ctl_table *entry; ++ ++ /* ++ * In the intermediate directories, both the child directory and ++ * procname are dynamically allocated and could fail but the mode ++ * will always be set. In the lowest directory the names are ++ * static strings and all have proc handlers. ++ */ ++ for (entry = *tablep; entry->mode; entry++) { ++ if (entry->child) ++ sd_free_ctl_entry(&entry->child); ++ if (entry->proc_handler == NULL) ++ kfree(entry->procname); ++ } ++ ++ kfree(*tablep); ++ *tablep = NULL; ++} ++ ++#define CPU_LOAD_IDX_MAX 5 ++static int min_load_idx = 0; ++static int max_load_idx = CPU_LOAD_IDX_MAX-1; ++ ++static void ++set_table_entry(struct ctl_table *entry, ++ const char *procname, void *data, int maxlen, ++ umode_t mode, proc_handler *proc_handler, ++ bool load_idx) ++{ ++ entry->procname = procname; ++ entry->data = data; ++ entry->maxlen = maxlen; ++ entry->mode = mode; ++ entry->proc_handler = proc_handler; ++ ++ if (load_idx) { ++ entry->extra1 = &min_load_idx; ++ entry->extra2 = &max_load_idx; ++ } ++} ++ ++static struct ctl_table * ++sd_alloc_ctl_domain_table(struct sched_domain *sd) ++{ ++ struct ctl_table *table = sd_alloc_ctl_entry(14); ++ ++ if (table == NULL) ++ return NULL; ++ ++ set_table_entry(&table[0], "min_interval", &sd->min_interval, ++ sizeof(long), 0644, proc_doulongvec_minmax, false); ++ set_table_entry(&table[1], "max_interval", &sd->max_interval, ++ sizeof(long), 0644, proc_doulongvec_minmax, false); ++ set_table_entry(&table[2], "busy_idx", &sd->busy_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[3], "idle_idx", &sd->idle_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[5], "wake_idx", &sd->wake_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[7], "busy_factor", &sd->busy_factor, ++ sizeof(int), 0644, proc_dointvec_minmax, false); ++ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, ++ sizeof(int), 0644, proc_dointvec_minmax, false); ++ set_table_entry(&table[9], "cache_nice_tries", ++ &sd->cache_nice_tries, ++ sizeof(int), 0644, proc_dointvec_minmax, false); ++ set_table_entry(&table[10], "flags", &sd->flags, ++ sizeof(int), 0644, proc_dointvec_minmax, false); ++ set_table_entry(&table[11], "max_newidle_lb_cost", ++ &sd->max_newidle_lb_cost, ++ sizeof(long), 0644, proc_doulongvec_minmax, false); ++ set_table_entry(&table[12], "name", sd->name, ++ CORENAME_MAX_SIZE, 0444, proc_dostring, false); ++ /* &table[13] is terminator */ ++ ++ return table; ++} ++ ++static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) ++{ ++ struct ctl_table *entry, *table; ++ struct sched_domain *sd; ++ int domain_num = 0, i; ++ char buf[32]; ++ ++ for_each_domain(cpu, sd) ++ domain_num++; ++ entry = table = sd_alloc_ctl_entry(domain_num + 1); ++ if (table == NULL) ++ return NULL; ++ ++ i = 0; ++ for_each_domain(cpu, sd) { ++ snprintf(buf, 32, "domain%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_domain_table(sd); ++ entry++; ++ i++; ++ } ++ return table; ++} ++ ++static cpumask_var_t sd_sysctl_cpus; ++static struct ctl_table_header *sd_sysctl_header; ++ ++void register_sched_domain_sysctl(void) ++{ ++ static struct ctl_table *cpu_entries; ++ static struct ctl_table **cpu_idx; ++ char buf[32]; ++ int i; ++ ++ if (!cpu_entries) { ++ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); ++ if (!cpu_entries) ++ return; ++ ++ WARN_ON(sd_ctl_dir[0].child); ++ sd_ctl_dir[0].child = cpu_entries; ++ } ++ ++ if (!cpu_idx) { ++ struct ctl_table *e = cpu_entries; ++ ++ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); ++ if (!cpu_idx) ++ return; ++ ++ /* deal with sparse possible map */ ++ for_each_possible_cpu(i) { ++ cpu_idx[i] = e; ++ e++; ++ } ++ } ++ ++ if (!cpumask_available(sd_sysctl_cpus)) { ++ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) ++ return; ++ ++ /* init to possible to not have holes in @cpu_entries */ ++ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); ++ } ++ ++ for_each_cpu(i, sd_sysctl_cpus) { ++ struct ctl_table *e = cpu_idx[i]; ++ ++ if (e->child) ++ sd_free_ctl_entry(&e->child); ++ ++ if (!e->procname) { ++ snprintf(buf, 32, "cpu%d", i); ++ e->procname = kstrdup(buf, GFP_KERNEL); ++ } ++ e->mode = 0555; ++ e->child = sd_alloc_ctl_cpu_table(i); ++ ++ __cpumask_clear_cpu(i, sd_sysctl_cpus); ++ } ++ ++ WARN_ON(sd_sysctl_header); ++ sd_sysctl_header = register_sysctl_table(sd_ctl_root); ++} ++ ++void dirty_sched_domain_sysctl(int cpu) ++{ ++ if (cpumask_available(sd_sysctl_cpus)) ++ __cpumask_set_cpu(cpu, sd_sysctl_cpus); ++} ++ ++/* may be called multiple times per register */ ++void unregister_sched_domain_sysctl(void) ++{ ++ unregister_sysctl_table(sd_sysctl_header); ++ sd_sysctl_header = NULL; ++} ++#endif /* CONFIG_SYSCTL */ ++ ++void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) { ++ cpumask_set_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = true; ++ } ++} ++ ++void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) { ++ int cpu = cpu_of(rq); ++ ++ cpumask_clear_cpu(cpu, rq->rd->online); ++ rq->online = false; ++ clear_cpuidle_map(cpu); ++ } ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) { ++ sched_domains_numa_masks_set(cpu); ++ cpuset_cpu_active(); ++ } ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all CPUs have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ rq_lock_irqsave(rq, &flags); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_online(rq); ++ } ++ unbind_zero(cpu); ++ rq_unlock_irqrestore(rq, &flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu_mult(call_rcu, call_rcu_sched); ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ sched_domains_numa_masks_clear(cpu); ++ return 0; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* Handle pending wakeups and then migrate everything off */ ++ sched_ttwu_pending(); ++ sched_tick_stop(cpu); ++ ++ local_irq_save(flags); ++ double_rq_lock(rq, cpu_rq(0)); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ bind_zero(cpu); ++ double_rq_unlock(rq, cpu_rq(0)); ++ sched_start_tick(rq, cpu); ++ hrexpiry_clear(rq); ++ local_irq_restore(flags); ++ ++ return 0; ++} ++#endif ++ ++#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) ++/* ++ * Cheaper version of the below functions in case support for SMT and MC is ++ * compiled in but CPUs have no siblings. ++ */ ++static bool sole_cpu_idle(struct rq *rq) ++{ ++ return rq_idle(rq); ++} ++#endif ++#ifdef CONFIG_SCHED_SMT ++static const cpumask_t *thread_cpumask(int cpu) ++{ ++ return topology_sibling_cpumask(cpu); ++} ++/* All this CPU's SMT siblings are idle */ ++static bool siblings_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); ++} ++#endif ++#ifdef CONFIG_SCHED_MC ++static const cpumask_t *core_cpumask(int cpu) ++{ ++ return topology_core_cpumask(cpu); ++} ++/* All this CPU's shared cache siblings are idle */ ++static bool cache_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->core_mask, &cpu_idle_map); ++} ++#endif ++ ++enum sched_domain_level { ++ SD_LV_NONE = 0, ++ SD_LV_SIBLING, ++ SD_LV_MC, ++ SD_LV_BOOK, ++ SD_LV_CPU, ++ SD_LV_NODE, ++ SD_LV_ALLNODES, ++ SD_LV_MAX ++}; ++ ++void __init sched_init_smp(void) ++{ ++ struct rq *rq, *other_rq, *leader; ++ struct sched_domain *sd; ++ int cpu, other_cpu, i; ++#ifdef CONFIG_SCHED_SMT ++ bool smt_threads = false; ++#endif ++ sched_init_numa(); ++ ++ /* ++ * There's no userspace yet to cause hotplug operations; hence all the ++ * cpu masks are stable and all blatant races in the below code cannot ++ * happen. ++ */ ++ mutex_lock(&sched_domains_mutex); ++ sched_init_domains(cpu_active_mask); ++ mutex_unlock(&sched_domains_mutex); ++ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ mutex_lock(&sched_domains_mutex); ++ local_irq_disable(); ++ lock_all_rqs(); ++ /* ++ * Set up the relative cache distance of each online cpu from each ++ * other in a simple array for quick lookup. Locality is determined ++ * by the closest sched_domain that CPUs are separated by. CPUs with ++ * shared cache in SMT and MC are treated as local. Separate CPUs ++ * (within the same package or physically) within the same node are ++ * treated as not local. CPUs not even in the same domain (different ++ * nodes) are treated as very distant. ++ */ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ ++ /* First check if this cpu is in the same node */ ++ for_each_domain(cpu, sd) { ++ if (sd->level > SD_LV_MC) ++ continue; ++ leader = NULL; ++ /* Set locality to local node if not already found lower */ ++ for_each_cpu(other_cpu, sched_domain_span(sd)) { ++ if (rqshare == RQSHARE_SMP) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smp_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->smp_leader = leader; ++ } ++ ++ if (rq->cpu_locality[other_cpu] > 3) ++ rq->cpu_locality[other_cpu] = 3; ++ } ++ } ++ ++ /* ++ * Each runqueue has its own function in case it doesn't have ++ * siblings of its own allowing mixed topologies. ++ */ ++#ifdef CONFIG_SCHED_MC ++ leader = NULL; ++ if (cpumask_weight(core_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->core_mask, core_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->core_mask); ++ for_each_cpu(other_cpu, core_cpumask(cpu)) { ++ if (rqshare == RQSHARE_MC) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the mc_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->mc_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > 2) ++ rq->cpu_locality[other_cpu] = 2; ++ } ++ rq->cache_idle = cache_cpu_idle; ++ } ++#endif ++#ifdef CONFIG_SCHED_SMT ++ leader = NULL; ++ if (cpumask_weight(thread_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->thread_mask); ++ for_each_cpu(other_cpu, thread_cpumask(cpu)) { ++ if (rqshare == RQSHARE_SMT) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smt_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->smt_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > 1) ++ rq->cpu_locality[other_cpu] = 1; ++ } ++ rq->siblings_idle = siblings_cpu_idle; ++ smt_threads = true; ++ } ++#endif ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (smt_threads) { ++ check_siblings = &check_smt_siblings; ++ wake_siblings = &wake_smt_siblings; ++ smt_schedule = &smt_should_schedule; ++ } ++#endif ++ unlock_all_rqs(); ++ local_irq_enable(); ++ mutex_unlock(&sched_domains_mutex); ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ ++ for_each_online_cpu(other_cpu) { ++ if (other_cpu <= cpu) ++ continue; ++ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->smp_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "Sharing SMP runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++ ++#ifdef CONFIG_SCHED_MC ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->mc_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "Sharing MC runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_MC */ ++ ++#ifdef CONFIG_SCHED_SMT ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ ++ leader = rq->smt_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "Sharing SMT runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_SMT */ ++ ++ total_runqueues = 0; ++ for_each_possible_cpu(cpu) { ++ int locality, total_rqs = 0, total_cpus = 0; ++ ++ rq = cpu_rq(cpu); ++ if ( ++#ifdef CONFIG_SCHED_MC ++ (rq->mc_leader == rq) && ++#endif ++#ifdef CONFIG_SCHED_SMT ++ (rq->smt_leader == rq) && ++#endif ++ (rq->smp_leader == rq)) ++ total_runqueues++; ++ ++ for (locality = 0; locality <= 4; locality++) { ++ int test_cpu; ++ ++ for_each_possible_cpu(test_cpu) { ++ /* Work from each CPU up instead of every rq ++ * starting at CPU 0 */ ++ other_cpu = test_cpu + cpu; ++ other_cpu %= num_possible_cpus(); ++ other_rq = cpu_rq(other_cpu); ++ ++ if (rq->cpu_locality[other_cpu] == locality) { ++ rq->cpu_order[total_cpus++] = other_rq; ++ if ( ++ ++#ifdef CONFIG_SCHED_MC ++ (other_rq->mc_leader == other_rq) && ++#endif ++#ifdef CONFIG_SCHED_SMT ++ (other_rq->smt_leader == other_rq) && ++#endif ++ (other_rq->smp_leader == other_rq)) ++ rq->rq_order[total_rqs++] = other_rq; ++ } ++ } ++ } ++ } ++ ++ for_each_possible_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < total_runqueues; i++) { ++ printk(KERN_DEBUG "CPU %d RQ order %d RQ %d\n", cpu, i, ++ rq->rq_order[i]->cpu); ++ } ++ } ++ for_each_possible_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < num_possible_cpus(); i++) { ++ printk(KERN_DEBUG "CPU %d CPU order %d RQ %d\n", cpu, i, ++ rq->cpu_order[i]->cpu); ++ } ++ } ++ switch (rqshare) { ++ case RQSHARE_SMP: ++ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC: ++ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMT: ++ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_NONE: ++ printk(KERN_INFO "MuQSS runqueue share type none total runqueues: %d\n", ++ total_runqueues); ++ break; ++ } ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ sched_smp_initialized = true; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++#ifdef CONFIG_SMP ++ int cpu_ids; ++#endif ++ int i; ++ struct rq *rq; ++ ++ wait_bit_init(); ++ ++ prio_ratios[0] = 128; ++ for (i = 1 ; i < NICE_WIDTH ; i++) ++ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; ++ ++ skiplist_node_init(&init_task.node); ++ ++#ifdef CONFIG_SMP ++ init_defrootdomain(); ++ cpumask_clear(&cpu_idle_map); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC); ++ skiplist_init(rq->node); ++ rq->sl = new_skiplist(rq->node); ++ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC); ++ raw_spin_lock_init(rq->lock); ++ rq->nr_running = 0; ++ rq->nr_uninterruptible = 0; ++ rq->nr_switches = 0; ++ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; ++ rq->last_jiffy = jiffies; ++ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = ++ rq->iowait_ns = rq->idle_ns = 0; ++ rq->dither = 0; ++ set_rq_task(rq, &init_task); ++ rq->iso_ticks = 0; ++ rq->iso_refractory = false; ++#ifdef CONFIG_SMP ++ rq->smp_leader = rq; ++#ifdef CONFIG_SCHED_MC ++ rq->mc_leader = rq; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ rq->smt_leader = rq; ++#endif ++ rq->sd = NULL; ++ rq->rd = NULL; ++ rq->online = false; ++ rq->cpu = i; ++ rq_attach_root(rq, &def_root_domain); ++#endif ++ init_rq_hrexpiry(rq); ++ atomic_set(&rq->nr_iowait, 0); ++ } ++ ++#ifdef CONFIG_SMP ++ cpu_ids = i; ++ /* ++ * Set the base locality for cpu cache distance calculation to ++ * "distant" (3). Make sure the distance from a CPU to itself is 0. ++ */ ++ for_each_possible_cpu(i) { ++ int j; ++ ++ rq = cpu_rq(i); ++#ifdef CONFIG_SCHED_SMT ++ rq->siblings_idle = sole_cpu_idle; ++#endif ++#ifdef CONFIG_SCHED_MC ++ rq->cache_idle = sole_cpu_idle; ++#endif ++ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); ++ for_each_possible_cpu(j) { ++ if (i == j) ++ rq->cpu_locality[j] = 0; ++ else ++ rq->cpu_locality[j] = 4; ++ } ++ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->rq_order[0] = rq->cpu_order[0] = rq; ++ for (j = 1; j < cpu_ids; j++) ++ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j); ++ } ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current)) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && !preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++static inline void normalise_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ unsigned long flags; ++ struct rq *rq; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p) && !iso_task(p)) ++ continue; ++ ++ rq = task_rq_lock(p, &flags); ++ __setscheduler(p, rq, SCHED_NORMAL, 0, false); ++ task_rq_unlock(rq, p, &flags); ++ } ++ read_unlock(&tasklist_lock); ++} ++ ++void normalize_rt_tasks(void) ++{ ++ normalise_rt_tasks(); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++void init_idle_bootup_task(struct task_struct *idle) ++{} ++ ++#ifdef CONFIG_SCHED_DEBUG ++__read_mostly bool sched_debug_enabled; ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_SMP ++#define SCHED_LOAD_SHIFT (10) ++#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) ++ ++unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) ++{ ++ return SCHED_LOAD_SCALE; ++} ++ ++unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) ++{ ++ unsigned long weight = cpumask_weight(sched_domain_span(sd)); ++ unsigned long smt_gain = sd->smt_gain; ++ ++ smt_gain /= weight; ++ ++ return smt_gain; ++} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff -Nur a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h +--- a/kernel/sched/MuQSS.h 1970-01-01 01:00:00.000000000 +0100 ++++ b/kernel/sched/MuQSS.h 2019-02-09 17:46:12.001297867 +0000 +@@ -0,0 +1,881 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef MUQSS_SCHED_H ++#define MUQSS_SCHED_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_PARAVIRT ++#include ++#endif ++ ++#include "cpupri.h" ++ ++#ifdef CONFIG_SCHED_DEBUG ++# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) ++#else ++# define SCHED_WARN_ON(x) ((void)(x)) ++#endif ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++struct rq; ++ ++#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) ++#define HAVE_SCHED_AVG_IRQ ++#endif ++ ++#ifdef CONFIG_SMP ++ ++static inline bool sched_asym_prefer(int a, int b) ++{ ++ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); ++} ++ ++/* ++ * We add the notion of a root-domain which will be used to define per-domain ++ * variables. Each exclusive cpuset essentially defines an island domain by ++ * fully partitioning the member cpus from any other cpuset. Whenever a new ++ * exclusive cpuset is created, we also create and attach a new root-domain ++ * object. ++ * ++ */ ++struct root_domain { ++ atomic_t refcount; ++ atomic_t rto_count; ++ struct rcu_head rcu; ++ cpumask_var_t span; ++ cpumask_var_t online; ++ ++ /* Indicate more than one runnable task for any CPU */ ++ bool overload; ++ ++ /* ++ * The bit corresponding to a CPU gets set here if such CPU has more ++ * than one runnable -deadline task (as it is below for RT tasks). ++ */ ++ cpumask_var_t dlo_mask; ++ atomic_t dlo_count; ++ /* Replace unused CFS structures with void */ ++ //struct dl_bw dl_bw; ++ //struct cpudl cpudl; ++ void *dl_bw; ++ void *cpudl; ++ ++ /* ++ * The "RT overload" flag: it gets set if a CPU has more than ++ * one runnable RT task. ++ */ ++ cpumask_var_t rto_mask; ++ //struct cpupri cpupri; ++ void *cpupri; ++ ++ unsigned long max_cpu_capacity; ++}; ++ ++extern struct root_domain def_root_domain; ++extern struct mutex sched_domains_mutex; ++ ++extern void init_defrootdomain(void); ++extern int sched_init_domains(const struct cpumask *cpu_map); ++extern void rq_attach_root(struct rq *rq, struct root_domain *rd); ++ ++static inline void cpupri_cleanup(void __maybe_unused *cpupri) ++{ ++} ++ ++static inline void cpudl_cleanup(void __maybe_unused *cpudl) ++{ ++} ++ ++static inline void init_dl_bw(void __maybe_unused *dl_bw) ++{ ++} ++ ++static inline int cpudl_init(void __maybe_unused *dl_bw) ++{ ++ return 0; ++} ++ ++static inline int cpupri_init(void __maybe_unused *cpupri) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ raw_spinlock_t *lock; ++ raw_spinlock_t *orig_lock; ++ ++ struct task_struct *curr, *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ unsigned int nr_running; ++ /* ++ * This is part of a global counter where only the total sum ++ * over all CPUs matters. A task can increase this counter on ++ * one CPU and if it got migrated afterwards it may decrease ++ * it on another CPU. Always updated under the runqueue lock: ++ */ ++ unsigned long nr_uninterruptible; ++ u64 nr_switches; ++ ++ /* Stored data about rq->curr to work outside rq lock */ ++ u64 rq_deadline; ++ int rq_prio; ++ ++ /* Best queued id for use outside lock */ ++ u64 best_key; ++ ++ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ ++ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ ++ u64 niffies; /* Last time this RQ updated rq clock */ ++ u64 last_niffy; /* Last niffies as updated by local clock */ ++ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ ++ ++ u64 load_update; /* When we last updated load */ ++ unsigned long load_avg; /* Rolling load average */ ++#ifdef HAVE_SCHED_AVG_IRQ ++ u64 irq_load_update; /* When we last updated IRQ load */ ++ unsigned long irq_load_avg; /* Rolling IRQ load average */ ++#endif ++#ifdef CONFIG_SMT_NICE ++ struct mm_struct *rq_mm; ++ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++ /* Accurate timekeeping data */ ++ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, ++ iowait_ns, idle_ns; ++ atomic_t nr_iowait; ++ ++ skiplist_node *node; ++ skiplist *sl; ++#ifdef CONFIG_SMP ++ struct task_struct *preempt; /* Preempt triggered on this task */ ++ struct task_struct *preempting; /* Hint only, what task is preempting */ ++ ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++ struct root_domain *rd; ++ struct sched_domain *sd; ++ ++ unsigned long cpu_capacity_orig; ++ ++ int *cpu_locality; /* CPU relative cache distance */ ++ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */ ++ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */ ++ ++ struct rq *smp_leader; /* First physical CPU per node */ ++#ifdef CONFIG_SCHED_SMT ++ struct rq *smt_leader; /* First logical CPU in SMT siblings */ ++ cpumask_t thread_mask; ++ bool (*siblings_idle)(struct rq *rq); ++ /* See if all smt siblings are idle */ ++#endif /* CONFIG_SCHED_SMT */ ++#ifdef CONFIG_SCHED_MC ++ struct rq *mc_leader; /* First logical CPU in MC siblings */ ++ cpumask_t core_mask; ++ bool (*cache_idle)(struct rq *rq); ++ /* See if all cache siblings are idle */ ++#endif /* CONFIG_SCHED_MC */ ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ u64 clock, old_clock, last_tick; ++ u64 clock_task; ++ int dither; ++ ++ int iso_ticks; ++ bool iso_refractory; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ struct hrtimer hrexpiry_timer; ++#endif ++ ++ int rt_nr_running; /* Number real time tasks running */ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_SMP ++ struct llist_head wake_list; ++#endif ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++#ifdef CONFIG_SMP ++struct rq *cpu_rq(int cpu); ++#endif ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#endif /* CONFIG_SMP */ ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline int task_running(struct rq *rq, struct task_struct *p) ++{ ++#ifdef CONFIG_SMP ++ return p->on_cpu; ++#else ++ return task_current(rq, p); ++#endif ++} ++ ++static inline void rq_lock(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock(rq->lock); ++} ++ ++static inline void rq_unlock(struct rq *rq) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(rq->lock); ++} ++ ++static inline void rq_lock_irq(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irq(rq->lock); ++} ++ ++static inline void rq_unlock_irq(struct rq *rq) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(rq->lock); ++} ++ ++static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(rq->lock, *flags); ++} ++ ++static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(rq->lock, *flags); ++} ++ ++static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ while (42) { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ return rq; ++} ++ ++static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ rq_unlock(rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++} ++ ++static inline struct rq *__task_rq_lock(struct task_struct *p) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ while (42) { ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ } ++ return rq; ++} ++ ++static inline void __task_rq_unlock(struct rq *rq) ++{ ++ rq_unlock(rq); ++} ++ ++/* ++ * {de,en}queue flags: Most not used on MuQSS. ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks ++ * are in a known state which allows modification. Such pairs ++ * should preserve as much state as possible. ++ * ++ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location ++ * in the runqueue. ++ * ++ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) ++ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) ++ * ENQUEUE_MIGRATED - the task was migrated during wakeup ++ * ++ */ ++ ++#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ ++ ++#define ENQUEUE_RESTORE 0x02 ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock_task; ++} ++ ++#ifdef CONFIG_NUMA ++enum numa_topology_type { ++ NUMA_DIRECT, ++ NUMA_GLUELESS_MESH, ++ NUMA_BACKPLANE, ++}; ++extern enum numa_topology_type sched_numa_topology_type; ++extern int sched_max_numa_distance; ++extern bool find_numa_distance(int distance); ++ ++extern void sched_init_numa(void); ++extern void sched_domains_numa_masks_set(unsigned int cpu); ++extern void sched_domains_numa_masks_clear(unsigned int cpu); ++#else ++static inline void sched_init_numa(void) { } ++static inline void sched_domains_numa_masks_set(unsigned int cpu) { } ++static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } ++#endif ++ ++extern struct mutex sched_domains_mutex; ++extern struct static_key_false sched_schedstats; ++ ++#define rcu_dereference_check_sched_domain(p) \ ++ rcu_dereference_check((p), \ ++ lockdep_is_held(&sched_domains_mutex)) ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. ++ * See detach_destroy_domains: synchronize_sched for details. ++ * ++ * The domain tree of any CPU may only be accessed from within ++ * preempt-disabled sections. ++ */ ++#define for_each_domain(cpu, __sd) \ ++ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ ++ __sd; __sd = __sd->parent) ++ ++#define for_each_lower_domain(sd) for (; sd; sd = sd->child) ++ ++/** ++ * highest_flag_domain - Return highest sched_domain containing flag. ++ * @cpu: The cpu whose highest level of sched domain is to ++ * be returned. ++ * @flag: The flag to check for the highest sched_domain ++ * for the given cpu. ++ * ++ * Returns the highest sched_domain of a cpu which contains the given flag. ++ */ ++static inline struct sched_domain *highest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd, *hsd = NULL; ++ ++ for_each_domain(cpu, sd) { ++ if (!(sd->flags & flag)) ++ break; ++ hsd = sd; ++ } ++ ++ return hsd; ++} ++ ++static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd; ++ ++ for_each_domain(cpu, sd) { ++ if (sd->flags & flag) ++ break; ++ } ++ ++ return sd; ++} ++ ++DECLARE_PER_CPU(struct sched_domain *, sd_llc); ++DECLARE_PER_CPU(int, sd_llc_size); ++DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); ++DECLARE_PER_CPU(struct sched_domain *, sd_numa); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym); ++ ++struct sched_group_capacity { ++ atomic_t ref; ++ /* ++ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity ++ * for a single CPU. ++ */ ++ unsigned long capacity; ++ unsigned long min_capacity; /* Min per-CPU capacity in group */ ++ unsigned long next_update; ++ int imbalance; /* XXX unrelated to capacity but shared group state */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++ int id; ++#endif ++ ++ unsigned long cpumask[0]; /* balance mask */ ++}; ++ ++struct sched_group { ++ struct sched_group *next; /* Must be a circular list */ ++ atomic_t ref; ++ ++ unsigned int group_weight; ++ struct sched_group_capacity *sgc; ++ int asym_prefer_cpu; /* cpu of highest priority in group */ ++ ++ /* ++ * The CPUs this group covers. ++ * ++ * NOTE: this field is variable length. (Allocated dynamically ++ * by attaching extra space to the end of the structure, ++ * depending on how many CPUs the kernel has booted up with) ++ */ ++ unsigned long cpumask[0]; ++}; ++ ++static inline struct cpumask *sched_group_span(struct sched_group *sg) ++{ ++ return to_cpumask(sg->cpumask); ++} ++ ++/* ++ * See build_balance_mask(). ++ */ ++static inline struct cpumask *group_balance_mask(struct sched_group *sg) ++{ ++ return to_cpumask(sg->sgc->cpumask); ++} ++ ++/** ++ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. ++ * @group: The group whose first cpu is to be returned. ++ */ ++static inline unsigned int group_first_cpu(struct sched_group *group) ++{ ++ return cpumask_first(sched_group_span(group)); ++} ++ ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void dirty_sched_domain_sysctl(int cpu); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void dirty_sched_domain_sysctl(int cpu) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern void sched_ttwu_pending(void); ++extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); ++extern void set_rq_online (struct rq *rq); ++extern void set_rq_offline(struct rq *rq); ++extern bool sched_smp_initialized; ++ ++static inline void update_group_capacity(struct sched_domain *sd, int cpu) ++{ ++} ++ ++static inline void trigger_load_balance(struct rq *rq) ++{ ++} ++ ++#define sched_feat(x) 0 ++ ++#else /* CONFIG_SMP */ ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ SCHED_WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++extern bool sched_debug_enabled; ++#endif ++ ++extern void schedule_idle(void); ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_SMP ++static inline int cpu_of(struct rq *rq) ++{ ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static inline int cpu_of(struct rq *rq) ++{ ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); ++ ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, ++ cpu_of(rq))); ++ ++ if (data) ++ data->func(data, rq->niffies, flags); ++} ++#else ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) ++{ ++} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++/* ++ * This should only be called when current == rq->idle. Dodgy workaround for ++ * when softirqs are pending and we are in the idle loop. Setting current to ++ * resched will kick us out of the idle loop and the softirqs will be serviced ++ * on our next pass through schedule(). ++ */ ++static inline bool softirq_pending(int cpu) ++{ ++ if (likely(!local_softirq_pending())) ++ return false; ++ set_tsk_need_resched(current); ++ return true; ++} ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return tsk_seruntime(t); ++} ++#else ++struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags); ++void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags); ++ ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ unsigned long flags; ++ u64 ns; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &flags); ++ ns = tsk_seruntime(t); ++ task_rq_unlock(rq, t, &flags); ++ ++ return ns; ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern bool sched_can_stop_tick(struct rq *rq); ++extern int __init sched_tick_offload_init(void); ++ ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out of ++ * nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (sched_can_stop_tick(rq)) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++#ifdef CONFIG_SMP ++ ++#ifndef arch_scale_cpu_capacity ++static __always_inline ++unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) ++{ ++ if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) ++ return sd->smt_gain / sd->span_weight; ++ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++#else ++#ifndef arch_scale_cpu_capacity ++static __always_inline ++unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++#endif ++ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++static inline bool rt_rq_is_runnable(struct rq *rt_rq) ++{ ++ return rt_rq->rt_nr_running; ++} ++ ++#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL ++ ++static inline unsigned long cpu_bw_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_cfs(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline unsigned long cpu_util_rt(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->rt_nr_running); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->irq_load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ util *= (max - irq); ++ util /= max; ++ ++ return util; ++ ++} ++#else ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ return util; ++} ++#endif ++#endif ++ ++#endif /* MUQSS_SCHED_H */ +diff -Nur a/kernel/sched/sched.h b/kernel/sched/sched.h +--- a/kernel/sched/sched.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/sched/sched.h 2019-02-09 17:46:12.001297867 +0000 +@@ -2,6 +2,19 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_MUQSS ++#include "MuQSS.h" ++ ++/* Begin compatibility wrappers for MuQSS/CFS differences */ ++#define rq_rt_nr_running(rq) ((rq)->rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->nr_running) ++ ++#else /* CONFIG_SCHED_MUQSS */ ++ ++#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running) ++ ++ + #include + + #include +@@ -2241,3 +2254,30 @@ + return util; + } + #endif ++ ++/* MuQSS compatibility functions */ ++static inline bool softirq_pending(int cpu) ++{ ++ return false; ++} ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return t->se.sum_exec_runtime; ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ u64 ns; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = t->se.sum_exec_runtime; ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++#endif /* CONFIG_SCHED_MUQSS */ +diff -Nur a/kernel/sched/topology.c b/kernel/sched/topology.c +--- a/kernel/sched/topology.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/sched/topology.c 2019-02-09 17:46:12.001297867 +0000 +@@ -219,7 +219,11 @@ + struct root_domain *old_rd = NULL; + unsigned long flags; + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_lock_irqsave(rq->lock, flags); ++#else + raw_spin_lock_irqsave(&rq->lock, flags); ++#endif + + if (rq->rd) { + old_rd = rq->rd; +@@ -245,7 +249,11 @@ + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_unlock_irqrestore(rq->lock, flags); ++#else + raw_spin_unlock_irqrestore(&rq->lock, flags); ++#endif + + if (old_rd) + call_rcu_sched(&old_rd->rcu, free_rootdomain); +diff -Nur a/kernel/skip_list.c b/kernel/skip_list.c +--- a/kernel/skip_list.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/kernel/skip_list.c 2019-02-09 17:46:12.001297867 +0000 +@@ -0,0 +1,148 @@ ++/* ++ Copyright (C) 2011,2016 Con Kolivas. ++ ++ Code based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++The routine randomLevel has been hard-coded to generate random ++levels using p=0.25. It can be easily changed. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++Levels start at zero and go up to MaxLevel (which is equal to ++MaxNumberOfLevels-1). ++ ++The routines defined in this file are: ++ ++init: defines slnode ++ ++new_skiplist: returns a new, empty list ++ ++randomLevel: Returns a random level based on a u64 random seed passed to it. ++In MuQSS, the "niffy" time is used for this purpose. ++ ++insert(l,key, value): inserts the binding (key, value) into l. This operation ++occurs in O(log n) time. ++ ++delnode(slnode, l, node): deletes any binding of key from the l based on the ++actual node value. This operation occurs in O(k) time where k is the ++number of levels of the node in question (max 8). The original delete ++function occurred in O(log n) time and involved a search. ++ ++MuQSS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++ ++*/ ++ ++#include ++#include ++ ++#define MaxNumberOfLevels 8 ++#define MaxLevel (MaxNumberOfLevels - 1) ++ ++void skiplist_init(skiplist_node *slnode) ++{ ++ int i; ++ ++ slnode->key = 0xFFFFFFFFFFFFFFFF; ++ slnode->level = 0; ++ slnode->value = NULL; ++ for (i = 0; i < MaxNumberOfLevels; i++) ++ slnode->next[i] = slnode->prev[i] = slnode; ++} ++ ++skiplist *new_skiplist(skiplist_node *slnode) ++{ ++ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); ++ ++ BUG_ON(!l); ++ l->header = slnode; ++ return l; ++} ++ ++void free_skiplist(skiplist *l) ++{ ++ skiplist_node *p, *q; ++ ++ p = l->header; ++ do { ++ q = p->next[0]; ++ p->next[0]->prev[0] = q->prev[0]; ++ skiplist_node_init(p); ++ p = q; ++ } while (p != l->header); ++ kfree(l); ++} ++ ++void skiplist_node_init(skiplist_node *node) ++{ ++ memset(node, 0, sizeof(skiplist_node)); ++} ++ ++static inline unsigned int randomLevel(const long unsigned int randseed) ++{ ++ return find_first_bit(&randseed, MaxLevel) / 2; ++} ++ ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) ++{ ++ skiplist_node *update[MaxNumberOfLevels]; ++ skiplist_node *p, *q; ++ int k = l->level; ++ ++ p = l->header; ++ do { ++ while (q = p->next[k], q->key <= key) ++ p = q; ++ update[k] = p; ++ } while (--k >= 0); ++ ++ ++l->entries; ++ k = randomLevel(randseed); ++ if (k > l->level) { ++ k = ++l->level; ++ update[k] = l->header; ++ } ++ ++ node->level = k; ++ node->key = key; ++ node->value = value; ++ do { ++ p = update[k]; ++ node->next[k] = p->next[k]; ++ p->next[k] = node; ++ node->prev[k] = p; ++ node->next[k]->prev[k] = node; ++ } while (--k >= 0); ++} ++ ++void skiplist_delete(skiplist *l, skiplist_node *node) ++{ ++ int k, m = node->level; ++ ++ for (k = 0; k <= m; k++) { ++ node->prev[k]->next[k] = node->next[k]; ++ node->next[k]->prev[k] = node->prev[k]; ++ } ++ skiplist_node_init(node); ++ if (m == l->level) { ++ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) ++ m--; ++ l->level = m; ++ } ++ l->entries--; ++} +diff -Nur a/kernel/sysctl.c b/kernel/sysctl.c +--- a/kernel/sysctl.c 2019-02-09 17:20:30.491821512 +0000 ++++ b/kernel/sysctl.c 2019-02-09 17:57:37.563468776 +0000 +@@ -136,6 +136,12 @@ + static unsigned long one_ul __read_only = 1; + static int one_hundred __read_only = 100; + static int one_thousand __read_only = 1000; ++#ifdef CONFIG_SCHED_MUQSS ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand __read_only = 10000; + #endif +@@ -306,7 +312,7 @@ + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) + static int min_sched_granularity_ns __read_only = 100000; /* 100 usecs */ + static int max_sched_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns __read_only; /* 0 usecs */ +@@ -323,6 +329,7 @@ + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_MUQSS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -477,6 +484,7 @@ + .extra1 = &one, + }, + #endif ++#endif /* !CONFIG_SCHED_MUQSS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1082,6 +1090,44 @@ + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_MUQSS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++ { ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff -Nur a/kernel/time/clockevents.c b/kernel/time/clockevents.c +--- a/kernel/time/clockevents.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/time/clockevents.c 2019-02-09 17:46:12.001297867 +0000 +@@ -198,8 +198,13 @@ + + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + ++#ifdef CONFIG_SCHED_MUQSS ++/* Limit min_delta to 100us */ ++#define MIN_DELTA_LIMIT (NSEC_PER_SEC / 10000) ++#else + /* Limit min_delta to a jiffie */ + #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) ++#endif + + /** + * clockevents_increase_min_delta - raise minimum delta of a clock event device +diff -Nur a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +--- a/kernel/time/posix-cpu-timers.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/time/posix-cpu-timers.c 2019-02-09 17:46:12.001297867 +0000 +@@ -830,7 +830,7 @@ + tsk_expires->virt_exp = expires; + + tsk_expires->sched_exp = check_timers_list(++timers, firing, +- tsk->se.sum_exec_runtime); ++ tsk_seruntime(tsk)); + + /* + * Check for the special case thread timers. +@@ -840,7 +840,7 @@ + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + if (hard != RLIM_INFINITY && +- tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { ++ tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { + /* + * At the hard limit, we just die. + * No need to calculate anything else now. +@@ -852,7 +852,7 @@ + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } +- if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { ++ if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { + /* + * At the soft limit, send a SIGXCPU every second. + */ +@@ -1095,7 +1095,7 @@ + struct task_cputime task_sample; + + task_cputime(tsk, &task_sample.utime, &task_sample.stime); +- task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; ++ task_sample.sum_exec_runtime = tsk_seruntime(tsk); + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + return 1; + } +diff -Nur a/kernel/time/timer.c b/kernel/time/timer.c +--- a/kernel/time/timer.c 2019-02-09 17:20:30.491821512 +0000 ++++ b/kernel/time/timer.c 2019-02-09 17:46:12.001297867 +0000 +@@ -1479,7 +1479,7 @@ + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) ++static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) + { + u64 nextevt = hrtimer_get_next_event(); + +@@ -1497,6 +1497,9 @@ + if (nextevt <= basem) + return basem; + ++ if (nextevt < expires && nextevt - basem <= TICK_NSEC) ++ base->is_idle = false; ++ + /* + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to +@@ -1566,7 +1569,7 @@ + } + raw_spin_unlock(&base->lock); + +- return cmp_next_hrtimer_event(basem, expires); ++ return cmp_next_hrtimer_event(base, basem, expires); + } + + /** +diff -Nur a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +--- a/kernel/trace/trace_selftest.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/trace/trace_selftest.c 2019-02-09 17:46:12.001297867 +0000 +@@ -1041,10 +1041,15 @@ + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_MUQSS ++ /* No deadline on MuQSS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0002-Fix-Werror-build-failure-in-tools.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0002-Fix-Werror-build-failure-in-tools.patch new file mode 100644 index 00000000..35872156 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0002-Fix-Werror-build-failure-in-tools.patch @@ -0,0 +1,25 @@ +From ba77544e4687e62fe9d8ca870ceb47ea87d1cbfe Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Sun, 18 Feb 2018 12:36:22 +1100 +Subject: [PATCH 02/16] Fix Werror build failure in tools. + +--- + tools/objtool/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile +index c9d038f91af6..af41781c233a 100644 +--- a/tools/objtool/Makefile ++++ b/tools/objtool/Makefile +@@ -31,7 +31,7 @@ INCLUDES := -I$(srctree)/tools/include \ + -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \ + -I$(srctree)/tools/objtool/arch/$(ARCH)/include + WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed +-CFLAGS += -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) ++CFLAGS += $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) + LDFLAGS += -lelf $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS) + + # Allow old libelf to be used: +-- +2.17.1 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0003-Make-preemptible-kernel-default.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0003-Make-preemptible-kernel-default.patch new file mode 100644 index 00000000..176fcb54 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0003-Make-preemptible-kernel-default.patch @@ -0,0 +1,4648 @@ +From 2432d1de7128e6ac986749bc52eb30c4c1c654d0 Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Sat, 29 Oct 2016 11:20:37 +1100 +Subject: [PATCH 03/16] Make preemptible kernel default. + +Make full preempt default on all arches. +--- + arch/arc/configs/tb10x_defconfig | 2 +- + arch/arm/configs/bcm2835_defconfig | 2 +- + arch/arm/configs/imx_v6_v7_defconfig | 2 +- + arch/arm/configs/mps2_defconfig | 2 +- + arch/arm/configs/mxs_defconfig | 2 +- + arch/blackfin/configs/BF518F-EZBRD_defconfig | 121 ++++ + arch/blackfin/configs/BF526-EZBRD_defconfig | 158 ++++++ + .../blackfin/configs/BF527-EZKIT-V2_defconfig | 188 +++++++ + arch/blackfin/configs/BF527-EZKIT_defconfig | 181 ++++++ + .../blackfin/configs/BF527-TLL6527M_defconfig | 178 ++++++ + arch/blackfin/configs/BF533-EZKIT_defconfig | 114 ++++ + arch/blackfin/configs/BF533-STAMP_defconfig | 124 +++++ + arch/blackfin/configs/BF537-STAMP_defconfig | 136 +++++ + arch/blackfin/configs/BF538-EZKIT_defconfig | 133 +++++ + arch/blackfin/configs/BF548-EZKIT_defconfig | 207 +++++++ + arch/blackfin/configs/BF561-ACVILON_defconfig | 149 +++++ + .../configs/BF561-EZKIT-SMP_defconfig | 112 ++++ + arch/blackfin/configs/BF561-EZKIT_defconfig | 114 ++++ + arch/blackfin/configs/BF609-EZKIT_defconfig | 154 +++++ + arch/blackfin/configs/BlackStamp_defconfig | 108 ++++ + arch/blackfin/configs/CM-BF527_defconfig | 129 +++++ + arch/blackfin/configs/PNAV-10_defconfig | 111 ++++ + arch/blackfin/configs/SRV1_defconfig | 88 +++ + arch/blackfin/configs/TCM-BF518_defconfig | 131 +++++ + arch/mips/configs/fuloong2e_defconfig | 3 +- + arch/mips/configs/gpr_defconfig | 3 +- + arch/mips/configs/ip22_defconfig | 3 +- + arch/mips/configs/ip28_defconfig | 3 +- + arch/mips/configs/jazz_defconfig | 3 +- + arch/mips/configs/mtx1_defconfig | 3 +- + arch/mips/configs/nlm_xlr_defconfig | 2 +- + arch/mips/configs/pic32mzda_defconfig | 2 +- + arch/mips/configs/pistachio_defconfig | 2 +- + arch/mips/configs/pnx8335_stb225_defconfig | 2 +- + arch/mips/configs/rm200_defconfig | 3 +- + arch/parisc/configs/712_defconfig | 2 +- + arch/parisc/configs/c3000_defconfig | 2 +- + arch/parisc/configs/default_defconfig | 2 +- + arch/powerpc/configs/c2k_defconfig | 389 +++++++++++++ + arch/powerpc/configs/ppc6xx_defconfig | 2 +- + arch/score/configs/spct6600_defconfig | 84 +++ + arch/sh/configs/se7712_defconfig | 2 +- + arch/sh/configs/se7721_defconfig | 2 +- + arch/sh/configs/titan_defconfig | 2 +- + arch/sparc/configs/sparc64_defconfig | 2 +- + arch/tile/configs/tilegx_defconfig | 411 ++++++++++++++ + arch/tile/configs/tilepro_defconfig | 524 ++++++++++++++++++ + arch/x86/configs/i386_defconfig | 2 +- + arch/x86/configs/x86_64_defconfig | 2 +- + kernel/Kconfig.preempt | 9 +- + 50 files changed, 4082 insertions(+), 30 deletions(-) + create mode 100644 arch/blackfin/configs/BF518F-EZBRD_defconfig + create mode 100644 arch/blackfin/configs/BF526-EZBRD_defconfig + create mode 100644 arch/blackfin/configs/BF527-EZKIT-V2_defconfig + create mode 100644 arch/blackfin/configs/BF527-EZKIT_defconfig + create mode 100644 arch/blackfin/configs/BF527-TLL6527M_defconfig + create mode 100644 arch/blackfin/configs/BF533-EZKIT_defconfig + create mode 100644 arch/blackfin/configs/BF533-STAMP_defconfig + create mode 100644 arch/blackfin/configs/BF537-STAMP_defconfig + create mode 100644 arch/blackfin/configs/BF538-EZKIT_defconfig + create mode 100644 arch/blackfin/configs/BF548-EZKIT_defconfig + create mode 100644 arch/blackfin/configs/BF561-ACVILON_defconfig + create mode 100644 arch/blackfin/configs/BF561-EZKIT-SMP_defconfig + create mode 100644 arch/blackfin/configs/BF561-EZKIT_defconfig + create mode 100644 arch/blackfin/configs/BF609-EZKIT_defconfig + create mode 100644 arch/blackfin/configs/BlackStamp_defconfig + create mode 100644 arch/blackfin/configs/CM-BF527_defconfig + create mode 100644 arch/blackfin/configs/PNAV-10_defconfig + create mode 100644 arch/blackfin/configs/SRV1_defconfig + create mode 100644 arch/blackfin/configs/TCM-BF518_defconfig + create mode 100644 arch/powerpc/configs/c2k_defconfig + create mode 100644 arch/score/configs/spct6600_defconfig + create mode 100644 arch/tile/configs/tilegx_defconfig + create mode 100644 arch/tile/configs/tilepro_defconfig + +diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig +index a7f65313f84a..5233307bf903 100644 +--- a/arch/arc/configs/tb10x_defconfig ++++ b/arch/arc/configs/tb10x_defconfig +@@ -28,7 +28,7 @@ CONFIG_ARC_PLAT_TB10X=y + CONFIG_ARC_CACHE_LINE_SHIFT=5 + CONFIG_HZ=250 + CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk" +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_COMPACTION is not set + CONFIG_NET=y + CONFIG_PACKET=y +diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig +index e9bc88937b1e..73cde48ad00f 100644 +--- a/arch/arm/configs/bcm2835_defconfig ++++ b/arch/arm/configs/bcm2835_defconfig +@@ -29,7 +29,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_ARCH_MULTI_V6=y + CONFIG_ARCH_BCM=y + CONFIG_ARCH_BCM2835=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_AEABI=y + CONFIG_KSM=y + CONFIG_CLEANCACHE=y +diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig +index 7eca43ff69bb..689095192133 100644 +--- a/arch/arm/configs/imx_v6_v7_defconfig ++++ b/arch/arm/configs/imx_v6_v7_defconfig +@@ -48,7 +48,7 @@ CONFIG_PCI_MSI=y + CONFIG_PCI_IMX6=y + CONFIG_SMP=y + CONFIG_ARM_PSCI=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_HIGHMEM=y + CONFIG_FORCE_MAX_ZONEORDER=14 + CONFIG_CMDLINE="noinitrd console=ttymxc0,115200" +diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig +index 0bcdec7cc169..10ceaefa51e0 100644 +--- a/arch/arm/configs/mps2_defconfig ++++ b/arch/arm/configs/mps2_defconfig +@@ -18,7 +18,7 @@ CONFIG_ARCH_MPS2=y + CONFIG_SET_MEM_PARAM=y + CONFIG_DRAM_BASE=0x21000000 + CONFIG_DRAM_SIZE=0x1000000 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_ATAGS is not set + CONFIG_ZBOOT_ROM_TEXT=0x0 + CONFIG_ZBOOT_ROM_BSS=0x0 +diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig +index 7b8212857535..6c1b8a1d9d59 100644 +--- a/arch/arm/configs/mxs_defconfig ++++ b/arch/arm/configs/mxs_defconfig +@@ -26,7 +26,7 @@ CONFIG_BLK_DEV_INTEGRITY=y + # CONFIG_ARCH_MULTI_V7 is not set + CONFIG_ARCH_MXS=y + # CONFIG_ARM_THUMB is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_AEABI=y + CONFIG_NET=y + CONFIG_PACKET=y +diff --git a/arch/blackfin/configs/BF518F-EZBRD_defconfig b/arch/blackfin/configs/BF518F-EZBRD_defconfig +new file mode 100644 +index 000000000000..39b91dfa55b5 +--- /dev/null ++++ b/arch/blackfin/configs/BF518F-EZBRD_defconfig +@@ -0,0 +1,121 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF518=y ++CONFIG_IRQ_TIMER0=12 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=y ++CONFIG_SDH_BFIN=y ++CONFIG_SDH_BFIN_MISSING_CMD_PULLUP_WORKAROUND=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_VFAT_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/BF526-EZBRD_defconfig b/arch/blackfin/configs/BF526-EZBRD_defconfig +new file mode 100644 +index 000000000000..675cadb3a0c4 +--- /dev/null ++++ b/arch/blackfin/configs/BF526-EZBRD_defconfig +@@ -0,0 +1,158 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF526=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_BFIN526_EZBRD=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=m ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_STORAGE=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_VFAT_FS=m ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/BF527-EZKIT-V2_defconfig b/arch/blackfin/configs/BF527-EZKIT-V2_defconfig +new file mode 100644 +index 000000000000..4c517c443af5 +--- /dev/null ++++ b/arch/blackfin/configs/BF527-EZKIT-V2_defconfig +@@ -0,0 +1,188 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_2=y ++CONFIG_BFIN527_EZKIT_V2=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR0=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=m ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++CONFIG_KEYBOARD_ADP5520=y ++# CONFIG_KEYBOARD_ATKBD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7879=y ++CONFIG_TOUCHSCREEN_AD7879_I2C=y ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_PMIC_ADP5520=y ++CONFIG_FB=y ++CONFIG_FB_BFIN_LQ035Q1=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SOC_SSM2602=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_USB_MUSB_BLACKFIN=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_NEW_LEDS=y ++CONFIG_LEDS_CLASS=y ++CONFIG_LEDS_ADP5520=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_UDF_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF527-EZKIT_defconfig b/arch/blackfin/configs/BF527-EZKIT_defconfig +new file mode 100644 +index 000000000000..bf8df3e6cf02 +--- /dev/null ++++ b/arch/blackfin/configs/BF527-EZKIT_defconfig +@@ -0,0 +1,181 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_1=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR0=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=m ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=y ++CONFIG_FB_BFIN_T350MCQB=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_LCD_LTV350QV=m ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SOC_SSM2602=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_MUSB_PIO_ONLY=y ++CONFIG_USB_MUSB_BLACKFIN=y ++CONFIG_MUSB_PIO_ONLY=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_UDF_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF527-TLL6527M_defconfig b/arch/blackfin/configs/BF527-TLL6527M_defconfig +new file mode 100644 +index 000000000000..0220b3b15c53 +--- /dev/null ++++ b/arch/blackfin/configs/BF527-TLL6527M_defconfig +@@ -0,0 +1,178 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_LOCALVERSION="DEV_0-1_pre2010" ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_2=y ++CONFIG_BFIN527_TLL6527M=y ++CONFIG_BF527_UART1_PORTG=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++CONFIG_BOOT_LOAD=0x400000 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_DMA_UNCACHED_2M=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_0=0xFFC2 ++CONFIG_BANK_1=0xFFC2 ++CONFIG_BANK_2=0xFFC2 ++CONFIG_BANK_3=0xFFC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR0=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_GPIO_ADDR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7879=m ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_AD714X=y ++CONFIG_INPUT_ADXL34X=y ++# CONFIG_SERIO is not set ++CONFIG_BFIN_PPI=m ++CONFIG_BFIN_SIMPLE_TIMER=m ++CONFIG_BFIN_SPORT=m ++# CONFIG_CONSOLE_TRANSLATIONS is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_BFIN_JTAG_COMM=m ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C_CHARDEV=y ++# CONFIG_I2C_HELPER_AUTO is not set ++CONFIG_I2C_SMBUS=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_MEDIA_SUPPORT=y ++CONFIG_VIDEO_DEV=y ++# CONFIG_MEDIA_TUNER_CUSTOMISE is not set ++CONFIG_VIDEO_HELPER_CHIPS_AUTO=y ++CONFIG_VIDEO_BLACKFIN_CAM=m ++CONFIG_OV9655=y ++CONFIG_FB=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_FONTS=y ++CONFIG_FONT_6x11=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_MIXER_OSS=y ++CONFIG_SND_PCM_OSS=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SOC_SSM2602=y ++# CONFIG_HID_SUPPORT is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=m ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++# CONFIG_RPCSEC_GSS_KRB5 is not set ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC7=m +diff --git a/arch/blackfin/configs/BF533-EZKIT_defconfig b/arch/blackfin/configs/BF533-EZKIT_defconfig +new file mode 100644 +index 000000000000..6023e3fd2c48 +--- /dev/null ++++ b/arch/blackfin/configs/BF533-EZKIT_defconfig +@@ -0,0 +1,114 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BFIN533_EZKIT=y ++CONFIG_TIMER0=11 ++CONFIG_CLKIN_HZ=27000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_PLATRAM=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF533-STAMP_defconfig b/arch/blackfin/configs/BF533-STAMP_defconfig +new file mode 100644 +index 000000000000..f5cd0f18b711 +--- /dev/null ++++ b/arch/blackfin/configs/BF533-STAMP_defconfig +@@ -0,0 +1,124 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_TIMER0=11 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_GPIO=m ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=m ++CONFIG_FIRMWARE_EDID=y ++CONFIG_SOUND=m ++CONFIG_SND=m ++CONFIG_SND_MIXER_OSS=m ++CONFIG_SND_PCM_OSS=m ++CONFIG_SND_SOC=m ++CONFIG_SND_BF5XX_I2S=m ++CONFIG_SND_BF5XX_SOC_AD73311=m ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF537-STAMP_defconfig b/arch/blackfin/configs/BF537-STAMP_defconfig +new file mode 100644 +index 000000000000..48085fde7f9e +--- /dev/null ++++ b/arch/blackfin/configs/BF537-STAMP_defconfig +@@ -0,0 +1,136 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF537=y ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_CAN=m ++CONFIG_CAN_RAW=m ++CONFIG_CAN_BCM=m ++CONFIG_CAN_BFIN=m ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR1=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=m ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=m ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=m ++CONFIG_FIRMWARE_EDID=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_SOUND=m ++CONFIG_SND=m ++CONFIG_SND_MIXER_OSS=m ++CONFIG_SND_PCM_OSS=m ++CONFIG_SND_SOC=m ++CONFIG_SND_BF5XX_I2S=m ++CONFIG_SND_BF5XX_SOC_AD73311=m ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF538-EZKIT_defconfig b/arch/blackfin/configs/BF538-EZKIT_defconfig +new file mode 100644 +index 000000000000..12deeaaef3cb +--- /dev/null ++++ b/arch/blackfin/configs/BF538-EZKIT_defconfig +@@ -0,0 +1,133 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF538=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_IRQ_TIMER1=12 ++CONFIG_IRQ_TIMER2=12 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_PM=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_CAN=m ++CONFIG_CAN_RAW=m ++CONFIG_CAN_BCM=m ++CONFIG_CAN_DEV=m ++CONFIG_CAN_BFIN=m ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=m ++CONFIG_MTD_NAND=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_PHYLIB=y ++CONFIG_SMSC_PHY=y ++CONFIG_NET_ETHERNET=y ++CONFIG_SMC91X=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7879=y ++CONFIG_TOUCHSCREEN_AD7879_SPI=y ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_BFIN_JTAG_COMM=m ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++CONFIG_SERIAL_BFIN_UART1=y ++CONFIG_SERIAL_BFIN_UART2=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=m ++CONFIG_I2C_BLACKFIN_TWI=m ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=m ++CONFIG_FB_BFIN_LQ035Q1=m ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF548-EZKIT_defconfig b/arch/blackfin/configs/BF548-EZKIT_defconfig +new file mode 100644 +index 000000000000..6a68ffc55b5a +--- /dev/null ++++ b/arch/blackfin/configs/BF548-EZKIT_defconfig +@@ -0,0 +1,207 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF548_std=y ++CONFIG_IRQ_TIMER0=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_CACHELINE_ALIGNED_L1=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_DMA_UNCACHED_2M=y ++CONFIG_BFIN_EXTMEM_WRITETHROUGH=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_EBIU_MBSCTLVAL=0x0 ++CONFIG_EBIU_MODEVAL=0x1 ++CONFIG_EBIU_FCTLVAL=0x6 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_CAN=m ++CONFIG_CAN_RAW=m ++CONFIG_CAN_BCM=m ++CONFIG_CAN_BFIN=m ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR3=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_FW_LOADER=m ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=y ++CONFIG_MTD_NAND_BF5XX=y ++# CONFIG_MTD_NAND_BF5XX_HWECC is not set ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_ATA=y ++# CONFIG_SATA_PMP is not set ++CONFIG_PATA_BF54X=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMSC911X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++CONFIG_INPUT_EVBUG=m ++# CONFIG_KEYBOARD_ATKBD is not set ++CONFIG_KEYBOARD_BFIN=y ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7877=m ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=y ++CONFIG_FIRMWARE_EDID=y ++CONFIG_FB_BF54X_LQ043=y ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_FONTS=y ++CONFIG_FONT_6x11=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_MIXER_OSS=y ++CONFIG_SND_PCM_OSS=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_AC97=y ++CONFIG_SND_BF5XX_SOC_AD1980=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_USB_MUSB_BLACKFIN=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_MMC=y ++CONFIG_MMC_BLOCK=m ++CONFIG_SDH_BFIN=y ++CONFIG_SDH_BFIN_MISSING_CMD_PULLUP_WORKAROUND=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_ZISOFS=y ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_NTFS_FS=m ++CONFIG_NTFS_RW=y ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NFSD=m ++CONFIG_NFSD_V3=y ++CONFIG_CIFS=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF561-ACVILON_defconfig b/arch/blackfin/configs/BF561-ACVILON_defconfig +new file mode 100644 +index 000000000000..e9f3ba783a4e +--- /dev/null ++++ b/arch/blackfin/configs/BF561-ACVILON_defconfig +@@ -0,0 +1,149 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_SYSFS_DEPRECATED_V2=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF561=y ++CONFIG_BF_REV_0_5=y ++CONFIG_IRQ_TIMER0=10 ++CONFIG_BFIN561_ACVILON=y ++# CONFIG_BF561_COREB is not set ++CONFIG_CLKIN_HZ=12000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_DMA_UNCACHED_4M=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_0=0x99b2 ++CONFIG_BANK_1=0x3350 ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++CONFIG_SYN_COOKIES=y ++# CONFIG_INET_LRO is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_PLATRAM=y ++CONFIG_MTD_PHRAM=y ++CONFIG_MTD_BLOCK2MTD=y ++CONFIG_MTD_NAND=y ++CONFIG_MTD_NAND_PLATFORM=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_COUNT=2 ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_SMSC911X=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_PIO=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_PCA_PLATFORM=y ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_SPI_SPIDEV=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_GPIO_PCF857X=y ++CONFIG_SENSORS_LM75=y ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_MIXER_OSS=y ++CONFIG_SND_PCM_OSS=y ++# CONFIG_SND_DRIVERS is not set ++# CONFIG_SND_USB is not set ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SPORT_NUM=1 ++CONFIG_USB=y ++CONFIG_USB_ANNOUNCE_NEW_DEVICES=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_MON=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_SERIAL=y ++CONFIG_USB_SERIAL_FTDI_SIO=y ++CONFIG_USB_SERIAL_PL2303=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_DS1307=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT2_FS_SECURITY=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_FAT_DEFAULT_CODEPAGE=866 ++CONFIG_FAT_DEFAULT_IOCHARSET="cp1251" ++CONFIG_NTFS_FS=y ++CONFIG_CONFIGFS_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_JFFS2_COMPRESSION_OPTIONS=y ++# CONFIG_JFFS2_ZLIB is not set ++CONFIG_JFFS2_LZO=y ++# CONFIG_JFFS2_RTIME is not set ++CONFIG_JFFS2_CMODE_FAVOURLZO=y ++CONFIG_CRAMFS=y ++CONFIG_MINIX_FS=y ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_ROOT_NFS=y ++CONFIG_NLS_DEFAULT="cp1251" ++CONFIG_NLS_CODEPAGE_866=y ++CONFIG_NLS_CODEPAGE_1251=y ++CONFIG_NLS_KOI8_R=y ++CONFIG_NLS_UTF8=y ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++# CONFIG_DEBUG_BUGVERBOSE is not set ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_CPLB_INFO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig +new file mode 100644 +index 000000000000..89b75a6c3fab +--- /dev/null ++++ b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig +@@ -0,0 +1,112 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF561=y ++CONFIG_SMP=y ++CONFIG_IRQ_TIMER0=10 ++CONFIG_CLKIN_HZ=30000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF561-EZKIT_defconfig b/arch/blackfin/configs/BF561-EZKIT_defconfig +new file mode 100644 +index 000000000000..67b3d2f419ba +--- /dev/null ++++ b/arch/blackfin/configs/BF561-EZKIT_defconfig +@@ -0,0 +1,114 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF561=y ++CONFIG_IRQ_TIMER0=10 ++CONFIG_CLKIN_HZ=30000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_BFIN_EXTMEM_WRITETHROUGH=y ++CONFIG_BFIN_L2_DCACHEABLE=y ++CONFIG_BFIN_L2_WRITETHROUGH=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF609-EZKIT_defconfig b/arch/blackfin/configs/BF609-EZKIT_defconfig +new file mode 100644 +index 000000000000..8cc75d4218fb +--- /dev/null ++++ b/arch/blackfin/configs/BF609-EZKIT_defconfig +@@ -0,0 +1,154 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF609=y ++CONFIG_PINT1_ASSIGN=0x01010000 ++CONFIG_PINT2_ASSIGN=0x07000101 ++CONFIG_PINT3_ASSIGN=0x02020303 ++CONFIG_IP_CHECKSUM_L1=y ++CONFIG_SYSCALL_TAB_L1=y ++CONFIG_CPLB_SWITCH_TAB_L1=y ++# CONFIG_APP_STACK_L1 is not set ++# CONFIG_BFIN_INS_LOWOVERHEAD is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_PM_BFIN_WAKE_PE12=y ++CONFIG_PM_BFIN_WAKE_PE12_POL=1 ++CONFIG_CPU_FREQ=y ++CONFIG_CPU_FREQ_GOV_POWERSAVE=y ++CONFIG_CPU_FREQ_GOV_ONDEMAND=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++CONFIG_IP_PNP_DHCP=y ++CONFIG_IP_PNP_BOOTP=y ++CONFIG_IP_PNP_RARP=y ++# CONFIG_IPV6 is not set ++CONFIG_NETFILTER=y ++CONFIG_CAN=y ++CONFIG_CAN_BFIN=y ++CONFIG_IRDA=y ++CONFIG_IRTTY_SIR=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_FW_LOADER=m ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_CFI_STAA=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_SPI_NOR=y ++CONFIG_MTD_UBI=m ++CONFIG_SCSI=y ++CONFIG_BLK_DEV_SD=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++CONFIG_STMMAC_ETH=y ++CONFIG_STMMAC_IEEE1588=y ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_BFIN_ROTARY=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_SIMPLE_TIMER=m ++# CONFIG_BFIN_CRC is not set ++CONFIG_BFIN_LINKPORT=y ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_ADI_V3=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_PINCTRL_MCP23S08=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_SOUND=m ++CONFIG_SND=m ++CONFIG_SND_MIXER_OSS=m ++CONFIG_SND_PCM_OSS=m ++# CONFIG_SND_DRIVERS is not set ++# CONFIG_SND_SPI is not set ++# CONFIG_SND_USB is not set ++CONFIG_SND_SOC=m ++CONFIG_USB=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_USB_MUSB_BLACKFIN=m ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_USB_GADGET_MUSB_HDRC=y ++CONFIG_USB_ZERO=y ++CONFIG_MMC=y ++CONFIG_SDH_BFIN=y ++# CONFIG_IOMMU_SUPPORT is not set ++CONFIG_EXT2_FS=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=m ++CONFIG_UBIFS_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_ISO8859_1=y ++CONFIG_DEBUG_FS=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++CONFIG_FRAME_POINTER=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO_HMAC=m ++CONFIG_CRYPTO_MD4=m ++CONFIG_CRYPTO_MD5=m ++CONFIG_CRYPTO_ARC4=m ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRYPTO_DEV_BFIN_CRC=m +diff --git a/arch/blackfin/configs/BlackStamp_defconfig b/arch/blackfin/configs/BlackStamp_defconfig +new file mode 100644 +index 000000000000..9faf0ec7007f +--- /dev/null ++++ b/arch/blackfin/configs/BlackStamp_defconfig +@@ -0,0 +1,108 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_SYSFS_DEPRECATED_V2=y ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODULE_FORCE_UNLOAD=y ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF532=y ++CONFIG_BF_REV_0_5=y ++CONFIG_BLACKSTAMP=y ++CONFIG_TIMER0=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_ROMKERNEL=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_BINFMT_SHARED_FLAT=y ++CONFIG_PM=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_LRO is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_NBD=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_MISC_DEVICES=y ++CONFIG_EEPROM_AT25=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_SMC91X=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_HW_RANDOM=y ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_GPIO=m ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_SPI_SPIDEV=m ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=y ++CONFIG_MMC_SPI=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_NFS_V4=y ++CONFIG_SMB_FS=y ++CONFIG_CIFS=y ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_UTF8=y ++CONFIG_SYSCTL_SYSCALL_CHECK=y ++CONFIG_DEBUG_MMRS=y ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/CM-BF527_defconfig b/arch/blackfin/configs/CM-BF527_defconfig +new file mode 100644 +index 000000000000..4a1ad4fd7bb2 +--- /dev/null ++++ b/arch/blackfin/configs/CM-BF527_defconfig +@@ -0,0 +1,129 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_KERNEL_LZMA=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_RD_GZIP is not set ++CONFIG_RD_LZMA=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_1=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_BFIN527_BLUETECHNIX_CM=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xFFC0 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_GPIO_ADDR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++CONFIG_BLK_DEV_SD=y ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=m ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_USB=m ++CONFIG_USB_ANNOUNCE_NEW_DEVICES=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=m ++CONFIG_USB_MUSB_HDRC=m ++CONFIG_USB_MUSB_PERIPHERAL=y ++CONFIG_USB_GADGET_MUSB_HDRC=y ++CONFIG_MUSB_PIO_ONLY=y ++CONFIG_USB_STORAGE=m ++CONFIG_USB_GADGET=m ++CONFIG_USB_ETH=m ++CONFIG_USB_MASS_STORAGE=m ++CONFIG_USB_G_SERIAL=m ++CONFIG_USB_G_PRINTER=m ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_ISO8859_1=y ++CONFIG_DEBUG_FS=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_EARLY_PRINTK=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m ++CONFIG_CRC_ITU_T=y ++CONFIG_CRC7=y +diff --git a/arch/blackfin/configs/PNAV-10_defconfig b/arch/blackfin/configs/PNAV-10_defconfig +new file mode 100644 +index 000000000000..9d787e28bbe8 +--- /dev/null ++++ b/arch/blackfin/configs/PNAV-10_defconfig +@@ -0,0 +1,111 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_LOG_BUF_SHIFT=14 ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF537=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_PNAV10=y ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++CONFIG_IP_CHECKSUM_L1=y ++CONFIG_SYSCALL_TAB_L1=y ++CONFIG_CPLB_SWITCH_TAB_L1=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_1=0x33B0 ++CONFIG_BANK_2=0x33B0 ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_UCLINUX=y ++CONFIG_MTD_NAND=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_BFIN_MAC_USE_L1 is not set ++CONFIG_BFIN_TX_DESC_NUM=100 ++CONFIG_BFIN_RX_DESC_NUM=100 ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7877=y ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_UINPUT=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_HW_RANDOM=y ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_FB=y ++CONFIG_FIRMWARE_EDID=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_LCD_CLASS_DEVICE=y ++CONFIG_BACKLIGHT_CLASS_DEVICE=y ++CONFIG_SOUND=y ++CONFIG_SND=m ++# CONFIG_SND_SUPPORT_OLD_API is not set ++# CONFIG_SND_VERBOSE_PROCFS is not set ++CONFIG_SOUND_PRIME=y ++# CONFIG_HID is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++# CONFIG_DNOTIFY is not set ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_DEBUG_HUNT_FOR_ZERO is not set ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++# CONFIG_ACCESS_CHECK is not set ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/SRV1_defconfig b/arch/blackfin/configs/SRV1_defconfig +new file mode 100644 +index 000000000000..225df32dc9a8 +--- /dev/null ++++ b/arch/blackfin/configs/SRV1_defconfig +@@ -0,0 +1,88 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++CONFIG_KALLSYMS_ALL=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF537=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_BOOT_LOAD=0x400000 ++CONFIG_CLKIN_HZ=22118400 ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_DMA_UNCACHED_2M=y ++CONFIG_C_CDPRIO=y ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_PM=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_UCLINUX=y ++CONFIG_MTD_NAND=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_MISC_DEVICES=y ++CONFIG_EEPROM_AT25=m ++CONFIG_NETDEVICES=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_UINPUT=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_HWMON=m ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_HID is not set ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++CONFIG_DEBUG_KERNEL=y ++# CONFIG_DEBUG_BUGVERBOSE is not set ++CONFIG_DEBUG_INFO=y ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_CPLB_INFO=y +diff --git a/arch/blackfin/configs/TCM-BF518_defconfig b/arch/blackfin/configs/TCM-BF518_defconfig +new file mode 100644 +index 000000000000..425c24e43c34 +--- /dev/null ++++ b/arch/blackfin/configs/TCM-BF518_defconfig +@@ -0,0 +1,131 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_KERNEL_LZMA=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_RD_GZIP is not set ++CONFIG_RD_LZMA=y ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF518=y ++CONFIG_BF_REV_0_1=y ++CONFIG_BFIN518F_TCM=y ++CONFIG_IRQ_TIMER0=12 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_ADV_OPTIONS=y ++CONFIG_MTD_CFI_GEOMETRY=y ++# CONFIG_MTD_MAP_BANK_WIDTH_1 is not set ++# CONFIG_MTD_MAP_BANK_WIDTH_4 is not set ++# CONFIG_MTD_CFI_I2 is not set ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_BFIN_JTAG_COMM=m ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_HID_SUPPORT is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=y ++CONFIG_MMC_DEBUG=y ++CONFIG_MMC_SPI=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++# CONFIG_DNOTIFY is not set ++CONFIG_VFAT_FS=m ++# CONFIG_MISC_FILESYSTEMS is not set ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_ROOT_NFS=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig +index 499f51498ecb..f7cb39b0662c 100644 +--- a/arch/mips/configs/fuloong2e_defconfig ++++ b/arch/mips/configs/fuloong2e_defconfig +@@ -2,7 +2,8 @@ CONFIG_MACH_LOONGSON64=y + CONFIG_64BIT=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_LOCALVERSION="-fuloong2e" + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y +diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig +index 55438fc9991e..db03ef4f737d 100644 +--- a/arch/mips/configs/gpr_defconfig ++++ b/arch/mips/configs/gpr_defconfig +@@ -1,7 +1,8 @@ + CONFIG_MIPS_ALCHEMY=y + CONFIG_MIPS_GPR=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig +index 7ddfb4ef9479..93e439ad3fce 100644 +--- a/arch/mips/configs/ip22_defconfig ++++ b/arch/mips/configs/ip22_defconfig +@@ -4,7 +4,8 @@ CONFIG_CPU_R5000=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_SYSVIPC=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig +index d0a4c2cfacf8..6f0600e99c25 100644 +--- a/arch/mips/configs/ip28_defconfig ++++ b/arch/mips/configs/ip28_defconfig +@@ -1,6 +1,7 @@ + CONFIG_SGI_IP28=y + CONFIG_ARC_CONSOLE=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_SYSVIPC=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig +index 9ad1c94376c8..1d62ce7ff5dc 100644 +--- a/arch/mips/configs/jazz_defconfig ++++ b/arch/mips/configs/jazz_defconfig +@@ -1,6 +1,7 @@ + CONFIG_MACH_JAZZ=y + CONFIG_OLIVETTI_M700=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_BSD_PROCESS_ACCT=y +diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig +index c3d0d0a6e044..aa3426d5f7d7 100644 +--- a/arch/mips/configs/mtx1_defconfig ++++ b/arch/mips/configs/mtx1_defconfig +@@ -1,6 +1,7 @@ + CONFIG_MIPS_ALCHEMY=y + CONFIG_MIPS_MTX1=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig +index c4477a4d40c1..95caf0af665f 100644 +--- a/arch/mips/configs/nlm_xlr_defconfig ++++ b/arch/mips/configs/nlm_xlr_defconfig +@@ -5,7 +5,7 @@ CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 + CONFIG_SMP=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_KEXEC=y + CONFIG_CROSS_COMPILE="" + # CONFIG_LOCALVERSION_AUTO is not set +diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig +index 41190c2036e6..3728897ab2b2 100644 +--- a/arch/mips/configs/pic32mzda_defconfig ++++ b/arch/mips/configs/pic32mzda_defconfig +@@ -1,7 +1,7 @@ + CONFIG_MACH_PIC32=y + CONFIG_DTB_PIC32_MZDA_SK=y + CONFIG_HZ_100=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_SECCOMP is not set + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y +diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig +index b22a3cf149b6..cfffca3d37f4 100644 +--- a/arch/mips/configs/pistachio_defconfig ++++ b/arch/mips/configs/pistachio_defconfig +@@ -5,7 +5,7 @@ CONFIG_MIPS_CPS=y + CONFIG_DEFAULT_MMAP_MIN_ADDR=32768 + CONFIG_ZSMALLOC=y + CONFIG_NR_CPUS=4 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_DEFAULT_HOSTNAME="localhost" + CONFIG_SYSVIPC=y +diff --git a/arch/mips/configs/pnx8335_stb225_defconfig b/arch/mips/configs/pnx8335_stb225_defconfig +index e73cdb08fc6e..dc62fa8d6065 100644 +--- a/arch/mips/configs/pnx8335_stb225_defconfig ++++ b/arch/mips/configs/pnx8335_stb225_defconfig +@@ -3,7 +3,7 @@ CONFIG_CPU_LITTLE_ENDIAN=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_HZ_128=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_SECCOMP is not set + # CONFIG_LOCALVERSION_AUTO is not set + # CONFIG_SWAP is not set +diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig +index 5f71aa598b06..767f1999ead0 100644 +--- a/arch/mips/configs/rm200_defconfig ++++ b/arch/mips/configs/rm200_defconfig +@@ -2,7 +2,8 @@ CONFIG_SNI_RM=y + CONFIG_CPU_LITTLE_ENDIAN=y + CONFIG_ARC_CONSOLE=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_BSD_PROCESS_ACCT=y +diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig +index ccc109761f44..a6a5b0b7a9c9 100644 +--- a/arch/parisc/configs/712_defconfig ++++ b/arch/parisc/configs/712_defconfig +@@ -13,7 +13,7 @@ CONFIG_MODULES=y + CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_PA7100LC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_GSC_LASI=y + # CONFIG_PDC_CHASSIS is not set + CONFIG_BINFMT_MISC=m +diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig +index 8d41a73bd71b..b8e0a6662ff9 100644 +--- a/arch/parisc/configs/c3000_defconfig ++++ b/arch/parisc/configs/c3000_defconfig +@@ -13,7 +13,7 @@ CONFIG_MODULES=y + CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_PA8X00=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_GSC is not set + CONFIG_PCI=y + CONFIG_PCI_LBA=y +diff --git a/arch/parisc/configs/default_defconfig b/arch/parisc/configs/default_defconfig +index 52c9050a7c5c..8d86d2e989f4 100644 +--- a/arch/parisc/configs/default_defconfig ++++ b/arch/parisc/configs/default_defconfig +@@ -14,7 +14,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + CONFIG_PA7100LC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IOMMU_CCIO=y + CONFIG_GSC_LASI=y + CONFIG_GSC_WAX=y +diff --git a/arch/powerpc/configs/c2k_defconfig b/arch/powerpc/configs/c2k_defconfig +new file mode 100644 +index 000000000000..04fee07ea6c5 +--- /dev/null ++++ b/arch/powerpc/configs/c2k_defconfig +@@ -0,0 +1,389 @@ ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_AUDIT=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_PROFILING=y ++CONFIG_OPROFILE=m ++CONFIG_KPROBES=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODVERSIONS=y ++CONFIG_PARTITION_ADVANCED=y ++CONFIG_OSF_PARTITION=y ++CONFIG_MAC_PARTITION=y ++CONFIG_BSD_DISKLABEL=y ++CONFIG_MINIX_SUBPARTITION=y ++CONFIG_SOLARIS_X86_PARTITION=y ++CONFIG_UNIXWARE_DISKLABEL=y ++CONFIG_SGI_PARTITION=y ++CONFIG_SUN_PARTITION=y ++# CONFIG_PPC_CHRP is not set ++# CONFIG_PPC_PMAC is not set ++CONFIG_EMBEDDED6xx=y ++CONFIG_PPC_C2K=y ++CONFIG_CPU_FREQ=y ++CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y ++CONFIG_CPU_FREQ_GOV_PERFORMANCE=y ++CONFIG_CPU_FREQ_GOV_POWERSAVE=m ++CONFIG_CPU_FREQ_GOV_ONDEMAND=m ++CONFIG_GEN_RTC=y ++CONFIG_HIGHMEM=y ++CONFIG_PREEMPT=y ++CONFIG_BINFMT_MISC=y ++CONFIG_PM=y ++CONFIG_PCI_MSI=y ++CONFIG_HOTPLUG_PCI=y ++CONFIG_HOTPLUG_PCI_SHPC=m ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=y ++CONFIG_NET_KEY=m ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_ADVANCED_ROUTER=y ++CONFIG_IP_MULTIPLE_TABLES=y ++CONFIG_IP_ROUTE_MULTIPATH=y ++CONFIG_IP_ROUTE_VERBOSE=y ++CONFIG_IP_PNP=y ++CONFIG_IP_PNP_DHCP=y ++CONFIG_NET_IPIP=m ++CONFIG_IP_MROUTE=y ++CONFIG_IP_PIMSM_V1=y ++CONFIG_IP_PIMSM_V2=y ++CONFIG_SYN_COOKIES=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_IPCOMP=m ++CONFIG_INET6_AH=m ++CONFIG_INET6_ESP=m ++CONFIG_INET6_IPCOMP=m ++CONFIG_IPV6_TUNNEL=m ++CONFIG_NETFILTER=y ++# CONFIG_NETFILTER_XT_MATCH_SCTP is not set ++CONFIG_IP_NF_IPTABLES=m ++CONFIG_IP_NF_MATCH_ECN=m ++CONFIG_IP_NF_MATCH_TTL=m ++CONFIG_IP_NF_FILTER=m ++CONFIG_IP_NF_TARGET_REJECT=m ++CONFIG_IP_NF_MANGLE=m ++CONFIG_IP_NF_TARGET_ECN=m ++CONFIG_IP_NF_RAW=m ++CONFIG_IP_NF_ARPTABLES=m ++CONFIG_IP_NF_ARPFILTER=m ++CONFIG_IP_NF_ARP_MANGLE=m ++CONFIG_IP6_NF_IPTABLES=m ++CONFIG_IP6_NF_MATCH_EUI64=m ++CONFIG_IP6_NF_MATCH_FRAG=m ++CONFIG_IP6_NF_MATCH_OPTS=m ++CONFIG_IP6_NF_MATCH_HL=m ++CONFIG_IP6_NF_MATCH_IPV6HEADER=m ++CONFIG_IP6_NF_MATCH_RT=m ++CONFIG_IP6_NF_FILTER=m ++CONFIG_IP6_NF_MANGLE=m ++CONFIG_IP6_NF_RAW=m ++CONFIG_BRIDGE_NF_EBTABLES=m ++CONFIG_BRIDGE_EBT_BROUTE=m ++CONFIG_BRIDGE_EBT_T_FILTER=m ++CONFIG_BRIDGE_EBT_T_NAT=m ++CONFIG_BRIDGE_EBT_802_3=m ++CONFIG_BRIDGE_EBT_AMONG=m ++CONFIG_BRIDGE_EBT_ARP=m ++CONFIG_BRIDGE_EBT_IP=m ++CONFIG_BRIDGE_EBT_LIMIT=m ++CONFIG_BRIDGE_EBT_MARK=m ++CONFIG_BRIDGE_EBT_PKTTYPE=m ++CONFIG_BRIDGE_EBT_STP=m ++CONFIG_BRIDGE_EBT_VLAN=m ++CONFIG_BRIDGE_EBT_ARPREPLY=m ++CONFIG_BRIDGE_EBT_DNAT=m ++CONFIG_BRIDGE_EBT_MARK_T=m ++CONFIG_BRIDGE_EBT_REDIRECT=m ++CONFIG_BRIDGE_EBT_SNAT=m ++CONFIG_BRIDGE_EBT_LOG=m ++CONFIG_IP_SCTP=m ++CONFIG_ATM=m ++CONFIG_ATM_CLIP=m ++CONFIG_ATM_LANE=m ++CONFIG_ATM_BR2684=m ++CONFIG_BRIDGE=m ++CONFIG_VLAN_8021Q=m ++CONFIG_NET_SCHED=y ++CONFIG_NET_SCH_CBQ=m ++CONFIG_NET_SCH_HTB=m ++CONFIG_NET_SCH_HFSC=m ++CONFIG_NET_SCH_ATM=m ++CONFIG_NET_SCH_PRIO=m ++CONFIG_NET_SCH_RED=m ++CONFIG_NET_SCH_SFQ=m ++CONFIG_NET_SCH_TEQL=m ++CONFIG_NET_SCH_TBF=m ++CONFIG_NET_SCH_GRED=m ++CONFIG_NET_SCH_DSMARK=m ++CONFIG_NET_SCH_NETEM=m ++CONFIG_NET_CLS_TCINDEX=m ++CONFIG_NET_CLS_ROUTE4=m ++CONFIG_NET_CLS_FW=m ++CONFIG_NET_CLS_U32=m ++CONFIG_CLS_U32_PERF=y ++CONFIG_NET_CLS_RSVP=m ++CONFIG_NET_CLS_RSVP6=m ++CONFIG_NET_CLS_IND=y ++CONFIG_BT=m ++CONFIG_BT_RFCOMM=m ++CONFIG_BT_RFCOMM_TTY=y ++CONFIG_BT_BNEP=m ++CONFIG_BT_BNEP_MC_FILTER=y ++CONFIG_BT_BNEP_PROTO_FILTER=y ++CONFIG_BT_HIDP=m ++CONFIG_BT_HCIUART=m ++CONFIG_BT_HCIUART_H4=y ++CONFIG_BT_HCIUART_BCSP=y ++CONFIG_BT_HCIBCM203X=m ++CONFIG_BT_HCIBFUSB=m ++CONFIG_BT_HCIVHCI=m ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP_OF=y ++CONFIG_BLK_DEV_LOOP=m ++CONFIG_BLK_DEV_CRYPTOLOOP=m ++CONFIG_BLK_DEV_NBD=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_SCSI=m ++CONFIG_BLK_DEV_SD=m ++CONFIG_CHR_DEV_ST=m ++CONFIG_CHR_DEV_OSST=m ++CONFIG_BLK_DEV_SR=m ++CONFIG_BLK_DEV_SR_VENDOR=y ++CONFIG_CHR_DEV_SG=m ++CONFIG_SCSI_CONSTANTS=y ++CONFIG_SCSI_LOGGING=y ++CONFIG_SCSI_ISCSI_ATTRS=m ++CONFIG_BLK_DEV_3W_XXXX_RAID=m ++CONFIG_SCSI_3W_9XXX=m ++CONFIG_SCSI_ACARD=m ++CONFIG_SCSI_AACRAID=m ++CONFIG_SCSI_AIC7XXX=m ++CONFIG_AIC7XXX_CMDS_PER_DEVICE=4 ++CONFIG_AIC7XXX_RESET_DELAY_MS=15000 ++# CONFIG_AIC7XXX_DEBUG_ENABLE is not set ++# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set ++CONFIG_SCSI_AIC79XX=m ++CONFIG_AIC79XX_CMDS_PER_DEVICE=4 ++CONFIG_AIC79XX_RESET_DELAY_MS=15000 ++# CONFIG_AIC79XX_DEBUG_ENABLE is not set ++# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set ++CONFIG_SCSI_ARCMSR=m ++CONFIG_MEGARAID_NEWGEN=y ++CONFIG_MEGARAID_MM=m ++CONFIG_MEGARAID_MAILBOX=m ++CONFIG_MEGARAID_SAS=m ++CONFIG_SCSI_GDTH=m ++CONFIG_SCSI_IPS=m ++CONFIG_SCSI_INITIO=m ++CONFIG_SCSI_SYM53C8XX_2=m ++CONFIG_SCSI_QLOGIC_1280=m ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_NETCONSOLE=m ++CONFIG_TUN=m ++# CONFIG_ATM_DRIVERS is not set ++CONFIG_MV643XX_ETH=y ++CONFIG_VITESSE_PHY=y ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_UINPUT=m ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_SERIAL_NONSTANDARD=y ++CONFIG_SERIAL_MPSC=y ++CONFIG_SERIAL_MPSC_CONSOLE=y ++CONFIG_NVRAM=m ++CONFIG_RAW_DRIVER=y ++CONFIG_MAX_RAW_DEVS=8192 ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_MV64XXX=m ++CONFIG_HWMON=m ++CONFIG_SENSORS_ADM1021=m ++CONFIG_SENSORS_ADM1025=m ++CONFIG_SENSORS_ADM1026=m ++CONFIG_SENSORS_ADM1031=m ++CONFIG_SENSORS_DS1621=m ++CONFIG_SENSORS_GL518SM=m ++CONFIG_SENSORS_MAX1619=m ++CONFIG_SENSORS_LM75=m ++CONFIG_SENSORS_LM77=m ++CONFIG_SENSORS_LM78=m ++CONFIG_SENSORS_LM80=m ++CONFIG_SENSORS_LM83=m ++CONFIG_SENSORS_LM85=m ++CONFIG_SENSORS_LM87=m ++CONFIG_SENSORS_LM90=m ++CONFIG_SENSORS_PCF8591=m ++CONFIG_SENSORS_VIA686A=m ++CONFIG_SENSORS_W83781D=m ++CONFIG_SENSORS_W83L785TS=m ++CONFIG_WATCHDOG=y ++CONFIG_SOFT_WATCHDOG=m ++CONFIG_PCIPCWATCHDOG=m ++CONFIG_WDTPCI=m ++CONFIG_USBPCWATCHDOG=m ++# CONFIG_VGA_CONSOLE is not set ++CONFIG_USB=m ++CONFIG_USB_MON=m ++CONFIG_USB_EHCI_HCD=m ++CONFIG_USB_EHCI_ROOT_HUB_TT=y ++CONFIG_USB_OHCI_HCD=m ++CONFIG_USB_OHCI_HCD_PPC_OF_BE=y ++CONFIG_USB_UHCI_HCD=m ++CONFIG_USB_ACM=m ++CONFIG_USB_PRINTER=m ++CONFIG_USB_STORAGE=m ++CONFIG_USB_STORAGE_DATAFAB=m ++CONFIG_USB_STORAGE_FREECOM=m ++CONFIG_USB_STORAGE_ISD200=m ++CONFIG_USB_STORAGE_SDDR09=m ++CONFIG_USB_STORAGE_SDDR55=m ++CONFIG_USB_STORAGE_JUMPSHOT=m ++CONFIG_USB_MDC800=m ++CONFIG_USB_MICROTEK=m ++CONFIG_USB_SERIAL=m ++CONFIG_USB_SERIAL_GENERIC=y ++CONFIG_USB_SERIAL_BELKIN=m ++CONFIG_USB_SERIAL_WHITEHEAT=m ++CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m ++CONFIG_USB_SERIAL_EMPEG=m ++CONFIG_USB_SERIAL_FTDI_SIO=m ++CONFIG_USB_SERIAL_VISOR=m ++CONFIG_USB_SERIAL_IPAQ=m ++CONFIG_USB_SERIAL_IR=m ++CONFIG_USB_SERIAL_EDGEPORT=m ++CONFIG_USB_SERIAL_EDGEPORT_TI=m ++CONFIG_USB_SERIAL_KEYSPAN_PDA=m ++CONFIG_USB_SERIAL_KEYSPAN=m ++CONFIG_USB_SERIAL_KLSI=m ++CONFIG_USB_SERIAL_KOBIL_SCT=m ++CONFIG_USB_SERIAL_MCT_U232=m ++CONFIG_USB_SERIAL_PL2303=m ++CONFIG_USB_SERIAL_SAFE=m ++CONFIG_USB_SERIAL_SAFE_PADDED=y ++CONFIG_USB_SERIAL_CYBERJACK=m ++CONFIG_USB_SERIAL_XIRCOM=m ++CONFIG_USB_SERIAL_OMNINET=m ++CONFIG_USB_EMI62=m ++CONFIG_USB_RIO500=m ++CONFIG_USB_LEGOTOWER=m ++CONFIG_USB_LCD=m ++CONFIG_USB_LED=m ++CONFIG_USB_TEST=m ++CONFIG_USB_ATM=m ++CONFIG_USB_SPEEDTOUCH=m ++CONFIG_INFINIBAND=m ++CONFIG_INFINIBAND_USER_MAD=m ++CONFIG_INFINIBAND_USER_ACCESS=m ++CONFIG_INFINIBAND_MTHCA=m ++CONFIG_INFINIBAND_IPOIB=m ++CONFIG_INFINIBAND_IPOIB_CM=y ++CONFIG_INFINIBAND_SRP=m ++CONFIG_DMADEVICES=y ++CONFIG_EXT4_FS=m ++CONFIG_EXT4_FS_POSIX_ACL=y ++CONFIG_EXT4_FS_SECURITY=y ++CONFIG_QUOTA=y ++CONFIG_QFMT_V2=y ++CONFIG_AUTOFS4_FS=m ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_FAT_DEFAULT_IOCHARSET="ascii" ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_HFS_FS=m ++CONFIG_HFSPLUS_FS=m ++CONFIG_JFFS2_FS=y ++CONFIG_CRAMFS=m ++CONFIG_VXFS_FS=m ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=y ++CONFIG_ROOT_NFS=y ++CONFIG_CIFS=m ++CONFIG_CIFS_XATTR=y ++CONFIG_CIFS_POSIX=y ++CONFIG_NLS=y ++CONFIG_NLS_DEFAULT="utf8" ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=m ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_CRC_CCITT=m ++CONFIG_CRC_T10DIF=m ++CONFIG_DEBUG_INFO=y ++CONFIG_MAGIC_SYSRQ=y ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_STACK_USAGE=y ++CONFIG_DEBUG_HIGHMEM=y ++CONFIG_DEBUG_STACKOVERFLOW=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_SPINLOCK=y ++CONFIG_BOOTX_TEXT=y ++CONFIG_PPC_EARLY_DEBUG=y ++CONFIG_SECURITY=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_SECURITY_SELINUX=y ++CONFIG_SECURITY_SELINUX_BOOTPARAM=y ++CONFIG_SECURITY_SELINUX_DISABLE=y ++CONFIG_CRYPTO_HMAC=y ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_SHA1=y ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m +diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig +index 7ee736f20774..8663c0043a56 100644 +--- a/arch/powerpc/configs/ppc6xx_defconfig ++++ b/arch/powerpc/configs/ppc6xx_defconfig +@@ -74,7 +74,7 @@ CONFIG_QE_GPIO=y + CONFIG_MCU_MPC8349EMITX=y + CONFIG_HIGHMEM=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BINFMT_MISC=y + CONFIG_HIBERNATION=y + CONFIG_PM_DEBUG=y +diff --git a/arch/score/configs/spct6600_defconfig b/arch/score/configs/spct6600_defconfig +new file mode 100644 +index 000000000000..46434ca1fa10 +--- /dev/null ++++ b/arch/score/configs/spct6600_defconfig +@@ -0,0 +1,84 @@ ++CONFIG_HZ_100=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y ++# CONFIG_LOCALVERSION_AUTO is not set ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_LOG_BUF_SHIFT=12 ++CONFIG_SYSFS_DEPRECATED_V2=y ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_KALLSYMS is not set ++# CONFIG_HOTPLUG is not set ++CONFIG_SLAB=y ++CONFIG_MODULES=y ++CONFIG_MODULE_FORCE_LOAD=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODULE_FORCE_UNLOAD=y ++# CONFIG_BLK_DEV_BSG is not set ++CONFIG_BINFMT_MISC=y ++CONFIG_NET=y ++CONFIG_UNIX=y ++CONFIG_NET_KEY=y ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_ARPD=y ++# CONFIG_INET_LRO is not set ++# CONFIG_IPV6 is not set ++# CONFIG_STANDALONE is not set ++# CONFIG_PREVENT_FIRMWARE_BUILD is not set ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_COUNT=1 ++# CONFIG_MISC_DEVICES is not set ++CONFIG_NETDEVICES=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++CONFIG_SERIAL_NONSTANDARD=y ++CONFIG_STALDRV=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_RAW_DRIVER=y ++CONFIG_MAX_RAW_DEVS=8192 ++# CONFIG_HWMON is not set ++# CONFIG_VGA_CONSOLE is not set ++# CONFIG_HID_SUPPORT is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT3_FS=y ++# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set ++CONFIG_EXT3_FS_POSIX_ACL=y ++CONFIG_AUTOFS_FS=y ++CONFIG_AUTOFS4_FS=y ++CONFIG_PROC_KCORE=y ++# CONFIG_PROC_PAGE_MONITOR is not set ++CONFIG_TMPFS=y ++CONFIG_TMPFS_POSIX_ACL=y ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=y ++CONFIG_NFSD=y ++CONFIG_NFSD_V3_ACL=y ++CONFIG_NFSD_V4=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++CONFIG_SECURITY=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_CRYPTO_NULL=y ++CONFIG_CRYPTO_CRYPTD=y ++CONFIG_CRYPTO_SEQIV=y ++CONFIG_CRYPTO_MD4=y ++CONFIG_CRYPTO_MICHAEL_MIC=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++# CONFIG_CRYPTO_HW is not set ++CONFIG_CRC_CCITT=y ++CONFIG_CRC16=y ++CONFIG_LIBCRC32C=y +diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig +index 5a1097641247..eb5fbf554e7f 100644 +--- a/arch/sh/configs/se7712_defconfig ++++ b/arch/sh/configs/se7712_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=66666666 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" + CONFIG_NET=y +diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig +index 9c0ef13bee10..cbaa65c8bf9e 100644 +--- a/arch/sh/configs/se7721_defconfig ++++ b/arch/sh/configs/se7721_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_7721_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=33333333 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda2" + CONFIG_NET=y +diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig +index ceb48e9b70f4..1a69eda6610c 100644 +--- a/arch/sh/configs/titan_defconfig ++++ b/arch/sh/configs/titan_defconfig +@@ -20,7 +20,7 @@ CONFIG_SH_TITAN=y + CONFIG_SH_PCLK_FREQ=30000000 + CONFIG_SH_DMA=y + CONFIG_SH_DMA_API=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" + CONFIG_PCI=y +diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig +index 4d4e1cc6402f..04bea1d28ba7 100644 +--- a/arch/sparc/configs/sparc64_defconfig ++++ b/arch/sparc/configs/sparc64_defconfig +@@ -22,7 +22,7 @@ CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_NUMA=y + CONFIG_DEFAULT_MMAP_MIN_ADDR=8192 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_SUN_LDOMS=y + CONFIG_PCI=y + CONFIG_PCI_MSI=y +diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig +new file mode 100644 +index 000000000000..939c63ba7e6e +--- /dev/null ++++ b/arch/tile/configs/tilegx_defconfig +@@ -0,0 +1,411 @@ ++CONFIG_TILEGX=y ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_FHANDLE=y ++CONFIG_AUDIT=y ++CONFIG_NO_HZ=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_BSD_PROCESS_ACCT_V3=y ++CONFIG_TASKSTATS=y ++CONFIG_TASK_DELAY_ACCT=y ++CONFIG_TASK_XACCT=y ++CONFIG_TASK_IO_ACCOUNTING=y ++CONFIG_LOG_BUF_SHIFT=19 ++CONFIG_CGROUPS=y ++CONFIG_CGROUP_DEBUG=y ++CONFIG_CGROUP_DEVICE=y ++CONFIG_CPUSETS=y ++CONFIG_CGROUP_CPUACCT=y ++CONFIG_CGROUP_SCHED=y ++CONFIG_RT_GROUP_SCHED=y ++CONFIG_BLK_CGROUP=y ++CONFIG_NAMESPACES=y ++CONFIG_RELAY=y ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_RD_XZ=y ++CONFIG_SYSCTL_SYSCALL=y ++CONFIG_EMBEDDED=y ++# CONFIG_COMPAT_BRK is not set ++CONFIG_PROFILING=y ++CONFIG_KPROBES=y ++CONFIG_MODULES=y ++CONFIG_MODULE_FORCE_LOAD=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_BLK_DEV_INTEGRITY=y ++CONFIG_PARTITION_ADVANCED=y ++CONFIG_OSF_PARTITION=y ++CONFIG_AMIGA_PARTITION=y ++CONFIG_MAC_PARTITION=y ++CONFIG_BSD_DISKLABEL=y ++CONFIG_MINIX_SUBPARTITION=y ++CONFIG_SOLARIS_X86_PARTITION=y ++CONFIG_UNIXWARE_DISKLABEL=y ++CONFIG_SGI_PARTITION=y ++CONFIG_SUN_PARTITION=y ++CONFIG_KARMA_PARTITION=y ++CONFIG_CFQ_GROUP_IOSCHED=y ++CONFIG_NR_CPUS=100 ++CONFIG_HZ_100=y ++# CONFIG_COMPACTION is not set ++CONFIG_PREEMPT=y ++CONFIG_TILE_PCI_IO=y ++CONFIG_PCI_DEBUG=y ++# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set ++CONFIG_BINFMT_MISC=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=y ++CONFIG_XFRM_SUB_POLICY=y ++CONFIG_XFRM_STATISTICS=y ++CONFIG_NET_KEY=m ++CONFIG_NET_KEY_MIGRATE=y ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_ADVANCED_ROUTER=y ++CONFIG_IP_MULTIPLE_TABLES=y ++CONFIG_IP_ROUTE_MULTIPATH=y ++CONFIG_IP_ROUTE_VERBOSE=y ++CONFIG_NET_IPIP=m ++CONFIG_IP_MROUTE=y ++CONFIG_IP_PIMSM_V1=y ++CONFIG_IP_PIMSM_V2=y ++CONFIG_SYN_COOKIES=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_IPCOMP=m ++CONFIG_INET_XFRM_MODE_TRANSPORT=m ++CONFIG_INET_XFRM_MODE_TUNNEL=m ++CONFIG_INET_XFRM_MODE_BEET=m ++CONFIG_INET_DIAG=m ++CONFIG_TCP_CONG_ADVANCED=y ++CONFIG_TCP_CONG_HSTCP=m ++CONFIG_TCP_CONG_HYBLA=m ++CONFIG_TCP_CONG_SCALABLE=m ++CONFIG_TCP_CONG_LP=m ++CONFIG_TCP_CONG_VENO=m ++CONFIG_TCP_CONG_YEAH=m ++CONFIG_TCP_CONG_ILLINOIS=m ++CONFIG_TCP_MD5SIG=y ++CONFIG_IPV6=y ++CONFIG_IPV6_ROUTER_PREF=y ++CONFIG_IPV6_ROUTE_INFO=y ++CONFIG_IPV6_OPTIMISTIC_DAD=y ++CONFIG_INET6_AH=m ++CONFIG_INET6_ESP=m ++CONFIG_INET6_IPCOMP=m ++CONFIG_IPV6_MIP6=m ++CONFIG_INET6_XFRM_MODE_TRANSPORT=m ++CONFIG_INET6_XFRM_MODE_TUNNEL=m ++CONFIG_INET6_XFRM_MODE_BEET=m ++CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m ++CONFIG_IPV6_SIT=m ++CONFIG_IPV6_TUNNEL=m ++CONFIG_IPV6_MULTIPLE_TABLES=y ++CONFIG_IPV6_MROUTE=y ++CONFIG_IPV6_PIMSM_V2=y ++CONFIG_NETLABEL=y ++CONFIG_RDS=m ++CONFIG_RDS_TCP=m ++CONFIG_BRIDGE=m ++CONFIG_VLAN_8021Q=m ++CONFIG_VLAN_8021Q_GVRP=y ++CONFIG_PHONET=m ++CONFIG_NET_SCHED=y ++CONFIG_NET_SCH_CBQ=m ++CONFIG_NET_SCH_HTB=m ++CONFIG_NET_SCH_HFSC=m ++CONFIG_NET_SCH_PRIO=m ++CONFIG_NET_SCH_MULTIQ=m ++CONFIG_NET_SCH_RED=m ++CONFIG_NET_SCH_SFQ=m ++CONFIG_NET_SCH_TEQL=m ++CONFIG_NET_SCH_TBF=m ++CONFIG_NET_SCH_GRED=m ++CONFIG_NET_SCH_DSMARK=m ++CONFIG_NET_SCH_NETEM=m ++CONFIG_NET_SCH_DRR=m ++CONFIG_NET_SCH_INGRESS=m ++CONFIG_NET_CLS_BASIC=m ++CONFIG_NET_CLS_TCINDEX=m ++CONFIG_NET_CLS_ROUTE4=m ++CONFIG_NET_CLS_FW=m ++CONFIG_NET_CLS_U32=m ++CONFIG_CLS_U32_PERF=y ++CONFIG_CLS_U32_MARK=y ++CONFIG_NET_CLS_RSVP=m ++CONFIG_NET_CLS_RSVP6=m ++CONFIG_NET_CLS_FLOW=m ++CONFIG_NET_CLS_CGROUP=y ++CONFIG_NET_EMATCH=y ++CONFIG_NET_EMATCH_CMP=m ++CONFIG_NET_EMATCH_NBYTE=m ++CONFIG_NET_EMATCH_U32=m ++CONFIG_NET_EMATCH_META=m ++CONFIG_NET_EMATCH_TEXT=m ++CONFIG_NET_CLS_ACT=y ++CONFIG_NET_ACT_POLICE=m ++CONFIG_NET_ACT_GACT=m ++CONFIG_GACT_PROB=y ++CONFIG_NET_ACT_MIRRED=m ++CONFIG_NET_ACT_NAT=m ++CONFIG_NET_ACT_PEDIT=m ++CONFIG_NET_ACT_SIMP=m ++CONFIG_NET_ACT_SKBEDIT=m ++CONFIG_NET_CLS_IND=y ++CONFIG_DCB=y ++CONFIG_DNS_RESOLVER=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_DEVTMPFS=y ++CONFIG_DEVTMPFS_MOUNT=y ++CONFIG_CONNECTOR=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=m ++CONFIG_BLK_DEV_SX8=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_ATA_OVER_ETH=m ++CONFIG_RAID_ATTRS=m ++CONFIG_BLK_DEV_SD=y ++CONFIG_SCSI_CONSTANTS=y ++CONFIG_SCSI_LOGGING=y ++CONFIG_SCSI_SAS_ATA=y ++CONFIG_ISCSI_TCP=m ++CONFIG_SCSI_MVSAS=y ++# CONFIG_SCSI_MVSAS_DEBUG is not set ++CONFIG_SCSI_MVSAS_TASKLET=y ++CONFIG_ATA=y ++CONFIG_SATA_AHCI=y ++CONFIG_SATA_SIL24=y ++# CONFIG_ATA_SFF is not set ++CONFIG_MD=y ++CONFIG_BLK_DEV_MD=y ++CONFIG_MD_LINEAR=m ++CONFIG_MD_RAID0=m ++CONFIG_MD_RAID1=m ++CONFIG_MD_RAID10=m ++CONFIG_MD_RAID456=m ++CONFIG_MD_FAULTY=m ++CONFIG_BLK_DEV_DM=m ++CONFIG_DM_DEBUG=y ++CONFIG_DM_CRYPT=m ++CONFIG_DM_SNAPSHOT=m ++CONFIG_DM_MIRROR=m ++CONFIG_DM_LOG_USERSPACE=m ++CONFIG_DM_ZERO=m ++CONFIG_DM_MULTIPATH=m ++CONFIG_DM_MULTIPATH_QL=m ++CONFIG_DM_MULTIPATH_ST=m ++CONFIG_DM_DELAY=m ++CONFIG_DM_UEVENT=y ++CONFIG_TARGET_CORE=m ++CONFIG_TCM_IBLOCK=m ++CONFIG_TCM_FILEIO=m ++CONFIG_TCM_PSCSI=m ++CONFIG_LOOPBACK_TARGET=m ++CONFIG_ISCSI_TARGET=m ++CONFIG_FUSION=y ++CONFIG_FUSION_SAS=y ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_IFB=m ++CONFIG_MACVLAN=m ++CONFIG_MACVTAP=m ++CONFIG_NETCONSOLE=m ++CONFIG_NETCONSOLE_DYNAMIC=y ++CONFIG_TUN=y ++CONFIG_VETH=m ++CONFIG_NET_DSA_MV88E6060=y ++CONFIG_NET_DSA_MV88E6XXX=y ++CONFIG_SKY2=y ++CONFIG_PTP_1588_CLOCK_TILEGX=y ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_SERIAL_TILEGX=y ++CONFIG_HW_RANDOM=y ++CONFIG_HW_RANDOM_TIMERIOMEM=m ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_WATCHDOG_NOWAYOUT=y ++# CONFIG_VGA_ARB is not set ++CONFIG_DRM=m ++CONFIG_DRM_TDFX=m ++CONFIG_DRM_R128=m ++CONFIG_DRM_MGA=m ++CONFIG_DRM_VIA=m ++CONFIG_DRM_SAVAGE=m ++CONFIG_USB=y ++CONFIG_USB_EHCI_HCD=y ++CONFIG_USB_OHCI_HCD=y ++CONFIG_USB_STORAGE=y ++CONFIG_EDAC=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_TILE=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT2_FS_SECURITY=y ++CONFIG_EXT2_FS_XIP=y ++CONFIG_EXT3_FS=y ++CONFIG_EXT3_FS_POSIX_ACL=y ++CONFIG_EXT3_FS_SECURITY=y ++CONFIG_EXT4_FS=y ++CONFIG_EXT4_FS_POSIX_ACL=y ++CONFIG_EXT4_FS_SECURITY=y ++CONFIG_XFS_FS=y ++CONFIG_XFS_QUOTA=y ++CONFIG_XFS_POSIX_ACL=y ++CONFIG_GFS2_FS=m ++CONFIG_GFS2_FS_LOCKING_DLM=y ++CONFIG_BTRFS_FS=m ++CONFIG_BTRFS_FS_POSIX_ACL=y ++CONFIG_QUOTA=y ++CONFIG_QUOTA_NETLINK_INTERFACE=y ++# CONFIG_PRINT_QUOTA_WARNING is not set ++CONFIG_QFMT_V2=y ++CONFIG_AUTOFS4_FS=m ++CONFIG_FUSE_FS=y ++CONFIG_CUSE=m ++CONFIG_FSCACHE=m ++CONFIG_FSCACHE_STATS=y ++CONFIG_CACHEFILES=m ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_ZISOFS=y ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_FAT_DEFAULT_IOCHARSET="ascii" ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_TMPFS_POSIX_ACL=y ++CONFIG_HUGETLBFS=y ++CONFIG_ECRYPT_FS=m ++CONFIG_CRAMFS=m ++CONFIG_SQUASHFS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=m ++CONFIG_NFS_V4_1=y ++CONFIG_NFS_FSCACHE=y ++CONFIG_NFSD=m ++CONFIG_NFSD_V3_ACL=y ++CONFIG_NFSD_V4=y ++CONFIG_CIFS=m ++CONFIG_CIFS_STATS=y ++CONFIG_CIFS_WEAK_PW_HASH=y ++CONFIG_CIFS_UPCALL=y ++CONFIG_CIFS_XATTR=y ++CONFIG_CIFS_POSIX=y ++CONFIG_CIFS_DFS_UPCALL=y ++CONFIG_CIFS_FSCACHE=y ++CONFIG_NLS_DEFAULT="utf8" ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=m ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_NLS_UTF8=m ++CONFIG_DLM=m ++CONFIG_DLM_DEBUG=y ++CONFIG_DYNAMIC_DEBUG=y ++CONFIG_DEBUG_INFO=y ++CONFIG_DEBUG_INFO_REDUCED=y ++# CONFIG_ENABLE_WARN_DEPRECATED is not set ++CONFIG_STRIP_ASM_SYMS=y ++CONFIG_DEBUG_FS=y ++CONFIG_HEADERS_CHECK=y ++# CONFIG_FRAME_POINTER is not set ++CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y ++CONFIG_DEBUG_VM=y ++CONFIG_DEBUG_MEMORY_INIT=y ++CONFIG_DEBUG_STACKOVERFLOW=y ++CONFIG_LOCKUP_DETECTOR=y ++CONFIG_SCHEDSTATS=y ++CONFIG_TIMER_STATS=y ++CONFIG_DEBUG_LIST=y ++CONFIG_DEBUG_CREDENTIALS=y ++CONFIG_RCU_CPU_STALL_TIMEOUT=60 ++CONFIG_ASYNC_RAID6_TEST=m ++CONFIG_KGDB=y ++CONFIG_SECURITY=y ++CONFIG_SECURITYFS=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_SECURITY_NETWORK_XFRM=y ++CONFIG_SECURITY_SELINUX=y ++CONFIG_SECURITY_SELINUX_BOOTPARAM=y ++CONFIG_SECURITY_SELINUX_DISABLE=y ++CONFIG_CRYPTO_PCRYPT=m ++CONFIG_CRYPTO_CRYPTD=m ++CONFIG_CRYPTO_TEST=m ++CONFIG_CRYPTO_CCM=m ++CONFIG_CRYPTO_GCM=m ++CONFIG_CRYPTO_CTS=m ++CONFIG_CRYPTO_LRW=m ++CONFIG_CRYPTO_PCBC=m ++CONFIG_CRYPTO_XTS=m ++CONFIG_CRYPTO_HMAC=y ++CONFIG_CRYPTO_XCBC=m ++CONFIG_CRYPTO_VMAC=m ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_RMD128=m ++CONFIG_CRYPTO_RMD160=m ++CONFIG_CRYPTO_RMD256=m ++CONFIG_CRYPTO_RMD320=m ++CONFIG_CRYPTO_SHA1=y ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_TGR192=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_ANUBIS=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAMELLIA=m ++CONFIG_CRYPTO_CAST5=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_FCRYPT=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SEED=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m ++CONFIG_CRYPTO_LZO=m +diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig +new file mode 100644 +index 000000000000..e8c4003cbd81 +--- /dev/null ++++ b/arch/tile/configs/tilepro_defconfig +@@ -0,0 +1,524 @@ ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_AUDIT=y ++CONFIG_NO_HZ=y ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_BSD_PROCESS_ACCT_V3=y ++CONFIG_TASKSTATS=y ++CONFIG_TASK_DELAY_ACCT=y ++CONFIG_TASK_XACCT=y ++CONFIG_TASK_IO_ACCOUNTING=y ++CONFIG_LOG_BUF_SHIFT=19 ++CONFIG_CGROUPS=y ++CONFIG_CGROUP_DEBUG=y ++CONFIG_CGROUP_DEVICE=y ++CONFIG_CPUSETS=y ++CONFIG_CGROUP_CPUACCT=y ++CONFIG_CGROUP_SCHED=y ++CONFIG_RT_GROUP_SCHED=y ++CONFIG_BLK_CGROUP=y ++CONFIG_NAMESPACES=y ++CONFIG_RELAY=y ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_RD_XZ=y ++CONFIG_SYSCTL_SYSCALL=y ++CONFIG_EMBEDDED=y ++# CONFIG_COMPAT_BRK is not set ++CONFIG_PROFILING=y ++CONFIG_MODULES=y ++CONFIG_MODULE_FORCE_LOAD=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_BLK_DEV_INTEGRITY=y ++CONFIG_PARTITION_ADVANCED=y ++CONFIG_OSF_PARTITION=y ++CONFIG_AMIGA_PARTITION=y ++CONFIG_MAC_PARTITION=y ++CONFIG_BSD_DISKLABEL=y ++CONFIG_MINIX_SUBPARTITION=y ++CONFIG_SOLARIS_X86_PARTITION=y ++CONFIG_UNIXWARE_DISKLABEL=y ++CONFIG_SGI_PARTITION=y ++CONFIG_SUN_PARTITION=y ++CONFIG_KARMA_PARTITION=y ++CONFIG_CFQ_GROUP_IOSCHED=y ++CONFIG_HZ_100=y ++# CONFIG_COMPACTION is not set ++CONFIG_PREEMPT=y ++CONFIG_PCI_DEBUG=y ++# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set ++CONFIG_BINFMT_MISC=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=y ++CONFIG_XFRM_SUB_POLICY=y ++CONFIG_XFRM_STATISTICS=y ++CONFIG_NET_KEY=m ++CONFIG_NET_KEY_MIGRATE=y ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_ADVANCED_ROUTER=y ++CONFIG_IP_MULTIPLE_TABLES=y ++CONFIG_IP_ROUTE_MULTIPATH=y ++CONFIG_IP_ROUTE_VERBOSE=y ++CONFIG_NET_IPIP=m ++CONFIG_IP_MROUTE=y ++CONFIG_IP_PIMSM_V1=y ++CONFIG_IP_PIMSM_V2=y ++CONFIG_SYN_COOKIES=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_IPCOMP=m ++CONFIG_INET_XFRM_MODE_TRANSPORT=m ++CONFIG_INET_XFRM_MODE_TUNNEL=m ++CONFIG_INET_XFRM_MODE_BEET=m ++CONFIG_INET_DIAG=m ++CONFIG_TCP_CONG_ADVANCED=y ++CONFIG_TCP_CONG_HSTCP=m ++CONFIG_TCP_CONG_HYBLA=m ++CONFIG_TCP_CONG_SCALABLE=m ++CONFIG_TCP_CONG_LP=m ++CONFIG_TCP_CONG_VENO=m ++CONFIG_TCP_CONG_YEAH=m ++CONFIG_TCP_CONG_ILLINOIS=m ++CONFIG_TCP_MD5SIG=y ++CONFIG_IPV6=y ++CONFIG_IPV6_ROUTER_PREF=y ++CONFIG_IPV6_ROUTE_INFO=y ++CONFIG_IPV6_OPTIMISTIC_DAD=y ++CONFIG_INET6_AH=m ++CONFIG_INET6_ESP=m ++CONFIG_INET6_IPCOMP=m ++CONFIG_IPV6_MIP6=m ++CONFIG_INET6_XFRM_MODE_TRANSPORT=m ++CONFIG_INET6_XFRM_MODE_TUNNEL=m ++CONFIG_INET6_XFRM_MODE_BEET=m ++CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m ++CONFIG_IPV6_SIT=m ++CONFIG_IPV6_TUNNEL=m ++CONFIG_IPV6_MULTIPLE_TABLES=y ++CONFIG_IPV6_MROUTE=y ++CONFIG_IPV6_PIMSM_V2=y ++CONFIG_NETLABEL=y ++CONFIG_NETFILTER=y ++CONFIG_NF_CONNTRACK=m ++CONFIG_NF_CONNTRACK_SECMARK=y ++CONFIG_NF_CONNTRACK_ZONES=y ++CONFIG_NF_CONNTRACK_EVENTS=y ++CONFIG_NF_CT_PROTO_DCCP=m ++CONFIG_NF_CT_PROTO_UDPLITE=m ++CONFIG_NF_CONNTRACK_AMANDA=m ++CONFIG_NF_CONNTRACK_FTP=m ++CONFIG_NF_CONNTRACK_H323=m ++CONFIG_NF_CONNTRACK_IRC=m ++CONFIG_NF_CONNTRACK_NETBIOS_NS=m ++CONFIG_NF_CONNTRACK_PPTP=m ++CONFIG_NF_CONNTRACK_SANE=m ++CONFIG_NF_CONNTRACK_SIP=m ++CONFIG_NF_CONNTRACK_TFTP=m ++CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m ++CONFIG_NETFILTER_XT_TARGET_CONNMARK=m ++CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m ++CONFIG_NETFILTER_XT_TARGET_DSCP=m ++CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m ++CONFIG_NETFILTER_XT_TARGET_MARK=m ++CONFIG_NETFILTER_XT_TARGET_NFLOG=m ++CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m ++CONFIG_NETFILTER_XT_TARGET_NOTRACK=m ++CONFIG_NETFILTER_XT_TARGET_TEE=m ++CONFIG_NETFILTER_XT_TARGET_TPROXY=m ++CONFIG_NETFILTER_XT_TARGET_TRACE=m ++CONFIG_NETFILTER_XT_TARGET_SECMARK=m ++CONFIG_NETFILTER_XT_TARGET_TCPMSS=m ++CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m ++CONFIG_NETFILTER_XT_MATCH_CLUSTER=m ++CONFIG_NETFILTER_XT_MATCH_COMMENT=m ++CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m ++CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m ++CONFIG_NETFILTER_XT_MATCH_CONNMARK=m ++CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m ++CONFIG_NETFILTER_XT_MATCH_DCCP=m ++CONFIG_NETFILTER_XT_MATCH_DSCP=m ++CONFIG_NETFILTER_XT_MATCH_ESP=m ++CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m ++CONFIG_NETFILTER_XT_MATCH_HELPER=m ++CONFIG_NETFILTER_XT_MATCH_IPRANGE=m ++CONFIG_NETFILTER_XT_MATCH_IPVS=m ++CONFIG_NETFILTER_XT_MATCH_LENGTH=m ++CONFIG_NETFILTER_XT_MATCH_LIMIT=m ++CONFIG_NETFILTER_XT_MATCH_MAC=m ++CONFIG_NETFILTER_XT_MATCH_MARK=m ++CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m ++CONFIG_NETFILTER_XT_MATCH_OSF=m ++CONFIG_NETFILTER_XT_MATCH_OWNER=m ++CONFIG_NETFILTER_XT_MATCH_POLICY=m ++CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m ++CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m ++CONFIG_NETFILTER_XT_MATCH_QUOTA=m ++CONFIG_NETFILTER_XT_MATCH_RATEEST=m ++CONFIG_NETFILTER_XT_MATCH_REALM=m ++CONFIG_NETFILTER_XT_MATCH_RECENT=m ++CONFIG_NETFILTER_XT_MATCH_SOCKET=m ++CONFIG_NETFILTER_XT_MATCH_STATE=m ++CONFIG_NETFILTER_XT_MATCH_STATISTIC=m ++CONFIG_NETFILTER_XT_MATCH_STRING=m ++CONFIG_NETFILTER_XT_MATCH_TCPMSS=m ++CONFIG_NETFILTER_XT_MATCH_TIME=m ++CONFIG_NETFILTER_XT_MATCH_U32=m ++CONFIG_IP_VS=m ++CONFIG_IP_VS_IPV6=y ++CONFIG_IP_VS_PROTO_TCP=y ++CONFIG_IP_VS_PROTO_UDP=y ++CONFIG_IP_VS_PROTO_ESP=y ++CONFIG_IP_VS_PROTO_AH=y ++CONFIG_IP_VS_PROTO_SCTP=y ++CONFIG_IP_VS_RR=m ++CONFIG_IP_VS_WRR=m ++CONFIG_IP_VS_LC=m ++CONFIG_IP_VS_WLC=m ++CONFIG_IP_VS_LBLC=m ++CONFIG_IP_VS_LBLCR=m ++CONFIG_IP_VS_SED=m ++CONFIG_IP_VS_NQ=m ++CONFIG_NF_CONNTRACK_IPV4=m ++# CONFIG_NF_CONNTRACK_PROC_COMPAT is not set ++CONFIG_IP_NF_IPTABLES=y ++CONFIG_IP_NF_MATCH_AH=m ++CONFIG_IP_NF_MATCH_ECN=m ++CONFIG_IP_NF_MATCH_TTL=m ++CONFIG_IP_NF_FILTER=y ++CONFIG_IP_NF_TARGET_REJECT=y ++CONFIG_IP_NF_MANGLE=m ++CONFIG_IP_NF_TARGET_ECN=m ++CONFIG_IP_NF_TARGET_TTL=m ++CONFIG_IP_NF_RAW=m ++CONFIG_IP_NF_SECURITY=m ++CONFIG_IP_NF_ARPTABLES=m ++CONFIG_IP_NF_ARPFILTER=m ++CONFIG_IP_NF_ARP_MANGLE=m ++CONFIG_NF_CONNTRACK_IPV6=m ++CONFIG_IP6_NF_MATCH_AH=m ++CONFIG_IP6_NF_MATCH_EUI64=m ++CONFIG_IP6_NF_MATCH_FRAG=m ++CONFIG_IP6_NF_MATCH_OPTS=m ++CONFIG_IP6_NF_MATCH_HL=m ++CONFIG_IP6_NF_MATCH_IPV6HEADER=m ++CONFIG_IP6_NF_MATCH_MH=m ++CONFIG_IP6_NF_MATCH_RT=m ++CONFIG_IP6_NF_TARGET_HL=m ++CONFIG_IP6_NF_FILTER=m ++CONFIG_IP6_NF_TARGET_REJECT=m ++CONFIG_IP6_NF_MANGLE=m ++CONFIG_IP6_NF_RAW=m ++CONFIG_IP6_NF_SECURITY=m ++CONFIG_BRIDGE_NF_EBTABLES=m ++CONFIG_BRIDGE_EBT_BROUTE=m ++CONFIG_BRIDGE_EBT_T_FILTER=m ++CONFIG_BRIDGE_EBT_T_NAT=m ++CONFIG_BRIDGE_EBT_802_3=m ++CONFIG_BRIDGE_EBT_AMONG=m ++CONFIG_BRIDGE_EBT_ARP=m ++CONFIG_BRIDGE_EBT_IP=m ++CONFIG_BRIDGE_EBT_IP6=m ++CONFIG_BRIDGE_EBT_LIMIT=m ++CONFIG_BRIDGE_EBT_MARK=m ++CONFIG_BRIDGE_EBT_PKTTYPE=m ++CONFIG_BRIDGE_EBT_STP=m ++CONFIG_BRIDGE_EBT_VLAN=m ++CONFIG_BRIDGE_EBT_ARPREPLY=m ++CONFIG_BRIDGE_EBT_DNAT=m ++CONFIG_BRIDGE_EBT_MARK_T=m ++CONFIG_BRIDGE_EBT_REDIRECT=m ++CONFIG_BRIDGE_EBT_SNAT=m ++CONFIG_BRIDGE_EBT_LOG=m ++CONFIG_BRIDGE_EBT_ULOG=m ++CONFIG_BRIDGE_EBT_NFLOG=m ++CONFIG_RDS=m ++CONFIG_RDS_TCP=m ++CONFIG_BRIDGE=m ++CONFIG_VLAN_8021Q=m ++CONFIG_VLAN_8021Q_GVRP=y ++CONFIG_PHONET=m ++CONFIG_NET_SCHED=y ++CONFIG_NET_SCH_CBQ=m ++CONFIG_NET_SCH_HTB=m ++CONFIG_NET_SCH_HFSC=m ++CONFIG_NET_SCH_PRIO=m ++CONFIG_NET_SCH_MULTIQ=m ++CONFIG_NET_SCH_RED=m ++CONFIG_NET_SCH_SFQ=m ++CONFIG_NET_SCH_TEQL=m ++CONFIG_NET_SCH_TBF=m ++CONFIG_NET_SCH_GRED=m ++CONFIG_NET_SCH_DSMARK=m ++CONFIG_NET_SCH_NETEM=m ++CONFIG_NET_SCH_DRR=m ++CONFIG_NET_SCH_INGRESS=m ++CONFIG_NET_CLS_BASIC=m ++CONFIG_NET_CLS_TCINDEX=m ++CONFIG_NET_CLS_ROUTE4=m ++CONFIG_NET_CLS_FW=m ++CONFIG_NET_CLS_U32=m ++CONFIG_CLS_U32_PERF=y ++CONFIG_CLS_U32_MARK=y ++CONFIG_NET_CLS_RSVP=m ++CONFIG_NET_CLS_RSVP6=m ++CONFIG_NET_CLS_FLOW=m ++CONFIG_NET_CLS_CGROUP=y ++CONFIG_NET_EMATCH=y ++CONFIG_NET_EMATCH_CMP=m ++CONFIG_NET_EMATCH_NBYTE=m ++CONFIG_NET_EMATCH_U32=m ++CONFIG_NET_EMATCH_META=m ++CONFIG_NET_EMATCH_TEXT=m ++CONFIG_NET_CLS_ACT=y ++CONFIG_NET_ACT_POLICE=m ++CONFIG_NET_ACT_GACT=m ++CONFIG_GACT_PROB=y ++CONFIG_NET_ACT_MIRRED=m ++CONFIG_NET_ACT_IPT=m ++CONFIG_NET_ACT_NAT=m ++CONFIG_NET_ACT_PEDIT=m ++CONFIG_NET_ACT_SIMP=m ++CONFIG_NET_ACT_SKBEDIT=m ++CONFIG_NET_CLS_IND=y ++CONFIG_DCB=y ++CONFIG_DNS_RESOLVER=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_DEVTMPFS=y ++CONFIG_DEVTMPFS_MOUNT=y ++CONFIG_CONNECTOR=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=m ++CONFIG_BLK_DEV_SX8=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_ATA_OVER_ETH=m ++CONFIG_RAID_ATTRS=m ++CONFIG_BLK_DEV_SD=y ++CONFIG_SCSI_CONSTANTS=y ++CONFIG_SCSI_LOGGING=y ++CONFIG_ATA=y ++CONFIG_SATA_SIL24=y ++# CONFIG_ATA_SFF is not set ++CONFIG_MD=y ++CONFIG_BLK_DEV_MD=y ++CONFIG_MD_LINEAR=m ++CONFIG_MD_RAID0=m ++CONFIG_MD_RAID1=m ++CONFIG_MD_RAID10=m ++CONFIG_MD_RAID456=m ++CONFIG_MD_FAULTY=m ++CONFIG_BLK_DEV_DM=m ++CONFIG_DM_DEBUG=y ++CONFIG_DM_CRYPT=m ++CONFIG_DM_SNAPSHOT=m ++CONFIG_DM_MIRROR=m ++CONFIG_DM_LOG_USERSPACE=m ++CONFIG_DM_ZERO=m ++CONFIG_DM_MULTIPATH=m ++CONFIG_DM_MULTIPATH_QL=m ++CONFIG_DM_MULTIPATH_ST=m ++CONFIG_DM_DELAY=m ++CONFIG_DM_UEVENT=y ++CONFIG_FUSION=y ++CONFIG_FUSION_SAS=y ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_IFB=m ++CONFIG_MACVLAN=m ++CONFIG_MACVTAP=m ++CONFIG_NETCONSOLE=m ++CONFIG_NETCONSOLE_DYNAMIC=y ++CONFIG_TUN=y ++CONFIG_VETH=m ++CONFIG_NET_DSA_MV88E6060=y ++CONFIG_NET_DSA_MV88E6XXX=y ++# CONFIG_NET_VENDOR_3COM is not set ++CONFIG_E1000E=y ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_HW_RANDOM=y ++CONFIG_HW_RANDOM_TIMERIOMEM=m ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_WATCHDOG_NOWAYOUT=y ++# CONFIG_VGA_ARB is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_EDAC=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_TILE=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT2_FS_SECURITY=y ++CONFIG_EXT2_FS_XIP=y ++CONFIG_EXT3_FS=y ++CONFIG_EXT3_FS_POSIX_ACL=y ++CONFIG_EXT3_FS_SECURITY=y ++CONFIG_EXT4_FS=y ++CONFIG_EXT4_FS_POSIX_ACL=y ++CONFIG_EXT4_FS_SECURITY=y ++CONFIG_XFS_FS=y ++CONFIG_XFS_QUOTA=y ++CONFIG_XFS_POSIX_ACL=y ++CONFIG_GFS2_FS=m ++CONFIG_GFS2_FS_LOCKING_DLM=y ++CONFIG_BTRFS_FS=m ++CONFIG_BTRFS_FS_POSIX_ACL=y ++CONFIG_QUOTA=y ++CONFIG_QUOTA_NETLINK_INTERFACE=y ++# CONFIG_PRINT_QUOTA_WARNING is not set ++CONFIG_QFMT_V2=y ++CONFIG_AUTOFS4_FS=m ++CONFIG_FUSE_FS=y ++CONFIG_CUSE=m ++CONFIG_FSCACHE=m ++CONFIG_FSCACHE_STATS=y ++CONFIG_CACHEFILES=m ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_ZISOFS=y ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_FAT_DEFAULT_IOCHARSET="ascii" ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_TMPFS_POSIX_ACL=y ++CONFIG_HUGETLBFS=y ++CONFIG_CONFIGFS_FS=m ++CONFIG_ECRYPT_FS=m ++CONFIG_CRAMFS=m ++CONFIG_SQUASHFS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=m ++CONFIG_NFS_V4_1=y ++CONFIG_NFS_FSCACHE=y ++CONFIG_NFSD=m ++CONFIG_NFSD_V3_ACL=y ++CONFIG_NFSD_V4=y ++CONFIG_CIFS=m ++CONFIG_CIFS_STATS=y ++CONFIG_CIFS_WEAK_PW_HASH=y ++CONFIG_CIFS_UPCALL=y ++CONFIG_CIFS_XATTR=y ++CONFIG_CIFS_POSIX=y ++CONFIG_CIFS_DFS_UPCALL=y ++CONFIG_CIFS_FSCACHE=y ++CONFIG_NLS=y ++CONFIG_NLS_DEFAULT="utf8" ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=m ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_NLS_UTF8=m ++CONFIG_DLM=m ++CONFIG_DLM_DEBUG=y ++CONFIG_DYNAMIC_DEBUG=y ++CONFIG_DEBUG_INFO=y ++CONFIG_DEBUG_INFO_REDUCED=y ++# CONFIG_ENABLE_WARN_DEPRECATED is not set ++CONFIG_FRAME_WARN=2048 ++CONFIG_STRIP_ASM_SYMS=y ++CONFIG_DEBUG_FS=y ++CONFIG_HEADERS_CHECK=y ++# CONFIG_FRAME_POINTER is not set ++CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y ++CONFIG_MAGIC_SYSRQ=y ++CONFIG_DEBUG_VM=y ++CONFIG_DEBUG_MEMORY_INIT=y ++CONFIG_DEBUG_STACKOVERFLOW=y ++CONFIG_LOCKUP_DETECTOR=y ++CONFIG_SCHEDSTATS=y ++CONFIG_TIMER_STATS=y ++CONFIG_DEBUG_LIST=y ++CONFIG_DEBUG_CREDENTIALS=y ++CONFIG_RCU_CPU_STALL_TIMEOUT=60 ++CONFIG_ASYNC_RAID6_TEST=m ++CONFIG_SECURITY=y ++CONFIG_SECURITYFS=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_SECURITY_NETWORK_XFRM=y ++CONFIG_SECURITY_SELINUX=y ++CONFIG_SECURITY_SELINUX_BOOTPARAM=y ++CONFIG_SECURITY_SELINUX_DISABLE=y ++CONFIG_CRYPTO_PCRYPT=m ++CONFIG_CRYPTO_CRYPTD=m ++CONFIG_CRYPTO_TEST=m ++CONFIG_CRYPTO_CCM=m ++CONFIG_CRYPTO_GCM=m ++CONFIG_CRYPTO_CTS=m ++CONFIG_CRYPTO_LRW=m ++CONFIG_CRYPTO_PCBC=m ++CONFIG_CRYPTO_XTS=m ++CONFIG_CRYPTO_HMAC=y ++CONFIG_CRYPTO_XCBC=m ++CONFIG_CRYPTO_VMAC=m ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_RMD128=m ++CONFIG_CRYPTO_RMD160=m ++CONFIG_CRYPTO_RMD256=m ++CONFIG_CRYPTO_RMD320=m ++CONFIG_CRYPTO_SHA1=y ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_TGR192=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_ANUBIS=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAMELLIA=m ++CONFIG_CRYPTO_CAST5=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_FCRYPT=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SEED=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m ++CONFIG_CRYPTO_LZO=m ++CONFIG_CRC_CCITT=m ++CONFIG_CRC7=m +diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig +index 0eb9f92f3717..e5890ae917e5 100644 +--- a/arch/x86/configs/i386_defconfig ++++ b/arch/x86/configs/i386_defconfig +@@ -41,7 +41,7 @@ CONFIG_SMP=y + CONFIG_X86_GENERIC=y + CONFIG_HPET_TIMER=y + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_X86_REBOOTFIXUPS=y +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index e32fc1f274d8..4368ba4f7967 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -40,7 +40,7 @@ CONFIG_SMP=y + CONFIG_CALGARY_IOMMU=y + CONFIG_NR_CPUS=64 + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_MICROCODE=y +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index cd1655122ec0..9cf10230d5fb 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -1,7 +1,7 @@ + + choice + prompt "Preemption Model" +- default PREEMPT_NONE ++ default PREEMPT + + config PREEMPT_NONE + bool "No Forced Preemption (Server)" +@@ -17,7 +17,7 @@ config PREEMPT_NONE + latencies. + + config PREEMPT_VOLUNTARY +- bool "Voluntary Kernel Preemption (Desktop)" ++ bool "Voluntary Kernel Preemption (Nothing)" + depends on !ARCH_NO_PREEMPT + help + This option reduces the latency of the kernel by adding more +@@ -32,7 +32,8 @@ config PREEMPT_VOLUNTARY + applications to run more 'smoothly' even when the system is + under load. + +- Select this if you are building a kernel for a desktop system. ++ Select this for no system in particular (choose Preemptible ++ instead on a desktop if you know what's good for you). + + config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" +@@ -57,4 +58,4 @@ config PREEMPT + endchoice + + config PREEMPT_COUNT +- bool +\ No newline at end of file ++ bool +-- +2.17.1 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0004-Expose-vmsplit-for-our-poor-32-bit-users.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0004-Expose-vmsplit-for-our-poor-32-bit-users.patch new file mode 100644 index 00000000..0aaae235 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0004-Expose-vmsplit-for-our-poor-32-bit-users.patch @@ -0,0 +1,48 @@ +From d67d0504370871bea9e73c69c840fb3d0a88d9cb Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Fri, 12 May 2017 13:07:37 +1000 +Subject: [PATCH 04/16] Expose vmsplit for our poor 32 bit users. + +--- + arch/x86/Kconfig | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 88bfccabcf3b..cfa268364ec7 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1495,7 +1495,7 @@ config HIGHMEM64G + endchoice + + choice +- prompt "Memory split" if EXPERT ++ prompt "Memory split" + default VMSPLIT_3G + depends on X86_32 + ---help--- +@@ -1515,17 +1515,17 @@ choice + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !X86_PAE +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_2G_OPT + depends on !X86_PAE +- bool "2G/2G user/kernel split (for full 2G low memory)" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +-- +2.17.1 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0005-Create-highres-timeout-variants-of-schedule_timeout-.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0005-Create-highres-timeout-variants-of-schedule_timeout-.patch new file mode 100644 index 00000000..500dd852 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0005-Create-highres-timeout-variants-of-schedule_timeout-.patch @@ -0,0 +1,153 @@ +From 552f25751a108c7e185b82aa3110d43bfe1e59b1 Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Sat, 12 Aug 2017 11:53:39 +1000 +Subject: [PATCH 05/16] Create highres timeout variants of schedule_timeout + functions. + +--- + include/linux/freezer.h | 1 + + include/linux/sched.h | 31 ++++++++++++++++-- + kernel/time/hrtimer.c | 71 +++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 101 insertions(+), 2 deletions(-) + +diff --git a/include/linux/freezer.h b/include/linux/freezer.h +index 21f5aa0b217f..ee9b46394fdf 100644 +--- a/include/linux/freezer.h ++++ b/include/linux/freezer.h +@@ -297,6 +297,7 @@ static inline void set_freezable(void) {} + #define wait_event_freezekillable_unsafe(wq, condition) \ + wait_event_killable(wq, condition) + ++#define pm_freezing (false) + #endif /* !CONFIG_FREEZER */ + + #endif /* FREEZER_H_INCLUDED */ +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 1f8cea1436b4..1cd022304c64 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -211,13 +211,40 @@ struct task_group; + + extern void scheduler_tick(void); + +-#define MAX_SCHEDULE_TIMEOUT LONG_MAX +- ++#define MAX_SCHEDULE_TIMEOUT LONG_MAX + extern long schedule_timeout(long timeout); + extern long schedule_timeout_interruptible(long timeout); + extern long schedule_timeout_killable(long timeout); + extern long schedule_timeout_uninterruptible(long timeout); + extern long schedule_timeout_idle(long timeout); ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++extern long schedule_msec_hrtimeout(long timeout); ++extern long schedule_min_hrtimeout(void); ++extern long schedule_msec_hrtimeout_interruptible(long timeout); ++extern long schedule_msec_hrtimeout_uninterruptible(long timeout); ++#else ++static inline long schedule_msec_hrtimeout(long timeout) ++{ ++ return schedule_timeout(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_min_hrtimeout(void) ++{ ++ return schedule_timeout(1); ++} ++ ++static inline long schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ return schedule_timeout_interruptible(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout)); ++} ++#endif ++ + asmlinkage void schedule(void); + extern void schedule_preempt_disabled(void); + +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index e1a549c9e399..12735724cce4 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -2026,3 +2026,74 @@ int __sched schedule_hrtimeout(ktime_t *expires, + return schedule_hrtimeout_range(expires, 0, mode); + } + EXPORT_SYMBOL_GPL(schedule_hrtimeout); ++ ++/* ++ * As per schedule_hrtimeout but taskes a millisecond value and returns how ++ * many milliseconds are left. ++ */ ++long __sched schedule_msec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ int delta, secs, jiffs; ++ ktime_t expires; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ jiffs = msecs_to_jiffies(timeout); ++ /* ++ * If regular timer resolution is adequate or hrtimer resolution is not ++ * (yet) better than Hz, as would occur during startup, use regular ++ * timers. ++ */ ++ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ) ++ return schedule_timeout(jiffs); ++ ++ secs = timeout / 1000; ++ delta = (timeout % 1000) * NSEC_PER_MSEC; ++ expires = ktime_set(secs, delta); ++ ++ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_init_sleeper(&t, current); ++ ++ hrtimer_start_expires(&t.timer, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_ms(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++EXPORT_SYMBOL(schedule_msec_hrtimeout); ++ ++long __sched schedule_min_hrtimeout(void) ++{ ++ return schedule_msec_hrtimeout(1); ++} ++ ++EXPORT_SYMBOL(schedule_min_hrtimeout); ++ ++long __sched schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ __set_current_state(TASK_INTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible); ++ ++long __sched schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible); +-- +2.17.1 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0006-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0006-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch new file mode 100644 index 00000000..aa348475 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0006-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch @@ -0,0 +1,49 @@ +From 65ea992f1da66b8c0a5554776d1350417b9107cb Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Sat, 5 Nov 2016 09:27:36 +1100 +Subject: [PATCH 06/16] Special case calls of schedule_timeout(1) to use the + min hrtimeout of 1ms, working around low Hz resolutions. + +--- + kernel/time/timer.c | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 981eaddff95b..ae942d459aa2 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1801,6 +1801,18 @@ signed long __sched schedule_timeout(signed long timeout) + + expire = timeout + jiffies; + ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ /* ++ * Special case 1 as being a request for the minimum timeout ++ * and use highres timers to timeout after 1ms to workaround ++ * the granularity of low Hz tick timers. ++ */ ++ if (!schedule_min_hrtimeout()) ++ return 0; ++ goto out_timeout; ++ } ++#endif + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + __mod_timer(&timer.timer, expire, 0); +@@ -1809,10 +1821,10 @@ signed long __sched schedule_timeout(signed long timeout) + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer.timer); +- ++out_timeout: + timeout = expire - jiffies; + +- out: ++out: + return timeout < 0 ? 0 : timeout; + } + EXPORT_SYMBOL(schedule_timeout); +-- +2.17.1 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0007-Convert-msleep-to-use-hrtimers-when-active.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0007-Convert-msleep-to-use-hrtimers-when-active.patch new file mode 100644 index 00000000..e9edcd12 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0007-Convert-msleep-to-use-hrtimers-when-active.patch @@ -0,0 +1,54 @@ +From 7b74daf29a88f3314af306509bd40d45c34f11c7 Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Fri, 4 Nov 2016 09:25:54 +1100 +Subject: [PATCH 07/16] Convert msleep to use hrtimers when active. + +--- + kernel/time/timer.c | 24 ++++++++++++++++++++++-- + 1 file changed, 22 insertions(+), 2 deletions(-) + +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index ae942d459aa2..542c13d98950 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1965,7 +1965,19 @@ void __init init_timers(void) + */ + void msleep(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ /* ++ * Use high resolution timers where the resolution of tick based ++ * timers is inadequate. ++ */ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ while (msecs) ++ msecs = schedule_msec_hrtimeout_uninterruptible(msecs); ++ return; ++ } ++ timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +@@ -1979,7 +1991,15 @@ EXPORT_SYMBOL(msleep); + */ + unsigned long msleep_interruptible(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ while (msecs && !signal_pending(current)) ++ msecs = schedule_msec_hrtimeout_interruptible(msecs); ++ return msecs; ++ } ++ timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); +-- +2.17.1 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0008-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0008-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch new file mode 100644 index 00000000..f4b441d8 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0008-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch @@ -0,0 +1,1009 @@ +From 8a679ba5279cbff1a8e4c47b55ac4bd6d66289f8 Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Mon, 20 Feb 2017 13:28:30 +1100 +Subject: [PATCH 08/16] Replace all schedule timeout(1) with + schedule_min_hrtimeout() + +--- + drivers/block/swim.c | 6 +- + drivers/bluetooth/hci_qca.c | 2 +- + drivers/char/ipmi/ipmi_msghandler.c | 2 +- + drivers/char/ipmi/ipmi_ssif.c | 2 +- + drivers/char/snsc.c | 4 +- + drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c | 2 +- + drivers/gpu/drm/vmwgfx/vmwgfx_irq.c | 2 +- + drivers/media/pci/ivtv/ivtv-ioctl.c | 2 +- + drivers/media/pci/ivtv/ivtv-streams.c | 2 +- + drivers/mfd/ucb1x00-core.c | 2 +- + drivers/misc/sgi-xp/xpc_channel.c | 2 +- + drivers/net/caif/caif_hsi.c | 2 +- + drivers/net/can/usb/peak_usb/pcan_usb.c | 2 +- + drivers/net/usb/lan78xx.c | 2 +- + drivers/net/usb/usbnet.c | 2 +- + drivers/scsi/fnic/fnic_scsi.c | 4 +- + drivers/scsi/snic/snic_scsi.c | 2 +- + .../staging/comedi/drivers/ni_mio_common.c | 2 +- + drivers/staging/lustre/lnet/lnet/lib-eq.c | 426 ++++++++++++++++++ + drivers/staging/rts5208/rtsx.c | 2 +- + drivers/staging/speakup/speakup_acntpc.c | 4 +- + drivers/staging/speakup/speakup_apollo.c | 2 +- + drivers/staging/speakup/speakup_decext.c | 2 +- + drivers/staging/speakup/speakup_decpc.c | 2 +- + drivers/staging/speakup/speakup_dectlk.c | 2 +- + drivers/staging/speakup/speakup_dtlk.c | 4 +- + drivers/staging/speakup/speakup_keypc.c | 4 +- + drivers/staging/speakup/synth.c | 14 +- + .../staging/unisys/visornic/visornic_main.c | 6 +- + drivers/video/fbdev/omap/hwa742.c | 2 +- + drivers/video/fbdev/pxafb.c | 2 +- + fs/btrfs/extent-tree.c | 2 +- + fs/btrfs/inode-map.c | 2 +- + sound/usb/line6/pcm.c | 2 +- + 34 files changed, 471 insertions(+), 51 deletions(-) + create mode 100644 drivers/staging/lustre/lnet/lnet/lib-eq.c + +diff --git a/drivers/block/swim.c b/drivers/block/swim.c +index 0e31884a9519..16fcfbde31d5 100644 +--- a/drivers/block/swim.c ++++ b/drivers/block/swim.c +@@ -332,7 +332,7 @@ static inline void swim_motor(struct swim __iomem *base, + if (swim_readbit(base, MOTOR_ON)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + } else if (action == OFF) { + swim_action(base, MOTOR_OFF); +@@ -351,7 +351,7 @@ static inline void swim_eject(struct swim __iomem *base) + if (!swim_readbit(base, DISK_IN)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + swim_select(base, RELAX); + } +@@ -375,7 +375,7 @@ static inline int swim_step(struct swim __iomem *base) + for (wait = 0; wait < HZ; wait++) { + + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + swim_select(base, RELAX); + if (!swim_readbit(base, STEP)) +diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c +index 2fee65886d50..4ca0bae3df58 100644 +--- a/drivers/bluetooth/hci_qca.c ++++ b/drivers/bluetooth/hci_qca.c +@@ -980,7 +980,7 @@ static int qca_set_baudrate(struct hci_dev *hdev, uint8_t baudrate) + * then host can communicate with new baudrate to controller + */ + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(BAUDRATE_SETTLE_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((BAUDRATE_SETTLE_TIMEOUT_MS)); + set_current_state(TASK_RUNNING); + + if (qcadev->btsoc_type == QCA_WCN3990) +diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c +index 7fc9612070a1..5a7f8a879001 100644 +--- a/drivers/char/ipmi/ipmi_msghandler.c ++++ b/drivers/char/ipmi/ipmi_msghandler.c +@@ -3453,7 +3453,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf) + /* Current message first, to preserve order */ + while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) { + /* Wait for the message to clear out. */ +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + /* No need for locks, the interface is down. */ +diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c +index 29e67a80fb20..73bd0eca5fe5 100644 +--- a/drivers/char/ipmi/ipmi_ssif.c ++++ b/drivers/char/ipmi/ipmi_ssif.c +@@ -1208,7 +1208,7 @@ static void shutdown_ssif(void *send_info) + + /* make sure the driver is not looking for flags any more. */ + while (ssif_info->ssif_state != SSIF_NORMAL) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + ssif_info->stopping = true; + del_timer_sync(&ssif_info->retry_timer); +diff --git a/drivers/char/snsc.c b/drivers/char/snsc.c +index 5918ea7499bb..5228e78df804 100644 +--- a/drivers/char/snsc.c ++++ b/drivers/char/snsc.c +@@ -198,7 +198,7 @@ scdrv_read(struct file *file, char __user *buf, size_t count, loff_t *f_pos) + add_wait_queue(&sd->sd_rq, &wait); + spin_unlock_irqrestore(&sd->sd_rlock, flags); + +- schedule_timeout(msecs_to_jiffies(SCDRV_TIMEOUT)); ++ schedule_msec_hrtimeout((SCDRV_TIMEOUT)); + + remove_wait_queue(&sd->sd_rq, &wait); + if (signal_pending(current)) { +@@ -294,7 +294,7 @@ scdrv_write(struct file *file, const char __user *buf, + add_wait_queue(&sd->sd_wq, &wait); + spin_unlock_irqrestore(&sd->sd_wlock, flags); + +- schedule_timeout(msecs_to_jiffies(SCDRV_TIMEOUT)); ++ schedule_msec_hrtimeout((SCDRV_TIMEOUT)); + + remove_wait_queue(&sd->sd_wq, &wait); + if (signal_pending(current)) { +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +index d0fd147ef75f..730ae4fe6b85 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +@@ -235,7 +235,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv, + DRM_ERROR("SVGA device lockup.\n"); + break; + } +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + if (interruptible && signal_pending(current)) { + ret = -ERESTARTSYS; + break; +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +index c3ad4478266b..7e2a29d56459 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +@@ -202,7 +202,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv, + break; + } + if (lazy) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + else if ((++count & 0x0F) == 0) { + /** + * FIXME: Use schedule_hr_timeout here for +diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c +index 4cdc6d2be85d..22c0803cbff3 100644 +--- a/drivers/media/pci/ivtv/ivtv-ioctl.c ++++ b/drivers/media/pci/ivtv/ivtv-ioctl.c +@@ -1154,7 +1154,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std) + TASK_UNINTERRUPTIBLE); + if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100) + break; +- schedule_timeout(msecs_to_jiffies(25)); ++ schedule_msec_hrtimeout((25)); + } + finish_wait(&itv->vsync_waitq, &wait); + mutex_lock(&itv->serialize_lock); +diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c +index d27c6df97566..e9ffc4eeb478 100644 +--- a/drivers/media/pci/ivtv/ivtv-streams.c ++++ b/drivers/media/pci/ivtv/ivtv-streams.c +@@ -834,7 +834,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end) + while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) && + time_before(jiffies, + then + msecs_to_jiffies(2000))) { +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + } + + /* To convert jiffies to ms, we must multiply by 1000 +diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c +index d6fb2e1a759a..7ac951b84beb 100644 +--- a/drivers/mfd/ucb1x00-core.c ++++ b/drivers/mfd/ucb1x00-core.c +@@ -253,7 +253,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync) + break; + /* yield to other processes */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + return UCB_ADC_DAT(val); +diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c +index 05a890ce2ab8..f6eb97bc3a2c 100644 +--- a/drivers/misc/sgi-xp/xpc_channel.c ++++ b/drivers/misc/sgi-xp/xpc_channel.c +@@ -834,7 +834,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch) + + atomic_inc(&ch->n_on_msg_allocate_wq); + prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); +- ret = schedule_timeout(1); ++ ret = schedule_min_hrtimeout(); + finish_wait(&ch->msg_allocate_wq, &wait); + atomic_dec(&ch->n_on_msg_allocate_wq); + +diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c +index 433a14b9f731..4d197a99472b 100644 +--- a/drivers/net/caif/caif_hsi.c ++++ b/drivers/net/caif/caif_hsi.c +@@ -939,7 +939,7 @@ static void cfhsi_wake_down(struct work_struct *work) + break; + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + retry--; + } + +diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c +index 13238a72a338..fc51ae55c63f 100644 +--- a/drivers/net/can/usb/peak_usb/pcan_usb.c ++++ b/drivers/net/can/usb/peak_usb/pcan_usb.c +@@ -250,7 +250,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff) + } else { + /* the PCAN-USB needs time to init */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); ++ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT)); + } + + return err; +diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c +index c3c9ba44e2a1..1bc66289699f 100644 +--- a/drivers/net/usb/lan78xx.c ++++ b/drivers/net/usb/lan78xx.c +@@ -2681,7 +2681,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev) + while (!skb_queue_empty(&dev->rxq) && + !skb_queue_empty(&dev->txq) && + !skb_queue_empty(&dev->done)) { +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + netif_dbg(dev, ifdown, dev->net, + "waited for %d urb completions\n", temp); +diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c +index 770aa624147f..9384de186bf9 100644 +--- a/drivers/net/usb/usbnet.c ++++ b/drivers/net/usb/usbnet.c +@@ -770,7 +770,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q) + spin_lock_irqsave(&q->lock, flags); + while (!skb_queue_empty(q)) { + spin_unlock_irqrestore(&q->lock, flags); +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_lock_irqsave(&q->lock, flags); + } +diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c +index 8cbd3c9f0b4c..7e3f9baa4ac6 100644 +--- a/drivers/scsi/fnic/fnic_scsi.c ++++ b/drivers/scsi/fnic/fnic_scsi.c +@@ -217,7 +217,7 @@ int fnic_fw_reset_handler(struct fnic *fnic) + + /* wait for io cmpl */ + while (atomic_read(&fnic->in_flight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); + +@@ -2255,7 +2255,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic, + } + } + +- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); ++ schedule_msec_hrtimeout((2 * fnic->config.ed_tov)); + + /* walk again to check, if IOs are still pending in fw */ + if (fnic_is_abts_pending(fnic, lr_sc)) +diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c +index d9b2e46424aa..4a313a0f2039 100644 +--- a/drivers/scsi/snic/snic_scsi.c ++++ b/drivers/scsi/snic/snic_scsi.c +@@ -2354,7 +2354,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc) + + /* Wait for all the IOs that are entered in Qcmd */ + while (atomic_read(&snic->ios_inflight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + ret = snic_issue_hba_reset(snic, sc); + if (ret) { +diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c +index 4dee2fc37aed..2bb1c1157636 100644 +--- a/drivers/staging/comedi/drivers/ni_mio_common.c ++++ b/drivers/staging/comedi/drivers/ni_mio_common.c +@@ -4650,7 +4650,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev) + if ((status & NI67XX_CAL_STATUS_BUSY) == 0) + break; + set_current_state(TASK_INTERRUPTIBLE); +- if (schedule_timeout(1)) ++ if (schedule_min_hrtimeout()) + return -EIO; + } + if (i == timeout) { +diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c +new file mode 100644 +index 000000000000..8cca151741b2 +--- /dev/null ++++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c +@@ -0,0 +1,426 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * GPL HEADER START ++ * ++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 only, ++ * as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License version 2 for more details (a copy is included ++ * in the LICENSE file that accompanied this code). ++ * ++ * You should have received a copy of the GNU General Public License ++ * version 2 along with this program; If not, see ++ * http://www.gnu.org/licenses/gpl-2.0.html ++ * ++ * GPL HEADER END ++ */ ++/* ++ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Use is subject to license terms. ++ * ++ * Copyright (c) 2012, Intel Corporation. ++ */ ++/* ++ * This file is part of Lustre, http://www.lustre.org/ ++ * Lustre is a trademark of Sun Microsystems, Inc. ++ * ++ * lnet/lnet/lib-eq.c ++ * ++ * Library level Event queue management routines ++ */ ++ ++#define DEBUG_SUBSYSTEM S_LNET ++ ++#include ++ ++/** ++ * Create an event queue that has room for \a count number of events. ++ * ++ * The event queue is circular and older events will be overwritten by new ++ * ones if they are not removed in time by the user using the functions ++ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to ++ * determine the appropriate size of the event queue to prevent this loss ++ * of events. Note that when EQ handler is specified in \a callback, no ++ * event loss can happen, since the handler is run for each event deposited ++ * into the EQ. ++ * ++ * \param count The number of events to be stored in the event queue. It ++ * will be rounded up to the next power of two. ++ * \param callback A handler function that runs when an event is deposited ++ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to ++ * indicate that no event handler is desired. ++ * \param handle On successful return, this location will hold a handle for ++ * the newly created EQ. ++ * ++ * \retval 0 On success. ++ * \retval -EINVAL If an parameter is not valid. ++ * \retval -ENOMEM If memory for the EQ can't be allocated. ++ * ++ * \see lnet_eq_handler_t for the discussion on EQ handler semantics. ++ */ ++int ++LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, ++ struct lnet_handle_eq *handle) ++{ ++ struct lnet_eq *eq; ++ ++ LASSERT(the_lnet.ln_refcount > 0); ++ ++ /* ++ * We need count to be a power of 2 so that when eq_{enq,deq}_seq ++ * overflow, they don't skip entries, so the queue has the same ++ * apparent capacity at all times ++ */ ++ if (count) ++ count = roundup_pow_of_two(count); ++ ++ if (callback != LNET_EQ_HANDLER_NONE && count) ++ CWARN("EQ callback is guaranteed to get every event, do you still want to set eqcount %d for polling event which will have locking overhead? Please contact with developer to confirm\n", count); ++ ++ /* ++ * count can be 0 if only need callback, we can eliminate ++ * overhead of enqueue event ++ */ ++ if (!count && callback == LNET_EQ_HANDLER_NONE) ++ return -EINVAL; ++ ++ eq = kzalloc(sizeof(*eq), GFP_NOFS); ++ if (!eq) ++ return -ENOMEM; ++ ++ if (count) { ++ eq->eq_events = kvmalloc_array(count, sizeof(struct lnet_event), ++ GFP_KERNEL | __GFP_ZERO); ++ if (!eq->eq_events) ++ goto failed; ++ /* ++ * NB allocator has set all event sequence numbers to 0, ++ * so all them should be earlier than eq_deq_seq ++ */ ++ } ++ ++ eq->eq_deq_seq = 1; ++ eq->eq_enq_seq = 1; ++ eq->eq_size = count; ++ eq->eq_callback = callback; ++ ++ eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(), ++ sizeof(*eq->eq_refs[0])); ++ if (!eq->eq_refs) ++ goto failed; ++ ++ /* MUST hold both exclusive lnet_res_lock */ ++ lnet_res_lock(LNET_LOCK_EX); ++ /* ++ * NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do ++ * both EQ lookup and poll event with only lnet_eq_wait_lock ++ */ ++ lnet_eq_wait_lock(); ++ ++ lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh); ++ list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active); ++ ++ lnet_eq_wait_unlock(); ++ lnet_res_unlock(LNET_LOCK_EX); ++ ++ lnet_eq2handle(handle, eq); ++ return 0; ++ ++failed: ++ kvfree(eq->eq_events); ++ ++ if (eq->eq_refs) ++ cfs_percpt_free(eq->eq_refs); ++ ++ kfree(eq); ++ return -ENOMEM; ++} ++EXPORT_SYMBOL(LNetEQAlloc); ++ ++/** ++ * Release the resources associated with an event queue if it's idle; ++ * otherwise do nothing and it's up to the user to try again. ++ * ++ * \param eqh A handle for the event queue to be released. ++ * ++ * \retval 0 If the EQ is not in use and freed. ++ * \retval -ENOENT If \a eqh does not point to a valid EQ. ++ * \retval -EBUSY If the EQ is still in use by some MDs. ++ */ ++int ++LNetEQFree(struct lnet_handle_eq eqh) ++{ ++ struct lnet_eq *eq; ++ struct lnet_event *events = NULL; ++ int **refs = NULL; ++ int *ref; ++ int rc = 0; ++ int size = 0; ++ int i; ++ ++ LASSERT(the_lnet.ln_refcount > 0); ++ ++ lnet_res_lock(LNET_LOCK_EX); ++ /* ++ * NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do ++ * both EQ lookup and poll event with only lnet_eq_wait_lock ++ */ ++ lnet_eq_wait_lock(); ++ ++ eq = lnet_handle2eq(&eqh); ++ if (!eq) { ++ rc = -ENOENT; ++ goto out; ++ } ++ ++ cfs_percpt_for_each(ref, i, eq->eq_refs) { ++ LASSERT(*ref >= 0); ++ if (!*ref) ++ continue; ++ ++ CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n", ++ i, *ref); ++ rc = -EBUSY; ++ goto out; ++ } ++ ++ /* stash for free after lock dropped */ ++ events = eq->eq_events; ++ size = eq->eq_size; ++ refs = eq->eq_refs; ++ ++ lnet_res_lh_invalidate(&eq->eq_lh); ++ list_del(&eq->eq_list); ++ kfree(eq); ++ out: ++ lnet_eq_wait_unlock(); ++ lnet_res_unlock(LNET_LOCK_EX); ++ ++ kvfree(events); ++ if (refs) ++ cfs_percpt_free(refs); ++ ++ return rc; ++} ++EXPORT_SYMBOL(LNetEQFree); ++ ++void ++lnet_eq_enqueue_event(struct lnet_eq *eq, struct lnet_event *ev) ++{ ++ /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */ ++ int index; ++ ++ if (!eq->eq_size) { ++ LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE); ++ eq->eq_callback(ev); ++ return; ++ } ++ ++ lnet_eq_wait_lock(); ++ ev->sequence = eq->eq_enq_seq++; ++ ++ LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size)); ++ index = ev->sequence & (eq->eq_size - 1); ++ ++ eq->eq_events[index] = *ev; ++ ++ if (eq->eq_callback != LNET_EQ_HANDLER_NONE) ++ eq->eq_callback(ev); ++ ++ /* Wake anyone waiting in LNetEQPoll() */ ++ if (waitqueue_active(&the_lnet.ln_eq_waitq)) ++ wake_up_all(&the_lnet.ln_eq_waitq); ++ lnet_eq_wait_unlock(); ++} ++ ++static int ++lnet_eq_dequeue_event(struct lnet_eq *eq, struct lnet_event *ev) ++{ ++ int new_index = eq->eq_deq_seq & (eq->eq_size - 1); ++ struct lnet_event *new_event = &eq->eq_events[new_index]; ++ int rc; ++ ++ /* must called with lnet_eq_wait_lock hold */ ++ if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence)) ++ return 0; ++ ++ /* We've got a new event... */ ++ *ev = *new_event; ++ ++ CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", ++ new_event, eq->eq_deq_seq, eq->eq_size); ++ ++ /* ...but did it overwrite an event we've not seen yet? */ ++ if (eq->eq_deq_seq == new_event->sequence) { ++ rc = 1; ++ } else { ++ /* ++ * don't complain with CERROR: some EQs are sized small ++ * anyway; if it's important, the caller should complain ++ */ ++ CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n", ++ eq->eq_deq_seq, new_event->sequence); ++ rc = -EOVERFLOW; ++ } ++ ++ eq->eq_deq_seq = new_event->sequence + 1; ++ return rc; ++} ++ ++/** ++ * A nonblocking function that can be used to get the next event in an EQ. ++ * If an event handler is associated with the EQ, the handler will run before ++ * this function returns successfully. The event is removed from the queue. ++ * ++ * \param eventq A handle for the event queue. ++ * \param event On successful return (1 or -EOVERFLOW), this location will ++ * hold the next event in the EQ. ++ * ++ * \retval 0 No pending event in the EQ. ++ * \retval 1 Indicates success. ++ * \retval -ENOENT If \a eventq does not point to a valid EQ. ++ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that ++ * at least one event between this event and the last event obtained from the ++ * EQ has been dropped due to limited space in the EQ. ++ */ ++ ++/** ++ * Block the calling process until there is an event in the EQ. ++ * If an event handler is associated with the EQ, the handler will run before ++ * this function returns successfully. This function returns the next event ++ * in the EQ and removes it from the EQ. ++ * ++ * \param eventq A handle for the event queue. ++ * \param event On successful return (1 or -EOVERFLOW), this location will ++ * hold the next event in the EQ. ++ * ++ * \retval 1 Indicates success. ++ * \retval -ENOENT If \a eventq does not point to a valid EQ. ++ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that ++ * at least one event between this event and the last event obtained from the ++ * EQ has been dropped due to limited space in the EQ. ++ */ ++ ++static int ++lnet_eq_wait_locked(int *timeout_ms, long state) ++__must_hold(&the_lnet.ln_eq_wait_lock) ++{ ++ int tms = *timeout_ms; ++ int wait; ++ wait_queue_entry_t wl; ++ unsigned long now; ++ ++ if (!tms) ++ return -ENXIO; /* don't want to wait and no new event */ ++ ++ init_waitqueue_entry(&wl, current); ++ set_current_state(state); ++ add_wait_queue(&the_lnet.ln_eq_waitq, &wl); ++ ++ lnet_eq_wait_unlock(); ++ ++ if (tms < 0) { ++ schedule(); ++ } else { ++ now = jiffies; ++ schedule_msec_hrtimeout((tms)); ++ tms -= jiffies_to_msecs(jiffies - now); ++ if (tms < 0) /* no more wait but may have new event */ ++ tms = 0; ++ } ++ ++ wait = tms; /* might need to call here again */ ++ *timeout_ms = tms; ++ ++ lnet_eq_wait_lock(); ++ remove_wait_queue(&the_lnet.ln_eq_waitq, &wl); ++ ++ return wait; ++} ++ ++/** ++ * Block the calling process until there's an event from a set of EQs or ++ * timeout happens. ++ * ++ * If an event handler is associated with the EQ, the handler will run before ++ * this function returns successfully, in which case the corresponding event ++ * is consumed. ++ * ++ * LNetEQPoll() provides a timeout to allow applications to poll, block for a ++ * fixed period, or block indefinitely. ++ * ++ * \param eventqs,neq An array of EQ handles, and size of the array. ++ * \param timeout_ms Time in milliseconds to wait for an event to occur on ++ * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an ++ * infinite timeout. ++ * \param interruptible, if true, use TASK_INTERRUPTIBLE, else TASK_NOLOAD ++ * \param event,which On successful return (1 or -EOVERFLOW), \a event will ++ * hold the next event in the EQs, and \a which will contain the index of the ++ * EQ from which the event was taken. ++ * ++ * \retval 0 No pending event in the EQs after timeout. ++ * \retval 1 Indicates success. ++ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that ++ * at least one event between this event and the last event obtained from the ++ * EQ indicated by \a which has been dropped due to limited space in the EQ. ++ * \retval -ENOENT If there's an invalid handle in \a eventqs. ++ */ ++int ++LNetEQPoll(struct lnet_handle_eq *eventqs, int neq, int timeout_ms, ++ int interruptible, ++ struct lnet_event *event, int *which) ++{ ++ int wait = 1; ++ int rc; ++ int i; ++ ++ LASSERT(the_lnet.ln_refcount > 0); ++ ++ if (neq < 1) ++ return -ENOENT; ++ ++ lnet_eq_wait_lock(); ++ ++ for (;;) { ++ for (i = 0; i < neq; i++) { ++ struct lnet_eq *eq = lnet_handle2eq(&eventqs[i]); ++ ++ if (!eq) { ++ lnet_eq_wait_unlock(); ++ return -ENOENT; ++ } ++ ++ rc = lnet_eq_dequeue_event(eq, event); ++ if (rc) { ++ lnet_eq_wait_unlock(); ++ *which = i; ++ return rc; ++ } ++ } ++ ++ if (!wait) ++ break; ++ ++ /* ++ * return value of lnet_eq_wait_locked: ++ * -1 : did nothing and it's sure no new event ++ * 1 : sleep inside and wait until new event ++ * 0 : don't want to wait anymore, but might have new event ++ * so need to call dequeue again ++ */ ++ wait = lnet_eq_wait_locked(&timeout_ms, ++ interruptible ? TASK_INTERRUPTIBLE ++ : TASK_NOLOAD); ++ if (wait < 0) /* no new event */ ++ break; ++ } ++ ++ lnet_eq_wait_unlock(); ++ return 0; ++} +diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c +index 69e6abe14abf..7d23e214ac21 100644 +--- a/drivers/staging/rts5208/rtsx.c ++++ b/drivers/staging/rts5208/rtsx.c +@@ -507,7 +507,7 @@ static int rtsx_polling_thread(void *__dev) + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL)); ++ schedule_msec_hrtimeout((POLLING_INTERVAL)); + + /* lock the device pointers */ + mutex_lock(&dev->dev_mutex); +diff --git a/drivers/staging/speakup/speakup_acntpc.c b/drivers/staging/speakup/speakup_acntpc.c +index 28519754b2f0..a96805bbec5c 100644 +--- a/drivers/staging/speakup/speakup_acntpc.c ++++ b/drivers/staging/speakup/speakup_acntpc.c +@@ -198,7 +198,7 @@ static void do_catch_up(struct spk_synth *synth) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -226,7 +226,7 @@ static void do_catch_up(struct spk_synth *synth) + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/speakup_apollo.c b/drivers/staging/speakup/speakup_apollo.c +index 0877b4044c28..627102d048c1 100644 +--- a/drivers/staging/speakup/speakup_apollo.c ++++ b/drivers/staging/speakup/speakup_apollo.c +@@ -165,7 +165,7 @@ static void do_catch_up(struct spk_synth *synth) + if (!synth->io_ops->synth_out(synth, ch)) { + synth->io_ops->tiocmset(0, UART_MCR_RTS); + synth->io_ops->tiocmset(UART_MCR_RTS, 0); +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +diff --git a/drivers/staging/speakup/speakup_decext.c b/drivers/staging/speakup/speakup_decext.c +index 3741c0fcf5bb..bff857b4aa5f 100644 +--- a/drivers/staging/speakup/speakup_decext.c ++++ b/drivers/staging/speakup/speakup_decext.c +@@ -176,7 +176,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full() || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_decpc.c b/drivers/staging/speakup/speakup_decpc.c +index 6649309e0342..c60e4712d817 100644 +--- a/drivers/staging/speakup/speakup_decpc.c ++++ b/drivers/staging/speakup/speakup_decpc.c +@@ -394,7 +394,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (dt_sendchar(ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_dectlk.c b/drivers/staging/speakup/speakup_dectlk.c +index a144f28ee1a8..c34764fafe2b 100644 +--- a/drivers/staging/speakup/speakup_dectlk.c ++++ b/drivers/staging/speakup/speakup_dectlk.c +@@ -244,7 +244,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_dtlk.c b/drivers/staging/speakup/speakup_dtlk.c +index dbebed0eeeec..6d83c13ca4a6 100644 +--- a/drivers/staging/speakup/speakup_dtlk.c ++++ b/drivers/staging/speakup/speakup_dtlk.c +@@ -211,7 +211,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -227,7 +227,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + jiffy_delta_val = jiffy_delta->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/speakup_keypc.c b/drivers/staging/speakup/speakup_keypc.c +index 3901734982a4..4e8a7a98b46d 100644 +--- a/drivers/staging/speakup/speakup_keypc.c ++++ b/drivers/staging/speakup/speakup_keypc.c +@@ -199,7 +199,7 @@ spin_lock_irqsave(&speakup_info.spinlock, flags); + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -232,7 +232,7 @@ spin_lock_irqsave(&speakup_info.spinlock, flags); + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + jiff_max = jiffies+jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c +index 25f259ee4ffc..b9721103e651 100644 +--- a/drivers/staging/speakup/synth.c ++++ b/drivers/staging/speakup/synth.c +@@ -93,12 +93,8 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (ch == '\n') + ch = synth->procspeech; +- if (unicode) +- ret = synth->io_ops->synth_out_unicode(synth, ch); +- else +- ret = synth->io_ops->synth_out(synth, ch); +- if (!ret) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ if (!synth->io_ops->synth_out(synth, ch)) { ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +@@ -108,11 +104,9 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth->io_ops->synth_out(synth, synth->procspeech)) +- schedule_timeout( +- msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + else +- schedule_timeout( +- msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c +index 3647b8f1ed28..9fb26ccc2b3b 100644 +--- a/drivers/staging/unisys/visornic/visornic_main.c ++++ b/drivers/staging/unisys/visornic/visornic_main.c +@@ -549,7 +549,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +@@ -560,7 +560,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + if (atomic_read(&devdata->usage)) + break; +@@ -714,7 +714,7 @@ static int visornic_enable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c +index 6199d4806193..7c7165f2dad4 100644 +--- a/drivers/video/fbdev/omap/hwa742.c ++++ b/drivers/video/fbdev/omap/hwa742.c +@@ -926,7 +926,7 @@ static void hwa742_resume(void) + if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7)) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(5)); ++ schedule_msec_hrtimeout((5)); + } + hwa742_set_update_mode(hwa742.update_mode_before_suspend); + } +diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c +index bbed039617a4..681ae041ea77 100644 +--- a/drivers/video/fbdev/pxafb.c ++++ b/drivers/video/fbdev/pxafb.c +@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg) + mutex_unlock(&fbi->ctrlr_lock); + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(30)); ++ schedule_msec_hrtimeout((30)); + } + + pr_debug("%s(): task ending\n", __func__); +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 2d9074295d7f..7df3e60e4e89 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -5905,7 +5905,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) + flush = BTRFS_RESERVE_FLUSH_LIMIT; + + if (btrfs_transaction_in_commit(fs_info)) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + if (delalloc_lock) +diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c +index ffca2abf13d0..89b2a7f7397e 100644 +--- a/fs/btrfs/inode-map.c ++++ b/fs/btrfs/inode-map.c +@@ -75,7 +75,7 @@ static int caching_kthread(void *data) + btrfs_release_path(path); + root->ino_cache_progress = last; + up_read(&fs_info->commit_root_sem); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + goto again; + } else + continue; +diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c +index 72c6f8e82a7e..46d8c2a148ad 100644 +--- a/sound/usb/line6/pcm.c ++++ b/sound/usb/line6/pcm.c +@@ -131,7 +131,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm, + if (!alive) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } while (--timeout > 0); + if (alive) + dev_err(line6pcm->line6->ifcdev, +-- +2.17.1 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0009-Replace-all-calls-to-schedule_timeout_interruptible-.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0009-Replace-all-calls-to-schedule_timeout_interruptible-.patch new file mode 100644 index 00000000..0ddf57af --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0009-Replace-all-calls-to-schedule_timeout_interruptible-.patch @@ -0,0 +1,311 @@ +From cd03bffabeee4f4c8438969b3b4d184d0d0bb81b Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Mon, 20 Feb 2017 13:30:07 +1100 +Subject: [PATCH 09/16] Replace all calls to schedule_timeout_interruptible of + potentially under 50ms to use schedule_msec_hrtimeout_interruptible. + +--- + drivers/hwmon/fam15h_power.c | 2 +- + drivers/iio/light/tsl2563.c | 6 +----- + drivers/media/i2c/msp3400-driver.c | 4 ++-- + drivers/media/pci/ivtv/ivtv-gpio.c | 6 +++--- + drivers/media/radio/radio-mr800.c | 2 +- + drivers/media/radio/radio-tea5777.c | 2 +- + drivers/media/radio/tea575x.c | 2 +- + drivers/parport/ieee1284.c | 2 +- + drivers/parport/ieee1284_ops.c | 2 +- + drivers/platform/x86/intel_ips.c | 8 ++++---- + net/core/pktgen.c | 2 +- + sound/soc/codecs/wm8350.c | 12 ++++++------ + sound/soc/codecs/wm8900.c | 2 +- + sound/soc/codecs/wm9713.c | 4 ++-- + 14 files changed, 26 insertions(+), 30 deletions(-) + +diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c +index 9545a346044f..c24cf1302ec7 100644 +--- a/drivers/hwmon/fam15h_power.c ++++ b/drivers/hwmon/fam15h_power.c +@@ -237,7 +237,7 @@ static ssize_t power1_average_show(struct device *dev, + prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; + } + +- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); ++ leftover = schedule_msec_hrtimeout_interruptible((data->power_period)); + if (leftover) + return 0; + +diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c +index 6bbb0b1e6032..f4b83648c405 100644 +--- a/drivers/iio/light/tsl2563.c ++++ b/drivers/iio/light/tsl2563.c +@@ -282,11 +282,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip) + default: + delay = 402; + } +- /* +- * TODO: Make sure that we wait at least required delay but why we +- * have to extend it one tick more? +- */ +- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2); ++ schedule_msec_hrtimeout_interruptible(delay + 1); + } + + static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc) +diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c +index 3db966db83eb..f0fab7676f72 100644 +--- a/drivers/media/i2c/msp3400-driver.c ++++ b/drivers/media/i2c/msp3400-driver.c +@@ -179,7 +179,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr) + break; + dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +@@ -220,7 +220,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val) + break; + dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c +index f752f3993687..23372af61ebf 100644 +--- a/drivers/media/pci/ivtv/ivtv-gpio.c ++++ b/drivers/media/pci/ivtv/ivtv-gpio.c +@@ -117,7 +117,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv) + curout = (curout & ~0xF) | 1; + write_reg(curout, IVTV_REG_GPIO_OUT); + /* We could use something else for smaller time */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + curout |= 2; + write_reg(curout, IVTV_REG_GPIO_OUT); + curdir &= ~0x80; +@@ -137,11 +137,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value) + curout = read_reg(IVTV_REG_GPIO_OUT); + curout &= ~(1 << itv->card->xceive_pin); + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + + curout |= 1 << itv->card->xceive_pin; + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + return 0; + } + +diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c +index 0f292c6ba338..9d7f22fe1ca8 100644 +--- a/drivers/media/radio/radio-mr800.c ++++ b/drivers/media/radio/radio-mr800.c +@@ -378,7 +378,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv, + retval = -ENODATA; + break; + } +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + retval = -ERESTARTSYS; + break; + } +diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c +index 04ed1a5d1177..d593d28dc286 100644 +--- a/drivers/media/radio/radio-tea5777.c ++++ b/drivers/media/radio/radio-tea5777.c +@@ -245,7 +245,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait) + } + + if (wait) { +- if (schedule_timeout_interruptible(msecs_to_jiffies(wait))) ++ if (schedule_msec_hrtimeout_interruptible((wait))) + return -ERESTARTSYS; + } + +diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c +index 7412fe1b10c6..92dce75e6ce9 100644 +--- a/drivers/media/radio/tea575x.c ++++ b/drivers/media/radio/tea575x.c +@@ -416,7 +416,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea, + for (;;) { + if (time_after(jiffies, timeout)) + break; +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + /* some signal arrived, stop search */ + tea->val &= ~TEA575X_BIT_SEARCH; + snd_tea575x_set_freq(tea); +diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c +index f12b9da69255..6ca6eecbdb2d 100644 +--- a/drivers/parport/ieee1284.c ++++ b/drivers/parport/ieee1284.c +@@ -208,7 +208,7 @@ int parport_wait_peripheral(struct parport *port, + /* parport_wait_event didn't time out, but the + * peripheral wasn't actually ready either. + * Wait for another 10ms. */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + } + +diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c +index 5d41dda6da4e..34705f6b423f 100644 +--- a/drivers/parport/ieee1284_ops.c ++++ b/drivers/parport/ieee1284_ops.c +@@ -537,7 +537,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, + /* Yield the port for a while. */ + if (count && dev->port->irq != PARPORT_IRQ_NONE) { + parport_release (dev); +- schedule_timeout_interruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_interruptible((40)); + parport_claim_or_block (dev); + } + else +diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c +index c5ece7ef08c6..9256fb502545 100644 +--- a/drivers/platform/x86/intel_ips.c ++++ b/drivers/platform/x86/intel_ips.c +@@ -809,7 +809,7 @@ static int ips_adjust(void *data) + ips_gpu_lower(ips); + + sleep: +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD)); + } while (!kthread_should_stop()); + + dev_dbg(ips->dev, "ips-adjust thread stopped\n"); +@@ -985,7 +985,7 @@ static int ips_monitor(void *data) + seqno_timestamp = get_jiffies_64(); + + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + + /* Collect an initial average */ + for (i = 0; i < IPS_SAMPLE_COUNT; i++) { +@@ -1012,7 +1012,7 @@ static int ips_monitor(void *data) + mchp_samples[i] = mchp; + } + +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + if (kthread_should_stop()) + break; + } +@@ -1039,7 +1039,7 @@ static int ips_monitor(void *data) + * us to reduce the sample frequency if the CPU and GPU are idle. + */ + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + last_sample_period = IPS_SAMPLE_PERIOD; + + timer_setup(&ips->timer, monitor_timeout, TIMER_DEFERRABLE); +diff --git a/net/core/pktgen.c b/net/core/pktgen.c +index 7f6938405fa1..369ad3eca2a3 100644 +--- a/net/core/pktgen.c ++++ b/net/core/pktgen.c +@@ -1900,7 +1900,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) + mutex_unlock(&pktgen_thread_lock); + pr_debug("%s: waiting for %s to disappear....\n", + __func__, ifname); +- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); ++ schedule_msec_hrtimeout_interruptible((msec_per_try)); + mutex_lock(&pktgen_thread_lock); + + if (++i >= max_tries) { +diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c +index e92ebe52d485..88791ebb6df0 100644 +--- a/sound/soc/codecs/wm8350.c ++++ b/sound/soc/codecs/wm8350.c +@@ -236,10 +236,10 @@ static void wm8350_pga_work(struct work_struct *work) + out2->ramp == WM8350_RAMP_UP) { + /* delay is longer over 0dB as increases are larger */ + if (i >= WM8350_OUTn_0dB) +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (2)); + else +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (1)); + } else + udelay(50); /* doesn't matter if we delay longer */ +@@ -1123,7 +1123,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + (platform->dis_out4 << 6)); + + /* wait for discharge */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + cap_discharge_msecs)); + +@@ -1139,7 +1139,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + WM8350_VBUFEN); + + /* wait for vmid */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_charge_msecs)); + +@@ -1190,7 +1190,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_discharge_msecs)); + +@@ -1208,7 +1208,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + pm1 | WM8350_OUTPUT_DRAIN_EN); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform->drain_msecs)); + + pm1 &= ~WM8350_BIASEN; +diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c +index 1a14e902949d..68f17d9877ec 100644 +--- a/sound/soc/codecs/wm8900.c ++++ b/sound/soc/codecs/wm8900.c +@@ -1112,7 +1112,7 @@ static int wm8900_set_bias_level(struct snd_soc_component *component, + /* Need to let things settle before stopping the clock + * to ensure that restart works, see "Stopping the + * master clock" in the datasheet. */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_write(component, WM8900_REG_POWER2, + WM8900_REG_POWER2_SYSCLK_ENA); + break; +diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c +index 643863bb32e0..fc318d71a8a3 100644 +--- a/sound/soc/codecs/wm9713.c ++++ b/sound/soc/codecs/wm9713.c +@@ -203,7 +203,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w, + + /* Gracefully shut down the voice interface. */ + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0200); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0f00); + snd_soc_component_update_bits(component, AC97_EXTENDED_MID, 0x1000, 0x1000); + +@@ -872,7 +872,7 @@ static int wm9713_set_pll(struct snd_soc_component *component, + wm9713->pll_in = freq_in; + + /* wait 10ms AC97 link frames for the link to stabilise */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + return 0; + } + +-- +2.17.1 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0010-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0010-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch new file mode 100644 index 00000000..3eb7c889 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0010-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch @@ -0,0 +1,160 @@ +From ba3f464ce9dd28a8999f56b327b458f869258a1a Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Mon, 20 Feb 2017 13:30:32 +1100 +Subject: [PATCH 10/16] Replace all calls to schedule_timeout_uninterruptible + of potentially under 50ms to use schedule_msec_hrtimeout_uninterruptible + +--- + drivers/media/pci/cx18/cx18-gpio.c | 4 ++-- + drivers/net/wireless/intel/ipw2x00/ipw2100.c | 4 ++-- + drivers/rtc/rtc-wm8350.c | 6 +++--- + drivers/scsi/lpfc/lpfc_scsi.c | 2 +- + sound/pci/maestro3.c | 4 ++-- + sound/soc/codecs/rt5631.c | 4 ++-- + sound/soc/soc-dapm.c | 2 +- + 7 files changed, 13 insertions(+), 13 deletions(-) + +diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c +index 012859e6dc7b..206bd08265a5 100644 +--- a/drivers/media/pci/cx18/cx18-gpio.c ++++ b/drivers/media/pci/cx18/cx18-gpio.c +@@ -90,11 +90,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi, + + /* Assert */ + gpio_update(cx, mask, ~active_lo); +- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((assert_msecs)); + + /* Deassert */ + gpio_update(cx, mask, ~active_hi); +- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((recovery_msecs)); + } + + /* +diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +index 910db46db6a1..497b01ab32d4 100644 +--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c ++++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +@@ -830,7 +830,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv, + * doesn't seem to have as many firmware restart cycles... + * + * As a test, we're sticking in a 1/100s delay here */ +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + return 0; + +@@ -1281,7 +1281,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv) + IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n"); + i = 5000; + do { +- schedule_timeout_uninterruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_uninterruptible((40)); + /* Todo... wait for sync command ... */ + + read_register(priv->net_dev, IPW_REG_INTA, &inta); +diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c +index 483c7993516b..fddbaa475066 100644 +--- a/drivers/rtc/rtc-wm8350.c ++++ b/drivers/rtc/rtc-wm8350.c +@@ -119,7 +119,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); + + if (!retries) { +@@ -202,7 +202,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS)); + + if (!(rtc_ctrl & WM8350_RTC_ALMSTS)) +@@ -225,7 +225,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) + /* Wait until confirmation */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS); + + if (rtc_ctrl & WM8350_RTC_ALMSTS) +diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c +index 5c7858e735c9..b56a01420918 100644 +--- a/drivers/scsi/lpfc/lpfc_scsi.c ++++ b/drivers/scsi/lpfc/lpfc_scsi.c +@@ -5201,7 +5201,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id, + tgt_id, lun_id, context); + later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies; + while (time_after(later, jiffies) && cnt) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(20)); ++ schedule_msec_hrtimeout_uninterruptible((20)); + cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context); + } + if (cnt) { +diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c +index 62962178a9d7..87e486740da0 100644 +--- a/sound/pci/maestro3.c ++++ b/sound/pci/maestro3.c +@@ -2016,7 +2016,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(0, io + GPIO_DATA); + outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1)); ++ schedule_msec_hrtimeout_uninterruptible((delay1)); + + outw(GPO_PRIMARY_AC97, io + GPIO_DATA); + udelay(5); +@@ -2024,7 +2024,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A); + outw(~0, io + GPIO_MASK); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2)); ++ schedule_msec_hrtimeout_uninterruptible((delay2)); + + if (! snd_m3_try_read_vendor(chip)) + break; +diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c +index 865f49ac38dd..3c1190dd114f 100644 +--- a/sound/soc/codecs/rt5631.c ++++ b/sound/soc/codecs/rt5631.c +@@ -419,7 +419,7 @@ static void onebit_depop_mute_stage(struct snd_soc_component *component, int ena + hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + /* config one-bit depop parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x307f); + snd_soc_component_update_bits(component, RT5631_HP_OUT_VOL, +@@ -529,7 +529,7 @@ static void depop_seq_mute_stage(struct snd_soc_component *component, int enable + hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + /* config depop sequence parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x302f); +diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c +index 461d951917c0..b5eb57ba4af4 100644 +--- a/sound/soc/soc-dapm.c ++++ b/sound/soc/soc-dapm.c +@@ -131,7 +131,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm) + static void pop_wait(u32 pop_time) + { + if (pop_time) +- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time)); ++ schedule_msec_hrtimeout_uninterruptible((pop_time)); + } + + static void pop_dbg(struct device *dev, u32 pop_time, const char *fmt, ...) +-- +2.17.1 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0011-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0011-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch new file mode 100644 index 00000000..680e5fcd --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0011-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch @@ -0,0 +1,69 @@ +From befdee72d814b6c302da85af524b15762e72e0cf Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Mon, 20 Feb 2017 13:32:58 +1100 +Subject: [PATCH 11/16] Don't use hrtimer overlay when pm_freezing since some + drivers still don't correctly use freezable timeouts. + +--- + kernel/time/hrtimer.c | 2 +- + kernel/time/timer.c | 9 +++++---- + 2 files changed, 6 insertions(+), 5 deletions(-) + +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 12735724cce4..32fb7b6d9568 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -2048,7 +2048,7 @@ long __sched schedule_msec_hrtimeout(long timeout) + * (yet) better than Hz, as would occur during startup, use regular + * timers. + */ +- if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ) ++ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) + return schedule_timeout(jiffs); + + secs = timeout / 1000; +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 542c13d98950..d5d0fa004d2b 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -44,6 +44,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1972,12 +1973,12 @@ void msleep(unsigned int msecs) + * Use high resolution timers where the resolution of tick based + * timers is inadequate. + */ +- if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { + while (msecs) + msecs = schedule_msec_hrtimeout_uninterruptible(msecs); + return; + } +- timeout = msecs_to_jiffies(msecs) + 1; ++ timeout = jiffs + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +@@ -1994,12 +1995,12 @@ unsigned long msleep_interruptible(unsigned int msecs) + int jiffs = msecs_to_jiffies(msecs); + unsigned long timeout; + +- if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { + while (msecs && !signal_pending(current)) + msecs = schedule_msec_hrtimeout_interruptible(msecs); + return msecs; + } +- timeout = msecs_to_jiffies(msecs) + 1; ++ timeout = jiffs + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); +-- +2.17.1 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0012-Make-threaded-IRQs-optionally-the-default-which-can-.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0012-Make-threaded-IRQs-optionally-the-default-which-can-.patch new file mode 100644 index 00000000..078a4ba0 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0012-Make-threaded-IRQs-optionally-the-default-which-can-.patch @@ -0,0 +1,67 @@ +From df4136f6de5b3f45c2f4be7a3cc042903e983e0c Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Wed, 7 Dec 2016 21:13:16 +1100 +Subject: [PATCH 13/16] Make threaded IRQs optionally the default which can be + disabled. + +--- + kernel/irq/Kconfig | 17 +++++++++++++++++ + kernel/irq/manage.c | 11 +++++++++++ + 2 files changed, 28 insertions(+) + +diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig +index 5f3e2baefca9..de3e5740679b 100644 +--- a/kernel/irq/Kconfig ++++ b/kernel/irq/Kconfig +@@ -107,6 +107,23 @@ config GENERIC_IRQ_RESERVATION_MODE + config IRQ_FORCED_THREADING + bool + ++config FORCE_IRQ_THREADING ++ bool "Make IRQ threading compulsory" ++ depends on IRQ_FORCED_THREADING ++ default n ++ ---help--- ++ ++ Make IRQ threading mandatory for any IRQ handlers that support it ++ instead of being optional and requiring the threadirqs kernel ++ parameter. Instead they can be optionally disabled with the ++ nothreadirqs kernel parameter. ++ ++ Enabling this may make some architectures not boot with runqueue ++ sharing and MuQSS. ++ ++ Enable if you are building for a desktop or low latency system, ++ otherwise say N. ++ + config SPARSE_IRQ + bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ + ---help--- +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index fb86146037a7..b322b1a0caa0 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -23,9 +23,20 @@ + #include "internals.h" + + #ifdef CONFIG_IRQ_FORCED_THREADING ++#ifdef CONFIG_FORCE_IRQ_THREADING ++__read_mostly bool force_irqthreads = true; ++#else + __read_mostly bool force_irqthreads; ++#endif + EXPORT_SYMBOL_GPL(force_irqthreads); + ++static int __init setup_noforced_irqthreads(char *arg) ++{ ++ force_irqthreads = false; ++ return 0; ++} ++early_param("nothreadirqs", setup_noforced_irqthreads); ++ + static int __init setup_forced_irqthreads(char *arg) + { + force_irqthreads = true; +-- +2.17.1 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0013-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0013-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch new file mode 100644 index 00000000..33a98653 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0013-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch @@ -0,0 +1,81 @@ +From ace3d66508ad4e17da3f579eaf04c5582b8256a2 Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Wed, 7 Dec 2016 21:23:01 +1100 +Subject: [PATCH 14/16] Reinstate default Hz of 100 in combination with MuQSS + and -ck patches. + +--- + kernel/Kconfig.hz | 25 ++++++++++++++++++------- + 1 file changed, 18 insertions(+), 7 deletions(-) + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1806fcac8f14 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,8 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_100 if SCHED_MUQSS ++ default HZ_250_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -19,11 +20,18 @@ choice + config HZ_100 + bool "100 HZ" + help ++ 100 Hz is a suitable choice in combination with MuQSS which does ++ not rely on ticks for rescheduling interrupts, and is not Hz limited ++ for timeouts and sleeps from both the kernel and userspace. ++ This allows us to benefit from the lower overhead and higher ++ throughput of fewer timer ticks. ++ ++ Non-MuQSS kernels: + 100 Hz is a typical choice for servers, SMP and NUMA systems + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEF + bool "250 HZ" + help + 250 Hz is a good compromise choice allowing server performance +@@ -31,7 +39,10 @@ choice + on SMP and NUMA systems. If you are going to be using NTSC video + or multimedia, selected 300Hz instead. + +- config HZ_300 ++ 250 Hz is the default choice for the mainline scheduler but not ++ advantageous in combination with MuQSS. ++ ++ config HZ_300_NODEF + bool "300 HZ" + help + 300 Hz is a good compromise choice allowing server performance +@@ -39,7 +50,7 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + +- config HZ_1000 ++ config HZ_1000_NODEF + bool "1000 HZ" + help + 1000 Hz is the preferred choice for desktop systems and other +@@ -50,9 +61,9 @@ endchoice + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 +- default 300 if HZ_300 +- default 1000 if HZ_1000 ++ default 250 if HZ_250_NODEF ++ default 300 if HZ_300_NODEF ++ default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS +-- +2.17.1 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0014-Swap-sucks.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0014-Swap-sucks.patch new file mode 100644 index 00000000..bd68ebec --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0014-Swap-sucks.patch @@ -0,0 +1,25 @@ +From d8f6f203f5bdabdf1a5ddb6bdc9e13fae2b640b9 Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Sat, 12 Aug 2017 12:02:04 +1000 +Subject: [PATCH 15/16] Swap sucks. + +--- + mm/vmscan.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index c5ef7240cbcb..3f04308b6445 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 33; + /* + * The total number of pages which are beyond the high watermark within all + * zones. +-- +2.17.1 + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-0015-unfuck-MuQSS-on-linux-4_19_10+.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-0015-unfuck-MuQSS-on-linux-4_19_10+.patch new file mode 100644 index 00000000..fbff2e6e --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-0015-unfuck-MuQSS-on-linux-4_19_10+.patch @@ -0,0 +1,14 @@ +diff -Nur a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +--- a/kernel/sched/MuQSS.c 2019-02-09 19:46:07.899912055 +0000 ++++ b/kernel/sched/MuQSS.c 2019-02-09 19:48:03.743622465 +0000 +@@ -1011,6 +1011,10 @@ + #define CPUIDLE_THREAD_BUSY (16) + #define CPUIDLE_DIFF_NODE (32) + ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++#endif ++ + /* + * The best idle CPU is chosen according to the CPUIDLE ranking above where the + * lowest value would give the most suitable CPU to schedule p onto next. The diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-ata-fix-NCQ-LOG-strings-and-move-to-debug.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-ata-fix-NCQ-LOG-strings-and-move-to-debug.patch new file mode 100644 index 00000000..344a8c4b --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-ata-fix-NCQ-LOG-strings-and-move-to-debug.patch @@ -0,0 +1,23 @@ +diff -Naur linux-4.16.5/drivers/ata/libata-core.c linux-4.16.5-p/drivers/ata/libata-core.c +--- linux-4.16.5/drivers/ata/libata-core.c 2018-04-26 11:00:39.000000000 +0200 ++++ linux-4.16.5-p/drivers/ata/libata-core.c 2018-04-28 02:19:06.632381413 +0200 +@@ -2201,7 +2201,7 @@ + unsigned int err_mask; + + if (!ata_log_supported(dev, ATA_LOG_NCQ_SEND_RECV)) { +- ata_dev_warn(dev, "NCQ Send/Recv Log not supported\n"); ++ ata_dev_dbg(dev, "NCQ Send/Recv Log not supported\n"); + return; + } + err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_SEND_RECV, +@@ -2230,8 +2230,8 @@ + unsigned int err_mask; + + if (!ata_log_supported(dev, ATA_LOG_NCQ_NON_DATA)) { +- ata_dev_warn(dev, +- "NCQ Send/Recv Log not supported\n"); ++ ata_dev_dbg(dev, ++ "NCQ Non-Data Log not supported\n"); + return; + } + err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_NON_DATA, diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-ath10k-drop-WARN_ON-added-in-cd93b83ad927b2c7979e0add0343ace59328b461.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-ath10k-drop-WARN_ON-added-in-cd93b83ad927b2c7979e0add0343ace59328b461.patch new file mode 100644 index 00000000..4e810797 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-ath10k-drop-WARN_ON-added-in-cd93b83ad927b2c7979e0add0343ace59328b461.patch @@ -0,0 +1,74 @@ + +Triggers on resume .. + +[ 487.909349] WARNING: CPU: 0 PID: 2125 at drivers/net/wireless/ath/ath10k/mac.c:5625 ath10k_bss_info_changed+0xb33/0xd90 [ath10k_core] +[ 487.909350] Modules linked in: ctr ccm cmac af_packet arc4 xt_tcpudp xt_state xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c bnep iptable_filter ip_tables x_tables bpfilter nls_utf8 nls_cp437 vfat fat amdgpu chash gpu_sched joydev uvcvideo iTCO_wdt snd_hda_codec_hdmi iTCO_vendor_support videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 snd_hda_codec_conexant intel_wmi_thunderbolt videobuf2_common btusb snd_hda_codec_generic radeon btrtl btbcm btintel ath10k_pci videodev bluetooth ath10k_core media i915 ttm ecdh_generic coretemp ath intel_rapl x86_pkg_temp_thermal rtsx_usb_ms intel_powerclamp mac80211 memstick kvm_intel kvmgt vfio_mdev mdev vfio_iommu_type1 vfio kvm snd_soc_skl snd_soc_skl_ipc snd_soc_sst_ipc snd_soc_sst_dsp snd_hda_ext_core snd_soc_acpi_intel_match snd_soc_acpi +[ 487.909390] cec snd_soc_core rc_core drm_kms_helper efi_pstore irqbypass cfg80211 snd_compress drm snd_pcm_dmaengine psmouse intel_gtt evdev snd_hda_intel mac_hid agpgart i2c_algo_bit snd_hda_codec r8169 pcspkr efivars fb_sys_fops syscopyarea sysfillrect sysimgblt i2c_i801 snd_hda_core hwmon i2c_core libphy snd_hwdep ideapad_laptop sparse_keymap intel_pch_thermal rfkill thermal wmi battery acpi_pad ac video pcc_cpufreq button ppdev sch_fq_codel fuse snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi_event snd_seq snd_seq_device snd_timer snd soundcore lp parport_pc parport binfmt_misc sg ext4 crc32c_generic crc16 mbcache jbd2 fscrypto rtsx_usb_sdmmc mmc_core rtsx_usb sr_mod sd_mod cdrom crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel pcbc xhci_pci ahci xhci_hcd +[ 487.909436] libahci libata aesni_intel aes_x86_64 crypto_simd usbcore cryptd scsi_mod glue_helper serio_raw dm_mirror dm_region_hash dm_log dm_mod efivarfs unix sha256_mb sha256_ssse3 sha256_generic sha1_mb mcryptd sha1_ssse3 sha1_generic hmac ipv6 autofs4 +[ 487.909454] CPU: 0 PID: 2125 Comm: kworker/u8:21 Not tainted 4.19.2-fw1 #1 +[ 487.909455] Hardware name: LENOVO 80UD/LNVNB161216, BIOS 1TCN26WW(V2.07) 03/29/2018 +[ 487.909459] Workqueue: events_unbound async_run_entry_fn +[ 487.909466] RIP: 0010:ath10k_bss_info_changed+0xb33/0xd90 [ath10k_core] +[ 487.909468] Code: ff ff b8 a1 ff ff ff e9 e9 f7 ff ff b8 a1 ff ff ff e9 a5 f6 ff ff b8 a1 ff ff ff e9 ef f5 ff ff b8 a1 ff ff ff e9 4f f7 ff ff <0f> 0b e9 37 f8 ff ff b8 a1 ff ff ff e9 f6 fa ff ff b8 a1 ff ff ff +[ 487.909469] RSP: 0018:ffff9b9042e47cd8 EFLAGS: 00010282 +[ 487.909471] RAX: 00000000fffffffe RBX: ffff904af2fb1540 RCX: 0000000000000000 +[ 487.909472] RDX: ffff904af2fb18b8 RSI: ffff9b9042e47cf8 RDI: ffff904af4181380 +[ 487.909473] RBP: ffff904af4181380 R08: 0000000000200000 R09: 0000000000000000 +[ 487.909474] R10: 000000000000001f R11: ffff904ae2b4a600 R12: 0000000002000000 +[ 487.909475] R13: ffff904af4181388 R14: ffff904af2fb24d0 R15: ffff904af2fb1540 +[ 487.909477] FS: 0000000000000000(0000) GS:ffff904af7200000(0000) knlGS:0000000000000000 +[ 487.909478] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 487.909479] CR2: 00007fa6a8300510 CR3: 000000002000a002 CR4: 00000000003606f0 +[ 487.909480] Call Trace: +[ 487.909488] ? ath10k_conf_tx+0x12d/0x4c0 [ath10k_core] +[ 487.909503] ieee80211_bss_info_change_notify+0xa9/0x1c0 [mac80211] +[ 487.909521] ieee80211_reconfig+0x9d7/0x14f0 [mac80211] +[ 487.909538] wiphy_resume+0x7e/0x150 [cfg80211] +[ 487.909549] ? wiphy_namespace+0x10/0x10 [cfg80211] +[ 487.909553] dpm_run_callback+0x2e/0x130 +[ 487.909556] device_resume+0x97/0x190 +[ 487.909558] async_resume+0x19/0x40 +[ 487.909561] async_run_entry_fn+0x37/0xe0 +[ 487.909563] process_one_work+0x1e9/0x410 +[ 487.909565] worker_thread+0x2d/0x3d0 +[ 487.909567] ? process_one_work+0x410/0x410 +[ 487.909569] kthread+0x113/0x130 +[ 487.909572] ? kthread_park+0x90/0x90 +[ 487.909575] ret_from_fork+0x35/0x40 +[ 487.909577] ---[ end trace 56a3ea97193bc4c5 ]--- + +Introduced by: + +..... + +commit cd93b83ad927b2c7979e0add0343ace59328b461 +Author: Pradeep Kumar Chitrapu +Date: Wed Jul 25 10:59:39 2018 +0300 + + ath10k: support for multicast rate control + + Issues a wmi command to firmware when multicast rate change is received with the + new BSS_CHANGED_MCAST_RATE flag. Also fixes the incorrect fixed_rate setting + for CCK rates which got introduced with addition of ath10k_rates_rev2 enum. + + Tested on QCA9984 with firmware ver 10.4-3.6-00104 + + Signed-off-by: Pradeep Kumar Chitrapu + Signed-off-by: Kalle Valo + +..... + + +diff -Naur linux-4.19.2/drivers/net/wireless/ath/ath10k/mac.c linux-4.19.2-p/drivers/net/wireless/ath/ath10k/mac.c +--- linux-4.19.2/drivers/net/wireless/ath/ath10k/mac.c 2018-11-13 20:09:00.000000000 +0100 ++++ linux-4.19.2-p/drivers/net/wireless/ath/ath10k/mac.c 2018-11-15 01:41:51.896601274 +0100 +@@ -5621,8 +5621,7 @@ + arvif->vdev_id, ret); + } + +- if (changed & BSS_CHANGED_MCAST_RATE && +- !WARN_ON(ath10k_mac_vif_chan(arvif->vif, &def))) { ++ if (changed & BSS_CHANGED_MCAST_RATE && !ath10k_mac_vif_chan(arvif->vif, &def)) { + band = def.chan->band; + rateidx = vif->bss_conf.mcast_rate[band] - 1; + diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-bfq-sq-mq-v9r1-2K190204-rc1.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-bfq-sq-mq-v9r1-2K190204-rc1.patch new file mode 100644 index 00000000..039c8fcd --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-bfq-sq-mq-v9r1-2K190204-rc1.patch @@ -0,0 +1,18511 @@ +diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt +index 8d8d8f06cab2..41d0200944f1 100644 +--- a/Documentation/block/bfq-iosched.txt ++++ b/Documentation/block/bfq-iosched.txt +@@ -1,3 +1,6 @@ ++[ THIS TREE CONTAINS ALSO THE DEV VERSION OF BFQ. ++ DETAILS AT THE END OF THIS DOCUMENT. ] ++ + BFQ (Budget Fair Queueing) + ========================== + +@@ -11,6 +14,15 @@ controllers), BFQ's main features are: + groups (switching back to time distribution when needed to keep + throughput high). + ++If bfq-mq patches have been applied, then the following three ++instances of BFQ are available (otherwise only the first instance): ++- bfq: mainline version of BFQ, for blk-mq ++- bfq-mq: development version of BFQ for blk-mq; this version contains ++ also all latest features and fixes not yet landed in mainline, plus many ++ safety checks ++- bfq-sq: BFQ for legacy blk; also this version contains latest features ++ and fixes, as well as safety checks ++ + In its default configuration, BFQ privileges latency over + throughput. So, when needed for achieving a lower latency, BFQ builds + schedules that may lead to a lower throughput. If your main or only +@@ -22,27 +34,42 @@ latency and throughput, or on how to maximize throughput. + + BFQ has a non-null overhead, which limits the maximum IOPS that a CPU + can process for a device scheduled with BFQ. To give an idea of the +-limits on slow or average CPUs, here are, first, the limits of BFQ for +-three different CPUs, on, respectively, an average laptop, an old +-desktop, and a cheap embedded system, in case full hierarchical +-support is enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set), but ++limits on slow or average CPUs, here are, first, the limits of bfq-mq ++and bfq for three different CPUs, on, respectively, an average laptop, ++an old desktop, and a cheap embedded system, in case full hierarchical ++support is enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED is set for ++bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED is set for bfq), but + CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2): + - Intel i7-4850HQ: 400 KIOPS + - AMD A8-3850: 250 KIOPS + - ARM CortexTM-A53 Octa-core: 80 KIOPS + +-If CONFIG_DEBUG_BLK_CGROUP is set (and of course full hierarchical +-support is enabled), then the sustainable throughput with BFQ +-decreases, because all blkio.bfq* statistics are created and updated +-(Section 4-2). For BFQ, this leads to the following maximum +-sustainable throughputs, on the same systems as above: ++As for bfq-sq, it cannot reach the above IOPS, because of the ++inherent, lower parallelism of legacy blk and of the components within ++it (including bfq-sq itself). In particular, results with ++CONFIG_DEBUG_BLK_CGROUP unset are rather fluctuating. The limits ++reported below for the case CONFIG_DEBUG_BLK_CGROUP set will however ++provide a lower bound to the limits of bfq-sq. ++ ++Turning back to bfq-mq and bfq, If CONFIG_DEBUG_BLK_CGROUP is set (and ++of course full hierarchical support is enabled), then the sustainable ++throughput with bfq-mq and bfq decreases, because all blkio.bfq* ++statistics are created and updated (Section 4-2). For bfq-mq and bfq, ++this leads to the following maximum sustainable throughputs, on the ++same systems as above: + - Intel i7-4850HQ: 310 KIOPS + - AMD A8-3850: 200 KIOPS + - ARM CortexTM-A53 Octa-core: 56 KIOPS + +-BFQ works for multi-queue devices too. ++Finally, if CONFIG_DEBUG_BLK_CGROUP is set (and full hierarchical ++support is enabled), then bfq-sq exhibits the following limits: ++- Intel i7-4850HQ: 250 KIOPS ++- AMD A8-3850: 170 KIOPS ++- ARM CortexTM-A53 Octa-core: 45 KIOPS + +-The table of contents follow. Impatients can just jump to Section 3. ++BFQ works for multi-queue devices too (bfq and bfq-mq instances). ++ ++The table of contents follows. Impatients can just jump to Section 3. + + CONTENTS + +@@ -509,25 +536,27 @@ To get proportional sharing of bandwidth with BFQ for a given device, + BFQ must of course be the active scheduler for that device. + + Within each group directory, the names of the files associated with +-BFQ-specific cgroup parameters and stats begin with the "bfq." +-prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for +-BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group +-parameter to set the weight of a group with BFQ is blkio.bfq.weight ++BFQ-specific cgroup parameters and stats begin with the "bfq.", ++"bfq-sq." or "bfq-mq." prefix, depending on which instance of bfq you ++want to use. So, with cgroups-v1 or cgroups-v2, the full prefix for ++BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be "" ++(i.e., null string), "-sq" or "-mq". For example, the group parameter ++to set the weight of a group with the mainline BFQ is blkio.bfq.weight + or io.bfq.weight. + + As for cgroups-v1 (blkio controller), the exact set of stat files +-created, and kept up-to-date by bfq, depends on whether +-CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all ++created, and kept up-to-date by bfq*, depends on whether ++CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq* creates all + the stat files documented in + Documentation/cgroup-v1/blkio-controller.txt. If, instead, +-CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files +-blkio.bfq.io_service_bytes +-blkio.bfq.io_service_bytes_recursive +-blkio.bfq.io_serviced +-blkio.bfq.io_serviced_recursive ++CONFIG_DEBUG_BLK_CGROUP is not set, then bfq* creates only the files ++blkio.bfq*.io_service_bytes ++blkio.bfq*.io_service_bytes_recursive ++blkio.bfq*.io_serviced ++blkio.bfq*.io_serviced_recursive + + The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum +-throughput sustainable with bfq, because updating the blkio.bfq.* ++throughput sustainable with bfq*, because updating the blkio.bfq* + stats is rather costly, especially for some of the stats enabled by + CONFIG_DEBUG_BLK_CGROUP. + +@@ -536,7 +565,7 @@ Parameters to set + + For each group, there is only the following parameter to set. + +-weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the ++weight (namely blkio.bfqX.weight or io.bfqX.weight): the weight of the + group inside its parent. Available values: 1..10000 (default 100). The + linear mapping between ioprio and weights, described at the beginning + of the tunable section, is still valid, but all weights higher than +@@ -559,3 +588,55 @@ applications. Unset this tunable if you need/want to control weights. + Slightly extended version: + http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite- + results.pdf ++ ++---------------------------------------------------------------------- ++ ++DETAILS ON THE DEV VERSIONS IN THIS TREE ++ ++The dev version of BFQ is available for both the legacy and the ++multi-queue block layers, as two additional I/O schedulers, named, ++respectively, bfq-sq-iosched and bfq-mq-iosched (the latter is ++available if also the changes introducing bfq-mq-iosched have been ++applied). In particular, this tree contains the dev version of bfq for ++Linux mainline 4.19.0, and has been obtained from the dev version for ++Linux 4.18.0. Rebasing from 4.18 to 4.19 involved two manual ++interventions. ++ ++First, some conflicts had to be resolved, as follows: ++ ++--------------------------------------------------------------- ++ ++diff --cc Makefile ++index 7727c1bf6fa5,69fa5c0310d8..c7cbdf0ad594 ++--- a/Makefile +++++ b/Makefile ++@@@ -1,9 -1,9 +1,9 @@@ ++ # SPDX-License-Identifier: GPL-2.0 ++ VERSION = 4 ++- PATCHLEVEL = 18 +++ PATCHLEVEL = 19 ++ SUBLEVEL = 0 ++ -EXTRAVERSION = ++ +EXTRAVERSION = -bfq-mq ++- NAME = Merciless Moray +++ NAME = "People's Front" ++ ++ # *DOCUMENTATION* ++ # To see a list of typical targets execute "make help" ++diff --cc include/linux/blkdev.h ++index 897c63322bd7,6980014357d4..8c4568ea6884 ++--- a/include/linux/blkdev.h +++++ b/include/linux/blkdev.h ++@@@ -56,7 -54,7 +54,7 @@@ struct blk_stat_callback ++ * Maximum number of blkcg policies allowed to be registered concurrently. ++ * Defined here to simplify include dependency. ++ */ ++--#define BLKCG_MAX_POLS 5 ++++#define BLKCG_MAX_POLS 7 ++ ++ typedef void (rq_end_io_fn)(struct request *, blk_status_t); ++ ++--------------------------------------------------------------- ++ ++Second, the following port commit had to be made: ++port commit "block: use ktime_get_ns() instead of sched_clock() for cfq and bfq" +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index e32fc1f274d8..94cb28eb20ba 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -12,6 +12,11 @@ CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_LOG_BUF_SHIFT=18 + CONFIG_CGROUPS=y ++CONFIG_BLK_CGROUP=y ++CONFIG_IOSCHED_BFQ_SQ=y ++CONFIG_BFQ_SQ_GROUP_IOSCHED=y ++CONFIG_MQ_IOSCHED_BFQ=y ++CONFIG_MQ_BFQ_GROUP_IOSCHED=y + CONFIG_CGROUP_FREEZER=y + CONFIG_CPUSETS=y + CONFIG_CGROUP_CPUACCT=y +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched +index a4a8914bf7a4..299a6861fb90 100644 +--- a/block/Kconfig.iosched ++++ b/block/Kconfig.iosched +@@ -40,6 +40,26 @@ config CFQ_GROUP_IOSCHED + ---help--- + Enable group IO scheduling in CFQ. + ++config IOSCHED_BFQ_SQ ++ tristate "BFQ-SQ I/O scheduler" ++ default n ++ ---help--- ++ The BFQ-SQ I/O scheduler (for legacy blk: SQ stands for ++ SingleQueue) distributes bandwidth among all processes ++ according to their weights, regardless of the device ++ parameters and with any workload. It also guarantees a low ++ latency to interactive and soft real-time applications. ++ Details in Documentation/block/bfq-iosched.txt ++ ++config BFQ_SQ_GROUP_IOSCHED ++ bool "BFQ-SQ hierarchical scheduling support" ++ depends on IOSCHED_BFQ_SQ && BLK_CGROUP ++ default n ++ ---help--- ++ ++ Enable hierarchical scheduling in BFQ-SQ, using the blkio ++ (cgroups-v1) or io (cgroups-v2) controller. ++ + choice + + prompt "Default I/O scheduler" +@@ -54,6 +74,16 @@ choice + config DEFAULT_CFQ + bool "CFQ" if IOSCHED_CFQ=y + ++ config DEFAULT_BFQ_SQ ++ bool "BFQ-SQ" if IOSCHED_BFQ_SQ=y ++ help ++ Selects BFQ-SQ as the default I/O scheduler which will be ++ used by default for all block devices. ++ The BFQ-SQ I/O scheduler aims at distributing the bandwidth ++ as desired, independently of the disk parameters and with ++ any workload. It also tries to guarantee low latency to ++ interactive and soft real-time applications. ++ + config DEFAULT_NOOP + bool "No-op" + +@@ -63,8 +93,28 @@ config DEFAULT_IOSCHED + string + default "deadline" if DEFAULT_DEADLINE + default "cfq" if DEFAULT_CFQ ++ default "bfq-sq" if DEFAULT_BFQ_SQ + default "noop" if DEFAULT_NOOP + ++config MQ_IOSCHED_BFQ ++ tristate "BFQ-MQ I/O Scheduler" ++ default y ++ ---help--- ++ BFQ I/O scheduler for BLK-MQ. BFQ-MQ distributes bandwidth ++ among all processes according to their weights, regardless of ++ the device parameters and with any workload. It also ++ guarantees a low latency to interactive and soft real-time ++ applications. Details in Documentation/block/bfq-iosched.txt ++ ++config MQ_BFQ_GROUP_IOSCHED ++ bool "BFQ-MQ hierarchical scheduling support" ++ depends on MQ_IOSCHED_BFQ && BLK_CGROUP ++ default n ++ ---help--- ++ ++ Enable hierarchical scheduling in BFQ-MQ, using the blkio ++ (cgroups-v1) or io (cgroups-v2) controller. ++ + config MQ_IOSCHED_DEADLINE + tristate "MQ deadline I/O scheduler" + default y +diff --git a/block/Makefile b/block/Makefile +index 572b33f32c07..1dd6ffdc2fee 100644 +--- a/block/Makefile ++++ b/block/Makefile +@@ -25,6 +25,8 @@ obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o + obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o + bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o + obj-$(CONFIG_IOSCHED_BFQ) += bfq.o ++obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o ++obj-$(CONFIG_MQ_IOSCHED_BFQ) += bfq-mq-iosched.o + + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o + obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +new file mode 100644 +index 000000000000..15459e50cd6a +--- /dev/null ++++ b/block/bfq-cgroup-included.c +@@ -0,0 +1,1359 @@ ++/* ++ * BFQ: CGROUPS support. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2015 Paolo Valente ++ * ++ * Copyright (C) 2016 Paolo Valente ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ ++ * file. ++ */ ++ ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) ++ ++/* bfqg stats flags */ ++enum bfqg_stats_flags { ++ BFQG_stats_waiting = 0, ++ BFQG_stats_idling, ++ BFQG_stats_empty, ++}; ++ ++#define BFQG_FLAG_FNS(name) \ ++static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ ++{ \ ++ stats->flags |= (1 << BFQG_stats_##name); \ ++} \ ++static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ ++{ \ ++ stats->flags &= ~(1 << BFQG_stats_##name); \ ++} \ ++static int bfqg_stats_##name(struct bfqg_stats *stats) \ ++{ \ ++ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ ++} \ ++ ++BFQG_FLAG_FNS(waiting) ++BFQG_FLAG_FNS(idling) ++BFQG_FLAG_FNS(empty) ++#undef BFQG_FLAG_FNS ++ ++#ifdef BFQ_MQ ++/* This should be called with the scheduler lock held. */ ++#else ++/* This should be called with the queue_lock held. */ ++#endif ++static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) ++{ ++ u64 now; ++ ++ if (!bfqg_stats_waiting(stats)) ++ return; ++ ++ now = ktime_get_ns(); ++ if (now > stats->start_group_wait_time) ++ blkg_stat_add(&stats->group_wait_time, ++ now - stats->start_group_wait_time); ++ bfqg_stats_clear_waiting(stats); ++} ++ ++#ifdef BFQ_MQ ++/* This should be called with the scheduler lock held. */ ++#else ++/* This should be called with the queue_lock held. */ ++#endif ++static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, ++ struct bfq_group *curr_bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ if (bfqg_stats_waiting(stats)) ++ return; ++ if (bfqg == curr_bfqg) ++ return; ++ stats->start_group_wait_time = ktime_get_ns(); ++ bfqg_stats_mark_waiting(stats); ++} ++ ++#ifdef BFQ_MQ ++/* This should be called with the scheduler lock held. */ ++#else ++/* This should be called with the queue_lock held. */ ++#endif ++static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) ++{ ++ u64 now; ++ ++ if (!bfqg_stats_empty(stats)) ++ return; ++ ++ now = ktime_get_ns(); ++ if (now > stats->start_empty_time) ++ blkg_stat_add(&stats->empty_time, ++ now - stats->start_empty_time); ++ bfqg_stats_clear_empty(stats); ++} ++ ++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) ++{ ++ blkg_stat_add(&bfqg->stats.dequeue, 1); ++} ++ ++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ if (blkg_rwstat_total(&stats->queued)) ++ return; ++ ++ /* ++ * group is already marked empty. This can happen if bfqq got new ++ * request in parent group and moved to this group while being added ++ * to service tree. Just ignore the event and move on. ++ */ ++ if (bfqg_stats_empty(stats)) ++ return; ++ ++ stats->start_empty_time = ktime_get_ns(); ++ bfqg_stats_mark_empty(stats); ++} ++ ++static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ if (bfqg_stats_idling(stats)) { ++ u64 now = ktime_get_ns(); ++ ++ if (now > stats->start_idle_time) ++ blkg_stat_add(&stats->idle_time, ++ now - stats->start_idle_time); ++ bfqg_stats_clear_idling(stats); ++ } ++} ++ ++static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ stats->start_idle_time = ktime_get_ns(); ++ bfqg_stats_mark_idling(stats); ++} ++ ++static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ blkg_stat_add(&stats->avg_queue_size_sum, ++ blkg_rwstat_total(&stats->queued)); ++ blkg_stat_add(&stats->avg_queue_size_samples, 1); ++ bfqg_stats_update_group_wait_time(stats); ++} ++ ++static void bfqg_stats_update_io_add(struct bfq_group *bfqg, ++ struct bfq_queue *bfqq, ++ unsigned int op) ++{ ++ blkg_rwstat_add(&bfqg->stats.queued, op, 1); ++ bfqg_stats_end_empty_time(&bfqg->stats); ++ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) ++ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); ++} ++ ++static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) ++{ ++ blkg_rwstat_add(&bfqg->stats.queued, op, -1); ++} ++ ++static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) ++{ ++ blkg_rwstat_add(&bfqg->stats.merged, op, 1); ++} ++ ++static void bfqg_stats_update_completion(struct bfq_group *bfqg, ++ u64 start_time_ns, ++ u64 io_start_time_ns, ++ unsigned int op) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ u64 now = ktime_get_ns(); ++ ++ if (now > io_start_time_ns) ++ blkg_rwstat_add(&stats->service_time, op, ++ now - io_start_time_ns); ++ if (io_start_time_ns > start_time_ns) ++ blkg_rwstat_add(&stats->wait_time, op, ++ io_start_time_ns - start_time_ns); ++} ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ ++ ++static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, ++ struct bfq_queue *bfqq, unsigned int op) { } ++static inline void ++bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } ++static inline void ++bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } ++static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, ++ u64 start_time_ns, ++ u64 io_start_time_ns, ++ unsigned int op) { } ++static inline void ++bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, ++ struct bfq_group *curr_bfqg) { } ++static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } ++static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct blkcg_policy blkcg_policy_bfq; ++ ++/* ++ * blk-cgroup policy-related handlers ++ * The following functions help in converting between blk-cgroup ++ * internal structures and BFQ-specific structures. ++ */ ++ ++static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) ++{ ++ return pd ? container_of(pd, struct bfq_group, pd) : NULL; ++} ++ ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) ++{ ++ return pd_to_blkg(&bfqg->pd); ++} ++ ++static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) ++{ ++ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); ++ ++ return pd_to_bfqg(pd); ++} ++ ++/* ++ * bfq_group handlers ++ * The following functions help in navigating the bfq_group hierarchy ++ * by allowing to find the parent of a bfq_group or the bfq_group ++ * associated to a bfq_queue. ++ */ ++ ++static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) ++{ ++ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; ++ ++ return pblkg ? blkg_to_bfqg(pblkg) : NULL; ++} ++ ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *group_entity = bfqq->entity.parent; ++ ++ return group_entity ? container_of(group_entity, struct bfq_group, ++ entity) : ++ bfqq->bfqd->root_group; ++} ++ ++/* ++ * The following two functions handle get and put of a bfq_group by ++ * wrapping the related blk-cgroup hooks. ++ */ ++ ++static void bfqg_get(struct bfq_group *bfqg) ++{ ++#ifdef BFQ_MQ ++ bfqg->ref++; ++#else ++ blkg_get(bfqg_to_blkg(bfqg)); ++#endif ++} ++ ++static void bfqg_put(struct bfq_group *bfqg) ++{ ++#ifdef BFQ_MQ ++ bfqg->ref--; ++ ++ BUG_ON(bfqg->ref < 0); ++ if (bfqg->ref == 0) ++ kfree(bfqg); ++#else ++ blkg_put(bfqg_to_blkg(bfqg)); ++#endif ++} ++ ++#ifdef BFQ_MQ ++static void bfqg_and_blkg_get(struct bfq_group *bfqg) ++{ ++ /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ ++ bfqg_get(bfqg); ++ ++ blkg_get(bfqg_to_blkg(bfqg)); ++} ++ ++static void bfqg_and_blkg_put(struct bfq_group *bfqg) ++{ ++ blkg_put(bfqg_to_blkg(bfqg)); ++ ++ bfqg_put(bfqg); ++} ++#endif ++ ++/* @stats = 0 */ ++static void bfqg_stats_reset(struct bfqg_stats *stats) ++{ ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++ /* queued stats shouldn't be cleared */ ++ blkg_rwstat_reset(&stats->merged); ++ blkg_rwstat_reset(&stats->service_time); ++ blkg_rwstat_reset(&stats->wait_time); ++ blkg_stat_reset(&stats->time); ++ blkg_stat_reset(&stats->avg_queue_size_sum); ++ blkg_stat_reset(&stats->avg_queue_size_samples); ++ blkg_stat_reset(&stats->dequeue); ++ blkg_stat_reset(&stats->group_wait_time); ++ blkg_stat_reset(&stats->idle_time); ++ blkg_stat_reset(&stats->empty_time); ++#endif ++} ++ ++/* @to += @from */ ++static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) ++{ ++ if (!to || !from) ++ return; ++ ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++ /* queued stats shouldn't be cleared */ ++ blkg_rwstat_add_aux(&to->merged, &from->merged); ++ blkg_rwstat_add_aux(&to->service_time, &from->service_time); ++ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); ++ blkg_stat_add_aux(&from->time, &from->time); ++ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); ++ blkg_stat_add_aux(&to->avg_queue_size_samples, ++ &from->avg_queue_size_samples); ++ blkg_stat_add_aux(&to->dequeue, &from->dequeue); ++ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); ++ blkg_stat_add_aux(&to->idle_time, &from->idle_time); ++ blkg_stat_add_aux(&to->empty_time, &from->empty_time); ++#endif ++} ++ ++/* ++ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' ++ * recursive stats can still account for the amount used by this bfqg after ++ * it's gone. ++ */ ++static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) ++{ ++ struct bfq_group *parent; ++ ++ if (!bfqg) /* root_group */ ++ return; ++ ++ parent = bfqg_parent(bfqg); ++ ++ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); ++ ++ if (unlikely(!parent)) ++ return; ++ ++ bfqg_stats_add_aux(&parent->stats, &bfqg->stats); ++ bfqg_stats_reset(&bfqg->stats); ++} ++ ++static void bfq_init_entity(struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ entity->weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; ++ if (bfqq) { ++ bfqq->ioprio = bfqq->new_ioprio; ++ bfqq->ioprio_class = bfqq->new_ioprio_class; ++#ifdef BFQ_MQ ++ /* ++ * Make sure that bfqg and its associated blkg do not ++ * disappear before entity. ++ */ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting bfqg %p and blkg\n", ++ bfqg); ++ ++ bfqg_and_blkg_get(bfqg); ++#else ++ bfqg_get(bfqg); ++#endif ++ } ++ entity->parent = bfqg->my_entity; /* NULL for root group */ ++ entity->sched_data = &bfqg->sched_data; ++} ++ ++static void bfqg_stats_exit(struct bfqg_stats *stats) ++{ ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++ blkg_rwstat_exit(&stats->merged); ++ blkg_rwstat_exit(&stats->service_time); ++ blkg_rwstat_exit(&stats->wait_time); ++ blkg_rwstat_exit(&stats->queued); ++ blkg_stat_exit(&stats->time); ++ blkg_stat_exit(&stats->avg_queue_size_sum); ++ blkg_stat_exit(&stats->avg_queue_size_samples); ++ blkg_stat_exit(&stats->dequeue); ++ blkg_stat_exit(&stats->group_wait_time); ++ blkg_stat_exit(&stats->idle_time); ++ blkg_stat_exit(&stats->empty_time); ++#endif ++} ++ ++static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) ++{ ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++ if (blkg_rwstat_init(&stats->merged, gfp) || ++ blkg_rwstat_init(&stats->service_time, gfp) || ++ blkg_rwstat_init(&stats->wait_time, gfp) || ++ blkg_rwstat_init(&stats->queued, gfp) || ++ blkg_stat_init(&stats->time, gfp) || ++ blkg_stat_init(&stats->avg_queue_size_sum, gfp) || ++ blkg_stat_init(&stats->avg_queue_size_samples, gfp) || ++ blkg_stat_init(&stats->dequeue, gfp) || ++ blkg_stat_init(&stats->group_wait_time, gfp) || ++ blkg_stat_init(&stats->idle_time, gfp) || ++ blkg_stat_init(&stats->empty_time, gfp)) { ++ bfqg_stats_exit(stats); ++ return -ENOMEM; ++ } ++#endif ++ ++ return 0; ++} ++ ++static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) ++{ ++ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; ++} ++ ++static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) ++{ ++ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); ++} ++ ++static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) ++{ ++ struct bfq_group_data *bgd; ++ ++ bgd = kzalloc(sizeof(*bgd), gfp); ++ if (!bgd) ++ return NULL; ++ return &bgd->pd; ++} ++ ++static void bfq_cpd_init(struct blkcg_policy_data *cpd) ++{ ++ struct bfq_group_data *d = cpd_to_bfqgd(cpd); ++ ++ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? ++ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; ++} ++ ++static void bfq_cpd_free(struct blkcg_policy_data *cpd) ++{ ++ kfree(cpd_to_bfqgd(cpd)); ++} ++ ++static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) ++{ ++ struct bfq_group *bfqg; ++ ++ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); ++ if (!bfqg) ++ return NULL; ++ ++ if (bfqg_stats_init(&bfqg->stats, gfp)) { ++ kfree(bfqg); ++ return NULL; ++ } ++#ifdef BFQ_MQ ++ /* see comments in bfq_bic_update_cgroup for why refcounting */ ++ bfqg_get(bfqg); ++#endif ++ return &bfqg->pd; ++} ++ ++static void bfq_pd_init(struct blkg_policy_data *pd) ++{ ++ struct blkcg_gq *blkg; ++ struct bfq_group *bfqg; ++ struct bfq_data *bfqd; ++ struct bfq_entity *entity; ++ struct bfq_group_data *d; ++ ++ blkg = pd_to_blkg(pd); ++ BUG_ON(!blkg); ++ bfqg = blkg_to_bfqg(blkg); ++ bfqd = blkg->q->elevator->elevator_data; ++ BUG_ON(bfqg == bfqd->root_group); ++ entity = &bfqg->entity; ++ d = blkcg_to_bfqgd(blkg->blkcg); ++ ++ entity->orig_weight = entity->weight = entity->new_weight = d->weight; ++ entity->my_sched_data = &bfqg->sched_data; ++ bfqg->my_entity = entity; /* ++ * the root_group's will be set to NULL ++ * in bfq_init_queue() ++ */ ++ bfqg->bfqd = bfqd; ++ bfqg->active_entities = 0; ++ bfqg->rq_pos_tree = RB_ROOT; ++} ++ ++static void bfq_pd_free(struct blkg_policy_data *pd) ++{ ++ struct bfq_group *bfqg = pd_to_bfqg(pd); ++ ++ bfqg_stats_exit(&bfqg->stats); ++#ifdef BFQ_MQ ++ bfqg_put(bfqg); ++#else ++ kfree(bfqg); ++#endif ++} ++ ++static void bfq_pd_reset_stats(struct blkg_policy_data *pd) ++{ ++ struct bfq_group *bfqg = pd_to_bfqg(pd); ++ ++ bfqg_stats_reset(&bfqg->stats); ++} ++ ++static void bfq_group_set_parent(struct bfq_group *bfqg, ++ struct bfq_group *parent) ++{ ++ struct bfq_entity *entity; ++ ++ BUG_ON(!parent); ++ BUG_ON(!bfqg); ++ BUG_ON(bfqg == parent); ++ ++ entity = &bfqg->entity; ++ entity->parent = parent->my_entity; ++ entity->sched_data = &parent->sched_data; ++} ++ ++static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, ++ struct blkcg *blkcg) ++{ ++ struct blkcg_gq *blkg; ++ ++ blkg = blkg_lookup(blkcg, bfqd->queue); ++ if (likely(blkg)) ++ return blkg_to_bfqg(blkg); ++ return NULL; ++} ++ ++static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, ++ struct blkcg *blkcg) ++{ ++ struct bfq_group *bfqg, *parent; ++ struct bfq_entity *entity; ++ ++ bfqg = bfq_lookup_bfqg(bfqd, blkcg); ++ ++ if (unlikely(!bfqg)) ++ return NULL; ++ ++ /* ++ * Update chain of bfq_groups as we might be handling a leaf group ++ * which, along with some of its relatives, has not been hooked yet ++ * to the private hierarchy of BFQ. ++ */ ++ entity = &bfqg->entity; ++ for_each_entity(entity) { ++ bfqg = container_of(entity, struct bfq_group, entity); ++ BUG_ON(!bfqg); ++ if (bfqg != bfqd->root_group) { ++ parent = bfqg_parent(bfqg); ++ if (!parent) ++ parent = bfqd->root_group; ++ BUG_ON(!parent); ++ bfq_group_set_parent(bfqg, parent); ++ } ++ } ++ ++ return bfqg; ++} ++ ++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq); ++ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason); ++ ++/** ++ * bfq_bfqq_move - migrate @bfqq to @bfqg. ++ * @bfqd: queue descriptor. ++ * @bfqq: the queue to move. ++ * @bfqg: the group to move to. ++ * ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating ++ * it on the new one. Avoid putting the entity on the old group idle tree. ++ * ++#ifdef BFQ_MQ ++ * Must be called under the scheduler lock, to make sure that the blkg ++ * owning @bfqg does not disappear (see comments in ++ * bfq_bic_update_cgroup on guaranteeing the consistency of blkg ++ * objects). ++#else ++ * Must be called under the queue lock; the cgroup owning @bfqg must ++ * not disappear (by now this just means that we are called under ++ * rcu_read_lock()). ++#endif ++ */ ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_group *bfqg) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); ++ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) ++ && entity->on_st && ++ bfqq != bfqd->in_service_queue); ++ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); ++ ++ /* If bfqq is empty, then bfq_bfqq_expire also invokes ++ * bfq_del_bfqq_busy, thereby removing bfqq and its entity ++ * from data structures related to current group. Otherwise we ++ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as ++ * we do below. ++ */ ++ if (bfqq == bfqd->in_service_queue) ++ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, ++ false, BFQ_BFQQ_PREEMPTED); ++ ++ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) ++ && &bfq_entity_service_tree(entity)->idle != ++ entity->tree); ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); ++ ++ if (bfq_bfqq_busy(bfqq)) ++ bfq_deactivate_bfqq(bfqd, bfqq, false, false); ++ else if (entity->on_st) { ++ BUG_ON(&bfq_entity_service_tree(entity)->idle != ++ entity->tree); ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); ++ } ++#ifdef BFQ_MQ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); ++ ++ bfqg_and_blkg_put(bfqq_group(bfqq)); ++#else ++ bfqg_put(bfqq_group(bfqq)); ++#endif ++ ++ entity->parent = bfqg->my_entity; ++ entity->sched_data = &bfqg->sched_data; ++#ifdef BFQ_MQ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting blkg and bfqg %p\n", bfqg); ++ ++ /* pin down bfqg and its associated blkg */ ++ bfqg_and_blkg_get(bfqg); ++#else ++ bfqg_get(bfqg); ++#endif ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); ++ if (bfq_bfqq_busy(bfqq)) { ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ bfq_activate_bfqq(bfqd, bfqq); ++ } ++ ++ if (!bfqd->in_service_queue && !bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) ++ && &bfq_entity_service_tree(entity)->idle != ++ entity->tree); ++} ++ ++/** ++ * __bfq_bic_change_cgroup - move @bic to @cgroup. ++ * @bfqd: the queue descriptor. ++ * @bic: the bic to move. ++ * @blkcg: the blk-cgroup to move to. ++ * ++#ifdef BFQ_MQ ++ * Move bic to blkcg, assuming that bfqd->lock is held; which makes ++ * sure that the reference to cgroup is valid across the call (see ++ * comments in bfq_bic_update_cgroup on this issue) ++#else ++ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller ++ * has to make sure that the reference to cgroup is valid across the call. ++#endif ++ * ++ * NOTE: an alternative approach might have been to store the current ++ * cgroup in bfqq and getting a reference to it, reducing the lookup ++ * time here, at the price of slightly more complex code. ++ */ ++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, ++ struct blkcg *blkcg) ++{ ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); ++ struct bfq_group *bfqg; ++ struct bfq_entity *entity; ++ ++ bfqg = bfq_find_set_group(bfqd, blkcg); ++ ++ if (unlikely(!bfqg)) ++ bfqg = bfqd->root_group; ++ ++ if (async_bfqq) { ++ entity = &async_bfqq->entity; ++ ++ if (entity->sched_data != &bfqg->sched_data) { ++ bic_set_bfqq(bic, NULL, 0); ++ bfq_log_bfqq(bfqd, async_bfqq, ++ "%p %d", ++ async_bfqq, ++ async_bfqq->ref); ++ bfq_put_queue(async_bfqq); ++ } ++ } ++ ++ if (sync_bfqq) { ++ entity = &sync_bfqq->entity; ++ if (entity->sched_data != &bfqg->sched_data) ++ bfq_bfqq_move(bfqd, sync_bfqq, bfqg); ++ } ++ ++ return bfqg; ++} ++ ++static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ struct bfq_group *bfqg = NULL; ++ uint64_t serial_nr; ++ ++ rcu_read_lock(); ++ serial_nr = bio_blkcg(bio)->css.serial_nr; ++ ++ /* ++ * Check whether blkcg has changed. The condition may trigger ++ * spuriously on a newly created cic but there's no harm. ++ */ ++ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) ++ goto out; ++ ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); ++#ifdef BFQ_MQ ++ /* ++ * Update blkg_path for bfq_log_* functions. We cache this ++ * path, and update it here, for the following ++ * reasons. Operations on blkg objects in blk-cgroup are ++ * protected with the request_queue lock, and not with the ++ * lock that protects the instances of this scheduler ++ * (bfqd->lock). This exposes BFQ to the following sort of ++ * race. ++ * ++ * The blkg_lookup performed in bfq_get_queue, protected ++ * through rcu, may happen to return the address of a copy of ++ * the original blkg. If this is the case, then the ++ * bfqg_and_blkg_get performed in bfq_get_queue, to pin down ++ * the blkg, is useless: it does not prevent blk-cgroup code ++ * from destroying both the original blkg and all objects ++ * directly or indirectly referred by the copy of the ++ * blkg. ++ * ++ * On the bright side, destroy operations on a blkg invoke, as ++ * a first step, hooks of the scheduler associated with the ++ * blkg. And these hooks are executed with bfqd->lock held for ++ * BFQ. As a consequence, for any blkg associated with the ++ * request queue this instance of the scheduler is attached ++ * to, we are guaranteed that such a blkg is not destroyed, and ++ * that all the pointers it contains are consistent, while we ++ * are holding bfqd->lock. A blkg_lookup performed with ++ * bfqd->lock held then returns a fully consistent blkg, which ++ * remains consistent until this lock is held. ++ * ++ * Thanks to the last fact, and to the fact that: (1) bfqg has ++ * been obtained through a blkg_lookup in the above ++ * assignment, and (2) bfqd->lock is being held, here we can ++ * safely use the policy data for the involved blkg (i.e., the ++ * field bfqg->pd) to get to the blkg associated with bfqg, ++ * and then we can safely use any field of blkg. After we ++ * release bfqd->lock, even just getting blkg through this ++ * bfqg may cause dangling references to be traversed, as ++ * bfqg->pd may not exist any more. ++ * ++ * In view of the above facts, here we cache, in the bfqg, any ++ * blkg data we may need for this bic, and for its associated ++ * bfq_queue. As of now, we need to cache only the path of the ++ * blkg, which is used in the bfq_log_* functions. ++ * ++ * Finally, note that bfqg itself needs to be protected from ++ * destruction on the blkg_free of the original blkg (which ++ * invokes bfq_pd_free). We use an additional private ++ * refcounter for bfqg, to let it disappear only after no ++ * bfq_queue refers to it any longer. ++ */ ++ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); ++#endif ++ bic->blkcg_serial_nr = serial_nr; ++out: ++ rcu_read_unlock(); ++} ++ ++/** ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. ++ * @st: the service tree being flushed. ++ */ ++static void bfq_flush_idle_tree(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entity = st->first_idle; ++ ++ for (; entity ; entity = st->first_idle) ++ __bfq_deactivate_entity(entity, false); ++} ++ ++/** ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group. ++ * @bfqd: the device data structure with the root group. ++ * @entity: the entity to move. ++ */ ++static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(!bfqq); ++ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); ++} ++ ++/** ++ * bfq_reparent_active_entities - move to the root group all active ++ * entities. ++ * @bfqd: the device data structure with the root group. ++ * @bfqg: the group to move from. ++ * @st: the service tree with the entities. ++ */ ++static void bfq_reparent_active_entities(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ struct bfq_service_tree *st) ++{ ++ struct rb_root *active = &st->active; ++ struct bfq_entity *entity = NULL; ++ ++ if (!RB_EMPTY_ROOT(&st->active)) ++ entity = bfq_entity_of(rb_first(active)); ++ ++ for (; entity ; entity = bfq_entity_of(rb_first(active))) ++ bfq_reparent_leaf_entity(bfqd, entity); ++ ++ if (bfqg->sched_data.in_service_entity) ++ bfq_reparent_leaf_entity(bfqd, ++ bfqg->sched_data.in_service_entity); ++} ++ ++/** ++ * bfq_pd_offline - deactivate the entity associated with @pd, ++ * and reparent its children entities. ++ * @pd: descriptor of the policy going offline. ++ * ++ * blkio already grabs the queue_lock for us, so no need to use ++ * RCU-based magic ++ */ ++static void bfq_pd_offline(struct blkg_policy_data *pd) ++{ ++ struct bfq_service_tree *st; ++ struct bfq_group *bfqg; ++ struct bfq_data *bfqd; ++ struct bfq_entity *entity; ++#ifdef BFQ_MQ ++ unsigned long flags; ++#endif ++ int i; ++ ++ BUG_ON(!pd); ++ bfqg = pd_to_bfqg(pd); ++ BUG_ON(!bfqg); ++ bfqd = bfqg->bfqd; ++ BUG_ON(bfqd && !bfqd->root_group); ++ ++ entity = bfqg->my_entity; ++ ++#ifdef BFQ_MQ ++ spin_lock_irqsave(&bfqd->lock, flags); ++#endif ++ ++ if (!entity) /* root group */ ++ goto put_async_queues; ++ ++ /* ++ * Empty all service_trees belonging to this group before ++ * deactivating the group itself. ++ */ ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { ++ BUG_ON(!bfqg->sched_data.service_tree); ++ st = bfqg->sched_data.service_tree + i; ++ /* ++ * The idle tree may still contain bfq_queues belonging ++ * to exited task because they never migrated to a different ++ * cgroup from the one being destroyed now. ++ */ ++ bfq_flush_idle_tree(st); ++ ++ /* ++ * It may happen that some queues are still active ++ * (busy) upon group destruction (if the corresponding ++ * processes have been forced to terminate). We move ++ * all the leaf entities corresponding to these queues ++ * to the root_group. ++ * Also, it may happen that the group has an entity ++ * in service, which is disconnected from the active ++ * tree: it must be moved, too. ++ * There is no need to put the sync queues, as the ++ * scheduler has taken no reference. ++ */ ++ bfq_reparent_active_entities(bfqd, bfqg, st); ++ BUG_ON(!RB_EMPTY_ROOT(&st->active)); ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); ++ } ++ BUG_ON(bfqg->sched_data.next_in_service); ++ BUG_ON(bfqg->sched_data.in_service_entity); ++ ++ __bfq_deactivate_entity(entity, false); ++ ++put_async_queues: ++ bfq_put_async_queues(bfqd, bfqg); ++ ++#ifdef BFQ_MQ ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++#endif ++ /* ++ * @blkg is going offline and will be ignored by ++ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so ++ * that they don't get lost. If IOs complete after this point, the ++ * stats for them will be lost. Oh well... ++ */ ++ bfqg_stats_xfer_dead(bfqg); ++} ++ ++static void bfq_end_wr_async(struct bfq_data *bfqd) ++{ ++ struct blkcg_gq *blkg; ++ ++ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { ++ struct bfq_group *bfqg = blkg_to_bfqg(blkg); ++ BUG_ON(!bfqg); ++ ++ bfq_end_wr_async_queues(bfqd, bfqg); ++ } ++ bfq_end_wr_async_queues(bfqd, bfqd->root_group); ++} ++ ++static int bfq_io_show_weight(struct seq_file *sf, void *v) ++{ ++ struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); ++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); ++ unsigned int val = 0; ++ ++ if (bfqgd) ++ val = bfqgd->weight; ++ ++ seq_printf(sf, "%u\n", val); ++ ++ return 0; ++} ++ ++static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, ++ struct cftype *cftype, ++ u64 val) ++{ ++ struct blkcg *blkcg = css_to_blkcg(css); ++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); ++ struct blkcg_gq *blkg; ++ int ret = -ERANGE; ++ ++ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) ++ return ret; ++ ++ ret = 0; ++ spin_lock_irq(&blkcg->lock); ++ bfqgd->weight = (unsigned short)val; ++ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { ++ struct bfq_group *bfqg = blkg_to_bfqg(blkg); ++ ++ if (!bfqg) ++ continue; ++ /* ++ * Setting the prio_changed flag of the entity ++ * to 1 with new_weight == weight would re-set ++ * the value of the weight to its ioprio mapping. ++ * Set the flag only if necessary. ++ */ ++ if ((unsigned short)val != bfqg->entity.new_weight) { ++ bfqg->entity.new_weight = (unsigned short)val; ++ /* ++ * Make sure that the above new value has been ++ * stored in bfqg->entity.new_weight before ++ * setting the prio_changed flag. In fact, ++ * this flag may be read asynchronously (in ++ * critical sections protected by a different ++ * lock than that held here), and finding this ++ * flag set may cause the execution of the code ++ * for updating parameters whose value may ++ * depend also on bfqg->entity.new_weight (in ++ * __bfq_entity_update_weight_prio). ++ * This barrier makes sure that the new value ++ * of bfqg->entity.new_weight is correctly ++ * seen in that code. ++ */ ++ smp_wmb(); ++ bfqg->entity.prio_changed = 1; ++ } ++ } ++ spin_unlock_irq(&blkcg->lock); ++ ++ return ret; ++} ++ ++static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, ++ char *buf, size_t nbytes, ++ loff_t off) ++{ ++ u64 weight; ++ /* First unsigned long found in the file is used */ ++ int ret = kstrtoull(strim(buf), 0, &weight); ++ ++ if (ret) ++ return ret; ++ ++ ret = bfq_io_set_weight_legacy(of_css(of), NULL, weight); ++ return ret ?: nbytes; ++} ++ ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++static int bfqg_print_stat(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, ++ &blkcg_policy_bfq, seq_cft(sf)->private, false); ++ return 0; ++} ++ ++static int bfqg_print_rwstat(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, ++ &blkcg_policy_bfq, seq_cft(sf)->private, true); ++ return 0; ++} ++ ++static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), ++ &blkcg_policy_bfq, off); ++ return __blkg_prfill_u64(sf, pd, sum); ++} ++ ++static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), ++ &blkcg_policy_bfq, ++ off); ++ return __blkg_prfill_rwstat(sf, pd, &sum); ++} ++ ++static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_stat_recursive, &blkcg_policy_bfq, ++ seq_cft(sf)->private, false); ++ return 0; ++} ++ ++static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, ++ seq_cft(sf)->private, true); ++ return 0; ++} ++ ++static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, ++ int off) ++{ ++ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); ++ ++ return __blkg_prfill_u64(sf, pd, sum >> 9); ++} ++ ++static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); ++ return 0; ++} ++ ++static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, ++ offsetof(struct blkcg_gq, stat_bytes)); ++ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + ++ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); ++ ++ return __blkg_prfill_u64(sf, pd, sum >> 9); ++} ++ ++static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, ++ false); ++ return 0; ++} ++ ++ ++static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ struct bfq_group *bfqg = pd_to_bfqg(pd); ++ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); ++ u64 v = 0; ++ ++ if (samples) { ++ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); ++ v = div64_u64(v, samples); ++ } ++ __blkg_prfill_u64(sf, pd, v); ++ return 0; ++} ++ ++/* print avg_queue_size */ ++static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, ++ 0, false); ++ return 0; ++} ++#endif /* CONFIG_DEBUG_BLK_CGROUP */ ++ ++static struct bfq_group * ++bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) ++{ ++ int ret; ++ ++ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); ++ if (ret) ++ return NULL; ++ ++ return blkg_to_bfqg(bfqd->queue->root_blkg); ++} ++ ++#ifdef BFQ_MQ ++#define BFQ_CGROUP_FNAME(param) "bfq-mq."#param ++#else ++#define BFQ_CGROUP_FNAME(param) "bfq-sq."#param ++#endif ++ ++static struct cftype bfq_blkcg_legacy_files[] = { ++ { ++ .name = BFQ_CGROUP_FNAME(weight), ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = bfq_io_show_weight, ++ .write_u64 = bfq_io_set_weight_legacy, ++ }, ++ ++ /* statistics, covers only the tasks in the bfqg */ ++ { ++ .name = BFQ_CGROUP_FNAME(io_service_bytes), ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_bytes, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_serviced), ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_ios, ++ }, ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++ { ++ .name = BFQ_CGROUP_FNAME(time), ++ .private = offsetof(struct bfq_group, stats.time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(sectors), ++ .seq_show = bfqg_print_stat_sectors, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_service_time), ++ .private = offsetof(struct bfq_group, stats.service_time), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_wait_time), ++ .private = offsetof(struct bfq_group, stats.wait_time), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_merged), ++ .private = offsetof(struct bfq_group, stats.merged), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_queued), ++ .private = offsetof(struct bfq_group, stats.queued), ++ .seq_show = bfqg_print_rwstat, ++ }, ++#endif /* CONFIG_DEBUG_BLK_CGROUP */ ++ ++ /* the same statictics which cover the bfqg and its descendants */ ++ { ++ .name = BFQ_CGROUP_FNAME(io_service_bytes_recursive), ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_bytes_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_serviced_recursive), ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_ios_recursive, ++ }, ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++ { ++ .name = BFQ_CGROUP_FNAME(time_recursive), ++ .private = offsetof(struct bfq_group, stats.time), ++ .seq_show = bfqg_print_stat_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(sectors_recursive), ++ .seq_show = bfqg_print_stat_sectors_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_service_time_recursive), ++ .private = offsetof(struct bfq_group, stats.service_time), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_wait_time_recursive), ++ .private = offsetof(struct bfq_group, stats.wait_time), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_merged_recursive), ++ .private = offsetof(struct bfq_group, stats.merged), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_queued_recursive), ++ .private = offsetof(struct bfq_group, stats.queued), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(avg_queue_size), ++ .seq_show = bfqg_print_avg_queue_size, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(group_wait_time), ++ .private = offsetof(struct bfq_group, stats.group_wait_time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(idle_time), ++ .private = offsetof(struct bfq_group, stats.idle_time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(empty_time), ++ .private = offsetof(struct bfq_group, stats.empty_time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(dequeue), ++ .private = offsetof(struct bfq_group, stats.dequeue), ++ .seq_show = bfqg_print_stat, ++ }, ++#endif /* CONFIG_DEBUG_BLK_CGROUP */ ++ { } /* terminate */ ++}; ++ ++static struct cftype bfq_blkg_files[] = { ++ { ++ .name = BFQ_CGROUP_FNAME(weight), ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = bfq_io_show_weight, ++ .write = bfq_io_set_weight, ++ }, ++ {} /* terminate */ ++}; ++ ++#undef BFQ_CGROUP_FNAME ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_group *bfqg) {} ++ ++static void bfq_init_entity(struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ entity->weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; ++ if (bfqq) { ++ bfqq->ioprio = bfqq->new_ioprio; ++ bfqq->ioprio_class = bfqq->new_ioprio_class; ++ } ++ entity->sched_data = &bfqg->sched_data; ++} ++ ++static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} ++ ++static void bfq_end_wr_async(struct bfq_data *bfqd) ++{ ++ bfq_end_wr_async_queues(bfqd, bfqd->root_group); ++} ++ ++static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, ++ struct blkcg *blkcg) ++{ ++ return bfqd->root_group; ++} ++ ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) ++{ ++ return bfqq->bfqd->root_group; ++} ++ ++static struct bfq_group * ++bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) ++{ ++ struct bfq_group *bfqg; ++ int i; ++ ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); ++ if (!bfqg) ++ return NULL; ++ ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ ++ return bfqg; ++} ++#endif +diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c +new file mode 100644 +index 000000000000..fb7bb8f08b75 +--- /dev/null ++++ b/block/bfq-ioc.c +@@ -0,0 +1,36 @@ ++/* ++ * BFQ: I/O context handling. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2010 Paolo Valente ++ */ ++ ++/** ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. ++ * @icq: the iocontext queue. ++ */ ++static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) ++{ ++ /* bic->icq is the first member, %NULL will convert to %NULL */ ++ return container_of(icq, struct bfq_io_cq, icq); ++} ++ ++/** ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. ++ * @bfqd: the lookup key. ++ * @ioc: the io_context of the process doing I/O. ++ * ++ * Queue lock must be held. ++ */ ++static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, ++ struct io_context *ioc) ++{ ++ if (ioc) ++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); ++ return NULL; ++} +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +new file mode 100644 +index 000000000000..47a49d9e6512 +--- /dev/null ++++ b/block/bfq-mq-iosched.c +@@ -0,0 +1,6548 @@ ++/* ++ * Budget Fair Queueing (BFQ) I/O scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2015 Paolo Valente ++ * ++ * Copyright (C) 2017 Paolo Valente ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ ++ * file. ++ * ++ * BFQ is a proportional-share I/O scheduler, with some extra ++ * low-latency capabilities. BFQ also supports full hierarchical ++ * scheduling through cgroups. Next paragraphs provide an introduction ++ * on BFQ inner workings. Details on BFQ benefits and usage can be ++ * found in Documentation/block/bfq-iosched.txt. ++ * ++ * BFQ is a proportional-share storage-I/O scheduling algorithm based ++ * on the slice-by-slice service scheme of CFQ. But BFQ assigns ++ * budgets, measured in number of sectors, to processes instead of ++ * time slices. The device is not granted to the in-service process ++ * for a given time slice, but until it has exhausted its assigned ++ * budget. This change from the time to the service domain enables BFQ ++ * to distribute the device throughput among processes as desired, ++ * without any distortion due to throughput fluctuations, or to device ++ * internal queueing. BFQ uses an ad hoc internal scheduler, called ++ * B-WF2Q+, to schedule processes according to their budgets. More ++ * precisely, BFQ schedules queues associated with processes. Thanks to ++ * the accurate policy of B-WF2Q+, BFQ can afford to assign high ++ * budgets to I/O-bound processes issuing sequential requests (to ++ * boost the throughput), and yet guarantee a low latency to ++ * interactive and soft real-time applications. ++ * ++ * In particular, BFQ schedules I/O so as to achieve the latter goal-- ++ * low latency for interactive and soft real-time applications--if the ++ * low_latency parameter is set (default configuration). To this ++ * purpose, BFQ constantly tries to detect whether the I/O requests in ++ * a bfq_queue come from an interactive or a soft real-time ++ * application. For brevity, in these cases, the queue is said to be ++ * interactive or soft real-time. In both cases, BFQ privileges the ++ * service of the queue, over that of non-interactive and ++ * non-soft-real-time queues. This privileging is performed, mainly, ++ * by raising the weight of the queue. So, for brevity, we call just ++ * weight-raising periods the time periods during which a queue is ++ * privileged, because deemed interactive or soft real-time. ++ * ++ * The detection of soft real-time queues/applications is described in ++ * detail in the comments on the function ++ * bfq_bfqq_softrt_next_start. On the other hand, the detection of an ++ * interactive queue works as follows: a queue is deemed interactive ++ * if it is constantly non empty only for a limited time interval, ++ * after which it does become empty. The queue may be deemed ++ * interactive again (for a limited time), if it restarts being ++ * constantly non empty, provided that this happens only after the ++ * queue has remained empty for a given minimum idle time. ++ * ++ * By default, BFQ computes automatically the above maximum time ++ * interval, i.e., the time interval after which a constantly ++ * non-empty queue stops being deemed interactive. Since a queue is ++ * weight-raised while it is deemed interactive, this maximum time ++ * interval happens to coincide with the (maximum) duration of the ++ * weight-raising for interactive queues. ++ * ++ * NOTE: if the main or only goal, with a given device, is to achieve ++ * the maximum-possible throughput at all times, then do switch off ++ * all low-latency heuristics for that device, by setting low_latency ++ * to 0. ++ * ++ * BFQ is described in [1], where also a reference to the initial, ++ * more theoretical paper on BFQ can be found. The interested reader ++ * can find in the latter paper full details on the main algorithm, as ++ * well as formulas of the guarantees and formal proofs of all the ++ * properties. With respect to the version of BFQ presented in these ++ * papers, this implementation adds a few more heuristics, such as the ++ * one that guarantees a low latency to soft real-time applications, ++ * and a hierarchical extension based on H-WF2Q+. ++ * ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) ++ * complexity derives from the one introduced with EEVDF in [3]. ++ * ++ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O ++ * Scheduler", Proceedings of the First Workshop on Mobile System ++ * Technologies (MST-2015), May 2015. ++ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf ++ * ++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf ++ * ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, ++ * Oct 1997. ++ * ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz ++ * ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline ++ * First: A Flexible and Accurate Mechanism for Proportional Share ++ * Resource Allocation,'' technical report. ++ * ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "blk.h" ++#include "blk-mq.h" ++#include "blk-mq-tag.h" ++#include "blk-mq-sched.h" ++#include "bfq-mq.h" ++#include "blk-wbt.h" ++ ++/* Expiration time of sync (0) and async (1) requests, in ns. */ ++static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; ++ ++/* Maximum backwards seek, in KiB. */ ++static const int bfq_back_max = (16 * 1024); ++ ++/* Penalty of a backwards seek, in number of sectors. */ ++static const int bfq_back_penalty = 2; ++ ++/* Idling period duration, in ns. */ ++static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); ++ ++/* Minimum number of assigned budgets for which stats are safe to compute. */ ++static const int bfq_stats_min_budgets = 194; ++ ++/* Default maximum budget values, in sectors and number of requests. */ ++static const int bfq_default_max_budget = (16 * 1024); ++ ++/* ++ * When a sync request is dispatched, the queue that contains that ++ * request, and all the ancestor entities of that queue, are charged ++ * with the number of sectors of the request. In constrast, if the ++ * request is async, then the queue and its ancestor entities are ++ * charged with the number of sectors of the request, multiplied by ++ * the factor below. This throttles the bandwidth for async I/O, ++ * w.r.t. to sync I/O, and it is done to counter the tendency of async ++ * writes to steal I/O throughput to reads. ++ * ++ * The current value of this parameter is the result of a tuning with ++ * several hardware and software configurations. We tried to find the ++ * lowest value for which writes do not cause noticeable problems to ++ * reads. In fact, the lower this parameter, the stabler I/O control, ++ * in the following respect. The lower this parameter is, the less ++ * the bandwidth enjoyed by a group decreases ++ * - when the group does writes, w.r.t. to when it does reads; ++ * - when other groups do reads, w.r.t. to when they do writes. ++ */ ++static const int bfq_async_charge_factor = 3; ++ ++/* Default timeout values, in jiffies, approximating CFQ defaults. */ ++static const int bfq_timeout = (HZ / 8); ++ ++/* ++ * Time limit for merging (see comments in bfq_setup_cooperator). Set ++ * to the slowest value that, in our tests, proved to be effective in ++ * removing false positives, while not causing true positives to miss ++ * queue merging. ++ * ++ * As can be deduced from the low time limit below, queue merging, if ++ * successful, happens at the very beggining of the I/O of the involved ++ * cooperating processes, as a consequence of the arrival of the very ++ * first requests from each cooperator. After that, there is very ++ * little chance to find cooperators. ++ */ ++static const unsigned long bfq_merge_time_limit = HZ/10; ++ ++#define MAX_LENGTH_REASON_NAME 25 ++ ++static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", ++"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", ++"PREEMPTED"}; ++ ++static struct kmem_cache *bfq_pool; ++ ++/* Below this threshold (in ns), we consider thinktime immediate. */ ++#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) ++ ++/* hw_tag detection: parallel requests threshold and min samples needed. */ ++#define BFQ_HW_QUEUE_THRESHOLD 3 ++#define BFQ_HW_QUEUE_SAMPLES 32 ++ ++#define BFQQ_SEEK_THR (sector_t)(8 * 100) ++#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) ++#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ ++ (get_sdist(last_pos, rq) > \ ++ BFQQ_SEEK_THR && \ ++ (!blk_queue_nonrot(bfqd->queue) || \ ++ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) ++#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) ++#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) ++ ++/* Min number of samples required to perform peak-rate update */ ++#define BFQ_RATE_MIN_SAMPLES 32 ++/* Min observation time interval required to perform a peak-rate update (ns) */ ++#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) ++/* Target observation time interval for a peak-rate update (ns) */ ++#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC ++ ++/* ++ * Shift used for peak-rate fixed precision calculations. ++ * With ++ * - the current shift: 16 positions ++ * - the current type used to store rate: u32 ++ * - the current unit of measure for rate: [sectors/usec], or, more precisely, ++ * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, ++ * the range of rates that can be stored is ++ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = ++ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = ++ * [15, 65G] sectors/sec ++ * Which, assuming a sector size of 512B, corresponds to a range of ++ * [7.5K, 33T] B/sec ++ */ ++#define BFQ_RATE_SHIFT 16 ++ ++/* ++ * When configured for computing the duration of the weight-raising ++ * for interactive queues automatically (see the comments at the ++ * beginning of this file), BFQ does it using the following formula: ++ * duration = (ref_rate / r) * ref_wr_duration, ++ * where r is the peak rate of the device, and ref_rate and ++ * ref_wr_duration are two reference parameters. In particular, ++ * ref_rate is the peak rate of the reference storage device (see ++ * below), and ref_wr_duration is about the maximum time needed, with ++ * BFQ and while reading two files in parallel, to load typical large ++ * applications on the reference device (see the comments on ++ * max_service_from_wr below, for more details on how ref_wr_duration ++ * is obtained). In practice, the slower/faster the device at hand ++ * is, the more/less it takes to load applications with respect to the ++ * reference device. Accordingly, the longer/shorter BFQ grants ++ * weight raising to interactive applications. ++ * ++ * BFQ uses two different reference pairs (ref_rate, ref_wr_duration), ++ * depending on whether the device is rotational or non-rotational. ++ * ++ * In the following definitions, ref_rate[0] and ref_wr_duration[0] ++ * are the reference values for a rotational device, whereas ++ * ref_rate[1] and ref_wr_duration[1] are the reference values for a ++ * non-rotational device. The reference rates are not the actual peak ++ * rates of the devices used as a reference, but slightly lower ++ * values. The reason for using slightly lower values is that the ++ * peak-rate estimator tends to yield slightly lower values than the ++ * actual peak rate (it can yield the actual peak rate only if there ++ * is only one process doing I/O, and the process does sequential ++ * I/O). ++ * ++ * The reference peak rates are measured in sectors/usec, left-shifted ++ * by BFQ_RATE_SHIFT. ++ */ ++static int ref_rate[2] = {14000, 33000}; ++/* ++ * To improve readability, a conversion function is used to initialize ++ * the following array, which entails that the array can be ++ * initialized only in a function. ++ */ ++static int ref_wr_duration[2]; ++ ++/* ++ * BFQ uses the above-detailed, time-based weight-raising mechanism to ++ * privilege interactive tasks. This mechanism is vulnerable to the ++ * following false positives: I/O-bound applications that will go on ++ * doing I/O for much longer than the duration of weight ++ * raising. These applications have basically no benefit from being ++ * weight-raised at the beginning of their I/O. On the opposite end, ++ * while being weight-raised, these applications ++ * a) unjustly steal throughput to applications that may actually need ++ * low latency; ++ * b) make BFQ uselessly perform device idling; device idling results ++ * in loss of device throughput with most flash-based storage, and may ++ * increase latencies when used purposelessly. ++ * ++ * BFQ tries to reduce these problems, by adopting the following ++ * countermeasure. To introduce this countermeasure, we need first to ++ * finish explaining how the duration of weight-raising for ++ * interactive tasks is computed. ++ * ++ * For a bfq_queue deemed as interactive, the duration of weight ++ * raising is dynamically adjusted, as a function of the estimated ++ * peak rate of the device, so as to be equal to the time needed to ++ * execute the 'largest' interactive task we benchmarked so far. By ++ * largest task, we mean the task for which each involved process has ++ * to do more I/O than for any of the other tasks we benchmarked. This ++ * reference interactive task is the start-up of LibreOffice Writer, ++ * and in this task each process/bfq_queue needs to have at most ~110K ++ * sectors transferred. ++ * ++ * This last piece of information enables BFQ to reduce the actual ++ * duration of weight-raising for at least one class of I/O-bound ++ * applications: those doing sequential or quasi-sequential I/O. An ++ * example is file copy. In fact, once started, the main I/O-bound ++ * processes of these applications usually consume the above 110K ++ * sectors in much less time than the processes of an application that ++ * is starting, because these I/O-bound processes will greedily devote ++ * almost all their CPU cycles only to their target, ++ * throughput-friendly I/O operations. This is even more true if BFQ ++ * happens to be underestimating the device peak rate, and thus ++ * overestimating the duration of weight raising. But, according to ++ * our measurements, once transferred 110K sectors, these processes ++ * have no right to be weight-raised any longer. ++ * ++ * Basing on the last consideration, BFQ ends weight-raising for a ++ * bfq_queue if the latter happens to have received an amount of ++ * service at least equal to the following constant. The constant is ++ * set to slightly more than 110K, to have a minimum safety margin. ++ * ++ * This early ending of weight-raising reduces the amount of time ++ * during which interactive false positives cause the two problems ++ * described at the beginning of these comments. ++ */ ++static const unsigned long max_service_from_wr = 120000; ++ ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) ++ ++#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) ++ ++/** ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. ++ * @icq: the iocontext queue. ++ */ ++static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) ++{ ++ /* bic->icq is the first member, %NULL will convert to %NULL */ ++ return container_of(icq, struct bfq_io_cq, icq); ++} ++ ++/** ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. ++ * @bfqd: the lookup key. ++ * @ioc: the io_context of the process doing I/O. ++ * @q: the request queue. ++ */ ++static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, ++ struct io_context *ioc, ++ struct request_queue *q) ++{ ++ if (ioc) { ++ unsigned long flags; ++ struct bfq_io_cq *icq; ++ ++ spin_lock_irqsave(q->queue_lock, flags); ++ icq = icq_to_bic(ioc_lookup_icq(ioc, q)); ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return icq; ++ } ++ ++ return NULL; ++} ++ ++/* ++ * Scheduler run of queue, if there are requests pending and no one in the ++ * driver that will restart queueing. ++ */ ++static void bfq_schedule_dispatch(struct bfq_data *bfqd) ++{ ++ if (bfqd->queued != 0) { ++ bfq_log(bfqd, ""); ++ blk_mq_run_hw_queues(bfqd->queue, true); ++ } ++} ++ ++#define BFQ_MQ ++#include "bfq-sched.c" ++#include "bfq-cgroup-included.c" ++ ++#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) ++#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) ++ ++#define bfq_sample_valid(samples) ((samples) > 80) ++ ++/* ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now. ++ * We choose the request that is closesr to the head right now. Distance ++ * behind the head is penalized and only allowed to a certain extent. ++ */ ++static struct request *bfq_choose_req(struct bfq_data *bfqd, ++ struct request *rq1, ++ struct request *rq2, ++ sector_t last) ++{ ++ sector_t s1, s2, d1 = 0, d2 = 0; ++ unsigned long back_max; ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ ++ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ ++ ++ if (!rq1 || rq1 == rq2) ++ return rq2; ++ if (!rq2) ++ return rq1; ++ ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) ++ return rq1; ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) ++ return rq2; ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) ++ return rq1; ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) ++ return rq2; ++ ++ s1 = blk_rq_pos(rq1); ++ s2 = blk_rq_pos(rq2); ++ ++ /* ++ * By definition, 1KiB is 2 sectors. ++ */ ++ back_max = bfqd->bfq_back_max * 2; ++ ++ /* ++ * Strict one way elevator _except_ in the case where we allow ++ * short backward seeks which are biased as twice the cost of a ++ * similar forward seek. ++ */ ++ if (s1 >= last) ++ d1 = s1 - last; ++ else if (s1 + back_max >= last) ++ d1 = (last - s1) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ1_WRAP; ++ ++ if (s2 >= last) ++ d2 = s2 - last; ++ else if (s2 + back_max >= last) ++ d2 = (last - s2) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ2_WRAP; ++ ++ /* Found required data */ ++ ++ /* ++ * By doing switch() on the bit mask "wrap" we avoid having to ++ * check two variables for all permutations: --> faster! ++ */ ++ switch (wrap) { ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ ++ if (d1 < d2) ++ return rq1; ++ else if (d2 < d1) ++ return rq2; ++ ++ if (s1 >= s2) ++ return rq1; ++ else ++ return rq2; ++ ++ case BFQ_RQ2_WRAP: ++ return rq1; ++ case BFQ_RQ1_WRAP: ++ return rq2; ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ ++ default: ++ /* ++ * Since both rqs are wrapped, ++ * start with the one that's further behind head ++ * (--> only *one* back seek required), ++ * since back seek takes more time than forward. ++ */ ++ if (s1 <= s2) ++ return rq1; ++ else ++ return rq2; ++ } ++} ++ ++/* ++ * Async I/O can easily starve sync I/O (both sync reads and sync ++ * writes), by consuming all tags. Similarly, storms of sync writes, ++ * such as those that sync(2) may trigger, can starve sync reads. ++ * Limit depths of async I/O and sync writes so as to counter both ++ * problems. ++ */ ++static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) ++{ ++ struct bfq_data *bfqd = data->q->elevator->elevator_data; ++ ++ if (op_is_sync(op) && !op_is_write(op)) ++ return; ++ ++ data->shallow_depth = ++ bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; ++ ++ bfq_log(bfqd, "wr_busy %d sync %d depth %u", ++ bfqd->wr_busy_queues, op_is_sync(op), ++ data->shallow_depth); ++} ++ ++static struct bfq_queue * ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, ++ sector_t sector, struct rb_node **ret_parent, ++ struct rb_node ***rb_link) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *bfqq = NULL; ++ ++ parent = NULL; ++ p = &root->rb_node; ++ while (*p) { ++ struct rb_node **n; ++ ++ parent = *p; ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ ++ /* ++ * Sort strictly based on sector. Smallest to the left, ++ * largest to the right. ++ */ ++ if (sector > blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_right; ++ else if (sector < blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_left; ++ else ++ break; ++ p = n; ++ bfqq = NULL; ++ } ++ ++ *ret_parent = parent; ++ if (rb_link) ++ *rb_link = p; ++ ++ bfq_log(bfqd, "%llu: returning %d", ++ (unsigned long long) sector, ++ bfqq ? bfqq->pid : 0); ++ ++ return bfqq; ++} ++ ++static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) ++{ ++ return bfqq->service_from_backlogged > 0 && ++ time_is_before_jiffies(bfqq->first_IO_time + ++ bfq_merge_time_limit); ++} ++ ++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *__bfqq; ++ ++ if (bfqq->pos_root) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ ++ /* ++ * bfqq cannot be merged any longer (see comments in ++ * bfq_setup_cooperator): no point in adding bfqq into the ++ * position tree. ++ */ ++ if (bfq_too_late_for_merging(bfqq)) ++ return; ++ ++ if (bfq_class_idle(bfqq)) ++ return; ++ if (!bfqq->next_rq) ++ return; ++ ++ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, ++ blk_rq_pos(bfqq->next_rq), &parent, &p); ++ if (!__bfqq) { ++ rb_link_node(&bfqq->pos_node, parent, p); ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); ++ } else ++ bfqq->pos_root = NULL; ++} ++ ++/* ++ * The following function returns true if every queue must receive the ++ * same share of the throughput (this condition is used when deciding ++ * whether idling may be disabled, see the comments in the function ++ * bfq_better_to_idle()). ++ * ++ * Such a scenario occurs when: ++ * 1) all active queues have the same weight, ++ * 2) all active queues belong to the same I/O-priority class, ++ * 3) all active groups at the same level in the groups tree have the same ++ * weight, ++ * 4) all active groups at the same level in the groups tree have the same ++ * number of children. ++ * ++ * Unfortunately, keeping the necessary state for evaluating exactly ++ * the last two symmetry sub-conditions above would be quite complex ++ * and time consuming. Therefore this function evaluates, instead, ++ * only the following stronger three sub-conditions, for which it is ++ * much easier to maintain the needed state: ++ * 1) all active queues have the same weight, ++ * 2) all active queues belong to the same I/O-priority class, ++ * 3) there are no active groups. ++ * In particular, the last condition is always true if hierarchical ++ * support or the cgroups interface are not enabled, thus no state ++ * needs to be maintained in this case. ++ */ ++static bool bfq_symmetric_scenario(struct bfq_data *bfqd) ++{ ++ /* ++ * For queue weights to differ, queue_weights_tree must contain ++ * at least two nodes. ++ */ ++ bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && ++ (bfqd->queue_weights_tree.rb_node->rb_left || ++ bfqd->queue_weights_tree.rb_node->rb_right); ++ ++ bool multiple_classes_busy = ++ (bfqd->busy_queues[0] && bfqd->busy_queues[1]) || ++ (bfqd->busy_queues[0] && bfqd->busy_queues[2]) || ++ (bfqd->busy_queues[1] && bfqd->busy_queues[2]); ++ ++ bfq_log(bfqd, "varied_queue_weights %d mul_classes %d", ++ varied_queue_weights, multiple_classes_busy); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ bfq_log(bfqd, "num_groups_with_pending_reqs %u", ++ bfqd->num_groups_with_pending_reqs); ++#endif ++ ++ return !(varied_queue_weights || multiple_classes_busy ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ || bfqd->num_groups_with_pending_reqs > 0 ++#endif ++ ); ++} ++ ++/* ++ * If the weight-counter tree passed as input contains no counter for ++ * the weight of the input queue, then add that counter; otherwise just ++ * increment the existing counter. ++ * ++ * Note that weight-counter trees contain few nodes in mostly symmetric ++ * scenarios. For example, if all queues have the same weight, then the ++ * weight-counter tree for the queues may contain at most one node. ++ * This holds even if low_latency is on, because weight-raised queues ++ * are not inserted in the tree. ++ * In most scenarios, the rate at which nodes are created/destroyed ++ * should be low too. ++ */ ++static void bfq_weights_tree_add(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct rb_root *root) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct rb_node **new = &(root->rb_node), *parent = NULL; ++ ++ /* ++ * Do not insert if the queue is already associated with a ++ * counter, which happens if: ++ * 1) a request arrival has caused the queue to become both ++ * non-weight-raised, and hence change its weight, and ++ * backlogged; in this respect, each of the two events ++ * causes an invocation of this function, ++ * 2) this is the invocation of this function caused by the ++ * second event. This second invocation is actually useless, ++ * and we handle this fact by exiting immediately. More ++ * efficient or clearer solutions might possibly be adopted. ++ */ ++ if (bfqq->weight_counter) ++ return; ++ ++ while (*new) { ++ struct bfq_weight_counter *__counter = container_of(*new, ++ struct bfq_weight_counter, ++ weights_node); ++ parent = *new; ++ ++ if (entity->weight == __counter->weight) { ++ bfqq->weight_counter = __counter; ++ goto inc_counter; ++ } ++ if (entity->weight < __counter->weight) ++ new = &((*new)->rb_left); ++ else ++ new = &((*new)->rb_right); ++ } ++ ++ bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), ++ GFP_ATOMIC); ++ ++ /* ++ * In the unlucky event of an allocation failure, we just ++ * exit. This will cause the weight of queue to not be ++ * considered in bfq_symmetric_scenario, which, in its turn, ++ * causes the scenario to be deemed wrongly symmetric in case ++ * bfqq's weight would have been the only weight making the ++ * scenario asymmetric. On the bright side, no unbalance will ++ * however occur when bfqq becomes inactive again (the ++ * invocation of this function is triggered by an activation ++ * of queue). In fact, bfq_weights_tree_remove does nothing ++ * if !bfqq->weight_counter. ++ */ ++ if (unlikely(!bfqq->weight_counter)) ++ return; ++ ++ bfqq->weight_counter->weight = entity->weight; ++ rb_link_node(&bfqq->weight_counter->weights_node, parent, new); ++ rb_insert_color(&bfqq->weight_counter->weights_node, root); ++ ++inc_counter: ++ bfqq->weight_counter->num_active++; ++ bfqq->ref++; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "refs %d weight %d symmetric %d", ++ bfqq->ref, ++ entity->weight, ++ bfq_symmetric_scenario(bfqd)); ++} ++ ++/* ++ * Decrement the weight counter associated with the queue, and, if the ++ * counter reaches 0, remove the counter from the tree. ++ * See the comments to the function bfq_weights_tree_add() for considerations ++ * about overhead. ++ */ ++static void __bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct rb_root *root) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (!bfqq->weight_counter) ++ return; ++ ++ BUG_ON(RB_EMPTY_ROOT(root)); ++ BUG_ON(bfqq->weight_counter->weight != entity->weight); ++ ++ BUG_ON(!bfqq->weight_counter->num_active); ++ bfqq->weight_counter->num_active--; ++ ++ if (bfqq->weight_counter->num_active > 0) ++ goto reset_entity_pointer; ++ ++ rb_erase(&bfqq->weight_counter->weights_node, root); ++ kfree(bfqq->weight_counter); ++ ++reset_entity_pointer: ++ bfqq->weight_counter = NULL; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "refs %d weight %d symmetric %d", ++ bfqq->ref, ++ entity->weight, ++ bfq_symmetric_scenario(bfqd)); ++ bfq_put_queue(bfqq); ++} ++ ++/* ++ * Invoke __bfq_weights_tree_remove on bfqq and decrement the number ++ * of active groups for each queue's inactive parent entity. ++ */ ++static void bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = bfqq->entity.parent; ++ ++ for_each_entity(entity) { ++ struct bfq_sched_data *sd = entity->my_sched_data; ++ ++ BUG_ON(entity->sched_data == NULL); /* ++ * It would mean ++ * that this is ++ * the root group. ++ */ ++ ++ if (sd->next_in_service || sd->in_service_entity) { ++ BUG_ON(!entity->in_groups_with_pending_reqs); ++ /* ++ * entity is still active, because either ++ * next_in_service or in_service_entity is not ++ * NULL (see the comments on the definition of ++ * next_in_service for details on why ++ * in_service_entity must be checked too). ++ * ++ * As a consequence, its parent entities are ++ * active as well, and thus this loop must ++ * stop here. ++ */ ++ break; ++ } ++ ++ BUG_ON(!bfqd->num_groups_with_pending_reqs && ++ entity->in_groups_with_pending_reqs); ++ /* ++ * The decrement of num_groups_with_pending_reqs is ++ * not performed immediately upon the deactivation of ++ * entity, but it is delayed to when it also happens ++ * that the first leaf descendant bfqq of entity gets ++ * all its pending requests completed. The following ++ * instructions perform this delayed decrement, if ++ * needed. See the comments on ++ * num_groups_with_pending_reqs for details. ++ */ ++ if (entity->in_groups_with_pending_reqs) { ++ entity->in_groups_with_pending_reqs = false; ++ bfqd->num_groups_with_pending_reqs--; ++ } ++ bfq_log_bfqq(bfqd, bfqq, "num_groups_with_pending_reqs %u", ++ bfqd->num_groups_with_pending_reqs); ++ } ++ ++ /* ++ * Next function is invoked last, because it causes bfqq to be ++ * freed if the following holds: bfqq is not in service and ++ * has no dispatched request. DO NOT use bfqq after the next ++ * function invocation. ++ */ ++ __bfq_weights_tree_remove(bfqd, bfqq, ++ &bfqd->queue_weights_tree); ++} ++ ++/* ++ * Return expired entry, or NULL to just start from scratch in rbtree. ++ */ ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct request *rq; ++ ++ if (bfq_bfqq_fifo_expire(bfqq)) ++ return NULL; ++ ++ bfq_mark_bfqq_fifo_expire(bfqq); ++ ++ rq = rq_entry_fifo(bfqq->fifo.next); ++ ++ if (rq == last || ktime_get_ns() < rq->fifo_time) ++ return NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); ++ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); ++ return rq; ++} ++ ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct rb_node *rbnext = rb_next(&last->rb_node); ++ struct rb_node *rbprev = rb_prev(&last->rb_node); ++ struct request *next, *prev = NULL; ++ ++ BUG_ON(list_empty(&bfqq->fifo)); ++ ++ /* Follow expired path, else get first next available. */ ++ next = bfq_check_fifo(bfqq, last); ++ if (next) { ++ BUG_ON(next == last); ++ return next; ++ } ++ ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); ++ ++ if (rbprev) ++ prev = rb_entry_rq(rbprev); ++ ++ if (rbnext) ++ next = rb_entry_rq(rbnext); ++ else { ++ rbnext = rb_first(&bfqq->sort_list); ++ if (rbnext && rbnext != &last->rb_node) ++ next = rb_entry_rq(rbnext); ++ } ++ ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); ++} ++ ++/* see the definition of bfq_async_charge_factor for details */ ++static unsigned long bfq_serv_to_charge(struct request *rq, ++ struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 || ++ !bfq_symmetric_scenario(bfqq->bfqd)) ++ return blk_rq_sectors(rq); ++ ++ return blk_rq_sectors(rq) * bfq_async_charge_factor; ++} ++ ++/** ++ * bfq_updated_next_req - update the queue after a new next_rq selection. ++ * @bfqd: the device data the queue belongs to. ++ * @bfqq: the queue to update. ++ * ++ * If the first request of a queue changes we make sure that the queue ++ * has enough budget to serve at least its first request (if the ++ * request has grown). We do this because if the queue has not enough ++ * budget for its first request, it has to go through two dispatch ++ * rounds to actually get it dispatched. ++ */ ++static void bfq_updated_next_req(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ struct request *next_rq = bfqq->next_rq; ++ unsigned long new_budget; ++ ++ if (!next_rq) ++ return; ++ ++ if (bfqq == bfqd->in_service_queue) ++ /* ++ * In order not to break guarantees, budgets cannot be ++ * changed after an entity has been selected. ++ */ ++ return; ++ ++ BUG_ON(entity->tree != &st->active); ++ BUG_ON(entity == entity->sched_data->in_service_entity); ++ ++ new_budget = max_t(unsigned long, ++ max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)), ++ entity->service); ++ if (entity->budget != new_budget) { ++ entity->budget = new_budget; ++ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", ++ new_budget); ++ bfq_requeue_bfqq(bfqd, bfqq, false); ++ } ++} ++ ++static unsigned int bfq_wr_duration(struct bfq_data *bfqd) ++{ ++ u64 dur; ++ ++ if (bfqd->bfq_wr_max_time > 0) ++ return bfqd->bfq_wr_max_time; ++ ++ dur = bfqd->rate_dur_prod; ++ do_div(dur, bfqd->peak_rate); ++ ++ /* ++ * Limit duration between 3 and 25 seconds. The upper limit ++ * has been conservatively set after the following worst case: ++ * on a QEMU/KVM virtual machine ++ * - running in a slow PC ++ * - with a virtual disk stacked on a slow low-end 5400rpm HDD ++ * - serving a heavy I/O workload, such as the sequential reading ++ * of several files ++ * mplayer took 23 seconds to start, if constantly weight-raised. ++ * ++ * As for higher values than that accomodating the above bad ++ * scenario, tests show that higher values would often yield ++ * the opposite of the desired result, i.e., would worsen ++ * responsiveness by allowing non-interactive applications to ++ * preserve weight raising for too long. ++ * ++ * On the other end, lower values than 3 seconds make it ++ * difficult for most interactive tasks to complete their jobs ++ * before weight-raising finishes. ++ */ ++ return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000)); ++} ++ ++/* switch back from soft real-time to interactive weight raising */ ++static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, ++ struct bfq_data *bfqd) ++{ ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; ++} ++ ++static void ++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, bool bfq_already_existing) ++{ ++ unsigned int old_wr_coeff; ++ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); ++ ++ if (bic->saved_has_short_ttime) ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ else ++ bfq_clear_bfqq_has_short_ttime(bfqq); ++ ++ if (bic->saved_IO_bound) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ else ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ if (unlikely(busy)) ++ old_wr_coeff = bfqq->wr_coeff; ++ ++ bfqq->ttime = bic->saved_ttime; ++ bfqq->wr_coeff = bic->saved_wr_coeff; ++ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; ++ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); ++ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; ++ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "bic %p wr_coeff %d start_finish %lu max_time %lu", ++ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, ++ bfqq->wr_cur_max_time); ++ ++ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || ++ time_is_before_jiffies(bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time))) { ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ !bfq_bfqq_in_large_burst(bfqq) && ++ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + ++ bfq_wr_duration(bfqd))) { ++ switch_back_to_interactive_wr(bfqq, bfqd); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "switching back to interactive"); ++ } else { ++ bfqq->wr_coeff = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "switching off wr (%lu + %lu < %lu)", ++ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, ++ jiffies); ++ } ++ } ++ ++ /* make sure weight will be updated, however we got here */ ++ bfqq->entity.prio_changed = 1; ++ ++ if (likely(!busy)) ++ return; ++ ++ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); ++ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++} ++ ++static int bfqq_process_refs(struct bfq_queue *bfqq) ++{ ++ int process_refs, io_refs; ++ ++ lockdep_assert_held(&bfqq->bfqd->lock); ++ ++ io_refs = bfqq->allocated; ++ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st - ++ (bfqq->weight_counter != NULL); ++ BUG_ON(process_refs < 0); ++ return process_refs; ++} ++ ++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ ++static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *item; ++ struct hlist_node *n; ++ ++ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) ++ hlist_del_init(&item->burst_list_node); ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++ bfqd->burst_size = 1; ++ bfqd->burst_parent_entity = bfqq->entity.parent; ++} ++ ++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ ++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ /* Increment burst size to take into account also bfqq */ ++ bfqd->burst_size++; ++ ++ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); ++ ++ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); ++ ++ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { ++ struct bfq_queue *pos, *bfqq_item; ++ struct hlist_node *n; ++ ++ /* ++ * Enough queues have been activated shortly after each ++ * other to consider this burst as large. ++ */ ++ bfqd->large_burst = true; ++ bfq_log_bfqq(bfqd, bfqq, "large burst started"); ++ ++ /* ++ * We can now mark all queues in the burst list as ++ * belonging to a large burst. ++ */ ++ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, ++ burst_list_node) { ++ bfq_mark_bfqq_in_large_burst(bfqq_item); ++ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); ++ } ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); ++ ++ /* ++ * From now on, and until the current burst finishes, any ++ * new queue being activated shortly after the last queue ++ * was inserted in the burst can be immediately marked as ++ * belonging to a large burst. So the burst list is not ++ * needed any more. Remove it. ++ */ ++ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, ++ burst_list_node) ++ hlist_del_init(&pos->burst_list_node); ++ } else /* ++ * Burst not yet large: add bfqq to the burst list. Do ++ * not increment the ref counter for bfqq, because bfqq ++ * is removed from the burst list before freeing bfqq ++ * in put_queue. ++ */ ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++} ++ ++/* ++ * If many queues belonging to the same group happen to be created ++ * shortly after each other, then the processes associated with these ++ * queues have typically a common goal. In particular, bursts of queue ++ * creations are usually caused by services or applications that spawn ++ * many parallel threads/processes. Examples are systemd during boot, ++ * or git grep. To help these processes get their job done as soon as ++ * possible, it is usually better to not grant either weight-raising ++ * or device idling to their queues. ++ * ++ * In this comment we describe, firstly, the reasons why this fact ++ * holds, and, secondly, the next function, which implements the main ++ * steps needed to properly mark these queues so that they can then be ++ * treated in a different way. ++ * ++ * The above services or applications benefit mostly from a high ++ * throughput: the quicker the requests of the activated queues are ++ * cumulatively served, the sooner the target job of these queues gets ++ * completed. As a consequence, weight-raising any of these queues, ++ * which also implies idling the device for it, is almost always ++ * counterproductive. In most cases it just lowers throughput. ++ * ++ * On the other hand, a burst of queue creations may be caused also by ++ * the start of an application that does not consist of a lot of ++ * parallel I/O-bound threads. In fact, with a complex application, ++ * several short processes may need to be executed to start-up the ++ * application. In this respect, to start an application as quickly as ++ * possible, the best thing to do is in any case to privilege the I/O ++ * related to the application with respect to all other ++ * I/O. Therefore, the best strategy to start as quickly as possible ++ * an application that causes a burst of queue creations is to ++ * weight-raise all the queues created during the burst. This is the ++ * exact opposite of the best strategy for the other type of bursts. ++ * ++ * In the end, to take the best action for each of the two cases, the ++ * two types of bursts need to be distinguished. Fortunately, this ++ * seems relatively easy, by looking at the sizes of the bursts. In ++ * particular, we found a threshold such that only bursts with a ++ * larger size than that threshold are apparently caused by ++ * services or commands such as systemd or git grep. For brevity, ++ * hereafter we call just 'large' these bursts. BFQ *does not* ++ * weight-raise queues whose creation occurs in a large burst. In ++ * addition, for each of these queues BFQ performs or does not perform ++ * idling depending on which choice boosts the throughput more. The ++ * exact choice depends on the device and request pattern at ++ * hand. ++ * ++ * Unfortunately, false positives may occur while an interactive task ++ * is starting (e.g., an application is being started). The ++ * consequence is that the queues associated with the task do not ++ * enjoy weight raising as expected. Fortunately these false positives ++ * are very rare. They typically occur if some service happens to ++ * start doing I/O exactly when the interactive task starts. ++ * ++ * Turning back to the next function, it implements all the steps ++ * needed to detect the occurrence of a large burst and to properly ++ * mark all the queues belonging to it (so that they can then be ++ * treated in a different way). This goal is achieved by maintaining a ++ * "burst list" that holds, temporarily, the queues that belong to the ++ * burst in progress. The list is then used to mark these queues as ++ * belonging to a large burst if the burst does become large. The main ++ * steps are the following. ++ * ++ * . when the very first queue is created, the queue is inserted into the ++ * list (as it could be the first queue in a possible burst) ++ * ++ * . if the current burst has not yet become large, and a queue Q that does ++ * not yet belong to the burst is activated shortly after the last time ++ * at which a new queue entered the burst list, then the function appends ++ * Q to the burst list ++ * ++ * . if, as a consequence of the previous step, the burst size reaches ++ * the large-burst threshold, then ++ * ++ * . all the queues in the burst list are marked as belonging to a ++ * large burst ++ * ++ * . the burst list is deleted; in fact, the burst list already served ++ * its purpose (keeping temporarily track of the queues in a burst, ++ * so as to be able to mark them as belonging to a large burst in the ++ * previous sub-step), and now is not needed any more ++ * ++ * . the device enters a large-burst mode ++ * ++ * . if a queue Q that does not belong to the burst is created while ++ * the device is in large-burst mode and shortly after the last time ++ * at which a queue either entered the burst list or was marked as ++ * belonging to the current large burst, then Q is immediately marked ++ * as belonging to a large burst. ++ * ++ * . if a queue Q that does not belong to the burst is created a while ++ * later, i.e., not shortly after, than the last time at which a queue ++ * either entered the burst list or was marked as belonging to the ++ * current large burst, then the current burst is deemed as finished and: ++ * ++ * . the large-burst mode is reset if set ++ * ++ * . the burst list is emptied ++ * ++ * . Q is inserted in the burst list, as Q may be the first queue ++ * in a possible new burst (then the burst list contains just Q ++ * after this step). ++ */ ++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ /* ++ * If bfqq is already in the burst list or is part of a large ++ * burst, or finally has just been split, then there is ++ * nothing else to do. ++ */ ++ if (!hlist_unhashed(&bfqq->burst_list_node) || ++ bfq_bfqq_in_large_burst(bfqq) || ++ time_is_after_eq_jiffies(bfqq->split_time + ++ msecs_to_jiffies(10))) ++ return; ++ ++ /* ++ * If bfqq's creation happens late enough, or bfqq belongs to ++ * a different group than the burst group, then the current ++ * burst is finished, and related data structures must be ++ * reset. ++ * ++ * In this respect, consider the special case where bfqq is ++ * the very first queue created after BFQ is selected for this ++ * device. In this case, last_ins_in_burst and ++ * burst_parent_entity are not yet significant when we get ++ * here. But it is easy to verify that, whether or not the ++ * following condition is true, bfqq will end up being ++ * inserted into the burst list. In particular the list will ++ * happen to contain only bfqq. And this is exactly what has ++ * to happen, as bfqq may be the first queue of the first ++ * burst. ++ */ ++ if (time_is_before_jiffies(bfqd->last_ins_in_burst + ++ bfqd->bfq_burst_interval) || ++ bfqq->entity.parent != bfqd->burst_parent_entity) { ++ bfqd->large_burst = false; ++ bfq_reset_burst_list(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "late activation or different group"); ++ goto end; ++ } ++ ++ /* ++ * If we get here, then bfqq is being activated shortly after the ++ * last queue. So, if the current burst is also large, we can mark ++ * bfqq as belonging to this large burst immediately. ++ */ ++ if (bfqd->large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ goto end; ++ } ++ ++ /* ++ * If we get here, then a large-burst state has not yet been ++ * reached, but bfqq is being activated shortly after the last ++ * queue. Then we add bfqq to the burst. ++ */ ++ bfq_add_to_burst(bfqd, bfqq); ++end: ++ /* ++ * At this point, bfqq either has been added to the current ++ * burst or has caused the current burst to terminate and a ++ * possible new burst to start. In particular, in the second ++ * case, bfqq has become the first queue in the possible new ++ * burst. In both cases last_ins_in_burst needs to be moved ++ * forward. ++ */ ++ bfqd->last_ins_in_burst = jiffies; ++ ++} ++ ++static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (entity->budget < entity->service) { ++ pr_crit("budget %d service %d\n", ++ entity->budget, entity->service); ++ BUG(); ++ } ++ return entity->budget - entity->service; ++} ++ ++/* ++ * If enough samples have been computed, return the current max budget ++ * stored in bfqd, which is dynamically updated according to the ++ * estimated disk peak rate; otherwise return the default max budget ++ */ ++static int bfq_max_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget; ++ else ++ return bfqd->bfq_max_budget; ++} ++ ++/* ++ * Return min budget, which is a fraction of the current or default ++ * max budget (trying with 1/32) ++ */ ++static int bfq_min_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget / 32; ++ else ++ return bfqd->bfq_max_budget / 32; ++} ++ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason); ++ ++/* ++ * The next function, invoked after the input queue bfqq switches from ++ * idle to busy, updates the budget of bfqq. The function also tells ++ * whether the in-service queue should be expired, by returning ++ * true. The purpose of expiring the in-service queue is to give bfqq ++ * the chance to possibly preempt the in-service queue, and the reason ++ * for preempting the in-service queue is to achieve one of the two ++ * goals below. ++ * ++ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has ++ * expired because it has remained idle. In particular, bfqq may have ++ * expired for one of the following two reasons: ++ * ++ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and ++ * did not make it to issue a new request before its last request ++ * was served; ++ * ++ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue ++ * a new request before the expiration of the idling-time. ++ * ++ * Even if bfqq has expired for one of the above reasons, the process ++ * associated with the queue may be however issuing requests greedily, ++ * and thus be sensitive to the bandwidth it receives (bfqq may have ++ * remained idle for other reasons: CPU high load, bfqq not enjoying ++ * idling, I/O throttling somewhere in the path from the process to ++ * the I/O scheduler, ...). But if, after every expiration for one of ++ * the above two reasons, bfqq has to wait for the service of at least ++ * one full budget of another queue before being served again, then ++ * bfqq is likely to get a much lower bandwidth or resource time than ++ * its reserved ones. To address this issue, two countermeasures need ++ * to be taken. ++ * ++ * First, the budget and the timestamps of bfqq need to be updated in ++ * a special way on bfqq reactivation: they need to be updated as if ++ * bfqq did not remain idle and did not expire. In fact, if they are ++ * computed as if bfqq expired and remained idle until reactivation, ++ * then the process associated with bfqq is treated as if, instead of ++ * being greedy, it stopped issuing requests when bfqq remained idle, ++ * and restarts issuing requests only on this reactivation. In other ++ * words, the scheduler does not help the process recover the "service ++ * hole" between bfqq expiration and reactivation. As a consequence, ++ * the process receives a lower bandwidth than its reserved one. In ++ * contrast, to recover this hole, the budget must be updated as if ++ * bfqq was not expired at all before this reactivation, i.e., it must ++ * be set to the value of the remaining budget when bfqq was ++ * expired. Along the same line, timestamps need to be assigned the ++ * value they had the last time bfqq was selected for service, i.e., ++ * before last expiration. Thus timestamps need to be back-shifted ++ * with respect to their normal computation (see [1] for more details ++ * on this tricky aspect). ++ * ++ * Secondly, to allow the process to recover the hole, the in-service ++ * queue must be expired too, to give bfqq the chance to preempt it ++ * immediately. In fact, if bfqq has to wait for a full budget of the ++ * in-service queue to be completed, then it may become impossible to ++ * let the process recover the hole, even if the back-shifted ++ * timestamps of bfqq are lower than those of the in-service queue. If ++ * this happens for most or all of the holes, then the process may not ++ * receive its reserved bandwidth. In this respect, it is worth noting ++ * that, being the service of outstanding requests unpreemptible, a ++ * little fraction of the holes may however be unrecoverable, thereby ++ * causing a little loss of bandwidth. ++ * ++ * The last important point is detecting whether bfqq does need this ++ * bandwidth recovery. In this respect, the next function deems the ++ * process associated with bfqq greedy, and thus allows it to recover ++ * the hole, if: 1) the process is waiting for the arrival of a new ++ * request (which implies that bfqq expired for one of the above two ++ * reasons), and 2) such a request has arrived soon. The first ++ * condition is controlled through the flag non_blocking_wait_rq, ++ * while the second through the flag arrived_in_time. If both ++ * conditions hold, then the function computes the budget in the ++ * above-described special way, and signals that the in-service queue ++ * should be expired. Timestamp back-shifting is done later in ++ * __bfq_activate_entity. ++ * ++ * 2. Reduce latency. Even if timestamps are not backshifted to let ++ * the process associated with bfqq recover a service hole, bfqq may ++ * however happen to have, after being (re)activated, a lower finish ++ * timestamp than the in-service queue. That is, the next budget of ++ * bfqq may have to be completed before the one of the in-service ++ * queue. If this is the case, then preempting the in-service queue ++ * allows this goal to be achieved, apart from the unpreemptible, ++ * outstanding requests mentioned above. ++ * ++ * Unfortunately, regardless of which of the above two goals one wants ++ * to achieve, service trees need first to be updated to know whether ++ * the in-service queue must be preempted. To have service trees ++ * correctly updated, the in-service queue must be expired and ++ * rescheduled, and bfqq must be scheduled too. This is one of the ++ * most costly operations (in future versions, the scheduling ++ * mechanism may be re-designed in such a way to make it possible to ++ * know whether preemption is needed without needing to update service ++ * trees). In addition, queue preemptions almost always cause random ++ * I/O, and thus loss of throughput. Because of these facts, the next ++ * function adopts the following simple scheme to avoid both costly ++ * operations and too frequent preemptions: it requests the expiration ++ * of the in-service queue (unconditionally) only for queues that need ++ * to recover a hole, or that either are weight-raised or deserve to ++ * be weight-raised. ++ */ ++static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool arrived_in_time, ++ bool wr_or_deserves_wr) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ /* ++ * In the next compound condition, we check also whether there ++ * is some budget left, because otherwise there is no point in ++ * trying to go on serving bfqq with this same budget: bfqq ++ * would be expired immediately after being selected for ++ * service. This would only cause useless overhead. ++ */ ++ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time && ++ bfq_bfqq_budget_left(bfqq) > 0) { ++ /* ++ * We do not clear the flag non_blocking_wait_rq here, as ++ * the latter is used in bfq_activate_bfqq to signal ++ * that timestamps need to be back-shifted (and is ++ * cleared right after). ++ */ ++ ++ /* ++ * In next assignment we rely on that either ++ * entity->service or entity->budget are not updated ++ * on expiration if bfqq is empty (see ++ * __bfq_bfqq_recalc_budget). Thus both quantities ++ * remain unchanged after such an expiration, and the ++ * following statement therefore assigns to ++ * entity->budget the remaining budget on such an ++ * expiration. ++ */ ++ BUG_ON(bfqq->max_budget < 0); ++ entity->budget = min_t(unsigned long, ++ bfq_bfqq_budget_left(bfqq), ++ bfqq->max_budget); ++ ++ BUG_ON(entity->budget < 0); ++ ++ /* ++ * At this point, we have used entity->service to get ++ * the budget left (needed for updating ++ * entity->budget). Thus we finally can, and have to, ++ * reset entity->service. The latter must be reset ++ * because bfqq would otherwise be charged again for ++ * the service it has received during its previous ++ * service slot(s). ++ */ ++ entity->service = 0; ++ ++ return true; ++ } ++ ++ /* ++ * We can finally complete expiration, by setting service to 0. ++ */ ++ entity->service = 0; ++ BUG_ON(bfqq->max_budget < 0); ++ entity->budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(bfqq->next_rq, bfqq)); ++ BUG_ON(entity->budget < 0); ++ ++ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); ++ return wr_or_deserves_wr; ++} ++ ++/* ++ * Return the farthest past time instant according to jiffies ++ * macros. ++ */ ++static unsigned long bfq_smallest_from_now(void) ++{ ++ return jiffies - MAX_JIFFY_OFFSET; ++} ++ ++static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ unsigned int old_wr_coeff, ++ bool wr_or_deserves_wr, ++ bool interactive, ++ bool in_burst, ++ bool soft_rt) ++{ ++ if (old_wr_coeff == 1 && wr_or_deserves_wr) { ++ /* start a weight-raising period */ ++ if (interactive) { ++ bfqq->service_from_wr = 0; ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ } else { ++ /* ++ * No interactive weight raising in progress ++ * here: assign minus infinity to ++ * wr_start_at_switch_to_srt, to make sure ++ * that, at the end of the soft-real-time ++ * weight raising periods that is starting ++ * now, no interactive weight-raising period ++ * may be wrongly considered as still in ++ * progress (and thus actually started by ++ * mistake). ++ */ ++ bfqq->wr_start_at_switch_to_srt = ++ bfq_smallest_from_now(); ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff * ++ BFQ_SOFTRT_WEIGHT_FACTOR; ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ } ++ /* ++ * If needed, further reduce budget to make sure it is ++ * close to bfqq's backlog, so as to reduce the ++ * scheduling-error component due to a too large ++ * budget. Do not care about throughput consequences, ++ * but only about latency. Finally, do not assign a ++ * too small budget either, to avoid increasing ++ * latency by causing too frequent expirations. ++ */ ++ bfqq->entity.budget = min_t(unsigned long, ++ bfqq->entity.budget, ++ 2 * bfq_min_budget(bfqd)); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais starting at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } else if (old_wr_coeff > 1) { ++ if (interactive) { /* update wr coeff and duration */ ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ } else if (in_burst) { ++ bfqq->wr_coeff = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq-> ++ wr_cur_max_time)); ++ } else if (soft_rt) { ++ /* ++ * The application is now or still meeting the ++ * requirements for being deemed soft rt. We ++ * can then correctly and safely (re)charge ++ * the weight-raising duration for the ++ * application with the weight-raising ++ * duration for soft rt applications. ++ * ++ * In particular, doing this recharge now, i.e., ++ * before the weight-raising period for the ++ * application finishes, reduces the probability ++ * of the following negative scenario: ++ * 1) the weight of a soft rt application is ++ * raised at startup (as for any newly ++ * created application), ++ * 2) since the application is not interactive, ++ * at a certain time weight-raising is ++ * stopped for the application, ++ * 3) at that time the application happens to ++ * still have pending requests, and hence ++ * is destined to not have a chance to be ++ * deemed soft rt before these requests are ++ * completed (see the comments to the ++ * function bfq_bfqq_softrt_next_start() ++ * for details on soft rt detection), ++ * 4) these pending requests experience a high ++ * latency because the application is not ++ * weight-raised while they are pending. ++ */ ++ if (bfqq->wr_cur_max_time != ++ bfqd->bfq_wr_rt_max_time) { ++ bfqq->wr_start_at_switch_to_srt = ++ bfqq->last_wr_start_finish; ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff * ++ BFQ_SOFTRT_WEIGHT_FACTOR; ++ bfq_log_bfqq(bfqd, bfqq, ++ "switching to soft_rt wr"); ++ } else ++ bfq_log_bfqq(bfqd, bfqq, ++ "moving forward soft_rt wr duration"); ++ bfqq->last_wr_start_finish = jiffies; ++ } ++ } ++} ++ ++static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ return bfqq->dispatched == 0 && ++ time_is_before_jiffies( ++ bfqq->budget_timeout + ++ bfqd->bfq_wr_min_idle_time); ++} ++ ++static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ int old_wr_coeff, ++ struct request *rq, ++ bool *interactive) ++{ ++ bool soft_rt, in_burst, wr_or_deserves_wr, ++ bfqq_wants_to_preempt, ++ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), ++ /* ++ * See the comments on ++ * bfq_bfqq_update_budg_for_activation for ++ * details on the usage of the next variable. ++ */ ++ arrived_in_time = ktime_get_ns() <= ++ bfqq->ttime.last_end_request + ++ bfqd->bfq_slice_idle * 3; ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfq_add_request non-busy: " ++ "jiffies %lu, in_time %d, idle_long %d busyw %d " ++ "wr_coeff %u", ++ jiffies, arrived_in_time, ++ idle_for_long_time, ++ bfq_bfqq_non_blocking_wait_rq(bfqq), ++ old_wr_coeff); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ ++ /* ++ * bfqq deserves to be weight-raised if: ++ * - it is sync, ++ * - it does not belong to a large burst, ++ * - it has been idle for enough time or is soft real-time, ++ * - is linked to a bfq_io_cq (it is not shared in any sense) ++ */ ++ in_burst = bfq_bfqq_in_large_burst(bfqq); ++ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && ++ !in_burst && ++ time_is_before_jiffies(bfqq->soft_rt_next_start) && ++ bfqq->dispatched == 0; ++ *interactive = ++ !in_burst && ++ idle_for_long_time; ++ wr_or_deserves_wr = bfqd->low_latency && ++ (bfqq->wr_coeff > 1 || ++ (bfq_bfqq_sync(bfqq) && ++ bfqq->bic && (*interactive || soft_rt))); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfq_add_request: " ++ "in_burst %d, " ++ "soft_rt %d (next %lu), inter %d, bic %p", ++ bfq_bfqq_in_large_burst(bfqq), soft_rt, ++ bfqq->soft_rt_next_start, ++ *interactive, ++ bfqq->bic); ++ ++ /* ++ * Using the last flag, update budget and check whether bfqq ++ * may want to preempt the in-service queue. ++ */ ++ bfqq_wants_to_preempt = ++ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, ++ arrived_in_time, ++ wr_or_deserves_wr); ++ ++ /* ++ * If bfqq happened to be activated in a burst, but has been ++ * idle for much more than an interactive queue, then we ++ * assume that, in the overall I/O initiated in the burst, the ++ * I/O associated with bfqq is finished. So bfqq does not need ++ * to be treated as a queue belonging to a burst ++ * anymore. Accordingly, we reset bfqq's in_large_burst flag ++ * if set, and remove bfqq from the burst list if it's ++ * there. We do not decrement burst_size, because the fact ++ * that bfqq does not need to belong to the burst list any ++ * more does not invalidate the fact that bfqq was created in ++ * a burst. ++ */ ++ if (likely(!bfq_bfqq_just_created(bfqq)) && ++ idle_for_long_time && ++ time_is_before_jiffies( ++ bfqq->budget_timeout + ++ msecs_to_jiffies(10000))) { ++ hlist_del_init(&bfqq->burst_list_node); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ } ++ ++ bfq_clear_bfqq_just_created(bfqq); ++ ++ if (!bfq_bfqq_IO_bound(bfqq)) { ++ if (arrived_in_time) { ++ bfqq->requests_within_timer++; ++ if (bfqq->requests_within_timer >= ++ bfqd->bfq_requests_within_timer) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ } else ++ bfqq->requests_within_timer = 0; ++ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", ++ bfqq->requests_within_timer); ++ } ++ ++ if (bfqd->low_latency) { ++ if (unlikely(time_is_after_jiffies(bfqq->split_time))) ++ /* wraparound */ ++ bfqq->split_time = ++ jiffies - bfqd->bfq_wr_min_idle_time - 1; ++ ++ if (time_is_before_jiffies(bfqq->split_time + ++ bfqd->bfq_wr_min_idle_time)) { ++ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, ++ old_wr_coeff, ++ wr_or_deserves_wr, ++ *interactive, ++ in_burst, ++ soft_rt); ++ ++ if (old_wr_coeff != bfqq->wr_coeff) ++ bfqq->entity.prio_changed = 1; ++ } ++ } ++ ++ bfqq->last_idle_bklogged = jiffies; ++ bfqq->service_from_backlogged = 0; ++ bfq_clear_bfqq_softrt_update(bfqq); ++ ++ bfq_add_bfqq_busy(bfqd, bfqq); ++ ++ /* ++ * Expire in-service queue only if preemption may be needed ++ * for guarantees. In this respect, the function ++ * next_queue_may_preempt just checks a simple, necessary ++ * condition, and not a sufficient condition based on ++ * timestamps. In fact, for the latter condition to be ++ * evaluated, timestamps would need first to be updated, and ++ * this operation is quite costly (see the comments on the ++ * function bfq_bfqq_update_budg_for_activation). ++ */ ++ if (bfqd->in_service_queue && bfqq_wants_to_preempt && ++ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && ++ next_queue_may_preempt(bfqd)) { ++ struct bfq_queue *in_serv = ++ bfqd->in_service_queue; ++ BUG_ON(in_serv == bfqq); ++ ++ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, ++ false, BFQ_BFQQ_PREEMPTED); ++ } ++} ++ ++static void bfq_add_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *next_rq, *prev; ++ unsigned int old_wr_coeff = bfqq->wr_coeff; ++ bool interactive = false; ++ ++ bfq_log_bfqq(bfqd, bfqq, "size %u %s", ++ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); ++ ++ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ bfqq->queued[rq_is_sync(rq)]++; ++ bfqd->queued++; ++ ++ BUG_ON(!RQ_BFQQ(rq)); ++ BUG_ON(RQ_BFQQ(rq) != bfqq); ++ WARN_ON(blk_rq_sectors(rq) == 0); ++ ++ elv_rb_add(&bfqq->sort_list, rq); ++ ++ /* ++ * Check if this request is a better next-to-serve candidate. ++ */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); ++ BUG_ON(!next_rq); ++ BUG_ON(!RQ_BFQQ(next_rq)); ++ BUG_ON(RQ_BFQQ(next_rq) != bfqq); ++ bfqq->next_rq = next_rq; ++ ++ /* ++ * Adjust priority tree position, if next_rq changes. ++ */ ++ if (prev != bfqq->next_rq) ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ ++ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ ++ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, ++ rq, &interactive); ++ else { ++ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && ++ time_is_before_jiffies( ++ bfqq->last_wr_start_finish + ++ bfqd->bfq_wr_min_inter_arr_async)) { ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "non-idle wrais starting, " ++ "wr_max_time %u wr_busy %d", ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqd->wr_busy_queues); ++ } ++ if (prev != bfqq->next_rq) ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ /* ++ * Assign jiffies to last_wr_start_finish in the following ++ * cases: ++ * ++ * . if bfqq is not going to be weight-raised, because, for ++ * non weight-raised queues, last_wr_start_finish stores the ++ * arrival time of the last request; as of now, this piece ++ * of information is used only for deciding whether to ++ * weight-raise async queues ++ * ++ * . if bfqq is not weight-raised, because, if bfqq is now ++ * switching to weight-raised, then last_wr_start_finish ++ * stores the time when weight-raising starts ++ * ++ * . if bfqq is interactive, because, regardless of whether ++ * bfqq is currently weight-raised, the weight-raising ++ * period must start or restart (this case is considered ++ * separately because it is not detected by the above ++ * conditions, if bfqq is already weight-raised) ++ * ++ * last_wr_start_finish has to be updated also if bfqq is soft ++ * real-time, because the weight-raising period is constantly ++ * restarted on idle-to-busy transitions for these queues, but ++ * this is already done in bfq_bfqq_handle_idle_busy_switch if ++ * needed. ++ */ ++ if (bfqd->low_latency && ++ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) ++ bfqq->last_wr_start_finish = jiffies; ++} ++ ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, ++ struct bio *bio, ++ struct request_queue *q) ++{ ++ struct bfq_queue *bfqq = bfqd->bio_bfqq; ++ ++ BUG_ON(!bfqd->bio_bfqq_set); ++ ++ if (bfqq) ++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); ++ ++ return NULL; ++} ++ ++static sector_t get_sdist(sector_t last_pos, struct request *rq) ++{ ++ sector_t sdist = 0; ++ ++ if (last_pos) { ++ if (last_pos < blk_rq_pos(rq)) ++ sdist = blk_rq_pos(rq) - last_pos; ++ else ++ sdist = last_pos - blk_rq_pos(rq); ++ } ++ ++ return sdist; ++} ++ ++#if 0 /* Still not clear if we can do without next two functions */ ++static void bfq_activate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ bfqd->rq_in_driver++; ++} ++ ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ BUG_ON(bfqd->rq_in_driver == 0); ++ bfqd->rq_in_driver--; ++} ++#endif ++ ++static void bfq_remove_request(struct request_queue *q, ++ struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ const int sync = rq_is_sync(rq); ++ ++ BUG_ON(bfqq->entity.service > bfqq->entity.budget); ++ ++ if (bfqq->next_rq == rq) { ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); ++ if (bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)) { ++ pr_crit("no bfqq! for next rq %p bfqq %p\n", ++ bfqq->next_rq, bfqq); ++ } ++ ++ BUG_ON(bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)); ++ if (bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq) { ++ pr_crit( ++ "wrong bfqq! for next rq %p, rq_bfqq %p bfqq %p\n", ++ bfqq->next_rq, RQ_BFQQ(bfqq->next_rq), bfqq); ++ } ++ BUG_ON(bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq); ++ ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ if (rq->queuelist.prev != &rq->queuelist) ++ list_del_init(&rq->queuelist); ++ BUG_ON(bfqq->queued[sync] == 0); ++ bfqq->queued[sync]--; ++ bfqd->queued--; ++ elv_rb_del(&bfqq->sort_list, rq); ++ ++ elv_rqhash_del(q, rq); ++ if (q->last_merge == rq) ++ q->last_merge = NULL; ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ bfqq->next_rq = NULL; ++ ++ BUG_ON(bfqq->entity.budget < 0); ++ ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { ++ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ ++ bfq_del_bfqq_busy(bfqd, bfqq, false); ++ /* ++ * bfqq emptied. In normal operation, when ++ * bfqq is empty, bfqq->entity.service and ++ * bfqq->entity.budget must contain, ++ * respectively, the service received and the ++ * budget used last time bfqq emptied. These ++ * facts do not hold in this case, as at least ++ * this last removal occurred while bfqq is ++ * not in service. To avoid inconsistencies, ++ * reset both bfqq->entity.service and ++ * bfqq->entity.budget, if bfqq has still a ++ * process that may issue I/O requests to it. ++ */ ++ bfqq->entity.budget = bfqq->entity.service = 0; ++ } ++ ++ /* ++ * Remove queue from request-position tree as it is empty. ++ */ ++ if (bfqq->pos_root) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ } else { ++ BUG_ON(!bfqq->next_rq); ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ ++ if (rq->cmd_flags & REQ_META) { ++ BUG_ON(bfqq->meta_pending == 0); ++ bfqq->meta_pending--; ++ } ++} ++ ++static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) ++{ ++ struct request_queue *q = hctx->queue; ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct request *free = NULL; ++ /* ++ * bfq_bic_lookup grabs the queue_lock: invoke it now and ++ * store its return value for later use, to avoid nesting ++ * queue_lock inside the bfqd->lock. We assume that the bic ++ * returned by bfq_bic_lookup does not go away before ++ * bfqd->lock is taken. ++ */ ++ struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); ++ bool ret; ++ ++ spin_lock_irq(&bfqd->lock); ++ ++ if (bic) ++ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); ++ else ++ bfqd->bio_bfqq = NULL; ++ bfqd->bio_bic = bic; ++ /* Set next flag just for testing purposes */ ++ bfqd->bio_bfqq_set = true; ++ ++ ret = blk_mq_sched_try_merge(q, bio, &free); ++ ++ /* ++ * XXX Not yet freeing without lock held, to avoid an ++ * inconsistency with respect to the lock-protected invocation ++ * of blk_mq_sched_try_insert_merge in bfq_bio_merge. Waiting ++ * for clarifications from Jens. ++ */ ++ if (free) ++ blk_mq_free_request(free); ++ bfqd->bio_bfqq_set = false; ++ spin_unlock_irq(&bfqd->lock); ++ ++ return ret; ++} ++ ++static int bfq_request_merge(struct request_queue *q, struct request **req, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct request *__rq; ++ ++ __rq = bfq_find_rq_fmerge(bfqd, bio, q); ++ if (__rq && elv_bio_merge_ok(__rq, bio)) { ++ *req = __rq; ++ bfq_log(bfqd, "req %p", __rq); ++ ++ return ELEVATOR_FRONT_MERGE; ++ } ++ ++ return ELEVATOR_NO_MERGE; ++} ++ ++static struct bfq_queue *bfq_init_rq(struct request *rq); ++ ++static void bfq_request_merged(struct request_queue *q, struct request *req, ++ enum elv_merge type) ++{ ++ BUG_ON(req->rq_flags & RQF_DISP_LIST); ++ ++ if (type == ELEVATOR_FRONT_MERGE && ++ rb_prev(&req->rb_node) && ++ blk_rq_pos(req) < ++ blk_rq_pos(container_of(rb_prev(&req->rb_node), ++ struct request, rb_node))) { ++ struct bfq_queue *bfqq = bfq_init_rq(req); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *prev, *next_rq; ++ ++ /* Reposition request in its sort_list */ ++ elv_rb_del(&bfqq->sort_list, req); ++ BUG_ON(!RQ_BFQQ(req)); ++ BUG_ON(RQ_BFQQ(req) != bfqq); ++ elv_rb_add(&bfqq->sort_list, req); ++ ++ /* Choose next request to be served for bfqq */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, ++ bfqd->last_position); ++ BUG_ON(!next_rq); ++ ++ bfqq->next_rq = next_rq; ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "req %p prev %p next_rq %p bfqq %p", ++ req, prev, next_rq, bfqq); ++ ++ /* ++ * If next_rq changes, update both the queue's budget to ++ * fit the new request and the queue's position in its ++ * rq_pos_tree. ++ */ ++ if (prev != bfqq->next_rq) { ++ bfq_updated_next_req(bfqd, bfqq); ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ } ++} ++ ++/* ++ * This function is called to notify the scheduler that the requests ++ * rq and 'next' have been merged, with 'next' going away. BFQ ++ * exploits this hook to address the following issue: if 'next' has a ++ * fifo_time lower that rq, then the fifo_time of rq must be set to ++ * the value of 'next', to not forget the greater age of 'next'. ++ * ++ * NOTE: in this function we assume that rq is in a bfq_queue, basing ++ * on that rq is picked from the hash table q->elevator->hash, which, ++ * in its turn, is filled only with I/O requests present in ++ * bfq_queues, while BFQ is in use for the request queue q. In fact, ++ * the function that fills this hash table (elv_rqhash_add) is called ++ * only by bfq_insert_request. ++ */ ++static void bfq_requests_merged(struct request_queue *q, struct request *rq, ++ struct request *next) ++{ ++ struct bfq_queue *bfqq = bfq_init_rq(rq), ++ *next_bfqq = bfq_init_rq(next); ++ ++ BUG_ON(!RQ_BFQQ(rq)); ++ BUG_ON(!RQ_BFQQ(next)); /* this does not imply next is in a bfqq */ ++ BUG_ON(rq->rq_flags & RQF_DISP_LIST); ++ BUG_ON(next->rq_flags & RQF_DISP_LIST); ++ ++ lockdep_assert_held(&bfqq->bfqd->lock); ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "rq %p next %p bfqq %p next_bfqq %p", ++ rq, next, bfqq, next_bfqq); ++ ++ /* ++ * If next and rq belong to the same bfq_queue and next is older ++ * than rq, then reposition rq in the fifo (by substituting next ++ * with rq). Otherwise, if next and rq belong to different ++ * bfq_queues, never reposition rq: in fact, we would have to ++ * reposition it with respect to next's position in its own fifo, ++ * which would most certainly be too expensive with respect to ++ * the benefits. ++ */ ++ if (bfqq == next_bfqq && ++ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && ++ next->fifo_time < rq->fifo_time) { ++ list_del_init(&rq->queuelist); ++ list_replace_init(&next->queuelist, &rq->queuelist); ++ rq->fifo_time = next->fifo_time; ++ } ++ ++ if (bfqq->next_rq == next) ++ bfqq->next_rq = rq; ++ ++ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); ++} ++ ++/* Must be called with bfqq != NULL */ ++static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) ++{ ++ BUG_ON(!bfqq); ++ ++ if (bfq_bfqq_busy(bfqq)) { ++ bfqq->bfqd->wr_busy_queues--; ++ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); ++ } ++ bfqq->wr_coeff = 1; ++ bfqq->wr_cur_max_time = 0; ++ bfqq->last_wr_start_finish = jiffies; ++ /* ++ * Trigger a weight change on the next invocation of ++ * __bfq_entity_update_weight_prio. ++ */ ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "wrais ending at %lu, rais_max_time %u", ++ bfqq->last_wr_start_finish, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", ++ bfqq->bfqd->wr_busy_queues); ++} ++ ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ if (bfqg->async_bfqq[i][j]) ++ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); ++ if (bfqg->async_idle_bfqq) ++ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); ++} ++ ++static void bfq_end_wr(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ ++ spin_lock_irq(&bfqd->lock); ++ ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ bfq_end_wr_async(bfqd); ++ ++ spin_unlock_irq(&bfqd->lock); ++} ++ ++static sector_t bfq_io_struct_pos(void *io_struct, bool request) ++{ ++ if (request) ++ return blk_rq_pos(io_struct); ++ else ++ return ((struct bio *)io_struct)->bi_iter.bi_sector; ++} ++ ++static int bfq_rq_close_to_sector(void *io_struct, bool request, ++ sector_t sector) ++{ ++ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= ++ BFQQ_CLOSE_THR; ++} ++ ++static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ sector_t sector) ++{ ++ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; ++ struct rb_node *parent, *node; ++ struct bfq_queue *__bfqq; ++ ++ if (RB_EMPTY_ROOT(root)) ++ return NULL; ++ ++ /* ++ * First, if we find a request starting at the end of the last ++ * request, choose it. ++ */ ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); ++ if (__bfqq) ++ return __bfqq; ++ ++ /* ++ * If the exact sector wasn't found, the parent of the NULL leaf ++ * will contain the closest sector (rq_pos_tree sorted by ++ * next_request position). ++ */ ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ if (blk_rq_pos(__bfqq->next_rq) < sector) ++ node = rb_next(&__bfqq->pos_node); ++ else ++ node = rb_prev(&__bfqq->pos_node); ++ if (!node) ++ return NULL; ++ ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ return NULL; ++} ++ ++static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, ++ struct bfq_queue *cur_bfqq, ++ sector_t sector) ++{ ++ struct bfq_queue *bfqq; ++ ++ /* ++ * We shall notice if some of the queues are cooperating, ++ * e.g., working closely on the same area of the device. In ++ * that case, we can group them together and: 1) don't waste ++ * time idling, and 2) serve the union of their requests in ++ * the best possible order for throughput. ++ */ ++ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); ++ if (!bfqq || bfqq == cur_bfqq) ++ return NULL; ++ ++ return bfqq; ++} ++ ++static struct bfq_queue * ++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ int process_refs, new_process_refs; ++ struct bfq_queue *__bfqq; ++ ++ /* ++ * If there are no process references on the new_bfqq, then it is ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain ++ * may have dropped their last reference (not just their last process ++ * reference). ++ */ ++ if (!bfqq_process_refs(new_bfqq)) ++ return NULL; ++ ++ /* Avoid a circular list and skip interim queue merges. */ ++ while ((__bfqq = new_bfqq->new_bfqq)) { ++ if (__bfqq == bfqq) ++ return NULL; ++ new_bfqq = __bfqq; ++ } ++ ++ process_refs = bfqq_process_refs(bfqq); ++ new_process_refs = bfqq_process_refs(new_bfqq); ++ /* ++ * If the process for the bfqq has gone away, there is no ++ * sense in merging the queues. ++ */ ++ if (process_refs == 0 || new_process_refs == 0) ++ return NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", ++ new_bfqq->pid); ++ ++ /* ++ * Merging is just a redirection: the requests of the process ++ * owning one of the two queues are redirected to the other queue. ++ * The latter queue, in its turn, is set as shared if this is the ++ * first time that the requests of some process are redirected to ++ * it. ++ * ++ * We redirect bfqq to new_bfqq and not the opposite, because ++ * we are in the context of the process owning bfqq, thus we ++ * have the io_cq of this process. So we can immediately ++ * configure this io_cq to redirect the requests of the ++ * process to new_bfqq. In contrast, the io_cq of new_bfqq is ++ * not available any more (new_bfqq->bic == NULL). ++ * ++ * Anyway, even in case new_bfqq coincides with the in-service ++ * queue, redirecting requests the in-service queue is the ++ * best option, as we feed the in-service queue with new ++ * requests close to the last request served and, by doing so, ++ * are likely to increase the throughput. ++ */ ++ bfqq->new_bfqq = new_bfqq; ++ new_bfqq->ref += process_refs; ++ return new_bfqq; ++} ++ ++static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, ++ struct bfq_queue *new_bfqq) ++{ ++ if (bfq_too_late_for_merging(new_bfqq)) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "too late for bfq%d to be merged", ++ new_bfqq->pid); ++ return false; ++ } ++ ++ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || ++ (bfqq->ioprio_class != new_bfqq->ioprio_class)) ++ return false; ++ ++ /* ++ * If either of the queues has already been detected as seeky, ++ * then merging it with the other queue is unlikely to lead to ++ * sequential I/O. ++ */ ++ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) ++ return false; ++ ++ /* ++ * Interleaved I/O is known to be done by (some) applications ++ * only for reads, so it does not make sense to merge async ++ * queues. ++ */ ++ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Attempt to schedule a merge of bfqq with the currently in-service ++ * queue or with a close queue among the scheduled queues. Return ++ * NULL if no merge was scheduled, a pointer to the shared bfq_queue ++ * structure otherwise. ++ * ++ * The OOM queue is not allowed to participate to cooperation: in fact, since ++ * the requests temporarily redirected to the OOM queue could be redirected ++ * again to dedicated queues at any time, the state needed to correctly ++ * handle merging with the OOM queue would be quite complex and expensive ++ * to maintain. Besides, in such a critical condition as an out of memory, ++ * the benefits of queue merging may be little relevant, or even negligible. ++ * ++ * WARNING: queue merging may impair fairness among non-weight raised ++ * queues, for at least two reasons: 1) the original weight of a ++ * merged queue may change during the merged state, 2) even being the ++ * weight the same, a merged queue may be bloated with many more ++ * requests than the ones produced by its originally-associated ++ * process. ++ */ ++static struct bfq_queue * ++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ void *io_struct, bool request) ++{ ++ struct bfq_queue *in_service_bfqq, *new_bfqq; ++ ++ /* ++ * Prevent bfqq from being merged if it has been created too ++ * long ago. The idea is that true cooperating processes, and ++ * thus their associated bfq_queues, are supposed to be ++ * created shortly after each other. This is the case, e.g., ++ * for KVM/QEMU and dump I/O threads. Basing on this ++ * assumption, the following filtering greatly reduces the ++ * probability that two non-cooperating processes, which just ++ * happen to do close I/O for some short time interval, have ++ * their queues merged by mistake. ++ */ ++ if (bfq_too_late_for_merging(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have looked for coop, but too late"); ++ return NULL; ++ } ++ ++ if (bfqq->new_bfqq) ++ return bfqq->new_bfqq; ++ ++ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) ++ return NULL; ++ ++ /* If there is only one backlogged queue, don't search. */ ++ if (bfq_tot_busy_queues(bfqd) == 1) ++ return NULL; ++ ++ in_service_bfqq = bfqd->in_service_queue; ++ ++ if (in_service_bfqq && in_service_bfqq != bfqq && ++ likely(in_service_bfqq != &bfqd->oom_bfqq) && ++ bfq_rq_close_to_sector(io_struct, request, bfqd->in_serv_last_pos) && ++ bfqq->entity.parent == in_service_bfqq->entity.parent && ++ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { ++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); ++ if (new_bfqq) ++ return new_bfqq; ++ } ++ /* ++ * Check whether there is a cooperator among currently scheduled ++ * queues. The only thing we need is that the bio/request is not ++ * NULL, as we need it to establish whether a cooperator exists. ++ */ ++ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, ++ bfq_io_struct_pos(io_struct, request)); ++ ++ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); ++ ++ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && ++ bfq_may_be_close_cooperator(bfqq, new_bfqq)) ++ return bfq_setup_merge(bfqq, new_bfqq); ++ ++ return NULL; ++} ++ ++static void bfq_bfqq_save_state(struct bfq_queue *bfqq) ++{ ++ struct bfq_io_cq *bic = bfqq->bic; ++ ++ /* ++ * If !bfqq->bic, the queue is already shared or its requests ++ * have already been redirected to a shared queue; both idle window ++ * and weight raising state have already been saved. Do nothing. ++ */ ++ if (!bic) ++ return; ++ ++ bic->saved_ttime = bfqq->ttime; ++ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); ++ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); ++ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); ++ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); ++ if (unlikely(bfq_bfqq_just_created(bfqq) && ++ !bfq_bfqq_in_large_burst(bfqq) && ++ bfqq->bfqd->low_latency)) { ++ /* ++ * bfqq being merged ritgh after being created: bfqq ++ * would have deserved interactive weight raising, but ++ * did not make it to be set in a weight-raised state, ++ * because of this early merge. Store directly the ++ * weight-raising state that would have been assigned ++ * to bfqq, so that to avoid that bfqq unjustly fails ++ * to enjoy weight raising if split soon. ++ */ ++ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; ++ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); ++ bic->saved_last_wr_start_finish = jiffies; ++ } else { ++ bic->saved_wr_coeff = bfqq->wr_coeff; ++ bic->saved_wr_start_at_switch_to_srt = ++ bfqq->wr_start_at_switch_to_srt; ++ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; ++ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; ++ } ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "bic %p wr_coeff %d start_finish %lu max_time %lu", ++ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, ++ bfqq->wr_cur_max_time); ++} ++ ++static void ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", ++ (unsigned long) new_bfqq->pid); ++ BUG_ON(bfqq->bic && bfqq->bic == new_bfqq->bic); ++ /* Save weight raising and idle window of the merged queues */ ++ bfq_bfqq_save_state(bfqq); ++ bfq_bfqq_save_state(new_bfqq); ++ ++ if (bfq_bfqq_IO_bound(bfqq)) ++ bfq_mark_bfqq_IO_bound(new_bfqq); ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ /* ++ * If bfqq is weight-raised, then let new_bfqq inherit ++ * weight-raising. To reduce false positives, neglect the case ++ * where bfqq has just been created, but has not yet made it ++ * to be weight-raised (which may happen because EQM may merge ++ * bfqq even before bfq_add_request is executed for the first ++ * time for bfqq). Handling this case would however be very ++ * easy, thanks to the flag just_created. ++ */ ++ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { ++ new_bfqq->wr_coeff = bfqq->wr_coeff; ++ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; ++ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; ++ new_bfqq->wr_start_at_switch_to_srt = ++ bfqq->wr_start_at_switch_to_srt; ++ if (bfq_bfqq_busy(new_bfqq)) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > ++ bfq_tot_busy_queues(bfqd)); ++ } ++ ++ new_bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, new_bfqq, ++ "wr start after merge with %d, rais_max_time %u", ++ bfqq->pid, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ ++ bfqq->wr_coeff = 1; ++ bfqq->entity.prio_changed = 1; ++ if (bfq_bfqq_busy(bfqq)) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++ ++ } ++ ++ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", ++ bfqd->wr_busy_queues); ++ ++ /* ++ * Merge queues (that is, let bic redirect its requests to new_bfqq) ++ */ ++ bic_set_bfqq(bic, new_bfqq, 1); ++ bfq_mark_bfqq_coop(new_bfqq); ++ /* ++ * new_bfqq now belongs to at least two bics (it is a shared queue): ++ * set new_bfqq->bic to NULL. bfqq either: ++ * - does not belong to any bic any more, and hence bfqq->bic must ++ * be set to NULL, or ++ * - is a queue whose owning bics have already been redirected to a ++ * different queue, hence the queue is destined to not belong to ++ * any bic soon and bfqq->bic is already NULL (therefore the next ++ * assignment causes no harm). ++ */ ++ new_bfqq->bic = NULL; ++ bfqq->bic = NULL; ++ /* release process reference to bfqq */ ++ bfq_put_queue(bfqq); ++} ++ ++static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ bool is_sync = op_is_sync(bio->bi_opf); ++ struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; ++ ++ assert_spin_locked(&bfqd->lock); ++ /* ++ * Disallow merge of a sync bio into an async request. ++ */ ++ if (is_sync && !rq_is_sync(rq)) ++ return false; ++ ++ /* ++ * Lookup the bfqq that this bio will be queued with. Allow ++ * merge only if rq is queued there. ++ */ ++ BUG_ON(!bfqd->bio_bfqq_set); ++ if (!bfqq) ++ return false; ++ ++ /* ++ * We take advantage of this function to perform an early merge ++ * of the queues of possible cooperating processes. ++ */ ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); ++ BUG_ON(new_bfqq == bfqq); ++ if (new_bfqq) { ++ /* ++ * bic still points to bfqq, then it has not yet been ++ * redirected to some other bfq_queue, and a queue ++ * merge beween bfqq and new_bfqq can be safely ++ * fulfillled, i.e., bic can be redirected to new_bfqq ++ * and bfqq can be put. ++ */ ++ bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, ++ new_bfqq); ++ /* ++ * If we get here, bio will be queued into new_queue, ++ * so use new_bfqq to decide whether bio and rq can be ++ * merged. ++ */ ++ bfqq = new_bfqq; ++ ++ /* ++ * Change also bqfd->bio_bfqq, as ++ * bfqd->bio_bic now points to new_bfqq, and ++ * this function may be invoked again (and then may ++ * use again bqfd->bio_bfqq). ++ */ ++ bfqd->bio_bfqq = bfqq; ++ } ++ return bfqq == RQ_BFQQ(rq); ++} ++ ++/* ++ * Set the maximum time for the in-service queue to consume its ++ * budget. This prevents seeky processes from lowering the throughput. ++ * In practice, a time-slice service scheme is used with seeky ++ * processes. ++ */ ++static void bfq_set_budget_timeout(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ unsigned int timeout_coeff; ++ ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) ++ timeout_coeff = 1; ++ else ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; ++ ++ bfqd->last_budget_start = ktime_get(); ++ ++ bfqq->budget_timeout = jiffies + ++ bfqd->bfq_timeout * timeout_coeff; ++ ++ bfq_log_bfqq(bfqd, bfqq, "%u", ++ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); ++} ++ ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ if (bfqq) { ++ bfq_clear_bfqq_fifo_expire(bfqq); ++ ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && ++ bfqq->wr_coeff > 1 && ++ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ time_is_before_jiffies(bfqq->budget_timeout)) { ++ /* ++ * For soft real-time queues, move the start ++ * of the weight-raising period forward by the ++ * time the queue has not received any ++ * service. Otherwise, a relatively long ++ * service delay is likely to cause the ++ * weight-raising period of the queue to end, ++ * because of the short duration of the ++ * weight-raising period of a soft real-time ++ * queue. It is worth noting that this move ++ * is not so dangerous for the other queues, ++ * because soft real-time queues are not ++ * greedy. ++ * ++ * To not add a further variable, we use the ++ * overloaded field budget_timeout to ++ * determine for how long the queue has not ++ * received service, i.e., how much time has ++ * elapsed since the queue expired. However, ++ * this is a little imprecise, because ++ * budget_timeout is set to jiffies if bfqq ++ * not only expires, but also remains with no ++ * request. ++ */ ++ if (time_after(bfqq->budget_timeout, ++ bfqq->last_wr_start_finish)) ++ bfqq->last_wr_start_finish += ++ jiffies - bfqq->budget_timeout; ++ else ++ bfqq->last_wr_start_finish = jiffies; ++ ++ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { ++ pr_crit( ++ "BFQ WARNING:last %lu budget %lu jiffies %lu", ++ bfqq->last_wr_start_finish, ++ bfqq->budget_timeout, ++ jiffies); ++ pr_crit("diff %lu", jiffies - ++ max_t(unsigned long, ++ bfqq->last_wr_start_finish, ++ bfqq->budget_timeout)); ++ bfqq->last_wr_start_finish = jiffies; ++ } ++ } ++ ++ bfq_set_budget_timeout(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "cur-budget = %d prio_class %d", ++ bfqq->entity.budget, bfqq->ioprio_class); ++ } else ++ bfq_log(bfqd, "NULL"); ++ ++ bfqd->in_service_queue = bfqq; ++} ++ ++/* ++ * Get and set a new queue for service. ++ */ ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); ++ ++ __bfq_set_in_service_queue(bfqd, bfqq); ++ return bfqq; ++} ++ ++static void bfq_arm_slice_timer(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->in_service_queue; ++ u32 sl; ++ ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ bfq_mark_bfqq_wait_request(bfqq); ++ ++ /* ++ * We don't want to idle for seeks, but we do want to allow ++ * fair distribution of slice time for a process doing back-to-back ++ * seeks. So allow a little bit of time for him to submit a new rq. ++ * ++ * To prevent processes with (partly) seeky workloads from ++ * being too ill-treated, grant them a small fraction of the ++ * assigned budget before reducing the waiting time to ++ * BFQ_MIN_TT. This happened to help reduce latency. ++ */ ++ sl = bfqd->bfq_slice_idle; ++ /* ++ * Unless the queue is being weight-raised or the scenario is ++ * asymmetric, grant only minimum idle time if the queue ++ * is seeky. A long idling is preserved for a weight-raised ++ * queue, or, more in general, in an asymemtric scenario, ++ * because a long idling is needed for guaranteeing to a queue ++ * its reserved share of the throughput (in particular, it is ++ * needed if the queue has a higher weight than some other ++ * queue). ++ */ ++ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && ++ bfq_symmetric_scenario(bfqd)) ++ sl = min_t(u32, sl, BFQ_MIN_TT); ++ ++ bfqd->last_idling_start = ktime_get(); ++ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), ++ HRTIMER_MODE_REL); ++ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); ++ bfq_log(bfqd, "arm idle: %ld/%ld ms", ++ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); ++} ++ ++/* ++ * In autotuning mode, max_budget is dynamically recomputed as the ++ * amount of sectors transferred in timeout at the estimated peak ++ * rate. This enables BFQ to utilize a full timeslice with a full ++ * budget, even if the in-service queue is served at peak rate. And ++ * this maximises throughput with sequential workloads. ++ */ ++static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) ++{ ++ return (u64)bfqd->peak_rate * USEC_PER_MSEC * ++ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; ++} ++ ++/* ++ * Update parameters related to throughput and responsiveness, as a ++ * function of the estimated peak rate. See comments on ++ * bfq_calc_max_budget(), and on the ref_wr_duration array. ++ */ ++static void update_thr_responsiveness_params(struct bfq_data *bfqd) ++{ ++ if (bfqd->bfq_user_max_budget == 0) { ++ bfqd->bfq_max_budget = ++ bfq_calc_max_budget(bfqd); ++ BUG_ON(bfqd->bfq_max_budget < 0); ++ bfq_log(bfqd, "new max_budget = %d", ++ bfqd->bfq_max_budget); ++ } ++} ++ ++static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) ++{ ++ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ ++ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; ++ bfqd->peak_rate_samples = 1; ++ bfqd->sequential_samples = 0; ++ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = ++ blk_rq_sectors(rq); ++ } else /* no new rq dispatched, just reset the number of samples */ ++ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ ++ ++ bfq_log(bfqd, ++ "at end, sample %u/%u tot_sects %llu", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ bfqd->tot_sectors_dispatched); ++} ++ ++static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) ++{ ++ u32 rate, weight, divisor; ++ ++ /* ++ * For the convergence property to hold (see comments on ++ * bfq_update_peak_rate()) and for the assessment to be ++ * reliable, a minimum number of samples must be present, and ++ * a minimum amount of time must have elapsed. If not so, do ++ * not compute new rate. Just reset parameters, to get ready ++ * for a new evaluation attempt. ++ */ ++ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || ++ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { ++ bfq_log(bfqd, ++ "only resetting, delta_first %lluus samples %d", ++ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); ++ goto reset_computation; ++ } ++ ++ /* ++ * If a new request completion has occurred after last ++ * dispatch, then, to approximate the rate at which requests ++ * have been served by the device, it is more precise to ++ * extend the observation interval to the last completion. ++ */ ++ bfqd->delta_from_first = ++ max_t(u64, bfqd->delta_from_first, ++ bfqd->last_completion - bfqd->first_dispatch); ++ ++ BUG_ON(bfqd->delta_from_first == 0); ++ /* ++ * Rate computed in sects/usec, and not sects/nsec, for ++ * precision issues. ++ */ ++ rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); ++ ++ bfq_log(bfqd, ++"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", ++ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ rate > 20< 20M sectors/sec) ++ */ ++ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && ++ rate <= bfqd->peak_rate) || ++ rate > 20<peak_rate_samples, bfqd->sequential_samples, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ goto reset_computation; ++ } else { ++ bfq_log(bfqd, ++ "do update, samples %u/%u rate/peak %llu/%llu", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ } ++ ++ /* ++ * We have to update the peak rate, at last! To this purpose, ++ * we use a low-pass filter. We compute the smoothing constant ++ * of the filter as a function of the 'weight' of the new ++ * measured rate. ++ * ++ * As can be seen in next formulas, we define this weight as a ++ * quantity proportional to how sequential the workload is, ++ * and to how long the observation time interval is. ++ * ++ * The weight runs from 0 to 8. The maximum value of the ++ * weight, 8, yields the minimum value for the smoothing ++ * constant. At this minimum value for the smoothing constant, ++ * the measured rate contributes for half of the next value of ++ * the estimated peak rate. ++ * ++ * So, the first step is to compute the weight as a function ++ * of how sequential the workload is. Note that the weight ++ * cannot reach 9, because bfqd->sequential_samples cannot ++ * become equal to bfqd->peak_rate_samples, which, in its ++ * turn, holds true because bfqd->sequential_samples is not ++ * incremented for the first sample. ++ */ ++ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; ++ ++ /* ++ * Second step: further refine the weight as a function of the ++ * duration of the observation interval. ++ */ ++ weight = min_t(u32, 8, ++ div_u64(weight * bfqd->delta_from_first, ++ BFQ_RATE_REF_INTERVAL)); ++ ++ /* ++ * Divisor ranging from 10, for minimum weight, to 2, for ++ * maximum weight. ++ */ ++ divisor = 10 - weight; ++ BUG_ON(divisor == 0); ++ ++ /* ++ * Finally, update peak rate: ++ * ++ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor ++ */ ++ bfqd->peak_rate *= divisor-1; ++ bfqd->peak_rate /= divisor; ++ rate /= divisor; /* smoothing constant alpha = 1/divisor */ ++ ++ bfq_log(bfqd, ++ "divisor %d tmp_peak_rate %llu tmp_rate %u", ++ divisor, ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), ++ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); ++ ++ BUG_ON(bfqd->peak_rate == 0); ++ BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; ++ ++ /* ++ * For a very slow device, bfqd->peak_rate can reach 0 (see ++ * the minimum representable values reported in the comments ++ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid ++ * divisions by zero where bfqd->peak_rate is used as a ++ * divisor. ++ */ ++ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); ++ ++ update_thr_responsiveness_params(bfqd); ++ BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */ ++ bfq_log(bfqd, ++ "goto reset, samples %d", ++ bfqd->peak_rate_samples) ; ++ bfq_reset_rate_computation(bfqd, rq); ++ goto update_last_values; /* will add one sample */ ++ } ++ ++ /* ++ * Device idle for very long: the observation interval lasting ++ * up to this dispatch cannot be a valid observation interval ++ * for computing a new peak rate (similarly to the late- ++ * completion event in bfq_completed_request()). Go to ++ * update_rate_and_reset to have the following three steps ++ * taken: ++ * - close the observation interval at the last (previous) ++ * request dispatch or completion ++ * - compute rate, if possible, for that observation interval ++ * - start a new observation interval with this dispatch ++ */ ++ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && ++ bfqd->rq_in_driver == 0) { ++ bfq_log(bfqd, ++"jumping to updating&resetting delta_last %lluus samples %d", ++ (now_ns - bfqd->last_dispatch)>>10, ++ bfqd->peak_rate_samples) ; ++ goto update_rate_and_reset; ++ } ++ ++ /* Update sampling information */ ++ bfqd->peak_rate_samples++; ++ ++ if ((bfqd->rq_in_driver > 0 || ++ now_ns - bfqd->last_completion < BFQ_MIN_TT) ++ && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) ++ bfqd->sequential_samples++; ++ ++ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); ++ ++ /* Reset max observed rq size every 32 dispatches */ ++ if (likely(bfqd->peak_rate_samples % 32)) ++ bfqd->last_rq_max_size = ++ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); ++ else ++ bfqd->last_rq_max_size = blk_rq_sectors(rq); ++ ++ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; ++ ++ bfq_log(bfqd, ++ "added samples %u/%u tot_sects %llu delta_first %lluus", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ bfqd->tot_sectors_dispatched, ++ bfqd->delta_from_first>>10); ++ ++ /* Target observation interval not yet reached, go on sampling */ ++ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) ++ goto update_last_values; ++ ++update_rate_and_reset: ++ bfq_update_rate_reset(bfqd, rq); ++update_last_values: ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ if (RQ_BFQQ(rq) == bfqd->in_service_queue) ++ bfqd->in_serv_last_pos = bfqd->last_position; ++ bfqd->last_dispatch = now_ns; ++ ++ bfq_log(bfqd, ++ "delta_first %lluus last_pos %llu peak_rate %llu", ++ (now_ns - bfqd->first_dispatch)>>10, ++ (unsigned long long) bfqd->last_position, ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ bfq_log(bfqd, ++ "samples at end %d", bfqd->peak_rate_samples); ++} ++ ++/* ++ * Remove request from internal lists. ++ */ ++static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ /* ++ * For consistency, the next instruction should have been ++ * executed after removing the request from the queue and ++ * dispatching it. We execute instead this instruction before ++ * bfq_remove_request() (and hence introduce a temporary ++ * inconsistency), for efficiency. In fact, should this ++ * dispatch occur for a non in-service bfqq, this anticipated ++ * increment prevents two counters related to bfqq->dispatched ++ * from risking to be, first, uselessly decremented, and then ++ * incremented again when the (new) value of bfqq->dispatched ++ * happens to be taken into account. ++ */ ++ bfqq->dispatched++; ++ bfq_update_peak_rate(q->elevator->elevator_data, rq); ++ ++ bfq_remove_request(q, rq); ++} ++ ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ /* ++ * If this bfqq is shared between multiple processes, check ++ * to make sure that those processes are still issuing I/Os ++ * within the mean seek distance. If not, it may be time to ++ * break the queues apart again. ++ */ ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) ++ bfq_mark_bfqq_split_coop(bfqq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ if (bfqq->dispatched == 0) ++ /* ++ * Overloading budget_timeout field to store ++ * the time at which the queue remains with no ++ * backlog and no outstanding request; used by ++ * the weight-raising mechanism. ++ */ ++ bfqq->budget_timeout = jiffies; ++ ++ bfq_del_bfqq_busy(bfqd, bfqq, true); ++ } else { ++ bfq_requeue_bfqq(bfqd, bfqq, true); ++ /* ++ * Resort priority tree of potential close cooperators. ++ */ ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ ++ /* ++ * All in-service entities must have been properly deactivated ++ * or requeued before executing the next function, which ++ * resets all in-service entites as no more in service. ++ */ ++ __bfq_bfqd_reset_in_service(bfqd); ++} ++ ++/** ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. ++ * @bfqd: device data. ++ * @bfqq: queue to update. ++ * @reason: reason for expiration. ++ * ++ * Handle the feedback on @bfqq budget at queue expiration. ++ * See the body for detailed comments. ++ */ ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ enum bfqq_expiration reason) ++{ ++ struct request *next_rq; ++ int budget, min_budget; ++ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ min_budget = bfq_min_budget(bfqd); ++ ++ if (bfqq->wr_coeff == 1) ++ budget = bfqq->max_budget; ++ else /* ++ * Use a constant, low budget for weight-raised queues, ++ * to help achieve a low latency. Keep it slightly higher ++ * than the minimum possible budget, to cause a little ++ * bit fewer expirations. ++ */ ++ budget = 2 * min_budget; ++ ++ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); ++ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", ++ budget, bfq_min_budget(bfqd)); ++ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); ++ ++ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { ++ switch (reason) { ++ /* ++ * Caveat: in all the following cases we trade latency ++ * for throughput. ++ */ ++ case BFQ_BFQQ_TOO_IDLE: ++ /* ++ * This is the only case where we may reduce ++ * the budget: if there is no request of the ++ * process still waiting for completion, then ++ * we assume (tentatively) that the timer has ++ * expired because the batch of requests of ++ * the process could have been served with a ++ * smaller budget. Hence, betting that ++ * process will behave in the same way when it ++ * becomes backlogged again, we reduce its ++ * next budget. As long as we guess right, ++ * this budget cut reduces the latency ++ * experienced by the process. ++ * ++ * However, if there are still outstanding ++ * requests, then the process may have not yet ++ * issued its next request just because it is ++ * still waiting for the completion of some of ++ * the still outstanding ones. So in this ++ * subcase we do not reduce its budget, on the ++ * contrary we increase it to possibly boost ++ * the throughput, as discussed in the ++ * comments to the BUDGET_TIMEOUT case. ++ */ ++ if (bfqq->dispatched > 0) /* still outstanding reqs */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ else { ++ if (budget > 5 * min_budget) ++ budget -= 4 * min_budget; ++ else ++ budget = min_budget; ++ } ++ break; ++ case BFQ_BFQQ_BUDGET_TIMEOUT: ++ /* ++ * We double the budget here because it gives ++ * the chance to boost the throughput if this ++ * is not a seeky process (and has bumped into ++ * this timeout because of, e.g., ZBR). ++ */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_BUDGET_EXHAUSTED: ++ /* ++ * The process still has backlog, and did not ++ * let either the budget timeout or the disk ++ * idling timeout expire. Hence it is not ++ * seeky, has a short thinktime and may be ++ * happy with a higher budget too. So ++ * definitely increase the budget of this good ++ * candidate to boost the disk throughput. ++ */ ++ budget = min(budget * 4, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_NO_MORE_REQUESTS: ++ /* ++ * For queues that expire for this reason, it ++ * is particularly important to keep the ++ * budget close to the actual service they ++ * need. Doing so reduces the timestamp ++ * misalignment problem described in the ++ * comments in the body of ++ * __bfq_activate_entity. In fact, suppose ++ * that a queue systematically expires for ++ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a ++ * new request in time to enjoy timestamp ++ * back-shifting. The larger the budget of the ++ * queue is with respect to the service the ++ * queue actually requests in each service ++ * slot, the more times the queue can be ++ * reactivated with the same virtual finish ++ * time. It follows that, even if this finish ++ * time is pushed to the system virtual time ++ * to reduce the consequent timestamp ++ * misalignment, the queue unjustly enjoys for ++ * many re-activations a lower finish time ++ * than all newly activated queues. ++ * ++ * The service needed by bfqq is measured ++ * quite precisely by bfqq->entity.service. ++ * Since bfqq does not enjoy device idling, ++ * bfqq->entity.service is equal to the number ++ * of sectors that the process associated with ++ * bfqq requested to read/write before waiting ++ * for request completions, or blocking for ++ * other reasons. ++ */ ++ budget = max_t(int, bfqq->entity.service, min_budget); ++ break; ++ default: ++ return; ++ } ++ } else if (!bfq_bfqq_sync(bfqq)) ++ /* ++ * Async queues get always the maximum possible ++ * budget, as for them we do not care about latency ++ * (in addition, their ability to dispatch is limited ++ * by the charging factor). ++ */ ++ budget = bfqd->bfq_max_budget; ++ ++ bfqq->max_budget = budget; ++ ++ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && ++ !bfqd->bfq_user_max_budget) ++ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); ++ ++ /* ++ * If there is still backlog, then assign a new budget, making ++ * sure that it is large enough for the next request. Since ++ * the finish time of bfqq must be kept in sync with the ++ * budget, be sure to call __bfq_bfqq_expire() *after* this ++ * update. ++ * ++ * If there is no backlog, then no need to update the budget; ++ * it will be updated on the arrival of a new request. ++ */ ++ next_rq = bfqq->next_rq; ++ if (next_rq) { ++ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || ++ reason == BFQ_BFQQ_NO_MORE_REQUESTS); ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ BUG_ON(!bfq_bfqq_busy(bfqq)); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", ++ next_rq ? blk_rq_sectors(next_rq) : 0, ++ bfqq->entity.budget); ++} ++ ++/* ++ * Return true if the process associated with bfqq is "slow". The slow ++ * flag is used, in addition to the budget timeout, to reduce the ++ * amount of service provided to seeky processes, and thus reduce ++ * their chances to lower the throughput. More details in the comments ++ * on the function bfq_bfqq_expire(). ++ * ++ * An important observation is in order: as discussed in the comments ++ * on the function bfq_update_peak_rate(), with devices with internal ++ * queues, it is hard if ever possible to know when and for how long ++ * an I/O request is processed by the device (apart from the trivial ++ * I/O pattern where a new request is dispatched only after the ++ * previous one has been completed). This makes it hard to evaluate ++ * the real rate at which the I/O requests of each bfq_queue are ++ * served. In fact, for an I/O scheduler like BFQ, serving a ++ * bfq_queue means just dispatching its requests during its service ++ * slot (i.e., until the budget of the queue is exhausted, or the ++ * queue remains idle, or, finally, a timeout fires). But, during the ++ * service slot of a bfq_queue, around 100 ms at most, the device may ++ * be even still processing requests of bfq_queues served in previous ++ * service slots. On the opposite end, the requests of the in-service ++ * bfq_queue may be completed after the service slot of the queue ++ * finishes. ++ * ++ * Anyway, unless more sophisticated solutions are used ++ * (where possible), the sum of the sizes of the requests dispatched ++ * during the service slot of a bfq_queue is probably the only ++ * approximation available for the service received by the bfq_queue ++ * during its service slot. And this sum is the quantity used in this ++ * function to evaluate the I/O speed of a process. ++ */ ++static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool compensate, enum bfqq_expiration reason, ++ unsigned long *delta_ms) ++{ ++ ktime_t delta_ktime; ++ u32 delta_usecs; ++ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ ++ ++ if (!bfq_bfqq_sync(bfqq)) ++ return false; ++ ++ if (compensate) ++ delta_ktime = bfqd->last_idling_start; ++ else ++ delta_ktime = ktime_get(); ++ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); ++ delta_usecs = ktime_to_us(delta_ktime); ++ ++ /* don't use too short time intervals */ ++ if (delta_usecs < 1000) { ++ if (blk_queue_nonrot(bfqd->queue)) ++ /* ++ * give same worst-case guarantees as idling ++ * for seeky ++ */ ++ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; ++ else /* charge at least one seek */ ++ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; ++ ++ bfq_log(bfqd, "too short %u", delta_usecs); ++ ++ return slow; ++ } ++ ++ *delta_ms = delta_usecs / USEC_PER_MSEC; ++ ++ /* ++ * Use only long (> 20ms) intervals to filter out excessive ++ * spikes in service rate estimation. ++ */ ++ if (delta_usecs > 20000) { ++ /* ++ * Caveat for rotational devices: processes doing I/O ++ * in the slower disk zones tend to be slow(er) even ++ * if not seeky. In this respect, the estimated peak ++ * rate is likely to be an average over the disk ++ * surface. Accordingly, to not be too harsh with ++ * unlucky processes, a process is deemed slow only if ++ * its rate has been lower than half of the estimated ++ * peak rate. ++ */ ++ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; ++ bfq_log(bfqd, "relative rate %d/%d", ++ bfqq->entity.service, bfqd->bfq_max_budget); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); ++ ++ return slow; ++} ++ ++/* ++ * To be deemed as soft real-time, an application must meet two ++ * requirements. First, the application must not require an average ++ * bandwidth higher than the approximate bandwidth required to playback or ++ * record a compressed high-definition video. ++ * The next function is invoked on the completion of the last request of a ++ * batch, to compute the next-start time instant, soft_rt_next_start, such ++ * that, if the next request of the application does not arrive before ++ * soft_rt_next_start, then the above requirement on the bandwidth is met. ++ * ++ * The second requirement is that the request pattern of the application is ++ * isochronous, i.e., that, after issuing a request or a batch of requests, ++ * the application stops issuing new requests until all its pending requests ++ * have been completed. After that, the application may issue a new batch, ++ * and so on. ++ * For this reason the next function is invoked to compute ++ * soft_rt_next_start only for applications that meet this requirement, ++ * whereas soft_rt_next_start is set to infinity for applications that do ++ * not. ++ * ++ * Unfortunately, even a greedy (i.e., I/O-bound) application may ++ * happen to meet, occasionally or systematically, both the above ++ * bandwidth and isochrony requirements. This may happen at least in ++ * the following circumstances. First, if the CPU load is high. The ++ * application may stop issuing requests while the CPUs are busy ++ * serving other processes, then restart, then stop again for a while, ++ * and so on. The other circumstances are related to the storage ++ * device: the storage device is highly loaded or reaches a low-enough ++ * throughput with the I/O of the application (e.g., because the I/O ++ * is random and/or the device is slow). In all these cases, the ++ * I/O of the application may be simply slowed down enough to meet ++ * the bandwidth and isochrony requirements. To reduce the probability ++ * that greedy applications are deemed as soft real-time in these ++ * corner cases, a further rule is used in the computation of ++ * soft_rt_next_start: the return value of this function is forced to ++ * be higher than the maximum between the following two quantities. ++ * ++ * (a) Current time plus: (1) the maximum time for which the arrival ++ * of a request is waited for when a sync queue becomes idle, ++ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We ++ * postpone for a moment the reason for adding a few extra ++ * jiffies; we get back to it after next item (b). Lower-bounding ++ * the return value of this function with the current time plus ++ * bfqd->bfq_slice_idle tends to filter out greedy applications, ++ * because the latter issue their next request as soon as possible ++ * after the last one has been completed. In contrast, a soft ++ * real-time application spends some time processing data, after a ++ * batch of its requests has been completed. ++ * ++ * (b) Current value of bfqq->soft_rt_next_start. As pointed out ++ * above, greedy applications may happen to meet both the ++ * bandwidth and isochrony requirements under heavy CPU or ++ * storage-device load. In more detail, in these scenarios, these ++ * applications happen, only for limited time periods, to do I/O ++ * slowly enough to meet all the requirements described so far, ++ * including the filtering in above item (a). These slow-speed ++ * time intervals are usually interspersed between other time ++ * intervals during which these applications do I/O at a very high ++ * speed. Fortunately, exactly because of the high speed of the ++ * I/O in the high-speed intervals, the values returned by this ++ * function happen to be so high, near the end of any such ++ * high-speed interval, to be likely to fall *after* the end of ++ * the low-speed time interval that follows. These high values are ++ * stored in bfqq->soft_rt_next_start after each invocation of ++ * this function. As a consequence, if the last value of ++ * bfqq->soft_rt_next_start is constantly used to lower-bound the ++ * next value that this function may return, then, from the very ++ * beginning of a low-speed interval, bfqq->soft_rt_next_start is ++ * likely to be constantly kept so high that any I/O request ++ * issued during the low-speed interval is considered as arriving ++ * to soon for the application to be deemed as soft ++ * real-time. Then, in the high-speed interval that follows, the ++ * application will not be deemed as soft real-time, just because ++ * it will do I/O at a high speed. And so on. ++ * ++ * Getting back to the filtering in item (a), in the following two ++ * cases this filtering might be easily passed by a greedy ++ * application, if the reference quantity was just ++ * bfqd->bfq_slice_idle: ++ * 1) HZ is so low that the duration of a jiffy is comparable to or ++ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow ++ * devices with HZ=100. The time granularity may be so coarse ++ * that the approximation, in jiffies, of bfqd->bfq_slice_idle ++ * is rather lower than the exact value. ++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing ++ * for a while, then suddenly 'jump' by several units to recover the lost ++ * increments. This seems to happen, e.g., inside virtual machines. ++ * To address this issue, in the filtering in (a) we do not use as a ++ * reference time interval just bfqd->bfq_slice_idle, but ++ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the ++ * minimum number of jiffies for which the filter seems to be quite ++ * precise also in embedded systems and KVM/QEMU virtual machines. ++ */ ++static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, ++"service_blkg %lu soft_rate %u sects/sec interval %u", ++ bfqq->service_from_backlogged, ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate)); ++ ++ return max3(bfqq->soft_rt_next_start, ++ bfqq->last_idle_bklogged + ++ HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); ++} ++ ++static bool bfq_bfqq_injectable(struct bfq_queue *bfqq) ++{ ++ return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && ++ blk_queue_nonrot(bfqq->bfqd->queue) && ++ bfqq->bfqd->hw_tag; ++} ++ ++/** ++ * bfq_bfqq_expire - expire a queue. ++ * @bfqd: device owning the queue. ++ * @bfqq: the queue to expire. ++ * @compensate: if true, compensate for the time spent idling. ++ * @reason: the reason causing the expiration. ++ * ++ * If the process associated with bfqq does slow I/O (e.g., because it ++ * issues random requests), we charge bfqq with the time it has been ++ * in service instead of the service it has received (see ++ * bfq_bfqq_charge_time for details on how this goal is achieved). As ++ * a consequence, bfqq will typically get higher timestamps upon ++ * reactivation, and hence it will be rescheduled as if it had ++ * received more service than what it has actually received. In the ++ * end, bfqq receives less service in proportion to how slowly its ++ * associated process consumes its budgets (and hence how seriously it ++ * tends to lower the throughput). In addition, this time-charging ++ * strategy guarantees time fairness among slow processes. In ++ * contrast, if the process associated with bfqq is not slow, we ++ * charge bfqq exactly with the service it has received. ++ * ++ * Charging time to the first type of queues and the exact service to ++ * the other has the effect of using the WF2Q+ policy to schedule the ++ * former on a timeslice basis, without violating service domain ++ * guarantees among the latter. ++ */ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason) ++{ ++ bool slow; ++ unsigned long delta = 0; ++ struct bfq_entity *entity = &bfqq->entity; ++ int ref; ++ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ /* ++ * Check whether the process is slow (see bfq_bfqq_is_slow). ++ */ ++ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); ++ ++ /* ++ * As above explained, charge slow (typically seeky) and ++ * timed-out queues with the time and not the service ++ * received, to favor sequential workloads. ++ * ++ * Processes doing I/O in the slower disk zones will tend to ++ * be slow(er) even if not seeky. Therefore, since the ++ * estimated peak rate is actually an average over the disk ++ * surface, these processes may timeout just for bad luck. To ++ * avoid punishing them, do not charge time to processes that ++ * succeeded in consuming at least 2/3 of their budget. This ++ * allows BFQ to preserve enough elasticity to still perform ++ * bandwidth, and not time, distribution with little unlucky ++ * or quasi-sequential processes. ++ */ ++ if (bfqq->wr_coeff == 1 && ++ (slow || ++ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && ++ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) ++ bfq_bfqq_charge_time(bfqd, bfqq, delta); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ if (reason == BFQ_BFQQ_TOO_IDLE && ++ entity->service <= 2 * entity->budget / 10) ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ if (bfqd->low_latency && bfqq->wr_coeff == 1) ++ bfqq->last_wr_start_finish = jiffies; ++ ++ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ /* ++ * If we get here, and there are no outstanding ++ * requests, then the request pattern is isochronous ++ * (see the comments on the function ++ * bfq_bfqq_softrt_next_start()). Thus we can compute ++ * soft_rt_next_start. And we do it, unless bfqq is in ++ * interactive weight raising. We do not do it in the ++ * latter subcase, for the following reason. bfqq may ++ * be conveying the I/O needed to load a soft ++ * real-time application. Such an application will ++ * actually exhibit a soft real-time I/O pattern after ++ * it finally starts doing its job. But, if ++ * soft_rt_next_start is computed here for an ++ * interactive bfqq, and bfqq had received a lot of ++ * service before remaining with no outstanding ++ * request (likely to happen on a fast device), then ++ * soft_rt_next_start would be assigned such a high ++ * value that, for a very long time, bfqq would be ++ * prevented from being possibly considered as soft ++ * real time. ++ * ++ * If, instead, the queue still has outstanding ++ * requests, then we have to wait for the completion ++ * of all the outstanding requests to discover whether ++ * the request pattern is actually isochronous. ++ */ ++ BUG_ON(bfq_tot_busy_queues(bfqd) < 1); ++ if (bfqq->dispatched == 0 && ++ bfqq->wr_coeff != bfqd->bfq_wr_coeff) { ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", ++ bfqq->soft_rt_next_start); ++ } else if (bfqq->dispatched > 0) { ++ /* ++ * Schedule an update of soft_rt_next_start to when ++ * the task may be discovered to be isochronous. ++ */ ++ bfq_mark_bfqq_softrt_update(bfqq); ++ } ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "expire (%s, slow %d, num_disp %d, short %d, weight %d, serv %d/%d)", ++ reason_name[reason], slow, bfqq->dispatched, ++ bfq_bfqq_has_short_ttime(bfqq), entity->weight, ++ entity->service, entity->budget); ++ ++ /* ++ * Increase, decrease or leave budget unchanged according to ++ * reason. ++ */ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); ++ BUG_ON(bfqq->next_rq == NULL && ++ bfqq->entity.budget < bfqq->entity.service); ++ ref = bfqq->ref; ++ __bfq_bfqq_expire(bfqd, bfqq); ++ ++ if (ref == 1) /* bfqq is gone, no more actions on it */ ++ return; ++ ++ BUG_ON(ref > 1 && ++ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && ++ !bfq_class_idle(bfqq)); ++ ++ bfqq->injected_service = 0; ++ ++ /* mark bfqq as waiting a request only if a bic still points to it */ ++ if (!bfq_bfqq_busy(bfqq) && ++ reason != BFQ_BFQQ_BUDGET_TIMEOUT && ++ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) { ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(bfqq->next_rq); ++ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); ++ /* ++ * Not setting service to 0, because, if the next rq ++ * arrives in time, the queue will go on receiving ++ * service with this same budget (as if it never expired) ++ */ ++ } else { ++ entity->service = 0; ++ bfq_log_bfqq(bfqd, bfqq, "resetting service"); ++ } ++ ++ /* ++ * Reset the received-service counter for every parent entity. ++ * Differently from what happens with bfqq->entity.service, ++ * the resetting of this counter never needs to be postponed ++ * for parent entities. In fact, in case bfqq may have a ++ * chance to go on being served using the last, partially ++ * consumed budget, bfqq->entity.service needs to be kept, ++ * because if bfqq then actually goes on being served using ++ * the same budget, the last value of bfqq->entity.service is ++ * needed to properly decrement bfqq->entity.budget by the ++ * portion already consumed. In contrast, it is not necessary ++ * to keep entity->service for parent entities too, because ++ * the bubble up of the new value of bfqq->entity.budget will ++ * make sure that the budgets of parent entities are correct, ++ * even in case bfqq and thus parent entities go on receiving ++ * service with the same budget. ++ */ ++ entity = entity->parent; ++ for_each_entity(entity) ++ entity->service = 0; ++} ++ ++/* ++ * Budget timeout is not implemented through a dedicated timer, but ++ * just checked on request arrivals and completions, as well as on ++ * idle timer expirations. ++ */ ++static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) ++{ ++ return time_is_before_eq_jiffies(bfqq->budget_timeout); ++} ++ ++/* ++ * If we expire a queue that is actively waiting (i.e., with the ++ * device idled) for the arrival of a new request, then we may incur ++ * the timestamp misalignment problem described in the body of the ++ * function __bfq_activate_entity. Hence we return true only if this ++ * condition does not hold, or if the queue is slow enough to deserve ++ * only to be kicked off for preserving a high throughput. ++ */ ++static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "wait_request %d left %d timeout %d", ++ bfq_bfqq_wait_request(bfqq), ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, ++ bfq_bfqq_budget_timeout(bfqq)); ++ ++ return (!bfq_bfqq_wait_request(bfqq) || ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) ++ && ++ bfq_bfqq_budget_timeout(bfqq); ++} ++ ++static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ bool rot_without_queueing = ++ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, ++ bfqq_sequential_and_IO_bound, ++ idling_boosts_thr; ++ ++ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && ++ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); ++ /* ++ * The next variable takes into account the cases where idling ++ * boosts the throughput. ++ * ++ * The value of the variable is computed considering, first, that ++ * idling is virtually always beneficial for the throughput if: ++ * (a) the device is not NCQ-capable and rotational, or ++ * (b) regardless of the presence of NCQ, the device is rotational and ++ * the request pattern for bfqq is I/O-bound and sequential, or ++ * (c) regardless of whether it is rotational, the device is ++ * not NCQ-capable and the request pattern for bfqq is ++ * I/O-bound and sequential. ++ * ++ * Secondly, and in contrast to the above item (b), idling an ++ * NCQ-capable flash-based device would not boost the ++ * throughput even with sequential I/O; rather it would lower ++ * the throughput in proportion to how fast the device ++ * is. Accordingly, the next variable is true if any of the ++ * above conditions (a), (b) or (c) is true, and, in ++ * particular, happens to be false if bfqd is an NCQ-capable ++ * flash-based device. ++ */ ++ idling_boosts_thr = rot_without_queueing || ++ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && ++ bfqq_sequential_and_IO_bound); ++ ++ bfq_log_bfqq(bfqd, bfqq, "idling_boosts_thr %d", idling_boosts_thr); ++ ++ /* ++ * The return value of this function is equal to that of ++ * idling_boosts_thr, unless a special case holds. In this ++ * special case, described below, idling may cause problems to ++ * weight-raised queues. ++ * ++ * When the request pool is saturated (e.g., in the presence ++ * of write hogs), if the processes associated with ++ * non-weight-raised queues ask for requests at a lower rate, ++ * then processes associated with weight-raised queues have a ++ * higher probability to get a request from the pool ++ * immediately (or at least soon) when they need one. Thus ++ * they have a higher probability to actually get a fraction ++ * of the device throughput proportional to their high ++ * weight. This is especially true with NCQ-capable drives, ++ * which enqueue several requests in advance, and further ++ * reorder internally-queued requests. ++ * ++ * For this reason, we force to false the return value if ++ * there are weight-raised busy queues. In this case, and if ++ * bfqq is not weight-raised, this guarantees that the device ++ * is not idled for bfqq (if, instead, bfqq is weight-raised, ++ * then idling will be guaranteed by another variable, see ++ * below). Combined with the timestamping rules of BFQ (see ++ * [1] for details), this behavior causes bfqq, and hence any ++ * sync non-weight-raised queue, to get a lower number of ++ * requests served, and thus to ask for a lower number of ++ * requests from the request pool, before the busy ++ * weight-raised queues get served again. This often mitigates ++ * starvation problems in the presence of heavy write ++ * workloads and NCQ, thereby guaranteeing a higher ++ * application and system responsiveness in these hostile ++ * scenarios. ++ */ ++ return idling_boosts_thr && ++ bfqd->wr_busy_queues == 0; ++} ++ ++/* ++ * There is a case where idling must be performed not for ++ * throughput concerns, but to preserve service guarantees. ++ * ++ * To introduce this case, we can note that allowing the drive ++ * to enqueue more than one request at a time, and hence ++ * delegating de facto final scheduling decisions to the ++ * drive's internal scheduler, entails loss of control on the ++ * actual request service order. In particular, the critical ++ * situation is when requests from different processes happen ++ * to be present, at the same time, in the internal queue(s) ++ * of the drive. In such a situation, the drive, by deciding ++ * the service order of the internally-queued requests, does ++ * determine also the actual throughput distribution among ++ * these processes. But the drive typically has no notion or ++ * concern about per-process throughput distribution, and ++ * makes its decisions only on a per-request basis. Therefore, ++ * the service distribution enforced by the drive's internal ++ * scheduler is likely to coincide with the desired ++ * device-throughput distribution only in a completely ++ * symmetric scenario where: ++ * (i) each of these processes must get the same throughput as ++ * the others; ++ * (ii) the I/O of each process has the same properties, in ++ * terms of locality (sequential or random), direction ++ * (reads or writes), request sizes, greediness ++ * (from I/O-bound to sporadic), and so on. ++ * In fact, in such a scenario, the drive tends to treat ++ * the requests of each of these processes in about the same ++ * way as the requests of the others, and thus to provide ++ * each of these processes with about the same throughput ++ * (which is exactly the desired throughput distribution). In ++ * contrast, in any asymmetric scenario, device idling is ++ * certainly needed to guarantee that bfqq receives its ++ * assigned fraction of the device throughput (see [1] for ++ * details). ++ * The problem is that idling may significantly reduce ++ * throughput with certain combinations of types of I/O and ++ * devices. An important example is sync random I/O, on flash ++ * storage with command queueing. So, unless bfqq falls in the ++ * above cases where idling also boosts throughput, it would ++ * be important to check conditions (i) and (ii) accurately, ++ * so as to avoid idling when not strictly needed for service ++ * guarantees. ++ * ++ * Unfortunately, it is extremely difficult to thoroughly ++ * check condition (ii). And, in case there are active groups, ++ * it becomes very difficult to check condition (i) too. In ++ * fact, if there are active groups, then, for condition (i) ++ * to become false, it is enough that an active group contains ++ * more active processes or sub-groups than some other active ++ * group. More precisely, for condition (i) to hold because of ++ * such a group, it is not even necessary that the group is ++ * (still) active: it is sufficient that, even if the group ++ * has become inactive, some of its descendant processes still ++ * have some request already dispatched but still waiting for ++ * completion. In fact, requests have still to be guaranteed ++ * their share of the throughput even after being ++ * dispatched. In this respect, it is easy to show that, if a ++ * group frequently becomes inactive while still having ++ * in-flight requests, and if, when this happens, the group is ++ * not considered in the calculation of whether the scenario ++ * is asymmetric, then the group may fail to be guaranteed its ++ * fair share of the throughput (basically because idling may ++ * not be performed for the descendant processes of the group, ++ * but it had to be). We address this issue with the ++ * following bi-modal behavior, implemented in the function ++ * bfq_symmetric_scenario(). ++ * ++ * If there are groups with requests waiting for completion ++ * (as commented above, some of these groups may even be ++ * already inactive), then the scenario is tagged as ++ * asymmetric, conservatively, without checking any of the ++ * conditions (i) and (ii). So the device is idled for bfqq. ++ * This behavior matches also the fact that groups are created ++ * exactly if controlling I/O is a primary concern (to ++ * preserve bandwidth and latency guarantees). ++ * ++ * On the opposite end, if there are no groups with requests ++ * waiting for completion, then only condition (i) is actually ++ * controlled, i.e., provided that condition (i) holds, idling ++ * is not performed, regardless of whether condition (ii) ++ * holds. In other words, only if condition (i) does not hold, ++ * then idling is allowed, and the device tends to be ++ * prevented from queueing many requests, possibly of several ++ * processes. Since there are no groups with requests waiting ++ * for completion, then, to control condition (i) it is enough ++ * to check just whether all the queues with requests waiting ++ * for completion also have the same weight. ++ * ++ * Not checking condition (ii) evidently exposes bfqq to the ++ * risk of getting less throughput than its fair share. ++ * However, for queues with the same weight, a further ++ * mechanism, preemption, mitigates or even eliminates this ++ * problem. And it does so without consequences on overall ++ * throughput. This mechanism and its benefits are explained ++ * in the next three paragraphs. ++ * ++ * Even if a queue, say Q, is expired when it remains idle, Q ++ * can still preempt the new in-service queue if the next ++ * request of Q arrives soon (see the comments on ++ * bfq_bfqq_update_budg_for_activation). If all queues and ++ * groups have the same weight, this form of preemption, ++ * combined with the hole-recovery heuristic described in the ++ * comments on function bfq_bfqq_update_budg_for_activation, ++ * are enough to preserve a correct bandwidth distribution in ++ * the mid term, even without idling. In fact, even if not ++ * idling allows the internal queues of the device to contain ++ * many requests, and thus to reorder requests, we can rather ++ * safely assume that the internal scheduler still preserves a ++ * minimum of mid-term fairness. ++ * ++ * More precisely, this preemption-based, idleless approach ++ * provides fairness in terms of IOPS, and not sectors per ++ * second. This can be seen with a simple example. Suppose ++ * that there are two queues with the same weight, but that ++ * the first queue receives requests of 8 sectors, while the ++ * second queue receives requests of 1024 sectors. In ++ * addition, suppose that each of the two queues contains at ++ * most one request at a time, which implies that each queue ++ * always remains idle after it is served. Finally, after ++ * remaining idle, each queue receives very quickly a new ++ * request. It follows that the two queues are served ++ * alternatively, preempting each other if needed. This ++ * implies that, although both queues have the same weight, ++ * the queue with large requests receives a service that is ++ * 1024/8 times as high as the service received by the other ++ * queue. ++ * ++ * The motivation for using preemption instead of idling (for ++ * queues with the same weight) is that, by not idling, ++ * service guarantees are preserved (completely or at least in ++ * part) without minimally sacrificing throughput. And, if ++ * there is no active group, then the primary expectation for ++ * this device is probably a high throughput. ++ * ++ * We are now left only with explaining the additional ++ * compound condition that is checked below for deciding ++ * whether the scenario is asymmetric. To explain this ++ * compound condition, we need to add that the function ++ * bfq_symmetric_scenario checks the weights of only ++ * non-weight-raised queues, for efficiency reasons (see ++ * comments on bfq_weights_tree_add()). Then the fact that ++ * bfqq is weight-raised is checked explicitly here. More ++ * precisely, the compound condition below takes into account ++ * also the fact that, even if bfqq is being weight-raised, ++ * the scenario is still symmetric if all queues with requests ++ * waiting for completion happen to be ++ * weight-raised. Actually, we should be even more precise ++ * here, and differentiate between interactive weight raising ++ * and soft real-time weight raising. ++ * ++ * As a side note, it is worth considering that the above ++ * device-idling countermeasures may however fail in the ++ * following unlucky scenario: if idling is (correctly) ++ * disabled in a time period during which all symmetry ++ * sub-conditions hold, and hence the device is allowed to ++ * enqueue many requests, but at some later point in time some ++ * sub-condition stops to hold, then it may become impossible ++ * to let requests be served in the desired order until all ++ * the requests already queued in the device have been served. ++ */ ++static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ bool asymmetric_scenario = (bfqq->wr_coeff > 1 && ++ bfqd->wr_busy_queues < ++ bfq_tot_busy_queues(bfqd)) || ++ !bfq_symmetric_scenario(bfqd); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wr_coeff %d wr_busy %d busy %d asymmetric %d", ++ bfqq->wr_coeff, ++ bfqd->wr_busy_queues, ++ bfq_tot_busy_queues(bfqd), ++ asymmetric_scenario); ++ ++ return asymmetric_scenario; ++} ++ ++/* ++ * For a queue that becomes empty, device idling is allowed only if ++ * this function returns true for that queue. As a consequence, since ++ * device idling plays a critical role for both throughput boosting ++ * and service guarantees, the return value of this function plays a ++ * critical role as well. ++ * ++ * In a nutshell, this function returns true only if idling is ++ * beneficial for throughput or, even if detrimental for throughput, ++ * idling is however necessary to preserve service guarantees (low ++ * latency, desired throughput distribution, ...). In particular, on ++ * NCQ-capable devices, this function tries to return false, so as to ++ * help keep the drives' internal queues full, whenever this helps the ++ * device boost the throughput without causing any service-guarantee ++ * issue. ++ * ++ * Most of the issues taken into account to get the return value of ++ * this function are not trivial. We discuss these issues in the two ++ * functions providing the main pieces of information needed by this ++ * function. ++ */ ++static bool bfq_better_to_idle(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar; ++ ++ if (unlikely(bfqd->strict_guarantees)) ++ return true; ++ ++ /* ++ * Idling is performed only if slice_idle > 0. In addition, we ++ * do not idle if ++ * (a) bfqq is async ++ * (b) bfqq is in the idle io prio class: in this case we do ++ * not idle because we want to minimize the bandwidth that ++ * queues in this class can steal to higher-priority queues ++ */ ++ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || ++ bfq_class_idle(bfqq)) ++ return false; ++ ++ idling_boosts_thr_with_no_issue = ++ idling_boosts_thr_without_issues(bfqd, bfqq); ++ ++ idling_needed_for_service_guar = ++ idling_needed_for_service_guarantees(bfqd, bfqq); ++ ++ /* ++ * We have now the two components we need to compute the ++ * return value of the function, which is true only if idling ++ * either boosts the throughput (without issues), or is ++ * necessary to preserve service guarantees. ++ */ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wr_busy %d boosts %d IO-bound %d guar %d", ++ bfqd->wr_busy_queues, ++ idling_boosts_thr_with_no_issue, ++ bfq_bfqq_IO_bound(bfqq), ++ idling_needed_for_service_guar); ++ ++ return idling_boosts_thr_with_no_issue || ++ idling_needed_for_service_guar; ++} ++ ++/* ++ * If the in-service queue is empty but the function bfq_better_to_idle ++ * returns true, then: ++ * 1) the queue must remain in service and cannot be expired, and ++ * 2) the device must be idled to wait for the possible arrival of a new ++ * request for the queue. ++ * See the comments on the function bfq_better_to_idle for the reasons ++ * why performing device idling is the best choice to boost the throughput ++ * and preserve service guarantees when bfq_better_to_idle itself ++ * returns true. ++ */ ++static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) ++{ ++ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); ++} ++ ++static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ ++ /* ++ * A linear search; but, with a high probability, very few ++ * steps are needed to find a candidate queue, i.e., a queue ++ * with enough budget left for its next request. In fact: ++ * - BFQ dynamically updates the budget of every queue so as ++ * to accomodate the expected backlog of the queue; ++ * - if a queue gets all its requests dispatched as injected ++ * service, then the queue is removed from the active list ++ * (and re-added only if it gets new requests, but with ++ * enough budget for its new backlog). ++ */ ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) ++ if (!RB_EMPTY_ROOT(&bfqq->sort_list) && ++ bfq_serv_to_charge(bfqq->next_rq, bfqq) <= ++ bfq_bfqq_budget_left(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); ++ return bfqq; ++ } ++ ++ bfq_log(bfqd, "no queue found"); ++ return NULL; ++} ++ ++/* ++ * Select a queue for service. If we have a current queue in service, ++ * check whether to continue servicing it, or retrieve and set a new one. ++ */ ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ struct request *next_rq; ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ ++ bfqq = bfqd->in_service_queue; ++ if (!bfqq) ++ goto new_queue; ++ ++ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); ++ ++ /* ++ * Do not expire bfqq for budget timeout if bfqq may be about ++ * to enjoy device idling. The reason why, in this case, we ++ * prevent bfqq from expiring is the same as in the comments ++ * on the case where bfq_bfqq_must_idle() returns true, in ++ * bfq_completed_request(). ++ */ ++ if (bfq_may_expire_for_budg_timeout(bfqq) && ++ !bfq_bfqq_must_idle(bfqq)) ++ goto expire; ++ ++check_queue: ++ /* ++ * This loop is rarely executed more than once. Even when it ++ * happens, it is much more convenient to re-execute this loop ++ * than to return NULL and trigger a new dispatch to get a ++ * request served. ++ */ ++ next_rq = bfqq->next_rq; ++ /* ++ * If bfqq has requests queued and it has enough budget left to ++ * serve them, keep the queue, otherwise expire it. ++ */ ++ if (next_rq) { ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (bfq_serv_to_charge(next_rq, bfqq) > ++ bfq_bfqq_budget_left(bfqq)) { ++ /* ++ * Expire the queue for budget exhaustion, ++ * which makes sure that the next budget is ++ * enough to serve the next request, even if ++ * it comes from the fifo expired path. ++ */ ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; ++ goto expire; ++ } else { ++ /* ++ * The idle timer may be pending because we may ++ * not disable disk idling even when a new request ++ * arrives. ++ */ ++ if (bfq_bfqq_wait_request(bfqq)) { ++ /* ++ * If we get here: 1) at least a new request ++ * has arrived but we have not disabled the ++ * timer because the request was too small, ++ * 2) then the block layer has unplugged ++ * the device, causing the dispatch to be ++ * invoked. ++ * ++ * Since the device is unplugged, now the ++ * requests are probably large enough to ++ * provide a reasonable throughput. ++ * So we disable idling. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ } ++ goto keep_queue; ++ } ++ } ++ ++ /* ++ * No requests pending. However, if the in-service queue is idling ++ * for a new request, or has requests waiting for a completion and ++ * may idle after their completion, then keep it anyway. ++ * ++ * Yet, to boost throughput, inject service from other queues if ++ * possible. ++ */ ++ if (bfq_bfqq_wait_request(bfqq) || ++ (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { ++ if (bfq_bfqq_injectable(bfqq) && ++ bfqq->injected_service * bfqq->inject_coeff < ++ bfqq->entity.service * 10) { ++ bfq_log_bfqq(bfqd, bfqq, "looking for queue for injection"); ++ bfqq = bfq_choose_bfqq_for_injection(bfqd); ++ } else { ++ if (BFQQ_SEEKY(bfqq)) ++ bfq_log_bfqq(bfqd, bfqq, ++ "injection saturated %d * %d >= %d * 10", ++ bfqq->injected_service, bfqq->inject_coeff, ++ bfqq->entity.service); ++ bfqq = NULL; ++ } ++ goto keep_queue; ++ } ++ ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS; ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, false, reason); ++new_queue: ++ bfqq = bfq_set_in_service_queue(bfqd); ++ if (bfqq) { ++ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); ++ goto check_queue; ++ } ++keep_queue: ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); ++ else ++ bfq_log(bfqd, "no queue returned"); ++ ++ return bfqq; ++} ++ ++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ ++ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != ++ entity->orig_weight * bfqq->wr_coeff); ++ if (entity->prio_changed) ++ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); ++ ++ /* ++ * If the queue was activated in a burst, or too much ++ * time has elapsed from the beginning of this ++ * weight-raising period, then end weight raising. ++ */ ++ if (bfq_bfqq_in_large_burst(bfqq)) ++ bfq_bfqq_end_wr(bfqq); ++ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time)) { ++ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || ++ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + ++ bfq_wr_duration(bfqd))) ++ bfq_bfqq_end_wr(bfqq); ++ else { ++ switch_back_to_interactive_wr(bfqq, bfqd); ++ BUG_ON(time_is_after_jiffies( ++ bfqq->last_wr_start_finish)); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "back to interactive wr"); ++ } ++ } ++ if (bfqq->wr_coeff > 1 && ++ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && ++ bfqq->service_from_wr > max_service_from_wr) { ++ /* see comments on max_service_from_wr */ ++ bfq_bfqq_end_wr(bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "too much service"); ++ } ++ } ++ /* ++ * To improve latency (for this or other queues), immediately ++ * update weight both if it must be raised and if it must be ++ * lowered. Since, entity may be on some active tree here, and ++ * might have a pending change of its ioprio class, invoke ++ * next function with the last parameter unset (see the ++ * comments on the function). ++ */ ++ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) ++ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), ++ entity, false); ++} ++ ++/* ++ * Dispatch next request from bfqq. ++ */ ++static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct request *rq = bfqq->next_rq; ++ unsigned long service_to_charge; ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(!rq); ++ service_to_charge = bfq_serv_to_charge(rq, bfqq); ++ ++ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ bfq_bfqq_served(bfqq, service_to_charge); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ bfq_dispatch_remove(bfqd->queue, rq); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "dispatched %u sec req (%llu), budg left %d, new disp_nr %d", ++ blk_rq_sectors(rq), ++ (unsigned long long) blk_rq_pos(rq), ++ bfq_bfqq_budget_left(bfqq), ++ bfqq->dispatched); ++ ++ if (bfqq != bfqd->in_service_queue) { ++ if (likely(bfqd->in_service_queue)) { ++ bfqd->in_service_queue->injected_service += ++ bfq_serv_to_charge(rq, bfqq); ++ bfq_log_bfqq(bfqd, bfqd->in_service_queue, ++ "injected_service increased to %d", ++ bfqd->in_service_queue->injected_service); ++ } ++ goto return_rq; ++ } ++ ++ /* ++ * If weight raising has to terminate for bfqq, then next ++ * function causes an immediate update of bfqq's weight, ++ * without waiting for next activation. As a consequence, on ++ * expiration, bfqq will be timestamped as if has never been ++ * weight-raised during this service slot, even if it has ++ * received part or even most of the service as a ++ * weight-raised queue. This inflates bfqq's timestamps, which ++ * is beneficial, as bfqq is then more willing to leave the ++ * device immediately to possible other weight-raised queues. ++ */ ++ bfq_update_wr_data(bfqd, bfqq); ++ ++ /* ++ * Expire bfqq, pretending that its budget expired, if bfqq ++ * belongs to CLASS_IDLE and other queues are waiting for ++ * service. ++ */ ++ if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))) ++ goto return_rq; ++ ++ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); ++ ++return_rq: ++ return rq; ++} ++ ++static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) ++{ ++ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; ++ ++ bfq_log(bfqd, "dispatch_non_empty %d busy_queues %d", ++ !list_empty_careful(&bfqd->dispatch), bfq_tot_busy_queues(bfqd) > 0); ++ ++ /* ++ * Avoiding lock: a race on bfqd->busy_queues should cause at ++ * most a call to dispatch for nothing ++ */ ++ return !list_empty_careful(&bfqd->dispatch) || ++ bfq_tot_busy_queues(bfqd) > 0; ++} ++ ++static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) ++{ ++ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; ++ struct request *rq = NULL; ++ struct bfq_queue *bfqq = NULL; ++ ++ if (!list_empty(&bfqd->dispatch)) { ++ rq = list_first_entry(&bfqd->dispatch, struct request, ++ queuelist); ++ list_del_init(&rq->queuelist); ++ rq->rq_flags &= ~RQF_DISP_LIST; ++ ++ bfq_log(bfqd, ++ "picked %p from dispatch list", rq); ++ bfqq = RQ_BFQQ(rq); ++ ++ if (bfqq) { ++ /* ++ * Increment counters here, because this ++ * dispatch does not follow the standard ++ * dispatch flow (where counters are ++ * incremented) ++ */ ++ bfqq->dispatched++; ++ ++ /* ++ * TESTING: reset DISP_LIST flag, because: 1) ++ * this rq this request has passed through ++ * bfq_prepare_request, 2) then it will have ++ * bfq_finish_requeue_request invoked on it, and 3) in ++ * bfq_finish_requeue_request we use this flag to check ++ * that bfq_finish_requeue_request is not invoked on ++ * requests for which bfq_prepare_request has ++ * been invoked. ++ */ ++ rq->rq_flags &= ~RQF_DISP_LIST; ++ goto inc_in_driver_start_rq; ++ } ++ ++ /* ++ * We exploit the bfq_finish_requeue_request hook to decrement ++ * rq_in_driver, but bfq_finish_requeue_request will not be ++ * invoked on this request. So, to avoid unbalance, ++ * just start this request, without incrementing ++ * rq_in_driver. As a negative consequence, ++ * rq_in_driver is deceptively lower than it should be ++ * while this request is in service. This may cause ++ * bfq_schedule_dispatch to be invoked uselessly. ++ * ++ * As for implementing an exact solution, the ++ * bfq_finish_requeue_request hook, if defined, is probably ++ * invoked also on this request. So, by exploiting ++ * this hook, we could 1) increment rq_in_driver here, ++ * and 2) decrement it in bfq_finish_requeue_request. Such a ++ * solution would let the value of the counter be ++ * always accurate, but it would entail using an extra ++ * interface function. This cost seems higher than the ++ * benefit, being the frequency of non-elevator-private ++ * requests very low. ++ */ ++ goto start_rq; ++ } ++ ++ bfq_log(bfqd, "%d busy queues", bfq_tot_busy_queues(bfqd)); ++ ++ if (bfq_tot_busy_queues(bfqd) == 0) ++ goto exit; ++ ++ /* ++ * Force device to serve one request at a time if ++ * strict_guarantees is true. Forcing this service scheme is ++ * currently the ONLY way to guarantee that the request ++ * service order enforced by the scheduler is respected by a ++ * queueing device. Otherwise the device is free even to make ++ * some unlucky request wait for as long as the device ++ * wishes. ++ * ++ * Of course, serving one request at at time may cause loss of ++ * throughput. ++ */ ++ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) ++ goto exit; ++ ++ bfqq = bfq_select_queue(bfqd); ++ if (!bfqq) ++ goto exit; ++ ++ BUG_ON(bfqq == bfqd->in_service_queue && ++ bfqq->entity.budget < bfqq->entity.service); ++ ++ BUG_ON(bfqq == bfqd->in_service_queue && ++ bfq_bfqq_wait_request(bfqq)); ++ ++ rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ if (rq) { ++ inc_in_driver_start_rq: ++ bfqd->rq_in_driver++; ++ start_rq: ++ rq->rq_flags |= RQF_STARTED; ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, ++ "%s request %p, rq_in_driver %d", ++ bfq_bfqq_sync(bfqq) ? "sync" : "async", ++ rq, ++ bfqd->rq_in_driver); ++ else ++ bfq_log(bfqd, ++ "request %p from dispatch list, rq_in_driver %d", ++ rq, bfqd->rq_in_driver); ++ } else ++ bfq_log(bfqd, ++ "returned NULL request, rq_in_driver %d", ++ bfqd->rq_in_driver); ++ ++exit: ++ return rq; ++} ++ ++ ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) ++static void bfq_update_dispatch_stats(struct request_queue *q, ++ struct request *rq, ++ struct bfq_queue *in_serv_queue, ++ bool idle_timer_disabled) ++{ ++ struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; ++ ++ if (!idle_timer_disabled && !bfqq) ++ return; ++ ++ /* ++ * rq and bfqq are guaranteed to exist until this function ++ * ends, for the following reasons. First, rq can be ++ * dispatched to the device, and then can be completed and ++ * freed, only after this function ends. Second, rq cannot be ++ * merged (and thus freed because of a merge) any longer, ++ * because it has already started. Thus rq cannot be freed ++ * before this function ends, and, since rq has a reference to ++ * bfqq, the same guarantee holds for bfqq too. ++ * ++ * In addition, the following queue lock guarantees that ++ * bfqq_group(bfqq) exists as well. ++ */ ++ spin_lock_irq(q->queue_lock); ++ if (idle_timer_disabled) ++ /* ++ * Since the idle timer has been disabled, ++ * in_serv_queue contained some request when ++ * __bfq_dispatch_request was invoked above, which ++ * implies that rq was picked exactly from ++ * in_serv_queue. Thus in_serv_queue == bfqq, and is ++ * therefore guaranteed to exist because of the above ++ * arguments. ++ */ ++ bfqg_stats_update_idle_time(bfqq_group(in_serv_queue)); ++ if (bfqq) { ++ struct bfq_group *bfqg = bfqq_group(bfqq); ++ ++ bfqg_stats_update_avg_queue_size(bfqg); ++ bfqg_stats_set_start_empty_time(bfqg); ++ bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); ++ } ++ spin_unlock_irq(q->queue_lock); ++} ++#else ++static inline void bfq_update_dispatch_stats(struct request_queue *q, ++ struct request *rq, ++ struct bfq_queue *in_serv_queue, ++ bool idle_timer_disabled) {} ++#endif ++static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) ++{ ++ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; ++ struct request *rq; ++ struct bfq_queue *in_serv_queue; ++ bool waiting_rq, idle_timer_disabled; ++ ++ spin_lock_irq(&bfqd->lock); ++ ++ in_serv_queue = bfqd->in_service_queue; ++ waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); ++ ++ rq = __bfq_dispatch_request(hctx); ++ ++ idle_timer_disabled = ++ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); ++ ++ spin_unlock_irq(&bfqd->lock); ++ ++ bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, ++ idle_timer_disabled); ++ ++ return rq; ++} ++ ++/* ++ * Task holds one reference to the queue, dropped when task exits. Each rq ++ * in-flight on this queue also holds a reference, dropped when rq is freed. ++ * ++ * Scheduler lock must be held here. Recall not to use bfqq after calling ++ * this function on it. ++ */ ++static void bfq_put_queue(struct bfq_queue *bfqq) ++{ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ struct bfq_group *bfqg = bfqq_group(bfqq); ++#endif ++ ++ assert_spin_locked(&bfqq->bfqd->lock); ++ ++ BUG_ON(bfqq->ref <= 0); ++ ++ if (bfqq->bfqd) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); ++ ++ bfqq->ref--; ++ if (bfqq->ref) ++ return; ++ ++ BUG_ON(rb_first(&bfqq->sort_list)); ++ BUG_ON(bfqq->allocated != 0); ++ BUG_ON(bfqq->entity.tree); ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ ++ if (!hlist_unhashed(&bfqq->burst_list_node)) { ++ hlist_del_init(&bfqq->burst_list_node); ++ /* ++ * Decrement also burst size after the removal, if the ++ * process associated with bfqq is exiting, and thus ++ * does not contribute to the burst any longer. This ++ * decrement helps filter out false positives of large ++ * bursts, when some short-lived process (often due to ++ * the execution of commands by some service) happens ++ * to start and exit while a complex application is ++ * starting, and thus spawning several processes that ++ * do I/O (and that *must not* be treated as a large ++ * burst, see comments on bfq_handle_burst). ++ * ++ * In particular, the decrement is performed only if: ++ * 1) bfqq is not a merged queue, because, if it is, ++ * then this free of bfqq is not triggered by the exit ++ * of the process bfqq is associated with, but exactly ++ * by the fact that bfqq has just been merged. ++ * 2) burst_size is greater than 0, to handle ++ * unbalanced decrements. Unbalanced decrements may ++ * happen in te following case: bfqq is inserted into ++ * the current burst list--without incrementing ++ * bust_size--because of a split, but the current ++ * burst list is not the burst list bfqq belonged to ++ * (see comments on the case of a split in ++ * bfq_set_request). ++ */ ++ if (bfqq->bic && bfqq->bfqd->burst_size > 0) ++ bfqq->bfqd->burst_size--; ++ } ++ ++ if (bfqq->bfqd) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); ++ bfqg_and_blkg_put(bfqg); ++#endif ++ kmem_cache_free(bfq_pool, bfqq); ++} ++ ++static void bfq_put_cooperator(struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *__bfqq, *next; ++ ++ /* ++ * If this queue was scheduled to merge with another queue, be ++ * sure to drop the reference taken on that queue (and others in ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. ++ */ ++ __bfqq = bfqq->new_bfqq; ++ while (__bfqq) { ++ if (__bfqq == bfqq) ++ break; ++ next = __bfqq->new_bfqq; ++ bfq_put_queue(__bfqq); ++ __bfqq = next; ++ } ++} ++ ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ if (bfqq == bfqd->in_service_queue) { ++ __bfq_bfqq_expire(bfqd, bfqq); ++ bfq_schedule_dispatch(bfqd); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); /* release process reference */ ++} ++ ++static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) ++{ ++ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); ++ struct bfq_data *bfqd; ++ ++ if (bfqq) ++ bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ ++ ++ if (bfqq && bfqd) { ++ unsigned long flags; ++ ++ spin_lock_irqsave(&bfqd->lock, flags); ++ bfq_exit_bfqq(bfqd, bfqq); ++ bic_set_bfqq(bic, NULL, is_sync); ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++ } ++} ++ ++static void bfq_exit_icq(struct io_cq *icq) ++{ ++ struct bfq_io_cq *bic = icq_to_bic(icq); ++ ++ BUG_ON(!bic); ++ bfq_exit_icq_bfqq(bic, true); ++ bfq_exit_icq_bfqq(bic, false); ++} ++ ++/* ++ * Update the entity prio values; note that the new values will not ++ * be used until the next (re)activation. ++ */ ++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ struct task_struct *tsk = current; ++ int ioprio_class; ++ struct bfq_data *bfqd = bfqq->bfqd; ++ ++ WARN_ON(!bfqd); ++ if (!bfqd) ++ return; ++ ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ switch (ioprio_class) { ++ default: ++ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, ++ "bfq: bad prio class %d\n", ioprio_class); ++ case IOPRIO_CLASS_NONE: ++ /* ++ * No prio set, inherit CPU scheduling settings. ++ */ ++ bfqq->new_ioprio = task_nice_ioprio(tsk); ++ bfqq->new_ioprio_class = task_nice_ioclass(tsk); ++ break; ++ case IOPRIO_CLASS_RT: ++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; ++ break; ++ case IOPRIO_CLASS_BE: ++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; ++ break; ++ case IOPRIO_CLASS_IDLE: ++ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; ++ bfqq->new_ioprio = 7; ++ break; ++ } ++ ++ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { ++ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", ++ bfqq->new_ioprio); ++ BUG(); ++ } ++ ++ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "bic_class %d prio %d class %d", ++ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); ++} ++ ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ struct bfq_queue *bfqq; ++ unsigned long uninitialized_var(flags); ++ int ioprio = bic->icq.ioc->ioprio; ++ ++ /* ++ * This condition may trigger on a newly created bic, be sure to ++ * drop the lock before returning. ++ */ ++ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) ++ return; ++ ++ bic->ioprio = ioprio; ++ ++ bfqq = bic_to_bfqq(bic, false); ++ if (bfqq) { ++ /* release process reference on this queue */ ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); ++ bic_set_bfqq(bic, bfqq, false); ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfqq %p %d", ++ bfqq, bfqq->ref); ++ } ++ ++ bfqq = bic_to_bfqq(bic, true); ++ if (bfqq) ++ bfq_set_next_ioprio_data(bfqq, bic); ++} ++ ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic, pid_t pid, int is_sync) ++{ ++ RB_CLEAR_NODE(&bfqq->entity.rb_node); ++ INIT_LIST_HEAD(&bfqq->fifo); ++ INIT_HLIST_NODE(&bfqq->burst_list_node); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); ++ ++ bfqq->ref = 0; ++ bfqq->bfqd = bfqd; ++ ++ if (bic) ++ bfq_set_next_ioprio_data(bfqq, bic); ++ ++ if (is_sync) { ++ /* ++ * No need to mark as has_short_ttime if in ++ * idle_class, because no device idling is performed ++ * for queues in idle class ++ */ ++ if (!bfq_class_idle(bfqq)) ++ /* tentatively mark as has_short_ttime */ ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ bfq_mark_bfqq_sync(bfqq); ++ bfq_mark_bfqq_just_created(bfqq); ++ /* ++ * Aggressively inject a lot of service: up to 90%. ++ * This coefficient remains constant during bfqq life, ++ * but this behavior might be changed, after enough ++ * testing and tuning. ++ */ ++ bfqq->inject_coeff = 1; ++ } else ++ bfq_clear_bfqq_sync(bfqq); ++ ++ bfqq->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); ++ ++ bfq_mark_bfqq_IO_bound(bfqq); ++ ++ /* Tentative initial value to trade off between thr and lat */ ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; ++ bfqq->pid = pid; ++ ++ bfqq->wr_coeff = 1; ++ bfqq->last_wr_start_finish = jiffies; ++ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); ++ bfqq->budget_timeout = bfq_smallest_from_now(); ++ bfqq->split_time = bfq_smallest_from_now(); ++ ++ /* ++ * To not forget the possibly high bandwidth consumed by a ++ * process/queue in the recent past, ++ * bfq_bfqq_softrt_next_start() returns a value at least equal ++ * to the current value of bfqq->soft_rt_next_start (see ++ * comments on bfq_bfqq_softrt_next_start). Set ++ * soft_rt_next_start to now, to mean that bfqq has consumed ++ * no bandwidth so far. ++ */ ++ bfqq->soft_rt_next_start = jiffies; ++ ++ /* first request is almost certainly seeky */ ++ bfqq->seek_history = 1; ++} ++ ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ int ioprio_class, int ioprio) ++{ ++ switch (ioprio_class) { ++ case IOPRIO_CLASS_RT: ++ return &bfqg->async_bfqq[0][ioprio]; ++ case IOPRIO_CLASS_NONE: ++ ioprio = IOPRIO_NORM; ++ /* fall through */ ++ case IOPRIO_CLASS_BE: ++ return &bfqg->async_bfqq[1][ioprio]; ++ case IOPRIO_CLASS_IDLE: ++ return &bfqg->async_idle_bfqq; ++ default: ++ BUG(); ++ } ++} ++ ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bio *bio, bool is_sync, ++ struct bfq_io_cq *bic) ++{ ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ struct bfq_queue **async_bfqq = NULL; ++ struct bfq_queue *bfqq; ++ struct bfq_group *bfqg; ++ ++ rcu_read_lock(); ++ ++ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); ++ if (!bfqg) { ++ bfqq = &bfqd->oom_bfqq; ++ goto out; ++ } ++ ++ if (!is_sync) { ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ++ ioprio); ++ bfqq = *async_bfqq; ++ if (bfqq) ++ goto out; ++ } ++ ++ bfqq = kmem_cache_alloc_node(bfq_pool, ++ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, ++ bfqd->queue->node); ++ ++ if (bfqq) { ++ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, ++ is_sync); ++ bfq_init_entity(&bfqq->entity, bfqg); ++ bfq_log_bfqq(bfqd, bfqq, "allocated"); ++ } else { ++ bfqq = &bfqd->oom_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); ++ goto out; ++ } ++ ++ /* ++ * Pin the queue now that it's allocated, scheduler exit will ++ * prune it. ++ */ ++ if (async_bfqq) { ++ bfqq->ref++; /* ++ * Extra group reference, w.r.t. sync ++ * queue. This extra reference is removed ++ * only if bfqq->bfqg disappears, to ++ * guarantee that this queue is not freed ++ * until its group goes away. ++ */ ++ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", ++ bfqq, bfqq->ref); ++ *async_bfqq = bfqq; ++ } ++ ++out: ++ bfqq->ref++; /* get a process reference to this queue */ ++ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); ++ rcu_read_unlock(); ++ return bfqq; ++} ++ ++static void bfq_update_io_thinktime(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_ttime *ttime = &bfqq->ttime; ++ u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; ++ ++ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); ++ ++ ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; ++ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); ++ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, ++ ttime->ttime_samples); ++} ++ ++static void ++bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ bfqq->seek_history <<= 1; ++ bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq); ++} ++ ++static void bfq_update_has_short_ttime(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ bool has_short_ttime = true; ++ ++ /* ++ * No need to update has_short_ttime if bfqq is async or in ++ * idle io prio class, or if bfq_slice_idle is zero, because ++ * no device idling is performed for bfqq in this case. ++ */ ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || ++ bfqd->bfq_slice_idle == 0) ++ return; ++ ++ /* Idle window just restored, statistics are meaningless. */ ++ if (time_is_after_eq_jiffies(bfqq->split_time + ++ bfqd->bfq_wr_min_idle_time)) ++ return; ++ ++ /* Think time is infinite if no process is linked to ++ * bfqq. Otherwise check average think time to ++ * decide whether to mark as has_short_ttime ++ */ ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || ++ (bfq_sample_valid(bfqq->ttime.ttime_samples) && ++ bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) ++ has_short_ttime = false; ++ ++ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", ++ has_short_ttime); ++ ++ if (has_short_ttime) ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ else ++ bfq_clear_bfqq_has_short_ttime(bfqq); ++} ++ ++/* ++ * Called when a new fs request (rq) is added to bfqq. Check if there's ++ * something we should do about it. ++ */ ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ struct bfq_io_cq *bic = RQ_BIC(rq); ++ ++ if (rq->cmd_flags & REQ_META) ++ bfqq->meta_pending++; ++ ++ bfq_update_io_thinktime(bfqd, bfqq); ++ bfq_update_has_short_ttime(bfqd, bfqq, bic); ++ bfq_update_io_seektime(bfqd, bfqq, rq); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "has_short_ttime=%d (seeky %d)", ++ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); ++ ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ ++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { ++ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && ++ blk_rq_sectors(rq) < 32; ++ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); ++ ++ /* ++ * There is just this request queued: if ++ * - the request is small, and ++ * - we are idling to boost throughput, and ++ * - the queue is not to be expired, ++ * then just exit. ++ * ++ * In this way, if the device is being idled to wait ++ * for a new request from the in-service queue, we ++ * avoid unplugging the device and committing the ++ * device to serve just a small request. In contrast ++ * we wait for the block layer to decide when to ++ * unplug the device: hopefully, new requests will be ++ * merged to this one quickly, then the device will be ++ * unplugged and larger requests will be dispatched. ++ */ ++ if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) && ++ !budget_timeout) ++ return; ++ ++ /* ++ * A large enough request arrived, or idling is being ++ * performed to preserve service guarantees, or ++ * finally the queue is to be expired: in all these ++ * cases disk idling is to be stopped, so clear ++ * wait_request flag and reset timer. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ ++ /* ++ * The queue is not empty, because a new request just ++ * arrived. Hence we can safely expire the queue, in ++ * case of budget timeout, without risking that the ++ * timestamps of the queue are not updated correctly. ++ * See [1] for more details. ++ */ ++ if (budget_timeout) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ } ++} ++ ++/* returns true if it causes the idle timer to be disabled */ ++static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; ++ bool waiting, idle_timer_disabled = false; ++ BUG_ON(!bfqq); ++ ++ assert_spin_locked(&bfqd->lock); ++ ++ bfq_log_bfqq(bfqd, bfqq, "rq %p bfqq %p", rq, bfqq); ++ ++ /* ++ * An unplug may trigger a requeue of a request from the device ++ * driver: make sure we are in process context while trying to ++ * merge two bfq_queues. ++ */ ++ if (!in_interrupt()) { ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); ++ if (new_bfqq) { ++ BUG_ON(bic_to_bfqq(RQ_BIC(rq), 1) != bfqq); ++ /* ++ * Release the request's reference to the old bfqq ++ * and make sure one is taken to the shared queue. ++ */ ++ new_bfqq->allocated++; ++ bfqq->allocated--; ++ bfq_log_bfqq(bfqd, bfqq, ++ "new allocated %d", bfqq->allocated); ++ bfq_log_bfqq(bfqd, new_bfqq, ++ "new_bfqq new allocated %d", ++ bfqq->allocated); ++ ++ new_bfqq->ref++; ++ /* ++ * If the bic associated with the process ++ * issuing this request still points to bfqq ++ * (and thus has not been already redirected ++ * to new_bfqq or even some other bfq_queue), ++ * then complete the merge and redirect it to ++ * new_bfqq. ++ */ ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), ++ bfqq, new_bfqq); ++ ++ bfq_clear_bfqq_just_created(bfqq); ++ /* ++ * rq is about to be enqueued into new_bfqq, ++ * release rq reference on bfqq ++ */ ++ bfq_put_queue(bfqq); ++ rq->elv.priv[1] = new_bfqq; ++ bfqq = new_bfqq; ++ } ++ } ++ ++ waiting = bfqq && bfq_bfqq_wait_request(bfqq); ++ bfq_add_request(rq); ++ idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); ++ ++ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; ++ list_add_tail(&rq->queuelist, &bfqq->fifo); ++ ++ bfq_rq_enqueued(bfqd, bfqq, rq); ++ ++ return idle_timer_disabled; ++} ++ ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) ++static void bfq_update_insert_stats(struct request_queue *q, ++ struct bfq_queue *bfqq, ++ bool idle_timer_disabled, ++ unsigned int cmd_flags) ++{ ++ if (!bfqq) ++ return; ++ ++ /* ++ * bfqq still exists, because it can disappear only after ++ * either it is merged with another queue, or the process it ++ * is associated with exits. But both actions must be taken by ++ * the same process currently executing this flow of ++ * instructions. ++ * ++ * In addition, the following queue lock guarantees that ++ * bfqq_group(bfqq) exists as well. ++ */ ++ spin_lock_irq(q->queue_lock); ++ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); ++ if (idle_timer_disabled) ++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); ++ spin_unlock_irq(q->queue_lock); ++} ++#else ++static inline void bfq_update_insert_stats(struct request_queue *q, ++ struct bfq_queue *bfqq, ++ bool idle_timer_disabled, ++ unsigned int cmd_flags) {} ++#endif ++ ++static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, ++ bool at_head) ++{ ++ struct request_queue *q = hctx->queue; ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq; ++ bool idle_timer_disabled = false; ++ unsigned int cmd_flags; ++ ++ spin_lock_irq(&bfqd->lock); ++ if (blk_mq_sched_try_insert_merge(q, rq)) { ++ spin_unlock_irq(&bfqd->lock); ++ return; ++ } ++ ++ spin_unlock_irq(&bfqd->lock); ++ ++ blk_mq_sched_request_inserted(rq); ++ ++ spin_lock_irq(&bfqd->lock); ++ ++ bfqq = bfq_init_rq(rq); ++ BUG_ON(!bfqq && !(at_head || blk_rq_is_passthrough(rq))); ++ BUG_ON(bfqq && bic_to_bfqq(RQ_BIC(rq), rq_is_sync(rq)) != bfqq); ++ ++ if (at_head || blk_rq_is_passthrough(rq)) { ++ if (at_head) ++ list_add(&rq->queuelist, &bfqd->dispatch); ++ else ++ list_add_tail(&rq->queuelist, &bfqd->dispatch); ++ ++ rq->rq_flags |= RQF_DISP_LIST; ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, ++ "%p in disp: at_head %d", ++ rq, at_head); ++ else ++ bfq_log(bfqd, ++ "%p in disp: at_head %d", ++ rq, at_head); ++ } else { /* bfqq is assumed to be non null here */ ++ BUG_ON(!bfqq); ++ BUG_ON(!(rq->rq_flags & RQF_GOT)); ++ rq->rq_flags &= ~RQF_GOT; ++ ++ idle_timer_disabled = __bfq_insert_request(bfqd, rq); ++ /* ++ * Update bfqq, because, if a queue merge has occurred ++ * in __bfq_insert_request, then rq has been ++ * redirected into a new queue. ++ */ ++ bfqq = RQ_BFQQ(rq); ++ ++ if (rq_mergeable(rq)) { ++ elv_rqhash_add(q, rq); ++ if (!q->last_merge) ++ q->last_merge = rq; ++ } ++ } ++ ++ /* ++ * Cache cmd_flags before releasing scheduler lock, because rq ++ * may disappear afterwards (for example, because of a request ++ * merge). ++ */ ++ cmd_flags = rq->cmd_flags; ++ ++ spin_unlock_irq(&bfqd->lock); ++ bfq_update_insert_stats(q, bfqq, idle_timer_disabled, ++ cmd_flags); ++} ++ ++static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, ++ struct list_head *list, bool at_head) ++{ ++ while (!list_empty(list)) { ++ struct request *rq; ++ ++ rq = list_first_entry(list, struct request, queuelist); ++ list_del_init(&rq->queuelist); ++ bfq_insert_request(hctx, rq, at_head); ++ } ++} ++ ++static void bfq_update_hw_tag(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->in_service_queue; ++ ++ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, ++ bfqd->rq_in_driver); ++ ++ if (bfqd->hw_tag == 1) ++ return; ++ ++ /* ++ * This sample is valid if the number of outstanding requests ++ * is large enough to allow a queueing behavior. Note that the ++ * sum is not exact, as it's not taking into account deactivated ++ * requests. ++ */ ++ if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) ++ return; ++ ++ /* ++ * If active queue hasn't enough requests and can idle, bfq might not ++ * dispatch sufficient requests to hardware. Don't zero hw_tag in this ++ * case ++ */ ++ if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && ++ bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < ++ BFQ_HW_QUEUE_THRESHOLD && bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) ++ return; ++ ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) ++ return; ++ ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; ++ bfqd->max_rq_in_driver = 0; ++ bfqd->hw_tag_samples = 0; ++} ++ ++static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) ++{ ++ u64 now_ns; ++ u32 delta_us; ++ ++ bfq_update_hw_tag(bfqd); ++ ++ BUG_ON(!bfqd->rq_in_driver); ++ BUG_ON(!bfqq->dispatched); ++ bfqd->rq_in_driver--; ++ bfqq->dispatched--; ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "new disp %d, new rq_in_driver %d", ++ bfqq->dispatched, bfqd->rq_in_driver); ++ ++ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ /* ++ * Set budget_timeout (which we overload to store the ++ * time at which the queue remains with no backlog and ++ * no outstanding request; used by the weight-raising ++ * mechanism). ++ */ ++ bfqq->budget_timeout = jiffies; ++ ++ bfq_weights_tree_remove(bfqd, bfqq); ++ } ++ ++ now_ns = ktime_get_ns(); ++ ++ bfqq->ttime.last_end_request = now_ns; ++ ++ /* ++ * Using us instead of ns, to get a reasonable precision in ++ * computing rate in next check. ++ */ ++ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "delta %uus/%luus max_size %u rate %llu/%llu", ++ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, ++ delta_us > 0 ? ++ (USEC_PER_SEC* ++ (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT : ++ (USEC_PER_SEC* ++ (u64)(bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, ++ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); ++ ++ /* ++ * If the request took rather long to complete, and, according ++ * to the maximum request size recorded, this completion latency ++ * implies that the request was certainly served at a very low ++ * rate (less than 1M sectors/sec), then the whole observation ++ * interval that lasts up to this time instant cannot be a ++ * valid time interval for computing a new peak rate. Invoke ++ * bfq_update_rate_reset to have the following three steps ++ * taken: ++ * - close the observation interval at the last (previous) ++ * request dispatch or completion ++ * - compute rate, if possible, for that observation interval ++ * - reset to zero samples, which will trigger a proper ++ * re-initialization of the observation interval on next ++ * dispatch ++ */ ++ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && ++ (bfqd->last_rq_max_size<last_completion = now_ns; ++ ++ /* ++ * If we are waiting to discover whether the request pattern ++ * of the task associated with the queue is actually ++ * isochronous, and both requisites for this condition to hold ++ * are now satisfied, then compute soft_rt_next_start (see the ++ * comments on the function bfq_bfqq_softrt_next_start()). We ++ * do not compute soft_rt_next_start if bfqq is in interactive ++ * weight raising (see the comments in bfq_bfqq_expire() for ++ * an explanation). We schedule this delayed update when bfqq ++ * expires, if it still has in-flight requests. ++ */ ++ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list) && ++ bfqq->wr_coeff != bfqd->bfq_wr_coeff) ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ ++ /* ++ * If this is the in-service queue, check if it needs to be expired, ++ * or if we want to idle in case it has no pending requests. ++ */ ++ if (bfqd->in_service_queue == bfqq) { ++ if (bfq_bfqq_must_idle(bfqq)) { ++ if (bfqq->dispatched == 0) ++ bfq_arm_slice_timer(bfqd); ++ /* ++ * If we get here, we do not expire bfqq, even ++ * if bfqq was in budget timeout or had no ++ * more requests (as controlled in the next ++ * conditional instructions). The reason for ++ * not expiring bfqq is as follows. ++ * ++ * Here bfqq->dispatched > 0 holds, but ++ * bfq_bfqq_must_idle() returned true. This ++ * implies that, even if no request arrives ++ * for bfqq before bfqq->dispatched reaches 0, ++ * bfqq will, however, not be expired on the ++ * completion event that causes bfqq->dispatch ++ * to reach zero. In contrast, on this event, ++ * bfqq will start enjoying device idling ++ * (I/O-dispatch plugging). ++ * ++ * But, if we expired bfqq here, bfqq would ++ * not have the chance to enjoy device idling ++ * when bfqq->dispatched finally reaches ++ * zero. This would expose bfqq to violation ++ * of its reserved service guarantees. ++ */ ++ return; ++ } else if (bfq_may_expire_for_budg_timeout(bfqq)) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && ++ (bfqq->dispatched == 0 || ++ !bfq_better_to_idle(bfqq))) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_NO_MORE_REQUESTS); ++ } ++} ++ ++static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "allocated %d", bfqq->allocated); ++ BUG_ON(!bfqq->allocated); ++ bfqq->allocated--; ++ ++ bfq_put_queue(bfqq); ++} ++ ++/* ++ * Handle either a requeue or a finish for rq. The things to do are ++ * the same in both cases: all references to rq are to be dropped. In ++ * particular, rq is considered completed from the point of view of ++ * the scheduler. ++ */ ++static void bfq_finish_requeue_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq; ++ struct bfq_data *bfqd; ++ struct bfq_io_cq *bic; ++ ++ BUG_ON(!rq); ++ ++ bfqq = RQ_BFQQ(rq); ++ ++ /* ++ * Requeue and finish hooks are invoked in blk-mq without ++ * checking whether the involved request is actually still ++ * referenced in the scheduler. To handle this fact, the ++ * following two checks make this function exit in case of ++ * spurious invocations, for which there is nothing to do. ++ * ++ * First, check whether rq has nothing to do with an elevator. ++ */ ++ if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) ++ return; ++ ++ /* ++ * rq either is not associated with any icq, or is an already ++ * requeued request that has not (yet) been re-inserted into ++ * a bfq_queue. ++ */ ++ if (!rq->elv.icq || !bfqq) ++ return; ++ ++ bic = RQ_BIC(rq); ++ BUG_ON(!bic); ++ ++ bfqd = bfqq->bfqd; ++ BUG_ON(!bfqd); ++ ++ if (rq->rq_flags & RQF_DISP_LIST) { ++ pr_crit("putting disp rq %p for %d", rq, bfqq->pid); ++ BUG(); ++ } ++ BUG_ON(rq->rq_flags & RQF_QUEUED); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "putting rq %p with %u sects left, STARTED %d", ++ rq, blk_rq_sectors(rq), ++ rq->rq_flags & RQF_STARTED); ++ ++ if (rq->rq_flags & RQF_STARTED) ++ bfqg_stats_update_completion(bfqq_group(bfqq), ++ rq->start_time_ns, ++ rq->io_start_time_ns, ++ rq->cmd_flags); ++ ++ WARN_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); ++ ++ if (likely(rq->rq_flags & RQF_STARTED)) { ++ unsigned long flags; ++ ++ spin_lock_irqsave(&bfqd->lock, flags); ++ ++ bfq_completed_request(bfqq, bfqd); ++ bfq_finish_requeue_request_body(bfqq); ++ ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++ } else { ++ /* ++ * Request rq may be still/already in the scheduler, ++ * in which case we need to remove it (this should ++ * never happen in case of requeue). And we cannot ++ * defer such a check and removal, to avoid ++ * inconsistencies in the time interval from the end ++ * of this function to the start of the deferred work. ++ * This situation seems to occur only in process ++ * context, as a consequence of a merge. In the ++ * current version of the code, this implies that the ++ * lock is held. ++ */ ++ BUG_ON(in_interrupt()); ++ ++ assert_spin_locked(&bfqd->lock); ++ if (!RB_EMPTY_NODE(&rq->rb_node)) { ++ bfq_remove_request(rq->q, rq); ++ bfqg_stats_update_io_remove(bfqq_group(bfqq), ++ rq->cmd_flags); ++ } ++ bfq_finish_requeue_request_body(bfqq); ++ } ++ ++ /* ++ * Reset private fields. In case of a requeue, this allows ++ * this function to correctly do nothing if it is spuriously ++ * invoked again on this same request (see the check at the ++ * beginning of the function). Probably, a better general ++ * design would be to prevent blk-mq from invoking the requeue ++ * or finish hooks of an elevator, for a request that is not ++ * referred by that elevator. ++ * ++ * Resetting the following fields would break the ++ * request-insertion logic if rq is re-inserted into a bfq ++ * internal queue, without a re-preparation. Here we assume ++ * that re-insertions of requeued requests, without ++ * re-preparation, can happen only for pass_through or at_head ++ * requests (which are not re-inserted into bfq internal ++ * queues). ++ */ ++ rq->elv.priv[0] = NULL; ++ rq->elv.priv[1] = NULL; ++} ++ ++/* ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this ++ * was the last process referring to that bfqq. ++ */ ++static struct bfq_queue * ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); ++ ++ if (bfqq_process_refs(bfqq) == 1) { ++ bfqq->pid = current->pid; ++ bfq_clear_bfqq_coop(bfqq); ++ bfq_clear_bfqq_split_coop(bfqq); ++ return bfqq; ++ } ++ ++ bic_set_bfqq(bic, NULL, 1); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); ++ return NULL; ++} ++ ++static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, ++ struct bio *bio, ++ bool split, bool is_sync, ++ bool *new_queue) ++{ ++ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); ++ ++ if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) ++ return bfqq; ++ ++ if (new_queue) ++ *new_queue = true; ++ ++ if (bfqq) ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); ++ ++ bic_set_bfqq(bic, bfqq, is_sync); ++ if (split && is_sync) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "get_request: was_in_list %d " ++ "was_in_large_burst %d " ++ "large burst in progress %d", ++ bic->was_in_burst_list, ++ bic->saved_in_large_burst, ++ bfqd->large_burst); ++ ++ if ((bic->was_in_burst_list && bfqd->large_burst) || ++ bic->saved_in_large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "get_request: marking in " ++ "large burst"); ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ } else { ++ bfq_log_bfqq(bfqd, bfqq, ++ "get_request: clearing in " ++ "large burst"); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ if (bic->was_in_burst_list) ++ /* ++ * If bfqq was in the current ++ * burst list before being ++ * merged, then we have to add ++ * it back. And we do not need ++ * to increase burst_size, as ++ * we did not decrement ++ * burst_size when we removed ++ * bfqq from the burst list as ++ * a consequence of a merge ++ * (see comments in ++ * bfq_put_queue). In this ++ * respect, it would be rather ++ * costly to know whether the ++ * current burst list is still ++ * the same burst list from ++ * which bfqq was removed on ++ * the merge. To avoid this ++ * cost, if bfqq was in a ++ * burst list, then we add ++ * bfqq to the current burst ++ * list without any further ++ * check. This can cause ++ * inappropriate insertions, ++ * but rarely enough to not ++ * harm the detection of large ++ * bursts significantly. ++ */ ++ hlist_add_head(&bfqq->burst_list_node, ++ &bfqd->burst_list); ++ } ++ bfqq->split_time = jiffies; ++ } ++ ++ return bfqq; ++} ++ ++/* ++ * Only reset private fields. The actual request preparation will be ++ * performed by bfq_init_rq, when rq is either inserted or merged. See ++ * comments on bfq_init_rq for the reason behind this delayed ++ * preparation. ++*/ ++static void bfq_prepare_request(struct request *rq, struct bio *bio) ++{ ++ /* ++ * Regardless of whether we have an icq attached, we have to ++ * clear the scheduler pointers, as they might point to ++ * previously allocated bic/bfqq structs. ++ */ ++ rq->elv.priv[0] = rq->elv.priv[1] = NULL; ++} ++ ++/* ++ * If needed, init rq, allocate bfq data structures associated with ++ * rq, and increment reference counters in the destination bfq_queue ++ * for rq. Return the destination bfq_queue for rq, or NULL is rq is ++ * not associated with any bfq_queue. ++ * ++ * This function is invoked by the functions that perform rq insertion ++ * or merging. One may have expected the above preparation operations ++ * to be performed in bfq_prepare_request, and not delayed to when rq ++ * is inserted or merged. The rationale behind this delayed ++ * preparation is that, after the prepare_request hook is invoked for ++ * rq, rq may still be transformed into a request with no icq, i.e., a ++ * request not associated with any queue. No bfq hook is invoked to ++ * signal this tranformation. As a consequence, should these ++ * preparation operations be performed when the prepare_request hook ++ * is invoked, and should rq be transformed one moment later, bfq ++ * would end up in an inconsistent state, because it would have ++ * incremented some queue counters for an rq destined to ++ * transformation, without any chance to correctly lower these ++ * counters back. In contrast, no transformation can still happen for ++ * rq after rq has been inserted or merged. So, it is safe to execute ++ * these preparation operations when rq is finally inserted or merged. ++ */ ++static struct bfq_queue *bfq_init_rq(struct request *rq) ++{ ++ struct request_queue *q = rq->q; ++ struct bio *bio = rq->bio; ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_io_cq *bic; ++ const int is_sync = rq_is_sync(rq); ++ struct bfq_queue *bfqq; ++ bool bfqq_already_existing = false, split = false; ++ bool new_queue = false; ++ ++ if (unlikely(!rq->elv.icq)) ++ return NULL; ++ ++ /* ++ * Assuming that elv.priv[1] is set only if everything is set ++ * for this rq. This holds true, because this function is ++ * invoked only for insertion or merging, and, after such ++ * events, a request cannot be manipulated any longer before ++ * being removed from bfq. ++ */ ++ if (rq->elv.priv[1]) { ++ BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); ++ return rq->elv.priv[1]; ++ } ++ ++ bic = icq_to_bic(rq->elv.icq); ++ ++ bfq_check_ioprio_change(bic, bio); ++ ++ bfq_bic_update_cgroup(bic, bio); ++ ++ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, ++ &new_queue); ++ ++ if (likely(!new_queue)) { ++ /* If the queue was seeky for too long, break it apart. */ ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { ++ BUG_ON(!is_sync); ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); ++ ++ /* Update bic before losing reference to bfqq */ ++ if (bfq_bfqq_in_large_burst(bfqq)) ++ bic->saved_in_large_burst = true; ++ ++ bfqq = bfq_split_bfqq(bic, bfqq); ++ split = true; ++ ++ if (!bfqq) ++ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, ++ true, is_sync, ++ NULL); ++ else ++ bfqq_already_existing = true; ++ ++ BUG_ON(!bfqq); ++ BUG_ON(bfqq == &bfqd->oom_bfqq); ++ } ++ } ++ ++ bfqq->allocated++; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "new allocated %d", bfqq->allocated); ++ ++ bfqq->ref++; ++ bfq_log_bfqq(bfqd, bfqq, "%p: bfqq %p, %d", rq, bfqq, bfqq->ref); ++ ++ rq->elv.priv[0] = bic; ++ rq->elv.priv[1] = bfqq; ++ rq->rq_flags &= ~RQF_DISP_LIST; ++ ++ /* ++ * If a bfq_queue has only one process reference, it is owned ++ * by only this bic: we can then set bfqq->bic = bic. in ++ * addition, if the queue has also just been split, we have to ++ * resume its state. ++ */ ++ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { ++ bfqq->bic = bic; ++ if (split) { ++ /* ++ * The queue has just been split from a shared ++ * queue: restore the idle window and the ++ * possible weight raising period. ++ */ ++ bfq_bfqq_resume_state(bfqq, bfqd, bic, ++ bfqq_already_existing); ++ } ++ } ++ ++ if (unlikely(bfq_bfqq_just_created(bfqq))) ++ bfq_handle_burst(bfqd, bfqq); ++ ++ rq->rq_flags |= RQF_GOT; ++ ++ return bfqq; ++} ++ ++static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ enum bfqq_expiration reason; ++ unsigned long flags; ++ ++ BUG_ON(!bfqd); ++ spin_lock_irqsave(&bfqd->lock, flags); ++ ++ bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); ++ bfq_clear_bfqq_wait_request(bfqq); ++ ++ if (bfqq != bfqd->in_service_queue) { ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++ return; ++ } ++ ++ if (bfq_bfqq_budget_timeout(bfqq)) ++ /* ++ * Also here the queue can be safely expired ++ * for budget timeout without wasting ++ * guarantees ++ */ ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) ++ /* ++ * The queue may not be empty upon timer expiration, ++ * because we may not disable the timer when the ++ * first request of the in-service queue arrives ++ * during disk idling. ++ */ ++ reason = BFQ_BFQQ_TOO_IDLE; ++ else ++ goto schedule_dispatch; ++ ++ bfq_bfqq_expire(bfqd, bfqq, true, reason); ++ ++schedule_dispatch: ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++ bfq_schedule_dispatch(bfqd); ++} ++ ++/* ++ * Handler of the expiration of the timer running if the in-service queue ++ * is idling inside its time slice. ++ */ ++static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) ++{ ++ struct bfq_data *bfqd = container_of(timer, struct bfq_data, ++ idle_slice_timer); ++ struct bfq_queue *bfqq = bfqd->in_service_queue; ++ ++ bfq_log(bfqd, "expired"); ++ ++ /* ++ * Theoretical race here: the in-service queue can be NULL or ++ * different from the queue that was idling if a new request ++ * arrives for the current queue and there is a full dispatch ++ * cycle that changes the in-service queue. This can hardly ++ * happen, but in the worst case we just expire a queue too ++ * early. ++ */ ++ if (bfqq) ++ bfq_idle_slice_timer_body(bfqq); ++ ++ return HRTIMER_NORESTART; ++} ++ ++static void __bfq_put_async_bfqq(struct bfq_data *bfqd, ++ struct bfq_queue **bfqq_ptr) ++{ ++ struct bfq_group *root_group = bfqd->root_group; ++ struct bfq_queue *bfqq = *bfqq_ptr; ++ ++ bfq_log(bfqd, "%p", bfqq); ++ if (bfqq) { ++ bfq_bfqq_move(bfqd, bfqq, root_group); ++ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", ++ bfqq, bfqq->ref); ++ bfq_put_queue(bfqq); ++ *bfqq_ptr = NULL; ++ } ++} ++ ++/* ++ * Release all the bfqg references to its async queues. If we are ++ * deallocating the group these queues may still contain requests, so ++ * we reparent them to the root cgroup (i.e., the only one that will ++ * exist for sure until all the requests on a device are gone). ++ */ ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); ++ ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); ++} ++ ++/* ++ * See the comments on bfq_limit_depth for the purpose of ++ * the depths set in the function. Return minimum shallow depth we'll use. ++ */ ++static unsigned int bfq_update_depths(struct bfq_data *bfqd, ++ struct sbitmap_queue *bt) ++{ ++ unsigned int i, j, min_shallow = UINT_MAX; ++ ++ /* ++ * In-word depths if no bfq_queue is being weight-raised: ++ * leaving 25% of tags only for sync reads. ++ * ++ * In next formulas, right-shift the value ++ * (1U<sb.shift), instead of computing directly ++ * (1U<<(bt->sb.shift - something)), to be robust against ++ * any possible value of bt->sb.shift, without having to ++ * limit 'something'. ++ */ ++ /* no more than 50% of tags for async I/O */ ++ bfqd->word_depths[0][0] = max((1U<sb.shift)>>1, 1U); ++ /* ++ * no more than 75% of tags for sync writes (25% extra tags ++ * w.r.t. async I/O, to prevent async I/O from starving sync ++ * writes) ++ */ ++ bfqd->word_depths[0][1] = max(((1U<sb.shift) * 3)>>2, 1U); ++ ++ /* ++ * In-word depths in case some bfq_queue is being weight- ++ * raised: leaving ~63% of tags for sync reads. This is the ++ * highest percentage for which, in our tests, application ++ * start-up times didn't suffer from any regression due to tag ++ * shortage. ++ */ ++ /* no more than ~18% of tags for async I/O */ ++ bfqd->word_depths[1][0] = max(((1U<sb.shift) * 3)>>4, 1U); ++ /* no more than ~37% of tags for sync writes (~20% extra tags) */ ++ bfqd->word_depths[1][1] = max(((1U<sb.shift) * 6)>>4, 1U); ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < 2; j++) ++ min_shallow = min(min_shallow, bfqd->word_depths[i][j]); ++ ++ return min_shallow; ++} ++ ++static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) ++{ ++ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; ++ struct blk_mq_tags *tags = hctx->sched_tags; ++ unsigned int min_shallow; ++ ++ min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); ++ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); ++} ++ ++static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) ++{ ++ bfq_depth_updated(hctx); ++ return 0; ++} ++ ++static void bfq_exit_queue(struct elevator_queue *e) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ struct bfq_queue *bfqq, *n; ++ ++ bfq_log(bfqd, "starting ..."); ++ ++ hrtimer_cancel(&bfqd->idle_slice_timer); ++ ++ BUG_ON(bfqd->in_service_queue); ++ BUG_ON(!list_empty(&bfqd->active_list)); ++ ++ spin_lock_irq(&bfqd->lock); ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) ++ bfq_deactivate_bfqq(bfqd, bfqq, false, false); ++ spin_unlock_irq(&bfqd->lock); ++ ++ hrtimer_cancel(&bfqd->idle_slice_timer); ++ ++ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ /* release oom-queue reference to root group */ ++ bfqg_and_blkg_put(bfqd->root_group); ++ ++ blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); ++#else ++ spin_lock_irq(&bfqd->lock); ++ bfq_put_async_queues(bfqd, bfqd->root_group); ++ kfree(bfqd->root_group); ++ spin_unlock_irq(&bfqd->lock); ++#endif ++ ++ bfq_log(bfqd, "finished ..."); ++ kfree(bfqd); ++} ++ ++static void bfq_init_root_group(struct bfq_group *root_group, ++ struct bfq_data *bfqd) ++{ ++ int i; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ root_group->entity.parent = NULL; ++ root_group->my_entity = NULL; ++ root_group->bfqd = bfqd; ++#endif ++ root_group->rq_pos_tree = RB_ROOT; ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ root_group->sched_data.bfq_class_idle_last_service = jiffies; ++} ++ ++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) ++{ ++ struct bfq_data *bfqd; ++ struct elevator_queue *eq; ++ ++ eq = elevator_alloc(q, e); ++ if (!eq) ++ return -ENOMEM; ++ ++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); ++ if (!bfqd) { ++ kobject_put(&eq->kobj); ++ return -ENOMEM; ++ } ++ eq->elevator_data = bfqd; ++ ++ spin_lock_irq(q->queue_lock); ++ q->elevator = eq; ++ spin_unlock_irq(q->queue_lock); ++ ++ /* ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. ++ * Grab a permanent reference to it, so that the normal code flow ++ * will not attempt to free it. ++ */ ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); ++ bfqd->oom_bfqq.ref++; ++ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; ++ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; ++ bfqd->oom_bfqq.entity.new_weight = ++ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); ++ ++ /* oom_bfqq does not participate to bursts */ ++ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); ++ /* ++ * Trigger weight initialization, according to ioprio, at the ++ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio ++ * class won't be changed any more. ++ */ ++ bfqd->oom_bfqq.entity.prio_changed = 1; ++ ++ bfqd->queue = q; ++ INIT_LIST_HEAD(&bfqd->dispatch); ++ ++ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, ++ HRTIMER_MODE_REL); ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; ++ ++ bfqd->queue_weights_tree = RB_ROOT; ++ bfqd->num_groups_with_pending_reqs = 0; ++ ++ INIT_LIST_HEAD(&bfqd->active_list); ++ INIT_LIST_HEAD(&bfqd->idle_list); ++ INIT_HLIST_HEAD(&bfqd->burst_list); ++ ++ bfqd->hw_tag = -1; ++ ++ bfqd->bfq_max_budget = bfq_default_max_budget; ++ ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; ++ bfqd->bfq_back_max = bfq_back_max; ++ bfqd->bfq_back_penalty = bfq_back_penalty; ++ bfqd->bfq_slice_idle = bfq_slice_idle; ++ bfqd->bfq_timeout = bfq_timeout; ++ ++ bfqd->bfq_requests_within_timer = 120; ++ ++ bfqd->bfq_large_burst_thresh = 8; ++ bfqd->bfq_burst_interval = msecs_to_jiffies(180); ++ ++ bfqd->low_latency = true; ++ ++ /* ++ * Trade-off between responsiveness and fairness. ++ */ ++ bfqd->bfq_wr_coeff = 30; ++ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); ++ bfqd->bfq_wr_max_time = 0; ++ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); ++ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); ++ bfqd->bfq_wr_max_softrt_rate = 7000; /* ++ * Approximate rate required ++ * to playback or record a ++ * high-definition compressed ++ * video. ++ */ ++ bfqd->wr_busy_queues = 0; ++ ++ /* ++ * Begin by assuming, optimistically, that the device peak ++ * rate is equal to 2/3 of the highest reference rate. ++ */ ++ bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] * ++ ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; ++ bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; ++ ++ spin_lock_init(&bfqd->lock); ++ ++ /* ++ * The invocation of the next bfq_create_group_hierarchy ++ * function is the head of a chain of function calls ++ * (bfq_create_group_hierarchy->blkcg_activate_policy-> ++ * blk_mq_freeze_queue) that may lead to the invocation of the ++ * has_work hook function. For this reason, ++ * bfq_create_group_hierarchy is invoked only after all ++ * scheduler data has been initialized, apart from the fields ++ * that can be initialized only after invoking ++ * bfq_create_group_hierarchy. This, in particular, enables ++ * has_work to correctly return false. Of course, to avoid ++ * other inconsistencies, the blk-mq stack must then refrain ++ * from invoking further scheduler hooks before this init ++ * function is finished. ++ */ ++ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); ++ if (!bfqd->root_group) ++ goto out_free; ++ bfq_init_root_group(bfqd->root_group, bfqd); ++ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); ++ ++ wbt_disable_default(q); ++ return 0; ++ ++out_free: ++ kfree(bfqd); ++ kobject_put(&eq->kobj); ++ return -ENOMEM; ++} ++ ++static void bfq_slab_kill(void) ++{ ++ kmem_cache_destroy(bfq_pool); ++} ++ ++static int __init bfq_slab_setup(void) ++{ ++ bfq_pool = KMEM_CACHE(bfq_queue, 0); ++ if (!bfq_pool) ++ return -ENOMEM; ++ return 0; ++} ++ ++static ssize_t bfq_var_show(unsigned int var, char *page) ++{ ++ return sprintf(page, "%u\n", var); ++} ++ ++static ssize_t bfq_var_store(unsigned long *var, const char *page, ++ size_t count) ++{ ++ unsigned long new_val; ++ int ret = kstrtoul(page, 10, &new_val); ++ ++ if (ret == 0) ++ *var = new_val; ++ ++ return count; ++} ++ ++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ ++ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? ++ jiffies_to_msecs(bfqd->bfq_wr_max_time) : ++ jiffies_to_msecs(bfq_wr_duration(bfqd))); ++} ++ ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_queue *bfqq; ++ struct bfq_data *bfqd = e->elevator_data; ++ ssize_t num_char = 0; ++ ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", ++ bfqd->queued); ++ ++ spin_lock_irq(&bfqd->lock); ++ ++ num_char += sprintf(page + num_char, "Active:\n"); ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, nr_queued %d %d, ", ++ bfqq->pid, ++ bfqq->entity.weight, ++ bfqq->queued[0], ++ bfqq->queued[1]); ++ num_char += sprintf(page + num_char, ++ "dur %d/%u\n", ++ jiffies_to_msecs( ++ jiffies - ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ num_char += sprintf(page + num_char, "Idle:\n"); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, dur %d/%u\n", ++ bfqq->pid, ++ bfqq->entity.weight, ++ jiffies_to_msecs(jiffies - ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ spin_unlock_irq(&bfqd->lock); ++ ++ return num_char; ++} ++ ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ u64 __data = __VAR; \ ++ if (__CONV == 1) \ ++ __data = jiffies_to_msecs(__data); \ ++ else if (__CONV == 2) \ ++ __data = div_u64(__data, NSEC_PER_MSEC); \ ++ return bfq_var_show(__data, (page)); \ ++} ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); ++SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); ++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); ++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); ++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); ++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, ++ 1); ++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); ++#undef SHOW_FUNCTION ++ ++#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ u64 __data = __VAR; \ ++ __data = div_u64(__data, NSEC_PER_USEC); \ ++ return bfq_var_show(__data, (page)); \ ++} ++USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); ++#undef USEC_SHOW_FUNCTION ++ ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ ++static ssize_t \ ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long uninitialized_var(__data); \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ if (__CONV == 1) \ ++ *(__PTR) = msecs_to_jiffies(__data); \ ++ else if (__CONV == 2) \ ++ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ ++ else \ ++ *(__PTR) = __data; \ ++ return ret; \ ++} ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, ++ INT_MAX, 2); ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, ++ INT_MAX, 2); ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, ++ INT_MAX, 0); ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); ++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, ++ 1); ++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, ++ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, ++ INT_MAX, 0); ++#undef STORE_FUNCTION ++ ++#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ ++static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long uninitialized_var(__data); \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ ++ return ret; \ ++} ++USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, ++ UINT_MAX); ++#undef USEC_STORE_FUNCTION ++ ++/* do nothing for the moment */ ++static ssize_t bfq_weights_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ return count; ++} ++ ++static ssize_t bfq_max_budget_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data == 0) ++ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); ++ else { ++ if (__data > INT_MAX) ++ __data = INT_MAX; ++ bfqd->bfq_max_budget = __data; ++ } ++ ++ bfqd->bfq_user_max_budget = __data; ++ ++ return ret; ++} ++ ++/* ++ * Leaving this name to preserve name compatibility with cfq ++ * parameters, but this timeout is used for both sync and async. ++ */ ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data < 1) ++ __data = 1; ++ else if (__data > INT_MAX) ++ __data = INT_MAX; ++ ++ bfqd->bfq_timeout = msecs_to_jiffies(__data); ++ if (bfqd->bfq_user_max_budget == 0) ++ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); ++ ++ return ret; ++} ++ ++static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ if (!bfqd->strict_guarantees && __data == 1 ++ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) ++ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; ++ ++ bfqd->strict_guarantees = __data; ++ ++ return ret; ++} ++ ++static ssize_t bfq_low_latency_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ if (__data == 0 && bfqd->low_latency != 0) ++ bfq_end_wr(bfqd); ++ bfqd->low_latency = __data; ++ ++ return ret; ++} ++ ++#define BFQ_ATTR(name) \ ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) ++ ++static struct elv_fs_entry bfq_attrs[] = { ++ BFQ_ATTR(fifo_expire_sync), ++ BFQ_ATTR(fifo_expire_async), ++ BFQ_ATTR(back_seek_max), ++ BFQ_ATTR(back_seek_penalty), ++ BFQ_ATTR(slice_idle), ++ BFQ_ATTR(slice_idle_us), ++ BFQ_ATTR(max_budget), ++ BFQ_ATTR(timeout_sync), ++ BFQ_ATTR(strict_guarantees), ++ BFQ_ATTR(low_latency), ++ BFQ_ATTR(wr_coeff), ++ BFQ_ATTR(wr_max_time), ++ BFQ_ATTR(wr_rt_max_time), ++ BFQ_ATTR(wr_min_idle_time), ++ BFQ_ATTR(wr_min_inter_arr_async), ++ BFQ_ATTR(wr_max_softrt_rate), ++ BFQ_ATTR(weights), ++ __ATTR_NULL ++}; ++ ++static struct elevator_type iosched_bfq_mq = { ++ .ops.mq = { ++ .limit_depth = bfq_limit_depth, ++ .prepare_request = bfq_prepare_request, ++ .requeue_request = bfq_finish_requeue_request, ++ .finish_request = bfq_finish_requeue_request, ++ .exit_icq = bfq_exit_icq, ++ .insert_requests = bfq_insert_requests, ++ .dispatch_request = bfq_dispatch_request, ++ .next_request = elv_rb_latter_request, ++ .former_request = elv_rb_former_request, ++ .allow_merge = bfq_allow_bio_merge, ++ .bio_merge = bfq_bio_merge, ++ .request_merge = bfq_request_merge, ++ .requests_merged = bfq_requests_merged, ++ .request_merged = bfq_request_merged, ++ .has_work = bfq_has_work, ++ .depth_updated = bfq_depth_updated, ++ .init_hctx = bfq_init_hctx, ++ .init_sched = bfq_init_queue, ++ .exit_sched = bfq_exit_queue, ++ }, ++ ++ .uses_mq = true, ++ .icq_size = sizeof(struct bfq_io_cq), ++ .icq_align = __alignof__(struct bfq_io_cq), ++ .elevator_attrs = bfq_attrs, ++ .elevator_name = "bfq-mq", ++ .elevator_owner = THIS_MODULE, ++}; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct blkcg_policy blkcg_policy_bfq = { ++ .dfl_cftypes = bfq_blkg_files, ++ .legacy_cftypes = bfq_blkcg_legacy_files, ++ ++ .cpd_alloc_fn = bfq_cpd_alloc, ++ .cpd_init_fn = bfq_cpd_init, ++ .cpd_bind_fn = bfq_cpd_init, ++ .cpd_free_fn = bfq_cpd_free, ++ ++ .pd_alloc_fn = bfq_pd_alloc, ++ .pd_init_fn = bfq_pd_init, ++ .pd_offline_fn = bfq_pd_offline, ++ .pd_free_fn = bfq_pd_free, ++ .pd_reset_stats_fn = bfq_pd_reset_stats, ++}; ++#endif ++ ++static int __init bfq_init(void) ++{ ++ int ret; ++ char msg[60] = "BFQ I/O-scheduler: v9"; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ ret = blkcg_policy_register(&blkcg_policy_bfq); ++ if (ret) ++ return ret; ++#endif ++ ++ ret = -ENOMEM; ++ if (bfq_slab_setup()) ++ goto err_pol_unreg; ++ ++ /* ++ * Times to load large popular applications for the typical ++ * systems installed on the reference devices (see the ++ * comments before the definition of the next ++ * array). Actually, we use slightly lower values, as the ++ * estimated peak rate tends to be smaller than the actual ++ * peak rate. The reason for this last fact is that estimates ++ * are computed over much shorter time intervals than the long ++ * intervals typically used for benchmarking. Why? First, to ++ * adapt more quickly to variations. Second, because an I/O ++ * scheduler cannot rely on a peak-rate-evaluation workload to ++ * be run for a long time. ++ */ ++ ref_wr_duration[0] = msecs_to_jiffies(7000); /* actually 8 sec */ ++ ref_wr_duration[1] = msecs_to_jiffies(2500); /* actually 3 sec */ ++ ++ ret = elv_register(&iosched_bfq_mq); ++ if (ret) ++ goto slab_kill; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ strcat(msg, " (with cgroups support)"); ++#endif ++ pr_info("%s", msg); ++ ++ return 0; ++ ++slab_kill: ++ bfq_slab_kill(); ++err_pol_unreg: ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ blkcg_policy_unregister(&blkcg_policy_bfq); ++#endif ++ return ret; ++} ++ ++static void __exit bfq_exit(void) ++{ ++ elv_unregister(&iosched_bfq_mq); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ blkcg_policy_unregister(&blkcg_policy_bfq); ++#endif ++ bfq_slab_kill(); ++} ++ ++module_init(bfq_init); ++module_exit(bfq_exit); ++ ++MODULE_AUTHOR("Paolo Valente"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler"); +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +new file mode 100644 +index 000000000000..ceb291132a1a +--- /dev/null ++++ b/block/bfq-mq.h +@@ -0,0 +1,1077 @@ ++/* ++ * BFQ v9: data structures and common functions prototypes. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2015 Paolo Valente ++ * ++ * Copyright (C) 2017 Paolo Valente ++ */ ++ ++#ifndef _BFQ_H ++#define _BFQ_H ++ ++#include ++#include ++ ++/* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */ ++#ifdef CONFIG_MQ_BFQ_GROUP_IOSCHED ++#define BFQ_GROUP_IOSCHED_ENABLED ++#endif ++ ++#define BFQ_IOPRIO_CLASSES 3 ++#define BFQ_CL_IDLE_TIMEOUT (HZ/5) ++ ++#define BFQ_MIN_WEIGHT 1 ++#define BFQ_MAX_WEIGHT 1000 ++#define BFQ_WEIGHT_CONVERSION_COEFF 10 ++ ++#define BFQ_DEFAULT_QUEUE_IOPRIO 4 ++ ++#define BFQ_WEIGHT_LEGACY_DFL 100 ++#define BFQ_DEFAULT_GRP_IOPRIO 0 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE ++ ++/* ++ * Soft real-time applications are extremely more latency sensitive ++ * than interactive ones. Over-raise the weight of the former to ++ * privilege them against the latter. ++ */ ++#define BFQ_SOFTRT_WEIGHT_FACTOR 100 ++ ++struct bfq_entity; ++ ++/** ++ * struct bfq_service_tree - per ioprio_class service tree. ++ * ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each ++ * ioprio_class has its own independent scheduler, and so its own ++ * bfq_service_tree. All the fields are protected by the queue lock ++ * of the containing bfqd. ++ */ ++struct bfq_service_tree { ++ /* tree for active entities (i.e., those backlogged) */ ++ struct rb_root active; ++ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ ++ struct rb_root idle; ++ ++ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ ++ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ ++ ++ u64 vtime; /* scheduler virtual time */ ++ /* scheduler weight sum; active and idle entities contribute to it */ ++ unsigned long wsum; ++}; ++ ++/** ++ * struct bfq_sched_data - multi-class scheduler. ++ * ++ * bfq_sched_data is the basic scheduler queue. It supports three ++ * ioprio_classes, and can be used either as a toplevel queue or as an ++ * intermediate queue in a hierarchical setup. ++ * ++ * The supported ioprio_classes are the same as in CFQ, in descending ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. ++ * Requests from higher priority queues are served before all the ++ * requests from lower priority queues; among requests of the same ++ * queue requests are served according to B-WF2Q+. ++ * ++ * The schedule is implemented by the service trees, plus the field ++ * @next_in_service, which points to the entity on the active trees ++ * that will be served next, if 1) no changes in the schedule occurs ++ * before the current in-service entity is expired, 2) the in-service ++ * queue becomes idle when it expires, and 3) if the entity pointed by ++ * in_service_entity is not a queue, then the in-service child entity ++ * of the entity pointed by in_service_entity becomes idle on ++ * expiration. This peculiar definition allows for the following ++ * optimization, not yet exploited: while a given entity is still in ++ * service, we already know which is the best candidate for next ++ * service among the other active entitities in the same parent ++ * entity. We can then quickly compare the timestamps of the ++ * in-service entity with those of such best candidate. ++ * ++ * All the fields are protected by the queue lock of the containing ++ * bfqd. ++ */ ++struct bfq_sched_data { ++ struct bfq_entity *in_service_entity; /* entity in service */ ++ /* head-of-the-line entity in the scheduler (see comments above) */ ++ struct bfq_entity *next_in_service; ++ /* array of service trees, one per ioprio_class */ ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; ++ /* last time CLASS_IDLE was served */ ++ unsigned long bfq_class_idle_last_service; ++ ++}; ++ ++/** ++ * struct bfq_weight_counter - counter of the number of all active queues ++ * with a given weight. ++ */ ++struct bfq_weight_counter { ++ unsigned int weight; /* weight of the queues this counter refers to */ ++ unsigned int num_active; /* nr of active queues with this weight */ ++ /* ++ * Weights tree member (see bfq_data's @queue_weights_tree) ++ */ ++ struct rb_node weights_node; ++}; ++ ++/** ++ * struct bfq_entity - schedulable entity. ++ * ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each ++ * entity belongs to the sched_data of the parent group in the cgroup ++ * hierarchy. Non-leaf entities have also their own sched_data, stored ++ * in @my_sched_data. ++ * ++ * Each entity stores independently its priority values; this would ++ * allow different weights on different devices, but this ++ * functionality is not exported to userspace by now. Priorities and ++ * weights are updated lazily, first storing the new values into the ++ * new_* fields, then setting the @prio_changed flag. As soon as ++ * there is a transition in the entity state that allows the priority ++ * update to take place the effective and the requested priority ++ * values are synchronized. ++ * ++ * Unless cgroups are used, the weight value is calculated from the ++ * ioprio to export the same interface as CFQ. When dealing with ++ * ``well-behaved'' queues (i.e., queues that do not spend too much ++ * time to consume their budget and have true sequential behavior, and ++ * when there are no external factors breaking anticipation) the ++ * relative weights at each level of the cgroups hierarchy should be ++ * guaranteed. All the fields are protected by the queue lock of the ++ * containing bfqd. ++ */ ++struct bfq_entity { ++ struct rb_node rb_node; /* service_tree member */ ++ ++ /* ++ * Flag, true if the entity is on a tree (either the active or ++ * the idle one of its service_tree) or is in service. ++ */ ++ bool on_st; ++ ++ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ ++ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ ++ ++ /* tree the entity is enqueued into; %NULL if not on a tree */ ++ struct rb_root *tree; ++ ++ /* ++ * minimum start time of the (active) subtree rooted at this ++ * entity; used for O(log N) lookups into active trees ++ */ ++ u64 min_start; ++ ++ /* amount of service received during the last service slot */ ++ int service; ++ ++ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ ++ int budget; ++ ++ unsigned int weight; /* weight of the queue */ ++ unsigned int new_weight; /* next weight if a change is in progress */ ++ ++ /* original weight, used to implement weight boosting */ ++ unsigned int orig_weight; ++ ++ /* parent entity, for hierarchical scheduling */ ++ struct bfq_entity *parent; ++ ++ /* ++ * For non-leaf nodes in the hierarchy, the associated ++ * scheduler queue, %NULL on leaf nodes. ++ */ ++ struct bfq_sched_data *my_sched_data; ++ /* the scheduler queue this entity belongs to */ ++ struct bfq_sched_data *sched_data; ++ ++ /* flag, set to request a weight, ioprio or ioprio_class change */ ++ int prio_changed; ++ ++ /* flag, set if the entity is counted in groups_with_pending_reqs */ ++ bool in_groups_with_pending_reqs; ++}; ++ ++struct bfq_group; ++ ++/** ++ * struct bfq_ttime - per process thinktime stats. ++ */ ++struct bfq_ttime { ++ u64 last_end_request; /* completion time of last request */ ++ ++ u64 ttime_total; /* total process thinktime */ ++ unsigned long ttime_samples; /* number of thinktime samples */ ++ u64 ttime_mean; /* average process thinktime */ ++ ++}; ++ ++/** ++ * struct bfq_queue - leaf schedulable entity. ++ * ++ * A bfq_queue is a leaf request queue; it can be associated with an ++ * io_context or more, if it is async or shared between cooperating ++ * processes. @cgroup holds a reference to the cgroup, to be sure that it ++ * does not disappear while a bfqq still references it (mostly to avoid ++ * races between request issuing and task migration followed by cgroup ++ * destruction). ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_queue { ++ /* reference counter */ ++ int ref; ++ /* parent bfq_data */ ++ struct bfq_data *bfqd; ++ ++ /* current ioprio and ioprio class */ ++ unsigned short ioprio, ioprio_class; ++ /* next ioprio and ioprio class if a change is in progress */ ++ unsigned short new_ioprio, new_ioprio_class; ++ ++ /* ++ * Shared bfq_queue if queue is cooperating with one or more ++ * other queues. ++ */ ++ struct bfq_queue *new_bfqq; ++ /* request-position tree member (see bfq_group's @rq_pos_tree) */ ++ struct rb_node pos_node; ++ /* request-position tree root (see bfq_group's @rq_pos_tree) */ ++ struct rb_root *pos_root; ++ ++ /* sorted list of pending requests */ ++ struct rb_root sort_list; ++ /* if fifo isn't expired, next request to serve */ ++ struct request *next_rq; ++ /* number of sync and async requests queued */ ++ int queued[2]; ++ /* number of requests currently allocated */ ++ int allocated; ++ /* number of pending metadata requests */ ++ int meta_pending; ++ /* fifo list of requests in sort_list */ ++ struct list_head fifo; ++ ++ /* entity representing this queue in the scheduler */ ++ struct bfq_entity entity; ++ ++ /* pointer to the weight counter associated with this queue */ ++ struct bfq_weight_counter *weight_counter; ++ ++ /* maximum budget allowed from the feedback mechanism */ ++ int max_budget; ++ /* budget expiration (in jiffies) */ ++ unsigned long budget_timeout; ++ ++ /* number of requests on the dispatch list or inside driver */ ++ int dispatched; ++ ++ unsigned int flags; /* status flags.*/ ++ ++ /* node for active/idle bfqq list inside parent bfqd */ ++ struct list_head bfqq_list; ++ ++ /* associated @bfq_ttime struct */ ++ struct bfq_ttime ttime; ++ ++ /* bit vector: a 1 for each seeky requests in history */ ++ u32 seek_history; ++ ++ /* node for the device's burst list */ ++ struct hlist_node burst_list_node; ++ ++ /* position of the last request enqueued */ ++ sector_t last_request_pos; ++ ++ /* Number of consecutive pairs of request completion and ++ * arrival, such that the queue becomes idle after the ++ * completion, but the next request arrives within an idle ++ * time slice; used only if the queue's IO_bound flag has been ++ * cleared. ++ */ ++ unsigned int requests_within_timer; ++ ++ /* pid of the process owning the queue, used for logging purposes */ ++ pid_t pid; ++ ++ /* ++ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL ++ * if the queue is shared. ++ */ ++ struct bfq_io_cq *bic; ++ ++ /* current maximum weight-raising time for this queue */ ++ unsigned long wr_cur_max_time; ++ /* ++ * Minimum time instant such that, only if a new request is ++ * enqueued after this time instant in an idle @bfq_queue with ++ * no outstanding requests, then the task associated with the ++ * queue it is deemed as soft real-time (see the comments on ++ * the function bfq_bfqq_softrt_next_start()) ++ */ ++ unsigned long soft_rt_next_start; ++ /* ++ * Start time of the current weight-raising period if ++ * the @bfq-queue is being weight-raised, otherwise ++ * finish time of the last weight-raising period. ++ */ ++ unsigned long last_wr_start_finish; ++ /* factor by which the weight of this queue is multiplied */ ++ unsigned int wr_coeff; ++ /* ++ * Time of the last transition of the @bfq_queue from idle to ++ * backlogged. ++ */ ++ unsigned long last_idle_bklogged; ++ /* ++ * Cumulative service received from the @bfq_queue since the ++ * last transition from idle to backlogged. ++ */ ++ unsigned long service_from_backlogged; ++ /* ++ * Cumulative service received from the @bfq_queue since its ++ * last transition to weight-raised state. ++ */ ++ unsigned long service_from_wr; ++ /* ++ * Value of wr start time when switching to soft rt ++ */ ++ unsigned long wr_start_at_switch_to_srt; ++ ++ unsigned long split_time; /* time of last split */ ++ unsigned long first_IO_time; /* time of first I/O for this queue */ ++ ++ /* max service rate measured so far */ ++ u32 max_service_rate; ++ /* ++ * Ratio between the service received by bfqq while it is in ++ * service, and the cumulative service (of requests of other ++ * queues) that may be injected while bfqq is empty but still ++ * in service. To increase precision, the coefficient is ++ * measured in tenths of unit. Here are some example of (1) ++ * ratios, (2) resulting percentages of service injected ++ * w.r.t. to the total service dispatched while bfqq is in ++ * service, and (3) corresponding values of the coefficient: ++ * 1 (50%) -> 10 ++ * 2 (33%) -> 20 ++ * 10 (9%) -> 100 ++ * 9.9 (9%) -> 99 ++ * 1.5 (40%) -> 15 ++ * 0.5 (66%) -> 5 ++ * 0.1 (90%) -> 1 ++ * ++ * So, if the coefficient is lower than 10, then ++ * injected service is more than bfqq service. ++ */ ++ unsigned int inject_coeff; ++ /* amount of service injected in current service slot */ ++ unsigned int injected_service; ++}; ++ ++/** ++ * struct bfq_io_cq - per (request_queue, io_context) structure. ++ */ ++struct bfq_io_cq { ++ /* associated io_cq structure */ ++ struct io_cq icq; /* must be the first member */ ++ /* array of two process queues, the sync and the async */ ++ struct bfq_queue *bfqq[2]; ++ /* per (request_queue, blkcg) ioprio */ ++ int ioprio; ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ uint64_t blkcg_serial_nr; /* the current blkcg serial */ ++#endif ++ ++ /* ++ * Snapshot of the has_short_time flag before merging; taken ++ * to remember its value while the queue is merged, so as to ++ * be able to restore it in case of split. ++ */ ++ bool saved_has_short_ttime; ++ /* ++ * Same purpose as the previous two fields for the I/O bound ++ * classification of a queue. ++ */ ++ bool saved_IO_bound; ++ ++ /* ++ * Same purpose as the previous fields for the value of the ++ * field keeping the queue's belonging to a large burst ++ */ ++ bool saved_in_large_burst; ++ /* ++ * True if the queue belonged to a burst list before its merge ++ * with another cooperating queue. ++ */ ++ bool was_in_burst_list; ++ ++ /* ++ * Similar to previous fields: save wr information. ++ */ ++ unsigned long saved_wr_coeff; ++ unsigned long saved_last_wr_start_finish; ++ unsigned long saved_wr_start_at_switch_to_srt; ++ unsigned int saved_wr_cur_max_time; ++ struct bfq_ttime saved_ttime; ++}; ++ ++/** ++ * struct bfq_data - per-device data structure. ++ * ++ * All the fields are protected by @lock. ++ */ ++struct bfq_data { ++ /* device request queue */ ++ struct request_queue *queue; ++ /* dispatch queue */ ++ struct list_head dispatch; ++ ++ /* root bfq_group for the device */ ++ struct bfq_group *root_group; ++ ++ /* ++ * rbtree of weight counters of @bfq_queues, sorted by ++ * weight. Used to keep track of whether all @bfq_queues have ++ * the same weight. The tree contains one counter for each ++ * distinct weight associated to some active and not ++ * weight-raised @bfq_queue (see the comments to the functions ++ * bfq_weights_tree_[add|remove] for further details). ++ */ ++ struct rb_root queue_weights_tree; ++ ++ /* ++ * Number of groups with at least one descendant process that ++ * has at least one request waiting for completion. Note that ++ * this accounts for also requests already dispatched, but not ++ * yet completed. Therefore this number of groups may differ ++ * (be larger) than the number of active groups, as a group is ++ * considered active only if its corresponding entity has ++ * descendant queues with at least one request queued. This ++ * number is used to decide whether a scenario is symmetric. ++ * For a detailed explanation see comments on the computation ++ * of the variable asymmetric_scenario in the function ++ * bfq_better_to_idle(). ++ * ++ * However, it is hard to compute this number exactly, for ++ * groups with multiple descendant processes. Consider a group ++ * that is inactive, i.e., that has no descendant process with ++ * pending I/O inside BFQ queues. Then suppose that ++ * num_groups_with_pending_reqs is still accounting for this ++ * group, because the group has descendant processes with some ++ * I/O request still in flight. num_groups_with_pending_reqs ++ * should be decremented when the in-flight request of the ++ * last descendant process is finally completed (assuming that ++ * nothing else has changed for the group in the meantime, in ++ * terms of composition of the group and active/inactive state of child ++ * groups and processes). To accomplish this, an additional ++ * pending-request counter must be added to entities, and must ++ * be updated correctly. To avoid this additional field and operations, ++ * we resort to the following tradeoff between simplicity and ++ * accuracy: for an inactive group that is still counted in ++ * num_groups_with_pending_reqs, we decrement ++ * num_groups_with_pending_reqs when the first descendant ++ * process of the group remains with no request waiting for ++ * completion. ++ * ++ * Even this simpler decrement strategy requires a little ++ * carefulness: to avoid multiple decrements, we flag a group, ++ * more precisely an entity representing a group, as still ++ * counted in num_groups_with_pending_reqs when it becomes ++ * inactive. Then, when the first descendant queue of the ++ * entity remains with no request waiting for completion, ++ * num_groups_with_pending_reqs is decremented, and this flag ++ * is reset. After this flag is reset for the entity, ++ * num_groups_with_pending_reqs won't be decremented any ++ * longer in case a new descendant queue of the entity remains ++ * with no request waiting for completion. ++ */ ++ unsigned int num_groups_with_pending_reqs; ++ ++ /* ++ * Per-class (RT, BE, IDLE) number of bfq_queues containing ++ * requests (including the queue in service, even if it is ++ * idling). ++ */ ++ unsigned int busy_queues[3]; ++ /* number of weight-raised busy @bfq_queues */ ++ int wr_busy_queues; ++ /* number of queued requests */ ++ int queued; ++ /* number of requests dispatched and waiting for completion */ ++ int rq_in_driver; ++ ++ /* ++ * Maximum number of requests in driver in the last ++ * @hw_tag_samples completed requests. ++ */ ++ int max_rq_in_driver; ++ /* number of samples used to calculate hw_tag */ ++ int hw_tag_samples; ++ /* flag set to one if the driver is showing a queueing behavior */ ++ int hw_tag; ++ ++ /* number of budgets assigned */ ++ int budgets_assigned; ++ ++ /* ++ * Timer set when idling (waiting) for the next request from ++ * the queue in service. ++ */ ++ struct hrtimer idle_slice_timer; ++ ++ /* bfq_queue in service */ ++ struct bfq_queue *in_service_queue; ++ ++ /* on-disk position of the last served request */ ++ sector_t last_position; ++ ++ /* position of the last served request for the in-service queue */ ++ sector_t in_serv_last_pos; ++ ++ /* time of last request completion (ns) */ ++ u64 last_completion; ++ ++ /* time of first rq dispatch in current observation interval (ns) */ ++ u64 first_dispatch; ++ /* time of last rq dispatch in current observation interval (ns) */ ++ u64 last_dispatch; ++ ++ /* beginning of the last budget */ ++ ktime_t last_budget_start; ++ /* beginning of the last idle slice */ ++ ktime_t last_idling_start; ++ ++ /* number of samples in current observation interval */ ++ int peak_rate_samples; ++ /* num of samples of seq dispatches in current observation interval */ ++ u32 sequential_samples; ++ /* total num of sectors transferred in current observation interval */ ++ u64 tot_sectors_dispatched; ++ /* max rq size seen during current observation interval (sectors) */ ++ u32 last_rq_max_size; ++ /* time elapsed from first dispatch in current observ. interval (us) */ ++ u64 delta_from_first; ++ /* ++ * Current estimate of the device peak rate, measured in ++ * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by ++ * BFQ_RATE_SHIFT is performed to increase precision in ++ * fixed-point calculations. ++ */ ++ u32 peak_rate; ++ ++ /* maximum budget allotted to a bfq_queue before rescheduling */ ++ int bfq_max_budget; ++ ++ /* list of all the bfq_queues active on the device */ ++ struct list_head active_list; ++ /* list of all the bfq_queues idle on the device */ ++ struct list_head idle_list; ++ ++ /* ++ * Timeout for async/sync requests; when it fires, requests ++ * are served in fifo order. ++ */ ++ u64 bfq_fifo_expire[2]; ++ /* weight of backward seeks wrt forward ones */ ++ unsigned int bfq_back_penalty; ++ /* maximum allowed backward seek */ ++ unsigned int bfq_back_max; ++ /* maximum idling time */ ++ u32 bfq_slice_idle; ++ ++ /* user-configured max budget value (0 for auto-tuning) */ ++ int bfq_user_max_budget; ++ /* ++ * Timeout for bfq_queues to consume their budget; used to ++ * prevent seeky queues from imposing long latencies to ++ * sequential or quasi-sequential ones (this also implies that ++ * seeky queues cannot receive guarantees in the service ++ * domain; after a timeout they are charged for the time they ++ * have been in service, to preserve fairness among them, but ++ * without service-domain guarantees). ++ */ ++ unsigned int bfq_timeout; ++ ++ /* ++ * Number of consecutive requests that must be issued within ++ * the idle time slice to set again idling to a queue which ++ * was marked as non-I/O-bound (see the definition of the ++ * IO_bound flag for further details). ++ */ ++ unsigned int bfq_requests_within_timer; ++ ++ /* ++ * Force device idling whenever needed to provide accurate ++ * service guarantees, without caring about throughput ++ * issues. CAVEAT: this may even increase latencies, in case ++ * of useless idling for processes that did stop doing I/O. ++ */ ++ bool strict_guarantees; ++ ++ /* ++ * Last time at which a queue entered the current burst of ++ * queues being activated shortly after each other; for more ++ * details about this and the following parameters related to ++ * a burst of activations, see the comments on the function ++ * bfq_handle_burst. ++ */ ++ unsigned long last_ins_in_burst; ++ /* ++ * Reference time interval used to decide whether a queue has ++ * been activated shortly after @last_ins_in_burst. ++ */ ++ unsigned long bfq_burst_interval; ++ /* number of queues in the current burst of queue activations */ ++ int burst_size; ++ ++ /* common parent entity for the queues in the burst */ ++ struct bfq_entity *burst_parent_entity; ++ /* Maximum burst size above which the current queue-activation ++ * burst is deemed as 'large'. ++ */ ++ unsigned long bfq_large_burst_thresh; ++ /* true if a large queue-activation burst is in progress */ ++ bool large_burst; ++ /* ++ * Head of the burst list (as for the above fields, more ++ * details in the comments on the function bfq_handle_burst). ++ */ ++ struct hlist_head burst_list; ++ ++ /* if set to true, low-latency heuristics are enabled */ ++ bool low_latency; ++ /* ++ * Maximum factor by which the weight of a weight-raised queue ++ * is multiplied. ++ */ ++ unsigned int bfq_wr_coeff; ++ /* maximum duration of a weight-raising period (jiffies) */ ++ unsigned int bfq_wr_max_time; ++ ++ /* Maximum weight-raising duration for soft real-time processes */ ++ unsigned int bfq_wr_rt_max_time; ++ /* ++ * Minimum idle period after which weight-raising may be ++ * reactivated for a queue (in jiffies). ++ */ ++ unsigned int bfq_wr_min_idle_time; ++ /* ++ * Minimum period between request arrivals after which ++ * weight-raising may be reactivated for an already busy async ++ * queue (in jiffies). ++ */ ++ unsigned long bfq_wr_min_inter_arr_async; ++ ++ /* Max service-rate for a soft real-time queue, in sectors/sec */ ++ unsigned int bfq_wr_max_softrt_rate; ++ /* ++ * Cached value of the product ref_rate*ref_wr_duration, used ++ * for computing the maximum duration of weight raising ++ * automatically. ++ */ ++ u64 rate_dur_prod; ++ ++ /* fallback dummy bfqq for extreme OOM conditions */ ++ struct bfq_queue oom_bfqq; ++ ++ spinlock_t lock; ++ ++ /* ++ * bic associated with the task issuing current bio for ++ * merging. This and the next field are used as a support to ++ * be able to perform the bic lookup, needed by bio-merge ++ * functions, before the scheduler lock is taken, and thus ++ * avoid taking the request-queue lock while the scheduler ++ * lock is being held. ++ */ ++ struct bfq_io_cq *bio_bic; ++ /* bfqq associated with the task issuing current bio for merging */ ++ struct bfq_queue *bio_bfqq; ++ /* Extra flag used only for TESTING */ ++ bool bio_bfqq_set; ++ ++ /* ++ * Depth limits used in bfq_limit_depth (see comments on the ++ * function) ++ */ ++ unsigned int word_depths[2][2]; ++}; ++ ++enum bfqq_state_flags { ++ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ ++ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ ++ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* ++ * waiting for a request ++ * without idling the device ++ */ ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ ++ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ ++ BFQ_BFQQ_FLAG_IO_bound, /* ++ * bfqq has timed-out at least once ++ * having consumed at most 2/10 of ++ * its budget ++ */ ++ BFQ_BFQQ_FLAG_in_large_burst, /* ++ * bfqq activated in a large burst, ++ * see comments to bfq_handle_burst. ++ */ ++ BFQ_BFQQ_FLAG_softrt_update, /* ++ * may need softrt-next-start ++ * update ++ */ ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ ++ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ ++}; ++ ++#define BFQ_BFQQ_FNS(name) \ ++static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ ++{ \ ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ ++} ++ ++BFQ_BFQQ_FNS(just_created); ++BFQ_BFQQ_FNS(busy); ++BFQ_BFQQ_FNS(wait_request); ++BFQ_BFQQ_FNS(non_blocking_wait_rq); ++BFQ_BFQQ_FNS(fifo_expire); ++BFQ_BFQQ_FNS(has_short_ttime); ++BFQ_BFQQ_FNS(sync); ++BFQ_BFQQ_FNS(IO_bound); ++BFQ_BFQQ_FNS(in_large_burst); ++BFQ_BFQQ_FNS(coop); ++BFQ_BFQQ_FNS(split_coop); ++BFQ_BFQQ_FNS(softrt_update); ++#undef BFQ_BFQQ_FNS ++ ++/* Logging facilities. */ ++#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE ++ ++static const char *checked_dev_name(const struct device *dev) ++{ ++ static const char nodev[] = "nodev"; ++ ++ if (dev) ++ return dev_name(dev); ++ ++ return nodev; ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ ++ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ ++} while (0) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ ++ pr_crit("%s %s [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ bfqg->blkg_path, __func__, ##args); \ ++} while (0) ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __func__, ##args) ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ pr_crit("%s bfq [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ __func__, ##args) ++ ++#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ ++ ++#if !defined(CONFIG_BLK_DEV_IO_TRACE) ++ ++/* Avoid possible "unused-variable" warning. See commit message. */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) ++ ++#define bfq_log(bfqd, fmt, args...) do {} while (0) ++ ++#else /* CONFIG_BLK_DEV_IO_TRACE */ ++ ++#include ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ ++ (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ ++} while (0) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ ++ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, bfqg->blkg_path, \ ++ __func__, ##args);\ ++} while (0) ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __func__, ##args) ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) ++ ++#endif /* CONFIG_BLK_DEV_IO_TRACE */ ++#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ ++ ++/* Expiration reasons. */ ++enum bfqq_expiration { ++ BFQ_BFQQ_TOO_IDLE = 0, /* ++ * queue has been idling for ++ * too long ++ */ ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ ++ BFQ_BFQQ_PREEMPTED /* preemption in progress */ ++}; ++ ++ ++struct bfqg_stats { ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) ++ /* number of ios merged */ ++ struct blkg_rwstat merged; ++ /* total time spent on device in ns, may not be accurate w/ queueing */ ++ struct blkg_rwstat service_time; ++ /* total time spent waiting in scheduler queue in ns */ ++ struct blkg_rwstat wait_time; ++ /* number of IOs queued up */ ++ struct blkg_rwstat queued; ++ /* total disk time and nr sectors dispatched by this group */ ++ struct blkg_stat time; ++ /* sum of number of ios queued across all samples */ ++ struct blkg_stat avg_queue_size_sum; ++ /* count of samples taken for average */ ++ struct blkg_stat avg_queue_size_samples; ++ /* how many times this group has been removed from service tree */ ++ struct blkg_stat dequeue; ++ /* total time spent waiting for it to be assigned a timeslice. */ ++ struct blkg_stat group_wait_time; ++ /* time spent idling for this blkcg_gq */ ++ struct blkg_stat idle_time; ++ /* total time with empty current active q with other requests queued */ ++ struct blkg_stat empty_time; ++ /* fields after this shouldn't be cleared on stat reset */ ++ u64 start_group_wait_time; ++ u64 start_idle_time; ++ u64 start_empty_time; ++ uint16_t flags; ++#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ ++}; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++/* ++ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. ++ * ++ * @ps: @blkcg_policy_storage that this structure inherits ++ * @weight: weight of the bfq_group ++ */ ++struct bfq_group_data { ++ /* must be the first member */ ++ struct blkcg_policy_data pd; ++ ++ unsigned int weight; ++}; ++ ++/** ++ * struct bfq_group - per (device, cgroup) data structure. ++ * @entity: schedulable entity to insert into the parent group sched_data. ++ * @sched_data: own sched_data, to contain child entities (they may be ++ * both bfq_queues and bfq_groups). ++ * @bfqd: the bfq_data for the device this group acts upon. ++ * @async_bfqq: array of async queues for all the tasks belonging to ++ * the group, one queue per ioprio value per ioprio_class, ++ * except for the idle class that has only one queue. ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used ++ * to avoid too many special cases during group creation/ ++ * migration. ++ * @active_entities: number of active entities belonging to the group; ++ * unused for the root group. Used to know whether there ++ * are groups with more than one active @bfq_entity ++ * (see the comments to the function ++ * bfq_bfqq_may_idle()). ++ * @rq_pos_tree: rbtree sorted by next_request position, used when ++ * determining if two or more queues have interleaving ++ * requests (see bfq_find_close_cooperator()). ++ * ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup ++ * there is a set of bfq_groups, each one collecting the lower-level ++ * entities belonging to the group that are acting on the same device. ++ * ++ * Locking works as follows: ++ * o @bfqd is protected by the queue lock, RCU is used to access it ++ * from the readers. ++ * o All the other fields are protected by the @bfqd queue lock. ++ */ ++struct bfq_group { ++ /* must be the first member */ ++ struct blkg_policy_data pd; ++ ++ /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ ++ char blkg_path[128]; ++ ++ /* reference counter (see comments in bfq_bic_update_cgroup) */ ++ int ref; ++ ++ struct bfq_entity entity; ++ struct bfq_sched_data sched_data; ++ ++ void *bfqd; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct bfq_entity *my_entity; ++ ++ int active_entities; ++ ++ struct rb_root rq_pos_tree; ++ ++ struct bfqg_stats stats; ++}; ++ ++#else ++struct bfq_group { ++ struct bfq_sched_data sched_data; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct rb_root rq_pos_tree; ++}; ++#endif ++ ++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); ++ ++static unsigned int bfq_class_idx(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ return bfqq ? bfqq->ioprio_class - 1 : ++ BFQ_DEFAULT_GRP_CLASS - 1; ++} ++ ++static unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd) ++{ ++ return bfqd->busy_queues[0] + bfqd->busy_queues[1] + ++ bfqd->busy_queues[2]; ++} ++ ++static struct bfq_service_tree * ++bfq_entity_service_tree(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sched_data = entity->sched_data; ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned int idx = bfq_class_idx(entity); ++ ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); ++ BUG_ON(sched_data == NULL); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "%p %d", ++ sched_data->service_tree + idx, idx); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "%p %d", ++ sched_data->service_tree + idx, idx); ++ } ++#endif ++ return sched_data->service_tree + idx; ++} ++ ++static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) ++{ ++ return bic->bfqq[is_sync]; ++} ++ ++static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, ++ bool is_sync) ++{ ++ bic->bfqq[is_sync] = bfqq; ++} ++ ++static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) ++{ ++ return bic->icq.q->elevator->elevator_data; ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ ++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *group_entity = bfqq->entity.parent; ++ ++ if (!group_entity) ++ group_entity = &bfqq->bfqd->root_group->entity; ++ ++ return container_of(group_entity, struct bfq_group, entity); ++} ++ ++#else ++ ++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) ++{ ++ return bfqq->bfqd->root_group; ++} ++ ++#endif ++ ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); ++static void bfq_put_queue(struct bfq_queue *bfqq); ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bio *bio, bool is_sync, ++ struct bfq_io_cq *bic); ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); ++#endif ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); ++ ++#endif /* _BFQ_H */ +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +new file mode 100644 +index 000000000000..7a4923231106 +--- /dev/null ++++ b/block/bfq-sched.c +@@ -0,0 +1,2077 @@ ++/* ++ * BFQ: Hierarchical B-WF2Q+ scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2015 Paolo Valente ++ * ++ * Copyright (C) 2016 Paolo Valente ++ */ ++ ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++ ++/** ++ * bfq_gt - compare two timestamps. ++ * @a: first ts. ++ * @b: second ts. ++ * ++ * Return @a > @b, dealing with wrapping correctly. ++ */ ++static int bfq_gt(u64 a, u64 b) ++{ ++ return (s64)(a - b) > 0; ++} ++ ++static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) ++{ ++ struct rb_node *node = tree->rb_node; ++ ++ return rb_entry(node, struct bfq_entity, rb_node); ++} ++ ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, ++ bool expiration); ++ ++static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); ++ ++/** ++ * bfq_update_next_in_service - update sd->next_in_service ++ * @sd: sched_data for which to perform the update. ++ * @new_entity: if not NULL, pointer to the entity whose activation, ++ * requeueing or repositionig triggered the invocation of ++ * this function. ++ * @expiration: id true, this function is being invoked after the ++ * expiration of the in-service entity ++ * ++ * This function is called to update sd->next_in_service, which, in ++ * its turn, may change as a consequence of the insertion or ++ * extraction of an entity into/from one of the active trees of ++ * sd. These insertions/extractions occur as a consequence of ++ * activations/deactivations of entities, with some activations being ++ * 'true' activations, and other activations being requeueings (i.e., ++ * implementing the second, requeueing phase of the mechanism used to ++ * reposition an entity in its active tree; see comments on ++ * __bfq_activate_entity and __bfq_requeue_entity for details). In ++ * both the last two activation sub-cases, new_entity points to the ++ * just activated or requeued entity. ++ * ++ * Returns true if sd->next_in_service changes in such a way that ++ * entity->parent may become the next_in_service for its parent ++ * entity. ++ */ ++static bool bfq_update_next_in_service(struct bfq_sched_data *sd, ++ struct bfq_entity *new_entity, ++ bool expiration) ++{ ++ struct bfq_entity *next_in_service = sd->next_in_service; ++ struct bfq_queue *bfqq; ++ bool parent_sched_may_change = false; ++ bool change_without_lookup = false; ++ ++ /* ++ * If this update is triggered by the activation, requeueing ++ * or repositiong of an entity that does not coincide with ++ * sd->next_in_service, then a full lookup in the active tree ++ * can be avoided. In fact, it is enough to check whether the ++ * just-modified entity has the same priority as ++ * sd->next_in_service, is eligible and has a lower virtual ++ * finish time than sd->next_in_service. If this compound ++ * condition holds, then the new entity becomes the new ++ * next_in_service. Otherwise no change is needed. ++ */ ++ if (new_entity && new_entity != sd->next_in_service) { ++ /* ++ * Flag used to decide whether to replace ++ * sd->next_in_service with new_entity. Tentatively ++ * set to true, and left as true if ++ * sd->next_in_service is NULL. ++ */ ++ change_without_lookup = true; ++ ++ /* ++ * If there is already a next_in_service candidate ++ * entity, then compare timestamps to decide whether ++ * to replace sd->service_tree with new_entity. ++ */ ++ if (next_in_service) { ++ unsigned int new_entity_class_idx = ++ bfq_class_idx(new_entity); ++ struct bfq_service_tree *st = ++ sd->service_tree + new_entity_class_idx; ++ ++ change_without_lookup = ++ (new_entity_class_idx == ++ bfq_class_idx(next_in_service) ++ && ++ !bfq_gt(new_entity->start, st->vtime) ++ && ++ bfq_gt(next_in_service->finish, ++ new_entity->finish)); ++ } ++ ++ if (change_without_lookup) { ++ next_in_service = new_entity; ++ bfqq = bfq_entity_to_bfqq(next_in_service); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "chose without lookup"); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(next_in_service, ++ struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data*)bfqg->bfqd, bfqg, ++ "chose without lookup"); ++ } ++#endif ++ } ++ } ++ ++ if (!change_without_lookup) /* lookup needed */ ++ next_in_service = bfq_lookup_next_entity(sd, expiration); ++ ++ if (next_in_service) { ++ bool new_budget_triggers_change = ++ bfq_update_parent_budget(next_in_service); ++ ++ parent_sched_may_change = !sd->next_in_service || ++ new_budget_triggers_change; ++ } ++ ++ sd->next_in_service = next_in_service; ++ ++ if (!next_in_service) ++ return parent_sched_may_change; ++ ++ bfqq = bfq_entity_to_bfqq(next_in_service); ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "chosen this queue"); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(next_in_service, ++ struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "chosen this entity"); ++ } ++#endif ++ return parent_sched_may_change; ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++/* both next loops stop at one of the child entities of the root group */ ++#define for_each_entity(entity) \ ++ for (; entity ; entity = entity->parent) ++ ++/* ++ * For each iteration, compute parent in advance, so as to be safe if ++ * entity is deallocated during the iteration. Such a deallocation may ++ * happen as a consequence of a bfq_put_queue that frees the bfq_queue ++ * containing entity. ++ */ ++#define for_each_entity_safe(entity, parent) \ ++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) ++ ++/* ++ * Returns true if this budget changes may let next_in_service->parent ++ * become the next_in_service entity for its parent entity. ++ */ ++static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) ++{ ++ struct bfq_entity *bfqg_entity; ++ struct bfq_group *bfqg; ++ struct bfq_sched_data *group_sd; ++ bool ret = false; ++ ++ BUG_ON(!next_in_service); ++ ++ group_sd = next_in_service->sched_data; ++ ++ bfqg = container_of(group_sd, struct bfq_group, sched_data); ++ /* ++ * bfq_group's my_entity field is not NULL only if the group ++ * is not the root group. We must not touch the root entity ++ * as it must never become an in-service entity. ++ */ ++ bfqg_entity = bfqg->my_entity; ++ if (bfqg_entity) { ++ if (bfqg_entity->budget > next_in_service->budget) ++ ret = true; ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "old budg: %d, new budg: %d", ++ bfqg_entity->budget, next_in_service->budget); ++ bfqg_entity->budget = next_in_service->budget; ++ } ++ ++ return ret; ++} ++ ++/* ++ * This function tells whether entity stops being a candidate for next ++ * service, according to the restrictive definition of the field ++ * next_in_service. In particular, this function is invoked for an ++ * entity that is about to be set in service. ++ * ++ * If entity is a queue, then the entity is no longer a candidate for ++ * next service according to the that definition, because entity is ++ * about to become the in-service queue. This function then returns ++ * true if entity is a queue. ++ * ++ * In contrast, entity could still be a candidate for next service if ++ * it is not a queue, and has more than one active child. In fact, ++ * even if one of its children is about to be set in service, other ++ * active children may still be the next to serve, for the parent ++ * entity, even according to the above definition. As a consequence, a ++ * non-queue entity is not a candidate for next-service only if it has ++ * only one active child. And only if this condition holds, then this ++ * function returns true for a non-queue entity. ++ */ ++static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) ++{ ++ struct bfq_group *bfqg; ++ ++ if (bfq_entity_to_bfqq(entity)) ++ return true; ++ ++ bfqg = container_of(entity, struct bfq_group, entity); ++ ++ BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); ++ BUG_ON(bfqg->active_entities == 0); ++ /* ++ * The field active_entities does not always contain the ++ * actual number of active children entities: it happens to ++ * not account for the in-service entity in case the latter is ++ * removed from its active tree (which may get done after ++ * invoking the function bfq_no_longer_next_in_service in ++ * bfq_get_next_queue). Fortunately, here, i.e., while ++ * bfq_no_longer_next_in_service is not yet completed in ++ * bfq_get_next_queue, bfq_active_extract has not yet been ++ * invoked, and thus active_entities still coincides with the ++ * actual number of active entities. ++ */ ++ if (bfqg->active_entities == 1) ++ return true; ++ ++ return false; ++} ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ ++#define for_each_entity(entity) \ ++ for (; entity ; entity = NULL) ++ ++#define for_each_entity_safe(entity, parent) \ ++ for (parent = NULL; entity ; entity = parent) ++ ++static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) ++{ ++ return false; ++} ++ ++static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) ++{ ++ return true; ++} ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++/* ++ * Shift for timestamp calculations. This actually limits the maximum ++ * service allowed in one timestamp delta (small shift values increase it), ++ * the maximum total weight that can be used for the queues in the system ++ * (big shift values increase it), and the period of virtual time ++ * wraparounds. ++ */ ++#define WFQ_SERVICE_SHIFT 22 ++ ++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = NULL; ++ ++ BUG_ON(!entity); ++ ++ if (!entity->my_sched_data) ++ bfqq = container_of(entity, struct bfq_queue, entity); ++ ++ return bfqq; ++} ++ ++ ++/** ++ * bfq_delta - map service into the virtual time domain. ++ * @service: amount of service. ++ * @weight: scale factor (weight of an entity or weight sum). ++ */ ++static u64 bfq_delta(unsigned long service, unsigned long weight) ++{ ++ u64 d = (u64)service << WFQ_SERVICE_SHIFT; ++ ++ do_div(d, weight); ++ return d; ++} ++ ++/** ++ * bfq_calc_finish - assign the finish time to an entity. ++ * @entity: the entity to act upon. ++ * @service: the service to be charged to the entity. ++ */ ++static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned long long start, finish, delta; ++ ++ BUG_ON(entity->weight == 0); ++ ++ entity->finish = entity->start + ++ bfq_delta(service, entity->weight); ++ ++ start = ((entity->start>>10)*1000)>>12; ++ finish = ((entity->finish>>10)*1000)>>12; ++ delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "serv %lu, w %d", ++ service, entity->weight); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "start %llu, finish %llu, delta %llu", ++ start, finish, delta); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "group: serv %lu, w %d", ++ service, entity->weight); ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "group: start %llu, finish %llu, delta %llu", ++ start, finish, delta); ++#endif ++ } ++} ++ ++/** ++ * bfq_entity_of - get an entity from a node. ++ * @node: the node field of the entity. ++ * ++ * Convert a node pointer to the relative entity. This is used only ++ * to simplify the logic of some functions and not as the generic ++ * conversion mechanism because, e.g., in the tree walking functions, ++ * the check for a %NULL value would be redundant. ++ */ ++static struct bfq_entity *bfq_entity_of(struct rb_node *node) ++{ ++ struct bfq_entity *entity = NULL; ++ ++ if (node) ++ entity = rb_entry(node, struct bfq_entity, rb_node); ++ ++ return entity; ++} ++ ++/** ++ * bfq_extract - remove an entity from a tree. ++ * @root: the tree root. ++ * @entity: the entity to remove. ++ */ ++static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) ++{ ++ BUG_ON(entity->tree != root); ++ ++ entity->tree = NULL; ++ rb_erase(&entity->rb_node, root); ++} ++ ++/** ++ * bfq_idle_extract - extract an entity from the idle tree. ++ * @st: the service tree of the owning @entity. ++ * @entity: the entity being removed. ++ */ ++static void bfq_idle_extract(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *next; ++ ++ BUG_ON(entity->tree != &st->idle); ++ ++ if (entity == st->first_idle) { ++ next = rb_next(&entity->rb_node); ++ st->first_idle = bfq_entity_of(next); ++ } ++ ++ if (entity == st->last_idle) { ++ next = rb_prev(&entity->rb_node); ++ st->last_idle = bfq_entity_of(next); ++ } ++ ++ bfq_extract(&st->idle, entity); ++ ++ if (bfqq) ++ list_del(&bfqq->bfqq_list); ++} ++ ++/** ++ * bfq_insert - generic tree insertion. ++ * @root: tree root. ++ * @entity: entity to insert. ++ * ++ * This is used for the idle and the active tree, since they are both ++ * ordered by finish time. ++ */ ++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) ++{ ++ struct bfq_entity *entry; ++ struct rb_node **node = &root->rb_node; ++ struct rb_node *parent = NULL; ++ ++ BUG_ON(entity->tree); ++ ++ while (*node) { ++ parent = *node; ++ entry = rb_entry(parent, struct bfq_entity, rb_node); ++ ++ if (bfq_gt(entry->finish, entity->finish)) ++ node = &parent->rb_left; ++ else ++ node = &parent->rb_right; ++ } ++ ++ rb_link_node(&entity->rb_node, parent, node); ++ rb_insert_color(&entity->rb_node, root); ++ ++ entity->tree = root; ++} ++ ++/** ++ * bfq_update_min - update the min_start field of a entity. ++ * @entity: the entity to update. ++ * @node: one of its children. ++ * ++ * This function is called when @entity may store an invalid value for ++ * min_start due to updates to the active tree. The function assumes ++ * that the subtree rooted at @node (which may be its left or its right ++ * child) has a valid min_start value. ++ */ ++static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) ++{ ++ struct bfq_entity *child; ++ ++ if (node) { ++ child = rb_entry(node, struct bfq_entity, rb_node); ++ if (bfq_gt(entity->min_start, child->min_start)) ++ entity->min_start = child->min_start; ++ } ++} ++ ++/** ++ * bfq_update_active_node - recalculate min_start. ++ * @node: the node to update. ++ * ++ * @node may have changed position or one of its children may have moved, ++ * this function updates its min_start value. The left and right subtrees ++ * are assumed to hold a correct min_start value. ++ */ ++static void bfq_update_active_node(struct rb_node *node) ++{ ++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ entity->min_start = entity->start; ++ bfq_update_min(entity, node->rb_right); ++ bfq_update_min(entity, node->rb_left); ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "new min_start %llu", ++ ((entity->min_start>>10)*1000)>>12); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "new min_start %llu", ++ ((entity->min_start>>10)*1000)>>12); ++#endif ++ } ++} ++ ++/** ++ * bfq_update_active_tree - update min_start for the whole active tree. ++ * @node: the starting node. ++ * ++ * @node must be the deepest modified node after an update. This function ++ * updates its min_start using the values held by its children, assuming ++ * that they did not change, and then updates all the nodes that may have ++ * changed in the path to the root. The only nodes that may have changed ++ * are the ones in the path or their siblings. ++ */ ++static void bfq_update_active_tree(struct rb_node *node) ++{ ++ struct rb_node *parent; ++ ++up: ++ bfq_update_active_node(node); ++ ++ parent = rb_parent(node); ++ if (!parent) ++ return; ++ ++ if (node == parent->rb_left && parent->rb_right) ++ bfq_update_active_node(parent->rb_right); ++ else if (parent->rb_left) ++ bfq_update_active_node(parent->rb_left); ++ ++ node = parent; ++ goto up; ++} ++ ++static void bfq_weights_tree_add(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct rb_root *root); ++ ++static void __bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct rb_root *root); ++ ++static void bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq); ++ ++ ++/** ++ * bfq_active_insert - insert an entity in the active tree of its ++ * group/device. ++ * @st: the service tree of the entity. ++ * @entity: the entity being inserted. ++ * ++ * The active tree is ordered by finish time, but an extra key is kept ++ * per each node, containing the minimum value for the start times of ++ * its children (and the node itself), so it's possible to search for ++ * the eligible node with the lowest finish time in logarithmic time. ++ */ ++static void bfq_active_insert(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *node = &entity->rb_node; ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ struct bfq_sched_data *sd = NULL; ++ struct bfq_group *bfqg = NULL; ++ struct bfq_data *bfqd = NULL; ++#endif ++ ++ bfq_insert(&st->active, entity); ++ ++ if (node->rb_left) ++ node = node->rb_left; ++ else if (node->rb_right) ++ node = node->rb_right; ++ ++ bfq_update_active_tree(node); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ sd = entity->sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++#endif ++ if (bfqq) ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ if (bfqg != bfqd->root_group) { ++ BUG_ON(!bfqg); ++ BUG_ON(!bfqd); ++ bfqg->active_entities++; ++ } ++#endif ++} ++ ++/** ++ * bfq_ioprio_to_weight - calc a weight from an ioprio. ++ * @ioprio: the ioprio value to convert. ++ */ ++static unsigned short bfq_ioprio_to_weight(int ioprio) ++{ ++ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); ++ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; ++} ++ ++/** ++ * bfq_weight_to_ioprio - calc an ioprio from a weight. ++ * @weight: the weight value to convert. ++ * ++ * To preserve as much as possible the old only-ioprio user interface, ++ * 0 is used as an escape ioprio value for weights (numerically) equal or ++ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. ++ */ ++static unsigned short bfq_weight_to_ioprio(int weight) ++{ ++ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); ++ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? ++ 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; ++} ++ ++static void bfq_get_entity(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ if (bfqq) { ++ bfqq->ref++; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", ++ bfqq, bfqq->ref); ++ } ++} ++ ++/** ++ * bfq_find_deepest - find the deepest node that an extraction can modify. ++ * @node: the node being removed. ++ * ++ * Do the first step of an extraction in an rb tree, looking for the ++ * node that will replace @node, and returning the deepest node that ++ * the following modifications to the tree can touch. If @node is the ++ * last node in the tree return %NULL. ++ */ ++static struct rb_node *bfq_find_deepest(struct rb_node *node) ++{ ++ struct rb_node *deepest; ++ ++ if (!node->rb_right && !node->rb_left) ++ deepest = rb_parent(node); ++ else if (!node->rb_right) ++ deepest = node->rb_left; ++ else if (!node->rb_left) ++ deepest = node->rb_right; ++ else { ++ deepest = rb_next(node); ++ if (deepest->rb_right) ++ deepest = deepest->rb_right; ++ else if (rb_parent(deepest) != node) ++ deepest = rb_parent(deepest); ++ } ++ ++ return deepest; ++} ++ ++/** ++ * bfq_active_extract - remove an entity from the active tree. ++ * @st: the service_tree containing the tree. ++ * @entity: the entity being removed. ++ */ ++static void bfq_active_extract(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *node; ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ struct bfq_sched_data *sd = NULL; ++ struct bfq_group *bfqg = NULL; ++ struct bfq_data *bfqd = NULL; ++#endif ++ ++ node = bfq_find_deepest(&entity->rb_node); ++ bfq_extract(&st->active, entity); ++ ++ if (node) ++ bfq_update_active_tree(node); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ sd = entity->sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++#endif ++ if (bfqq) ++ list_del(&bfqq->bfqq_list); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ if (bfqg != bfqd->root_group) { ++ BUG_ON(!bfqg); ++ BUG_ON(!bfqd); ++ BUG_ON(!bfqg->active_entities); ++ bfqg->active_entities--; ++ } ++#endif ++} ++ ++/** ++ * bfq_idle_insert - insert an entity into the idle tree. ++ * @st: the service tree containing the tree. ++ * @entity: the entity to insert. ++ */ ++static void bfq_idle_insert(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_entity *first_idle = st->first_idle; ++ struct bfq_entity *last_idle = st->last_idle; ++ ++ if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) ++ st->first_idle = entity; ++ if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) ++ st->last_idle = entity; ++ ++ bfq_insert(&st->idle, entity); ++ ++ if (bfqq) ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); ++} ++ ++/** ++ * bfq_forget_entity - do not consider entity any longer for scheduling ++ * @st: the service tree. ++ * @entity: the entity being removed. ++ * @is_in_service: true if entity is currently the in-service entity. ++ * ++ * Forget everything about @entity. In addition, if entity represents ++ * a queue, and the latter is not in service, then release the service ++ * reference to the queue (the one taken through bfq_get_entity). In ++ * fact, in this case, there is really no more service reference to ++ * the queue, as the latter is also outside any service tree. If, ++ * instead, the queue is in service, then __bfq_bfqd_reset_in_service ++ * will take care of putting the reference when the queue finally ++ * stops being served. ++ */ ++static void bfq_forget_entity(struct bfq_service_tree *st, ++ struct bfq_entity *entity, ++ bool is_in_service) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ BUG_ON(!entity->on_st); ++ ++ entity->on_st = false; ++ st->wsum -= entity->weight; ++ if (bfqq && !is_in_service) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "(before): %p %d", ++ bfqq, bfqq->ref); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++/** ++ * bfq_put_idle_entity - release the idle tree ref of an entity. ++ * @st: service tree for the entity. ++ * @entity: the entity being released. ++ */ ++static void bfq_put_idle_entity(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ bfq_idle_extract(st, entity); ++ bfq_forget_entity(st, entity, ++ entity == entity->sched_data->in_service_entity); ++} ++ ++/** ++ * bfq_forget_idle - update the idle tree if necessary. ++ * @st: the service tree to act upon. ++ * ++ * To preserve the global O(log N) complexity we only remove one entry here; ++ * as the idle tree will not grow indefinitely this can be done safely. ++ */ ++static void bfq_forget_idle(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *first_idle = st->first_idle; ++ struct bfq_entity *last_idle = st->last_idle; ++ ++ if (RB_EMPTY_ROOT(&st->active) && last_idle && ++ !bfq_gt(last_idle->finish, st->vtime)) { ++ /* ++ * Forget the whole idle tree, increasing the vtime past ++ * the last finish time of idle entities. ++ */ ++ st->vtime = last_idle->finish; ++ } ++ ++ if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) ++ bfq_put_idle_entity(st, first_idle); ++} ++ ++/* ++ * Update weight and priority of entity. If update_class_too is true, ++ * then update the ioprio_class of entity too. ++ * ++ * The reason why the update of ioprio_class is controlled through the ++ * last parameter is as follows. Changing the ioprio class of an ++ * entity implies changing the destination service trees for that ++ * entity. If such a change occurred when the entity is already on one ++ * of the service trees for its previous class, then the state of the ++ * entity would become more complex: none of the new possible service ++ * trees for the entity, according to bfq_entity_service_tree(), would ++ * match any of the possible service trees on which the entity ++ * is. Complex operations involving these trees, such as entity ++ * activations and deactivations, should take into account this ++ * additional complexity. To avoid this issue, this function is ++ * invoked with update_class_too unset in the points in the code where ++ * entity may happen to be on some tree. ++ */ ++static struct bfq_service_tree * ++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, ++ struct bfq_entity *entity, ++ bool update_class_too) ++{ ++ struct bfq_service_tree *new_st = old_st; ++ ++ if (entity->prio_changed) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned int prev_weight, new_weight; ++ struct bfq_data *bfqd = NULL; ++ struct rb_root *root; ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ struct bfq_sched_data *sd; ++ struct bfq_group *bfqg; ++#endif ++ ++ if (bfqq) ++ bfqd = bfqq->bfqd; ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ sd = entity->my_sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++ BUG_ON(!bfqd); ++ } ++#endif ++ ++ BUG_ON(entity->tree && update_class_too); ++ BUG_ON(old_st->wsum < entity->weight); ++ old_st->wsum -= entity->weight; ++ ++ if (entity->new_weight != entity->orig_weight) { ++ if (entity->new_weight < BFQ_MIN_WEIGHT || ++ entity->new_weight > BFQ_MAX_WEIGHT) { ++ pr_crit("update_weight_prio: new_weight %d\n", ++ entity->new_weight); ++ if (entity->new_weight < BFQ_MIN_WEIGHT) ++ entity->new_weight = BFQ_MIN_WEIGHT; ++ else ++ entity->new_weight = BFQ_MAX_WEIGHT; ++ } ++ entity->orig_weight = entity->new_weight; ++ if (bfqq) ++ bfqq->ioprio = ++ bfq_weight_to_ioprio(entity->orig_weight); ++ } ++ ++ if (bfqq && update_class_too) ++ bfqq->ioprio_class = bfqq->new_ioprio_class; ++ ++ /* ++ * Reset prio_changed only if the ioprio_class change ++ * is not pending any longer. ++ */ ++ if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class) ++ entity->prio_changed = 0; ++ ++ /* ++ * NOTE: here we may be changing the weight too early, ++ * this will cause unfairness. The correct approach ++ * would have required additional complexity to defer ++ * weight changes to the proper time instants (i.e., ++ * when entity->finish <= old_st->vtime). ++ */ ++ new_st = bfq_entity_service_tree(entity); ++ ++ prev_weight = entity->weight; ++ new_weight = entity->orig_weight * ++ (bfqq ? bfqq->wr_coeff : 1); ++ /* ++ * If the weight of the entity changes and the entity is a ++ * queue, remove the entity from its old weight counter (if ++ * there is a counter associated with the entity). ++ */ ++ if (prev_weight != new_weight && bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "weight changed %d %d(%d %d)", ++ prev_weight, new_weight, ++ entity->orig_weight, ++ bfqq->wr_coeff); ++ ++ root = &bfqd->queue_weights_tree; ++ __bfq_weights_tree_remove(bfqd, bfqq, root); ++ } ++ entity->weight = new_weight; ++ /* ++ * Add the entity, if it is not a weight-raised queue, to the ++ * counter associated with its new weight. ++ */ ++ if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) { ++ /* If we get here, root has been initialized. */ ++ bfq_weights_tree_add(bfqd, bfqq, root); ++ } ++ ++ new_st->wsum += entity->weight; ++ ++ if (new_st != old_st) { ++ BUG_ON(!update_class_too); ++ entity->start = new_st->vtime; ++ } ++ } ++ ++ return new_st; ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); ++#endif ++ ++/** ++ * bfq_bfqq_served - update the scheduler status after selection for ++ * service. ++ * @bfqq: the queue being served. ++ * @served: bytes to transfer. ++ * ++ * NOTE: this can be optimized, as the timestamps of upper level entities ++ * are synchronized every time a new bfqq is selected for service. By now, ++ * we keep it to better check consistency. ++ */ ++static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st; ++ ++ if (!bfqq->service_from_backlogged) ++ bfqq->first_IO_time = jiffies; ++ ++ if (bfqq->wr_coeff > 1) ++ bfqq->service_from_wr += served; ++ ++ bfqq->service_from_backlogged += served; ++ for_each_entity(entity) { ++ st = bfq_entity_service_tree(entity); ++ ++ entity->service += served; ++ ++ BUG_ON(st->wsum == 0); ++ ++ st->vtime += bfq_delta(served, st->wsum); ++ bfq_forget_idle(st); ++ } ++#ifndef BFQ_MQ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); ++#endif ++#endif ++ st = bfq_entity_service_tree(&bfqq->entity); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", ++ served, ((st->vtime>>10)*1000)>>12, st); ++} ++ ++/** ++ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length ++ * of the time interval during which bfqq has been in ++ * service. ++ * @bfqd: the device ++ * @bfqq: the queue that needs a service update. ++ * @time_ms: the amount of time during which the queue has received service ++ * ++ * If a queue does not consume its budget fast enough, then providing ++ * the queue with service fairness may impair throughput, more or less ++ * severely. For this reason, queues that consume their budget slowly ++ * are provided with time fairness instead of service fairness. This ++ * goal is achieved through the BFQ scheduling engine, even if such an ++ * engine works in the service, and not in the time domain. The trick ++ * is charging these queues with an inflated amount of service, equal ++ * to the amount of service that they would have received during their ++ * service slot if they had been fast, i.e., if their requests had ++ * been dispatched at a rate equal to the estimated peak rate. ++ * ++ * It is worth noting that time fairness can cause important ++ * distortions in terms of bandwidth distribution, on devices with ++ * internal queueing. The reason is that I/O requests dispatched ++ * during the service slot of a queue may be served after that service ++ * slot is finished, and may have a total processing time loosely ++ * correlated with the duration of the service slot. This is ++ * especially true for short service slots. ++ */ ++static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ unsigned long time_ms) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ unsigned long timeout_ms = jiffies_to_msecs(bfq_timeout); ++ unsigned long bounded_time_ms = min(time_ms, timeout_ms); ++ int serv_to_charge_for_time = ++ (bfqd->bfq_max_budget * bounded_time_ms) / timeout_ms; ++ int tot_serv_to_charge = max(serv_to_charge_for_time, entity->service); ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "%lu/%lu ms, %d/%d/%d/%d sectors", ++ time_ms, timeout_ms, ++ entity->service, ++ tot_serv_to_charge, ++ bfqd->bfq_max_budget, ++ entity->budget); ++ ++ /* Increase budget to avoid inconsistencies */ ++ if (tot_serv_to_charge > entity->budget) ++ entity->budget = tot_serv_to_charge; ++ ++ bfq_bfqq_served(bfqq, ++ max_t(int, 0, tot_serv_to_charge - entity->service)); ++} ++ ++static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, ++ struct bfq_service_tree *st, ++ bool backshifted) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_sched_data *sd = entity->sched_data; ++ ++ /* ++ * When this function is invoked, entity is not in any service ++ * tree, then it is safe to invoke next function with the last ++ * parameter set (see the comments on the function). ++ */ ++ BUG_ON(entity->tree); ++ st = __bfq_entity_update_weight_prio(st, entity, true); ++ bfq_calc_finish(entity, entity->budget); ++ ++ /* ++ * If some queues enjoy backshifting for a while, then their ++ * (virtual) finish timestamps may happen to become lower and ++ * lower than the system virtual time. In particular, if ++ * these queues often happen to be idle for short time ++ * periods, and during such time periods other queues with ++ * higher timestamps happen to be busy, then the backshifted ++ * timestamps of the former queues can become much lower than ++ * the system virtual time. In fact, to serve the queues with ++ * higher timestamps while the ones with lower timestamps are ++ * idle, the system virtual time may be pushed-up to much ++ * higher values than the finish timestamps of the idle ++ * queues. As a consequence, the finish timestamps of all new ++ * or newly activated queues may end up being much larger than ++ * those of lucky queues with backshifted timestamps. The ++ * latter queues may then monopolize the device for a lot of ++ * time. This would simply break service guarantees. ++ * ++ * To reduce this problem, push up a little bit the ++ * backshifted timestamps of the queue associated with this ++ * entity (only a queue can happen to have the backshifted ++ * flag set): just enough to let the finish timestamp of the ++ * queue be equal to the current value of the system virtual ++ * time. This may introduce a little unfairness among queues ++ * with backshifted timestamps, but it does not break ++ * worst-case fairness guarantees. ++ * ++ * As a special case, if bfqq is weight-raised, push up ++ * timestamps much less, to keep very low the probability that ++ * this push up causes the backshifted finish timestamps of ++ * weight-raised queues to become higher than the backshifted ++ * finish timestamps of non weight-raised queues. ++ */ ++ if (backshifted && bfq_gt(st->vtime, entity->finish)) { ++ unsigned long delta = st->vtime - entity->finish; ++ ++ if (bfqq) ++ delta /= bfqq->wr_coeff; ++ ++ entity->start += delta; ++ entity->finish += delta; ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "new queue finish %llu", ++ ((entity->finish>>10)*1000)>>12); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "new group finish %llu", ++ ((entity->finish>>10)*1000)>>12); ++#endif ++ } ++ } ++ ++ bfq_active_insert(st, entity); ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "queue %seligible in st %p", ++ entity->start <= st->vtime ? "" : "non ", st); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "group %seligible in st %p", ++ entity->start <= st->vtime ? "" : "non ", st); ++#endif ++ } ++ BUG_ON(RB_EMPTY_ROOT(&st->active)); ++ BUG_ON(&st->active != &sd->service_tree->active && ++ &st->active != &(sd->service_tree+1)->active && ++ &st->active != &(sd->service_tree+2)->active); ++} ++ ++/** ++ * __bfq_activate_entity - handle activation of entity. ++ * @entity: the entity being activated. ++ * @non_blocking_wait_rq: true if entity was waiting for a request ++ * ++ * Called for a 'true' activation, i.e., if entity is not active and ++ * one of its children receives a new request. ++ * ++ * Basically, this function updates the timestamps of entity and ++ * inserts entity into its active tree, after possibly extracting it ++ * from its idle tree. ++ */ ++static void __bfq_activate_entity(struct bfq_entity *entity, ++ bool non_blocking_wait_rq) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ bool backshifted = false; ++ unsigned long long min_vstart; ++ ++ BUG_ON(!sd); ++ BUG_ON(!st); ++ ++ /* See comments on bfq_fqq_update_budg_for_activation */ ++ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { ++ backshifted = true; ++ min_vstart = entity->finish; ++ } else ++ min_vstart = st->vtime; ++ ++ if (entity->tree == &st->idle) { ++ /* ++ * Must be on the idle tree, bfq_idle_extract() will ++ * check for that. ++ */ ++ bfq_idle_extract(st, entity); ++ BUG_ON(entity->tree); ++ entity->start = bfq_gt(min_vstart, entity->finish) ? ++ min_vstart : entity->finish; ++ } else { ++ BUG_ON(entity->tree); ++ /* ++ * The finish time of the entity may be invalid, and ++ * it is in the past for sure, otherwise the queue ++ * would have been on the idle tree. ++ */ ++ entity->start = min_vstart; ++ st->wsum += entity->weight; ++ /* ++ * entity is about to be inserted into a service tree, ++ * and then set in service: get a reference to make ++ * sure entity does not disappear until it is no ++ * longer in service or scheduled for service. ++ */ ++ bfq_get_entity(entity); ++ ++ BUG_ON(entity->on_st && bfqq); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ if (entity->on_st && !bfqq) { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, ++ entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, ++ bfqg, ++ "activate bug, class %d in_service %p", ++ bfq_class_idx(entity), sd->in_service_entity); ++ } ++#endif ++ BUG_ON(entity->on_st && !bfqq); ++ entity->on_st = true; ++ } ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */ ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ struct bfq_data *bfqd = bfqg->bfqd; ++ ++ BUG_ON(!bfqd); ++ if (!entity->in_groups_with_pending_reqs) { ++ entity->in_groups_with_pending_reqs = true; ++ bfqd->num_groups_with_pending_reqs++; ++ } ++ bfq_log_bfqg(bfqd, bfqg, "num_groups_with_pending_reqs %u", ++ bfqd->num_groups_with_pending_reqs); ++ } ++#endif ++ ++ bfq_update_fin_time_enqueue(entity, st, backshifted); ++} ++ ++/** ++ * __bfq_requeue_entity - handle requeueing or repositioning of an entity. ++ * @entity: the entity being requeued or repositioned. ++ * ++ * Requeueing is needed if this entity stops being served, which ++ * happens if a leaf descendant entity has expired. On the other hand, ++ * repositioning is needed if the next_inservice_entity for the child ++ * entity has changed. See the comments inside the function for ++ * details. ++ * ++ * Basically, this function: 1) removes entity from its active tree if ++ * present there, 2) updates the timestamps of entity and 3) inserts ++ * entity back into its active tree (in the new, right position for ++ * the new values of the timestamps). ++ */ ++static void __bfq_requeue_entity(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ ++ BUG_ON(!sd); ++ BUG_ON(!st); ++ ++ BUG_ON(entity != sd->in_service_entity && ++ entity->tree != &st->active); ++ ++ if (entity == sd->in_service_entity) { ++ /* ++ * We are requeueing the current in-service entity, ++ * which may have to be done for one of the following ++ * reasons: ++ * - entity represents the in-service queue, and the ++ * in-service queue is being requeued after an ++ * expiration; ++ * - entity represents a group, and its budget has ++ * changed because one of its child entities has ++ * just been either activated or requeued for some ++ * reason; the timestamps of the entity need then to ++ * be updated, and the entity needs to be enqueued ++ * or repositioned accordingly. ++ * ++ * In particular, before requeueing, the start time of ++ * the entity must be moved forward to account for the ++ * service that the entity has received while in ++ * service. This is done by the next instructions. The ++ * finish time will then be updated according to this ++ * new value of the start time, and to the budget of ++ * the entity. ++ */ ++ bfq_calc_finish(entity, entity->service); ++ entity->start = entity->finish; ++ BUG_ON(entity->tree && entity->tree == &st->idle); ++ BUG_ON(entity->tree && entity->tree != &st->active); ++ /* ++ * In addition, if the entity had more than one child ++ * when set in service, then it was not extracted from ++ * the active tree. This implies that the position of ++ * the entity in the active tree may need to be ++ * changed now, because we have just updated the start ++ * time of the entity, and we will update its finish ++ * time in a moment (the requeueing is then, more ++ * precisely, a repositioning in this case). To ++ * implement this repositioning, we: 1) dequeue the ++ * entity here, 2) update the finish time and requeue ++ * the entity according to the new timestamps below. ++ */ ++ if (entity->tree) ++ bfq_active_extract(st, entity); ++ } else { /* The entity is already active, and not in service */ ++ /* ++ * In this case, this function gets called only if the ++ * next_in_service entity below this entity has ++ * changed, and this change has caused the budget of ++ * this entity to change, which, finally implies that ++ * the finish time of this entity must be ++ * updated. Such an update may cause the scheduling, ++ * i.e., the position in the active tree, of this ++ * entity to change. We handle this change by: 1) ++ * dequeueing the entity here, 2) updating the finish ++ * time and requeueing the entity according to the new ++ * timestamps below. This is the same approach as the ++ * non-extracted-entity sub-case above. ++ */ ++ bfq_active_extract(st, entity); ++ } ++ ++ bfq_update_fin_time_enqueue(entity, st, false); ++} ++ ++static void __bfq_activate_requeue_entity(struct bfq_entity *entity, ++ struct bfq_sched_data *sd, ++ bool non_blocking_wait_rq) ++{ ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ ++ if (sd->in_service_entity == entity || entity->tree == &st->active) ++ /* ++ * in service or already queued on the active tree, ++ * requeue or reposition ++ */ ++ __bfq_requeue_entity(entity); ++ else ++ /* ++ * Not in service and not queued on its active tree: ++ * the activity is idle and this is a true activation. ++ */ ++ __bfq_activate_entity(entity, non_blocking_wait_rq); ++} ++ ++ ++/** ++ * bfq_activate_requeue_entity - activate or requeue an entity representing a bfq_queue, ++ * and activate, requeue or reposition all ancestors ++ * for which such an update becomes necessary. ++ * @entity: the entity to activate. ++ * @non_blocking_wait_rq: true if this entity was waiting for a request ++ * @requeue: true if this is a requeue, which implies that bfqq is ++ * being expired; thus ALL its ancestors stop being served and must ++ * therefore be requeued ++ * @expiration: true if this function is being invoked in the expiration path ++ * of the in-service queue ++ */ ++static void bfq_activate_requeue_entity(struct bfq_entity *entity, ++ bool non_blocking_wait_rq, ++ bool requeue, bool expiration) ++{ ++ struct bfq_sched_data *sd; ++ ++ for_each_entity(entity) { ++ BUG_ON(!entity); ++ sd = entity->sched_data; ++ __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); ++ ++ BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) && ++ RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && ++ RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); ++ ++ if (!bfq_update_next_in_service(sd, entity, expiration) && ++ !requeue) { ++ BUG_ON(!sd->next_in_service); ++ break; ++ } ++ BUG_ON(!sd->next_in_service); ++ } ++} ++ ++/** ++ * __bfq_deactivate_entity - update sched_data and service trees for ++ * entity, so as to represent entity as inactive ++ * @entity: the entity being deactivated. ++ * @ins_into_idle_tree: if false, the entity will not be put into the ++ * idle tree. ++ * ++ * If necessary and allowed, puts entity into the idle tree. NOTE: ++ * entity may be on no tree if in service. ++ */ ++static bool __bfq_deactivate_entity(struct bfq_entity *entity, ++ bool ins_into_idle_tree) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st; ++ bool is_in_service; ++ ++ if (!entity->on_st) { /* entity never activated, or already inactive */ ++ BUG_ON(sd && entity == sd->in_service_entity); ++ return false; ++ } ++ ++ /* ++ * If we get here, then entity is active, which implies that ++ * bfq_group_set_parent has already been invoked for the group ++ * represented by entity. Therefore, the field ++ * entity->sched_data has been set, and we can safely use it. ++ */ ++ st = bfq_entity_service_tree(entity); ++ is_in_service = entity == sd->in_service_entity; ++ ++ BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); ++ ++ bfq_calc_finish(entity, entity->service); ++ ++ if (is_in_service) { ++ sd->in_service_entity = NULL; ++ } else ++ /* ++ * Non in-service entity: nobody will take care of ++ * resetting its service counter on expiration. Do it ++ * now. ++ */ ++ entity->service = 0; ++ ++ if (entity->tree == &st->active) ++ bfq_active_extract(st, entity); ++ else if (!is_in_service && entity->tree == &st->idle) ++ bfq_idle_extract(st, entity); ++ else if (entity->tree) ++ BUG(); ++ ++ if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) ++ bfq_forget_entity(st, entity, is_in_service); ++ else ++ bfq_idle_insert(st, entity); ++ ++ return true; ++} ++ ++/** ++ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. ++ * @entity: the entity to deactivate. ++ * @ins_into_idle_tree: true if the entity can be put into the idle tree ++ * @expiration: true if this function is being invoked in the expiration path ++ * of the in-service queue ++ */ ++static void bfq_deactivate_entity(struct bfq_entity *entity, ++ bool ins_into_idle_tree, ++ bool expiration) ++{ ++ struct bfq_sched_data *sd; ++ struct bfq_entity *parent = NULL; ++ ++ for_each_entity_safe(entity, parent) { ++ sd = entity->sched_data; ++ ++ BUG_ON(sd == NULL); /* ++ * It would mean that this is the ++ * root group. ++ */ ++ ++ BUG_ON(expiration && entity != sd->in_service_entity); ++ ++ BUG_ON(entity != sd->in_service_entity && ++ entity->tree == ++ &bfq_entity_service_tree(entity)->active && ++ !sd->next_in_service); ++ ++ if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { ++ /* ++ * entity is not in any tree any more, so ++ * this deactivation is a no-op, and there is ++ * nothing to change for upper-level entities ++ * (in case of expiration, this can never ++ * happen). ++ */ ++ BUG_ON(expiration); /* ++ * entity cannot be already out of ++ * any tree ++ */ ++ return; ++ } ++ ++ if (sd->next_in_service == entity) ++ /* ++ * entity was the next_in_service entity, ++ * then, since entity has just been ++ * deactivated, a new one must be found. ++ */ ++ bfq_update_next_in_service(sd, NULL, expiration); ++ ++ if (sd->next_in_service || sd->in_service_entity) { ++ /* ++ * The parent entity is still active, because ++ * either next_in_service or in_service_entity ++ * is not NULL. So, no further upwards ++ * deactivation must be performed. Yet, ++ * next_in_service has changed. Then the ++ * schedule does need to be updated upwards. ++ * ++ * NOTE If in_service_entity is not NULL, then ++ * next_in_service may happen to be NULL, ++ * although the parent entity is evidently ++ * active. This happens if 1) the entity ++ * pointed by in_service_entity is the only ++ * active entity in the parent entity, and 2) ++ * according to the definition of ++ * next_in_service, the in_service_entity ++ * cannot be considered as ++ * next_in_service. See the comments on the ++ * definition of next_in_service for details. ++ */ ++ BUG_ON(sd->next_in_service == entity); ++ BUG_ON(sd->in_service_entity == entity); ++ break; ++ } ++ ++ /* ++ * If we get here, then the parent is no more ++ * backlogged and we need to propagate the ++ * deactivation upwards. Thus let the loop go on. ++ */ ++ ++ /* ++ * Also let parent be queued into the idle tree on ++ * deactivation, to preserve service guarantees, and ++ * assuming that who invoked this function does not ++ * need parent entities too to be removed completely. ++ */ ++ ins_into_idle_tree = true; ++ } ++ ++ /* ++ * If the deactivation loop is fully executed, then there are ++ * no more entities to touch and next loop is not executed at ++ * all. Otherwise, requeue remaining entities if they are ++ * about to stop receiving service, or reposition them if this ++ * is not the case. ++ */ ++ entity = parent; ++ for_each_entity(entity) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ /* ++ * Invoke __bfq_requeue_entity on entity, even if ++ * already active, to requeue/reposition it in the ++ * active tree (because sd->next_in_service has ++ * changed) ++ */ ++ __bfq_requeue_entity(entity); ++ ++ sd = entity->sched_data; ++ BUG_ON(expiration && sd->in_service_entity != entity); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "invoking udpdate_next for this queue"); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, ++ struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "invoking udpdate_next for this entity"); ++ } ++#endif ++ if (!bfq_update_next_in_service(sd, entity, expiration) && ++ !expiration) ++ /* ++ * next_in_service unchanged or not causing ++ * any change in entity->parent->sd, and no ++ * requeueing needed for expiration: stop ++ * here. ++ */ ++ break; ++ } ++} ++ ++/** ++ * bfq_calc_vtime_jump - compute the value to which the vtime should jump, ++ * if needed, to have at least one entity eligible. ++ * @st: the service tree to act upon. ++ * ++ * Assumes that st is not empty. ++ */ ++static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); ++ ++ if (bfq_gt(root_entity->min_start, st->vtime)) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "new value %llu", ++ ((root_entity->min_start>>10)*1000)>>12); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(root_entity, struct bfq_group, ++ entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "new value %llu", ++ ((root_entity->min_start>>10)*1000)>>12); ++ } ++#endif ++ return root_entity->min_start; ++ } ++ return st->vtime; ++} ++ ++static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) ++{ ++ if (new_value > st->vtime) { ++ st->vtime = new_value; ++ bfq_forget_idle(st); ++ } ++} ++ ++/** ++ * bfq_first_active_entity - find the eligible entity with ++ * the smallest finish time ++ * @st: the service tree to select from. ++ * @vtime: the system virtual to use as a reference for eligibility ++ * ++ * This function searches the first schedulable entity, starting from the ++ * root of the tree and going on the left every time on this side there is ++ * a subtree with at least one eligible (start >= vtime) entity. The path on ++ * the right is followed only if a) the left subtree contains no eligible ++ * entities and b) no eligible entity has been found yet. ++ */ ++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, ++ u64 vtime) ++{ ++ struct bfq_entity *entry, *first = NULL; ++ struct rb_node *node = st->active.rb_node; ++ ++ while (node) { ++ entry = rb_entry(node, struct bfq_entity, rb_node); ++left: ++ if (!bfq_gt(entry->start, vtime)) ++ first = entry; ++ ++ BUG_ON(bfq_gt(entry->min_start, vtime)); ++ ++ if (node->rb_left) { ++ entry = rb_entry(node->rb_left, ++ struct bfq_entity, rb_node); ++ if (!bfq_gt(entry->min_start, vtime)) { ++ node = node->rb_left; ++ goto left; ++ } ++ } ++ if (first) ++ break; ++ node = node->rb_right; ++ } ++ ++ BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); ++ return first; ++} ++ ++/** ++ * __bfq_lookup_next_entity - return the first eligible entity in @st. ++ * @st: the service tree. ++ * ++ * If there is no in-service entity for the sched_data st belongs to, ++ * then return the entity that will be set in service if: ++ * 1) the parent entity this st belongs to is set in service; ++ * 2) no entity belonging to such parent entity undergoes a state change ++ * that would influence the timestamps of the entity (e.g., becomes idle, ++ * becomes backlogged, changes its budget, ...). ++ * ++ * In this first case, update the virtual time in @st too (see the ++ * comments on this update inside the function). ++ * ++ * In constrast, if there is an in-service entity, then return the ++ * entity that would be set in service if not only the above ++ * conditions, but also the next one held true: the currently ++ * in-service entity, on expiration, ++ * 1) gets a finish time equal to the current one, or ++ * 2) is not eligible any more, or ++ * 3) is idle. ++ */ ++static struct bfq_entity * ++__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) ++{ ++ struct bfq_entity *entity; ++ u64 new_vtime; ++ struct bfq_queue *bfqq; ++ ++ if (RB_EMPTY_ROOT(&st->active)) ++ return NULL; ++ ++ /* ++ * Get the value of the system virtual time for which at ++ * least one entity is eligible. ++ */ ++ new_vtime = bfq_calc_vtime_jump(st); ++ ++ /* ++ * If there is no in-service entity for the sched_data this ++ * active tree belongs to, then push the system virtual time ++ * up to the value that guarantees that at least one entity is ++ * eligible. If, instead, there is an in-service entity, then ++ * do not make any such update, because there is already an ++ * eligible entity, namely the in-service one (even if the ++ * entity is not on st, because it was extracted when set in ++ * service). ++ */ ++ if (!in_service) ++ bfq_update_vtime(st, new_vtime); ++ ++ entity = bfq_first_active_entity(st, new_vtime); ++ BUG_ON(bfq_gt(entity->start, new_vtime)); ++ ++ /* Log some information */ ++ bfqq = bfq_entity_to_bfqq(entity); ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "start %llu vtime %llu st %p", ++ ((entity->start>>10)*1000)>>12, ++ ((new_vtime>>10)*1000)>>12, st); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "start %llu vtime %llu (%llu) st %p", ++ ((entity->start>>10)*1000)>>12, ++ ((st->vtime>>10)*1000)>>12, ++ ((new_vtime>>10)*1000)>>12, st); ++ } ++#endif ++ ++ BUG_ON(!entity); ++ ++ return entity; ++} ++ ++/** ++ * bfq_lookup_next_entity - return the first eligible entity in @sd. ++ * @sd: the sched_data. ++ * @expiration: true if we are on the expiration path of the in-service queue ++ * ++ * This function is invoked when there has been a change in the trees ++ * for sd, and we need to know what is the new next entity to serve ++ * after this change. ++ */ ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, ++ bool expiration) ++{ ++ struct bfq_service_tree *st = sd->service_tree; ++ struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); ++ struct bfq_entity *entity = NULL; ++ struct bfq_queue *bfqq; ++ int class_idx = 0; ++ ++ BUG_ON(!sd); ++ BUG_ON(!st); ++ /* ++ * Choose from idle class, if needed to guarantee a minimum ++ * bandwidth to this class (and if there is some active entity ++ * in idle class). This should also mitigate ++ * priority-inversion problems in case a low priority task is ++ * holding file system resources. ++ */ ++ if (time_is_before_jiffies(sd->bfq_class_idle_last_service + ++ BFQ_CL_IDLE_TIMEOUT)) { ++ if (!RB_EMPTY_ROOT(&idle_class_st->active)) ++ class_idx = BFQ_IOPRIO_CLASSES - 1; ++ /* About to be served if backlogged, or not yet backlogged */ ++ sd->bfq_class_idle_last_service = jiffies; ++ } ++ ++ /* ++ * Find the next entity to serve for the highest-priority ++ * class, unless the idle class needs to be served. ++ */ ++ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { ++ /* ++ * If expiration is true, then bfq_lookup_next_entity ++ * is being invoked as a part of the expiration path ++ * of the in-service queue. In this case, even if ++ * sd->in_service_entity is not NULL, ++ * sd->in_service_entiy at this point is actually not ++ * in service any more, and, if needed, has already ++ * been properly queued or requeued into the right ++ * tree. The reason why sd->in_service_entity is still ++ * not NULL here, even if expiration is true, is that ++ * sd->in_service_entiy is reset as a last step in the ++ * expiration path. So, if expiration is true, tell ++ * __bfq_lookup_next_entity that there is no ++ * sd->in_service_entity. ++ */ ++ entity = __bfq_lookup_next_entity(st + class_idx, ++ sd->in_service_entity && ++ !expiration); ++ ++ if (entity) ++ break; ++ } ++ ++ BUG_ON(!entity && ++ (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) || ++ !RB_EMPTY_ROOT(&(st+2)->active))); ++ ++ if (!entity) ++ return NULL; ++ ++ /* Log some information */ ++ bfqq = bfq_entity_to_bfqq(entity); ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", ++ st + class_idx, class_idx); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "chosen from st %p %d", ++ st + class_idx, class_idx); ++ } ++#endif ++ ++ return entity; ++} ++ ++static bool next_queue_may_preempt(struct bfq_data *bfqd) ++{ ++ struct bfq_sched_data *sd = &bfqd->root_group->sched_data; ++ ++ return sd->next_in_service != sd->in_service_entity; ++} ++ ++/* ++ * Get next queue for service. ++ */ ++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_entity *entity = NULL; ++ struct bfq_sched_data *sd; ++ struct bfq_queue *bfqq; ++ ++ BUG_ON(bfqd->in_service_queue); ++ ++ if (bfq_tot_busy_queues(bfqd) == 0) ++ return NULL; ++ ++ /* ++ * Traverse the path from the root to the leaf entity to ++ * serve. Set in service all the entities visited along the ++ * way. ++ */ ++ sd = &bfqd->root_group->sched_data; ++ for (; sd ; sd = entity->my_sched_data) { ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ if (entity) { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg(bfqd, bfqg, ++ "lookup in this group"); ++ if (!sd->next_in_service) ++ pr_crit("lookup in this group"); ++ } else { ++ bfq_log_bfqg(bfqd, bfqd->root_group, ++ "lookup in root group"); ++ if (!sd->next_in_service) ++ pr_crit("lookup in root group"); ++ } ++#endif ++ ++ BUG_ON(!sd->next_in_service); ++ ++ /* ++ * WARNING. We are about to set the in-service entity ++ * to sd->next_in_service, i.e., to the (cached) value ++ * returned by bfq_lookup_next_entity(sd) the last ++ * time it was invoked, i.e., the last time when the ++ * service order in sd changed as a consequence of the ++ * activation or deactivation of an entity. In this ++ * respect, if we execute bfq_lookup_next_entity(sd) ++ * in this very moment, it may, although with low ++ * probability, yield a different entity than that ++ * pointed to by sd->next_in_service. This rare event ++ * happens in case there was no CLASS_IDLE entity to ++ * serve for sd when bfq_lookup_next_entity(sd) was ++ * invoked for the last time, while there is now one ++ * such entity. ++ * ++ * If the above event happens, then the scheduling of ++ * such entity in CLASS_IDLE is postponed until the ++ * service of the sd->next_in_service entity ++ * finishes. In fact, when the latter is expired, ++ * bfq_lookup_next_entity(sd) gets called again, ++ * exactly to update sd->next_in_service. ++ */ ++ ++ /* Make next_in_service entity become in_service_entity */ ++ entity = sd->next_in_service; ++ sd->in_service_entity = entity; ++ ++ /* ++ * If entity is no longer a candidate for next ++ * service, then it must be extracted from its active ++ * tree, so as to make sure that it won't be ++ * considered when computing next_in_service. See the ++ * comments on the function ++ * bfq_no_longer_next_in_service() for details. ++ */ ++ if (bfq_no_longer_next_in_service(entity)) ++ bfq_active_extract(bfq_entity_service_tree(entity), ++ entity); ++ ++ /* ++ * Even if entity is not to be extracted according to ++ * the above check, a descendant entity may get ++ * extracted in one of the next iterations of this ++ * loop. Such an event could cause a change in ++ * next_in_service for the level of the descendant ++ * entity, and thus possibly back to this level. ++ * ++ * However, we cannot perform the resulting needed ++ * update of next_in_service for this level before the ++ * end of the whole loop, because, to know which is ++ * the correct next-to-serve candidate entity for each ++ * level, we need first to find the leaf entity to set ++ * in service. In fact, only after we know which is ++ * the next-to-serve leaf entity, we can discover ++ * whether the parent entity of the leaf entity ++ * becomes the next-to-serve, and so on. ++ */ ++ ++ /* Log some information */ ++ bfqq = bfq_entity_to_bfqq(entity); ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, ++ "this queue, finish %llu", ++ (((entity->finish>>10)*1000)>>10)>>2); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg(bfqd, bfqg, ++ "this entity, finish %llu", ++ (((entity->finish>>10)*1000)>>10)>>2); ++ } ++#endif ++ ++ } ++ ++ BUG_ON(!entity); ++ bfqq = bfq_entity_to_bfqq(entity); ++ BUG_ON(!bfqq); ++ ++ /* ++ * We can finally update all next-to-serve entities along the ++ * path from the leaf entity just set in service to the root. ++ */ ++ for_each_entity(entity) { ++ struct bfq_sched_data *sd = entity->sched_data; ++ ++ if (!bfq_update_next_in_service(sd, NULL, false)) ++ break; ++ } ++ ++ return bfqq; ++} ++ ++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; ++ struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; ++ struct bfq_entity *entity = in_serv_entity; ++ ++#ifndef BFQ_MQ ++ if (bfqd->in_service_bic) { ++ put_io_context(bfqd->in_service_bic->icq.ioc); ++ bfqd->in_service_bic = NULL; ++ } ++#endif ++ ++ bfq_clear_bfqq_wait_request(in_serv_bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ bfqd->in_service_queue = NULL; ++ ++ /* ++ * When this function is called, all in-service entities have ++ * been properly deactivated or requeued, so we can safely ++ * execute the final step: reset in_service_entity along the ++ * path from entity to the root. ++ */ ++ for_each_entity(entity) ++ entity->sched_data->in_service_entity = NULL; ++ ++ /* ++ * in_serv_entity is no longer in service, so, if it is in no ++ * service tree either, then release the service reference to ++ * the queue it represents (taken with bfq_get_entity). ++ */ ++ if (!in_serv_entity->on_st) ++ bfq_put_queue(in_serv_bfqq); ++} ++ ++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool ins_into_idle_tree, bool expiration) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); ++} ++ ++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ BUG_ON(entity->tree != &st->active && entity->tree != &st->idle && ++ entity->on_st); ++ ++ bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), ++ false, false); ++ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); ++} ++ ++static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool expiration) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_activate_requeue_entity(entity, false, ++ bfqq == bfqd->in_service_queue, expiration); ++} ++ ++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); ++ ++/* ++ * Called when the bfqq no longer has requests pending, remove it from ++ * the service tree. As a special case, it can be invoked during an ++ * expiration. ++ */ ++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool expiration) ++{ ++ BUG_ON(!bfq_bfqq_busy(bfqq)); ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ bfq_log_bfqq(bfqd, bfqq, "del from busy"); ++ ++ bfq_clear_bfqq_busy(bfqq); ++ ++ BUG_ON(bfq_tot_busy_queues(bfqd) == 0); ++ bfqd->busy_queues[bfqq->ioprio_class - 1]--; ++ ++ if (bfqq->wr_coeff > 1) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++ ++ bfqg_stats_update_dequeue(bfqq_group(bfqq)); ++ ++ BUG_ON(bfqq->entity.budget < 0); ++ ++ bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); ++ if (!bfqq->dispatched) ++ bfq_weights_tree_remove(bfqd, bfqq); ++} ++ ++/* ++ * Called when an inactive queue receives a new request. ++ */ ++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ ++ bfq_log_bfqq(bfqd, bfqq, "add to busy"); ++ ++ bfq_activate_bfqq(bfqd, bfqq); ++ ++ bfq_mark_bfqq_busy(bfqq); ++ bfqd->busy_queues[bfqq->ioprio_class - 1]++; ++ ++ if (!bfqq->dispatched) ++ if (bfqq->wr_coeff == 1) ++ bfq_weights_tree_add(bfqd, bfqq, ++ &bfqd->queue_weights_tree); ++ ++ if (bfqq->wr_coeff > 1) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); ++ } ++ ++} +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +new file mode 100644 +index 000000000000..6da94eef0cf1 +--- /dev/null ++++ b/block/bfq-sq-iosched.c +@@ -0,0 +1,5957 @@ ++/* ++ * Budget Fair Queueing (BFQ) I/O scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2015 Paolo Valente ++ * ++ * Copyright (C) 2017 Paolo Valente ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ ++ * file. ++ * ++ * BFQ is a proportional-share I/O scheduler, with some extra ++ * low-latency capabilities. BFQ also supports full hierarchical ++ * scheduling through cgroups. Next paragraphs provide an introduction ++ * on BFQ inner workings. Details on BFQ benefits and usage can be ++ * found in Documentation/block/bfq-iosched.txt. ++ * ++ * BFQ is a proportional-share storage-I/O scheduling algorithm based ++ * on the slice-by-slice service scheme of CFQ. But BFQ assigns ++ * budgets, measured in number of sectors, to processes instead of ++ * time slices. The device is not granted to the in-service process ++ * for a given time slice, but until it has exhausted its assigned ++ * budget. This change from the time to the service domain enables BFQ ++ * to distribute the device throughput among processes as desired, ++ * without any distortion due to throughput fluctuations, or to device ++ * internal queueing. BFQ uses an ad hoc internal scheduler, called ++ * B-WF2Q+, to schedule processes according to their budgets. More ++ * precisely, BFQ schedules queues associated with processes. Thanks to ++ * the accurate policy of B-WF2Q+, BFQ can afford to assign high ++ * budgets to I/O-bound processes issuing sequential requests (to ++ * boost the throughput), and yet guarantee a low latency to ++ * interactive and soft real-time applications. ++ * ++ * In particular, BFQ schedules I/O so as to achieve the latter goal-- ++ * low latency for interactive and soft real-time applications--if the ++ * low_latency parameter is set (default configuration). To this ++ * purpose, BFQ constantly tries to detect whether the I/O requests in ++ * a bfq_queue come from an interactive or a soft real-time ++ * application. For brevity, in these cases, the queue is said to be ++ * interactive or soft real-time. In both cases, BFQ privileges the ++ * service of the queue, over that of non-interactive and ++ * non-soft-real-time queues. This privileging is performed, mainly, ++ * by raising the weight of the queue. So, for brevity, we call just ++ * weight-raising periods the time periods during which a queue is ++ * privileged, because deemed interactive or soft real-time. ++ * ++ * The detection of soft real-time queues/applications is described in ++ * detail in the comments on the function ++ * bfq_bfqq_softrt_next_start. On the other hand, the detection of an ++ * interactive queue works as follows: a queue is deemed interactive ++ * if it is constantly non empty only for a limited time interval, ++ * after which it does become empty. The queue may be deemed ++ * interactive again (for a limited time), if it restarts being ++ * constantly non empty, provided that this happens only after the ++ * queue has remained empty for a given minimum idle time. ++ * ++ * By default, BFQ computes automatically the above maximum time ++ * interval, i.e., the time interval after which a constantly ++ * non-empty queue stops being deemed interactive. Since a queue is ++ * weight-raised while it is deemed interactive, this maximum time ++ * interval happens to coincide with the (maximum) duration of the ++ * weight-raising for interactive queues. ++ * ++ * NOTE: if the main or only goal, with a given device, is to achieve ++ * the maximum-possible throughput at all times, then do switch off ++ * all low-latency heuristics for that device, by setting low_latency ++ * to 0. ++ * ++ * BFQ is described in [1], where also a reference to the initial, ++ * more theoretical paper on BFQ can be found. The interested reader ++ * can find in the latter paper full details on the main algorithm, as ++ * well as formulas of the guarantees and formal proofs of all the ++ * properties. With respect to the version of BFQ presented in these ++ * papers, this implementation adds a few more heuristics, such as the ++ * one that guarantees a low latency to soft real-time applications, ++ * and a hierarchical extension based on H-WF2Q+. ++ * ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) ++ * complexity derives from the one introduced with EEVDF in [3]. ++ * ++ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O ++ * Scheduler", Proceedings of the First Workshop on Mobile System ++ * Technologies (MST-2015), May 2015. ++ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf ++ * ++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf ++ * ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, ++ * Oct 1997. ++ * ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz ++ * ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline ++ * First: A Flexible and Accurate Mechanism for Proportional Share ++ * Resource Allocation,'' technical report. ++ * ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "blk.h" ++#include "bfq.h" ++#include "blk-wbt.h" ++ ++/* Expiration time of sync (0) and async (1) requests, in ns. */ ++static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; ++ ++/* Maximum backwards seek, in KiB. */ ++static const int bfq_back_max = (16 * 1024); ++ ++/* Penalty of a backwards seek, in number of sectors. */ ++static const int bfq_back_penalty = 2; ++ ++/* Idling period duration, in ns. */ ++static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); ++ ++/* Minimum number of assigned budgets for which stats are safe to compute. */ ++static const int bfq_stats_min_budgets = 194; ++ ++/* Default maximum budget values, in sectors and number of requests. */ ++static const int bfq_default_max_budget = (16 * 1024); ++ ++/* ++ * When a sync request is dispatched, the queue that contains that ++ * request, and all the ancestor entities of that queue, are charged ++ * with the number of sectors of the request. In constrast, if the ++ * request is async, then the queue and its ancestor entities are ++ * charged with the number of sectors of the request, multiplied by ++ * the factor below. This throttles the bandwidth for async I/O, ++ * w.r.t. to sync I/O, and it is done to counter the tendency of async ++ * writes to steal I/O throughput to reads. ++ * ++ * The current value of this parameter is the result of a tuning with ++ * several hardware and software configurations. We tried to find the ++ * lowest value for which writes do not cause noticeable problems to ++ * reads. In fact, the lower this parameter, the stabler I/O control, ++ * in the following respect. The lower this parameter is, the less ++ * the bandwidth enjoyed by a group decreases ++ * - when the group does writes, w.r.t. to when it does reads; ++ * - when other groups do reads, w.r.t. to when they do writes. ++ */ ++static const int bfq_async_charge_factor = 3; ++ ++/* Default timeout values, in jiffies, approximating CFQ defaults. */ ++static const int bfq_timeout = (HZ / 8); ++ ++/* ++ * Time limit for merging (see comments in bfq_setup_cooperator). Set ++ * to the slowest value that, in our tests, proved to be effective in ++ * removing false positives, while not causing true positives to miss ++ * queue merging. ++ * ++ * As can be deduced from the low time limit below, queue merging, if ++ * successful, happens at the very beggining of the I/O of the involved ++ * cooperating processes, as a consequence of the arrival of the very ++ * first requests from each cooperator. After that, there is very ++ * little chance to find cooperators. ++ */ ++static const unsigned long bfq_merge_time_limit = HZ/10; ++ ++#define MAX_LENGTH_REASON_NAME 25 ++ ++static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", ++"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", ++"PREEMPTED"}; ++ ++static struct kmem_cache *bfq_pool; ++ ++/* Below this threshold (in ns), we consider thinktime immediate. */ ++#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) ++ ++/* hw_tag detection: parallel requests threshold and min samples needed. */ ++#define BFQ_HW_QUEUE_THRESHOLD 3 ++#define BFQ_HW_QUEUE_SAMPLES 32 ++ ++#define BFQQ_SEEK_THR (sector_t)(8 * 100) ++#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) ++#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ ++ (get_sdist(last_pos, rq) > \ ++ BFQQ_SEEK_THR && \ ++ (!blk_queue_nonrot(bfqd->queue) || \ ++ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) ++#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) ++#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) ++ ++/* Min number of samples required to perform peak-rate update */ ++#define BFQ_RATE_MIN_SAMPLES 32 ++/* Min observation time interval required to perform a peak-rate update (ns) */ ++#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) ++/* Target observation time interval for a peak-rate update (ns) */ ++#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC ++ ++/* ++ * Shift used for peak-rate fixed precision calculations. ++ * With ++ * - the current shift: 16 positions ++ * - the current type used to store rate: u32 ++ * - the current unit of measure for rate: [sectors/usec], or, more precisely, ++ * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, ++ * the range of rates that can be stored is ++ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = ++ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = ++ * [15, 65G] sectors/sec ++ * Which, assuming a sector size of 512B, corresponds to a range of ++ * [7.5K, 33T] B/sec ++ */ ++#define BFQ_RATE_SHIFT 16 ++ ++/* ++ * When configured for computing the duration of the weight-raising ++ * for interactive queues automatically (see the comments at the ++ * beginning of this file), BFQ does it using the following formula: ++ * duration = (ref_rate / r) * ref_wr_duration, ++ * where r is the peak rate of the device, and ref_rate and ++ * ref_wr_duration are two reference parameters. In particular, ++ * ref_rate is the peak rate of the reference storage device (see ++ * below), and ref_wr_duration is about the maximum time needed, with ++ * BFQ and while reading two files in parallel, to load typical large ++ * applications on the reference device (see the comments on ++ * max_service_from_wr below, for more details on how ref_wr_duration ++ * is obtained). In practice, the slower/faster the device at hand ++ * is, the more/less it takes to load applications with respect to the ++ * reference device. Accordingly, the longer/shorter BFQ grants ++ * weight raising to interactive applications. ++ * ++ * BFQ uses two different reference pairs (ref_rate, ref_wr_duration), ++ * depending on whether the device is rotational or non-rotational. ++ * ++ * In the following definitions, ref_rate[0] and ref_wr_duration[0] ++ * are the reference values for a rotational device, whereas ++ * ref_rate[1] and ref_wr_duration[1] are the reference values for a ++ * non-rotational device. The reference rates are not the actual peak ++ * rates of the devices used as a reference, but slightly lower ++ * values. The reason for using slightly lower values is that the ++ * peak-rate estimator tends to yield slightly lower values than the ++ * actual peak rate (it can yield the actual peak rate only if there ++ * is only one process doing I/O, and the process does sequential ++ * I/O). ++ * ++ * The reference peak rates are measured in sectors/usec, left-shifted ++ * by BFQ_RATE_SHIFT. ++ */ ++static int ref_rate[2] = {14000, 33000}; ++/* ++ * To improve readability, a conversion function is used to initialize ++ * the following array, which entails that the array can be ++ * initialized only in a function. ++ */ ++static int ref_wr_duration[2]; ++ ++/* ++ * BFQ uses the above-detailed, time-based weight-raising mechanism to ++ * privilege interactive tasks. This mechanism is vulnerable to the ++ * following false positives: I/O-bound applications that will go on ++ * doing I/O for much longer than the duration of weight ++ * raising. These applications have basically no benefit from being ++ * weight-raised at the beginning of their I/O. On the opposite end, ++ * while being weight-raised, these applications ++ * a) unjustly steal throughput to applications that may actually need ++ * low latency; ++ * b) make BFQ uselessly perform device idling; device idling results ++ * in loss of device throughput with most flash-based storage, and may ++ * increase latencies when used purposelessly. ++ * ++ * BFQ tries to reduce these problems, by adopting the following ++ * countermeasure. To introduce this countermeasure, we need first to ++ * finish explaining how the duration of weight-raising for ++ * interactive tasks is computed. ++ * ++ * For a bfq_queue deemed as interactive, the duration of weight ++ * raising is dynamically adjusted, as a function of the estimated ++ * peak rate of the device, so as to be equal to the time needed to ++ * execute the 'largest' interactive task we benchmarked so far. By ++ * largest task, we mean the task for which each involved process has ++ * to do more I/O than for any of the other tasks we benchmarked. This ++ * reference interactive task is the start-up of LibreOffice Writer, ++ * and in this task each process/bfq_queue needs to have at most ~110K ++ * sectors transfered. ++ * ++ * This last piece of information enables BFQ to reduce the actual ++ * duration of weight-raising for at least one class of I/O-bound ++ * applications: those doing sequential or quasi-sequential I/O. An ++ * example is file copy. In fact, once started, the main I/O-bound ++ * processes of these applications usually consume the above 110K ++ * sectors in much less time than the processes of an application that ++ * is starting, because these I/O-bound processes will greedily devote ++ * almost all their CPU cycles only to their target, ++ * throughput-friendly I/O operations. This is even more true if BFQ ++ * happens to be underestimating the device peak rate, and thus ++ * overestimating the duration of weight raising. But, according to ++ * our measurements, once transferred 110K sectors, these processes ++ * have no right to be weight-raised any longer. ++ * ++ * Basing on the last consideration, BFQ ends weight-raising for a ++ * bfq_queue if the latter happens to have received an amount of ++ * service at least equal to the following constant. The constant is ++ * set to slightly more than 110K, to have a minimum safety margin. ++ * ++ * This early ending of weight-raising reduces the amount of time ++ * during which interactive false positives cause the two problems ++ * described at the beginning of these comments. ++ */ ++static const unsigned long max_service_from_wr = 120000; ++ ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) ++ ++#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) ++ ++static void bfq_schedule_dispatch(struct bfq_data *bfqd); ++ ++#include "bfq-ioc.c" ++#include "bfq-sched.c" ++#include "bfq-cgroup-included.c" ++ ++#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) ++#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) ++ ++#define bfq_sample_valid(samples) ((samples) > 80) ++ ++/* ++ * Scheduler run of queue, if there are requests pending and no one in the ++ * driver that will restart queueing. ++ */ ++static void bfq_schedule_dispatch(struct bfq_data *bfqd) ++{ ++ if (bfqd->queued != 0) { ++ bfq_log(bfqd, ""); ++ kblockd_schedule_work(&bfqd->unplug_work); ++ } ++} ++ ++/* ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now. ++ * We choose the request that is closesr to the head right now. Distance ++ * behind the head is penalized and only allowed to a certain extent. ++ */ ++static struct request *bfq_choose_req(struct bfq_data *bfqd, ++ struct request *rq1, ++ struct request *rq2, ++ sector_t last) ++{ ++ sector_t s1, s2, d1 = 0, d2 = 0; ++ unsigned long back_max; ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ ++ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ ++ ++ if (!rq1 || rq1 == rq2) ++ return rq2; ++ if (!rq2) ++ return rq1; ++ ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) ++ return rq1; ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) ++ return rq2; ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) ++ return rq1; ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) ++ return rq2; ++ ++ s1 = blk_rq_pos(rq1); ++ s2 = blk_rq_pos(rq2); ++ ++ /* ++ * By definition, 1KiB is 2 sectors. ++ */ ++ back_max = bfqd->bfq_back_max * 2; ++ ++ /* ++ * Strict one way elevator _except_ in the case where we allow ++ * short backward seeks which are biased as twice the cost of a ++ * similar forward seek. ++ */ ++ if (s1 >= last) ++ d1 = s1 - last; ++ else if (s1 + back_max >= last) ++ d1 = (last - s1) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ1_WRAP; ++ ++ if (s2 >= last) ++ d2 = s2 - last; ++ else if (s2 + back_max >= last) ++ d2 = (last - s2) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ2_WRAP; ++ ++ /* Found required data */ ++ ++ /* ++ * By doing switch() on the bit mask "wrap" we avoid having to ++ * check two variables for all permutations: --> faster! ++ */ ++ switch (wrap) { ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ ++ if (d1 < d2) ++ return rq1; ++ else if (d2 < d1) ++ return rq2; ++ ++ if (s1 >= s2) ++ return rq1; ++ else ++ return rq2; ++ ++ case BFQ_RQ2_WRAP: ++ return rq1; ++ case BFQ_RQ1_WRAP: ++ return rq2; ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ ++ default: ++ /* ++ * Since both rqs are wrapped, ++ * start with the one that's further behind head ++ * (--> only *one* back seek required), ++ * since back seek takes more time than forward. ++ */ ++ if (s1 <= s2) ++ return rq1; ++ else ++ return rq2; ++ } ++} ++ ++static struct bfq_queue * ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, ++ sector_t sector, struct rb_node **ret_parent, ++ struct rb_node ***rb_link) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *bfqq = NULL; ++ ++ parent = NULL; ++ p = &root->rb_node; ++ while (*p) { ++ struct rb_node **n; ++ ++ parent = *p; ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ ++ /* ++ * Sort strictly based on sector. Smallest to the left, ++ * largest to the right. ++ */ ++ if (sector > blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_right; ++ else if (sector < blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_left; ++ else ++ break; ++ p = n; ++ bfqq = NULL; ++ } ++ ++ *ret_parent = parent; ++ if (rb_link) ++ *rb_link = p; ++ ++ bfq_log(bfqd, "%llu: returning %d", ++ (unsigned long long) sector, ++ bfqq ? bfqq->pid : 0); ++ ++ return bfqq; ++} ++ ++static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) ++{ ++ return bfqq->service_from_backlogged > 0 && ++ time_is_before_jiffies(bfqq->first_IO_time + ++ bfq_merge_time_limit); ++} ++ ++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *__bfqq; ++ ++ if (bfqq->pos_root) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ ++ /* ++ * bfqq cannot be merged any longer (see comments in ++ * bfq_setup_cooperator): no point in adding bfqq into the ++ * position tree. ++ */ ++ if (bfq_too_late_for_merging(bfqq)) ++ return; ++ ++ if (bfq_class_idle(bfqq)) ++ return; ++ if (!bfqq->next_rq) ++ return; ++ ++ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, ++ blk_rq_pos(bfqq->next_rq), &parent, &p); ++ if (!__bfqq) { ++ rb_link_node(&bfqq->pos_node, parent, p); ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); ++ } else ++ bfqq->pos_root = NULL; ++} ++ ++/* ++ * The following function returns true if every queue must receive the ++ * same share of the throughput (this condition is used when deciding ++ * whether idling may be disabled, see the comments in the function ++ * bfq_better_to_idle()). ++ * ++ * Such a scenario occurs when: ++ * 1) all active queues have the same weight, ++ * 2) all active queues belong to the same I/O-priority class, ++ * 3) all active groups at the same level in the groups tree have the same ++ * weight, ++ * 4) all active groups at the same level in the groups tree have the same ++ * number of children. ++ * ++ * Unfortunately, keeping the necessary state for evaluating exactly ++ * the last two symmetry sub-conditions above would be quite complex ++ * and time consuming. Therefore this function evaluates, instead, ++ * only the following stronger three sub-conditions, for which it is ++ * much easier to maintain the needed state: ++ * 1) all active queues have the same weight, ++ * 2) all active queues belong to the same I/O-priority class, ++ * 3) there are no active groups. ++ * In particular, the last condition is always true if hierarchical ++ * support or the cgroups interface are not enabled, thus no state ++ * needs to be maintained in this case. ++ */ ++static bool bfq_symmetric_scenario(struct bfq_data *bfqd) ++{ ++ /* ++ * For queue weights to differ, queue_weights_tree must contain ++ * at least two nodes. ++ */ ++ bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && ++ (bfqd->queue_weights_tree.rb_node->rb_left || ++ bfqd->queue_weights_tree.rb_node->rb_right); ++ ++ bool multiple_classes_busy = ++ (bfqd->busy_queues[0] && bfqd->busy_queues[1]) || ++ (bfqd->busy_queues[0] && bfqd->busy_queues[2]) || ++ (bfqd->busy_queues[1] && bfqd->busy_queues[2]); ++ ++ bfq_log(bfqd, "varied_queue_weights %d mul_classes %d", ++ varied_queue_weights, multiple_classes_busy); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ bfq_log(bfqd, "num_groups_with_pending_reqs %u", ++ bfqd->num_groups_with_pending_reqs); ++#endif ++ ++ return !(varied_queue_weights || multiple_classes_busy ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ || bfqd->num_groups_with_pending_reqs > 0 ++#endif ++ ); ++} ++ ++/* ++ * If the weight-counter tree passed as input contains no counter for ++ * the weight of the input queue, then add that counter; otherwise just ++ * increment the existing counter. ++ * ++ * Note that weight-counter trees contain few nodes in mostly symmetric ++ * scenarios. For example, if all queues have the same weight, then the ++ * weight-counter tree for the queues may contain at most one node. ++ * This holds even if low_latency is on, because weight-raised queues ++ * are not inserted in the tree. ++ * In most scenarios, the rate at which nodes are created/destroyed ++ * should be low too. ++ */ ++static void bfq_weights_tree_add(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct rb_root *root) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct rb_node **new = &(root->rb_node), *parent = NULL; ++ ++ /* ++ * Do not insert if the queue is already associated with a ++ * counter, which happens if: ++ * 1) a request arrival has caused the queue to become both ++ * non-weight-raised, and hence change its weight, and ++ * backlogged; in this respect, each of the two events ++ * causes an invocation of this function, ++ * 2) this is the invocation of this function caused by the ++ * second event. This second invocation is actually useless, ++ * and we handle this fact by exiting immediately. More ++ * efficient or clearer solutions might possibly be adopted. ++ */ ++ if (bfqq->weight_counter) ++ return; ++ ++ while (*new) { ++ struct bfq_weight_counter *__counter = container_of(*new, ++ struct bfq_weight_counter, ++ weights_node); ++ parent = *new; ++ ++ if (entity->weight == __counter->weight) { ++ bfqq->weight_counter = __counter; ++ goto inc_counter; ++ } ++ if (entity->weight < __counter->weight) ++ new = &((*new)->rb_left); ++ else ++ new = &((*new)->rb_right); ++ } ++ ++ bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), ++ GFP_ATOMIC); ++ ++ /* ++ * In the unlucky event of an allocation failure, we just ++ * exit. This will cause the weight of queue to not be ++ * considered in bfq_symmetric_scenario, which, in its turn, ++ * causes the scenario to be deemed wrongly symmetric in case ++ * bfqq's weight would have been the only weight making the ++ * scenario asymmetric. On the bright side, no unbalance will ++ * however occur when bfqq becomes inactive again (the ++ * invocation of this function is triggered by an activation ++ * of queue). In fact, bfq_weights_tree_remove does nothing ++ * if !bfqq->weight_counter. ++ */ ++ if (unlikely(!bfqq->weight_counter)) ++ return; ++ ++ bfqq->weight_counter->weight = entity->weight; ++ rb_link_node(&bfqq->weight_counter->weights_node, parent, new); ++ rb_insert_color(&bfqq->weight_counter->weights_node, root); ++ ++inc_counter: ++ bfqq->weight_counter->num_active++; ++ bfqq->ref++; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "refs %d weight %d symmetric %d", ++ bfqq->ref, ++ entity->weight, ++ bfq_symmetric_scenario(bfqd)); ++} ++ ++/* ++ * Decrement the weight counter associated with the queue, and, if the ++ * counter reaches 0, remove the counter from the tree. ++ * See the comments to the function bfq_weights_tree_add() for considerations ++ * about overhead. ++ */ ++static void __bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct rb_root *root) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (!bfqq->weight_counter) ++ return; ++ ++ BUG_ON(RB_EMPTY_ROOT(root)); ++ BUG_ON(bfqq->weight_counter->weight != entity->weight); ++ ++ BUG_ON(!bfqq->weight_counter->num_active); ++ bfqq->weight_counter->num_active--; ++ ++ if (bfqq->weight_counter->num_active > 0) ++ goto reset_entity_pointer; ++ ++ rb_erase(&bfqq->weight_counter->weights_node, root); ++ kfree(bfqq->weight_counter); ++ ++reset_entity_pointer: ++ bfqq->weight_counter = NULL; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "refs %d weight %d symmetric %d", ++ bfqq->ref, ++ entity->weight, ++ bfq_symmetric_scenario(bfqd)); ++ bfq_put_queue(bfqq); ++} ++ ++/* ++ * Invoke __bfq_weights_tree_remove on bfqq and decrement the number ++ * of active groups for each queue's inactive parent entity. ++ */ ++static void bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = bfqq->entity.parent; ++ ++ for_each_entity(entity) { ++ struct bfq_sched_data *sd = entity->my_sched_data; ++ ++ BUG_ON(entity->sched_data == NULL); /* ++ * It would mean ++ * that this is ++ * the root group. ++ */ ++ ++ if (sd->next_in_service || sd->in_service_entity) { ++ BUG_ON(!entity->in_groups_with_pending_reqs); ++ /* ++ * entity is still active, because either ++ * next_in_service or in_service_entity is not ++ * NULL (see the comments on the definition of ++ * next_in_service for details on why ++ * in_service_entity must be checked too). ++ * ++ * As a consequence, its parent entities are ++ * active as well, and thus this loop must ++ * stop here. ++ */ ++ break; ++ } ++ ++ BUG_ON(!bfqd->num_groups_with_pending_reqs && ++ entity->in_groups_with_pending_reqs); ++ /* ++ * The decrement of num_groups_with_pending_reqs is ++ * not performed immediately upon the deactivation of ++ * entity, but it is delayed to when it also happens ++ * that the first leaf descendant bfqq of entity gets ++ * all its pending requests completed. The following ++ * instructions perform this delayed decrement, if ++ * needed. See the comments on ++ * num_groups_with_pending_reqs for details. ++ */ ++ if (entity->in_groups_with_pending_reqs) { ++ entity->in_groups_with_pending_reqs = false; ++ bfqd->num_groups_with_pending_reqs--; ++ } ++ bfq_log_bfqq(bfqd, bfqq, "num_groups_with_pending_reqs %u", ++ bfqd->num_groups_with_pending_reqs); ++ } ++ ++ /* ++ * Next function is invoked last, because it causes bfqq to be ++ * freed if the following holds: bfqq is not in service and ++ * has no dispatched request. DO NOT use bfqq after the next ++ * function invocation. ++ */ ++ __bfq_weights_tree_remove(bfqd, bfqq, ++ &bfqd->queue_weights_tree); ++} ++ ++/* ++ * Return expired entry, or NULL to just start from scratch in rbtree. ++ */ ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct request *rq; ++ ++ if (bfq_bfqq_fifo_expire(bfqq)) ++ return NULL; ++ ++ bfq_mark_bfqq_fifo_expire(bfqq); ++ ++ rq = rq_entry_fifo(bfqq->fifo.next); ++ ++ if (rq == last || ktime_get_ns() < rq->fifo_time) ++ return NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); ++ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); ++ return rq; ++} ++ ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct rb_node *rbnext = rb_next(&last->rb_node); ++ struct rb_node *rbprev = rb_prev(&last->rb_node); ++ struct request *next, *prev = NULL; ++ ++ BUG_ON(list_empty(&bfqq->fifo)); ++ ++ /* Follow expired path, else get first next available. */ ++ next = bfq_check_fifo(bfqq, last); ++ if (next) { ++ BUG_ON(next == last); ++ return next; ++ } ++ ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); ++ ++ if (rbprev) ++ prev = rb_entry_rq(rbprev); ++ ++ if (rbnext) ++ next = rb_entry_rq(rbnext); ++ else { ++ rbnext = rb_first(&bfqq->sort_list); ++ if (rbnext && rbnext != &last->rb_node) ++ next = rb_entry_rq(rbnext); ++ } ++ ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); ++} ++ ++/* see the definition of bfq_async_charge_factor for details */ ++static unsigned long bfq_serv_to_charge(struct request *rq, ++ struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 || ++ !bfq_symmetric_scenario(bfqq->bfqd)) ++ return blk_rq_sectors(rq); ++ ++ return blk_rq_sectors(rq) * bfq_async_charge_factor; ++} ++ ++/** ++ * bfq_updated_next_req - update the queue after a new next_rq selection. ++ * @bfqd: the device data the queue belongs to. ++ * @bfqq: the queue to update. ++ * ++ * If the first request of a queue changes we make sure that the queue ++ * has enough budget to serve at least its first request (if the ++ * request has grown). We do this because if the queue has not enough ++ * budget for its first request, it has to go through two dispatch ++ * rounds to actually get it dispatched. ++ */ ++static void bfq_updated_next_req(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ struct request *next_rq = bfqq->next_rq; ++ unsigned long new_budget; ++ ++ if (!next_rq) ++ return; ++ ++ if (bfqq == bfqd->in_service_queue) ++ /* ++ * In order not to break guarantees, budgets cannot be ++ * changed after an entity has been selected. ++ */ ++ return; ++ ++ BUG_ON(entity->tree != &st->active); ++ BUG_ON(entity == entity->sched_data->in_service_entity); ++ ++ new_budget = max_t(unsigned long, ++ max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)), ++ entity->service); ++ if (entity->budget != new_budget) { ++ entity->budget = new_budget; ++ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", ++ new_budget); ++ bfq_requeue_bfqq(bfqd, bfqq, false); ++ } ++} ++ ++static unsigned int bfq_wr_duration(struct bfq_data *bfqd) ++{ ++ u64 dur; ++ ++ if (bfqd->bfq_wr_max_time > 0) ++ return bfqd->bfq_wr_max_time; ++ ++ dur = bfqd->rate_dur_prod; ++ do_div(dur, bfqd->peak_rate); ++ ++ /* ++ * Limit duration between 3 and 25 seconds. The upper limit ++ * has been conservatively set after the following worst case: ++ * on a QEMU/KVM virtual machine ++ * - running in a slow PC ++ * - with a virtual disk stacked on a slow low-end 5400rpm HDD ++ * - serving a heavy I/O workload, such as the sequential reading ++ * of several files ++ * mplayer took 23 seconds to start, if constantly weight-raised. ++ * ++ * As for higher values than that accomodating the above bad ++ * scenario, tests show that higher values would often yield ++ * the opposite of the desired result, i.e., would worsen ++ * responsiveness by allowing non-interactive applications to ++ * preserve weight raising for too long. ++ * ++ * On the other end, lower values than 3 seconds make it ++ * difficult for most interactive tasks to complete their jobs ++ * before weight-raising finishes. ++ */ ++ return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000)); ++} ++ ++/* switch back from soft real-time to interactive weight raising */ ++static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, ++ struct bfq_data *bfqd) ++{ ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; ++} ++ ++static void ++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, bool bfq_already_existing) ++{ ++ unsigned int old_wr_coeff; ++ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); ++ ++ if (bic->saved_has_short_ttime) ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ else ++ bfq_clear_bfqq_has_short_ttime(bfqq); ++ ++ if (bic->saved_IO_bound) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ else ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ if (unlikely(busy)) ++ old_wr_coeff = bfqq->wr_coeff; ++ ++ bfqq->wr_coeff = bic->saved_wr_coeff; ++ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; ++ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); ++ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; ++ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "bic %p wr_coeff %d start_finish %lu max_time %lu", ++ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, ++ bfqq->wr_cur_max_time); ++ ++ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || ++ time_is_before_jiffies(bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time))) { ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ !bfq_bfqq_in_large_burst(bfqq) && ++ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + ++ bfq_wr_duration(bfqd))) { ++ switch_back_to_interactive_wr(bfqq, bfqd); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "switching back to interactive"); ++ } else { ++ bfqq->wr_coeff = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "switching off wr (%lu + %lu < %lu)", ++ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, ++ jiffies); ++ } ++ } ++ ++ /* make sure weight will be updated, however we got here */ ++ bfqq->entity.prio_changed = 1; ++ ++ if (likely(!busy)) ++ return; ++ ++ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); ++ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++} ++ ++static int bfqq_process_refs(struct bfq_queue *bfqq) ++{ ++ int process_refs, io_refs; ++ ++ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); ++ ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; ++ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st - ++ (bfqq->weight_counter != NULL); ++ BUG_ON(process_refs < 0); ++ return process_refs; ++} ++ ++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ ++static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *item; ++ struct hlist_node *n; ++ ++ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) ++ hlist_del_init(&item->burst_list_node); ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++ bfqd->burst_size = 1; ++ bfqd->burst_parent_entity = bfqq->entity.parent; ++} ++ ++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ ++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ /* Increment burst size to take into account also bfqq */ ++ bfqd->burst_size++; ++ ++ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); ++ ++ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); ++ ++ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { ++ struct bfq_queue *pos, *bfqq_item; ++ struct hlist_node *n; ++ ++ /* ++ * Enough queues have been activated shortly after each ++ * other to consider this burst as large. ++ */ ++ bfqd->large_burst = true; ++ bfq_log_bfqq(bfqd, bfqq, "large burst started"); ++ ++ /* ++ * We can now mark all queues in the burst list as ++ * belonging to a large burst. ++ */ ++ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, ++ burst_list_node) { ++ bfq_mark_bfqq_in_large_burst(bfqq_item); ++ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); ++ } ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); ++ ++ /* ++ * From now on, and until the current burst finishes, any ++ * new queue being activated shortly after the last queue ++ * was inserted in the burst can be immediately marked as ++ * belonging to a large burst. So the burst list is not ++ * needed any more. Remove it. ++ */ ++ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, ++ burst_list_node) ++ hlist_del_init(&pos->burst_list_node); ++ } else /* ++ * Burst not yet large: add bfqq to the burst list. Do ++ * not increment the ref counter for bfqq, because bfqq ++ * is removed from the burst list before freeing bfqq ++ * in put_queue. ++ */ ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++} ++ ++/* ++ * If many queues belonging to the same group happen to be created ++ * shortly after each other, then the processes associated with these ++ * queues have typically a common goal. In particular, bursts of queue ++ * creations are usually caused by services or applications that spawn ++ * many parallel threads/processes. Examples are systemd during boot, ++ * or git grep. To help these processes get their job done as soon as ++ * possible, it is usually better to not grant either weight-raising ++ * or device idling to their queues. ++ * ++ * In this comment we describe, firstly, the reasons why this fact ++ * holds, and, secondly, the next function, which implements the main ++ * steps needed to properly mark these queues so that they can then be ++ * treated in a different way. ++ * ++ * The above services or applications benefit mostly from a high ++ * throughput: the quicker the requests of the activated queues are ++ * cumulatively served, the sooner the target job of these queues gets ++ * completed. As a consequence, weight-raising any of these queues, ++ * which also implies idling the device for it, is almost always ++ * counterproductive. In most cases it just lowers throughput. ++ * ++ * On the other hand, a burst of queue creations may be caused also by ++ * the start of an application that does not consist of a lot of ++ * parallel I/O-bound threads. In fact, with a complex application, ++ * several short processes may need to be executed to start-up the ++ * application. In this respect, to start an application as quickly as ++ * possible, the best thing to do is in any case to privilege the I/O ++ * related to the application with respect to all other ++ * I/O. Therefore, the best strategy to start as quickly as possible ++ * an application that causes a burst of queue creations is to ++ * weight-raise all the queues created during the burst. This is the ++ * exact opposite of the best strategy for the other type of bursts. ++ * ++ * In the end, to take the best action for each of the two cases, the ++ * two types of bursts need to be distinguished. Fortunately, this ++ * seems relatively easy, by looking at the sizes of the bursts. In ++ * particular, we found a threshold such that only bursts with a ++ * larger size than that threshold are apparently caused by ++ * services or commands such as systemd or git grep. For brevity, ++ * hereafter we call just 'large' these bursts. BFQ *does not* ++ * weight-raise queues whose creation occurs in a large burst. In ++ * addition, for each of these queues BFQ performs or does not perform ++ * idling depending on which choice boosts the throughput more. The ++ * exact choice depends on the device and request pattern at ++ * hand. ++ * ++ * Unfortunately, false positives may occur while an interactive task ++ * is starting (e.g., an application is being started). The ++ * consequence is that the queues associated with the task do not ++ * enjoy weight raising as expected. Fortunately these false positives ++ * are very rare. They typically occur if some service happens to ++ * start doing I/O exactly when the interactive task starts. ++ * ++ * Turning back to the next function, it implements all the steps ++ * needed to detect the occurrence of a large burst and to properly ++ * mark all the queues belonging to it (so that they can then be ++ * treated in a different way). This goal is achieved by maintaining a ++ * "burst list" that holds, temporarily, the queues that belong to the ++ * burst in progress. The list is then used to mark these queues as ++ * belonging to a large burst if the burst does become large. The main ++ * steps are the following. ++ * ++ * . when the very first queue is created, the queue is inserted into the ++ * list (as it could be the first queue in a possible burst) ++ * ++ * . if the current burst has not yet become large, and a queue Q that does ++ * not yet belong to the burst is activated shortly after the last time ++ * at which a new queue entered the burst list, then the function appends ++ * Q to the burst list ++ * ++ * . if, as a consequence of the previous step, the burst size reaches ++ * the large-burst threshold, then ++ * ++ * . all the queues in the burst list are marked as belonging to a ++ * large burst ++ * ++ * . the burst list is deleted; in fact, the burst list already served ++ * its purpose (keeping temporarily track of the queues in a burst, ++ * so as to be able to mark them as belonging to a large burst in the ++ * previous sub-step), and now is not needed any more ++ * ++ * . the device enters a large-burst mode ++ * ++ * . if a queue Q that does not belong to the burst is created while ++ * the device is in large-burst mode and shortly after the last time ++ * at which a queue either entered the burst list or was marked as ++ * belonging to the current large burst, then Q is immediately marked ++ * as belonging to a large burst. ++ * ++ * . if a queue Q that does not belong to the burst is created a while ++ * later, i.e., not shortly after, than the last time at which a queue ++ * either entered the burst list or was marked as belonging to the ++ * current large burst, then the current burst is deemed as finished and: ++ * ++ * . the large-burst mode is reset if set ++ * ++ * . the burst list is emptied ++ * ++ * . Q is inserted in the burst list, as Q may be the first queue ++ * in a possible new burst (then the burst list contains just Q ++ * after this step). ++ */ ++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ /* ++ * If bfqq is already in the burst list or is part of a large ++ * burst, or finally has just been split, then there is ++ * nothing else to do. ++ */ ++ if (!hlist_unhashed(&bfqq->burst_list_node) || ++ bfq_bfqq_in_large_burst(bfqq) || ++ time_is_after_eq_jiffies(bfqq->split_time + ++ msecs_to_jiffies(10))) ++ return; ++ ++ /* ++ * If bfqq's creation happens late enough, or bfqq belongs to ++ * a different group than the burst group, then the current ++ * burst is finished, and related data structures must be ++ * reset. ++ * ++ * In this respect, consider the special case where bfqq is ++ * the very first queue created after BFQ is selected for this ++ * device. In this case, last_ins_in_burst and ++ * burst_parent_entity are not yet significant when we get ++ * here. But it is easy to verify that, whether or not the ++ * following condition is true, bfqq will end up being ++ * inserted into the burst list. In particular the list will ++ * happen to contain only bfqq. And this is exactly what has ++ * to happen, as bfqq may be the first queue of the first ++ * burst. ++ */ ++ if (time_is_before_jiffies(bfqd->last_ins_in_burst + ++ bfqd->bfq_burst_interval) || ++ bfqq->entity.parent != bfqd->burst_parent_entity) { ++ bfqd->large_burst = false; ++ bfq_reset_burst_list(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "late activation or different group"); ++ goto end; ++ } ++ ++ /* ++ * If we get here, then bfqq is being activated shortly after the ++ * last queue. So, if the current burst is also large, we can mark ++ * bfqq as belonging to this large burst immediately. ++ */ ++ if (bfqd->large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ goto end; ++ } ++ ++ /* ++ * If we get here, then a large-burst state has not yet been ++ * reached, but bfqq is being activated shortly after the last ++ * queue. Then we add bfqq to the burst. ++ */ ++ bfq_add_to_burst(bfqd, bfqq); ++end: ++ /* ++ * At this point, bfqq either has been added to the current ++ * burst or has caused the current burst to terminate and a ++ * possible new burst to start. In particular, in the second ++ * case, bfqq has become the first queue in the possible new ++ * burst. In both cases last_ins_in_burst needs to be moved ++ * forward. ++ */ ++ bfqd->last_ins_in_burst = jiffies; ++ ++} ++ ++static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (entity->budget < entity->service) { ++ pr_crit("budget %d service %d\n", ++ entity->budget, entity->service); ++ BUG(); ++ } ++ return entity->budget - entity->service; ++} ++ ++/* ++ * If enough samples have been computed, return the current max budget ++ * stored in bfqd, which is dynamically updated according to the ++ * estimated disk peak rate; otherwise return the default max budget ++ */ ++static int bfq_max_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget; ++ else ++ return bfqd->bfq_max_budget; ++} ++ ++/* ++ * Return min budget, which is a fraction of the current or default ++ * max budget (trying with 1/32) ++ */ ++static int bfq_min_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget / 32; ++ else ++ return bfqd->bfq_max_budget / 32; ++} ++ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason); ++ ++/* ++ * The next function, invoked after the input queue bfqq switches from ++ * idle to busy, updates the budget of bfqq. The function also tells ++ * whether the in-service queue should be expired, by returning ++ * true. The purpose of expiring the in-service queue is to give bfqq ++ * the chance to possibly preempt the in-service queue, and the reason ++ * for preempting the in-service queue is to achieve one of the two ++ * goals below. ++ * ++ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has ++ * expired because it has remained idle. In particular, bfqq may have ++ * expired for one of the following two reasons: ++ * ++ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and ++ * did not make it to issue a new request before its last request ++ * was served; ++ * ++ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue ++ * a new request before the expiration of the idling-time. ++ * ++ * Even if bfqq has expired for one of the above reasons, the process ++ * associated with the queue may be however issuing requests greedily, ++ * and thus be sensitive to the bandwidth it receives (bfqq may have ++ * remained idle for other reasons: CPU high load, bfqq not enjoying ++ * idling, I/O throttling somewhere in the path from the process to ++ * the I/O scheduler, ...). But if, after every expiration for one of ++ * the above two reasons, bfqq has to wait for the service of at least ++ * one full budget of another queue before being served again, then ++ * bfqq is likely to get a much lower bandwidth or resource time than ++ * its reserved ones. To address this issue, two countermeasures need ++ * to be taken. ++ * ++ * First, the budget and the timestamps of bfqq need to be updated in ++ * a special way on bfqq reactivation: they need to be updated as if ++ * bfqq did not remain idle and did not expire. In fact, if they are ++ * computed as if bfqq expired and remained idle until reactivation, ++ * then the process associated with bfqq is treated as if, instead of ++ * being greedy, it stopped issuing requests when bfqq remained idle, ++ * and restarts issuing requests only on this reactivation. In other ++ * words, the scheduler does not help the process recover the "service ++ * hole" between bfqq expiration and reactivation. As a consequence, ++ * the process receives a lower bandwidth than its reserved one. In ++ * contrast, to recover this hole, the budget must be updated as if ++ * bfqq was not expired at all before this reactivation, i.e., it must ++ * be set to the value of the remaining budget when bfqq was ++ * expired. Along the same line, timestamps need to be assigned the ++ * value they had the last time bfqq was selected for service, i.e., ++ * before last expiration. Thus timestamps need to be back-shifted ++ * with respect to their normal computation (see [1] for more details ++ * on this tricky aspect). ++ * ++ * Secondly, to allow the process to recover the hole, the in-service ++ * queue must be expired too, to give bfqq the chance to preempt it ++ * immediately. In fact, if bfqq has to wait for a full budget of the ++ * in-service queue to be completed, then it may become impossible to ++ * let the process recover the hole, even if the back-shifted ++ * timestamps of bfqq are lower than those of the in-service queue. If ++ * this happens for most or all of the holes, then the process may not ++ * receive its reserved bandwidth. In this respect, it is worth noting ++ * that, being the service of outstanding requests unpreemptible, a ++ * little fraction of the holes may however be unrecoverable, thereby ++ * causing a little loss of bandwidth. ++ * ++ * The last important point is detecting whether bfqq does need this ++ * bandwidth recovery. In this respect, the next function deems the ++ * process associated with bfqq greedy, and thus allows it to recover ++ * the hole, if: 1) the process is waiting for the arrival of a new ++ * request (which implies that bfqq expired for one of the above two ++ * reasons), and 2) such a request has arrived soon. The first ++ * condition is controlled through the flag non_blocking_wait_rq, ++ * while the second through the flag arrived_in_time. If both ++ * conditions hold, then the function computes the budget in the ++ * above-described special way, and signals that the in-service queue ++ * should be expired. Timestamp back-shifting is done later in ++ * __bfq_activate_entity. ++ * ++ * 2. Reduce latency. Even if timestamps are not backshifted to let ++ * the process associated with bfqq recover a service hole, bfqq may ++ * however happen to have, after being (re)activated, a lower finish ++ * timestamp than the in-service queue. That is, the next budget of ++ * bfqq may have to be completed before the one of the in-service ++ * queue. If this is the case, then preempting the in-service queue ++ * allows this goal to be achieved, apart from the unpreemptible, ++ * outstanding requests mentioned above. ++ * ++ * Unfortunately, regardless of which of the above two goals one wants ++ * to achieve, service trees need first to be updated to know whether ++ * the in-service queue must be preempted. To have service trees ++ * correctly updated, the in-service queue must be expired and ++ * rescheduled, and bfqq must be scheduled too. This is one of the ++ * most costly operations (in future versions, the scheduling ++ * mechanism may be re-designed in such a way to make it possible to ++ * know whether preemption is needed without needing to update service ++ * trees). In addition, queue preemptions almost always cause random ++ * I/O, and thus loss of throughput. Because of these facts, the next ++ * function adopts the following simple scheme to avoid both costly ++ * operations and too frequent preemptions: it requests the expiration ++ * of the in-service queue (unconditionally) only for queues that need ++ * to recover a hole, or that either are weight-raised or deserve to ++ * be weight-raised. ++ */ ++static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool arrived_in_time, ++ bool wr_or_deserves_wr) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ /* ++ * In the next compound condition, we check also whether there ++ * is some budget left, because otherwise there is no point in ++ * trying to go on serving bfqq with this same budget: bfqq ++ * would be expired immediately after being selected for ++ * service. This would only cause useless overhead. ++ */ ++ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time && ++ bfq_bfqq_budget_left(bfqq) > 0) { ++ /* ++ * We do not clear the flag non_blocking_wait_rq here, as ++ * the latter is used in bfq_activate_bfqq to signal ++ * that timestamps need to be back-shifted (and is ++ * cleared right after). ++ */ ++ ++ /* ++ * In next assignment we rely on that either ++ * entity->service or entity->budget are not updated ++ * on expiration if bfqq is empty (see ++ * __bfq_bfqq_recalc_budget). Thus both quantities ++ * remain unchanged after such an expiration, and the ++ * following statement therefore assigns to ++ * entity->budget the remaining budget on such an ++ * expiration. ++ */ ++ BUG_ON(bfqq->max_budget < 0); ++ entity->budget = min_t(unsigned long, ++ bfq_bfqq_budget_left(bfqq), ++ bfqq->max_budget); ++ ++ BUG_ON(entity->budget < 0); ++ ++ /* ++ * At this point, we have used entity->service to get ++ * the budget left (needed for updating ++ * entity->budget). Thus we finally can, and have to, ++ * reset entity->service. The latter must be reset ++ * because bfqq would otherwise be charged again for ++ * the service it has received during its previous ++ * service slot(s). ++ */ ++ entity->service = 0; ++ ++ return true; ++ } ++ ++ /* ++ * We can finally complete expiration, by setting service to 0. ++ */ ++ entity->service = 0; ++ BUG_ON(bfqq->max_budget < 0); ++ entity->budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(bfqq->next_rq, bfqq)); ++ BUG_ON(entity->budget < 0); ++ ++ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); ++ return wr_or_deserves_wr; ++} ++ ++/* ++ * Return the farthest past time instant according to jiffies ++ * macros. ++ */ ++static unsigned long bfq_smallest_from_now(void) ++{ ++ return jiffies - MAX_JIFFY_OFFSET; ++} ++ ++static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ unsigned int old_wr_coeff, ++ bool wr_or_deserves_wr, ++ bool interactive, ++ bool in_burst, ++ bool soft_rt) ++{ ++ if (old_wr_coeff == 1 && wr_or_deserves_wr) { ++ /* start a weight-raising period */ ++ if (interactive) { ++ bfqq->service_from_wr = 0; ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ } else { ++ /* ++ * No interactive weight raising in progress ++ * here: assign minus infinity to ++ * wr_start_at_switch_to_srt, to make sure ++ * that, at the end of the soft-real-time ++ * weight raising periods that is starting ++ * now, no interactive weight-raising period ++ * may be wrongly considered as still in ++ * progress (and thus actually started by ++ * mistake). ++ */ ++ bfqq->wr_start_at_switch_to_srt = ++ bfq_smallest_from_now(); ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff * ++ BFQ_SOFTRT_WEIGHT_FACTOR; ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ } ++ /* ++ * If needed, further reduce budget to make sure it is ++ * close to bfqq's backlog, so as to reduce the ++ * scheduling-error component due to a too large ++ * budget. Do not care about throughput consequences, ++ * but only about latency. Finally, do not assign a ++ * too small budget either, to avoid increasing ++ * latency by causing too frequent expirations. ++ */ ++ bfqq->entity.budget = min_t(unsigned long, ++ bfqq->entity.budget, ++ 2 * bfq_min_budget(bfqd)); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais starting at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } else if (old_wr_coeff > 1) { ++ if (interactive) { /* update wr coeff and duration */ ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ } else if (in_burst) { ++ bfqq->wr_coeff = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq-> ++ wr_cur_max_time)); ++ } else if (soft_rt) { ++ /* ++ * The application is now or still meeting the ++ * requirements for being deemed soft rt. We ++ * can then correctly and safely (re)charge ++ * the weight-raising duration for the ++ * application with the weight-raising ++ * duration for soft rt applications. ++ * ++ * In particular, doing this recharge now, i.e., ++ * before the weight-raising period for the ++ * application finishes, reduces the probability ++ * of the following negative scenario: ++ * 1) the weight of a soft rt application is ++ * raised at startup (as for any newly ++ * created application), ++ * 2) since the application is not interactive, ++ * at a certain time weight-raising is ++ * stopped for the application, ++ * 3) at that time the application happens to ++ * still have pending requests, and hence ++ * is destined to not have a chance to be ++ * deemed soft rt before these requests are ++ * completed (see the comments to the ++ * function bfq_bfqq_softrt_next_start() ++ * for details on soft rt detection), ++ * 4) these pending requests experience a high ++ * latency because the application is not ++ * weight-raised while they are pending. ++ */ ++ if (bfqq->wr_cur_max_time != ++ bfqd->bfq_wr_rt_max_time) { ++ bfqq->wr_start_at_switch_to_srt = ++ bfqq->last_wr_start_finish; ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff * ++ BFQ_SOFTRT_WEIGHT_FACTOR; ++ bfq_log_bfqq(bfqd, bfqq, ++ "switching to soft_rt wr"); ++ } else ++ bfq_log_bfqq(bfqd, bfqq, ++ "moving forward soft_rt wr duration"); ++ bfqq->last_wr_start_finish = jiffies; ++ } ++ } ++} ++ ++static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ return bfqq->dispatched == 0 && ++ time_is_before_jiffies( ++ bfqq->budget_timeout + ++ bfqd->bfq_wr_min_idle_time); ++} ++ ++static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ int old_wr_coeff, ++ struct request *rq, ++ bool *interactive) ++{ ++ bool soft_rt, in_burst, wr_or_deserves_wr, ++ bfqq_wants_to_preempt, ++ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), ++ /* ++ * See the comments on ++ * bfq_bfqq_update_budg_for_activation for ++ * details on the usage of the next variable. ++ */ ++ arrived_in_time = ktime_get_ns() <= ++ RQ_BIC(rq)->ttime.last_end_request + ++ bfqd->bfq_slice_idle * 3; ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfq_add_request non-busy: " ++ "jiffies %lu, in_time %d, idle_long %d busyw %d " ++ "wr_coeff %u", ++ jiffies, arrived_in_time, ++ idle_for_long_time, ++ bfq_bfqq_non_blocking_wait_rq(bfqq), ++ old_wr_coeff); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); ++ ++ /* ++ * bfqq deserves to be weight-raised if: ++ * - it is sync, ++ * - it does not belong to a large burst, ++ * - it has been idle for enough time or is soft real-time, ++ * - is linked to a bfq_io_cq (it is not shared in any sense) ++ */ ++ in_burst = bfq_bfqq_in_large_burst(bfqq); ++ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && ++ !in_burst && ++ time_is_before_jiffies(bfqq->soft_rt_next_start) && ++ bfqq->dispatched == 0; ++ *interactive = ++ !in_burst && ++ idle_for_long_time; ++ wr_or_deserves_wr = bfqd->low_latency && ++ (bfqq->wr_coeff > 1 || ++ (bfq_bfqq_sync(bfqq) && ++ bfqq->bic && (*interactive || soft_rt))); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfq_add_request: " ++ "in_burst %d, " ++ "soft_rt %d (next %lu), inter %d, bic %p", ++ bfq_bfqq_in_large_burst(bfqq), soft_rt, ++ bfqq->soft_rt_next_start, ++ *interactive, ++ bfqq->bic); ++ ++ /* ++ * Using the last flag, update budget and check whether bfqq ++ * may want to preempt the in-service queue. ++ */ ++ bfqq_wants_to_preempt = ++ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, ++ arrived_in_time, ++ wr_or_deserves_wr); ++ ++ /* ++ * If bfqq happened to be activated in a burst, but has been ++ * idle for much more than an interactive queue, then we ++ * assume that, in the overall I/O initiated in the burst, the ++ * I/O associated with bfqq is finished. So bfqq does not need ++ * to be treated as a queue belonging to a burst ++ * anymore. Accordingly, we reset bfqq's in_large_burst flag ++ * if set, and remove bfqq from the burst list if it's ++ * there. We do not decrement burst_size, because the fact ++ * that bfqq does not need to belong to the burst list any ++ * more does not invalidate the fact that bfqq was created in ++ * a burst. ++ */ ++ if (likely(!bfq_bfqq_just_created(bfqq)) && ++ idle_for_long_time && ++ time_is_before_jiffies( ++ bfqq->budget_timeout + ++ msecs_to_jiffies(10000))) { ++ hlist_del_init(&bfqq->burst_list_node); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ } ++ ++ bfq_clear_bfqq_just_created(bfqq); ++ ++ if (!bfq_bfqq_IO_bound(bfqq)) { ++ if (arrived_in_time) { ++ bfqq->requests_within_timer++; ++ if (bfqq->requests_within_timer >= ++ bfqd->bfq_requests_within_timer) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ } else ++ bfqq->requests_within_timer = 0; ++ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", ++ bfqq->requests_within_timer); ++ } ++ ++ if (bfqd->low_latency) { ++ if (unlikely(time_is_after_jiffies(bfqq->split_time))) ++ /* wraparound */ ++ bfqq->split_time = ++ jiffies - bfqd->bfq_wr_min_idle_time - 1; ++ ++ if (time_is_before_jiffies(bfqq->split_time + ++ bfqd->bfq_wr_min_idle_time)) { ++ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, ++ old_wr_coeff, ++ wr_or_deserves_wr, ++ *interactive, ++ in_burst, ++ soft_rt); ++ ++ if (old_wr_coeff != bfqq->wr_coeff) ++ bfqq->entity.prio_changed = 1; ++ } ++ } ++ ++ bfqq->last_idle_bklogged = jiffies; ++ bfqq->service_from_backlogged = 0; ++ bfq_clear_bfqq_softrt_update(bfqq); ++ ++ bfq_add_bfqq_busy(bfqd, bfqq); ++ ++ /* ++ * Expire in-service queue only if preemption may be needed ++ * for guarantees. In this respect, the function ++ * next_queue_may_preempt just checks a simple, necessary ++ * condition, and not a sufficient condition based on ++ * timestamps. In fact, for the latter condition to be ++ * evaluated, timestamps would need first to be updated, and ++ * this operation is quite costly (see the comments on the ++ * function bfq_bfqq_update_budg_for_activation). ++ */ ++ if (bfqd->in_service_queue && bfqq_wants_to_preempt && ++ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && ++ next_queue_may_preempt(bfqd)) { ++ struct bfq_queue *in_serv = ++ bfqd->in_service_queue; ++ BUG_ON(in_serv == bfqq); ++ ++ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, ++ false, BFQ_BFQQ_PREEMPTED); ++ } ++} ++ ++static void bfq_add_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *next_rq, *prev; ++ unsigned int old_wr_coeff = bfqq->wr_coeff; ++ bool interactive = false; ++ ++ bfq_log_bfqq(bfqd, bfqq, "size %u %s", ++ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); ++ ++ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ bfqq->queued[rq_is_sync(rq)]++; ++ bfqd->queued++; ++ ++ elv_rb_add(&bfqq->sort_list, rq); ++ ++ /* ++ * Check if this request is a better next-to-serve candidate. ++ */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); ++ BUG_ON(!next_rq); ++ bfqq->next_rq = next_rq; ++ ++ /* ++ * Adjust priority tree position, if next_rq changes. ++ */ ++ if (prev != bfqq->next_rq) ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ ++ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ ++ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, ++ rq, &interactive); ++ else { ++ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && ++ time_is_before_jiffies( ++ bfqq->last_wr_start_finish + ++ bfqd->bfq_wr_min_inter_arr_async)) { ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "non-idle wrais starting, " ++ "wr_max_time %u wr_busy %d", ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqd->wr_busy_queues); ++ } ++ if (prev != bfqq->next_rq) ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ /* ++ * Assign jiffies to last_wr_start_finish in the following ++ * cases: ++ * ++ * . if bfqq is not going to be weight-raised, because, for ++ * non weight-raised queues, last_wr_start_finish stores the ++ * arrival time of the last request; as of now, this piece ++ * of information is used only for deciding whether to ++ * weight-raise async queues ++ * ++ * . if bfqq is not weight-raised, because, if bfqq is now ++ * switching to weight-raised, then last_wr_start_finish ++ * stores the time when weight-raising starts ++ * ++ * . if bfqq is interactive, because, regardless of whether ++ * bfqq is currently weight-raised, the weight-raising ++ * period must start or restart (this case is considered ++ * separately because it is not detected by the above ++ * conditions, if bfqq is already weight-raised) ++ * ++ * last_wr_start_finish has to be updated also if bfqq is soft ++ * real-time, because the weight-raising period is constantly ++ * restarted on idle-to-busy transitions for these queues, but ++ * this is already done in bfq_bfqq_handle_idle_busy_switch if ++ * needed. ++ */ ++ if (bfqd->low_latency && ++ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) ++ bfqq->last_wr_start_finish = jiffies; ++} ++ ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, ++ struct bio *bio) ++{ ++ struct task_struct *tsk = current; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ if (!bic) ++ return NULL; ++ ++ bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); ++ if (bfqq) ++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); ++ ++ return NULL; ++} ++ ++static sector_t get_sdist(sector_t last_pos, struct request *rq) ++{ ++ sector_t sdist = 0; ++ ++ if (last_pos) { ++ if (last_pos < blk_rq_pos(rq)) ++ sdist = blk_rq_pos(rq) - last_pos; ++ else ++ sdist = last_pos - blk_rq_pos(rq); ++ } ++ ++ return sdist; ++} ++ ++static void bfq_activate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ bfqd->rq_in_driver++; ++} ++ ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ BUG_ON(bfqd->rq_in_driver == 0); ++ bfqd->rq_in_driver--; ++} ++ ++static void bfq_remove_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ const int sync = rq_is_sync(rq); ++ ++ /* ++ * NOTE: ++ * (bfqq->entity.service > bfqq->entity.budget) may hold here, ++ * in case of forced dispatches. ++ */ ++ ++ if (bfqq->next_rq == rq) { ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ if (rq->queuelist.prev != &rq->queuelist) ++ list_del_init(&rq->queuelist); ++ BUG_ON(bfqq->queued[sync] == 0); ++ bfqq->queued[sync]--; ++ bfqd->queued--; ++ elv_rb_del(&bfqq->sort_list, rq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ bfqq->next_rq = NULL; ++ ++ BUG_ON(bfqq->entity.budget < 0); ++ ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { ++ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ ++ bfq_del_bfqq_busy(bfqd, bfqq, false); ++ /* ++ * bfqq emptied. In normal operation, when ++ * bfqq is empty, bfqq->entity.service and ++ * bfqq->entity.budget must contain, ++ * respectively, the service received and the ++ * budget used last time bfqq emptied. These ++ * facts do not hold in this case, as at least ++ * this last removal occurred while bfqq is ++ * not in service. To avoid inconsistencies, ++ * reset both bfqq->entity.service and ++ * bfqq->entity.budget, if bfqq has still a ++ * process that may issue I/O requests to it. ++ */ ++ bfqq->entity.budget = bfqq->entity.service = 0; ++ } ++ ++ /* ++ * Remove queue from request-position tree as it is empty. ++ */ ++ if (bfqq->pos_root) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ } else { ++ BUG_ON(!bfqq->next_rq); ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ ++ if (rq->cmd_flags & REQ_META) { ++ BUG_ON(bfqq->meta_pending == 0); ++ bfqq->meta_pending--; ++ } ++ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); ++} ++ ++static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct request *__rq; ++ ++ __rq = bfq_find_rq_fmerge(bfqd, bio); ++ if (__rq && elv_bio_merge_ok(__rq, bio)) { ++ *req = __rq; ++ return ELEVATOR_FRONT_MERGE; ++ } ++ ++ return ELEVATOR_NO_MERGE; ++} ++ ++static void bfq_merged_request(struct request_queue *q, struct request *req, ++ enum elv_merge type) ++{ ++ if (type == ELEVATOR_FRONT_MERGE && ++ rb_prev(&req->rb_node) && ++ blk_rq_pos(req) < ++ blk_rq_pos(container_of(rb_prev(&req->rb_node), ++ struct request, rb_node))) { ++ struct bfq_queue *bfqq = RQ_BFQQ(req); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *prev, *next_rq; ++ ++ /* Reposition request in its sort_list */ ++ elv_rb_del(&bfqq->sort_list, req); ++ elv_rb_add(&bfqq->sort_list, req); ++ /* Choose next request to be served for bfqq */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, ++ bfqd->last_position); ++ BUG_ON(!next_rq); ++ bfqq->next_rq = next_rq; ++ /* ++ * If next_rq changes, update both the queue's budget to ++ * fit the new request and the queue's position in its ++ * rq_pos_tree. ++ */ ++ if (prev != bfqq->next_rq) { ++ bfq_updated_next_req(bfqd, bfqq); ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ } ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static void bfq_bio_merged(struct request_queue *q, struct request *req, ++ struct bio *bio) ++{ ++ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); ++} ++#endif ++ ++static void bfq_merged_requests(struct request_queue *q, struct request *rq, ++ struct request *next) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); ++ ++ /* ++ * If next and rq belong to the same bfq_queue and next is older ++ * than rq, then reposition rq in the fifo (by substituting next ++ * with rq). Otherwise, if next and rq belong to different ++ * bfq_queues, never reposition rq: in fact, we would have to ++ * reposition it with respect to next's position in its own fifo, ++ * which would most certainly be too expensive with respect to ++ * the benefits. ++ */ ++ if (bfqq == next_bfqq && ++ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && ++ next->fifo_time < rq->fifo_time) { ++ list_del_init(&rq->queuelist); ++ list_replace_init(&next->queuelist, &rq->queuelist); ++ rq->fifo_time = next->fifo_time; ++ } ++ ++ if (bfqq->next_rq == next) ++ bfqq->next_rq = rq; ++ ++ bfq_remove_request(next); ++ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); ++} ++ ++/* Must be called with bfqq != NULL */ ++static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) ++{ ++ BUG_ON(!bfqq); ++ ++ if (bfq_bfqq_busy(bfqq)) { ++ bfqq->bfqd->wr_busy_queues--; ++ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); ++ } ++ bfqq->wr_coeff = 1; ++ bfqq->wr_cur_max_time = 0; ++ bfqq->last_wr_start_finish = jiffies; ++ /* ++ * Trigger a weight change on the next invocation of ++ * __bfq_entity_update_weight_prio. ++ */ ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "wrais ending at %lu, rais_max_time %u", ++ bfqq->last_wr_start_finish, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", ++ bfqq->bfqd->wr_busy_queues); ++} ++ ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ if (bfqg->async_bfqq[i][j]) ++ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); ++ if (bfqg->async_idle_bfqq) ++ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); ++} ++ ++static void bfq_end_wr(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ ++ spin_lock_irq(bfqd->queue->queue_lock); ++ ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ bfq_end_wr_async(bfqd); ++ ++ spin_unlock_irq(bfqd->queue->queue_lock); ++} ++ ++static sector_t bfq_io_struct_pos(void *io_struct, bool request) ++{ ++ if (request) ++ return blk_rq_pos(io_struct); ++ else ++ return ((struct bio *)io_struct)->bi_iter.bi_sector; ++} ++ ++static int bfq_rq_close_to_sector(void *io_struct, bool request, ++ sector_t sector) ++{ ++ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= ++ BFQQ_CLOSE_THR; ++} ++ ++static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ sector_t sector) ++{ ++ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; ++ struct rb_node *parent, *node; ++ struct bfq_queue *__bfqq; ++ ++ if (RB_EMPTY_ROOT(root)) ++ return NULL; ++ ++ /* ++ * First, if we find a request starting at the end of the last ++ * request, choose it. ++ */ ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); ++ if (__bfqq) ++ return __bfqq; ++ ++ /* ++ * If the exact sector wasn't found, the parent of the NULL leaf ++ * will contain the closest sector (rq_pos_tree sorted by ++ * next_request position). ++ */ ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ if (blk_rq_pos(__bfqq->next_rq) < sector) ++ node = rb_next(&__bfqq->pos_node); ++ else ++ node = rb_prev(&__bfqq->pos_node); ++ if (!node) ++ return NULL; ++ ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ return NULL; ++} ++ ++static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, ++ struct bfq_queue *cur_bfqq, ++ sector_t sector) ++{ ++ struct bfq_queue *bfqq; ++ ++ /* ++ * We shall notice if some of the queues are cooperating, ++ * e.g., working closely on the same area of the device. In ++ * that case, we can group them together and: 1) don't waste ++ * time idling, and 2) serve the union of their requests in ++ * the best possible order for throughput. ++ */ ++ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); ++ if (!bfqq || bfqq == cur_bfqq) ++ return NULL; ++ ++ return bfqq; ++} ++ ++static struct bfq_queue * ++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ int process_refs, new_process_refs; ++ struct bfq_queue *__bfqq; ++ ++ /* ++ * If there are no process references on the new_bfqq, then it is ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain ++ * may have dropped their last reference (not just their last process ++ * reference). ++ */ ++ if (!bfqq_process_refs(new_bfqq)) ++ return NULL; ++ ++ /* Avoid a circular list and skip interim queue merges. */ ++ while ((__bfqq = new_bfqq->new_bfqq)) { ++ if (__bfqq == bfqq) ++ return NULL; ++ new_bfqq = __bfqq; ++ } ++ ++ process_refs = bfqq_process_refs(bfqq); ++ new_process_refs = bfqq_process_refs(new_bfqq); ++ /* ++ * If the process for the bfqq has gone away, there is no ++ * sense in merging the queues. ++ */ ++ if (process_refs == 0 || new_process_refs == 0) ++ return NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", ++ new_bfqq->pid); ++ ++ /* ++ * Merging is just a redirection: the requests of the process ++ * owning one of the two queues are redirected to the other queue. ++ * The latter queue, in its turn, is set as shared if this is the ++ * first time that the requests of some process are redirected to ++ * it. ++ * ++ * We redirect bfqq to new_bfqq and not the opposite, because we ++ * are in the context of the process owning bfqq, hence we have ++ * the io_cq of this process. So we can immediately configure this ++ * io_cq to redirect the requests of the process to new_bfqq. ++ * ++ * NOTE, even if new_bfqq coincides with the in-service queue, the ++ * io_cq of new_bfqq is not available, because, if the in-service ++ * queue is shared, bfqd->in_service_bic may not point to the ++ * io_cq of the in-service queue. ++ * Redirecting the requests of the process owning bfqq to the ++ * currently in-service queue is in any case the best option, as ++ * we feed the in-service queue with new requests close to the ++ * last request served and, by doing so, hopefully increase the ++ * throughput. ++ */ ++ bfqq->new_bfqq = new_bfqq; ++ new_bfqq->ref += process_refs; ++ return new_bfqq; ++} ++ ++static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, ++ struct bfq_queue *new_bfqq) ++{ ++ if (bfq_too_late_for_merging(new_bfqq)) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "too late for bfq%d to be merged", ++ new_bfqq->pid); ++ return false; ++ } ++ ++ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || ++ (bfqq->ioprio_class != new_bfqq->ioprio_class)) ++ return false; ++ ++ /* ++ * If either of the queues has already been detected as seeky, ++ * then merging it with the other queue is unlikely to lead to ++ * sequential I/O. ++ */ ++ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) ++ return false; ++ ++ /* ++ * Interleaved I/O is known to be done by (some) applications ++ * only for reads, so it does not make sense to merge async ++ * queues. ++ */ ++ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Attempt to schedule a merge of bfqq with the currently in-service ++ * queue or with a close queue among the scheduled queues. Return ++ * NULL if no merge was scheduled, a pointer to the shared bfq_queue ++ * structure otherwise. ++ * ++ * The OOM queue is not allowed to participate to cooperation: in fact, since ++ * the requests temporarily redirected to the OOM queue could be redirected ++ * again to dedicated queues at any time, the state needed to correctly ++ * handle merging with the OOM queue would be quite complex and expensive ++ * to maintain. Besides, in such a critical condition as an out of memory, ++ * the benefits of queue merging may be little relevant, or even negligible. ++ * ++ * WARNING: queue merging may impair fairness among non-weight raised ++ * queues, for at least two reasons: 1) the original weight of a ++ * merged queue may change during the merged state, 2) even being the ++ * weight the same, a merged queue may be bloated with many more ++ * requests than the ones produced by its originally-associated ++ * process. ++ */ ++static struct bfq_queue * ++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ void *io_struct, bool request) ++{ ++ struct bfq_queue *in_service_bfqq, *new_bfqq; ++ ++ /* ++ * Prevent bfqq from being merged if it has been created too ++ * long ago. The idea is that true cooperating processes, and ++ * thus their associated bfq_queues, are supposed to be ++ * created shortly after each other. This is the case, e.g., ++ * for KVM/QEMU and dump I/O threads. Basing on this ++ * assumption, the following filtering greatly reduces the ++ * probability that two non-cooperating processes, which just ++ * happen to do close I/O for some short time interval, have ++ * their queues merged by mistake. ++ */ ++ if (bfq_too_late_for_merging(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have looked for coop, but too late"); ++ return NULL; ++ } ++ ++ if (bfqq->new_bfqq) ++ return bfqq->new_bfqq; ++ ++ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) ++ return NULL; ++ ++ /* If there is only one backlogged queue, don't search. */ ++ if (bfq_tot_busy_queues(bfqd) == 1) ++ return NULL; ++ ++ in_service_bfqq = bfqd->in_service_queue; ++ ++ if (in_service_bfqq && in_service_bfqq != bfqq && ++ likely(in_service_bfqq != &bfqd->oom_bfqq) && ++ bfq_rq_close_to_sector(io_struct, request, bfqd->in_serv_last_pos) && ++ bfqq->entity.parent == in_service_bfqq->entity.parent && ++ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { ++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); ++ if (new_bfqq) ++ return new_bfqq; ++ } ++ /* ++ * Check whether there is a cooperator among currently scheduled ++ * queues. The only thing we need is that the bio/request is not ++ * NULL, as we need it to establish whether a cooperator exists. ++ */ ++ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, ++ bfq_io_struct_pos(io_struct, request)); ++ ++ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); ++ ++ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && ++ bfq_may_be_close_cooperator(bfqq, new_bfqq)) ++ return bfq_setup_merge(bfqq, new_bfqq); ++ ++ return NULL; ++} ++ ++static void bfq_bfqq_save_state(struct bfq_queue *bfqq) ++{ ++ struct bfq_io_cq *bic = bfqq->bic; ++ ++ /* ++ * If !bfqq->bic, the queue is already shared or its requests ++ * have already been redirected to a shared queue; both idle window ++ * and weight raising state have already been saved. Do nothing. ++ */ ++ if (!bic) ++ return; ++ ++ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); ++ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); ++ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); ++ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); ++ if (unlikely(bfq_bfqq_just_created(bfqq) && ++ !bfq_bfqq_in_large_burst(bfqq) && ++ bfqq->bfqd->low_latency)) { ++ /* ++ * bfqq being merged ritgh after being created: bfqq ++ * would have deserved interactive weight raising, but ++ * did not make it to be set in a weight-raised state, ++ * because of this early merge. Store directly the ++ * weight-raising state that would have been assigned ++ * to bfqq, so that to avoid that bfqq unjustly fails ++ * to enjoy weight raising if split soon. ++ */ ++ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; ++ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); ++ bic->saved_last_wr_start_finish = jiffies; ++ } else { ++ bic->saved_wr_coeff = bfqq->wr_coeff; ++ bic->saved_wr_start_at_switch_to_srt = ++ bfqq->wr_start_at_switch_to_srt; ++ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; ++ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; ++ } ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++} ++ ++static void bfq_get_bic_reference(struct bfq_queue *bfqq) ++{ ++ /* ++ * If bfqq->bic has a non-NULL value, the bic to which it belongs ++ * is about to begin using a shared bfq_queue. ++ */ ++ if (bfqq->bic) ++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); ++} ++ ++static void ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", ++ (unsigned long) new_bfqq->pid); ++ /* Save weight raising and idle window of the merged queues */ ++ bfq_bfqq_save_state(bfqq); ++ bfq_bfqq_save_state(new_bfqq); ++ if (bfq_bfqq_IO_bound(bfqq)) ++ bfq_mark_bfqq_IO_bound(new_bfqq); ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ /* ++ * If bfqq is weight-raised, then let new_bfqq inherit ++ * weight-raising. To reduce false positives, neglect the case ++ * where bfqq has just been created, but has not yet made it ++ * to be weight-raised (which may happen because EQM may merge ++ * bfqq even before bfq_add_request is executed for the first ++ * time for bfqq). Handling this case would however be very ++ * easy, thanks to the flag just_created. ++ */ ++ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { ++ new_bfqq->wr_coeff = bfqq->wr_coeff; ++ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; ++ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; ++ new_bfqq->wr_start_at_switch_to_srt = ++ bfqq->wr_start_at_switch_to_srt; ++ if (bfq_bfqq_busy(new_bfqq)) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > ++ bfq_tot_busy_queues(bfqd)); ++ } ++ ++ new_bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, new_bfqq, ++ "wr start after merge with %d, rais_max_time %u", ++ bfqq->pid, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ ++ bfqq->wr_coeff = 1; ++ bfqq->entity.prio_changed = 1; ++ if (bfq_bfqq_busy(bfqq)) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++ ++ } ++ ++ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", ++ bfqd->wr_busy_queues); ++ ++ /* ++ * Grab a reference to the bic, to prevent it from being destroyed ++ * before being possibly touched by a bfq_split_bfqq(). ++ */ ++ bfq_get_bic_reference(bfqq); ++ bfq_get_bic_reference(new_bfqq); ++ /* ++ * Merge queues (that is, let bic redirect its requests to new_bfqq) ++ */ ++ bic_set_bfqq(bic, new_bfqq, 1); ++ bfq_mark_bfqq_coop(new_bfqq); ++ /* ++ * new_bfqq now belongs to at least two bics (it is a shared queue): ++ * set new_bfqq->bic to NULL. bfqq either: ++ * - does not belong to any bic any more, and hence bfqq->bic must ++ * be set to NULL, or ++ * - is a queue whose owning bics have already been redirected to a ++ * different queue, hence the queue is destined to not belong to ++ * any bic soon and bfqq->bic is already NULL (therefore the next ++ * assignment causes no harm). ++ */ ++ new_bfqq->bic = NULL; ++ bfqq->bic = NULL; ++ /* release process reference to bfqq */ ++ bfq_put_queue(bfqq); ++} ++ ++static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ bool is_sync = op_is_sync(bio->bi_opf); ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq, *new_bfqq; ++ ++ /* ++ * Disallow merge of a sync bio into an async request. ++ */ ++ if (is_sync && !rq_is_sync(rq)) ++ return false; ++ ++ /* ++ * Lookup the bfqq that this bio will be queued with. Allow ++ * merge only if rq is queued there. ++ * Queue lock is held here. ++ */ ++ bic = bfq_bic_lookup(bfqd, current->io_context); ++ if (!bic) ++ return false; ++ ++ bfqq = bic_to_bfqq(bic, is_sync); ++ /* ++ * We take advantage of this function to perform an early merge ++ * of the queues of possible cooperating processes. ++ */ ++ if (bfqq) { ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); ++ if (new_bfqq) { ++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); ++ /* ++ * If we get here, the bio will be queued in the ++ * shared queue, i.e., new_bfqq, so use new_bfqq ++ * to decide whether bio and rq can be merged. ++ */ ++ bfqq = new_bfqq; ++ } ++ } ++ ++ return bfqq == RQ_BFQQ(rq); ++} ++ ++static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, ++ struct request *next) ++{ ++ return RQ_BFQQ(rq) == RQ_BFQQ(next); ++} ++ ++/* ++ * Set the maximum time for the in-service queue to consume its ++ * budget. This prevents seeky processes from lowering the throughput. ++ * In practice, a time-slice service scheme is used with seeky ++ * processes. ++ */ ++static void bfq_set_budget_timeout(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ unsigned int timeout_coeff; ++ ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) ++ timeout_coeff = 1; ++ else ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; ++ ++ bfqd->last_budget_start = ktime_get(); ++ ++ bfqq->budget_timeout = jiffies + ++ bfqd->bfq_timeout * timeout_coeff; ++ ++ bfq_log_bfqq(bfqd, bfqq, "%u", ++ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); ++} ++ ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ if (bfqq) { ++ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); ++ bfq_mark_bfqq_must_alloc(bfqq); ++ bfq_clear_bfqq_fifo_expire(bfqq); ++ ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && ++ bfqq->wr_coeff > 1 && ++ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ time_is_before_jiffies(bfqq->budget_timeout)) { ++ /* ++ * For soft real-time queues, move the start ++ * of the weight-raising period forward by the ++ * time the queue has not received any ++ * service. Otherwise, a relatively long ++ * service delay is likely to cause the ++ * weight-raising period of the queue to end, ++ * because of the short duration of the ++ * weight-raising period of a soft real-time ++ * queue. It is worth noting that this move ++ * is not so dangerous for the other queues, ++ * because soft real-time queues are not ++ * greedy. ++ * ++ * To not add a further variable, we use the ++ * overloaded field budget_timeout to ++ * determine for how long the queue has not ++ * received service, i.e., how much time has ++ * elapsed since the queue expired. However, ++ * this is a little imprecise, because ++ * budget_timeout is set to jiffies if bfqq ++ * not only expires, but also remains with no ++ * request. ++ */ ++ if (time_after(bfqq->budget_timeout, ++ bfqq->last_wr_start_finish)) ++ bfqq->last_wr_start_finish += ++ jiffies - bfqq->budget_timeout; ++ else ++ bfqq->last_wr_start_finish = jiffies; ++ ++ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { ++ pr_crit( ++ "BFQ WARNING:last %lu budget %lu jiffies %lu", ++ bfqq->last_wr_start_finish, ++ bfqq->budget_timeout, ++ jiffies); ++ pr_crit("diff %lu", jiffies - ++ max_t(unsigned long, ++ bfqq->last_wr_start_finish, ++ bfqq->budget_timeout)); ++ bfqq->last_wr_start_finish = jiffies; ++ } ++ } ++ ++ bfq_set_budget_timeout(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "cur-budget = %d prio_class %d", ++ bfqq->entity.budget, bfqq->ioprio_class); ++ } else ++ bfq_log(bfqd, "NULL"); ++ ++ bfqd->in_service_queue = bfqq; ++} ++ ++/* ++ * Get and set a new queue for service. ++ */ ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); ++ ++ __bfq_set_in_service_queue(bfqd, bfqq); ++ return bfqq; ++} ++ ++static void bfq_arm_slice_timer(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->in_service_queue; ++ struct bfq_io_cq *bic; ++ u32 sl; ++ ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ /* Processes have exited, don't wait. */ ++ bic = bfqd->in_service_bic; ++ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) ++ return; ++ ++ bfq_mark_bfqq_wait_request(bfqq); ++ ++ /* ++ * We don't want to idle for seeks, but we do want to allow ++ * fair distribution of slice time for a process doing back-to-back ++ * seeks. So allow a little bit of time for him to submit a new rq. ++ * ++ * To prevent processes with (partly) seeky workloads from ++ * being too ill-treated, grant them a small fraction of the ++ * assigned budget before reducing the waiting time to ++ * BFQ_MIN_TT. This happened to help reduce latency. ++ */ ++ sl = bfqd->bfq_slice_idle; ++ /* ++ * Unless the queue is being weight-raised or the scenario is ++ * asymmetric, grant only minimum idle time if the queue ++ * is seeky. A long idling is preserved for a weight-raised ++ * queue, or, more in general, in an asymemtric scenario, ++ * because a long idling is needed for guaranteeing to a queue ++ * its reserved share of the throughput (in particular, it is ++ * needed if the queue has a higher weight than some other ++ * queue). ++ */ ++ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && ++ bfq_symmetric_scenario(bfqd)) ++ sl = min_t(u32, sl, BFQ_MIN_TT); ++ ++ bfqd->last_idling_start = ktime_get(); ++ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), ++ HRTIMER_MODE_REL); ++ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); ++ bfq_log(bfqd, "arm idle: %ld/%ld ms", ++ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); ++} ++ ++/* ++ * In autotuning mode, max_budget is dynamically recomputed as the ++ * amount of sectors transferred in timeout at the estimated peak ++ * rate. This enables BFQ to utilize a full timeslice with a full ++ * budget, even if the in-service queue is served at peak rate. And ++ * this maximises throughput with sequential workloads. ++ */ ++static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) ++{ ++ return (u64)bfqd->peak_rate * USEC_PER_MSEC * ++ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; ++} ++ ++/* ++ * Update parameters related to throughput and responsiveness, as a ++ * function of the estimated peak rate. See comments on ++ * bfq_calc_max_budget(), and on the ref_wr_duration array. ++ */ ++static void update_thr_responsiveness_params(struct bfq_data *bfqd) ++{ ++ if (bfqd->bfq_user_max_budget == 0) { ++ bfqd->bfq_max_budget = ++ bfq_calc_max_budget(bfqd); ++ BUG_ON(bfqd->bfq_max_budget < 0); ++ bfq_log(bfqd, "new max_budget = %d", ++ bfqd->bfq_max_budget); ++ } ++} ++ ++static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) ++{ ++ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ ++ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; ++ bfqd->peak_rate_samples = 1; ++ bfqd->sequential_samples = 0; ++ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = ++ blk_rq_sectors(rq); ++ } else /* no new rq dispatched, just reset the number of samples */ ++ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ ++ ++ bfq_log(bfqd, ++ "at end, sample %u/%u tot_sects %llu", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ bfqd->tot_sectors_dispatched); ++} ++ ++static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) ++{ ++ u32 rate, weight, divisor; ++ ++ /* ++ * For the convergence property to hold (see comments on ++ * bfq_update_peak_rate()) and for the assessment to be ++ * reliable, a minimum number of samples must be present, and ++ * a minimum amount of time must have elapsed. If not so, do ++ * not compute new rate. Just reset parameters, to get ready ++ * for a new evaluation attempt. ++ */ ++ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || ++ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { ++ bfq_log(bfqd, ++ "only resetting, delta_first %lluus samples %d", ++ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); ++ goto reset_computation; ++ } ++ ++ /* ++ * If a new request completion has occurred after last ++ * dispatch, then, to approximate the rate at which requests ++ * have been served by the device, it is more precise to ++ * extend the observation interval to the last completion. ++ */ ++ bfqd->delta_from_first = ++ max_t(u64, bfqd->delta_from_first, ++ bfqd->last_completion - bfqd->first_dispatch); ++ ++ BUG_ON(bfqd->delta_from_first == 0); ++ /* ++ * Rate computed in sects/usec, and not sects/nsec, for ++ * precision issues. ++ */ ++ rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); ++ ++ bfq_log(bfqd, ++"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", ++ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ rate > 20< 20M sectors/sec) ++ */ ++ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && ++ rate <= bfqd->peak_rate) || ++ rate > 20<peak_rate_samples, bfqd->sequential_samples, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ goto reset_computation; ++ } else { ++ bfq_log(bfqd, ++ "do update, samples %u/%u rate/peak %llu/%llu", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ } ++ ++ /* ++ * We have to update the peak rate, at last! To this purpose, ++ * we use a low-pass filter. We compute the smoothing constant ++ * of the filter as a function of the 'weight' of the new ++ * measured rate. ++ * ++ * As can be seen in next formulas, we define this weight as a ++ * quantity proportional to how sequential the workload is, ++ * and to how long the observation time interval is. ++ * ++ * The weight runs from 0 to 8. The maximum value of the ++ * weight, 8, yields the minimum value for the smoothing ++ * constant. At this minimum value for the smoothing constant, ++ * the measured rate contributes for half of the next value of ++ * the estimated peak rate. ++ * ++ * So, the first step is to compute the weight as a function ++ * of how sequential the workload is. Note that the weight ++ * cannot reach 9, because bfqd->sequential_samples cannot ++ * become equal to bfqd->peak_rate_samples, which, in its ++ * turn, holds true because bfqd->sequential_samples is not ++ * incremented for the first sample. ++ */ ++ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; ++ ++ /* ++ * Second step: further refine the weight as a function of the ++ * duration of the observation interval. ++ */ ++ weight = min_t(u32, 8, ++ div_u64(weight * bfqd->delta_from_first, ++ BFQ_RATE_REF_INTERVAL)); ++ ++ /* ++ * Divisor ranging from 10, for minimum weight, to 2, for ++ * maximum weight. ++ */ ++ divisor = 10 - weight; ++ BUG_ON(divisor == 0); ++ ++ /* ++ * Finally, update peak rate: ++ * ++ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor ++ */ ++ bfqd->peak_rate *= divisor-1; ++ bfqd->peak_rate /= divisor; ++ rate /= divisor; /* smoothing constant alpha = 1/divisor */ ++ ++ bfq_log(bfqd, ++ "divisor %d tmp_peak_rate %llu tmp_rate %u", ++ divisor, ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), ++ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); ++ ++ BUG_ON(bfqd->peak_rate == 0); ++ BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; ++ ++ /* ++ * For a very slow device, bfqd->peak_rate can reach 0 (see ++ * the minimum representable values reported in the comments ++ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid ++ * divisions by zero where bfqd->peak_rate is used as a ++ * divisor. ++ */ ++ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); ++ ++ update_thr_responsiveness_params(bfqd); ++ BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */ ++ bfq_log(bfqd, ++ "goto reset, samples %d", ++ bfqd->peak_rate_samples) ; ++ bfq_reset_rate_computation(bfqd, rq); ++ goto update_last_values; /* will add one sample */ ++ } ++ ++ /* ++ * Device idle for very long: the observation interval lasting ++ * up to this dispatch cannot be a valid observation interval ++ * for computing a new peak rate (similarly to the late- ++ * completion event in bfq_completed_request()). Go to ++ * update_rate_and_reset to have the following three steps ++ * taken: ++ * - close the observation interval at the last (previous) ++ * request dispatch or completion ++ * - compute rate, if possible, for that observation interval ++ * - start a new observation interval with this dispatch ++ */ ++ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && ++ bfqd->rq_in_driver == 0) { ++ bfq_log(bfqd, ++"jumping to updating&resetting delta_last %lluus samples %d", ++ (now_ns - bfqd->last_dispatch)>>10, ++ bfqd->peak_rate_samples) ; ++ goto update_rate_and_reset; ++ } ++ ++ /* Update sampling information */ ++ bfqd->peak_rate_samples++; ++ ++ if ((bfqd->rq_in_driver > 0 || ++ now_ns - bfqd->last_completion < BFQ_MIN_TT) ++ && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) ++ bfqd->sequential_samples++; ++ ++ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); ++ ++ /* Reset max observed rq size every 32 dispatches */ ++ if (likely(bfqd->peak_rate_samples % 32)) ++ bfqd->last_rq_max_size = ++ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); ++ else ++ bfqd->last_rq_max_size = blk_rq_sectors(rq); ++ ++ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; ++ ++ bfq_log(bfqd, ++ "added samples %u/%u tot_sects %llu delta_first %lluus", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ bfqd->tot_sectors_dispatched, ++ bfqd->delta_from_first>>10); ++ ++ /* Target observation interval not yet reached, go on sampling */ ++ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) ++ goto update_last_values; ++ ++update_rate_and_reset: ++ bfq_update_rate_reset(bfqd, rq); ++update_last_values: ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ if (RQ_BFQQ(rq) == bfqd->in_service_queue) ++ bfqd->in_serv_last_pos = bfqd->last_position; ++ bfqd->last_dispatch = now_ns; ++ ++ bfq_log(bfqd, ++ "delta_first %lluus last_pos %llu peak_rate %llu", ++ (now_ns - bfqd->first_dispatch)>>10, ++ (unsigned long long) bfqd->last_position, ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ bfq_log(bfqd, ++ "samples at end %d", bfqd->peak_rate_samples); ++} ++ ++/* ++ * Move request from internal lists to the dispatch list of the request queue ++ */ ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ /* ++ * For consistency, the next instruction should have been executed ++ * after removing the request from the queue and dispatching it. ++ * We execute instead this instruction before bfq_remove_request() ++ * (and hence introduce a temporary inconsistency), for efficiency. ++ * In fact, in a forced_dispatch, this prevents two counters related ++ * to bfqq->dispatched to risk to be uselessly decremented if bfqq ++ * is not in service, and then to be incremented again after ++ * incrementing bfqq->dispatched. ++ */ ++ bfqq->dispatched++; ++ bfq_update_peak_rate(q->elevator->elevator_data, rq); ++ ++ bfq_remove_request(rq); ++ elv_dispatch_sort(q, rq); ++} ++ ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ /* ++ * If this bfqq is shared between multiple processes, check ++ * to make sure that those processes are still issuing I/Os ++ * within the mean seek distance. If not, it may be time to ++ * break the queues apart again. ++ */ ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) ++ bfq_mark_bfqq_split_coop(bfqq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ if (bfqq->dispatched == 0) ++ /* ++ * Overloading budget_timeout field to store ++ * the time at which the queue remains with no ++ * backlog and no outstanding request; used by ++ * the weight-raising mechanism. ++ */ ++ bfqq->budget_timeout = jiffies; ++ ++ bfq_del_bfqq_busy(bfqd, bfqq, true); ++ } else { ++ bfq_requeue_bfqq(bfqd, bfqq, true); ++ /* ++ * Resort priority tree of potential close cooperators. ++ */ ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ ++ /* ++ * All in-service entities must have been properly deactivated ++ * or requeued before executing the next function, which ++ * resets all in-service entites as no more in service. ++ */ ++ __bfq_bfqd_reset_in_service(bfqd); ++} ++ ++/** ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. ++ * @bfqd: device data. ++ * @bfqq: queue to update. ++ * @reason: reason for expiration. ++ * ++ * Handle the feedback on @bfqq budget at queue expiration. ++ * See the body for detailed comments. ++ */ ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ enum bfqq_expiration reason) ++{ ++ struct request *next_rq; ++ int budget, min_budget; ++ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ min_budget = bfq_min_budget(bfqd); ++ ++ if (bfqq->wr_coeff == 1) ++ budget = bfqq->max_budget; ++ else /* ++ * Use a constant, low budget for weight-raised queues, ++ * to help achieve a low latency. Keep it slightly higher ++ * than the minimum possible budget, to cause a little ++ * bit fewer expirations. ++ */ ++ budget = 2 * min_budget; ++ ++ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); ++ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", ++ budget, bfq_min_budget(bfqd)); ++ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); ++ ++ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { ++ switch (reason) { ++ /* ++ * Caveat: in all the following cases we trade latency ++ * for throughput. ++ */ ++ case BFQ_BFQQ_TOO_IDLE: ++ /* ++ * This is the only case where we may reduce ++ * the budget: if there is no request of the ++ * process still waiting for completion, then ++ * we assume (tentatively) that the timer has ++ * expired because the batch of requests of ++ * the process could have been served with a ++ * smaller budget. Hence, betting that ++ * process will behave in the same way when it ++ * becomes backlogged again, we reduce its ++ * next budget. As long as we guess right, ++ * this budget cut reduces the latency ++ * experienced by the process. ++ * ++ * However, if there are still outstanding ++ * requests, then the process may have not yet ++ * issued its next request just because it is ++ * still waiting for the completion of some of ++ * the still outstanding ones. So in this ++ * subcase we do not reduce its budget, on the ++ * contrary we increase it to possibly boost ++ * the throughput, as discussed in the ++ * comments to the BUDGET_TIMEOUT case. ++ */ ++ if (bfqq->dispatched > 0) /* still outstanding reqs */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ else { ++ if (budget > 5 * min_budget) ++ budget -= 4 * min_budget; ++ else ++ budget = min_budget; ++ } ++ break; ++ case BFQ_BFQQ_BUDGET_TIMEOUT: ++ /* ++ * We double the budget here because it gives ++ * the chance to boost the throughput if this ++ * is not a seeky process (and has bumped into ++ * this timeout because of, e.g., ZBR). ++ */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_BUDGET_EXHAUSTED: ++ /* ++ * The process still has backlog, and did not ++ * let either the budget timeout or the disk ++ * idling timeout expire. Hence it is not ++ * seeky, has a short thinktime and may be ++ * happy with a higher budget too. So ++ * definitely increase the budget of this good ++ * candidate to boost the disk throughput. ++ */ ++ budget = min(budget * 4, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_NO_MORE_REQUESTS: ++ /* ++ * For queues that expire for this reason, it ++ * is particularly important to keep the ++ * budget close to the actual service they ++ * need. Doing so reduces the timestamp ++ * misalignment problem described in the ++ * comments in the body of ++ * __bfq_activate_entity. In fact, suppose ++ * that a queue systematically expires for ++ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a ++ * new request in time to enjoy timestamp ++ * back-shifting. The larger the budget of the ++ * queue is with respect to the service the ++ * queue actually requests in each service ++ * slot, the more times the queue can be ++ * reactivated with the same virtual finish ++ * time. It follows that, even if this finish ++ * time is pushed to the system virtual time ++ * to reduce the consequent timestamp ++ * misalignment, the queue unjustly enjoys for ++ * many re-activations a lower finish time ++ * than all newly activated queues. ++ * ++ * The service needed by bfqq is measured ++ * quite precisely by bfqq->entity.service. ++ * Since bfqq does not enjoy device idling, ++ * bfqq->entity.service is equal to the number ++ * of sectors that the process associated with ++ * bfqq requested to read/write before waiting ++ * for request completions, or blocking for ++ * other reasons. ++ */ ++ budget = max_t(int, bfqq->entity.service, min_budget); ++ break; ++ default: ++ return; ++ } ++ } else if (!bfq_bfqq_sync(bfqq)) ++ /* ++ * Async queues get always the maximum possible ++ * budget, as for them we do not care about latency ++ * (in addition, their ability to dispatch is limited ++ * by the charging factor). ++ */ ++ budget = bfqd->bfq_max_budget; ++ ++ bfqq->max_budget = budget; ++ ++ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && ++ !bfqd->bfq_user_max_budget) ++ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); ++ ++ /* ++ * If there is still backlog, then assign a new budget, making ++ * sure that it is large enough for the next request. Since ++ * the finish time of bfqq must be kept in sync with the ++ * budget, be sure to call __bfq_bfqq_expire() *after* this ++ * update. ++ * ++ * If there is no backlog, then no need to update the budget; ++ * it will be updated on the arrival of a new request. ++ */ ++ next_rq = bfqq->next_rq; ++ if (next_rq) { ++ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || ++ reason == BFQ_BFQQ_NO_MORE_REQUESTS); ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ BUG_ON(!bfq_bfqq_busy(bfqq)); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", ++ next_rq ? blk_rq_sectors(next_rq) : 0, ++ bfqq->entity.budget); ++} ++ ++/* ++ * Return true if the process associated with bfqq is "slow". The slow ++ * flag is used, in addition to the budget timeout, to reduce the ++ * amount of service provided to seeky processes, and thus reduce ++ * their chances to lower the throughput. More details in the comments ++ * on the function bfq_bfqq_expire(). ++ * ++ * An important observation is in order: as discussed in the comments ++ * on the function bfq_update_peak_rate(), with devices with internal ++ * queues, it is hard if ever possible to know when and for how long ++ * an I/O request is processed by the device (apart from the trivial ++ * I/O pattern where a new request is dispatched only after the ++ * previous one has been completed). This makes it hard to evaluate ++ * the real rate at which the I/O requests of each bfq_queue are ++ * served. In fact, for an I/O scheduler like BFQ, serving a ++ * bfq_queue means just dispatching its requests during its service ++ * slot (i.e., until the budget of the queue is exhausted, or the ++ * queue remains idle, or, finally, a timeout fires). But, during the ++ * service slot of a bfq_queue, around 100 ms at most, the device may ++ * be even still processing requests of bfq_queues served in previous ++ * service slots. On the opposite end, the requests of the in-service ++ * bfq_queue may be completed after the service slot of the queue ++ * finishes. ++ * ++ * Anyway, unless more sophisticated solutions are used ++ * (where possible), the sum of the sizes of the requests dispatched ++ * during the service slot of a bfq_queue is probably the only ++ * approximation available for the service received by the bfq_queue ++ * during its service slot. And this sum is the quantity used in this ++ * function to evaluate the I/O speed of a process. ++ */ ++static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool compensate, enum bfqq_expiration reason, ++ unsigned long *delta_ms) ++{ ++ ktime_t delta_ktime; ++ u32 delta_usecs; ++ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ ++ ++ if (!bfq_bfqq_sync(bfqq)) ++ return false; ++ ++ if (compensate) ++ delta_ktime = bfqd->last_idling_start; ++ else ++ delta_ktime = ktime_get(); ++ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); ++ delta_usecs = ktime_to_us(delta_ktime); ++ ++ /* don't use too short time intervals */ ++ if (delta_usecs < 1000) { ++ if (blk_queue_nonrot(bfqd->queue)) ++ /* ++ * give same worst-case guarantees as idling ++ * for seeky ++ */ ++ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; ++ else /* charge at least one seek */ ++ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; ++ ++ bfq_log(bfqd, "too short %u", delta_usecs); ++ ++ return slow; ++ } ++ ++ *delta_ms = delta_usecs / USEC_PER_MSEC; ++ ++ /* ++ * Use only long (> 20ms) intervals to filter out excessive ++ * spikes in service rate estimation. ++ */ ++ if (delta_usecs > 20000) { ++ /* ++ * Caveat for rotational devices: processes doing I/O ++ * in the slower disk zones tend to be slow(er) even ++ * if not seeky. In this respect, the estimated peak ++ * rate is likely to be an average over the disk ++ * surface. Accordingly, to not be too harsh with ++ * unlucky processes, a process is deemed slow only if ++ * its rate has been lower than half of the estimated ++ * peak rate. ++ */ ++ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; ++ bfq_log(bfqd, "relative rate %d/%d", ++ bfqq->entity.service, bfqd->bfq_max_budget); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); ++ ++ return slow; ++} ++ ++/* ++ * To be deemed as soft real-time, an application must meet two ++ * requirements. First, the application must not require an average ++ * bandwidth higher than the approximate bandwidth required to playback or ++ * record a compressed high-definition video. ++ * The next function is invoked on the completion of the last request of a ++ * batch, to compute the next-start time instant, soft_rt_next_start, such ++ * that, if the next request of the application does not arrive before ++ * soft_rt_next_start, then the above requirement on the bandwidth is met. ++ * ++ * The second requirement is that the request pattern of the application is ++ * isochronous, i.e., that, after issuing a request or a batch of requests, ++ * the application stops issuing new requests until all its pending requests ++ * have been completed. After that, the application may issue a new batch, ++ * and so on. ++ * For this reason the next function is invoked to compute ++ * soft_rt_next_start only for applications that meet this requirement, ++ * whereas soft_rt_next_start is set to infinity for applications that do ++ * not. ++ * ++ * Unfortunately, even a greedy (i.e., I/O-bound) application may ++ * happen to meet, occasionally or systematically, both the above ++ * bandwidth and isochrony requirements. This may happen at least in ++ * the following circumstances. First, if the CPU load is high. The ++ * application may stop issuing requests while the CPUs are busy ++ * serving other processes, then restart, then stop again for a while, ++ * and so on. The other circumstances are related to the storage ++ * device: the storage device is highly loaded or reaches a low-enough ++ * throughput with the I/O of the application (e.g., because the I/O ++ * is random and/or the device is slow). In all these cases, the ++ * I/O of the application may be simply slowed down enough to meet ++ * the bandwidth and isochrony requirements. To reduce the probability ++ * that greedy applications are deemed as soft real-time in these ++ * corner cases, a further rule is used in the computation of ++ * soft_rt_next_start: the return value of this function is forced to ++ * be higher than the maximum between the following two quantities. ++ * ++ * (a) Current time plus: (1) the maximum time for which the arrival ++ * of a request is waited for when a sync queue becomes idle, ++ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We ++ * postpone for a moment the reason for adding a few extra ++ * jiffies; we get back to it after next item (b). Lower-bounding ++ * the return value of this function with the current time plus ++ * bfqd->bfq_slice_idle tends to filter out greedy applications, ++ * because the latter issue their next request as soon as possible ++ * after the last one has been completed. In contrast, a soft ++ * real-time application spends some time processing data, after a ++ * batch of its requests has been completed. ++ * ++ * (b) Current value of bfqq->soft_rt_next_start. As pointed out ++ * above, greedy applications may happen to meet both the ++ * bandwidth and isochrony requirements under heavy CPU or ++ * storage-device load. In more detail, in these scenarios, these ++ * applications happen, only for limited time periods, to do I/O ++ * slowly enough to meet all the requirements described so far, ++ * including the filtering in above item (a). These slow-speed ++ * time intervals are usually interspersed between other time ++ * intervals during which these applications do I/O at a very high ++ * speed. Fortunately, exactly because of the high speed of the ++ * I/O in the high-speed intervals, the values returned by this ++ * function happen to be so high, near the end of any such ++ * high-speed interval, to be likely to fall *after* the end of ++ * the low-speed time interval that follows. These high values are ++ * stored in bfqq->soft_rt_next_start after each invocation of ++ * this function. As a consequence, if the last value of ++ * bfqq->soft_rt_next_start is constantly used to lower-bound the ++ * next value that this function may return, then, from the very ++ * beginning of a low-speed interval, bfqq->soft_rt_next_start is ++ * likely to be constantly kept so high that any I/O request ++ * issued during the low-speed interval is considered as arriving ++ * to soon for the application to be deemed as soft ++ * real-time. Then, in the high-speed interval that follows, the ++ * application will not be deemed as soft real-time, just because ++ * it will do I/O at a high speed. And so on. ++ * ++ * Getting back to the filtering in item (a), in the following two ++ * cases this filtering might be easily passed by a greedy ++ * application, if the reference quantity was just ++ * bfqd->bfq_slice_idle: ++ * 1) HZ is so low that the duration of a jiffy is comparable to or ++ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow ++ * devices with HZ=100. The time granularity may be so coarse ++ * that the approximation, in jiffies, of bfqd->bfq_slice_idle ++ * is rather lower than the exact value. ++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing ++ * for a while, then suddenly 'jump' by several units to recover the lost ++ * increments. This seems to happen, e.g., inside virtual machines. ++ * To address this issue, in the filtering in (a) we do not use as a ++ * reference time interval just bfqd->bfq_slice_idle, but ++ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the ++ * minimum number of jiffies for which the filter seems to be quite ++ * precise also in embedded systems and KVM/QEMU virtual machines. ++ */ ++static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, ++"service_blkg %lu soft_rate %u sects/sec interval %u", ++ bfqq->service_from_backlogged, ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate)); ++ ++ return max3(bfqq->soft_rt_next_start, ++ bfqq->last_idle_bklogged + ++ HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); ++} ++ ++static bool bfq_bfqq_injectable(struct bfq_queue *bfqq) ++{ ++ return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && ++ blk_queue_nonrot(bfqq->bfqd->queue) && ++ bfqq->bfqd->hw_tag; ++} ++ ++/** ++ * bfq_bfqq_expire - expire a queue. ++ * @bfqd: device owning the queue. ++ * @bfqq: the queue to expire. ++ * @compensate: if true, compensate for the time spent idling. ++ * @reason: the reason causing the expiration. ++ * ++ * If the process associated with bfqq does slow I/O (e.g., because it ++ * issues random requests), we charge bfqq with the time it has been ++ * in service instead of the service it has received (see ++ * bfq_bfqq_charge_time for details on how this goal is achieved). As ++ * a consequence, bfqq will typically get higher timestamps upon ++ * reactivation, and hence it will be rescheduled as if it had ++ * received more service than what it has actually received. In the ++ * end, bfqq receives less service in proportion to how slowly its ++ * associated process consumes its budgets (and hence how seriously it ++ * tends to lower the throughput). In addition, this time-charging ++ * strategy guarantees time fairness among slow processes. In ++ * contrast, if the process associated with bfqq is not slow, we ++ * charge bfqq exactly with the service it has received. ++ * ++ * Charging time to the first type of queues and the exact service to ++ * the other has the effect of using the WF2Q+ policy to schedule the ++ * former on a timeslice basis, without violating service domain ++ * guarantees among the latter. ++ */ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason) ++{ ++ bool slow; ++ unsigned long delta = 0; ++ struct bfq_entity *entity = &bfqq->entity; ++ int ref; ++ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ /* ++ * Check whether the process is slow (see bfq_bfqq_is_slow). ++ */ ++ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); ++ ++ /* ++ * As above explained, charge slow (typically seeky) and ++ * timed-out queues with the time and not the service ++ * received, to favor sequential workloads. ++ * ++ * Processes doing I/O in the slower disk zones will tend to ++ * be slow(er) even if not seeky. Therefore, since the ++ * estimated peak rate is actually an average over the disk ++ * surface, these processes may timeout just for bad luck. To ++ * avoid punishing them, do not charge time to processes that ++ * succeeded in consuming at least 2/3 of their budget. This ++ * allows BFQ to preserve enough elasticity to still perform ++ * bandwidth, and not time, distribution with little unlucky ++ * or quasi-sequential processes. ++ */ ++ if (bfqq->wr_coeff == 1 && ++ (slow || ++ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && ++ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) ++ bfq_bfqq_charge_time(bfqd, bfqq, delta); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ if (reason == BFQ_BFQQ_TOO_IDLE && ++ entity->service <= 2 * entity->budget / 10) ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ if (bfqd->low_latency && bfqq->wr_coeff == 1) ++ bfqq->last_wr_start_finish = jiffies; ++ ++ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ /* ++ * If we get here, and there are no outstanding ++ * requests, then the request pattern is isochronous ++ * (see the comments on the function ++ * bfq_bfqq_softrt_next_start()). Thus we can compute ++ * soft_rt_next_start. And we do it, unless bfqq is in ++ * interactive weight raising. We do not do it in the ++ * latter subcase, for the following reason. bfqq may ++ * be conveying the I/O needed to load a soft ++ * real-time application. Such an application will ++ * actually exhibit a soft real-time I/O pattern after ++ * it finally starts doing its job. But, if ++ * soft_rt_next_start is computed here for an ++ * interactive bfqq, and bfqq had received a lot of ++ * service before remaining with no outstanding ++ * request (likely to happen on a fast device), then ++ * soft_rt_next_start would be assigned such a high ++ * value that, for a very long time, bfqq would be ++ * prevented from being possibly considered as soft ++ * real time. ++ * ++ * If, instead, the queue still has outstanding ++ * requests, then we have to wait for the completion ++ * of all the outstanding requests to discover whether ++ * the request pattern is actually isochronous. ++ */ ++ BUG_ON(bfq_tot_busy_queues(bfqd) < 1); ++ if (bfqq->dispatched == 0 && ++ bfqq->wr_coeff != bfqd->bfq_wr_coeff) { ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", ++ bfqq->soft_rt_next_start); ++ } else if (bfqq->dispatched > 0) { ++ /* ++ * Schedule an update of soft_rt_next_start to when ++ * the task may be discovered to be isochronous. ++ */ ++ bfq_mark_bfqq_softrt_update(bfqq); ++ } ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "expire (%s, slow %d, num_disp %d, short %d, weight %d, serv %d/%d)", ++ reason_name[reason], slow, bfqq->dispatched, ++ bfq_bfqq_has_short_ttime(bfqq), entity->weight, ++ entity->service, entity->budget); ++ ++ /* ++ * Increase, decrease or leave budget unchanged according to ++ * reason. ++ */ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); ++ BUG_ON(bfqq->next_rq == NULL && ++ bfqq->entity.budget < bfqq->entity.service); ++ ref = bfqq->ref; ++ __bfq_bfqq_expire(bfqd, bfqq); ++ ++ if (ref == 1) /* bfqq is gone, no more actions on it */ ++ return; ++ ++ BUG_ON(ref > 1 && ++ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && ++ !bfq_class_idle(bfqq)); ++ ++ bfqq->injected_service = 0; ++ ++ /* mark bfqq as waiting a request only if a bic still points to it */ ++ if (!bfq_bfqq_busy(bfqq) && ++ reason != BFQ_BFQQ_BUDGET_TIMEOUT && ++ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) { ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(bfqq->next_rq); ++ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); ++ /* ++ * Not setting service to 0, because, if the next rq ++ * arrives in time, the queue will go on receiving ++ * service with this same budget (as if it never expired) ++ */ ++ } else { ++ entity->service = 0; ++ bfq_log_bfqq(bfqd, bfqq, "resetting service"); ++ } ++ ++ /* ++ * Reset the received-service counter for every parent entity. ++ * Differently from what happens with bfqq->entity.service, ++ * the resetting of this counter never needs to be postponed ++ * for parent entities. In fact, in case bfqq may have a ++ * chance to go on being served using the last, partially ++ * consumed budget, bfqq->entity.service needs to be kept, ++ * because if bfqq then actually goes on being served using ++ * the same budget, the last value of bfqq->entity.service is ++ * needed to properly decrement bfqq->entity.budget by the ++ * portion already consumed. In contrast, it is not necessary ++ * to keep entity->service for parent entities too, because ++ * the bubble up of the new value of bfqq->entity.budget will ++ * make sure that the budgets of parent entities are correct, ++ * even in case bfqq and thus parent entities go on receiving ++ * service with the same budget. ++ */ ++ entity = entity->parent; ++ for_each_entity(entity) ++ entity->service = 0; ++} ++ ++/* ++ * Budget timeout is not implemented through a dedicated timer, but ++ * just checked on request arrivals and completions, as well as on ++ * idle timer expirations. ++ */ ++static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) ++{ ++ return time_is_before_eq_jiffies(bfqq->budget_timeout); ++} ++ ++/* ++ * If we expire a queue that is actively waiting (i.e., with the ++ * device idled) for the arrival of a new request, then we may incur ++ * the timestamp misalignment problem described in the body of the ++ * function __bfq_activate_entity. Hence we return true only if this ++ * condition does not hold, or if the queue is slow enough to deserve ++ * only to be kicked off for preserving a high throughput. ++ */ ++static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "wait_request %d left %d timeout %d", ++ bfq_bfqq_wait_request(bfqq), ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, ++ bfq_bfqq_budget_timeout(bfqq)); ++ ++ return (!bfq_bfqq_wait_request(bfqq) || ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) ++ && ++ bfq_bfqq_budget_timeout(bfqq); ++} ++ ++static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ bool rot_without_queueing = ++ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, ++ bfqq_sequential_and_IO_bound, ++ idling_boosts_thr; ++ ++ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && ++ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); ++ /* ++ * The next variable takes into account the cases where idling ++ * boosts the throughput. ++ * ++ * The value of the variable is computed considering, first, that ++ * idling is virtually always beneficial for the throughput if: ++ * (a) the device is not NCQ-capable and rotational, or ++ * (b) regardless of the presence of NCQ, the device is rotational and ++ * the request pattern for bfqq is I/O-bound and sequential, or ++ * (c) regardless of whether it is rotational, the device is ++ * not NCQ-capable and the request pattern for bfqq is ++ * I/O-bound and sequential. ++ * ++ * Secondly, and in contrast to the above item (b), idling an ++ * NCQ-capable flash-based device would not boost the ++ * throughput even with sequential I/O; rather it would lower ++ * the throughput in proportion to how fast the device ++ * is. Accordingly, the next variable is true if any of the ++ * above conditions (a), (b) or (c) is true, and, in ++ * particular, happens to be false if bfqd is an NCQ-capable ++ * flash-based device. ++ */ ++ idling_boosts_thr = rot_without_queueing || ++ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && ++ bfqq_sequential_and_IO_bound); ++ ++ bfq_log_bfqq(bfqd, bfqq, "idling_boosts_thr %d", idling_boosts_thr); ++ ++ /* ++ * The return value of this function is equal to that of ++ * idling_boosts_thr, unless a special case holds. In this ++ * special case, described below, idling may cause problems to ++ * weight-raised queues. ++ * ++ * When the request pool is saturated (e.g., in the presence ++ * of write hogs), if the processes associated with ++ * non-weight-raised queues ask for requests at a lower rate, ++ * then processes associated with weight-raised queues have a ++ * higher probability to get a request from the pool ++ * immediately (or at least soon) when they need one. Thus ++ * they have a higher probability to actually get a fraction ++ * of the device throughput proportional to their high ++ * weight. This is especially true with NCQ-capable drives, ++ * which enqueue several requests in advance, and further ++ * reorder internally-queued requests. ++ * ++ * For this reason, we force to false the return value if ++ * there are weight-raised busy queues. In this case, and if ++ * bfqq is not weight-raised, this guarantees that the device ++ * is not idled for bfqq (if, instead, bfqq is weight-raised, ++ * then idling will be guaranteed by another variable, see ++ * below). Combined with the timestamping rules of BFQ (see ++ * [1] for details), this behavior causes bfqq, and hence any ++ * sync non-weight-raised queue, to get a lower number of ++ * requests served, and thus to ask for a lower number of ++ * requests from the request pool, before the busy ++ * weight-raised queues get served again. This often mitigates ++ * starvation problems in the presence of heavy write ++ * workloads and NCQ, thereby guaranteeing a higher ++ * application and system responsiveness in these hostile ++ * scenarios. ++ */ ++ return idling_boosts_thr && ++ bfqd->wr_busy_queues == 0; ++} ++ ++/* ++ * There is a case where idling must be performed not for ++ * throughput concerns, but to preserve service guarantees. ++ * ++ * To introduce this case, we can note that allowing the drive ++ * to enqueue more than one request at a time, and hence ++ * delegating de facto final scheduling decisions to the ++ * drive's internal scheduler, entails loss of control on the ++ * actual request service order. In particular, the critical ++ * situation is when requests from different processes happen ++ * to be present, at the same time, in the internal queue(s) ++ * of the drive. In such a situation, the drive, by deciding ++ * the service order of the internally-queued requests, does ++ * determine also the actual throughput distribution among ++ * these processes. But the drive typically has no notion or ++ * concern about per-process throughput distribution, and ++ * makes its decisions only on a per-request basis. Therefore, ++ * the service distribution enforced by the drive's internal ++ * scheduler is likely to coincide with the desired ++ * device-throughput distribution only in a completely ++ * symmetric scenario where: ++ * (i) each of these processes must get the same throughput as ++ * the others; ++ * (ii) the I/O of each process has the same properties, in ++ * terms of locality (sequential or random), direction ++ * (reads or writes), request sizes, greediness ++ * (from I/O-bound to sporadic), and so on. ++ * In fact, in such a scenario, the drive tends to treat ++ * the requests of each of these processes in about the same ++ * way as the requests of the others, and thus to provide ++ * each of these processes with about the same throughput ++ * (which is exactly the desired throughput distribution). In ++ * contrast, in any asymmetric scenario, device idling is ++ * certainly needed to guarantee that bfqq receives its ++ * assigned fraction of the device throughput (see [1] for ++ * details). ++ * The problem is that idling may significantly reduce ++ * throughput with certain combinations of types of I/O and ++ * devices. An important example is sync random I/O, on flash ++ * storage with command queueing. So, unless bfqq falls in the ++ * above cases where idling also boosts throughput, it would ++ * be important to check conditions (i) and (ii) accurately, ++ * so as to avoid idling when not strictly needed for service ++ * guarantees. ++ * ++ * Unfortunately, it is extremely difficult to thoroughly ++ * check condition (ii). And, in case there are active groups, ++ * it becomes very difficult to check condition (i) too. In ++ * fact, if there are active groups, then, for condition (i) ++ * to become false, it is enough that an active group contains ++ * more active processes or sub-groups than some other active ++ * group. More precisely, for condition (i) to hold because of ++ * such a group, it is not even necessary that the group is ++ * (still) active: it is sufficient that, even if the group ++ * has become inactive, some of its descendant processes still ++ * have some request already dispatched but still waiting for ++ * completion. In fact, requests have still to be guaranteed ++ * their share of the throughput even after being ++ * dispatched. In this respect, it is easy to show that, if a ++ * group frequently becomes inactive while still having ++ * in-flight requests, and if, when this happens, the group is ++ * not considered in the calculation of whether the scenario ++ * is asymmetric, then the group may fail to be guaranteed its ++ * fair share of the throughput (basically because idling may ++ * not be performed for the descendant processes of the group, ++ * but it had to be). We address this issue with the ++ * following bi-modal behavior, implemented in the function ++ * bfq_symmetric_scenario(). ++ * ++ * If there are groups with requests waiting for completion ++ * (as commented above, some of these groups may even be ++ * already inactive), then the scenario is tagged as ++ * asymmetric, conservatively, without checking any of the ++ * conditions (i) and (ii). So the device is idled for bfqq. ++ * This behavior matches also the fact that groups are created ++ * exactly if controlling I/O is a primary concern (to ++ * preserve bandwidth and latency guarantees). ++ * ++ * On the opposite end, if there are no groups with requests ++ * waiting for completion, then only condition (i) is actually ++ * controlled, i.e., provided that condition (i) holds, idling ++ * is not performed, regardless of whether condition (ii) ++ * holds. In other words, only if condition (i) does not hold, ++ * then idling is allowed, and the device tends to be ++ * prevented from queueing many requests, possibly of several ++ * processes. Since there are no groups with requests waiting ++ * for completion, then, to control condition (i) it is enough ++ * to check just whether all the queues with requests waiting ++ * for completion also have the same weight. ++ * ++ * Not checking condition (ii) evidently exposes bfqq to the ++ * risk of getting less throughput than its fair share. ++ * However, for queues with the same weight, a further ++ * mechanism, preemption, mitigates or even eliminates this ++ * problem. And it does so without consequences on overall ++ * throughput. This mechanism and its benefits are explained ++ * in the next three paragraphs. ++ * ++ * Even if a queue, say Q, is expired when it remains idle, Q ++ * can still preempt the new in-service queue if the next ++ * request of Q arrives soon (see the comments on ++ * bfq_bfqq_update_budg_for_activation). If all queues and ++ * groups have the same weight, this form of preemption, ++ * combined with the hole-recovery heuristic described in the ++ * comments on function bfq_bfqq_update_budg_for_activation, ++ * are enough to preserve a correct bandwidth distribution in ++ * the mid term, even without idling. In fact, even if not ++ * idling allows the internal queues of the device to contain ++ * many requests, and thus to reorder requests, we can rather ++ * safely assume that the internal scheduler still preserves a ++ * minimum of mid-term fairness. ++ * ++ * More precisely, this preemption-based, idleless approach ++ * provides fairness in terms of IOPS, and not sectors per ++ * second. This can be seen with a simple example. Suppose ++ * that there are two queues with the same weight, but that ++ * the first queue receives requests of 8 sectors, while the ++ * second queue receives requests of 1024 sectors. In ++ * addition, suppose that each of the two queues contains at ++ * most one request at a time, which implies that each queue ++ * always remains idle after it is served. Finally, after ++ * remaining idle, each queue receives very quickly a new ++ * request. It follows that the two queues are served ++ * alternatively, preempting each other if needed. This ++ * implies that, although both queues have the same weight, ++ * the queue with large requests receives a service that is ++ * 1024/8 times as high as the service received by the other ++ * queue. ++ * ++ * The motivation for using preemption instead of idling (for ++ * queues with the same weight) is that, by not idling, ++ * service guarantees are preserved (completely or at least in ++ * part) without minimally sacrificing throughput. And, if ++ * there is no active group, then the primary expectation for ++ * this device is probably a high throughput. ++ * ++ * We are now left only with explaining the additional ++ * compound condition that is checked below for deciding ++ * whether the scenario is asymmetric. To explain this ++ * compound condition, we need to add that the function ++ * bfq_symmetric_scenario checks the weights of only ++ * non-weight-raised queues, for efficiency reasons (see ++ * comments on bfq_weights_tree_add()). Then the fact that ++ * bfqq is weight-raised is checked explicitly here. More ++ * precisely, the compound condition below takes into account ++ * also the fact that, even if bfqq is being weight-raised, ++ * the scenario is still symmetric if all queues with requests ++ * waiting for completion happen to be ++ * weight-raised. Actually, we should be even more precise ++ * here, and differentiate between interactive weight raising ++ * and soft real-time weight raising. ++ * ++ * As a side note, it is worth considering that the above ++ * device-idling countermeasures may however fail in the ++ * following unlucky scenario: if idling is (correctly) ++ * disabled in a time period during which all symmetry ++ * sub-conditions hold, and hence the device is allowed to ++ * enqueue many requests, but at some later point in time some ++ * sub-condition stops to hold, then it may become impossible ++ * to let requests be served in the desired order until all ++ * the requests already queued in the device have been served. ++ */ ++static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ bool asymmetric_scenario = (bfqq->wr_coeff > 1 && ++ bfqd->wr_busy_queues < ++ bfq_tot_busy_queues(bfqd)) || ++ !bfq_symmetric_scenario(bfqd); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wr_coeff %d wr_busy %d busy %d asymmetric %d", ++ bfqq->wr_coeff, ++ bfqd->wr_busy_queues, ++ bfq_tot_busy_queues(bfqd), ++ asymmetric_scenario); ++ ++ return asymmetric_scenario; ++} ++ ++/* ++ * For a queue that becomes empty, device idling is allowed only if ++ * this function returns true for that queue. As a consequence, since ++ * device idling plays a critical role for both throughput boosting ++ * and service guarantees, the return value of this function plays a ++ * critical role as well. ++ * ++ * In a nutshell, this function returns true only if idling is ++ * beneficial for throughput or, even if detrimental for throughput, ++ * idling is however necessary to preserve service guarantees (low ++ * latency, desired throughput distribution, ...). In particular, on ++ * NCQ-capable devices, this function tries to return false, so as to ++ * help keep the drives' internal queues full, whenever this helps the ++ * device boost the throughput without causing any service-guarantee ++ * issue. ++ * ++ * Most of the issues taken into account to get the return value of ++ * this function are not trivial. We discuss these issues in the two ++ * functions providing the main pieces of information needed by this ++ * function. ++ */ ++static bool bfq_better_to_idle(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar; ++ ++ if (unlikely(bfqd->strict_guarantees)) ++ return true; ++ ++ /* ++ * Idling is performed only if slice_idle > 0. In addition, we ++ * do not idle if ++ * (a) bfqq is async ++ * (b) bfqq is in the idle io prio class: in this case we do ++ * not idle because we want to minimize the bandwidth that ++ * queues in this class can steal to higher-priority queues ++ */ ++ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || ++ bfq_class_idle(bfqq)) ++ return false; ++ ++ idling_boosts_thr_with_no_issue = ++ idling_boosts_thr_without_issues(bfqd, bfqq); ++ ++ idling_needed_for_service_guar = ++ idling_needed_for_service_guarantees(bfqd, bfqq); ++ ++ /* ++ * We have now the two components we need to compute the ++ * return value of the function, which is true only if idling ++ * either boosts the throughput (without issues), or is ++ * necessary to preserve service guarantees. ++ */ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wr_busy %d boosts %d IO-bound %d guar %d", ++ bfqd->wr_busy_queues, ++ idling_boosts_thr_with_no_issue, ++ bfq_bfqq_IO_bound(bfqq), ++ idling_needed_for_service_guar); ++ ++ return idling_boosts_thr_with_no_issue || ++ idling_needed_for_service_guar; ++} ++ ++/* ++ * If the in-service queue is empty but the function bfq_better_to_idle ++ * returns true, then: ++ * 1) the queue must remain in service and cannot be expired, and ++ * 2) the device must be idled to wait for the possible arrival of a new ++ * request for the queue. ++ * See the comments on the function bfq_better_to_idle for the reasons ++ * why performing device idling is the best choice to boost the throughput ++ * and preserve service guarantees when bfq_better_to_idle itself ++ * returns true. ++ */ ++static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) ++{ ++ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); ++} ++ ++static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ ++ /* ++ * A linear search; but, with a high probability, very few ++ * steps are needed to find a candidate queue, i.e., a queue ++ * with enough budget left for its next request. In fact: ++ * - BFQ dynamically updates the budget of every queue so as ++ * to accomodate the expected backlog of the queue; ++ * - if a queue gets all its requests dispatched as injected ++ * service, then the queue is removed from the active list ++ * (and re-added only if it gets new requests, but with ++ * enough budget for its new backlog). ++ */ ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) ++ if (!RB_EMPTY_ROOT(&bfqq->sort_list) && ++ bfq_serv_to_charge(bfqq->next_rq, bfqq) <= ++ bfq_bfqq_budget_left(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); ++ return bfqq; ++ } ++ ++ bfq_log(bfqd, "no queue found"); ++ return NULL; ++} ++ ++/* ++ * Select a queue for service. If we have a current queue in service, ++ * check whether to continue servicing it, or retrieve and set a new one. ++ */ ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ struct request *next_rq; ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ ++ bfqq = bfqd->in_service_queue; ++ if (!bfqq) ++ goto new_queue; ++ ++ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); ++ ++ /* ++ * Do not expire bfqq for budget timeout if bfqq may be about ++ * to enjoy device idling. The reason why, in this case, we ++ * prevent bfqq from expiring is the same as in the comments ++ * on the case where bfq_bfqq_must_idle() returns true, in ++ * bfq_completed_request(). ++ */ ++ if (bfq_may_expire_for_budg_timeout(bfqq) && ++ !bfq_bfqq_must_idle(bfqq)) ++ goto expire; ++ ++check_queue: ++ /* ++ * This loop is rarely executed more than once. Even when it ++ * happens, it is much more convenient to re-execute this loop ++ * than to return NULL and trigger a new dispatch to get a ++ * request served. ++ */ ++ next_rq = bfqq->next_rq; ++ /* ++ * If bfqq has requests queued and it has enough budget left to ++ * serve them, keep the queue, otherwise expire it. ++ */ ++ if (next_rq) { ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (bfq_serv_to_charge(next_rq, bfqq) > ++ bfq_bfqq_budget_left(bfqq)) { ++ /* ++ * Expire the queue for budget exhaustion, ++ * which makes sure that the next budget is ++ * enough to serve the next request, even if ++ * it comes from the fifo expired path. ++ */ ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; ++ goto expire; ++ } else { ++ /* ++ * The idle timer may be pending because we may ++ * not disable disk idling even when a new request ++ * arrives. ++ */ ++ if (bfq_bfqq_wait_request(bfqq)) { ++ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); ++ /* ++ * If we get here: 1) at least a new request ++ * has arrived but we have not disabled the ++ * timer because the request was too small, ++ * 2) then the block layer has unplugged ++ * the device, causing the dispatch to be ++ * invoked. ++ * ++ * Since the device is unplugged, now the ++ * requests are probably large enough to ++ * provide a reasonable throughput. ++ * So we disable idling. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); ++ } ++ goto keep_queue; ++ } ++ } ++ ++ /* ++ * No requests pending. However, if the in-service queue is idling ++ * for a new request, or has requests waiting for a completion and ++ * may idle after their completion, then keep it anyway. ++ * ++ * Yet, to boost throughput, inject service from other queues if ++ * possible. ++ */ ++ if (hrtimer_active(&bfqd->idle_slice_timer) || ++ (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { ++ if (bfq_bfqq_injectable(bfqq) && ++ bfqq->injected_service * bfqq->inject_coeff < ++ bfqq->entity.service * 10) { ++ bfq_log_bfqq(bfqd, bfqq, "looking for queue for injection"); ++ bfqq = bfq_choose_bfqq_for_injection(bfqd); ++ } else { ++ if (BFQQ_SEEKY(bfqq)) ++ bfq_log_bfqq(bfqd, bfqq, ++ "injection saturated %d * %d >= %d * 10", ++ bfqq->injected_service, bfqq->inject_coeff, ++ bfqq->entity.service); ++ bfqq = NULL; ++ } ++ goto keep_queue; ++ } ++ ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS; ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, false, reason); ++new_queue: ++ bfqq = bfq_set_in_service_queue(bfqd); ++ if (bfqq) { ++ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); ++ goto check_queue; ++ } ++keep_queue: ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); ++ else ++ bfq_log(bfqd, "no queue returned"); ++ ++ return bfqq; ++} ++ ++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ ++ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != ++ entity->orig_weight * bfqq->wr_coeff); ++ if (entity->prio_changed) ++ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); ++ ++ /* ++ * If the queue was activated in a burst, or too much ++ * time has elapsed from the beginning of this ++ * weight-raising period, then end weight raising. ++ */ ++ if (bfq_bfqq_in_large_burst(bfqq)) ++ bfq_bfqq_end_wr(bfqq); ++ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time)) { ++ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || ++ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + ++ bfq_wr_duration(bfqd))) ++ bfq_bfqq_end_wr(bfqq); ++ else { ++ switch_back_to_interactive_wr(bfqq, bfqd); ++ BUG_ON(time_is_after_jiffies( ++ bfqq->last_wr_start_finish)); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "back to interactive wr"); ++ } ++ } ++ if (bfqq->wr_coeff > 1 && ++ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && ++ bfqq->service_from_wr > max_service_from_wr) { ++ /* see comments on max_service_from_wr */ ++ bfq_bfqq_end_wr(bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "too much service"); ++ } ++ } ++ /* ++ * To improve latency (for this or other queues), immediately ++ * update weight both if it must be raised and if it must be ++ * lowered. Since, entity may be on some active tree here, and ++ * might have a pending change of its ioprio class, invoke ++ * next function with the last parameter unset (see the ++ * comments on the function). ++ */ ++ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) ++ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), ++ entity, false); ++} ++ ++/* ++ * Dispatch one request from bfqq, moving it to the request queue ++ * dispatch list. ++ */ ++static int bfq_dispatch_request(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ struct request *rq = bfqq->next_rq; ++ unsigned long service_to_charge; ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(!rq); ++ service_to_charge = bfq_serv_to_charge(rq, bfqq); ++ ++ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ bfq_bfqq_served(bfqq, service_to_charge); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ bfq_dispatch_insert(bfqd->queue, rq); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "dispatched %u sec req (%llu), budg left %d, new disp_nr %d", ++ blk_rq_sectors(rq), ++ (unsigned long long) blk_rq_pos(rq), ++ bfq_bfqq_budget_left(bfqq), ++ bfqq->dispatched); ++ ++ dispatched++; ++ ++ if (bfqq != bfqd->in_service_queue) { ++ if (likely(bfqd->in_service_queue)) { ++ bfqd->in_service_queue->injected_service += ++ bfq_serv_to_charge(rq, bfqq); ++ bfq_log_bfqq(bfqd, bfqd->in_service_queue, ++ "injected_service increased to %d", ++ bfqd->in_service_queue->injected_service); ++ } ++ return dispatched; ++ } ++ ++ /* ++ * If weight raising has to terminate for bfqq, then next ++ * function causes an immediate update of bfqq's weight, ++ * without waiting for next activation. As a consequence, on ++ * expiration, bfqq will be timestamped as if has never been ++ * weight-raised during this service slot, even if it has ++ * received part or even most of the service as a ++ * weight-raised queue. This inflates bfqq's timestamps, which ++ * is beneficial, as bfqq is then more willing to leave the ++ * device immediately to possible other weight-raised queues. ++ */ ++ bfq_update_wr_data(bfqd, bfqq); ++ ++ if (!bfqd->in_service_bic) { ++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); ++ bfqd->in_service_bic = RQ_BIC(rq); ++ BUG_ON(!bfqd->in_service_bic); ++ } ++ ++ if (bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)) ++ goto expire; ++ ++ return dispatched; ++ ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); ++ return dispatched; ++} ++ ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ ++ while (bfqq->next_rq) { ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); ++ dispatched++; ++ } ++ ++ BUG_ON(!list_empty(&bfqq->fifo)); ++ return dispatched; ++} ++ ++/* ++ * Drain our current requests. ++ * Used for barriers and when switching io schedulers on-the-fly. ++ */ ++static int bfq_forced_dispatch(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq, *n; ++ struct bfq_service_tree *st; ++ int dispatched = 0; ++ ++ bfqq = bfqd->in_service_queue; ++ if (bfqq) ++ __bfq_bfqq_expire(bfqd, bfqq); ++ ++ /* ++ * Loop through classes, and be careful to leave the scheduler ++ * in a consistent state, as feedback mechanisms and vtime ++ * updates cannot be disabled during the process. ++ */ ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { ++ st = bfq_entity_service_tree(&bfqq->entity); ++ ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq); ++ ++ bfqq->max_budget = bfq_max_budget(bfqd); ++ bfq_forget_idle(st); ++ } ++ ++ BUG_ON(bfq_tot_busy_queues(bfqd) != 0); ++ ++ return dispatched; ++} ++ ++static int bfq_dispatch_requests(struct request_queue *q, int force) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq; ++ ++ bfq_log(bfqd, "%d busy queues", bfq_tot_busy_queues(bfqd)); ++ ++ if (bfq_tot_busy_queues(bfqd) == 0) ++ return 0; ++ ++ if (unlikely(force)) ++ return bfq_forced_dispatch(bfqd); ++ ++ /* ++ * Force device to serve one request at a time if ++ * strict_guarantees is true. Forcing this service scheme is ++ * currently the ONLY way to guarantee that the request ++ * service order enforced by the scheduler is respected by a ++ * queueing device. Otherwise the device is free even to make ++ * some unlucky request wait for as long as the device ++ * wishes. ++ * ++ * Of course, serving one request at at time may cause loss of ++ * throughput. ++ */ ++ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) ++ return 0; ++ ++ bfqq = bfq_select_queue(bfqd); ++ if (!bfqq) ++ return 0; ++ ++ BUG_ON(bfqq == bfqd->in_service_queue && ++ bfqq->entity.budget < bfqq->entity.service); ++ ++ BUG_ON(bfqq == bfqd->in_service_queue && ++ bfq_bfqq_wait_request(bfqq)); ++ ++ if (!bfq_dispatch_request(bfqd, bfqq)) ++ return 0; ++ ++ bfq_log_bfqq(bfqd, bfqq, "%s request", ++ bfq_bfqq_sync(bfqq) ? "sync" : "async"); ++ ++ BUG_ON(bfqq->next_rq == NULL && ++ bfqq->entity.budget < bfqq->entity.service); ++ return 1; ++} ++ ++/* ++ * Task holds one reference to the queue, dropped when task exits. Each rq ++ * in-flight on this queue also holds a reference, dropped when rq is freed. ++ * ++ * Queue lock must be held here. Recall not to use bfqq after calling ++ * this function on it. ++ */ ++static void bfq_put_queue(struct bfq_queue *bfqq) ++{ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ struct bfq_group *bfqg = bfqq_group(bfqq); ++#endif ++ ++ BUG_ON(bfqq->ref <= 0); ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); ++ bfqq->ref--; ++ if (bfqq->ref) ++ return; ++ ++ BUG_ON(rb_first(&bfqq->sort_list)); ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); ++ BUG_ON(bfqq->entity.tree); ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ ++ if (!hlist_unhashed(&bfqq->burst_list_node)) { ++ hlist_del_init(&bfqq->burst_list_node); ++ /* ++ * Decrement also burst size after the removal, if the ++ * process associated with bfqq is exiting, and thus ++ * does not contribute to the burst any longer. This ++ * decrement helps filter out false positives of large ++ * bursts, when some short-lived process (often due to ++ * the execution of commands by some service) happens ++ * to start and exit while a complex application is ++ * starting, and thus spawning several processes that ++ * do I/O (and that *must not* be treated as a large ++ * burst, see comments on bfq_handle_burst). ++ * ++ * In particular, the decrement is performed only if: ++ * 1) bfqq is not a merged queue, because, if it is, ++ * then this free of bfqq is not triggered by the exit ++ * of the process bfqq is associated with, but exactly ++ * by the fact that bfqq has just been merged. ++ * 2) burst_size is greater than 0, to handle ++ * unbalanced decrements. Unbalanced decrements may ++ * happen in te following case: bfqq is inserted into ++ * the current burst list--without incrementing ++ * bust_size--because of a split, but the current ++ * burst list is not the burst list bfqq belonged to ++ * (see comments on the case of a split in ++ * bfq_set_request). ++ */ ++ if (bfqq->bic && bfqq->bfqd->burst_size > 0) ++ bfqq->bfqd->burst_size--; ++ } ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); ++ ++ kmem_cache_free(bfq_pool, bfqq); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ bfqg_put(bfqg); ++#endif ++} ++ ++static void bfq_put_cooperator(struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *__bfqq, *next; ++ ++ /* ++ * If this queue was scheduled to merge with another queue, be ++ * sure to drop the reference taken on that queue (and others in ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. ++ */ ++ __bfqq = bfqq->new_bfqq; ++ while (__bfqq) { ++ if (__bfqq == bfqq) ++ break; ++ next = __bfqq->new_bfqq; ++ bfq_put_queue(__bfqq); ++ __bfqq = next; ++ } ++} ++ ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ if (bfqq == bfqd->in_service_queue) { ++ __bfq_bfqq_expire(bfqd, bfqq); ++ bfq_schedule_dispatch(bfqd); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); /* release process reference */ ++} ++ ++static void bfq_init_icq(struct io_cq *icq) ++{ ++ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); ++} ++ ++static void bfq_exit_icq(struct io_cq *icq) ++{ ++ struct bfq_io_cq *bic = icq_to_bic(icq); ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ ++ if (bic_to_bfqq(bic, false)) { ++ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); ++ bic_set_bfqq(bic, NULL, false); ++ } ++ ++ if (bic_to_bfqq(bic, true)) { ++ /* ++ * If the bic is using a shared queue, put the reference ++ * taken on the io_context when the bic started using a ++ * shared bfq_queue. ++ */ ++ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) ++ put_io_context(icq->ioc); ++ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); ++ bic_set_bfqq(bic, NULL, true); ++ } ++} ++ ++/* ++ * Update the entity prio values; note that the new values will not ++ * be used until the next (re)activation. ++ */ ++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ struct task_struct *tsk = current; ++ int ioprio_class; ++ ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ switch (ioprio_class) { ++ default: ++ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, ++ "bfq: bad prio class %d\n", ioprio_class); ++ case IOPRIO_CLASS_NONE: ++ /* ++ * No prio set, inherit CPU scheduling settings. ++ */ ++ bfqq->new_ioprio = task_nice_ioprio(tsk); ++ bfqq->new_ioprio_class = task_nice_ioclass(tsk); ++ break; ++ case IOPRIO_CLASS_RT: ++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; ++ break; ++ case IOPRIO_CLASS_BE: ++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; ++ break; ++ case IOPRIO_CLASS_IDLE: ++ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; ++ bfqq->new_ioprio = 7; ++ break; ++ } ++ ++ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { ++ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", ++ bfqq->new_ioprio); ++ BUG(); ++ } ++ ++ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "bic_class %d prio %d class %d", ++ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); ++} ++ ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ struct bfq_queue *bfqq; ++ unsigned long uninitialized_var(flags); ++ int ioprio = bic->icq.ioc->ioprio; ++ ++ /* ++ * This condition may trigger on a newly created bic, be sure to ++ * drop the lock before returning. ++ */ ++ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) ++ return; ++ ++ bic->ioprio = ioprio; ++ ++ bfqq = bic_to_bfqq(bic, false); ++ if (bfqq) { ++ /* release process reference on this queue */ ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); ++ bic_set_bfqq(bic, bfqq, false); ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfqq %p %d", ++ bfqq, bfqq->ref); ++ } ++ ++ bfqq = bic_to_bfqq(bic, true); ++ if (bfqq) ++ bfq_set_next_ioprio_data(bfqq, bic); ++} ++ ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic, pid_t pid, int is_sync) ++{ ++ RB_CLEAR_NODE(&bfqq->entity.rb_node); ++ INIT_LIST_HEAD(&bfqq->fifo); ++ INIT_HLIST_NODE(&bfqq->burst_list_node); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); ++ ++ bfqq->ref = 0; ++ bfqq->bfqd = bfqd; ++ ++ if (bic) ++ bfq_set_next_ioprio_data(bfqq, bic); ++ ++ if (is_sync) { ++ /* ++ * No need to mark as has_short_ttime if in ++ * idle_class, because no device idling is performed ++ * for queues in idle class ++ */ ++ if (!bfq_class_idle(bfqq)) ++ /* tentatively mark as has_short_ttime */ ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ bfq_mark_bfqq_sync(bfqq); ++ bfq_mark_bfqq_just_created(bfqq); ++ /* ++ * Aggressively inject a lot of service: up to 90%. ++ * This coefficient remains constant during bfqq life, ++ * but this behavior might be changed, after enough ++ * testing and tuning. ++ */ ++ bfqq->inject_coeff = 1; ++ } else ++ bfq_clear_bfqq_sync(bfqq); ++ bfq_mark_bfqq_IO_bound(bfqq); ++ ++ /* Tentative initial value to trade off between thr and lat */ ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; ++ bfqq->pid = pid; ++ ++ bfqq->wr_coeff = 1; ++ bfqq->last_wr_start_finish = jiffies; ++ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); ++ bfqq->budget_timeout = bfq_smallest_from_now(); ++ bfqq->split_time = bfq_smallest_from_now(); ++ ++ /* ++ * To not forget the possibly high bandwidth consumed by a ++ * process/queue in the recent past, ++ * bfq_bfqq_softrt_next_start() returns a value at least equal ++ * to the current value of bfqq->soft_rt_next_start (see ++ * comments on bfq_bfqq_softrt_next_start). Set ++ * soft_rt_next_start to now, to mean that bfqq has consumed ++ * no bandwidth so far. ++ */ ++ bfqq->soft_rt_next_start = jiffies; ++ ++ /* first request is almost certainly seeky */ ++ bfqq->seek_history = 1; ++} ++ ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ int ioprio_class, int ioprio) ++{ ++ switch (ioprio_class) { ++ case IOPRIO_CLASS_RT: ++ return &bfqg->async_bfqq[0][ioprio]; ++ case IOPRIO_CLASS_NONE: ++ ioprio = IOPRIO_NORM; ++ /* fall through */ ++ case IOPRIO_CLASS_BE: ++ return &bfqg->async_bfqq[1][ioprio]; ++ case IOPRIO_CLASS_IDLE: ++ return &bfqg->async_idle_bfqq; ++ default: ++ BUG(); ++ } ++} ++ ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bio *bio, bool is_sync, ++ struct bfq_io_cq *bic) ++{ ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ struct bfq_queue **async_bfqq = NULL; ++ struct bfq_queue *bfqq; ++ struct bfq_group *bfqg; ++ ++ rcu_read_lock(); ++ ++ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); ++ if (!bfqg) { ++ bfqq = &bfqd->oom_bfqq; ++ goto out; ++ } ++ ++ if (!is_sync) { ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ++ ioprio); ++ bfqq = *async_bfqq; ++ if (bfqq) ++ goto out; ++ } ++ ++ bfqq = kmem_cache_alloc_node(bfq_pool, ++ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, ++ bfqd->queue->node); ++ ++ if (bfqq) { ++ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, ++ is_sync); ++ bfq_init_entity(&bfqq->entity, bfqg); ++ bfq_log_bfqq(bfqd, bfqq, "allocated"); ++ } else { ++ bfqq = &bfqd->oom_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); ++ goto out; ++ } ++ ++ /* ++ * Pin the queue now that it's allocated, scheduler exit will ++ * prune it. ++ */ ++ if (async_bfqq) { ++ bfqq->ref++; /* ++ * Extra group reference, w.r.t. sync ++ * queue. This extra reference is removed ++ * only if bfqq->bfqg disappears, to ++ * guarantee that this queue is not freed ++ * until its group goes away. ++ */ ++ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", ++ bfqq, bfqq->ref); ++ *async_bfqq = bfqq; ++ } ++ ++out: ++ bfqq->ref++; /* get a process reference to this queue */ ++ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); ++ rcu_read_unlock(); ++ return bfqq; ++} ++ ++static void bfq_update_io_thinktime(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic) ++{ ++ struct bfq_ttime *ttime = &bic->ttime; ++ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; ++ ++ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); ++ ++ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; ++ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); ++ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, ++ ttime->ttime_samples); ++} ++ ++static void ++bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ bfqq->seek_history <<= 1; ++ bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq); ++} ++ ++static void bfq_update_has_short_ttime(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ bool has_short_ttime = true; ++ ++ /* ++ * No need to update has_short_ttime if bfqq is async or in ++ * idle io prio class, or if bfq_slice_idle is zero, because ++ * no device idling is performed for bfqq in this case. ++ */ ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || ++ bfqd->bfq_slice_idle == 0) ++ return; ++ ++ /* Idle window just restored, statistics are meaningless. */ ++ if (time_is_after_eq_jiffies(bfqq->split_time + ++ bfqd->bfq_wr_min_idle_time)) ++ return; ++ ++ /* Think time is infinite if no process is linked to ++ * bfqq. Otherwise check average think time to ++ * decide whether to mark as has_short_ttime ++ */ ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || ++ (bfq_sample_valid(bic->ttime.ttime_samples) && ++ bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) ++ has_short_ttime = false; ++ ++ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", ++ has_short_ttime); ++ ++ if (has_short_ttime) ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ else ++ bfq_clear_bfqq_has_short_ttime(bfqq); ++} ++ ++/* ++ * Called when a new fs request (rq) is added to bfqq. Check if there's ++ * something we should do about it. ++ */ ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ struct bfq_io_cq *bic = RQ_BIC(rq); ++ ++ if (rq->cmd_flags & REQ_META) ++ bfqq->meta_pending++; ++ ++ bfq_update_io_thinktime(bfqd, bic); ++ bfq_update_has_short_ttime(bfqd, bfqq, bic); ++ bfq_update_io_seektime(bfqd, bfqq, rq); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "has_short_ttime=%d (seeky %d)", ++ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); ++ ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ ++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { ++ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && ++ blk_rq_sectors(rq) < 32; ++ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); ++ ++ /* ++ * There is just this request queued: if ++ * - the request is small, and ++ * - we are idling to boost throughput, and ++ * - the queue is not to be expired, ++ * then just exit. ++ * ++ * In this way, if the device is being idled to wait ++ * for a new request from the in-service queue, we ++ * avoid unplugging the device and committing the ++ * device to serve just a small request. In contrast ++ * we wait for the block layer to decide when to ++ * unplug the device: hopefully, new requests will be ++ * merged to this one quickly, then the device will be ++ * unplugged and larger requests will be dispatched. ++ */ ++ if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) && ++ !budget_timeout) ++ return; ++ ++ /* ++ * A large enough request arrived, or idling is being ++ * performed to preserve service guarantees, or ++ * finally the queue is to be expired: in all these ++ * cases disk idling is to be stopped, so clear ++ * wait_request flag and reset timer. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); ++ ++ /* ++ * The queue is not empty, because a new request just ++ * arrived. Hence we can safely expire the queue, in ++ * case of budget timeout, without risking that the ++ * timestamps of the queue are not updated correctly. ++ * See [1] for more details. ++ */ ++ if (budget_timeout) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ ++ /* ++ * Let the request rip immediately, or let a new queue be ++ * selected if bfqq has just been expired. ++ */ ++ __blk_run_queue(bfqd->queue); ++ } ++} ++ ++static void bfq_insert_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ ++ /* ++ * An unplug may trigger a requeue of a request from the device ++ * driver: make sure we are in process context while trying to ++ * merge two bfq_queues. ++ */ ++ if (!in_interrupt()) { ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); ++ if (new_bfqq) { ++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) ++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); ++ /* ++ * Release the request's reference to the old bfqq ++ * and make sure one is taken to the shared queue. ++ */ ++ new_bfqq->allocated[rq_data_dir(rq)]++; ++ bfqq->allocated[rq_data_dir(rq)]--; ++ new_bfqq->ref++; ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), ++ bfqq, new_bfqq); ++ ++ bfq_clear_bfqq_just_created(bfqq); ++ /* ++ * rq is about to be enqueued into new_bfqq, ++ * release rq reference on bfqq ++ */ ++ bfq_put_queue(bfqq); ++ rq->elv.priv[1] = new_bfqq; ++ bfqq = new_bfqq; ++ } ++ } ++ ++ bfq_add_request(rq); ++ ++ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; ++ list_add_tail(&rq->queuelist, &bfqq->fifo); ++ ++ bfq_rq_enqueued(bfqd, bfqq, rq); ++} ++ ++static void bfq_update_hw_tag(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->in_service_queue; ++ ++ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, ++ bfqd->rq_in_driver); ++ ++ if (bfqd->hw_tag == 1) ++ return; ++ ++ /* ++ * This sample is valid if the number of outstanding requests ++ * is large enough to allow a queueing behavior. Note that the ++ * sum is not exact, as it's not taking into account deactivated ++ * requests. ++ */ ++ if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) ++ return; ++ ++ /* ++ * If active queue hasn't enough requests and can idle, bfq might not ++ * dispatch sufficient requests to hardware. Don't zero hw_tag in this ++ * case ++ */ ++ if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && ++ bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < ++ BFQ_HW_QUEUE_THRESHOLD && bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) ++ return; ++ ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) ++ return; ++ ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; ++ bfqd->max_rq_in_driver = 0; ++ bfqd->hw_tag_samples = 0; ++} ++ ++static void bfq_completed_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ u64 now_ns; ++ u32 delta_us; ++ ++ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", ++ blk_rq_sectors(rq)); ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ bfq_update_hw_tag(bfqd); ++ ++ BUG_ON(!bfqd->rq_in_driver); ++ BUG_ON(!bfqq->dispatched); ++ bfqd->rq_in_driver--; ++ bfqq->dispatched--; ++ bfqg_stats_update_completion(bfqq_group(bfqq), ++ rq->start_time_ns, ++ rq->io_start_time_ns, ++ rq->cmd_flags); ++ ++ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ /* ++ * Set budget_timeout (which we overload to store the ++ * time at which the queue remains with no backlog and ++ * no outstanding request; used by the weight-raising ++ * mechanism). ++ */ ++ bfqq->budget_timeout = jiffies; ++ ++ bfq_weights_tree_remove(bfqd, bfqq); ++ } ++ ++ now_ns = ktime_get_ns(); ++ ++ RQ_BIC(rq)->ttime.last_end_request = now_ns; ++ ++ /* ++ * Using us instead of ns, to get a reasonable precision in ++ * computing rate in next check. ++ */ ++ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); ++ ++ bfq_log(bfqd, "delta %uus/%luus max_size %u rate %llu/%llu", ++ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, ++ delta_us > 0 ? ++ (USEC_PER_SEC* ++ (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT : ++ (USEC_PER_SEC* ++ (u64)(bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, ++ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); ++ ++ /* ++ * If the request took rather long to complete, and, according ++ * to the maximum request size recorded, this completion latency ++ * implies that the request was certainly served at a very low ++ * rate (less than 1M sectors/sec), then the whole observation ++ * interval that lasts up to this time instant cannot be a ++ * valid time interval for computing a new peak rate. Invoke ++ * bfq_update_rate_reset to have the following three steps ++ * taken: ++ * - close the observation interval at the last (previous) ++ * request dispatch or completion ++ * - compute rate, if possible, for that observation interval ++ * - reset to zero samples, which will trigger a proper ++ * re-initialization of the observation interval on next ++ * dispatch ++ */ ++ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && ++ (bfqd->last_rq_max_size<last_completion = now_ns; ++ ++ /* ++ * If we are waiting to discover whether the request pattern ++ * of the task associated with the queue is actually ++ * isochronous, and both requisites for this condition to hold ++ * are now satisfied, then compute soft_rt_next_start (see the ++ * comments on the function bfq_bfqq_softrt_next_start()). We ++ * do not compute soft_rt_next_start if bfqq is in interactive ++ * weight raising (see the comments in bfq_bfqq_expire() for ++ * an explanation). We schedule this delayed update when bfqq ++ * expires, if it still has in-flight requests. ++ */ ++ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list) && ++ bfqq->wr_coeff != bfqd->bfq_wr_coeff) ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ ++ /* ++ * If this is the in-service queue, check if it needs to be expired, ++ * or if we want to idle in case it has no pending requests. ++ */ ++ if (bfqd->in_service_queue == bfqq) { ++ if (bfq_bfqq_must_idle(bfqq)) { ++ if (bfqq->dispatched == 0) ++ bfq_arm_slice_timer(bfqd); ++ /* ++ * If we get here, we do not expire bfqq, even ++ * if bfqq was in budget timeout or had no ++ * more requests (as controlled in the next ++ * conditional instructions). The reason for ++ * not expiring bfqq is as follows. ++ * ++ * Here bfqq->dispatched > 0 holds, but ++ * bfq_bfqq_must_idle() returned true. This ++ * implies that, even if no request arrives ++ * for bfqq before bfqq->dispatched reaches 0, ++ * bfqq will, however, not be expired on the ++ * completion event that causes bfqq->dispatch ++ * to reach zero. In contrast, on this event, ++ * bfqq will start enjoying device idling ++ * (I/O-dispatch plugging). ++ * ++ * But, if we expired bfqq here, bfqq would ++ * not have the chance to enjoy device idling ++ * when bfqq->dispatched finally reaches ++ * zero. This would expose bfqq to violation ++ * of its reserved service guarantees. ++ */ ++ goto out; ++ } else if (bfq_may_expire_for_budg_timeout(bfqq)) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && ++ (bfqq->dispatched == 0 || ++ !bfq_better_to_idle(bfqq))) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_NO_MORE_REQUESTS); ++ } ++ ++ if (!bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++ ++out: ++ return; ++} ++ ++static int __bfq_may_queue(struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { ++ bfq_clear_bfqq_must_alloc(bfqq); ++ return ELV_MQUEUE_MUST; ++ } ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++static int bfq_may_queue(struct request_queue *q, unsigned int op) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct task_struct *tsk = current; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ /* ++ * Don't force setup of a queue from here, as a call to may_queue ++ * does not necessarily imply that a request actually will be ++ * queued. So just lookup a possibly existing queue, or return ++ * 'may queue' if that fails. ++ */ ++ bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ if (!bic) ++ return ELV_MQUEUE_MAY; ++ ++ bfqq = bic_to_bfqq(bic, op_is_sync(op)); ++ if (bfqq) ++ return __bfq_may_queue(bfqq); ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++/* ++ * Queue lock held here. ++ */ ++static void bfq_put_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ if (bfqq) { ++ const int rw = rq_data_dir(rq); ++ ++ BUG_ON(!bfqq->allocated[rw]); ++ bfqq->allocated[rw]--; ++ ++ rq->elv.priv[0] = NULL; ++ rq->elv.priv[1] = NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p, %d", ++ bfqq, bfqq->ref); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++/* ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this ++ * was the last process referring to that bfqq. ++ */ ++static struct bfq_queue * ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); ++ ++ put_io_context(bic->icq.ioc); ++ ++ if (bfqq_process_refs(bfqq) == 1) { ++ bfqq->pid = current->pid; ++ bfq_clear_bfqq_coop(bfqq); ++ bfq_clear_bfqq_split_coop(bfqq); ++ return bfqq; ++ } ++ ++ bic_set_bfqq(bic, NULL, 1); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); ++ return NULL; ++} ++ ++/* ++ * Allocate bfq data structures associated with this request. ++ */ ++static int bfq_set_request(struct request_queue *q, struct request *rq, ++ struct bio *bio, gfp_t gfp_mask) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); ++ const int rw = rq_data_dir(rq); ++ const int is_sync = rq_is_sync(rq); ++ struct bfq_queue *bfqq; ++ unsigned long flags; ++ bool bfqq_already_existing = false, split = false; ++ ++ spin_lock_irqsave(q->queue_lock, flags); ++ ++ if (!bic) ++ goto queue_fail; ++ ++ bfq_check_ioprio_change(bic, bio); ++ ++ bfq_bic_update_cgroup(bic, bio); ++ ++new_queue: ++ bfqq = bic_to_bfqq(bic, is_sync); ++ if (!bfqq || bfqq == &bfqd->oom_bfqq) { ++ if (bfqq) ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); ++ ++ bic_set_bfqq(bic, bfqq, is_sync); ++ if (split && is_sync) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "was_in_list %d " ++ "was_in_large_burst %d " ++ "large burst in progress %d", ++ bic->was_in_burst_list, ++ bic->saved_in_large_burst, ++ bfqd->large_burst); ++ ++ if ((bic->was_in_burst_list && bfqd->large_burst) || ++ bic->saved_in_large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "marking in " ++ "large burst"); ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ } else { ++ bfq_log_bfqq(bfqd, bfqq, ++ "clearing in " ++ "large burst"); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ if (bic->was_in_burst_list) ++ /* ++ * If bfqq was in the current ++ * burst list before being ++ * merged, then we have to add ++ * it back. And we do not need ++ * to increase burst_size, as ++ * we did not decrement ++ * burst_size when we removed ++ * bfqq from the burst list as ++ * a consequence of a merge ++ * (see comments in ++ * bfq_put_queue). In this ++ * respect, it would be rather ++ * costly to know whether the ++ * current burst list is still ++ * the same burst list from ++ * which bfqq was removed on ++ * the merge. To avoid this ++ * cost, if bfqq was in a ++ * burst list, then we add ++ * bfqq to the current burst ++ * list without any further ++ * check. This can cause ++ * inappropriate insertions, ++ * but rarely enough to not ++ * harm the detection of large ++ * bursts significantly. ++ */ ++ hlist_add_head(&bfqq->burst_list_node, ++ &bfqd->burst_list); ++ } ++ bfqq->split_time = jiffies; ++ } ++ } else { ++ /* If the queue was seeky for too long, break it apart. */ ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); ++ ++ /* Update bic before losing reference to bfqq */ ++ if (bfq_bfqq_in_large_burst(bfqq)) ++ bic->saved_in_large_burst = true; ++ ++ bfqq = bfq_split_bfqq(bic, bfqq); ++ split = true; ++ if (!bfqq) ++ goto new_queue; ++ else ++ bfqq_already_existing = true; ++ } ++ } ++ ++ bfqq->allocated[rw]++; ++ bfqq->ref++; ++ bfq_log_bfqq(bfqd, bfqq, "bfqq %p, %d", bfqq, bfqq->ref); ++ ++ rq->elv.priv[0] = bic; ++ rq->elv.priv[1] = bfqq; ++ ++ /* ++ * If a bfq_queue has only one process reference, it is owned ++ * by only one bfq_io_cq: we can set the bic field of the ++ * bfq_queue to the address of that structure. Also, if the ++ * queue has just been split, mark a flag so that the ++ * information is available to the other scheduler hooks. ++ */ ++ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { ++ bfqq->bic = bic; ++ if (split) { ++ /* ++ * If the queue has just been split from a shared ++ * queue, restore the idle window and the possible ++ * weight raising period. ++ */ ++ bfq_bfqq_resume_state(bfqq, bfqd, bic, ++ bfqq_already_existing); ++ } ++ } ++ ++ if (unlikely(bfq_bfqq_just_created(bfqq))) ++ bfq_handle_burst(bfqd, bfqq); ++ ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 0; ++ ++queue_fail: ++ bfq_schedule_dispatch(bfqd); ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 1; ++} ++ ++static void bfq_kick_queue(struct work_struct *work) ++{ ++ struct bfq_data *bfqd = ++ container_of(work, struct bfq_data, unplug_work); ++ struct request_queue *q = bfqd->queue; ++ ++ spin_lock_irq(q->queue_lock); ++ __blk_run_queue(q); ++ spin_unlock_irq(q->queue_lock); ++} ++ ++/* ++ * Handler of the expiration of the timer running if the in-service queue ++ * is idling inside its time slice. ++ */ ++static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) ++{ ++ struct bfq_data *bfqd = container_of(timer, struct bfq_data, ++ idle_slice_timer); ++ struct bfq_queue *bfqq; ++ unsigned long flags; ++ enum bfqq_expiration reason; ++ ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags); ++ ++ bfqq = bfqd->in_service_queue; ++ /* ++ * Theoretical race here: the in-service queue can be NULL or ++ * different from the queue that was idling if the timer handler ++ * spins on the queue_lock and a new request arrives for the ++ * current queue and there is a full dispatch cycle that changes ++ * the in-service queue. This can hardly happen, but in the worst ++ * case we just expire a queue too early. ++ */ ++ if (bfqq) { ++ bfq_log_bfqq(bfqd, bfqq, "expired"); ++ bfq_clear_bfqq_wait_request(bfqq); ++ ++ if (bfq_bfqq_budget_timeout(bfqq)) ++ /* ++ * Also here the queue can be safely expired ++ * for budget timeout without wasting ++ * guarantees ++ */ ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) ++ /* ++ * The queue may not be empty upon timer expiration, ++ * because we may not disable the timer when the ++ * first request of the in-service queue arrives ++ * during disk idling. ++ */ ++ reason = BFQ_BFQQ_TOO_IDLE; ++ else ++ goto schedule_dispatch; ++ ++ bfq_bfqq_expire(bfqd, bfqq, true, reason); ++ } ++ ++schedule_dispatch: ++ bfq_schedule_dispatch(bfqd); ++ ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); ++ return HRTIMER_NORESTART; ++} ++ ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) ++{ ++ hrtimer_cancel(&bfqd->idle_slice_timer); ++ cancel_work_sync(&bfqd->unplug_work); ++} ++ ++static void __bfq_put_async_bfqq(struct bfq_data *bfqd, ++ struct bfq_queue **bfqq_ptr) ++{ ++ struct bfq_group *root_group = bfqd->root_group; ++ struct bfq_queue *bfqq = *bfqq_ptr; ++ ++ bfq_log(bfqd, "%p", bfqq); ++ if (bfqq) { ++ bfq_bfqq_move(bfqd, bfqq, root_group); ++ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", ++ bfqq, bfqq->ref); ++ bfq_put_queue(bfqq); ++ *bfqq_ptr = NULL; ++ } ++} ++ ++/* ++ * Release all the bfqg references to its async queues. If we are ++ * deallocating the group these queues may still contain requests, so ++ * we reparent them to the root cgroup (i.e., the only one that will ++ * exist for sure until all the requests on a device are gone). ++ */ ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); ++ ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); ++} ++ ++static void bfq_exit_queue(struct elevator_queue *e) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ struct request_queue *q = bfqd->queue; ++ struct bfq_queue *bfqq, *n; ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ spin_lock_irq(q->queue_lock); ++ ++ BUG_ON(bfqd->in_service_queue); ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) ++ bfq_deactivate_bfqq(bfqd, bfqq, false, false); ++ ++ spin_unlock_irq(q->queue_lock); ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ /* release oom-queue reference to root group */ ++ bfqg_put(bfqd->root_group); ++ ++ blkcg_deactivate_policy(q, &blkcg_policy_bfq); ++#else ++ bfq_put_async_queues(bfqd, bfqd->root_group); ++ kfree(bfqd->root_group); ++#endif ++ ++ kfree(bfqd); ++} ++ ++static void bfq_init_root_group(struct bfq_group *root_group, ++ struct bfq_data *bfqd) ++{ ++ int i; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ root_group->entity.parent = NULL; ++ root_group->my_entity = NULL; ++ root_group->bfqd = bfqd; ++#endif ++ root_group->rq_pos_tree = RB_ROOT; ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ root_group->sched_data.bfq_class_idle_last_service = jiffies; ++} ++ ++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) ++{ ++ struct bfq_data *bfqd; ++ struct elevator_queue *eq; ++ ++ eq = elevator_alloc(q, e); ++ if (!eq) ++ return -ENOMEM; ++ ++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); ++ if (!bfqd) { ++ kobject_put(&eq->kobj); ++ return -ENOMEM; ++ } ++ eq->elevator_data = bfqd; ++ ++ /* ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. ++ * Grab a permanent reference to it, so that the normal code flow ++ * will not attempt to free it. ++ */ ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); ++ bfqd->oom_bfqq.ref++; ++ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; ++ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; ++ bfqd->oom_bfqq.entity.new_weight = ++ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); ++ ++ /* oom_bfqq does not participate to bursts */ ++ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); ++ /* ++ * Trigger weight initialization, according to ioprio, at the ++ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio ++ * class won't be changed any more. ++ */ ++ bfqd->oom_bfqq.entity.prio_changed = 1; ++ ++ bfqd->queue = q; ++ ++ spin_lock_irq(q->queue_lock); ++ q->elevator = eq; ++ spin_unlock_irq(q->queue_lock); ++ ++ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); ++ if (!bfqd->root_group) ++ goto out_free; ++ bfq_init_root_group(bfqd->root_group, bfqd); ++ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); ++ ++ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, ++ HRTIMER_MODE_REL); ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; ++ ++ bfqd->queue_weights_tree = RB_ROOT; ++ bfqd->num_groups_with_pending_reqs = 0; ++ ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); ++ ++ INIT_LIST_HEAD(&bfqd->active_list); ++ INIT_LIST_HEAD(&bfqd->idle_list); ++ INIT_HLIST_HEAD(&bfqd->burst_list); ++ ++ bfqd->hw_tag = -1; ++ ++ bfqd->bfq_max_budget = bfq_default_max_budget; ++ ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; ++ bfqd->bfq_back_max = bfq_back_max; ++ bfqd->bfq_back_penalty = bfq_back_penalty; ++ bfqd->bfq_slice_idle = bfq_slice_idle; ++ bfqd->bfq_timeout = bfq_timeout; ++ ++ bfqd->bfq_requests_within_timer = 120; ++ ++ bfqd->bfq_large_burst_thresh = 8; ++ bfqd->bfq_burst_interval = msecs_to_jiffies(180); ++ ++ bfqd->low_latency = true; ++ ++ /* ++ * Trade-off between responsiveness and fairness. ++ */ ++ bfqd->bfq_wr_coeff = 30; ++ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); ++ bfqd->bfq_wr_max_time = 0; ++ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); ++ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); ++ bfqd->bfq_wr_max_softrt_rate = 7000; /* ++ * Approximate rate required ++ * to playback or record a ++ * high-definition compressed ++ * video. ++ */ ++ bfqd->wr_busy_queues = 0; ++ ++ /* ++ * Begin by assuming, optimistically, that the device peak ++ * rate is equal to 2/3 of the highest reference rate. ++ */ ++ bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] * ++ ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; ++ bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; ++ ++ return 0; ++ ++out_free: ++ kfree(bfqd); ++ kobject_put(&eq->kobj); ++ return -ENOMEM; ++} ++ ++static void bfq_registered_queue(struct request_queue *q) ++{ ++ wbt_disable_default(q); ++} ++ ++static void bfq_slab_kill(void) ++{ ++ kmem_cache_destroy(bfq_pool); ++} ++ ++static int __init bfq_slab_setup(void) ++{ ++ bfq_pool = KMEM_CACHE(bfq_queue, 0); ++ if (!bfq_pool) ++ return -ENOMEM; ++ return 0; ++} ++ ++static ssize_t bfq_var_show(unsigned int var, char *page) ++{ ++ return sprintf(page, "%u\n", var); ++} ++ ++static ssize_t bfq_var_store(unsigned long *var, const char *page, ++ size_t count) ++{ ++ unsigned long new_val; ++ int ret = kstrtoul(page, 10, &new_val); ++ ++ if (ret == 0) ++ *var = new_val; ++ ++ return count; ++} ++ ++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ ++ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? ++ jiffies_to_msecs(bfqd->bfq_wr_max_time) : ++ jiffies_to_msecs(bfq_wr_duration(bfqd))); ++} ++ ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_queue *bfqq; ++ struct bfq_data *bfqd = e->elevator_data; ++ ssize_t num_char = 0; ++ ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", ++ bfqd->queued); ++ ++ spin_lock_irq(bfqd->queue->queue_lock); ++ ++ num_char += sprintf(page + num_char, "Active:\n"); ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, nr_queued %d %d, ", ++ bfqq->pid, ++ bfqq->entity.weight, ++ bfqq->queued[0], ++ bfqq->queued[1]); ++ num_char += sprintf(page + num_char, ++ "dur %d/%u\n", ++ jiffies_to_msecs( ++ jiffies - ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ num_char += sprintf(page + num_char, "Idle:\n"); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, dur %d/%u\n", ++ bfqq->pid, ++ bfqq->entity.weight, ++ jiffies_to_msecs(jiffies - ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ spin_unlock_irq(bfqd->queue->queue_lock); ++ ++ return num_char; ++} ++ ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ u64 __data = __VAR; \ ++ if (__CONV == 1) \ ++ __data = jiffies_to_msecs(__data); \ ++ else if (__CONV == 2) \ ++ __data = div_u64(__data, NSEC_PER_MSEC); \ ++ return bfq_var_show(__data, (page)); \ ++} ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); ++SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); ++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); ++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); ++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); ++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, ++ 1); ++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); ++#undef SHOW_FUNCTION ++ ++#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ u64 __data = __VAR; \ ++ __data = div_u64(__data, NSEC_PER_USEC); \ ++ return bfq_var_show(__data, (page)); \ ++} ++USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); ++#undef USEC_SHOW_FUNCTION ++ ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ ++static ssize_t \ ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long uninitialized_var(__data); \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ if (__CONV == 1) \ ++ *(__PTR) = msecs_to_jiffies(__data); \ ++ else if (__CONV == 2) \ ++ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ ++ else \ ++ *(__PTR) = __data; \ ++ return ret; \ ++} ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, ++ INT_MAX, 2); ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, ++ INT_MAX, 2); ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, ++ INT_MAX, 0); ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); ++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, ++ 1); ++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, ++ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, ++ INT_MAX, 0); ++#undef STORE_FUNCTION ++ ++#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ ++static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long uninitialized_var(__data); \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ ++ return ret; \ ++} ++USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, ++ UINT_MAX); ++#undef USEC_STORE_FUNCTION ++ ++/* do nothing for the moment */ ++static ssize_t bfq_weights_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ return count; ++} ++ ++static ssize_t bfq_max_budget_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data == 0) ++ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); ++ else { ++ if (__data > INT_MAX) ++ __data = INT_MAX; ++ bfqd->bfq_max_budget = __data; ++ } ++ ++ bfqd->bfq_user_max_budget = __data; ++ ++ return ret; ++} ++ ++/* ++ * Leaving this name to preserve name compatibility with cfq ++ * parameters, but this timeout is used for both sync and async. ++ */ ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data < 1) ++ __data = 1; ++ else if (__data > INT_MAX) ++ __data = INT_MAX; ++ ++ bfqd->bfq_timeout = msecs_to_jiffies(__data); ++ if (bfqd->bfq_user_max_budget == 0) ++ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); ++ ++ return ret; ++} ++ ++static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ if (!bfqd->strict_guarantees && __data == 1 ++ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) ++ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; ++ ++ bfqd->strict_guarantees = __data; ++ ++ return ret; ++} ++ ++static ssize_t bfq_low_latency_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ if (__data == 0 && bfqd->low_latency != 0) ++ bfq_end_wr(bfqd); ++ bfqd->low_latency = __data; ++ ++ return ret; ++} ++ ++#define BFQ_ATTR(name) \ ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) ++ ++static struct elv_fs_entry bfq_attrs[] = { ++ BFQ_ATTR(fifo_expire_sync), ++ BFQ_ATTR(fifo_expire_async), ++ BFQ_ATTR(back_seek_max), ++ BFQ_ATTR(back_seek_penalty), ++ BFQ_ATTR(slice_idle), ++ BFQ_ATTR(slice_idle_us), ++ BFQ_ATTR(max_budget), ++ BFQ_ATTR(timeout_sync), ++ BFQ_ATTR(strict_guarantees), ++ BFQ_ATTR(low_latency), ++ BFQ_ATTR(wr_coeff), ++ BFQ_ATTR(wr_max_time), ++ BFQ_ATTR(wr_rt_max_time), ++ BFQ_ATTR(wr_min_idle_time), ++ BFQ_ATTR(wr_min_inter_arr_async), ++ BFQ_ATTR(wr_max_softrt_rate), ++ BFQ_ATTR(weights), ++ __ATTR_NULL ++}; ++ ++static struct elevator_type iosched_bfq = { ++ .ops.sq = { ++ .elevator_merge_fn = bfq_merge, ++ .elevator_merged_fn = bfq_merged_request, ++ .elevator_merge_req_fn = bfq_merged_requests, ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ .elevator_bio_merged_fn = bfq_bio_merged, ++#endif ++ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, ++ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, ++ .elevator_dispatch_fn = bfq_dispatch_requests, ++ .elevator_add_req_fn = bfq_insert_request, ++ .elevator_activate_req_fn = bfq_activate_request, ++ .elevator_deactivate_req_fn = bfq_deactivate_request, ++ .elevator_completed_req_fn = bfq_completed_request, ++ .elevator_former_req_fn = elv_rb_former_request, ++ .elevator_latter_req_fn = elv_rb_latter_request, ++ .elevator_init_icq_fn = bfq_init_icq, ++ .elevator_exit_icq_fn = bfq_exit_icq, ++ .elevator_set_req_fn = bfq_set_request, ++ .elevator_put_req_fn = bfq_put_request, ++ .elevator_may_queue_fn = bfq_may_queue, ++ .elevator_init_fn = bfq_init_queue, ++ .elevator_exit_fn = bfq_exit_queue, ++ .elevator_registered_fn = bfq_registered_queue, ++ }, ++ .icq_size = sizeof(struct bfq_io_cq), ++ .icq_align = __alignof__(struct bfq_io_cq), ++ .elevator_attrs = bfq_attrs, ++ .elevator_name = "bfq-sq", ++ .elevator_owner = THIS_MODULE, ++}; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct blkcg_policy blkcg_policy_bfq = { ++ .dfl_cftypes = bfq_blkg_files, ++ .legacy_cftypes = bfq_blkcg_legacy_files, ++ ++ .cpd_alloc_fn = bfq_cpd_alloc, ++ .cpd_init_fn = bfq_cpd_init, ++ .cpd_bind_fn = bfq_cpd_init, ++ .cpd_free_fn = bfq_cpd_free, ++ ++ .pd_alloc_fn = bfq_pd_alloc, ++ .pd_init_fn = bfq_pd_init, ++ .pd_offline_fn = bfq_pd_offline, ++ .pd_free_fn = bfq_pd_free, ++ .pd_reset_stats_fn = bfq_pd_reset_stats, ++}; ++#endif ++ ++static int __init bfq_init(void) ++{ ++ int ret; ++ char msg[60] = "BFQ I/O-scheduler: v9"; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ ret = blkcg_policy_register(&blkcg_policy_bfq); ++ if (ret) ++ return ret; ++#endif ++ ++ ret = -ENOMEM; ++ if (bfq_slab_setup()) ++ goto err_pol_unreg; ++ ++ /* ++ * Times to load large popular applications for the typical ++ * systems installed on the reference devices (see the ++ * comments before the definition of the next ++ * array). Actually, we use slightly lower values, as the ++ * estimated peak rate tends to be smaller than the actual ++ * peak rate. The reason for this last fact is that estimates ++ * are computed over much shorter time intervals than the long ++ * intervals typically used for benchmarking. Why? First, to ++ * adapt more quickly to variations. Second, because an I/O ++ * scheduler cannot rely on a peak-rate-evaluation workload to ++ * be run for a long time. ++ */ ++ ref_wr_duration[0] = msecs_to_jiffies(7000); /* actually 8 sec */ ++ ref_wr_duration[1] = msecs_to_jiffies(2500); /* actually 3 sec */ ++ ++ ret = elv_register(&iosched_bfq); ++ if (ret) ++ goto slab_kill; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ strcat(msg, " (with cgroups support)"); ++#endif ++ pr_info("%s", msg); ++ ++ return 0; ++ ++slab_kill: ++ bfq_slab_kill(); ++err_pol_unreg: ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ blkcg_policy_unregister(&blkcg_policy_bfq); ++#endif ++ return ret; ++} ++ ++static void __exit bfq_exit(void) ++{ ++ elv_unregister(&iosched_bfq); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ blkcg_policy_unregister(&blkcg_policy_bfq); ++#endif ++ bfq_slab_kill(); ++} ++ ++module_init(bfq_init); ++module_exit(bfq_exit); ++ ++MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); ++MODULE_LICENSE("GPL"); +diff --git a/block/bfq.h b/block/bfq.h +new file mode 100644 +index 000000000000..0177fc7205d7 +--- /dev/null ++++ b/block/bfq.h +@@ -0,0 +1,1074 @@ ++/* ++ * BFQ v9: data structures and common functions prototypes. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2015 Paolo Valente ++ * ++ * Copyright (C) 2017 Paolo Valente ++ */ ++ ++#ifndef _BFQ_H ++#define _BFQ_H ++ ++#include ++#include ++ ++/* ++ * Define an alternative macro to compile cgroups support. This is one ++ * of the steps needed to let bfq-mq share the files bfq-sched.c and ++ * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro ++ * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether ++ * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not ++ * CONFIG_BFQ_GROUP_IOSCHED, is defined. ++ */ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#define BFQ_GROUP_IOSCHED_ENABLED ++#endif ++ ++#define BFQ_IOPRIO_CLASSES 3 ++#define BFQ_CL_IDLE_TIMEOUT (HZ/5) ++ ++#define BFQ_MIN_WEIGHT 1 ++#define BFQ_MAX_WEIGHT 1000 ++#define BFQ_WEIGHT_CONVERSION_COEFF 10 ++ ++#define BFQ_DEFAULT_QUEUE_IOPRIO 4 ++ ++#define BFQ_WEIGHT_LEGACY_DFL 100 ++#define BFQ_DEFAULT_GRP_IOPRIO 0 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE ++ ++/* ++ * Soft real-time applications are extremely more latency sensitive ++ * than interactive ones. Over-raise the weight of the former to ++ * privilege them against the latter. ++ */ ++#define BFQ_SOFTRT_WEIGHT_FACTOR 100 ++ ++struct bfq_entity; ++ ++/** ++ * struct bfq_service_tree - per ioprio_class service tree. ++ * ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each ++ * ioprio_class has its own independent scheduler, and so its own ++ * bfq_service_tree. All the fields are protected by the queue lock ++ * of the containing bfqd. ++ */ ++struct bfq_service_tree { ++ /* tree for active entities (i.e., those backlogged) */ ++ struct rb_root active; ++ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ ++ struct rb_root idle; ++ ++ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ ++ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ ++ ++ u64 vtime; /* scheduler virtual time */ ++ /* scheduler weight sum; active and idle entities contribute to it */ ++ unsigned long wsum; ++}; ++ ++/** ++ * struct bfq_sched_data - multi-class scheduler. ++ * ++ * bfq_sched_data is the basic scheduler queue. It supports three ++ * ioprio_classes, and can be used either as a toplevel queue or as an ++ * intermediate queue in a hierarchical setup. ++ * ++ * The supported ioprio_classes are the same as in CFQ, in descending ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. ++ * Requests from higher priority queues are served before all the ++ * requests from lower priority queues; among requests of the same ++ * queue requests are served according to B-WF2Q+. ++ * ++ * The schedule is implemented by the service trees, plus the field ++ * @next_in_service, which points to the entity on the active trees ++ * that will be served next, if 1) no changes in the schedule occurs ++ * before the current in-service entity is expired, 2) the in-service ++ * queue becomes idle when it expires, and 3) if the entity pointed by ++ * in_service_entity is not a queue, then the in-service child entity ++ * of the entity pointed by in_service_entity becomes idle on ++ * expiration. This peculiar definition allows for the following ++ * optimization, not yet exploited: while a given entity is still in ++ * service, we already know which is the best candidate for next ++ * service among the other active entitities in the same parent ++ * entity. We can then quickly compare the timestamps of the ++ * in-service entity with those of such best candidate. ++ * ++ * All the fields are protected by the queue lock of the containing ++ * bfqd. ++ */ ++struct bfq_sched_data { ++ struct bfq_entity *in_service_entity; /* entity in service */ ++ /* head-of-the-line entity in the scheduler (see comments above) */ ++ struct bfq_entity *next_in_service; ++ /* array of service trees, one per ioprio_class */ ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; ++ /* last time CLASS_IDLE was served */ ++ unsigned long bfq_class_idle_last_service; ++ ++}; ++ ++/** ++ * struct bfq_weight_counter - counter of the number of all active queues ++ * with a given weight. ++ */ ++struct bfq_weight_counter { ++ unsigned int weight; /* weight of the queues this counter refers to */ ++ unsigned int num_active; /* nr of active queues with this weight */ ++ /* ++ * Weights tree member (see bfq_data's @queue_weights_tree) ++ */ ++ struct rb_node weights_node; ++}; ++ ++/** ++ * struct bfq_entity - schedulable entity. ++ * ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each ++ * entity belongs to the sched_data of the parent group in the cgroup ++ * hierarchy. Non-leaf entities have also their own sched_data, stored ++ * in @my_sched_data. ++ * ++ * Each entity stores independently its priority values; this would ++ * allow different weights on different devices, but this ++ * functionality is not exported to userspace by now. Priorities and ++ * weights are updated lazily, first storing the new values into the ++ * new_* fields, then setting the @prio_changed flag. As soon as ++ * there is a transition in the entity state that allows the priority ++ * update to take place the effective and the requested priority ++ * values are synchronized. ++ * ++ * Unless cgroups are used, the weight value is calculated from the ++ * ioprio to export the same interface as CFQ. When dealing with ++ * ``well-behaved'' queues (i.e., queues that do not spend too much ++ * time to consume their budget and have true sequential behavior, and ++ * when there are no external factors breaking anticipation) the ++ * relative weights at each level of the cgroups hierarchy should be ++ * guaranteed. All the fields are protected by the queue lock of the ++ * containing bfqd. ++ */ ++struct bfq_entity { ++ struct rb_node rb_node; /* service_tree member */ ++ ++ /* ++ * Flag, true if the entity is on a tree (either the active or ++ * the idle one of its service_tree) or is in service. ++ */ ++ bool on_st; ++ ++ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ ++ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ ++ ++ /* tree the entity is enqueued into; %NULL if not on a tree */ ++ struct rb_root *tree; ++ ++ /* ++ * minimum start time of the (active) subtree rooted at this ++ * entity; used for O(log N) lookups into active trees ++ */ ++ u64 min_start; ++ ++ /* amount of service received during the last service slot */ ++ int service; ++ ++ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ ++ int budget; ++ ++ unsigned int weight; /* weight of the queue */ ++ unsigned int new_weight; /* next weight if a change is in progress */ ++ ++ /* original weight, used to implement weight boosting */ ++ unsigned int orig_weight; ++ ++ /* parent entity, for hierarchical scheduling */ ++ struct bfq_entity *parent; ++ ++ /* ++ * For non-leaf nodes in the hierarchy, the associated ++ * scheduler queue, %NULL on leaf nodes. ++ */ ++ struct bfq_sched_data *my_sched_data; ++ /* the scheduler queue this entity belongs to */ ++ struct bfq_sched_data *sched_data; ++ ++ /* flag, set to request a weight, ioprio or ioprio_class change */ ++ int prio_changed; ++ ++ /* flag, set if the entity is counted in groups_with_pending_reqs */ ++ bool in_groups_with_pending_reqs; ++}; ++ ++struct bfq_group; ++ ++/** ++ * struct bfq_queue - leaf schedulable entity. ++ * ++ * A bfq_queue is a leaf request queue; it can be associated with an ++ * io_context or more, if it is async or shared between cooperating ++ * processes. @cgroup holds a reference to the cgroup, to be sure that it ++ * does not disappear while a bfqq still references it (mostly to avoid ++ * races between request issuing and task migration followed by cgroup ++ * destruction). ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_queue { ++ /* reference counter */ ++ int ref; ++ /* parent bfq_data */ ++ struct bfq_data *bfqd; ++ ++ /* current ioprio and ioprio class */ ++ unsigned short ioprio, ioprio_class; ++ /* next ioprio and ioprio class if a change is in progress */ ++ unsigned short new_ioprio, new_ioprio_class; ++ ++ /* ++ * Shared bfq_queue if queue is cooperating with one or more ++ * other queues. ++ */ ++ struct bfq_queue *new_bfqq; ++ /* request-position tree member (see bfq_group's @rq_pos_tree) */ ++ struct rb_node pos_node; ++ /* request-position tree root (see bfq_group's @rq_pos_tree) */ ++ struct rb_root *pos_root; ++ ++ /* sorted list of pending requests */ ++ struct rb_root sort_list; ++ /* if fifo isn't expired, next request to serve */ ++ struct request *next_rq; ++ /* number of sync and async requests queued */ ++ int queued[2]; ++ /* number of sync and async requests currently allocated */ ++ int allocated[2]; ++ /* number of pending metadata requests */ ++ int meta_pending; ++ /* fifo list of requests in sort_list */ ++ struct list_head fifo; ++ ++ /* entity representing this queue in the scheduler */ ++ struct bfq_entity entity; ++ ++ /* pointer to the weight counter associated with this queue */ ++ struct bfq_weight_counter *weight_counter; ++ ++ /* maximum budget allowed from the feedback mechanism */ ++ int max_budget; ++ /* budget expiration (in jiffies) */ ++ unsigned long budget_timeout; ++ ++ /* number of requests on the dispatch list or inside driver */ ++ int dispatched; ++ ++ unsigned int flags; /* status flags.*/ ++ ++ /* node for active/idle bfqq list inside parent bfqd */ ++ struct list_head bfqq_list; ++ ++ /* bit vector: a 1 for each seeky requests in history */ ++ u32 seek_history; ++ ++ /* node for the device's burst list */ ++ struct hlist_node burst_list_node; ++ ++ /* position of the last request enqueued */ ++ sector_t last_request_pos; ++ ++ /* Number of consecutive pairs of request completion and ++ * arrival, such that the queue becomes idle after the ++ * completion, but the next request arrives within an idle ++ * time slice; used only if the queue's IO_bound flag has been ++ * cleared. ++ */ ++ unsigned int requests_within_timer; ++ ++ /* pid of the process owning the queue, used for logging purposes */ ++ pid_t pid; ++ ++ /* ++ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL ++ * if the queue is shared. ++ */ ++ struct bfq_io_cq *bic; ++ ++ /* current maximum weight-raising time for this queue */ ++ unsigned long wr_cur_max_time; ++ /* ++ * Minimum time instant such that, only if a new request is ++ * enqueued after this time instant in an idle @bfq_queue with ++ * no outstanding requests, then the task associated with the ++ * queue it is deemed as soft real-time (see the comments on ++ * the function bfq_bfqq_softrt_next_start()) ++ */ ++ unsigned long soft_rt_next_start; ++ /* ++ * Start time of the current weight-raising period if ++ * the @bfq-queue is being weight-raised, otherwise ++ * finish time of the last weight-raising period. ++ */ ++ unsigned long last_wr_start_finish; ++ /* factor by which the weight of this queue is multiplied */ ++ unsigned int wr_coeff; ++ /* ++ * Time of the last transition of the @bfq_queue from idle to ++ * backlogged. ++ */ ++ unsigned long last_idle_bklogged; ++ /* ++ * Cumulative service received from the @bfq_queue since the ++ * last transition from idle to backlogged. ++ */ ++ unsigned long service_from_backlogged; ++ /* ++ * Cumulative service received from the @bfq_queue since its ++ * last transition to weight-raised state. ++ */ ++ unsigned long service_from_wr; ++ /* ++ * Value of wr start time when switching to soft rt ++ */ ++ unsigned long wr_start_at_switch_to_srt; ++ ++ unsigned long split_time; /* time of last split */ ++ ++ unsigned long first_IO_time; /* time of first I/O for this queue */ ++ ++ /* max service rate measured so far */ ++ u32 max_service_rate; ++ /* ++ * Ratio between the service received by bfqq while it is in ++ * service, and the cumulative service (of requests of other ++ * queues) that may be injected while bfqq is empty but still ++ * in service. To increase precision, the coefficient is ++ * measured in tenths of unit. Here are some example of (1) ++ * ratios, (2) resulting percentages of service injected ++ * w.r.t. to the total service dispatched while bfqq is in ++ * service, and (3) corresponding values of the coefficient: ++ * 1 (50%) -> 10 ++ * 2 (33%) -> 20 ++ * 10 (9%) -> 100 ++ * 9.9 (9%) -> 99 ++ * 1.5 (40%) -> 15 ++ * 0.5 (66%) -> 5 ++ * 0.1 (90%) -> 1 ++ * ++ * So, if the coefficient is lower than 10, then ++ * injected service is more than bfqq service. ++ */ ++ unsigned int inject_coeff; ++ /* amount of service injected in current service slot */ ++ unsigned int injected_service; ++}; ++ ++/** ++ * struct bfq_ttime - per process thinktime stats. ++ */ ++struct bfq_ttime { ++ u64 last_end_request; /* completion time of last request */ ++ ++ u64 ttime_total; /* total process thinktime */ ++ unsigned long ttime_samples; /* number of thinktime samples */ ++ u64 ttime_mean; /* average process thinktime */ ++ ++}; ++ ++/** ++ * struct bfq_io_cq - per (request_queue, io_context) structure. ++ */ ++struct bfq_io_cq { ++ /* associated io_cq structure */ ++ struct io_cq icq; /* must be the first member */ ++ /* array of two process queues, the sync and the async */ ++ struct bfq_queue *bfqq[2]; ++ /* associated @bfq_ttime struct */ ++ struct bfq_ttime ttime; ++ /* per (request_queue, blkcg) ioprio */ ++ int ioprio; ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ uint64_t blkcg_serial_nr; /* the current blkcg serial */ ++#endif ++ ++ /* ++ * Snapshot of the has_short_time flag before merging; taken ++ * to remember its value while the queue is merged, so as to ++ * be able to restore it in case of split. ++ */ ++ bool saved_has_short_ttime; ++ /* ++ * Same purpose as the previous two fields for the I/O bound ++ * classification of a queue. ++ */ ++ bool saved_IO_bound; ++ ++ /* ++ * Same purpose as the previous fields for the value of the ++ * field keeping the queue's belonging to a large burst ++ */ ++ bool saved_in_large_burst; ++ /* ++ * True if the queue belonged to a burst list before its merge ++ * with another cooperating queue. ++ */ ++ bool was_in_burst_list; ++ ++ /* ++ * Similar to previous fields: save wr information. ++ */ ++ unsigned long saved_wr_coeff; ++ unsigned long saved_last_wr_start_finish; ++ unsigned long saved_wr_start_at_switch_to_srt; ++ unsigned int saved_wr_cur_max_time; ++}; ++ ++/** ++ * struct bfq_data - per-device data structure. ++ * ++ * All the fields are protected by the @queue lock. ++ */ ++struct bfq_data { ++ /* request queue for the device */ ++ struct request_queue *queue; ++ ++ /* root bfq_group for the device */ ++ struct bfq_group *root_group; ++ ++ /* ++ * rbtree of weight counters of @bfq_queues, sorted by ++ * weight. Used to keep track of whether all @bfq_queues have ++ * the same weight. The tree contains one counter for each ++ * distinct weight associated to some active and not ++ * weight-raised @bfq_queue (see the comments to the functions ++ * bfq_weights_tree_[add|remove] for further details). ++ */ ++ struct rb_root queue_weights_tree; ++ ++ /* ++ * Number of groups with at least one descendant process that ++ * has at least one request waiting for completion. Note that ++ * this accounts for also requests already dispatched, but not ++ * yet completed. Therefore this number of groups may differ ++ * (be larger) than the number of active groups, as a group is ++ * considered active only if its corresponding entity has ++ * descendant queues with at least one request queued. This ++ * number is used to decide whether a scenario is symmetric. ++ * For a detailed explanation see comments on the computation ++ * of the variable asymmetric_scenario in the function ++ * bfq_better_to_idle(). ++ * ++ * However, it is hard to compute this number exactly, for ++ * groups with multiple descendant processes. Consider a group ++ * that is inactive, i.e., that has no descendant process with ++ * pending I/O inside BFQ queues. Then suppose that ++ * num_groups_with_pending_reqs is still accounting for this ++ * group, because the group has descendant processes with some ++ * I/O request still in flight. num_groups_with_pending_reqs ++ * should be decremented when the in-flight request of the ++ * last descendant process is finally completed (assuming that ++ * nothing else has changed for the group in the meantime, in ++ * terms of composition of the group and active/inactive state of child ++ * groups and processes). To accomplish this, an additional ++ * pending-request counter must be added to entities, and must ++ * be updated correctly. To avoid this additional field and operations, ++ * we resort to the following tradeoff between simplicity and ++ * accuracy: for an inactive group that is still counted in ++ * num_groups_with_pending_reqs, we decrement ++ * num_groups_with_pending_reqs when the first descendant ++ * process of the group remains with no request waiting for ++ * completion. ++ * ++ * Even this simpler decrement strategy requires a little ++ * carefulness: to avoid multiple decrements, we flag a group, ++ * more precisely an entity representing a group, as still ++ * counted in num_groups_with_pending_reqs when it becomes ++ * inactive. Then, when the first descendant queue of the ++ * entity remains with no request waiting for completion, ++ * num_groups_with_pending_reqs is decremented, and this flag ++ * is reset. After this flag is reset for the entity, ++ * num_groups_with_pending_reqs won't be decremented any ++ * longer in case a new descendant queue of the entity remains ++ * with no request waiting for completion. ++ */ ++ unsigned int num_groups_with_pending_reqs; ++ ++ /* ++ * Per-class (RT, BE, IDLE) number of bfq_queues containing ++ * requests (including the queue in service, even if it is ++ * idling). ++ */ ++ unsigned int busy_queues[3]; ++ /* number of weight-raised busy @bfq_queues */ ++ int wr_busy_queues; ++ /* number of queued requests */ ++ int queued; ++ /* number of requests dispatched and waiting for completion */ ++ int rq_in_driver; ++ ++ /* ++ * Maximum number of requests in driver in the last ++ * @hw_tag_samples completed requests. ++ */ ++ int max_rq_in_driver; ++ /* number of samples used to calculate hw_tag */ ++ int hw_tag_samples; ++ /* flag set to one if the driver is showing a queueing behavior */ ++ int hw_tag; ++ ++ /* number of budgets assigned */ ++ int budgets_assigned; ++ ++ /* ++ * Timer set when idling (waiting) for the next request from ++ * the queue in service. ++ */ ++ struct hrtimer idle_slice_timer; ++ /* delayed work to restart dispatching on the request queue */ ++ struct work_struct unplug_work; ++ ++ /* bfq_queue in service */ ++ struct bfq_queue *in_service_queue; ++ /* bfq_io_cq (bic) associated with the @in_service_queue */ ++ struct bfq_io_cq *in_service_bic; ++ ++ /* on-disk position of the last served request */ ++ sector_t last_position; ++ ++ /* position of the last served request for the in-service queue */ ++ sector_t in_serv_last_pos; ++ ++ /* time of last request completion (ns) */ ++ u64 last_completion; ++ ++ /* time of first rq dispatch in current observation interval (ns) */ ++ u64 first_dispatch; ++ /* time of last rq dispatch in current observation interval (ns) */ ++ u64 last_dispatch; ++ ++ /* beginning of the last budget */ ++ ktime_t last_budget_start; ++ /* beginning of the last idle slice */ ++ ktime_t last_idling_start; ++ ++ /* number of samples in current observation interval */ ++ int peak_rate_samples; ++ /* num of samples of seq dispatches in current observation interval */ ++ u32 sequential_samples; ++ /* total num of sectors transferred in current observation interval */ ++ u64 tot_sectors_dispatched; ++ /* max rq size seen during current observation interval (sectors) */ ++ u32 last_rq_max_size; ++ /* time elapsed from first dispatch in current observ. interval (us) */ ++ u64 delta_from_first; ++ /* ++ * Current estimate of the device peak rate, measured in ++ * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by ++ * BFQ_RATE_SHIFT is performed to increase precision in ++ * fixed-point calculations. ++ */ ++ u32 peak_rate; ++ ++ /* maximum budget allotted to a bfq_queue before rescheduling */ ++ int bfq_max_budget; ++ ++ /* list of all the bfq_queues active on the device */ ++ struct list_head active_list; ++ /* list of all the bfq_queues idle on the device */ ++ struct list_head idle_list; ++ ++ /* ++ * Timeout for async/sync requests; when it fires, requests ++ * are served in fifo order. ++ */ ++ u64 bfq_fifo_expire[2]; ++ /* weight of backward seeks wrt forward ones */ ++ unsigned int bfq_back_penalty; ++ /* maximum allowed backward seek */ ++ unsigned int bfq_back_max; ++ /* maximum idling time */ ++ u32 bfq_slice_idle; ++ ++ /* user-configured max budget value (0 for auto-tuning) */ ++ int bfq_user_max_budget; ++ /* ++ * Timeout for bfq_queues to consume their budget; used to ++ * prevent seeky queues from imposing long latencies to ++ * sequential or quasi-sequential ones (this also implies that ++ * seeky queues cannot receive guarantees in the service ++ * domain; after a timeout they are charged for the time they ++ * have been in service, to preserve fairness among them, but ++ * without service-domain guarantees). ++ */ ++ unsigned int bfq_timeout; ++ ++ /* ++ * Number of consecutive requests that must be issued within ++ * the idle time slice to set again idling to a queue which ++ * was marked as non-I/O-bound (see the definition of the ++ * IO_bound flag for further details). ++ */ ++ unsigned int bfq_requests_within_timer; ++ ++ /* ++ * Force device idling whenever needed to provide accurate ++ * service guarantees, without caring about throughput ++ * issues. CAVEAT: this may even increase latencies, in case ++ * of useless idling for processes that did stop doing I/O. ++ */ ++ bool strict_guarantees; ++ ++ /* ++ * Last time at which a queue entered the current burst of ++ * queues being activated shortly after each other; for more ++ * details about this and the following parameters related to ++ * a burst of activations, see the comments on the function ++ * bfq_handle_burst. ++ */ ++ unsigned long last_ins_in_burst; ++ /* ++ * Reference time interval used to decide whether a queue has ++ * been activated shortly after @last_ins_in_burst. ++ */ ++ unsigned long bfq_burst_interval; ++ /* number of queues in the current burst of queue activations */ ++ int burst_size; ++ ++ /* common parent entity for the queues in the burst */ ++ struct bfq_entity *burst_parent_entity; ++ /* Maximum burst size above which the current queue-activation ++ * burst is deemed as 'large'. ++ */ ++ unsigned long bfq_large_burst_thresh; ++ /* true if a large queue-activation burst is in progress */ ++ bool large_burst; ++ /* ++ * Head of the burst list (as for the above fields, more ++ * details in the comments on the function bfq_handle_burst). ++ */ ++ struct hlist_head burst_list; ++ ++ /* if set to true, low-latency heuristics are enabled */ ++ bool low_latency; ++ /* ++ * Maximum factor by which the weight of a weight-raised queue ++ * is multiplied. ++ */ ++ unsigned int bfq_wr_coeff; ++ /* maximum duration of a weight-raising period (jiffies) */ ++ unsigned int bfq_wr_max_time; ++ ++ /* Maximum weight-raising duration for soft real-time processes */ ++ unsigned int bfq_wr_rt_max_time; ++ /* ++ * Minimum idle period after which weight-raising may be ++ * reactivated for a queue (in jiffies). ++ */ ++ unsigned int bfq_wr_min_idle_time; ++ /* ++ * Minimum period between request arrivals after which ++ * weight-raising may be reactivated for an already busy async ++ * queue (in jiffies). ++ */ ++ unsigned long bfq_wr_min_inter_arr_async; ++ ++ /* Max service-rate for a soft real-time queue, in sectors/sec */ ++ unsigned int bfq_wr_max_softrt_rate; ++ /* ++ * Cached value of the product ref_rate*ref_wr_duration, used ++ * for computing the maximum duration of weight raising ++ * automatically. ++ */ ++ u64 rate_dur_prod; ++ ++ /* fallback dummy bfqq for extreme OOM conditions */ ++ struct bfq_queue oom_bfqq; ++}; ++ ++enum bfqq_state_flags { ++ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ ++ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ ++ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* ++ * waiting for a request ++ * without idling the device ++ */ ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ ++ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ ++ BFQ_BFQQ_FLAG_IO_bound, /* ++ * bfqq has timed-out at least once ++ * having consumed at most 2/10 of ++ * its budget ++ */ ++ BFQ_BFQQ_FLAG_in_large_burst, /* ++ * bfqq activated in a large burst, ++ * see comments to bfq_handle_burst. ++ */ ++ BFQ_BFQQ_FLAG_softrt_update, /* ++ * may need softrt-next-start ++ * update ++ */ ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ ++ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ ++}; ++ ++#define BFQ_BFQQ_FNS(name) \ ++static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ ++{ \ ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ ++} ++ ++BFQ_BFQQ_FNS(just_created); ++BFQ_BFQQ_FNS(busy); ++BFQ_BFQQ_FNS(wait_request); ++BFQ_BFQQ_FNS(non_blocking_wait_rq); ++BFQ_BFQQ_FNS(must_alloc); ++BFQ_BFQQ_FNS(fifo_expire); ++BFQ_BFQQ_FNS(has_short_ttime); ++BFQ_BFQQ_FNS(sync); ++BFQ_BFQQ_FNS(IO_bound); ++BFQ_BFQQ_FNS(in_large_burst); ++BFQ_BFQQ_FNS(coop); ++BFQ_BFQQ_FNS(split_coop); ++BFQ_BFQQ_FNS(softrt_update); ++#undef BFQ_BFQQ_FNS ++ ++/* Logging facilities. */ ++#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE ++ ++static const char *checked_dev_name(const struct device *dev) ++{ ++ static const char nodev[] = "nodev"; ++ ++ if (dev) ++ return dev_name(dev); ++ ++ return nodev; ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ assert_spin_locked((bfqd)->queue->queue_lock); \ ++ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ ++ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __pbuf, __func__, ##args); \ ++} while (0) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ ++ pr_crit("%s %s [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ __pbuf, __func__, ##args); \ ++} while (0) ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __func__, ##args) ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ pr_crit("%s bfq [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ __func__, ##args) ++ ++#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ ++ ++#if !defined(CONFIG_BLK_DEV_IO_TRACE) ++ ++/* Avoid possible "unused-variable" warning. See commit message. */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) ++ ++#define bfq_log(bfqd, fmt, args...) do {} while (0) ++ ++#else /* CONFIG_BLK_DEV_IO_TRACE */ ++ ++#include ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ assert_spin_locked((bfqd)->queue->queue_lock); \ ++ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ ++ (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __pbuf, __func__, ##args); \ ++} while (0) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ ++ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, __pbuf, \ ++ __func__, ##args); \ ++} while (0) ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __func__, ##args) ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) ++ ++#endif /* CONFIG_BLK_DEV_IO_TRACE */ ++#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ ++ ++/* Expiration reasons. */ ++enum bfqq_expiration { ++ BFQ_BFQQ_TOO_IDLE = 0, /* ++ * queue has been idling for ++ * too long ++ */ ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ ++ BFQ_BFQQ_PREEMPTED /* preemption in progress */ ++}; ++ ++ ++struct bfqg_stats { ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) ++ /* number of ios merged */ ++ struct blkg_rwstat merged; ++ /* total time spent on device in ns, may not be accurate w/ queueing */ ++ struct blkg_rwstat service_time; ++ /* total time spent waiting in scheduler queue in ns */ ++ struct blkg_rwstat wait_time; ++ /* number of IOs queued up */ ++ struct blkg_rwstat queued; ++ /* total disk time and nr sectors dispatched by this group */ ++ struct blkg_stat time; ++ /* sum of number of ios queued across all samples */ ++ struct blkg_stat avg_queue_size_sum; ++ /* count of samples taken for average */ ++ struct blkg_stat avg_queue_size_samples; ++ /* how many times this group has been removed from service tree */ ++ struct blkg_stat dequeue; ++ /* total time spent waiting for it to be assigned a timeslice. */ ++ struct blkg_stat group_wait_time; ++ /* time spent idling for this blkcg_gq */ ++ struct blkg_stat idle_time; ++ /* total time with empty current active q with other requests queued */ ++ struct blkg_stat empty_time; ++ /* fields after this shouldn't be cleared on stat reset */ ++ uint64_t start_group_wait_time; ++ uint64_t start_idle_time; ++ uint64_t start_empty_time; ++ uint16_t flags; ++#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ ++}; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++/* ++ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. ++ * ++ * @ps: @blkcg_policy_storage that this structure inherits ++ * @weight: weight of the bfq_group ++ */ ++struct bfq_group_data { ++ /* must be the first member */ ++ struct blkcg_policy_data pd; ++ ++ unsigned int weight; ++}; ++ ++/** ++ * struct bfq_group - per (device, cgroup) data structure. ++ * @entity: schedulable entity to insert into the parent group sched_data. ++ * @sched_data: own sched_data, to contain child entities (they may be ++ * both bfq_queues and bfq_groups). ++ * @bfqd: the bfq_data for the device this group acts upon. ++ * @async_bfqq: array of async queues for all the tasks belonging to ++ * the group, one queue per ioprio value per ioprio_class, ++ * except for the idle class that has only one queue. ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used ++ * to avoid too many special cases during group creation/ ++ * migration. ++ * @active_entities: number of active entities belonging to the group; ++ * unused for the root group. Used to know whether there ++ * are groups with more than one active @bfq_entity ++ * (see the comments to the function ++ * bfq_bfqq_may_idle()). ++ * @rq_pos_tree: rbtree sorted by next_request position, used when ++ * determining if two or more queues have interleaving ++ * requests (see bfq_find_close_cooperator()). ++ * ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup ++ * there is a set of bfq_groups, each one collecting the lower-level ++ * entities belonging to the group that are acting on the same device. ++ * ++ * Locking works as follows: ++ * o @bfqd is protected by the queue lock, RCU is used to access it ++ * from the readers. ++ * o All the other fields are protected by the @bfqd queue lock. ++ */ ++struct bfq_group { ++ /* must be the first member */ ++ struct blkg_policy_data pd; ++ ++ struct bfq_entity entity; ++ struct bfq_sched_data sched_data; ++ ++ void *bfqd; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct bfq_entity *my_entity; ++ ++ int active_entities; ++ ++ struct rb_root rq_pos_tree; ++ ++ struct bfqg_stats stats; ++}; ++ ++#else ++struct bfq_group { ++ struct bfq_sched_data sched_data; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct rb_root rq_pos_tree; ++}; ++#endif ++ ++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); ++ ++static unsigned int bfq_class_idx(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ return bfqq ? bfqq->ioprio_class - 1 : ++ BFQ_DEFAULT_GRP_CLASS - 1; ++} ++ ++static unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd) ++{ ++ return bfqd->busy_queues[0] + bfqd->busy_queues[1] + ++ bfqd->busy_queues[2]; ++} ++ ++static struct bfq_service_tree * ++bfq_entity_service_tree(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sched_data = entity->sched_data; ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned int idx = bfq_class_idx(entity); ++ ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); ++ BUG_ON(sched_data == NULL); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "%p %d", ++ sched_data->service_tree + idx, idx); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "%p %d", ++ sched_data->service_tree + idx, idx); ++ } ++#endif ++ return sched_data->service_tree + idx; ++} ++ ++static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) ++{ ++ return bic->bfqq[is_sync]; ++} ++ ++static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, ++ bool is_sync) ++{ ++ bic->bfqq[is_sync] = bfqq; ++} ++ ++static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) ++{ ++ return bic->icq.q->elevator->elevator_data; ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ ++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *group_entity = bfqq->entity.parent; ++ ++ if (!group_entity) ++ group_entity = &bfqq->bfqd->root_group->entity; ++ ++ return container_of(group_entity, struct bfq_group, entity); ++} ++ ++#else ++ ++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) ++{ ++ return bfqq->bfqd->root_group; ++} ++ ++#endif ++ ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); ++static void bfq_put_queue(struct bfq_queue *bfqq); ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bio *bio, bool is_sync, ++ struct bfq_io_cq *bic); ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); ++#endif ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); ++ ++#endif /* _BFQ_H */ +diff --git a/block/blk-mq.c b/block/blk-mq.c +index e3c39ea8e17b..7a57368841f6 100644 +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -2878,6 +2878,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) + } + if (ret) + break; ++ if (q->elevator && q->elevator->type->ops.mq.depth_updated) ++ q->elevator->type->ops.mq.depth_updated(hctx); + } + + if (!ret) +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 6980014357d4..8c4568ea6884 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -54,7 +54,7 @@ struct blk_stat_callback; + * Maximum number of blkcg policies allowed to be registered concurrently. + * Defined here to simplify include dependency. + */ +-#define BLKCG_MAX_POLS 5 ++#define BLKCG_MAX_POLS 7 + + typedef void (rq_end_io_fn)(struct request *, blk_status_t); + +@@ -127,6 +127,10 @@ typedef __u32 __bitwise req_flags_t; + #define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) + /* ->timeout has been called, don't expire again */ + #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) ++/* DEBUG: rq in bfq-mq dispatch list */ ++#define RQF_DISP_LIST ((__force req_flags_t)(1 << 22)) ++/* DEBUG: rq had get_rq_private executed on it */ ++#define RQF_GOT ((__force req_flags_t)(1 << 23)) + + /* flags that prevent us from merging requests: */ + #define RQF_NOMERGE_FLAGS \ +diff --git a/include/linux/elevator.h b/include/linux/elevator.h +index a02deea30185..a2bf4a6b9316 100644 +--- a/include/linux/elevator.h ++++ b/include/linux/elevator.h +@@ -99,6 +99,7 @@ struct elevator_mq_ops { + void (*exit_sched)(struct elevator_queue *); + int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); + void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); ++ void (*depth_updated)(struct blk_mq_hw_ctx *); + + bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); + bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-drop_ancient-and-wrong-msg.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-drop_ancient-and-wrong-msg.patch new file mode 100644 index 00000000..f184b08e --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-drop_ancient-and-wrong-msg.patch @@ -0,0 +1,29 @@ +diff -Naur linux-4.15.1/drivers/edac/amd64_edac.c linux-4.15.1-p/drivers/edac/amd64_edac.c +--- linux-4.15.1/drivers/edac/amd64_edac.c 2018-02-03 17:58:44.000000000 +0100 ++++ linux-4.15.1-p/drivers/edac/amd64_edac.c 2018-02-12 01:52:10.411149240 +0100 +@@ -3020,17 +3020,6 @@ + amd64_warn("Error restoring NB MCGCTL settings!\n"); + } + +-/* +- * EDAC requires that the BIOS have ECC enabled before +- * taking over the processing of ECC errors. A command line +- * option allows to force-enable hardware ECC later in +- * enable_ecc_error_reporting(). +- */ +-static const char *ecc_msg = +- "ECC disabled in the BIOS or no ECC capability, module will not load.\n" +- " Either enable ECC checking or force module loading by setting " +- "'ecc_enable_override'.\n" +- " (Note that use of the override may cause unknown side effects.)\n"; + + static bool ecc_enabled(struct pci_dev *F3, u16 nid) + { +@@ -3083,7 +3072,6 @@ + nid, (ecc_en ? "enabled" : "disabled")); + + if (!ecc_en || !nb_mce_en) { +- amd64_info("%s", ecc_msg); + return false; + } + return true; diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-enable_alx_wol.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-enable_alx_wol.patch new file mode 100644 index 00000000..1b7f6e13 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-enable_alx_wol.patch @@ -0,0 +1,485 @@ +diff --git a/drivers/net/ethernet/atheros/alx/ethtool.c b/drivers/net/ethernet/atheros/alx/ethtool.c +index 2f4eabf652e8..859e27236ce4 100644 +--- a/drivers/net/ethernet/atheros/alx/ethtool.c ++++ b/drivers/net/ethernet/atheros/alx/ethtool.c +@@ -310,11 +310,47 @@ static int alx_get_sset_count(struct net_device *netdev, int sset) + } + } + ++static void alx_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) ++{ ++ struct alx_priv *alx = netdev_priv(netdev); ++ struct alx_hw *hw = &alx->hw; ++ ++ wol->supported = WAKE_MAGIC | WAKE_PHY; ++ wol->wolopts = 0; ++ ++ if (hw->sleep_ctrl & ALX_SLEEP_WOL_MAGIC) ++ wol->wolopts |= WAKE_MAGIC; ++ if (hw->sleep_ctrl & ALX_SLEEP_WOL_PHY) ++ wol->wolopts |= WAKE_PHY; ++} ++ ++static int alx_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) ++{ ++ struct alx_priv *alx = netdev_priv(netdev); ++ struct alx_hw *hw = &alx->hw; ++ ++ if (wol->wolopts & ~(WAKE_MAGIC | WAKE_PHY)) ++ return -EOPNOTSUPP; ++ ++ hw->sleep_ctrl = 0; ++ ++ if (wol->wolopts & WAKE_MAGIC) ++ hw->sleep_ctrl |= ALX_SLEEP_WOL_MAGIC; ++ if (wol->wolopts & WAKE_PHY) ++ hw->sleep_ctrl |= ALX_SLEEP_WOL_PHY; ++ ++ device_set_wakeup_enable(&alx->hw.pdev->dev, hw->sleep_ctrl); ++ ++ return 0; ++} ++ + const struct ethtool_ops alx_ethtool_ops = { + .get_pauseparam = alx_get_pauseparam, + .set_pauseparam = alx_set_pauseparam, + .get_msglevel = alx_get_msglevel, + .set_msglevel = alx_set_msglevel, ++ .get_wol = alx_get_wol, ++ .set_wol = alx_set_wol, + .get_link = ethtool_op_get_link, + .get_strings = alx_get_strings, + .get_sset_count = alx_get_sset_count, +diff --git a/drivers/net/ethernet/atheros/alx/hw.c b/drivers/net/ethernet/atheros/alx/hw.c +index 6ac40b0003a3..4791b9dbbe26 100644 +--- a/drivers/net/ethernet/atheros/alx/hw.c ++++ b/drivers/net/ethernet/atheros/alx/hw.c +@@ -332,6 +332,16 @@ void alx_set_macaddr(struct alx_hw *hw, const u8 *addr) + alx_write_mem32(hw, ALX_STAD1, val); + } + ++static void alx_enable_osc(struct alx_hw *hw) ++{ ++ u32 val; ++ ++ /* rising edge */ ++ val = alx_read_mem32(hw, ALX_MISC); ++ alx_write_mem32(hw, ALX_MISC, val & ~ALX_MISC_INTNLOSC_OPEN); ++ alx_write_mem32(hw, ALX_MISC, val | ALX_MISC_INTNLOSC_OPEN); ++} ++ + static void alx_reset_osc(struct alx_hw *hw, u8 rev) + { + u32 val, val2; +@@ -848,6 +858,66 @@ void alx_post_phy_link(struct alx_hw *hw) + } + } + ++ ++/* NOTE: ++ * 1. phy link must be established before calling this function ++ * 2. wol option (pattern,magic,link,etc.) is configed before call it. ++ */ ++int alx_pre_suspend(struct alx_hw *hw, int speed, u8 duplex) ++{ ++ u32 master, mac, phy, val; ++ int err = 0; ++ ++ master = alx_read_mem32(hw, ALX_MASTER); ++ master &= ~ALX_MASTER_PCLKSEL_SRDS; ++ mac = hw->rx_ctrl; ++ /* 10/100 half */ ++ ALX_SET_FIELD(mac, ALX_MAC_CTRL_SPEED, ALX_MAC_CTRL_SPEED_10_100); ++ mac &= ~(ALX_MAC_CTRL_FULLD | ALX_MAC_CTRL_RX_EN | ALX_MAC_CTRL_TX_EN); ++ ++ phy = alx_read_mem32(hw, ALX_PHY_CTRL); ++ phy &= ~(ALX_PHY_CTRL_DSPRST_OUT | ALX_PHY_CTRL_CLS); ++ phy |= ALX_PHY_CTRL_RST_ANALOG | ALX_PHY_CTRL_HIB_PULSE | ++ ALX_PHY_CTRL_HIB_EN; ++ ++ /* without any activity */ ++ if (!(hw->sleep_ctrl & ALX_SLEEP_ACTIVE)) { ++ err = alx_write_phy_reg(hw, ALX_MII_IER, 0); ++ if (err) ++ return err; ++ phy |= ALX_PHY_CTRL_IDDQ | ALX_PHY_CTRL_POWER_DOWN; ++ } else { ++ if (hw->sleep_ctrl & (ALX_SLEEP_WOL_MAGIC | ALX_SLEEP_CIFS)) ++ mac |= ALX_MAC_CTRL_RX_EN | ALX_MAC_CTRL_BRD_EN; ++ if (hw->sleep_ctrl & ALX_SLEEP_CIFS) ++ mac |= ALX_MAC_CTRL_TX_EN; ++ if (duplex == DUPLEX_FULL) ++ mac |= ALX_MAC_CTRL_FULLD; ++ if (speed == SPEED_1000) ++ ALX_SET_FIELD(mac, ALX_MAC_CTRL_SPEED, ++ ALX_MAC_CTRL_SPEED_1000); ++ phy |= ALX_PHY_CTRL_DSPRST_OUT; ++ err = alx_write_phy_ext(hw, ALX_MIIEXT_ANEG, ++ ALX_MIIEXT_S3DIG10, ++ ALX_MIIEXT_S3DIG10_SL); ++ if (err) ++ return err; ++ } ++ ++ alx_enable_osc(hw); ++ hw->rx_ctrl = mac; ++ alx_write_mem32(hw, ALX_MASTER, master); ++ alx_write_mem32(hw, ALX_MAC_CTRL, mac); ++ alx_write_mem32(hw, ALX_PHY_CTRL, phy); ++ ++ /* set val of PDLL D3PLLOFF */ ++ val = alx_read_mem32(hw, ALX_PDLL_TRNS1); ++ val |= ALX_PDLL_TRNS1_D3PLLOFF_EN; ++ alx_write_mem32(hw, ALX_PDLL_TRNS1, val); ++ ++ return 0; ++} ++ + bool alx_phy_configured(struct alx_hw *hw) + { + u32 cfg, hw_cfg; +@@ -920,6 +990,26 @@ int alx_clear_phy_intr(struct alx_hw *hw) + return alx_read_phy_reg(hw, ALX_MII_ISR, &isr); + } + ++int alx_config_wol(struct alx_hw *hw) ++{ ++ u32 wol = 0; ++ int err = 0; ++ ++ /* turn on magic packet event */ ++ if (hw->sleep_ctrl & ALX_SLEEP_WOL_MAGIC) ++ wol |= ALX_WOL0_MAGIC_EN | ALX_WOL0_PME_MAGIC_EN; ++ ++ /* turn on link up event */ ++ if (hw->sleep_ctrl & ALX_SLEEP_WOL_PHY) { ++ wol |= ALX_WOL0_LINK_EN | ALX_WOL0_PME_LINK; ++ /* only link up can wake up */ ++ err = alx_write_phy_reg(hw, ALX_MII_IER, ALX_IER_LINK_UP); ++ } ++ alx_write_mem32(hw, ALX_WOL0, wol); ++ ++ return err; ++} ++ + void alx_disable_rss(struct alx_hw *hw) + { + u32 ctrl = alx_read_mem32(hw, ALX_RXQ0); +@@ -1045,6 +1135,71 @@ void alx_mask_msix(struct alx_hw *hw, int index, bool mask) + } + + ++int alx_select_powersaving_speed(struct alx_hw *hw, int *speed, u8 *duplex) ++{ ++ int i, err; ++ u16 lpa; ++ ++ err = alx_read_phy_link(hw); ++ if (err) ++ return err; ++ ++ if (hw->link_speed == SPEED_UNKNOWN) { ++ *speed = SPEED_UNKNOWN; ++ *duplex = DUPLEX_UNKNOWN; ++ return 0; ++ } ++ ++ err = alx_read_phy_reg(hw, MII_LPA, &lpa); ++ if (err) ++ return err; ++ ++ if (!(lpa & LPA_LPACK)) { ++ *speed = hw->link_speed; ++ return 0; ++ } ++ ++ if (lpa & LPA_10FULL) { ++ *speed = SPEED_10; ++ *duplex = DUPLEX_FULL; ++ } else if (lpa & LPA_10HALF) { ++ *speed = SPEED_10; ++ *duplex = DUPLEX_HALF; ++ } else if (lpa & LPA_100FULL) { ++ *speed = SPEED_100; ++ *duplex = DUPLEX_FULL; ++ } else { ++ *speed = SPEED_100; ++ *duplex = DUPLEX_HALF; ++ } ++ ++ if (*speed == hw->link_speed && *duplex == hw->duplex) ++ return 0; ++ err = alx_write_phy_reg(hw, ALX_MII_IER, 0); ++ if (err) ++ return err; ++ err = alx_setup_speed_duplex(hw, alx_speed_to_ethadv(*speed, *duplex) | ++ ADVERTISED_Autoneg, ALX_FC_ANEG | ++ ALX_FC_RX | ALX_FC_TX); ++ if (err) ++ return err; ++ ++ /* wait for linkup */ ++ for (i = 0; i < ALX_MAX_SETUP_LNK_CYCLE; i++) { ++ msleep(100); ++ ++ err = alx_read_phy_link(hw); ++ if (err < 0) ++ return err; ++ if (hw->link_speed != SPEED_UNKNOWN) ++ break; ++ } ++ if (i == ALX_MAX_SETUP_LNK_CYCLE) ++ return -ETIMEDOUT; ++ ++ return 0; ++} ++ + bool alx_get_phy_info(struct alx_hw *hw) + { + u16 devs1, devs2; +diff --git a/drivers/net/ethernet/atheros/alx/hw.h b/drivers/net/ethernet/atheros/alx/hw.h +index e42d7e0947eb..a7fb6c8d846a 100644 +--- a/drivers/net/ethernet/atheros/alx/hw.h ++++ b/drivers/net/ethernet/atheros/alx/hw.h +@@ -487,6 +487,8 @@ struct alx_hw { + u8 flowctrl; + u32 adv_cfg; + ++ u32 sleep_ctrl; ++ + spinlock_t mdio_lock; + struct mdio_if_info mdio; + u16 phy_id[2]; +@@ -549,12 +551,14 @@ void alx_reset_pcie(struct alx_hw *hw); + void alx_enable_aspm(struct alx_hw *hw, bool l0s_en, bool l1_en); + int alx_setup_speed_duplex(struct alx_hw *hw, u32 ethadv, u8 flowctrl); + void alx_post_phy_link(struct alx_hw *hw); ++int alx_pre_suspend(struct alx_hw *hw, int speed, u8 duplex); + int alx_read_phy_reg(struct alx_hw *hw, u16 reg, u16 *phy_data); + int alx_write_phy_reg(struct alx_hw *hw, u16 reg, u16 phy_data); + int alx_read_phy_ext(struct alx_hw *hw, u8 dev, u16 reg, u16 *pdata); + int alx_write_phy_ext(struct alx_hw *hw, u8 dev, u16 reg, u16 data); + int alx_read_phy_link(struct alx_hw *hw); + int alx_clear_phy_intr(struct alx_hw *hw); ++int alx_config_wol(struct alx_hw *hw); + void alx_cfg_mac_flowcontrol(struct alx_hw *hw, u8 fc); + void alx_start_mac(struct alx_hw *hw); + int alx_reset_mac(struct alx_hw *hw); +@@ -563,6 +567,7 @@ bool alx_phy_configured(struct alx_hw *hw); + void alx_configure_basic(struct alx_hw *hw); + void alx_mask_msix(struct alx_hw *hw, int index, bool mask); + void alx_disable_rss(struct alx_hw *hw); ++int alx_select_powersaving_speed(struct alx_hw *hw, int *speed, u8 *duplex); + bool alx_get_phy_info(struct alx_hw *hw); + void alx_update_hw_stats(struct alx_hw *hw); + +diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c +index 5e5022fa1d04..7adaf10a1929 100644 +--- a/drivers/net/ethernet/atheros/alx/main.c ++++ b/drivers/net/ethernet/atheros/alx/main.c +@@ -1070,6 +1070,7 @@ static int alx_init_sw(struct alx_priv *alx) + alx->dev->max_mtu = ALX_MAX_FRAME_LEN(ALX_MAX_FRAME_SIZE); + alx->tx_ringsz = 256; + alx->rx_ringsz = 512; ++ hw->sleep_ctrl = ALX_SLEEP_WOL_MAGIC | ALX_SLEEP_WOL_PHY; + hw->imt = 200; + alx->int_mask = ALX_ISR_MISC; + hw->dma_chnl = hw->max_dma_chnl; +@@ -1345,6 +1346,65 @@ static int alx_stop(struct net_device *netdev) + __alx_stop(netdev_priv(netdev)); + return 0; + } ++static int __alx_shutdown(struct pci_dev *pdev, bool *wol_en) ++{ ++ struct alx_priv *alx = pci_get_drvdata(pdev); ++ struct net_device *netdev = alx->dev; ++ struct alx_hw *hw = &alx->hw; ++ int err, speed; ++ u8 duplex; ++ ++ netif_device_detach(netdev); ++ ++ if (netif_running(netdev)) ++ __alx_stop(alx); ++ ++#ifdef CONFIG_PM_SLEEP ++ err = pci_save_state(pdev); ++ if (err) ++ return err; ++#endif ++ ++ err = alx_select_powersaving_speed(hw, &speed, &duplex); ++ if (err) ++ return err; ++ err = alx_clear_phy_intr(hw); ++ if (err) ++ return err; ++ err = alx_pre_suspend(hw, speed, duplex); ++ if (err) ++ return err; ++ err = alx_config_wol(hw); ++ if (err) ++ return err; ++ ++ *wol_en = false; ++ if (hw->sleep_ctrl & ALX_SLEEP_ACTIVE) { ++ netif_info(alx, wol, netdev, ++ "wol: ctrl=%X, speed=%X\n", ++ hw->sleep_ctrl, speed); ++ device_set_wakeup_enable(&pdev->dev, true); ++ *wol_en = true; ++ } ++ ++ pci_disable_device(pdev); ++ ++ return 0; ++} ++ ++static void alx_shutdown(struct pci_dev *pdev) ++{ ++ int err; ++ bool wol_en; ++ ++ err = __alx_shutdown(pdev, &wol_en); ++ if (!err) { ++ pci_wake_from_d3(pdev, wol_en); ++ pci_set_power_state(pdev, PCI_D3hot); ++ } else { ++ dev_err(&pdev->dev, "shutdown fail %d\n", err); ++ } ++} + + static void alx_link_check(struct work_struct *work) + { +@@ -1841,6 +1901,8 @@ static int alx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) + goto out_unmap; + } + ++ device_set_wakeup_enable(&pdev->dev, hw->sleep_ctrl); ++ + netdev_info(netdev, + "Qualcomm Atheros AR816x/AR817x Ethernet [%pM]\n", + netdev->dev_addr); +@@ -1883,12 +1945,21 @@ static void alx_remove(struct pci_dev *pdev) + static int alx_suspend(struct device *dev) + { + struct pci_dev *pdev = to_pci_dev(dev); +- struct alx_priv *alx = pci_get_drvdata(pdev); ++ int err; ++ bool wol_en; + +- if (!netif_running(alx->dev)) +- return 0; +- netif_device_detach(alx->dev); +- __alx_stop(alx); ++ err = __alx_shutdown(pdev, &wol_en); ++ if (err) { ++ dev_err(&pdev->dev, "shutdown fail in suspend %d\n", err); ++ return err; ++ } ++ ++ if (wol_en) { ++ pci_prepare_to_sleep(pdev); ++ } else { ++ pci_wake_from_d3(pdev, false); ++ pci_set_power_state(pdev, PCI_D3hot); ++ } + return 0; + } + +@@ -1896,26 +1967,49 @@ static int alx_resume(struct device *dev) + { + struct pci_dev *pdev = to_pci_dev(dev); + struct alx_priv *alx = pci_get_drvdata(pdev); +- struct alx_hw *hw = &alx->hw; +- int err; +- +- alx_reset_phy(hw); +- +- if (!netif_running(alx->dev)) +- return 0; +- netif_device_attach(alx->dev); +- +- rtnl_lock(); +- err = __alx_open(alx, true); +- rtnl_unlock(); +- ++ struct net_device *netdev = alx->dev; ++ struct alx_hw *hw = &alx->hw; ++ int err; ++ ++ pci_set_power_state(pdev, PCI_D0); ++ pci_restore_state(pdev); ++ pci_save_state(pdev); ++ ++ pci_enable_wake(pdev, PCI_D3hot, 0); ++ pci_enable_wake(pdev, PCI_D3cold, 0); ++ ++ hw->link_speed = SPEED_UNKNOWN; ++ alx->int_mask = ALX_ISR_MISC; ++ ++ alx_reset_pcie(hw); ++ alx_reset_phy(hw); ++ ++ err = alx_reset_mac(hw); ++ if (err) { ++ netif_err(alx, hw, alx->dev, ++ "resume:reset_mac fail %d\n", err); ++ return -EIO; ++ } ++ ++ err = alx_setup_speed_duplex(hw, hw->adv_cfg, hw->flowctrl); ++ if (err) { ++ netif_err(alx, hw, alx->dev, ++ "resume:setup_speed_duplex fail %d\n", err); ++ return -EIO; ++ } ++ ++ if (netif_running(netdev)) { ++ rtnl_lock(); ++ err = __alx_open(alx, true); ++ rtnl_unlock(); ++ if (err) ++ return err; ++ } ++ ++ netif_device_attach(netdev); + return err; + } + +-static SIMPLE_DEV_PM_OPS(alx_pm_ops, alx_suspend, alx_resume); +-#define ALX_PM_OPS (&alx_pm_ops) +-#else +-#define ALX_PM_OPS NULL + #endif + + +@@ -1961,6 +2055,8 @@ static pci_ers_result_t alx_pci_error_slot_reset(struct pci_dev *pdev) + } + + pci_set_master(pdev); ++ pci_enable_wake(pdev, PCI_D3hot, 0); ++ pci_enable_wake(pdev, PCI_D3cold, 0); + + alx_reset_pcie(hw); + if (!alx_reset_mac(hw)) +@@ -2012,11 +2108,19 @@ static const struct pci_device_id alx_pci_tbl[] = { + {} + }; + ++#ifdef CONFIG_PM_SLEEP ++static SIMPLE_DEV_PM_OPS(alx_pm_ops, alx_suspend, alx_resume); ++#define ALX_PM_OPS (&alx_pm_ops) ++#else ++#define ALX_PM_OPS NULL ++#endif ++ + static struct pci_driver alx_driver = { + .name = alx_drv_name, + .id_table = alx_pci_tbl, + .probe = alx_probe, + .remove = alx_remove, ++ .shutdown = alx_shutdown, + .err_handler = &alx_err_handlers, + .driver.pm = ALX_PM_OPS, + }; diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-linux-hardened.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-linux-hardened.patch new file mode 100644 index 00000000..42ba2084 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-linux-hardened.patch @@ -0,0 +1,2732 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index f5acf35c712f..191e7eb6b9ce 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -496,16 +496,6 @@ + nosocket -- Disable socket memory accounting. + nokmem -- Disable kernel memory accounting. + +- checkreqprot [SELINUX] Set initial checkreqprot flag value. +- Format: { "0" | "1" } +- See security/selinux/Kconfig help text. +- 0 -- check protection applied by kernel (includes +- any implied execute protection). +- 1 -- check protection requested by application. +- Default value is set via a kernel config option. +- Value can be changed at runtime via +- /selinux/checkreqprot. +- + cio_ignore= [S390] + See Documentation/s390/CommonIO for details. + clk_ignore_unused +@@ -3105,6 +3095,11 @@ + the specified number of seconds. This is to be used if + your oopses keep scrolling off the screen. + ++ extra_latent_entropy ++ Enable a very simple form of latent entropy extraction ++ from the first 4GB of memory as the bootmem allocator ++ passes the memory pages to the buddy allocator. ++ + pcbit= [HW,ISDN] + + pcd. [PARIDE] +diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt +index 37a679501ddc..59b747920f4d 100644 +--- a/Documentation/sysctl/kernel.txt ++++ b/Documentation/sysctl/kernel.txt +@@ -94,6 +94,7 @@ show up in /proc/sys/kernel: + - sysctl_writes_strict + - tainted + - threads-max ++- tiocsti_restrict + - unknown_nmi_panic + - watchdog + - watchdog_thresh +@@ -1041,6 +1042,26 @@ available RAM pages threads-max is reduced accordingly. + + ============================================================== + ++tiocsti_restrict: ++ ++This toggle indicates whether unprivileged users are prevented ++from using the TIOCSTI ioctl to inject commands into other processes ++which share a tty session. ++ ++When tiocsti_restrict is set to (0) there are no restrictions(accept ++the default restriction of only being able to injection commands into ++one's own tty). When tiocsti_restrict is set to (1), users must ++have CAP_SYS_ADMIN to use the TIOCSTI ioctl. ++ ++When user namespaces are in use, the check for the capability ++CAP_SYS_ADMIN is done against the user namespace that originally ++opened the tty. ++ ++The kernel config option CONFIG_SECURITY_TIOCSTI_RESTRICT sets the ++default value of tiocsti_restrict. ++ ++============================================================== ++ + unknown_nmi_panic: + + The value in this file affects behavior of handling NMI. When the +diff --git a/Makefile b/Makefile +index f1859811dca1..432040e2d299 100644 +--- a/Makefile ++++ b/Makefile +@@ -698,6 +698,9 @@ stackp-flags-$(CONFIG_STACKPROTECTOR_STRONG) := -fstack-protector-strong + KBUILD_CFLAGS += $(stackp-flags-y) + + ifeq ($(cc-name),clang) ++ifdef CONFIG_LOCAL_INIT ++KBUILD_CFLAGS += -fsanitize=local-init ++endif + KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) + KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) + KBUILD_CFLAGS += $(call cc-disable-warning, gnu) +diff --git a/arch/Kconfig b/arch/Kconfig +index 6801123932a5..d331769f18cd 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -598,7 +598,7 @@ config ARCH_MMAP_RND_BITS + int "Number of bits to use for ASLR of mmap base address" if EXPERT + range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX + default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT +- default ARCH_MMAP_RND_BITS_MIN ++ default ARCH_MMAP_RND_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_BITS + help + This value can be used to select the number of bits to use to +@@ -632,7 +632,7 @@ config ARCH_MMAP_RND_COMPAT_BITS + int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT + range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX + default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT +- default ARCH_MMAP_RND_COMPAT_BITS_MIN ++ default ARCH_MMAP_RND_COMPAT_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS + help + This value can be used to select the number of bits to use to +@@ -837,6 +837,7 @@ config ARCH_HAS_REFCOUNT + + config REFCOUNT_FULL + bool "Perform full reference count validation at the expense of speed" ++ default y + help + Enabling this switches the refcounting infrastructure from a fast + unchecked atomic_t implementation to a fully state checked +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 1b1a0e95c751..2397d505747f 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -1013,6 +1013,7 @@ endif + + config ARM64_SW_TTBR0_PAN + bool "Emulate Privileged Access Never using TTBR0_EL1 switching" ++ default y + help + Enabling this option prevents the kernel from accessing + user-space memory directly by pointing TTBR0_EL1 to a reserved +@@ -1188,6 +1189,7 @@ config RANDOMIZE_BASE + bool "Randomize the address of the kernel image" + select ARM64_MODULE_PLTS if MODULES + select RELOCATABLE ++ default y + help + Randomizes the virtual address at which the kernel image is + loaded, as a security feature that deters exploit attempts +diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug +index 69c9170bdd24..a786227db0e3 100644 +--- a/arch/arm64/Kconfig.debug ++++ b/arch/arm64/Kconfig.debug +@@ -42,6 +42,7 @@ config ARM64_RANDOMIZE_TEXT_OFFSET + config DEBUG_WX + bool "Warn on W+X mappings at boot" + select ARM64_PTDUMP_CORE ++ default y + ---help--- + Generate a warning if any W+X mappings are found at boot. + +diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig +index db8d364f8476..67441db36c07 100644 +--- a/arch/arm64/configs/defconfig ++++ b/arch/arm64/configs/defconfig +@@ -1,4 +1,3 @@ +-CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y + CONFIG_NO_HZ_IDLE=y +diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h +index 433b9554c6a1..1f4b06317c9f 100644 +--- a/arch/arm64/include/asm/elf.h ++++ b/arch/arm64/include/asm/elf.h +@@ -114,10 +114,10 @@ + + /* + * This is the base location for PIE (ET_DYN with INTERP) loads. On +- * 64-bit, this is above 4GB to leave the entire 32-bit address ++ * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ +-#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) ++#define ELF_ET_DYN_BASE 0x100000000UL + + #ifndef __ASSEMBLY__ + +@@ -171,10 +171,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, + /* 1GB of VA */ + #ifdef CONFIG_COMPAT + #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \ +- 0x7ff >> (PAGE_SHIFT - 12) : \ +- 0x3ffff >> (PAGE_SHIFT - 12)) ++ ((1UL << mmap_rnd_compat_bits) - 1) >> (PAGE_SHIFT - 12) : \ ++ ((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) + #else +-#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) ++#define STACK_RND_MASK (((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) + #endif + + #ifdef __AARCH64EB__ +diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c +index 7f1628effe6d..38bd2f95a961 100644 +--- a/arch/arm64/kernel/process.c ++++ b/arch/arm64/kernel/process.c +@@ -481,9 +481,9 @@ unsigned long arch_align_stack(unsigned long sp) + unsigned long arch_randomize_brk(struct mm_struct *mm) + { + if (is_compat_task()) +- return randomize_page(mm->brk, SZ_32M); ++ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; + else +- return randomize_page(mm->brk, SZ_1G); ++ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; + } + + /* +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 44c6a82b7ce5..62aba195aae8 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1189,8 +1189,7 @@ config VM86 + default X86_LEGACY_VM86 + + config X86_16BIT +- bool "Enable support for 16-bit segments" if EXPERT +- default y ++ bool "Enable support for 16-bit segments" + depends on MODIFY_LDT_SYSCALL + ---help--- + This option is required by programs like Wine to run 16-bit +@@ -2280,7 +2279,7 @@ config COMPAT_VDSO + choice + prompt "vsyscall table for legacy applications" + depends on X86_64 +- default LEGACY_VSYSCALL_EMULATE ++ default LEGACY_VSYSCALL_NONE + help + Legacy user code that does not know how to find the vDSO expects + to be able to issue three syscalls by calling fixed addresses in +@@ -2361,8 +2360,7 @@ config CMDLINE_OVERRIDE + be set to 'N' under normal conditions. + + config MODIFY_LDT_SYSCALL +- bool "Enable the LDT (local descriptor table)" if EXPERT +- default y ++ bool "Enable the LDT (local descriptor table)" + ---help--- + Linux can allow user programs to install a per-process x86 + Local Descriptor Table (LDT) using the modify_ldt(2) system +diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug +index 7d68f0c7cfb1..85f04bbeadd8 100644 +--- a/arch/x86/Kconfig.debug ++++ b/arch/x86/Kconfig.debug +@@ -101,6 +101,7 @@ config EFI_PGT_DUMP + config DEBUG_WX + bool "Warn on W+X mappings at boot" + select X86_PTDUMP_CORE ++ default y + ---help--- + Generate a warning if any W+X mappings are found at boot. + +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index e32fc1f274d8..d08acc76502a 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -1,5 +1,4 @@ + # CONFIG_LOCALVERSION_AUTO is not set +-CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_TASKSTATS=y +diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c +index 5b8b556dbb12..a569f08b4478 100644 +--- a/arch/x86/entry/vdso/vma.c ++++ b/arch/x86/entry/vdso/vma.c +@@ -204,55 +204,9 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) + } + + #ifdef CONFIG_X86_64 +-/* +- * Put the vdso above the (randomized) stack with another randomized +- * offset. This way there is no hole in the middle of address space. +- * To save memory make sure it is still in the same PTE as the stack +- * top. This doesn't give that many random bits. +- * +- * Note that this algorithm is imperfect: the distribution of the vdso +- * start address within a PMD is biased toward the end. +- * +- * Only used for the 64-bit and x32 vdsos. +- */ +-static unsigned long vdso_addr(unsigned long start, unsigned len) +-{ +- unsigned long addr, end; +- unsigned offset; +- +- /* +- * Round up the start address. It can start out unaligned as a result +- * of stack start randomization. +- */ +- start = PAGE_ALIGN(start); +- +- /* Round the lowest possible end address up to a PMD boundary. */ +- end = (start + len + PMD_SIZE - 1) & PMD_MASK; +- if (end >= TASK_SIZE_MAX) +- end = TASK_SIZE_MAX; +- end -= len; +- +- if (end > start) { +- offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); +- addr = start + (offset << PAGE_SHIFT); +- } else { +- addr = start; +- } +- +- /* +- * Forcibly align the final address in case we have a hardware +- * issue that requires alignment for performance reasons. +- */ +- addr = align_vdso_addr(addr); +- +- return addr; +-} +- + static int map_vdso_randomized(const struct vdso_image *image) + { +- unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); +- +- return map_vdso(image, addr); ++ return map_vdso(image, 0); + } + #endif + +diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h +index 0d157d2a1e2a..770c8ae97f92 100644 +--- a/arch/x86/include/asm/elf.h ++++ b/arch/x86/include/asm/elf.h +@@ -249,11 +249,11 @@ extern int force_personality32; + + /* + * This is the base location for PIE (ET_DYN with INTERP) loads. On +- * 64-bit, this is above 4GB to leave the entire 32-bit address ++ * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ + #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ +- (DEFAULT_MAP_WINDOW / 3 * 2)) ++ 0x100000000UL) + + /* This yields a mask that user programs can use to figure out what + instruction set this CPU supports. This could be done in user space, +@@ -313,8 +313,8 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); + + #ifdef CONFIG_X86_32 + +-#define __STACK_RND_MASK(is32bit) (0x7ff) +-#define STACK_RND_MASK (0x7ff) ++#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) ++#define STACK_RND_MASK ((1UL << mmap_rnd_bits) - 1) + + #define ARCH_DLINFO ARCH_DLINFO_IA32 + +@@ -323,7 +323,11 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); + #else /* CONFIG_X86_32 */ + + /* 1GB for 64bit, 8MB for 32bit */ +-#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) ++#ifdef CONFIG_COMPAT ++#define __STACK_RND_MASK(is32bit) ((is32bit) ? (1UL << mmap_rnd_compat_bits) - 1 : (1UL << mmap_rnd_bits) - 1) ++#else ++#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) ++#endif + #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) + + #define ARCH_DLINFO \ +@@ -381,5 +385,4 @@ struct va_alignment { + } ____cacheline_aligned; + + extern struct va_alignment va_align; +-extern unsigned long align_vdso_addr(unsigned long); + #endif /* _ASM_X86_ELF_H */ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 79ec7add5f98..2950448e00ac 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -310,6 +310,7 @@ static inline void cr4_set_bits(unsigned long mask) + + local_irq_save(flags); + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + if ((cr4 | mask) != cr4) + __cr4_set(cr4 | mask); + local_irq_restore(flags); +@@ -322,6 +323,7 @@ static inline void cr4_clear_bits(unsigned long mask) + + local_irq_save(flags); + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + if ((cr4 & ~mask) != cr4) + __cr4_set(cr4 & ~mask); + local_irq_restore(flags); +@@ -332,6 +334,7 @@ static inline void cr4_toggle_bits_irqsoff(unsigned long mask) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + __cr4_set(cr4 ^ mask); + } + +@@ -438,6 +441,7 @@ static inline void __native_flush_tlb_global(void) + raw_local_irq_save(flags); + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + /* toggle PGE */ + native_write_cr4(cr4 ^ X86_CR4_PGE); + /* write old PGE again and flush TLBs */ +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 44c4ef3d989b..05943ca7b59a 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1730,7 +1730,6 @@ void cpu_init(void) + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + +- x86_configure_nx(); + x2apic_setup(); + + /* +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 7d31192296a8..4f87550d814c 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -39,6 +39,8 @@ + #include + #include + #include ++#include ++#include + + #include "process.h" + +@@ -779,7 +781,10 @@ unsigned long arch_align_stack(unsigned long sp) + + unsigned long arch_randomize_brk(struct mm_struct *mm) + { +- return randomize_page(mm->brk, 0x02000000); ++ if (mmap_is_ia32()) ++ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; ++ else ++ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; + } + + /* +diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c +index 6a78d4b36a79..715009f7a96c 100644 +--- a/arch/x86/kernel/sys_x86_64.c ++++ b/arch/x86/kernel/sys_x86_64.c +@@ -54,13 +54,6 @@ static unsigned long get_align_bits(void) + return va_align.bits & get_align_mask(); + } + +-unsigned long align_vdso_addr(unsigned long addr) +-{ +- unsigned long align_mask = get_align_mask(); +- addr = (addr + align_mask) & ~align_mask; +- return addr | get_align_bits(); +-} +- + static int __init control_va_addr_alignment(char *str) + { + /* guard against enabling this on other CPU families */ +@@ -122,10 +115,7 @@ static void find_start_end(unsigned long addr, unsigned long flags, + } + + *begin = get_mmap_base(1); +- if (in_compat_syscall()) +- *end = task_size_32bit(); +- else +- *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); ++ *end = get_mmap_base(0); + } + + unsigned long +@@ -210,7 +200,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; +- info.low_limit = PAGE_SIZE; ++ info.low_limit = get_mmap_base(1); + info.high_limit = get_mmap_base(0); + + /* +diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c +index 979e0a02cbe1..d6ab882a0091 100644 +--- a/arch/x86/mm/init_32.c ++++ b/arch/x86/mm/init_32.c +@@ -560,9 +560,9 @@ static void __init pagetable_init(void) + + #define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL) + /* Bits supported by the hardware: */ +-pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK; ++pteval_t __supported_pte_mask __ro_after_init = DEFAULT_PTE_MASK; + /* Bits allowed in normal kernel mappings: */ +-pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK; ++pteval_t __default_kernel_pte_mask __ro_after_init = DEFAULT_PTE_MASK; + EXPORT_SYMBOL_GPL(__supported_pte_mask); + /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ + EXPORT_SYMBOL(__default_kernel_pte_mask); +@@ -873,7 +873,7 @@ int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) + #endif + #endif + +-int kernel_set_to_readonly __read_mostly; ++int kernel_set_to_readonly __ro_after_init; + + void set_kernel_text_rw(void) + { +@@ -925,12 +925,11 @@ void mark_rodata_ro(void) + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + ++ kernel_set_to_readonly = 1; + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); + +- kernel_set_to_readonly = 1; +- + #ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", + start, start+size); +diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c +index a3e9c6ee3cf2..40bbcd978b0a 100644 +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -66,9 +66,9 @@ + */ + + /* Bits supported by the hardware: */ +-pteval_t __supported_pte_mask __read_mostly = ~0; ++pteval_t __supported_pte_mask __ro_after_init = ~0; + /* Bits allowed in normal kernel mappings: */ +-pteval_t __default_kernel_pte_mask __read_mostly = ~0; ++pteval_t __default_kernel_pte_mask __ro_after_init = ~0; + EXPORT_SYMBOL_GPL(__supported_pte_mask); + /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ + EXPORT_SYMBOL(__default_kernel_pte_mask); +@@ -1201,7 +1201,7 @@ void __init mem_init(void) + mem_init_print_info(NULL); + } + +-int kernel_set_to_readonly; ++int kernel_set_to_readonly __ro_after_init; + + void set_kernel_text_rw(void) + { +@@ -1250,9 +1250,8 @@ void mark_rodata_ro(void) + + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); +- set_memory_ro(start, (end - start) >> PAGE_SHIFT); +- + kernel_set_to_readonly = 1; ++ set_memory_ro(start, (end - start) >> PAGE_SHIFT); + + /* + * The rodata/data/bss/brk section (but not the kernel text!) +diff --git a/block/blk-softirq.c b/block/blk-softirq.c +index 15c1f5e12eb8..ff72cccec5b8 100644 +--- a/block/blk-softirq.c ++++ b/block/blk-softirq.c +@@ -20,7 +20,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done); + * Softirq action handler - move entries to local list and loop over them + * while passing them to the queue registered handler. + */ +-static __latent_entropy void blk_done_softirq(struct softirq_action *h) ++static __latent_entropy void blk_done_softirq(void) + { + struct list_head *cpu_list, local_list; + +diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c +index b8c3f9e6af89..bf65bc091cb6 100644 +--- a/drivers/ata/libata-core.c ++++ b/drivers/ata/libata-core.c +@@ -5157,7 +5157,7 @@ void ata_qc_free(struct ata_queued_cmd *qc) + struct ata_port *ap; + unsigned int tag; + +- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ ++ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + ap = qc->ap; + + qc->flags = 0; +@@ -5174,7 +5174,7 @@ void __ata_qc_complete(struct ata_queued_cmd *qc) + struct ata_port *ap; + struct ata_link *link; + +- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ ++ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE)); + ap = qc->ap; + link = qc->dev->link; +diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig +index 40728491f37b..b4f3ccfa2993 100644 +--- a/drivers/char/Kconfig ++++ b/drivers/char/Kconfig +@@ -9,7 +9,6 @@ source "drivers/tty/Kconfig" + + config DEVMEM + bool "/dev/mem virtual device support" +- default y + help + Say Y here if you want to support the /dev/mem device. + The /dev/mem device is used to access areas of physical +@@ -531,7 +530,6 @@ config TELCLOCK + config DEVPORT + bool "/dev/port character device" + depends on ISA || PCI +- default y + help + Say Y here if you want to support the /dev/port device. The /dev/port + device is similar to /dev/mem, but for I/O ports. +diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig +index 0840d27381ea..ae292fcedaca 100644 +--- a/drivers/tty/Kconfig ++++ b/drivers/tty/Kconfig +@@ -122,7 +122,6 @@ config UNIX98_PTYS + + config LEGACY_PTYS + bool "Legacy (BSD) PTY support" +- default y + ---help--- + A pseudo terminal (PTY) is a software device consisting of two + halves: a master and a slave. The slave device behaves identical to +diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c +index e7d192ebecd7..1c682abd31ca 100644 +--- a/drivers/tty/tty_io.c ++++ b/drivers/tty/tty_io.c +@@ -172,6 +172,7 @@ static void free_tty_struct(struct tty_struct *tty) + put_device(tty->dev); + kfree(tty->write_buf); + tty->magic = 0xDEADDEAD; ++ put_user_ns(tty->owner_user_ns); + kfree(tty); + } + +@@ -2175,11 +2176,19 @@ static int tty_fasync(int fd, struct file *filp, int on) + * FIXME: may race normal receive processing + */ + ++int tiocsti_restrict = IS_ENABLED(CONFIG_SECURITY_TIOCSTI_RESTRICT); ++ + static int tiocsti(struct tty_struct *tty, char __user *p) + { + char ch, mbz = 0; + struct tty_ldisc *ld; + ++ if (tiocsti_restrict && ++ !ns_capable(tty->owner_user_ns, CAP_SYS_ADMIN)) { ++ dev_warn_ratelimited(tty->dev, ++ "Denied TIOCSTI ioctl for non-privileged process\n"); ++ return -EPERM; ++ } + if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (get_user(ch, p)) +@@ -2863,6 +2872,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) + tty->index = idx; + tty_line_name(driver, idx, tty->name); + tty->dev = tty_get_device(tty); ++ tty->owner_user_ns = get_user_ns(current_user_ns()); + + return tty; + } +diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c +index cc62707c0251..21d78ae4b4ae 100644 +--- a/drivers/usb/core/hub.c ++++ b/drivers/usb/core/hub.c +@@ -41,6 +41,8 @@ + #define USB_TP_TRANSMISSION_DELAY 40 /* ns */ + #define USB_TP_TRANSMISSION_DELAY_MAX 65535 /* ns */ + ++extern int deny_new_usb; ++ + /* Protect struct usb_device->state and ->children members + * Note: Both are also protected by ->dev.sem, except that ->state can + * change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */ +@@ -4933,6 +4935,12 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, + goto done; + return; + } ++ ++ if (deny_new_usb) { ++ dev_err(&port_dev->dev, "denied insert of USB device on port %d\n", port1); ++ goto done; ++ } ++ + if (hub_is_superspeed(hub->hdev)) + unit_load = 150; + else +diff --git a/fs/exec.c b/fs/exec.c +index 1ebf6e5a521d..73b8d839927c 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -62,6 +62,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -320,6 +321,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm) + arch_bprm_mm_init(mm, vma); + up_write(&mm->mmap_sem); + bprm->p = vma->vm_end - sizeof(void *); ++ if (randomize_va_space) ++ bprm->p ^= get_random_int() & ~PAGE_MASK; + return 0; + err: + up_write(&mm->mmap_sem); +diff --git a/fs/namei.c b/fs/namei.c +index 914178cdbe94..7422b5ce077a 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -885,10 +885,10 @@ static inline void put_link(struct nameidata *nd) + path_put(&last->link); + } + +-int sysctl_protected_symlinks __read_mostly = 0; +-int sysctl_protected_hardlinks __read_mostly = 0; +-int sysctl_protected_fifos __read_mostly; +-int sysctl_protected_regular __read_mostly; ++int sysctl_protected_symlinks __read_mostly = 1; ++int sysctl_protected_hardlinks __read_mostly = 1; ++int sysctl_protected_fifos __read_mostly = 2; ++int sysctl_protected_regular __read_mostly = 2; + + /** + * may_follow_link - Check symlink following for unsafe situations +diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig +index 5f93cfacb3d1..cea0d7d3b23e 100644 +--- a/fs/nfs/Kconfig ++++ b/fs/nfs/Kconfig +@@ -195,4 +195,3 @@ config NFS_DEBUG + bool + depends on NFS_FS && SUNRPC_DEBUG + select CRC32 +- default y +diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig +index 817c02b13b1d..b8cd62b5cbc3 100644 +--- a/fs/proc/Kconfig ++++ b/fs/proc/Kconfig +@@ -40,7 +40,6 @@ config PROC_KCORE + config PROC_VMCORE + bool "/proc/vmcore support" + depends on PROC_FS && CRASH_DUMP +- default y + help + Exports the dump image of crashed kernel in ELF format. + +diff --git a/fs/stat.c b/fs/stat.c +index f8e6fb2c3657..240c1432e18f 100644 +--- a/fs/stat.c ++++ b/fs/stat.c +@@ -40,8 +40,13 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) + stat->gid = inode->i_gid; + stat->rdev = inode->i_rdev; + stat->size = i_size_read(inode); +- stat->atime = inode->i_atime; +- stat->mtime = inode->i_mtime; ++ if (is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { ++ stat->atime = inode->i_ctime; ++ stat->mtime = inode->i_ctime; ++ } else { ++ stat->atime = inode->i_atime; ++ stat->mtime = inode->i_mtime; ++ } + stat->ctime = inode->i_ctime; + stat->blksize = i_blocksize(inode); + stat->blocks = inode->i_blocks; +@@ -75,9 +80,14 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, + stat->result_mask |= STATX_BASIC_STATS; + request_mask &= STATX_ALL; + query_flags &= KSTAT_QUERY_FLAGS; +- if (inode->i_op->getattr) +- return inode->i_op->getattr(path, stat, request_mask, +- query_flags); ++ if (inode->i_op->getattr) { ++ int retval = inode->i_op->getattr(path, stat, request_mask, query_flags); ++ if (!retval && is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { ++ stat->atime = stat->ctime; ++ stat->mtime = stat->ctime; ++ } ++ return retval; ++ } + + generic_fillattr(inode, stat); + return 0; +diff --git a/include/linux/cache.h b/include/linux/cache.h +index 750621e41d1c..e7157c18c62c 100644 +--- a/include/linux/cache.h ++++ b/include/linux/cache.h +@@ -31,6 +31,8 @@ + #define __ro_after_init __attribute__((__section__(".data..ro_after_init"))) + #endif + ++#define __read_only __ro_after_init ++ + #ifndef ____cacheline_aligned + #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) + #endif +diff --git a/include/linux/capability.h b/include/linux/capability.h +index f640dcbc880c..2b4f5d651f19 100644 +--- a/include/linux/capability.h ++++ b/include/linux/capability.h +@@ -207,6 +207,7 @@ extern bool has_capability_noaudit(struct task_struct *t, int cap); + extern bool has_ns_capability_noaudit(struct task_struct *t, + struct user_namespace *ns, int cap); + extern bool capable(int cap); ++extern bool capable_noaudit(int cap); + extern bool ns_capable(struct user_namespace *ns, int cap); + extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); + #else +@@ -232,6 +233,10 @@ static inline bool capable(int cap) + { + return true; + } ++static inline bool capable_noaudit(int cap) ++{ ++ return true; ++} + static inline bool ns_capable(struct user_namespace *ns, int cap) + { + return true; +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 7b6084854bfe..cee4467da4a7 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -3456,4 +3456,15 @@ extern void inode_nohighmem(struct inode *inode); + extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, + int advice); + ++extern int device_sidechannel_restrict; ++ ++static inline bool is_sidechannel_device(const struct inode *inode) ++{ ++ umode_t mode; ++ if (!device_sidechannel_restrict) ++ return false; ++ mode = inode->i_mode; ++ return ((S_ISCHR(mode) || S_ISBLK(mode)) && (mode & (S_IROTH | S_IWOTH))); ++} ++ + #endif /* _LINUX_FS_H */ +diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h +index fd1ce10553bf..1905d2476d32 100644 +--- a/include/linux/fsnotify.h ++++ b/include/linux/fsnotify.h +@@ -177,6 +177,9 @@ static inline void fsnotify_access(struct file *file) + struct inode *inode = file_inode(file); + __u32 mask = FS_ACCESS; + ++ if (is_sidechannel_device(inode)) ++ return; ++ + if (S_ISDIR(inode->i_mode)) + mask |= FS_ISDIR; + +@@ -195,6 +198,9 @@ static inline void fsnotify_modify(struct file *file) + struct inode *inode = file_inode(file); + __u32 mask = FS_MODIFY; + ++ if (is_sidechannel_device(inode)) ++ return; ++ + if (S_ISDIR(inode->i_mode)) + mask |= FS_ISDIR; + +diff --git a/include/linux/gfp.h b/include/linux/gfp.h +index 24bcc5eec6b4..b1cdfc350596 100644 +--- a/include/linux/gfp.h ++++ b/include/linux/gfp.h +@@ -530,9 +530,9 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, + extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); + extern unsigned long get_zeroed_page(gfp_t gfp_mask); + +-void *alloc_pages_exact(size_t size, gfp_t gfp_mask); ++void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __attribute__((alloc_size(1))); + void free_pages_exact(void *virt, size_t size); +-void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); ++void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __attribute__((alloc_size(2))); + + #define __get_free_page(gfp_mask) \ + __get_free_pages((gfp_mask), 0) +diff --git a/include/linux/highmem.h b/include/linux/highmem.h +index 0690679832d4..b9394bc86fad 100644 +--- a/include/linux/highmem.h ++++ b/include/linux/highmem.h +@@ -191,6 +191,13 @@ static inline void clear_highpage(struct page *page) + kunmap_atomic(kaddr); + } + ++static inline void verify_zero_highpage(struct page *page) ++{ ++ void *kaddr = kmap_atomic(page); ++ BUG_ON(memchr_inv(kaddr, 0, PAGE_SIZE)); ++ kunmap_atomic(kaddr); ++} ++ + static inline void zero_user_segments(struct page *page, + unsigned start1, unsigned end1, + unsigned start2, unsigned end2) +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index eeceac3376fc..78ad558bce5f 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -490,7 +490,7 @@ extern const char * const softirq_to_name[NR_SOFTIRQS]; + + struct softirq_action + { +- void (*action)(struct softirq_action *); ++ void (*action)(void); + }; + + asmlinkage void do_softirq(void); +@@ -505,7 +505,7 @@ static inline void do_softirq_own_stack(void) + } + #endif + +-extern void open_softirq(int nr, void (*action)(struct softirq_action *)); ++extern void __init open_softirq(int nr, void (*action)(void)); + extern void softirq_init(void); + extern void __raise_softirq_irqoff(unsigned int nr); + +diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h +index 069aa2ebef90..cb9e3637a620 100644 +--- a/include/linux/kobject_ns.h ++++ b/include/linux/kobject_ns.h +@@ -45,7 +45,7 @@ struct kobj_ns_type_operations { + void (*drop_ns)(void *); + }; + +-int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); ++int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops); + int kobj_ns_type_registered(enum kobj_ns_type type); + const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent); + const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj); +diff --git a/include/linux/mm.h b/include/linux/mm.h +index e899460f1bc5..bca0cbed3269 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -571,7 +571,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) + } + #endif + +-extern void *kvmalloc_node(size_t size, gfp_t flags, int node); ++extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __attribute__((alloc_size(1))); + static inline void *kvmalloc(size_t size, gfp_t flags) + { + return kvmalloc_node(size, flags, NUMA_NO_NODE); +diff --git a/include/linux/percpu.h b/include/linux/percpu.h +index 70b7123f38c7..09f3019489b2 100644 +--- a/include/linux/percpu.h ++++ b/include/linux/percpu.h +@@ -129,7 +129,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_populate_pte_fn_t populate_pte_fn); + #endif + +-extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); ++extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); + extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); + extern bool is_kernel_percpu_address(unsigned long addr); + +@@ -137,8 +137,8 @@ extern bool is_kernel_percpu_address(unsigned long addr); + extern void __init setup_per_cpu_areas(void); + #endif + +-extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); +-extern void __percpu *__alloc_percpu(size_t size, size_t align); ++extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __attribute__((alloc_size(1))); ++extern void __percpu *__alloc_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); + extern void free_percpu(void __percpu *__pdata); + extern phys_addr_t per_cpu_ptr_to_phys(void *addr); + +diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h +index 53c500f0ca79..15c236b8aba3 100644 +--- a/include/linux/perf_event.h ++++ b/include/linux/perf_event.h +@@ -1179,6 +1179,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + int perf_event_max_stack_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + ++static inline bool perf_paranoid_any(void) ++{ ++ return sysctl_perf_event_paranoid > 2; ++} ++ + static inline bool perf_paranoid_tracepoint_raw(void) + { + return sysctl_perf_event_paranoid > -1; +diff --git a/include/linux/slab.h b/include/linux/slab.h +index ed9cbddeb4a6..e76e18c7165f 100644 +--- a/include/linux/slab.h ++++ b/include/linux/slab.h +@@ -178,8 +178,8 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *); + /* + * Common kmalloc functions provided by all allocators + */ +-void * __must_check __krealloc(const void *, size_t, gfp_t); +-void * __must_check krealloc(const void *, size_t, gfp_t); ++void * __must_check __krealloc(const void *, size_t, gfp_t) __attribute__((alloc_size(2))); ++void * __must_check krealloc(const void *, size_t, gfp_t) __attribute((alloc_size(2))); + void kfree(const void *); + void kzfree(const void *); + size_t ksize(const void *); +@@ -352,7 +352,7 @@ static __always_inline unsigned int kmalloc_index(size_t size) + } + #endif /* !CONFIG_SLOB */ + +-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; ++void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); + void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; + void kmem_cache_free(struct kmem_cache *, void *); + +@@ -376,7 +376,7 @@ static __always_inline void kfree_bulk(size_t size, void **p) + } + + #ifdef CONFIG_NUMA +-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; ++void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); + void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; + #else + static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) +@@ -498,7 +498,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) + * for general use, and so are not documented here. For a full list of + * potential flags, always refer to linux/gfp.h. + */ +-static __always_inline void *kmalloc(size_t size, gfp_t flags) ++static __always_inline __attribute__((alloc_size(1))) void *kmalloc(size_t size, gfp_t flags) + { + if (__builtin_constant_p(size)) { + if (size > KMALLOC_MAX_CACHE_SIZE) +@@ -538,7 +538,7 @@ static __always_inline unsigned int kmalloc_size(unsigned int n) + return 0; + } + +-static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) ++static __always_inline __attribute__((alloc_size(1))) void *kmalloc_node(size_t size, gfp_t flags, int node) + { + #ifndef CONFIG_SLOB + if (__builtin_constant_p(size) && +diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h +index 3a1a1dbc6f49..ff38fec9eb76 100644 +--- a/include/linux/slub_def.h ++++ b/include/linux/slub_def.h +@@ -121,6 +121,11 @@ struct kmem_cache { + unsigned long random; + #endif + ++#ifdef CONFIG_SLAB_CANARY ++ unsigned long random_active; ++ unsigned long random_inactive; ++#endif ++ + #ifdef CONFIG_NUMA + /* + * Defragmentation by allocating from a remote node. +diff --git a/include/linux/string.h b/include/linux/string.h +index 4a5a0eb7df51..be86cf21d0ce 100644 +--- a/include/linux/string.h ++++ b/include/linux/string.h +@@ -235,10 +235,16 @@ void __read_overflow2(void) __compiletime_error("detected read beyond size of ob + void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter"); + void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); + ++#ifdef CONFIG_FORTIFY_SOURCE_STRICT_STRING ++#define __string_size(p) __builtin_object_size(p, 1) ++#else ++#define __string_size(p) __builtin_object_size(p, 0) ++#endif ++ + #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) + __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + if (p_size < size) +@@ -248,7 +254,7 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) + + __FORTIFY_INLINE char *strcat(char *p, const char *q) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + if (p_size == (size_t)-1) + return __builtin_strcat(p, q); + if (strlcat(p, q, p_size) >= p_size) +@@ -259,7 +265,7 @@ __FORTIFY_INLINE char *strcat(char *p, const char *q) + __FORTIFY_INLINE __kernel_size_t strlen(const char *p) + { + __kernel_size_t ret; +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + + /* Work around gcc excess stack consumption issue */ + if (p_size == (size_t)-1 || +@@ -274,7 +280,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) + extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); + __FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); + if (p_size <= ret && maxlen != ret) + fortify_panic(__func__); +@@ -286,8 +292,8 @@ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); + __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) + { + size_t ret; +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __real_strlcpy(p, q, size); + ret = strlen(q); +@@ -307,8 +313,8 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) + __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) + { + size_t p_len, copy_len; +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __builtin_strncat(p, q, count); + p_len = strlen(p); +@@ -421,8 +427,8 @@ __FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) + /* defined after fortified strlen and memcpy to reuse them */ + __FORTIFY_INLINE char *strcpy(char *p, const char *q) + { +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __builtin_strcpy(p, q); + memcpy(p, q, strlen(q) + 1); +diff --git a/include/linux/tty.h b/include/linux/tty.h +index 808fbfe86f85..e4429b7d6e8e 100644 +--- a/include/linux/tty.h ++++ b/include/linux/tty.h +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + + + /* +@@ -336,6 +337,7 @@ struct tty_struct { + /* If the tty has a pending do_SAK, queue it here - akpm */ + struct work_struct SAK_work; + struct tty_port *port; ++ struct user_namespace *owner_user_ns; + } __randomize_layout; + + /* Each of a tty's open files has private_data pointing to tty_file_private */ +@@ -345,6 +347,8 @@ struct tty_file_private { + struct list_head list; + }; + ++extern int tiocsti_restrict; ++ + /* tty magic number */ + #define TTY_MAGIC 0x5401 + +diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h +index 398e9c95cd61..baab7195306a 100644 +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -69,19 +69,19 @@ static inline void vmalloc_init(void) + } + #endif + +-extern void *vmalloc(unsigned long size); +-extern void *vzalloc(unsigned long size); +-extern void *vmalloc_user(unsigned long size); +-extern void *vmalloc_node(unsigned long size, int node); +-extern void *vzalloc_node(unsigned long size, int node); +-extern void *vmalloc_exec(unsigned long size); +-extern void *vmalloc_32(unsigned long size); +-extern void *vmalloc_32_user(unsigned long size); +-extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); ++extern void *vmalloc(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vzalloc(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_user(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); ++extern void *vzalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); ++extern void *vmalloc_exec(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_32(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_32_user(unsigned long size) __attribute__((alloc_size(1))); ++extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) __attribute__((alloc_size(1))); + extern void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, +- const void *caller); ++ const void *caller) __attribute__((alloc_size(1))); + #ifndef CONFIG_MMU + extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); + static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, +diff --git a/init/Kconfig b/init/Kconfig +index 864af10bb1b9..643bb9448bb9 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -323,6 +323,7 @@ config USELIB + config AUDIT + bool "Auditing support" + depends on NET ++ default y + help + Enable auditing infrastructure that can be used with another + kernel subsystem, such as SELinux (which requires this for +@@ -1088,6 +1089,12 @@ config CC_OPTIMIZE_FOR_SIZE + + endchoice + ++config LOCAL_INIT ++ bool "Zero uninitialized locals" ++ help ++ Zero-fill uninitialized local variables, other than variable-length ++ arrays. Requires compiler support. ++ + config HAVE_LD_DEAD_CODE_DATA_ELIMINATION + bool + help +@@ -1374,8 +1381,7 @@ config SHMEM + which may be appropriate on small systems without swap. + + config AIO +- bool "Enable AIO support" if EXPERT +- default y ++ bool "Enable AIO support" + help + This option enables POSIX asynchronous I/O which may by used + by some high performance threaded applications. Disabling +@@ -1592,7 +1598,7 @@ config VM_EVENT_COUNTERS + + config SLUB_DEBUG + default y +- bool "Enable SLUB debugging support" if EXPERT ++ bool "Enable SLUB debugging support" + depends on SLUB && SYSFS + help + SLUB has extensive debug support features. Disabling these can +@@ -1616,7 +1622,6 @@ config SLUB_MEMCG_SYSFS_ON + + config COMPAT_BRK + bool "Disable heap randomization" +- default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). +@@ -1663,7 +1668,6 @@ endchoice + + config SLAB_MERGE_DEFAULT + bool "Allow slab caches to be merged" +- default y + help + For reduced kernel memory fragmentation, slab caches can be + merged when they share the same size and other characteristics. +@@ -1676,9 +1680,9 @@ config SLAB_MERGE_DEFAULT + command line. + + config SLAB_FREELIST_RANDOM +- default n + depends on SLAB || SLUB + bool "SLAB freelist randomization" ++ default y + help + Randomizes the freelist order used on creating new pages. This + security feature reduces the predictability of the kernel slab +@@ -1687,12 +1691,56 @@ config SLAB_FREELIST_RANDOM + config SLAB_FREELIST_HARDENED + bool "Harden slab freelist metadata" + depends on SLUB ++ default y + help + Many kernel heap attacks try to target slab cache metadata and + other infrastructure. This options makes minor performance + sacrifies to harden the kernel slab allocator against common + freelist exploit methods. + ++config SLAB_HARDENED ++ default y ++ depends on SLUB ++ bool "Hardened SLAB infrastructure" ++ help ++ Make minor performance sacrifices to harden the kernel slab ++ allocator. ++ ++config SLAB_CANARY ++ depends on SLUB ++ depends on !SLAB_MERGE_DEFAULT ++ bool "SLAB canaries" ++ default y ++ help ++ Place canaries at the end of kernel slab allocations, sacrificing ++ some performance and memory usage for security. ++ ++ Canaries can detect some forms of heap corruption when allocations ++ are freed and as part of the HARDENED_USERCOPY feature. It provides ++ basic use-after-free detection for HARDENED_USERCOPY. ++ ++ Canaries absorb small overflows (rendering them harmless), mitigate ++ non-NUL terminated C string overflows on 64-bit via a guaranteed zero ++ byte and provide basic double-free detection. ++ ++config SLAB_SANITIZE ++ bool "Sanitize SLAB allocations" ++ depends on SLUB ++ default y ++ help ++ Zero fill slab allocations on free, reducing the lifetime of ++ sensitive data and helping to mitigate use-after-free bugs. ++ ++ For slabs with debug poisoning enabling, this has no impact. ++ ++config SLAB_SANITIZE_VERIFY ++ depends on SLAB_SANITIZE && PAGE_SANITIZE ++ default y ++ bool "Verify sanitized SLAB allocations" ++ help ++ Verify that newly allocated slab allocations are zeroed to detect ++ write-after-free bugs. ++ + config SLUB_CPU_PARTIAL + default y + depends on SLUB && SMP +diff --git a/kernel/audit.c b/kernel/audit.c +index 2a8058764aa6..14e7a763db43 100644 +--- a/kernel/audit.c ++++ b/kernel/audit.c +@@ -1628,6 +1628,9 @@ static int __init audit_enable(char *str) + + if (audit_default == AUDIT_OFF) + audit_initialized = AUDIT_DISABLED; ++ else if (!audit_ever_enabled) ++ audit_initialized = AUDIT_UNINITIALIZED; ++ + if (audit_set_enabled(audit_default)) + pr_err("audit: error setting audit state (%d)\n", + audit_default); +diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c +index 474525e3a9db..644a87f6ad28 100644 +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -368,7 +368,7 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) + #ifdef CONFIG_BPF_JIT + /* All BPF JIT sysctl knobs here. */ + int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +-int bpf_jit_harden __read_mostly; ++int bpf_jit_harden __read_mostly = 2; + int bpf_jit_kallsyms __read_mostly; + + static __always_inline void +diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c +index 382c09dddf93..11f436e79170 100644 +--- a/kernel/bpf/syscall.c ++++ b/kernel/bpf/syscall.c +@@ -48,7 +48,7 @@ static DEFINE_SPINLOCK(prog_idr_lock); + static DEFINE_IDR(map_idr); + static DEFINE_SPINLOCK(map_idr_lock); + +-int sysctl_unprivileged_bpf_disabled __read_mostly; ++int sysctl_unprivileged_bpf_disabled __read_mostly = 1; + + static const struct bpf_map_ops * const bpf_map_types[] = { + #define BPF_PROG_TYPE(_id, _ops) +diff --git a/kernel/capability.c b/kernel/capability.c +index 1e1c0236f55b..452062fe45ce 100644 +--- a/kernel/capability.c ++++ b/kernel/capability.c +@@ -431,6 +431,12 @@ bool capable(int cap) + return ns_capable(&init_user_ns, cap); + } + EXPORT_SYMBOL(capable); ++ ++bool capable_noaudit(int cap) ++{ ++ return ns_capable_noaudit(&init_user_ns, cap); ++} ++EXPORT_SYMBOL(capable_noaudit); + #endif /* CONFIG_MULTIUSER */ + + /** +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 5a97f34bc14c..a4a4fc1e1586 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -397,8 +397,13 @@ static cpumask_var_t perf_online_mask; + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv ++ * 3 - disallow all unpriv perf event use + */ ++#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT ++int sysctl_perf_event_paranoid __read_mostly = 3; ++#else + int sysctl_perf_event_paranoid __read_mostly = 2; ++#endif + + /* Minimum for 512 kiB + 1 user control page */ + int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ +@@ -10410,6 +10415,9 @@ SYSCALL_DEFINE5(perf_event_open, + if (flags & ~PERF_FLAG_ALL) + return -EINVAL; + ++ if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN)) ++ return -EACCES; ++ + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; +diff --git a/kernel/fork.c b/kernel/fork.c +index 64ef113e387e..42d257e43e04 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -103,6 +103,11 @@ + + #define CREATE_TRACE_POINTS + #include ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#else ++#define unprivileged_userns_clone 0 ++#endif + + /* + * Minimum number of threads to boot the kernel +@@ -1649,6 +1654,10 @@ static __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -2476,6 +2485,12 @@ int ksys_unshare(unsigned long unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c +index 3d37c279c090..0789ca413f09 100644 +--- a/kernel/power/snapshot.c ++++ b/kernel/power/snapshot.c +@@ -1138,7 +1138,7 @@ void free_basic_memory_bitmaps(void) + + void clear_free_pages(void) + { +-#ifdef CONFIG_PAGE_POISONING_ZERO ++#if defined(CONFIG_PAGE_POISONING_ZERO) || defined(CONFIG_PAGE_SANITIZE) + struct memory_bitmap *bm = free_pages_map; + unsigned long pfn; + +@@ -1155,7 +1155,7 @@ void clear_free_pages(void) + } + memory_bm_position_reset(bm); + pr_info("free pages cleared after restore\n"); +-#endif /* PAGE_POISONING_ZERO */ ++#endif /* PAGE_POISONING_ZERO || PAGE_SANITIZE */ + } + + /** +diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c +index befc9321a89c..61e19256560c 100644 +--- a/kernel/rcu/tiny.c ++++ b/kernel/rcu/tiny.c +@@ -162,7 +162,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) + } + } + +-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) ++static __latent_entropy void rcu_process_callbacks(void) + { + __rcu_process_callbacks(&rcu_sched_ctrlblk); + __rcu_process_callbacks(&rcu_bh_ctrlblk); +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c +index 15301ed19da6..2a799dea7016 100644 +--- a/kernel/rcu/tree.c ++++ b/kernel/rcu/tree.c +@@ -2862,7 +2862,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) + /* + * Do RCU core processing for the current CPU. + */ +-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) ++static __latent_entropy void rcu_process_callbacks(void) + { + struct rcu_state *rsp; + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 7137bc343b4a..104e0855a018 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9593,7 +9593,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ +-static __latent_entropy void run_rebalance_domains(struct softirq_action *h) ++static __latent_entropy void run_rebalance_domains(void) + { + struct rq *this_rq = this_rq(); + enum cpu_idle_type idle = this_rq->idle_balance ? +diff --git a/kernel/softirq.c b/kernel/softirq.c +index 6f584861d329..1943fe60f3b9 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -53,7 +53,7 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); + EXPORT_PER_CPU_SYMBOL(irq_stat); + #endif + +-static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; ++static struct softirq_action softirq_vec[NR_SOFTIRQS] __ro_after_init __aligned(PAGE_SIZE); + + DEFINE_PER_CPU(struct task_struct *, ksoftirqd); + +@@ -289,7 +289,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + kstat_incr_softirqs_this_cpu(vec_nr); + + trace_softirq_entry(vec_nr); +- h->action(h); ++ h->action(); + trace_softirq_exit(vec_nr); + if (unlikely(prev_count != preempt_count())) { + pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", +@@ -451,7 +451,7 @@ void __raise_softirq_irqoff(unsigned int nr) + or_softirq_pending(1UL << nr); + } + +-void open_softirq(int nr, void (*action)(struct softirq_action *)) ++void __init open_softirq(int nr, void (*action)(void)) + { + softirq_vec[nr].action = action; + } +@@ -497,8 +497,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) + } + EXPORT_SYMBOL(__tasklet_hi_schedule); + +-static void tasklet_action_common(struct softirq_action *a, +- struct tasklet_head *tl_head, ++static void tasklet_action_common(struct tasklet_head *tl_head, + unsigned int softirq_nr) + { + struct tasklet_struct *list; +@@ -535,14 +534,14 @@ static void tasklet_action_common(struct softirq_action *a, + } + } + +-static __latent_entropy void tasklet_action(struct softirq_action *a) ++static __latent_entropy void tasklet_action(void) + { +- tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); ++ tasklet_action_common(this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); + } + +-static __latent_entropy void tasklet_hi_action(struct softirq_action *a) ++static __latent_entropy void tasklet_hi_action(void) + { +- tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); ++ tasklet_action_common(this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); + } + + void tasklet_init(struct tasklet_struct *t, +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index cc02050fd0c4..cca161854186 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -67,6 +67,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -99,12 +100,19 @@ + #if defined(CONFIG_SYSCTL) + + /* External variables not in a header file. */ ++#if IS_ENABLED(CONFIG_USB) ++int deny_new_usb __read_mostly = 0; ++EXPORT_SYMBOL(deny_new_usb); ++#endif + extern int suid_dumpable; + #ifdef CONFIG_COREDUMP + extern int core_uses_pid; + extern char core_pattern[]; + extern unsigned int core_pipe_limit; + #endif ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#endif + extern int pid_max; + extern int pid_max_min, pid_max_max; + extern int percpu_pagelist_fraction; +@@ -116,33 +124,33 @@ extern int sysctl_nr_trim_pages; + + /* Constants used for minimum and maximum */ + #ifdef CONFIG_LOCKUP_DETECTOR +-static int sixty = 60; ++static int sixty __read_only = 60; + #endif + +-static int __maybe_unused neg_one = -1; ++static int __maybe_unused neg_one __read_only = -1; + + static int zero; +-static int __maybe_unused one = 1; +-static int __maybe_unused two = 2; +-static int __maybe_unused four = 4; +-static unsigned long one_ul = 1; +-static int one_hundred = 100; +-static int one_thousand = 1000; ++static int __maybe_unused one __read_only = 1; ++static int __maybe_unused two __read_only = 2; ++static int __maybe_unused four __read_only = 4; ++static unsigned long one_ul __read_only = 1; ++static int one_hundred __read_only = 100; ++static int one_thousand __read_only = 1000; + #ifdef CONFIG_PRINTK +-static int ten_thousand = 10000; ++static int ten_thousand __read_only = 10000; + #endif + #ifdef CONFIG_PERF_EVENTS +-static int six_hundred_forty_kb = 640 * 1024; ++static int six_hundred_forty_kb __read_only = 640 * 1024; + #endif + + /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +-static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; ++static unsigned long dirty_bytes_min __read_only = 2 * PAGE_SIZE; + + /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +-static int maxolduid = 65535; +-static int minolduid; ++static int maxolduid __read_only = 65535; ++static int minolduid __read_only; + +-static int ngroups_max = NGROUPS_MAX; ++static int ngroups_max __read_only = NGROUPS_MAX; + static const int cap_last_cap = CAP_LAST_CAP; + + /* +@@ -150,9 +158,12 @@ static const int cap_last_cap = CAP_LAST_CAP; + * and hung_task_check_interval_secs + */ + #ifdef CONFIG_DETECT_HUNG_TASK +-static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); ++static unsigned long hung_task_timeout_max __read_only = (LONG_MAX/HZ); + #endif + ++int device_sidechannel_restrict __read_mostly = 1; ++EXPORT_SYMBOL(device_sidechannel_restrict); ++ + #ifdef CONFIG_INOTIFY_USER + #include + #endif +@@ -296,19 +307,19 @@ static struct ctl_table sysctl_base_table[] = { + }; + + #ifdef CONFIG_SCHED_DEBUG +-static int min_sched_granularity_ns = 100000; /* 100 usecs */ +-static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ +-static int min_wakeup_granularity_ns; /* 0 usecs */ +-static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ ++static int min_sched_granularity_ns __read_only = 100000; /* 100 usecs */ ++static int max_sched_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ ++static int min_wakeup_granularity_ns __read_only; /* 0 usecs */ ++static int max_wakeup_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ + #ifdef CONFIG_SMP +-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; ++static int min_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_NONE; ++static int max_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_END-1; + #endif /* CONFIG_SMP */ + #endif /* CONFIG_SCHED_DEBUG */ + + #ifdef CONFIG_COMPACTION +-static int min_extfrag_threshold; +-static int max_extfrag_threshold = 1000; ++static int min_extfrag_threshold __read_only; ++static int max_extfrag_threshold __read_only = 1000; + #endif + + static struct ctl_table kern_table[] = { +@@ -514,6 +525,15 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", +@@ -862,6 +882,37 @@ static struct ctl_table kern_table[] = { + .extra1 = &zero, + .extra2 = &two, + }, ++#endif ++#if defined CONFIG_TTY ++ { ++ .procname = "tiocsti_restrict", ++ .data = &tiocsti_restrict, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++#endif ++ { ++ .procname = "device_sidechannel_restrict", ++ .data = &device_sidechannel_restrict, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++#if IS_ENABLED(CONFIG_USB) ++ { ++ .procname = "deny_new_usb", ++ .data = &deny_new_usb, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, + #endif + { + .procname = "ngroups_max", +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index e1a549c9e399..c560063e3a8c 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -1462,7 +1462,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, + } + } + +-static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) ++static __latent_entropy void hrtimer_run_softirq(void) + { + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index fa49cd753dea..a16f8613282e 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1688,7 +1688,7 @@ static inline void __run_timers(struct timer_base *base) + /* + * This function runs timers and the timer-tq in bottom half context. + */ +-static __latent_entropy void run_timer_softirq(struct softirq_action *h) ++static __latent_entropy void run_timer_softirq(void) + { + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 923414a246e9..6b9dbc257e34 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -26,6 +26,9 @@ + #include + #include + ++/* sysctl */ ++int unprivileged_userns_clone; ++ + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); + +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 4966c4fbe7f7..7a685272c155 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -950,6 +950,7 @@ endmenu # "Debug lockups and hangs" + + config PANIC_ON_OOPS + bool "Panic on Oops" ++ default y + help + Say Y here to enable the kernel to panic when it oopses. This + has the same effect as setting oops=panic on the kernel command +@@ -959,7 +960,7 @@ config PANIC_ON_OOPS + anything erroneous after an oops which could result in data + corruption or other issues. + +- Say N if unsure. ++ Say Y if unsure. + + config PANIC_ON_OOPS_VALUE + int +@@ -1328,6 +1329,7 @@ config DEBUG_BUGVERBOSE + config DEBUG_LIST + bool "Debug linked list manipulation" + depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION ++ default y + help + Enable this to turn on extended checks in the linked-list + walking routines. +@@ -1982,6 +1984,7 @@ config MEMTEST + config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select DEBUG_LIST ++ default y + help + Select this option if the kernel should BUG when it encounters + data corruption in kernel memory structures when they get checked +@@ -2021,6 +2024,7 @@ config STRICT_DEVMEM + config IO_STRICT_DEVMEM + bool "Filter I/O access to /dev/mem" + depends on STRICT_DEVMEM ++ default y + ---help--- + If this option is disabled, you allow userspace (root) access to all + io-memory regardless of whether a driver is actively using that +diff --git a/lib/irq_poll.c b/lib/irq_poll.c +index 86a709954f5a..6f15787fcb1b 100644 +--- a/lib/irq_poll.c ++++ b/lib/irq_poll.c +@@ -75,7 +75,7 @@ void irq_poll_complete(struct irq_poll *iop) + } + EXPORT_SYMBOL(irq_poll_complete); + +-static void __latent_entropy irq_poll_softirq(struct softirq_action *h) ++static void __latent_entropy irq_poll_softirq(void) + { + struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); + int rearm = 0, budget = irq_poll_budget; +diff --git a/lib/kobject.c b/lib/kobject.c +index 97d86dc17c42..388257c2878b 100644 +--- a/lib/kobject.c ++++ b/lib/kobject.c +@@ -978,9 +978,9 @@ EXPORT_SYMBOL_GPL(kset_create_and_add); + + + static DEFINE_SPINLOCK(kobj_ns_type_lock); +-static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES]; ++static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES] __ro_after_init; + +-int kobj_ns_type_register(const struct kobj_ns_type_operations *ops) ++int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops) + { + enum kobj_ns_type type = ops->type; + int error; +diff --git a/lib/nlattr.c b/lib/nlattr.c +index e335bcafa9e4..f6334f882b1f 100644 +--- a/lib/nlattr.c ++++ b/lib/nlattr.c +@@ -364,6 +364,8 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count) + { + int minlen = min_t(int, count, nla_len(src)); + ++ BUG_ON(minlen < 0); ++ + memcpy(dest, nla_data(src), minlen); + if (count > minlen) + memset(dest + minlen, 0, count - minlen); +diff --git a/lib/vsprintf.c b/lib/vsprintf.c +index 812e59e13fe6..2c2104884c81 100644 +--- a/lib/vsprintf.c ++++ b/lib/vsprintf.c +@@ -1371,7 +1371,7 @@ char *pointer_string(char *buf, char *end, const void *ptr, + return number(buf, end, (unsigned long int)ptr, spec); + } + +-int kptr_restrict __read_mostly; ++int kptr_restrict __read_mostly = 2; + + static noinline_for_stack + char *restricted_pointer(char *buf, char *end, const void *ptr, +diff --git a/mm/Kconfig b/mm/Kconfig +index de64ea658716..8bff017856eb 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -311,7 +311,8 @@ config KSM + config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" + depends on MMU +- default 4096 ++ default 32768 if ARM || (ARM64 && COMPAT) ++ default 65536 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages +diff --git a/mm/mmap.c b/mm/mmap.c +index f7cd9cb966c0..fda49841f4f2 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -229,6 +229,13 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) + + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); ++ /* properly handle unaligned min_brk as an empty heap */ ++ if (min_brk & ~PAGE_MASK) { ++ if (brk == min_brk) ++ newbrk -= PAGE_SIZE; ++ if (mm->brk == min_brk) ++ oldbrk -= PAGE_SIZE; ++ } + if (oldbrk == newbrk) + goto set_brk; + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 9e45553cabd6..f5ec01e1498c 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -66,6 +66,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -99,6 +100,15 @@ int _node_numa_mem_[MAX_NUMNODES]; + DEFINE_MUTEX(pcpu_drain_mutex); + DEFINE_PER_CPU(struct work_struct, pcpu_drain); + ++bool __meminitdata extra_latent_entropy; ++ ++static int __init setup_extra_latent_entropy(char *str) ++{ ++ extra_latent_entropy = true; ++ return 0; ++} ++early_param("extra_latent_entropy", setup_extra_latent_entropy); ++ + #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY + volatile unsigned long latent_entropy __latent_entropy; + EXPORT_SYMBOL(latent_entropy); +@@ -1027,6 +1037,13 @@ static __always_inline bool free_pages_prepare(struct page *page, + debug_check_no_obj_freed(page_address(page), + PAGE_SIZE << order); + } ++ ++ if (IS_ENABLED(CONFIG_PAGE_SANITIZE)) { ++ int i; ++ for (i = 0; i < (1 << order); i++) ++ clear_highpage(page + i); ++ } ++ + arch_free_page(page, order); + kernel_poison_pages(page, 1 << order, 0); + kernel_map_pages(page, 1 << order, 0); +@@ -1267,6 +1284,21 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order) + __ClearPageReserved(p); + set_page_count(p, 0); + ++ if (extra_latent_entropy && !PageHighMem(page) && page_to_pfn(page) < 0x100000) { ++ unsigned long hash = 0; ++ size_t index, end = PAGE_SIZE * nr_pages / sizeof hash; ++ const unsigned long *data = lowmem_page_address(page); ++ ++ for (index = 0; index < end; index++) ++ hash ^= hash + data[index]; ++#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY ++ latent_entropy ^= hash; ++ add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); ++#else ++ add_device_randomness((const void *)&hash, sizeof(hash)); ++#endif ++ } ++ + page_zone(page)->managed_pages += nr_pages; + set_page_refcounted(page); + __free_pages(page, order); +@@ -1855,8 +1887,8 @@ static inline int check_new_page(struct page *page) + + static inline bool free_pages_prezeroed(void) + { +- return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && +- page_poisoning_enabled(); ++ return IS_ENABLED(CONFIG_PAGE_SANITIZE) || ++ (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && page_poisoning_enabled()); + } + + #ifdef CONFIG_DEBUG_VM +@@ -1913,6 +1945,11 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags + + post_alloc_hook(page, order, gfp_flags); + ++ if (IS_ENABLED(CONFIG_PAGE_SANITIZE_VERIFY)) { ++ for (i = 0; i < (1 << order); i++) ++ verify_zero_highpage(page + i); ++ } ++ + if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +diff --git a/mm/slab.h b/mm/slab.h +index 58c6c1c2a78e..86d7a6e7ad25 100644 +--- a/mm/slab.h ++++ b/mm/slab.h +@@ -313,7 +313,11 @@ static inline bool is_root_cache(struct kmem_cache *s) + static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) + { ++#ifdef CONFIG_SLAB_HARDENED ++ return p == s; ++#else + return true; ++#endif + } + + static inline const char *cache_name(struct kmem_cache *s) +@@ -365,18 +369,26 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) + * to not do even the assignment. In that case, slab_equal_or_root + * will also be a constant. + */ +- if (!memcg_kmem_enabled() && ++ if (!IS_ENABLED(CONFIG_SLAB_HARDENED) && ++ !memcg_kmem_enabled() && + !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) + return s; + + page = virt_to_head_page(x); ++#ifdef CONFIG_SLAB_HARDENED ++ BUG_ON(!PageSlab(page)); ++#endif + cachep = page->slab_cache; + if (slab_equal_or_root(cachep, s)) + return cachep; + + pr_err("%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name); ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG_ON(1); ++#else + WARN_ON_ONCE(1); ++#endif + return s; + } + +@@ -401,7 +413,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) + * back there or track user information then we can + * only use the space before that information. + */ +- if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) ++ if ((s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) || IS_ENABLED(CONFIG_SLAB_CANARY)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation +diff --git a/mm/slab_common.c b/mm/slab_common.c +index 3a7ac4f15194..a567cc1807ae 100644 +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -27,10 +27,10 @@ + + #include "slab.h" + +-enum slab_state slab_state; ++enum slab_state slab_state __ro_after_init; + LIST_HEAD(slab_caches); + DEFINE_MUTEX(slab_mutex); +-struct kmem_cache *kmem_cache; ++struct kmem_cache *kmem_cache __ro_after_init; + + #ifdef CONFIG_HARDENED_USERCOPY + bool usercopy_fallback __ro_after_init = +@@ -58,7 +58,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, + /* + * Merge control. If this is set then no merging of slab caches will occur. + */ +-static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); ++static bool slab_nomerge __ro_after_init = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); + + static int __init setup_slab_nomerge(char *str) + { +diff --git a/mm/slub.c b/mm/slub.c +index 8da34a8af53d..f05bc9ca8489 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -124,6 +124,16 @@ static inline int kmem_cache_debug(struct kmem_cache *s) + #endif + } + ++static inline bool has_sanitize(struct kmem_cache *s) ++{ ++ return IS_ENABLED(CONFIG_SLAB_SANITIZE) && !(s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)); ++} ++ ++static inline bool has_sanitize_verify(struct kmem_cache *s) ++{ ++ return IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && has_sanitize(s); ++} ++ + void *fixup_red_left(struct kmem_cache *s, void *p) + { + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) +@@ -297,6 +307,35 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) + *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); + } + ++#ifdef CONFIG_SLAB_CANARY ++static inline unsigned long *get_canary(struct kmem_cache *s, void *object) ++{ ++ if (s->offset) ++ return object + s->offset + sizeof(void *); ++ return object + s->inuse; ++} ++ ++static inline unsigned long get_canary_value(const void *canary, unsigned long value) ++{ ++ return (value ^ (unsigned long)canary) & CANARY_MASK; ++} ++ ++static inline void set_canary(struct kmem_cache *s, void *object, unsigned long value) ++{ ++ unsigned long *canary = get_canary(s, object); ++ *canary = get_canary_value(canary, value); ++} ++ ++static inline void check_canary(struct kmem_cache *s, void *object, unsigned long value) ++{ ++ unsigned long *canary = get_canary(s, object); ++ BUG_ON(*canary != get_canary_value(canary, value)); ++} ++#else ++#define set_canary(s, object, value) ++#define check_canary(s, object, value) ++#endif ++ + /* Loop over all objects in a slab */ + #define for_each_object(__p, __s, __addr, __objects) \ + for (__p = fixup_red_left(__s, __addr); \ +@@ -469,13 +508,13 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) + * Debug settings: + */ + #if defined(CONFIG_SLUB_DEBUG_ON) +-static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; ++static slab_flags_t slub_debug __ro_after_init = DEBUG_DEFAULT_FLAGS; + #else +-static slab_flags_t slub_debug; ++static slab_flags_t slub_debug __ro_after_init; + #endif + +-static char *slub_debug_slabs; +-static int disable_higher_order_debug; ++static char *slub_debug_slabs __ro_after_init; ++static int disable_higher_order_debug __ro_after_init; + + /* + * slub is about to manipulate internal object metadata. This memory lies +@@ -535,6 +574,9 @@ static struct track *get_track(struct kmem_cache *s, void *object, + else + p = object + s->inuse; + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ p = (void *)p + sizeof(void *); ++ + return p + alloc; + } + +@@ -674,6 +716,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) + else + off = s->inuse; + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ off += sizeof(void *); ++ + if (s->flags & SLAB_STORE_USER) + off += 2 * sizeof(struct track); + +@@ -803,6 +848,9 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) + /* Freepointer is placed after the object. */ + off += sizeof(void *); + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ off += sizeof(void *); ++ + if (s->flags & SLAB_STORE_USER) + /* We also have user information there */ + off += 2 * sizeof(struct track); +@@ -1417,8 +1465,9 @@ static void setup_object(struct kmem_cache *s, struct page *page, + void *object) + { + setup_object_debug(s, page, object); ++ set_canary(s, object, s->random_inactive); + kasan_init_slab_obj(s, object); +- if (unlikely(s->ctor)) { ++ if (unlikely(s->ctor) && !has_sanitize_verify(s)) { + kasan_unpoison_object_data(s, object); + s->ctor(object); + kasan_poison_object_data(s, object); +@@ -2700,9 +2749,21 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, + stat(s, ALLOC_FASTPATH); + } + +- if (unlikely(gfpflags & __GFP_ZERO) && object) ++ if (has_sanitize_verify(s) && object) { ++ size_t offset = s->offset ? 0 : sizeof(void *); ++ BUG_ON(memchr_inv(object + offset, 0, s->object_size - offset)); ++ if (s->ctor) ++ s->ctor(object); ++ if (unlikely(gfpflags & __GFP_ZERO) && offset) ++ memset(object, 0, sizeof(void *)); ++ } else if (unlikely(gfpflags & __GFP_ZERO) && object) + memset(object, 0, s->object_size); + ++ if (object) { ++ check_canary(s, object, s->random_inactive); ++ set_canary(s, object, s->random_active); ++ } ++ + slab_post_alloc_hook(s, gfpflags, 1, &object); + + return object; +@@ -2909,6 +2970,27 @@ static __always_inline void do_slab_free(struct kmem_cache *s, + void *tail_obj = tail ? : head; + struct kmem_cache_cpu *c; + unsigned long tid; ++ bool sanitize = has_sanitize(s); ++ ++ if (IS_ENABLED(CONFIG_SLAB_CANARY) || sanitize) { ++ __maybe_unused int offset = s->offset ? 0 : sizeof(void *); ++ void *x = head; ++ ++ while (1) { ++ check_canary(s, x, s->random_active); ++ set_canary(s, x, s->random_inactive); ++ ++ if (sanitize) { ++ memset(x + offset, 0, s->object_size - offset); ++ if (!IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && s->ctor) ++ s->ctor(x); ++ } ++ if (x == tail_obj) ++ break; ++ x = get_freepointer(s, x); ++ } ++ } ++ + redo: + /* + * Determine the currently cpus per cpu slab. +@@ -3085,7 +3167,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) + { + struct kmem_cache_cpu *c; +- int i; ++ int i, k; + + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, flags); +@@ -3122,13 +3204,29 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + local_irq_enable(); + + /* Clear memory outside IRQ disabled fastpath loop */ +- if (unlikely(flags & __GFP_ZERO)) { ++ if (has_sanitize_verify(s)) { ++ int j; ++ ++ for (j = 0; j < i; j++) { ++ size_t offset = s->offset ? 0 : sizeof(void *); ++ BUG_ON(memchr_inv(p[j] + offset, 0, s->object_size - offset)); ++ if (s->ctor) ++ s->ctor(p[j]); ++ if (unlikely(flags & __GFP_ZERO) && offset) ++ memset(p[j], 0, sizeof(void *)); ++ } ++ } else if (unlikely(flags & __GFP_ZERO)) { + int j; + + for (j = 0; j < i; j++) + memset(p[j], 0, s->object_size); + } + ++ for (k = 0; k < i; k++) { ++ check_canary(s, p[k], s->random_inactive); ++ set_canary(s, p[k], s->random_active); ++ } ++ + /* memcg and kmem_cache debug support */ + slab_post_alloc_hook(s, flags, size, p); + return i; +@@ -3160,9 +3258,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); + * and increases the number of allocations possible without having to + * take the list_lock. + */ +-static unsigned int slub_min_order; +-static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +-static unsigned int slub_min_objects; ++static unsigned int slub_min_order __ro_after_init; ++static unsigned int slub_max_order __ro_after_init = PAGE_ALLOC_COSTLY_ORDER; ++static unsigned int slub_min_objects __ro_after_init; + + /* + * Calculate the order of allocation given an slab object size. +@@ -3334,6 +3432,7 @@ static void early_kmem_cache_node_alloc(int node) + init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); + init_tracking(kmem_cache_node, n); + #endif ++ set_canary(kmem_cache_node, n, kmem_cache_node->random_active); + kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), + GFP_KERNEL); + init_kmem_cache_node(n); +@@ -3490,6 +3589,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) + size += sizeof(void *); + } + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ size += sizeof(void *); ++ + #ifdef CONFIG_SLUB_DEBUG + if (flags & SLAB_STORE_USER) + /* +@@ -3559,6 +3661,10 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) + #ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); + #endif ++#ifdef CONFIG_SLAB_CANARY ++ s->random_active = get_random_long(); ++ s->random_inactive = get_random_long(); ++#endif + + if (!calculate_sizes(s, -1)) + goto error; +@@ -3835,6 +3941,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, + offset -= s->red_left_pad; + } + ++ check_canary(s, (void *)ptr - offset, s->random_active); ++ + /* Allow address range falling entirely within usercopy region. */ + if (offset >= s->useroffset && + offset - s->useroffset <= s->usersize && +@@ -3868,7 +3976,11 @@ static size_t __ksize(const void *object) + page = virt_to_head_page(object); + + if (unlikely(!PageSlab(page))) { ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG_ON(!PageCompound(page)); ++#else + WARN_ON(!PageCompound(page)); ++#endif + return PAGE_SIZE << compound_order(page); + } + +@@ -4728,7 +4840,7 @@ enum slab_stat_type { + #define SO_TOTAL (1 << SL_TOTAL) + + #ifdef CONFIG_MEMCG +-static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); ++static bool memcg_sysfs_enabled __ro_after_init = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); + + static int __init setup_slub_memcg_sysfs(char *str) + { +diff --git a/mm/swap.c b/mm/swap.c +index 26fc9b5f1b6c..7c9312ca8982 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -93,6 +93,13 @@ static void __put_compound_page(struct page *page) + if (!PageHuge(page)) + __page_cache_release(page); + dtor = get_compound_page_dtor(page); ++ if (!PageHuge(page)) ++ BUG_ON(dtor != free_compound_page ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++ && dtor != free_transhuge_page ++#endif ++ ); ++ + (*dtor)(page); + } + +diff --git a/net/core/dev.c b/net/core/dev.c +index af097ca9cb4f..fda1753e5b65 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -4519,7 +4519,7 @@ int netif_rx_ni(struct sk_buff *skb) + } + EXPORT_SYMBOL(netif_rx_ni); + +-static __latent_entropy void net_tx_action(struct softirq_action *h) ++static __latent_entropy void net_tx_action(void) + { + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + +@@ -6302,7 +6302,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) + return work; + } + +-static __latent_entropy void net_rx_action(struct softirq_action *h) ++static __latent_entropy void net_rx_action(void) + { + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + unsigned long time_limit = jiffies + +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 32cae39cdff6..9141d7ae99b2 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -266,6 +266,7 @@ config IP_PIMSM_V2 + + config SYN_COOKIES + bool "IP: TCP syncookie support" ++ default y + ---help--- + Normal TCP/IP networking is open to an attack known as "SYN + flooding". This denial-of-service attack prevents legitimate remote +diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig +index cb0c889e13aa..305f52f58c1a 100644 +--- a/scripts/gcc-plugins/Kconfig ++++ b/scripts/gcc-plugins/Kconfig +@@ -59,6 +59,11 @@ config GCC_PLUGIN_LATENT_ENTROPY + is some slowdown of the boot process (about 0.5%) and fork and + irq processing. + ++ When extra_latent_entropy is passed on the kernel command line, ++ entropy will be extracted from up to the first 4GB of RAM while the ++ runtime memory allocator is being initialized. This costs even more ++ slowdown of the boot process. ++ + Note that entropy extracted this way is not cryptographically + secure! + +diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c +index 5a5b3780456f..01eac2c6e7eb 100644 +--- a/scripts/mod/modpost.c ++++ b/scripts/mod/modpost.c +@@ -35,6 +35,7 @@ static int vmlinux_section_warnings = 1; + static int warn_unresolved = 0; + /* How a symbol is exported */ + static int sec_mismatch_count = 0; ++static int writable_fptr_count = 0; + static int sec_mismatch_verbose = 1; + static int sec_mismatch_fatal = 0; + /* ignore missing files */ +@@ -954,6 +955,7 @@ enum mismatch { + ANY_EXIT_TO_ANY_INIT, + EXPORT_TO_INIT_EXIT, + EXTABLE_TO_NON_TEXT, ++ DATA_TO_TEXT + }; + + /** +@@ -1080,6 +1082,12 @@ static const struct sectioncheck sectioncheck[] = { + .good_tosec = {ALL_TEXT_SECTIONS , NULL}, + .mismatch = EXTABLE_TO_NON_TEXT, + .handler = extable_mismatch_handler, ++}, ++/* Do not reference code from writable data */ ++{ ++ .fromsec = { DATA_SECTIONS, NULL }, ++ .bad_tosec = { ALL_TEXT_SECTIONS, NULL }, ++ .mismatch = DATA_TO_TEXT + } + }; + +@@ -1229,10 +1237,10 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr, + continue; + if (ELF_ST_TYPE(sym->st_info) == STT_SECTION) + continue; +- if (sym->st_value == addr) +- return sym; + /* Find a symbol nearby - addr are maybe negative */ + d = sym->st_value - addr; ++ if (d == 0) ++ return sym; + if (d < 0) + d = addr - sym->st_value; + if (d < distance) { +@@ -1391,7 +1399,11 @@ static void report_sec_mismatch(const char *modname, + char *prl_from; + char *prl_to; + +- sec_mismatch_count++; ++ if (mismatch->mismatch == DATA_TO_TEXT) ++ writable_fptr_count++; ++ else ++ sec_mismatch_count++; ++ + if (!sec_mismatch_verbose) + return; + +@@ -1515,6 +1527,14 @@ static void report_sec_mismatch(const char *modname, + fatal("There's a special handler for this mismatch type, " + "we should never get here."); + break; ++ case DATA_TO_TEXT: ++#if 0 ++ fprintf(stderr, ++ "The %s %s:%s references\n" ++ "the %s %s:%s%s\n", ++ from, fromsec, fromsym, to, tosec, tosym, to_p); ++#endif ++ break; + } + fprintf(stderr, "\n"); + } +@@ -2526,6 +2546,14 @@ int main(int argc, char **argv) + } + } + free(buf.p); ++ if (writable_fptr_count) { ++ if (!sec_mismatch_verbose) { ++ warn("modpost: Found %d writable function pointer(s).\n" ++ "To see full details build your kernel with:\n" ++ "'make CONFIG_DEBUG_SECTION_MISMATCH=y'\n", ++ writable_fptr_count); ++ } ++ } + + return err; + } +diff --git a/security/Kconfig b/security/Kconfig +index d9aa521b5206..a921713b76ec 100644 +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -8,7 +8,7 @@ source security/keys/Kconfig + + config SECURITY_DMESG_RESTRICT + bool "Restrict unprivileged access to the kernel syslog" +- default n ++ default y + help + This enforces restrictions on unprivileged users reading the kernel + syslog via dmesg(8). +@@ -18,10 +18,34 @@ config SECURITY_DMESG_RESTRICT + + If you are unsure how to answer this question, answer N. + ++config SECURITY_PERF_EVENTS_RESTRICT ++ bool "Restrict unprivileged use of performance events" ++ depends on PERF_EVENTS ++ default y ++ help ++ If you say Y here, the kernel.perf_event_paranoid sysctl ++ will be set to 3 by default, and no unprivileged use of the ++ perf_event_open syscall will be permitted unless it is ++ changed. ++ ++config SECURITY_TIOCSTI_RESTRICT ++ bool "Restrict unprivileged use of tiocsti command injection" ++ default y ++ help ++ This enforces restrictions on unprivileged users injecting commands ++ into other processes which share a tty session using the TIOCSTI ++ ioctl. This option makes TIOCSTI use require CAP_SYS_ADMIN. ++ ++ If this option is not selected, no restrictions will be enforced ++ unless the tiocsti_restrict sysctl is explicitly set to (1). ++ ++ If you are unsure how to answer this question, answer N. ++ + config SECURITY + bool "Enable different security models" + depends on SYSFS + depends on MULTIUSER ++ default y + help + This allows you to choose different security modules to be + configured into your kernel. +@@ -48,6 +72,7 @@ config SECURITYFS + config SECURITY_NETWORK + bool "Socket and Networking Security Hooks" + depends on SECURITY ++ default y + help + This enables the socket and networking security hooks. + If enabled, a security module can use these hooks to +@@ -154,6 +179,7 @@ config HARDENED_USERCOPY + bool "Harden memory copies between kernel and userspace" + depends on HAVE_HARDENED_USERCOPY_ALLOCATOR + imply STRICT_DEVMEM ++ default y + help + This option checks for obviously wrong memory regions when + copying memory to/from the kernel (via copy_to_user() and +@@ -166,7 +192,6 @@ config HARDENED_USERCOPY + config HARDENED_USERCOPY_FALLBACK + bool "Allow usercopy whitelist violations to fallback to object size" + depends on HARDENED_USERCOPY +- default y + help + This is a temporary option that allows missing usercopy whitelists + to be discovered via a WARN() to the kernel log, instead of +@@ -191,10 +216,36 @@ config HARDENED_USERCOPY_PAGESPAN + config FORTIFY_SOURCE + bool "Harden common str/mem functions against buffer overflows" + depends on ARCH_HAS_FORTIFY_SOURCE ++ default y + help + Detect overflows of buffers in common string and memory functions + where the compiler can determine and validate the buffer sizes. + ++config FORTIFY_SOURCE_STRICT_STRING ++ bool "Harden common functions against buffer overflows" ++ depends on FORTIFY_SOURCE ++ depends on EXPERT ++ help ++ Perform stricter overflow checks catching overflows within objects ++ for common C string functions rather than only between objects. ++ ++ This is not yet intended for production use, only bug finding. ++ ++config PAGE_SANITIZE ++ bool "Sanitize pages" ++ default y ++ help ++ Zero fill page allocations on free, reducing the lifetime of ++ sensitive data and helping to mitigate use-after-free bugs. ++ ++config PAGE_SANITIZE_VERIFY ++ bool "Verify sanitized pages" ++ depends on PAGE_SANITIZE ++ default y ++ help ++ Verify that newly allocated pages are zeroed to detect ++ write-after-free bugs. ++ + config STATIC_USERMODEHELPER + bool "Force all usermode helper calls through a single binary" + help +diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig +index 8af7a690eb40..6539694b0fd3 100644 +--- a/security/selinux/Kconfig ++++ b/security/selinux/Kconfig +@@ -2,7 +2,7 @@ config SECURITY_SELINUX + bool "NSA SELinux Support" + depends on SECURITY_NETWORK && AUDIT && NET && INET + select NETWORK_SECMARK +- default n ++ default y + help + This selects NSA Security-Enhanced Linux (SELinux). + You will also need a policy configuration and a labeled filesystem. +@@ -79,23 +79,3 @@ config SECURITY_SELINUX_AVC_STATS + This option collects access vector cache statistics to + /selinux/avc/cache_stats, which may be monitored via + tools such as avcstat. +- +-config SECURITY_SELINUX_CHECKREQPROT_VALUE +- int "NSA SELinux checkreqprot default value" +- depends on SECURITY_SELINUX +- range 0 1 +- default 0 +- help +- This option sets the default value for the 'checkreqprot' flag +- that determines whether SELinux checks the protection requested +- by the application or the protection that will be applied by the +- kernel (including any implied execute for read-implies-exec) for +- mmap and mprotect calls. If this option is set to 0 (zero), +- SELinux will default to checking the protection that will be applied +- by the kernel. If this option is set to 1 (one), SELinux will +- default to checking the protection requested by the application. +- The checkreqprot flag may be changed from the default via the +- 'checkreqprot=' boot parameter. It may also be changed at runtime +- via /selinux/checkreqprot if authorized by policy. +- +- If you are unsure how to answer this question, answer 0. +diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c +index 3c3878f0d2fa..553e52f19f28 100644 +--- a/security/selinux/hooks.c ++++ b/security/selinux/hooks.c +@@ -135,18 +135,7 @@ __setup("selinux=", selinux_enabled_setup); + int selinux_enabled = 1; + #endif + +-static unsigned int selinux_checkreqprot_boot = +- CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; +- +-static int __init checkreqprot_setup(char *str) +-{ +- unsigned long checkreqprot; +- +- if (!kstrtoul(str, 0, &checkreqprot)) +- selinux_checkreqprot_boot = checkreqprot ? 1 : 0; +- return 1; +-} +-__setup("checkreqprot=", checkreqprot_setup); ++static const unsigned int selinux_checkreqprot_boot; + + static struct kmem_cache *sel_inode_cache; + static struct kmem_cache *file_security_cache; +diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c +index f3a5a138a096..d95f3c5fe6f0 100644 +--- a/security/selinux/selinuxfs.c ++++ b/security/selinux/selinuxfs.c +@@ -640,7 +640,6 @@ static ssize_t sel_read_checkreqprot(struct file *filp, char __user *buf, + static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) + { +- struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; + char *page; + ssize_t length; + unsigned int new_value; +@@ -664,10 +663,9 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, + return PTR_ERR(page); + + length = -EINVAL; +- if (sscanf(page, "%u", &new_value) != 1) ++ if (sscanf(page, "%u", &new_value) != 1 || new_value) + goto out; + +- fsi->state->checkreqprot = new_value ? 1 : 0; + length = count; + out: + kfree(page); +diff --git a/security/yama/Kconfig b/security/yama/Kconfig +index 96b27405558a..485c1b85c325 100644 +--- a/security/yama/Kconfig ++++ b/security/yama/Kconfig +@@ -1,7 +1,7 @@ + config SECURITY_YAMA + bool "Yama support" + depends on SECURITY +- default n ++ default y + help + This selects Yama, which extends DAC support with additional + system-wide security settings beyond regular Linux discretionary diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-mute-pps_state_mismatch.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-mute-pps_state_mismatch.patch new file mode 100644 index 00000000..5bc1eff7 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-mute-pps_state_mismatch.patch @@ -0,0 +1,16 @@ +diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c +index 158438bb0389..734718e45aaa 100644 +--- a/drivers/gpu/drm/i915/intel_dp.c ++++ b/drivers/gpu/drm/i915/intel_dp.c +@@ -5245,7 +5245,10 @@ intel_pps_verify_state(struct drm_i915_private *dev_priv, + + if (hw.t1_t3 != sw->t1_t3 || hw.t8 != sw->t8 || hw.t9 != sw->t9 || + hw.t10 != sw->t10 || hw.t11_t12 != sw->t11_t12) { +- DRM_ERROR("PPS state mismatch\n"); ++ /* seem buggy on 4.14.x .. mute that for now ++ * even is not a real solution .. ++ * DRM_ERROR("PPS state mismatch\n"); ++ */ + intel_pps_dump_state("sw", sw); + intel_pps_dump_state("hw", &hw); + } diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-nouveau-pascal-backlight.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-nouveau-pascal-backlight.patch new file mode 100644 index 00000000..754d982a --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-nouveau-pascal-backlight.patch @@ -0,0 +1,11 @@ +diff -up linux-4.16/drivers/gpu/drm/nouveau/nouveau_backlight.c.omv~ linux-4.16/drivers/gpu/drm/nouveau/nouveau_backlight.c +--- linux-4.16/drivers/gpu/drm/nouveau/nouveau_backlight.c.omv~ 2018-04-06 01:04:34.573357055 +0200 ++++ linux-4.16/drivers/gpu/drm/nouveau/nouveau_backlight.c 2018-04-06 01:05:46.985579248 +0200 +@@ -287,6 +287,7 @@ nouveau_backlight_init(struct drm_device + case NV_DEVICE_INFO_V0_FERMI: + case NV_DEVICE_INFO_V0_KEPLER: + case NV_DEVICE_INFO_V0_MAXWELL: ++ case NV_DEVICE_INFO_V0_PASCAL: + return nv50_backlight_init(connector); + default: + break; diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-radeon_dp_aux_transfer_native-no-ratelimited_debug.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-radeon_dp_aux_transfer_native-no-ratelimited_debug.patch new file mode 100644 index 00000000..6ffcb42c --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-radeon_dp_aux_transfer_native-no-ratelimited_debug.patch @@ -0,0 +1,13 @@ +diff --git a/drivers/gpu/drm/radeon/radeon_dp_auxch.c b/drivers/gpu/drm/radeon/radeon_dp_auxch.c +index 12eac4e75542..a26b8ddd7d3f 100644 +--- a/drivers/gpu/drm/radeon/radeon_dp_auxch.c ++++ b/drivers/gpu/drm/radeon/radeon_dp_auxch.c +@@ -168,7 +168,7 @@ radeon_dp_aux_transfer_native(struct drm_dp_aux *aux, struct drm_dp_aux_msg *msg + goto done; + } + if (tmp & AUX_RX_ERROR_FLAGS) { +- DRM_DEBUG_KMS_RATELIMITED("dp_aux_ch flags not zero: %08x\n", ++ DRM_DEBUG_KMS("dp_aux_ch flags not zero: %08x\n", + tmp); + ret = -EIO; + goto done; diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-redcore-lts-amd64.config b/sys-kernel/linux-sources-redcore-lts/files/4.19-redcore-lts-amd64.config new file mode 100644 index 00000000..c5bedf65 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-redcore-lts-amd64.config @@ -0,0 +1,9348 @@ +# +# Automatically generated file; DO NOT EDIT. +# Linux/x86 4.19.20-redcore-lts Kernel Configuration +# + +# +# Compiler: gcc (Gentoo Hardened 8.2.0-r1337 p1.6) 8.2.0 +# +CONFIG_CC_IS_GCC=y +CONFIG_GCC_VERSION=80200 +CONFIG_CLANG_VERSION=0 +CONFIG_IRQ_WORK=y +CONFIG_BUILDTIME_EXTABLE_SORT=y +CONFIG_THREAD_INFO_IN_TASK=y + +# +# General setup +# +CONFIG_SCHED_MUQSS=y +CONFIG_INIT_ENV_ARG_LIMIT=32 +# CONFIG_COMPILE_TEST is not set +CONFIG_LOCALVERSION="" +CONFIG_LOCALVERSION_AUTO=y +CONFIG_BUILD_SALT="" +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_HAVE_KERNEL_XZ=y +CONFIG_HAVE_KERNEL_LZO=y +CONFIG_HAVE_KERNEL_LZ4=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set +# CONFIG_KERNEL_XZ is not set +# CONFIG_KERNEL_LZO is not set +CONFIG_KERNEL_LZ4=y +CONFIG_DEFAULT_HOSTNAME="(none)" +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y +CONFIG_CROSS_MEMORY_ATTACH=y +# CONFIG_USELIB is not set +CONFIG_AUDIT=y +CONFIG_HAVE_ARCH_AUDITSYSCALL=y +CONFIG_AUDITSYSCALL=y +CONFIG_AUDIT_WATCH=y +CONFIG_AUDIT_TREE=y + +# +# IRQ subsystem +# +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_IRQ_SHOW=y +CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_GENERIC_IRQ_MIGRATION=y +CONFIG_GENERIC_IRQ_CHIP=y +CONFIG_IRQ_DOMAIN=y +CONFIG_IRQ_SIM=y +CONFIG_IRQ_DOMAIN_HIERARCHY=y +CONFIG_GENERIC_MSI_IRQ=y +CONFIG_GENERIC_MSI_IRQ_DOMAIN=y +CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y +CONFIG_GENERIC_IRQ_RESERVATION_MODE=y +CONFIG_IRQ_FORCED_THREADING=y +# CONFIG_FORCE_IRQ_THREADING is not set +CONFIG_SPARSE_IRQ=y +# CONFIG_GENERIC_IRQ_DEBUGFS is not set +CONFIG_CLOCKSOURCE_WATCHDOG=y +CONFIG_ARCH_CLOCKSOURCE_DATA=y +CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y +CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y +CONFIG_GENERIC_CMOS_UPDATE=y + +# +# Timers subsystem +# +CONFIG_TICK_ONESHOT=y +CONFIG_HZ_PERIODIC=y +# CONFIG_NO_HZ_IDLE is not set +# CONFIG_NO_HZ_FULL is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +# CONFIG_PREEMPT_NONE is not set +# CONFIG_PREEMPT_VOLUNTARY is not set +CONFIG_PREEMPT=y +CONFIG_PREEMPT_COUNT=y + +# +# CPU/Task time and stats accounting +# +CONFIG_VIRT_CPU_ACCOUNTING=y +# CONFIG_TICK_CPU_ACCOUNTING is not set +CONFIG_VIRT_CPU_ACCOUNTING_GEN=y +CONFIG_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_SCHED_AVG_IRQ=y +CONFIG_BSD_PROCESS_ACCT=y +# CONFIG_BSD_PROCESS_ACCT_V3 is not set +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_CPU_ISOLATION=y + +# +# RCU Subsystem +# +CONFIG_PREEMPT_RCU=y +# CONFIG_RCU_EXPERT is not set +CONFIG_SRCU=y +CONFIG_TREE_SRCU=y +CONFIG_TASKS_RCU=y +CONFIG_RCU_STALL_COMMON=y +CONFIG_RCU_NEED_SEGCBLIST=y +CONFIG_CONTEXT_TRACKING=y +# CONFIG_CONTEXT_TRACKING_FORCE is not set +CONFIG_BUILD_BIN2C=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y +CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y +CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y +CONFIG_ARCH_SUPPORTS_INT128=y +CONFIG_CGROUPS=y +CONFIG_PAGE_COUNTER=y +CONFIG_MEMCG=y +CONFIG_MEMCG_SWAP=y +CONFIG_MEMCG_SWAP_ENABLED=y +CONFIG_MEMCG_KMEM=y +CONFIG_BLK_CGROUP=y +# CONFIG_DEBUG_BLK_CGROUP is not set +CONFIG_CGROUP_WRITEBACK=y +CONFIG_CGROUP_SCHED=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_RDMA=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_PROC_PID_CPUSET=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_BPF=y +# CONFIG_CGROUP_DEBUG is not set +CONFIG_SOCK_CGROUP_DATA=y +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +# CONFIG_CHECKPOINT_RESTORE is not set +# CONFIG_SYSFS_DEPRECATED is not set +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y +CONFIG_RD_XZ=y +CONFIG_RD_LZO=y +CONFIG_RD_LZ4=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +# CONFIG_LOCAL_INIT is not set +CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y +CONFIG_HAVE_UID16=y +CONFIG_SYSCTL_EXCEPTION_TRACE=y +CONFIG_HAVE_PCSPKR_PLATFORM=y +CONFIG_BPF=y +# CONFIG_EXPERT is not set +CONFIG_UID16=y +CONFIG_MULTIUSER=y +CONFIG_SGETMASK_SYSCALL=y +CONFIG_SYSFS_SYSCALL=y +CONFIG_FHANDLE=y +CONFIG_POSIX_TIMERS=y +CONFIG_PRINTK=y +CONFIG_PRINTK_NMI=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_FUTEX_PI=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_ADVISE_SYSCALLS=y +CONFIG_MEMBARRIER=y +CONFIG_KALLSYMS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y +CONFIG_KALLSYMS_BASE_RELATIVE=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_USERFAULTFD=y +CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y +CONFIG_RSEQ=y +# CONFIG_EMBEDDED is not set +CONFIG_HAVE_PERF_EVENTS=y + +# +# Kernel Performance Events And Counters +# +CONFIG_PERF_EVENTS=y +# CONFIG_DEBUG_PERF_USE_VMALLOC is not set +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_SLUB_DEBUG=y +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB is not set +CONFIG_SLUB=y +CONFIG_SLAB_MERGE_DEFAULT=y +CONFIG_SLAB_FREELIST_RANDOM=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_SLAB_HARDENED=y +CONFIG_SLAB_SANITIZE=y +CONFIG_SLAB_SANITIZE_VERIFY=y +CONFIG_SLUB_CPU_PARTIAL=y +CONFIG_SYSTEM_DATA_VERIFICATION=y +CONFIG_PROFILING=y +CONFIG_TRACEPOINTS=y +CONFIG_64BIT=y +CONFIG_X86_64=y +CONFIG_X86=y +CONFIG_INSTRUCTION_DECODER=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" +CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_MMU=y +CONFIG_ARCH_MMAP_RND_BITS_MIN=28 +CONFIG_ARCH_MMAP_RND_BITS_MAX=32 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 +CONFIG_GENERIC_ISA_DMA=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y +CONFIG_GENERIC_HWEIGHT=y +CONFIG_ARCH_MAY_HAVE_PC_FDC=y +CONFIG_RWSEM_XCHGADD_ALGORITHM=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_ARCH_HAS_CPU_RELAX=y +CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y +CONFIG_ARCH_HAS_FILTER_PGPROT=y +CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y +CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y +CONFIG_ARCH_WANT_GENERAL_HUGETLB=y +CONFIG_ZONE_DMA32=y +CONFIG_AUDIT_ARCH=y +CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y +CONFIG_HAVE_INTEL_TXT=y +CONFIG_X86_64_SMP=y +CONFIG_ARCH_SUPPORTS_UPROBES=y +CONFIG_FIX_EARLYCON_MEM=y +CONFIG_DYNAMIC_PHYSICAL_MASK=y +CONFIG_PGTABLE_LEVELS=4 +CONFIG_CC_HAS_SANE_STACKPROTECTOR=y + +# +# Processor type and features +# +CONFIG_ZONE_DMA=y +CONFIG_SMP=y +CONFIG_X86_FEATURE_NAMES=y +CONFIG_X86_X2APIC=y +CONFIG_X86_MPPARSE=y +# CONFIG_GOLDFISH is not set +CONFIG_RETPOLINE=y +CONFIG_INTEL_RDT=y +# CONFIG_X86_EXTENDED_PLATFORM is not set +CONFIG_X86_INTEL_LPSS=y +CONFIG_X86_AMD_PLATFORM_DEVICE=y +CONFIG_IOSF_MBI=y +# CONFIG_IOSF_MBI_DEBUG is not set +CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y +CONFIG_SCHED_OMIT_FRAME_POINTER=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +# CONFIG_PARAVIRT_DEBUG is not set +# CONFIG_PARAVIRT_SPINLOCKS is not set +# CONFIG_XEN is not set +CONFIG_KVM_GUEST=y +# CONFIG_KVM_DEBUG_FS is not set +# CONFIG_PARAVIRT_TIME_ACCOUNTING is not set +CONFIG_PARAVIRT_CLOCK=y +CONFIG_JAILHOUSE_GUEST=y +CONFIG_NO_BOOTMEM=y +# CONFIG_MK8 is not set +# CONFIG_MPSC is not set +# CONFIG_MCORE2 is not set +# CONFIG_MATOM is not set +CONFIG_GENERIC_CPU=y +CONFIG_X86_INTERNODE_CACHE_SHIFT=6 +CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG64=y +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_FAMILY=64 +CONFIG_X86_DEBUGCTLMSR=y +CONFIG_CPU_SUP_INTEL=y +CONFIG_CPU_SUP_AMD=y +CONFIG_CPU_SUP_CENTAUR=y +CONFIG_HPET_TIMER=y +CONFIG_HPET_EMULATE_RTC=y +CONFIG_DMI=y +CONFIG_GART_IOMMU=y +# CONFIG_CALGARY_IOMMU is not set +CONFIG_MAXSMP=y +CONFIG_NR_CPUS_RANGE_BEGIN=8192 +CONFIG_NR_CPUS_RANGE_END=8192 +CONFIG_NR_CPUS_DEFAULT=8192 +CONFIG_NR_CPUS=8192 +CONFIG_SCHED_SMT=y +CONFIG_SMT_NICE=y +CONFIG_SCHED_MC=y +CONFIG_SCHED_MC_PRIO=y +# CONFIG_RQ_NONE is not set +# CONFIG_RQ_SMT is not set +CONFIG_RQ_MC=y +# CONFIG_RQ_SMP is not set +CONFIG_SHARERQ=2 +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y +CONFIG_X86_MCE=y +# CONFIG_X86_MCELOG_LEGACY is not set +CONFIG_X86_MCE_INTEL=y +CONFIG_X86_MCE_AMD=y +CONFIG_X86_MCE_THRESHOLD=y +# CONFIG_X86_MCE_INJECT is not set +CONFIG_X86_THERMAL_VECTOR=y + +# +# Performance monitoring +# +CONFIG_PERF_EVENTS_INTEL_UNCORE=y +CONFIG_PERF_EVENTS_INTEL_RAPL=y +CONFIG_PERF_EVENTS_INTEL_CSTATE=y +CONFIG_PERF_EVENTS_AMD_POWER=m +CONFIG_X86_16BIT=y +CONFIG_X86_ESPFIX64=y +CONFIG_X86_VSYSCALL_EMULATION=y +CONFIG_I8K=m +CONFIG_MICROCODE=y +CONFIG_MICROCODE_INTEL=y +CONFIG_MICROCODE_AMD=y +CONFIG_MICROCODE_OLD_INTERFACE=y +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m +# CONFIG_X86_5LEVEL is not set +CONFIG_X86_DIRECT_GBPAGES=y +CONFIG_ARCH_HAS_MEM_ENCRYPT=y +CONFIG_AMD_MEM_ENCRYPT=y +# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set +CONFIG_ARCH_USE_MEMREMAP_PROT=y +CONFIG_NUMA=y +CONFIG_AMD_NUMA=y +CONFIG_X86_64_ACPI_NUMA=y +CONFIG_NODES_SPAN_OTHER_NODES=y +# CONFIG_NUMA_EMU is not set +CONFIG_NODES_SHIFT=10 +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_DEFAULT=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ARCH_MEMORY_PROBE=y +CONFIG_ARCH_PROC_KCORE_TEXT=y +CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 +CONFIG_X86_PMEM_LEGACY_DEVICE=y +CONFIG_X86_PMEM_LEGACY=m +CONFIG_X86_CHECK_BIOS_CORRUPTION=y +CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y +CONFIG_X86_RESERVE_LOW=64 +CONFIG_MTRR=y +CONFIG_MTRR_SANITIZER=y +CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=0 +CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=1 +CONFIG_X86_PAT=y +CONFIG_ARCH_USES_PG_UNCACHED=y +CONFIG_ARCH_RANDOM=y +CONFIG_X86_SMAP=y +CONFIG_X86_INTEL_UMIP=y +CONFIG_X86_INTEL_MPX=y +CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y +CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_EFI_MIXED=y +CONFIG_SECCOMP=y +CONFIG_HZ_100=y +# CONFIG_HZ_250_NODEF is not set +# CONFIG_HZ_300_NODEF is not set +# CONFIG_HZ_1000_NODEF is not set +CONFIG_HZ=100 +CONFIG_SCHED_HRTICK=y +CONFIG_KEXEC=y +# CONFIG_CRASH_DUMP is not set +CONFIG_KEXEC_JUMP=y +CONFIG_PHYSICAL_START=0x1000000 +CONFIG_RELOCATABLE=y +CONFIG_RANDOMIZE_BASE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x1000000 +CONFIG_DYNAMIC_MEMORY_LAYOUT=y +CONFIG_RANDOMIZE_MEMORY=y +CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0xa +CONFIG_HOTPLUG_CPU=y +CONFIG_BOOTPARAM_HOTPLUG_CPU0=y +# CONFIG_DEBUG_HOTPLUG_CPU0 is not set +# CONFIG_COMPAT_VDSO is not set +CONFIG_LEGACY_VSYSCALL_EMULATE=y +# CONFIG_LEGACY_VSYSCALL_NONE is not set +# CONFIG_CMDLINE_BOOL is not set +CONFIG_MODIFY_LDT_SYSCALL=y +CONFIG_HAVE_LIVEPATCH=y +# CONFIG_LIVEPATCH is not set +CONFIG_ARCH_HAS_ADD_PAGES=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +CONFIG_USE_PERCPU_NUMA_NODE_ID=y +CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y +CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y +CONFIG_ARCH_ENABLE_THP_MIGRATION=y + +# +# Power management and ACPI options +# +CONFIG_ARCH_HIBERNATION_HEADER=y +CONFIG_SUSPEND=y +CONFIG_SUSPEND_FREEZER=y +CONFIG_HIBERNATE_CALLBACKS=y +CONFIG_HIBERNATION=y +CONFIG_PM_STD_PARTITION="" +CONFIG_PM_SLEEP=y +CONFIG_PM_SLEEP_SMP=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=100 +CONFIG_PM_WAKELOCKS_GC=y +CONFIG_PM=y +# CONFIG_PM_DEBUG is not set +CONFIG_PM_CLK=y +CONFIG_PM_GENERIC_DOMAINS=y +# CONFIG_WQ_POWER_EFFICIENT_DEFAULT is not set +CONFIG_PM_GENERIC_DOMAINS_SLEEP=y +CONFIG_ARCH_SUPPORTS_ACPI=y +CONFIG_ACPI=y +CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y +CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y +CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y +# CONFIG_ACPI_DEBUGGER is not set +CONFIG_ACPI_SPCR_TABLE=y +CONFIG_ACPI_LPIT=y +CONFIG_ACPI_SLEEP=y +# CONFIG_ACPI_PROCFS_POWER is not set +CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y +# CONFIG_ACPI_EC_DEBUGFS is not set +CONFIG_ACPI_AC=m +CONFIG_ACPI_BATTERY=m +CONFIG_ACPI_BUTTON=m +CONFIG_ACPI_VIDEO=m +CONFIG_ACPI_FAN=m +CONFIG_ACPI_TAD=m +CONFIG_ACPI_DOCK=y +CONFIG_ACPI_CPU_FREQ_PSS=y +CONFIG_ACPI_PROCESSOR_CSTATE=y +CONFIG_ACPI_PROCESSOR_IDLE=y +CONFIG_ACPI_CPPC_LIB=y +CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_IPMI=m +CONFIG_ACPI_HOTPLUG_CPU=y +CONFIG_ACPI_PROCESSOR_AGGREGATOR=m +CONFIG_ACPI_THERMAL=m +CONFIG_ACPI_NUMA=y +CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_TABLE_UPGRADE=y +# CONFIG_ACPI_DEBUG is not set +CONFIG_ACPI_PCI_SLOT=y +CONFIG_ACPI_CONTAINER=y +CONFIG_ACPI_HOTPLUG_MEMORY=y +CONFIG_ACPI_HOTPLUG_IOAPIC=y +CONFIG_ACPI_SBS=m +CONFIG_ACPI_HED=y +# CONFIG_ACPI_CUSTOM_METHOD is not set +CONFIG_ACPI_BGRT=y +CONFIG_ACPI_NFIT=m +CONFIG_HAVE_ACPI_APEI=y +CONFIG_HAVE_ACPI_APEI_NMI=y +CONFIG_ACPI_APEI=y +CONFIG_ACPI_APEI_GHES=y +CONFIG_ACPI_APEI_PCIEAER=y +CONFIG_ACPI_APEI_MEMORY_FAILURE=y +# CONFIG_ACPI_APEI_EINJ is not set +# CONFIG_ACPI_APEI_ERST_DEBUG is not set +CONFIG_DPTF_POWER=m +CONFIG_ACPI_WATCHDOG=y +CONFIG_ACPI_EXTLOG=m +CONFIG_PMIC_OPREGION=y +# CONFIG_XPOWER_PMIC_OPREGION is not set +# CONFIG_BXT_WC_PMIC_OPREGION is not set +CONFIG_CHT_DC_TI_PMIC_OPREGION=y +CONFIG_ACPI_CONFIGFS=m +CONFIG_X86_PM_TIMER=y +CONFIG_SFI=y + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_GOV_ATTR_SET=y +CONFIG_CPU_FREQ_GOV_COMMON=y +CONFIG_CPU_FREQ_STAT=y +CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y +CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y + +# +# CPU frequency scaling drivers +# +CONFIG_X86_INTEL_PSTATE=y +CONFIG_X86_PCC_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ_CPB=y +CONFIG_X86_POWERNOW_K8=m +CONFIG_X86_AMD_FREQ_SENSITIVITY=m +# CONFIG_X86_SPEEDSTEP_CENTRINO is not set +# CONFIG_X86_P4_CLOCKMOD is not set + +# +# shared options +# + +# +# CPU Idle +# +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y +CONFIG_INTEL_IDLE=y + +# +# Bus options (PCI etc.) +# +CONFIG_PCI=y +CONFIG_PCI_DIRECT=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_DOMAINS=y +CONFIG_MMCONF_FAM10H=y +CONFIG_PCIEPORTBUS=y +CONFIG_HOTPLUG_PCI_PCIE=y +CONFIG_PCIEAER=y +CONFIG_PCIEAER_INJECT=m +CONFIG_PCIE_ECRC=y +CONFIG_PCIEASPM=y +# CONFIG_PCIEASPM_DEBUG is not set +CONFIG_PCIEASPM_DEFAULT=y +# CONFIG_PCIEASPM_POWERSAVE is not set +# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set +# CONFIG_PCIEASPM_PERFORMANCE is not set +CONFIG_PCIE_PME=y +CONFIG_PCIE_DPC=y +CONFIG_PCIE_PTM=y +CONFIG_PCI_MSI=y +CONFIG_PCI_MSI_IRQ_DOMAIN=y +CONFIG_PCI_QUIRKS=y +# CONFIG_PCI_DEBUG is not set +CONFIG_PCI_REALLOC_ENABLE_AUTO=y +CONFIG_PCI_STUB=m +CONFIG_PCI_PF_STUB=m +CONFIG_PCI_ATS=y +CONFIG_PCI_LOCKLESS_CONFIG=y +CONFIG_PCI_IOV=y +CONFIG_PCI_PRI=y +CONFIG_PCI_PASID=y +CONFIG_PCI_LABEL=y +CONFIG_PCI_HYPERV=m +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_ACPI=y +CONFIG_HOTPLUG_PCI_ACPI_IBM=m +CONFIG_HOTPLUG_PCI_CPCI=y +CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m +CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m +# CONFIG_HOTPLUG_PCI_SHPC is not set + +# +# PCI controller drivers +# + +# +# Cadence PCIe controllers support +# +CONFIG_VMD=m + +# +# DesignWare PCI Core Support +# +# CONFIG_PCIE_DW_PLAT_HOST is not set +# CONFIG_PCIE_DW_PLAT_EP is not set + +# +# PCI Endpoint +# +CONFIG_PCI_ENDPOINT=y +CONFIG_PCI_ENDPOINT_CONFIGFS=y +# CONFIG_PCI_EPF_TEST is not set + +# +# PCI switch controller drivers +# +CONFIG_PCI_SW_SWITCHTEC=m +CONFIG_ISA_DMA_API=y +CONFIG_AMD_NB=y +CONFIG_PCCARD=m +CONFIG_PCMCIA=m +CONFIG_PCMCIA_LOAD_CIS=y +CONFIG_CARDBUS=y + +# +# PC-card bridges +# +CONFIG_YENTA=m +CONFIG_YENTA_O2=y +CONFIG_YENTA_RICOH=y +CONFIG_YENTA_TI=y +CONFIG_YENTA_ENE_TUNE=y +CONFIG_YENTA_TOSHIBA=y +CONFIG_PD6729=m +CONFIG_I82092=m +CONFIG_PCCARD_NONSTATIC=y +CONFIG_RAPIDIO=y +CONFIG_RAPIDIO_TSI721=y +CONFIG_RAPIDIO_DISC_TIMEOUT=30 +CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y +CONFIG_RAPIDIO_DMA_ENGINE=y +# CONFIG_RAPIDIO_DEBUG is not set +CONFIG_RAPIDIO_ENUM_BASIC=m +CONFIG_RAPIDIO_CHMAN=m +CONFIG_RAPIDIO_MPORT_CDEV=m + +# +# RapidIO Switch drivers +# +CONFIG_RAPIDIO_TSI57X=y +CONFIG_RAPIDIO_CPS_XX=y +CONFIG_RAPIDIO_TSI568=y +CONFIG_RAPIDIO_CPS_GEN2=y +CONFIG_RAPIDIO_RXS_GEN3=m +CONFIG_X86_SYSFB=y + +# +# Binary Emulations +# +CONFIG_IA32_EMULATION=y +CONFIG_IA32_AOUT=y +CONFIG_X86_X32=y +CONFIG_COMPAT_32=y +CONFIG_COMPAT=y +CONFIG_COMPAT_FOR_U64_ALIGNMENT=y +CONFIG_SYSVIPC_COMPAT=y +CONFIG_X86_DEV_DMA_OPS=y +CONFIG_HAVE_GENERIC_GUP=y + +# +# Firmware Drivers +# +CONFIG_EDD=m +# CONFIG_EDD_OFF is not set +CONFIG_FIRMWARE_MEMMAP=y +CONFIG_DELL_RBU=m +CONFIG_DCDBAS=m +CONFIG_DMIID=y +CONFIG_DMI_SYSFS=m +CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_ISCSI_IBFT=m +CONFIG_FW_CFG_SYSFS=m +# CONFIG_FW_CFG_SYSFS_CMDLINE is not set +CONFIG_GOOGLE_FIRMWARE=y +CONFIG_GOOGLE_SMI=m +CONFIG_GOOGLE_COREBOOT_TABLE=m +CONFIG_GOOGLE_COREBOOT_TABLE_ACPI=m +CONFIG_GOOGLE_MEMCONSOLE=m +CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY=m +CONFIG_GOOGLE_FRAMEBUFFER_COREBOOT=m +CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m +CONFIG_GOOGLE_VPD=m + +# +# EFI (Extensible Firmware Interface) Support +# +CONFIG_EFI_VARS=m +CONFIG_EFI_ESRT=y +CONFIG_EFI_VARS_PSTORE=m +CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE=y +CONFIG_EFI_RUNTIME_MAP=y +# CONFIG_EFI_FAKE_MEMMAP is not set +CONFIG_EFI_RUNTIME_WRAPPERS=y +CONFIG_EFI_BOOTLOADER_CONTROL=m +CONFIG_EFI_CAPSULE_LOADER=m +CONFIG_EFI_TEST=m +CONFIG_APPLE_PROPERTIES=y +CONFIG_RESET_ATTACK_MITIGATION=y +CONFIG_UEFI_CPER=y +CONFIG_UEFI_CPER_X86=y +CONFIG_EFI_DEV_PATH_PARSER=y + +# +# Tegra firmware driver +# +CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y +CONFIG_HAVE_KVM_IRQFD=y +CONFIG_HAVE_KVM_IRQ_ROUTING=y +CONFIG_HAVE_KVM_EVENTFD=y +CONFIG_KVM_MMIO=y +CONFIG_KVM_ASYNC_PF=y +CONFIG_HAVE_KVM_MSI=y +CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y +CONFIG_KVM_VFIO=y +CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y +CONFIG_KVM_COMPAT=y +CONFIG_HAVE_KVM_IRQ_BYPASS=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=m +CONFIG_KVM_INTEL=m +CONFIG_KVM_AMD=m +CONFIG_KVM_AMD_SEV=y +# CONFIG_KVM_MMU_AUDIT is not set +CONFIG_VHOST_NET=m +CONFIG_VHOST_SCSI=m +CONFIG_VHOST_VSOCK=m +CONFIG_VHOST=m +# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set + +# +# General architecture-dependent options +# +CONFIG_CRASH_CORE=y +CONFIG_KEXEC_CORE=y +CONFIG_HOTPLUG_SMT=y +# CONFIG_OPROFILE is not set +CONFIG_HAVE_OPROFILE=y +CONFIG_OPROFILE_NMI_TIMER=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +# CONFIG_STATIC_KEYS_SELFTEST is not set +CONFIG_OPTPROBES=y +CONFIG_KPROBES_ON_FTRACE=y +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_ARCH_USE_BUILTIN_BSWAP=y +CONFIG_KRETPROBES=y +CONFIG_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_HAVE_OPTPROBES=y +CONFIG_HAVE_KPROBES_ON_FTRACE=y +CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y +CONFIG_HAVE_NMI=y +CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_CONTIGUOUS=y +CONFIG_GENERIC_SMP_IDLE_THREAD=y +CONFIG_ARCH_HAS_FORTIFY_SOURCE=y +CONFIG_ARCH_HAS_SET_MEMORY=y +CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y +CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y +CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y +CONFIG_HAVE_RSEQ=y +CONFIG_HAVE_CLK=y +CONFIG_HAVE_HW_BREAKPOINT=y +CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y +CONFIG_HAVE_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_PERF_EVENTS_NMI=y +CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HAVE_PERF_REGS=y +CONFIG_HAVE_PERF_USER_STACK_DUMP=y +CONFIG_HAVE_ARCH_JUMP_LABEL=y +CONFIG_HAVE_RCU_TABLE_FREE=y +CONFIG_HAVE_RCU_TABLE_INVALIDATE=y +CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y +CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y +CONFIG_HAVE_CMPXCHG_LOCAL=y +CONFIG_HAVE_CMPXCHG_DOUBLE=y +CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y +CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y +CONFIG_HAVE_ARCH_SECCOMP_FILTER=y +CONFIG_SECCOMP_FILTER=y +CONFIG_HAVE_STACKPROTECTOR=y +CONFIG_CC_HAS_STACKPROTECTOR_NONE=y +CONFIG_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR_STRONG=y +CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y +CONFIG_HAVE_CONTEXT_TRACKING=y +CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y +CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y +CONFIG_HAVE_ARCH_HUGE_VMAP=y +CONFIG_HAVE_ARCH_SOFT_DIRTY=y +CONFIG_HAVE_MOD_ARCH_SPECIFIC=y +CONFIG_MODULES_USE_ELF_RELA=y +CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK=y +CONFIG_ARCH_HAS_ELF_RANDOMIZE=y +CONFIG_HAVE_ARCH_MMAP_RND_BITS=y +CONFIG_HAVE_EXIT_THREAD=y +CONFIG_ARCH_MMAP_RND_BITS=32 +CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16 +CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y +CONFIG_HAVE_COPY_THREAD_TLS=y +CONFIG_HAVE_STACK_VALIDATION=y +CONFIG_HAVE_RELIABLE_STACKTRACE=y +CONFIG_ISA_BUS_API=y +CONFIG_OLD_SIGSUSPEND3=y +CONFIG_COMPAT_OLD_SIGACTION=y +CONFIG_COMPAT_32BIT_TIME=y +CONFIG_HAVE_ARCH_VMAP_STACK=y +CONFIG_VMAP_STACK=y +CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y +CONFIG_STRICT_MODULE_RWX=y +CONFIG_ARCH_HAS_REFCOUNT=y +CONFIG_REFCOUNT_FULL=y +CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y +CONFIG_PLUGIN_HOSTCC="g++" +CONFIG_HAVE_GCC_PLUGINS=y +# CONFIG_GCC_PLUGINS is not set +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG=y +# CONFIG_MODULE_SIG_FORCE is not set +CONFIG_MODULE_SIG_ALL=y +# CONFIG_MODULE_SIG_SHA1 is not set +# CONFIG_MODULE_SIG_SHA224 is not set +# CONFIG_MODULE_SIG_SHA256 is not set +# CONFIG_MODULE_SIG_SHA384 is not set +CONFIG_MODULE_SIG_SHA512=y +CONFIG_MODULE_SIG_HASH="sha512" +CONFIG_MODULE_COMPRESS=y +CONFIG_MODULE_COMPRESS_GZIP=y +# CONFIG_MODULE_COMPRESS_XZ is not set +# CONFIG_TRIM_UNUSED_KSYMS is not set +CONFIG_MODULES_TREE_LOOKUP=y +CONFIG_BLOCK=y +CONFIG_BLK_SCSI_REQUEST=y +CONFIG_BLK_DEV_BSG=y +CONFIG_BLK_DEV_BSGLIB=y +CONFIG_BLK_DEV_INTEGRITY=y +CONFIG_BLK_DEV_ZONED=y +CONFIG_BLK_DEV_THROTTLING=y +# CONFIG_BLK_DEV_THROTTLING_LOW is not set +CONFIG_BLK_CMDLINE_PARSER=y +CONFIG_BLK_WBT=y +# CONFIG_BLK_CGROUP_IOLATENCY is not set +CONFIG_BLK_WBT_SQ=y +CONFIG_BLK_WBT_MQ=y +CONFIG_BLK_DEBUG_FS=y +CONFIG_BLK_DEBUG_FS_ZONED=y +# CONFIG_BLK_SED_OPAL is not set + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +CONFIG_ACORN_PARTITION=y +CONFIG_ACORN_PARTITION_CUMANA=y +CONFIG_ACORN_PARTITION_EESOX=y +CONFIG_ACORN_PARTITION_ICS=y +CONFIG_ACORN_PARTITION_ADFS=y +CONFIG_ACORN_PARTITION_POWERTEC=y +CONFIG_ACORN_PARTITION_RISCIX=y +CONFIG_AIX_PARTITION=y +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +CONFIG_ATARI_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_LDM_PARTITION=y +CONFIG_LDM_DEBUG=y +CONFIG_SGI_PARTITION=y +CONFIG_ULTRIX_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_KARMA_PARTITION=y +CONFIG_EFI_PARTITION=y +CONFIG_SYSV68_PARTITION=y +CONFIG_CMDLINE_PARTITION=y +CONFIG_BLOCK_COMPAT=y +CONFIG_BLK_MQ_PCI=y +CONFIG_BLK_MQ_VIRTIO=y +CONFIG_BLK_MQ_RDMA=y + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_CFQ_GROUP_IOSCHED=y +CONFIG_IOSCHED_BFQ_SQ=y +CONFIG_BFQ_SQ_GROUP_IOSCHED=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +CONFIG_DEFAULT_BFQ_SQ=y +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="bfq-sq" +CONFIG_MQ_IOSCHED_BFQ=y +CONFIG_MQ_BFQ_GROUP_IOSCHED=y +CONFIG_MQ_IOSCHED_DEADLINE=y +# CONFIG_MQ_IOSCHED_KYBER is not set +CONFIG_IOSCHED_BFQ=y +CONFIG_BFQ_GROUP_IOSCHED=y +CONFIG_PREEMPT_NOTIFIERS=y +CONFIG_PADATA=y +CONFIG_ASN1=y +CONFIG_UNINLINE_SPIN_UNLOCK=y +CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y +CONFIG_MUTEX_SPIN_ON_OWNER=y +CONFIG_RWSEM_SPIN_ON_OWNER=y +CONFIG_LOCK_SPIN_ON_OWNER=y +CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y +CONFIG_QUEUED_SPINLOCKS=y +CONFIG_ARCH_USE_QUEUED_RWLOCKS=y +CONFIG_QUEUED_RWLOCKS=y +CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y +CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y +CONFIG_FREEZER=y + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +CONFIG_COMPAT_BINFMT_ELF=y +CONFIG_ELFCORE=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y +CONFIG_BINFMT_SCRIPT=y +CONFIG_BINFMT_MISC=y +CONFIG_COREDUMP=y + +# +# Memory Management options +# +CONFIG_SELECT_MEMORY_MODEL=y +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y +CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_HAVE_MEMORY_PRESENT=y +CONFIG_SPARSEMEM_EXTREME=y +CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y +CONFIG_SPARSEMEM_VMEMMAP=y +CONFIG_HAVE_MEMBLOCK=y +CONFIG_HAVE_MEMBLOCK_NODE_MAP=y +CONFIG_ARCH_DISCARD_MEMBLOCK=y +CONFIG_MEMORY_ISOLATION=y +CONFIG_HAVE_BOOTMEM_INFO_NODE=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTPLUG_SPARSE=y +# CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE is not set +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_MEMORY_BALLOON=y +CONFIG_BALLOON_COMPACTION=y +CONFIG_COMPACTION=y +CONFIG_MIGRATION=y +CONFIG_PHYS_ADDR_T_64BIT=y +CONFIG_BOUNCE=y +CONFIG_VIRT_TO_BUS=y +CONFIG_MMU_NOTIFIER=y +CONFIG_KSM=y +CONFIG_UKSM=y +# CONFIG_KSM_LEGACY is not set +CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y +# CONFIG_HWPOISON_INJECT is not set +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y +# CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set +CONFIG_ARCH_WANTS_THP_SWAP=y +CONFIG_THP_SWAP=y +CONFIG_TRANSPARENT_HUGE_PAGECACHE=y +CONFIG_CLEANCACHE=y +CONFIG_FRONTSWAP=y +CONFIG_CMA=y +# CONFIG_CMA_DEBUG is not set +# CONFIG_CMA_DEBUGFS is not set +CONFIG_CMA_AREAS=7 +# CONFIG_ZSWAP is not set +CONFIG_ZPOOL=m +CONFIG_ZBUD=m +CONFIG_Z3FOLD=m +CONFIG_ZSMALLOC=y +# CONFIG_PGTABLE_MAPPING is not set +# CONFIG_ZSMALLOC_STAT is not set +CONFIG_GENERIC_EARLY_IOREMAP=y +# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set +# CONFIG_IDLE_PAGE_TRACKING is not set +CONFIG_ARCH_HAS_ZONE_DEVICE=y +# CONFIG_ZONE_DEVICE is not set +CONFIG_FRAME_VECTOR=y +CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y +CONFIG_ARCH_HAS_PKEYS=y +# CONFIG_PERCPU_STATS is not set +# CONFIG_GUP_BENCHMARK is not set +CONFIG_ARCH_HAS_PTE_SPECIAL=y +CONFIG_NET=y +CONFIG_COMPAT_NETLINK_MESSAGES=y +CONFIG_NET_INGRESS=y +CONFIG_NET_EGRESS=y + +# +# Networking options +# +CONFIG_PACKET=m +CONFIG_PACKET_DIAG=m +CONFIG_UNIX=m +CONFIG_UNIX_DIAG=m +CONFIG_TLS=m +# CONFIG_TLS_DEVICE is not set +CONFIG_XFRM=y +CONFIG_XFRM_OFFLOAD=y +CONFIG_XFRM_ALGO=m +CONFIG_XFRM_USER=m +# CONFIG_XFRM_INTERFACE is not set +CONFIG_XFRM_SUB_POLICY=y +CONFIG_XFRM_MIGRATE=y +CONFIG_XFRM_STATISTICS=y +CONFIG_XFRM_IPCOMP=m +CONFIG_NET_KEY=m +CONFIG_NET_KEY_MIGRATE=y +CONFIG_SMC=m +CONFIG_SMC_DIAG=m +CONFIG_XDP_SOCKETS=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_FIB_TRIE_STATS=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_ROUTE_CLASSID=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IP_TUNNEL=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE_COMMON=y +CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_NET_IPVTI=m +CONFIG_NET_UDP_TUNNEL=m +CONFIG_NET_FOU=m +CONFIG_NET_FOU_IP_TUNNELS=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_ESP_OFFLOAD=m +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_TUNNEL=m +CONFIG_INET_TUNNEL=m +CONFIG_INET_XFRM_MODE_TRANSPORT=m +CONFIG_INET_XFRM_MODE_TUNNEL=m +CONFIG_INET_XFRM_MODE_BEET=m +CONFIG_INET_DIAG=m +CONFIG_INET_TCP_DIAG=m +CONFIG_INET_UDP_DIAG=m +CONFIG_INET_RAW_DIAG=m +CONFIG_INET_DIAG_DESTROY=y +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_BIC=m +CONFIG_TCP_CONG_CUBIC=m +CONFIG_TCP_CONG_WESTWOOD=m +CONFIG_TCP_CONG_HTCP=m +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_VEGAS=m +CONFIG_TCP_CONG_NV=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_TCP_CONG_DCTCP=m +# CONFIG_TCP_CONG_CDG is not set +CONFIG_TCP_CONG_BBR=m +CONFIG_DEFAULT_RENO=y +CONFIG_DEFAULT_TCP_CONG="reno" +CONFIG_TCP_MD5SIG=y +CONFIG_IPV6=m +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_ESP_OFFLOAD=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_IPV6_ILA=m +CONFIG_INET6_XFRM_TUNNEL=m +CONFIG_INET6_TUNNEL=m +CONFIG_INET6_XFRM_MODE_TRANSPORT=m +CONFIG_INET6_XFRM_MODE_TUNNEL=m +CONFIG_INET6_XFRM_MODE_BEET=m +CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m +CONFIG_IPV6_VTI=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_SIT_6RD=y +CONFIG_IPV6_NDISC_NODETYPE=y +CONFIG_IPV6_TUNNEL=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_FOU=m +CONFIG_IPV6_FOU_TUNNEL=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_MROUTE=y +CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y +CONFIG_IPV6_PIMSM_V2=y +CONFIG_IPV6_SEG6_LWTUNNEL=y +CONFIG_IPV6_SEG6_HMAC=y +# CONFIG_NETLABEL is not set +CONFIG_NETWORK_SECMARK=y +CONFIG_NET_PTP_CLASSIFY=y +CONFIG_NETWORK_PHY_TIMESTAMPING=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_BRIDGE_NETFILTER=m + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_INGRESS=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_FAMILY_BRIDGE=y +CONFIG_NETFILTER_FAMILY_ARP=y +CONFIG_NETFILTER_NETLINK_ACCT=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_NETLINK_LOG=m +CONFIG_NETFILTER_NETLINK_OSF=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_LOG_COMMON=m +CONFIG_NF_LOG_NETDEV=m +CONFIG_NETFILTER_CONNCOUNT=m +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CONNTRACK_PROCFS=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y +CONFIG_NF_CONNTRACK_LABELS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_GRE=m +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_BROADCAST=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y +CONFIG_NF_NAT=m +CONFIG_NF_NAT_NEEDED=y +CONFIG_NF_NAT_PROTO_DCCP=y +CONFIG_NF_NAT_PROTO_UDPLITE=y +CONFIG_NF_NAT_PROTO_SCTP=y +CONFIG_NF_NAT_AMANDA=m +CONFIG_NF_NAT_FTP=m +CONFIG_NF_NAT_IRC=m +CONFIG_NF_NAT_SIP=m +CONFIG_NF_NAT_TFTP=m +CONFIG_NF_NAT_REDIRECT=y +CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_SET=m +CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m +CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_CONNLIMIT=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_NAT=m +# CONFIG_NFT_TUNNEL is not set +CONFIG_NFT_OBJREF=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m +CONFIG_NFT_REJECT=m +CONFIG_NFT_REJECT_INET=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m +CONFIG_NFT_FIB=m +CONFIG_NFT_FIB_INET=m +CONFIG_NFT_SOCKET=m +# CONFIG_NFT_OSF is not set +# CONFIG_NFT_TPROXY is not set +CONFIG_NF_DUP_NETDEV=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES=m + +# +# Xtables combined modules +# +CONFIG_NETFILTER_XT_MARK=m +CONFIG_NETFILTER_XT_CONNMARK=m +CONFIG_NETFILTER_XT_SET=m + +# +# Xtables targets +# +CONFIG_NETFILTER_XT_TARGET_AUDIT=m +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_CT=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HL=m +CONFIG_NETFILTER_XT_TARGET_HMARK=m +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m +CONFIG_NETFILTER_XT_TARGET_LED=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_NOTRACK=m +CONFIG_NETFILTER_XT_TARGET_RATEEST=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NETFILTER_XT_TARGET_TEE=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m + +# +# Xtables matches +# +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_CPU=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ECN=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_HL=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MATCH_L2TP=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_NFACCT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SCTP=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +CONFIG_IP_SET=m +CONFIG_IP_SET_MAX=256 +CONFIG_IP_SET_BITMAP_IP=m +CONFIG_IP_SET_BITMAP_IPMAC=m +CONFIG_IP_SET_BITMAP_PORT=m +CONFIG_IP_SET_HASH_IP=m +CONFIG_IP_SET_HASH_IPMARK=m +CONFIG_IP_SET_HASH_IPPORT=m +CONFIG_IP_SET_HASH_IPPORTIP=m +CONFIG_IP_SET_HASH_IPPORTNET=m +CONFIG_IP_SET_HASH_IPMAC=m +CONFIG_IP_SET_HASH_MAC=m +CONFIG_IP_SET_HASH_NETPORTNET=m +CONFIG_IP_SET_HASH_NET=m +CONFIG_IP_SET_HASH_NETNET=m +CONFIG_IP_SET_HASH_NETPORT=m +CONFIG_IP_SET_HASH_NETIFACE=m +CONFIG_IP_SET_LIST_SET=m +CONFIG_IP_VS=m +CONFIG_IP_VS_IPV6=y +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=12 + +# +# IPVS transport protocol load balancing support +# +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_AH_ESP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_PROTO_SCTP=y + +# +# IPVS scheduler +# +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_FO=m +CONFIG_IP_VS_OVF=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_MH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m + +# +# IPVS SH scheduler +# +CONFIG_IP_VS_SH_TAB_BITS=8 + +# +# IPVS MH scheduler +# +CONFIG_IP_VS_MH_TAB_INDEX=12 + +# +# IPVS application helper +# +CONFIG_IP_VS_FTP=m +CONFIG_IP_VS_NFCT=y +CONFIG_IP_VS_PE_SIP=m + +# +# IP: Netfilter Configuration +# +CONFIG_NF_DEFRAG_IPV4=m +CONFIG_NF_SOCKET_IPV4=m +CONFIG_NF_TPROXY_IPV4=m +CONFIG_NF_TABLES_IPV4=y +CONFIG_NFT_CHAIN_ROUTE_IPV4=m +CONFIG_NFT_REJECT_IPV4=m +CONFIG_NFT_DUP_IPV4=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NF_TABLES_ARP=y +CONFIG_NF_FLOW_TABLE_IPV4=m +CONFIG_NF_DUP_IPV4=m +CONFIG_NF_LOG_ARP=m +CONFIG_NF_LOG_IPV4=m +CONFIG_NF_REJECT_IPV4=m +CONFIG_NF_NAT_IPV4=m +CONFIG_NF_NAT_MASQUERADE_IPV4=y +CONFIG_NFT_CHAIN_NAT_IPV4=m +CONFIG_NFT_MASQ_IPV4=m +CONFIG_NFT_REDIR_IPV4=m +CONFIG_NF_NAT_SNMP_BASIC=m +CONFIG_NF_NAT_PROTO_GRE=m +CONFIG_NF_NAT_PPTP=m +CONFIG_NF_NAT_H323=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_SYNPROXY=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +# CONFIG_IP_NF_SECURITY is not set +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m + +# +# IPv6: Netfilter Configuration +# +CONFIG_NF_SOCKET_IPV6=m +CONFIG_NF_TPROXY_IPV6=m +CONFIG_NF_TABLES_IPV6=y +CONFIG_NFT_CHAIN_ROUTE_IPV6=m +CONFIG_NFT_CHAIN_NAT_IPV6=m +CONFIG_NFT_MASQ_IPV6=m +CONFIG_NFT_REDIR_IPV6=m +CONFIG_NFT_REJECT_IPV6=m +CONFIG_NFT_DUP_IPV6=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_NF_FLOW_TABLE_IPV6=m +CONFIG_NF_DUP_IPV6=m +CONFIG_NF_REJECT_IPV6=m +CONFIG_NF_LOG_IPV6=m +CONFIG_NF_NAT_IPV6=m +CONFIG_NF_NAT_MASQUERADE_IPV6=y +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_MATCH_SRH=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_TARGET_SYNPROXY=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +# CONFIG_IP6_NF_SECURITY is not set +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_IP6_NF_TARGET_NPT=m +CONFIG_NF_DEFRAG_IPV6=m + +# +# DECnet: Netfilter Configuration +# +CONFIG_DECNET_NF_GRABULATOR=m +CONFIG_NF_TABLES_BRIDGE=y +CONFIG_NFT_BRIDGE_REJECT=m +CONFIG_NF_LOG_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_IP6=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_SNAT=m +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_BRIDGE_EBT_NFLOG=m +CONFIG_BPFILTER=y +CONFIG_BPFILTER_UMH=m +CONFIG_IP_DCCP=m +CONFIG_INET_DCCP_DIAG=m + +# +# DCCP CCIDs Configuration +# +# CONFIG_IP_DCCP_CCID2_DEBUG is not set +CONFIG_IP_DCCP_CCID3=y +# CONFIG_IP_DCCP_CCID3_DEBUG is not set +CONFIG_IP_DCCP_TFRC_LIB=y + +# +# DCCP Kernel Hacking +# +# CONFIG_IP_DCCP_DEBUG is not set +CONFIG_IP_SCTP=m +# CONFIG_SCTP_DBG_OBJCNT is not set +CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5=y +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1 is not set +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set +CONFIG_SCTP_COOKIE_HMAC_MD5=y +CONFIG_SCTP_COOKIE_HMAC_SHA1=y +CONFIG_INET_SCTP_DIAG=m +CONFIG_RDS=m +CONFIG_RDS_RDMA=m +CONFIG_RDS_TCP=m +# CONFIG_RDS_DEBUG is not set +CONFIG_TIPC=m +CONFIG_TIPC_MEDIA_IB=y +CONFIG_TIPC_MEDIA_UDP=y +CONFIG_TIPC_DIAG=m +CONFIG_ATM=m +CONFIG_ATM_CLIP=m +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +# CONFIG_ATM_BR2684_IPFILTER is not set +CONFIG_L2TP=m +# CONFIG_L2TP_DEBUGFS is not set +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m +CONFIG_STP=m +CONFIG_GARP=m +CONFIG_MRP=m +CONFIG_BRIDGE=m +CONFIG_BRIDGE_IGMP_SNOOPING=y +CONFIG_BRIDGE_VLAN_FILTERING=y +CONFIG_HAVE_NET_DSA=y +CONFIG_NET_DSA=m +CONFIG_NET_DSA_LEGACY=y +CONFIG_NET_DSA_TAG_BRCM=y +CONFIG_NET_DSA_TAG_BRCM_PREPEND=y +CONFIG_NET_DSA_TAG_DSA=y +CONFIG_NET_DSA_TAG_EDSA=y +CONFIG_NET_DSA_TAG_KSZ=y +CONFIG_NET_DSA_TAG_LAN9303=y +CONFIG_NET_DSA_TAG_MTK=y +CONFIG_NET_DSA_TAG_TRAILER=y +CONFIG_NET_DSA_TAG_QCA=y +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y +CONFIG_VLAN_8021Q_MVRP=y +CONFIG_DECNET=m +CONFIG_DECNET_ROUTER=y +CONFIG_LLC=m +CONFIG_LLC2=m +CONFIG_ATALK=m +CONFIG_DEV_APPLETALK=m +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +CONFIG_X25=m +CONFIG_LAPB=m +CONFIG_PHONET=m +CONFIG_6LOWPAN=m +# CONFIG_6LOWPAN_DEBUGFS is not set +CONFIG_6LOWPAN_NHC=m +CONFIG_6LOWPAN_NHC_DEST=m +CONFIG_6LOWPAN_NHC_FRAGMENT=m +CONFIG_6LOWPAN_NHC_HOP=m +CONFIG_6LOWPAN_NHC_IPV6=m +CONFIG_6LOWPAN_NHC_MOBILITY=m +CONFIG_6LOWPAN_NHC_ROUTING=m +CONFIG_6LOWPAN_NHC_UDP=m +CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m +CONFIG_6LOWPAN_GHC_UDP=m +CONFIG_6LOWPAN_GHC_ICMPV6=m +CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m +CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m +CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m +CONFIG_IEEE802154=m +CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y +CONFIG_IEEE802154_SOCKET=m +CONFIG_IEEE802154_6LOWPAN=m +CONFIG_MAC802154=m +CONFIG_NET_SCHED=y + +# +# Queueing/Scheduling +# +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_ATM=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFB=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_CBS=m +# CONFIG_NET_SCH_ETF is not set +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_MQPRIO=m +# CONFIG_NET_SCH_SKBPRIO is not set +CONFIG_NET_SCH_CHOKE=m +CONFIG_NET_SCH_QFQ=m +CONFIG_NET_SCH_CODEL=m +CONFIG_NET_SCH_FQ_CODEL=m +# CONFIG_NET_SCH_CAKE is not set +CONFIG_NET_SCH_FQ=m +CONFIG_NET_SCH_HHF=m +CONFIG_NET_SCH_PIE=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_SCH_PLUG=m +# CONFIG_NET_SCH_DEFAULT is not set + +# +# Classification +# +CONFIG_NET_CLS=y +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_CLS_CGROUP=m +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_STACK=32 +CONFIG_NET_EMATCH_CMP=m +CONFIG_NET_EMATCH_NBYTE=m +CONFIG_NET_EMATCH_U32=m +CONFIG_NET_EMATCH_META=m +CONFIG_NET_EMATCH_TEXT=m +CONFIG_NET_EMATCH_CANID=m +CONFIG_NET_EMATCH_IPSET=m +CONFIG_NET_EMATCH_IPT=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_SAMPLE=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +# CONFIG_NET_ACT_SIMP is not set +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_BPF=m +CONFIG_NET_ACT_CONNMARK=m +CONFIG_NET_ACT_SKBMOD=m +CONFIG_NET_ACT_IFE=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_IFE_SKBMARK=m +CONFIG_NET_IFE_SKBPRIO=m +CONFIG_NET_IFE_SKBTCINDEX=m +CONFIG_NET_CLS_IND=y +CONFIG_NET_SCH_FIFO=y +CONFIG_DCB=y +CONFIG_DNS_RESOLVER=y +CONFIG_BATMAN_ADV=m +# CONFIG_BATMAN_ADV_BATMAN_V is not set +CONFIG_BATMAN_ADV_BLA=y +CONFIG_BATMAN_ADV_DAT=y +CONFIG_BATMAN_ADV_NC=y +CONFIG_BATMAN_ADV_MCAST=y +CONFIG_BATMAN_ADV_DEBUGFS=y +# CONFIG_BATMAN_ADV_DEBUG is not set +CONFIG_OPENVSWITCH=m +CONFIG_OPENVSWITCH_GRE=m +CONFIG_OPENVSWITCH_VXLAN=m +CONFIG_OPENVSWITCH_GENEVE=m +CONFIG_VSOCKETS=m +CONFIG_VSOCKETS_DIAG=m +CONFIG_VMWARE_VMCI_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS_COMMON=m +CONFIG_HYPERV_VSOCKETS=m +CONFIG_NETLINK_DIAG=m +CONFIG_MPLS=y +CONFIG_NET_MPLS_GSO=m +CONFIG_MPLS_ROUTING=m +CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m +CONFIG_HSR=m +CONFIG_NET_SWITCHDEV=y +CONFIG_NET_L3_MASTER_DEV=y +# CONFIG_NET_NCSI is not set +CONFIG_RPS=y +CONFIG_RFS_ACCEL=y +CONFIG_XPS=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CGROUP_NET_CLASSID=y +CONFIG_NET_RX_BUSY_POLL=y +CONFIG_BQL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_STREAM_PARSER=y +CONFIG_NET_FLOW_LIMIT=y + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set +# CONFIG_NET_DROP_MONITOR is not set +CONFIG_HAMRADIO=y + +# +# Packet Radio protocols +# +CONFIG_AX25=m +CONFIG_AX25_DAMA_SLAVE=y +CONFIG_NETROM=m +CONFIG_ROSE=m + +# +# AX.25 network device drivers +# +CONFIG_MKISS=m +CONFIG_6PACK=m +CONFIG_BPQETHER=m +CONFIG_BAYCOM_SER_FDX=m +CONFIG_BAYCOM_SER_HDX=m +CONFIG_BAYCOM_PAR=m +CONFIG_YAM=m +CONFIG_CAN=m +CONFIG_CAN_RAW=m +CONFIG_CAN_BCM=m +CONFIG_CAN_GW=m + +# +# CAN Device Drivers +# +CONFIG_CAN_VCAN=m +CONFIG_CAN_VXCAN=m +CONFIG_CAN_SLCAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_CALC_BITTIMING=y +CONFIG_CAN_JANZ_ICAN3=m +CONFIG_CAN_C_CAN=m +CONFIG_CAN_C_CAN_PLATFORM=m +CONFIG_CAN_C_CAN_PCI=m +CONFIG_CAN_CC770=m +CONFIG_CAN_CC770_ISA=m +CONFIG_CAN_CC770_PLATFORM=m +CONFIG_CAN_IFI_CANFD=m +CONFIG_CAN_M_CAN=m +CONFIG_CAN_PEAK_PCIEFD=m +CONFIG_CAN_SJA1000=m +CONFIG_CAN_SJA1000_ISA=m +CONFIG_CAN_SJA1000_PLATFORM=m +CONFIG_CAN_EMS_PCMCIA=m +CONFIG_CAN_EMS_PCI=m +CONFIG_CAN_PEAK_PCMCIA=m +CONFIG_CAN_PEAK_PCI=m +CONFIG_CAN_PEAK_PCIEC=y +CONFIG_CAN_KVASER_PCI=m +CONFIG_CAN_PLX_PCI=m +CONFIG_CAN_SOFTING=m +CONFIG_CAN_SOFTING_CS=m + +# +# CAN SPI interfaces +# +CONFIG_CAN_HI311X=m +CONFIG_CAN_MCP251X=m + +# +# CAN USB interfaces +# +CONFIG_CAN_8DEV_USB=m +CONFIG_CAN_EMS_USB=m +CONFIG_CAN_ESD_USB2=m +CONFIG_CAN_GS_USB=m +CONFIG_CAN_KVASER_USB=m +CONFIG_CAN_MCBA_USB=m +CONFIG_CAN_PEAK_USB=m +# CONFIG_CAN_UCAN is not set +# CONFIG_CAN_DEBUG_DEVICES is not set +CONFIG_BT=m +CONFIG_BT_BREDR=y +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_CMTP=m +CONFIG_BT_HIDP=m +CONFIG_BT_HS=y +CONFIG_BT_LE=y +CONFIG_BT_6LOWPAN=m +CONFIG_BT_LEDS=y +# CONFIG_BT_SELFTEST is not set +CONFIG_BT_DEBUGFS=y + +# +# Bluetooth device drivers +# +CONFIG_BT_INTEL=m +CONFIG_BT_BCM=m +CONFIG_BT_RTL=m +CONFIG_BT_QCA=m +CONFIG_BT_HCIBTUSB=m +# CONFIG_BT_HCIBTUSB_AUTOSUSPEND is not set +CONFIG_BT_HCIBTUSB_BCM=y +CONFIG_BT_HCIBTUSB_RTL=y +CONFIG_BT_HCIBTSDIO=m +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_SERDEV=y +CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_NOKIA=m +CONFIG_BT_HCIUART_BCSP=y +CONFIG_BT_HCIUART_ATH3K=y +CONFIG_BT_HCIUART_LL=y +CONFIG_BT_HCIUART_3WIRE=y +CONFIG_BT_HCIUART_INTEL=y +# CONFIG_BT_HCIUART_RTL is not set +CONFIG_BT_HCIUART_QCA=y +CONFIG_BT_HCIUART_AG6XX=y +CONFIG_BT_HCIUART_MRVL=y +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBPA10X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIDTL1=m +CONFIG_BT_HCIBT3C=m +CONFIG_BT_HCIBLUECARD=m +CONFIG_BT_HCIVHCI=m +CONFIG_BT_MRVL=m +CONFIG_BT_MRVL_SDIO=m +CONFIG_BT_ATH3K=m +CONFIG_BT_WILINK=m +# CONFIG_BT_MTKUART is not set +CONFIG_BT_HCIRSI=m +CONFIG_AF_RXRPC=m +CONFIG_AF_RXRPC_IPV6=y +# CONFIG_AF_RXRPC_INJECT_LOSS is not set +# CONFIG_AF_RXRPC_DEBUG is not set +# CONFIG_RXKAD is not set +CONFIG_AF_KCM=m +CONFIG_STREAM_PARSER=y +CONFIG_FIB_RULES=y +CONFIG_WIRELESS=y +CONFIG_WIRELESS_EXT=y +CONFIG_WEXT_CORE=y +CONFIG_WEXT_PROC=y +CONFIG_WEXT_SPY=y +CONFIG_WEXT_PRIV=y +CONFIG_CFG80211=m +CONFIG_NL80211_TESTMODE=y +# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set +CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y +CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y +CONFIG_CFG80211_DEFAULT_PS=y +# CONFIG_CFG80211_DEBUGFS is not set +CONFIG_CFG80211_CRDA_SUPPORT=y +CONFIG_CFG80211_WEXT=y +CONFIG_CFG80211_WEXT_EXPORT=y +CONFIG_LIB80211=m +CONFIG_LIB80211_CRYPT_WEP=m +CONFIG_LIB80211_CRYPT_CCMP=m +CONFIG_LIB80211_CRYPT_TKIP=m +# CONFIG_LIB80211_DEBUG is not set +CONFIG_MAC80211=m +CONFIG_MAC80211_HAS_RC=y +CONFIG_MAC80211_RC_MINSTREL=y +CONFIG_MAC80211_RC_MINSTREL_HT=y +CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" +CONFIG_MAC80211_MESH=y +CONFIG_MAC80211_LEDS=y +# CONFIG_MAC80211_DEBUGFS is not set +# CONFIG_MAC80211_MESSAGE_TRACING is not set +# CONFIG_MAC80211_DEBUG_MENU is not set +CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 +CONFIG_WIMAX=m +CONFIG_WIMAX_DEBUG_LEVEL=8 +CONFIG_RFKILL=m +CONFIG_RFKILL_LEDS=y +CONFIG_RFKILL_INPUT=y +CONFIG_RFKILL_GPIO=m +CONFIG_NET_9P=m +CONFIG_NET_9P_VIRTIO=m +CONFIG_NET_9P_RDMA=m +# CONFIG_NET_9P_DEBUG is not set +CONFIG_CAIF=m +# CONFIG_CAIF_DEBUG is not set +CONFIG_CAIF_NETDEV=m +CONFIG_CAIF_USB=m +CONFIG_CEPH_LIB=m +# CONFIG_CEPH_LIB_PRETTYDEBUG is not set +CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y +CONFIG_NFC=m +CONFIG_NFC_DIGITAL=m +CONFIG_NFC_NCI=m +CONFIG_NFC_NCI_SPI=m +CONFIG_NFC_NCI_UART=m +CONFIG_NFC_HCI=m +CONFIG_NFC_SHDLC=y + +# +# Near Field Communication (NFC) devices +# +CONFIG_NFC_TRF7970A=m +CONFIG_NFC_MEI_PHY=m +CONFIG_NFC_SIM=m +CONFIG_NFC_PORT100=m +CONFIG_NFC_FDP=m +CONFIG_NFC_FDP_I2C=m +CONFIG_NFC_PN544=m +CONFIG_NFC_PN544_I2C=m +CONFIG_NFC_PN544_MEI=m +CONFIG_NFC_PN533=m +CONFIG_NFC_PN533_USB=m +CONFIG_NFC_PN533_I2C=m +CONFIG_NFC_MICROREAD=m +CONFIG_NFC_MICROREAD_I2C=m +CONFIG_NFC_MICROREAD_MEI=m +CONFIG_NFC_MRVL=m +CONFIG_NFC_MRVL_USB=m +CONFIG_NFC_MRVL_UART=m +CONFIG_NFC_MRVL_I2C=m +CONFIG_NFC_MRVL_SPI=m +CONFIG_NFC_ST21NFCA=m +CONFIG_NFC_ST21NFCA_I2C=m +CONFIG_NFC_ST_NCI=m +CONFIG_NFC_ST_NCI_I2C=m +CONFIG_NFC_ST_NCI_SPI=m +CONFIG_NFC_NXP_NCI=m +CONFIG_NFC_NXP_NCI_I2C=m +CONFIG_NFC_S3FWRN5=m +CONFIG_NFC_S3FWRN5_I2C=m +CONFIG_NFC_ST95HF=m +CONFIG_PSAMPLE=m +CONFIG_NET_IFE=m +CONFIG_LWTUNNEL=y +CONFIG_LWTUNNEL_BPF=y +CONFIG_DST_CACHE=y +CONFIG_GRO_CELLS=y +CONFIG_NET_DEVLINK=m +CONFIG_MAY_USE_DEVLINK=m +CONFIG_FAILOVER=m +CONFIG_HAVE_EBPF_JIT=y + +# +# Device Drivers +# + +# +# Generic Driver Options +# +# CONFIG_UEVENT_HELPER is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y + +# +# Firmware loader +# +CONFIG_FW_LOADER=y +CONFIG_EXTRA_FIRMWARE="" +CONFIG_FW_LOADER_USER_HELPER=y +# CONFIG_FW_LOADER_USER_HELPER_FALLBACK is not set +CONFIG_WANT_DEV_COREDUMP=y +CONFIG_ALLOW_DEV_COREDUMP=y +CONFIG_DEV_COREDUMP=y +# CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set +# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set +CONFIG_TEST_ASYNC_DRIVER_PROBE=m +CONFIG_GENERIC_CPU_AUTOPROBE=y +CONFIG_GENERIC_CPU_VULNERABILITIES=y +CONFIG_REGMAP=y +CONFIG_REGMAP_I2C=m +CONFIG_REGMAP_SPI=y +CONFIG_REGMAP_SPMI=m +CONFIG_REGMAP_W1=m +CONFIG_REGMAP_MMIO=y +CONFIG_REGMAP_IRQ=y +CONFIG_REGMAP_SOUNDWIRE=m +CONFIG_DMA_SHARED_BUFFER=y +# CONFIG_DMA_FENCE_TRACE is not set +# CONFIG_DMA_CMA is not set + +# +# Bus devices +# +CONFIG_CONNECTOR=m +# CONFIG_GNSS is not set +CONFIG_MTD=m +CONFIG_MTD_TESTS=m +CONFIG_MTD_REDBOOT_PARTS=m +CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 +CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED=y +CONFIG_MTD_REDBOOT_PARTS_READONLY=y +CONFIG_MTD_CMDLINE_PARTS=m +CONFIG_MTD_AR7_PARTS=m + +# +# Partition parsers +# + +# +# User Modules And Translation Layers +# +CONFIG_MTD_BLKDEVS=m +CONFIG_MTD_BLOCK=m +CONFIG_MTD_BLOCK_RO=m +CONFIG_FTL=m +CONFIG_NFTL=m +CONFIG_NFTL_RW=y +CONFIG_INFTL=m +CONFIG_RFD_FTL=m +CONFIG_SSFDC=m +CONFIG_SM_FTL=m +CONFIG_MTD_OOPS=m +CONFIG_MTD_SWAP=m +# CONFIG_MTD_PARTITIONED_MASTER is not set + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=m +CONFIG_MTD_JEDECPROBE=m +CONFIG_MTD_GEN_PROBE=m +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +CONFIG_MTD_CFI_INTELEXT=m +CONFIG_MTD_CFI_AMDSTD=m +CONFIG_MTD_CFI_STAA=m +CONFIG_MTD_CFI_UTIL=m +CONFIG_MTD_RAM=m +CONFIG_MTD_ROM=m +CONFIG_MTD_ABSENT=m + +# +# Mapping drivers for chip access +# +CONFIG_MTD_COMPLEX_MAPPINGS=y +CONFIG_MTD_PHYSMAP=m +# CONFIG_MTD_PHYSMAP_COMPAT is not set +CONFIG_MTD_SBC_GXX=m +CONFIG_MTD_AMD76XROM=m +CONFIG_MTD_ICHXROM=m +CONFIG_MTD_ESB2ROM=m +CONFIG_MTD_CK804XROM=m +CONFIG_MTD_SCB2_FLASH=m +CONFIG_MTD_NETtel=m +CONFIG_MTD_L440GX=m +CONFIG_MTD_PCI=m +CONFIG_MTD_PCMCIA=m +# CONFIG_MTD_PCMCIA_ANONYMOUS is not set +CONFIG_MTD_GPIO_ADDR=m +CONFIG_MTD_INTEL_VR_NOR=m +CONFIG_MTD_PLATRAM=m +CONFIG_MTD_LATCH_ADDR=m + +# +# Self-contained MTD device drivers +# +CONFIG_MTD_PMC551=m +CONFIG_MTD_PMC551_BUGFIX=y +# CONFIG_MTD_PMC551_DEBUG is not set +CONFIG_MTD_DATAFLASH=m +CONFIG_MTD_DATAFLASH_WRITE_VERIFY=y +CONFIG_MTD_DATAFLASH_OTP=y +CONFIG_MTD_M25P80=m +CONFIG_MTD_MCHP23K256=m +CONFIG_MTD_SST25L=m +CONFIG_MTD_SLRAM=m +CONFIG_MTD_PHRAM=m +CONFIG_MTD_MTDRAM=m +CONFIG_MTDRAM_TOTAL_SIZE=4096 +CONFIG_MTDRAM_ERASE_SIZE=128 +CONFIG_MTD_BLOCK2MTD=m + +# +# Disk-On-Chip Device Drivers +# +CONFIG_MTD_DOCG3=m +CONFIG_BCH_CONST_M=14 +CONFIG_BCH_CONST_T=4 +CONFIG_MTD_ONENAND=m +CONFIG_MTD_ONENAND_VERIFY_WRITE=y +CONFIG_MTD_ONENAND_GENERIC=m +CONFIG_MTD_ONENAND_OTP=y +CONFIG_MTD_ONENAND_2X_PROGRAM=y +CONFIG_MTD_NAND_ECC=m +CONFIG_MTD_NAND_ECC_SMC=y +CONFIG_MTD_NAND=m +CONFIG_MTD_NAND_BCH=m +CONFIG_MTD_NAND_ECC_BCH=y +CONFIG_MTD_SM_COMMON=m +CONFIG_MTD_NAND_DENALI=m +CONFIG_MTD_NAND_DENALI_PCI=m +CONFIG_MTD_NAND_GPIO=m +CONFIG_MTD_NAND_RICOH=m +CONFIG_MTD_NAND_DISKONCHIP=m +CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED=y +CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0x0 +CONFIG_MTD_NAND_DISKONCHIP_PROBE_HIGH=y +CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y +CONFIG_MTD_NAND_DOCG4=m +CONFIG_MTD_NAND_CAFE=m +CONFIG_MTD_NAND_NANDSIM=m +CONFIG_MTD_NAND_PLATFORM=m +# CONFIG_MTD_SPI_NAND is not set + +# +# LPDDR & LPDDR2 PCM memory drivers +# +CONFIG_MTD_LPDDR=m +CONFIG_MTD_QINFO_PROBE=m +CONFIG_MTD_SPI_NOR=m +CONFIG_MTD_MT81xx_NOR=m +CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y +CONFIG_SPI_INTEL_SPI=m +CONFIG_SPI_INTEL_SPI_PCI=m +CONFIG_SPI_INTEL_SPI_PLATFORM=m +CONFIG_MTD_UBI=m +CONFIG_MTD_UBI_WL_THRESHOLD=4096 +CONFIG_MTD_UBI_BEB_LIMIT=20 +CONFIG_MTD_UBI_FASTMAP=y +# CONFIG_MTD_UBI_GLUEBI is not set +CONFIG_MTD_UBI_BLOCK=y +# CONFIG_OF is not set +CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_SERIAL=m +CONFIG_PARPORT_PC_FIFO=y +CONFIG_PARPORT_PC_SUPERIO=y +CONFIG_PARPORT_PC_PCMCIA=m +CONFIG_PARPORT_AX88796=m +CONFIG_PARPORT_1284=y +CONFIG_PARPORT_NOT_PC=y +CONFIG_PNP=y +# CONFIG_PNP_DEBUG_MESSAGES is not set + +# +# Protocols +# +CONFIG_PNPACPI=y +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_NULL_BLK is not set +CONFIG_BLK_DEV_FD=m +CONFIG_CDROM=m +CONFIG_PARIDE=m + +# +# Parallel IDE high-level drivers +# +CONFIG_PARIDE_PD=m +CONFIG_PARIDE_PCD=m +CONFIG_PARIDE_PF=m +CONFIG_PARIDE_PT=m +CONFIG_PARIDE_PG=m + +# +# Parallel IDE protocol modules +# +CONFIG_PARIDE_ATEN=m +CONFIG_PARIDE_BPCK=m +CONFIG_PARIDE_COMM=m +CONFIG_PARIDE_DSTR=m +CONFIG_PARIDE_FIT2=m +CONFIG_PARIDE_FIT3=m +CONFIG_PARIDE_EPAT=m +CONFIG_PARIDE_EPATC8=y +CONFIG_PARIDE_EPIA=m +CONFIG_PARIDE_FRIQ=m +CONFIG_PARIDE_FRPW=m +CONFIG_PARIDE_KBIC=m +CONFIG_PARIDE_KTTI=m +CONFIG_PARIDE_ON20=m +CONFIG_PARIDE_ON26=m +CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m +CONFIG_ZRAM=m +CONFIG_ZRAM_WRITEBACK=y +# CONFIG_ZRAM_MEMORY_TRACKING is not set +CONFIG_BLK_DEV_DAC960=m +CONFIG_BLK_DEV_UMEM=m +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_DRBD=m +# CONFIG_DRBD_FAULT_INJECTION is not set +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_SKD=m +CONFIG_BLK_DEV_SX8=m +CONFIG_BLK_DEV_RAM=m +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=16384 +CONFIG_CDROM_PKTCDVD=m +CONFIG_CDROM_PKTCDVD_BUFFERS=8 +CONFIG_CDROM_PKTCDVD_WCACHE=y +CONFIG_ATA_OVER_ETH=m +CONFIG_VIRTIO_BLK=m +# CONFIG_VIRTIO_BLK_SCSI is not set +CONFIG_BLK_DEV_RBD=m +CONFIG_BLK_DEV_RSXX=m + +# +# NVME Support +# +CONFIG_NVME_CORE=y +CONFIG_BLK_DEV_NVME=y +# CONFIG_NVME_MULTIPATH is not set +CONFIG_NVME_FABRICS=m +CONFIG_NVME_RDMA=m +CONFIG_NVME_FC=m +CONFIG_NVME_TARGET=m +CONFIG_NVME_TARGET_LOOP=m +CONFIG_NVME_TARGET_RDMA=m +CONFIG_NVME_TARGET_FC=m +CONFIG_NVME_TARGET_FCLOOP=m + +# +# Misc devices +# +CONFIG_SENSORS_LIS3LV02D=m +CONFIG_AD525X_DPOT=m +CONFIG_AD525X_DPOT_I2C=m +CONFIG_AD525X_DPOT_SPI=m +# CONFIG_DUMMY_IRQ is not set +CONFIG_IBM_ASM=m +CONFIG_PHANTOM=m +CONFIG_SGI_IOC4=m +CONFIG_TIFM_CORE=m +CONFIG_TIFM_7XX1=m +CONFIG_ICS932S401=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_HP_ILO=m +CONFIG_APDS9802ALS=m +CONFIG_ISL29003=m +CONFIG_ISL29020=m +CONFIG_SENSORS_TSL2550=m +CONFIG_SENSORS_BH1770=m +CONFIG_SENSORS_APDS990X=m +CONFIG_HMC6352=m +CONFIG_DS1682=m +CONFIG_VMWARE_BALLOON=m +CONFIG_USB_SWITCH_FSA9480=m +CONFIG_LATTICE_ECP3_CONFIG=m +CONFIG_SRAM=y +CONFIG_PCI_ENDPOINT_TEST=m +CONFIG_MISC_RTSX=m +CONFIG_C2PORT=m +CONFIG_C2PORT_DURAMAR_2150=m + +# +# EEPROM support +# +CONFIG_EEPROM_AT24=m +CONFIG_EEPROM_AT25=m +CONFIG_EEPROM_LEGACY=m +CONFIG_EEPROM_MAX6875=m +CONFIG_EEPROM_93CX6=m +CONFIG_EEPROM_93XX46=m +CONFIG_EEPROM_IDT_89HPESX=m +CONFIG_CB710_CORE=m +# CONFIG_CB710_DEBUG is not set +CONFIG_CB710_DEBUG_ASSUMPTIONS=y + +# +# Texas Instruments shared transport line discipline +# +CONFIG_TI_ST=m +CONFIG_SENSORS_LIS3_I2C=m + +# +# Altera FPGA firmware download module (requires I2C) +# +CONFIG_ALTERA_STAPL=m +CONFIG_INTEL_MEI=y +CONFIG_INTEL_MEI_ME=y +CONFIG_INTEL_MEI_TXE=m +CONFIG_VMWARE_VMCI=m + +# +# Intel MIC & related support +# + +# +# Intel MIC Bus Driver +# +CONFIG_INTEL_MIC_BUS=m + +# +# SCIF Bus Driver +# +CONFIG_SCIF_BUS=m + +# +# VOP Bus Driver +# +CONFIG_VOP_BUS=m + +# +# Intel MIC Host Driver +# +CONFIG_INTEL_MIC_HOST=m + +# +# Intel MIC Card Driver +# +CONFIG_INTEL_MIC_CARD=m + +# +# SCIF Driver +# +CONFIG_SCIF=m + +# +# Intel MIC Coprocessor State Management (COSM) Drivers +# +CONFIG_MIC_COSM=m + +# +# VOP Driver +# +CONFIG_VOP=m +CONFIG_VHOST_RING=m +CONFIG_GENWQE=m +CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 +CONFIG_ECHO=m +CONFIG_MISC_RTSX_PCI=m +CONFIG_MISC_RTSX_USB=m +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set + +# +# SCSI device support +# +CONFIG_SCSI_MOD=m +CONFIG_RAID_ATTRS=m +CONFIG_SCSI=m +CONFIG_SCSI_DMA=y +CONFIG_SCSI_NETLINK=y +# CONFIG_SCSI_MQ_DEFAULT is not set +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=m +CONFIG_CHR_DEV_ST=m +CONFIG_CHR_DEV_OSST=m +CONFIG_BLK_DEV_SR=m +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=m +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SCAN_ASYNC=y + +# +# SCSI Transports +# +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SCSI_SAS_ATTRS=m +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SAS_ATA=y +CONFIG_SCSI_SAS_HOST_SMP=y +CONFIG_SCSI_SRP_ATTRS=m +CONFIG_SCSI_LOWLEVEL=y +CONFIG_ISCSI_TCP=m +CONFIG_ISCSI_BOOT_SYSFS=m +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_CXGB4_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_SCSI_BNX2X_FCOE=m +CONFIG_BE2ISCSI=m +CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_HPSA=m +CONFIG_SCSI_3W_9XXX=m +CONFIG_SCSI_3W_SAS=m +CONFIG_SCSI_ACARD=m +CONFIG_SCSI_AACRAID=m +CONFIG_SCSI_AIC7XXX=m +CONFIG_AIC7XXX_CMDS_PER_DEVICE=4 +CONFIG_AIC7XXX_RESET_DELAY_MS=15000 +# CONFIG_AIC7XXX_DEBUG_ENABLE is not set +CONFIG_AIC7XXX_DEBUG_MASK=0 +# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set +CONFIG_SCSI_AIC79XX=m +CONFIG_AIC79XX_CMDS_PER_DEVICE=4 +CONFIG_AIC79XX_RESET_DELAY_MS=15000 +# CONFIG_AIC79XX_DEBUG_ENABLE is not set +CONFIG_AIC79XX_DEBUG_MASK=0 +# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set +CONFIG_SCSI_AIC94XX=m +# CONFIG_AIC94XX_DEBUG is not set +CONFIG_SCSI_MVSAS=m +# CONFIG_SCSI_MVSAS_DEBUG is not set +CONFIG_SCSI_MVSAS_TASKLET=y +CONFIG_SCSI_MVUMI=m +CONFIG_SCSI_DPT_I2O=m +CONFIG_SCSI_ADVANSYS=m +CONFIG_SCSI_ARCMSR=m +CONFIG_SCSI_ESAS2R=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m +CONFIG_SCSI_MPT3SAS=m +CONFIG_SCSI_MPT2SAS_MAX_SGE=128 +CONFIG_SCSI_MPT3SAS_MAX_SGE=128 +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_SMARTPQI=m +CONFIG_SCSI_UFSHCD=m +CONFIG_SCSI_UFSHCD_PCI=m +CONFIG_SCSI_UFS_DWC_TC_PCI=m +CONFIG_SCSI_UFSHCD_PLATFORM=m +CONFIG_SCSI_UFS_DWC_TC_PLATFORM=m +CONFIG_SCSI_HPTIOP=m +CONFIG_SCSI_BUSLOGIC=m +CONFIG_SCSI_FLASHPOINT=y +CONFIG_VMWARE_PVSCSI=m +CONFIG_HYPERV_STORAGE=m +CONFIG_LIBFC=m +CONFIG_LIBFCOE=m +CONFIG_FCOE=m +CONFIG_FCOE_FNIC=m +CONFIG_SCSI_SNIC=m +# CONFIG_SCSI_SNIC_DEBUG_FS is not set +CONFIG_SCSI_DMX3191D=m +CONFIG_SCSI_GDTH=m +CONFIG_SCSI_ISCI=m +CONFIG_SCSI_IPS=m +CONFIG_SCSI_INITIO=m +CONFIG_SCSI_INIA100=m +CONFIG_SCSI_PPA=m +CONFIG_SCSI_IMM=m +# CONFIG_SCSI_IZIP_EPP16 is not set +# CONFIG_SCSI_IZIP_SLOW_CTR is not set +CONFIG_SCSI_STEX=m +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +CONFIG_SCSI_SYM53C8XX_MMIO=y +CONFIG_SCSI_IPR=m +# CONFIG_SCSI_IPR_TRACE is not set +# CONFIG_SCSI_IPR_DUMP is not set +CONFIG_SCSI_QLOGIC_1280=m +CONFIG_SCSI_QLA_FC=m +CONFIG_TCM_QLA2XXX=m +# CONFIG_TCM_QLA2XXX_DEBUG is not set +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_QEDI=m +CONFIG_QEDF=m +CONFIG_SCSI_LPFC=m +# CONFIG_SCSI_LPFC_DEBUG_FS is not set +CONFIG_SCSI_DC395x=m +CONFIG_SCSI_AM53C974=m +CONFIG_SCSI_WD719X=m +# CONFIG_SCSI_DEBUG is not set +CONFIG_SCSI_PMCRAID=m +CONFIG_SCSI_PM8001=m +CONFIG_SCSI_BFA_FC=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_CHELSIO_FCOE=m +CONFIG_SCSI_LOWLEVEL_PCMCIA=y +CONFIG_PCMCIA_AHA152X=m +CONFIG_PCMCIA_QLOGIC=m +CONFIG_PCMCIA_SYM53C500=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +CONFIG_SCSI_OSD_INITIATOR=m +CONFIG_SCSI_OSD_ULD=m +CONFIG_SCSI_OSD_DPRINT_SENSE=1 +# CONFIG_SCSI_OSD_DEBUG is not set +CONFIG_ATA=m +CONFIG_ATA_VERBOSE_ERROR=y +CONFIG_ATA_ACPI=y +CONFIG_SATA_ZPODD=y +CONFIG_SATA_PMP=y + +# +# Controllers with non-SFF native interface +# +CONFIG_SATA_AHCI=m +CONFIG_SATA_MOBILE_LPM_POLICY=0 +CONFIG_SATA_AHCI_PLATFORM=m +CONFIG_SATA_INIC162X=m +CONFIG_SATA_ACARD_AHCI=m +CONFIG_SATA_SIL24=m +CONFIG_ATA_SFF=y + +# +# SFF controllers with custom DMA interface +# +CONFIG_PDC_ADMA=m +CONFIG_SATA_QSTOR=m +CONFIG_SATA_SX4=m +CONFIG_ATA_BMDMA=y + +# +# SATA SFF controllers with BMDMA +# +CONFIG_ATA_PIIX=m +CONFIG_SATA_DWC=m +# CONFIG_SATA_DWC_OLD_DMA is not set +# CONFIG_SATA_DWC_DEBUG is not set +CONFIG_SATA_MV=m +CONFIG_SATA_NV=m +CONFIG_SATA_PROMISE=m +CONFIG_SATA_SIL=m +CONFIG_SATA_SIS=m +CONFIG_SATA_SVW=m +CONFIG_SATA_ULI=m +CONFIG_SATA_VIA=m +CONFIG_SATA_VITESSE=m + +# +# PATA SFF controllers with BMDMA +# +CONFIG_PATA_ALI=m +CONFIG_PATA_AMD=m +CONFIG_PATA_ARTOP=m +CONFIG_PATA_ATIIXP=m +CONFIG_PATA_ATP867X=m +CONFIG_PATA_CMD64X=m +CONFIG_PATA_CYPRESS=m +CONFIG_PATA_EFAR=m +CONFIG_PATA_HPT366=m +CONFIG_PATA_HPT37X=m +CONFIG_PATA_HPT3X2N=m +CONFIG_PATA_HPT3X3=m +CONFIG_PATA_HPT3X3_DMA=y +CONFIG_PATA_IT8213=m +CONFIG_PATA_IT821X=m +CONFIG_PATA_JMICRON=m +CONFIG_PATA_MARVELL=m +CONFIG_PATA_NETCELL=m +CONFIG_PATA_NINJA32=m +CONFIG_PATA_NS87415=m +CONFIG_PATA_OLDPIIX=m +CONFIG_PATA_OPTIDMA=m +CONFIG_PATA_PDC2027X=m +CONFIG_PATA_PDC_OLD=m +CONFIG_PATA_RADISYS=m +CONFIG_PATA_RDC=m +CONFIG_PATA_SCH=m +CONFIG_PATA_SERVERWORKS=m +CONFIG_PATA_SIL680=m +CONFIG_PATA_SIS=m +CONFIG_PATA_TOSHIBA=m +CONFIG_PATA_TRIFLEX=m +CONFIG_PATA_VIA=m +CONFIG_PATA_WINBOND=m + +# +# PIO-only SFF controllers +# +CONFIG_PATA_CMD640_PCI=m +CONFIG_PATA_MPIIX=m +CONFIG_PATA_NS87410=m +CONFIG_PATA_OPTI=m +CONFIG_PATA_PCMCIA=m +CONFIG_PATA_RZ1000=m + +# +# Generic fallback / legacy drivers +# +CONFIG_PATA_ACPI=m +CONFIG_ATA_GENERIC=m +CONFIG_PATA_LEGACY=m +CONFIG_MD=y +CONFIG_BLK_DEV_MD=m +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_MD_CLUSTER=m +CONFIG_BCACHE=m +# CONFIG_BCACHE_DEBUG is not set +# CONFIG_BCACHE_CLOSURES_DEBUG is not set +CONFIG_BLK_DEV_DM_BUILTIN=y +CONFIG_BLK_DEV_DM=m +# CONFIG_DM_MQ_DEFAULT is not set +# CONFIG_DM_DEBUG is not set +CONFIG_DM_BUFIO=m +# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set +CONFIG_DM_BIO_PRISON=m +CONFIG_DM_PERSISTENT_DATA=m +CONFIG_DM_UNSTRIPED=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m +CONFIG_DM_CACHE=m +CONFIG_DM_CACHE_SMQ=m +CONFIG_DM_WRITECACHE=m +CONFIG_DM_ERA=m +CONFIG_DM_MIRROR=m +CONFIG_DM_LOG_USERSPACE=m +CONFIG_DM_RAID=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_QL=m +CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_DELAY=m +CONFIG_DM_UEVENT=y +CONFIG_DM_FLAKEY=m +CONFIG_DM_VERITY=m +# CONFIG_DM_VERITY_FEC is not set +CONFIG_DM_SWITCH=m +CONFIG_DM_LOG_WRITES=m +CONFIG_DM_INTEGRITY=m +CONFIG_DM_ZONED=m +CONFIG_TARGET_CORE=m +CONFIG_TCM_IBLOCK=m +CONFIG_TCM_FILEIO=m +CONFIG_TCM_PSCSI=m +CONFIG_TCM_USER2=m +CONFIG_LOOPBACK_TARGET=m +CONFIG_TCM_FC=m +CONFIG_ISCSI_TARGET=m +CONFIG_ISCSI_TARGET_CXGB4=m +CONFIG_SBP_TARGET=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m +CONFIG_FUSION_MAX_SGE=128 +CONFIG_FUSION_CTL=m +CONFIG_FUSION_LAN=m +CONFIG_FUSION_LOGGING=y + +# +# IEEE 1394 (FireWire) support +# +CONFIG_FIREWIRE=m +CONFIG_FIREWIRE_OHCI=m +CONFIG_FIREWIRE_SBP2=m +CONFIG_FIREWIRE_NET=m +CONFIG_FIREWIRE_NOSY=m +CONFIG_MACINTOSH_DRIVERS=y +CONFIG_MAC_EMUMOUSEBTN=m +CONFIG_NETDEVICES=y +CONFIG_MII=m +CONFIG_NET_CORE=y +CONFIG_BONDING=m +CONFIG_DUMMY=m +CONFIG_EQUALIZER=m +CONFIG_NET_FC=y +CONFIG_IFB=m +CONFIG_NET_TEAM=m +CONFIG_NET_TEAM_MODE_BROADCAST=m +CONFIG_NET_TEAM_MODE_ROUNDROBIN=m +CONFIG_NET_TEAM_MODE_RANDOM=m +CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m +CONFIG_NET_TEAM_MODE_LOADBALANCE=m +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m +CONFIG_IPVLAN=m +CONFIG_IPVTAP=m +CONFIG_VXLAN=m +CONFIG_GENEVE=m +CONFIG_GTP=m +CONFIG_MACSEC=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETPOLL=y +CONFIG_NET_POLL_CONTROLLER=y +# CONFIG_NTB_NETDEV is not set +CONFIG_RIONET=m +CONFIG_RIONET_TX_SIZE=128 +CONFIG_RIONET_RX_SIZE=128 +CONFIG_TUN=m +CONFIG_TAP=m +# CONFIG_TUN_VNET_CROSS_LE is not set +CONFIG_VETH=m +CONFIG_VIRTIO_NET=m +CONFIG_NLMON=m +CONFIG_NET_VRF=m +CONFIG_VSOCKMON=m +CONFIG_SUNGEM_PHY=m +CONFIG_ARCNET=m +CONFIG_ARCNET_1201=m +CONFIG_ARCNET_1051=m +CONFIG_ARCNET_RAW=m +CONFIG_ARCNET_CAP=m +CONFIG_ARCNET_COM90xx=m +CONFIG_ARCNET_COM90xxIO=m +CONFIG_ARCNET_RIM_I=m +CONFIG_ARCNET_COM20020=m +CONFIG_ARCNET_COM20020_PCI=m +CONFIG_ARCNET_COM20020_CS=m +CONFIG_ATM_DRIVERS=y +# CONFIG_ATM_DUMMY is not set +CONFIG_ATM_TCP=m +CONFIG_ATM_LANAI=m +CONFIG_ATM_ENI=m +# CONFIG_ATM_ENI_DEBUG is not set +# CONFIG_ATM_ENI_TUNE_BURST is not set +CONFIG_ATM_FIRESTREAM=m +CONFIG_ATM_ZATM=m +# CONFIG_ATM_ZATM_DEBUG is not set +CONFIG_ATM_NICSTAR=m +CONFIG_ATM_NICSTAR_USE_SUNI=y +CONFIG_ATM_NICSTAR_USE_IDT77105=y +CONFIG_ATM_IDT77252=m +# CONFIG_ATM_IDT77252_DEBUG is not set +# CONFIG_ATM_IDT77252_RCV_ALL is not set +CONFIG_ATM_IDT77252_USE_SUNI=y +CONFIG_ATM_AMBASSADOR=m +# CONFIG_ATM_AMBASSADOR_DEBUG is not set +CONFIG_ATM_HORIZON=m +# CONFIG_ATM_HORIZON_DEBUG is not set +CONFIG_ATM_IA=m +# CONFIG_ATM_IA_DEBUG is not set +CONFIG_ATM_FORE200E=m +CONFIG_ATM_FORE200E_USE_TASKLET=y +CONFIG_ATM_FORE200E_TX_RETRY=16 +CONFIG_ATM_FORE200E_DEBUG=0 +CONFIG_ATM_HE=m +CONFIG_ATM_HE_USE_SUNI=y +CONFIG_ATM_SOLOS=m + +# +# CAIF transport drivers +# +CONFIG_CAIF_TTY=m +CONFIG_CAIF_SPI_SLAVE=m +CONFIG_CAIF_SPI_SYNC=y +CONFIG_CAIF_HSI=m +CONFIG_CAIF_VIRTIO=m + +# +# Distributed Switch Architecture drivers +# +CONFIG_B53=m +CONFIG_B53_SPI_DRIVER=m +CONFIG_B53_MDIO_DRIVER=m +CONFIG_B53_MMAP_DRIVER=m +CONFIG_B53_SRAB_DRIVER=m +# CONFIG_NET_DSA_BCM_SF2 is not set +CONFIG_NET_DSA_LOOP=m +CONFIG_NET_DSA_MT7530=m +CONFIG_NET_DSA_MV88E6060=m +CONFIG_MICROCHIP_KSZ=m +CONFIG_MICROCHIP_KSZ_SPI_DRIVER=m +CONFIG_NET_DSA_MV88E6XXX=m +CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y +CONFIG_NET_DSA_MV88E6XXX_PTP=y +CONFIG_NET_DSA_QCA8K=m +# CONFIG_NET_DSA_REALTEK_SMI is not set +CONFIG_NET_DSA_SMSC_LAN9303=m +CONFIG_NET_DSA_SMSC_LAN9303_I2C=m +CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m +CONFIG_ETHERNET=y +CONFIG_MDIO=m +CONFIG_NET_VENDOR_3COM=y +CONFIG_PCMCIA_3C574=m +CONFIG_PCMCIA_3C589=m +CONFIG_VORTEX=m +CONFIG_TYPHOON=m +CONFIG_NET_VENDOR_ADAPTEC=y +CONFIG_ADAPTEC_STARFIRE=m +CONFIG_NET_VENDOR_AGERE=y +CONFIG_ET131X=m +CONFIG_NET_VENDOR_ALACRITECH=y +CONFIG_SLICOSS=m +CONFIG_NET_VENDOR_ALTEON=y +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +CONFIG_ALTERA_TSE=m +CONFIG_NET_VENDOR_AMAZON=y +CONFIG_ENA_ETHERNET=m +CONFIG_NET_VENDOR_AMD=y +CONFIG_AMD8111_ETH=m +CONFIG_PCNET32=m +CONFIG_PCMCIA_NMCLAN=m +CONFIG_AMD_XGBE=m +CONFIG_AMD_XGBE_DCB=y +CONFIG_AMD_XGBE_HAVE_ECC=y +CONFIG_NET_VENDOR_AQUANTIA=y +CONFIG_AQTION=m +CONFIG_NET_VENDOR_ARC=y +CONFIG_NET_VENDOR_ATHEROS=y +CONFIG_ATL2=m +CONFIG_ATL1=m +CONFIG_ATL1E=m +CONFIG_ATL1C=m +CONFIG_ALX=m +CONFIG_NET_VENDOR_AURORA=y +CONFIG_AURORA_NB8800=m +CONFIG_NET_VENDOR_BROADCOM=y +CONFIG_B44=m +CONFIG_B44_PCI_AUTOSELECT=y +CONFIG_B44_PCICORE_AUTOSELECT=y +CONFIG_B44_PCI=y +# CONFIG_BCMGENET is not set +CONFIG_BNX2=m +CONFIG_CNIC=m +CONFIG_TIGON3=m +CONFIG_TIGON3_HWMON=y +CONFIG_BNX2X=m +CONFIG_BNX2X_SRIOV=y +# CONFIG_SYSTEMPORT is not set +CONFIG_BNXT=m +CONFIG_BNXT_SRIOV=y +CONFIG_BNXT_FLOWER_OFFLOAD=y +CONFIG_BNXT_DCB=y +CONFIG_BNXT_HWMON=y +CONFIG_NET_VENDOR_BROCADE=y +CONFIG_BNA=m +CONFIG_NET_VENDOR_CADENCE=y +CONFIG_MACB=m +CONFIG_MACB_USE_HWSTAMP=y +CONFIG_MACB_PCI=m +CONFIG_NET_VENDOR_CAVIUM=y +CONFIG_THUNDER_NIC_PF=m +CONFIG_THUNDER_NIC_VF=m +CONFIG_THUNDER_NIC_BGX=m +CONFIG_THUNDER_NIC_RGX=m +CONFIG_CAVIUM_PTP=m +CONFIG_LIQUIDIO=m +CONFIG_LIQUIDIO_VF=m +CONFIG_NET_VENDOR_CHELSIO=y +CONFIG_CHELSIO_T1=m +CONFIG_CHELSIO_T1_1G=y +CONFIG_CHELSIO_T3=m +CONFIG_CHELSIO_T4=m +CONFIG_CHELSIO_T4_DCB=y +# CONFIG_CHELSIO_T4_FCOE is not set +CONFIG_CHELSIO_T4VF=m +CONFIG_CHELSIO_LIB=m +CONFIG_NET_VENDOR_CISCO=y +CONFIG_ENIC=m +# CONFIG_NET_VENDOR_CORTINA is not set +CONFIG_CX_ECAT=m +CONFIG_DNET=m +CONFIG_NET_VENDOR_DEC=y +CONFIG_NET_TULIP=y +CONFIG_DE2104X=m +CONFIG_DE2104X_DSL=0 +CONFIG_TULIP=m +CONFIG_TULIP_MWI=y +CONFIG_TULIP_MMIO=y +CONFIG_TULIP_NAPI=y +CONFIG_TULIP_NAPI_HW_MITIGATION=y +CONFIG_DE4X5=m +CONFIG_WINBOND_840=m +CONFIG_DM9102=m +CONFIG_ULI526X=m +CONFIG_PCMCIA_XIRCOM=m +CONFIG_NET_VENDOR_DLINK=y +CONFIG_DL2K=m +CONFIG_SUNDANCE=m +# CONFIG_SUNDANCE_MMIO is not set +CONFIG_NET_VENDOR_EMULEX=y +CONFIG_BE2NET=m +CONFIG_BE2NET_HWMON=y +CONFIG_BE2NET_BE2=y +CONFIG_BE2NET_BE3=y +CONFIG_BE2NET_LANCER=y +CONFIG_BE2NET_SKYHAWK=y +CONFIG_NET_VENDOR_EZCHIP=y +CONFIG_NET_VENDOR_FUJITSU=y +CONFIG_PCMCIA_FMVJ18X=m +CONFIG_NET_VENDOR_HP=y +CONFIG_HP100=m +CONFIG_NET_VENDOR_HUAWEI=y +CONFIG_HINIC=m +CONFIG_NET_VENDOR_I825XX=y +CONFIG_NET_VENDOR_INTEL=y +CONFIG_E100=m +CONFIG_E1000=m +CONFIG_E1000E=m +CONFIG_E1000E_HWTS=y +CONFIG_IGB=m +CONFIG_IGB_HWMON=y +CONFIG_IGB_DCA=y +CONFIG_IGBVF=m +CONFIG_IXGB=m +CONFIG_IXGBE=m +CONFIG_IXGBE_HWMON=y +CONFIG_IXGBE_DCA=y +CONFIG_IXGBE_DCB=y +CONFIG_IXGBEVF=m +CONFIG_I40E=m +CONFIG_I40E_DCB=y +CONFIG_I40EVF=m +CONFIG_ICE=m +CONFIG_FM10K=m +CONFIG_JME=m +CONFIG_NET_VENDOR_MARVELL=y +CONFIG_MVMDIO=m +CONFIG_SKGE=m +# CONFIG_SKGE_DEBUG is not set +CONFIG_SKGE_GENESIS=y +CONFIG_SKY2=m +# CONFIG_SKY2_DEBUG is not set +CONFIG_NET_VENDOR_MELLANOX=y +CONFIG_MLX4_EN=m +CONFIG_MLX4_EN_DCB=y +CONFIG_MLX4_CORE=m +CONFIG_MLX4_DEBUG=y +CONFIG_MLX4_CORE_GEN2=y +CONFIG_MLX5_CORE=m +CONFIG_MLX5_ACCEL=y +CONFIG_MLX5_FPGA=y +# CONFIG_MLX5_CORE_EN is not set +CONFIG_MLXSW_CORE=m +CONFIG_MLXSW_CORE_HWMON=y +CONFIG_MLXSW_CORE_THERMAL=y +CONFIG_MLXSW_PCI=m +CONFIG_MLXSW_I2C=m +CONFIG_MLXSW_SWITCHIB=m +CONFIG_MLXSW_SWITCHX2=m +CONFIG_MLXSW_SPECTRUM=m +CONFIG_MLXSW_SPECTRUM_DCB=y +CONFIG_MLXSW_MINIMAL=m +CONFIG_MLXFW=m +CONFIG_NET_VENDOR_MICREL=y +CONFIG_KS8842=m +CONFIG_KS8851=m +CONFIG_KS8851_MLL=m +CONFIG_KSZ884X_PCI=m +CONFIG_NET_VENDOR_MICROCHIP=y +CONFIG_ENC28J60=m +# CONFIG_ENC28J60_WRITEVERIFY is not set +CONFIG_ENCX24J600=m +CONFIG_LAN743X=m +CONFIG_NET_VENDOR_MICROSEMI=y +CONFIG_MSCC_OCELOT_SWITCH=m +CONFIG_MSCC_OCELOT_SWITCH_OCELOT=m +CONFIG_NET_VENDOR_MYRI=y +CONFIG_MYRI10GE=m +CONFIG_MYRI10GE_DCA=y +CONFIG_FEALNX=m +CONFIG_NET_VENDOR_NATSEMI=y +CONFIG_NATSEMI=m +CONFIG_NS83820=m +CONFIG_NET_VENDOR_NETERION=y +CONFIG_S2IO=m +CONFIG_VXGE=m +# CONFIG_VXGE_DEBUG_TRACE_ALL is not set +CONFIG_NET_VENDOR_NETRONOME=y +CONFIG_NFP=m +# CONFIG_NFP_APP_FLOWER is not set +CONFIG_NFP_APP_ABM_NIC=y +# CONFIG_NFP_DEBUG is not set +CONFIG_NET_VENDOR_NI=y +CONFIG_NET_VENDOR_8390=y +CONFIG_PCMCIA_AXNET=m +CONFIG_NE2K_PCI=m +CONFIG_PCMCIA_PCNET=m +CONFIG_NET_VENDOR_NVIDIA=y +CONFIG_FORCEDETH=m +CONFIG_NET_VENDOR_OKI=y +CONFIG_ETHOC=m +CONFIG_NET_VENDOR_PACKET_ENGINES=y +CONFIG_HAMACHI=m +CONFIG_YELLOWFIN=m +CONFIG_NET_VENDOR_QLOGIC=y +CONFIG_QLA3XXX=m +CONFIG_QLCNIC=m +CONFIG_QLCNIC_SRIOV=y +CONFIG_QLCNIC_DCB=y +CONFIG_QLCNIC_HWMON=y +CONFIG_QLGE=m +CONFIG_NETXEN_NIC=m +CONFIG_QED=m +CONFIG_QED_LL2=y +CONFIG_QED_SRIOV=y +CONFIG_QEDE=m +CONFIG_QED_RDMA=y +CONFIG_QED_ISCSI=y +CONFIG_QED_FCOE=y +CONFIG_QED_OOO=y +CONFIG_NET_VENDOR_QUALCOMM=y +CONFIG_QCOM_EMAC=m +CONFIG_RMNET=m +CONFIG_NET_VENDOR_RDC=y +CONFIG_R6040=m +CONFIG_NET_VENDOR_REALTEK=y +CONFIG_ATP=m +CONFIG_8139CP=m +CONFIG_8139TOO=m +# CONFIG_8139TOO_PIO is not set +CONFIG_8139TOO_TUNE_TWISTER=y +CONFIG_8139TOO_8129=y +# CONFIG_8139_OLD_RX_RESET is not set +CONFIG_R8169=m +CONFIG_NET_VENDOR_RENESAS=y +CONFIG_NET_VENDOR_ROCKER=y +CONFIG_ROCKER=m +CONFIG_NET_VENDOR_SAMSUNG=y +CONFIG_SXGBE_ETH=m +CONFIG_NET_VENDOR_SEEQ=y +CONFIG_NET_VENDOR_SOLARFLARE=y +CONFIG_SFC=m +CONFIG_SFC_MTD=y +CONFIG_SFC_MCDI_MON=y +CONFIG_SFC_SRIOV=y +CONFIG_SFC_MCDI_LOGGING=y +CONFIG_SFC_FALCON=m +CONFIG_SFC_FALCON_MTD=y +CONFIG_NET_VENDOR_SILAN=y +CONFIG_SC92031=m +CONFIG_NET_VENDOR_SIS=y +CONFIG_SIS900=m +CONFIG_SIS190=m +CONFIG_NET_VENDOR_SMSC=y +CONFIG_PCMCIA_SMC91C92=m +CONFIG_EPIC100=m +CONFIG_SMSC911X=m +CONFIG_SMSC9420=m +# CONFIG_NET_VENDOR_SOCIONEXT is not set +CONFIG_NET_VENDOR_STMICRO=y +CONFIG_STMMAC_ETH=m +CONFIG_STMMAC_PLATFORM=m +CONFIG_DWMAC_GENERIC=m +CONFIG_STMMAC_PCI=m +CONFIG_NET_VENDOR_SUN=y +CONFIG_HAPPYMEAL=m +CONFIG_SUNGEM=m +CONFIG_CASSINI=m +CONFIG_NIU=m +CONFIG_NET_VENDOR_SYNOPSYS=y +CONFIG_DWC_XLGMAC=m +CONFIG_DWC_XLGMAC_PCI=m +CONFIG_NET_VENDOR_TEHUTI=y +CONFIG_TEHUTI=m +CONFIG_NET_VENDOR_TI=y +CONFIG_TI_CPSW_ALE=m +CONFIG_TLAN=m +CONFIG_NET_VENDOR_VIA=y +CONFIG_VIA_RHINE=m +CONFIG_VIA_RHINE_MMIO=y +CONFIG_VIA_VELOCITY=m +CONFIG_NET_VENDOR_WIZNET=y +CONFIG_WIZNET_W5100=m +CONFIG_WIZNET_W5300=m +# CONFIG_WIZNET_BUS_DIRECT is not set +# CONFIG_WIZNET_BUS_INDIRECT is not set +CONFIG_WIZNET_BUS_ANY=y +CONFIG_WIZNET_W5100_SPI=m +CONFIG_NET_VENDOR_XIRCOM=y +CONFIG_PCMCIA_XIRC2PS=m +CONFIG_FDDI=m +CONFIG_DEFXX=m +# CONFIG_DEFXX_MMIO is not set +CONFIG_SKFP=m +CONFIG_HIPPI=y +CONFIG_ROADRUNNER=m +CONFIG_ROADRUNNER_LARGE_RINGS=y +CONFIG_NET_SB1000=m +CONFIG_MDIO_DEVICE=m +CONFIG_MDIO_BUS=m +# CONFIG_MDIO_BCM_UNIMAC is not set +CONFIG_MDIO_BITBANG=m +CONFIG_MDIO_CAVIUM=m +CONFIG_MDIO_GPIO=m +CONFIG_MDIO_I2C=m +CONFIG_MDIO_MSCC_MIIM=m +CONFIG_MDIO_THUNDER=m +CONFIG_PHYLINK=m +CONFIG_PHYLIB=m +CONFIG_SWPHY=y +CONFIG_LED_TRIGGER_PHY=y + +# +# MII PHY device drivers +# +CONFIG_SFP=m +CONFIG_AMD_PHY=m +CONFIG_AQUANTIA_PHY=m +CONFIG_ASIX_PHY=m +CONFIG_AT803X_PHY=m +CONFIG_BCM7XXX_PHY=m +CONFIG_BCM87XX_PHY=m +CONFIG_BCM_NET_PHYLIB=m +CONFIG_BROADCOM_PHY=m +CONFIG_CICADA_PHY=m +CONFIG_CORTINA_PHY=m +CONFIG_DAVICOM_PHY=m +CONFIG_DP83822_PHY=m +CONFIG_DP83TC811_PHY=m +CONFIG_DP83848_PHY=m +CONFIG_DP83867_PHY=m +CONFIG_FIXED_PHY=m +CONFIG_ICPLUS_PHY=m +CONFIG_INTEL_XWAY_PHY=m +CONFIG_LSI_ET1011C_PHY=m +CONFIG_LXT_PHY=m +CONFIG_MARVELL_PHY=m +CONFIG_MARVELL_10G_PHY=m +CONFIG_MICREL_PHY=m +CONFIG_MICROCHIP_PHY=m +CONFIG_MICROCHIP_T1_PHY=m +CONFIG_MICROSEMI_PHY=m +CONFIG_NATIONAL_PHY=m +CONFIG_QSEMI_PHY=m +CONFIG_REALTEK_PHY=m +CONFIG_RENESAS_PHY=m +CONFIG_ROCKCHIP_PHY=m +CONFIG_SMSC_PHY=m +CONFIG_STE10XP=m +CONFIG_TERANETICS_PHY=m +CONFIG_VITESSE_PHY=m +CONFIG_XILINX_GMII2RGMII=m +CONFIG_MICREL_KS8995MA=m +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_FILTER=y +CONFIG_PPP_MPPE=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPPOATM=m +CONFIG_PPPOE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_SLIP=m +CONFIG_SLHC=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y + +# +# Host-side USB support is needed for USB Network Adapter support +# +CONFIG_USB_NET_DRIVERS=m +CONFIG_USB_CATC=m +CONFIG_USB_KAWETH=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_RTL8152=m +CONFIG_USB_LAN78XX=m +CONFIG_USB_USBNET=m +CONFIG_USB_NET_AX8817X=m +CONFIG_USB_NET_AX88179_178A=m +CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_CDC_EEM=m +CONFIG_USB_NET_CDC_NCM=m +CONFIG_USB_NET_HUAWEI_CDC_NCM=m +CONFIG_USB_NET_CDC_MBIM=m +CONFIG_USB_NET_DM9601=m +CONFIG_USB_NET_SR9700=m +CONFIG_USB_NET_SR9800=m +CONFIG_USB_NET_SMSC75XX=m +CONFIG_USB_NET_SMSC95XX=m +CONFIG_USB_NET_GL620A=m +CONFIG_USB_NET_NET1080=m +CONFIG_USB_NET_PLUSB=m +CONFIG_USB_NET_MCS7830=m +CONFIG_USB_NET_RNDIS_HOST=m +CONFIG_USB_NET_CDC_SUBSET_ENABLE=m +CONFIG_USB_NET_CDC_SUBSET=m +CONFIG_USB_ALI_M5632=y +CONFIG_USB_AN2720=y +CONFIG_USB_BELKIN=y +CONFIG_USB_ARMLINUX=y +CONFIG_USB_EPSON2888=y +CONFIG_USB_KC2190=y +CONFIG_USB_NET_ZAURUS=m +CONFIG_USB_NET_CX82310_ETH=m +CONFIG_USB_NET_KALMIA=m +CONFIG_USB_NET_QMI_WWAN=m +CONFIG_USB_HSO=m +CONFIG_USB_NET_INT51X1=m +CONFIG_USB_CDC_PHONET=m +CONFIG_USB_IPHETH=m +CONFIG_USB_SIERRA_NET=m +CONFIG_USB_VL600=m +CONFIG_USB_NET_CH9200=m +CONFIG_WLAN=y +CONFIG_WLAN_VENDOR_ADMTEK=y +CONFIG_ADM8211=m +CONFIG_ATH_COMMON=m +CONFIG_WLAN_VENDOR_ATH=y +# CONFIG_ATH_DEBUG is not set +CONFIG_ATH5K=m +# CONFIG_ATH5K_DEBUG is not set +# CONFIG_ATH5K_TRACER is not set +CONFIG_ATH5K_PCI=y +CONFIG_ATH9K_HW=m +CONFIG_ATH9K_COMMON=m +CONFIG_ATH9K_BTCOEX_SUPPORT=y +CONFIG_ATH9K=m +CONFIG_ATH9K_PCI=y +CONFIG_ATH9K_AHB=y +# CONFIG_ATH9K_DEBUGFS is not set +CONFIG_ATH9K_DYNACK=y +CONFIG_ATH9K_WOW=y +CONFIG_ATH9K_RFKILL=y +CONFIG_ATH9K_CHANNEL_CONTEXT=y +CONFIG_ATH9K_PCOEM=y +CONFIG_ATH9K_HTC=m +# CONFIG_ATH9K_HTC_DEBUGFS is not set +CONFIG_ATH9K_HWRNG=y +CONFIG_CARL9170=m +CONFIG_CARL9170_LEDS=y +CONFIG_CARL9170_WPC=y +# CONFIG_CARL9170_HWRNG is not set +CONFIG_ATH6KL=m +CONFIG_ATH6KL_SDIO=m +CONFIG_ATH6KL_USB=m +# CONFIG_ATH6KL_DEBUG is not set +# CONFIG_ATH6KL_TRACING is not set +CONFIG_AR5523=m +CONFIG_WIL6210=m +CONFIG_WIL6210_ISR_COR=y +CONFIG_WIL6210_TRACING=y +CONFIG_WIL6210_DEBUGFS=y +CONFIG_ATH10K=m +CONFIG_ATH10K_CE=y +CONFIG_ATH10K_PCI=m +CONFIG_ATH10K_SDIO=m +CONFIG_ATH10K_USB=m +# CONFIG_ATH10K_DEBUG is not set +# CONFIG_ATH10K_DEBUGFS is not set +# CONFIG_ATH10K_TRACING is not set +CONFIG_WCN36XX=m +# CONFIG_WCN36XX_DEBUGFS is not set +CONFIG_WLAN_VENDOR_ATMEL=y +CONFIG_ATMEL=m +CONFIG_PCI_ATMEL=m +CONFIG_PCMCIA_ATMEL=m +CONFIG_AT76C50X_USB=m +CONFIG_WLAN_VENDOR_BROADCOM=y +CONFIG_B43=m +CONFIG_B43_BCMA=y +CONFIG_B43_SSB=y +CONFIG_B43_BUSES_BCMA_AND_SSB=y +# CONFIG_B43_BUSES_BCMA is not set +# CONFIG_B43_BUSES_SSB is not set +CONFIG_B43_PCI_AUTOSELECT=y +CONFIG_B43_PCICORE_AUTOSELECT=y +CONFIG_B43_SDIO=y +CONFIG_B43_BCMA_PIO=y +CONFIG_B43_PIO=y +CONFIG_B43_PHY_G=y +CONFIG_B43_PHY_N=y +CONFIG_B43_PHY_LP=y +CONFIG_B43_PHY_HT=y +CONFIG_B43_LEDS=y +CONFIG_B43_HWRNG=y +# CONFIG_B43_DEBUG is not set +CONFIG_B43LEGACY=m +CONFIG_B43LEGACY_PCI_AUTOSELECT=y +CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y +CONFIG_B43LEGACY_LEDS=y +CONFIG_B43LEGACY_HWRNG=y +# CONFIG_B43LEGACY_DEBUG is not set +CONFIG_B43LEGACY_DMA=y +CONFIG_B43LEGACY_PIO=y +CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y +# CONFIG_B43LEGACY_DMA_MODE is not set +# CONFIG_B43LEGACY_PIO_MODE is not set +CONFIG_BRCMUTIL=m +CONFIG_BRCMSMAC=m +CONFIG_BRCMFMAC=m +CONFIG_BRCMFMAC_PROTO_BCDC=y +CONFIG_BRCMFMAC_PROTO_MSGBUF=y +CONFIG_BRCMFMAC_SDIO=y +CONFIG_BRCMFMAC_USB=y +CONFIG_BRCMFMAC_PCIE=y +CONFIG_BRCM_TRACING=y +# CONFIG_BRCMDBG is not set +CONFIG_WLAN_VENDOR_CISCO=y +CONFIG_AIRO=m +CONFIG_AIRO_CS=m +CONFIG_WLAN_VENDOR_INTEL=y +CONFIG_IPW2100=m +CONFIG_IPW2100_MONITOR=y +# CONFIG_IPW2100_DEBUG is not set +CONFIG_IPW2200=m +CONFIG_IPW2200_MONITOR=y +CONFIG_IPW2200_RADIOTAP=y +CONFIG_IPW2200_PROMISCUOUS=y +CONFIG_IPW2200_QOS=y +# CONFIG_IPW2200_DEBUG is not set +CONFIG_LIBIPW=m +# CONFIG_LIBIPW_DEBUG is not set +CONFIG_IWLEGACY=m +CONFIG_IWL4965=m +CONFIG_IWL3945=m + +# +# iwl3945 / iwl4965 Debugging Options +# +# CONFIG_IWLEGACY_DEBUG is not set +CONFIG_IWLWIFI=m +CONFIG_IWLWIFI_LEDS=y +CONFIG_IWLDVM=m +CONFIG_IWLMVM=m +CONFIG_IWLWIFI_OPMODE_MODULAR=y +CONFIG_IWLWIFI_BCAST_FILTERING=y + +# +# Debugging Options +# +# CONFIG_IWLWIFI_DEBUG is not set +CONFIG_IWLWIFI_DEVICE_TRACING=y +CONFIG_WLAN_VENDOR_INTERSIL=y +CONFIG_HOSTAP=m +CONFIG_HOSTAP_FIRMWARE=y +CONFIG_HOSTAP_FIRMWARE_NVRAM=y +CONFIG_HOSTAP_PLX=m +CONFIG_HOSTAP_PCI=m +CONFIG_HOSTAP_CS=m +CONFIG_HERMES=m +CONFIG_HERMES_PRISM=y +CONFIG_HERMES_CACHE_FW_ON_INIT=y +CONFIG_PLX_HERMES=m +CONFIG_TMD_HERMES=m +CONFIG_NORTEL_HERMES=m +CONFIG_PCI_HERMES=m +CONFIG_PCMCIA_HERMES=m +CONFIG_PCMCIA_SPECTRUM=m +CONFIG_ORINOCO_USB=m +CONFIG_P54_COMMON=m +CONFIG_P54_USB=m +CONFIG_P54_PCI=m +CONFIG_P54_SPI=m +CONFIG_P54_SPI_DEFAULT_EEPROM=y +CONFIG_P54_LEDS=y +CONFIG_PRISM54=m +CONFIG_WLAN_VENDOR_MARVELL=y +CONFIG_LIBERTAS=m +CONFIG_LIBERTAS_USB=m +CONFIG_LIBERTAS_CS=m +CONFIG_LIBERTAS_SDIO=m +CONFIG_LIBERTAS_SPI=m +# CONFIG_LIBERTAS_DEBUG is not set +CONFIG_LIBERTAS_MESH=y +CONFIG_LIBERTAS_THINFIRM=m +# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set +CONFIG_LIBERTAS_THINFIRM_USB=m +CONFIG_MWIFIEX=m +CONFIG_MWIFIEX_SDIO=m +CONFIG_MWIFIEX_PCIE=m +CONFIG_MWIFIEX_USB=m +CONFIG_MWL8K=m +CONFIG_WLAN_VENDOR_MEDIATEK=y +CONFIG_MT7601U=m +CONFIG_MT76_CORE=m +CONFIG_MT76_LEDS=y +CONFIG_MT76x2_COMMON=m +# CONFIG_MT76x0U is not set +CONFIG_MT76x2E=m +# CONFIG_MT76x2U is not set +CONFIG_WLAN_VENDOR_RALINK=y +CONFIG_RT2X00=m +CONFIG_RT2400PCI=m +CONFIG_RT2500PCI=m +CONFIG_RT61PCI=m +CONFIG_RT2800PCI=m +CONFIG_RT2800PCI_RT33XX=y +CONFIG_RT2800PCI_RT35XX=y +CONFIG_RT2800PCI_RT53XX=y +CONFIG_RT2800PCI_RT3290=y +CONFIG_RT2500USB=m +CONFIG_RT73USB=m +CONFIG_RT2800USB=m +CONFIG_RT2800USB_RT33XX=y +CONFIG_RT2800USB_RT35XX=y +CONFIG_RT2800USB_RT3573=y +CONFIG_RT2800USB_RT53XX=y +CONFIG_RT2800USB_RT55XX=y +CONFIG_RT2800USB_UNKNOWN=y +CONFIG_RT2800_LIB=m +CONFIG_RT2800_LIB_MMIO=m +CONFIG_RT2X00_LIB_MMIO=m +CONFIG_RT2X00_LIB_PCI=m +CONFIG_RT2X00_LIB_USB=m +CONFIG_RT2X00_LIB=m +CONFIG_RT2X00_LIB_FIRMWARE=y +CONFIG_RT2X00_LIB_CRYPTO=y +CONFIG_RT2X00_LIB_LEDS=y +# CONFIG_RT2X00_DEBUG is not set +CONFIG_WLAN_VENDOR_REALTEK=y +CONFIG_RTL8180=m +CONFIG_RTL8187=m +CONFIG_RTL8187_LEDS=y +CONFIG_RTL_CARDS=m +CONFIG_RTL8192CE=m +CONFIG_RTL8192SE=m +CONFIG_RTL8192DE=m +CONFIG_RTL8723AE=m +CONFIG_RTL8723BE=m +CONFIG_RTL8188EE=m +CONFIG_RTL8192EE=m +CONFIG_RTL8821AE=m +CONFIG_RTL8192CU=m +CONFIG_RTLWIFI=m +CONFIG_RTLWIFI_PCI=m +CONFIG_RTLWIFI_USB=m +# CONFIG_RTLWIFI_DEBUG is not set +CONFIG_RTL8192C_COMMON=m +CONFIG_RTL8723_COMMON=m +CONFIG_RTLBTCOEXIST=m +CONFIG_RTL8XXXU=m +CONFIG_RTL8XXXU_UNTESTED=y +CONFIG_WLAN_VENDOR_RSI=y +CONFIG_RSI_91X=m +# CONFIG_RSI_DEBUGFS is not set +CONFIG_RSI_SDIO=m +CONFIG_RSI_USB=m +CONFIG_RSI_COEX=y +CONFIG_WLAN_VENDOR_ST=y +CONFIG_CW1200=m +CONFIG_CW1200_WLAN_SDIO=m +CONFIG_CW1200_WLAN_SPI=m +CONFIG_WLAN_VENDOR_TI=y +CONFIG_WL1251=m +CONFIG_WL1251_SPI=m +CONFIG_WL1251_SDIO=m +CONFIG_WL12XX=m +CONFIG_WL18XX=m +CONFIG_WLCORE=m +CONFIG_WLCORE_SDIO=m +CONFIG_WILINK_PLATFORM_DATA=y +CONFIG_WLAN_VENDOR_ZYDAS=y +CONFIG_USB_ZD1201=m +CONFIG_ZD1211RW=m +# CONFIG_ZD1211RW_DEBUG is not set +CONFIG_WLAN_VENDOR_QUANTENNA=y +CONFIG_QTNFMAC=m +CONFIG_QTNFMAC_PEARL_PCIE=m +CONFIG_PCMCIA_RAYCS=m +CONFIG_PCMCIA_WL3501=m +# CONFIG_MAC80211_HWSIM is not set +CONFIG_USB_NET_RNDIS_WLAN=m + +# +# WiMAX Wireless Broadband devices +# +CONFIG_WIMAX_I2400M=m +CONFIG_WIMAX_I2400M_USB=m +CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 +CONFIG_WAN=y +CONFIG_LANMEDIA=m +CONFIG_HDLC=m +CONFIG_HDLC_RAW=m +CONFIG_HDLC_RAW_ETH=m +CONFIG_HDLC_CISCO=m +CONFIG_HDLC_FR=m +CONFIG_HDLC_PPP=m +CONFIG_HDLC_X25=m +CONFIG_PCI200SYN=m +CONFIG_WANXL=m +CONFIG_PC300TOO=m +CONFIG_FARSYNC=m +CONFIG_DSCC4=m +CONFIG_DSCC4_PCISYNC=y +CONFIG_DSCC4_PCI_RST=y +CONFIG_DLCI=m +CONFIG_DLCI_MAX=8 +CONFIG_LAPBETHER=m +CONFIG_X25_ASY=m +CONFIG_SBNI=m +CONFIG_SBNI_MULTILINE=y +CONFIG_IEEE802154_DRIVERS=m +CONFIG_IEEE802154_FAKELB=m +CONFIG_IEEE802154_AT86RF230=m +# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set +CONFIG_IEEE802154_MRF24J40=m +CONFIG_IEEE802154_CC2520=m +CONFIG_IEEE802154_ATUSB=m +CONFIG_IEEE802154_ADF7242=m +CONFIG_IEEE802154_CA8210=m +# CONFIG_IEEE802154_CA8210_DEBUGFS is not set +CONFIG_IEEE802154_MCR20A=m +# CONFIG_IEEE802154_HWSIM is not set +CONFIG_VMXNET3=m +CONFIG_FUJITSU_ES=m +CONFIG_THUNDERBOLT_NET=m +CONFIG_HYPERV_NET=m +CONFIG_NETDEVSIM=m +CONFIG_NET_FAILOVER=m +CONFIG_ISDN=y +CONFIG_ISDN_I4L=m +CONFIG_ISDN_PPP=y +CONFIG_ISDN_PPP_VJ=y +CONFIG_ISDN_MPP=y +CONFIG_IPPP_FILTER=y +CONFIG_ISDN_PPP_BSDCOMP=m +CONFIG_ISDN_AUDIO=y +CONFIG_ISDN_TTY_FAX=y +CONFIG_ISDN_X25=y + +# +# ISDN feature submodules +# +CONFIG_ISDN_DIVERSION=m + +# +# ISDN4Linux hardware drivers +# + +# +# Passive cards +# +CONFIG_ISDN_DRV_HISAX=m + +# +# D-channel protocol features +# +CONFIG_HISAX_EURO=y +CONFIG_DE_AOC=y +# CONFIG_HISAX_NO_SENDCOMPLETE is not set +# CONFIG_HISAX_NO_LLC is not set +# CONFIG_HISAX_NO_KEYPAD is not set +CONFIG_HISAX_1TR6=y +CONFIG_HISAX_NI1=y +CONFIG_HISAX_MAX_CARDS=8 + +# +# HiSax supported cards +# +CONFIG_HISAX_16_3=y +CONFIG_HISAX_TELESPCI=y +CONFIG_HISAX_S0BOX=y +CONFIG_HISAX_FRITZPCI=y +CONFIG_HISAX_AVM_A1_PCMCIA=y +CONFIG_HISAX_ELSA=y +CONFIG_HISAX_DIEHLDIVA=y +CONFIG_HISAX_SEDLBAUER=y +CONFIG_HISAX_NETJET=y +CONFIG_HISAX_NETJET_U=y +CONFIG_HISAX_NICCY=y +CONFIG_HISAX_BKM_A4T=y +CONFIG_HISAX_SCT_QUADRO=y +CONFIG_HISAX_GAZEL=y +CONFIG_HISAX_HFC_PCI=y +CONFIG_HISAX_W6692=y +CONFIG_HISAX_HFC_SX=y +CONFIG_HISAX_ENTERNOW_PCI=y +# CONFIG_HISAX_DEBUG is not set + +# +# HiSax PCMCIA card service modules +# +CONFIG_HISAX_SEDLBAUER_CS=m +CONFIG_HISAX_ELSA_CS=m +CONFIG_HISAX_AVM_A1_CS=m +CONFIG_HISAX_TELES_CS=m + +# +# HiSax sub driver modules +# +CONFIG_HISAX_ST5481=m +CONFIG_HISAX_HFCUSB=m +CONFIG_HISAX_HFC4S8S=m +CONFIG_HISAX_FRITZ_PCIPNP=m +CONFIG_ISDN_CAPI=m +CONFIG_CAPI_TRACE=y +CONFIG_ISDN_CAPI_CAPI20=m +CONFIG_ISDN_CAPI_MIDDLEWARE=y +CONFIG_ISDN_CAPI_CAPIDRV=m +# CONFIG_ISDN_CAPI_CAPIDRV_VERBOSE is not set + +# +# CAPI hardware drivers +# +CONFIG_CAPI_AVM=y +CONFIG_ISDN_DRV_AVMB1_B1PCI=m +CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y +CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m +CONFIG_ISDN_DRV_AVMB1_AVM_CS=m +CONFIG_ISDN_DRV_AVMB1_T1PCI=m +CONFIG_ISDN_DRV_AVMB1_C4=m +CONFIG_CAPI_EICON=y +CONFIG_ISDN_DIVAS=m +CONFIG_ISDN_DIVAS_BRIPCI=y +CONFIG_ISDN_DIVAS_PRIPCI=y +CONFIG_ISDN_DIVAS_DIVACAPI=m +CONFIG_ISDN_DIVAS_USERIDI=m +CONFIG_ISDN_DIVAS_MAINT=m +CONFIG_ISDN_DRV_GIGASET=m +CONFIG_GIGASET_CAPI=y +CONFIG_GIGASET_BASE=m +CONFIG_GIGASET_M105=m +CONFIG_GIGASET_M101=m +# CONFIG_GIGASET_DEBUG is not set +CONFIG_HYSDN=m +CONFIG_HYSDN_CAPI=y +CONFIG_MISDN=m +CONFIG_MISDN_DSP=m +CONFIG_MISDN_L1OIP=m + +# +# mISDN hardware drivers +# +CONFIG_MISDN_HFCPCI=m +CONFIG_MISDN_HFCMULTI=m +CONFIG_MISDN_HFCUSB=m +CONFIG_MISDN_AVMFRITZ=m +CONFIG_MISDN_SPEEDFAX=m +CONFIG_MISDN_INFINEON=m +CONFIG_MISDN_W6692=m +CONFIG_MISDN_NETJET=m +CONFIG_MISDN_IPAC=m +CONFIG_MISDN_ISAR=m +CONFIG_ISDN_HDLC=m +CONFIG_NVM=y +CONFIG_NVM_PBLK=m +# CONFIG_NVM_PBLK_DEBUG is not set + +# +# Input device support +# +CONFIG_INPUT=y +CONFIG_INPUT_LEDS=y +CONFIG_INPUT_FF_MEMLESS=m +CONFIG_INPUT_POLLDEV=m +CONFIG_INPUT_SPARSEKMAP=m +CONFIG_INPUT_MATRIXKMAP=m + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=y +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_EVDEV=m +# CONFIG_INPUT_EVBUG is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ADC=m +CONFIG_KEYBOARD_ADP5588=m +CONFIG_KEYBOARD_ADP5589=m +CONFIG_KEYBOARD_ATKBD=y +CONFIG_KEYBOARD_QT1070=m +CONFIG_KEYBOARD_QT2160=m +CONFIG_KEYBOARD_DLINK_DIR685=m +CONFIG_KEYBOARD_LKKBD=m +CONFIG_KEYBOARD_GPIO=m +CONFIG_KEYBOARD_GPIO_POLLED=m +CONFIG_KEYBOARD_TCA6416=m +CONFIG_KEYBOARD_TCA8418=m +CONFIG_KEYBOARD_MATRIX=m +CONFIG_KEYBOARD_LM8323=m +CONFIG_KEYBOARD_LM8333=m +CONFIG_KEYBOARD_MAX7359=m +CONFIG_KEYBOARD_MCS=m +CONFIG_KEYBOARD_MPR121=m +CONFIG_KEYBOARD_NEWTON=m +CONFIG_KEYBOARD_OPENCORES=m +CONFIG_KEYBOARD_SAMSUNG=m +CONFIG_KEYBOARD_STOWAWAY=m +CONFIG_KEYBOARD_SUNKBD=m +CONFIG_KEYBOARD_TM2_TOUCHKEY=m +CONFIG_KEYBOARD_XTKBD=m +CONFIG_KEYBOARD_CROS_EC=m +CONFIG_KEYBOARD_MTK_PMIC=m +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=m +CONFIG_MOUSE_PS2_ALPS=y +CONFIG_MOUSE_PS2_BYD=y +CONFIG_MOUSE_PS2_LOGIPS2PP=y +CONFIG_MOUSE_PS2_SYNAPTICS=y +CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y +CONFIG_MOUSE_PS2_CYPRESS=y +CONFIG_MOUSE_PS2_LIFEBOOK=y +CONFIG_MOUSE_PS2_TRACKPOINT=y +CONFIG_MOUSE_PS2_ELANTECH=y +CONFIG_MOUSE_PS2_ELANTECH_SMBUS=y +CONFIG_MOUSE_PS2_SENTELIC=y +CONFIG_MOUSE_PS2_TOUCHKIT=y +CONFIG_MOUSE_PS2_FOCALTECH=y +# CONFIG_MOUSE_PS2_VMMOUSE is not set +CONFIG_MOUSE_PS2_SMBUS=y +CONFIG_MOUSE_SERIAL=m +CONFIG_MOUSE_APPLETOUCH=m +CONFIG_MOUSE_BCM5974=m +CONFIG_MOUSE_CYAPA=m +CONFIG_MOUSE_ELAN_I2C=m +CONFIG_MOUSE_ELAN_I2C_I2C=y +CONFIG_MOUSE_ELAN_I2C_SMBUS=y +CONFIG_MOUSE_VSXXXAA=m +CONFIG_MOUSE_GPIO=m +CONFIG_MOUSE_SYNAPTICS_I2C=m +CONFIG_MOUSE_SYNAPTICS_USB=m +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_ANALOG=m +CONFIG_JOYSTICK_A3D=m +CONFIG_JOYSTICK_ADI=m +CONFIG_JOYSTICK_COBRA=m +CONFIG_JOYSTICK_GF2K=m +CONFIG_JOYSTICK_GRIP=m +CONFIG_JOYSTICK_GRIP_MP=m +CONFIG_JOYSTICK_GUILLEMOT=m +CONFIG_JOYSTICK_INTERACT=m +CONFIG_JOYSTICK_SIDEWINDER=m +CONFIG_JOYSTICK_TMDC=m +CONFIG_JOYSTICK_IFORCE=m +CONFIG_JOYSTICK_IFORCE_USB=y +CONFIG_JOYSTICK_IFORCE_232=y +CONFIG_JOYSTICK_WARRIOR=m +CONFIG_JOYSTICK_MAGELLAN=m +CONFIG_JOYSTICK_SPACEORB=m +CONFIG_JOYSTICK_SPACEBALL=m +CONFIG_JOYSTICK_STINGER=m +CONFIG_JOYSTICK_TWIDJOY=m +CONFIG_JOYSTICK_ZHENHUA=m +CONFIG_JOYSTICK_DB9=m +CONFIG_JOYSTICK_GAMECON=m +CONFIG_JOYSTICK_TURBOGRAFX=m +CONFIG_JOYSTICK_AS5011=m +# CONFIG_JOYSTICK_JOYDUMP is not set +CONFIG_JOYSTICK_XPAD=m +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_JOYSTICK_WALKERA0701=m +CONFIG_JOYSTICK_PSXPAD_SPI=m +CONFIG_JOYSTICK_PSXPAD_SPI_FF=y +CONFIG_JOYSTICK_PXRC=m +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=m +CONFIG_TABLET_USB_AIPTEK=m +CONFIG_TABLET_USB_GTCO=m +CONFIG_TABLET_USB_HANWANG=m +CONFIG_TABLET_USB_KBTAB=m +CONFIG_TABLET_USB_PEGASUS=m +CONFIG_TABLET_SERIAL_WACOM4=m +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_PROPERTIES=y +CONFIG_TOUCHSCREEN_ADS7846=m +CONFIG_TOUCHSCREEN_AD7877=m +CONFIG_TOUCHSCREEN_AD7879=m +CONFIG_TOUCHSCREEN_AD7879_I2C=m +CONFIG_TOUCHSCREEN_AD7879_SPI=m +# CONFIG_TOUCHSCREEN_ADC is not set +CONFIG_TOUCHSCREEN_ATMEL_MXT=m +# CONFIG_TOUCHSCREEN_ATMEL_MXT_T37 is not set +CONFIG_TOUCHSCREEN_AUO_PIXCIR=m +CONFIG_TOUCHSCREEN_BU21013=m +# CONFIG_TOUCHSCREEN_BU21029 is not set +CONFIG_TOUCHSCREEN_CHIPONE_ICN8505=m +CONFIG_TOUCHSCREEN_CY8CTMG110=m +CONFIG_TOUCHSCREEN_CYTTSP_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP_SPI=m +CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m +CONFIG_TOUCHSCREEN_DA9052=m +CONFIG_TOUCHSCREEN_DYNAPRO=m +CONFIG_TOUCHSCREEN_HAMPSHIRE=m +CONFIG_TOUCHSCREEN_EETI=m +CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m +CONFIG_TOUCHSCREEN_EXC3000=m +CONFIG_TOUCHSCREEN_FUJITSU=m +CONFIG_TOUCHSCREEN_GOODIX=m +CONFIG_TOUCHSCREEN_HIDEEP=m +CONFIG_TOUCHSCREEN_ILI210X=m +CONFIG_TOUCHSCREEN_S6SY761=m +CONFIG_TOUCHSCREEN_GUNZE=m +CONFIG_TOUCHSCREEN_EKTF2127=m +CONFIG_TOUCHSCREEN_ELAN=m +CONFIG_TOUCHSCREEN_ELO=m +CONFIG_TOUCHSCREEN_WACOM_W8001=m +CONFIG_TOUCHSCREEN_WACOM_I2C=m +CONFIG_TOUCHSCREEN_MAX11801=m +CONFIG_TOUCHSCREEN_MCS5000=m +CONFIG_TOUCHSCREEN_MMS114=m +CONFIG_TOUCHSCREEN_MELFAS_MIP4=m +CONFIG_TOUCHSCREEN_MTOUCH=m +CONFIG_TOUCHSCREEN_INEXIO=m +CONFIG_TOUCHSCREEN_MK712=m +CONFIG_TOUCHSCREEN_PENMOUNT=m +CONFIG_TOUCHSCREEN_EDT_FT5X06=m +CONFIG_TOUCHSCREEN_TOUCHRIGHT=m +CONFIG_TOUCHSCREEN_TOUCHWIN=m +CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m +CONFIG_TOUCHSCREEN_UCB1400=m +CONFIG_TOUCHSCREEN_PIXCIR=m +CONFIG_TOUCHSCREEN_WDT87XX_I2C=m +CONFIG_TOUCHSCREEN_WM831X=m +CONFIG_TOUCHSCREEN_WM97XX=m +CONFIG_TOUCHSCREEN_WM9705=y +CONFIG_TOUCHSCREEN_WM9712=y +CONFIG_TOUCHSCREEN_WM9713=y +CONFIG_TOUCHSCREEN_USB_COMPOSITE=m +CONFIG_TOUCHSCREEN_MC13783=m +CONFIG_TOUCHSCREEN_USB_EGALAX=y +CONFIG_TOUCHSCREEN_USB_PANJIT=y +CONFIG_TOUCHSCREEN_USB_3M=y +CONFIG_TOUCHSCREEN_USB_ITM=y +CONFIG_TOUCHSCREEN_USB_ETURBO=y +CONFIG_TOUCHSCREEN_USB_GUNZE=y +CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y +CONFIG_TOUCHSCREEN_USB_IRTOUCH=y +CONFIG_TOUCHSCREEN_USB_IDEALTEK=y +CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y +CONFIG_TOUCHSCREEN_USB_GOTOP=y +CONFIG_TOUCHSCREEN_USB_JASTEC=y +CONFIG_TOUCHSCREEN_USB_ELO=y +CONFIG_TOUCHSCREEN_USB_E2I=y +CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y +CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y +CONFIG_TOUCHSCREEN_USB_NEXIO=y +CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y +CONFIG_TOUCHSCREEN_TOUCHIT213=m +CONFIG_TOUCHSCREEN_TSC_SERIO=m +CONFIG_TOUCHSCREEN_TSC200X_CORE=m +CONFIG_TOUCHSCREEN_TSC2004=m +CONFIG_TOUCHSCREEN_TSC2005=m +CONFIG_TOUCHSCREEN_TSC2007=m +CONFIG_TOUCHSCREEN_TSC2007_IIO=y +CONFIG_TOUCHSCREEN_PCAP=m +CONFIG_TOUCHSCREEN_RM_TS=m +CONFIG_TOUCHSCREEN_SILEAD=m +CONFIG_TOUCHSCREEN_SIS_I2C=m +CONFIG_TOUCHSCREEN_ST1232=m +CONFIG_TOUCHSCREEN_STMFTS=m +CONFIG_TOUCHSCREEN_SUR40=m +CONFIG_TOUCHSCREEN_SURFACE3_SPI=m +CONFIG_TOUCHSCREEN_SX8654=m +CONFIG_TOUCHSCREEN_TPS6507X=m +CONFIG_TOUCHSCREEN_ZET6223=m +CONFIG_TOUCHSCREEN_ZFORCE=m +CONFIG_TOUCHSCREEN_ROHM_BU21023=m +CONFIG_INPUT_MISC=y +CONFIG_INPUT_88PM80X_ONKEY=m +CONFIG_INPUT_AD714X=m +CONFIG_INPUT_AD714X_I2C=m +CONFIG_INPUT_AD714X_SPI=m +CONFIG_INPUT_ARIZONA_HAPTICS=m +CONFIG_INPUT_BMA150=m +CONFIG_INPUT_E3X0_BUTTON=m +CONFIG_INPUT_PCSPKR=m +CONFIG_INPUT_MAX77693_HAPTIC=m +CONFIG_INPUT_MC13783_PWRBUTTON=m +CONFIG_INPUT_MMA8450=m +CONFIG_INPUT_APANEL=m +CONFIG_INPUT_GP2A=m +CONFIG_INPUT_GPIO_BEEPER=m +CONFIG_INPUT_GPIO_DECODER=m +CONFIG_INPUT_ATLAS_BTNS=m +CONFIG_INPUT_ATI_REMOTE2=m +CONFIG_INPUT_KEYSPAN_REMOTE=m +CONFIG_INPUT_KXTJ9=m +CONFIG_INPUT_KXTJ9_POLLED_MODE=y +CONFIG_INPUT_POWERMATE=m +CONFIG_INPUT_YEALINK=m +CONFIG_INPUT_CM109=m +CONFIG_INPUT_REGULATOR_HAPTIC=m +CONFIG_INPUT_RETU_PWRBUTTON=m +CONFIG_INPUT_AXP20X_PEK=m +CONFIG_INPUT_UINPUT=m +CONFIG_INPUT_PCF50633_PMU=m +CONFIG_INPUT_PCF8574=m +CONFIG_INPUT_PWM_BEEPER=m +CONFIG_INPUT_PWM_VIBRA=m +CONFIG_INPUT_GPIO_ROTARY_ENCODER=m +CONFIG_INPUT_DA9052_ONKEY=m +CONFIG_INPUT_DA9063_ONKEY=m +CONFIG_INPUT_WM831X_ON=m +CONFIG_INPUT_PCAP=m +CONFIG_INPUT_ADXL34X=m +CONFIG_INPUT_ADXL34X_I2C=m +CONFIG_INPUT_ADXL34X_SPI=m +CONFIG_INPUT_IMS_PCU=m +CONFIG_INPUT_CMA3000=m +CONFIG_INPUT_CMA3000_I2C=m +CONFIG_INPUT_IDEAPAD_SLIDEBAR=m +CONFIG_INPUT_SOC_BUTTON_ARRAY=m +CONFIG_INPUT_DRV260X_HAPTICS=m +CONFIG_INPUT_DRV2665_HAPTICS=m +CONFIG_INPUT_DRV2667_HAPTICS=m +CONFIG_INPUT_RAVE_SP_PWRBUTTON=m +CONFIG_RMI4_CORE=m +CONFIG_RMI4_I2C=m +CONFIG_RMI4_SPI=m +CONFIG_RMI4_SMB=m +CONFIG_RMI4_F03=y +CONFIG_RMI4_F03_SERIO=m +CONFIG_RMI4_2D_SENSOR=y +CONFIG_RMI4_F11=y +CONFIG_RMI4_F12=y +CONFIG_RMI4_F30=y +CONFIG_RMI4_F34=y +CONFIG_RMI4_F54=y +CONFIG_RMI4_F55=y + +# +# Hardware I/O ports +# +CONFIG_SERIO=y +CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y +CONFIG_SERIO_I8042=y +CONFIG_SERIO_SERPORT=m +CONFIG_SERIO_CT82C710=m +CONFIG_SERIO_PARKBD=m +CONFIG_SERIO_PCIPS2=m +CONFIG_SERIO_LIBPS2=y +CONFIG_SERIO_RAW=m +CONFIG_SERIO_ALTERA_PS2=m +CONFIG_SERIO_PS2MULT=m +CONFIG_SERIO_ARC_PS2=m +CONFIG_HYPERV_KEYBOARD=m +CONFIG_SERIO_GPIO_PS2=m +CONFIG_USERIO=m +CONFIG_GAMEPORT=m +CONFIG_GAMEPORT_NS558=m +CONFIG_GAMEPORT_L4=m +CONFIG_GAMEPORT_EMU10K1=m +CONFIG_GAMEPORT_FM801=m + +# +# Character devices +# +CONFIG_TTY=y +CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y +CONFIG_VT_CONSOLE=y +CONFIG_VT_CONSOLE_SLEEP=y +CONFIG_HW_CONSOLE=y +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_UNIX98_PTYS=y +CONFIG_LEGACY_PTYS=y +CONFIG_LEGACY_PTY_COUNT=256 +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_ROCKETPORT=m +CONFIG_CYCLADES=m +CONFIG_CYZ_INTR=y +CONFIG_MOXA_INTELLIO=m +CONFIG_MOXA_SMARTIO=m +CONFIG_SYNCLINK=m +CONFIG_SYNCLINKMP=m +CONFIG_SYNCLINK_GT=m +CONFIG_NOZOMI=m +CONFIG_ISI=m +CONFIG_N_HDLC=m +CONFIG_N_GSM=m +CONFIG_TRACE_ROUTER=m +CONFIG_TRACE_SINK=m +CONFIG_DEVMEM=y +# CONFIG_DEVKMEM is not set + +# +# Serial drivers +# +CONFIG_SERIAL_EARLYCON=y +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set +CONFIG_SERIAL_8250_PNP=y +CONFIG_SERIAL_8250_FINTEK=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_DMA=y +CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_EXAR=y +CONFIG_SERIAL_8250_CS=m +CONFIG_SERIAL_8250_MEN_MCB=m +CONFIG_SERIAL_8250_NR_UARTS=4 +CONFIG_SERIAL_8250_RUNTIME_UARTS=4 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_SHARE_IRQ=y +CONFIG_SERIAL_8250_DETECT_IRQ=y +CONFIG_SERIAL_8250_RSA=y +CONFIG_SERIAL_8250_DW=m +CONFIG_SERIAL_8250_RT288X=y +CONFIG_SERIAL_8250_LPSS=y +CONFIG_SERIAL_8250_MID=y +CONFIG_SERIAL_8250_MOXA=m + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_MAX3100=m +CONFIG_SERIAL_MAX310X=y +CONFIG_SERIAL_UARTLITE=m +CONFIG_SERIAL_UARTLITE_NR_UARTS=1 +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +CONFIG_SERIAL_JSM=m +CONFIG_SERIAL_SCCNXP=m +CONFIG_SERIAL_SC16IS7XX_CORE=m +CONFIG_SERIAL_SC16IS7XX=m +CONFIG_SERIAL_SC16IS7XX_I2C=y +CONFIG_SERIAL_SC16IS7XX_SPI=y +CONFIG_SERIAL_ALTERA_JTAGUART=m +CONFIG_SERIAL_ALTERA_UART=m +CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 +CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 +CONFIG_SERIAL_IFX6X60=m +CONFIG_SERIAL_ARC=m +CONFIG_SERIAL_ARC_NR_PORTS=1 +CONFIG_SERIAL_RP2=m +CONFIG_SERIAL_RP2_NR_UARTS=32 +CONFIG_SERIAL_FSL_LPUART=m +CONFIG_SERIAL_MEN_Z135=m +CONFIG_SERIAL_DEV_BUS=m +CONFIG_PRINTER=m +CONFIG_LP_CONSOLE=y +CONFIG_PPDEV=m +CONFIG_HVC_DRIVER=y +CONFIG_VIRTIO_CONSOLE=m +CONFIG_IPMI_HANDLER=m +CONFIG_IPMI_DMI_DECODE=y +CONFIG_IPMI_PANIC_EVENT=y +CONFIG_IPMI_PANIC_STRING=y +CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m +CONFIG_IPMI_SSIF=m +CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m +CONFIG_HW_RANDOM=m +CONFIG_HW_RANDOM_TIMERIOMEM=m +CONFIG_HW_RANDOM_INTEL=m +CONFIG_HW_RANDOM_AMD=m +CONFIG_HW_RANDOM_VIA=m +CONFIG_HW_RANDOM_VIRTIO=m +CONFIG_NVRAM=m +CONFIG_R3964=m +CONFIG_APPLICOM=m + +# +# PCMCIA character devices +# +CONFIG_SYNCLINK_CS=m +CONFIG_CARDMAN_4000=m +CONFIG_CARDMAN_4040=m +CONFIG_SCR24X=m +CONFIG_IPWIRELESS=m +CONFIG_MWAVE=m +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=256 +CONFIG_HPET=y +CONFIG_HPET_MMAP=y +CONFIG_HPET_MMAP_DEFAULT=y +CONFIG_HANGCHECK_TIMER=m +CONFIG_TCG_TPM=m +CONFIG_HW_RANDOM_TPM=y +CONFIG_TCG_TIS_CORE=m +CONFIG_TCG_TIS=m +CONFIG_TCG_TIS_SPI=m +CONFIG_TCG_TIS_I2C_ATMEL=m +CONFIG_TCG_TIS_I2C_INFINEON=m +CONFIG_TCG_TIS_I2C_NUVOTON=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m +CONFIG_TCG_INFINEON=m +CONFIG_TCG_CRB=m +CONFIG_TCG_VTPM_PROXY=m +CONFIG_TCG_TIS_ST33ZP24=m +CONFIG_TCG_TIS_ST33ZP24_I2C=m +CONFIG_TCG_TIS_ST33ZP24_SPI=m +CONFIG_TELCLOCK=m +CONFIG_DEVPORT=y +CONFIG_XILLYBUS=m +CONFIG_XILLYBUS_PCIE=m +# CONFIG_RANDOM_TRUST_CPU is not set + +# +# I2C support +# +CONFIG_I2C=m +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_COMPAT=y +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_MUX=m + +# +# Multiplexer I2C Chip support +# +CONFIG_I2C_MUX_GPIO=m +CONFIG_I2C_MUX_LTC4306=m +CONFIG_I2C_MUX_PCA9541=m +CONFIG_I2C_MUX_PCA954x=m +CONFIG_I2C_MUX_REG=m +CONFIG_I2C_MUX_MLXCPLD=m +CONFIG_I2C_HELPER_AUTO=y +CONFIG_I2C_SMBUS=m +CONFIG_I2C_ALGOBIT=m +CONFIG_I2C_ALGOPCA=m + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +CONFIG_I2C_ALI1535=m +CONFIG_I2C_ALI1563=m +CONFIG_I2C_ALI15X3=m +CONFIG_I2C_AMD756=m +CONFIG_I2C_AMD756_S4882=m +CONFIG_I2C_AMD8111=m +CONFIG_I2C_I801=m +CONFIG_I2C_ISCH=m +CONFIG_I2C_ISMT=m +CONFIG_I2C_PIIX4=m +CONFIG_I2C_NFORCE2=m +CONFIG_I2C_NFORCE2_S4985=m +CONFIG_I2C_SIS5595=m +CONFIG_I2C_SIS630=m +CONFIG_I2C_SIS96X=m +CONFIG_I2C_VIA=m +CONFIG_I2C_VIAPRO=m + +# +# ACPI drivers +# +CONFIG_I2C_SCMI=m + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_CBUS_GPIO=m +CONFIG_I2C_DESIGNWARE_CORE=m +CONFIG_I2C_DESIGNWARE_PLATFORM=m +# CONFIG_I2C_DESIGNWARE_SLAVE is not set +CONFIG_I2C_DESIGNWARE_PCI=m +# CONFIG_I2C_DESIGNWARE_BAYTRAIL is not set +CONFIG_I2C_EMEV2=m +CONFIG_I2C_GPIO=m +# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set +CONFIG_I2C_KEMPLD=m +CONFIG_I2C_OCORES=m +CONFIG_I2C_PCA_PLATFORM=m +CONFIG_I2C_SIMTEC=m +CONFIG_I2C_XILINX=m + +# +# External I2C/SMBus adapter drivers +# +CONFIG_I2C_DIOLAN_U2C=m +CONFIG_I2C_DLN2=m +CONFIG_I2C_PARPORT=m +CONFIG_I2C_PARPORT_LIGHT=m +CONFIG_I2C_ROBOTFUZZ_OSIF=m +CONFIG_I2C_TAOS_EVM=m +CONFIG_I2C_TINY_USB=m +CONFIG_I2C_VIPERBOARD=m + +# +# Other I2C/SMBus bus drivers +# +CONFIG_I2C_MLXCPLD=m +CONFIG_I2C_CROS_EC_TUNNEL=m +# CONFIG_I2C_STUB is not set +CONFIG_I2C_SLAVE=y +CONFIG_I2C_SLAVE_EEPROM=m +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +CONFIG_SPI=y +# CONFIG_SPI_DEBUG is not set +CONFIG_SPI_MASTER=y +CONFIG_SPI_MEM=y + +# +# SPI Master Controller Drivers +# +CONFIG_SPI_ALTERA=m +CONFIG_SPI_AXI_SPI_ENGINE=m +CONFIG_SPI_BITBANG=m +CONFIG_SPI_BUTTERFLY=m +CONFIG_SPI_CADENCE=m +CONFIG_SPI_DESIGNWARE=m +CONFIG_SPI_DW_PCI=m +CONFIG_SPI_DW_MID_DMA=y +CONFIG_SPI_DW_MMIO=m +CONFIG_SPI_DLN2=m +CONFIG_SPI_GPIO=m +CONFIG_SPI_LM70_LLP=m +CONFIG_SPI_OC_TINY=m +CONFIG_SPI_PXA2XX=m +CONFIG_SPI_PXA2XX_PCI=m +CONFIG_SPI_ROCKCHIP=m +CONFIG_SPI_SC18IS602=m +CONFIG_SPI_XCOMM=m +CONFIG_SPI_XILINX=m +CONFIG_SPI_ZYNQMP_GQSPI=m + +# +# SPI Protocol Masters +# +CONFIG_SPI_SPIDEV=m +CONFIG_SPI_LOOPBACK_TEST=m +CONFIG_SPI_TLE62X0=m +CONFIG_SPI_SLAVE=y +CONFIG_SPI_SLAVE_TIME=m +CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m +CONFIG_SPMI=m +CONFIG_HSI=m +CONFIG_HSI_BOARDINFO=y + +# +# HSI controllers +# + +# +# HSI clients +# +CONFIG_HSI_CHAR=m +CONFIG_PPS=y +# CONFIG_PPS_DEBUG is not set +# CONFIG_NTP_PPS is not set + +# +# PPS clients support +# +# CONFIG_PPS_CLIENT_KTIMER is not set +CONFIG_PPS_CLIENT_LDISC=m +CONFIG_PPS_CLIENT_PARPORT=m +CONFIG_PPS_CLIENT_GPIO=m + +# +# PPS generators support +# + +# +# PTP clock support +# +CONFIG_PTP_1588_CLOCK=y +CONFIG_DP83640_PHY=m +CONFIG_PTP_1588_CLOCK_KVM=m +CONFIG_PINCTRL=y +CONFIG_PINMUX=y +CONFIG_PINCONF=y +CONFIG_GENERIC_PINCONF=y +# CONFIG_DEBUG_PINCTRL is not set +CONFIG_PINCTRL_AMD=m +CONFIG_PINCTRL_MCP23S08=m +CONFIG_PINCTRL_BAYTRAIL=y +CONFIG_PINCTRL_CHERRYVIEW=m +CONFIG_PINCTRL_INTEL=m +CONFIG_PINCTRL_BROXTON=m +CONFIG_PINCTRL_CANNONLAKE=m +CONFIG_PINCTRL_CEDARFORK=m +CONFIG_PINCTRL_DENVERTON=m +CONFIG_PINCTRL_GEMINILAKE=m +# CONFIG_PINCTRL_ICELAKE is not set +CONFIG_PINCTRL_LEWISBURG=m +CONFIG_PINCTRL_SUNRISEPOINT=m +CONFIG_GPIOLIB=y +CONFIG_GPIOLIB_FASTPATH_LIMIT=512 +CONFIG_GPIO_ACPI=y +CONFIG_GPIOLIB_IRQCHIP=y +# CONFIG_DEBUG_GPIO is not set +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_GENERIC=m +CONFIG_GPIO_MAX730X=m + +# +# Memory mapped GPIO drivers +# +CONFIG_GPIO_AMDPT=m +CONFIG_GPIO_DWAPB=m +CONFIG_GPIO_EXAR=m +CONFIG_GPIO_GENERIC_PLATFORM=m +CONFIG_GPIO_ICH=m +CONFIG_GPIO_LYNXPOINT=y +CONFIG_GPIO_MB86S7X=m +CONFIG_GPIO_MENZ127=m +CONFIG_GPIO_MOCKUP=m +CONFIG_GPIO_VX855=m + +# +# Port-mapped I/O GPIO drivers +# +CONFIG_GPIO_F7188X=m +CONFIG_GPIO_IT87=m +CONFIG_GPIO_SCH=m +CONFIG_GPIO_SCH311X=m +CONFIG_GPIO_WINBOND=m +CONFIG_GPIO_WS16C48=m + +# +# I2C GPIO expanders +# +CONFIG_GPIO_ADP5588=m +CONFIG_GPIO_MAX7300=m +CONFIG_GPIO_MAX732X=m +CONFIG_GPIO_PCA953X=m +CONFIG_GPIO_PCF857X=m +CONFIG_GPIO_TPIC2810=m + +# +# MFD GPIO expanders +# +CONFIG_GPIO_ARIZONA=m +CONFIG_GPIO_BD9571MWV=m +CONFIG_GPIO_DA9052=m +CONFIG_GPIO_DLN2=m +CONFIG_GPIO_JANZ_TTL=m +CONFIG_GPIO_KEMPLD=m +CONFIG_GPIO_LP3943=m +CONFIG_GPIO_LP873X=m +CONFIG_GPIO_TPS65086=m +CONFIG_GPIO_TPS65912=m +CONFIG_GPIO_UCB1400=m +CONFIG_GPIO_WHISKEY_COVE=m +CONFIG_GPIO_WM831X=m +CONFIG_GPIO_WM8994=m + +# +# PCI GPIO expanders +# +CONFIG_GPIO_AMD8111=m +CONFIG_GPIO_ML_IOH=m +CONFIG_GPIO_PCI_IDIO_16=m +CONFIG_GPIO_PCIE_IDIO_24=m +CONFIG_GPIO_RDC321X=m + +# +# SPI GPIO expanders +# +CONFIG_GPIO_MAX3191X=m +CONFIG_GPIO_MAX7301=m +CONFIG_GPIO_MC33880=m +CONFIG_GPIO_PISOSR=m +CONFIG_GPIO_XRA1403=m + +# +# USB GPIO expanders +# +CONFIG_GPIO_VIPERBOARD=m +CONFIG_W1=m +CONFIG_W1_CON=y + +# +# 1-wire Bus Masters +# +CONFIG_W1_MASTER_MATROX=m +CONFIG_W1_MASTER_DS2490=m +CONFIG_W1_MASTER_DS2482=m +CONFIG_W1_MASTER_DS1WM=m +CONFIG_W1_MASTER_GPIO=m + +# +# 1-wire Slaves +# +CONFIG_W1_SLAVE_THERM=m +CONFIG_W1_SLAVE_SMEM=m +# CONFIG_W1_SLAVE_DS2405 is not set +CONFIG_W1_SLAVE_DS2408=m +# CONFIG_W1_SLAVE_DS2408_READBACK is not set +CONFIG_W1_SLAVE_DS2413=m +CONFIG_W1_SLAVE_DS2406=m +CONFIG_W1_SLAVE_DS2423=m +CONFIG_W1_SLAVE_DS2805=m +CONFIG_W1_SLAVE_DS2431=m +CONFIG_W1_SLAVE_DS2433=m +CONFIG_W1_SLAVE_DS2433_CRC=y +CONFIG_W1_SLAVE_DS2438=m +CONFIG_W1_SLAVE_DS2780=m +CONFIG_W1_SLAVE_DS2781=m +CONFIG_W1_SLAVE_DS28E04=m +CONFIG_W1_SLAVE_DS28E17=m +CONFIG_POWER_AVS=y +CONFIG_POWER_RESET=y +# CONFIG_POWER_RESET_RESTART is not set +CONFIG_POWER_SUPPLY=y +# CONFIG_POWER_SUPPLY_DEBUG is not set +CONFIG_PDA_POWER=m +CONFIG_GENERIC_ADC_BATTERY=m +CONFIG_WM831X_BACKUP=m +CONFIG_WM831X_POWER=m +# CONFIG_TEST_POWER is not set +# CONFIG_CHARGER_ADP5061 is not set +CONFIG_BATTERY_DS2760=m +CONFIG_BATTERY_DS2780=m +CONFIG_BATTERY_DS2781=m +CONFIG_BATTERY_DS2782=m +CONFIG_BATTERY_SBS=m +CONFIG_CHARGER_SBS=m +CONFIG_MANAGER_SBS=m +CONFIG_BATTERY_BQ27XXX=m +CONFIG_BATTERY_BQ27XXX_I2C=m +CONFIG_BATTERY_BQ27XXX_HDQ=m +# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set +CONFIG_BATTERY_DA9052=m +CONFIG_CHARGER_DA9150=m +CONFIG_BATTERY_DA9150=m +CONFIG_CHARGER_AXP20X=m +CONFIG_BATTERY_AXP20X=m +CONFIG_AXP20X_POWER=m +CONFIG_AXP288_CHARGER=m +CONFIG_AXP288_FUEL_GAUGE=m +CONFIG_BATTERY_MAX17040=m +CONFIG_BATTERY_MAX17042=m +CONFIG_BATTERY_MAX1721X=m +CONFIG_CHARGER_PCF50633=m +CONFIG_CHARGER_ISP1704=m +CONFIG_CHARGER_MAX8903=m +CONFIG_CHARGER_LP8727=m +CONFIG_CHARGER_GPIO=m +CONFIG_CHARGER_MANAGER=y +CONFIG_CHARGER_LTC3651=m +CONFIG_CHARGER_MAX14577=m +CONFIG_CHARGER_MAX77693=m +CONFIG_CHARGER_BQ2415X=m +CONFIG_CHARGER_BQ24190=m +CONFIG_CHARGER_BQ24257=m +CONFIG_CHARGER_BQ24735=m +CONFIG_CHARGER_BQ25890=m +CONFIG_CHARGER_SMB347=m +CONFIG_BATTERY_GAUGE_LTC2941=m +CONFIG_BATTERY_RT5033=m +CONFIG_CHARGER_RT9455=m +# CONFIG_CHARGER_CROS_USBPD is not set +CONFIG_HWMON=m +CONFIG_HWMON_VID=m +# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# Native drivers +# +CONFIG_SENSORS_ABITUGURU=m +CONFIG_SENSORS_ABITUGURU3=m +CONFIG_SENSORS_AD7314=m +CONFIG_SENSORS_AD7414=m +CONFIG_SENSORS_AD7418=m +CONFIG_SENSORS_ADM1021=m +CONFIG_SENSORS_ADM1025=m +CONFIG_SENSORS_ADM1026=m +CONFIG_SENSORS_ADM1029=m +CONFIG_SENSORS_ADM1031=m +CONFIG_SENSORS_ADM9240=m +CONFIG_SENSORS_ADT7X10=m +CONFIG_SENSORS_ADT7310=m +CONFIG_SENSORS_ADT7410=m +CONFIG_SENSORS_ADT7411=m +CONFIG_SENSORS_ADT7462=m +CONFIG_SENSORS_ADT7470=m +CONFIG_SENSORS_ADT7475=m +CONFIG_SENSORS_ASC7621=m +CONFIG_SENSORS_K8TEMP=m +CONFIG_SENSORS_K10TEMP=m +CONFIG_SENSORS_FAM15H_POWER=m +CONFIG_SENSORS_APPLESMC=m +CONFIG_SENSORS_ASB100=m +CONFIG_SENSORS_ASPEED=m +CONFIG_SENSORS_ATXP1=m +CONFIG_SENSORS_DS620=m +CONFIG_SENSORS_DS1621=m +CONFIG_SENSORS_DELL_SMM=m +CONFIG_SENSORS_DA9052_ADC=m +CONFIG_SENSORS_I5K_AMB=m +CONFIG_SENSORS_F71805F=m +CONFIG_SENSORS_F71882FG=m +CONFIG_SENSORS_F75375S=m +CONFIG_SENSORS_MC13783_ADC=m +CONFIG_SENSORS_FSCHMD=m +CONFIG_SENSORS_FTSTEUTATES=m +CONFIG_SENSORS_GL518SM=m +CONFIG_SENSORS_GL520SM=m +CONFIG_SENSORS_G760A=m +CONFIG_SENSORS_G762=m +CONFIG_SENSORS_HIH6130=m +CONFIG_SENSORS_IBMAEM=m +CONFIG_SENSORS_IBMPEX=m +CONFIG_SENSORS_IIO_HWMON=m +CONFIG_SENSORS_I5500=m +CONFIG_SENSORS_CORETEMP=m +CONFIG_SENSORS_IT87=m +CONFIG_SENSORS_JC42=m +CONFIG_SENSORS_POWR1220=m +CONFIG_SENSORS_LINEAGE=m +CONFIG_SENSORS_LTC2945=m +CONFIG_SENSORS_LTC2990=m +CONFIG_SENSORS_LTC4151=m +CONFIG_SENSORS_LTC4215=m +CONFIG_SENSORS_LTC4222=m +CONFIG_SENSORS_LTC4245=m +CONFIG_SENSORS_LTC4260=m +CONFIG_SENSORS_LTC4261=m +CONFIG_SENSORS_MAX1111=m +CONFIG_SENSORS_MAX16065=m +CONFIG_SENSORS_MAX1619=m +CONFIG_SENSORS_MAX1668=m +CONFIG_SENSORS_MAX197=m +CONFIG_SENSORS_MAX31722=m +CONFIG_SENSORS_MAX6621=m +CONFIG_SENSORS_MAX6639=m +CONFIG_SENSORS_MAX6642=m +CONFIG_SENSORS_MAX6650=m +CONFIG_SENSORS_MAX6697=m +CONFIG_SENSORS_MAX31790=m +CONFIG_SENSORS_MCP3021=m +# CONFIG_SENSORS_MLXREG_FAN is not set +CONFIG_SENSORS_TC654=m +CONFIG_SENSORS_MENF21BMC_HWMON=m +CONFIG_SENSORS_ADCXX=m +CONFIG_SENSORS_LM63=m +CONFIG_SENSORS_LM70=m +CONFIG_SENSORS_LM73=m +CONFIG_SENSORS_LM75=m +CONFIG_SENSORS_LM77=m +CONFIG_SENSORS_LM78=m +CONFIG_SENSORS_LM80=m +CONFIG_SENSORS_LM83=m +CONFIG_SENSORS_LM85=m +CONFIG_SENSORS_LM87=m +CONFIG_SENSORS_LM90=m +CONFIG_SENSORS_LM92=m +CONFIG_SENSORS_LM93=m +CONFIG_SENSORS_LM95234=m +CONFIG_SENSORS_LM95241=m +CONFIG_SENSORS_LM95245=m +CONFIG_SENSORS_PC87360=m +CONFIG_SENSORS_PC87427=m +CONFIG_SENSORS_NTC_THERMISTOR=m +CONFIG_SENSORS_NCT6683=m +CONFIG_SENSORS_NCT6775=m +CONFIG_SENSORS_NCT7802=m +CONFIG_SENSORS_NCT7904=m +# CONFIG_SENSORS_NPCM7XX is not set +CONFIG_SENSORS_PCF8591=m +CONFIG_PMBUS=m +CONFIG_SENSORS_PMBUS=m +CONFIG_SENSORS_ADM1275=m +CONFIG_SENSORS_IBM_CFFPS=m +CONFIG_SENSORS_IR35221=m +CONFIG_SENSORS_LM25066=m +CONFIG_SENSORS_LTC2978=m +CONFIG_SENSORS_LTC2978_REGULATOR=y +CONFIG_SENSORS_LTC3815=m +CONFIG_SENSORS_MAX16064=m +CONFIG_SENSORS_MAX20751=m +CONFIG_SENSORS_MAX31785=m +CONFIG_SENSORS_MAX34440=m +CONFIG_SENSORS_MAX8688=m +CONFIG_SENSORS_TPS40422=m +CONFIG_SENSORS_TPS53679=m +CONFIG_SENSORS_UCD9000=m +CONFIG_SENSORS_UCD9200=m +CONFIG_SENSORS_ZL6100=m +CONFIG_SENSORS_SHT15=m +CONFIG_SENSORS_SHT21=m +CONFIG_SENSORS_SHT3x=m +CONFIG_SENSORS_SHTC1=m +CONFIG_SENSORS_SIS5595=m +CONFIG_SENSORS_DME1737=m +CONFIG_SENSORS_EMC1403=m +CONFIG_SENSORS_EMC2103=m +CONFIG_SENSORS_EMC6W201=m +CONFIG_SENSORS_SMSC47M1=m +CONFIG_SENSORS_SMSC47M192=m +CONFIG_SENSORS_SMSC47B397=m +CONFIG_SENSORS_SCH56XX_COMMON=m +CONFIG_SENSORS_SCH5627=m +CONFIG_SENSORS_SCH5636=m +CONFIG_SENSORS_STTS751=m +CONFIG_SENSORS_SMM665=m +CONFIG_SENSORS_ADC128D818=m +CONFIG_SENSORS_ADS1015=m +CONFIG_SENSORS_ADS7828=m +CONFIG_SENSORS_ADS7871=m +CONFIG_SENSORS_AMC6821=m +CONFIG_SENSORS_INA209=m +CONFIG_SENSORS_INA2XX=m +CONFIG_SENSORS_INA3221=m +CONFIG_SENSORS_TC74=m +CONFIG_SENSORS_THMC50=m +CONFIG_SENSORS_TMP102=m +CONFIG_SENSORS_TMP103=m +CONFIG_SENSORS_TMP108=m +CONFIG_SENSORS_TMP401=m +CONFIG_SENSORS_TMP421=m +CONFIG_SENSORS_VIA_CPUTEMP=m +CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_VT1211=m +CONFIG_SENSORS_VT8231=m +CONFIG_SENSORS_W83773G=m +CONFIG_SENSORS_W83781D=m +CONFIG_SENSORS_W83791D=m +CONFIG_SENSORS_W83792D=m +CONFIG_SENSORS_W83793=m +CONFIG_SENSORS_W83795=m +# CONFIG_SENSORS_W83795_FANCTRL is not set +CONFIG_SENSORS_W83L785TS=m +CONFIG_SENSORS_W83L786NG=m +CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_W83627EHF=m +CONFIG_SENSORS_WM831X=m +CONFIG_SENSORS_XGENE=m + +# +# ACPI drivers +# +CONFIG_SENSORS_ACPI_POWER=m +CONFIG_SENSORS_ATK0110=m +CONFIG_THERMAL=y +# CONFIG_THERMAL_STATISTICS is not set +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=0 +CONFIG_THERMAL_WRITABLE_TRIPS=y +CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y +# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set +# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set +# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set +CONFIG_THERMAL_GOV_FAIR_SHARE=y +CONFIG_THERMAL_GOV_STEP_WISE=y +CONFIG_THERMAL_GOV_BANG_BANG=y +CONFIG_THERMAL_GOV_USER_SPACE=y +# CONFIG_THERMAL_GOV_POWER_ALLOCATOR is not set +CONFIG_CLOCK_THERMAL=y +CONFIG_DEVFREQ_THERMAL=y +# CONFIG_THERMAL_EMULATION is not set +CONFIG_INTEL_POWERCLAMP=m +CONFIG_X86_PKG_TEMP_THERMAL=m +CONFIG_INTEL_SOC_DTS_IOSF_CORE=m +CONFIG_INTEL_SOC_DTS_THERMAL=m + +# +# ACPI INT340X thermal drivers +# +CONFIG_INT340X_THERMAL=m +CONFIG_ACPI_THERMAL_REL=m +CONFIG_INT3406_THERMAL=m +CONFIG_INTEL_BXT_PMIC_THERMAL=m +CONFIG_INTEL_PCH_THERMAL=m +CONFIG_GENERIC_ADC_THERMAL=m +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_CORE=y +# CONFIG_WATCHDOG_NOWAYOUT is not set +CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y +# CONFIG_WATCHDOG_SYSFS is not set + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=m +# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set +CONFIG_DA9052_WATCHDOG=m +CONFIG_DA9063_WATCHDOG=m +CONFIG_DA9062_WATCHDOG=m +CONFIG_MENF21BMC_WATCHDOG=m +# CONFIG_MENZ069_WATCHDOG is not set +CONFIG_WDAT_WDT=m +CONFIG_WM831X_WATCHDOG=m +CONFIG_XILINX_WATCHDOG=m +CONFIG_ZIIRAVE_WATCHDOG=m +CONFIG_RAVE_SP_WATCHDOG=m +CONFIG_CADENCE_WATCHDOG=m +CONFIG_DW_WATCHDOG=m +CONFIG_MAX63XX_WATCHDOG=m +CONFIG_RETU_WATCHDOG=m +CONFIG_ACQUIRE_WDT=m +CONFIG_ADVANTECH_WDT=m +CONFIG_ALIM1535_WDT=m +CONFIG_ALIM7101_WDT=m +CONFIG_EBC_C384_WDT=m +CONFIG_F71808E_WDT=m +# CONFIG_SP5100_TCO is not set +CONFIG_SBC_FITPC2_WATCHDOG=m +CONFIG_EUROTECH_WDT=m +CONFIG_IB700_WDT=m +CONFIG_IBMASR=m +CONFIG_WAFER_WDT=m +CONFIG_I6300ESB_WDT=m +CONFIG_IE6XX_WDT=m +CONFIG_ITCO_WDT=m +CONFIG_ITCO_VENDOR_SUPPORT=y +CONFIG_IT8712F_WDT=m +CONFIG_IT87_WDT=m +CONFIG_HP_WATCHDOG=m +CONFIG_KEMPLD_WDT=m +CONFIG_HPWDT_NMI_DECODING=y +CONFIG_SC1200_WDT=m +CONFIG_PC87413_WDT=m +CONFIG_NV_TCO=m +CONFIG_60XX_WDT=m +CONFIG_CPU5_WDT=m +CONFIG_SMSC_SCH311X_WDT=m +CONFIG_SMSC37B787_WDT=m +CONFIG_VIA_WDT=m +CONFIG_W83627HF_WDT=m +CONFIG_W83877F_WDT=m +CONFIG_W83977F_WDT=m +CONFIG_MACHZ_WDT=m +CONFIG_SBC_EPX_C3_WATCHDOG=m +CONFIG_INTEL_MEI_WDT=m +CONFIG_NI903X_WDT=m +CONFIG_NIC7018_WDT=m +CONFIG_MEN_A21_WDT=m + +# +# PCI-based Watchdog Cards +# +CONFIG_PCIPCWATCHDOG=m +CONFIG_WDTPCI=m + +# +# USB-based Watchdog Cards +# +CONFIG_USBPCWATCHDOG=m + +# +# Watchdog Pretimeout Governors +# +CONFIG_WATCHDOG_PRETIMEOUT_GOV=y +# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set +CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y +CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y +CONFIG_SSB_POSSIBLE=y +CONFIG_SSB=m +CONFIG_SSB_SPROM=y +CONFIG_SSB_BLOCKIO=y +CONFIG_SSB_PCIHOST_POSSIBLE=y +CONFIG_SSB_PCIHOST=y +CONFIG_SSB_B43_PCI_BRIDGE=y +CONFIG_SSB_PCMCIAHOST_POSSIBLE=y +CONFIG_SSB_PCMCIAHOST=y +CONFIG_SSB_SDIOHOST_POSSIBLE=y +CONFIG_SSB_SDIOHOST=y +CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y +CONFIG_SSB_DRIVER_PCICORE=y +CONFIG_SSB_DRIVER_GPIO=y +CONFIG_BCMA_POSSIBLE=y +CONFIG_BCMA=m +CONFIG_BCMA_BLOCKIO=y +CONFIG_BCMA_HOST_PCI_POSSIBLE=y +CONFIG_BCMA_HOST_PCI=y +CONFIG_BCMA_HOST_SOC=y +CONFIG_BCMA_DRIVER_PCI=y +CONFIG_BCMA_SFLASH=y +CONFIG_BCMA_DRIVER_GMAC_CMN=y +CONFIG_BCMA_DRIVER_GPIO=y +# CONFIG_BCMA_DEBUG is not set + +# +# Multifunction device drivers +# +CONFIG_MFD_CORE=y +CONFIG_MFD_BCM590XX=m +CONFIG_MFD_BD9571MWV=m +CONFIG_MFD_AXP20X=m +CONFIG_MFD_AXP20X_I2C=m +CONFIG_MFD_CROS_EC=m +# CONFIG_MFD_CROS_EC_CHARDEV is not set +# CONFIG_MFD_MADERA is not set +CONFIG_PMIC_DA9052=y +CONFIG_MFD_DA9052_SPI=y +CONFIG_MFD_DA9062=m +CONFIG_MFD_DA9063=m +CONFIG_MFD_DA9150=m +CONFIG_MFD_DLN2=m +CONFIG_MFD_MC13XXX=m +CONFIG_MFD_MC13XXX_SPI=m +CONFIG_MFD_MC13XXX_I2C=m +CONFIG_HTC_PASIC3=m +CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m +CONFIG_LPC_ICH=m +CONFIG_LPC_SCH=m +CONFIG_INTEL_SOC_PMIC_BXTWC=m +CONFIG_INTEL_SOC_PMIC_CHTDC_TI=m +CONFIG_MFD_INTEL_LPSS=m +CONFIG_MFD_INTEL_LPSS_ACPI=m +CONFIG_MFD_INTEL_LPSS_PCI=m +CONFIG_MFD_JANZ_CMODIO=m +CONFIG_MFD_KEMPLD=m +CONFIG_MFD_88PM800=m +CONFIG_MFD_88PM805=m +CONFIG_MFD_MAX14577=m +CONFIG_MFD_MAX77693=m +CONFIG_MFD_MAX8907=m +CONFIG_MFD_MT6397=m +CONFIG_MFD_MENF21BMC=m +CONFIG_EZX_PCAP=y +CONFIG_MFD_VIPERBOARD=m +CONFIG_MFD_RETU=m +CONFIG_MFD_PCF50633=m +CONFIG_PCF50633_ADC=m +CONFIG_PCF50633_GPIO=m +CONFIG_UCB1400_CORE=m +CONFIG_MFD_RDC321X=m +CONFIG_MFD_RT5033=m +CONFIG_MFD_SI476X_CORE=m +CONFIG_MFD_SM501=m +CONFIG_MFD_SM501_GPIO=y +CONFIG_MFD_SKY81452=m +CONFIG_ABX500_CORE=y +CONFIG_MFD_SYSCON=y +CONFIG_MFD_TI_AM335X_TSCADC=m +CONFIG_MFD_LP3943=m +CONFIG_MFD_TI_LMU=m +CONFIG_TPS6105X=m +CONFIG_TPS65010=m +CONFIG_TPS6507X=m +CONFIG_MFD_TPS65086=m +CONFIG_MFD_TI_LP873X=m +CONFIG_MFD_TPS65912=y +CONFIG_MFD_TPS65912_I2C=m +CONFIG_MFD_TPS65912_SPI=y +CONFIG_MFD_WL1273_CORE=m +CONFIG_MFD_LM3533=m +CONFIG_MFD_VX855=m +CONFIG_MFD_ARIZONA=y +CONFIG_MFD_ARIZONA_I2C=m +CONFIG_MFD_ARIZONA_SPI=m +CONFIG_MFD_CS47L24=y +CONFIG_MFD_WM5102=y +CONFIG_MFD_WM5110=y +CONFIG_MFD_WM8997=y +CONFIG_MFD_WM8998=y +CONFIG_MFD_WM831X=y +CONFIG_MFD_WM831X_SPI=y +CONFIG_MFD_WM8994=m +CONFIG_RAVE_SP_CORE=m +CONFIG_REGULATOR=y +# CONFIG_REGULATOR_DEBUG is not set +CONFIG_REGULATOR_FIXED_VOLTAGE=m +CONFIG_REGULATOR_VIRTUAL_CONSUMER=m +CONFIG_REGULATOR_USERSPACE_CONSUMER=m +CONFIG_REGULATOR_88PG86X=m +CONFIG_REGULATOR_88PM800=m +CONFIG_REGULATOR_ACT8865=m +CONFIG_REGULATOR_AD5398=m +CONFIG_REGULATOR_ANATOP=m +CONFIG_REGULATOR_ARIZONA_LDO1=m +CONFIG_REGULATOR_ARIZONA_MICSUPP=m +CONFIG_REGULATOR_AXP20X=m +CONFIG_REGULATOR_BCM590XX=m +CONFIG_REGULATOR_BD9571MWV=m +CONFIG_REGULATOR_DA9052=m +CONFIG_REGULATOR_DA9062=m +CONFIG_REGULATOR_DA9063=m +CONFIG_REGULATOR_DA9210=m +CONFIG_REGULATOR_DA9211=m +CONFIG_REGULATOR_FAN53555=m +CONFIG_REGULATOR_GPIO=m +CONFIG_REGULATOR_ISL9305=m +CONFIG_REGULATOR_ISL6271A=m +CONFIG_REGULATOR_LM363X=m +CONFIG_REGULATOR_LP3971=m +CONFIG_REGULATOR_LP3972=m +CONFIG_REGULATOR_LP872X=m +CONFIG_REGULATOR_LP8755=m +CONFIG_REGULATOR_LTC3589=m +CONFIG_REGULATOR_LTC3676=m +CONFIG_REGULATOR_MAX14577=m +CONFIG_REGULATOR_MAX1586=m +CONFIG_REGULATOR_MAX8649=m +CONFIG_REGULATOR_MAX8660=m +CONFIG_REGULATOR_MAX8907=m +CONFIG_REGULATOR_MAX8952=m +CONFIG_REGULATOR_MAX77693=m +CONFIG_REGULATOR_MC13XXX_CORE=m +CONFIG_REGULATOR_MC13783=m +CONFIG_REGULATOR_MC13892=m +CONFIG_REGULATOR_MT6311=m +CONFIG_REGULATOR_MT6323=m +CONFIG_REGULATOR_MT6397=m +CONFIG_REGULATOR_PCAP=m +CONFIG_REGULATOR_PCF50633=m +CONFIG_REGULATOR_PFUZE100=m +CONFIG_REGULATOR_PV88060=m +CONFIG_REGULATOR_PV88080=m +CONFIG_REGULATOR_PV88090=m +CONFIG_REGULATOR_PWM=m +CONFIG_REGULATOR_QCOM_SPMI=m +CONFIG_REGULATOR_RT5033=m +CONFIG_REGULATOR_SKY81452=m +CONFIG_REGULATOR_TPS51632=m +CONFIG_REGULATOR_TPS6105X=m +CONFIG_REGULATOR_TPS62360=m +CONFIG_REGULATOR_TPS65023=m +CONFIG_REGULATOR_TPS6507X=m +CONFIG_REGULATOR_TPS65086=m +CONFIG_REGULATOR_TPS65132=m +CONFIG_REGULATOR_TPS6524X=m +CONFIG_REGULATOR_TPS65912=m +CONFIG_REGULATOR_WM831X=m +CONFIG_REGULATOR_WM8994=m +CONFIG_CEC_CORE=m +CONFIG_CEC_NOTIFIER=y +CONFIG_CEC_PIN=y +CONFIG_RC_CORE=m +CONFIG_RC_MAP=m +CONFIG_LIRC=y +CONFIG_RC_DECODERS=y +CONFIG_IR_NEC_DECODER=m +CONFIG_IR_RC5_DECODER=m +CONFIG_IR_RC6_DECODER=m +CONFIG_IR_JVC_DECODER=m +CONFIG_IR_SONY_DECODER=m +CONFIG_IR_SANYO_DECODER=m +CONFIG_IR_SHARP_DECODER=m +CONFIG_IR_MCE_KBD_DECODER=m +CONFIG_IR_XMP_DECODER=m +CONFIG_IR_IMON_DECODER=m +CONFIG_RC_DEVICES=y +CONFIG_RC_ATI_REMOTE=m +CONFIG_IR_ENE=m +CONFIG_IR_IMON=m +CONFIG_IR_IMON_RAW=m +CONFIG_IR_MCEUSB=m +CONFIG_IR_ITE_CIR=m +CONFIG_IR_FINTEK=m +CONFIG_IR_NUVOTON=m +CONFIG_IR_REDRAT3=m +CONFIG_IR_STREAMZAP=m +CONFIG_IR_WINBOND_CIR=m +CONFIG_IR_IGORPLUGUSB=m +CONFIG_IR_IGUANA=m +CONFIG_IR_TTUSBIR=m +CONFIG_RC_LOOPBACK=m +CONFIG_IR_SERIAL=m +CONFIG_IR_SERIAL_TRANSMITTER=y +CONFIG_IR_SIR=m +CONFIG_MEDIA_SUPPORT=m + +# +# Multimedia core support +# +CONFIG_MEDIA_CAMERA_SUPPORT=y +CONFIG_MEDIA_ANALOG_TV_SUPPORT=y +CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y +CONFIG_MEDIA_RADIO_SUPPORT=y +CONFIG_MEDIA_SDR_SUPPORT=y +CONFIG_MEDIA_CEC_SUPPORT=y +CONFIG_MEDIA_CEC_RC=y +# CONFIG_CEC_PIN_ERROR_INJ is not set +CONFIG_MEDIA_CONTROLLER=y +CONFIG_MEDIA_CONTROLLER_DVB=y +CONFIG_VIDEO_DEV=m +CONFIG_VIDEO_V4L2_SUBDEV_API=y +CONFIG_VIDEO_V4L2=m +# CONFIG_VIDEO_ADV_DEBUG is not set +# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set +CONFIG_VIDEO_PCI_SKELETON=m +CONFIG_VIDEO_TUNER=m +CONFIG_V4L2_MEM2MEM_DEV=m +CONFIG_V4L2_FLASH_LED_CLASS=m +CONFIG_V4L2_FWNODE=m +CONFIG_VIDEOBUF_GEN=m +CONFIG_VIDEOBUF_DMA_SG=m +CONFIG_VIDEOBUF_VMALLOC=m +CONFIG_DVB_CORE=m +# CONFIG_DVB_MMAP is not set +CONFIG_DVB_NET=y +CONFIG_TTPCI_EEPROM=m +CONFIG_DVB_MAX_ADAPTERS=8 +CONFIG_DVB_DYNAMIC_MINORS=y +# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set +# CONFIG_DVB_ULE_DEBUG is not set + +# +# Media drivers +# +CONFIG_MEDIA_USB_SUPPORT=y + +# +# Webcam devices +# +CONFIG_USB_VIDEO_CLASS=m +CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y +CONFIG_USB_GSPCA=m +CONFIG_USB_M5602=m +CONFIG_USB_STV06XX=m +CONFIG_USB_GL860=m +CONFIG_USB_GSPCA_BENQ=m +CONFIG_USB_GSPCA_CONEX=m +CONFIG_USB_GSPCA_CPIA1=m +CONFIG_USB_GSPCA_DTCS033=m +CONFIG_USB_GSPCA_ETOMS=m +CONFIG_USB_GSPCA_FINEPIX=m +CONFIG_USB_GSPCA_JEILINJ=m +CONFIG_USB_GSPCA_JL2005BCD=m +CONFIG_USB_GSPCA_KINECT=m +CONFIG_USB_GSPCA_KONICA=m +CONFIG_USB_GSPCA_MARS=m +CONFIG_USB_GSPCA_MR97310A=m +CONFIG_USB_GSPCA_NW80X=m +CONFIG_USB_GSPCA_OV519=m +CONFIG_USB_GSPCA_OV534=m +CONFIG_USB_GSPCA_OV534_9=m +CONFIG_USB_GSPCA_PAC207=m +CONFIG_USB_GSPCA_PAC7302=m +CONFIG_USB_GSPCA_PAC7311=m +CONFIG_USB_GSPCA_SE401=m +CONFIG_USB_GSPCA_SN9C2028=m +CONFIG_USB_GSPCA_SN9C20X=m +CONFIG_USB_GSPCA_SONIXB=m +CONFIG_USB_GSPCA_SONIXJ=m +CONFIG_USB_GSPCA_SPCA500=m +CONFIG_USB_GSPCA_SPCA501=m +CONFIG_USB_GSPCA_SPCA505=m +CONFIG_USB_GSPCA_SPCA506=m +CONFIG_USB_GSPCA_SPCA508=m +CONFIG_USB_GSPCA_SPCA561=m +CONFIG_USB_GSPCA_SPCA1528=m +CONFIG_USB_GSPCA_SQ905=m +CONFIG_USB_GSPCA_SQ905C=m +CONFIG_USB_GSPCA_SQ930X=m +CONFIG_USB_GSPCA_STK014=m +CONFIG_USB_GSPCA_STK1135=m +CONFIG_USB_GSPCA_STV0680=m +CONFIG_USB_GSPCA_SUNPLUS=m +CONFIG_USB_GSPCA_T613=m +CONFIG_USB_GSPCA_TOPRO=m +CONFIG_USB_GSPCA_TOUPTEK=m +CONFIG_USB_GSPCA_TV8532=m +CONFIG_USB_GSPCA_VC032X=m +CONFIG_USB_GSPCA_VICAM=m +CONFIG_USB_GSPCA_XIRLINK_CIT=m +CONFIG_USB_GSPCA_ZC3XX=m +CONFIG_USB_PWC=m +# CONFIG_USB_PWC_DEBUG is not set +CONFIG_USB_PWC_INPUT_EVDEV=y +CONFIG_VIDEO_CPIA2=m +CONFIG_USB_ZR364XX=m +CONFIG_USB_STKWEBCAM=m +CONFIG_USB_S2255=m +CONFIG_VIDEO_USBTV=m + +# +# Analog TV USB devices +# +CONFIG_VIDEO_PVRUSB2=m +CONFIG_VIDEO_PVRUSB2_SYSFS=y +CONFIG_VIDEO_PVRUSB2_DVB=y +# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set +CONFIG_VIDEO_HDPVR=m +CONFIG_VIDEO_USBVISION=m +CONFIG_VIDEO_STK1160_COMMON=m +CONFIG_VIDEO_STK1160=m +CONFIG_VIDEO_GO7007=m +CONFIG_VIDEO_GO7007_USB=m +CONFIG_VIDEO_GO7007_LOADER=m +CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m + +# +# Analog/digital TV USB devices +# +CONFIG_VIDEO_AU0828=m +CONFIG_VIDEO_AU0828_V4L2=y +CONFIG_VIDEO_AU0828_RC=y +CONFIG_VIDEO_CX231XX=m +CONFIG_VIDEO_CX231XX_RC=y +CONFIG_VIDEO_CX231XX_ALSA=m +CONFIG_VIDEO_CX231XX_DVB=m +CONFIG_VIDEO_TM6000=m +CONFIG_VIDEO_TM6000_ALSA=m +CONFIG_VIDEO_TM6000_DVB=m + +# +# Digital TV USB devices +# +CONFIG_DVB_USB=m +# CONFIG_DVB_USB_DEBUG is not set +CONFIG_DVB_USB_DIB3000MC=m +CONFIG_DVB_USB_A800=m +CONFIG_DVB_USB_DIBUSB_MB=m +CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y +CONFIG_DVB_USB_DIBUSB_MC=m +CONFIG_DVB_USB_DIB0700=m +CONFIG_DVB_USB_UMT_010=m +CONFIG_DVB_USB_CXUSB=m +CONFIG_DVB_USB_M920X=m +CONFIG_DVB_USB_DIGITV=m +CONFIG_DVB_USB_VP7045=m +CONFIG_DVB_USB_VP702X=m +CONFIG_DVB_USB_GP8PSK=m +CONFIG_DVB_USB_NOVA_T_USB2=m +CONFIG_DVB_USB_TTUSB2=m +CONFIG_DVB_USB_DTT200U=m +CONFIG_DVB_USB_OPERA1=m +CONFIG_DVB_USB_AF9005=m +CONFIG_DVB_USB_AF9005_REMOTE=m +CONFIG_DVB_USB_PCTV452E=m +CONFIG_DVB_USB_DW2102=m +CONFIG_DVB_USB_CINERGY_T2=m +CONFIG_DVB_USB_DTV5100=m +CONFIG_DVB_USB_AZ6027=m +CONFIG_DVB_USB_TECHNISAT_USB2=m +CONFIG_DVB_USB_V2=m +CONFIG_DVB_USB_AF9015=m +CONFIG_DVB_USB_AF9035=m +CONFIG_DVB_USB_ANYSEE=m +CONFIG_DVB_USB_AU6610=m +CONFIG_DVB_USB_AZ6007=m +CONFIG_DVB_USB_CE6230=m +CONFIG_DVB_USB_EC168=m +CONFIG_DVB_USB_GL861=m +CONFIG_DVB_USB_LME2510=m +CONFIG_DVB_USB_MXL111SF=m +CONFIG_DVB_USB_RTL28XXU=m +CONFIG_DVB_USB_DVBSKY=m +CONFIG_DVB_USB_ZD1301=m +CONFIG_DVB_TTUSB_BUDGET=m +CONFIG_DVB_TTUSB_DEC=m +CONFIG_SMS_USB_DRV=m +CONFIG_DVB_B2C2_FLEXCOP_USB=m +# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set +CONFIG_DVB_AS102=m + +# +# Webcam, TV (analog/digital) USB devices +# +CONFIG_VIDEO_EM28XX=m +CONFIG_VIDEO_EM28XX_V4L2=m +CONFIG_VIDEO_EM28XX_ALSA=m +CONFIG_VIDEO_EM28XX_DVB=m +CONFIG_VIDEO_EM28XX_RC=m + +# +# Software defined radio USB devices +# +CONFIG_USB_AIRSPY=m +CONFIG_USB_HACKRF=m +CONFIG_USB_MSI2500=m + +# +# USB HDMI CEC adapters +# +CONFIG_USB_PULSE8_CEC=m +CONFIG_USB_RAINSHADOW_CEC=m +CONFIG_MEDIA_PCI_SUPPORT=y + +# +# Media capture support +# +CONFIG_VIDEO_MEYE=m +CONFIG_VIDEO_SOLO6X10=m +CONFIG_VIDEO_TW5864=m +CONFIG_VIDEO_TW68=m +CONFIG_VIDEO_TW686X=m + +# +# Media capture/analog TV support +# +CONFIG_VIDEO_IVTV=m +# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set +CONFIG_VIDEO_IVTV_ALSA=m +CONFIG_VIDEO_FB_IVTV=m +CONFIG_VIDEO_HEXIUM_GEMINI=m +CONFIG_VIDEO_HEXIUM_ORION=m +CONFIG_VIDEO_MXB=m +CONFIG_VIDEO_DT3155=m + +# +# Media capture/analog/hybrid TV support +# +CONFIG_VIDEO_CX18=m +CONFIG_VIDEO_CX18_ALSA=m +CONFIG_VIDEO_CX23885=m +CONFIG_MEDIA_ALTERA_CI=m +CONFIG_VIDEO_CX25821=m +CONFIG_VIDEO_CX25821_ALSA=m +CONFIG_VIDEO_CX88=m +CONFIG_VIDEO_CX88_ALSA=m +CONFIG_VIDEO_CX88_BLACKBIRD=m +CONFIG_VIDEO_CX88_DVB=m +CONFIG_VIDEO_CX88_ENABLE_VP3054=y +CONFIG_VIDEO_CX88_VP3054=m +CONFIG_VIDEO_CX88_MPEG=m +CONFIG_VIDEO_BT848=m +CONFIG_DVB_BT8XX=m +CONFIG_VIDEO_SAA7134=m +CONFIG_VIDEO_SAA7134_ALSA=m +CONFIG_VIDEO_SAA7134_RC=y +CONFIG_VIDEO_SAA7134_DVB=m +CONFIG_VIDEO_SAA7134_GO7007=m +CONFIG_VIDEO_SAA7164=m +# CONFIG_VIDEO_COBALT is not set + +# +# Media digital TV PCI Adapters +# +CONFIG_DVB_AV7110_IR=y +CONFIG_DVB_AV7110=m +CONFIG_DVB_AV7110_OSD=y +CONFIG_DVB_BUDGET_CORE=m +CONFIG_DVB_BUDGET=m +CONFIG_DVB_BUDGET_CI=m +CONFIG_DVB_BUDGET_AV=m +CONFIG_DVB_BUDGET_PATCH=m +CONFIG_DVB_B2C2_FLEXCOP_PCI=m +# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set +CONFIG_DVB_PLUTO2=m +CONFIG_DVB_DM1105=m +CONFIG_DVB_PT1=m +CONFIG_DVB_PT3=m +CONFIG_MANTIS_CORE=m +CONFIG_DVB_MANTIS=m +CONFIG_DVB_HOPPER=m +CONFIG_DVB_NGENE=m +CONFIG_DVB_DDBRIDGE=m +# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set +CONFIG_DVB_SMIPCIE=m +CONFIG_DVB_NETUP_UNIDVB=m +CONFIG_VIDEO_IPU3_CIO2=m +CONFIG_V4L_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CAFE_CCIC=m +CONFIG_VIDEO_CADENCE=y +CONFIG_VIDEO_CADENCE_CSI2RX=m +CONFIG_VIDEO_CADENCE_CSI2TX=m +CONFIG_SOC_CAMERA=m +CONFIG_SOC_CAMERA_PLATFORM=m +CONFIG_V4L_MEM2MEM_DRIVERS=y +CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m +CONFIG_VIDEO_SH_VEU=m +# CONFIG_V4L_TEST_DRIVERS is not set +CONFIG_DVB_PLATFORM_DRIVERS=y +CONFIG_CEC_PLATFORM_DRIVERS=y +# CONFIG_VIDEO_CROS_EC_CEC is not set +CONFIG_CEC_GPIO=m +CONFIG_SDR_PLATFORM_DRIVERS=y + +# +# Supported MMC/SDIO adapters +# +CONFIG_SMS_SDIO_DRV=m +CONFIG_RADIO_ADAPTERS=y +CONFIG_RADIO_TEA575X=m +CONFIG_RADIO_SI470X=m +CONFIG_USB_SI470X=m +CONFIG_I2C_SI470X=m +CONFIG_RADIO_SI4713=m +CONFIG_USB_SI4713=m +CONFIG_PLATFORM_SI4713=m +CONFIG_I2C_SI4713=m +CONFIG_RADIO_SI476X=m +CONFIG_USB_MR800=m +CONFIG_USB_DSBR=m +CONFIG_RADIO_MAXIRADIO=m +CONFIG_RADIO_SHARK=m +CONFIG_RADIO_SHARK2=m +CONFIG_USB_KEENE=m +CONFIG_USB_RAREMONO=m +CONFIG_USB_MA901=m +CONFIG_RADIO_TEA5764=m +CONFIG_RADIO_SAA7706H=m +CONFIG_RADIO_TEF6862=m +CONFIG_RADIO_WL1273=m + +# +# Texas Instruments WL128x FM driver (ST based) +# +CONFIG_RADIO_WL128X=m + +# +# Supported FireWire (IEEE 1394) Adapters +# +CONFIG_DVB_FIREDTV=m +CONFIG_DVB_FIREDTV_INPUT=y +CONFIG_MEDIA_COMMON_OPTIONS=y + +# +# common driver options +# +CONFIG_VIDEO_CX2341X=m +CONFIG_VIDEO_TVEEPROM=m +CONFIG_CYPRESS_FIRMWARE=m +CONFIG_VIDEOBUF2_CORE=m +CONFIG_VIDEOBUF2_V4L2=m +CONFIG_VIDEOBUF2_MEMOPS=m +CONFIG_VIDEOBUF2_DMA_CONTIG=m +CONFIG_VIDEOBUF2_VMALLOC=m +CONFIG_VIDEOBUF2_DMA_SG=m +CONFIG_VIDEOBUF2_DVB=m +CONFIG_DVB_B2C2_FLEXCOP=m +CONFIG_VIDEO_SAA7146=m +CONFIG_VIDEO_SAA7146_VV=m +CONFIG_SMS_SIANO_MDTV=m +CONFIG_SMS_SIANO_RC=y +# CONFIG_SMS_SIANO_DEBUGFS is not set + +# +# Media ancillary drivers (tuners, sensors, i2c, spi, frontends) +# +CONFIG_MEDIA_SUBDRV_AUTOSELECT=y +CONFIG_MEDIA_ATTACH=y +CONFIG_VIDEO_IR_I2C=m + +# +# Audio decoders, processors and mixers +# +CONFIG_VIDEO_TVAUDIO=m +CONFIG_VIDEO_TDA7432=m +CONFIG_VIDEO_TDA9840=m +CONFIG_VIDEO_TEA6415C=m +CONFIG_VIDEO_TEA6420=m +CONFIG_VIDEO_MSP3400=m +CONFIG_VIDEO_CS3308=m +CONFIG_VIDEO_CS5345=m +CONFIG_VIDEO_CS53L32A=m +CONFIG_VIDEO_UDA1342=m +CONFIG_VIDEO_WM8775=m +CONFIG_VIDEO_WM8739=m +CONFIG_VIDEO_VP27SMPX=m +CONFIG_VIDEO_SONY_BTF_MPX=m + +# +# RDS decoders +# +CONFIG_VIDEO_SAA6588=m + +# +# Video decoders +# +CONFIG_VIDEO_BT819=m +CONFIG_VIDEO_BT856=m +CONFIG_VIDEO_BT866=m +CONFIG_VIDEO_KS0127=m +CONFIG_VIDEO_SAA7110=m +CONFIG_VIDEO_SAA711X=m +CONFIG_VIDEO_TVP5150=m +CONFIG_VIDEO_TW2804=m +CONFIG_VIDEO_TW9903=m +CONFIG_VIDEO_TW9906=m +CONFIG_VIDEO_VPX3220=m + +# +# Video and audio decoders +# +CONFIG_VIDEO_SAA717X=m +CONFIG_VIDEO_CX25840=m + +# +# Video encoders +# +CONFIG_VIDEO_SAA7127=m +CONFIG_VIDEO_SAA7185=m +CONFIG_VIDEO_ADV7170=m +CONFIG_VIDEO_ADV7175=m + +# +# Camera sensor devices +# +CONFIG_VIDEO_OV2640=m +CONFIG_VIDEO_OV7640=m +CONFIG_VIDEO_OV7670=m +CONFIG_VIDEO_MT9M111=m +CONFIG_VIDEO_MT9V011=m + +# +# Flash devices +# + +# +# Video improvement chips +# +CONFIG_VIDEO_UPD64031A=m +CONFIG_VIDEO_UPD64083=m + +# +# Audio/Video compression chips +# +CONFIG_VIDEO_SAA6752HS=m + +# +# SDR tuner chips +# + +# +# Miscellaneous helper chips +# +CONFIG_VIDEO_M52790=m + +# +# Sensors used on soc_camera driver +# + +# +# soc_camera sensor drivers +# +CONFIG_SOC_CAMERA_MT9M001=m +CONFIG_SOC_CAMERA_MT9M111=m +CONFIG_SOC_CAMERA_MT9T112=m +CONFIG_SOC_CAMERA_MT9V022=m +CONFIG_SOC_CAMERA_OV5642=m +CONFIG_SOC_CAMERA_OV772X=m +CONFIG_SOC_CAMERA_OV9640=m +CONFIG_SOC_CAMERA_OV9740=m +CONFIG_SOC_CAMERA_RJ54N1=m +CONFIG_SOC_CAMERA_TW9910=m + +# +# Media SPI Adapters +# +CONFIG_CXD2880_SPI_DRV=m +CONFIG_MEDIA_TUNER=m +CONFIG_MEDIA_TUNER_SIMPLE=m +CONFIG_MEDIA_TUNER_TDA18250=m +CONFIG_MEDIA_TUNER_TDA8290=m +CONFIG_MEDIA_TUNER_TDA827X=m +CONFIG_MEDIA_TUNER_TDA18271=m +CONFIG_MEDIA_TUNER_TDA9887=m +CONFIG_MEDIA_TUNER_TEA5761=m +CONFIG_MEDIA_TUNER_TEA5767=m +CONFIG_MEDIA_TUNER_MSI001=m +CONFIG_MEDIA_TUNER_MT20XX=m +CONFIG_MEDIA_TUNER_MT2060=m +CONFIG_MEDIA_TUNER_MT2063=m +CONFIG_MEDIA_TUNER_MT2266=m +CONFIG_MEDIA_TUNER_MT2131=m +CONFIG_MEDIA_TUNER_QT1010=m +CONFIG_MEDIA_TUNER_XC2028=m +CONFIG_MEDIA_TUNER_XC5000=m +CONFIG_MEDIA_TUNER_XC4000=m +CONFIG_MEDIA_TUNER_MXL5005S=m +CONFIG_MEDIA_TUNER_MXL5007T=m +CONFIG_MEDIA_TUNER_MC44S803=m +CONFIG_MEDIA_TUNER_MAX2165=m +CONFIG_MEDIA_TUNER_TDA18218=m +CONFIG_MEDIA_TUNER_FC0011=m +CONFIG_MEDIA_TUNER_FC0012=m +CONFIG_MEDIA_TUNER_FC0013=m +CONFIG_MEDIA_TUNER_TDA18212=m +CONFIG_MEDIA_TUNER_E4000=m +CONFIG_MEDIA_TUNER_FC2580=m +CONFIG_MEDIA_TUNER_M88RS6000T=m +CONFIG_MEDIA_TUNER_TUA9001=m +CONFIG_MEDIA_TUNER_SI2157=m +CONFIG_MEDIA_TUNER_IT913X=m +CONFIG_MEDIA_TUNER_R820T=m +CONFIG_MEDIA_TUNER_MXL301RF=m +CONFIG_MEDIA_TUNER_QM1D1C0042=m +CONFIG_MEDIA_TUNER_QM1D1B0004=m + +# +# Multistandard (satellite) frontends +# +CONFIG_DVB_STB0899=m +CONFIG_DVB_STB6100=m +CONFIG_DVB_STV090x=m +CONFIG_DVB_STV0910=m +CONFIG_DVB_STV6110x=m +CONFIG_DVB_STV6111=m +CONFIG_DVB_MXL5XX=m +CONFIG_DVB_M88DS3103=m + +# +# Multistandard (cable + terrestrial) frontends +# +CONFIG_DVB_DRXK=m +CONFIG_DVB_TDA18271C2DD=m +CONFIG_DVB_SI2165=m +CONFIG_DVB_MN88472=m +CONFIG_DVB_MN88473=m + +# +# DVB-S (satellite) frontends +# +CONFIG_DVB_CX24110=m +CONFIG_DVB_CX24123=m +CONFIG_DVB_MT312=m +CONFIG_DVB_ZL10036=m +CONFIG_DVB_ZL10039=m +CONFIG_DVB_S5H1420=m +CONFIG_DVB_STV0288=m +CONFIG_DVB_STB6000=m +CONFIG_DVB_STV0299=m +CONFIG_DVB_STV6110=m +CONFIG_DVB_STV0900=m +CONFIG_DVB_TDA8083=m +CONFIG_DVB_TDA10086=m +CONFIG_DVB_TDA8261=m +CONFIG_DVB_VES1X93=m +CONFIG_DVB_TUNER_ITD1000=m +CONFIG_DVB_TUNER_CX24113=m +CONFIG_DVB_TDA826X=m +CONFIG_DVB_TUA6100=m +CONFIG_DVB_CX24116=m +CONFIG_DVB_CX24117=m +CONFIG_DVB_CX24120=m +CONFIG_DVB_SI21XX=m +CONFIG_DVB_TS2020=m +CONFIG_DVB_DS3000=m +CONFIG_DVB_MB86A16=m +CONFIG_DVB_TDA10071=m + +# +# DVB-T (terrestrial) frontends +# +CONFIG_DVB_SP8870=m +CONFIG_DVB_SP887X=m +CONFIG_DVB_CX22700=m +CONFIG_DVB_CX22702=m +CONFIG_DVB_DRXD=m +CONFIG_DVB_L64781=m +CONFIG_DVB_TDA1004X=m +CONFIG_DVB_NXT6000=m +CONFIG_DVB_MT352=m +CONFIG_DVB_ZL10353=m +CONFIG_DVB_DIB3000MB=m +CONFIG_DVB_DIB3000MC=m +CONFIG_DVB_DIB7000M=m +CONFIG_DVB_DIB7000P=m +CONFIG_DVB_TDA10048=m +CONFIG_DVB_AF9013=m +CONFIG_DVB_EC100=m +CONFIG_DVB_STV0367=m +CONFIG_DVB_CXD2820R=m +CONFIG_DVB_CXD2841ER=m +CONFIG_DVB_RTL2830=m +CONFIG_DVB_RTL2832=m +CONFIG_DVB_RTL2832_SDR=m +CONFIG_DVB_SI2168=m +CONFIG_DVB_AS102_FE=m +CONFIG_DVB_ZD1301_DEMOD=m +CONFIG_DVB_GP8PSK_FE=m + +# +# DVB-C (cable) frontends +# +CONFIG_DVB_VES1820=m +CONFIG_DVB_TDA10021=m +CONFIG_DVB_TDA10023=m +CONFIG_DVB_STV0297=m + +# +# ATSC (North American/Korean Terrestrial/Cable DTV) frontends +# +CONFIG_DVB_NXT200X=m +CONFIG_DVB_OR51211=m +CONFIG_DVB_OR51132=m +CONFIG_DVB_BCM3510=m +CONFIG_DVB_LGDT330X=m +CONFIG_DVB_LGDT3305=m +CONFIG_DVB_LGDT3306A=m +CONFIG_DVB_LG2160=m +CONFIG_DVB_S5H1409=m +CONFIG_DVB_AU8522=m +CONFIG_DVB_AU8522_DTV=m +CONFIG_DVB_AU8522_V4L=m +CONFIG_DVB_S5H1411=m + +# +# ISDB-T (terrestrial) frontends +# +CONFIG_DVB_S921=m +CONFIG_DVB_DIB8000=m +CONFIG_DVB_MB86A20S=m + +# +# ISDB-S (satellite) & ISDB-T (terrestrial) frontends +# +CONFIG_DVB_TC90522=m + +# +# Digital terrestrial only tuners/PLL +# +CONFIG_DVB_PLL=m +CONFIG_DVB_TUNER_DIB0070=m +CONFIG_DVB_TUNER_DIB0090=m + +# +# SEC control devices for DVB-S +# +CONFIG_DVB_DRX39XYJ=m +CONFIG_DVB_LNBH25=m +CONFIG_DVB_LNBP21=m +CONFIG_DVB_LNBP22=m +CONFIG_DVB_ISL6405=m +CONFIG_DVB_ISL6421=m +CONFIG_DVB_ISL6423=m +CONFIG_DVB_A8293=m +CONFIG_DVB_LGS8GXX=m +CONFIG_DVB_ATBM8830=m +CONFIG_DVB_TDA665x=m +CONFIG_DVB_IX2505V=m +CONFIG_DVB_M88RS2000=m +CONFIG_DVB_AF9033=m +CONFIG_DVB_HORUS3A=m +CONFIG_DVB_ASCOT2E=m +CONFIG_DVB_HELENE=m + +# +# Common Interface (EN50221) controller drivers +# +CONFIG_DVB_CXD2099=m +CONFIG_DVB_SP2=m + +# +# Tools to develop new frontends +# +CONFIG_DVB_DUMMY_FE=m + +# +# Graphics support +# +CONFIG_AGP=m +CONFIG_AGP_AMD64=m +CONFIG_AGP_INTEL=m +CONFIG_AGP_SIS=m +CONFIG_AGP_VIA=m +CONFIG_INTEL_GTT=m +CONFIG_VGA_ARB=y +CONFIG_VGA_ARB_MAX_GPUS=16 +CONFIG_VGA_SWITCHEROO=y +CONFIG_DRM=m +CONFIG_DRM_MIPI_DSI=y +CONFIG_DRM_DP_AUX_CHARDEV=y +CONFIG_DRM_DEBUG_SELFTEST=m +CONFIG_DRM_KMS_HELPER=m +CONFIG_DRM_KMS_FB_HELPER=y +CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_FBDEV_OVERALLOC=100 +# CONFIG_DRM_LOAD_EDID_FIRMWARE is not set +# CONFIG_DRM_DP_CEC is not set +CONFIG_DRM_TTM=m +CONFIG_DRM_GEM_CMA_HELPER=y +CONFIG_DRM_KMS_CMA_HELPER=y +CONFIG_DRM_VM=y +CONFIG_DRM_SCHED=m + +# +# I2C encoder or helper chips +# +CONFIG_DRM_I2C_CH7006=m +CONFIG_DRM_I2C_SIL164=m +CONFIG_DRM_I2C_NXP_TDA998X=m +CONFIG_DRM_I2C_NXP_TDA9950=m +CONFIG_DRM_RADEON=m +# CONFIG_DRM_RADEON_USERPTR is not set +CONFIG_DRM_AMDGPU=m +CONFIG_DRM_AMDGPU_SI=y +CONFIG_DRM_AMDGPU_CIK=y +CONFIG_DRM_AMDGPU_USERPTR=y +# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set + +# +# ACP (Audio CoProcessor) Configuration +# +CONFIG_DRM_AMD_ACP=y + +# +# Display Engine Configuration +# +CONFIG_DRM_AMD_DC=y +CONFIG_DRM_AMD_DC_DCN1_0=y +# CONFIG_DEBUG_KERNEL_DC is not set + +# +# AMD Library routines +# +CONFIG_CHASH=m +# CONFIG_CHASH_STATS is not set +# CONFIG_CHASH_SELFTEST is not set +CONFIG_DRM_NOUVEAU=m +CONFIG_NOUVEAU_DEBUG=5 +CONFIG_NOUVEAU_DEBUG_DEFAULT=3 +# CONFIG_NOUVEAU_DEBUG_MMU is not set +CONFIG_DRM_NOUVEAU_BACKLIGHT=y +CONFIG_DRM_I915=m +# CONFIG_DRM_I915_ALPHA_SUPPORT is not set +CONFIG_DRM_I915_CAPTURE_ERROR=y +CONFIG_DRM_I915_COMPRESS_ERROR=y +CONFIG_DRM_I915_USERPTR=y +CONFIG_DRM_I915_GVT=y +CONFIG_DRM_I915_GVT_KVMGT=m +CONFIG_DRM_VGEM=m +# CONFIG_DRM_VKMS is not set +CONFIG_DRM_VMWGFX=m +CONFIG_DRM_VMWGFX_FBCON=y +CONFIG_DRM_GMA500=m +CONFIG_DRM_GMA600=y +CONFIG_DRM_GMA3600=y +CONFIG_DRM_UDL=m +CONFIG_DRM_AST=m +CONFIG_DRM_MGAG200=m +CONFIG_DRM_CIRRUS_QEMU=m +CONFIG_DRM_QXL=m +CONFIG_DRM_BOCHS=m +CONFIG_DRM_VIRTIO_GPU=m +CONFIG_DRM_PANEL=y + +# +# Display Panels +# +CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m +CONFIG_DRM_BRIDGE=y +CONFIG_DRM_PANEL_BRIDGE=y + +# +# Display Interface Bridges +# +CONFIG_DRM_ANALOGIX_ANX78XX=m +CONFIG_HSA_AMD=m +CONFIG_DRM_HISI_HIBMC=m +CONFIG_DRM_TINYDRM=m +CONFIG_TINYDRM_MIPI_DBI=m +CONFIG_TINYDRM_ILI9225=m +# CONFIG_TINYDRM_ILI9341 is not set +CONFIG_TINYDRM_MI0283QT=m +CONFIG_TINYDRM_REPAPER=m +CONFIG_TINYDRM_ST7586=m +CONFIG_TINYDRM_ST7735R=m +# CONFIG_DRM_LEGACY is not set +CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y +CONFIG_DRM_LIB_RANDOM=y + +# +# Frame buffer Devices +# +CONFIG_FB=y +CONFIG_FIRMWARE_EDID=y +CONFIG_FB_CMDLINE=y +CONFIG_FB_NOTIFY=y +CONFIG_FB_BOOT_VESA_SUPPORT=y +CONFIG_FB_CFB_FILLRECT=y +CONFIG_FB_CFB_COPYAREA=y +CONFIG_FB_CFB_IMAGEBLIT=y +CONFIG_FB_SYS_FILLRECT=m +CONFIG_FB_SYS_COPYAREA=m +CONFIG_FB_SYS_IMAGEBLIT=m +CONFIG_FB_FOREIGN_ENDIAN=y +CONFIG_FB_BOTH_ENDIAN=y +# CONFIG_FB_BIG_ENDIAN is not set +# CONFIG_FB_LITTLE_ENDIAN is not set +CONFIG_FB_SYS_FOPS=m +CONFIG_FB_DEFERRED_IO=y +CONFIG_FB_BACKLIGHT=y +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y + +# +# Frame buffer hardware drivers +# +# CONFIG_FB_CIRRUS is not set +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set +# CONFIG_FB_ARC is not set +# CONFIG_FB_ASILIANT is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_VGA16 is not set +# CONFIG_FB_UVESA is not set +CONFIG_FB_VESA=y +CONFIG_FB_EFI=y +# CONFIG_FB_N411 is not set +# CONFIG_FB_HGA is not set +# CONFIG_FB_OPENCORES is not set +# CONFIG_FB_S1D13XXX is not set +# CONFIG_FB_NVIDIA is not set +# CONFIG_FB_RIVA is not set +# CONFIG_FB_I740 is not set +# CONFIG_FB_LE80578 is not set +# CONFIG_FB_MATROX is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_S3 is not set +# CONFIG_FB_SAVAGE is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_VIA is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_VT8623 is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_ARK is not set +# CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set +# CONFIG_FB_SM501 is not set +# CONFIG_FB_SMSCUFX is not set +# CONFIG_FB_UDL is not set +# CONFIG_FB_IBM_GXT4500 is not set +# CONFIG_FB_VIRTUAL is not set +# CONFIG_FB_METRONOME is not set +# CONFIG_FB_MB862XX is not set +# CONFIG_FB_BROADSHEET is not set +# CONFIG_FB_HYPERV is not set +CONFIG_FB_SIMPLE=y +# CONFIG_FB_SM712 is not set +CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_LCD_CLASS_DEVICE=m +CONFIG_LCD_L4F00242T03=m +CONFIG_LCD_LMS283GF05=m +CONFIG_LCD_LTV350QV=m +CONFIG_LCD_ILI922X=m +CONFIG_LCD_ILI9320=m +CONFIG_LCD_TDO24M=m +CONFIG_LCD_VGG2432A4=m +CONFIG_LCD_PLATFORM=m +CONFIG_LCD_S6E63M0=m +CONFIG_LCD_LD9040=m +CONFIG_LCD_AMS369FG06=m +CONFIG_LCD_LMS501KF03=m +CONFIG_LCD_HX8357=m +CONFIG_LCD_OTM3225A=m +CONFIG_BACKLIGHT_CLASS_DEVICE=y +CONFIG_BACKLIGHT_GENERIC=m +CONFIG_BACKLIGHT_LM3533=m +CONFIG_BACKLIGHT_PWM=m +CONFIG_BACKLIGHT_DA9052=m +CONFIG_BACKLIGHT_APPLE=m +CONFIG_BACKLIGHT_PM8941_WLED=m +CONFIG_BACKLIGHT_SAHARA=m +CONFIG_BACKLIGHT_WM831X=m +CONFIG_BACKLIGHT_ADP8860=m +CONFIG_BACKLIGHT_ADP8870=m +CONFIG_BACKLIGHT_PCF50633=m +CONFIG_BACKLIGHT_LM3630A=m +CONFIG_BACKLIGHT_LM3639=m +CONFIG_BACKLIGHT_LP855X=m +CONFIG_BACKLIGHT_SKY81452=m +CONFIG_BACKLIGHT_GPIO=m +CONFIG_BACKLIGHT_LV5207LP=m +CONFIG_BACKLIGHT_BD6107=m +CONFIG_BACKLIGHT_ARCXCNN=m +CONFIG_BACKLIGHT_RAVE_SP=m +CONFIG_HDMI=y + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +CONFIG_VGACON_SOFT_SCROLLBACK=y +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 +# CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set +CONFIG_DUMMY_CONSOLE=y +CONFIG_DUMMY_CONSOLE_COLUMNS=80 +CONFIG_DUMMY_CONSOLE_ROWS=25 +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y +CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y +# CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER is not set +CONFIG_LOGO=y +CONFIG_LOGO_LINUX_MONO=y +CONFIG_LOGO_LINUX_VGA16=y +CONFIG_LOGO_LINUX_CLUT224=y +CONFIG_SOUND=m +CONFIG_SOUND_OSS_CORE=y +CONFIG_SOUND_OSS_CORE_PRECLAIM=y +CONFIG_SND=m +CONFIG_SND_TIMER=m +CONFIG_SND_PCM=m +CONFIG_SND_PCM_ELD=y +CONFIG_SND_PCM_IEC958=y +CONFIG_SND_DMAENGINE_PCM=m +CONFIG_SND_HWDEP=m +CONFIG_SND_SEQ_DEVICE=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_COMPRESS_OFFLOAD=m +CONFIG_SND_JACK=y +CONFIG_SND_JACK_INPUT_DEV=y +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=m +CONFIG_SND_PCM_OSS=m +CONFIG_SND_PCM_OSS_PLUGINS=y +CONFIG_SND_PCM_TIMER=y +CONFIG_SND_HRTIMER=m +CONFIG_SND_DYNAMIC_MINORS=y +CONFIG_SND_MAX_CARDS=32 +CONFIG_SND_SUPPORT_OLD_API=y +CONFIG_SND_PROC_FS=y +CONFIG_SND_VERBOSE_PROCFS=y +# CONFIG_SND_VERBOSE_PRINTK is not set +# CONFIG_SND_DEBUG is not set +CONFIG_SND_VMASTER=y +CONFIG_SND_DMA_SGBUF=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m +CONFIG_SND_SEQ_HRTIMER_DEFAULT=y +CONFIG_SND_SEQ_MIDI_EVENT=m +CONFIG_SND_SEQ_MIDI=m +CONFIG_SND_SEQ_MIDI_EMUL=m +CONFIG_SND_SEQ_VIRMIDI=m +CONFIG_SND_MPU401_UART=m +CONFIG_SND_OPL3_LIB=m +CONFIG_SND_OPL3_LIB_SEQ=m +CONFIG_SND_VX_LIB=m +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_DRIVERS=y +# CONFIG_SND_PCSP is not set +CONFIG_SND_DUMMY=m +CONFIG_SND_ALOOP=m +CONFIG_SND_VIRMIDI=m +CONFIG_SND_MTPAV=m +CONFIG_SND_MTS64=m +CONFIG_SND_SERIAL_U16550=m +CONFIG_SND_MPU401=m +CONFIG_SND_PORTMAN2X4=m +CONFIG_SND_AC97_POWER_SAVE=y +CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 +CONFIG_SND_SB_COMMON=m +CONFIG_SND_PCI=y +CONFIG_SND_AD1889=m +CONFIG_SND_ALS300=m +CONFIG_SND_ALS4000=m +CONFIG_SND_ALI5451=m +CONFIG_SND_ASIHPI=m +CONFIG_SND_ATIIXP=m +CONFIG_SND_ATIIXP_MODEM=m +CONFIG_SND_AU8810=m +CONFIG_SND_AU8820=m +CONFIG_SND_AU8830=m +CONFIG_SND_AW2=m +CONFIG_SND_AZT3328=m +CONFIG_SND_BT87X=m +CONFIG_SND_BT87X_OVERCLOCK=y +CONFIG_SND_CA0106=m +CONFIG_SND_CMIPCI=m +CONFIG_SND_OXYGEN_LIB=m +CONFIG_SND_OXYGEN=m +CONFIG_SND_CS4281=m +CONFIG_SND_CS46XX=m +CONFIG_SND_CS46XX_NEW_DSP=y +CONFIG_SND_CTXFI=m +CONFIG_SND_DARLA20=m +CONFIG_SND_GINA20=m +CONFIG_SND_LAYLA20=m +CONFIG_SND_DARLA24=m +CONFIG_SND_GINA24=m +CONFIG_SND_LAYLA24=m +CONFIG_SND_MONA=m +CONFIG_SND_MIA=m +CONFIG_SND_ECHO3G=m +CONFIG_SND_INDIGO=m +CONFIG_SND_INDIGOIO=m +CONFIG_SND_INDIGODJ=m +CONFIG_SND_INDIGOIOX=m +CONFIG_SND_INDIGODJX=m +CONFIG_SND_EMU10K1=m +CONFIG_SND_EMU10K1_SEQ=m +CONFIG_SND_EMU10K1X=m +CONFIG_SND_ENS1370=m +CONFIG_SND_ENS1371=m +CONFIG_SND_ES1938=m +CONFIG_SND_ES1968=m +CONFIG_SND_ES1968_INPUT=y +CONFIG_SND_ES1968_RADIO=y +CONFIG_SND_FM801=m +CONFIG_SND_FM801_TEA575X_BOOL=y +CONFIG_SND_HDSP=m +CONFIG_SND_HDSPM=m +CONFIG_SND_ICE1712=m +CONFIG_SND_ICE1724=m +CONFIG_SND_INTEL8X0=m +CONFIG_SND_INTEL8X0M=m +CONFIG_SND_KORG1212=m +CONFIG_SND_LOLA=m +CONFIG_SND_LX6464ES=m +CONFIG_SND_MAESTRO3=m +CONFIG_SND_MAESTRO3_INPUT=y +CONFIG_SND_MIXART=m +CONFIG_SND_NM256=m +CONFIG_SND_PCXHR=m +CONFIG_SND_RIPTIDE=m +CONFIG_SND_RME32=m +CONFIG_SND_RME96=m +CONFIG_SND_RME9652=m +CONFIG_SND_SONICVIBES=m +CONFIG_SND_TRIDENT=m +CONFIG_SND_VIA82XX=m +CONFIG_SND_VIA82XX_MODEM=m +CONFIG_SND_VIRTUOSO=m +CONFIG_SND_VX222=m +CONFIG_SND_YMFPCI=m + +# +# HD-Audio +# +CONFIG_SND_HDA=m +CONFIG_SND_HDA_INTEL=m +CONFIG_SND_HDA_HWDEP=y +CONFIG_SND_HDA_RECONFIG=y +CONFIG_SND_HDA_INPUT_BEEP=y +CONFIG_SND_HDA_INPUT_BEEP_MODE=1 +CONFIG_SND_HDA_PATCH_LOADER=y +CONFIG_SND_HDA_CODEC_REALTEK=m +CONFIG_SND_HDA_CODEC_ANALOG=m +CONFIG_SND_HDA_CODEC_SIGMATEL=m +CONFIG_SND_HDA_CODEC_VIA=m +CONFIG_SND_HDA_CODEC_HDMI=m +CONFIG_SND_HDA_CODEC_CIRRUS=m +CONFIG_SND_HDA_CODEC_CONEXANT=m +CONFIG_SND_HDA_CODEC_CA0110=m +CONFIG_SND_HDA_CODEC_CA0132=m +CONFIG_SND_HDA_CODEC_CA0132_DSP=y +CONFIG_SND_HDA_CODEC_CMEDIA=m +CONFIG_SND_HDA_CODEC_SI3054=m +CONFIG_SND_HDA_GENERIC=m +CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 +CONFIG_SND_HDA_CORE=m +CONFIG_SND_HDA_DSP_LOADER=y +CONFIG_SND_HDA_COMPONENT=y +CONFIG_SND_HDA_I915=y +CONFIG_SND_HDA_EXT_CORE=m +CONFIG_SND_HDA_PREALLOC_SIZE=64 +CONFIG_SND_SPI=y +CONFIG_SND_USB=y +CONFIG_SND_USB_AUDIO=m +CONFIG_SND_USB_UA101=m +CONFIG_SND_USB_USX2Y=m +CONFIG_SND_USB_CAIAQ=m +CONFIG_SND_USB_CAIAQ_INPUT=y +CONFIG_SND_USB_US122L=m +CONFIG_SND_USB_6FIRE=m +CONFIG_SND_USB_HIFACE=m +CONFIG_SND_BCD2000=m +CONFIG_SND_USB_LINE6=m +CONFIG_SND_USB_POD=m +CONFIG_SND_USB_PODHD=m +CONFIG_SND_USB_TONEPORT=m +CONFIG_SND_USB_VARIAX=m +CONFIG_SND_FIREWIRE=y +CONFIG_SND_FIREWIRE_LIB=m +CONFIG_SND_DICE=m +CONFIG_SND_OXFW=m +CONFIG_SND_ISIGHT=m +CONFIG_SND_FIREWORKS=m +CONFIG_SND_BEBOB=m +CONFIG_SND_FIREWIRE_DIGI00X=m +CONFIG_SND_FIREWIRE_TASCAM=m +CONFIG_SND_FIREWIRE_MOTU=m +CONFIG_SND_FIREFACE=m +CONFIG_SND_PCMCIA=y +CONFIG_SND_VXPOCKET=m +CONFIG_SND_PDAUDIOCF=m +CONFIG_SND_SOC=m +CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y +CONFIG_SND_SOC_COMPRESS=y +CONFIG_SND_SOC_TOPOLOGY=y +CONFIG_SND_SOC_ACPI=m +CONFIG_SND_SOC_AMD_ACP=m +CONFIG_SND_SOC_AMD_CZ_DA7219MX98357_MACH=m +CONFIG_SND_SOC_AMD_CZ_RT5645_MACH=m +CONFIG_SND_ATMEL_SOC=m +CONFIG_SND_DESIGNWARE_I2S=m +CONFIG_SND_DESIGNWARE_PCM=y + +# +# SoC Audio for Freescale CPUs +# + +# +# Common SoC Audio options for Freescale CPUs: +# +CONFIG_SND_SOC_FSL_ASRC=m +CONFIG_SND_SOC_FSL_SAI=m +CONFIG_SND_SOC_FSL_SSI=m +CONFIG_SND_SOC_FSL_SPDIF=m +CONFIG_SND_SOC_FSL_ESAI=m +CONFIG_SND_SOC_IMX_AUDMUX=m +CONFIG_SND_I2S_HI6210_I2S=m +CONFIG_SND_SOC_IMG=y +CONFIG_SND_SOC_IMG_I2S_IN=m +CONFIG_SND_SOC_IMG_I2S_OUT=m +CONFIG_SND_SOC_IMG_PARALLEL_OUT=m +CONFIG_SND_SOC_IMG_SPDIF_IN=m +CONFIG_SND_SOC_IMG_SPDIF_OUT=m +CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m +CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y +CONFIG_SND_SST_IPC=m +CONFIG_SND_SST_IPC_PCI=m +CONFIG_SND_SST_IPC_ACPI=m +CONFIG_SND_SOC_INTEL_SST_ACPI=m +CONFIG_SND_SOC_INTEL_SST=m +CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m +CONFIG_SND_SOC_INTEL_HASWELL=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_PCI=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_ACPI=m +CONFIG_SND_SOC_INTEL_SKYLAKE_SSP_CLK=m +CONFIG_SND_SOC_INTEL_SKYLAKE=m +CONFIG_SND_SOC_ACPI_INTEL_MATCH=m +CONFIG_SND_SOC_INTEL_MACH=y +CONFIG_SND_SOC_INTEL_HASWELL_MACH=m +CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m +CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH=m +CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH=m +# CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH is not set + +# +# STMicroelectronics STM32 SOC audio support +# +CONFIG_SND_SOC_XTFPGA_I2S=m +CONFIG_ZX_TDM=m +CONFIG_SND_SOC_I2C_AND_SPI=m + +# +# CODEC drivers +# +# CONFIG_SND_SOC_AC97_CODEC is not set +CONFIG_SND_SOC_ADAU_UTILS=m +CONFIG_SND_SOC_ADAU1701=m +CONFIG_SND_SOC_ADAU17X1=m +CONFIG_SND_SOC_ADAU1761=m +CONFIG_SND_SOC_ADAU1761_I2C=m +CONFIG_SND_SOC_ADAU1761_SPI=m +CONFIG_SND_SOC_ADAU7002=m +CONFIG_SND_SOC_AK4104=m +CONFIG_SND_SOC_AK4458=m +CONFIG_SND_SOC_AK4554=m +CONFIG_SND_SOC_AK4613=m +CONFIG_SND_SOC_AK4642=m +CONFIG_SND_SOC_AK5386=m +CONFIG_SND_SOC_AK5558=m +CONFIG_SND_SOC_ALC5623=m +CONFIG_SND_SOC_BD28623=m +# CONFIG_SND_SOC_BT_SCO is not set +CONFIG_SND_SOC_CS35L32=m +CONFIG_SND_SOC_CS35L33=m +CONFIG_SND_SOC_CS35L34=m +CONFIG_SND_SOC_CS35L35=m +CONFIG_SND_SOC_CS42L42=m +CONFIG_SND_SOC_CS42L51=m +CONFIG_SND_SOC_CS42L51_I2C=m +CONFIG_SND_SOC_CS42L52=m +CONFIG_SND_SOC_CS42L56=m +CONFIG_SND_SOC_CS42L73=m +CONFIG_SND_SOC_CS4265=m +CONFIG_SND_SOC_CS4270=m +CONFIG_SND_SOC_CS4271=m +CONFIG_SND_SOC_CS4271_I2C=m +CONFIG_SND_SOC_CS4271_SPI=m +CONFIG_SND_SOC_CS42XX8=m +CONFIG_SND_SOC_CS42XX8_I2C=m +CONFIG_SND_SOC_CS43130=m +CONFIG_SND_SOC_CS4349=m +CONFIG_SND_SOC_CS53L30=m +CONFIG_SND_SOC_DA7213=m +CONFIG_SND_SOC_DA7219=m +CONFIG_SND_SOC_DMIC=m +CONFIG_SND_SOC_HDMI_CODEC=m +CONFIG_SND_SOC_ES7134=m +# CONFIG_SND_SOC_ES7241 is not set +CONFIG_SND_SOC_ES8316=m +CONFIG_SND_SOC_ES8328=m +CONFIG_SND_SOC_ES8328_I2C=m +CONFIG_SND_SOC_ES8328_SPI=m +CONFIG_SND_SOC_GTM601=m +CONFIG_SND_SOC_HDAC_HDMI=m +CONFIG_SND_SOC_INNO_RK3036=m +CONFIG_SND_SOC_MAX98090=m +CONFIG_SND_SOC_MAX98357A=m +CONFIG_SND_SOC_MAX98504=m +CONFIG_SND_SOC_MAX9867=m +CONFIG_SND_SOC_MAX98927=m +CONFIG_SND_SOC_MAX98373=m +CONFIG_SND_SOC_MAX9860=m +CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m +CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m +CONFIG_SND_SOC_PCM1681=m +CONFIG_SND_SOC_PCM1789=m +CONFIG_SND_SOC_PCM1789_I2C=m +CONFIG_SND_SOC_PCM179X=m +CONFIG_SND_SOC_PCM179X_I2C=m +CONFIG_SND_SOC_PCM179X_SPI=m +CONFIG_SND_SOC_PCM186X=m +CONFIG_SND_SOC_PCM186X_I2C=m +CONFIG_SND_SOC_PCM186X_SPI=m +CONFIG_SND_SOC_PCM3168A=m +CONFIG_SND_SOC_PCM3168A_I2C=m +CONFIG_SND_SOC_PCM3168A_SPI=m +CONFIG_SND_SOC_PCM512x=m +CONFIG_SND_SOC_PCM512x_I2C=m +CONFIG_SND_SOC_PCM512x_SPI=m +CONFIG_SND_SOC_RL6231=m +CONFIG_SND_SOC_RL6347A=m +CONFIG_SND_SOC_RT286=m +CONFIG_SND_SOC_RT298=m +CONFIG_SND_SOC_RT5514=m +CONFIG_SND_SOC_RT5514_SPI=m +CONFIG_SND_SOC_RT5616=m +CONFIG_SND_SOC_RT5631=m +CONFIG_SND_SOC_RT5640=m +CONFIG_SND_SOC_RT5645=m +CONFIG_SND_SOC_RT5651=m +CONFIG_SND_SOC_RT5663=m +CONFIG_SND_SOC_RT5670=m +CONFIG_SND_SOC_RT5677=m +CONFIG_SND_SOC_RT5677_SPI=m +CONFIG_SND_SOC_SGTL5000=m +CONFIG_SND_SOC_SI476X=m +CONFIG_SND_SOC_SIGMADSP=m +CONFIG_SND_SOC_SIGMADSP_I2C=m +CONFIG_SND_SOC_SIGMADSP_REGMAP=m +# CONFIG_SND_SOC_SIMPLE_AMPLIFIER is not set +CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m +CONFIG_SND_SOC_SPDIF=m +CONFIG_SND_SOC_SSM2305=m +CONFIG_SND_SOC_SSM2602=m +CONFIG_SND_SOC_SSM2602_SPI=m +CONFIG_SND_SOC_SSM2602_I2C=m +CONFIG_SND_SOC_SSM4567=m +CONFIG_SND_SOC_STA32X=m +CONFIG_SND_SOC_STA350=m +CONFIG_SND_SOC_STI_SAS=m +CONFIG_SND_SOC_TAS2552=m +CONFIG_SND_SOC_TAS5086=m +CONFIG_SND_SOC_TAS571X=m +CONFIG_SND_SOC_TAS5720=m +CONFIG_SND_SOC_TAS6424=m +CONFIG_SND_SOC_TDA7419=m +CONFIG_SND_SOC_TFA9879=m +CONFIG_SND_SOC_TLV320AIC23=m +CONFIG_SND_SOC_TLV320AIC23_I2C=m +CONFIG_SND_SOC_TLV320AIC23_SPI=m +CONFIG_SND_SOC_TLV320AIC31XX=m +CONFIG_SND_SOC_TLV320AIC32X4=m +CONFIG_SND_SOC_TLV320AIC32X4_I2C=m +CONFIG_SND_SOC_TLV320AIC32X4_SPI=m +CONFIG_SND_SOC_TLV320AIC3X=m +CONFIG_SND_SOC_TS3A227E=m +CONFIG_SND_SOC_TSCS42XX=m +CONFIG_SND_SOC_TSCS454=m +CONFIG_SND_SOC_WM8510=m +CONFIG_SND_SOC_WM8523=m +CONFIG_SND_SOC_WM8524=m +CONFIG_SND_SOC_WM8580=m +CONFIG_SND_SOC_WM8711=m +CONFIG_SND_SOC_WM8728=m +CONFIG_SND_SOC_WM8731=m +CONFIG_SND_SOC_WM8737=m +CONFIG_SND_SOC_WM8741=m +CONFIG_SND_SOC_WM8750=m +CONFIG_SND_SOC_WM8753=m +CONFIG_SND_SOC_WM8770=m +CONFIG_SND_SOC_WM8776=m +CONFIG_SND_SOC_WM8782=m +CONFIG_SND_SOC_WM8804=m +CONFIG_SND_SOC_WM8804_I2C=m +CONFIG_SND_SOC_WM8804_SPI=m +CONFIG_SND_SOC_WM8903=m +CONFIG_SND_SOC_WM8960=m +CONFIG_SND_SOC_WM8962=m +CONFIG_SND_SOC_WM8974=m +CONFIG_SND_SOC_WM8978=m +CONFIG_SND_SOC_WM8985=m +CONFIG_SND_SOC_ZX_AUD96P22=m +CONFIG_SND_SOC_MAX9759=m +CONFIG_SND_SOC_MT6351=m +CONFIG_SND_SOC_NAU8540=m +CONFIG_SND_SOC_NAU8810=m +CONFIG_SND_SOC_NAU8824=m +CONFIG_SND_SOC_NAU8825=m +CONFIG_SND_SOC_TPA6130A2=m +CONFIG_SND_SIMPLE_CARD_UTILS=m +CONFIG_SND_SIMPLE_CARD=m +CONFIG_SND_X86=y +CONFIG_HDMI_LPE_AUDIO=m +CONFIG_SND_SYNTH_EMUX=m +CONFIG_AC97_BUS=m + +# +# HID support +# +CONFIG_HID=m +CONFIG_HID_BATTERY_STRENGTH=y +CONFIG_HIDRAW=y +CONFIG_UHID=m +CONFIG_HID_GENERIC=m + +# +# Special HID drivers +# +CONFIG_HID_A4TECH=m +CONFIG_HID_ACCUTOUCH=m +CONFIG_HID_ACRUX=m +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=m +CONFIG_HID_APPLEIR=m +CONFIG_HID_ASUS=m +CONFIG_HID_AUREAL=m +CONFIG_HID_BELKIN=m +CONFIG_HID_BETOP_FF=m +CONFIG_HID_CHERRY=m +CONFIG_HID_CHICONY=m +CONFIG_HID_CORSAIR=m +# CONFIG_HID_COUGAR is not set +CONFIG_HID_PRODIKEYS=m +CONFIG_HID_CMEDIA=m +CONFIG_HID_CP2112=m +CONFIG_HID_CYPRESS=m +CONFIG_HID_DRAGONRISE=m +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=m +CONFIG_HID_ELAN=m +CONFIG_HID_ELECOM=m +CONFIG_HID_ELO=m +CONFIG_HID_EZKEY=m +CONFIG_HID_GEMBIRD=m +CONFIG_HID_GFRM=m +CONFIG_HID_HOLTEK=m +CONFIG_HOLTEK_FF=y +CONFIG_HID_GOOGLE_HAMMER=m +CONFIG_HID_GT683R=m +CONFIG_HID_KEYTOUCH=m +CONFIG_HID_KYE=m +CONFIG_HID_UCLOGIC=m +CONFIG_HID_WALTOP=m +CONFIG_HID_GYRATION=m +CONFIG_HID_ICADE=m +CONFIG_HID_ITE=m +CONFIG_HID_JABRA=m +CONFIG_HID_TWINHAN=m +CONFIG_HID_KENSINGTON=m +CONFIG_HID_LCPOWER=m +CONFIG_HID_LED=m +CONFIG_HID_LENOVO=m +CONFIG_HID_LOGITECH=m +CONFIG_HID_LOGITECH_DJ=m +CONFIG_HID_LOGITECH_HIDPP=m +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIWHEELS_FF=y +CONFIG_HID_MAGICMOUSE=m +CONFIG_HID_MAYFLASH=m +CONFIG_HID_REDRAGON=m +CONFIG_HID_MICROSOFT=m +CONFIG_HID_MONTEREY=m +CONFIG_HID_MULTITOUCH=m +CONFIG_HID_NTI=m +CONFIG_HID_NTRIG=m +CONFIG_HID_ORTEK=m +CONFIG_HID_PANTHERLORD=m +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PENMOUNT=m +CONFIG_HID_PETALYNX=m +CONFIG_HID_PICOLCD=m +CONFIG_HID_PICOLCD_FB=y +CONFIG_HID_PICOLCD_BACKLIGHT=y +CONFIG_HID_PICOLCD_LCD=y +CONFIG_HID_PICOLCD_LEDS=y +CONFIG_HID_PICOLCD_CIR=y +CONFIG_HID_PLANTRONICS=m +CONFIG_HID_PRIMAX=m +CONFIG_HID_RETRODE=m +CONFIG_HID_ROCCAT=m +CONFIG_HID_SAITEK=m +CONFIG_HID_SAMSUNG=m +CONFIG_HID_SONY=m +CONFIG_SONY_FF=y +CONFIG_HID_SPEEDLINK=m +CONFIG_HID_STEAM=m +CONFIG_HID_STEELSERIES=m +CONFIG_HID_SUNPLUS=m +CONFIG_HID_RMI=m +CONFIG_HID_GREENASIA=m +CONFIG_GREENASIA_FF=y +CONFIG_HID_HYPERV_MOUSE=m +CONFIG_HID_SMARTJOYPLUS=m +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=m +CONFIG_HID_TOPSEED=m +CONFIG_HID_THINGM=m +CONFIG_HID_THRUSTMASTER=m +CONFIG_THRUSTMASTER_FF=y +CONFIG_HID_UDRAW_PS3=m +CONFIG_HID_WACOM=m +CONFIG_HID_WIIMOTE=m +CONFIG_HID_XINMO=m +CONFIG_HID_ZEROPLUS=m +CONFIG_ZEROPLUS_FF=y +CONFIG_HID_ZYDACRON=m +CONFIG_HID_SENSOR_HUB=m +CONFIG_HID_SENSOR_CUSTOM_SENSOR=m +CONFIG_HID_ALPS=m + +# +# USB HID support +# +CONFIG_USB_HID=m +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y + +# +# I2C HID support +# +CONFIG_I2C_HID=m + +# +# Intel ISH HID support +# +CONFIG_INTEL_ISH_HID=m +CONFIG_USB_OHCI_LITTLE_ENDIAN=y +CONFIG_USB_SUPPORT=y +CONFIG_USB_COMMON=y +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB=m +CONFIG_USB_PCI=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y + +# +# Miscellaneous USB options +# +CONFIG_USB_DEFAULT_PERSIST=y +CONFIG_USB_DYNAMIC_MINORS=y +CONFIG_USB_OTG=y +# CONFIG_USB_OTG_WHITELIST is not set +# CONFIG_USB_OTG_BLACKLIST_HUB is not set +CONFIG_USB_OTG_FSM=m +CONFIG_USB_LEDS_TRIGGER_USBPORT=m +CONFIG_USB_MON=m +CONFIG_USB_WUSB=m +CONFIG_USB_WUSB_CBAF=m +# CONFIG_USB_WUSB_CBAF_DEBUG is not set + +# +# USB Host Controller Drivers +# +CONFIG_USB_C67X00_HCD=m +CONFIG_USB_XHCI_HCD=m +CONFIG_USB_XHCI_DBGCAP=y +CONFIG_USB_XHCI_PCI=m +CONFIG_USB_XHCI_PLATFORM=m +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_EHCI_TT_NEWSCHED=y +CONFIG_USB_EHCI_PCI=m +CONFIG_USB_EHCI_HCD_PLATFORM=m +CONFIG_USB_OXU210HP_HCD=m +CONFIG_USB_ISP116X_HCD=m +CONFIG_USB_FOTG210_HCD=m +CONFIG_USB_MAX3421_HCD=m +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_OHCI_HCD_PCI=m +CONFIG_USB_OHCI_HCD_SSB=y +CONFIG_USB_OHCI_HCD_PLATFORM=m +CONFIG_USB_UHCI_HCD=m +CONFIG_USB_U132_HCD=m +CONFIG_USB_SL811_HCD=m +# CONFIG_USB_SL811_HCD_ISO is not set +CONFIG_USB_SL811_CS=m +CONFIG_USB_R8A66597_HCD=m +CONFIG_USB_WHCI_HCD=m +CONFIG_USB_HWA_HCD=m +CONFIG_USB_HCD_BCMA=m +CONFIG_USB_HCD_SSB=m +# CONFIG_USB_HCD_TEST_MODE is not set + +# +# USB Device Class drivers +# +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_WDM=m +CONFIG_USB_TMC=m + +# +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may +# + +# +# also be needed; see USB_STORAGE Help for more info +# +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_REALTEK=m +CONFIG_REALTEK_AUTOPM=y +CONFIG_USB_STORAGE_DATAFAB=m +CONFIG_USB_STORAGE_FREECOM=m +CONFIG_USB_STORAGE_ISD200=m +CONFIG_USB_STORAGE_USBAT=m +CONFIG_USB_STORAGE_SDDR09=m +CONFIG_USB_STORAGE_SDDR55=m +CONFIG_USB_STORAGE_JUMPSHOT=m +CONFIG_USB_STORAGE_ALAUDA=m +CONFIG_USB_STORAGE_ONETOUCH=m +CONFIG_USB_STORAGE_KARMA=m +CONFIG_USB_STORAGE_CYPRESS_ATACB=m +CONFIG_USB_STORAGE_ENE_UB6250=m +CONFIG_USB_UAS=m + +# +# USB Imaging devices +# +CONFIG_USB_MDC800=m +CONFIG_USB_MICROTEK=m +CONFIG_USBIP_CORE=m +CONFIG_USBIP_VHCI_HCD=m +CONFIG_USBIP_VHCI_HC_PORTS=8 +CONFIG_USBIP_VHCI_NR_HCS=1 +CONFIG_USBIP_HOST=m +CONFIG_USBIP_VUDC=m +# CONFIG_USBIP_DEBUG is not set +CONFIG_USB_MUSB_HDRC=m +# CONFIG_USB_MUSB_HOST is not set +# CONFIG_USB_MUSB_GADGET is not set +CONFIG_USB_MUSB_DUAL_ROLE=y + +# +# Platform Glue Layer +# + +# +# MUSB DMA mode +# +CONFIG_MUSB_PIO_ONLY=y +CONFIG_USB_DWC3=m +# CONFIG_USB_DWC3_ULPI is not set +# CONFIG_USB_DWC3_HOST is not set +# CONFIG_USB_DWC3_GADGET is not set +CONFIG_USB_DWC3_DUAL_ROLE=y + +# +# Platform Glue Driver Support +# +CONFIG_USB_DWC3_PCI=m +CONFIG_USB_DWC3_HAPS=m +CONFIG_USB_DWC2=m +# CONFIG_USB_DWC2_HOST is not set + +# +# Gadget/Dual-role mode requires USB Gadget support to be enabled +# +# CONFIG_USB_DWC2_PERIPHERAL is not set +CONFIG_USB_DWC2_DUAL_ROLE=y +CONFIG_USB_DWC2_PCI=m +# CONFIG_USB_DWC2_DEBUG is not set +# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set +CONFIG_USB_CHIPIDEA=m +CONFIG_USB_CHIPIDEA_PCI=m +CONFIG_USB_CHIPIDEA_UDC=y +CONFIG_USB_CHIPIDEA_HOST=y +CONFIG_USB_ISP1760=m +CONFIG_USB_ISP1760_HCD=y +CONFIG_USB_ISP1761_UDC=y +# CONFIG_USB_ISP1760_HOST_ROLE is not set +# CONFIG_USB_ISP1760_GADGET_ROLE is not set +CONFIG_USB_ISP1760_DUAL_ROLE=y + +# +# USB port drivers +# +CONFIG_USB_USS720=m +CONFIG_USB_SERIAL=m +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_SIMPLE=m +CONFIG_USB_SERIAL_AIRCABLE=m +CONFIG_USB_SERIAL_ARK3116=m +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_CH341=m +CONFIG_USB_SERIAL_WHITEHEAT=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_CP210X=m +CONFIG_USB_SERIAL_CYPRESS_M8=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_F81232=m +CONFIG_USB_SERIAL_F8153X=m +CONFIG_USB_SERIAL_GARMIN=m +CONFIG_USB_SERIAL_IPW=m +CONFIG_USB_SERIAL_IUU=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_METRO=m +CONFIG_USB_SERIAL_MOS7720=m +CONFIG_USB_SERIAL_MOS7715_PARPORT=y +CONFIG_USB_SERIAL_MOS7840=m +CONFIG_USB_SERIAL_MXUPORT=m +CONFIG_USB_SERIAL_NAVMAN=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_OTI6858=m +CONFIG_USB_SERIAL_QCAUX=m +CONFIG_USB_SERIAL_QUALCOMM=m +CONFIG_USB_SERIAL_SPCP8X5=m +CONFIG_USB_SERIAL_SAFE=m +CONFIG_USB_SERIAL_SAFE_PADDED=y +CONFIG_USB_SERIAL_SIERRAWIRELESS=m +CONFIG_USB_SERIAL_SYMBOL=m +CONFIG_USB_SERIAL_TI=m +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_WWAN=m +CONFIG_USB_SERIAL_OPTION=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_SERIAL_OPTICON=m +CONFIG_USB_SERIAL_XSENS_MT=m +CONFIG_USB_SERIAL_WISHBONE=m +CONFIG_USB_SERIAL_SSU100=m +CONFIG_USB_SERIAL_QT2=m +# CONFIG_USB_SERIAL_UPD78F0730 is not set +# CONFIG_USB_SERIAL_DEBUG is not set + +# +# USB Miscellaneous drivers +# +CONFIG_USB_EMI62=m +CONFIG_USB_EMI26=m +CONFIG_USB_ADUTUX=m +CONFIG_USB_SEVSEG=m +CONFIG_USB_RIO500=m +CONFIG_USB_LEGOTOWER=m +CONFIG_USB_LCD=m +CONFIG_USB_CYPRESS_CY7C63=m +CONFIG_USB_CYTHERM=m +CONFIG_USB_IDMOUSE=m +CONFIG_USB_FTDI_ELAN=m +CONFIG_USB_APPLEDISPLAY=m +CONFIG_USB_SISUSBVGA=m +CONFIG_USB_SISUSBVGA_CON=y +CONFIG_USB_LD=m +CONFIG_USB_TRANCEVIBRATOR=m +CONFIG_USB_IOWARRIOR=m +CONFIG_USB_TEST=m +CONFIG_USB_EHSET_TEST_FIXTURE=m +CONFIG_USB_ISIGHTFW=m +CONFIG_USB_YUREX=m +CONFIG_USB_EZUSB_FX2=m +CONFIG_USB_HUB_USB251XB=m +CONFIG_USB_HSIC_USB3503=m +CONFIG_USB_HSIC_USB4604=m +CONFIG_USB_LINK_LAYER_TEST=m +CONFIG_USB_CHAOSKEY=m +CONFIG_USB_ATM=m +CONFIG_USB_SPEEDTOUCH=m +CONFIG_USB_CXACRU=m +CONFIG_USB_UEAGLEATM=m +CONFIG_USB_XUSBATM=m + +# +# USB Physical Layer drivers +# +CONFIG_USB_PHY=y +CONFIG_NOP_USB_XCEIV=m +CONFIG_USB_GPIO_VBUS=m +CONFIG_TAHVO_USB=m +CONFIG_TAHVO_USB_HOST_BY_DEFAULT=y +CONFIG_USB_ISP1301=m +CONFIG_USB_GADGET=m +# CONFIG_USB_GADGET_DEBUG is not set +# CONFIG_USB_GADGET_DEBUG_FILES is not set +# CONFIG_USB_GADGET_DEBUG_FS is not set +CONFIG_USB_GADGET_VBUS_DRAW=2 +CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 +CONFIG_U_SERIAL_CONSOLE=y + +# +# USB Peripheral Controller +# +CONFIG_USB_FOTG210_UDC=m +CONFIG_USB_GR_UDC=m +CONFIG_USB_R8A66597=m +CONFIG_USB_PXA27X=m +CONFIG_USB_MV_UDC=m +CONFIG_USB_MV_U3D=m +CONFIG_USB_SNP_CORE=m +CONFIG_USB_M66592=m +CONFIG_USB_BDC_UDC=m + +# +# Platform Support +# +CONFIG_USB_BDC_PCI=m +CONFIG_USB_AMD5536UDC=m +CONFIG_USB_NET2272=m +CONFIG_USB_NET2272_DMA=y +CONFIG_USB_NET2280=m +CONFIG_USB_GOKU=m +CONFIG_USB_EG20T=m +# CONFIG_USB_DUMMY_HCD is not set +CONFIG_USB_LIBCOMPOSITE=m +CONFIG_USB_F_ACM=m +CONFIG_USB_F_SS_LB=m +CONFIG_USB_U_SERIAL=m +CONFIG_USB_U_ETHER=m +CONFIG_USB_U_AUDIO=m +CONFIG_USB_F_SERIAL=m +CONFIG_USB_F_OBEX=m +CONFIG_USB_F_NCM=m +CONFIG_USB_F_ECM=m +CONFIG_USB_F_PHONET=m +CONFIG_USB_F_EEM=m +CONFIG_USB_F_SUBSET=m +CONFIG_USB_F_RNDIS=m +CONFIG_USB_F_MASS_STORAGE=m +CONFIG_USB_F_FS=m +CONFIG_USB_F_UAC1=m +CONFIG_USB_F_UAC2=m +CONFIG_USB_F_UVC=m +CONFIG_USB_F_MIDI=m +CONFIG_USB_F_HID=m +CONFIG_USB_F_PRINTER=m +CONFIG_USB_F_TCM=m +CONFIG_USB_CONFIGFS=m +CONFIG_USB_CONFIGFS_SERIAL=y +CONFIG_USB_CONFIGFS_ACM=y +CONFIG_USB_CONFIGFS_OBEX=y +CONFIG_USB_CONFIGFS_NCM=y +CONFIG_USB_CONFIGFS_ECM=y +CONFIG_USB_CONFIGFS_ECM_SUBSET=y +CONFIG_USB_CONFIGFS_RNDIS=y +CONFIG_USB_CONFIGFS_EEM=y +CONFIG_USB_CONFIGFS_PHONET=y +CONFIG_USB_CONFIGFS_MASS_STORAGE=y +CONFIG_USB_CONFIGFS_F_LB_SS=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_UAC1=y +# CONFIG_USB_CONFIGFS_F_UAC1_LEGACY is not set +CONFIG_USB_CONFIGFS_F_UAC2=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_CONFIGFS_F_HID=y +CONFIG_USB_CONFIGFS_F_UVC=y +CONFIG_USB_CONFIGFS_F_PRINTER=y +CONFIG_USB_CONFIGFS_F_TCM=y +CONFIG_USB_ZERO=m +CONFIG_USB_ZERO_HNPTEST=y +CONFIG_USB_AUDIO=m +CONFIG_GADGET_UAC1=y +# CONFIG_GADGET_UAC1_LEGACY is not set +CONFIG_USB_ETH=m +CONFIG_USB_ETH_RNDIS=y +CONFIG_USB_ETH_EEM=y +CONFIG_USB_G_NCM=m +CONFIG_USB_GADGETFS=m +CONFIG_USB_FUNCTIONFS=m +CONFIG_USB_FUNCTIONFS_ETH=y +CONFIG_USB_FUNCTIONFS_RNDIS=y +CONFIG_USB_FUNCTIONFS_GENERIC=y +CONFIG_USB_MASS_STORAGE=m +CONFIG_USB_GADGET_TARGET=m +CONFIG_USB_G_SERIAL=m +CONFIG_USB_MIDI_GADGET=m +CONFIG_USB_G_PRINTER=m +CONFIG_USB_CDC_COMPOSITE=m +CONFIG_USB_G_NOKIA=m +CONFIG_USB_G_ACM_MS=m +CONFIG_USB_G_MULTI=m +CONFIG_USB_G_MULTI_RNDIS=y +CONFIG_USB_G_MULTI_CDC=y +CONFIG_USB_G_HID=m +# CONFIG_USB_G_DBGP is not set +CONFIG_USB_G_WEBCAM=m +CONFIG_TYPEC=m +CONFIG_TYPEC_TCPM=m +CONFIG_TYPEC_TCPCI=m +CONFIG_TYPEC_RT1711H=m +CONFIG_TYPEC_FUSB302=m +CONFIG_TYPEC_UCSI=m +CONFIG_UCSI_ACPI=m +CONFIG_TYPEC_TPS6598X=m + +# +# USB Type-C Multiplexer/DeMultiplexer Switch support +# +CONFIG_TYPEC_MUX_PI3USB30532=m + +# +# USB Type-C Alternate Mode drivers +# +# CONFIG_TYPEC_DP_ALTMODE is not set +CONFIG_USB_ROLE_SWITCH=m +CONFIG_USB_ROLES_INTEL_XHCI=m +CONFIG_USB_LED_TRIG=y +CONFIG_USB_ULPI_BUS=m +CONFIG_UWB=m +CONFIG_UWB_HWA=m +CONFIG_UWB_WHCI=m +CONFIG_UWB_I1480U=m +CONFIG_MMC=m +CONFIG_MMC_BLOCK=m +CONFIG_MMC_BLOCK_MINORS=8 +CONFIG_SDIO_UART=m +# CONFIG_MMC_TEST is not set + +# +# MMC/SD/SDIO Host Controller Drivers +# +# CONFIG_MMC_DEBUG is not set +CONFIG_MMC_SDHCI=m +CONFIG_MMC_SDHCI_PCI=m +CONFIG_MMC_RICOH_MMC=y +CONFIG_MMC_SDHCI_ACPI=m +CONFIG_MMC_SDHCI_PLTFM=m +CONFIG_MMC_SDHCI_F_SDH30=m +CONFIG_MMC_WBSD=m +CONFIG_MMC_TIFM_SD=m +CONFIG_MMC_SPI=m +CONFIG_MMC_SDRICOH_CS=m +CONFIG_MMC_CB710=m +CONFIG_MMC_VIA_SDMMC=m +CONFIG_MMC_VUB300=m +CONFIG_MMC_USHC=m +CONFIG_MMC_USDHI6ROL0=m +CONFIG_MMC_REALTEK_PCI=m +CONFIG_MMC_REALTEK_USB=m +CONFIG_MMC_CQHCI=m +CONFIG_MMC_TOSHIBA_PCI=m +CONFIG_MMC_MTK=m +CONFIG_MMC_SDHCI_XENON=m +CONFIG_MEMSTICK=m +# CONFIG_MEMSTICK_DEBUG is not set + +# +# MemoryStick drivers +# +# CONFIG_MEMSTICK_UNSAFE_RESUME is not set +CONFIG_MSPRO_BLOCK=m +CONFIG_MS_BLOCK=m + +# +# MemoryStick Host Controller Drivers +# +CONFIG_MEMSTICK_TIFM_MS=m +CONFIG_MEMSTICK_JMICRON_38X=m +CONFIG_MEMSTICK_R592=m +CONFIG_MEMSTICK_REALTEK_PCI=m +CONFIG_MEMSTICK_REALTEK_USB=m +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_CLASS_FLASH=m +# CONFIG_LEDS_BRIGHTNESS_HW_CHANGED is not set + +# +# LED drivers +# +CONFIG_LEDS_APU=m +CONFIG_LEDS_AS3645A=m +CONFIG_LEDS_LM3530=m +CONFIG_LEDS_LM3533=m +CONFIG_LEDS_LM3642=m +CONFIG_LEDS_LM3601X=m +CONFIG_LEDS_MT6323=m +CONFIG_LEDS_PCA9532=m +CONFIG_LEDS_PCA9532_GPIO=y +CONFIG_LEDS_GPIO=m +CONFIG_LEDS_LP3944=m +CONFIG_LEDS_LP3952=m +CONFIG_LEDS_LP55XX_COMMON=m +CONFIG_LEDS_LP5521=m +CONFIG_LEDS_LP5523=m +CONFIG_LEDS_LP5562=m +CONFIG_LEDS_LP8501=m +CONFIG_LEDS_CLEVO_MAIL=m +CONFIG_LEDS_PCA955X=m +# CONFIG_LEDS_PCA955X_GPIO is not set +CONFIG_LEDS_PCA963X=m +CONFIG_LEDS_WM831X_STATUS=m +CONFIG_LEDS_DA9052=m +CONFIG_LEDS_DAC124S085=m +CONFIG_LEDS_PWM=m +CONFIG_LEDS_REGULATOR=m +CONFIG_LEDS_BD2802=m +CONFIG_LEDS_INTEL_SS4200=m +CONFIG_LEDS_LT3593=m +CONFIG_LEDS_MC13783=m +CONFIG_LEDS_TCA6507=m +CONFIG_LEDS_TLC591XX=m +CONFIG_LEDS_LM355x=m +CONFIG_LEDS_MENF21BMC=m + +# +# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) +# +CONFIG_LEDS_BLINKM=m +CONFIG_LEDS_MLXCPLD=m +CONFIG_LEDS_MLXREG=m +CONFIG_LEDS_USER=m +CONFIG_LEDS_NIC78BX=m + +# +# LED Triggers +# +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_ONESHOT=m +CONFIG_LEDS_TRIGGER_DISK=y +# CONFIG_LEDS_TRIGGER_MTD is not set +CONFIG_LEDS_TRIGGER_HEARTBEAT=m +CONFIG_LEDS_TRIGGER_BACKLIGHT=m +CONFIG_LEDS_TRIGGER_CPU=y +CONFIG_LEDS_TRIGGER_ACTIVITY=m +CONFIG_LEDS_TRIGGER_GPIO=m +CONFIG_LEDS_TRIGGER_DEFAULT_ON=m + +# +# iptables trigger is under Netfilter config (LED target) +# +CONFIG_LEDS_TRIGGER_TRANSIENT=m +CONFIG_LEDS_TRIGGER_CAMERA=m +CONFIG_LEDS_TRIGGER_PANIC=y +CONFIG_LEDS_TRIGGER_NETDEV=m +CONFIG_ACCESSIBILITY=y +CONFIG_A11Y_BRAILLE_CONSOLE=y +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_MAD=m +CONFIG_INFINIBAND_USER_ACCESS=m +# CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI is not set +CONFIG_INFINIBAND_USER_MEM=y +CONFIG_INFINIBAND_ON_DEMAND_PAGING=y +CONFIG_INFINIBAND_ADDR_TRANS=y +CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y +CONFIG_INFINIBAND_MTHCA=m +CONFIG_INFINIBAND_MTHCA_DEBUG=y +CONFIG_INFINIBAND_QIB=m +CONFIG_INFINIBAND_QIB_DCA=y +CONFIG_INFINIBAND_CXGB3=m +CONFIG_INFINIBAND_CXGB4=m +CONFIG_INFINIBAND_I40IW=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_MLX5_INFINIBAND=m +CONFIG_INFINIBAND_NES=m +# CONFIG_INFINIBAND_NES_DEBUG is not set +CONFIG_INFINIBAND_OCRDMA=m +CONFIG_INFINIBAND_VMWARE_PVRDMA=m +CONFIG_INFINIBAND_USNIC=m +CONFIG_INFINIBAND_IPOIB=m +CONFIG_INFINIBAND_IPOIB_CM=y +CONFIG_INFINIBAND_IPOIB_DEBUG=y +# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set +CONFIG_INFINIBAND_SRP=m +CONFIG_INFINIBAND_SRPT=m +CONFIG_INFINIBAND_ISER=m +CONFIG_INFINIBAND_ISERT=m +CONFIG_INFINIBAND_OPA_VNIC=m +CONFIG_INFINIBAND_RDMAVT=m +CONFIG_RDMA_RXE=m +CONFIG_INFINIBAND_HFI1=m +# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set +# CONFIG_SDMA_VERBOSITY is not set +CONFIG_INFINIBAND_QEDR=m +CONFIG_INFINIBAND_BNXT_RE=m +CONFIG_EDAC_ATOMIC_SCRUB=y +CONFIG_EDAC_SUPPORT=y +CONFIG_EDAC=y +CONFIG_EDAC_LEGACY_SYSFS=y +# CONFIG_EDAC_DEBUG is not set +CONFIG_EDAC_DECODE_MCE=m +CONFIG_EDAC_GHES=y +CONFIG_EDAC_AMD64=m +CONFIG_EDAC_AMD64_ERROR_INJECTION=y +CONFIG_EDAC_E752X=m +CONFIG_EDAC_I82975X=m +CONFIG_EDAC_I3000=m +CONFIG_EDAC_I3200=m +CONFIG_EDAC_IE31200=m +CONFIG_EDAC_X38=m +CONFIG_EDAC_I5400=m +CONFIG_EDAC_I7CORE=m +CONFIG_EDAC_I5000=m +CONFIG_EDAC_I5100=m +CONFIG_EDAC_I7300=m +CONFIG_EDAC_SBRIDGE=m +CONFIG_EDAC_SKX=m +CONFIG_EDAC_PND2=m +CONFIG_RTC_LIB=y +CONFIG_RTC_MC146818_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +CONFIG_RTC_SYSTOHC=y +CONFIG_RTC_SYSTOHC_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set +CONFIG_RTC_NVMEM=y + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +CONFIG_RTC_INTF_DEV_UIE_EMUL=y +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +CONFIG_RTC_DRV_88PM80X=m +CONFIG_RTC_DRV_ABB5ZES3=m +CONFIG_RTC_DRV_ABX80X=m +CONFIG_RTC_DRV_DS1307=m +# CONFIG_RTC_DRV_DS1307_CENTURY is not set +CONFIG_RTC_DRV_DS1374=m +CONFIG_RTC_DRV_DS1374_WDT=y +CONFIG_RTC_DRV_DS1672=m +CONFIG_RTC_DRV_MAX6900=m +CONFIG_RTC_DRV_MAX8907=m +CONFIG_RTC_DRV_RS5C372=m +CONFIG_RTC_DRV_ISL1208=m +CONFIG_RTC_DRV_ISL12022=m +CONFIG_RTC_DRV_X1205=m +CONFIG_RTC_DRV_PCF8523=m +CONFIG_RTC_DRV_PCF85063=m +CONFIG_RTC_DRV_PCF85363=m +CONFIG_RTC_DRV_PCF8563=m +CONFIG_RTC_DRV_PCF8583=m +CONFIG_RTC_DRV_M41T80=m +CONFIG_RTC_DRV_M41T80_WDT=y +CONFIG_RTC_DRV_BQ32K=m +CONFIG_RTC_DRV_S35390A=m +CONFIG_RTC_DRV_FM3130=m +CONFIG_RTC_DRV_RX8010=m +CONFIG_RTC_DRV_RX8581=m +CONFIG_RTC_DRV_RX8025=m +CONFIG_RTC_DRV_EM3027=m +CONFIG_RTC_DRV_RV8803=m + +# +# SPI RTC drivers +# +CONFIG_RTC_DRV_M41T93=m +CONFIG_RTC_DRV_M41T94=m +CONFIG_RTC_DRV_DS1302=m +CONFIG_RTC_DRV_DS1305=m +CONFIG_RTC_DRV_DS1343=m +CONFIG_RTC_DRV_DS1347=m +CONFIG_RTC_DRV_DS1390=m +CONFIG_RTC_DRV_MAX6916=m +CONFIG_RTC_DRV_R9701=m +CONFIG_RTC_DRV_RX4581=m +CONFIG_RTC_DRV_RX6110=m +CONFIG_RTC_DRV_RS5C348=m +CONFIG_RTC_DRV_MAX6902=m +CONFIG_RTC_DRV_PCF2123=m +CONFIG_RTC_DRV_MCP795=m +CONFIG_RTC_I2C_AND_SPI=m + +# +# SPI and I2C RTC drivers +# +CONFIG_RTC_DRV_DS3232=m +CONFIG_RTC_DRV_DS3232_HWMON=y +CONFIG_RTC_DRV_PCF2127=m +CONFIG_RTC_DRV_RV3029C2=m +CONFIG_RTC_DRV_RV3029_HWMON=y + +# +# Platform RTC drivers +# +CONFIG_RTC_DRV_CMOS=y +CONFIG_RTC_DRV_DS1286=m +CONFIG_RTC_DRV_DS1511=m +CONFIG_RTC_DRV_DS1553=m +CONFIG_RTC_DRV_DS1685_FAMILY=m +CONFIG_RTC_DRV_DS1685=y +# CONFIG_RTC_DRV_DS1689 is not set +# CONFIG_RTC_DRV_DS17285 is not set +# CONFIG_RTC_DRV_DS17485 is not set +# CONFIG_RTC_DRV_DS17885 is not set +CONFIG_RTC_DS1685_PROC_REGS=y +CONFIG_RTC_DRV_DS1742=m +CONFIG_RTC_DRV_DS2404=m +CONFIG_RTC_DRV_DA9052=m +CONFIG_RTC_DRV_DA9063=m +CONFIG_RTC_DRV_STK17TA8=m +CONFIG_RTC_DRV_M48T86=m +CONFIG_RTC_DRV_M48T35=m +CONFIG_RTC_DRV_M48T59=m +CONFIG_RTC_DRV_MSM6242=m +CONFIG_RTC_DRV_BQ4802=m +CONFIG_RTC_DRV_RP5C01=m +CONFIG_RTC_DRV_V3020=m +CONFIG_RTC_DRV_WM831X=m +CONFIG_RTC_DRV_PCF50633=m +# CONFIG_RTC_DRV_CROS_EC is not set + +# +# on-CPU RTC drivers +# +CONFIG_RTC_DRV_FTRTC010=m +CONFIG_RTC_DRV_PCAP=m +CONFIG_RTC_DRV_MC13XXX=m +CONFIG_RTC_DRV_MT6397=m + +# +# HID Sensor RTC drivers +# +CONFIG_RTC_DRV_HID_SENSOR_TIME=m +CONFIG_DMADEVICES=y +# CONFIG_DMADEVICES_DEBUG is not set + +# +# DMA Devices +# +CONFIG_DMA_ENGINE=y +CONFIG_DMA_VIRTUAL_CHANNELS=y +CONFIG_DMA_ACPI=y +CONFIG_ALTERA_MSGDMA=m +CONFIG_INTEL_IDMA64=m +CONFIG_INTEL_IOATDMA=m +CONFIG_INTEL_MIC_X100_DMA=m +CONFIG_QCOM_HIDMA_MGMT=m +CONFIG_QCOM_HIDMA=m +CONFIG_DW_DMAC_CORE=y +CONFIG_DW_DMAC=m +CONFIG_DW_DMAC_PCI=y +CONFIG_HSU_DMA=y + +# +# DMA Clients +# +CONFIG_ASYNC_TX_DMA=y +# CONFIG_DMATEST is not set +CONFIG_DMA_ENGINE_RAID=y + +# +# DMABUF options +# +CONFIG_SYNC_FILE=y +# CONFIG_SW_SYNC is not set +CONFIG_DCA=m +CONFIG_AUXDISPLAY=y +CONFIG_HD44780=m +CONFIG_KS0108=m +CONFIG_KS0108_PORT=0x378 +CONFIG_KS0108_DELAY=2 +CONFIG_CFAG12864B=m +CONFIG_CFAG12864B_RATE=20 +CONFIG_IMG_ASCII_LCD=m +CONFIG_PANEL=m +CONFIG_PANEL_PARPORT=0 +CONFIG_PANEL_PROFILE=5 +# CONFIG_PANEL_CHANGE_MESSAGE is not set +CONFIG_CHARLCD=m +CONFIG_UIO=m +CONFIG_UIO_CIF=m +CONFIG_UIO_PDRV_GENIRQ=m +CONFIG_UIO_DMEM_GENIRQ=m +CONFIG_UIO_AEC=m +CONFIG_UIO_SERCOS3=m +CONFIG_UIO_PCI_GENERIC=m +CONFIG_UIO_NETX=m +CONFIG_UIO_PRUSS=m +CONFIG_UIO_MF624=m +CONFIG_UIO_HV_GENERIC=m +CONFIG_VFIO_IOMMU_TYPE1=m +CONFIG_VFIO_VIRQFD=m +CONFIG_VFIO=m +# CONFIG_VFIO_NOIOMMU is not set +CONFIG_VFIO_PCI=m +CONFIG_VFIO_PCI_VGA=y +CONFIG_VFIO_PCI_MMAP=y +CONFIG_VFIO_PCI_INTX=y +CONFIG_VFIO_PCI_IGD=y +CONFIG_VFIO_MDEV=m +CONFIG_VFIO_MDEV_DEVICE=m +CONFIG_IRQ_BYPASS_MANAGER=m +CONFIG_VIRT_DRIVERS=y +# CONFIG_VBOXGUEST is not set +CONFIG_VIRTIO=m +CONFIG_VIRTIO_MENU=y +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_PCI_LEGACY=y +CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_INPUT=m +CONFIG_VIRTIO_MMIO=m +CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y + +# +# Microsoft Hyper-V guest support +# +CONFIG_HYPERV=m +CONFIG_HYPERV_TSCPAGE=y +CONFIG_HYPERV_UTILS=m +CONFIG_HYPERV_BALLOON=m +CONFIG_STAGING=y +CONFIG_PRISM2_USB=m +CONFIG_COMEDI=m +# CONFIG_COMEDI_DEBUG is not set +CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 +CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 +CONFIG_COMEDI_MISC_DRIVERS=y +CONFIG_COMEDI_BOND=m +CONFIG_COMEDI_TEST=m +CONFIG_COMEDI_PARPORT=m +CONFIG_COMEDI_ISA_DRIVERS=y +CONFIG_COMEDI_PCL711=m +CONFIG_COMEDI_PCL724=m +CONFIG_COMEDI_PCL726=m +CONFIG_COMEDI_PCL730=m +CONFIG_COMEDI_PCL812=m +CONFIG_COMEDI_PCL816=m +CONFIG_COMEDI_PCL818=m +CONFIG_COMEDI_PCM3724=m +CONFIG_COMEDI_AMPLC_DIO200_ISA=m +CONFIG_COMEDI_AMPLC_PC236_ISA=m +CONFIG_COMEDI_AMPLC_PC263_ISA=m +CONFIG_COMEDI_RTI800=m +CONFIG_COMEDI_RTI802=m +CONFIG_COMEDI_DAC02=m +CONFIG_COMEDI_DAS16M1=m +CONFIG_COMEDI_DAS08_ISA=m +CONFIG_COMEDI_DAS16=m +CONFIG_COMEDI_DAS800=m +CONFIG_COMEDI_DAS1800=m +CONFIG_COMEDI_DAS6402=m +CONFIG_COMEDI_DT2801=m +CONFIG_COMEDI_DT2811=m +CONFIG_COMEDI_DT2814=m +CONFIG_COMEDI_DT2815=m +CONFIG_COMEDI_DT2817=m +CONFIG_COMEDI_DT282X=m +CONFIG_COMEDI_DMM32AT=m +CONFIG_COMEDI_FL512=m +CONFIG_COMEDI_AIO_AIO12_8=m +CONFIG_COMEDI_AIO_IIRO_16=m +CONFIG_COMEDI_II_PCI20KC=m +CONFIG_COMEDI_C6XDIGIO=m +CONFIG_COMEDI_MPC624=m +CONFIG_COMEDI_ADQ12B=m +CONFIG_COMEDI_NI_AT_A2150=m +CONFIG_COMEDI_NI_AT_AO=m +CONFIG_COMEDI_NI_ATMIO=m +CONFIG_COMEDI_NI_ATMIO16D=m +CONFIG_COMEDI_NI_LABPC_ISA=m +CONFIG_COMEDI_PCMAD=m +CONFIG_COMEDI_PCMDA12=m +CONFIG_COMEDI_PCMMIO=m +CONFIG_COMEDI_PCMUIO=m +CONFIG_COMEDI_MULTIQ3=m +CONFIG_COMEDI_S526=m +CONFIG_COMEDI_PCI_DRIVERS=m +CONFIG_COMEDI_8255_PCI=m +CONFIG_COMEDI_ADDI_WATCHDOG=m +CONFIG_COMEDI_ADDI_APCI_1032=m +CONFIG_COMEDI_ADDI_APCI_1500=m +CONFIG_COMEDI_ADDI_APCI_1516=m +CONFIG_COMEDI_ADDI_APCI_1564=m +CONFIG_COMEDI_ADDI_APCI_16XX=m +CONFIG_COMEDI_ADDI_APCI_2032=m +CONFIG_COMEDI_ADDI_APCI_2200=m +CONFIG_COMEDI_ADDI_APCI_3120=m +CONFIG_COMEDI_ADDI_APCI_3501=m +CONFIG_COMEDI_ADDI_APCI_3XXX=m +CONFIG_COMEDI_ADL_PCI6208=m +CONFIG_COMEDI_ADL_PCI7X3X=m +CONFIG_COMEDI_ADL_PCI8164=m +CONFIG_COMEDI_ADL_PCI9111=m +CONFIG_COMEDI_ADL_PCI9118=m +CONFIG_COMEDI_ADV_PCI1710=m +CONFIG_COMEDI_ADV_PCI1720=m +CONFIG_COMEDI_ADV_PCI1723=m +CONFIG_COMEDI_ADV_PCI1724=m +CONFIG_COMEDI_ADV_PCI1760=m +CONFIG_COMEDI_ADV_PCI_DIO=m +CONFIG_COMEDI_AMPLC_DIO200_PCI=m +CONFIG_COMEDI_AMPLC_PC236_PCI=m +CONFIG_COMEDI_AMPLC_PC263_PCI=m +CONFIG_COMEDI_AMPLC_PCI224=m +CONFIG_COMEDI_AMPLC_PCI230=m +CONFIG_COMEDI_CONTEC_PCI_DIO=m +CONFIG_COMEDI_DAS08_PCI=m +CONFIG_COMEDI_DT3000=m +CONFIG_COMEDI_DYNA_PCI10XX=m +CONFIG_COMEDI_GSC_HPDI=m +CONFIG_COMEDI_MF6X4=m +CONFIG_COMEDI_ICP_MULTI=m +CONFIG_COMEDI_DAQBOARD2000=m +CONFIG_COMEDI_JR3_PCI=m +CONFIG_COMEDI_KE_COUNTER=m +CONFIG_COMEDI_CB_PCIDAS64=m +CONFIG_COMEDI_CB_PCIDAS=m +CONFIG_COMEDI_CB_PCIDDA=m +CONFIG_COMEDI_CB_PCIMDAS=m +CONFIG_COMEDI_CB_PCIMDDA=m +CONFIG_COMEDI_ME4000=m +CONFIG_COMEDI_ME_DAQ=m +CONFIG_COMEDI_NI_6527=m +CONFIG_COMEDI_NI_65XX=m +CONFIG_COMEDI_NI_660X=m +CONFIG_COMEDI_NI_670X=m +CONFIG_COMEDI_NI_LABPC_PCI=m +CONFIG_COMEDI_NI_PCIDIO=m +CONFIG_COMEDI_NI_PCIMIO=m +CONFIG_COMEDI_RTD520=m +CONFIG_COMEDI_S626=m +CONFIG_COMEDI_MITE=m +CONFIG_COMEDI_NI_TIOCMD=m +CONFIG_COMEDI_PCMCIA_DRIVERS=m +CONFIG_COMEDI_CB_DAS16_CS=m +CONFIG_COMEDI_DAS08_CS=m +CONFIG_COMEDI_NI_DAQ_700_CS=m +CONFIG_COMEDI_NI_DAQ_DIO24_CS=m +CONFIG_COMEDI_NI_LABPC_CS=m +CONFIG_COMEDI_NI_MIO_CS=m +CONFIG_COMEDI_QUATECH_DAQP_CS=m +CONFIG_COMEDI_USB_DRIVERS=m +CONFIG_COMEDI_DT9812=m +CONFIG_COMEDI_NI_USB6501=m +CONFIG_COMEDI_USBDUX=m +CONFIG_COMEDI_USBDUXFAST=m +CONFIG_COMEDI_USBDUXSIGMA=m +CONFIG_COMEDI_VMK80XX=m +CONFIG_COMEDI_8254=m +CONFIG_COMEDI_8255=m +CONFIG_COMEDI_8255_SA=m +CONFIG_COMEDI_KCOMEDILIB=m +CONFIG_COMEDI_AMPLC_DIO200=m +CONFIG_COMEDI_AMPLC_PC236=m +CONFIG_COMEDI_DAS08=m +CONFIG_COMEDI_ISADMA=m +CONFIG_COMEDI_NI_LABPC=m +CONFIG_COMEDI_NI_LABPC_ISADMA=m +CONFIG_COMEDI_NI_TIO=m +CONFIG_RTL8192U=m +CONFIG_RTLLIB=m +CONFIG_RTLLIB_CRYPTO_CCMP=m +CONFIG_RTLLIB_CRYPTO_TKIP=m +CONFIG_RTLLIB_CRYPTO_WEP=m +CONFIG_RTL8192E=m +CONFIG_RTL8723BS=m +CONFIG_R8712U=m +CONFIG_R8188EU=m +CONFIG_88EU_AP_MODE=y +CONFIG_R8822BE=m +CONFIG_RTLWIFI_DEBUG_ST=y +CONFIG_RTS5208=m +CONFIG_VT6655=m +CONFIG_VT6656=m + +# +# IIO staging drivers +# + +# +# Accelerometers +# +CONFIG_ADIS16203=m +CONFIG_ADIS16240=m + +# +# Analog to digital converters +# +CONFIG_AD7606=m +CONFIG_AD7606_IFACE_PARALLEL=m +CONFIG_AD7606_IFACE_SPI=m +CONFIG_AD7780=m +CONFIG_AD7816=m +CONFIG_AD7192=m +CONFIG_AD7280=m + +# +# Analog digital bi-direction converters +# +CONFIG_ADT7316=m +CONFIG_ADT7316_SPI=m +CONFIG_ADT7316_I2C=m + +# +# Capacitance to digital converters +# +CONFIG_AD7150=m +CONFIG_AD7152=m +CONFIG_AD7746=m + +# +# Direct Digital Synthesis +# +CONFIG_AD9832=m +CONFIG_AD9834=m + +# +# Network Analyzer, Impedance Converters +# +CONFIG_AD5933=m + +# +# Active energy metering IC +# +CONFIG_ADE7854=m +CONFIG_ADE7854_I2C=m +CONFIG_ADE7854_SPI=m + +# +# Resolver to digital converters +# +CONFIG_AD2S90=m +CONFIG_AD2S1210=m +CONFIG_FB_SM750=m +CONFIG_FB_XGI=m + +# +# Speakup console speech +# +CONFIG_SPEAKUP=m +CONFIG_SPEAKUP_SYNTH_ACNTSA=m +CONFIG_SPEAKUP_SYNTH_APOLLO=m +CONFIG_SPEAKUP_SYNTH_AUDPTR=m +CONFIG_SPEAKUP_SYNTH_BNS=m +CONFIG_SPEAKUP_SYNTH_DECTLK=m +CONFIG_SPEAKUP_SYNTH_DECEXT=m +CONFIG_SPEAKUP_SYNTH_LTLK=m +CONFIG_SPEAKUP_SYNTH_SOFT=m +CONFIG_SPEAKUP_SYNTH_SPKOUT=m +CONFIG_SPEAKUP_SYNTH_TXPRT=m +# CONFIG_SPEAKUP_SYNTH_DUMMY is not set +CONFIG_STAGING_MEDIA=y +CONFIG_I2C_BCM2048=m +CONFIG_SOC_CAMERA_IMX074=m +CONFIG_SOC_CAMERA_MT9T031=m +CONFIG_VIDEO_ZORAN=m +CONFIG_VIDEO_ZORAN_DC30=m +CONFIG_VIDEO_ZORAN_ZR36060=m +CONFIG_VIDEO_ZORAN_BUZ=m +CONFIG_VIDEO_ZORAN_DC10=m +CONFIG_VIDEO_ZORAN_LML33=m +CONFIG_VIDEO_ZORAN_LML33R10=m +CONFIG_VIDEO_ZORAN_AVS6EYES=m + +# +# Android +# +CONFIG_LTE_GDM724X=m +CONFIG_FIREWIRE_SERIAL=m +CONFIG_FWTTY_MAX_TOTAL_PORTS=64 +CONFIG_FWTTY_MAX_CARD_PORTS=32 +CONFIG_MTD_SPINAND_MT29F=m +CONFIG_MTD_SPINAND_ONDIEECC=y +CONFIG_DGNC=m +CONFIG_GS_FPGABOOT=m +CONFIG_UNISYSSPAR=y +CONFIG_FB_TFT=m +CONFIG_FB_TFT_AGM1264K_FL=m +CONFIG_FB_TFT_BD663474=m +CONFIG_FB_TFT_HX8340BN=m +CONFIG_FB_TFT_HX8347D=m +CONFIG_FB_TFT_HX8353D=m +# CONFIG_FB_TFT_HX8357D is not set +# CONFIG_FB_TFT_ILI9163 is not set +CONFIG_FB_TFT_ILI9320=m +CONFIG_FB_TFT_ILI9325=m +CONFIG_FB_TFT_ILI9340=m +CONFIG_FB_TFT_ILI9341=m +CONFIG_FB_TFT_ILI9481=m +CONFIG_FB_TFT_ILI9486=m +CONFIG_FB_TFT_PCD8544=m +CONFIG_FB_TFT_RA8875=m +CONFIG_FB_TFT_S6D02A1=m +CONFIG_FB_TFT_S6D1121=m +CONFIG_FB_TFT_SH1106=m +CONFIG_FB_TFT_SSD1289=m +CONFIG_FB_TFT_SSD1305=m +CONFIG_FB_TFT_SSD1306=m +CONFIG_FB_TFT_SSD1331=m +CONFIG_FB_TFT_SSD1351=m +CONFIG_FB_TFT_ST7735R=m +CONFIG_FB_TFT_ST7789V=m +CONFIG_FB_TFT_TINYLCD=m +CONFIG_FB_TFT_TLS8204=m +CONFIG_FB_TFT_UC1611=m +CONFIG_FB_TFT_UC1701=m +CONFIG_FB_TFT_UPD161704=m +CONFIG_FB_TFT_WATTEROTT=m +CONFIG_FB_FLEX=m +CONFIG_FB_TFT_FBTFT_DEVICE=m +CONFIG_WILC1000=m +CONFIG_WILC1000_SDIO=m +CONFIG_WILC1000_SPI=m +# CONFIG_WILC1000_HW_OOB_INTR is not set +CONFIG_MOST=m +CONFIG_MOST_CDEV=m +CONFIG_MOST_NET=m +CONFIG_MOST_SOUND=m +CONFIG_MOST_VIDEO=m +CONFIG_MOST_I2C=m +CONFIG_MOST_USB=m +CONFIG_KS7010=m +CONFIG_GREYBUS=m +CONFIG_GREYBUS_ES2=m +CONFIG_GREYBUS_AUDIO=m +CONFIG_GREYBUS_BOOTROM=m +CONFIG_GREYBUS_FIRMWARE=m +CONFIG_GREYBUS_HID=m +CONFIG_GREYBUS_LIGHT=m +CONFIG_GREYBUS_LOG=m +CONFIG_GREYBUS_LOOPBACK=m +CONFIG_GREYBUS_POWER=m +CONFIG_GREYBUS_RAW=m +CONFIG_GREYBUS_VIBRATOR=m +CONFIG_GREYBUS_BRIDGED_PHY=m +CONFIG_GREYBUS_GPIO=m +CONFIG_GREYBUS_I2C=m +CONFIG_GREYBUS_PWM=m +CONFIG_GREYBUS_SDIO=m +CONFIG_GREYBUS_SPI=m +CONFIG_GREYBUS_UART=m +CONFIG_GREYBUS_USB=m +# CONFIG_DRM_VBOXVIDEO is not set +CONFIG_PI433=m +CONFIG_MTK_MMC=m +# CONFIG_MTK_AEE_KDUMP is not set +# CONFIG_MTK_MMC_CD_POLL is not set + +# +# Gasket devices +# +# CONFIG_STAGING_GASKET_FRAMEWORK is not set +# CONFIG_XIL_AXIS_FIFO is not set +# CONFIG_EROFS_FS is not set +CONFIG_X86_PLATFORM_DEVICES=y +CONFIG_ACER_WMI=m +CONFIG_ACER_WIRELESS=m +CONFIG_ACERHDF=m +CONFIG_ALIENWARE_WMI=m +CONFIG_ASUS_LAPTOP=m +CONFIG_DELL_SMBIOS=m +CONFIG_DELL_SMBIOS_WMI=y +CONFIG_DELL_SMBIOS_SMM=y +CONFIG_DELL_LAPTOP=m +CONFIG_DELL_WMI=m +CONFIG_DELL_WMI_DESCRIPTOR=m +CONFIG_DELL_WMI_AIO=m +CONFIG_DELL_WMI_LED=m +CONFIG_DELL_SMO8800=m +CONFIG_DELL_RBTN=m +CONFIG_FUJITSU_LAPTOP=m +CONFIG_FUJITSU_TABLET=m +CONFIG_AMILO_RFKILL=m +CONFIG_GPD_POCKET_FAN=m +CONFIG_HP_ACCEL=m +CONFIG_HP_WIRELESS=m +CONFIG_HP_WMI=m +CONFIG_MSI_LAPTOP=m +CONFIG_PANASONIC_LAPTOP=m +CONFIG_COMPAL_LAPTOP=m +CONFIG_SONY_LAPTOP=m +CONFIG_SONYPI_COMPAT=y +CONFIG_IDEAPAD_LAPTOP=m +CONFIG_SURFACE3_WMI=m +CONFIG_THINKPAD_ACPI=m +CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y +# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set +# CONFIG_THINKPAD_ACPI_DEBUG is not set +# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set +CONFIG_THINKPAD_ACPI_VIDEO=y +CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y +CONFIG_SENSORS_HDAPS=m +CONFIG_INTEL_MENLOW=m +CONFIG_EEEPC_LAPTOP=m +CONFIG_ASUS_WMI=m +CONFIG_ASUS_NB_WMI=m +CONFIG_EEEPC_WMI=m +CONFIG_ASUS_WIRELESS=m +CONFIG_ACPI_WMI=m +CONFIG_WMI_BMOF=m +CONFIG_INTEL_WMI_THUNDERBOLT=m +CONFIG_MSI_WMI=m +CONFIG_PEAQ_WMI=m +CONFIG_TOPSTAR_LAPTOP=m +CONFIG_ACPI_TOSHIBA=m +CONFIG_TOSHIBA_BT_RFKILL=m +CONFIG_TOSHIBA_HAPS=m +CONFIG_TOSHIBA_WMI=m +CONFIG_ACPI_CMPC=m +CONFIG_INTEL_CHT_INT33FE=m +CONFIG_INTEL_INT0002_VGPIO=m +CONFIG_INTEL_HID_EVENT=m +CONFIG_INTEL_VBTN=m +CONFIG_INTEL_IPS=m +CONFIG_INTEL_PMC_CORE=y +CONFIG_IBM_RTL=m +CONFIG_SAMSUNG_LAPTOP=m +CONFIG_MXM_WMI=m +CONFIG_INTEL_OAKTRAIL=m +CONFIG_SAMSUNG_Q10=m +CONFIG_APPLE_GMUX=m +CONFIG_INTEL_RST=m +CONFIG_INTEL_SMARTCONNECT=m +CONFIG_PVPANIC=m +CONFIG_INTEL_PMC_IPC=m +CONFIG_INTEL_BXTWC_PMIC_TMU=m +CONFIG_SURFACE_PRO3_BUTTON=m +CONFIG_SURFACE_3_BUTTON=m +CONFIG_INTEL_PUNIT_IPC=m +CONFIG_INTEL_TELEMETRY=m +CONFIG_MLX_PLATFORM=m +# CONFIG_INTEL_TURBO_MAX_3 is not set +CONFIG_INTEL_CHTDC_TI_PWRBTN=m +# CONFIG_I2C_MULTI_INSTANTIATE is not set +CONFIG_PMC_ATOM=y +CONFIG_CHROME_PLATFORMS=y +CONFIG_CHROMEOS_LAPTOP=m +CONFIG_CHROMEOS_PSTORE=m +CONFIG_CHROMEOS_TBMC=m +# CONFIG_CROS_EC_I2C is not set +# CONFIG_CROS_EC_SPI is not set +CONFIG_CROS_EC_LPC=m +CONFIG_CROS_EC_LPC_MEC=y +CONFIG_CROS_EC_PROTO=y +CONFIG_CROS_KBD_LED_BACKLIGHT=m +CONFIG_MELLANOX_PLATFORM=y +CONFIG_MLXREG_HOTPLUG=m +# CONFIG_MLXREG_IO is not set +CONFIG_CLKDEV_LOOKUP=y +CONFIG_HAVE_CLK_PREPARE=y +CONFIG_COMMON_CLK=y + +# +# Common Clock Framework +# +CONFIG_COMMON_CLK_WM831X=m +# CONFIG_COMMON_CLK_MAX9485 is not set +CONFIG_COMMON_CLK_SI5351=m +CONFIG_COMMON_CLK_SI544=m +CONFIG_COMMON_CLK_CDCE706=m +CONFIG_COMMON_CLK_CS2000_CP=m +CONFIG_COMMON_CLK_PWM=m +CONFIG_HWSPINLOCK=y + +# +# Clock Source drivers +# +CONFIG_CLKEVT_I8253=y +CONFIG_I8253_LOCK=y +CONFIG_CLKBLD_I8253=y +CONFIG_MAILBOX=y +CONFIG_PCC=y +CONFIG_ALTERA_MBOX=m +CONFIG_IOMMU_API=y +CONFIG_IOMMU_SUPPORT=y + +# +# Generic IOMMU Pagetable Support +# +# CONFIG_IOMMU_DEBUGFS is not set +# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set +CONFIG_IOMMU_IOVA=y +CONFIG_AMD_IOMMU=y +CONFIG_AMD_IOMMU_V2=m +CONFIG_DMAR_TABLE=y +CONFIG_INTEL_IOMMU=y +CONFIG_INTEL_IOMMU_SVM=y +# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set +CONFIG_INTEL_IOMMU_FLOPPY_WA=y +CONFIG_IRQ_REMAP=y + +# +# Remoteproc drivers +# +CONFIG_REMOTEPROC=m + +# +# Rpmsg drivers +# +CONFIG_RPMSG=m +CONFIG_RPMSG_CHAR=m +CONFIG_RPMSG_QCOM_GLINK_NATIVE=m +CONFIG_RPMSG_QCOM_GLINK_RPM=m +CONFIG_RPMSG_VIRTIO=m +CONFIG_SOUNDWIRE=y + +# +# SoundWire Devices +# +CONFIG_SOUNDWIRE_BUS=m +CONFIG_SOUNDWIRE_CADENCE=m +CONFIG_SOUNDWIRE_INTEL=m + +# +# SOC (System On Chip) specific Drivers +# + +# +# Amlogic SoC drivers +# + +# +# Broadcom SoC drivers +# + +# +# NXP/Freescale QorIQ SoC drivers +# + +# +# i.MX SoC drivers +# + +# +# Qualcomm SoC drivers +# +CONFIG_SOC_TI=y + +# +# Xilinx SoC drivers +# +CONFIG_XILINX_VCU=m +CONFIG_PM_DEVFREQ=y + +# +# DEVFREQ Governors +# +CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y +CONFIG_DEVFREQ_GOV_PERFORMANCE=y +CONFIG_DEVFREQ_GOV_POWERSAVE=y +CONFIG_DEVFREQ_GOV_USERSPACE=y +CONFIG_DEVFREQ_GOV_PASSIVE=m + +# +# DEVFREQ Drivers +# +CONFIG_PM_DEVFREQ_EVENT=y +CONFIG_EXTCON=y + +# +# Extcon Device Drivers +# +CONFIG_EXTCON_ADC_JACK=m +CONFIG_EXTCON_ARIZONA=m +CONFIG_EXTCON_AXP288=m +CONFIG_EXTCON_GPIO=m +# CONFIG_EXTCON_INTEL_INT3496 is not set +CONFIG_EXTCON_MAX14577=m +CONFIG_EXTCON_MAX3355=m +CONFIG_EXTCON_MAX77693=m +CONFIG_EXTCON_RT8973A=m +CONFIG_EXTCON_SM5502=m +# CONFIG_EXTCON_USB_GPIO is not set +CONFIG_EXTCON_USBC_CROS_EC=m +CONFIG_MEMORY=y +CONFIG_IIO=m +CONFIG_IIO_BUFFER=y +CONFIG_IIO_BUFFER_CB=m +CONFIG_IIO_BUFFER_HW_CONSUMER=m +CONFIG_IIO_KFIFO_BUF=m +CONFIG_IIO_TRIGGERED_BUFFER=m +CONFIG_IIO_CONFIGFS=m +CONFIG_IIO_TRIGGER=y +CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 +CONFIG_IIO_SW_DEVICE=m +CONFIG_IIO_SW_TRIGGER=m +CONFIG_IIO_TRIGGERED_EVENT=m + +# +# Accelerometers +# +CONFIG_ADIS16201=m +CONFIG_ADIS16209=m +CONFIG_BMA180=m +CONFIG_BMA220=m +CONFIG_BMC150_ACCEL=m +CONFIG_BMC150_ACCEL_I2C=m +CONFIG_BMC150_ACCEL_SPI=m +CONFIG_DA280=m +CONFIG_DA311=m +CONFIG_DMARD09=m +CONFIG_DMARD10=m +CONFIG_HID_SENSOR_ACCEL_3D=m +CONFIG_IIO_CROS_EC_ACCEL_LEGACY=m +CONFIG_IIO_ST_ACCEL_3AXIS=m +CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m +CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m +CONFIG_KXSD9=m +CONFIG_KXSD9_SPI=m +CONFIG_KXSD9_I2C=m +CONFIG_KXCJK1013=m +CONFIG_MC3230=m +CONFIG_MMA7455=m +CONFIG_MMA7455_I2C=m +CONFIG_MMA7455_SPI=m +CONFIG_MMA7660=m +CONFIG_MMA8452=m +CONFIG_MMA9551_CORE=m +CONFIG_MMA9551=m +CONFIG_MMA9553=m +CONFIG_MXC4005=m +CONFIG_MXC6255=m +CONFIG_SCA3000=m +CONFIG_STK8312=m +CONFIG_STK8BA50=m + +# +# Analog to digital converters +# +CONFIG_AD_SIGMA_DELTA=m +CONFIG_AD7266=m +CONFIG_AD7291=m +CONFIG_AD7298=m +CONFIG_AD7476=m +CONFIG_AD7766=m +CONFIG_AD7791=m +CONFIG_AD7793=m +CONFIG_AD7887=m +CONFIG_AD7923=m +CONFIG_AD799X=m +CONFIG_AXP20X_ADC=m +CONFIG_AXP288_ADC=m +CONFIG_CC10001_ADC=m +CONFIG_DA9150_GPADC=m +CONFIG_DLN2_ADC=m +CONFIG_HI8435=m +CONFIG_HX711=m +CONFIG_INA2XX_ADC=m +CONFIG_LTC2471=m +CONFIG_LTC2485=m +CONFIG_LTC2497=m +CONFIG_MAX1027=m +CONFIG_MAX11100=m +CONFIG_MAX1118=m +CONFIG_MAX1363=m +CONFIG_MAX9611=m +CONFIG_MCP320X=m +CONFIG_MCP3422=m +CONFIG_MEN_Z188_ADC=m +CONFIG_NAU7802=m +CONFIG_QCOM_VADC_COMMON=m +CONFIG_QCOM_SPMI_IADC=m +CONFIG_QCOM_SPMI_VADC=m +CONFIG_TI_ADC081C=m +CONFIG_TI_ADC0832=m +CONFIG_TI_ADC084S021=m +CONFIG_TI_ADC12138=m +CONFIG_TI_ADC108S102=m +CONFIG_TI_ADC128S052=m +CONFIG_TI_ADC161S626=m +CONFIG_TI_ADS1015=m +CONFIG_TI_ADS7950=m +CONFIG_TI_AM335X_ADC=m +CONFIG_TI_TLC4541=m +CONFIG_VIPERBOARD_ADC=m + +# +# Analog Front Ends +# + +# +# Amplifiers +# +CONFIG_AD8366=m + +# +# Chemical Sensors +# +CONFIG_ATLAS_PH_SENSOR=m +# CONFIG_BME680 is not set +CONFIG_CCS811=m +CONFIG_IAQCORE=m +CONFIG_VZ89X=m +CONFIG_IIO_CROS_EC_SENSORS_CORE=m +CONFIG_IIO_CROS_EC_SENSORS=m + +# +# Hid Sensor IIO Common +# +CONFIG_HID_SENSOR_IIO_COMMON=m +CONFIG_HID_SENSOR_IIO_TRIGGER=m +CONFIG_IIO_MS_SENSORS_I2C=m + +# +# SSP Sensor Common +# +CONFIG_IIO_SSP_SENSORS_COMMONS=m +CONFIG_IIO_SSP_SENSORHUB=m +CONFIG_IIO_ST_SENSORS_I2C=m +CONFIG_IIO_ST_SENSORS_SPI=m +CONFIG_IIO_ST_SENSORS_CORE=m + +# +# Counters +# + +# +# Digital to analog converters +# +CONFIG_AD5064=m +CONFIG_AD5360=m +CONFIG_AD5380=m +CONFIG_AD5421=m +CONFIG_AD5446=m +CONFIG_AD5449=m +CONFIG_AD5592R_BASE=m +CONFIG_AD5592R=m +CONFIG_AD5593R=m +CONFIG_AD5504=m +CONFIG_AD5624R_SPI=m +CONFIG_LTC2632=m +CONFIG_AD5686=m +CONFIG_AD5686_SPI=m +CONFIG_AD5696_I2C=m +CONFIG_AD5755=m +# CONFIG_AD5758 is not set +CONFIG_AD5761=m +CONFIG_AD5764=m +CONFIG_AD5791=m +CONFIG_AD7303=m +CONFIG_AD8801=m +CONFIG_DS4424=m +CONFIG_M62332=m +CONFIG_MAX517=m +CONFIG_MCP4725=m +CONFIG_MCP4922=m +CONFIG_TI_DAC082S085=m +CONFIG_TI_DAC5571=m + +# +# IIO dummy driver +# +# CONFIG_IIO_SIMPLE_DUMMY is not set + +# +# Frequency Synthesizers DDS/PLL +# + +# +# Clock Generator/Distribution +# +CONFIG_AD9523=m + +# +# Phase-Locked Loop (PLL) frequency synthesizers +# +CONFIG_ADF4350=m + +# +# Digital gyroscope sensors +# +CONFIG_ADIS16080=m +CONFIG_ADIS16130=m +CONFIG_ADIS16136=m +CONFIG_ADIS16260=m +CONFIG_ADXRS450=m +CONFIG_BMG160=m +CONFIG_BMG160_I2C=m +CONFIG_BMG160_SPI=m +CONFIG_HID_SENSOR_GYRO_3D=m +CONFIG_MPU3050=m +CONFIG_MPU3050_I2C=m +CONFIG_IIO_ST_GYRO_3AXIS=m +CONFIG_IIO_ST_GYRO_I2C_3AXIS=m +CONFIG_IIO_ST_GYRO_SPI_3AXIS=m +CONFIG_ITG3200=m + +# +# Health Sensors +# + +# +# Heart Rate Monitors +# +CONFIG_AFE4403=m +CONFIG_AFE4404=m +CONFIG_MAX30100=m +CONFIG_MAX30102=m + +# +# Humidity sensors +# +CONFIG_AM2315=m +CONFIG_DHT11=m +CONFIG_HDC100X=m +CONFIG_HID_SENSOR_HUMIDITY=m +CONFIG_HTS221=m +CONFIG_HTS221_I2C=m +CONFIG_HTS221_SPI=m +CONFIG_HTU21=m +CONFIG_SI7005=m +CONFIG_SI7020=m + +# +# Inertial measurement units +# +CONFIG_ADIS16400=m +CONFIG_ADIS16480=m +CONFIG_BMI160=m +CONFIG_BMI160_I2C=m +CONFIG_BMI160_SPI=m +CONFIG_KMX61=m +CONFIG_INV_MPU6050_IIO=m +CONFIG_INV_MPU6050_I2C=m +CONFIG_INV_MPU6050_SPI=m +CONFIG_IIO_ST_LSM6DSX=m +CONFIG_IIO_ST_LSM6DSX_I2C=m +CONFIG_IIO_ST_LSM6DSX_SPI=m +CONFIG_IIO_ADIS_LIB=m +CONFIG_IIO_ADIS_LIB_BUFFER=y + +# +# Light sensors +# +# CONFIG_ACPI_ALS is not set +CONFIG_ADJD_S311=m +CONFIG_AL3320A=m +CONFIG_APDS9300=m +CONFIG_APDS9960=m +CONFIG_BH1750=m +CONFIG_BH1780=m +CONFIG_CM32181=m +CONFIG_CM3232=m +CONFIG_CM3323=m +CONFIG_CM36651=m +CONFIG_IIO_CROS_EC_LIGHT_PROX=m +CONFIG_GP2AP020A00F=m +CONFIG_SENSORS_ISL29018=m +CONFIG_SENSORS_ISL29028=m +CONFIG_ISL29125=m +CONFIG_HID_SENSOR_ALS=m +CONFIG_HID_SENSOR_PROX=m +CONFIG_JSA1212=m +CONFIG_RPR0521=m +CONFIG_SENSORS_LM3533=m +CONFIG_LTR501=m +CONFIG_LV0104CS=m +CONFIG_MAX44000=m +CONFIG_OPT3001=m +CONFIG_PA12203001=m +# CONFIG_SI1133 is not set +CONFIG_SI1145=m +CONFIG_STK3310=m +CONFIG_ST_UVIS25=m +CONFIG_ST_UVIS25_I2C=m +CONFIG_ST_UVIS25_SPI=m +CONFIG_TCS3414=m +CONFIG_TCS3472=m +CONFIG_SENSORS_TSL2563=m +CONFIG_TSL2583=m +CONFIG_TSL2772=m +CONFIG_TSL4531=m +CONFIG_US5182D=m +CONFIG_VCNL4000=m +CONFIG_VEML6070=m +CONFIG_VL6180=m +CONFIG_ZOPT2201=m + +# +# Magnetometer sensors +# +CONFIG_AK8975=m +CONFIG_AK09911=m +CONFIG_BMC150_MAGN=m +CONFIG_BMC150_MAGN_I2C=m +CONFIG_BMC150_MAGN_SPI=m +CONFIG_MAG3110=m +CONFIG_HID_SENSOR_MAGNETOMETER_3D=m +CONFIG_MMC35240=m +CONFIG_IIO_ST_MAGN_3AXIS=m +CONFIG_IIO_ST_MAGN_I2C_3AXIS=m +CONFIG_IIO_ST_MAGN_SPI_3AXIS=m +CONFIG_SENSORS_HMC5843=m +CONFIG_SENSORS_HMC5843_I2C=m +CONFIG_SENSORS_HMC5843_SPI=m + +# +# Multiplexers +# + +# +# Inclinometer sensors +# +CONFIG_HID_SENSOR_INCLINOMETER_3D=m +CONFIG_HID_SENSOR_DEVICE_ROTATION=m + +# +# Triggers - standalone +# +CONFIG_IIO_HRTIMER_TRIGGER=m +CONFIG_IIO_INTERRUPT_TRIGGER=m +CONFIG_IIO_TIGHTLOOP_TRIGGER=m +CONFIG_IIO_SYSFS_TRIGGER=m + +# +# Digital potentiometers +# +CONFIG_AD5272=m +CONFIG_DS1803=m +CONFIG_MAX5481=m +CONFIG_MAX5487=m +CONFIG_MCP4018=m +CONFIG_MCP4131=m +CONFIG_MCP4531=m +CONFIG_TPL0102=m + +# +# Digital potentiostats +# +CONFIG_LMP91000=m + +# +# Pressure sensors +# +CONFIG_ABP060MG=m +CONFIG_BMP280=m +CONFIG_BMP280_I2C=m +CONFIG_BMP280_SPI=m +CONFIG_IIO_CROS_EC_BARO=m +CONFIG_HID_SENSOR_PRESS=m +CONFIG_HP03=m +CONFIG_MPL115=m +CONFIG_MPL115_I2C=m +CONFIG_MPL115_SPI=m +CONFIG_MPL3115=m +CONFIG_MS5611=m +CONFIG_MS5611_I2C=m +CONFIG_MS5611_SPI=m +CONFIG_MS5637=m +CONFIG_IIO_ST_PRESS=m +CONFIG_IIO_ST_PRESS_I2C=m +CONFIG_IIO_ST_PRESS_SPI=m +CONFIG_T5403=m +CONFIG_HP206C=m +CONFIG_ZPA2326=m +CONFIG_ZPA2326_I2C=m +CONFIG_ZPA2326_SPI=m + +# +# Lightning sensors +# +CONFIG_AS3935=m + +# +# Proximity and distance sensors +# +# CONFIG_ISL29501 is not set +CONFIG_LIDAR_LITE_V2=m +CONFIG_RFD77402=m +CONFIG_SRF04=m +CONFIG_SX9500=m +CONFIG_SRF08=m + +# +# Resolver to digital converters +# +CONFIG_AD2S1200=m + +# +# Temperature sensors +# +CONFIG_MAXIM_THERMOCOUPLE=m +CONFIG_HID_SENSOR_TEMP=m +CONFIG_MLX90614=m +CONFIG_MLX90632=m +CONFIG_TMP006=m +CONFIG_TMP007=m +CONFIG_TSYS01=m +CONFIG_TSYS02D=m +CONFIG_NTB=m +CONFIG_NTB_AMD=m +CONFIG_NTB_IDT=m +CONFIG_NTB_INTEL=m +CONFIG_NTB_SWITCHTEC=m +CONFIG_NTB_PINGPONG=m +CONFIG_NTB_TOOL=m +CONFIG_NTB_PERF=m +CONFIG_NTB_TRANSPORT=m +CONFIG_VME_BUS=y + +# +# VME Bridge Drivers +# +CONFIG_VME_CA91CX42=m +CONFIG_VME_TSI148=m +CONFIG_VME_FAKE=m + +# +# VME Board Drivers +# +CONFIG_VMIVME_7805=m + +# +# VME Device Drivers +# +CONFIG_VME_USER=m +CONFIG_PWM=y +CONFIG_PWM_SYSFS=y +CONFIG_PWM_CROS_EC=m +CONFIG_PWM_LP3943=m +CONFIG_PWM_LPSS=m +CONFIG_PWM_LPSS_PCI=m +CONFIG_PWM_LPSS_PLATFORM=m +CONFIG_PWM_PCA9685=m + +# +# IRQ chip support +# +CONFIG_ARM_GIC_MAX_NR=1 +CONFIG_IPACK_BUS=m +CONFIG_BOARD_TPCI200=m +CONFIG_SERIAL_IPOCTAL=m +CONFIG_RESET_CONTROLLER=y +CONFIG_RESET_TI_SYSCON=m +CONFIG_FMC=m +CONFIG_FMC_FAKEDEV=m +CONFIG_FMC_TRIVIAL=m +CONFIG_FMC_WRITE_EEPROM=m +CONFIG_FMC_CHARDEV=m + +# +# PHY Subsystem +# +CONFIG_GENERIC_PHY=y +CONFIG_BCM_KONA_USB2_PHY=m +CONFIG_PHY_PXA_28NM_HSIC=m +CONFIG_PHY_PXA_28NM_USB2=m +CONFIG_PHY_CPCAP_USB=m +CONFIG_PHY_QCOM_USB_HS=m +CONFIG_PHY_QCOM_USB_HSIC=m +CONFIG_PHY_SAMSUNG_USB2=m +CONFIG_PHY_TUSB1210=m +CONFIG_POWERCAP=y +CONFIG_INTEL_RAPL=m +# CONFIG_IDLE_INJECT is not set +CONFIG_MCB=m +CONFIG_MCB_PCI=m +CONFIG_MCB_LPC=m + +# +# Performance monitor support +# +CONFIG_RAS=y +CONFIG_RAS_CEC=y +CONFIG_THUNDERBOLT=m + +# +# Android +# +# CONFIG_ANDROID is not set +CONFIG_LIBNVDIMM=m +CONFIG_BLK_DEV_PMEM=m +CONFIG_ND_BLK=m +CONFIG_ND_CLAIM=y +CONFIG_ND_BTT=m +CONFIG_BTT=y +CONFIG_DAX_DRIVER=y +CONFIG_DAX=y +CONFIG_DEV_DAX=m +CONFIG_NVMEM=y +CONFIG_RAVE_SP_EEPROM=m + +# +# HW tracing support +# +CONFIG_STM=m +# CONFIG_STM_DUMMY is not set +CONFIG_STM_SOURCE_CONSOLE=m +CONFIG_STM_SOURCE_HEARTBEAT=m +# CONFIG_STM_SOURCE_FTRACE is not set +CONFIG_INTEL_TH=m +CONFIG_INTEL_TH_PCI=m +CONFIG_INTEL_TH_ACPI=m +CONFIG_INTEL_TH_GTH=m +CONFIG_INTEL_TH_STH=m +CONFIG_INTEL_TH_MSU=m +CONFIG_INTEL_TH_PTI=m +# CONFIG_INTEL_TH_DEBUG is not set +CONFIG_FPGA=m +CONFIG_ALTERA_PR_IP_CORE=m +CONFIG_FPGA_MGR_ALTERA_PS_SPI=m +CONFIG_FPGA_MGR_ALTERA_CVP=m +CONFIG_FPGA_MGR_XILINX_SPI=m +CONFIG_FPGA_MGR_MACHXO2_SPI=m +CONFIG_FPGA_BRIDGE=m +CONFIG_XILINX_PR_DECOUPLER=m +CONFIG_FPGA_REGION=m +# CONFIG_FPGA_DFL is not set +CONFIG_PM_OPP=y +# CONFIG_UNISYS_VISORBUS is not set +CONFIG_SIOX=m +CONFIG_SIOX_BUS_GPIO=m +CONFIG_SLIMBUS=m +CONFIG_SLIM_QCOM_CTRL=m + +# +# File systems +# +CONFIG_DCACHE_WORD_ACCESS=y +CONFIG_FS_IOMAP=y +CONFIG_EXT2_FS=m +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_POSIX_ACL=y +CONFIG_EXT2_FS_SECURITY=y +CONFIG_EXT3_FS=m +CONFIG_EXT3_FS_POSIX_ACL=y +CONFIG_EXT3_FS_SECURITY=y +CONFIG_EXT4_FS=m +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_EXT4_ENCRYPTION=y +CONFIG_EXT4_FS_ENCRYPTION=y +# CONFIG_EXT4_DEBUG is not set +CONFIG_JBD2=m +# CONFIG_JBD2_DEBUG is not set +CONFIG_FS_MBCACHE=m +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +CONFIG_REISERFS_FS_SECURITY=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +# CONFIG_JFS_DEBUG is not set +CONFIG_JFS_STATISTICS=y +CONFIG_XFS_FS=m +CONFIG_XFS_QUOTA=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_XFS_RT=y +CONFIG_XFS_ONLINE_SCRUB=y +# CONFIG_XFS_ONLINE_REPAIR is not set +# CONFIG_XFS_WARN is not set +# CONFIG_XFS_DEBUG is not set +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m +CONFIG_OCFS2_FS_O2CB=m +CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m +CONFIG_OCFS2_FS_STATS=y +CONFIG_OCFS2_DEBUG_MASKLOG=y +CONFIG_OCFS2_DEBUG_FS=y +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set +# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set +# CONFIG_BTRFS_DEBUG is not set +# CONFIG_BTRFS_ASSERT is not set +# CONFIG_BTRFS_FS_REF_VERIFY is not set +CONFIG_NILFS2_FS=m +CONFIG_F2FS_FS=m +CONFIG_F2FS_STAT_FS=y +CONFIG_F2FS_FS_XATTR=y +CONFIG_F2FS_FS_POSIX_ACL=y +CONFIG_F2FS_FS_SECURITY=y +CONFIG_F2FS_CHECK_FS=y +CONFIG_F2FS_FS_ENCRYPTION=y +# CONFIG_F2FS_IO_TRACE is not set +# CONFIG_F2FS_FAULT_INJECTION is not set +CONFIG_FS_DAX=y +CONFIG_FS_POSIX_ACL=y +CONFIG_EXPORTFS=y +CONFIG_EXPORTFS_BLOCK_OPS=y +CONFIG_FILE_LOCKING=y +CONFIG_MANDATORY_FILE_LOCKING=y +CONFIG_FS_ENCRYPTION=m +CONFIG_FSNOTIFY=y +CONFIG_DNOTIFY=y +CONFIG_INOTIFY_USER=y +CONFIG_FANOTIFY=y +# CONFIG_FANOTIFY_ACCESS_PERMISSIONS is not set +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +# CONFIG_QUOTA_DEBUG is not set +CONFIG_QUOTA_TREE=m +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_QUOTACTL=y +CONFIG_QUOTACTL_COMPAT=y +CONFIG_AUTOFS4_FS=m +CONFIG_AUTOFS_FS=m +CONFIG_FUSE_FS=m +CONFIG_CUSE=m +CONFIG_OVERLAY_FS=m +# CONFIG_OVERLAY_FS_REDIRECT_DIR is not set +CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW=y +CONFIG_OVERLAY_FS_INDEX=y +CONFIG_OVERLAY_FS_NFS_EXPORT=y +# CONFIG_OVERLAY_FS_XINO_AUTO is not set +# CONFIG_OVERLAY_FS_METACOPY is not set + +# +# Caches +# +CONFIG_FSCACHE=m +CONFIG_FSCACHE_STATS=y +# CONFIG_FSCACHE_HISTOGRAM is not set +# CONFIG_FSCACHE_DEBUG is not set +# CONFIG_FSCACHE_OBJECT_LIST is not set +CONFIG_CACHEFILES=m +# CONFIG_CACHEFILES_DEBUG is not set +# CONFIG_CACHEFILES_HISTOGRAM is not set + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m + +# +# DOS/FAT/NT Filesystems +# +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=437 +CONFIG_FAT_DEFAULT_IOCHARSET="utf8" +# CONFIG_FAT_DEFAULT_UTF8 is not set +CONFIG_NTFS_FS=m +# CONFIG_NTFS_DEBUG is not set +CONFIG_NTFS_RW=y + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +CONFIG_PROC_SYSCTL=y +CONFIG_PROC_PAGE_MONITOR=y +# CONFIG_PROC_CHILDREN is not set +CONFIG_KERNFS=y +CONFIG_SYSFS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_XATTR=y +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y +CONFIG_MEMFD_CREATE=y +CONFIG_ARCH_HAS_GIGANTIC_PAGE=y +CONFIG_CONFIGFS_FS=y +CONFIG_EFIVAR_FS=m +CONFIG_MISC_FILESYSTEMS=y +CONFIG_ORANGEFS_FS=m +CONFIG_ADFS_FS=m +# CONFIG_ADFS_FS_RW is not set +CONFIG_AFFS_FS=m +CONFIG_ECRYPT_FS=m +CONFIG_ECRYPT_FS_MESSAGING=y +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +CONFIG_BEFS_FS=m +# CONFIG_BEFS_DEBUG is not set +CONFIG_BFS_FS=m +CONFIG_EFS_FS=m +CONFIG_JFFS2_FS=m +CONFIG_JFFS2_FS_DEBUG=0 +CONFIG_JFFS2_FS_WRITEBUFFER=y +CONFIG_JFFS2_FS_WBUF_VERIFY=y +CONFIG_JFFS2_SUMMARY=y +CONFIG_JFFS2_FS_XATTR=y +CONFIG_JFFS2_FS_POSIX_ACL=y +CONFIG_JFFS2_FS_SECURITY=y +CONFIG_JFFS2_COMPRESSION_OPTIONS=y +CONFIG_JFFS2_ZLIB=y +CONFIG_JFFS2_LZO=y +CONFIG_JFFS2_RTIME=y +CONFIG_JFFS2_RUBIN=y +# CONFIG_JFFS2_CMODE_NONE is not set +CONFIG_JFFS2_CMODE_PRIORITY=y +# CONFIG_JFFS2_CMODE_SIZE is not set +# CONFIG_JFFS2_CMODE_FAVOURLZO is not set +CONFIG_UBIFS_FS=m +CONFIG_UBIFS_FS_ADVANCED_COMPR=y +CONFIG_UBIFS_FS_LZO=y +CONFIG_UBIFS_FS_ZLIB=y +CONFIG_UBIFS_ATIME_SUPPORT=y +CONFIG_UBIFS_FS_XATTR=y +CONFIG_UBIFS_FS_ENCRYPTION=y +CONFIG_UBIFS_FS_SECURITY=y +CONFIG_CRAMFS=m +CONFIG_CRAMFS_BLOCKDEV=y +CONFIG_CRAMFS_MTD=y +CONFIG_SQUASHFS=m +# CONFIG_SQUASHFS_FILE_CACHE is not set +CONFIG_SQUASHFS_FILE_DIRECT=y +# CONFIG_SQUASHFS_DECOMP_SINGLE is not set +# CONFIG_SQUASHFS_DECOMP_MULTI is not set +CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU=y +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_ZLIB=y +CONFIG_SQUASHFS_LZ4=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_SQUASHFS_ZSTD=y +CONFIG_SQUASHFS_4K_DEVBLK_SIZE=y +CONFIG_SQUASHFS_EMBEDDED=y +CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 +CONFIG_VXFS_FS=m +CONFIG_MINIX_FS=m +CONFIG_OMFS_FS=m +CONFIG_HPFS_FS=m +CONFIG_QNX4FS_FS=m +CONFIG_QNX6FS_FS=m +# CONFIG_QNX6FS_DEBUG is not set +CONFIG_ROMFS_FS=m +# CONFIG_ROMFS_BACKED_BY_BLOCK is not set +# CONFIG_ROMFS_BACKED_BY_MTD is not set +CONFIG_ROMFS_BACKED_BY_BOTH=y +CONFIG_ROMFS_ON_BLOCK=y +CONFIG_ROMFS_ON_MTD=y +CONFIG_PSTORE=y +CONFIG_PSTORE_DEFLATE_COMPRESS=y +CONFIG_PSTORE_LZO_COMPRESS=y +CONFIG_PSTORE_LZ4_COMPRESS=y +CONFIG_PSTORE_LZ4HC_COMPRESS=y +CONFIG_PSTORE_842_COMPRESS=y +CONFIG_PSTORE_ZSTD_COMPRESS=y +CONFIG_PSTORE_COMPRESS=y +# CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZO_COMPRESS_DEFAULT is not set +CONFIG_PSTORE_LZ4_COMPRESS_DEFAULT=y +# CONFIG_PSTORE_LZ4HC_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_842_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT is not set +CONFIG_PSTORE_COMPRESS_DEFAULT="lz4" +# CONFIG_PSTORE_CONSOLE is not set +CONFIG_PSTORE_PMSG=y +# CONFIG_PSTORE_FTRACE is not set +CONFIG_PSTORE_RAM=m +CONFIG_SYSV_FS=m +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set +# CONFIG_UFS_DEBUG is not set +CONFIG_EXOFS_FS=m +# CONFIG_EXOFS_DEBUG is not set +CONFIG_ORE=m +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_NFS_FS=m +CONFIG_NFS_V2=m +CONFIG_NFS_V3=m +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=m +CONFIG_NFS_SWAP=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_PNFS_FILE_LAYOUT=m +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_FLEXFILE_LAYOUT=m +CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" +CONFIG_NFS_V4_1_MIGRATION=y +CONFIG_NFS_V4_SECURITY_LABEL=y +CONFIG_NFS_FSCACHE=y +# CONFIG_NFS_USE_LEGACY_DNS is not set +CONFIG_NFS_USE_KERNEL_DNS=y +CONFIG_NFSD=m +CONFIG_NFSD_V2_ACL=y +CONFIG_NFSD_V3=y +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_NFSD_PNFS=y +CONFIG_NFSD_BLOCKLAYOUT=y +CONFIG_NFSD_SCSILAYOUT=y +CONFIG_NFSD_FLEXFILELAYOUT=y +# CONFIG_NFSD_V4_SECURITY_LABEL is not set +# CONFIG_NFSD_FAULT_INJECTION is not set +CONFIG_GRACE_PERIOD=m +CONFIG_LOCKD=m +CONFIG_LOCKD_V4=y +CONFIG_NFS_ACL_SUPPORT=m +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=m +CONFIG_SUNRPC_GSS=m +CONFIG_SUNRPC_BACKCHANNEL=y +CONFIG_SUNRPC_SWAP=y +CONFIG_RPCSEC_GSS_KRB5=m +# CONFIG_SUNRPC_DEBUG is not set +CONFIG_SUNRPC_XPRT_RDMA=m +CONFIG_CEPH_FS=m +CONFIG_CEPH_FSCACHE=y +CONFIG_CEPH_FS_POSIX_ACL=y +CONFIG_CIFS=m +CONFIG_CIFS_STATS2=y +CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y +CONFIG_CIFS_WEAK_PW_HASH=y +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +CONFIG_CIFS_ACL=y +# CONFIG_CIFS_DEBUG is not set +CONFIG_CIFS_DFS_UPCALL=y +# CONFIG_CIFS_SMB_DIRECT is not set +CONFIG_CIFS_FSCACHE=y +CONFIG_CODA_FS=m +CONFIG_AFS_FS=m +# CONFIG_AFS_DEBUG is not set +CONFIG_AFS_FSCACHE=y +CONFIG_9P_FS=m +CONFIG_9P_FSCACHE=y +CONFIG_9P_FS_POSIX_ACL=y +CONFIG_9P_FS_SECURITY=y +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_MAC_ROMAN=m +CONFIG_NLS_MAC_CELTIC=m +CONFIG_NLS_MAC_CENTEURO=m +CONFIG_NLS_MAC_CROATIAN=m +CONFIG_NLS_MAC_CYRILLIC=m +CONFIG_NLS_MAC_GAELIC=m +CONFIG_NLS_MAC_GREEK=m +CONFIG_NLS_MAC_ICELAND=m +CONFIG_NLS_MAC_INUIT=m +CONFIG_NLS_MAC_ROMANIAN=m +CONFIG_NLS_MAC_TURKISH=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +# CONFIG_DLM_DEBUG is not set + +# +# Security options +# +CONFIG_KEYS=y +CONFIG_KEYS_COMPAT=y +CONFIG_PERSISTENT_KEYRINGS=y +# CONFIG_BIG_KEYS is not set +CONFIG_TRUSTED_KEYS=m +CONFIG_ENCRYPTED_KEYS=m +# CONFIG_KEY_DH_OPERATIONS is not set +CONFIG_SECURITY_DMESG_RESTRICT=y +CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y +CONFIG_SECURITY_TIOCSTI_RESTRICT=y +CONFIG_SECURITY=y +CONFIG_SECURITYFS=y +CONFIG_SECURITY_NETWORK=y +CONFIG_PAGE_TABLE_ISOLATION=y +# CONFIG_SECURITY_INFINIBAND is not set +# CONFIG_SECURITY_NETWORK_XFRM is not set +CONFIG_SECURITY_PATH=y +CONFIG_INTEL_TXT=y +CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y +CONFIG_HARDENED_USERCOPY=y +CONFIG_HARDENED_USERCOPY_FALLBACK=y +CONFIG_FORTIFY_SOURCE=y +CONFIG_PAGE_SANITIZE=y +CONFIG_PAGE_SANITIZE_VERIFY=y +# CONFIG_STATIC_USERMODEHELPER is not set +# CONFIG_SECURITY_SELINUX is not set +# CONFIG_SECURITY_SMACK is not set +# CONFIG_SECURITY_TOMOYO is not set +CONFIG_SECURITY_APPARMOR=y +CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE=1 +CONFIG_SECURITY_APPARMOR_HASH=y +CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y +# CONFIG_SECURITY_APPARMOR_DEBUG is not set +# CONFIG_SECURITY_LOADPIN is not set +CONFIG_SECURITY_YAMA=y +# CONFIG_INTEGRITY is not set +CONFIG_DEFAULT_SECURITY_APPARMOR=y +# CONFIG_DEFAULT_SECURITY_DAC is not set +CONFIG_DEFAULT_SECURITY="apparmor" +CONFIG_XOR_BLOCKS=m +CONFIG_ASYNC_CORE=m +CONFIG_ASYNC_MEMCPY=m +CONFIG_ASYNC_XOR=m +CONFIG_ASYNC_PQ=m +CONFIG_ASYNC_RAID6_RECOV=m +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_ALGAPI2=y +CONFIG_CRYPTO_AEAD=m +CONFIG_CRYPTO_AEAD2=y +CONFIG_CRYPTO_BLKCIPHER=y +CONFIG_CRYPTO_BLKCIPHER2=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG=m +CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_RNG_DEFAULT=m +CONFIG_CRYPTO_AKCIPHER2=y +CONFIG_CRYPTO_AKCIPHER=y +CONFIG_CRYPTO_KPP2=y +CONFIG_CRYPTO_KPP=m +CONFIG_CRYPTO_ACOMP2=y +CONFIG_CRYPTO_RSA=y +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_MANAGER=y +CONFIG_CRYPTO_MANAGER2=y +CONFIG_CRYPTO_USER=m +CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y +CONFIG_CRYPTO_GF128MUL=m +CONFIG_CRYPTO_NULL=m +CONFIG_CRYPTO_NULL2=y +CONFIG_CRYPTO_PCRYPT=m +CONFIG_CRYPTO_WORKQUEUE=y +CONFIG_CRYPTO_CRYPTD=m +CONFIG_CRYPTO_MCRYPTD=m +CONFIG_CRYPTO_AUTHENC=m +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_SIMD=m +CONFIG_CRYPTO_GLUE_HELPER_X86=m +CONFIG_CRYPTO_ENGINE=m + +# +# Authenticated Encryption with Associated Data +# +CONFIG_CRYPTO_CCM=m +CONFIG_CRYPTO_GCM=m +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_AEGIS128L=m +CONFIG_CRYPTO_AEGIS256=m +CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m +CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2=m +CONFIG_CRYPTO_AEGIS256_AESNI_SSE2=m +CONFIG_CRYPTO_MORUS640=m +CONFIG_CRYPTO_MORUS640_GLUE=m +CONFIG_CRYPTO_MORUS640_SSE2=m +CONFIG_CRYPTO_MORUS1280=m +CONFIG_CRYPTO_MORUS1280_GLUE=m +CONFIG_CRYPTO_MORUS1280_SSE2=m +CONFIG_CRYPTO_MORUS1280_AVX2=m +CONFIG_CRYPTO_SEQIV=m +CONFIG_CRYPTO_ECHAINIV=m + +# +# Block modes +# +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTR=m +CONFIG_CRYPTO_CTS=m +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m +CONFIG_CRYPTO_KEYWRAP=m + +# +# Hash modes +# +CONFIG_CRYPTO_CMAC=m +CONFIG_CRYPTO_HMAC=m +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_VMAC=m + +# +# Digest +# +CONFIG_CRYPTO_CRC32C=m +CONFIG_CRYPTO_CRC32C_INTEL=m +CONFIG_CRYPTO_CRC32=m +CONFIG_CRYPTO_CRC32_PCLMUL=m +CONFIG_CRYPTO_CRCT10DIF=y +CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m +CONFIG_CRYPTO_GHASH=m +CONFIG_CRYPTO_POLY1305=m +CONFIG_CRYPTO_POLY1305_X86_64=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD128=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_RMD256=m +CONFIG_CRYPTO_RMD320=m +CONFIG_CRYPTO_SHA1=y +CONFIG_CRYPTO_SHA1_SSSE3=m +CONFIG_CRYPTO_SHA256_SSSE3=m +CONFIG_CRYPTO_SHA512_SSSE3=m +CONFIG_CRYPTO_SHA1_MB=m +CONFIG_CRYPTO_SHA256_MB=m +CONFIG_CRYPTO_SHA512_MB=m +CONFIG_CRYPTO_SHA256=m +CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m +CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m + +# +# Ciphers +# +CONFIG_CRYPTO_AES=y +CONFIG_CRYPTO_AES_TI=m +CONFIG_CRYPTO_AES_X86_64=m +CONFIG_CRYPTO_AES_NI_INTEL=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_BLOWFISH_COMMON=m +CONFIG_CRYPTO_BLOWFISH_X86_64=m +CONFIG_CRYPTO_CAMELLIA=m +CONFIG_CRYPTO_CAMELLIA_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m +CONFIG_CRYPTO_CAST_COMMON=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST5_AVX_X86_64=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_CAST6_AVX_X86_64=m +CONFIG_CRYPTO_DES=m +CONFIG_CRYPTO_DES3_EDE_X86_64=m +CONFIG_CRYPTO_FCRYPT=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SALSA20=m +CONFIG_CRYPTO_CHACHA20=m +CONFIG_CRYPTO_CHACHA20_X86_64=m +CONFIG_CRYPTO_SEED=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m +CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_TWOFISH_COMMON=m +CONFIG_CRYPTO_TWOFISH_X86_64=m +CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m +CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m + +# +# Compression +# +CONFIG_CRYPTO_DEFLATE=y +CONFIG_CRYPTO_LZO=y +CONFIG_CRYPTO_842=y +CONFIG_CRYPTO_LZ4=y +CONFIG_CRYPTO_LZ4HC=y +CONFIG_CRYPTO_ZSTD=y + +# +# Random Number Generation +# +CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_DRBG_MENU=m +CONFIG_CRYPTO_DRBG_HMAC=y +CONFIG_CRYPTO_DRBG_HASH=y +CONFIG_CRYPTO_DRBG_CTR=y +CONFIG_CRYPTO_DRBG=m +CONFIG_CRYPTO_JITTERENTROPY=m +CONFIG_CRYPTO_USER_API=m +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_USER_API_SKCIPHER=m +CONFIG_CRYPTO_USER_API_RNG=m +CONFIG_CRYPTO_USER_API_AEAD=m +CONFIG_CRYPTO_HASH_INFO=y +CONFIG_CRYPTO_HW=y +CONFIG_CRYPTO_DEV_PADLOCK=m +CONFIG_CRYPTO_DEV_PADLOCK_AES=m +CONFIG_CRYPTO_DEV_PADLOCK_SHA=m +CONFIG_CRYPTO_DEV_CCP=y +CONFIG_CRYPTO_DEV_CCP_DD=m +CONFIG_CRYPTO_DEV_SP_CCP=y +CONFIG_CRYPTO_DEV_CCP_CRYPTO=m +CONFIG_CRYPTO_DEV_SP_PSP=y +CONFIG_CRYPTO_DEV_QAT=m +CONFIG_CRYPTO_DEV_QAT_DH895xCC=m +CONFIG_CRYPTO_DEV_QAT_C3XXX=m +CONFIG_CRYPTO_DEV_QAT_C62X=m +CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m +CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m +CONFIG_CRYPTO_DEV_QAT_C62XVF=m +CONFIG_CRYPTO_DEV_NITROX=m +CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m +CONFIG_CRYPTO_DEV_CHELSIO=m +CONFIG_CHELSIO_IPSEC_INLINE=y +CONFIG_CRYPTO_DEV_CHELSIO_TLS=m +CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS7_MESSAGE_PARSER=y +CONFIG_PKCS7_TEST_KEY=m +CONFIG_SIGNED_PE_FILE_VERIFICATION=y + +# +# Certificates for signature checking +# +CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" +CONFIG_SYSTEM_TRUSTED_KEYRING=y +CONFIG_SYSTEM_TRUSTED_KEYS="" +# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set +CONFIG_SECONDARY_TRUSTED_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" +CONFIG_BINARY_PRINTF=y + +# +# Library routines +# +CONFIG_RAID6_PQ=m +CONFIG_BITREVERSE=y +CONFIG_RATIONAL=y +CONFIG_GENERIC_STRNCPY_FROM_USER=y +CONFIG_GENERIC_STRNLEN_USER=y +CONFIG_GENERIC_NET_UTILS=y +CONFIG_GENERIC_FIND_FIRST_BIT=y +CONFIG_GENERIC_PCI_IOMAP=y +CONFIG_GENERIC_IOMAP=y +CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y +CONFIG_ARCH_HAS_FAST_MULTIPLIER=y +CONFIG_CRC_CCITT=m +CONFIG_CRC16=m +CONFIG_CRC_T10DIF=y +CONFIG_CRC_ITU_T=m +CONFIG_CRC32=y +# CONFIG_CRC32_SELFTEST is not set +CONFIG_CRC32_SLICEBY8=y +# CONFIG_CRC32_SLICEBY4 is not set +# CONFIG_CRC32_SARWATE is not set +# CONFIG_CRC32_BIT is not set +CONFIG_CRC64=m +CONFIG_CRC4=m +CONFIG_CRC7=m +CONFIG_LIBCRC32C=m +CONFIG_CRC8=m +CONFIG_XXHASH=y +# CONFIG_RANDOM32_SELFTEST is not set +CONFIG_842_COMPRESS=y +CONFIG_842_DECOMPRESS=y +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=y +CONFIG_LZO_COMPRESS=y +CONFIG_LZO_DECOMPRESS=y +CONFIG_LZ4_COMPRESS=y +CONFIG_LZ4HC_COMPRESS=y +CONFIG_LZ4_DECOMPRESS=y +CONFIG_ZSTD_COMPRESS=y +CONFIG_ZSTD_DECOMPRESS=y +CONFIG_XZ_DEC=y +CONFIG_XZ_DEC_X86=y +CONFIG_XZ_DEC_POWERPC=y +CONFIG_XZ_DEC_IA64=y +CONFIG_XZ_DEC_ARM=y +CONFIG_XZ_DEC_ARMTHUMB=y +CONFIG_XZ_DEC_SPARC=y +CONFIG_XZ_DEC_BCJ=y +# CONFIG_XZ_DEC_TEST is not set +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y +CONFIG_DECOMPRESS_XZ=y +CONFIG_DECOMPRESS_LZO=y +CONFIG_DECOMPRESS_LZ4=y +CONFIG_GENERIC_ALLOCATOR=y +CONFIG_REED_SOLOMON=m +CONFIG_REED_SOLOMON_ENC8=y +CONFIG_REED_SOLOMON_DEC8=y +CONFIG_REED_SOLOMON_DEC16=y +CONFIG_BCH=m +CONFIG_BCH_CONST_PARAMS=y +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m +CONFIG_BTREE=y +CONFIG_INTERVAL_TREE=y +CONFIG_RADIX_TREE_MULTIORDER=y +CONFIG_ASSOCIATIVE_ARRAY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT_MAP=y +CONFIG_HAS_DMA=y +CONFIG_NEED_SG_DMA_LENGTH=y +CONFIG_NEED_DMA_MAP_STATE=y +CONFIG_ARCH_DMA_ADDR_T_64BIT=y +CONFIG_DMA_DIRECT_OPS=y +CONFIG_DMA_VIRT_OPS=y +CONFIG_SWIOTLB=y +CONFIG_SGL_ALLOC=y +CONFIG_IOMMU_HELPER=y +CONFIG_CHECK_SIGNATURE=y +CONFIG_CPUMASK_OFFSTACK=y +CONFIG_CPU_RMAP=y +CONFIG_DQL=y +CONFIG_GLOB=y +# CONFIG_GLOB_SELFTEST is not set +CONFIG_NLATTR=y +CONFIG_LRU_CACHE=m +CONFIG_CLZ_TAB=y +CONFIG_CORDIC=m +CONFIG_DDR=y +CONFIG_IRQ_POLL=y +CONFIG_MPILIB=y +CONFIG_OID_REGISTRY=y +CONFIG_UCS2_STRING=y +CONFIG_FONT_SUPPORT=y +CONFIG_FONTS=y +CONFIG_FONT_8x8=y +CONFIG_FONT_8x16=y +# CONFIG_FONT_6x11 is not set +# CONFIG_FONT_7x14 is not set +# CONFIG_FONT_PEARL_8x8 is not set +# CONFIG_FONT_ACORN_8x8 is not set +# CONFIG_FONT_MINI_4x6 is not set +# CONFIG_FONT_6x10 is not set +# CONFIG_FONT_10x18 is not set +# CONFIG_FONT_SUN8x16 is not set +# CONFIG_FONT_SUN12x22 is not set +CONFIG_SG_POOL=y +CONFIG_ARCH_HAS_SG_CHAIN=y +CONFIG_ARCH_HAS_PMEM_API=y +CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y +CONFIG_ARCH_HAS_UACCESS_MCSAFE=y +CONFIG_SBITMAP=y +CONFIG_PARMAN=m +CONFIG_PRIME_NUMBERS=m +# CONFIG_STRING_SELFTEST is not set + +# +# Kernel hacking +# + +# +# printk and dmesg options +# +# CONFIG_PRINTK_TIME is not set +CONFIG_CONSOLE_LOGLEVEL_DEFAULT=1 +CONFIG_CONSOLE_LOGLEVEL_QUIET=4 +CONFIG_MESSAGE_LOGLEVEL_DEFAULT=1 +# CONFIG_BOOT_PRINTK_DELAY is not set +# CONFIG_DYNAMIC_DEBUG is not set + +# +# Compile-time checks and compiler options +# +# CONFIG_DEBUG_INFO is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=0 +CONFIG_STRIP_ASM_SYMS=y +# CONFIG_READABLE_ASM is not set +# CONFIG_UNUSED_SYMBOLS is not set +# CONFIG_PAGE_OWNER is not set +CONFIG_DEBUG_FS=y +# CONFIG_HEADERS_CHECK is not set +# CONFIG_DEBUG_SECTION_MISMATCH is not set +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +CONFIG_STACK_VALIDATION=y +# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set +CONFIG_MAGIC_SYSRQ=y +CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x1 +CONFIG_MAGIC_SYSRQ_SERIAL=y +CONFIG_DEBUG_KERNEL=y + +# +# Memory Debugging +# +# CONFIG_PAGE_EXTENSION is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PAGE_POISONING is not set +# CONFIG_DEBUG_PAGE_REF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +# CONFIG_DEBUG_OBJECTS is not set +CONFIG_SLUB_DEBUG_ON=y +# CONFIG_SLUB_STATS is not set +CONFIG_HAVE_DEBUG_KMEMLEAK=y +# CONFIG_DEBUG_KMEMLEAK is not set +# CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_DEBUG_VM is not set +CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y +# CONFIG_DEBUG_VIRTUAL is not set +CONFIG_DEBUG_MEMORY_INIT=y +# CONFIG_DEBUG_PER_CPU_MAPS is not set +CONFIG_HAVE_DEBUG_STACKOVERFLOW=y +# CONFIG_DEBUG_STACKOVERFLOW is not set +CONFIG_HAVE_ARCH_KASAN=y +# CONFIG_KASAN is not set +CONFIG_ARCH_HAS_KCOV=y +CONFIG_CC_HAS_SANCOV_TRACE_PC=y +# CONFIG_KCOV is not set +# CONFIG_DEBUG_SHIRQ is not set + +# +# Debug Lockups and Hangs +# +# CONFIG_SOFTLOCKUP_DETECTOR is not set +CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y +# CONFIG_HARDLOCKUP_DETECTOR is not set +# CONFIG_DETECT_HUNG_TASK is not set +# CONFIG_WQ_WATCHDOG is not set +# CONFIG_PANIC_ON_OOPS is not set +CONFIG_PANIC_ON_OOPS_VALUE=0 +CONFIG_PANIC_TIMEOUT=0 +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_INFO=y +CONFIG_SCHEDSTATS=y +CONFIG_SCHED_STACK_END_CHECK=y +# CONFIG_DEBUG_TIMEKEEPING is not set +# CONFIG_DEBUG_PREEMPT is not set + +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +CONFIG_LOCK_DEBUGGING_SUPPORT=y +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set +# CONFIG_DEBUG_RWSEMS is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_DEBUG_ATOMIC_SLEEP is not set +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +# CONFIG_LOCK_TORTURE_TEST is not set +# CONFIG_WW_MUTEX_SELFTEST is not set +CONFIG_STACKTRACE=y +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +# CONFIG_DEBUG_KOBJECT is not set +CONFIG_DEBUG_BUGVERBOSE=y +CONFIG_DEBUG_LIST=y +CONFIG_DEBUG_PI_LIST=y +CONFIG_DEBUG_SG=y +CONFIG_DEBUG_NOTIFIERS=y +CONFIG_DEBUG_CREDENTIALS=y + +# +# RCU Debugging +# +CONFIG_TORTURE_TEST=m +CONFIG_RCU_PERF_TEST=m +# CONFIG_RCU_TORTURE_TEST is not set +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +# CONFIG_RCU_TRACE is not set +# CONFIG_RCU_EQS_DEBUG is not set +# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set +# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set +# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set +# CONFIG_NOTIFIER_ERROR_INJECTION is not set +CONFIG_FUNCTION_ERROR_INJECTION=y +# CONFIG_FAULT_INJECTION is not set +CONFIG_LATENCYTOP=y +CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y +CONFIG_HAVE_FENTRY=y +CONFIG_HAVE_C_RECORDMCOUNT=y +CONFIG_TRACE_CLOCK=y +CONFIG_RING_BUFFER=y +CONFIG_EVENT_TRACING=y +CONFIG_CONTEXT_SWITCH_TRACER=y +CONFIG_TRACING=y +CONFIG_GENERIC_TRACER=y +CONFIG_TRACING_SUPPORT=y +CONFIG_FTRACE=y +CONFIG_FUNCTION_TRACER=y +# CONFIG_FUNCTION_GRAPH_TRACER is not set +# CONFIG_PREEMPTIRQ_EVENTS is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_HWLAT_TRACER is not set +CONFIG_FTRACE_SYSCALLS=y +# CONFIG_TRACER_SNAPSHOT is not set +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +CONFIG_STACK_TRACER=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_KPROBE_EVENTS=y +# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set +# CONFIG_UPROBE_EVENTS is not set +CONFIG_BPF_EVENTS=y +CONFIG_PROBE_EVENTS=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_FUNCTION_PROFILER=y +# CONFIG_BPF_KPROBE_OVERRIDE is not set +CONFIG_FTRACE_MCOUNT_RECORD=y +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_MMIOTRACE is not set +# CONFIG_HIST_TRIGGERS is not set +# CONFIG_TRACEPOINT_BENCHMARK is not set +# CONFIG_RING_BUFFER_BENCHMARK is not set +# CONFIG_RING_BUFFER_STARTUP_TEST is not set +# CONFIG_PREEMPTIRQ_DELAY_TEST is not set +# CONFIG_TRACE_EVAL_MAP_FILE is not set +CONFIG_TRACING_EVENTS_GPIO=y +# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set +# CONFIG_DMA_API_DEBUG is not set +CONFIG_RUNTIME_TESTING_MENU=y +CONFIG_LKDTM=m +# CONFIG_TEST_LIST_SORT is not set +# CONFIG_TEST_SORT is not set +# CONFIG_KPROBES_SANITY_TEST is not set +# CONFIG_BACKTRACE_SELF_TEST is not set +# CONFIG_RBTREE_TEST is not set +# CONFIG_INTERVAL_TREE_TEST is not set +# CONFIG_PERCPU_TEST is not set +# CONFIG_ATOMIC64_SELFTEST is not set +# CONFIG_ASYNC_RAID6_TEST is not set +# CONFIG_TEST_HEXDUMP is not set +# CONFIG_TEST_STRING_HELPERS is not set +# CONFIG_TEST_KSTRTOX is not set +# CONFIG_TEST_PRINTF is not set +# CONFIG_TEST_BITMAP is not set +# CONFIG_TEST_BITFIELD is not set +# CONFIG_TEST_UUID is not set +# CONFIG_TEST_OVERFLOW is not set +# CONFIG_TEST_RHASHTABLE is not set +# CONFIG_TEST_HASH is not set +# CONFIG_TEST_IDA is not set +# CONFIG_TEST_PARMAN is not set +# CONFIG_TEST_LKM is not set +# CONFIG_TEST_USER_COPY is not set +# CONFIG_TEST_BPF is not set +# CONFIG_FIND_BIT_BENCHMARK is not set +# CONFIG_TEST_FIRMWARE is not set +# CONFIG_TEST_SYSCTL is not set +# CONFIG_TEST_UDELAY is not set +# CONFIG_TEST_STATIC_KEYS is not set +# CONFIG_TEST_KMOD is not set +CONFIG_MEMTEST=y +# CONFIG_BUG_ON_DATA_CORRUPTION is not set +# CONFIG_SAMPLES is not set +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_KGDB is not set +CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y +# CONFIG_UBSAN is not set +CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y +CONFIG_STRICT_DEVMEM=y +CONFIG_IO_STRICT_DEVMEM=y +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +CONFIG_EARLY_PRINTK_USB=y +CONFIG_X86_VERBOSE_BOOTUP=y +CONFIG_EARLY_PRINTK=y +CONFIG_EARLY_PRINTK_DBGP=y +CONFIG_EARLY_PRINTK_EFI=y +CONFIG_EARLY_PRINTK_USB_XDBC=y +CONFIG_X86_PTDUMP_CORE=y +# CONFIG_X86_PTDUMP is not set +# CONFIG_EFI_PGT_DUMP is not set +CONFIG_DEBUG_WX=y +CONFIG_DOUBLEFAULT=y +# CONFIG_DEBUG_TLBFLUSH is not set +# CONFIG_IOMMU_DEBUG is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y +# CONFIG_X86_DECODER_SELFTEST is not set +CONFIG_IO_DELAY_TYPE_0X80=0 +CONFIG_IO_DELAY_TYPE_0XED=1 +CONFIG_IO_DELAY_TYPE_UDELAY=2 +CONFIG_IO_DELAY_TYPE_NONE=3 +CONFIG_IO_DELAY_0X80=y +# CONFIG_IO_DELAY_0XED is not set +# CONFIG_IO_DELAY_UDELAY is not set +# CONFIG_IO_DELAY_NONE is not set +CONFIG_DEFAULT_IO_DELAY_TYPE=0 +# CONFIG_DEBUG_BOOT_PARAMS is not set +# CONFIG_CPA_DEBUG is not set +# CONFIG_OPTIMIZE_INLINING is not set +# CONFIG_DEBUG_ENTRY is not set +# CONFIG_DEBUG_NMI_SELFTEST is not set +CONFIG_X86_DEBUG_FPU=y +# CONFIG_PUNIT_ATOM_DEBUG is not set +CONFIG_UNWINDER_ORC=y +# CONFIG_UNWINDER_FRAME_POINTER is not set diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-revert-patches-causing-instant-reboot.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-revert-patches-causing-instant-reboot.patch new file mode 100644 index 00000000..a2127cff --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-revert-patches-causing-instant-reboot.patch @@ -0,0 +1,314 @@ +diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S +index 8169e8b7a4dc..12915511be61 100644 +--- a/arch/x86/boot/compressed/head_64.S ++++ b/arch/x86/boot/compressed/head_64.S +@@ -305,48 +305,13 @@ ENTRY(startup_64) + /* Set up the stack */ + leaq boot_stack_end(%rbx), %rsp + +- /* +- * paging_prepare() and cleanup_trampoline() below can have GOT +- * references. Adjust the table with address we are running at. +- * +- * Zero RAX for adjust_got: the GOT was not adjusted before; +- * there's no adjustment to undo. +- */ +- xorq %rax, %rax +- +- /* +- * Calculate the address the binary is loaded at and use it as +- * a GOT adjustment. +- */ +- call 1f +-1: popq %rdi +- subq $1b, %rdi +- +- call adjust_got +- + /* + * At this point we are in long mode with 4-level paging enabled, +- * but we might want to enable 5-level paging or vice versa. +- * +- * The problem is that we cannot do it directly. Setting or clearing +- * CR4.LA57 in long mode would trigger #GP. So we need to switch off +- * long mode and paging first. +- * +- * We also need a trampoline in lower memory to switch over from +- * 4- to 5-level paging for cases when the bootloader puts the kernel +- * above 4G, but didn't enable 5-level paging for us. +- * +- * The same trampoline can be used to switch from 5- to 4-level paging +- * mode, like when starting 4-level paging kernel via kexec() when +- * original kernel worked in 5-level paging mode. +- * +- * For the trampoline, we need the top page table to reside in lower +- * memory as we don't have a way to load 64-bit values into CR3 in +- * 32-bit mode. ++ * but we want to enable 5-level paging. + * +- * We go though the trampoline even if we don't have to: if we're +- * already in a desired paging mode. This way the trampoline code gets +- * tested on every boot. ++ * The problem is that we cannot do it directly. Setting LA57 in ++ * long mode would trigger #GP. So we need to switch off long mode ++ * first. + */ + + /* Make sure we have GDT with 32-bit code segment */ +@@ -371,32 +336,40 @@ ENTRY(startup_64) + /* Save the trampoline address in RCX */ + movq %rax, %rcx + ++ /* Check if we need to enable 5-level paging */ ++ cmpq $0, %rdx ++ jz lvl5 ++ ++ /* Clear additional page table */ ++ leaq lvl5_pgtable(%rbx), %rdi ++ xorq %rax, %rax ++ movq $(PAGE_SIZE/8), %rcx ++ rep stosq ++ + /* +- * Load the address of trampoline_return() into RDI. +- * It will be used by the trampoline to return to the main code. ++ * Setup current CR3 as the first and only entry in a new top level ++ * page table. + */ +- leaq trampoline_return(%rip), %rdi ++ movq %cr3, %rdi ++ leaq 0x7 (%rdi), %rax ++ movq %rax, lvl5_pgtable(%rbx) + + /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ + pushq $__KERNEL32_CS +- leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax ++ leaq compatible_mode(%rip), %rax + pushq %rax + lretq +-trampoline_return: ++lvl5: + /* Restore the stack, the 32-bit trampoline uses its own stack */ + leaq boot_stack_end(%rbx), %rsp + + /* + * cleanup_trampoline() would restore trampoline memory. + * +- * RDI is address of the page table to use instead of page table +- * in trampoline memory (if required). +- * + * RSI holds real mode data and needs to be preserved across + * this function call. + */ + pushq %rsi +- leaq top_pgtable(%rbx), %rdi + call cleanup_trampoline + popq %rsi + +@@ -404,21 +377,6 @@ trampoline_return: + pushq $0 + popfq + +- /* +- * Previously we've adjusted the GOT with address the binary was +- * loaded at. Now we need to re-adjust for relocation address. +- * +- * Calculate the address the binary is loaded at, so that we can +- * undo the previous GOT adjustment. +- */ +- call 1f +-1: popq %rax +- subq $1b, %rax +- +- /* The new adjustment is the relocation address */ +- movq %rbx, %rdi +- call adjust_got +- + /* + * Copy the compressed kernel to the end of our buffer + * where decompression in place becomes safe. +@@ -519,6 +477,19 @@ relocated: + shrq $3, %rcx + rep stosq + ++/* ++ * Adjust our own GOT ++ */ ++ leaq _got(%rip), %rdx ++ leaq _egot(%rip), %rcx ++1: ++ cmpq %rcx, %rdx ++ jae 2f ++ addq %rbx, (%rdx) ++ addq $8, %rdx ++ jmp 1b ++2: ++ + /* + * Do the extraction, and jump to the new kernel.. + */ +@@ -537,36 +508,9 @@ relocated: + */ + jmp *%rax + +-/* +- * Adjust the global offset table +- * +- * RAX is the previous adjustment of the table to undo (use 0 if it's the +- * first time we touch GOT). +- * RDI is the new adjustment to apply. +- */ +-adjust_got: +- /* Walk through the GOT adding the address to the entries */ +- leaq _got(%rip), %rdx +- leaq _egot(%rip), %rcx +-1: +- cmpq %rcx, %rdx +- jae 2f +- subq %rax, (%rdx) /* Undo previous adjustment */ +- addq %rdi, (%rdx) /* Apply the new adjustment */ +- addq $8, %rdx +- jmp 1b +-2: +- ret +- + .code32 +-/* +- * This is the 32-bit trampoline that will be copied over to low memory. +- * +- * RDI contains the return address (might be above 4G). +- * ECX contains the base address of the trampoline memory. +- * Non zero RDX on return means we need to enable 5-level paging. +- */ + ENTRY(trampoline_32bit_src) ++compatible_mode: + /* Set up data and stack segments */ + movl $__KERNEL_DS, %eax + movl %eax, %ds +@@ -580,61 +524,33 @@ ENTRY(trampoline_32bit_src) + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + +- /* Check what paging mode we want to be in after the trampoline */ +- cmpl $0, %edx +- jz 1f ++ /* Point CR3 to 5-level paging */ ++ leal lvl5_pgtable(%ebx), %eax ++ movl %eax, %cr3 + +- /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */ ++ /* Enable PAE and LA57 mode */ + movl %cr4, %eax +- testl $X86_CR4_LA57, %eax +- jnz 3f +- jmp 2f +-1: +- /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */ +- movl %cr4, %eax +- testl $X86_CR4_LA57, %eax +- jz 3f +-2: +- /* Point CR3 to the trampoline's new top level page table */ +- leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax +- movl %eax, %cr3 +-3: +- /* Enable PAE and LA57 (if required) paging modes */ +- movl $X86_CR4_PAE, %eax +- cmpl $0, %edx +- jz 1f +- orl $X86_CR4_LA57, %eax +-1: ++ orl $(X86_CR4_PAE | X86_CR4_LA57), %eax + movl %eax, %cr4 + +- /* Calculate address of paging_enabled() once we are executing in the trampoline */ +- leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax ++ /* Calculate address we are running at */ ++ call 1f ++1: popl %edi ++ subl $1b, %edi + +- /* Prepare the stack for far return to Long Mode */ ++ /* Prepare stack for far return to Long Mode */ + pushl $__KERNEL_CS +- pushl %eax ++ leal lvl5(%edi), %eax ++ push %eax + +- /* Enable paging again */ ++ /* Enable paging back */ + movl $(X86_CR0_PG | X86_CR0_PE), %eax + movl %eax, %cr0 + + lret + +- .code64 +-paging_enabled: +- /* Return from the trampoline */ +- jmp *%rdi +- +- /* +- * The trampoline code has a size limit. +- * Make sure we fail to compile if the trampoline code grows +- * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. +- */ +- .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE +- +- .code32 + no_longmode: +- /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ ++ /* This isn't an x86-64 CPU so hang */ + 1: + hlt + jmp 1b +@@ -695,10 +611,5 @@ boot_stack_end: + .balign 4096 + pgtable: + .fill BOOT_PGT_SIZE, 1, 0 +- +-/* +- * The page table is going to be used instead of page table in the trampoline +- * memory. +- */ +-top_pgtable: ++lvl5_pgtable: + .fill PAGE_SIZE, 1, 0 +diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c +index a362fa0b849c..32af1cbcd903 100644 +--- a/arch/x86/boot/compressed/pgtable_64.c ++++ b/arch/x86/boot/compressed/pgtable_64.c +@@ -22,6 +22,14 @@ struct paging_config { + /* Buffer to preserve trampoline memory */ + static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; + ++/* ++ * The page table is going to be used instead of page table in the trampoline ++ * memory. ++ * ++ * It must not be in BSS as BSS is cleared after cleanup_trampoline(). ++ */ ++static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data); ++ + /* + * Trampoline address will be printed by extract_kernel() for debugging + * purposes. +@@ -126,7 +134,7 @@ struct paging_config paging_prepare(void) + return paging_config; + } + +-void cleanup_trampoline(void *pgtable) ++void cleanup_trampoline(void) + { + void *trampoline_pgtable; + +@@ -137,8 +145,8 @@ void cleanup_trampoline(void *pgtable) + * if it's there. + */ + if ((void *)__native_read_cr3() == trampoline_pgtable) { +- memcpy(pgtable, trampoline_pgtable, PAGE_SIZE); +- native_write_cr3((unsigned long)pgtable); ++ memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE); ++ native_write_cr3((unsigned long)top_pgtable); + } + + /* Restore trampoline memory */ diff --git a/sys-kernel/linux-sources-redcore-lts/files/4.19-uksm-linux-hardened.patch b/sys-kernel/linux-sources-redcore-lts/files/4.19-uksm-linux-hardened.patch new file mode 100644 index 00000000..afb30db2 --- /dev/null +++ b/sys-kernel/linux-sources-redcore-lts/files/4.19-uksm-linux-hardened.patch @@ -0,0 +1,6941 @@ +diff -Nur a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX +--- a/Documentation/vm/00-INDEX 2019-02-06 16:30:16.000000000 +0000 ++++ b/Documentation/vm/00-INDEX 2019-02-09 17:23:06.726863699 +0000 +@@ -18,6 +18,8 @@ + - explains what hwpoison is + ksm.rst + - how to use the Kernel Samepage Merging feature. ++uksm.txt ++ - Introduction to Ultra KSM + mmu_notifier.rst + - a note about clearing pte/pmd and mmu notifications + numa.rst +diff -Nur a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt +--- a/Documentation/vm/uksm.txt 1970-01-01 01:00:00.000000000 +0100 ++++ b/Documentation/vm/uksm.txt 2019-02-09 17:23:06.726863699 +0000 +@@ -0,0 +1,61 @@ ++The Ultra Kernel Samepage Merging feature ++---------------------------------------------- ++/* ++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia ++ * ++ * This is an improvement upon KSM. Some basic data structures and routines ++ * are borrowed from ksm.c . ++ * ++ * Its new features: ++ * 1. Full system scan: ++ * It automatically scans all user processes' anonymous VMAs. Kernel-user ++ * interaction to submit a memory area to KSM is no longer needed. ++ * ++ * 2. Rich area detection: ++ * It automatically detects rich areas containing abundant duplicated ++ * pages based. Rich areas are given a full scan speed. Poor areas are ++ * sampled at a reasonable speed with very low CPU consumption. ++ * ++ * 3. Ultra Per-page scan speed improvement: ++ * A new hash algorithm is proposed. As a result, on a machine with ++ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it ++ * can scan memory areas that does not contain duplicated pages at speed of ++ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of ++ * 477MB/sec ~ 923MB/sec. ++ * ++ * 4. Thrashing area avoidance: ++ * Thrashing area(an VMA that has frequent Ksm page break-out) can be ++ * filtered out. My benchmark shows it's more efficient than KSM's per-page ++ * hash value based volatile page detection. ++ * ++ * ++ * 5. Misc changes upon KSM: ++ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page ++ * comparison. It's much faster than default C version on x86. ++ * * rmap_item now has an struct *page member to loosely cache a ++ * address-->page mapping, which reduces too much time-costly ++ * follow_page(). ++ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. ++ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ ++ * ksm is needed for this case. ++ * ++ * 6. Full Zero Page consideration(contributed by Figo Zhang) ++ * Now uksmd consider full zero pages as special pages and merge them to an ++ * special unswappable uksm zero page. ++ */ ++ ++ChangeLog: ++ ++2012-05-05 The creation of this Doc ++2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. ++2012-05-28 UKSM 0.1.1.2 bug fix release ++2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 ++2012-07-2 UKSM 0.1.2-beta2 ++2012-07-10 UKSM 0.1.2-beta3 ++2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. ++2012-10-13 UKSM 0.1.2.1 Bug fixes. ++2012-12-31 UKSM 0.1.2.2 Minor bug fixes. ++2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug". ++2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings. ++2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation. ++2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration. +diff -Nur a/fs/exec.c b/fs/exec.c +--- a/fs/exec.c 2019-02-09 17:20:30.471820869 +0000 ++++ b/fs/exec.c 2019-02-09 17:26:11.522863979 +0000 +@@ -63,6 +63,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1382,6 +1383,7 @@ + /* An exec changes our domain. We are no longer part of the thread + group */ + current->self_exec_id++; ++ + flush_signal_handlers(current, 0); + } + EXPORT_SYMBOL(setup_new_exec); +diff -Nur a/fs/proc/meminfo.c b/fs/proc/meminfo.c +--- a/fs/proc/meminfo.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/fs/proc/meminfo.c 2019-02-09 17:23:06.726863699 +0000 +@@ -106,6 +106,10 @@ + global_zone_page_state(NR_KERNEL_STACK_KB)); + show_val_kb(m, "PageTables: ", + global_zone_page_state(NR_PAGETABLE)); ++#ifdef CONFIG_UKSM ++ show_val_kb(m, "KsmZeroPages: ", ++ global_zone_page_state(NR_UKSM_ZERO_PAGES)); ++#endif + #ifdef CONFIG_QUICKLIST + show_val_kb(m, "Quicklists: ", quicklist_total_size()); + #endif +diff -Nur a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h +--- a/include/asm-generic/pgtable.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/asm-generic/pgtable.h 2019-02-09 17:23:06.726863699 +0000 +@@ -817,12 +817,25 @@ + extern void untrack_pfn_moved(struct vm_area_struct *vma); + #endif + ++#ifdef CONFIG_UKSM ++static inline int is_uksm_zero_pfn(unsigned long pfn) ++{ ++ extern unsigned long uksm_zero_pfn; ++ return pfn == uksm_zero_pfn; ++} ++#else ++static inline int is_uksm_zero_pfn(unsigned long pfn) ++{ ++ return 0; ++} ++#endif ++ + #ifdef __HAVE_COLOR_ZERO_PAGE + static inline int is_zero_pfn(unsigned long pfn) + { + extern unsigned long zero_pfn; + unsigned long offset_from_zero_pfn = pfn - zero_pfn; +- return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); ++ return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn); + } + + #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) +@@ -831,7 +844,7 @@ + static inline int is_zero_pfn(unsigned long pfn) + { + extern unsigned long zero_pfn; +- return pfn == zero_pfn; ++ return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); + } + + static inline unsigned long my_zero_pfn(unsigned long addr) +diff -Nur a/include/linux/ksm.h b/include/linux/ksm.h +--- a/include/linux/ksm.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/ksm.h 2019-02-09 17:23:06.726863699 +0000 +@@ -1,4 +1,4 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ ++/* SPDX-License-Identifier: GPL-3.0 */ + #ifndef __LINUX_KSM_H + #define __LINUX_KSM_H + /* +@@ -21,20 +21,16 @@ + #ifdef CONFIG_KSM + int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags); +-int __ksm_enter(struct mm_struct *mm); +-void __ksm_exit(struct mm_struct *mm); + +-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++static inline struct stable_node *page_stable_node(struct page *page) + { +- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) +- return __ksm_enter(mm); +- return 0; ++ return PageKsm(page) ? page_rmapping(page) : NULL; + } + +-static inline void ksm_exit(struct mm_struct *mm) ++static inline void set_page_stable_node(struct page *page, ++ struct stable_node *stable_node) + { +- if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) +- __ksm_exit(mm); ++ page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); + } + + /* +@@ -54,6 +50,33 @@ + void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); + void ksm_migrate_page(struct page *newpage, struct page *oldpage); + ++#ifdef CONFIG_KSM_LEGACY ++int __ksm_enter(struct mm_struct *mm); ++void __ksm_exit(struct mm_struct *mm); ++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++{ ++ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) ++ return __ksm_enter(mm); ++ return 0; ++} ++ ++static inline void ksm_exit(struct mm_struct *mm) ++{ ++ if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) ++ __ksm_exit(mm); ++} ++ ++#elif defined(CONFIG_UKSM) ++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++{ ++ return 0; ++} ++ ++static inline void ksm_exit(struct mm_struct *mm) ++{ ++} ++#endif /* !CONFIG_UKSM */ ++ + #else /* !CONFIG_KSM */ + + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +@@ -89,4 +112,6 @@ + #endif /* CONFIG_MMU */ + #endif /* !CONFIG_KSM */ + ++#include ++ + #endif /* __LINUX_KSM_H */ +diff -Nur a/include/linux/mm_types.h b/include/linux/mm_types.h +--- a/include/linux/mm_types.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/mm_types.h 2019-02-09 17:23:06.726863699 +0000 +@@ -323,6 +323,9 @@ + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ + #endif + struct vm_userfaultfd_ctx vm_userfaultfd_ctx; ++#ifdef CONFIG_UKSM ++ struct vma_slot *uksm_vma_slot; ++#endif + } __randomize_layout; + + struct core_thread { +diff -Nur a/include/linux/mmzone.h b/include/linux/mmzone.h +--- a/include/linux/mmzone.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/mmzone.h 2019-02-09 17:23:06.726863699 +0000 +@@ -148,6 +148,9 @@ + NR_ZSPAGES, /* allocated in zsmalloc */ + #endif + NR_FREE_CMA_PAGES, ++#ifdef CONFIG_UKSM ++ NR_UKSM_ZERO_PAGES, ++#endif + NR_VM_ZONE_STAT_ITEMS }; + + enum node_stat_item { +@@ -867,7 +870,7 @@ + } + + /** +- * is_highmem - helper function to quickly check if a struct zone is a ++ * is_highmem - helper function to quickly check if a struct zone is a + * highmem zone or not. This is an attempt to keep references + * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. + * @zone - pointer to struct zone variable +diff -Nur a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h +--- a/include/linux/sradix-tree.h 1970-01-01 01:00:00.000000000 +0100 ++++ b/include/linux/sradix-tree.h 2019-02-09 17:23:06.726863699 +0000 +@@ -0,0 +1,77 @@ ++#ifndef _LINUX_SRADIX_TREE_H ++#define _LINUX_SRADIX_TREE_H ++ ++ ++#define INIT_SRADIX_TREE(root, mask) \ ++do { \ ++ (root)->height = 0; \ ++ (root)->gfp_mask = (mask); \ ++ (root)->rnode = NULL; \ ++} while (0) ++ ++#define ULONG_BITS (sizeof(unsigned long) * 8) ++#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) ++//#define SRADIX_TREE_MAP_SHIFT 6 ++//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) ++//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) ++ ++struct sradix_tree_node { ++ unsigned int height; /* Height from the bottom */ ++ unsigned int count; ++ unsigned int fulls; /* Number of full sublevel trees */ ++ struct sradix_tree_node *parent; ++ void *stores[0]; ++}; ++ ++/* A simple radix tree implementation */ ++struct sradix_tree_root { ++ unsigned int height; ++ struct sradix_tree_node *rnode; ++ ++ /* Where found to have available empty stores in its sublevels */ ++ struct sradix_tree_node *enter_node; ++ unsigned int shift; ++ unsigned int stores_size; ++ unsigned int mask; ++ unsigned long min; /* The first hole index */ ++ unsigned long num; ++ //unsigned long *height_to_maxindex; ++ ++ /* How the node is allocated and freed. */ ++ struct sradix_tree_node *(*alloc)(void); ++ void (*free)(struct sradix_tree_node *node); ++ ++ /* When a new node is added and removed */ ++ void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); ++ void (*assign)(struct sradix_tree_node *node, unsigned int index, void *item); ++ void (*rm)(struct sradix_tree_node *node, unsigned int offset); ++}; ++ ++struct sradix_tree_path { ++ struct sradix_tree_node *node; ++ int offset; ++}; ++ ++static inline ++void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) ++{ ++ root->height = 0; ++ root->rnode = NULL; ++ root->shift = shift; ++ root->stores_size = 1UL << shift; ++ root->mask = root->stores_size - 1; ++} ++ ++ ++extern void *sradix_tree_next(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index, ++ int (*iter)(void *, unsigned long)); ++ ++extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); ++ ++extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index); ++ ++extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); ++ ++#endif /* _LINUX_SRADIX_TREE_H */ +diff -Nur a/include/linux/uksm.h b/include/linux/uksm.h +--- a/include/linux/uksm.h 1970-01-01 01:00:00.000000000 +0100 ++++ b/include/linux/uksm.h 2019-02-09 17:23:06.726863699 +0000 +@@ -0,0 +1,149 @@ ++#ifndef __LINUX_UKSM_H ++#define __LINUX_UKSM_H ++/* ++ * Memory merging support. ++ * ++ * This code enables dynamic sharing of identical pages found in different ++ * memory areas, even if they are not shared by fork(). ++ */ ++ ++/* if !CONFIG_UKSM this file should not be compiled at all. */ ++#ifdef CONFIG_UKSM ++ ++#include ++#include ++#include ++#include ++#include ++ ++extern unsigned long zero_pfn __read_mostly; ++extern unsigned long uksm_zero_pfn __read_mostly; ++extern struct page *empty_uksm_zero_page; ++ ++/* must be done before linked to mm */ ++extern void uksm_vma_add_new(struct vm_area_struct *vma); ++extern void uksm_remove_vma(struct vm_area_struct *vma); ++ ++#define UKSM_SLOT_NEED_SORT (1 << 0) ++#define UKSM_SLOT_NEED_RERAND (1 << 1) ++#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ ++#define UKSM_SLOT_FUL_SCANNED (1 << 3) ++#define UKSM_SLOT_IN_UKSM (1 << 4) ++ ++struct vma_slot { ++ struct sradix_tree_node *snode; ++ unsigned long sindex; ++ ++ struct list_head slot_list; ++ unsigned long fully_scanned_round; ++ unsigned long dedup_num; ++ unsigned long pages_scanned; ++ unsigned long this_sampled; ++ unsigned long last_scanned; ++ unsigned long pages_to_scan; ++ struct scan_rung *rung; ++ struct page **rmap_list_pool; ++ unsigned int *pool_counts; ++ unsigned long pool_size; ++ struct vm_area_struct *vma; ++ struct mm_struct *mm; ++ unsigned long ctime_j; ++ unsigned long pages; ++ unsigned long flags; ++ unsigned long pages_cowed; /* pages cowed this round */ ++ unsigned long pages_merged; /* pages merged this round */ ++ unsigned long pages_bemerged; ++ ++ /* when it has page merged in this eval round */ ++ struct list_head dedup_list; ++}; ++ ++static inline void uksm_unmap_zero_page(pte_t pte) ++{ ++ if (pte_pfn(pte) == uksm_zero_pfn) ++ __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); ++} ++ ++static inline void uksm_map_zero_page(pte_t pte) ++{ ++ if (pte_pfn(pte) == uksm_zero_pfn) ++ __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); ++} ++ ++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) ++{ ++ if (vma->uksm_vma_slot && PageKsm(page)) ++ vma->uksm_vma_slot->pages_cowed++; ++} ++ ++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) ++{ ++ if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) ++ vma->uksm_vma_slot->pages_cowed++; ++} ++ ++static inline int uksm_flags_can_scan(unsigned long vm_flags) ++{ ++#ifdef VM_SAO ++ if (vm_flags & VM_SAO) ++ return 0; ++#endif ++ ++ return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | ++ VM_HUGETLB | VM_MIXEDMAP | VM_SHARED ++ | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN)); ++} ++ ++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) ++{ ++ if (uksm_flags_can_scan(*vm_flags_p)) ++ *vm_flags_p |= VM_MERGEABLE; ++} ++ ++/* ++ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will ++ * be removed when uksm zero page patch is stable enough. ++ */ ++static inline void uksm_bugon_zeropage(pte_t pte) ++{ ++ BUG_ON(pte_pfn(pte) == uksm_zero_pfn); ++} ++#else ++static inline void uksm_vma_add_new(struct vm_area_struct *vma) ++{ ++} ++ ++static inline void uksm_remove_vma(struct vm_area_struct *vma) ++{ ++} ++ ++static inline void uksm_unmap_zero_page(pte_t pte) ++{ ++} ++ ++static inline void uksm_map_zero_page(pte_t pte) ++{ ++} ++ ++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) ++{ ++} ++ ++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) ++{ ++} ++ ++static inline int uksm_flags_can_scan(unsigned long vm_flags) ++{ ++ return 0; ++} ++ ++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) ++{ ++} ++ ++static inline void uksm_bugon_zeropage(pte_t pte) ++{ ++} ++#endif /* !CONFIG_UKSM */ ++#endif /* __LINUX_UKSM_H */ +diff -Nur a/kernel/fork.c b/kernel/fork.c +--- a/kernel/fork.c 2019-02-09 17:20:30.481821193 +0000 ++++ b/kernel/fork.c 2019-02-09 17:23:06.736864024 +0000 +@@ -543,7 +543,7 @@ + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; +- ++ uksm_vma_add_new(tmp); + mm->map_count++; + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); +diff -Nur a/lib/Makefile b/lib/Makefile +--- a/lib/Makefile 2019-02-06 16:30:16.000000000 +0000 ++++ b/lib/Makefile 2019-02-09 17:23:06.736864024 +0000 +@@ -18,7 +18,7 @@ + KCOV_INSTRUMENT_dynamic_debug.o := n + + lib-y := ctype.o string.o vsprintf.o cmdline.o \ +- rbtree.o radix-tree.o timerqueue.o\ ++ rbtree.o radix-tree.o sradix-tree.o timerqueue.o\ + idr.o int_sqrt.o extable.o \ + sha1.o chacha20.o irq_regs.o argv_split.o \ + flex_proportions.o ratelimit.o show_mem.o \ +diff -Nur a/lib/sradix-tree.c b/lib/sradix-tree.c +--- a/lib/sradix-tree.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/lib/sradix-tree.c 2019-02-09 17:23:06.736864024 +0000 +@@ -0,0 +1,476 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) ++{ ++ return node->fulls == root->stores_size || ++ (node->height == 1 && node->count == root->stores_size); ++} ++ ++/* ++ * Extend a sradix tree so it can store key @index. ++ */ ++static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) ++{ ++ struct sradix_tree_node *node; ++ unsigned int height; ++ ++ if (unlikely(root->rnode == NULL)) { ++ if (!(node = root->alloc())) ++ return -ENOMEM; ++ ++ node->height = 1; ++ root->rnode = node; ++ root->height = 1; ++ } ++ ++ /* Figure out what the height should be. */ ++ height = root->height; ++ index >>= root->shift * height; ++ ++ while (index) { ++ index >>= root->shift; ++ height++; ++ } ++ ++ while (height > root->height) { ++ unsigned int newheight; ++ ++ if (!(node = root->alloc())) ++ return -ENOMEM; ++ ++ /* Increase the height. */ ++ node->stores[0] = root->rnode; ++ root->rnode->parent = node; ++ if (root->extend) ++ root->extend(node, root->rnode); ++ ++ newheight = root->height + 1; ++ node->height = newheight; ++ node->count = 1; ++ if (sradix_node_full(root, root->rnode)) ++ node->fulls = 1; ++ ++ root->rnode = node; ++ root->height = newheight; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Search the next item from the current node, that is not NULL ++ * and can satify root->iter(). ++ */ ++void *sradix_tree_next(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index, ++ int (*iter)(void *item, unsigned long height)) ++{ ++ unsigned long offset; ++ void *item; ++ ++ if (unlikely(node == NULL)) { ++ node = root->rnode; ++ for (offset = 0; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (unlikely(offset >= root->stores_size)) ++ return NULL; ++ ++ if (node->height == 1) ++ return item; ++ else ++ goto go_down; ++ } ++ ++ while (node) { ++ offset = (index & root->mask) + 1; ++ for (; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (offset < root->stores_size) ++ break; ++ ++ node = node->parent; ++ index >>= root->shift; ++ } ++ ++ if (!node) ++ return NULL; ++ ++ while (node->height > 1) { ++go_down: ++ node = item; ++ for (offset = 0; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (unlikely(offset >= root->stores_size)) ++ return NULL; ++ } ++ ++ BUG_ON(offset > root->stores_size); ++ ++ return item; ++} ++ ++/* ++ * Blindly insert the item to the tree. Typically, we reuse the ++ * first empty store item. ++ */ ++int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) ++{ ++ unsigned long index; ++ unsigned int height; ++ struct sradix_tree_node *node, *tmp = NULL; ++ int offset, offset_saved; ++ void **store = NULL; ++ int error, i, j, shift; ++ ++go_on: ++ index = root->min; ++ ++ if (root->enter_node && !sradix_node_full(root, root->enter_node)) { ++ node = root->enter_node; ++ BUG_ON((index >> (root->shift * root->height))); ++ } else { ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height)) ++ || sradix_node_full(root, node)) { ++ error = sradix_tree_extend(root, index); ++ if (error) ++ return error; ++ ++ node = root->rnode; ++ } ++ } ++ ++ ++ height = node->height; ++ shift = (height - 1) * root->shift; ++ offset = (index >> shift) & root->mask; ++ while (shift > 0) { ++ offset_saved = offset; ++ for (; offset < root->stores_size; offset++) { ++ store = &node->stores[offset]; ++ tmp = *store; ++ ++ if (!tmp || !sradix_node_full(root, tmp)) ++ break; ++ } ++ BUG_ON(offset >= root->stores_size); ++ ++ if (offset != offset_saved) { ++ index += (offset - offset_saved) << shift; ++ index &= ~((1UL << shift) - 1); ++ } ++ ++ if (!tmp) { ++ if (!(tmp = root->alloc())) ++ return -ENOMEM; ++ ++ tmp->height = shift / root->shift; ++ *store = tmp; ++ tmp->parent = node; ++ node->count++; ++// if (root->extend) ++// root->extend(node, tmp); ++ } ++ ++ node = tmp; ++ shift -= root->shift; ++ offset = (index >> shift) & root->mask; ++ } ++ ++ BUG_ON(node->height != 1); ++ ++ ++ store = &node->stores[offset]; ++ for (i = 0, j = 0; ++ j < root->stores_size - node->count && ++ i < root->stores_size - offset && j < num; i++) { ++ if (!store[i]) { ++ store[i] = item[j]; ++ if (root->assign) ++ root->assign(node, index + i, item[j]); ++ j++; ++ } ++ } ++ ++ node->count += j; ++ root->num += j; ++ num -= j; ++ ++ while (sradix_node_full(root, node)) { ++ node = node->parent; ++ if (!node) ++ break; ++ ++ node->fulls++; ++ } ++ ++ if (unlikely(!node)) { ++ /* All nodes are full */ ++ root->min = 1 << (root->height * root->shift); ++ root->enter_node = NULL; ++ } else { ++ root->min = index + i - 1; ++ root->min |= (1UL << (node->height - 1)) - 1; ++ root->min++; ++ root->enter_node = node; ++ } ++ ++ if (num) { ++ item += j; ++ goto go_on; ++ } ++ ++ return 0; ++} ++ ++ ++/** ++ * sradix_tree_shrink - shrink height of a sradix tree to minimal ++ * @root sradix tree root ++ * ++ */ ++static inline void sradix_tree_shrink(struct sradix_tree_root *root) ++{ ++ /* try to shrink tree height */ ++ while (root->height > 1) { ++ struct sradix_tree_node *to_free = root->rnode; ++ ++ /* ++ * The candidate node has more than one child, or its child ++ * is not at the leftmost store, we cannot shrink. ++ */ ++ if (to_free->count != 1 || !to_free->stores[0]) ++ break; ++ ++ root->rnode = to_free->stores[0]; ++ root->rnode->parent = NULL; ++ root->height--; ++ if (unlikely(root->enter_node == to_free)) ++ root->enter_node = NULL; ++ root->free(to_free); ++ } ++} ++ ++/* ++ * Del the item on the known leaf node and index ++ */ ++void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index) ++{ ++ unsigned int offset; ++ struct sradix_tree_node *start, *end; ++ ++ BUG_ON(node->height != 1); ++ ++ start = node; ++ while (node && !(--node->count)) ++ node = node->parent; ++ ++ end = node; ++ if (!node) { ++ root->rnode = NULL; ++ root->height = 0; ++ root->min = 0; ++ root->num = 0; ++ root->enter_node = NULL; ++ } else { ++ offset = (index >> (root->shift * (node->height - 1))) & root->mask; ++ if (root->rm) ++ root->rm(node, offset); ++ node->stores[offset] = NULL; ++ root->num--; ++ if (root->min > index) { ++ root->min = index; ++ root->enter_node = node; ++ } ++ } ++ ++ if (start != end) { ++ do { ++ node = start; ++ start = start->parent; ++ if (unlikely(root->enter_node == node)) ++ root->enter_node = end; ++ root->free(node); ++ } while (start != end); ++ ++ /* ++ * Note that shrink may free "end", so enter_node still need to ++ * be checked inside. ++ */ ++ sradix_tree_shrink(root); ++ } else if (node->count == root->stores_size - 1) { ++ /* It WAS a full leaf node. Update the ancestors */ ++ node = node->parent; ++ while (node) { ++ node->fulls--; ++ if (node->fulls != root->stores_size - 1) ++ break; ++ ++ node = node->parent; ++ } ++ } ++} ++ ++void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node; ++ int shift; ++ ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height))) ++ return NULL; ++ ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ node = node->stores[offset]; ++ if (!node) ++ return NULL; ++ ++ shift -= root->shift; ++ } while (shift >= 0); ++ ++ return node; ++} ++ ++/* ++ * Return the item if it exists, otherwise create it in place ++ * and return the created item. ++ */ ++void *sradix_tree_lookup_create(struct sradix_tree_root *root, ++ unsigned long index, void *(*item_alloc)(void)) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node, *tmp; ++ void *item; ++ int shift, error; ++ ++ if (root->rnode == NULL || (index >> (root->shift * root->height))) { ++ if (item_alloc) { ++ error = sradix_tree_extend(root, index); ++ if (error) ++ return NULL; ++ } else { ++ return NULL; ++ } ++ } ++ ++ node = root->rnode; ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ if (!node->stores[offset]) { ++ if (!(tmp = root->alloc())) ++ return NULL; ++ ++ tmp->height = shift / root->shift; ++ node->stores[offset] = tmp; ++ tmp->parent = node; ++ node->count++; ++ node = tmp; ++ } else { ++ node = node->stores[offset]; ++ } ++ ++ shift -= root->shift; ++ } while (shift > 0); ++ ++ BUG_ON(node->height != 1); ++ offset = index & root->mask; ++ if (node->stores[offset]) { ++ return node->stores[offset]; ++ } else if (item_alloc) { ++ if (!(item = item_alloc())) ++ return NULL; ++ ++ node->stores[offset] = item; ++ ++ /* ++ * NOTE: we do NOT call root->assign here, since this item is ++ * newly created by us having no meaning. Caller can call this ++ * if it's necessary to do so. ++ */ ++ ++ node->count++; ++ root->num++; ++ ++ while (sradix_node_full(root, node)) { ++ node = node->parent; ++ if (!node) ++ break; ++ ++ node->fulls++; ++ } ++ ++ if (unlikely(!node)) { ++ /* All nodes are full */ ++ root->min = 1 << (root->height * root->shift); ++ } else { ++ if (root->min == index) { ++ root->min |= (1UL << (node->height - 1)) - 1; ++ root->min++; ++ root->enter_node = node; ++ } ++ } ++ ++ return item; ++ } else { ++ return NULL; ++ } ++ ++} ++ ++int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node; ++ int shift; ++ ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height))) ++ return -ENOENT; ++ ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ node = node->stores[offset]; ++ if (!node) ++ return -ENOENT; ++ ++ shift -= root->shift; ++ } while (shift > 0); ++ ++ offset = index & root->mask; ++ if (!node->stores[offset]) ++ return -ENOENT; ++ ++ sradix_tree_delete_from_leaf(root, node, index); ++ ++ return 0; ++} +diff -Nur a/mm/Kconfig b/mm/Kconfig +--- a/mm/Kconfig 2019-02-09 17:20:30.491821512 +0000 ++++ b/mm/Kconfig 2019-02-09 17:23:06.736864024 +0000 +@@ -307,6 +307,32 @@ + See Documentation/vm/ksm.rst for more information: KSM is inactive + until a program has madvised that an area is MADV_MERGEABLE, and + root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). ++choice ++ prompt "Choose UKSM/KSM strategy" ++ default UKSM ++ depends on KSM ++ help ++ This option allows to select a UKSM/KSM stragety. ++ ++config UKSM ++ bool "Ultra-KSM for page merging" ++ depends on KSM ++ help ++ UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same ++ page Merging), but with a fundamentally rewritten core algorithm. With ++ an advanced algorithm, UKSM now can transparently scans all anonymously ++ mapped user space applications with an significantly improved scan speed ++ and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from ++ UKSM. Now UKSM has its first stable release and first real world enterprise user. ++ For more information, please goto its project page. ++ (www.kerneldedup.org) ++ ++config KSM_LEGACY ++ bool "Legacy KSM implementation" ++ depends on KSM ++ help ++ The legacy KSM implementation from Red Hat. ++endchoice + + config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" +diff -Nur a/mm/ksm.c b/mm/ksm.c +--- a/mm/ksm.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/mm/ksm.c 2019-02-09 17:23:06.736864024 +0000 +@@ -842,17 +842,6 @@ + return err; + } + +-static inline struct stable_node *page_stable_node(struct page *page) +-{ +- return PageKsm(page) ? page_rmapping(page) : NULL; +-} +- +-static inline void set_page_stable_node(struct page *page, +- struct stable_node *stable_node) +-{ +- page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); +-} +- + #ifdef CONFIG_SYSFS + /* + * Only called through the sysfs control interface: +diff -Nur a/mm/Makefile b/mm/Makefile +--- a/mm/Makefile 2019-02-06 16:30:16.000000000 +0000 ++++ b/mm/Makefile 2019-02-09 17:23:06.736864024 +0000 +@@ -64,7 +64,8 @@ + obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o + obj-$(CONFIG_SLOB) += slob.o + obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +-obj-$(CONFIG_KSM) += ksm.o ++obj-$(CONFIG_KSM_LEGACY) += ksm.o ++obj-$(CONFIG_UKSM) += uksm.o + obj-$(CONFIG_PAGE_POISONING) += page_poison.o + obj-$(CONFIG_SLAB) += slab.o + obj-$(CONFIG_SLUB) += slub.o +diff -Nur a/mm/memory.c b/mm/memory.c +--- a/mm/memory.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/mm/memory.c 2019-02-09 17:23:06.736864024 +0000 +@@ -128,6 +128,25 @@ + + unsigned long highest_memmap_pfn __read_mostly; + ++#ifdef CONFIG_UKSM ++unsigned long uksm_zero_pfn __read_mostly; ++EXPORT_SYMBOL_GPL(uksm_zero_pfn); ++struct page *empty_uksm_zero_page; ++ ++static int __init setup_uksm_zero_page(void) ++{ ++ empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0); ++ if (!empty_uksm_zero_page) ++ panic("Oh boy, that early out of memory?"); ++ ++ SetPageReserved(empty_uksm_zero_page); ++ uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); ++ ++ return 0; ++} ++core_initcall(setup_uksm_zero_page); ++#endif ++ + /* + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() + */ +@@ -139,6 +158,7 @@ + core_initcall(init_zero_pfn); + + ++ + #if defined(SPLIT_RSS_COUNTING) + + void sync_mm_rss(struct mm_struct *mm) +@@ -1040,6 +1060,9 @@ + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; ++ ++ /* Should return NULL in vm_normal_page() */ ++ uksm_bugon_zeropage(pte); + } else if (pte_devmap(pte)) { + page = pte_page(pte); + +@@ -1053,6 +1076,8 @@ + page_dup_rmap(page, false); + rss[mm_counter(page)]++; + } ++ } else { ++ uksm_map_zero_page(pte); + } + + out_set_pte: +@@ -1322,8 +1347,10 @@ + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + tlb_remove_tlb_entry(tlb, pte, addr); +- if (unlikely(!page)) ++ if (unlikely(!page)) { ++ uksm_unmap_zero_page(ptent); + continue; ++ } + + if (!PageAnon(page)) { + if (pte_dirty(ptent)) { +@@ -2353,8 +2380,10 @@ + clear_page(kaddr); + kunmap_atomic(kaddr); + flush_dcache_page(dst); +- } else ++ } else { + copy_user_highpage(dst, src, va, vma); ++ uksm_cow_page(vma, src); ++ } + } + + static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) +@@ -2503,6 +2532,7 @@ + vmf->address); + if (!new_page) + goto oom; ++ uksm_cow_pte(vma, vmf->orig_pte); + } else { + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, + vmf->address); +@@ -2529,7 +2559,9 @@ + mm_counter_file(old_page)); + inc_mm_counter_fast(mm, MM_ANONPAGES); + } ++ uksm_bugon_zeropage(vmf->orig_pte); + } else { ++ uksm_unmap_zero_page(vmf->orig_pte); + inc_mm_counter_fast(mm, MM_ANONPAGES); + } + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); +diff -Nur a/mm/mmap.c b/mm/mmap.c +--- a/mm/mmap.c 2019-02-09 17:20:30.491821512 +0000 ++++ b/mm/mmap.c 2019-02-09 17:23:06.736864024 +0000 +@@ -45,6 +45,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -182,6 +183,7 @@ + if (vma->vm_file) + fput(vma->vm_file); + mpol_put(vma_policy(vma)); ++ uksm_remove_vma(vma); + vm_area_free(vma); + return next; + } +@@ -708,9 +710,16 @@ + long adjust_next = 0; + int remove_next = 0; + ++/* ++ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is ++ * acquired ++ */ ++ uksm_remove_vma(vma); ++ + if (next && !insert) { + struct vm_area_struct *exporter = NULL, *importer = NULL; + ++ uksm_remove_vma(next); + if (end >= next->vm_end) { + /* + * vma expands, overlapping all the next, and +@@ -843,6 +852,7 @@ + end_changed = true; + } + vma->vm_pgoff = pgoff; ++ + if (adjust_next) { + next->vm_start += adjust_next << PAGE_SHIFT; + next->vm_pgoff += adjust_next; +@@ -948,6 +958,7 @@ + if (remove_next == 2) { + remove_next = 1; + end = next->vm_end; ++ uksm_remove_vma(next); + goto again; + } + else if (next) +@@ -974,10 +985,14 @@ + */ + VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); + } ++ } else { ++ if (next && !insert) ++ uksm_vma_add_new(next); + } + if (insert && file) + uprobe_mmap(insert); + ++ uksm_vma_add_new(vma); + validate_mm(mm); + + return 0; +@@ -1434,6 +1449,9 @@ + vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + ++ /* If uksm is enabled, we add VM_MERGEABLE to new VMAs. */ ++ uksm_vm_flags_mod(&vm_flags); ++ + if (flags & MAP_LOCKED) + if (!can_do_mlock()) + return -EPERM; +@@ -1798,6 +1816,7 @@ + allow_write_access(file); + } + file = vma->vm_file; ++ uksm_vma_add_new(vma); + out: + perf_event_mmap(vma); + +@@ -1840,6 +1859,7 @@ + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + free_vma: ++ uksm_remove_vma(vma); + vm_area_free(vma); + unacct_error: + if (charged) +@@ -2659,6 +2679,8 @@ + else + err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + ++ uksm_vma_add_new(new); ++ + /* Success. */ + if (!err) + return 0; +@@ -2944,6 +2966,7 @@ + if ((flags & (~VM_EXEC)) != 0) + return -EINVAL; + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; ++ uksm_vm_flags_mod(&flags); + + error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (offset_in_page(error)) +@@ -3000,6 +3023,7 @@ + vma->vm_flags = flags; + vma->vm_page_prot = vm_get_page_prot(flags); + vma_link(mm, vma, prev, rb_link, rb_parent); ++ uksm_vma_add_new(vma); + out: + perf_event_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; +@@ -3077,6 +3101,12 @@ + up_write(&mm->mmap_sem); + } + ++ /* ++ * Taking write lock on mmap_sem does not harm others, ++ * but it's crucial for uksm to avoid races. ++ */ ++ down_write(&mm->mmap_sem); ++ + if (mm->locked_vm) { + vma = mm->mmap; + while (vma) { +@@ -3111,6 +3141,11 @@ + vma = remove_vma(vma); + } + vm_unacct_memory(nr_accounted); ++ ++ mm->mmap = NULL; ++ mm->mm_rb = RB_ROOT; ++ vmacache_invalidate(mm); ++ up_write(&mm->mmap_sem); + } + + /* Insert vm structure into process list sorted by address +@@ -3218,6 +3253,7 @@ + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); + *need_rmap_locks = false; ++ uksm_vma_add_new(new_vma); + } + return new_vma; + +@@ -3368,6 +3404,7 @@ + vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); + + perf_event_mmap(vma); ++ uksm_vma_add_new(vma); + + return vma; + +diff -Nur a/mm/rmap.c b/mm/rmap.c +--- a/mm/rmap.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/mm/rmap.c 2019-02-09 17:23:06.736864024 +0000 +@@ -1017,9 +1017,9 @@ + + /** + * __page_set_anon_rmap - set up new anonymous rmap +- * @page: Page to add to rmap ++ * @page: Page to add to rmap + * @vma: VM area to add page to. +- * @address: User virtual address of the mapping ++ * @address: User virtual address of the mapping + * @exclusive: the page is exclusively owned by the current process + */ + static void __page_set_anon_rmap(struct page *page, +diff -Nur a/mm/uksm.c b/mm/uksm.c +--- a/mm/uksm.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/mm/uksm.c 2019-02-09 17:23:06.736864024 +0000 +@@ -0,0 +1,5584 @@ ++/* ++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia ++ * ++ * This is an improvement upon KSM. Some basic data structures and routines ++ * are borrowed from ksm.c . ++ * ++ * Its new features: ++ * 1. Full system scan: ++ * It automatically scans all user processes' anonymous VMAs. Kernel-user ++ * interaction to submit a memory area to KSM is no longer needed. ++ * ++ * 2. Rich area detection: ++ * It automatically detects rich areas containing abundant duplicated ++ * pages based. Rich areas are given a full scan speed. Poor areas are ++ * sampled at a reasonable speed with very low CPU consumption. ++ * ++ * 3. Ultra Per-page scan speed improvement: ++ * A new hash algorithm is proposed. As a result, on a machine with ++ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it ++ * can scan memory areas that does not contain duplicated pages at speed of ++ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of ++ * 477MB/sec ~ 923MB/sec. ++ * ++ * 4. Thrashing area avoidance: ++ * Thrashing area(an VMA that has frequent Ksm page break-out) can be ++ * filtered out. My benchmark shows it's more efficient than KSM's per-page ++ * hash value based volatile page detection. ++ * ++ * ++ * 5. Misc changes upon KSM: ++ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page ++ * comparison. It's much faster than default C version on x86. ++ * * rmap_item now has an struct *page member to loosely cache a ++ * address-->page mapping, which reduces too much time-costly ++ * follow_page(). ++ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. ++ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ ++ * ksm is needed for this case. ++ * ++ * 6. Full Zero Page consideration(contributed by Figo Zhang) ++ * Now uksmd consider full zero pages as special pages and merge them to an ++ * special unswappable uksm zero page. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include "internal.h" ++ ++#ifdef CONFIG_X86 ++#undef memcmp ++ ++#ifdef CONFIG_X86_32 ++#define memcmp memcmpx86_32 ++/* ++ * Compare 4-byte-aligned address s1 and s2, with length n ++ */ ++int memcmpx86_32(void *s1, void *s2, size_t n) ++{ ++ size_t num = n / 4; ++ register int res; ++ ++ __asm__ __volatile__ ++ ( ++ "testl %3,%3\n\t" ++ "repe; cmpsd\n\t" ++ "je 1f\n\t" ++ "sbbl %0,%0\n\t" ++ "orl $1,%0\n" ++ "1:" ++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) ++ : "0" (0) ++ : "cc"); ++ ++ return res; ++} ++ ++/* ++ * Check the page is all zero ? ++ */ ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned char same; ++ ++ len /= 4; ++ ++ __asm__ __volatile__ ++ ("repe; scasl;" ++ "sete %0" ++ : "=qm" (same), "+D" (s1), "+c" (len) ++ : "a" (0) ++ : "cc"); ++ ++ return same; ++} ++ ++ ++#elif defined(CONFIG_X86_64) ++#define memcmp memcmpx86_64 ++/* ++ * Compare 8-byte-aligned address s1 and s2, with length n ++ */ ++int memcmpx86_64(void *s1, void *s2, size_t n) ++{ ++ size_t num = n / 8; ++ register int res; ++ ++ __asm__ __volatile__ ++ ( ++ "testq %q3,%q3\n\t" ++ "repe; cmpsq\n\t" ++ "je 1f\n\t" ++ "sbbq %q0,%q0\n\t" ++ "orq $1,%q0\n" ++ "1:" ++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) ++ : "0" (0) ++ : "cc"); ++ ++ return res; ++} ++ ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned char same; ++ ++ len /= 8; ++ ++ __asm__ __volatile__ ++ ("repe; scasq;" ++ "sete %0" ++ : "=qm" (same), "+D" (s1), "+c" (len) ++ : "a" (0) ++ : "cc"); ++ ++ return same; ++} ++ ++#endif ++#else ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned long *src = s1; ++ int i; ++ ++ len /= sizeof(*src); ++ ++ for (i = 0; i < len; i++) { ++ if (src[i]) ++ return 0; ++ } ++ ++ return 1; ++} ++#endif ++ ++#define UKSM_RUNG_ROUND_FINISHED (1 << 0) ++#define TIME_RATIO_SCALE 10000 ++ ++#define SLOT_TREE_NODE_SHIFT 8 ++#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) ++struct slot_tree_node { ++ unsigned long size; ++ struct sradix_tree_node snode; ++ void *stores[SLOT_TREE_NODE_STORE_SIZE]; ++}; ++ ++static struct kmem_cache *slot_tree_node_cachep; ++ ++static struct sradix_tree_node *slot_tree_node_alloc(void) ++{ ++ struct slot_tree_node *p; ++ ++ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (!p) ++ return NULL; ++ ++ return &p->snode; ++} ++ ++static void slot_tree_node_free(struct sradix_tree_node *node) ++{ ++ struct slot_tree_node *p; ++ ++ p = container_of(node, struct slot_tree_node, snode); ++ kmem_cache_free(slot_tree_node_cachep, p); ++} ++ ++static void slot_tree_node_extend(struct sradix_tree_node *parent, ++ struct sradix_tree_node *child) ++{ ++ struct slot_tree_node *p, *c; ++ ++ p = container_of(parent, struct slot_tree_node, snode); ++ c = container_of(child, struct slot_tree_node, snode); ++ ++ p->size += c->size; ++} ++ ++void slot_tree_node_assign(struct sradix_tree_node *node, ++ unsigned int index, void *item) ++{ ++ struct vma_slot *slot = item; ++ struct slot_tree_node *cur; ++ ++ slot->snode = node; ++ slot->sindex = index; ++ ++ while (node) { ++ cur = container_of(node, struct slot_tree_node, snode); ++ cur->size += slot->pages; ++ node = node->parent; ++ } ++} ++ ++void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset) ++{ ++ struct vma_slot *slot; ++ struct slot_tree_node *cur; ++ unsigned long pages; ++ ++ if (node->height == 1) { ++ slot = node->stores[offset]; ++ pages = slot->pages; ++ } else { ++ cur = container_of(node->stores[offset], ++ struct slot_tree_node, snode); ++ pages = cur->size; ++ } ++ ++ while (node) { ++ cur = container_of(node, struct slot_tree_node, snode); ++ cur->size -= pages; ++ node = node->parent; ++ } ++} ++ ++unsigned long slot_iter_index; ++int slot_iter(void *item, unsigned long height) ++{ ++ struct slot_tree_node *node; ++ struct vma_slot *slot; ++ ++ if (height == 1) { ++ slot = item; ++ if (slot_iter_index < slot->pages) { ++ /*in this one*/ ++ return 1; ++ } else { ++ slot_iter_index -= slot->pages; ++ return 0; ++ } ++ ++ } else { ++ node = container_of(item, struct slot_tree_node, snode); ++ if (slot_iter_index < node->size) { ++ /*in this one*/ ++ return 1; ++ } else { ++ slot_iter_index -= node->size; ++ return 0; ++ } ++ } ++} ++ ++ ++static inline void slot_tree_init_root(struct sradix_tree_root *root) ++{ ++ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); ++ root->alloc = slot_tree_node_alloc; ++ root->free = slot_tree_node_free; ++ root->extend = slot_tree_node_extend; ++ root->assign = slot_tree_node_assign; ++ root->rm = slot_tree_node_rm; ++} ++ ++void slot_tree_init(void) ++{ ++ slot_tree_node_cachep = kmem_cache_create("slot_tree_node", ++ sizeof(struct slot_tree_node), 0, ++ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, ++ NULL); ++} ++ ++ ++/* Each rung of this ladder is a list of VMAs having a same scan ratio */ ++struct scan_rung { ++ //struct list_head scanned_list; ++ struct sradix_tree_root vma_root; ++ struct sradix_tree_root vma_root2; ++ ++ struct vma_slot *current_scan; ++ unsigned long current_offset; ++ ++ /* ++ * The initial value for current_offset, it should loop over ++ * [0~ step - 1] to let all slot have its chance to be scanned. ++ */ ++ unsigned long offset_init; ++ unsigned long step; /* dynamic step for current_offset */ ++ unsigned int flags; ++ unsigned long pages_to_scan; ++ //unsigned long fully_scanned_slots; ++ /* ++ * a little bit tricky - if cpu_time_ratio > 0, then the value is the ++ * the cpu time ratio it can spend in rung_i for every scan ++ * period. if < 0, then it is the cpu time ratio relative to the ++ * max cpu percentage user specified. Both in unit of ++ * 1/TIME_RATIO_SCALE ++ */ ++ int cpu_ratio; ++ ++ /* ++ * How long it will take for all slots in this rung to be fully ++ * scanned? If it's zero, we don't care about the cover time: ++ * it's fully scanned. ++ */ ++ unsigned int cover_msecs; ++ //unsigned long vma_num; ++ //unsigned long pages; /* Sum of all slot's pages in rung */ ++}; ++ ++/** ++ * node of either the stable or unstale rbtree ++ * ++ */ ++struct tree_node { ++ struct rb_node node; /* link in the main (un)stable rbtree */ ++ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ ++ u32 hash; ++ unsigned long count; /* TODO: merged with sub_root */ ++ struct list_head all_list; /* all tree nodes in stable/unstable tree */ ++}; ++ ++/** ++ * struct stable_node - node of the stable rbtree ++ * @node: rb node of this ksm page in the stable tree ++ * @hlist: hlist head of rmap_items using this ksm page ++ * @kpfn: page frame number of this ksm page ++ */ ++struct stable_node { ++ struct rb_node node; /* link in sub-rbtree */ ++ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ ++ struct hlist_head hlist; ++ unsigned long kpfn; ++ u32 hash_max; /* if ==0 then it's not been calculated yet */ ++ struct list_head all_list; /* in a list for all stable nodes */ ++}; ++ ++/** ++ * struct node_vma - group rmap_items linked in a same stable ++ * node together. ++ */ ++struct node_vma { ++ union { ++ struct vma_slot *slot; ++ unsigned long key; /* slot is used as key sorted on hlist */ ++ }; ++ struct hlist_node hlist; ++ struct hlist_head rmap_hlist; ++ struct stable_node *head; ++}; ++ ++/** ++ * struct rmap_item - reverse mapping item for virtual addresses ++ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list ++ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree ++ * @mm: the memory structure this rmap_item is pointing into ++ * @address: the virtual address this rmap_item tracks (+ flags in low bits) ++ * @node: rb node of this rmap_item in the unstable tree ++ * @head: pointer to stable_node heading this list in the stable tree ++ * @hlist: link into hlist of rmap_items hanging off that stable_node ++ */ ++struct rmap_item { ++ struct vma_slot *slot; ++ struct page *page; ++ unsigned long address; /* + low bits used for flags below */ ++ unsigned long hash_round; ++ unsigned long entry_index; ++ union { ++ struct {/* when in unstable tree */ ++ struct rb_node node; ++ struct tree_node *tree_node; ++ u32 hash_max; ++ }; ++ struct { /* when in stable tree */ ++ struct node_vma *head; ++ struct hlist_node hlist; ++ struct anon_vma *anon_vma; ++ }; ++ }; ++} __aligned(4); ++ ++struct rmap_list_entry { ++ union { ++ struct rmap_item *item; ++ unsigned long addr; ++ }; ++ /* lowest bit is used for is_addr tag */ ++} __aligned(4); /* 4 aligned to fit in to pages*/ ++ ++ ++/* Basic data structure definition ends */ ++ ++ ++/* ++ * Flags for rmap_item to judge if it's listed in the stable/unstable tree. ++ * The flags use the low bits of rmap_item.address ++ */ ++#define UNSTABLE_FLAG 0x1 ++#define STABLE_FLAG 0x2 ++#define get_rmap_addr(x) ((x)->address & PAGE_MASK) ++ ++/* ++ * rmap_list_entry helpers ++ */ ++#define IS_ADDR_FLAG 1 ++#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) ++#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) ++#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) ++ ++ ++/* ++ * High speed caches for frequently allocated and freed structs ++ */ ++static struct kmem_cache *rmap_item_cache; ++static struct kmem_cache *stable_node_cache; ++static struct kmem_cache *node_vma_cache; ++static struct kmem_cache *vma_slot_cache; ++static struct kmem_cache *tree_node_cache; ++#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ ++ sizeof(struct __struct), __alignof__(struct __struct),\ ++ (__flags), NULL) ++ ++/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ ++#define SCAN_LADDER_SIZE 4 ++static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; ++ ++/* The evaluation rounds uksmd has finished */ ++static unsigned long long uksm_eval_round = 1; ++ ++/* ++ * we add 1 to this var when we consider we should rebuild the whole ++ * unstable tree. ++ */ ++static unsigned long uksm_hash_round = 1; ++ ++/* ++ * How many times the whole memory is scanned. ++ */ ++static unsigned long long fully_scanned_round = 1; ++ ++/* The total number of virtual pages of all vma slots */ ++static u64 uksm_pages_total; ++ ++/* The number of pages has been scanned since the start up */ ++static u64 uksm_pages_scanned; ++ ++static u64 scanned_virtual_pages; ++ ++/* The number of pages has been scanned since last encode_benefit call */ ++static u64 uksm_pages_scanned_last; ++ ++/* If the scanned number is tooo large, we encode it here */ ++static u64 pages_scanned_stored; ++ ++static unsigned long pages_scanned_base; ++ ++/* The number of nodes in the stable tree */ ++static unsigned long uksm_pages_shared; ++ ++/* The number of page slots additionally sharing those nodes */ ++static unsigned long uksm_pages_sharing; ++ ++/* The number of nodes in the unstable tree */ ++static unsigned long uksm_pages_unshared; ++ ++/* ++ * Milliseconds ksmd should sleep between scans, ++ * >= 100ms to be consistent with ++ * scan_time_to_sleep_msec() ++ */ ++static unsigned int uksm_sleep_jiffies; ++ ++/* The real value for the uksmd next sleep */ ++static unsigned int uksm_sleep_real; ++ ++/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ ++static unsigned int uksm_sleep_saved; ++ ++/* Max percentage of cpu utilization ksmd can take to scan in one batch */ ++static unsigned int uksm_max_cpu_percentage; ++ ++static int uksm_cpu_governor; ++ ++static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; ++ ++struct uksm_cpu_preset_s { ++ int cpu_ratio[SCAN_LADDER_SIZE]; ++ unsigned int cover_msecs[SCAN_LADDER_SIZE]; ++ unsigned int max_cpu; /* percentage */ ++}; ++ ++struct uksm_cpu_preset_s uksm_cpu_preset[4] = { ++ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, ++ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, ++ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, ++ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, ++}; ++ ++/* The default value for uksm_ema_page_time if it's not initialized */ ++#define UKSM_PAGE_TIME_DEFAULT 500 ++ ++/*cost to scan one page by expotional moving average in nsecs */ ++static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; ++ ++/* The expotional moving average alpha weight, in percentage. */ ++#define EMA_ALPHA 20 ++ ++/* ++ * The threshold used to filter out thrashing areas, ++ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound ++ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio ++ * will be considered as having a zero duplication ratio. ++ */ ++static unsigned int uksm_thrash_threshold = 50; ++ ++/* How much dedup ratio is considered to be abundant*/ ++static unsigned int uksm_abundant_threshold = 10; ++ ++/* All slots having merged pages in this eval round. */ ++struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); ++ ++/* How many times the ksmd has slept since startup */ ++static unsigned long long uksm_sleep_times; ++ ++#define UKSM_RUN_STOP 0 ++#define UKSM_RUN_MERGE 1 ++static unsigned int uksm_run = 1; ++ ++static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); ++static DEFINE_MUTEX(uksm_thread_mutex); ++ ++/* ++ * List vma_slot_new is for newly created vma_slot waiting to be added by ++ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to ++ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding ++ * VMA has been removed/freed. ++ */ ++struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); ++struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); ++struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); ++static DEFINE_SPINLOCK(vma_slot_list_lock); ++ ++/* The unstable tree heads */ ++static struct rb_root root_unstable_tree = RB_ROOT; ++ ++/* ++ * All tree_nodes are in a list to be freed at once when unstable tree is ++ * freed after each scan round. ++ */ ++static struct list_head unstable_tree_node_list = ++ LIST_HEAD_INIT(unstable_tree_node_list); ++ ++/* List contains all stable nodes */ ++static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); ++ ++/* ++ * When the hash strength is changed, the stable tree must be delta_hashed and ++ * re-structured. We use two set of below structs to speed up the ++ * re-structuring of stable tree. ++ */ ++static struct list_head ++stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), ++ LIST_HEAD_INIT(stable_tree_node_list[1])}; ++ ++static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; ++static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; ++static struct rb_root *root_stable_treep = &root_stable_tree[0]; ++static unsigned long stable_tree_index; ++ ++/* The hash strength needed to hash a full page */ ++#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) ++ ++/* The hash strength needed for loop-back hashing */ ++#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) ++ ++/* The random offsets in a page */ ++static u32 *random_nums; ++ ++/* The hash strength */ ++static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; ++ ++/* The delta value each time the hash strength increases or decreases */ ++static unsigned long hash_strength_delta; ++#define HASH_STRENGTH_DELTA_MAX 5 ++ ++/* The time we have saved due to random_sample_hash */ ++static u64 rshash_pos; ++ ++/* The time we have wasted due to hash collision */ ++static u64 rshash_neg; ++ ++struct uksm_benefit { ++ u64 pos; ++ u64 neg; ++ u64 scanned; ++ unsigned long base; ++} benefit; ++ ++/* ++ * The relative cost of memcmp, compared to 1 time unit of random sample ++ * hash, this value is tested when ksm module is initialized ++ */ ++static unsigned long memcmp_cost; ++ ++static unsigned long rshash_neg_cont_zero; ++static unsigned long rshash_cont_obscure; ++ ++/* The possible states of hash strength adjustment heuristic */ ++enum rshash_states { ++ RSHASH_STILL, ++ RSHASH_TRYUP, ++ RSHASH_TRYDOWN, ++ RSHASH_NEW, ++ RSHASH_PRE_STILL, ++}; ++ ++/* The possible direction we are about to adjust hash strength */ ++enum rshash_direct { ++ GO_UP, ++ GO_DOWN, ++ OBSCURE, ++ STILL, ++}; ++ ++/* random sampling hash state machine */ ++static struct { ++ enum rshash_states state; ++ enum rshash_direct pre_direct; ++ u8 below_count; ++ /* Keep a lookup window of size 5, iff above_count/below_count > 3 ++ * in this window we stop trying. ++ */ ++ u8 lookup_window_index; ++ u64 stable_benefit; ++ unsigned long turn_point_down; ++ unsigned long turn_benefit_down; ++ unsigned long turn_point_up; ++ unsigned long turn_benefit_up; ++ unsigned long stable_point; ++} rshash_state; ++ ++/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ ++static u32 *zero_hash_table; ++ ++static inline struct node_vma *alloc_node_vma(void) ++{ ++ struct node_vma *node_vma; ++ ++ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (node_vma) { ++ INIT_HLIST_HEAD(&node_vma->rmap_hlist); ++ INIT_HLIST_NODE(&node_vma->hlist); ++ } ++ return node_vma; ++} ++ ++static inline void free_node_vma(struct node_vma *node_vma) ++{ ++ kmem_cache_free(node_vma_cache, node_vma); ++} ++ ++ ++static inline struct vma_slot *alloc_vma_slot(void) ++{ ++ struct vma_slot *slot; ++ ++ /* ++ * In case ksm is not initialized by now. ++ * Oops, we need to consider the call site of uksm_init() in the future. ++ */ ++ if (!vma_slot_cache) ++ return NULL; ++ ++ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (slot) { ++ INIT_LIST_HEAD(&slot->slot_list); ++ INIT_LIST_HEAD(&slot->dedup_list); ++ slot->flags |= UKSM_SLOT_NEED_RERAND; ++ } ++ return slot; ++} ++ ++static inline void free_vma_slot(struct vma_slot *vma_slot) ++{ ++ kmem_cache_free(vma_slot_cache, vma_slot); ++} ++ ++ ++ ++static inline struct rmap_item *alloc_rmap_item(void) ++{ ++ struct rmap_item *rmap_item; ++ ++ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (rmap_item) { ++ /* bug on lowest bit is not clear for flag use */ ++ BUG_ON(is_addr(rmap_item)); ++ } ++ return rmap_item; ++} ++ ++static inline void free_rmap_item(struct rmap_item *rmap_item) ++{ ++ rmap_item->slot = NULL; /* debug safety */ ++ kmem_cache_free(rmap_item_cache, rmap_item); ++} ++ ++static inline struct stable_node *alloc_stable_node(void) ++{ ++ struct stable_node *node; ++ ++ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (!node) ++ return NULL; ++ ++ INIT_HLIST_HEAD(&node->hlist); ++ list_add(&node->all_list, &stable_node_list); ++ return node; ++} ++ ++static inline void free_stable_node(struct stable_node *stable_node) ++{ ++ list_del(&stable_node->all_list); ++ kmem_cache_free(stable_node_cache, stable_node); ++} ++ ++static inline struct tree_node *alloc_tree_node(struct list_head *list) ++{ ++ struct tree_node *node; ++ ++ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (!node) ++ return NULL; ++ ++ list_add(&node->all_list, list); ++ return node; ++} ++ ++static inline void free_tree_node(struct tree_node *node) ++{ ++ list_del(&node->all_list); ++ kmem_cache_free(tree_node_cache, node); ++} ++ ++static void uksm_drop_anon_vma(struct rmap_item *rmap_item) ++{ ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ ++ put_anon_vma(anon_vma); ++} ++ ++ ++/** ++ * Remove a stable node from stable_tree, may unlink from its tree_node and ++ * may remove its parent tree_node if no other stable node is pending. ++ * ++ * @stable_node The node need to be removed ++ * @unlink_rb Will this node be unlinked from the rbtree? ++ * @remove_tree_ node Will its tree_node be removed if empty? ++ */ ++static void remove_node_from_stable_tree(struct stable_node *stable_node, ++ int unlink_rb, int remove_tree_node) ++{ ++ struct node_vma *node_vma; ++ struct rmap_item *rmap_item; ++ struct hlist_node *n; ++ ++ if (!hlist_empty(&stable_node->hlist)) { ++ hlist_for_each_entry_safe(node_vma, n, ++ &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { ++ uksm_pages_sharing--; ++ ++ uksm_drop_anon_vma(rmap_item); ++ rmap_item->address &= PAGE_MASK; ++ } ++ free_node_vma(node_vma); ++ cond_resched(); ++ } ++ ++ /* the last one is counted as shared */ ++ uksm_pages_shared--; ++ uksm_pages_sharing++; ++ } ++ ++ if (stable_node->tree_node && unlink_rb) { ++ rb_erase(&stable_node->node, ++ &stable_node->tree_node->sub_root); ++ ++ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && ++ remove_tree_node) { ++ rb_erase(&stable_node->tree_node->node, ++ root_stable_treep); ++ free_tree_node(stable_node->tree_node); ++ } else { ++ stable_node->tree_node->count--; ++ } ++ } ++ ++ free_stable_node(stable_node); ++} ++ ++ ++/* ++ * get_uksm_page: checks if the page indicated by the stable node ++ * is still its ksm page, despite having held no reference to it. ++ * In which case we can trust the content of the page, and it ++ * returns the gotten page; but if the page has now been zapped, ++ * remove the stale node from the stable tree and return NULL. ++ * ++ * You would expect the stable_node to hold a reference to the ksm page. ++ * But if it increments the page's count, swapping out has to wait for ++ * ksmd to come around again before it can free the page, which may take ++ * seconds or even minutes: much too unresponsive. So instead we use a ++ * "keyhole reference": access to the ksm page from the stable node peeps ++ * out through its keyhole to see if that page still holds the right key, ++ * pointing back to this stable node. This relies on freeing a PageAnon ++ * page to reset its page->mapping to NULL, and relies on no other use of ++ * a page to put something that might look like our key in page->mapping. ++ * ++ * include/linux/pagemap.h page_cache_get_speculative() is a good reference, ++ * but this is different - made simpler by uksm_thread_mutex being held, but ++ * interesting for assuming that no other use of the struct page could ever ++ * put our expected_mapping into page->mapping (or a field of the union which ++ * coincides with page->mapping). The RCU calls are not for KSM at all, but ++ * to keep the page_count protocol described with page_cache_get_speculative. ++ * ++ * Note: it is possible that get_uksm_page() will return NULL one moment, ++ * then page the next, if the page is in between page_freeze_refs() and ++ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page ++ * is on its way to being freed; but it is an anomaly to bear in mind. ++ * ++ * @unlink_rb: if the removal of this node will firstly unlink from ++ * its rbtree. stable_node_reinsert will prevent this when restructuring the ++ * node from its old tree. ++ * ++ * @remove_tree_node: if this is the last one of its tree_node, will the ++ * tree_node be freed ? If we are inserting stable node, this tree_node may ++ * be reused, so don't free it. ++ */ ++static struct page *get_uksm_page(struct stable_node *stable_node, ++ int unlink_rb, int remove_tree_node) ++{ ++ struct page *page; ++ void *expected_mapping; ++ unsigned long kpfn; ++ ++ expected_mapping = (void *)((unsigned long)stable_node | ++ PAGE_MAPPING_KSM); ++again: ++ kpfn = READ_ONCE(stable_node->kpfn); ++ page = pfn_to_page(kpfn); ++ ++ /* ++ * page is computed from kpfn, so on most architectures reading ++ * page->mapping is naturally ordered after reading node->kpfn, ++ * but on Alpha we need to be more careful. ++ */ ++ smp_read_barrier_depends(); ++ ++ if (READ_ONCE(page->mapping) != expected_mapping) ++ goto stale; ++ ++ /* ++ * We cannot do anything with the page while its refcount is 0. ++ * Usually 0 means free, or tail of a higher-order page: in which ++ * case this node is no longer referenced, and should be freed; ++ * however, it might mean that the page is under page_freeze_refs(). ++ * The __remove_mapping() case is easy, again the node is now stale; ++ * but if page is swapcache in migrate_page_move_mapping(), it might ++ * still be our page, in which case it's essential to keep the node. ++ */ ++ while (!get_page_unless_zero(page)) { ++ /* ++ * Another check for page->mapping != expected_mapping would ++ * work here too. We have chosen the !PageSwapCache test to ++ * optimize the common case, when the page is or is about to ++ * be freed: PageSwapCache is cleared (under spin_lock_irq) ++ * in the freeze_refs section of __remove_mapping(); but Anon ++ * page->mapping reset to NULL later, in free_pages_prepare(). ++ */ ++ if (!PageSwapCache(page)) ++ goto stale; ++ cpu_relax(); ++ } ++ ++ if (READ_ONCE(page->mapping) != expected_mapping) { ++ put_page(page); ++ goto stale; ++ } ++ ++ lock_page(page); ++ if (READ_ONCE(page->mapping) != expected_mapping) { ++ unlock_page(page); ++ put_page(page); ++ goto stale; ++ } ++ unlock_page(page); ++ return page; ++stale: ++ /* ++ * We come here from above when page->mapping or !PageSwapCache ++ * suggests that the node is stale; but it might be under migration. ++ * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), ++ * before checking whether node->kpfn has been changed. ++ */ ++ smp_rmb(); ++ if (stable_node->kpfn != kpfn) ++ goto again; ++ ++ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); ++ ++ return NULL; ++} ++ ++/* ++ * Removing rmap_item from stable or unstable tree. ++ * This function will clean the information from the stable/unstable tree. ++ */ ++static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ++{ ++ if (rmap_item->address & STABLE_FLAG) { ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct page *page; ++ ++ node_vma = rmap_item->head; ++ stable_node = node_vma->head; ++ page = get_uksm_page(stable_node, 1, 1); ++ if (!page) ++ goto out; ++ ++ /* ++ * page lock is needed because it's racing with ++ * try_to_unmap_ksm(), etc. ++ */ ++ lock_page(page); ++ hlist_del(&rmap_item->hlist); ++ ++ if (hlist_empty(&node_vma->rmap_hlist)) { ++ hlist_del(&node_vma->hlist); ++ free_node_vma(node_vma); ++ } ++ unlock_page(page); ++ ++ put_page(page); ++ if (hlist_empty(&stable_node->hlist)) { ++ /* do NOT call remove_node_from_stable_tree() here, ++ * it's possible for a forked rmap_item not in ++ * stable tree while the in-tree rmap_items were ++ * deleted. ++ */ ++ uksm_pages_shared--; ++ } else ++ uksm_pages_sharing--; ++ ++ ++ uksm_drop_anon_vma(rmap_item); ++ } else if (rmap_item->address & UNSTABLE_FLAG) { ++ if (rmap_item->hash_round == uksm_hash_round) { ++ ++ rb_erase(&rmap_item->node, ++ &rmap_item->tree_node->sub_root); ++ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { ++ rb_erase(&rmap_item->tree_node->node, ++ &root_unstable_tree); ++ ++ free_tree_node(rmap_item->tree_node); ++ } else ++ rmap_item->tree_node->count--; ++ } ++ uksm_pages_unshared--; ++ } ++ ++ rmap_item->address &= PAGE_MASK; ++ rmap_item->hash_max = 0; ++ ++out: ++ cond_resched(); /* we're called from many long loops */ ++} ++ ++static inline int slot_in_uksm(struct vma_slot *slot) ++{ ++ return list_empty(&slot->slot_list); ++} ++ ++/* ++ * Test if the mm is exiting ++ */ ++static inline bool uksm_test_exit(struct mm_struct *mm) ++{ ++ return atomic_read(&mm->mm_users) == 0; ++} ++ ++static inline unsigned long vma_pool_size(struct vma_slot *slot) ++{ ++ return round_up(sizeof(struct rmap_list_entry) * slot->pages, ++ PAGE_SIZE) >> PAGE_SHIFT; ++} ++ ++#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) ++ ++/* must be done with sem locked */ ++static int slot_pool_alloc(struct vma_slot *slot) ++{ ++ unsigned long pool_size; ++ ++ if (slot->rmap_list_pool) ++ return 0; ++ ++ pool_size = vma_pool_size(slot); ++ slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *), ++ GFP_KERNEL); ++ if (!slot->rmap_list_pool) ++ return -ENOMEM; ++ ++ slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int), ++ GFP_KERNEL); ++ if (!slot->pool_counts) { ++ kfree(slot->rmap_list_pool); ++ return -ENOMEM; ++ } ++ ++ slot->pool_size = pool_size; ++ BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); ++ slot->flags |= UKSM_SLOT_IN_UKSM; ++ uksm_pages_total += slot->pages; ++ ++ return 0; ++} ++ ++/* ++ * Called after vma is unlinked from its mm ++ */ ++void uksm_remove_vma(struct vm_area_struct *vma) ++{ ++ struct vma_slot *slot; ++ ++ if (!vma->uksm_vma_slot) ++ return; ++ ++ spin_lock(&vma_slot_list_lock); ++ slot = vma->uksm_vma_slot; ++ if (!slot) ++ goto out; ++ ++ if (slot_in_uksm(slot)) { ++ /** ++ * This slot has been added by ksmd, so move to the del list ++ * waiting ksmd to free it. ++ */ ++ list_add_tail(&slot->slot_list, &vma_slot_del); ++ } else { ++ /** ++ * It's still on new list. It's ok to free slot directly. ++ */ ++ list_del(&slot->slot_list); ++ free_vma_slot(slot); ++ } ++out: ++ vma->uksm_vma_slot = NULL; ++ spin_unlock(&vma_slot_list_lock); ++} ++ ++/** ++ * Need to do two things: ++ * 1. check if slot was moved to del list ++ * 2. make sure the mmap_sem is manipulated under valid vma. ++ * ++ * My concern here is that in some cases, this may make ++ * vma_slot_list_lock() waiters to serialized further by some ++ * sem->wait_lock, can this really be expensive? ++ * ++ * ++ * @return ++ * 0: if successfully locked mmap_sem ++ * -ENOENT: this slot was moved to del list ++ * -EBUSY: vma lock failed ++ */ ++static int try_down_read_slot_mmap_sem(struct vma_slot *slot) ++{ ++ struct vm_area_struct *vma; ++ struct mm_struct *mm; ++ struct rw_semaphore *sem; ++ ++ spin_lock(&vma_slot_list_lock); ++ ++ /* the slot_list was removed and inited from new list, when it enters ++ * uksm_list. If now it's not empty, then it must be moved to del list ++ */ ++ if (!slot_in_uksm(slot)) { ++ spin_unlock(&vma_slot_list_lock); ++ return -ENOENT; ++ } ++ ++ BUG_ON(slot->pages != vma_pages(slot->vma)); ++ /* Ok, vma still valid */ ++ vma = slot->vma; ++ mm = vma->vm_mm; ++ sem = &mm->mmap_sem; ++ ++ if (uksm_test_exit(mm)) { ++ spin_unlock(&vma_slot_list_lock); ++ return -ENOENT; ++ } ++ ++ if (down_read_trylock(sem)) { ++ spin_unlock(&vma_slot_list_lock); ++ if (slot_pool_alloc(slot)) { ++ uksm_remove_vma(vma); ++ up_read(sem); ++ return -ENOENT; ++ } ++ return 0; ++ } ++ ++ spin_unlock(&vma_slot_list_lock); ++ return -EBUSY; ++} ++ ++static inline unsigned long ++vma_page_address(struct page *page, struct vm_area_struct *vma) ++{ ++ pgoff_t pgoff = page->index; ++ unsigned long address; ++ ++ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ++ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { ++ /* page should be within @vma mapping range */ ++ return -EFAULT; ++ } ++ return address; ++} ++ ++ ++/* return 0 on success with the item's mmap_sem locked */ ++static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) ++{ ++ struct mm_struct *mm; ++ struct vma_slot *slot = item->slot; ++ int err = -EINVAL; ++ ++ struct page *page; ++ ++ /* ++ * try_down_read_slot_mmap_sem() returns non-zero if the slot ++ * has been removed by uksm_remove_vma(). ++ */ ++ if (try_down_read_slot_mmap_sem(slot)) ++ return -EBUSY; ++ ++ mm = slot->vma->vm_mm; ++ ++ if (uksm_test_exit(mm)) ++ goto failout_up; ++ ++ page = item->page; ++ rcu_read_lock(); ++ if (!get_page_unless_zero(page)) { ++ rcu_read_unlock(); ++ goto failout_up; ++ } ++ ++ /* No need to consider huge page here. */ ++ if (item->slot->vma->anon_vma != page_anon_vma(page) || ++ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { ++ /* ++ * TODO: ++ * should we release this item becase of its stale page ++ * mapping? ++ */ ++ put_page(page); ++ rcu_read_unlock(); ++ goto failout_up; ++ } ++ rcu_read_unlock(); ++ return 0; ++ ++failout_up: ++ up_read(&mm->mmap_sem); ++ return err; ++} ++ ++/* ++ * What kind of VMA is considered ? ++ */ ++static inline int vma_can_enter(struct vm_area_struct *vma) ++{ ++ return uksm_flags_can_scan(vma->vm_flags); ++} ++ ++/* ++ * Called whenever a fresh new vma is created A new vma_slot. ++ * is created and inserted into a global list Must be called. ++ * after vma is inserted to its mm. ++ */ ++void uksm_vma_add_new(struct vm_area_struct *vma) ++{ ++ struct vma_slot *slot; ++ ++ if (!vma_can_enter(vma)) { ++ vma->uksm_vma_slot = NULL; ++ return; ++ } ++ ++ slot = alloc_vma_slot(); ++ if (!slot) { ++ vma->uksm_vma_slot = NULL; ++ return; ++ } ++ ++ vma->uksm_vma_slot = slot; ++ vma->vm_flags |= VM_MERGEABLE; ++ slot->vma = vma; ++ slot->mm = vma->vm_mm; ++ slot->ctime_j = jiffies; ++ slot->pages = vma_pages(vma); ++ spin_lock(&vma_slot_list_lock); ++ list_add_tail(&slot->slot_list, &vma_slot_new); ++ spin_unlock(&vma_slot_list_lock); ++} ++ ++/* 32/3 < they < 32/2 */ ++#define shiftl 8 ++#define shiftr 12 ++ ++#define HASH_FROM_TO(from, to) \ ++for (index = from; index < to; index++) { \ ++ pos = random_nums[index]; \ ++ hash += key[pos]; \ ++ hash += (hash << shiftl); \ ++ hash ^= (hash >> shiftr); \ ++} ++ ++ ++#define HASH_FROM_DOWN_TO(from, to) \ ++for (index = from - 1; index >= to; index--) { \ ++ hash ^= (hash >> shiftr); \ ++ hash ^= (hash >> (shiftr*2)); \ ++ hash -= (hash << shiftl); \ ++ hash += (hash << (shiftl*2)); \ ++ pos = random_nums[index]; \ ++ hash -= key[pos]; \ ++} ++ ++/* ++ * The main random sample hash function. ++ */ ++static u32 random_sample_hash(void *addr, u32 hash_strength) ++{ ++ u32 hash = 0xdeadbeef; ++ int index, pos, loop = hash_strength; ++ u32 *key = (u32 *)addr; ++ ++ if (loop > HASH_STRENGTH_FULL) ++ loop = HASH_STRENGTH_FULL; ++ ++ HASH_FROM_TO(0, loop); ++ ++ if (hash_strength > HASH_STRENGTH_FULL) { ++ loop = hash_strength - HASH_STRENGTH_FULL; ++ HASH_FROM_TO(0, loop); ++ } ++ ++ return hash; ++} ++ ++ ++/** ++ * It's used when hash strength is adjusted ++ * ++ * @addr The page's virtual address ++ * @from The original hash strength ++ * @to The hash strength changed to ++ * @hash The hash value generated with "from" hash value ++ * ++ * return the hash value ++ */ ++static u32 delta_hash(void *addr, int from, int to, u32 hash) ++{ ++ u32 *key = (u32 *)addr; ++ int index, pos; /* make sure they are int type */ ++ ++ if (to > from) { ++ if (from >= HASH_STRENGTH_FULL) { ++ from -= HASH_STRENGTH_FULL; ++ to -= HASH_STRENGTH_FULL; ++ HASH_FROM_TO(from, to); ++ } else if (to <= HASH_STRENGTH_FULL) { ++ HASH_FROM_TO(from, to); ++ } else { ++ HASH_FROM_TO(from, HASH_STRENGTH_FULL); ++ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); ++ } ++ } else { ++ if (from <= HASH_STRENGTH_FULL) { ++ HASH_FROM_DOWN_TO(from, to); ++ } else if (to >= HASH_STRENGTH_FULL) { ++ from -= HASH_STRENGTH_FULL; ++ to -= HASH_STRENGTH_FULL; ++ HASH_FROM_DOWN_TO(from, to); ++ } else { ++ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); ++ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); ++ } ++ } ++ ++ return hash; ++} ++ ++/** ++ * ++ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round ++ * has finished. ++ * ++ * return 0 if no page has been scanned since last call, 1 otherwise. ++ */ ++static inline int encode_benefit(void) ++{ ++ u64 scanned_delta, pos_delta, neg_delta; ++ unsigned long base = benefit.base; ++ ++ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; ++ ++ if (!scanned_delta) ++ return 0; ++ ++ scanned_delta >>= base; ++ pos_delta = rshash_pos >> base; ++ neg_delta = rshash_neg >> base; ++ ++ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || ++ CAN_OVERFLOW_U64(benefit.neg, neg_delta) || ++ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { ++ benefit.scanned >>= 1; ++ benefit.neg >>= 1; ++ benefit.pos >>= 1; ++ benefit.base++; ++ scanned_delta >>= 1; ++ pos_delta >>= 1; ++ neg_delta >>= 1; ++ } ++ ++ benefit.pos += pos_delta; ++ benefit.neg += neg_delta; ++ benefit.scanned += scanned_delta; ++ ++ BUG_ON(!benefit.scanned); ++ ++ rshash_pos = rshash_neg = 0; ++ uksm_pages_scanned_last = uksm_pages_scanned; ++ ++ return 1; ++} ++ ++static inline void reset_benefit(void) ++{ ++ benefit.pos = 0; ++ benefit.neg = 0; ++ benefit.base = 0; ++ benefit.scanned = 0; ++} ++ ++static inline void inc_rshash_pos(unsigned long delta) ++{ ++ if (CAN_OVERFLOW_U64(rshash_pos, delta)) ++ encode_benefit(); ++ ++ rshash_pos += delta; ++} ++ ++static inline void inc_rshash_neg(unsigned long delta) ++{ ++ if (CAN_OVERFLOW_U64(rshash_neg, delta)) ++ encode_benefit(); ++ ++ rshash_neg += delta; ++} ++ ++ ++static inline u32 page_hash(struct page *page, unsigned long hash_strength, ++ int cost_accounting) ++{ ++ u32 val; ++ unsigned long delta; ++ ++ void *addr = kmap_atomic(page); ++ ++ val = random_sample_hash(addr, hash_strength); ++ kunmap_atomic(addr); ++ ++ if (cost_accounting) { ++ if (hash_strength < HASH_STRENGTH_FULL) ++ delta = HASH_STRENGTH_FULL - hash_strength; ++ else ++ delta = 0; ++ ++ inc_rshash_pos(delta); ++ } ++ ++ return val; ++} ++ ++static int memcmp_pages(struct page *page1, struct page *page2, ++ int cost_accounting) ++{ ++ char *addr1, *addr2; ++ int ret; ++ ++ addr1 = kmap_atomic(page1); ++ addr2 = kmap_atomic(page2); ++ ret = memcmp(addr1, addr2, PAGE_SIZE); ++ kunmap_atomic(addr2); ++ kunmap_atomic(addr1); ++ ++ if (cost_accounting) ++ inc_rshash_neg(memcmp_cost); ++ ++ return ret; ++} ++ ++static inline int pages_identical(struct page *page1, struct page *page2) ++{ ++ return !memcmp_pages(page1, page2, 0); ++} ++ ++static inline int is_page_full_zero(struct page *page) ++{ ++ char *addr; ++ int ret; ++ ++ addr = kmap_atomic(page); ++ ret = is_full_zero(addr, PAGE_SIZE); ++ kunmap_atomic(addr); ++ ++ return ret; ++} ++ ++static int write_protect_page(struct vm_area_struct *vma, struct page *page, ++ pte_t *orig_pte, pte_t *old_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ struct page_vma_mapped_walk pvmw = { ++ .page = page, ++ .vma = vma, ++ }; ++ int swapped; ++ int err = -EFAULT; ++ unsigned long mmun_start; /* For mmu_notifiers */ ++ unsigned long mmun_end; /* For mmu_notifiers */ ++ ++ pvmw.address = page_address_in_vma(page, vma); ++ if (pvmw.address == -EFAULT) ++ goto out; ++ ++ BUG_ON(PageTransCompound(page)); ++ ++ mmun_start = pvmw.address; ++ mmun_end = pvmw.address + PAGE_SIZE; ++ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ++ ++ if (!page_vma_mapped_walk(&pvmw)) ++ goto out_mn; ++ if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) ++ goto out_unlock; ++ ++ if (old_pte) ++ *old_pte = *pvmw.pte; ++ ++ if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || ++ (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) { ++ pte_t entry; ++ ++ swapped = PageSwapCache(page); ++ flush_cache_page(vma, pvmw.address, page_to_pfn(page)); ++ /* ++ * Ok this is tricky, when get_user_pages_fast() run it doesn't ++ * take any lock, therefore the check that we are going to make ++ * with the pagecount against the mapcount is racey and ++ * O_DIRECT can happen right after the check. ++ * So we clear the pte and flush the tlb before the check ++ * this assure us that no O_DIRECT can happen after the check ++ * or in the middle of the check. ++ */ ++ entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); ++ /* ++ * Check that no O_DIRECT or similar I/O is in progress on the ++ * page ++ */ ++ if (page_mapcount(page) + 1 + swapped != page_count(page)) { ++ set_pte_at(mm, pvmw.address, pvmw.pte, entry); ++ goto out_unlock; ++ } ++ if (pte_dirty(entry)) ++ set_page_dirty(page); ++ ++ if (pte_protnone(entry)) ++ entry = pte_mkclean(pte_clear_savedwrite(entry)); ++ else ++ entry = pte_mkclean(pte_wrprotect(entry)); ++ ++ set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); ++ } ++ *orig_pte = *pvmw.pte; ++ err = 0; ++ ++out_unlock: ++ page_vma_mapped_walk_done(&pvmw); ++out_mn: ++ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); ++out: ++ return err; ++} ++ ++#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ ++#define MERGE_ERR_COLLI 2 /* there is a collision */ ++#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ ++#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ ++ ++ ++/** ++ * replace_page - replace page in vma by new ksm page ++ * @vma: vma that holds the pte pointing to page ++ * @page: the page we are replacing by kpage ++ * @kpage: the ksm page we replace page by ++ * @orig_pte: the original value of the pte ++ * ++ * Returns 0 on success, MERGE_ERR_PGERR on failure. ++ */ ++static int replace_page(struct vm_area_struct *vma, struct page *page, ++ struct page *kpage, pte_t orig_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ p4d_t *p4d; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *ptep; ++ spinlock_t *ptl; ++ pte_t entry; ++ ++ unsigned long addr; ++ int err = MERGE_ERR_PGERR; ++ unsigned long mmun_start; /* For mmu_notifiers */ ++ unsigned long mmun_end; /* For mmu_notifiers */ ++ ++ addr = page_address_in_vma(page, vma); ++ if (addr == -EFAULT) ++ goto out; ++ ++ pgd = pgd_offset(mm, addr); ++ if (!pgd_present(*pgd)) ++ goto out; ++ ++ p4d = p4d_offset(pgd, addr); ++ pud = pud_offset(p4d, addr); ++ if (!pud_present(*pud)) ++ goto out; ++ ++ pmd = pmd_offset(pud, addr); ++ BUG_ON(pmd_trans_huge(*pmd)); ++ if (!pmd_present(*pmd)) ++ goto out; ++ ++ mmun_start = addr; ++ mmun_end = addr + PAGE_SIZE; ++ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ++ ++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ if (!pte_same(*ptep, orig_pte)) { ++ pte_unmap_unlock(ptep, ptl); ++ goto out_mn; ++ } ++ ++ flush_cache_page(vma, addr, pte_pfn(*ptep)); ++ ptep_clear_flush_notify(vma, addr, ptep); ++ entry = mk_pte(kpage, vma->vm_page_prot); ++ ++ /* special treatment is needed for zero_page */ ++ if ((page_to_pfn(kpage) == uksm_zero_pfn) || ++ (page_to_pfn(kpage) == zero_pfn)) { ++ entry = pte_mkspecial(entry); ++ dec_mm_counter(mm, MM_ANONPAGES); ++ inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); ++ } else { ++ get_page(kpage); ++ page_add_anon_rmap(kpage, vma, addr, false); ++ } ++ ++ set_pte_at_notify(mm, addr, ptep, entry); ++ ++ page_remove_rmap(page, false); ++ if (!page_mapped(page)) ++ try_to_free_swap(page); ++ put_page(page); ++ ++ pte_unmap_unlock(ptep, ptl); ++ err = 0; ++out_mn: ++ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); ++out: ++ return err; ++} ++ ++ ++/** ++ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The ++ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its ++ * hash_max member has not been calculated. ++ * ++ * @page The page needs to be hashed ++ * @hash_old The hash value calculated with current hash strength ++ * ++ * return the new hash value calculated at HASH_STRENGTH_MAX ++ */ ++static inline u32 page_hash_max(struct page *page, u32 hash_old) ++{ ++ u32 hash_max = 0; ++ void *addr; ++ ++ addr = kmap_atomic(page); ++ hash_max = delta_hash(addr, hash_strength, ++ HASH_STRENGTH_MAX, hash_old); ++ ++ kunmap_atomic(addr); ++ ++ if (!hash_max) ++ hash_max = 1; ++ ++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); ++ return hash_max; ++} ++ ++/* ++ * We compare the hash again, to ensure that it is really a hash collision ++ * instead of being caused by page write. ++ */ ++static inline int check_collision(struct rmap_item *rmap_item, ++ u32 hash) ++{ ++ int err; ++ struct page *page = rmap_item->page; ++ ++ /* if this rmap_item has already been hash_maxed, then the collision ++ * must appears in the second-level rbtree search. In this case we check ++ * if its hash_max value has been changed. Otherwise, the collision ++ * happens in the first-level rbtree search, so we check against it's ++ * current hash value. ++ */ ++ if (rmap_item->hash_max) { ++ inc_rshash_neg(memcmp_cost); ++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); ++ ++ if (rmap_item->hash_max == page_hash_max(page, hash)) ++ err = MERGE_ERR_COLLI; ++ else ++ err = MERGE_ERR_CHANGED; ++ } else { ++ inc_rshash_neg(memcmp_cost + hash_strength); ++ ++ if (page_hash(page, hash_strength, 0) == hash) ++ err = MERGE_ERR_COLLI; ++ else ++ err = MERGE_ERR_CHANGED; ++ } ++ ++ return err; ++} ++ ++/** ++ * Try to merge a rmap_item.page with a kpage in stable node. kpage must ++ * already be a ksm page. ++ * ++ * @return 0 if the pages were merged, -EFAULT otherwise. ++ */ ++static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, ++ struct page *kpage, u32 hash) ++{ ++ struct vm_area_struct *vma = rmap_item->slot->vma; ++ struct mm_struct *mm = vma->vm_mm; ++ pte_t orig_pte = __pte(0); ++ int err = MERGE_ERR_PGERR; ++ struct page *page; ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ page = rmap_item->page; ++ ++ if (page == kpage) { /* ksm page forked */ ++ err = 0; ++ goto out; ++ } ++ ++ /* ++ * We need the page lock to read a stable PageSwapCache in ++ * write_protect_page(). We use trylock_page() instead of ++ * lock_page() because we don't want to wait here - we ++ * prefer to continue scanning and merging different pages, ++ * then come back to this page when it is unlocked. ++ */ ++ if (!trylock_page(page)) ++ goto out; ++ ++ if (!PageAnon(page) || !PageKsm(kpage)) ++ goto out_unlock; ++ ++ if (PageTransCompound(page)) { ++ err = split_huge_page(page); ++ if (err) ++ goto out_unlock; ++ } ++ ++ /* ++ * If this anonymous page is mapped only here, its pte may need ++ * to be write-protected. If it's mapped elsewhere, all of its ++ * ptes are necessarily already write-protected. But in either ++ * case, we need to lock and check page_count is not raised. ++ */ ++ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { ++ if (pages_identical(page, kpage)) ++ err = replace_page(vma, page, kpage, orig_pte); ++ else ++ err = check_collision(rmap_item, hash); ++ } ++ ++ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { ++ munlock_vma_page(page); ++ if (!PageMlocked(kpage)) { ++ unlock_page(page); ++ lock_page(kpage); ++ mlock_vma_page(kpage); ++ page = kpage; /* for final unlock */ ++ } ++ } ++ ++out_unlock: ++ unlock_page(page); ++out: ++ return err; ++} ++ ++ ++ ++/** ++ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance ++ * to restore a page mapping that has been changed in try_to_merge_two_pages. ++ * ++ * @return 0 on success. ++ */ ++static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, ++ pte_t orig_pte, pte_t wprt_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ p4d_t *p4d; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *ptep; ++ spinlock_t *ptl; ++ ++ int err = -EFAULT; ++ ++ pgd = pgd_offset(mm, addr); ++ if (!pgd_present(*pgd)) ++ goto out; ++ ++ p4d = p4d_offset(pgd, addr); ++ pud = pud_offset(p4d, addr); ++ if (!pud_present(*pud)) ++ goto out; ++ ++ pmd = pmd_offset(pud, addr); ++ if (!pmd_present(*pmd)) ++ goto out; ++ ++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ if (!pte_same(*ptep, wprt_pte)) { ++ /* already copied, let it be */ ++ pte_unmap_unlock(ptep, ptl); ++ goto out; ++ } ++ ++ /* ++ * Good boy, still here. When we still get the ksm page, it does not ++ * return to the free page pool, there is no way that a pte was changed ++ * to other page and gets back to this page. And remind that ksm page ++ * do not reuse in do_wp_page(). So it's safe to restore the original ++ * pte. ++ */ ++ flush_cache_page(vma, addr, pte_pfn(*ptep)); ++ ptep_clear_flush_notify(vma, addr, ptep); ++ set_pte_at_notify(mm, addr, ptep, orig_pte); ++ ++ pte_unmap_unlock(ptep, ptl); ++ err = 0; ++out: ++ return err; ++} ++ ++/** ++ * try_to_merge_two_pages() - take two identical pages and prepare ++ * them to be merged into one page(rmap_item->page) ++ * ++ * @return 0 if we successfully merged two identical pages into ++ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision ++ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been ++ * changed since it's hashed. MERGE_ERR_PGERR otherwise. ++ * ++ */ ++static int try_to_merge_two_pages(struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ u32 hash) ++{ ++ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); ++ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); ++ struct vm_area_struct *vma1 = rmap_item->slot->vma; ++ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; ++ struct page *page = rmap_item->page; ++ struct page *tree_page = tree_rmap_item->page; ++ int err = MERGE_ERR_PGERR; ++ struct address_space *saved_mapping; ++ ++ ++ if (rmap_item->page == tree_rmap_item->page) ++ goto out; ++ ++ if (!trylock_page(page)) ++ goto out; ++ ++ if (!PageAnon(page)) ++ goto out_unlock; ++ ++ if (PageTransCompound(page)) { ++ err = split_huge_page(page); ++ if (err) ++ goto out_unlock; ++ } ++ ++ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { ++ unlock_page(page); ++ goto out; ++ } ++ ++ /* ++ * While we hold page lock, upgrade page from ++ * PageAnon+anon_vma to PageKsm+NULL stable_node: ++ * stable_tree_insert() will update stable_node. ++ */ ++ saved_mapping = page->mapping; ++ set_page_stable_node(page, NULL); ++ mark_page_accessed(page); ++ if (!PageDirty(page)) ++ SetPageDirty(page); ++ ++ unlock_page(page); ++ ++ if (!trylock_page(tree_page)) ++ goto restore_out; ++ ++ if (!PageAnon(tree_page)) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if (PageTransCompound(tree_page)) { ++ err = split_huge_page(tree_page); ++ if (err) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ } ++ ++ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if (pages_identical(page, tree_page)) { ++ err = replace_page(vma2, tree_page, page, wprt_pte2); ++ if (err) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if ((vma2->vm_flags & VM_LOCKED)) { ++ munlock_vma_page(tree_page); ++ if (!PageMlocked(page)) { ++ unlock_page(tree_page); ++ lock_page(page); ++ mlock_vma_page(page); ++ tree_page = page; /* for final unlock */ ++ } ++ } ++ ++ unlock_page(tree_page); ++ ++ goto out; /* success */ ++ ++ } else { ++ if (tree_rmap_item->hash_max && ++ tree_rmap_item->hash_max == rmap_item->hash_max) { ++ err = MERGE_ERR_COLLI_MAX; ++ } else if (page_hash(page, hash_strength, 0) == ++ page_hash(tree_page, hash_strength, 0)) { ++ inc_rshash_neg(memcmp_cost + hash_strength * 2); ++ err = MERGE_ERR_COLLI; ++ } else { ++ err = MERGE_ERR_CHANGED; ++ } ++ ++ unlock_page(tree_page); ++ } ++ ++restore_out: ++ lock_page(page); ++ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), ++ orig_pte1, wprt_pte1)) ++ page->mapping = saved_mapping; ++ ++out_unlock: ++ unlock_page(page); ++out: ++ return err; ++} ++ ++static inline int hash_cmp(u32 new_val, u32 node_val) ++{ ++ if (new_val > node_val) ++ return 1; ++ else if (new_val < node_val) ++ return -1; ++ else ++ return 0; ++} ++ ++static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) ++{ ++ u32 hash_max = item->hash_max; ++ ++ if (!hash_max) { ++ hash_max = page_hash_max(item->page, hash); ++ ++ item->hash_max = hash_max; ++ } ++ ++ return hash_max; ++} ++ ++ ++ ++/** ++ * stable_tree_search() - search the stable tree for a page ++ * ++ * @item: the rmap_item we are comparing with ++ * @hash: the hash value of this item->page already calculated ++ * ++ * @return the page we have found, NULL otherwise. The page returned has ++ * been gotten. ++ */ ++static struct page *stable_tree_search(struct rmap_item *item, u32 hash) ++{ ++ struct rb_node *node = root_stable_treep->rb_node; ++ struct tree_node *tree_node; ++ unsigned long hash_max; ++ struct page *page = item->page; ++ struct stable_node *stable_node; ++ ++ stable_node = page_stable_node(page); ++ if (stable_node) { ++ /* ksm page forked, that is ++ * if (PageKsm(page) && !in_stable_tree(rmap_item)) ++ * it's actually gotten once outside. ++ */ ++ get_page(page); ++ return page; ++ } ++ ++ while (node) { ++ int cmp; ++ ++ tree_node = rb_entry(node, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) ++ node = node->rb_left; ++ else if (cmp > 0) ++ node = node->rb_right; ++ else ++ break; ++ } ++ ++ if (!node) ++ return NULL; ++ ++ if (tree_node->count == 1) { ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ BUG_ON(!stable_node); ++ ++ goto get_page_out; ++ } ++ ++ /* ++ * ok, we have to search the second ++ * level subtree, hash the page to a ++ * full strength. ++ */ ++ node = tree_node->sub_root.rb_node; ++ BUG_ON(!node); ++ hash_max = rmap_item_hash_max(item, hash); ++ ++ while (node) { ++ int cmp; ++ ++ stable_node = rb_entry(node, struct stable_node, node); ++ ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ if (cmp < 0) ++ node = node->rb_left; ++ else if (cmp > 0) ++ node = node->rb_right; ++ else ++ goto get_page_out; ++ } ++ ++ return NULL; ++ ++get_page_out: ++ page = get_uksm_page(stable_node, 1, 1); ++ return page; ++} ++ ++static int try_merge_rmap_item(struct rmap_item *item, ++ struct page *kpage, ++ struct page *tree_page) ++{ ++ struct vm_area_struct *vma = item->slot->vma; ++ struct page_vma_mapped_walk pvmw = { ++ .page = kpage, ++ .vma = vma, ++ }; ++ ++ pvmw.address = get_rmap_addr(item); ++ if (!page_vma_mapped_walk(&pvmw)) ++ return 0; ++ ++ if (pte_write(*pvmw.pte)) { ++ /* has changed, abort! */ ++ page_vma_mapped_walk_done(&pvmw); ++ return 0; ++ } ++ ++ get_page(tree_page); ++ page_add_anon_rmap(tree_page, vma, pvmw.address, false); ++ ++ flush_cache_page(vma, pvmw.address, page_to_pfn(kpage)); ++ ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); ++ set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte, ++ mk_pte(tree_page, vma->vm_page_prot)); ++ ++ page_remove_rmap(kpage, false); ++ put_page(kpage); ++ ++ page_vma_mapped_walk_done(&pvmw); ++ ++ return 1; ++} ++ ++/** ++ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted ++ * into stable tree, the page was found to be identical to a stable ksm page, ++ * this is the last chance we can merge them into one. ++ * ++ * @item1: the rmap_item holding the page which we wanted to insert ++ * into stable tree. ++ * @item2: the other rmap_item we found when unstable tree search ++ * @oldpage: the page currently mapped by the two rmap_items ++ * @tree_page: the page we found identical in stable tree node ++ * @success1: return if item1 is successfully merged ++ * @success2: return if item2 is successfully merged ++ */ ++static void try_merge_with_stable(struct rmap_item *item1, ++ struct rmap_item *item2, ++ struct page **kpage, ++ struct page *tree_page, ++ int *success1, int *success2) ++{ ++ struct vm_area_struct *vma1 = item1->slot->vma; ++ struct vm_area_struct *vma2 = item2->slot->vma; ++ *success1 = 0; ++ *success2 = 0; ++ ++ if (unlikely(*kpage == tree_page)) { ++ /* I don't think this can really happen */ ++ pr_warn("UKSM: unexpected condition detected in " ++ "%s -- *kpage == tree_page !\n", __func__); ++ *success1 = 1; ++ *success2 = 1; ++ return; ++ } ++ ++ if (!PageAnon(*kpage) || !PageKsm(*kpage)) ++ goto failed; ++ ++ if (!trylock_page(tree_page)) ++ goto failed; ++ ++ /* If the oldpage is still ksm and still pointed ++ * to in the right place, and still write protected, ++ * we are confident it's not changed, no need to ++ * memcmp anymore. ++ * be ware, we cannot take nested pte locks, ++ * deadlock risk. ++ */ ++ if (!try_merge_rmap_item(item1, *kpage, tree_page)) ++ goto unlock_failed; ++ ++ /* ok, then vma2, remind that pte1 already set */ ++ if (!try_merge_rmap_item(item2, *kpage, tree_page)) ++ goto success_1; ++ ++ *success2 = 1; ++success_1: ++ *success1 = 1; ++ ++ ++ if ((*success1 && vma1->vm_flags & VM_LOCKED) || ++ (*success2 && vma2->vm_flags & VM_LOCKED)) { ++ munlock_vma_page(*kpage); ++ if (!PageMlocked(tree_page)) ++ mlock_vma_page(tree_page); ++ } ++ ++ /* ++ * We do not need oldpage any more in the caller, so can break the lock ++ * now. ++ */ ++ unlock_page(*kpage); ++ *kpage = tree_page; /* Get unlocked outside. */ ++ return; ++ ++unlock_failed: ++ unlock_page(tree_page); ++failed: ++ return; ++} ++ ++static inline void stable_node_hash_max(struct stable_node *node, ++ struct page *page, u32 hash) ++{ ++ u32 hash_max = node->hash_max; ++ ++ if (!hash_max) { ++ hash_max = page_hash_max(page, hash); ++ node->hash_max = hash_max; ++ } ++} ++ ++static inline ++struct stable_node *new_stable_node(struct tree_node *tree_node, ++ struct page *kpage, u32 hash_max) ++{ ++ struct stable_node *new_stable_node; ++ ++ new_stable_node = alloc_stable_node(); ++ if (!new_stable_node) ++ return NULL; ++ ++ new_stable_node->kpfn = page_to_pfn(kpage); ++ new_stable_node->hash_max = hash_max; ++ new_stable_node->tree_node = tree_node; ++ set_page_stable_node(kpage, new_stable_node); ++ ++ return new_stable_node; ++} ++ ++static inline ++struct stable_node *first_level_insert(struct tree_node *tree_node, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ struct page **kpage, u32 hash, ++ int *success1, int *success2) ++{ ++ int cmp; ++ struct page *tree_page; ++ u32 hash_max = 0; ++ struct stable_node *stable_node, *new_snode; ++ struct rb_node *parent = NULL, **new; ++ ++ /* this tree node contains no sub-tree yet */ ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ cmp = memcmp_pages(*kpage, tree_page, 1); ++ if (!cmp) { ++ try_merge_with_stable(rmap_item, tree_rmap_item, kpage, ++ tree_page, success1, success2); ++ put_page(tree_page); ++ if (!*success1 && !*success2) ++ goto failed; ++ ++ return stable_node; ++ ++ } else { ++ /* ++ * collision in first level try to create a subtree. ++ * A new node need to be created. ++ */ ++ put_page(tree_page); ++ ++ stable_node_hash_max(stable_node, tree_page, ++ tree_node->hash); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ parent = &stable_node->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto failed; ++ } ++ ++ } else { ++ /* the only stable_node deleted, we reuse its tree_node. ++ */ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++ new_snode = new_stable_node(tree_node, *kpage, hash_max); ++ if (!new_snode) ++ goto failed; ++ ++ rb_link_node(&new_snode->node, parent, new); ++ rb_insert_color(&new_snode->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ ++ return new_snode; ++ ++failed: ++ return NULL; ++} ++ ++static inline ++struct stable_node *stable_subtree_insert(struct tree_node *tree_node, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ struct page **kpage, u32 hash, ++ int *success1, int *success2) ++{ ++ struct page *tree_page; ++ u32 hash_max; ++ struct stable_node *stable_node, *new_snode; ++ struct rb_node *parent, **new; ++ ++research: ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ BUG_ON(!*new); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ while (*new) { ++ int cmp; ++ ++ stable_node = rb_entry(*new, struct stable_node, node); ++ ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else { ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ cmp = memcmp_pages(*kpage, tree_page, 1); ++ if (!cmp) { ++ try_merge_with_stable(rmap_item, ++ tree_rmap_item, kpage, ++ tree_page, success1, success2); ++ ++ put_page(tree_page); ++ if (!*success1 && !*success2) ++ goto failed; ++ /* ++ * successfully merged with a stable ++ * node ++ */ ++ return stable_node; ++ } else { ++ put_page(tree_page); ++ goto failed; ++ } ++ } else { ++ /* ++ * stable node may be deleted, ++ * and subtree maybe ++ * restructed, cannot ++ * continue, research it. ++ */ ++ if (tree_node->count) { ++ goto research; ++ } else { ++ /* reuse the tree node*/ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ } ++ } ++ } ++ ++ new_snode = new_stable_node(tree_node, *kpage, hash_max); ++ if (!new_snode) ++ goto failed; ++ ++ rb_link_node(&new_snode->node, parent, new); ++ rb_insert_color(&new_snode->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ ++ return new_snode; ++ ++failed: ++ return NULL; ++} ++ ++ ++/** ++ * stable_tree_insert() - try to insert a merged page in unstable tree to ++ * the stable tree ++ * ++ * @kpage: the page need to be inserted ++ * @hash: the current hash of this page ++ * @rmap_item: the rmap_item being scanned ++ * @tree_rmap_item: the rmap_item found on unstable tree ++ * @success1: return if rmap_item is merged ++ * @success2: return if tree_rmap_item is merged ++ * ++ * @return the stable_node on stable tree if at least one ++ * rmap_item is inserted into stable tree, NULL ++ * otherwise. ++ */ ++static struct stable_node * ++stable_tree_insert(struct page **kpage, u32 hash, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ int *success1, int *success2) ++{ ++ struct rb_node **new = &root_stable_treep->rb_node; ++ struct rb_node *parent = NULL; ++ struct stable_node *stable_node; ++ struct tree_node *tree_node; ++ u32 hash_max = 0; ++ ++ *success1 = *success2 = 0; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ if (tree_node->count == 1) { ++ stable_node = first_level_insert(tree_node, rmap_item, ++ tree_rmap_item, kpage, ++ hash, success1, success2); ++ } else { ++ stable_node = stable_subtree_insert(tree_node, ++ rmap_item, tree_rmap_item, kpage, ++ hash, success1, success2); ++ } ++ } else { ++ ++ /* no tree node found */ ++ tree_node = alloc_tree_node(stable_tree_node_listp); ++ if (!tree_node) { ++ stable_node = NULL; ++ goto out; ++ } ++ ++ stable_node = new_stable_node(tree_node, *kpage, hash_max); ++ if (!stable_node) { ++ free_tree_node(tree_node); ++ goto out; ++ } ++ ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, root_stable_treep); ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ ++ rb_link_node(&stable_node->node, parent, new); ++ rb_insert_color(&stable_node->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ } ++ ++out: ++ return stable_node; ++} ++ ++ ++/** ++ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem ++ * ++ * @return 0 on success, -EBUSY if unable to lock the mmap_sem, ++ * -EINVAL if the page mapping has been changed. ++ */ ++static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) ++{ ++ int err; ++ ++ err = get_mergeable_page_lock_mmap(tree_rmap_item); ++ ++ if (err == -EINVAL) { ++ /* its page map has been changed, remove it */ ++ remove_rmap_item_from_tree(tree_rmap_item); ++ } ++ ++ /* The page is gotten and mmap_sem is locked now. */ ++ return err; ++} ++ ++ ++/** ++ * unstable_tree_search_insert() - search an unstable tree rmap_item with the ++ * same hash value. Get its page and trylock the mmap_sem ++ */ ++static inline ++struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, ++ u32 hash) ++ ++{ ++ struct rb_node **new = &root_unstable_tree.rb_node; ++ struct rb_node *parent = NULL; ++ struct tree_node *tree_node; ++ u32 hash_max; ++ struct rmap_item *tree_rmap_item; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ /* got the tree_node */ ++ if (tree_node->count == 1) { ++ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, ++ struct rmap_item, node); ++ BUG_ON(!tree_rmap_item); ++ ++ goto get_page_out; ++ } ++ ++ /* well, search the collision subtree */ ++ new = &tree_node->sub_root.rb_node; ++ BUG_ON(!*new); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ ++ while (*new) { ++ int cmp; ++ ++ tree_rmap_item = rb_entry(*new, struct rmap_item, ++ node); ++ ++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); ++ parent = *new; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto get_page_out; ++ } ++ } else { ++ /* alloc a new tree_node */ ++ tree_node = alloc_tree_node(&unstable_tree_node_list); ++ if (!tree_node) ++ return NULL; ++ ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, &root_unstable_tree); ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++ /* did not found even in sub-tree */ ++ rmap_item->tree_node = tree_node; ++ rmap_item->address |= UNSTABLE_FLAG; ++ rmap_item->hash_round = uksm_hash_round; ++ rb_link_node(&rmap_item->node, parent, new); ++ rb_insert_color(&rmap_item->node, &tree_node->sub_root); ++ ++ uksm_pages_unshared++; ++ return NULL; ++ ++get_page_out: ++ if (tree_rmap_item->page == rmap_item->page) ++ return NULL; ++ ++ if (get_tree_rmap_item_page(tree_rmap_item)) ++ return NULL; ++ ++ return tree_rmap_item; ++} ++ ++static void hold_anon_vma(struct rmap_item *rmap_item, ++ struct anon_vma *anon_vma) ++{ ++ rmap_item->anon_vma = anon_vma; ++ get_anon_vma(anon_vma); ++} ++ ++ ++/** ++ * stable_tree_append() - append a rmap_item to a stable node. Deduplication ++ * ratio statistics is done in this function. ++ * ++ */ ++static void stable_tree_append(struct rmap_item *rmap_item, ++ struct stable_node *stable_node, int logdedup) ++{ ++ struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL; ++ unsigned long key = (unsigned long)rmap_item->slot; ++ unsigned long factor = rmap_item->slot->rung->step; ++ ++ BUG_ON(!stable_node); ++ rmap_item->address |= STABLE_FLAG; ++ ++ if (hlist_empty(&stable_node->hlist)) { ++ uksm_pages_shared++; ++ goto node_vma_new; ++ } else { ++ uksm_pages_sharing++; ++ } ++ ++ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { ++ if (node_vma->key >= key) ++ break; ++ ++ if (logdedup) { ++ node_vma->slot->pages_bemerged += factor; ++ if (list_empty(&node_vma->slot->dedup_list)) ++ list_add(&node_vma->slot->dedup_list, ++ &vma_slot_dedup); ++ } ++ } ++ ++ if (node_vma) { ++ if (node_vma->key == key) { ++ node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist); ++ goto node_vma_ok; ++ } else if (node_vma->key > key) { ++ node_vma_cont = node_vma; ++ } ++ } ++ ++node_vma_new: ++ /* no same vma already in node, alloc a new node_vma */ ++ new_node_vma = alloc_node_vma(); ++ BUG_ON(!new_node_vma); ++ new_node_vma->head = stable_node; ++ new_node_vma->slot = rmap_item->slot; ++ ++ if (!node_vma) { ++ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); ++ } else if (node_vma->key != key) { ++ if (node_vma->key < key) ++ hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist); ++ else { ++ hlist_add_before(&new_node_vma->hlist, ++ &node_vma->hlist); ++ } ++ ++ } ++ node_vma = new_node_vma; ++ ++node_vma_ok: /* ok, ready to add to the list */ ++ rmap_item->head = node_vma; ++ hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); ++ hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); ++ if (logdedup) { ++ rmap_item->slot->pages_merged++; ++ if (node_vma_cont) { ++ node_vma = node_vma_cont; ++ hlist_for_each_entry_continue(node_vma, hlist) { ++ node_vma->slot->pages_bemerged += factor; ++ if (list_empty(&node_vma->slot->dedup_list)) ++ list_add(&node_vma->slot->dedup_list, ++ &vma_slot_dedup); ++ } ++ } ++ } ++} ++ ++/* ++ * We use break_ksm to break COW on a ksm page: it's a stripped down ++ * ++ * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) ++ * put_page(page); ++ * ++ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, ++ * in case the application has unmapped and remapped mm,addr meanwhile. ++ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP ++ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. ++ */ ++static int break_ksm(struct vm_area_struct *vma, unsigned long addr) ++{ ++ struct page *page; ++ int ret = 0; ++ ++ do { ++ cond_resched(); ++ page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); ++ if (IS_ERR_OR_NULL(page)) ++ break; ++ if (PageKsm(page)) { ++ ret = handle_mm_fault(vma, addr, ++ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE); ++ } else ++ ret = VM_FAULT_WRITE; ++ put_page(page); ++ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); ++ /* ++ * We must loop because handle_mm_fault() may back out if there's ++ * any difficulty e.g. if pte accessed bit gets updated concurrently. ++ * ++ * VM_FAULT_WRITE is what we have been hoping for: it indicates that ++ * COW has been broken, even if the vma does not permit VM_WRITE; ++ * but note that a concurrent fault might break PageKsm for us. ++ * ++ * VM_FAULT_SIGBUS could occur if we race with truncation of the ++ * backing file, which also invalidates anonymous pages: that's ++ * okay, that truncation will have unmapped the PageKsm for us. ++ * ++ * VM_FAULT_OOM: at the time of writing (late July 2009), setting ++ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the ++ * current task has TIF_MEMDIE set, and will be OOM killed on return ++ * to user; and ksmd, having no mm, would never be chosen for that. ++ * ++ * But if the mm is in a limited mem_cgroup, then the fault may fail ++ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and ++ * even ksmd can fail in this way - though it's usually breaking ksm ++ * just to undo a merge it made a moment before, so unlikely to oom. ++ * ++ * That's a pity: we might therefore have more kernel pages allocated ++ * than we're counting as nodes in the stable tree; but uksm_do_scan ++ * will retry to break_cow on each pass, so should recover the page ++ * in due course. The important thing is to not let VM_MERGEABLE ++ * be cleared while any such pages might remain in the area. ++ */ ++ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; ++} ++ ++static void break_cow(struct rmap_item *rmap_item) ++{ ++ struct vm_area_struct *vma = rmap_item->slot->vma; ++ struct mm_struct *mm = vma->vm_mm; ++ unsigned long addr = get_rmap_addr(rmap_item); ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ break_ksm(vma, addr); ++out: ++ return; ++} ++ ++/* ++ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather ++ * than check every pte of a given vma, the locking doesn't quite work for ++ * that - an rmap_item is assigned to the stable tree after inserting ksm ++ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing ++ * rmap_items from parent to child at fork time (so as not to waste time ++ * if exit comes before the next scan reaches it). ++ * ++ * Similarly, although we'd like to remove rmap_items (so updating counts ++ * and freeing memory) when unmerging an area, it's easier to leave that ++ * to the next pass of ksmd - consider, for example, how ksmd might be ++ * in cmp_and_merge_page on one of the rmap_items we would be removing. ++ */ ++inline int unmerge_uksm_pages(struct vm_area_struct *vma, ++ unsigned long start, unsigned long end) ++{ ++ unsigned long addr; ++ int err = 0; ++ ++ for (addr = start; addr < end && !err; addr += PAGE_SIZE) { ++ if (uksm_test_exit(vma->vm_mm)) ++ break; ++ if (signal_pending(current)) ++ err = -ERESTARTSYS; ++ else ++ err = break_ksm(vma, addr); ++ } ++ return err; ++} ++ ++static inline void inc_uksm_pages_scanned(void) ++{ ++ u64 delta; ++ ++ ++ if (uksm_pages_scanned == U64_MAX) { ++ encode_benefit(); ++ ++ delta = uksm_pages_scanned >> pages_scanned_base; ++ ++ if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { ++ pages_scanned_stored >>= 1; ++ delta >>= 1; ++ pages_scanned_base++; ++ } ++ ++ pages_scanned_stored += delta; ++ ++ uksm_pages_scanned = uksm_pages_scanned_last = 0; ++ } ++ ++ uksm_pages_scanned++; ++} ++ ++static inline int find_zero_page_hash(int strength, u32 hash) ++{ ++ return (zero_hash_table[strength] == hash); ++} ++ ++static ++int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) ++{ ++ struct page *zero_page = empty_uksm_zero_page; ++ struct mm_struct *mm = vma->vm_mm; ++ pte_t orig_pte = __pte(0); ++ int err = -EFAULT; ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ if (!trylock_page(page)) ++ goto out; ++ ++ if (!PageAnon(page)) ++ goto out_unlock; ++ ++ if (PageTransCompound(page)) { ++ err = split_huge_page(page); ++ if (err) ++ goto out_unlock; ++ } ++ ++ if (write_protect_page(vma, page, &orig_pte, 0) == 0) { ++ if (is_page_full_zero(page)) ++ err = replace_page(vma, page, zero_page, orig_pte); ++ } ++ ++out_unlock: ++ unlock_page(page); ++out: ++ return err; ++} ++ ++/* ++ * cmp_and_merge_page() - first see if page can be merged into the stable ++ * tree; if not, compare hash to previous and if it's the same, see if page ++ * can be inserted into the unstable tree, or merged with a page already there ++ * and both transferred to the stable tree. ++ * ++ * @page: the page that we are searching identical page to. ++ * @rmap_item: the reverse mapping into the virtual address of this page ++ */ ++static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) ++{ ++ struct rmap_item *tree_rmap_item; ++ struct page *page; ++ struct page *kpage = NULL; ++ u32 hash_max; ++ int err; ++ unsigned int success1, success2; ++ struct stable_node *snode; ++ int cmp; ++ struct rb_node *parent = NULL, **new; ++ ++ remove_rmap_item_from_tree(rmap_item); ++ page = rmap_item->page; ++ ++ /* We first start with searching the page inside the stable tree */ ++ kpage = stable_tree_search(rmap_item, hash); ++ if (kpage) { ++ err = try_to_merge_with_uksm_page(rmap_item, kpage, ++ hash); ++ if (!err) { ++ /* ++ * The page was successfully merged, add ++ * its rmap_item to the stable tree. ++ * page lock is needed because it's ++ * racing with try_to_unmap_ksm(), etc. ++ */ ++ lock_page(kpage); ++ snode = page_stable_node(kpage); ++ stable_tree_append(rmap_item, snode, 1); ++ unlock_page(kpage); ++ put_page(kpage); ++ return; /* success */ ++ } ++ put_page(kpage); ++ ++ /* ++ * if it's a collision and it has been search in sub-rbtree ++ * (hash_max != 0), we want to abort, because if it is ++ * successfully merged in unstable tree, the collision trends to ++ * happen again. ++ */ ++ if (err == MERGE_ERR_COLLI && rmap_item->hash_max) ++ return; ++ } ++ ++ tree_rmap_item = ++ unstable_tree_search_insert(rmap_item, hash); ++ if (tree_rmap_item) { ++ err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); ++ /* ++ * As soon as we merge this page, we want to remove the ++ * rmap_item of the page we have merged with from the unstable ++ * tree, and insert it instead as new node in the stable tree. ++ */ ++ if (!err) { ++ kpage = page; ++ remove_rmap_item_from_tree(tree_rmap_item); ++ lock_page(kpage); ++ snode = stable_tree_insert(&kpage, hash, ++ rmap_item, tree_rmap_item, ++ &success1, &success2); ++ ++ /* ++ * Do not log dedup for tree item, it's not counted as ++ * scanned in this round. ++ */ ++ if (success2) ++ stable_tree_append(tree_rmap_item, snode, 0); ++ ++ /* ++ * The order of these two stable append is important: ++ * we are scanning rmap_item. ++ */ ++ if (success1) ++ stable_tree_append(rmap_item, snode, 1); ++ ++ /* ++ * The original kpage may be unlocked inside ++ * stable_tree_insert() already. This page ++ * should be unlocked before doing ++ * break_cow(). ++ */ ++ unlock_page(kpage); ++ ++ if (!success1) ++ break_cow(rmap_item); ++ ++ if (!success2) ++ break_cow(tree_rmap_item); ++ ++ } else if (err == MERGE_ERR_COLLI) { ++ BUG_ON(tree_rmap_item->tree_node->count > 1); ++ ++ rmap_item_hash_max(tree_rmap_item, ++ tree_rmap_item->tree_node->hash); ++ ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); ++ parent = &tree_rmap_item->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto put_up_out; ++ ++ rmap_item->tree_node = tree_rmap_item->tree_node; ++ rmap_item->address |= UNSTABLE_FLAG; ++ rmap_item->hash_round = uksm_hash_round; ++ rb_link_node(&rmap_item->node, parent, new); ++ rb_insert_color(&rmap_item->node, ++ &tree_rmap_item->tree_node->sub_root); ++ rmap_item->tree_node->count++; ++ } else { ++ /* ++ * either one of the page has changed or they collide ++ * at the max hash, we consider them as ill items. ++ */ ++ remove_rmap_item_from_tree(tree_rmap_item); ++ } ++put_up_out: ++ put_page(tree_rmap_item->page); ++ up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem); ++ } ++} ++ ++ ++ ++ ++static inline unsigned long get_pool_index(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; ++ if (pool_index >= slot->pool_size) ++ BUG(); ++ return pool_index; ++} ++ ++static inline unsigned long index_page_offset(unsigned long index) ++{ ++ return offset_in_page(sizeof(struct rmap_list_entry *) * index); ++} ++ ++static inline ++struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, ++ unsigned long index, int need_alloc) ++{ ++ unsigned long pool_index; ++ struct page *page; ++ void *addr; ++ ++ ++ pool_index = get_pool_index(slot, index); ++ if (!slot->rmap_list_pool[pool_index]) { ++ if (!need_alloc) ++ return NULL; ++ ++ page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); ++ if (!page) ++ return NULL; ++ ++ slot->rmap_list_pool[pool_index] = page; ++ } ++ ++ addr = kmap(slot->rmap_list_pool[pool_index]); ++ addr += index_page_offset(index); ++ ++ return addr; ++} ++ ++static inline void put_rmap_list_entry(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ kunmap(slot->rmap_list_pool[pool_index]); ++} ++ ++static inline int entry_is_new(struct rmap_list_entry *entry) ++{ ++ return !entry->item; ++} ++ ++static inline unsigned long get_index_orig_addr(struct vma_slot *slot, ++ unsigned long index) ++{ ++ return slot->vma->vm_start + (index << PAGE_SHIFT); ++} ++ ++static inline unsigned long get_entry_address(struct rmap_list_entry *entry) ++{ ++ unsigned long addr; ++ ++ if (is_addr(entry->addr)) ++ addr = get_clean_addr(entry->addr); ++ else if (entry->item) ++ addr = get_rmap_addr(entry->item); ++ else ++ BUG(); ++ ++ return addr; ++} ++ ++static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) ++{ ++ if (is_addr(entry->addr)) ++ return NULL; ++ ++ return entry->item; ++} ++ ++static inline void inc_rmap_list_pool_count(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ slot->pool_counts[pool_index]++; ++} ++ ++static inline void dec_rmap_list_pool_count(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ BUG_ON(!slot->pool_counts[pool_index]); ++ slot->pool_counts[pool_index]--; ++} ++ ++static inline int entry_has_rmap(struct rmap_list_entry *entry) ++{ ++ return !is_addr(entry->addr) && entry->item; ++} ++ ++static inline void swap_entries(struct rmap_list_entry *entry1, ++ unsigned long index1, ++ struct rmap_list_entry *entry2, ++ unsigned long index2) ++{ ++ struct rmap_list_entry tmp; ++ ++ /* swapping two new entries is meaningless */ ++ BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); ++ ++ tmp = *entry1; ++ *entry1 = *entry2; ++ *entry2 = tmp; ++ ++ if (entry_has_rmap(entry1)) ++ entry1->item->entry_index = index1; ++ ++ if (entry_has_rmap(entry2)) ++ entry2->item->entry_index = index2; ++ ++ if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { ++ inc_rmap_list_pool_count(entry1->item->slot, index1); ++ dec_rmap_list_pool_count(entry1->item->slot, index2); ++ } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { ++ inc_rmap_list_pool_count(entry2->item->slot, index2); ++ dec_rmap_list_pool_count(entry2->item->slot, index1); ++ } ++} ++ ++static inline void free_entry_item(struct rmap_list_entry *entry) ++{ ++ unsigned long index; ++ struct rmap_item *item; ++ ++ if (!is_addr(entry->addr)) { ++ BUG_ON(!entry->item); ++ item = entry->item; ++ entry->addr = get_rmap_addr(item); ++ set_is_addr(entry->addr); ++ index = item->entry_index; ++ remove_rmap_item_from_tree(item); ++ dec_rmap_list_pool_count(item->slot, index); ++ free_rmap_item(item); ++ } ++} ++ ++static inline int pool_entry_boundary(unsigned long index) ++{ ++ unsigned long linear_addr; ++ ++ linear_addr = sizeof(struct rmap_list_entry *) * index; ++ return index && !offset_in_page(linear_addr); ++} ++ ++static inline void try_free_last_pool(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ if (slot->rmap_list_pool[pool_index] && ++ !slot->pool_counts[pool_index]) { ++ __free_page(slot->rmap_list_pool[pool_index]); ++ slot->rmap_list_pool[pool_index] = NULL; ++ slot->flags |= UKSM_SLOT_NEED_SORT; ++ } ++ ++} ++ ++static inline unsigned long vma_item_index(struct vm_area_struct *vma, ++ struct rmap_item *item) ++{ ++ return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; ++} ++ ++static int within_same_pool(struct vma_slot *slot, ++ unsigned long i, unsigned long j) ++{ ++ unsigned long pool_i, pool_j; ++ ++ pool_i = get_pool_index(slot, i); ++ pool_j = get_pool_index(slot, j); ++ ++ return (pool_i == pool_j); ++} ++ ++static void sort_rmap_entry_list(struct vma_slot *slot) ++{ ++ unsigned long i, j; ++ struct rmap_list_entry *entry, *swap_entry; ++ ++ entry = get_rmap_list_entry(slot, 0, 0); ++ for (i = 0; i < slot->pages; ) { ++ ++ if (!entry) ++ goto skip_whole_pool; ++ ++ if (entry_is_new(entry)) ++ goto next_entry; ++ ++ if (is_addr(entry->addr)) { ++ entry->addr = 0; ++ goto next_entry; ++ } ++ ++ j = vma_item_index(slot->vma, entry->item); ++ if (j == i) ++ goto next_entry; ++ ++ if (within_same_pool(slot, i, j)) ++ swap_entry = entry + j - i; ++ else ++ swap_entry = get_rmap_list_entry(slot, j, 1); ++ ++ swap_entries(entry, i, swap_entry, j); ++ if (!within_same_pool(slot, i, j)) ++ put_rmap_list_entry(slot, j); ++ continue; ++ ++skip_whole_pool: ++ i += PAGE_SIZE / sizeof(*entry); ++ if (i < slot->pages) ++ entry = get_rmap_list_entry(slot, i, 0); ++ continue; ++ ++next_entry: ++ if (i >= slot->pages - 1 || ++ !within_same_pool(slot, i, i + 1)) { ++ put_rmap_list_entry(slot, i); ++ if (i + 1 < slot->pages) ++ entry = get_rmap_list_entry(slot, i + 1, 0); ++ } else ++ entry++; ++ i++; ++ continue; ++ } ++ ++ /* free empty pool entries which contain no rmap_item */ ++ /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ ++ for (i = 0; i < slot->pool_size; i++) { ++ unsigned char has_rmap; ++ void *addr; ++ ++ if (!slot->rmap_list_pool[i]) ++ continue; ++ ++ has_rmap = 0; ++ addr = kmap(slot->rmap_list_pool[i]); ++ BUG_ON(!addr); ++ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { ++ entry = (struct rmap_list_entry *)addr + j; ++ if (is_addr(entry->addr)) ++ continue; ++ if (!entry->item) ++ continue; ++ has_rmap = 1; ++ } ++ kunmap(slot->rmap_list_pool[i]); ++ if (!has_rmap) { ++ BUG_ON(slot->pool_counts[i]); ++ __free_page(slot->rmap_list_pool[i]); ++ slot->rmap_list_pool[i] = NULL; ++ } ++ } ++ ++ slot->flags &= ~UKSM_SLOT_NEED_SORT; ++} ++ ++/* ++ * vma_fully_scanned() - if all the pages in this slot have been scanned. ++ */ ++static inline int vma_fully_scanned(struct vma_slot *slot) ++{ ++ return slot->pages_scanned == slot->pages; ++} ++ ++/** ++ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to ++ * its random permutation. This function is embedded with the random ++ * permutation index management code. ++ */ ++static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) ++{ ++ unsigned long rand_range, addr, swap_index, scan_index; ++ struct rmap_item *item = NULL; ++ struct rmap_list_entry *scan_entry, *swap_entry = NULL; ++ struct page *page; ++ ++ scan_index = swap_index = slot->pages_scanned % slot->pages; ++ ++ if (pool_entry_boundary(scan_index)) ++ try_free_last_pool(slot, scan_index - 1); ++ ++ if (vma_fully_scanned(slot)) { ++ if (slot->flags & UKSM_SLOT_NEED_SORT) ++ slot->flags |= UKSM_SLOT_NEED_RERAND; ++ else ++ slot->flags &= ~UKSM_SLOT_NEED_RERAND; ++ if (slot->flags & UKSM_SLOT_NEED_SORT) ++ sort_rmap_entry_list(slot); ++ } ++ ++ scan_entry = get_rmap_list_entry(slot, scan_index, 1); ++ if (!scan_entry) ++ return NULL; ++ ++ if (entry_is_new(scan_entry)) { ++ scan_entry->addr = get_index_orig_addr(slot, scan_index); ++ set_is_addr(scan_entry->addr); ++ } ++ ++ if (slot->flags & UKSM_SLOT_NEED_RERAND) { ++ rand_range = slot->pages - scan_index; ++ BUG_ON(!rand_range); ++ swap_index = scan_index + (prandom_u32() % rand_range); ++ } ++ ++ if (swap_index != scan_index) { ++ swap_entry = get_rmap_list_entry(slot, swap_index, 1); ++ if (entry_is_new(swap_entry)) { ++ swap_entry->addr = get_index_orig_addr(slot, ++ swap_index); ++ set_is_addr(swap_entry->addr); ++ } ++ swap_entries(scan_entry, scan_index, swap_entry, swap_index); ++ } ++ ++ addr = get_entry_address(scan_entry); ++ item = get_entry_item(scan_entry); ++ BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); ++ ++ page = follow_page(slot->vma, addr, FOLL_GET); ++ if (IS_ERR_OR_NULL(page)) ++ goto nopage; ++ ++ if (!PageAnon(page)) ++ goto putpage; ++ ++ /*check is zero_page pfn or uksm_zero_page*/ ++ if ((page_to_pfn(page) == zero_pfn) ++ || (page_to_pfn(page) == uksm_zero_pfn)) ++ goto putpage; ++ ++ flush_anon_page(slot->vma, page, addr); ++ flush_dcache_page(page); ++ ++ ++ *hash = page_hash(page, hash_strength, 1); ++ inc_uksm_pages_scanned(); ++ /*if the page content all zero, re-map to zero-page*/ ++ if (find_zero_page_hash(hash_strength, *hash)) { ++ if (!cmp_and_merge_zero_page(slot->vma, page)) { ++ slot->pages_merged++; ++ ++ /* For full-zero pages, no need to create rmap item */ ++ goto putpage; ++ } else { ++ inc_rshash_neg(memcmp_cost / 2); ++ } ++ } ++ ++ if (!item) { ++ item = alloc_rmap_item(); ++ if (item) { ++ /* It has already been zeroed */ ++ item->slot = slot; ++ item->address = addr; ++ item->entry_index = scan_index; ++ scan_entry->item = item; ++ inc_rmap_list_pool_count(slot, scan_index); ++ } else ++ goto putpage; ++ } ++ ++ BUG_ON(item->slot != slot); ++ /* the page may have changed */ ++ item->page = page; ++ put_rmap_list_entry(slot, scan_index); ++ if (swap_entry) ++ put_rmap_list_entry(slot, swap_index); ++ return item; ++ ++putpage: ++ put_page(page); ++ page = NULL; ++nopage: ++ /* no page, store addr back and free rmap_item if possible */ ++ free_entry_item(scan_entry); ++ put_rmap_list_entry(slot, scan_index); ++ if (swap_entry) ++ put_rmap_list_entry(slot, swap_index); ++ return NULL; ++} ++ ++static inline int in_stable_tree(struct rmap_item *rmap_item) ++{ ++ return rmap_item->address & STABLE_FLAG; ++} ++ ++/** ++ * scan_vma_one_page() - scan the next page in a vma_slot. Called with ++ * mmap_sem locked. ++ */ ++static noinline void scan_vma_one_page(struct vma_slot *slot) ++{ ++ u32 hash; ++ struct mm_struct *mm; ++ struct rmap_item *rmap_item = NULL; ++ struct vm_area_struct *vma = slot->vma; ++ ++ mm = vma->vm_mm; ++ BUG_ON(!mm); ++ BUG_ON(!slot); ++ ++ rmap_item = get_next_rmap_item(slot, &hash); ++ if (!rmap_item) ++ goto out1; ++ ++ if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) ++ goto out2; ++ ++ cmp_and_merge_page(rmap_item, hash); ++out2: ++ put_page(rmap_item->page); ++out1: ++ slot->pages_scanned++; ++ slot->this_sampled++; ++ if (slot->fully_scanned_round != fully_scanned_round) ++ scanned_virtual_pages++; ++ ++ if (vma_fully_scanned(slot)) ++ slot->fully_scanned_round = fully_scanned_round; ++} ++ ++static inline unsigned long rung_get_pages(struct scan_rung *rung) ++{ ++ struct slot_tree_node *node; ++ ++ if (!rung->vma_root.rnode) ++ return 0; ++ ++ node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); ++ ++ return node->size; ++} ++ ++#define RUNG_SAMPLED_MIN 3 ++ ++static inline ++void uksm_calc_rung_step(struct scan_rung *rung, ++ unsigned long page_time, unsigned long ratio) ++{ ++ unsigned long sampled, pages; ++ ++ /* will be fully scanned ? */ ++ if (!rung->cover_msecs) { ++ rung->step = 1; ++ return; ++ } ++ ++ sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) ++ * ratio / page_time; ++ ++ /* ++ * Before we finsish a scan round and expensive per-round jobs, ++ * we need to have a chance to estimate the per page time. So ++ * the sampled number can not be too small. ++ */ ++ if (sampled < RUNG_SAMPLED_MIN) ++ sampled = RUNG_SAMPLED_MIN; ++ ++ pages = rung_get_pages(rung); ++ if (likely(pages > sampled)) ++ rung->step = pages / sampled; ++ else ++ rung->step = 1; ++} ++ ++static inline int step_need_recalc(struct scan_rung *rung) ++{ ++ unsigned long pages, stepmax; ++ ++ pages = rung_get_pages(rung); ++ stepmax = pages / RUNG_SAMPLED_MIN; ++ ++ return pages && (rung->step > pages || ++ (stepmax && rung->step > stepmax)); ++} ++ ++static inline ++void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) ++{ ++ struct vma_slot *slot; ++ ++ if (finished) ++ rung->flags |= UKSM_RUNG_ROUND_FINISHED; ++ ++ if (step_recalc || step_need_recalc(rung)) { ++ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); ++ BUG_ON(step_need_recalc(rung)); ++ } ++ ++ slot_iter_index = prandom_u32() % rung->step; ++ BUG_ON(!rung->vma_root.rnode); ++ slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); ++ BUG_ON(!slot); ++ ++ rung->current_scan = slot; ++ rung->current_offset = slot_iter_index; ++} ++ ++static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) ++{ ++ return &slot->rung->vma_root; ++} ++ ++/* ++ * return if resetted. ++ */ ++static int advance_current_scan(struct scan_rung *rung) ++{ ++ unsigned short n; ++ struct vma_slot *slot, *next = NULL; ++ ++ BUG_ON(!rung->vma_root.num); ++ ++ slot = rung->current_scan; ++ n = (slot->pages - rung->current_offset) % rung->step; ++ slot_iter_index = rung->step - n; ++ next = sradix_tree_next(&rung->vma_root, slot->snode, ++ slot->sindex, slot_iter); ++ ++ if (next) { ++ rung->current_offset = slot_iter_index; ++ rung->current_scan = next; ++ return 0; ++ } else { ++ reset_current_scan(rung, 1, 0); ++ return 1; ++ } ++} ++ ++static inline void rung_rm_slot(struct vma_slot *slot) ++{ ++ struct scan_rung *rung = slot->rung; ++ struct sradix_tree_root *root; ++ ++ if (rung->current_scan == slot) ++ advance_current_scan(rung); ++ ++ root = slot_get_root(slot); ++ sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); ++ slot->snode = NULL; ++ if (step_need_recalc(rung)) { ++ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); ++ BUG_ON(step_need_recalc(rung)); ++ } ++ ++ /* In case advance_current_scan loop back to this slot again */ ++ if (rung->vma_root.num && rung->current_scan == slot) ++ reset_current_scan(slot->rung, 1, 0); ++} ++ ++static inline void rung_add_new_slots(struct scan_rung *rung, ++ struct vma_slot **slots, unsigned long num) ++{ ++ int err; ++ struct vma_slot *slot; ++ unsigned long i; ++ struct sradix_tree_root *root = &rung->vma_root; ++ ++ err = sradix_tree_enter(root, (void **)slots, num); ++ BUG_ON(err); ++ ++ for (i = 0; i < num; i++) { ++ slot = slots[i]; ++ slot->rung = rung; ++ BUG_ON(vma_fully_scanned(slot)); ++ } ++ ++ if (rung->vma_root.num == num) ++ reset_current_scan(rung, 0, 1); ++} ++ ++static inline int rung_add_one_slot(struct scan_rung *rung, ++ struct vma_slot *slot) ++{ ++ int err; ++ ++ err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); ++ if (err) ++ return err; ++ ++ slot->rung = rung; ++ if (rung->vma_root.num == 1) ++ reset_current_scan(rung, 0, 1); ++ ++ return 0; ++} ++ ++/* ++ * Return true if the slot is deleted from its rung. ++ */ ++static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) ++{ ++ struct scan_rung *old_rung = slot->rung; ++ int err; ++ ++ if (old_rung == rung) ++ return 0; ++ ++ rung_rm_slot(slot); ++ err = rung_add_one_slot(rung, slot); ++ if (err) { ++ err = rung_add_one_slot(old_rung, slot); ++ WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ ++ } ++ ++ return 1; ++} ++ ++static inline int vma_rung_up(struct vma_slot *slot) ++{ ++ struct scan_rung *rung; ++ ++ rung = slot->rung; ++ if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) ++ rung++; ++ ++ return vma_rung_enter(slot, rung); ++} ++ ++static inline int vma_rung_down(struct vma_slot *slot) ++{ ++ struct scan_rung *rung; ++ ++ rung = slot->rung; ++ if (slot->rung != &uksm_scan_ladder[0]) ++ rung--; ++ ++ return vma_rung_enter(slot, rung); ++} ++ ++/** ++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. ++ */ ++static unsigned long cal_dedup_ratio(struct vma_slot *slot) ++{ ++ unsigned long ret; ++ unsigned long pages; ++ ++ pages = slot->this_sampled; ++ if (!pages) ++ return 0; ++ ++ BUG_ON(slot->pages_scanned == slot->last_scanned); ++ ++ ret = slot->pages_merged; ++ ++ /* Thrashing area filtering */ ++ if (ret && uksm_thrash_threshold) { ++ if (slot->pages_cowed * 100 / slot->pages_merged ++ > uksm_thrash_threshold) { ++ ret = 0; ++ } else { ++ ret = slot->pages_merged - slot->pages_cowed; ++ } ++ } ++ ++ return ret * 100 / pages; ++} ++ ++/** ++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. ++ */ ++static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) ++{ ++ unsigned long ret; ++ unsigned long pages; ++ ++ pages = slot->pages; ++ if (!pages) ++ return 0; ++ ++ ret = slot->pages_bemerged; ++ ++ /* Thrashing area filtering */ ++ if (ret && uksm_thrash_threshold) { ++ if (slot->pages_cowed * 100 / slot->pages_bemerged ++ > uksm_thrash_threshold) { ++ ret = 0; ++ } else { ++ ret = slot->pages_bemerged - slot->pages_cowed; ++ } ++ } ++ ++ return ret * 100 / pages; ++} ++ ++/** ++ * stable_node_reinsert() - When the hash_strength has been adjusted, the ++ * stable tree need to be restructured, this is the function re-inserting the ++ * stable node. ++ */ ++static inline void stable_node_reinsert(struct stable_node *new_node, ++ struct page *page, ++ struct rb_root *root_treep, ++ struct list_head *tree_node_listp, ++ u32 hash) ++{ ++ struct rb_node **new = &root_treep->rb_node; ++ struct rb_node *parent = NULL; ++ struct stable_node *stable_node; ++ struct tree_node *tree_node; ++ struct page *tree_page; ++ int cmp; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ /* find a stable tree node with same first level hash value */ ++ stable_node_hash_max(new_node, page, hash); ++ if (tree_node->count == 1) { ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ stable_node_hash_max(stable_node, ++ tree_page, hash); ++ put_page(tree_page); ++ ++ /* prepare for stable node insertion */ ++ ++ cmp = hash_cmp(new_node->hash_max, ++ stable_node->hash_max); ++ parent = &stable_node->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto failed; ++ ++ goto add_node; ++ } else { ++ /* the only stable_node deleted, the tree node ++ * was not deleted. ++ */ ++ goto tree_node_reuse; ++ } ++ } ++ ++ /* well, search the collision subtree */ ++ new = &tree_node->sub_root.rb_node; ++ parent = NULL; ++ BUG_ON(!*new); ++ while (*new) { ++ int cmp; ++ ++ stable_node = rb_entry(*new, struct stable_node, node); ++ ++ cmp = hash_cmp(new_node->hash_max, ++ stable_node->hash_max); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else { ++ /* oh, no, still a collision */ ++ goto failed; ++ } ++ } ++ ++ goto add_node; ++ } ++ ++ /* no tree node found */ ++ tree_node = alloc_tree_node(tree_node_listp); ++ if (!tree_node) { ++ pr_err("UKSM: memory allocation error!\n"); ++ goto failed; ++ } else { ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, root_treep); ++ ++tree_node_reuse: ++ /* prepare for stable node insertion */ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++add_node: ++ rb_link_node(&new_node->node, parent, new); ++ rb_insert_color(&new_node->node, &tree_node->sub_root); ++ new_node->tree_node = tree_node; ++ tree_node->count++; ++ return; ++ ++failed: ++ /* This can only happen when two nodes have collided ++ * in two levels. ++ */ ++ new_node->tree_node = NULL; ++ return; ++} ++ ++static inline void free_all_tree_nodes(struct list_head *list) ++{ ++ struct tree_node *node, *tmp; ++ ++ list_for_each_entry_safe(node, tmp, list, all_list) { ++ free_tree_node(node); ++ } ++} ++ ++/** ++ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash ++ * strength to the current hash_strength. It re-structures the hole tree. ++ */ ++static inline void stable_tree_delta_hash(u32 prev_hash_strength) ++{ ++ struct stable_node *node, *tmp; ++ struct rb_root *root_new_treep; ++ struct list_head *new_tree_node_listp; ++ ++ stable_tree_index = (stable_tree_index + 1) % 2; ++ root_new_treep = &root_stable_tree[stable_tree_index]; ++ new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; ++ *root_new_treep = RB_ROOT; ++ BUG_ON(!list_empty(new_tree_node_listp)); ++ ++ /* ++ * we need to be safe, the node could be removed by get_uksm_page() ++ */ ++ list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { ++ void *addr; ++ struct page *node_page; ++ u32 hash; ++ ++ /* ++ * We are completely re-structuring the stable nodes to a new ++ * stable tree. We don't want to touch the old tree unlinks and ++ * old tree_nodes. The old tree_nodes will be freed at once. ++ */ ++ node_page = get_uksm_page(node, 0, 0); ++ if (!node_page) ++ continue; ++ ++ if (node->tree_node) { ++ hash = node->tree_node->hash; ++ ++ addr = kmap_atomic(node_page); ++ ++ hash = delta_hash(addr, prev_hash_strength, ++ hash_strength, hash); ++ kunmap_atomic(addr); ++ } else { ++ /* ++ *it was not inserted to rbtree due to collision in last ++ *round scan. ++ */ ++ hash = page_hash(node_page, hash_strength, 0); ++ } ++ ++ stable_node_reinsert(node, node_page, root_new_treep, ++ new_tree_node_listp, hash); ++ put_page(node_page); ++ } ++ ++ root_stable_treep = root_new_treep; ++ free_all_tree_nodes(stable_tree_node_listp); ++ BUG_ON(!list_empty(stable_tree_node_listp)); ++ stable_tree_node_listp = new_tree_node_listp; ++} ++ ++static inline void inc_hash_strength(unsigned long delta) ++{ ++ hash_strength += 1 << delta; ++ if (hash_strength > HASH_STRENGTH_MAX) ++ hash_strength = HASH_STRENGTH_MAX; ++} ++ ++static inline void dec_hash_strength(unsigned long delta) ++{ ++ unsigned long change = 1 << delta; ++ ++ if (hash_strength <= change + 1) ++ hash_strength = 1; ++ else ++ hash_strength -= change; ++} ++ ++static inline void inc_hash_strength_delta(void) ++{ ++ hash_strength_delta++; ++ if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) ++ hash_strength_delta = HASH_STRENGTH_DELTA_MAX; ++} ++ ++static inline unsigned long get_current_neg_ratio(void) ++{ ++ u64 pos = benefit.pos; ++ u64 neg = benefit.neg; ++ ++ if (!neg) ++ return 0; ++ ++ if (!pos || neg > pos) ++ return 100; ++ ++ if (neg > div64_u64(U64_MAX, 100)) ++ pos = div64_u64(pos, 100); ++ else ++ neg *= 100; ++ ++ return div64_u64(neg, pos); ++} ++ ++static inline unsigned long get_current_benefit(void) ++{ ++ u64 pos = benefit.pos; ++ u64 neg = benefit.neg; ++ u64 scanned = benefit.scanned; ++ ++ if (neg > pos) ++ return 0; ++ ++ return div64_u64((pos - neg), scanned); ++} ++ ++static inline int judge_rshash_direction(void) ++{ ++ u64 current_neg_ratio, stable_benefit; ++ u64 current_benefit, delta = 0; ++ int ret = STILL; ++ ++ /* ++ * Try to probe a value after the boot, and in case the system ++ * are still for a long time. ++ */ ++ if ((fully_scanned_round & 0xFFULL) == 10) { ++ ret = OBSCURE; ++ goto out; ++ } ++ ++ current_neg_ratio = get_current_neg_ratio(); ++ ++ if (current_neg_ratio == 0) { ++ rshash_neg_cont_zero++; ++ if (rshash_neg_cont_zero > 2) ++ return GO_DOWN; ++ else ++ return STILL; ++ } ++ rshash_neg_cont_zero = 0; ++ ++ if (current_neg_ratio > 90) { ++ ret = GO_UP; ++ goto out; ++ } ++ ++ current_benefit = get_current_benefit(); ++ stable_benefit = rshash_state.stable_benefit; ++ ++ if (!stable_benefit) { ++ ret = OBSCURE; ++ goto out; ++ } ++ ++ if (current_benefit > stable_benefit) ++ delta = current_benefit - stable_benefit; ++ else if (current_benefit < stable_benefit) ++ delta = stable_benefit - current_benefit; ++ ++ delta = div64_u64(100 * delta, stable_benefit); ++ ++ if (delta > 50) { ++ rshash_cont_obscure++; ++ if (rshash_cont_obscure > 2) ++ return OBSCURE; ++ else ++ return STILL; ++ } ++ ++out: ++ rshash_cont_obscure = 0; ++ return ret; ++} ++ ++/** ++ * rshash_adjust() - The main function to control the random sampling state ++ * machine for hash strength adapting. ++ * ++ * return true if hash_strength has changed. ++ */ ++static inline int rshash_adjust(void) ++{ ++ unsigned long prev_hash_strength = hash_strength; ++ ++ if (!encode_benefit()) ++ return 0; ++ ++ switch (rshash_state.state) { ++ case RSHASH_STILL: ++ switch (judge_rshash_direction()) { ++ case GO_UP: ++ if (rshash_state.pre_direct == GO_DOWN) ++ hash_strength_delta = 0; ++ ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.pre_direct = GO_UP; ++ break; ++ ++ case GO_DOWN: ++ if (rshash_state.pre_direct == GO_UP) ++ hash_strength_delta = 0; ++ ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.pre_direct = GO_DOWN; ++ break; ++ ++ case OBSCURE: ++ rshash_state.stable_point = hash_strength; ++ rshash_state.turn_point_down = hash_strength; ++ rshash_state.turn_point_up = hash_strength; ++ rshash_state.turn_benefit_down = get_current_benefit(); ++ rshash_state.turn_benefit_up = get_current_benefit(); ++ rshash_state.lookup_window_index = 0; ++ rshash_state.state = RSHASH_TRYDOWN; ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ break; ++ ++ case STILL: ++ break; ++ default: ++ BUG(); ++ } ++ break; ++ ++ case RSHASH_TRYDOWN: ++ if (rshash_state.lookup_window_index++ % 5 == 0) ++ rshash_state.below_count = 0; ++ ++ if (get_current_benefit() < rshash_state.stable_benefit) ++ rshash_state.below_count++; ++ else if (get_current_benefit() > ++ rshash_state.turn_benefit_down) { ++ rshash_state.turn_point_down = hash_strength; ++ rshash_state.turn_benefit_down = get_current_benefit(); ++ } ++ ++ if (rshash_state.below_count >= 3 || ++ judge_rshash_direction() == GO_UP || ++ hash_strength == 1) { ++ hash_strength = rshash_state.stable_point; ++ hash_strength_delta = 0; ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.lookup_window_index = 0; ++ rshash_state.state = RSHASH_TRYUP; ++ hash_strength_delta = 0; ++ } else { ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ } ++ break; ++ ++ case RSHASH_TRYUP: ++ if (rshash_state.lookup_window_index++ % 5 == 0) ++ rshash_state.below_count = 0; ++ ++ if (get_current_benefit() < rshash_state.turn_benefit_down) ++ rshash_state.below_count++; ++ else if (get_current_benefit() > rshash_state.turn_benefit_up) { ++ rshash_state.turn_point_up = hash_strength; ++ rshash_state.turn_benefit_up = get_current_benefit(); ++ } ++ ++ if (rshash_state.below_count >= 3 || ++ judge_rshash_direction() == GO_DOWN || ++ hash_strength == HASH_STRENGTH_MAX) { ++ hash_strength = rshash_state.turn_benefit_up > ++ rshash_state.turn_benefit_down ? ++ rshash_state.turn_point_up : ++ rshash_state.turn_point_down; ++ ++ rshash_state.state = RSHASH_PRE_STILL; ++ } else { ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ } ++ ++ break; ++ ++ case RSHASH_NEW: ++ case RSHASH_PRE_STILL: ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.state = RSHASH_STILL; ++ hash_strength_delta = 0; ++ break; ++ default: ++ BUG(); ++ } ++ ++ /* rshash_neg = rshash_pos = 0; */ ++ reset_benefit(); ++ ++ if (prev_hash_strength != hash_strength) ++ stable_tree_delta_hash(prev_hash_strength); ++ ++ return prev_hash_strength != hash_strength; ++} ++ ++/** ++ * round_update_ladder() - The main function to do update of all the ++ * adjustments whenever a scan round is finished. ++ */ ++static noinline void round_update_ladder(void) ++{ ++ int i; ++ unsigned long dedup; ++ struct vma_slot *slot, *tmp_slot; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) ++ uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; ++ ++ list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { ++ ++ /* slot may be rung_rm_slot() when mm exits */ ++ if (slot->snode) { ++ dedup = cal_dedup_ratio_old(slot); ++ if (dedup && dedup >= uksm_abundant_threshold) ++ vma_rung_up(slot); ++ } ++ ++ slot->pages_bemerged = 0; ++ slot->pages_cowed = 0; ++ ++ list_del_init(&slot->dedup_list); ++ } ++} ++ ++static void uksm_del_vma_slot(struct vma_slot *slot) ++{ ++ int i, j; ++ struct rmap_list_entry *entry; ++ ++ if (slot->snode) { ++ /* ++ * In case it just failed when entering the rung, it's not ++ * necessary. ++ */ ++ rung_rm_slot(slot); ++ } ++ ++ if (!list_empty(&slot->dedup_list)) ++ list_del(&slot->dedup_list); ++ ++ if (!slot->rmap_list_pool || !slot->pool_counts) { ++ /* In case it OOMed in uksm_vma_enter() */ ++ goto out; ++ } ++ ++ for (i = 0; i < slot->pool_size; i++) { ++ void *addr; ++ ++ if (!slot->rmap_list_pool[i]) ++ continue; ++ ++ addr = kmap(slot->rmap_list_pool[i]); ++ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { ++ entry = (struct rmap_list_entry *)addr + j; ++ if (is_addr(entry->addr)) ++ continue; ++ if (!entry->item) ++ continue; ++ ++ remove_rmap_item_from_tree(entry->item); ++ free_rmap_item(entry->item); ++ slot->pool_counts[i]--; ++ } ++ BUG_ON(slot->pool_counts[i]); ++ kunmap(slot->rmap_list_pool[i]); ++ __free_page(slot->rmap_list_pool[i]); ++ } ++ kfree(slot->rmap_list_pool); ++ kfree(slot->pool_counts); ++ ++out: ++ slot->rung = NULL; ++ if (slot->flags & UKSM_SLOT_IN_UKSM) { ++ BUG_ON(uksm_pages_total < slot->pages); ++ uksm_pages_total -= slot->pages; ++ } ++ ++ if (slot->fully_scanned_round == fully_scanned_round) ++ scanned_virtual_pages -= slot->pages; ++ else ++ scanned_virtual_pages -= slot->pages_scanned; ++ free_vma_slot(slot); ++} ++ ++ ++#define SPIN_LOCK_PERIOD 32 ++static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; ++static inline void cleanup_vma_slots(void) ++{ ++ struct vma_slot *slot; ++ int i; ++ ++ i = 0; ++ spin_lock(&vma_slot_list_lock); ++ while (!list_empty(&vma_slot_del)) { ++ slot = list_entry(vma_slot_del.next, ++ struct vma_slot, slot_list); ++ list_del(&slot->slot_list); ++ cleanup_slots[i++] = slot; ++ if (i == SPIN_LOCK_PERIOD) { ++ spin_unlock(&vma_slot_list_lock); ++ while (--i >= 0) ++ uksm_del_vma_slot(cleanup_slots[i]); ++ i = 0; ++ spin_lock(&vma_slot_list_lock); ++ } ++ } ++ spin_unlock(&vma_slot_list_lock); ++ ++ while (--i >= 0) ++ uksm_del_vma_slot(cleanup_slots[i]); ++} ++ ++/* ++ * Expotional moving average formula ++ */ ++static inline unsigned long ema(unsigned long curr, unsigned long last_ema) ++{ ++ /* ++ * For a very high burst, even the ema cannot work well, a false very ++ * high per-page time estimation can result in feedback in very high ++ * overhead of context switch and rung update -- this will then lead ++ * to higher per-paper time, this may not converge. ++ * ++ * Instead, we try to approach this value in a binary manner. ++ */ ++ if (curr > last_ema * 10) ++ return last_ema * 2; ++ ++ return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; ++} ++ ++/* ++ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to ++ * nanoseconds based on current uksm_sleep_jiffies. ++ */ ++static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) ++{ ++ return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / ++ (TIME_RATIO_SCALE - ratio) * ratio; ++} ++ ++ ++static inline unsigned long rung_real_ratio(int cpu_time_ratio) ++{ ++ unsigned long ret; ++ ++ BUG_ON(!cpu_time_ratio); ++ ++ if (cpu_time_ratio > 0) ++ ret = cpu_time_ratio; ++ else ++ ret = (unsigned long)(-cpu_time_ratio) * ++ uksm_max_cpu_percentage / 100UL; ++ ++ return ret ? ret : 1; ++} ++ ++static noinline void uksm_calc_scan_pages(void) ++{ ++ struct scan_rung *ladder = uksm_scan_ladder; ++ unsigned long sleep_usecs, nsecs; ++ unsigned long ratio; ++ int i; ++ unsigned long per_page; ++ ++ if (uksm_ema_page_time > 100000 || ++ (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) ++ uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; ++ ++ per_page = uksm_ema_page_time; ++ BUG_ON(!per_page); ++ ++ /* ++ * For every 8 eval round, we try to probe a uksm_sleep_jiffies value ++ * based on saved user input. ++ */ ++ if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) ++ uksm_sleep_jiffies = uksm_sleep_saved; ++ ++ /* We require a rung scan at least 1 page in a period. */ ++ nsecs = per_page; ++ ratio = rung_real_ratio(ladder[0].cpu_ratio); ++ if (cpu_ratio_to_nsec(ratio) < nsecs) { ++ sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio ++ / NSEC_PER_USEC; ++ uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ ratio = rung_real_ratio(ladder[i].cpu_ratio); ++ ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / ++ per_page; ++ BUG_ON(!ladder[i].pages_to_scan); ++ uksm_calc_rung_step(&ladder[i], per_page, ratio); ++ } ++} ++ ++/* ++ * From the scan time of this round (ns) to next expected min sleep time ++ * (ms), be careful of the possible overflows. ratio is taken from ++ * rung_real_ratio() ++ */ ++static inline ++unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) ++{ ++ scan_time >>= 20; /* to msec level now */ ++ BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); ++ ++ return (unsigned int) ((unsigned long) scan_time * ++ (TIME_RATIO_SCALE - ratio) / ratio); ++} ++ ++#define __round_mask(x, y) ((__typeof__(x))((y)-1)) ++#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) ++ ++static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) ++{ ++ struct scan_rung *rung; ++ ++ rung = &uksm_scan_ladder[0]; ++ rung_add_new_slots(rung, slots, num); ++} ++ ++static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; ++ ++static void uksm_enter_all_slots(void) ++{ ++ struct vma_slot *slot; ++ unsigned long index; ++ struct list_head empty_vma_list; ++ int i; ++ ++ i = 0; ++ index = 0; ++ INIT_LIST_HEAD(&empty_vma_list); ++ ++ spin_lock(&vma_slot_list_lock); ++ while (!list_empty(&vma_slot_new)) { ++ slot = list_entry(vma_slot_new.next, ++ struct vma_slot, slot_list); ++ ++ if (!slot->vma->anon_vma) { ++ list_move(&slot->slot_list, &empty_vma_list); ++ } else if (vma_can_enter(slot->vma)) { ++ batch_slots[index++] = slot; ++ list_del_init(&slot->slot_list); ++ } else { ++ list_move(&slot->slot_list, &vma_slot_noadd); ++ } ++ ++ if (++i == SPIN_LOCK_PERIOD || ++ (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { ++ spin_unlock(&vma_slot_list_lock); ++ ++ if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { ++ uksm_vma_enter(batch_slots, index); ++ index = 0; ++ } ++ i = 0; ++ cond_resched(); ++ spin_lock(&vma_slot_list_lock); ++ } ++ } ++ ++ list_splice(&empty_vma_list, &vma_slot_new); ++ ++ spin_unlock(&vma_slot_list_lock); ++ ++ if (index) ++ uksm_vma_enter(batch_slots, index); ++ ++} ++ ++static inline int rung_round_finished(struct scan_rung *rung) ++{ ++ return rung->flags & UKSM_RUNG_ROUND_FINISHED; ++} ++ ++static inline void judge_slot(struct vma_slot *slot) ++{ ++ struct scan_rung *rung = slot->rung; ++ unsigned long dedup; ++ int deleted; ++ ++ dedup = cal_dedup_ratio(slot); ++ if (vma_fully_scanned(slot) && uksm_thrash_threshold) ++ deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); ++ else if (dedup && dedup >= uksm_abundant_threshold) ++ deleted = vma_rung_up(slot); ++ else ++ deleted = vma_rung_down(slot); ++ ++ slot->pages_merged = 0; ++ slot->pages_cowed = 0; ++ slot->this_sampled = 0; ++ ++ if (vma_fully_scanned(slot)) ++ slot->pages_scanned = 0; ++ ++ slot->last_scanned = slot->pages_scanned; ++ ++ /* If its deleted in above, then rung was already advanced. */ ++ if (!deleted) ++ advance_current_scan(rung); ++} ++ ++ ++static inline int hash_round_finished(void) ++{ ++ if (scanned_virtual_pages > (uksm_pages_total >> 2)) { ++ scanned_virtual_pages = 0; ++ if (uksm_pages_scanned) ++ fully_scanned_round++; ++ ++ return 1; ++ } else { ++ return 0; ++ } ++} ++ ++#define UKSM_MMSEM_BATCH 5 ++#define BUSY_RETRY 100 ++ ++/** ++ * uksm_do_scan() - the main worker function. ++ */ ++static noinline void uksm_do_scan(void) ++{ ++ struct vma_slot *slot, *iter; ++ struct mm_struct *busy_mm; ++ unsigned char round_finished, all_rungs_emtpy; ++ int i, err, mmsem_batch; ++ unsigned long pcost; ++ long long delta_exec; ++ unsigned long vpages, max_cpu_ratio; ++ unsigned long long start_time, end_time, scan_time; ++ unsigned int expected_jiffies; ++ ++ might_sleep(); ++ ++ vpages = 0; ++ ++ start_time = task_sched_runtime(current); ++ max_cpu_ratio = 0; ++ mmsem_batch = 0; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE;) { ++ struct scan_rung *rung = &uksm_scan_ladder[i]; ++ unsigned long ratio; ++ int busy_retry; ++ ++ if (!rung->pages_to_scan) { ++ i++; ++ continue; ++ } ++ ++ if (!rung->vma_root.num) { ++ rung->pages_to_scan = 0; ++ i++; ++ continue; ++ } ++ ++ ratio = rung_real_ratio(rung->cpu_ratio); ++ if (ratio > max_cpu_ratio) ++ max_cpu_ratio = ratio; ++ ++ busy_retry = BUSY_RETRY; ++ /* ++ * Do not consider rung_round_finished() here, just used up the ++ * rung->pages_to_scan quota. ++ */ ++ while (rung->pages_to_scan && rung->vma_root.num && ++ likely(!freezing(current))) { ++ int reset = 0; ++ ++ slot = rung->current_scan; ++ ++ BUG_ON(vma_fully_scanned(slot)); ++ ++ if (mmsem_batch) ++ err = 0; ++ else ++ err = try_down_read_slot_mmap_sem(slot); ++ ++ if (err == -ENOENT) { ++rm_slot: ++ rung_rm_slot(slot); ++ continue; ++ } ++ ++ busy_mm = slot->mm; ++ ++ if (err == -EBUSY) { ++ /* skip other vmas on the same mm */ ++ do { ++ reset = advance_current_scan(rung); ++ iter = rung->current_scan; ++ busy_retry--; ++ if (iter->vma->vm_mm != busy_mm || ++ !busy_retry || reset) ++ break; ++ } while (1); ++ ++ if (iter->vma->vm_mm != busy_mm) { ++ continue; ++ } else { ++ /* scan round finsished */ ++ break; ++ } ++ } ++ ++ BUG_ON(!vma_can_enter(slot->vma)); ++ if (uksm_test_exit(slot->vma->vm_mm)) { ++ mmsem_batch = 0; ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ goto rm_slot; ++ } ++ ++ if (mmsem_batch) ++ mmsem_batch--; ++ else ++ mmsem_batch = UKSM_MMSEM_BATCH; ++ ++ /* Ok, we have take the mmap_sem, ready to scan */ ++ scan_vma_one_page(slot); ++ rung->pages_to_scan--; ++ vpages++; ++ ++ if (rung->current_offset + rung->step > slot->pages - 1 ++ || vma_fully_scanned(slot)) { ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ judge_slot(slot); ++ mmsem_batch = 0; ++ } else { ++ rung->current_offset += rung->step; ++ if (!mmsem_batch) ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ } ++ ++ busy_retry = BUSY_RETRY; ++ cond_resched(); ++ } ++ ++ if (mmsem_batch) { ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ mmsem_batch = 0; ++ } ++ ++ if (freezing(current)) ++ break; ++ ++ cond_resched(); ++ } ++ end_time = task_sched_runtime(current); ++ delta_exec = end_time - start_time; ++ ++ if (freezing(current)) ++ return; ++ ++ cleanup_vma_slots(); ++ uksm_enter_all_slots(); ++ ++ round_finished = 1; ++ all_rungs_emtpy = 1; ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ struct scan_rung *rung = &uksm_scan_ladder[i]; ++ ++ if (rung->vma_root.num) { ++ all_rungs_emtpy = 0; ++ if (!rung_round_finished(rung)) ++ round_finished = 0; ++ } ++ } ++ ++ if (all_rungs_emtpy) ++ round_finished = 0; ++ ++ if (round_finished) { ++ round_update_ladder(); ++ uksm_eval_round++; ++ ++ if (hash_round_finished() && rshash_adjust()) { ++ /* Reset the unstable root iff hash strength changed */ ++ uksm_hash_round++; ++ root_unstable_tree = RB_ROOT; ++ free_all_tree_nodes(&unstable_tree_node_list); ++ } ++ ++ /* ++ * A number of pages can hang around indefinitely on per-cpu ++ * pagevecs, raised page count preventing write_protect_page ++ * from merging them. Though it doesn't really matter much, ++ * it is puzzling to see some stuck in pages_volatile until ++ * other activity jostles them out, and they also prevented ++ * LTP's KSM test from succeeding deterministically; so drain ++ * them here (here rather than on entry to uksm_do_scan(), ++ * so we don't IPI too often when pages_to_scan is set low). ++ */ ++ lru_add_drain_all(); ++ } ++ ++ ++ if (vpages && delta_exec > 0) { ++ pcost = (unsigned long) delta_exec / vpages; ++ if (likely(uksm_ema_page_time)) ++ uksm_ema_page_time = ema(pcost, uksm_ema_page_time); ++ else ++ uksm_ema_page_time = pcost; ++ } ++ ++ uksm_calc_scan_pages(); ++ uksm_sleep_real = uksm_sleep_jiffies; ++ /* in case of radical cpu bursts, apply the upper bound */ ++ end_time = task_sched_runtime(current); ++ if (max_cpu_ratio && end_time > start_time) { ++ scan_time = end_time - start_time; ++ expected_jiffies = msecs_to_jiffies( ++ scan_time_to_sleep(scan_time, max_cpu_ratio)); ++ ++ if (expected_jiffies > uksm_sleep_real) ++ uksm_sleep_real = expected_jiffies; ++ ++ /* We have a 1 second up bound for responsiveness. */ ++ if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) ++ uksm_sleep_real = msecs_to_jiffies(1000); ++ } ++ ++ return; ++} ++ ++static int ksmd_should_run(void) ++{ ++ return uksm_run & UKSM_RUN_MERGE; ++} ++ ++static int uksm_scan_thread(void *nothing) ++{ ++ set_freezable(); ++ set_user_nice(current, 5); ++ ++ while (!kthread_should_stop()) { ++ mutex_lock(&uksm_thread_mutex); ++ if (ksmd_should_run()) ++ uksm_do_scan(); ++ mutex_unlock(&uksm_thread_mutex); ++ ++ try_to_freeze(); ++ ++ if (ksmd_should_run()) { ++ schedule_timeout_interruptible(uksm_sleep_real); ++ uksm_sleep_times++; ++ } else { ++ wait_event_freezable(uksm_thread_wait, ++ ksmd_should_run() || kthread_should_stop()); ++ } ++ } ++ return 0; ++} ++ ++void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) ++{ ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct rmap_item *rmap_item; ++ int search_new_forks = 0; ++ unsigned long address; ++ ++ VM_BUG_ON_PAGE(!PageKsm(page), page); ++ VM_BUG_ON_PAGE(!PageLocked(page), page); ++ ++ stable_node = page_stable_node(page); ++ if (!stable_node) ++ return; ++again: ++ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ struct anon_vma_chain *vmac; ++ struct vm_area_struct *vma; ++ ++ cond_resched(); ++ anon_vma_lock_read(anon_vma); ++ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, ++ 0, ULONG_MAX) { ++ cond_resched(); ++ vma = vmac->vma; ++ address = get_rmap_addr(rmap_item); ++ ++ if (address < vma->vm_start || ++ address >= vma->vm_end) ++ continue; ++ ++ if ((rmap_item->slot->vma == vma) == ++ search_new_forks) ++ continue; ++ ++ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) ++ continue; ++ ++ if (!rwc->rmap_one(page, vma, address, rwc->arg)) { ++ anon_vma_unlock_read(anon_vma); ++ return; ++ } ++ ++ if (rwc->done && rwc->done(page)) { ++ anon_vma_unlock_read(anon_vma); ++ return; ++ } ++ } ++ anon_vma_unlock_read(anon_vma); ++ } ++ } ++ if (!search_new_forks++) ++ goto again; ++} ++ ++#ifdef CONFIG_MIGRATION ++/* Common ksm interface but may be specific to uksm */ ++void ksm_migrate_page(struct page *newpage, struct page *oldpage) ++{ ++ struct stable_node *stable_node; ++ ++ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); ++ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); ++ VM_BUG_ON(newpage->mapping != oldpage->mapping); ++ ++ stable_node = page_stable_node(newpage); ++ if (stable_node) { ++ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); ++ stable_node->kpfn = page_to_pfn(newpage); ++ /* ++ * newpage->mapping was set in advance; now we need smp_wmb() ++ * to make sure that the new stable_node->kpfn is visible ++ * to get_ksm_page() before it can see that oldpage->mapping ++ * has gone stale (or that PageSwapCache has been cleared). ++ */ ++ smp_wmb(); ++ set_page_stable_node(oldpage, NULL); ++ } ++} ++#endif /* CONFIG_MIGRATION */ ++ ++#ifdef CONFIG_MEMORY_HOTREMOVE ++static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, ++ unsigned long end_pfn) ++{ ++ struct rb_node *node; ++ ++ for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { ++ struct stable_node *stable_node; ++ ++ stable_node = rb_entry(node, struct stable_node, node); ++ if (stable_node->kpfn >= start_pfn && ++ stable_node->kpfn < end_pfn) ++ return stable_node; ++ } ++ return NULL; ++} ++ ++static int uksm_memory_callback(struct notifier_block *self, ++ unsigned long action, void *arg) ++{ ++ struct memory_notify *mn = arg; ++ struct stable_node *stable_node; ++ ++ switch (action) { ++ case MEM_GOING_OFFLINE: ++ /* ++ * Keep it very simple for now: just lock out ksmd and ++ * MADV_UNMERGEABLE while any memory is going offline. ++ * mutex_lock_nested() is necessary because lockdep was alarmed ++ * that here we take uksm_thread_mutex inside notifier chain ++ * mutex, and later take notifier chain mutex inside ++ * uksm_thread_mutex to unlock it. But that's safe because both ++ * are inside mem_hotplug_mutex. ++ */ ++ mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); ++ break; ++ ++ case MEM_OFFLINE: ++ /* ++ * Most of the work is done by page migration; but there might ++ * be a few stable_nodes left over, still pointing to struct ++ * pages which have been offlined: prune those from the tree. ++ */ ++ while ((stable_node = uksm_check_stable_tree(mn->start_pfn, ++ mn->start_pfn + mn->nr_pages)) != NULL) ++ remove_node_from_stable_tree(stable_node, 1, 1); ++ /* fallthrough */ ++ ++ case MEM_CANCEL_OFFLINE: ++ mutex_unlock(&uksm_thread_mutex); ++ break; ++ } ++ return NOTIFY_OK; ++} ++#endif /* CONFIG_MEMORY_HOTREMOVE */ ++ ++#ifdef CONFIG_SYSFS ++/* ++ * This all compiles without CONFIG_SYSFS, but is a waste of space. ++ */ ++ ++#define UKSM_ATTR_RO(_name) \ ++ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) ++#define UKSM_ATTR(_name) \ ++ static struct kobj_attribute _name##_attr = \ ++ __ATTR(_name, 0644, _name##_show, _name##_store) ++ ++static ssize_t max_cpu_percentage_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_max_cpu_percentage); ++} ++ ++static ssize_t max_cpu_percentage_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned long max_cpu_percentage; ++ int err; ++ ++ err = kstrtoul(buf, 10, &max_cpu_percentage); ++ if (err || max_cpu_percentage > 100) ++ return -EINVAL; ++ ++ if (max_cpu_percentage == 100) ++ max_cpu_percentage = 99; ++ else if (max_cpu_percentage < 10) ++ max_cpu_percentage = 10; ++ ++ uksm_max_cpu_percentage = max_cpu_percentage; ++ ++ return count; ++} ++UKSM_ATTR(max_cpu_percentage); ++ ++static ssize_t sleep_millisecs_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); ++} ++ ++static ssize_t sleep_millisecs_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned long msecs; ++ int err; ++ ++ err = kstrtoul(buf, 10, &msecs); ++ if (err || msecs > MSEC_PER_SEC) ++ return -EINVAL; ++ ++ uksm_sleep_jiffies = msecs_to_jiffies(msecs); ++ uksm_sleep_saved = uksm_sleep_jiffies; ++ ++ return count; ++} ++UKSM_ATTR(sleep_millisecs); ++ ++ ++static ssize_t cpu_governor_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); ++ int i; ++ ++ buf[0] = '\0'; ++ for (i = 0; i < n ; i++) { ++ if (uksm_cpu_governor == i) ++ strcat(buf, "["); ++ ++ strcat(buf, uksm_cpu_governor_str[i]); ++ ++ if (uksm_cpu_governor == i) ++ strcat(buf, "]"); ++ ++ strcat(buf, " "); ++ } ++ strcat(buf, "\n"); ++ ++ return strlen(buf); ++} ++ ++static inline void init_performance_values(void) ++{ ++ int i; ++ struct scan_rung *rung; ++ struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; ++ ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = uksm_scan_ladder + i; ++ rung->cpu_ratio = preset->cpu_ratio[i]; ++ rung->cover_msecs = preset->cover_msecs[i]; ++ } ++ ++ uksm_max_cpu_percentage = preset->max_cpu; ++} ++ ++static ssize_t cpu_governor_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); ++ ++ for (n--; n >= 0 ; n--) { ++ if (!strncmp(buf, uksm_cpu_governor_str[n], ++ strlen(uksm_cpu_governor_str[n]))) ++ break; ++ } ++ ++ if (n < 0) ++ return -EINVAL; ++ else ++ uksm_cpu_governor = n; ++ ++ init_performance_values(); ++ ++ return count; ++} ++UKSM_ATTR(cpu_governor); ++ ++static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_run); ++} ++ ++static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = kstrtoul(buf, 10, &flags); ++ if (err || flags > UINT_MAX) ++ return -EINVAL; ++ if (flags > UKSM_RUN_MERGE) ++ return -EINVAL; ++ ++ mutex_lock(&uksm_thread_mutex); ++ if (uksm_run != flags) ++ uksm_run = flags; ++ mutex_unlock(&uksm_thread_mutex); ++ ++ if (flags & UKSM_RUN_MERGE) ++ wake_up_interruptible(&uksm_thread_wait); ++ ++ return count; ++} ++UKSM_ATTR(run); ++ ++static ssize_t abundant_threshold_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_abundant_threshold); ++} ++ ++static ssize_t abundant_threshold_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = kstrtoul(buf, 10, &flags); ++ if (err || flags > 99) ++ return -EINVAL; ++ ++ uksm_abundant_threshold = flags; ++ ++ return count; ++} ++UKSM_ATTR(abundant_threshold); ++ ++static ssize_t thrash_threshold_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_thrash_threshold); ++} ++ ++static ssize_t thrash_threshold_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = kstrtoul(buf, 10, &flags); ++ if (err || flags > 99) ++ return -EINVAL; ++ ++ uksm_thrash_threshold = flags; ++ ++ return count; ++} ++UKSM_ATTR(thrash_threshold); ++ ++static ssize_t cpu_ratios_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int i, size; ++ struct scan_rung *rung; ++ char *p = buf; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ if (rung->cpu_ratio > 0) ++ size = sprintf(p, "%d ", rung->cpu_ratio); ++ else ++ size = sprintf(p, "MAX/%d ", ++ TIME_RATIO_SCALE / -rung->cpu_ratio); ++ ++ p += size; ++ } ++ ++ *p++ = '\n'; ++ *p = '\0'; ++ ++ return p - buf; ++} ++ ++static ssize_t cpu_ratios_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int i, cpuratios[SCAN_LADDER_SIZE], err; ++ unsigned long value; ++ struct scan_rung *rung; ++ char *p, *end = NULL; ++ ++ p = kzalloc(count, GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ memcpy(p, buf, count); ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ if (i != SCAN_LADDER_SIZE - 1) { ++ end = strchr(p, ' '); ++ if (!end) ++ return -EINVAL; ++ ++ *end = '\0'; ++ } ++ ++ if (strstr(p, "MAX/")) { ++ p = strchr(p, '/') + 1; ++ err = kstrtoul(p, 10, &value); ++ if (err || value > TIME_RATIO_SCALE || !value) ++ return -EINVAL; ++ ++ cpuratios[i] = -(int) (TIME_RATIO_SCALE / value); ++ } else { ++ err = kstrtoul(p, 10, &value); ++ if (err || value > TIME_RATIO_SCALE || !value) ++ return -EINVAL; ++ ++ cpuratios[i] = value; ++ } ++ ++ p = end + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ rung->cpu_ratio = cpuratios[i]; ++ } ++ ++ return count; ++} ++UKSM_ATTR(cpu_ratios); ++ ++static ssize_t eval_intervals_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int i, size; ++ struct scan_rung *rung; ++ char *p = buf; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ size = sprintf(p, "%u ", rung->cover_msecs); ++ p += size; ++ } ++ ++ *p++ = '\n'; ++ *p = '\0'; ++ ++ return p - buf; ++} ++ ++static ssize_t eval_intervals_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int i, err; ++ unsigned long values[SCAN_LADDER_SIZE]; ++ struct scan_rung *rung; ++ char *p, *end = NULL; ++ ssize_t ret = count; ++ ++ p = kzalloc(count + 2, GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ memcpy(p, buf, count); ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ if (i != SCAN_LADDER_SIZE - 1) { ++ end = strchr(p, ' '); ++ if (!end) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ *end = '\0'; ++ } ++ ++ err = kstrtoul(p, 10, &values[i]); ++ if (err) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ p = end + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ rung->cover_msecs = values[i]; ++ } ++ ++out: ++ kfree(p); ++ return ret; ++} ++UKSM_ATTR(eval_intervals); ++ ++static ssize_t ema_per_page_time_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_ema_page_time); ++} ++UKSM_ATTR_RO(ema_per_page_time); ++ ++static ssize_t pages_shared_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_shared); ++} ++UKSM_ATTR_RO(pages_shared); ++ ++static ssize_t pages_sharing_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_sharing); ++} ++UKSM_ATTR_RO(pages_sharing); ++ ++static ssize_t pages_unshared_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_unshared); ++} ++UKSM_ATTR_RO(pages_unshared); ++ ++static ssize_t full_scans_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%llu\n", fully_scanned_round); ++} ++UKSM_ATTR_RO(full_scans); ++ ++static ssize_t pages_scanned_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ unsigned long base = 0; ++ u64 delta, ret; ++ ++ if (pages_scanned_stored) { ++ base = pages_scanned_base; ++ ret = pages_scanned_stored; ++ delta = uksm_pages_scanned >> base; ++ if (CAN_OVERFLOW_U64(ret, delta)) { ++ ret >>= 1; ++ delta >>= 1; ++ base++; ++ ret += delta; ++ } ++ } else { ++ ret = uksm_pages_scanned; ++ } ++ ++ while (ret > ULONG_MAX) { ++ ret >>= 1; ++ base++; ++ } ++ ++ if (base) ++ return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); ++ else ++ return sprintf(buf, "%lu\n", (unsigned long)ret); ++} ++UKSM_ATTR_RO(pages_scanned); ++ ++static ssize_t hash_strength_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", hash_strength); ++} ++UKSM_ATTR_RO(hash_strength); ++ ++static ssize_t sleep_times_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%llu\n", uksm_sleep_times); ++} ++UKSM_ATTR_RO(sleep_times); ++ ++ ++static struct attribute *uksm_attrs[] = { ++ &max_cpu_percentage_attr.attr, ++ &sleep_millisecs_attr.attr, ++ &cpu_governor_attr.attr, ++ &run_attr.attr, ++ &ema_per_page_time_attr.attr, ++ &pages_shared_attr.attr, ++ &pages_sharing_attr.attr, ++ &pages_unshared_attr.attr, ++ &full_scans_attr.attr, ++ &pages_scanned_attr.attr, ++ &hash_strength_attr.attr, ++ &sleep_times_attr.attr, ++ &thrash_threshold_attr.attr, ++ &abundant_threshold_attr.attr, ++ &cpu_ratios_attr.attr, ++ &eval_intervals_attr.attr, ++ NULL, ++}; ++ ++static struct attribute_group uksm_attr_group = { ++ .attrs = uksm_attrs, ++ .name = "uksm", ++}; ++#endif /* CONFIG_SYSFS */ ++ ++static inline void init_scan_ladder(void) ++{ ++ int i; ++ struct scan_rung *rung; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = uksm_scan_ladder + i; ++ slot_tree_init_root(&rung->vma_root); ++ } ++ ++ init_performance_values(); ++ uksm_calc_scan_pages(); ++} ++ ++static inline int cal_positive_negative_costs(void) ++{ ++ struct page *p1, *p2; ++ unsigned char *addr1, *addr2; ++ unsigned long i, time_start, hash_cost; ++ unsigned long loopnum = 0; ++ ++ /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ ++ volatile u32 hash; ++ volatile int ret; ++ ++ p1 = alloc_page(GFP_KERNEL); ++ if (!p1) ++ return -ENOMEM; ++ ++ p2 = alloc_page(GFP_KERNEL); ++ if (!p2) ++ return -ENOMEM; ++ ++ addr1 = kmap_atomic(p1); ++ addr2 = kmap_atomic(p2); ++ memset(addr1, prandom_u32(), PAGE_SIZE); ++ memcpy(addr2, addr1, PAGE_SIZE); ++ ++ /* make sure that the two pages differ in last byte */ ++ addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; ++ kunmap_atomic(addr2); ++ kunmap_atomic(addr1); ++ ++ time_start = jiffies; ++ while (jiffies - time_start < 100) { ++ for (i = 0; i < 100; i++) ++ hash = page_hash(p1, HASH_STRENGTH_FULL, 0); ++ loopnum += 100; ++ } ++ hash_cost = (jiffies - time_start); ++ ++ time_start = jiffies; ++ for (i = 0; i < loopnum; i++) ++ ret = pages_identical(p1, p2); ++ memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); ++ memcmp_cost /= hash_cost; ++ pr_info("UKSM: relative memcmp_cost = %lu " ++ "hash=%u cmp_ret=%d.\n", ++ memcmp_cost, hash, ret); ++ ++ __free_page(p1); ++ __free_page(p2); ++ return 0; ++} ++ ++static int init_zeropage_hash_table(void) ++{ ++ struct page *page; ++ char *addr; ++ int i; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ addr = kmap_atomic(page); ++ memset(addr, 0, PAGE_SIZE); ++ kunmap_atomic(addr); ++ ++ zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32), ++ GFP_KERNEL); ++ if (!zero_hash_table) ++ return -ENOMEM; ++ ++ for (i = 0; i < HASH_STRENGTH_MAX; i++) ++ zero_hash_table[i] = page_hash(page, i, 0); ++ ++ __free_page(page); ++ ++ return 0; ++} ++ ++static inline int init_random_sampling(void) ++{ ++ unsigned long i; ++ ++ random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); ++ if (!random_nums) ++ return -ENOMEM; ++ ++ for (i = 0; i < HASH_STRENGTH_FULL; i++) ++ random_nums[i] = i; ++ ++ for (i = 0; i < HASH_STRENGTH_FULL; i++) { ++ unsigned long rand_range, swap_index, tmp; ++ ++ rand_range = HASH_STRENGTH_FULL - i; ++ swap_index = i + prandom_u32() % rand_range; ++ tmp = random_nums[i]; ++ random_nums[i] = random_nums[swap_index]; ++ random_nums[swap_index] = tmp; ++ } ++ ++ rshash_state.state = RSHASH_NEW; ++ rshash_state.below_count = 0; ++ rshash_state.lookup_window_index = 0; ++ ++ return cal_positive_negative_costs(); ++} ++ ++static int __init uksm_slab_init(void) ++{ ++ rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); ++ if (!rmap_item_cache) ++ goto out; ++ ++ stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); ++ if (!stable_node_cache) ++ goto out_free1; ++ ++ node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); ++ if (!node_vma_cache) ++ goto out_free2; ++ ++ vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); ++ if (!vma_slot_cache) ++ goto out_free3; ++ ++ tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); ++ if (!tree_node_cache) ++ goto out_free4; ++ ++ return 0; ++ ++out_free4: ++ kmem_cache_destroy(vma_slot_cache); ++out_free3: ++ kmem_cache_destroy(node_vma_cache); ++out_free2: ++ kmem_cache_destroy(stable_node_cache); ++out_free1: ++ kmem_cache_destroy(rmap_item_cache); ++out: ++ return -ENOMEM; ++} ++ ++static void __init uksm_slab_free(void) ++{ ++ kmem_cache_destroy(stable_node_cache); ++ kmem_cache_destroy(rmap_item_cache); ++ kmem_cache_destroy(node_vma_cache); ++ kmem_cache_destroy(vma_slot_cache); ++ kmem_cache_destroy(tree_node_cache); ++} ++ ++/* Common interface to ksm, different to it. */ ++int ksm_madvise(struct vm_area_struct *vma, unsigned long start, ++ unsigned long end, int advice, unsigned long *vm_flags) ++{ ++ int err; ++ ++ switch (advice) { ++ case MADV_MERGEABLE: ++ return 0; /* just ignore the advice */ ++ ++ case MADV_UNMERGEABLE: ++ if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags)) ++ return 0; /* just ignore the advice */ ++ ++ if (vma->anon_vma) { ++ err = unmerge_uksm_pages(vma, start, end); ++ if (err) ++ return err; ++ } ++ ++ uksm_remove_vma(vma); ++ *vm_flags &= ~VM_MERGEABLE; ++ break; ++ } ++ ++ return 0; ++} ++ ++/* Common interface to ksm, actually the same. */ ++struct page *ksm_might_need_to_copy(struct page *page, ++ struct vm_area_struct *vma, unsigned long address) ++{ ++ struct anon_vma *anon_vma = page_anon_vma(page); ++ struct page *new_page; ++ ++ if (PageKsm(page)) { ++ if (page_stable_node(page)) ++ return page; /* no need to copy it */ ++ } else if (!anon_vma) { ++ return page; /* no need to copy it */ ++ } else if (anon_vma->root == vma->anon_vma->root && ++ page->index == linear_page_index(vma, address)) { ++ return page; /* still no need to copy it */ ++ } ++ if (!PageUptodate(page)) ++ return page; /* let do_swap_page report the error */ ++ ++ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); ++ if (new_page) { ++ copy_user_highpage(new_page, page, address, vma); ++ ++ SetPageDirty(new_page); ++ __SetPageUptodate(new_page); ++ __SetPageLocked(new_page); ++ } ++ ++ return new_page; ++} ++ ++static int __init uksm_init(void) ++{ ++ struct task_struct *uksm_thread; ++ int err; ++ ++ uksm_sleep_jiffies = msecs_to_jiffies(100); ++ uksm_sleep_saved = uksm_sleep_jiffies; ++ ++ slot_tree_init(); ++ init_scan_ladder(); ++ ++ ++ err = init_random_sampling(); ++ if (err) ++ goto out_free2; ++ ++ err = uksm_slab_init(); ++ if (err) ++ goto out_free1; ++ ++ err = init_zeropage_hash_table(); ++ if (err) ++ goto out_free0; ++ ++ uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); ++ if (IS_ERR(uksm_thread)) { ++ pr_err("uksm: creating kthread failed\n"); ++ err = PTR_ERR(uksm_thread); ++ goto out_free; ++ } ++ ++#ifdef CONFIG_SYSFS ++ err = sysfs_create_group(mm_kobj, &uksm_attr_group); ++ if (err) { ++ pr_err("uksm: register sysfs failed\n"); ++ kthread_stop(uksm_thread); ++ goto out_free; ++ } ++#else ++ uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ ++ ++#endif /* CONFIG_SYSFS */ ++ ++#ifdef CONFIG_MEMORY_HOTREMOVE ++ /* ++ * Choose a high priority since the callback takes uksm_thread_mutex: ++ * later callbacks could only be taking locks which nest within that. ++ */ ++ hotplug_memory_notifier(uksm_memory_callback, 100); ++#endif ++ return 0; ++ ++out_free: ++ kfree(zero_hash_table); ++out_free0: ++ uksm_slab_free(); ++out_free1: ++ kfree(random_nums); ++out_free2: ++ kfree(uksm_scan_ladder); ++ return err; ++} ++ ++#ifdef MODULE ++subsys_initcall(ksm_init); ++#else ++late_initcall(uksm_init); ++#endif ++ +diff -Nur a/mm/vmstat.c b/mm/vmstat.c +--- a/mm/vmstat.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/mm/vmstat.c 2019-02-09 17:23:06.736864024 +0000 +@@ -1163,6 +1163,9 @@ + "nr_written", + "", /* nr_indirectly_reclaimable */ + ++#ifdef CONFIG_UKSM ++ "nr_uksm_zero_pages", ++#endif + /* enum writeback_stat_item counters */ + "nr_dirty_threshold", + "nr_dirty_background_threshold", diff --git a/sys-kernel/linux-sources-redcore-lts/files/Revert-ath10k-activate-user-space-firmware-loading.patch b/sys-kernel/linux-sources-redcore-lts/files/Revert-ath10k-activate-user-space-firmware-loading.patch deleted file mode 100644 index 28f9b2f6..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/Revert-ath10k-activate-user-space-firmware-loading.patch +++ /dev/null @@ -1,13 +0,0 @@ -diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c -index a4f635820..9b4c4facf 100644 ---- a/drivers/net/wireless/ath/ath10k/core.c -+++ b/drivers/net/wireless/ath/ath10k/core.c -@@ -519,7 +519,7 @@ static const struct firmware *ath10k_fetch_fw_file(struct ath10k *ar, - dir = "."; - - snprintf(filename, sizeof(filename), "%s/%s", dir, file); -- ret = request_firmware(&fw, filename, ar->dev); -+ ret = request_firmware_direct(&fw, filename, ar->dev); - ath10k_dbg(ar, ATH10K_DBG_BOOT, "boot fw request '%s': %d\n", - filename, ret); - diff --git a/sys-kernel/linux-sources-redcore-lts/files/introduce-NUMA-identity-node-sched-domain.patch b/sys-kernel/linux-sources-redcore-lts/files/introduce-NUMA-identity-node-sched-domain.patch deleted file mode 100644 index 2376edae..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/introduce-NUMA-identity-node-sched-domain.patch +++ /dev/null @@ -1,46 +0,0 @@ -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 808998fe1..18d3321ef 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -1339,6 +1339,10 @@ void sched_init_numa(void) - if (!sched_domains_numa_distance) - return; - -+ /* Includes NUMA identity node at level 0. */ -+ sched_domains_numa_distance[level++] = curr_distance; -+ sched_domains_numa_levels = level; -+ - /* - * O(nr_nodes^2) deduplicating selection sort -- in order to find the - * unique distances in the node_distance() table. -@@ -1386,8 +1390,7 @@ void sched_init_numa(void) - return; - - /* -- * 'level' contains the number of unique distances, excluding the -- * identity distance node_distance(i,i). -+ * 'level' contains the number of unique distances - * - * The sched_domains_numa_distance[] array includes the actual distance - * numbers. -@@ -1448,10 +1451,19 @@ void sched_init_numa(void) - for (i = 0; sched_domain_topology[i].mask; i++) - tl[i] = sched_domain_topology[i]; - -+ /* -+ * Add the NUMA identity distance, aka single NODE. -+ */ -+ tl[i++] = (struct sched_domain_topology_level){ -+ .mask = sd_numa_mask, -+ .numa_level = 0, -+ SD_INIT_NAME(NODE) -+ }; -+ - /* - * .. and append 'j' levels of NUMA goodness. - */ -- for (j = 0; j < level; i++, j++) { -+ for (j = 1; j < level; i++, j++) { - tl[i] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .sd_flags = cpu_numa_flags, diff --git a/sys-kernel/linux-sources-redcore-lts/files/k10temp-add-ZEN-support.patch b/sys-kernel/linux-sources-redcore-lts/files/k10temp-add-ZEN-support.patch deleted file mode 100644 index b1e8a9b0..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/k10temp-add-ZEN-support.patch +++ /dev/null @@ -1,177 +0,0 @@ -diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c -index ce3b91f22..0721e1756 100644 ---- a/drivers/hwmon/k10temp.c -+++ b/drivers/hwmon/k10temp.c -@@ -36,6 +36,10 @@ MODULE_PARM_DESC(force, "force loading on processors with erratum 319"); - /* Provide lock for writing to NB_SMU_IND_ADDR */ - static DEFINE_MUTEX(nb_smu_ind_mutex); - -+#ifndef PCI_DEVICE_ID_AMD_17H_DF_F3 -+#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463 -+#endif -+ - /* CPUID function 0x80000001, ebx */ - #define CPUID_PKGTYPE_MASK 0xf0000000 - #define CPUID_PKGTYPE_F 0x00000000 -@@ -61,31 +65,72 @@ static DEFINE_MUTEX(nb_smu_ind_mutex); - */ - #define F15H_M60H_REPORTED_TEMP_CTRL_OFFSET 0xd8200ca4 - --static void amd_nb_smu_index_read(struct pci_dev *pdev, unsigned int devfn, -- int offset, u32 *val) -+/* F17h M01h Access througn SMN */ -+#define F17H_M01H_REPORTED_TEMP_CTRL_OFFSET 0x00059800 -+ -+struct k10temp_data { -+ struct pci_dev *pdev; -+ void (*read_tempreg)(struct pci_dev *pdev, u32 *regval); -+ int temp_offset; -+}; -+ -+struct tctl_offset { -+ u8 model; -+ char const *id; -+ int offset; -+}; -+ -+static const struct tctl_offset tctl_offset_table[] = { -+ { 0x17, "AMD Ryzen 5 1600X", 20000 }, -+ { 0x17, "AMD Ryzen 7 1700X", 20000 }, -+ { 0x17, "AMD Ryzen 7 1800X", 20000 }, -+ { 0x17, "AMD Ryzen Threadripper 1950X", 27000 }, -+ { 0x17, "AMD Ryzen Threadripper 1920X", 27000 }, -+ { 0x17, "AMD Ryzen Threadripper 1950", 10000 }, -+ { 0x17, "AMD Ryzen Threadripper 1920", 10000 }, -+ { 0x17, "AMD Ryzen Threadripper 1910", 10000 }, -+}; -+ -+static void read_tempreg_pci(struct pci_dev *pdev, u32 *regval) -+{ -+ pci_read_config_dword(pdev, REG_REPORTED_TEMPERATURE, regval); -+} -+ -+static void amd_nb_index_read(struct pci_dev *pdev, unsigned int devfn, -+ unsigned int base, int offset, u32 *val) - { - mutex_lock(&nb_smu_ind_mutex); - pci_bus_write_config_dword(pdev->bus, devfn, -- 0xb8, offset); -+ base, offset); - pci_bus_read_config_dword(pdev->bus, devfn, -- 0xbc, val); -+ base + 4, val); - mutex_unlock(&nb_smu_ind_mutex); - } - -+static void read_tempreg_nb_f15(struct pci_dev *pdev, u32 *regval) -+{ -+ amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0xb8, -+ F15H_M60H_REPORTED_TEMP_CTRL_OFFSET, regval); -+} -+ -+static void read_tempreg_nb_f17(struct pci_dev *pdev, u32 *regval) -+{ -+ amd_nb_index_read(pdev, PCI_DEVFN(0, 0), 0x60, -+ F17H_M01H_REPORTED_TEMP_CTRL_OFFSET, regval); -+} -+ - static ssize_t temp1_input_show(struct device *dev, - struct device_attribute *attr, char *buf) - { -+ struct k10temp_data *data = dev_get_drvdata(dev); - u32 regval; -- struct pci_dev *pdev = dev_get_drvdata(dev); -- -- if (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model == 0x60) { -- amd_nb_smu_index_read(pdev, PCI_DEVFN(0, 0), -- F15H_M60H_REPORTED_TEMP_CTRL_OFFSET, -- ®val); -- } else { -- pci_read_config_dword(pdev, REG_REPORTED_TEMPERATURE, ®val); -- } -- return sprintf(buf, "%u\n", (regval >> 21) * 125); -+ unsigned int temp; -+ -+ data->read_tempreg(data->pdev, ®val); -+ temp = (regval >> 21) * 125; -+ temp -= data->temp_offset; -+ -+ return sprintf(buf, "%u\n", temp); - } - - static ssize_t temp1_max_show(struct device *dev, -@@ -98,11 +143,12 @@ static ssize_t show_temp_crit(struct device *dev, - struct device_attribute *devattr, char *buf) - { - struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); -+ struct k10temp_data *data = dev_get_drvdata(dev); - int show_hyst = attr->index; - u32 regval; - int value; - -- pci_read_config_dword(dev_get_drvdata(dev), -+ pci_read_config_dword(data->pdev, - REG_HARDWARE_THERMAL_CONTROL, ®val); - value = ((regval >> 16) & 0x7f) * 500 + 52000; - if (show_hyst) -@@ -119,7 +165,8 @@ static umode_t k10temp_is_visible(struct kobject *kobj, - struct attribute *attr, int index) - { - struct device *dev = container_of(kobj, struct device, kobj); -- struct pci_dev *pdev = dev_get_drvdata(dev); -+ struct k10temp_data *data = dev_get_drvdata(dev); -+ struct pci_dev *pdev = data->pdev; - - if (index >= 2) { - u32 reg_caps, reg_htc; -@@ -187,7 +234,9 @@ static int k10temp_probe(struct pci_dev *pdev, - { - int unreliable = has_erratum_319(pdev); - struct device *dev = &pdev->dev; -+ struct k10temp_data *data; - struct device *hwmon_dev; -+ int i; - - if (unreliable) { - if (!force) { -@@ -199,7 +248,31 @@ static int k10temp_probe(struct pci_dev *pdev, - "unreliable CPU thermal sensor; check erratum 319\n"); - } - -- hwmon_dev = devm_hwmon_device_register_with_groups(dev, "k10temp", pdev, -+ data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); -+ if (!data) -+ return -ENOMEM; -+ -+ data->pdev = pdev; -+ -+ if (boot_cpu_data.x86 == 0x15 && (boot_cpu_data.x86_model == 0x60 || -+ boot_cpu_data.x86_model == 0x70)) -+ data->read_tempreg = read_tempreg_nb_f15; -+ else if (boot_cpu_data.x86 == 0x17) -+ data->read_tempreg = read_tempreg_nb_f17; -+ else -+ data->read_tempreg = read_tempreg_pci; -+ -+ for (i = 0; i < ARRAY_SIZE(tctl_offset_table); i++) { -+ const struct tctl_offset *entry = &tctl_offset_table[i]; -+ -+ if (boot_cpu_data.x86 == entry->model && -+ strstr(boot_cpu_data.x86_model_id, entry->id)) { -+ data->temp_offset = entry->offset; -+ break; -+ } -+ } -+ -+ hwmon_dev = devm_hwmon_device_register_with_groups(dev, "k10temp", data, - k10temp_groups); - return PTR_ERR_OR_ZERO(hwmon_dev); - } -@@ -214,6 +287,7 @@ static const struct pci_device_id k10temp_id_table[] = { - { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, - { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, - { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, -+ { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, - {} - }; - MODULE_DEVICE_TABLE(pci, k10temp_id_table); diff --git a/sys-kernel/linux-sources-redcore-lts/files/linux-hardened.patch b/sys-kernel/linux-sources-redcore-lts/files/linux-hardened.patch deleted file mode 100644 index d07e831b..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/linux-hardened.patch +++ /dev/null @@ -1,2868 +0,0 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 5f3d58142600..c5566972d058 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -490,16 +490,6 @@ - nosocket -- Disable socket memory accounting. - nokmem -- Disable kernel memory accounting. - -- checkreqprot [SELINUX] Set initial checkreqprot flag value. -- Format: { "0" | "1" } -- See security/selinux/Kconfig help text. -- 0 -- check protection applied by kernel (includes -- any implied execute protection). -- 1 -- check protection requested by application. -- Default value is set via a kernel config option. -- Value can be changed at runtime via -- /selinux/checkreqprot. -- - cio_ignore= [S390] - See Documentation/s390/CommonIO for details. - clk_ignore_unused -@@ -2981,6 +2971,11 @@ - the specified number of seconds. This is to be used if - your oopses keep scrolling off the screen. - -+ extra_latent_entropy -+ Enable a very simple form of latent entropy extraction -+ from the first 4GB of memory as the bootmem allocator -+ passes the memory pages to the buddy allocator. -+ - pcbit= [HW,ISDN] - - pcd. [PARIDE] -diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt -index 694968c7523c..002d86416ef8 100644 ---- a/Documentation/sysctl/kernel.txt -+++ b/Documentation/sysctl/kernel.txt -@@ -91,6 +91,7 @@ show up in /proc/sys/kernel: - - sysctl_writes_strict - - tainted - - threads-max -+- tiocsti_restrict - - unknown_nmi_panic - - watchdog - - watchdog_thresh -@@ -999,6 +1000,26 @@ available RAM pages threads-max is reduced accordingly. - - ============================================================== - -+tiocsti_restrict: -+ -+This toggle indicates whether unprivileged users are prevented -+from using the TIOCSTI ioctl to inject commands into other processes -+which share a tty session. -+ -+When tiocsti_restrict is set to (0) there are no restrictions(accept -+the default restriction of only being able to injection commands into -+one's own tty). When tiocsti_restrict is set to (1), users must -+have CAP_SYS_ADMIN to use the TIOCSTI ioctl. -+ -+When user namespaces are in use, the check for the capability -+CAP_SYS_ADMIN is done against the user namespace that originally -+opened the tty. -+ -+The kernel config option CONFIG_SECURITY_TIOCSTI_RESTRICT sets the -+default value of tiocsti_restrict. -+ -+============================================================== -+ - unknown_nmi_panic: - - The value in this file affects behavior of handling NMI. When the -diff --git a/Makefile b/Makefile -index 280c7193e246..c869bc294766 100644 ---- a/Makefile -+++ b/Makefile -@@ -714,6 +714,9 @@ endif - KBUILD_CFLAGS += $(stackp-flag) - - ifeq ($(cc-name),clang) -+ifdef CONFIG_LOCAL_INIT -+KBUILD_CFLAGS += -fsanitize=local-init -+endif - KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) - KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) - KBUILD_CFLAGS += $(call cc-disable-warning, gnu) -diff --git a/arch/Kconfig b/arch/Kconfig -index 77b3e21c4844..3dff252446ac 100644 ---- a/arch/Kconfig -+++ b/arch/Kconfig -@@ -446,6 +446,11 @@ config GCC_PLUGIN_LATENT_ENTROPY - is some slowdown of the boot process (about 0.5%) and fork and - irq processing. - -+ When extra_latent_entropy is passed on the kernel command line, -+ entropy will be extracted from up to the first 4GB of RAM while the -+ runtime memory allocator is being initialized. This costs even more -+ slowdown of the boot process. -+ - Note that entropy extracted this way is not cryptographically - secure! - -@@ -539,7 +544,7 @@ config CC_STACKPROTECTOR - choice - prompt "Stack Protector buffer overflow detection" - depends on HAVE_CC_STACKPROTECTOR -- default CC_STACKPROTECTOR_NONE -+ default CC_STACKPROTECTOR_STRONG - help - This option turns on the "stack-protector" GCC feature. This - feature puts, at the beginning of functions, a canary value on -@@ -741,7 +746,7 @@ config ARCH_MMAP_RND_BITS - int "Number of bits to use for ASLR of mmap base address" if EXPERT - range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX - default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT -- default ARCH_MMAP_RND_BITS_MIN -+ default ARCH_MMAP_RND_BITS_MAX - depends on HAVE_ARCH_MMAP_RND_BITS - help - This value can be used to select the number of bits to use to -@@ -775,7 +780,7 @@ config ARCH_MMAP_RND_COMPAT_BITS - int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT - range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX - default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT -- default ARCH_MMAP_RND_COMPAT_BITS_MIN -+ default ARCH_MMAP_RND_COMPAT_BITS_MAX - depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS - help - This value can be used to select the number of bits to use to -@@ -958,6 +963,7 @@ config ARCH_HAS_REFCOUNT - - config REFCOUNT_FULL - bool "Perform full reference count validation at the expense of speed" -+ default y - help - Enabling this switches the refcounting infrastructure from a fast - unchecked atomic_t implementation to a fully state checked -diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig -index c30cd78b6918..ba32a283f027 100644 ---- a/arch/arm64/Kconfig -+++ b/arch/arm64/Kconfig -@@ -926,6 +926,7 @@ endif - - config ARM64_SW_TTBR0_PAN - bool "Emulate Privileged Access Never using TTBR0_EL1 switching" -+ default y - help - Enabling this option prevents the kernel from accessing - user-space memory directly by pointing TTBR0_EL1 to a reserved -@@ -1052,6 +1053,7 @@ config RANDOMIZE_BASE - bool "Randomize the address of the kernel image" - select ARM64_MODULE_PLTS if MODULES - select RELOCATABLE -+ default y - help - Randomizes the virtual address at which the kernel image is - loaded, as a security feature that deters exploit attempts -diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug -index cc6bd559af85..01d5442d4722 100644 ---- a/arch/arm64/Kconfig.debug -+++ b/arch/arm64/Kconfig.debug -@@ -45,6 +45,7 @@ config ARM64_RANDOMIZE_TEXT_OFFSET - config DEBUG_WX - bool "Warn on W+X mappings at boot" - select ARM64_PTDUMP_CORE -+ default y - ---help--- - Generate a warning if any W+X mappings are found at boot. - -diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig -index b05796578e7a..8f6e2099717d 100644 ---- a/arch/arm64/configs/defconfig -+++ b/arch/arm64/configs/defconfig -@@ -1,4 +1,3 @@ --CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_AUDIT=y - CONFIG_NO_HZ_IDLE=y -diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h -index 33be513ef24c..6f0c0e3ef0dd 100644 ---- a/arch/arm64/include/asm/elf.h -+++ b/arch/arm64/include/asm/elf.h -@@ -114,10 +114,10 @@ - - /* - * This is the base location for PIE (ET_DYN with INTERP) loads. On -- * 64-bit, this is above 4GB to leave the entire 32-bit address -+ * 64-bit, this is raised to 4GB to leave the entire 32-bit address - * space open for things that want to use the area for 32-bit pointers. - */ --#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) -+#define ELF_ET_DYN_BASE 0x100000000UL - - #ifndef __ASSEMBLY__ - -@@ -158,10 +158,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, - /* 1GB of VA */ - #ifdef CONFIG_COMPAT - #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \ -- 0x7ff >> (PAGE_SHIFT - 12) : \ -- 0x3ffff >> (PAGE_SHIFT - 12)) -+ ((1UL << mmap_rnd_compat_bits) - 1) >> (PAGE_SHIFT - 12) : \ -+ ((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) - #else --#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) -+#define STACK_RND_MASK (((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) - #endif - - #ifdef __AARCH64EB__ -diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c -index 9e773732520c..91359f45b5fc 100644 ---- a/arch/arm64/kernel/process.c -+++ b/arch/arm64/kernel/process.c -@@ -419,9 +419,9 @@ unsigned long arch_align_stack(unsigned long sp) - unsigned long arch_randomize_brk(struct mm_struct *mm) - { - if (is_compat_task()) -- return randomize_page(mm->brk, SZ_32M); -+ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; - else -- return randomize_page(mm->brk, SZ_1G); -+ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; - } - - /* -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 4f393eb9745f..1a31f8fc82ed 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -1145,8 +1145,7 @@ config VM86 - default X86_LEGACY_VM86 - - config X86_16BIT -- bool "Enable support for 16-bit segments" if EXPERT -- default y -+ bool "Enable support for 16-bit segments" - depends on MODIFY_LDT_SYSCALL - ---help--- - This option is required by programs like Wine to run 16-bit -@@ -2220,7 +2219,7 @@ config COMPAT_VDSO - choice - prompt "vsyscall table for legacy applications" - depends on X86_64 -- default LEGACY_VSYSCALL_EMULATE -+ default LEGACY_VSYSCALL_NONE - help - Legacy user code that does not know how to find the vDSO expects - to be able to issue three syscalls by calling fixed addresses in -@@ -2310,8 +2309,7 @@ config CMDLINE_OVERRIDE - be set to 'N' under normal conditions. - - config MODIFY_LDT_SYSCALL -- bool "Enable the LDT (local descriptor table)" if EXPERT -- default y -+ bool "Enable the LDT (local descriptor table)" - ---help--- - Linux can allow user programs to install a per-process x86 - Local Descriptor Table (LDT) using the modify_ldt(2) system -diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug -index 6293a8768a91..add82e0f1df3 100644 ---- a/arch/x86/Kconfig.debug -+++ b/arch/x86/Kconfig.debug -@@ -101,6 +101,7 @@ config EFI_PGT_DUMP - config DEBUG_WX - bool "Warn on W+X mappings at boot" - select X86_PTDUMP_CORE -+ default y - ---help--- - Generate a warning if any W+X mappings are found at boot. - -diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig -index e32fc1f274d8..d08acc76502a 100644 ---- a/arch/x86/configs/x86_64_defconfig -+++ b/arch/x86/configs/x86_64_defconfig -@@ -1,5 +1,4 @@ - # CONFIG_LOCALVERSION_AUTO is not set --CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_TASKSTATS=y -diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c -index 1911310959f8..bba8dbbc07a8 100644 ---- a/arch/x86/entry/vdso/vma.c -+++ b/arch/x86/entry/vdso/vma.c -@@ -203,55 +203,9 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) - } - - #ifdef CONFIG_X86_64 --/* -- * Put the vdso above the (randomized) stack with another randomized -- * offset. This way there is no hole in the middle of address space. -- * To save memory make sure it is still in the same PTE as the stack -- * top. This doesn't give that many random bits. -- * -- * Note that this algorithm is imperfect: the distribution of the vdso -- * start address within a PMD is biased toward the end. -- * -- * Only used for the 64-bit and x32 vdsos. -- */ --static unsigned long vdso_addr(unsigned long start, unsigned len) --{ -- unsigned long addr, end; -- unsigned offset; -- -- /* -- * Round up the start address. It can start out unaligned as a result -- * of stack start randomization. -- */ -- start = PAGE_ALIGN(start); -- -- /* Round the lowest possible end address up to a PMD boundary. */ -- end = (start + len + PMD_SIZE - 1) & PMD_MASK; -- if (end >= TASK_SIZE_MAX) -- end = TASK_SIZE_MAX; -- end -= len; -- -- if (end > start) { -- offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); -- addr = start + (offset << PAGE_SHIFT); -- } else { -- addr = start; -- } -- -- /* -- * Forcibly align the final address in case we have a hardware -- * issue that requires alignment for performance reasons. -- */ -- addr = align_vdso_addr(addr); -- -- return addr; --} -- - static int map_vdso_randomized(const struct vdso_image *image) - { -- unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); -- -- return map_vdso(image, addr); -+ return map_vdso(image, 0); - } - #endif - -diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h -index 3a091cea36c5..0931c05a3348 100644 ---- a/arch/x86/include/asm/elf.h -+++ b/arch/x86/include/asm/elf.h -@@ -249,11 +249,11 @@ extern int force_personality32; - - /* - * This is the base location for PIE (ET_DYN with INTERP) loads. On -- * 64-bit, this is above 4GB to leave the entire 32-bit address -+ * 64-bit, this is raised to 4GB to leave the entire 32-bit address - * space open for things that want to use the area for 32-bit pointers. - */ - #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ -- (DEFAULT_MAP_WINDOW / 3 * 2)) -+ 0x100000000UL) - - /* This yields a mask that user programs can use to figure out what - instruction set this CPU supports. This could be done in user space, -@@ -312,8 +312,8 @@ extern unsigned long get_mmap_base(int is_legacy); - - #ifdef CONFIG_X86_32 - --#define __STACK_RND_MASK(is32bit) (0x7ff) --#define STACK_RND_MASK (0x7ff) -+#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) -+#define STACK_RND_MASK ((1UL << mmap_rnd_bits) - 1) - - #define ARCH_DLINFO ARCH_DLINFO_IA32 - -@@ -322,7 +322,11 @@ extern unsigned long get_mmap_base(int is_legacy); - #else /* CONFIG_X86_32 */ - - /* 1GB for 64bit, 8MB for 32bit */ --#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) -+#ifdef CONFIG_COMPAT -+#define __STACK_RND_MASK(is32bit) ((is32bit) ? (1UL << mmap_rnd_compat_bits) - 1 : (1UL << mmap_rnd_bits) - 1) -+#else -+#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) -+#endif - #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) - - #define ARCH_DLINFO \ -@@ -380,5 +384,4 @@ struct va_alignment { - } ____cacheline_aligned; - - extern struct va_alignment va_align; --extern unsigned long align_vdso_addr(unsigned long); - #endif /* _ASM_X86_ELF_H */ -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index e31040333f0c..14f3f214c9d1 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -302,6 +302,7 @@ static inline void cr4_set_bits(unsigned long mask) - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); -+ BUG_ON(cr4 != __read_cr4()); - if ((cr4 | mask) != cr4) { - cr4 |= mask; - this_cpu_write(cpu_tlbstate.cr4, cr4); -@@ -315,6 +316,7 @@ static inline void cr4_clear_bits(unsigned long mask) - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); -+ BUG_ON(cr4 != __read_cr4()); - if ((cr4 & ~mask) != cr4) { - cr4 &= ~mask; - this_cpu_write(cpu_tlbstate.cr4, cr4); -@@ -327,6 +329,7 @@ static inline void cr4_toggle_bits(unsigned long mask) - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); -+ BUG_ON(cr4 != __read_cr4()); - cr4 ^= mask; - this_cpu_write(cpu_tlbstate.cr4, cr4); - __write_cr4(cr4); -@@ -435,6 +438,7 @@ static inline void __native_flush_tlb_global(void) - raw_local_irq_save(flags); - - cr4 = this_cpu_read(cpu_tlbstate.cr4); -+ BUG_ON(cr4 != __read_cr4()); - /* toggle PGE */ - native_write_cr4(cr4 ^ X86_CR4_PGE); - /* write old PGE again and flush TLBs */ -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index 51e49f6fe8e1..7ee813033624 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -1669,7 +1669,6 @@ void cpu_init(void) - wrmsrl(MSR_KERNEL_GS_BASE, 0); - barrier(); - -- x86_configure_nx(); - x2apic_setup(); - - /* -diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c -index a98d1cdd6299..7426eb5d1c03 100644 ---- a/arch/x86/kernel/process.c -+++ b/arch/x86/kernel/process.c -@@ -40,6 +40,8 @@ - #include - #include - #include -+#include -+#include - - #include "process.h" - -@@ -782,7 +784,10 @@ unsigned long arch_align_stack(unsigned long sp) - - unsigned long arch_randomize_brk(struct mm_struct *mm) - { -- return randomize_page(mm->brk, 0x02000000); -+ if (mmap_is_ia32()) -+ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; -+ else -+ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; - } - - /* -diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c -index a63fe77b3217..e1085e76043e 100644 ---- a/arch/x86/kernel/sys_x86_64.c -+++ b/arch/x86/kernel/sys_x86_64.c -@@ -54,13 +54,6 @@ static unsigned long get_align_bits(void) - return va_align.bits & get_align_mask(); - } - --unsigned long align_vdso_addr(unsigned long addr) --{ -- unsigned long align_mask = get_align_mask(); -- addr = (addr + align_mask) & ~align_mask; -- return addr | get_align_bits(); --} -- - static int __init control_va_addr_alignment(char *str) - { - /* guard against enabling this on other CPU families */ -@@ -122,10 +115,7 @@ static void find_start_end(unsigned long addr, unsigned long flags, - } - - *begin = get_mmap_base(1); -- if (in_compat_syscall()) -- *end = task_size_32bit(); -- else -- *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); -+ *end = get_mmap_base(0); - } - - unsigned long -@@ -206,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; -- info.low_limit = PAGE_SIZE; -+ info.low_limit = get_mmap_base(1); - info.high_limit = get_mmap_base(0); - - /* -diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c -index 3141e67ec24c..e93173193f60 100644 ---- a/arch/x86/mm/init_32.c -+++ b/arch/x86/mm/init_32.c -@@ -558,7 +558,7 @@ static void __init pagetable_init(void) - permanent_kmaps_init(pgd_base); - } - --pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); -+pteval_t __supported_pte_mask __ro_after_init = ~(_PAGE_NX | _PAGE_GLOBAL); - EXPORT_SYMBOL_GPL(__supported_pte_mask); - - /* user-defined highmem size */ -@@ -865,7 +865,7 @@ int arch_remove_memory(u64 start, u64 size) - #endif - #endif - --int kernel_set_to_readonly __read_mostly; -+int kernel_set_to_readonly __ro_after_init; - - void set_kernel_text_rw(void) - { -@@ -917,12 +917,11 @@ void mark_rodata_ro(void) - unsigned long start = PFN_ALIGN(_text); - unsigned long size = PFN_ALIGN(_etext) - start; - -+ kernel_set_to_readonly = 1; - set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); - printk(KERN_INFO "Write protecting the kernel text: %luk\n", - size >> 10); - -- kernel_set_to_readonly = 1; -- - #ifdef CONFIG_CPA_DEBUG - printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", - start, start+size); -diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c -index 642357aff216..8bbf93ce3cd2 100644 ---- a/arch/x86/mm/init_64.c -+++ b/arch/x86/mm/init_64.c -@@ -65,7 +65,7 @@ - * around without checking the pgd every time. - */ - --pteval_t __supported_pte_mask __read_mostly = ~0; -+pteval_t __supported_pte_mask __ro_after_init = ~0; - EXPORT_SYMBOL_GPL(__supported_pte_mask); - - int force_personality32; -@@ -1185,7 +1185,7 @@ void __init mem_init(void) - mem_init_print_info(NULL); - } - --int kernel_set_to_readonly; -+int kernel_set_to_readonly __ro_after_init; - - void set_kernel_text_rw(void) - { -@@ -1234,9 +1234,8 @@ void mark_rodata_ro(void) - - printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", - (end - start) >> 10); -- set_memory_ro(start, (end - start) >> PAGE_SHIFT); -- - kernel_set_to_readonly = 1; -+ set_memory_ro(start, (end - start) >> PAGE_SHIFT); - - /* - * The rodata/data/bss/brk section (but not the kernel text!) -diff --git a/block/blk-softirq.c b/block/blk-softirq.c -index 01e2b353a2b9..9aeddca4a29f 100644 ---- a/block/blk-softirq.c -+++ b/block/blk-softirq.c -@@ -20,7 +20,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done); - * Softirq action handler - move entries to local list and loop over them - * while passing them to the queue registered handler. - */ --static __latent_entropy void blk_done_softirq(struct softirq_action *h) -+static __latent_entropy void blk_done_softirq(void) - { - struct list_head *cpu_list, local_list; - -diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c -index 04f406d7e973..60d8c59fa824 100644 ---- a/drivers/ata/libata-core.c -+++ b/drivers/ata/libata-core.c -@@ -5148,7 +5148,7 @@ void ata_qc_free(struct ata_queued_cmd *qc) - struct ata_port *ap; - unsigned int tag; - -- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ -+ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ - ap = qc->ap; - - qc->flags = 0; -@@ -5165,7 +5165,7 @@ void __ata_qc_complete(struct ata_queued_cmd *qc) - struct ata_port *ap; - struct ata_link *link; - -- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ -+ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ - WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE)); - ap = qc->ap; - link = qc->dev->link; -diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig -index c28dca0c613d..d4813f0d25ca 100644 ---- a/drivers/char/Kconfig -+++ b/drivers/char/Kconfig -@@ -9,7 +9,6 @@ source "drivers/tty/Kconfig" - - config DEVMEM - bool "/dev/mem virtual device support" -- default y - help - Say Y here if you want to support the /dev/mem device. - The /dev/mem device is used to access areas of physical -@@ -568,7 +567,6 @@ config TELCLOCK - config DEVPORT - bool "/dev/port character device" - depends on ISA || PCI -- default y - help - Say Y here if you want to support the /dev/port device. The /dev/port - device is similar to /dev/mem, but for I/O ports. -diff --git a/drivers/media/dvb-frontends/cx24116.c b/drivers/media/dvb-frontends/cx24116.c -index e105532bfba8..e07d52bb9b62 100644 ---- a/drivers/media/dvb-frontends/cx24116.c -+++ b/drivers/media/dvb-frontends/cx24116.c -@@ -1462,7 +1462,7 @@ static int cx24116_tune(struct dvb_frontend *fe, bool re_tune, - return cx24116_read_status(fe, status); - } - --static int cx24116_get_algo(struct dvb_frontend *fe) -+static enum dvbfe_algo cx24116_get_algo(struct dvb_frontend *fe) - { - return DVBFE_ALGO_HW; - } -diff --git a/drivers/media/dvb-frontends/cx24117.c b/drivers/media/dvb-frontends/cx24117.c -index d37cb7762bd6..97e0feff0ede 100644 ---- a/drivers/media/dvb-frontends/cx24117.c -+++ b/drivers/media/dvb-frontends/cx24117.c -@@ -1555,7 +1555,7 @@ static int cx24117_tune(struct dvb_frontend *fe, bool re_tune, - return cx24117_read_status(fe, status); - } - --static int cx24117_get_algo(struct dvb_frontend *fe) -+static enum dvbfe_algo cx24117_get_algo(struct dvb_frontend *fe) - { - return DVBFE_ALGO_HW; - } -diff --git a/drivers/media/dvb-frontends/cx24120.c b/drivers/media/dvb-frontends/cx24120.c -index 7f11dcc94d85..01da670760ba 100644 ---- a/drivers/media/dvb-frontends/cx24120.c -+++ b/drivers/media/dvb-frontends/cx24120.c -@@ -1491,7 +1491,7 @@ static int cx24120_tune(struct dvb_frontend *fe, bool re_tune, - return cx24120_read_status(fe, status); - } - --static int cx24120_get_algo(struct dvb_frontend *fe) -+static enum dvbfe_algo cx24120_get_algo(struct dvb_frontend *fe) - { - return DVBFE_ALGO_HW; - } -diff --git a/drivers/media/dvb-frontends/cx24123.c b/drivers/media/dvb-frontends/cx24123.c -index 1d59d1d3bd82..41cd0e9ea199 100644 ---- a/drivers/media/dvb-frontends/cx24123.c -+++ b/drivers/media/dvb-frontends/cx24123.c -@@ -1005,7 +1005,7 @@ static int cx24123_tune(struct dvb_frontend *fe, - return retval; - } - --static int cx24123_get_algo(struct dvb_frontend *fe) -+static enum dvbfe_algo cx24123_get_algo(struct dvb_frontend *fe) - { - return DVBFE_ALGO_HW; - } -diff --git a/drivers/media/dvb-frontends/cxd2820r_core.c b/drivers/media/dvb-frontends/cxd2820r_core.c -index f6ebbb47b9b2..3e0d8cbd76da 100644 ---- a/drivers/media/dvb-frontends/cxd2820r_core.c -+++ b/drivers/media/dvb-frontends/cxd2820r_core.c -@@ -403,7 +403,7 @@ static enum dvbfe_search cxd2820r_search(struct dvb_frontend *fe) - return DVBFE_ALGO_SEARCH_ERROR; - } - --static int cxd2820r_get_frontend_algo(struct dvb_frontend *fe) -+static enum dvbfe_algo cxd2820r_get_frontend_algo(struct dvb_frontend *fe) - { - return DVBFE_ALGO_CUSTOM; - } -diff --git a/drivers/media/dvb-frontends/mb86a20s.c b/drivers/media/dvb-frontends/mb86a20s.c -index e8ac8c3e2ec0..e0f4ba8302d1 100644 ---- a/drivers/media/dvb-frontends/mb86a20s.c -+++ b/drivers/media/dvb-frontends/mb86a20s.c -@@ -2055,7 +2055,7 @@ static void mb86a20s_release(struct dvb_frontend *fe) - kfree(state); - } - --static int mb86a20s_get_frontend_algo(struct dvb_frontend *fe) -+static enum dvbfe_algo mb86a20s_get_frontend_algo(struct dvb_frontend *fe) - { - return DVBFE_ALGO_HW; - } -diff --git a/drivers/media/dvb-frontends/s921.c b/drivers/media/dvb-frontends/s921.c -index 274544a3ae0e..9ef9b9bc1bd2 100644 ---- a/drivers/media/dvb-frontends/s921.c -+++ b/drivers/media/dvb-frontends/s921.c -@@ -464,7 +464,7 @@ static int s921_tune(struct dvb_frontend *fe, - return rc; - } - --static int s921_get_algo(struct dvb_frontend *fe) -+static enum dvbfe_algo s921_get_algo(struct dvb_frontend *fe) - { - return DVBFE_ALGO_HW; - } -diff --git a/drivers/media/pci/bt8xx/dst.c b/drivers/media/pci/bt8xx/dst.c -index 7166d2279465..fa682f9fdc4b 100644 ---- a/drivers/media/pci/bt8xx/dst.c -+++ b/drivers/media/pci/bt8xx/dst.c -@@ -1657,7 +1657,7 @@ static int dst_tune_frontend(struct dvb_frontend* fe, - return 0; - } - --static int dst_get_tuning_algo(struct dvb_frontend *fe) -+static enum dvbfe_algo dst_get_tuning_algo(struct dvb_frontend *fe) - { - return dst_algo ? DVBFE_ALGO_HW : DVBFE_ALGO_SW; - } -diff --git a/drivers/media/pci/pt1/va1j5jf8007s.c b/drivers/media/pci/pt1/va1j5jf8007s.c -index f75f69556be7..d913a6050e8c 100644 ---- a/drivers/media/pci/pt1/va1j5jf8007s.c -+++ b/drivers/media/pci/pt1/va1j5jf8007s.c -@@ -98,7 +98,7 @@ static int va1j5jf8007s_read_snr(struct dvb_frontend *fe, u16 *snr) - return 0; - } - --static int va1j5jf8007s_get_frontend_algo(struct dvb_frontend *fe) -+static enum dvbfe_algo va1j5jf8007s_get_frontend_algo(struct dvb_frontend *fe) - { - return DVBFE_ALGO_HW; - } -diff --git a/drivers/media/pci/pt1/va1j5jf8007t.c b/drivers/media/pci/pt1/va1j5jf8007t.c -index 63fda79a75c0..4115c3ccd4a8 100644 ---- a/drivers/media/pci/pt1/va1j5jf8007t.c -+++ b/drivers/media/pci/pt1/va1j5jf8007t.c -@@ -88,7 +88,7 @@ static int va1j5jf8007t_read_snr(struct dvb_frontend *fe, u16 *snr) - return 0; - } - --static int va1j5jf8007t_get_frontend_algo(struct dvb_frontend *fe) -+static enum dvbfe_algo va1j5jf8007t_get_frontend_algo(struct dvb_frontend *fe) - { - return DVBFE_ALGO_HW; - } -diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c -index 981b3ef71e47..9883da1da383 100644 ---- a/drivers/misc/lkdtm_core.c -+++ b/drivers/misc/lkdtm_core.c -@@ -78,7 +78,7 @@ static irqreturn_t jp_handle_irq_event(unsigned int irq, - return 0; - } - --static void jp_tasklet_action(struct softirq_action *a) -+static void jp_tasklet_action(void) - { - lkdtm_handler(); - jprobe_return(); -diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig -index b811442c5ce6..4f62a63cbcb1 100644 ---- a/drivers/tty/Kconfig -+++ b/drivers/tty/Kconfig -@@ -122,7 +122,6 @@ config UNIX98_PTYS - - config LEGACY_PTYS - bool "Legacy (BSD) PTY support" -- default y - ---help--- - A pseudo terminal (PTY) is a software device consisting of two - halves: a master and a slave. The slave device behaves identical to -diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c -index 83376caa571b..4aa47ca17268 100644 ---- a/drivers/tty/tty_io.c -+++ b/drivers/tty/tty_io.c -@@ -171,6 +171,7 @@ static void free_tty_struct(struct tty_struct *tty) - put_device(tty->dev); - kfree(tty->write_buf); - tty->magic = 0xDEADDEAD; -+ put_user_ns(tty->owner_user_ns); - kfree(tty); - } - -@@ -2159,11 +2160,19 @@ static int tty_fasync(int fd, struct file *filp, int on) - * FIXME: may race normal receive processing - */ - -+int tiocsti_restrict = IS_ENABLED(CONFIG_SECURITY_TIOCSTI_RESTRICT); -+ - static int tiocsti(struct tty_struct *tty, char __user *p) - { - char ch, mbz = 0; - struct tty_ldisc *ld; - -+ if (tiocsti_restrict && -+ !ns_capable(tty->owner_user_ns, CAP_SYS_ADMIN)) { -+ dev_warn_ratelimited(tty->dev, -+ "Denied TIOCSTI ioctl for non-privileged process\n"); -+ return -EPERM; -+ } - if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) - return -EPERM; - if (get_user(ch, p)) -@@ -2846,6 +2855,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) - tty->index = idx; - tty_line_name(driver, idx, tty->name); - tty->dev = tty_get_device(tty); -+ tty->owner_user_ns = get_user_ns(current_user_ns()); - - return tty; - } -diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c -index a073cb5be013..e9dfece7b7ce 100644 ---- a/drivers/usb/core/hub.c -+++ b/drivers/usb/core/hub.c -@@ -38,6 +38,8 @@ - #define USB_VENDOR_GENESYS_LOGIC 0x05e3 - #define HUB_QUIRK_CHECK_PORT_AUTOSUSPEND 0x01 - -+extern int deny_new_usb; -+ - /* Protect struct usb_device->state and ->children members - * Note: Both are also protected by ->dev.sem, except that ->state can - * change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */ -@@ -4818,6 +4820,12 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, - goto done; - return; - } -+ -+ if (deny_new_usb) { -+ dev_err(&port_dev->dev, "denied insert of USB device on port %d\n", port1); -+ goto done; -+ } -+ - if (hub_is_superspeed(hub->hdev)) - unit_load = 150; - else -diff --git a/fs/exec.c b/fs/exec.c -index 0da4d748b4e6..69fcee853363 100644 ---- a/fs/exec.c -+++ b/fs/exec.c -@@ -62,6 +62,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -321,6 +322,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm) - arch_bprm_mm_init(mm, vma); - up_write(&mm->mmap_sem); - bprm->p = vma->vm_end - sizeof(void *); -+ if (randomize_va_space) -+ bprm->p ^= get_random_int() & ~PAGE_MASK; - return 0; - err: - up_write(&mm->mmap_sem); -diff --git a/fs/namei.c b/fs/namei.c -index d1e467b7b9de..0d96ad71b700 100644 ---- a/fs/namei.c -+++ b/fs/namei.c -@@ -902,10 +902,10 @@ static inline void put_link(struct nameidata *nd) - path_put(&last->link); - } - --int sysctl_protected_symlinks __read_mostly = 0; --int sysctl_protected_hardlinks __read_mostly = 0; --int sysctl_protected_fifos __read_mostly; --int sysctl_protected_regular __read_mostly; -+int sysctl_protected_symlinks __read_mostly = 1; -+int sysctl_protected_hardlinks __read_mostly = 1; -+int sysctl_protected_fifos __read_mostly = 2; -+int sysctl_protected_regular __read_mostly = 2; - - /** - * may_follow_link - Check symlink following for unsafe situations -diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig -index 5f93cfacb3d1..cea0d7d3b23e 100644 ---- a/fs/nfs/Kconfig -+++ b/fs/nfs/Kconfig -@@ -195,4 +195,3 @@ config NFS_DEBUG - bool - depends on NFS_FS && SUNRPC_DEBUG - select CRC32 -- default y -diff --git a/fs/pipe.c b/fs/pipe.c -index 8ef7d7bef775..b82f305ec13d 100644 ---- a/fs/pipe.c -+++ b/fs/pipe.c -@@ -38,7 +38,7 @@ unsigned int pipe_max_size = 1048576; - /* - * Minimum pipe size, as required by POSIX - */ --unsigned int pipe_min_size = PAGE_SIZE; -+unsigned int pipe_min_size __read_only = PAGE_SIZE; - - /* Maximum allocatable pages per user. Hard limit is unset by default, soft - * matches default values. -diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig -index 1ade1206bb89..60b0f76dec47 100644 ---- a/fs/proc/Kconfig -+++ b/fs/proc/Kconfig -@@ -39,7 +39,6 @@ config PROC_KCORE - config PROC_VMCORE - bool "/proc/vmcore support" - depends on PROC_FS && CRASH_DUMP -- default y - help - Exports the dump image of crashed kernel in ELF format. - -diff --git a/fs/stat.c b/fs/stat.c -index 873785dae022..d3c2ada8b9c7 100644 ---- a/fs/stat.c -+++ b/fs/stat.c -@@ -40,8 +40,13 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) - stat->gid = inode->i_gid; - stat->rdev = inode->i_rdev; - stat->size = i_size_read(inode); -- stat->atime = inode->i_atime; -- stat->mtime = inode->i_mtime; -+ if (is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { -+ stat->atime = inode->i_ctime; -+ stat->mtime = inode->i_ctime; -+ } else { -+ stat->atime = inode->i_atime; -+ stat->mtime = inode->i_mtime; -+ } - stat->ctime = inode->i_ctime; - stat->blksize = i_blocksize(inode); - stat->blocks = inode->i_blocks; -@@ -75,9 +80,14 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, - stat->result_mask |= STATX_BASIC_STATS; - request_mask &= STATX_ALL; - query_flags &= KSTAT_QUERY_FLAGS; -- if (inode->i_op->getattr) -- return inode->i_op->getattr(path, stat, request_mask, -- query_flags); -+ if (inode->i_op->getattr) { -+ int retval = inode->i_op->getattr(path, stat, request_mask, query_flags); -+ if (!retval && is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { -+ stat->atime = stat->ctime; -+ stat->mtime = stat->ctime; -+ } -+ return retval; -+ } - - generic_fillattr(inode, stat); - return 0; -diff --git a/include/linux/cache.h b/include/linux/cache.h -index 750621e41d1c..e7157c18c62c 100644 ---- a/include/linux/cache.h -+++ b/include/linux/cache.h -@@ -31,6 +31,8 @@ - #define __ro_after_init __attribute__((__section__(".data..ro_after_init"))) - #endif - -+#define __read_only __ro_after_init -+ - #ifndef ____cacheline_aligned - #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) - #endif -diff --git a/include/linux/capability.h b/include/linux/capability.h -index f640dcbc880c..2b4f5d651f19 100644 ---- a/include/linux/capability.h -+++ b/include/linux/capability.h -@@ -207,6 +207,7 @@ extern bool has_capability_noaudit(struct task_struct *t, int cap); - extern bool has_ns_capability_noaudit(struct task_struct *t, - struct user_namespace *ns, int cap); - extern bool capable(int cap); -+extern bool capable_noaudit(int cap); - extern bool ns_capable(struct user_namespace *ns, int cap); - extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); - #else -@@ -232,6 +233,10 @@ static inline bool capable(int cap) - { - return true; - } -+static inline bool capable_noaudit(int cap) -+{ -+ return true; -+} - static inline bool ns_capable(struct user_namespace *ns, int cap) - { - return true; -diff --git a/include/linux/fs.h b/include/linux/fs.h -index f6a577edec67..fa3a6caeca6c 100644 ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -3383,4 +3383,15 @@ static inline bool dir_relax_shared(struct inode *inode) - extern bool path_noexec(const struct path *path); - extern void inode_nohighmem(struct inode *inode); - -+extern int device_sidechannel_restrict; -+ -+static inline bool is_sidechannel_device(const struct inode *inode) -+{ -+ umode_t mode; -+ if (!device_sidechannel_restrict) -+ return false; -+ mode = inode->i_mode; -+ return ((S_ISCHR(mode) || S_ISBLK(mode)) && (mode & (S_IROTH | S_IWOTH))); -+} -+ - #endif /* _LINUX_FS_H */ -diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h -index bdaf22582f6e..326ff15d4637 100644 ---- a/include/linux/fsnotify.h -+++ b/include/linux/fsnotify.h -@@ -181,6 +181,9 @@ static inline void fsnotify_access(struct file *file) - struct inode *inode = path->dentry->d_inode; - __u32 mask = FS_ACCESS; - -+ if (is_sidechannel_device(inode)) -+ return; -+ - if (S_ISDIR(inode->i_mode)) - mask |= FS_ISDIR; - -@@ -199,6 +202,9 @@ static inline void fsnotify_modify(struct file *file) - struct inode *inode = path->dentry->d_inode; - __u32 mask = FS_MODIFY; - -+ if (is_sidechannel_device(inode)) -+ return; -+ - if (S_ISDIR(inode->i_mode)) - mask |= FS_ISDIR; - -diff --git a/include/linux/gfp.h b/include/linux/gfp.h -index b041f94678de..fd8bb5a78b75 100644 ---- a/include/linux/gfp.h -+++ b/include/linux/gfp.h -@@ -518,9 +518,9 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, - extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); - extern unsigned long get_zeroed_page(gfp_t gfp_mask); - --void *alloc_pages_exact(size_t size, gfp_t gfp_mask); -+void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __attribute__((alloc_size(1))); - void free_pages_exact(void *virt, size_t size); --void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); -+void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __attribute__((alloc_size(2))); - - #define __get_free_page(gfp_mask) \ - __get_free_pages((gfp_mask), 0) -diff --git a/include/linux/highmem.h b/include/linux/highmem.h -index 776f90f3a1cd..3f5c47000059 100644 ---- a/include/linux/highmem.h -+++ b/include/linux/highmem.h -@@ -191,6 +191,13 @@ static inline void clear_highpage(struct page *page) - kunmap_atomic(kaddr); - } - -+static inline void verify_zero_highpage(struct page *page) -+{ -+ void *kaddr = kmap_atomic(page); -+ BUG_ON(memchr_inv(kaddr, 0, PAGE_SIZE)); -+ kunmap_atomic(kaddr); -+} -+ - static inline void zero_user_segments(struct page *page, - unsigned start1, unsigned end1, - unsigned start2, unsigned end2) -diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h -index 69c238210325..ee487ea4f48f 100644 ---- a/include/linux/interrupt.h -+++ b/include/linux/interrupt.h -@@ -485,7 +485,7 @@ extern const char * const softirq_to_name[NR_SOFTIRQS]; - - struct softirq_action - { -- void (*action)(struct softirq_action *); -+ void (*action)(void); - }; - - asmlinkage void do_softirq(void); -@@ -500,7 +500,7 @@ static inline void do_softirq_own_stack(void) - } - #endif - --extern void open_softirq(int nr, void (*action)(struct softirq_action *)); -+extern void __init open_softirq(int nr, void (*action)(void)); - extern void softirq_init(void); - extern void __raise_softirq_irqoff(unsigned int nr); - -diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h -index df32d2508290..c992d130b94d 100644 ---- a/include/linux/kobject_ns.h -+++ b/include/linux/kobject_ns.h -@@ -46,7 +46,7 @@ struct kobj_ns_type_operations { - void (*drop_ns)(void *); - }; - --int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); -+int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops); - int kobj_ns_type_registered(enum kobj_ns_type type); - const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent); - const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj); -diff --git a/include/linux/mm.h b/include/linux/mm.h -index 58f2263de4de..e90dc5d98c7f 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -525,7 +525,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) - } - #endif - --extern void *kvmalloc_node(size_t size, gfp_t flags, int node); -+extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __attribute__((alloc_size(1))); - static inline void *kvmalloc(size_t size, gfp_t flags) - { - return kvmalloc_node(size, flags, NUMA_NO_NODE); -diff --git a/include/linux/percpu.h b/include/linux/percpu.h -index 296bbe49d5d1..b26652c9a98d 100644 ---- a/include/linux/percpu.h -+++ b/include/linux/percpu.h -@@ -129,7 +129,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_populate_pte_fn_t populate_pte_fn); - #endif - --extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); -+extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); - extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); - extern bool is_kernel_percpu_address(unsigned long addr); - -@@ -137,8 +137,8 @@ extern bool is_kernel_percpu_address(unsigned long addr); - extern void __init setup_per_cpu_areas(void); - #endif - --extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); --extern void __percpu *__alloc_percpu(size_t size, size_t align); -+extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __attribute__((alloc_size(1))); -+extern void __percpu *__alloc_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); - extern void free_percpu(void __percpu *__pdata); - extern phys_addr_t per_cpu_ptr_to_phys(void *addr); - -diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h -index 8e22f24ded6a..b7fecdfa6de5 100644 ---- a/include/linux/perf_event.h -+++ b/include/linux/perf_event.h -@@ -1165,6 +1165,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -+static inline bool perf_paranoid_any(void) -+{ -+ return sysctl_perf_event_paranoid > 2; -+} -+ - static inline bool perf_paranoid_tracepoint_raw(void) - { - return sysctl_perf_event_paranoid > -1; -diff --git a/include/linux/slab.h b/include/linux/slab.h -index ae5ed6492d54..fd0786124504 100644 ---- a/include/linux/slab.h -+++ b/include/linux/slab.h -@@ -146,8 +146,8 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *); - /* - * Common kmalloc functions provided by all allocators - */ --void * __must_check __krealloc(const void *, size_t, gfp_t); --void * __must_check krealloc(const void *, size_t, gfp_t); -+void * __must_check __krealloc(const void *, size_t, gfp_t) __attribute__((alloc_size(2))); -+void * __must_check krealloc(const void *, size_t, gfp_t) __attribute((alloc_size(2))); - void kfree(const void *); - void kzfree(const void *); - size_t ksize(const void *); -@@ -324,7 +324,7 @@ static __always_inline int kmalloc_index(size_t size) - } - #endif /* !CONFIG_SLOB */ - --void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; -+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); - void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; - void kmem_cache_free(struct kmem_cache *, void *); - -@@ -348,7 +348,7 @@ static __always_inline void kfree_bulk(size_t size, void **p) - } - - #ifdef CONFIG_NUMA --void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; -+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); - void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; - #else - static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) -@@ -473,7 +473,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) - * for general use, and so are not documented here. For a full list of - * potential flags, always refer to linux/gfp.h. - */ --static __always_inline void *kmalloc(size_t size, gfp_t flags) -+static __always_inline __attribute__((alloc_size(1))) void *kmalloc(size_t size, gfp_t flags) - { - if (__builtin_constant_p(size)) { - if (size > KMALLOC_MAX_CACHE_SIZE) -@@ -513,7 +513,7 @@ static __always_inline int kmalloc_size(int n) - return 0; - } - --static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) -+static __always_inline __attribute__((alloc_size(1))) void *kmalloc_node(size_t size, gfp_t flags, int node) - { - #ifndef CONFIG_SLOB - if (__builtin_constant_p(size) && -diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h -index f8ced87a2efe..cd61c8d2aa6e 100644 ---- a/include/linux/slub_def.h -+++ b/include/linux/slub_def.h -@@ -121,6 +121,11 @@ struct kmem_cache { - unsigned long random; - #endif - -+#ifdef CONFIG_SLAB_CANARY -+ unsigned long random_active; -+ unsigned long random_inactive; -+#endif -+ - #ifdef CONFIG_NUMA - /* - * Defragmentation by allocating from a remote node. -diff --git a/include/linux/string.h b/include/linux/string.h -index 96115bf561b4..f93d908c5bbc 100644 ---- a/include/linux/string.h -+++ b/include/linux/string.h -@@ -234,10 +234,16 @@ void __read_overflow2(void) __compiletime_error("detected read beyond size of ob - void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter"); - void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); - -+#ifdef CONFIG_FORTIFY_SOURCE_STRICT_STRING -+#define __string_size(p) __builtin_object_size(p, 1) -+#else -+#define __string_size(p) __builtin_object_size(p, 0) -+#endif -+ - #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) - __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) - { -- size_t p_size = __builtin_object_size(p, 0); -+ size_t p_size = __string_size(p); - if (__builtin_constant_p(size) && p_size < size) - __write_overflow(); - if (p_size < size) -@@ -247,7 +253,7 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) - - __FORTIFY_INLINE char *strcat(char *p, const char *q) - { -- size_t p_size = __builtin_object_size(p, 0); -+ size_t p_size = __string_size(p); - if (p_size == (size_t)-1) - return __builtin_strcat(p, q); - if (strlcat(p, q, p_size) >= p_size) -@@ -258,7 +264,7 @@ __FORTIFY_INLINE char *strcat(char *p, const char *q) - __FORTIFY_INLINE __kernel_size_t strlen(const char *p) - { - __kernel_size_t ret; -- size_t p_size = __builtin_object_size(p, 0); -+ size_t p_size = __string_size(p); - - /* Work around gcc excess stack consumption issue */ - if (p_size == (size_t)-1 || -@@ -273,7 +279,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) - extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); - __FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) - { -- size_t p_size = __builtin_object_size(p, 0); -+ size_t p_size = __string_size(p); - __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); - if (p_size <= ret && maxlen != ret) - fortify_panic(__func__); -@@ -285,8 +291,8 @@ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); - __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) - { - size_t ret; -- size_t p_size = __builtin_object_size(p, 0); -- size_t q_size = __builtin_object_size(q, 0); -+ size_t p_size = __string_size(p); -+ size_t q_size = __string_size(q); - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __real_strlcpy(p, q, size); - ret = strlen(q); -@@ -306,8 +312,8 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) - __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) - { - size_t p_len, copy_len; -- size_t p_size = __builtin_object_size(p, 0); -- size_t q_size = __builtin_object_size(q, 0); -+ size_t p_size = __string_size(p); -+ size_t q_size = __string_size(q); - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __builtin_strncat(p, q, count); - p_len = strlen(p); -@@ -420,8 +426,8 @@ __FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) - /* defined after fortified strlen and memcpy to reuse them */ - __FORTIFY_INLINE char *strcpy(char *p, const char *q) - { -- size_t p_size = __builtin_object_size(p, 0); -- size_t q_size = __builtin_object_size(q, 0); -+ size_t p_size = __string_size(p); -+ size_t q_size = __string_size(q); - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __builtin_strcpy(p, q); - memcpy(p, q, strlen(q) + 1); -diff --git a/include/linux/tty.h b/include/linux/tty.h -index 1dd587ba6d88..9a9a04fb641d 100644 ---- a/include/linux/tty.h -+++ b/include/linux/tty.h -@@ -13,6 +13,7 @@ - #include - #include - #include -+#include - - - /* -@@ -335,6 +336,7 @@ struct tty_struct { - /* If the tty has a pending do_SAK, queue it here - akpm */ - struct work_struct SAK_work; - struct tty_port *port; -+ struct user_namespace *owner_user_ns; - } __randomize_layout; - - /* Each of a tty's open files has private_data pointing to tty_file_private */ -@@ -344,6 +346,8 @@ struct tty_file_private { - struct list_head list; - }; - -+extern int tiocsti_restrict; -+ - /* tty magic number */ - #define TTY_MAGIC 0x5401 - -diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h -index 1e5d8c392f15..66d0e49c9987 100644 ---- a/include/linux/vmalloc.h -+++ b/include/linux/vmalloc.h -@@ -68,19 +68,19 @@ static inline void vmalloc_init(void) - } - #endif - --extern void *vmalloc(unsigned long size); --extern void *vzalloc(unsigned long size); --extern void *vmalloc_user(unsigned long size); --extern void *vmalloc_node(unsigned long size, int node); --extern void *vzalloc_node(unsigned long size, int node); --extern void *vmalloc_exec(unsigned long size); --extern void *vmalloc_32(unsigned long size); --extern void *vmalloc_32_user(unsigned long size); --extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); -+extern void *vmalloc(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vzalloc(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vmalloc_user(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vmalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); -+extern void *vzalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); -+extern void *vmalloc_exec(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vmalloc_32(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vmalloc_32_user(unsigned long size) __attribute__((alloc_size(1))); -+extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) __attribute__((alloc_size(1))); - extern void *__vmalloc_node_range(unsigned long size, unsigned long align, - unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, unsigned long vm_flags, int node, -- const void *caller); -+ const void *caller) __attribute__((alloc_size(1))); - #ifndef CONFIG_MMU - extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); - static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, -diff --git a/init/Kconfig b/init/Kconfig -index 46075327c165..0c78750bc76d 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -309,6 +309,7 @@ config USELIB - config AUDIT - bool "Auditing support" - depends on NET -+ default y - help - Enable auditing infrastructure that can be used with another - kernel subsystem, such as SELinux (which requires this for -@@ -1052,6 +1053,12 @@ config CC_OPTIMIZE_FOR_SIZE - - endchoice - -+config LOCAL_INIT -+ bool "Zero uninitialized locals" -+ help -+ Zero-fill uninitialized local variables, other than variable-length -+ arrays. Requires compiler support. -+ - config SYSCTL - bool - -@@ -1361,8 +1368,7 @@ config SHMEM - which may be appropriate on small systems without swap. - - config AIO -- bool "Enable AIO support" if EXPERT -- default y -+ bool "Enable AIO support" - help - This option enables POSIX asynchronous I/O which may by used - by some high performance threaded applications. Disabling -@@ -1491,7 +1497,7 @@ config VM_EVENT_COUNTERS - - config SLUB_DEBUG - default y -- bool "Enable SLUB debugging support" if EXPERT -+ bool "Enable SLUB debugging support" - depends on SLUB && SYSFS - help - SLUB has extensive debug support features. Disabling these can -@@ -1515,7 +1521,6 @@ config SLUB_MEMCG_SYSFS_ON - - config COMPAT_BRK - bool "Disable heap randomization" -- default y - help - Randomizing heap placement makes heap exploits harder, but it - also breaks ancient binaries (including anything libc5 based). -@@ -1562,7 +1567,6 @@ endchoice - - config SLAB_MERGE_DEFAULT - bool "Allow slab caches to be merged" -- default y - help - For reduced kernel memory fragmentation, slab caches can be - merged when they share the same size and other characteristics. -@@ -1575,9 +1579,9 @@ config SLAB_MERGE_DEFAULT - command line. - - config SLAB_FREELIST_RANDOM -- default n - depends on SLAB || SLUB - bool "SLAB freelist randomization" -+ default y - help - Randomizes the freelist order used on creating new pages. This - security feature reduces the predictability of the kernel slab -@@ -1586,12 +1590,56 @@ config SLAB_FREELIST_RANDOM - config SLAB_FREELIST_HARDENED - bool "Harden slab freelist metadata" - depends on SLUB -+ default y - help - Many kernel heap attacks try to target slab cache metadata and - other infrastructure. This options makes minor performance - sacrifies to harden the kernel slab allocator against common - freelist exploit methods. - -+config SLAB_HARDENED -+ default y -+ depends on SLUB -+ bool "Hardened SLAB infrastructure" -+ help -+ Make minor performance sacrifices to harden the kernel slab -+ allocator. -+ -+config SLAB_CANARY -+ depends on SLUB -+ depends on !SLAB_MERGE_DEFAULT -+ bool "SLAB canaries" -+ default y -+ help -+ Place canaries at the end of kernel slab allocations, sacrificing -+ some performance and memory usage for security. -+ -+ Canaries can detect some forms of heap corruption when allocations -+ are freed and as part of the HARDENED_USERCOPY feature. It provides -+ basic use-after-free detection for HARDENED_USERCOPY. -+ -+ Canaries absorb small overflows (rendering them harmless), mitigate -+ non-NUL terminated C string overflows on 64-bit via a guaranteed zero -+ byte and provide basic double-free detection. -+ -+config SLAB_SANITIZE -+ bool "Sanitize SLAB allocations" -+ depends on SLUB -+ default y -+ help -+ Zero fill slab allocations on free, reducing the lifetime of -+ sensitive data and helping to mitigate use-after-free bugs. -+ -+ For slabs with debug poisoning enabling, this has no impact. -+ -+config SLAB_SANITIZE_VERIFY -+ depends on SLAB_SANITIZE && PAGE_SANITIZE -+ default y -+ bool "Verify sanitized SLAB allocations" -+ help -+ Verify that newly allocated slab allocations are zeroed to detect -+ write-after-free bugs. -+ - config SLUB_CPU_PARTIAL - default y - depends on SLUB && SMP -diff --git a/kernel/audit.c b/kernel/audit.c -index d301276bca58..d55a1e290cea 100644 ---- a/kernel/audit.c -+++ b/kernel/audit.c -@@ -1575,6 +1575,9 @@ static int __init audit_enable(char *str) - audit_default = !!simple_strtol(str, NULL, 0); - if (!audit_default) - audit_initialized = AUDIT_DISABLED; -+ else -+ audit_initialized = AUDIT_UNINITIALIZED; -+ - audit_enabled = audit_default; - audit_ever_enabled = !!audit_enabled; - -diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c -index d203a5d6b726..2a6c3e2c57a6 100644 ---- a/kernel/bpf/core.c -+++ b/kernel/bpf/core.c -@@ -539,7 +539,7 @@ void __weak bpf_jit_free(struct bpf_prog *fp) - bpf_prog_unlock_free(fp); - } - --int bpf_jit_harden __read_mostly; -+int bpf_jit_harden __read_mostly = 2; - - static int bpf_jit_blind_insn(const struct bpf_insn *from, - const struct bpf_insn *aux, -diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c -index 5c9deed4524e..6d90aabecfc7 100644 ---- a/kernel/bpf/syscall.c -+++ b/kernel/bpf/syscall.c -@@ -37,7 +37,7 @@ static DEFINE_SPINLOCK(prog_idr_lock); - static DEFINE_IDR(map_idr); - static DEFINE_SPINLOCK(map_idr_lock); - --int sysctl_unprivileged_bpf_disabled __read_mostly; -+int sysctl_unprivileged_bpf_disabled __read_mostly = 1; - - static const struct bpf_map_ops * const bpf_map_types[] = { - #define BPF_PROG_TYPE(_id, _ops) -diff --git a/kernel/capability.c b/kernel/capability.c -index 1e1c0236f55b..452062fe45ce 100644 ---- a/kernel/capability.c -+++ b/kernel/capability.c -@@ -431,6 +431,12 @@ bool capable(int cap) - return ns_capable(&init_user_ns, cap); - } - EXPORT_SYMBOL(capable); -+ -+bool capable_noaudit(int cap) -+{ -+ return ns_capable_noaudit(&init_user_ns, cap); -+} -+EXPORT_SYMBOL(capable_noaudit); - #endif /* CONFIG_MULTIUSER */ - - /** -diff --git a/kernel/events/core.c b/kernel/events/core.c -index 991af683ef9e..66f66b648707 100644 ---- a/kernel/events/core.c -+++ b/kernel/events/core.c -@@ -397,8 +397,13 @@ static cpumask_var_t perf_online_mask; - * 0 - disallow raw tracepoint access for unpriv - * 1 - disallow cpu events for unpriv - * 2 - disallow kernel profiling for unpriv -+ * 3 - disallow all unpriv perf event use - */ -+#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT -+int sysctl_perf_event_paranoid __read_mostly = 3; -+#else - int sysctl_perf_event_paranoid __read_mostly = 2; -+#endif - - /* Minimum for 512 kiB + 1 user control page */ - int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ -@@ -9984,6 +9989,9 @@ SYSCALL_DEFINE5(perf_event_open, - if (flags & ~PERF_FLAG_ALL) - return -EINVAL; - -+ if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN)) -+ return -EACCES; -+ - err = perf_copy_attr(attr_uptr, &attr); - if (err) - return err; -diff --git a/kernel/fork.c b/kernel/fork.c -index 6a219fea4926..013703c020f6 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -102,6 +102,11 @@ - - #define CREATE_TRACE_POINTS - #include -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#else -+#define unprivileged_userns_clone 0 -+#endif - - /* - * Minimum number of threads to boot the kernel -@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process( - if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) - return ERR_PTR(-EINVAL); - -+ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) -+ if (!capable(CAP_SYS_ADMIN)) -+ return ERR_PTR(-EPERM); -+ - /* - * Thread groups must share signals as well, and detached threads - * can only be started up within the thread group. -@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) - if (unshare_flags & CLONE_NEWNS) - unshare_flags |= CLONE_FS; - -+ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { -+ err = -EPERM; -+ if (!capable(CAP_SYS_ADMIN)) -+ goto bad_unshare_out; -+ } -+ - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; -diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c -index 0972a8e09d08..00dde7aad47a 100644 ---- a/kernel/power/snapshot.c -+++ b/kernel/power/snapshot.c -@@ -1136,7 +1136,7 @@ void free_basic_memory_bitmaps(void) - - void clear_free_pages(void) - { --#ifdef CONFIG_PAGE_POISONING_ZERO -+#if defined(CONFIG_PAGE_POISONING_ZERO) || defined(CONFIG_PAGE_SANITIZE) - struct memory_bitmap *bm = free_pages_map; - unsigned long pfn; - -@@ -1153,7 +1153,7 @@ void clear_free_pages(void) - } - memory_bm_position_reset(bm); - pr_info("PM: free pages cleared after restore\n"); --#endif /* PAGE_POISONING_ZERO */ -+#endif /* PAGE_POISONING_ZERO || PAGE_SANITIZE */ - } - - /** -diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c -index a64eee0db39e..4d7de378fe4c 100644 ---- a/kernel/rcu/tiny.c -+++ b/kernel/rcu/tiny.c -@@ -164,7 +164,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) - } - } - --static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) -+static __latent_entropy void rcu_process_callbacks(void) - { - __rcu_process_callbacks(&rcu_sched_ctrlblk); - __rcu_process_callbacks(&rcu_bh_ctrlblk); -diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c -index 710ce1d6b982..4013b634e820 100644 ---- a/kernel/rcu/tree.c -+++ b/kernel/rcu/tree.c -@@ -2927,7 +2927,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) - /* - * Do RCU core processing for the current CPU. - */ --static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) -+static __latent_entropy void rcu_process_callbacks(void) - { - struct rcu_state *rsp; - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 7240bb4a4090..9adcec5bcbd9 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -9005,7 +9005,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } - * run_rebalance_domains is triggered when needed from the scheduler tick. - * Also triggered for nohz idle balancing (with nohz_balancing_kick set). - */ --static __latent_entropy void run_rebalance_domains(struct softirq_action *h) -+static __latent_entropy void run_rebalance_domains(void) - { - struct rq *this_rq = this_rq(); - enum cpu_idle_type idle = this_rq->idle_balance ? -diff --git a/kernel/softirq.c b/kernel/softirq.c -index a4c87cf27f9d..efb97a8dc568 100644 ---- a/kernel/softirq.c -+++ b/kernel/softirq.c -@@ -53,7 +53,7 @@ irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; - EXPORT_SYMBOL(irq_stat); - #endif - --static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; -+static struct softirq_action softirq_vec[NR_SOFTIRQS] __ro_after_init __aligned(PAGE_SIZE); - - DEFINE_PER_CPU(struct task_struct *, ksoftirqd); - -@@ -285,7 +285,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) - kstat_incr_softirqs_this_cpu(vec_nr); - - trace_softirq_entry(vec_nr); -- h->action(h); -+ h->action(); - trace_softirq_exit(vec_nr); - if (unlikely(prev_count != preempt_count())) { - pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", -@@ -448,7 +448,7 @@ void __raise_softirq_irqoff(unsigned int nr) - or_softirq_pending(1UL << nr); - } - --void open_softirq(int nr, void (*action)(struct softirq_action *)) -+void __init open_softirq(int nr, void (*action)(void)) - { - softirq_vec[nr].action = action; - } -@@ -490,7 +490,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) - } - EXPORT_SYMBOL(__tasklet_hi_schedule); - --static __latent_entropy void tasklet_action(struct softirq_action *a) -+static __latent_entropy void tasklet_action(void) - { - struct tasklet_struct *list; - -@@ -526,7 +526,7 @@ static __latent_entropy void tasklet_action(struct softirq_action *a) - } - } - --static __latent_entropy void tasklet_hi_action(struct softirq_action *a) -+static __latent_entropy void tasklet_hi_action(void) - { - struct tasklet_struct *list; - -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index d330b1ce3b94..050278b12928 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -66,6 +66,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -98,12 +99,19 @@ - #if defined(CONFIG_SYSCTL) - - /* External variables not in a header file. */ -+#if IS_ENABLED(CONFIG_USB) -+int deny_new_usb __read_mostly = 0; -+EXPORT_SYMBOL(deny_new_usb); -+#endif - extern int suid_dumpable; - #ifdef CONFIG_COREDUMP - extern int core_uses_pid; - extern char core_pattern[]; - extern unsigned int core_pipe_limit; - #endif -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#endif - extern int pid_max; - extern int pid_max_min, pid_max_max; - extern int percpu_pagelist_fraction; -@@ -115,40 +123,43 @@ extern int sysctl_nr_trim_pages; - - /* Constants used for minimum and maximum */ - #ifdef CONFIG_LOCKUP_DETECTOR --static int sixty = 60; -+static int sixty __read_only = 60; - #endif - --static int __maybe_unused neg_one = -1; -+static int __maybe_unused neg_one __read_only = -1; - - static int zero; --static int __maybe_unused one = 1; --static int __maybe_unused two = 2; --static int __maybe_unused four = 4; --static unsigned long one_ul = 1; --static int one_hundred = 100; --static int one_thousand = 1000; -+static int __maybe_unused one __read_only = 1; -+static int __maybe_unused two __read_only = 2; -+static int __maybe_unused four __read_only = 4; -+static unsigned long one_ul __read_only = 1; -+static int one_hundred __read_only = 100; -+static int one_thousand __read_only = 1000; - #ifdef CONFIG_PRINTK --static int ten_thousand = 10000; -+static int ten_thousand __read_only = 10000; - #endif - #ifdef CONFIG_PERF_EVENTS --static int six_hundred_forty_kb = 640 * 1024; -+static int six_hundred_forty_kb __read_only = 640 * 1024; - #endif - - /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ --static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; -+static unsigned long dirty_bytes_min __read_only = 2 * PAGE_SIZE; - - /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ --static int maxolduid = 65535; --static int minolduid; -+static int maxolduid __read_only = 65535; -+static int minolduid __read_only; - --static int ngroups_max = NGROUPS_MAX; -+static int ngroups_max __read_only = NGROUPS_MAX; - static const int cap_last_cap = CAP_LAST_CAP; - - /*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ - #ifdef CONFIG_DETECT_HUNG_TASK --static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); -+static unsigned long hung_task_timeout_max __read_only = (LONG_MAX/HZ); - #endif - -+int device_sidechannel_restrict __read_mostly = 1; -+EXPORT_SYMBOL(device_sidechannel_restrict); -+ - #ifdef CONFIG_INOTIFY_USER - #include - #endif -@@ -286,19 +297,19 @@ static struct ctl_table sysctl_base_table[] = { - }; - - #ifdef CONFIG_SCHED_DEBUG --static int min_sched_granularity_ns = 100000; /* 100 usecs */ --static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ --static int min_wakeup_granularity_ns; /* 0 usecs */ --static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ -+static int min_sched_granularity_ns __read_only = 100000; /* 100 usecs */ -+static int max_sched_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ -+static int min_wakeup_granularity_ns __read_only; /* 0 usecs */ -+static int max_wakeup_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ - #ifdef CONFIG_SMP --static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; --static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -+static int min_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_NONE; -+static int max_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_END-1; - #endif /* CONFIG_SMP */ - #endif /* CONFIG_SCHED_DEBUG */ - - #ifdef CONFIG_COMPACTION --static int min_extfrag_threshold; --static int max_extfrag_threshold = 1000; -+static int min_extfrag_threshold __read_only; -+static int max_extfrag_threshold __read_only = 1000; - #endif - - static struct ctl_table kern_table[] = { -@@ -512,6 +523,15 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_USER_NS -+ { -+ .procname = "unprivileged_userns_clone", -+ .data = &unprivileged_userns_clone, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec, -+ }, -+#endif - #ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", -@@ -853,6 +873,37 @@ static struct ctl_table kern_table[] = { - .extra1 = &zero, - .extra2 = &two, - }, -+#endif -+#if defined CONFIG_TTY -+ { -+ .procname = "tiocsti_restrict", -+ .data = &tiocsti_restrict, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec_minmax_sysadmin, -+ .extra1 = &zero, -+ .extra2 = &one, -+ }, -+#endif -+ { -+ .procname = "device_sidechannel_restrict", -+ .data = &device_sidechannel_restrict, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec_minmax_sysadmin, -+ .extra1 = &zero, -+ .extra2 = &one, -+ }, -+#if IS_ENABLED(CONFIG_USB) -+ { -+ .procname = "deny_new_usb", -+ .data = &deny_new_usb, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec_minmax_sysadmin, -+ .extra1 = &zero, -+ .extra2 = &one, -+ }, - #endif - { - .procname = "ngroups_max", -diff --git a/kernel/time/timer.c b/kernel/time/timer.c -index f17c76a1a05f..50f079d11488 100644 ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -1640,7 +1640,7 @@ static inline void __run_timers(struct timer_base *base) - /* - * This function runs timers and the timer-tq in bottom half context. - */ --static __latent_entropy void run_timer_softirq(struct softirq_action *h) -+static __latent_entropy void run_timer_softirq(void) - { - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); - -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index ed80a88980f0..ff6d27d06af0 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -24,6 +24,9 @@ - #include - #include - -+/* sysctl */ -+int unprivileged_userns_clone; -+ - static struct kmem_cache *user_ns_cachep __read_mostly; - static DEFINE_MUTEX(userns_state_mutex); - -diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index 62d0e25c054c..3953072277eb 100644 ---- a/lib/Kconfig.debug -+++ b/lib/Kconfig.debug -@@ -937,6 +937,7 @@ endmenu # "Debug lockups and hangs" - - config PANIC_ON_OOPS - bool "Panic on Oops" -+ default y - help - Say Y here to enable the kernel to panic when it oopses. This - has the same effect as setting oops=panic on the kernel command -@@ -946,7 +947,7 @@ config PANIC_ON_OOPS - anything erroneous after an oops which could result in data - corruption or other issues. - -- Say N if unsure. -+ Say Y if unsure. - - config PANIC_ON_OOPS_VALUE - int -@@ -1319,6 +1320,7 @@ config DEBUG_BUGVERBOSE - config DEBUG_LIST - bool "Debug linked list manipulation" - depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION -+ default y - help - Enable this to turn on extended checks in the linked-list - walking routines. -@@ -1932,6 +1934,7 @@ config MEMTEST - config BUG_ON_DATA_CORRUPTION - bool "Trigger a BUG when data corruption is detected" - select DEBUG_LIST -+ default y - help - Select this option if the kernel should BUG when it encounters - data corruption in kernel memory structures when they get checked -@@ -1952,7 +1955,7 @@ config STRICT_DEVMEM - bool "Filter access to /dev/mem" - depends on MMU && DEVMEM - depends on ARCH_HAS_DEVMEM_IS_ALLOWED -- default y if TILE || PPC -+ default y - ---help--- - If this option is disabled, you allow userspace (root) access to all - of memory, including kernel and userspace memory. Accidental -@@ -1971,6 +1974,7 @@ config STRICT_DEVMEM - config IO_STRICT_DEVMEM - bool "Filter I/O access to /dev/mem" - depends on STRICT_DEVMEM -+ default y - ---help--- - If this option is disabled, you allow userspace (root) access to all - io-memory regardless of whether a driver is actively using that -diff --git a/lib/irq_poll.c b/lib/irq_poll.c -index 86a709954f5a..6f15787fcb1b 100644 ---- a/lib/irq_poll.c -+++ b/lib/irq_poll.c -@@ -75,7 +75,7 @@ void irq_poll_complete(struct irq_poll *iop) - } - EXPORT_SYMBOL(irq_poll_complete); - --static void __latent_entropy irq_poll_softirq(struct softirq_action *h) -+static void __latent_entropy irq_poll_softirq(void) - { - struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); - int rearm = 0, budget = irq_poll_budget; -diff --git a/lib/kobject.c b/lib/kobject.c -index bbbb067de8ec..fec2f780cf9b 100644 ---- a/lib/kobject.c -+++ b/lib/kobject.c -@@ -956,9 +956,9 @@ EXPORT_SYMBOL_GPL(kset_create_and_add); - - - static DEFINE_SPINLOCK(kobj_ns_type_lock); --static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES]; -+static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES] __ro_after_init; - --int kobj_ns_type_register(const struct kobj_ns_type_operations *ops) -+int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops) - { - enum kobj_ns_type type = ops->type; - int error; -diff --git a/lib/nlattr.c b/lib/nlattr.c -index 3d8295c85505..3fa3b3409d69 100644 ---- a/lib/nlattr.c -+++ b/lib/nlattr.c -@@ -341,6 +341,8 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count) - { - int minlen = min_t(int, count, nla_len(src)); - -+ BUG_ON(minlen < 0); -+ - memcpy(dest, nla_data(src), minlen); - if (count > minlen) - memset(dest + minlen, 0, count - minlen); -diff --git a/lib/vsprintf.c b/lib/vsprintf.c -index 4a990f3fd345..3df8db5af0ba 100644 ---- a/lib/vsprintf.c -+++ b/lib/vsprintf.c -@@ -1588,7 +1588,7 @@ char *device_node_string(char *buf, char *end, struct device_node *dn, - return widen_string(buf, buf - buf_start, end, spec); - } - --int kptr_restrict __read_mostly; -+int kptr_restrict __read_mostly = 2; - - /* - * Show a '%p' thing. A kernel extension is that the '%p' is followed -diff --git a/mm/Kconfig b/mm/Kconfig -index 59efbd3337e0..c070e14ec83d 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -319,7 +319,8 @@ config KSM - config DEFAULT_MMAP_MIN_ADDR - int "Low address space to protect from user allocation" - depends on MMU -- default 4096 -+ default 32768 if ARM || (ARM64 && COMPAT) -+ default 65536 - help - This is the portion of low virtual memory which should be protected - from userspace allocation. Keeping a user from writing to low pages -diff --git a/mm/mmap.c b/mm/mmap.c -index 2398776195d2..a8ffa2223ad1 100644 ---- a/mm/mmap.c -+++ b/mm/mmap.c -@@ -220,6 +220,13 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) - - newbrk = PAGE_ALIGN(brk); - oldbrk = PAGE_ALIGN(mm->brk); -+ /* properly handle unaligned min_brk as an empty heap */ -+ if (min_brk & ~PAGE_MASK) { -+ if (brk == min_brk) -+ newbrk -= PAGE_SIZE; -+ if (mm->brk == min_brk) -+ oldbrk -= PAGE_SIZE; -+ } - if (oldbrk == newbrk) - goto set_brk; - -diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index a2f365f40433..5e726e59de20 100644 ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -67,6 +67,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -98,6 +99,15 @@ int _node_numa_mem_[MAX_NUMNODES]; - DEFINE_MUTEX(pcpu_drain_mutex); - DEFINE_PER_CPU(struct work_struct, pcpu_drain); - -+bool __meminitdata extra_latent_entropy; -+ -+static int __init setup_extra_latent_entropy(char *str) -+{ -+ extra_latent_entropy = true; -+ return 0; -+} -+early_param("extra_latent_entropy", setup_extra_latent_entropy); -+ - #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY - volatile unsigned long latent_entropy __latent_entropy; - EXPORT_SYMBOL(latent_entropy); -@@ -1063,6 +1073,13 @@ static __always_inline bool free_pages_prepare(struct page *page, - debug_check_no_obj_freed(page_address(page), - PAGE_SIZE << order); - } -+ -+ if (IS_ENABLED(CONFIG_PAGE_SANITIZE)) { -+ int i; -+ for (i = 0; i < (1 << order); i++) -+ clear_highpage(page + i); -+ } -+ - arch_free_page(page, order); - kernel_poison_pages(page, 1 << order, 0); - kernel_map_pages(page, 1 << order, 0); -@@ -1278,6 +1295,21 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order) - __ClearPageReserved(p); - set_page_count(p, 0); - -+ if (extra_latent_entropy && !PageHighMem(page) && page_to_pfn(page) < 0x100000) { -+ unsigned long hash = 0; -+ size_t index, end = PAGE_SIZE * nr_pages / sizeof hash; -+ const unsigned long *data = lowmem_page_address(page); -+ -+ for (index = 0; index < end; index++) -+ hash ^= hash + data[index]; -+#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY -+ latent_entropy ^= hash; -+ add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); -+#else -+ add_device_randomness((const void *)&hash, sizeof(hash)); -+#endif -+ } -+ - page_zone(page)->managed_pages += nr_pages; - set_page_refcounted(page); - __free_pages(page, order); -@@ -1718,8 +1750,8 @@ static inline int check_new_page(struct page *page) - - static inline bool free_pages_prezeroed(void) - { -- return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && -- page_poisoning_enabled(); -+ return IS_ENABLED(CONFIG_PAGE_SANITIZE) || -+ (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && page_poisoning_enabled()); - } - - #ifdef CONFIG_DEBUG_VM -@@ -1776,6 +1808,11 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags - - post_alloc_hook(page, order, gfp_flags); - -+ if (IS_ENABLED(CONFIG_PAGE_SANITIZE_VERIFY)) { -+ for (i = 0; i < (1 << order); i++) -+ verify_zero_highpage(page + i); -+ } -+ - if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) - for (i = 0; i < (1 << order); i++) - clear_highpage(page + i); -diff --git a/mm/slab.h b/mm/slab.h -index 485d9fbb8802..436461588804 100644 ---- a/mm/slab.h -+++ b/mm/slab.h -@@ -311,7 +311,11 @@ static inline bool is_root_cache(struct kmem_cache *s) - static inline bool slab_equal_or_root(struct kmem_cache *s, - struct kmem_cache *p) - { -+#ifdef CONFIG_SLAB_HARDENED -+ return p == s; -+#else - return true; -+#endif - } - - static inline const char *cache_name(struct kmem_cache *s) -@@ -363,18 +367,26 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) - * to not do even the assignment. In that case, slab_equal_or_root - * will also be a constant. - */ -- if (!memcg_kmem_enabled() && -+ if (!IS_ENABLED(CONFIG_SLAB_HARDENED) && -+ !memcg_kmem_enabled() && - !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) - return s; - - page = virt_to_head_page(x); -+#ifdef CONFIG_SLAB_HARDENED -+ BUG_ON(!PageSlab(page)); -+#endif - cachep = page->slab_cache; - if (slab_equal_or_root(cachep, s)) - return cachep; - - pr_err("%s: Wrong slab cache. %s but object is from %s\n", - __func__, s->name, cachep->name); -+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION -+ BUG_ON(1); -+#else - WARN_ON_ONCE(1); -+#endif - return s; - } - -@@ -399,7 +411,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) - * back there or track user information then we can - * only use the space before that information. - */ -- if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) -+ if ((s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) || IS_ENABLED(CONFIG_SLAB_CANARY)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation -diff --git a/mm/slab_common.c b/mm/slab_common.c -index f6764cf162b8..015c8e4df318 100644 ---- a/mm/slab_common.c -+++ b/mm/slab_common.c -@@ -26,10 +26,10 @@ - - #include "slab.h" - --enum slab_state slab_state; -+enum slab_state slab_state __ro_after_init; - LIST_HEAD(slab_caches); - DEFINE_MUTEX(slab_mutex); --struct kmem_cache *kmem_cache; -+struct kmem_cache *kmem_cache __ro_after_init; - - static LIST_HEAD(slab_caches_to_rcu_destroy); - static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work); -@@ -49,7 +49,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, - /* - * Merge control. If this is set then no merging of slab caches will occur. - */ --static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); -+static bool slab_nomerge __ro_after_init = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); - - static int __init setup_slab_nomerge(char *str) - { -@@ -931,7 +931,7 @@ EXPORT_SYMBOL(kmalloc_dma_caches); - * of two cache sizes there. The size of larger slabs can be determined using - * fls. - */ --static s8 size_index[24] = { -+static s8 size_index[24] __ro_after_init = { - 3, /* 8 */ - 4, /* 16 */ - 5, /* 24 */ -diff --git a/mm/slub.c b/mm/slub.c -index 220d42e592ef..3decf87b1cf2 100644 ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -125,6 +125,16 @@ static inline int kmem_cache_debug(struct kmem_cache *s) - #endif - } - -+static inline bool has_sanitize(struct kmem_cache *s) -+{ -+ return IS_ENABLED(CONFIG_SLAB_SANITIZE) && !(s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)); -+} -+ -+static inline bool has_sanitize_verify(struct kmem_cache *s) -+{ -+ return IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && has_sanitize(s); -+} -+ - void *fixup_red_left(struct kmem_cache *s, void *p) - { - if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) -@@ -297,6 +307,35 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) - *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); - } - -+#ifdef CONFIG_SLAB_CANARY -+static inline unsigned long *get_canary(struct kmem_cache *s, void *object) -+{ -+ if (s->offset) -+ return object + s->offset + sizeof(void *); -+ return object + s->inuse; -+} -+ -+static inline unsigned long get_canary_value(const void *canary, unsigned long value) -+{ -+ return (value ^ (unsigned long)canary) & CANARY_MASK; -+} -+ -+static inline void set_canary(struct kmem_cache *s, void *object, unsigned long value) -+{ -+ unsigned long *canary = get_canary(s, object); -+ *canary = get_canary_value(canary, value); -+} -+ -+static inline void check_canary(struct kmem_cache *s, void *object, unsigned long value) -+{ -+ unsigned long *canary = get_canary(s, object); -+ BUG_ON(*canary != get_canary_value(canary, value)); -+} -+#else -+#define set_canary(s, object, value) -+#define check_canary(s, object, value) -+#endif -+ - /* Loop over all objects in a slab */ - #define for_each_object(__p, __s, __addr, __objects) \ - for (__p = fixup_red_left(__s, __addr); \ -@@ -484,13 +523,13 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) - * Debug settings: - */ - #if defined(CONFIG_SLUB_DEBUG_ON) --static int slub_debug = DEBUG_DEFAULT_FLAGS; -+static int slub_debug __ro_after_init = DEBUG_DEFAULT_FLAGS; - #else --static int slub_debug; -+static int slub_debug __ro_after_init; - #endif - --static char *slub_debug_slabs; --static int disable_higher_order_debug; -+static char *slub_debug_slabs __ro_after_init; -+static int disable_higher_order_debug __ro_after_init; - - /* - * slub is about to manipulate internal object metadata. This memory lies -@@ -550,6 +589,9 @@ static struct track *get_track(struct kmem_cache *s, void *object, - else - p = object + s->inuse; - -+ if (IS_ENABLED(CONFIG_SLAB_CANARY)) -+ p = (void *)p + sizeof(void *); -+ - return p + alloc; - } - -@@ -688,6 +730,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) - else - off = s->inuse; - -+ if (IS_ENABLED(CONFIG_SLAB_CANARY)) -+ off += sizeof(void *); -+ - if (s->flags & SLAB_STORE_USER) - off += 2 * sizeof(struct track); - -@@ -817,6 +862,9 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) - /* Freepointer is placed after the object. */ - off += sizeof(void *); - -+ if (IS_ENABLED(CONFIG_SLAB_CANARY)) -+ off += sizeof(void *); -+ - if (s->flags & SLAB_STORE_USER) - /* We also have user information there */ - off += 2 * sizeof(struct track); -@@ -1416,8 +1464,9 @@ static void setup_object(struct kmem_cache *s, struct page *page, - void *object) - { - setup_object_debug(s, page, object); -+ set_canary(s, object, s->random_inactive); - kasan_init_slab_obj(s, object); -- if (unlikely(s->ctor)) { -+ if (unlikely(s->ctor) && !has_sanitize_verify(s)) { - kasan_unpoison_object_data(s, object); - s->ctor(object); - kasan_poison_object_data(s, object); -@@ -2717,9 +2766,21 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, - stat(s, ALLOC_FASTPATH); - } - -- if (unlikely(gfpflags & __GFP_ZERO) && object) -+ if (has_sanitize_verify(s) && object) { -+ size_t offset = s->offset ? 0 : sizeof(void *); -+ BUG_ON(memchr_inv(object + offset, 0, s->object_size - offset)); -+ if (s->ctor) -+ s->ctor(object); -+ if (unlikely(gfpflags & __GFP_ZERO) && offset) -+ memset(object, 0, sizeof(void *)); -+ } else if (unlikely(gfpflags & __GFP_ZERO) && object) - memset(object, 0, s->object_size); - -+ if (object) { -+ check_canary(s, object, s->random_inactive); -+ set_canary(s, object, s->random_active); -+ } -+ - slab_post_alloc_hook(s, gfpflags, 1, &object); - - return object; -@@ -2926,6 +2987,27 @@ static __always_inline void do_slab_free(struct kmem_cache *s, - void *tail_obj = tail ? : head; - struct kmem_cache_cpu *c; - unsigned long tid; -+ bool sanitize = has_sanitize(s); -+ -+ if (IS_ENABLED(CONFIG_SLAB_CANARY) || sanitize) { -+ __maybe_unused int offset = s->offset ? 0 : sizeof(void *); -+ void *x = head; -+ -+ while (1) { -+ check_canary(s, x, s->random_active); -+ set_canary(s, x, s->random_inactive); -+ -+ if (sanitize) { -+ memset(x + offset, 0, s->object_size - offset); -+ if (!IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && s->ctor) -+ s->ctor(x); -+ } -+ if (x == tail_obj) -+ break; -+ x = get_freepointer(s, x); -+ } -+ } -+ - redo: - /* - * Determine the currently cpus per cpu slab. -@@ -3104,7 +3186,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) - { - struct kmem_cache_cpu *c; -- int i; -+ int i, k; - - /* memcg and kmem_cache debug support */ - s = slab_pre_alloc_hook(s, flags); -@@ -3141,13 +3223,29 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - local_irq_enable(); - - /* Clear memory outside IRQ disabled fastpath loop */ -- if (unlikely(flags & __GFP_ZERO)) { -+ if (has_sanitize_verify(s)) { -+ int j; -+ -+ for (j = 0; j < i; j++) { -+ size_t offset = s->offset ? 0 : sizeof(void *); -+ BUG_ON(memchr_inv(p[j] + offset, 0, s->object_size - offset)); -+ if (s->ctor) -+ s->ctor(p[j]); -+ if (unlikely(flags & __GFP_ZERO) && offset) -+ memset(p[j], 0, sizeof(void *)); -+ } -+ } else if (unlikely(flags & __GFP_ZERO)) { - int j; - - for (j = 0; j < i; j++) - memset(p[j], 0, s->object_size); - } - -+ for (k = 0; k < i; k++) { -+ check_canary(s, p[k], s->random_inactive); -+ set_canary(s, p[k], s->random_active); -+ } -+ - /* memcg and kmem_cache debug support */ - slab_post_alloc_hook(s, flags, size, p); - return i; -@@ -3179,9 +3277,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); - * and increases the number of allocations possible without having to - * take the list_lock. - */ --static int slub_min_order; --static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; --static int slub_min_objects; -+static int slub_min_order __ro_after_init; -+static int slub_max_order __ro_after_init = PAGE_ALLOC_COSTLY_ORDER; -+static int slub_min_objects __ro_after_init; - - /* - * Calculate the order of allocation given an slab object size. -@@ -3351,6 +3449,7 @@ static void early_kmem_cache_node_alloc(int node) - init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); - init_tracking(kmem_cache_node, n); - #endif -+ set_canary(kmem_cache_node, n, kmem_cache_node->random_active); - kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), - GFP_KERNEL); - init_kmem_cache_node(n); -@@ -3507,6 +3606,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) - size += sizeof(void *); - } - -+ if (IS_ENABLED(CONFIG_SLAB_CANARY)) -+ size += sizeof(void *); -+ - #ifdef CONFIG_SLUB_DEBUG - if (flags & SLAB_STORE_USER) - /* -@@ -3577,6 +3679,10 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) - #ifdef CONFIG_SLAB_FREELIST_HARDENED - s->random = get_random_long(); - #endif -+#ifdef CONFIG_SLAB_CANARY -+ s->random_active = get_random_long(); -+ s->random_inactive = get_random_long(); -+#endif - - if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) - s->reserved = sizeof(struct rcu_head); -@@ -3841,6 +3947,8 @@ const char *__check_heap_object(const void *ptr, unsigned long n, - offset -= s->red_left_pad; - } - -+ check_canary(s, (void *)ptr - offset, s->random_active); -+ - /* Allow address range falling entirely within object size. */ - if (offset <= object_size && n <= object_size - offset) - return NULL; -@@ -3859,7 +3967,11 @@ static size_t __ksize(const void *object) - page = virt_to_head_page(object); - - if (unlikely(!PageSlab(page))) { -+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION -+ BUG_ON(!PageCompound(page)); -+#else - WARN_ON(!PageCompound(page)); -+#endif - return PAGE_SIZE << compound_order(page); - } - -@@ -4724,7 +4836,7 @@ enum slab_stat_type { - #define SO_TOTAL (1 << SL_TOTAL) - - #ifdef CONFIG_MEMCG --static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); -+static bool memcg_sysfs_enabled __ro_after_init = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); - - static int __init setup_slub_memcg_sysfs(char *str) - { -diff --git a/mm/swap.c b/mm/swap.c -index a77d68f2c1b6..d1f1d75f4d1f 100644 ---- a/mm/swap.c -+++ b/mm/swap.c -@@ -92,6 +92,13 @@ static void __put_compound_page(struct page *page) - if (!PageHuge(page)) - __page_cache_release(page); - dtor = get_compound_page_dtor(page); -+ if (!PageHuge(page)) -+ BUG_ON(dtor != free_compound_page -+#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+ && dtor != free_transhuge_page -+#endif -+ ); -+ - (*dtor)(page); - } - -diff --git a/net/core/dev.c b/net/core/dev.c -index 4337450a5fdb..5a3c7d217719 100644 ---- a/net/core/dev.c -+++ b/net/core/dev.c -@@ -4117,7 +4117,7 @@ int netif_rx_ni(struct sk_buff *skb) - } - EXPORT_SYMBOL(netif_rx_ni); - --static __latent_entropy void net_tx_action(struct softirq_action *h) -+static __latent_entropy void net_tx_action(void) - { - struct softnet_data *sd = this_cpu_ptr(&softnet_data); - -@@ -5635,7 +5635,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) - return work; - } - --static __latent_entropy void net_rx_action(struct softirq_action *h) -+static __latent_entropy void net_rx_action(void) - { - struct softnet_data *sd = this_cpu_ptr(&softnet_data); - unsigned long time_limit = jiffies + -diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index f48fe6fc7e8c..d78c52835c08 100644 ---- a/net/ipv4/Kconfig -+++ b/net/ipv4/Kconfig -@@ -261,6 +261,7 @@ config IP_PIMSM_V2 - - config SYN_COOKIES - bool "IP: TCP syncookie support" -+ default y - ---help--- - Normal TCP/IP networking is open to an attack known as "SYN - flooding". This denial-of-service attack prevents legitimate remote -diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c -index 957f6041dd79..7be404c9fb47 100644 ---- a/scripts/mod/modpost.c -+++ b/scripts/mod/modpost.c -@@ -37,6 +37,7 @@ static int vmlinux_section_warnings = 1; - static int warn_unresolved = 0; - /* How a symbol is exported */ - static int sec_mismatch_count = 0; -+static int writable_fptr_count = 0; - static int sec_mismatch_verbose = 1; - static int sec_mismatch_fatal = 0; - /* ignore missing files */ -@@ -965,6 +966,7 @@ enum mismatch { - ANY_EXIT_TO_ANY_INIT, - EXPORT_TO_INIT_EXIT, - EXTABLE_TO_NON_TEXT, -+ DATA_TO_TEXT - }; - - /** -@@ -1091,6 +1093,12 @@ static const struct sectioncheck sectioncheck[] = { - .good_tosec = {ALL_TEXT_SECTIONS , NULL}, - .mismatch = EXTABLE_TO_NON_TEXT, - .handler = extable_mismatch_handler, -+}, -+/* Do not reference code from writable data */ -+{ -+ .fromsec = { DATA_SECTIONS, NULL }, -+ .bad_tosec = { ALL_TEXT_SECTIONS, NULL }, -+ .mismatch = DATA_TO_TEXT - } - }; - -@@ -1240,10 +1248,10 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr, - continue; - if (ELF_ST_TYPE(sym->st_info) == STT_SECTION) - continue; -- if (sym->st_value == addr) -- return sym; - /* Find a symbol nearby - addr are maybe negative */ - d = sym->st_value - addr; -+ if (d == 0) -+ return sym; - if (d < 0) - d = addr - sym->st_value; - if (d < distance) { -@@ -1402,7 +1410,11 @@ static void report_sec_mismatch(const char *modname, - char *prl_from; - char *prl_to; - -- sec_mismatch_count++; -+ if (mismatch->mismatch == DATA_TO_TEXT) -+ writable_fptr_count++; -+ else -+ sec_mismatch_count++; -+ - if (!sec_mismatch_verbose) - return; - -@@ -1526,6 +1538,14 @@ static void report_sec_mismatch(const char *modname, - fatal("There's a special handler for this mismatch type, " - "we should never get here."); - break; -+ case DATA_TO_TEXT: -+#if 0 -+ fprintf(stderr, -+ "The %s %s:%s references\n" -+ "the %s %s:%s%s\n", -+ from, fromsec, fromsym, to, tosec, tosym, to_p); -+#endif -+ break; - } - fprintf(stderr, "\n"); - } -@@ -2539,6 +2559,14 @@ int main(int argc, char **argv) - } - } - free(buf.p); -+ if (writable_fptr_count) { -+ if (!sec_mismatch_verbose) { -+ warn("modpost: Found %d writable function pointer(s).\n" -+ "To see full details build your kernel with:\n" -+ "'make CONFIG_DEBUG_SECTION_MISMATCH=y'\n", -+ writable_fptr_count); -+ } -+ } - - return err; - } -diff --git a/security/Kconfig b/security/Kconfig -index 87f2a6f842fd..7bdbb7edf5bf 100644 ---- a/security/Kconfig -+++ b/security/Kconfig -@@ -8,7 +8,7 @@ source security/keys/Kconfig - - config SECURITY_DMESG_RESTRICT - bool "Restrict unprivileged access to the kernel syslog" -- default n -+ default y - help - This enforces restrictions on unprivileged users reading the kernel - syslog via dmesg(8). -@@ -18,10 +18,34 @@ config SECURITY_DMESG_RESTRICT - - If you are unsure how to answer this question, answer N. - -+config SECURITY_PERF_EVENTS_RESTRICT -+ bool "Restrict unprivileged use of performance events" -+ depends on PERF_EVENTS -+ default y -+ help -+ If you say Y here, the kernel.perf_event_paranoid sysctl -+ will be set to 3 by default, and no unprivileged use of the -+ perf_event_open syscall will be permitted unless it is -+ changed. -+ -+config SECURITY_TIOCSTI_RESTRICT -+ bool "Restrict unprivileged use of tiocsti command injection" -+ default y -+ help -+ This enforces restrictions on unprivileged users injecting commands -+ into other processes which share a tty session using the TIOCSTI -+ ioctl. This option makes TIOCSTI use require CAP_SYS_ADMIN. -+ -+ If this option is not selected, no restrictions will be enforced -+ unless the tiocsti_restrict sysctl is explicitly set to (1). -+ -+ If you are unsure how to answer this question, answer N. -+ - config SECURITY - bool "Enable different security models" - depends on SYSFS - depends on MULTIUSER -+ default y - help - This allows you to choose different security modules to be - configured into your kernel. -@@ -48,6 +72,7 @@ config SECURITYFS - config SECURITY_NETWORK - bool "Socket and Networking Security Hooks" - depends on SECURITY -+ default y - help - This enables the socket and networking security hooks. - If enabled, a security module can use these hooks to -@@ -155,6 +180,7 @@ config HARDENED_USERCOPY - depends on HAVE_HARDENED_USERCOPY_ALLOCATOR - select BUG - imply STRICT_DEVMEM -+ default y - help - This option checks for obviously wrong memory regions when - copying memory to/from the kernel (via copy_to_user() and -@@ -178,10 +204,36 @@ config HARDENED_USERCOPY_PAGESPAN - config FORTIFY_SOURCE - bool "Harden common str/mem functions against buffer overflows" - depends on ARCH_HAS_FORTIFY_SOURCE -+ default y - help - Detect overflows of buffers in common string and memory functions - where the compiler can determine and validate the buffer sizes. - -+config FORTIFY_SOURCE_STRICT_STRING -+ bool "Harden common functions against buffer overflows" -+ depends on FORTIFY_SOURCE -+ depends on EXPERT -+ help -+ Perform stricter overflow checks catching overflows within objects -+ for common C string functions rather than only between objects. -+ -+ This is not yet intended for production use, only bug finding. -+ -+config PAGE_SANITIZE -+ bool "Sanitize pages" -+ default y -+ help -+ Zero fill page allocations on free, reducing the lifetime of -+ sensitive data and helping to mitigate use-after-free bugs. -+ -+config PAGE_SANITIZE_VERIFY -+ bool "Verify sanitized pages" -+ depends on PAGE_SANITIZE -+ default y -+ help -+ Verify that newly allocated pages are zeroed to detect -+ write-after-free bugs. -+ - config STATIC_USERMODEHELPER - bool "Force all usermode helper calls through a single binary" - help -diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig -index 8af7a690eb40..6539694b0fd3 100644 ---- a/security/selinux/Kconfig -+++ b/security/selinux/Kconfig -@@ -2,7 +2,7 @@ config SECURITY_SELINUX - bool "NSA SELinux Support" - depends on SECURITY_NETWORK && AUDIT && NET && INET - select NETWORK_SECMARK -- default n -+ default y - help - This selects NSA Security-Enhanced Linux (SELinux). - You will also need a policy configuration and a labeled filesystem. -@@ -79,23 +79,3 @@ config SECURITY_SELINUX_AVC_STATS - This option collects access vector cache statistics to - /selinux/avc/cache_stats, which may be monitored via - tools such as avcstat. -- --config SECURITY_SELINUX_CHECKREQPROT_VALUE -- int "NSA SELinux checkreqprot default value" -- depends on SECURITY_SELINUX -- range 0 1 -- default 0 -- help -- This option sets the default value for the 'checkreqprot' flag -- that determines whether SELinux checks the protection requested -- by the application or the protection that will be applied by the -- kernel (including any implied execute for read-implies-exec) for -- mmap and mprotect calls. If this option is set to 0 (zero), -- SELinux will default to checking the protection that will be applied -- by the kernel. If this option is set to 1 (one), SELinux will -- default to checking the protection requested by the application. -- The checkreqprot flag may be changed from the default via the -- 'checkreqprot=' boot parameter. It may also be changed at runtime -- via /selinux/checkreqprot if authorized by policy. -- -- If you are unsure how to answer this question, answer 0. -diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h -index 1649cd18eb0b..067f35559aa7 100644 ---- a/security/selinux/include/objsec.h -+++ b/security/selinux/include/objsec.h -@@ -150,6 +150,6 @@ struct pkey_security_struct { - u32 sid; /* SID of pkey */ - }; - --extern unsigned int selinux_checkreqprot; -+extern const unsigned int selinux_checkreqprot; - - #endif /* _SELINUX_OBJSEC_H_ */ -diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c -index 00eed842c491..8f7b8d7e6f91 100644 ---- a/security/selinux/selinuxfs.c -+++ b/security/selinux/selinuxfs.c -@@ -41,16 +41,7 @@ - #include "objsec.h" - #include "conditional.h" - --unsigned int selinux_checkreqprot = CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; -- --static int __init checkreqprot_setup(char *str) --{ -- unsigned long checkreqprot; -- if (!kstrtoul(str, 0, &checkreqprot)) -- selinux_checkreqprot = checkreqprot ? 1 : 0; -- return 1; --} --__setup("checkreqprot=", checkreqprot_setup); -+const unsigned int selinux_checkreqprot; - - static DEFINE_MUTEX(sel_mutex); - -@@ -610,10 +601,9 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, - return PTR_ERR(page); - - length = -EINVAL; -- if (sscanf(page, "%u", &new_value) != 1) -+ if (sscanf(page, "%u", &new_value) != 1 || new_value) - goto out; - -- selinux_checkreqprot = new_value ? 1 : 0; - length = count; - out: - kfree(page); -diff --git a/security/yama/Kconfig b/security/yama/Kconfig -index 96b27405558a..485c1b85c325 100644 ---- a/security/yama/Kconfig -+++ b/security/yama/Kconfig -@@ -1,7 +1,7 @@ - config SECURITY_YAMA - bool "Yama support" - depends on SECURITY -- default n -+ default y - help - This selects Yama, which extends DAC support with additional - system-wide security settings beyond regular Linux discretionary diff --git a/sys-kernel/linux-sources-redcore-lts/files/mute-pps_state_mismatch.patch b/sys-kernel/linux-sources-redcore-lts/files/mute-pps_state_mismatch.patch deleted file mode 100644 index dc1d254b..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/mute-pps_state_mismatch.patch +++ /dev/null @@ -1,16 +0,0 @@ -diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c -index 09f274419..595bc5844 100644 ---- a/drivers/gpu/drm/i915/intel_dp.c -+++ b/drivers/gpu/drm/i915/intel_dp.c -@@ -5249,7 +5249,10 @@ intel_pps_verify_state(struct drm_i915_private *dev_priv, - - if (hw.t1_t3 != sw->t1_t3 || hw.t8 != sw->t8 || hw.t9 != sw->t9 || - hw.t10 != sw->t10 || hw.t11_t12 != sw->t11_t12) { -- DRM_ERROR("PPS state mismatch\n"); -+ /* seem buggy on 4.14.x .. mute that for now -+ * even is not a real solution .. -+ * DRM_ERROR("PPS state mismatch\n"); -+ */ - intel_pps_dump_state("sw", sw); - intel_pps_dump_state("hw", &hw); - } diff --git a/sys-kernel/linux-sources-redcore-lts/files/redcore-lts-amd64.config b/sys-kernel/linux-sources-redcore-lts/files/redcore-lts-amd64.config deleted file mode 100644 index 23e35863..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/redcore-lts-amd64.config +++ /dev/null @@ -1,9106 +0,0 @@ -# -# Automatically generated file; DO NOT EDIT. -# Linux/x86 4.14.90-redcore-lts Kernel Configuration -# -CONFIG_64BIT=y -CONFIG_X86_64=y -CONFIG_X86=y -CONFIG_INSTRUCTION_DECODER=y -CONFIG_OUTPUT_FORMAT="elf64-x86-64" -CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" -CONFIG_LOCKDEP_SUPPORT=y -CONFIG_STACKTRACE_SUPPORT=y -CONFIG_MMU=y -CONFIG_ARCH_MMAP_RND_BITS_MIN=28 -CONFIG_ARCH_MMAP_RND_BITS_MAX=32 -CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 -CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 -CONFIG_NEED_DMA_MAP_STATE=y -CONFIG_NEED_SG_DMA_LENGTH=y -CONFIG_GENERIC_ISA_DMA=y -CONFIG_GENERIC_BUG=y -CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y -CONFIG_GENERIC_HWEIGHT=y -CONFIG_ARCH_MAY_HAVE_PC_FDC=y -CONFIG_RWSEM_XCHGADD_ALGORITHM=y -CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_ARCH_HAS_CPU_RELAX=y -CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y -CONFIG_HAVE_SETUP_PER_CPU_AREA=y -CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y -CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y -CONFIG_ARCH_HIBERNATION_POSSIBLE=y -CONFIG_ARCH_SUSPEND_POSSIBLE=y -CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y -CONFIG_ARCH_WANT_GENERAL_HUGETLB=y -CONFIG_ZONE_DMA32=y -CONFIG_AUDIT_ARCH=y -CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y -CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y -CONFIG_HAVE_INTEL_TXT=y -CONFIG_X86_64_SMP=y -CONFIG_ARCH_SUPPORTS_UPROBES=y -CONFIG_FIX_EARLYCON_MEM=y -CONFIG_PGTABLE_LEVELS=4 -CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" -CONFIG_IRQ_WORK=y -CONFIG_BUILDTIME_EXTABLE_SORT=y -CONFIG_THREAD_INFO_IN_TASK=y - -# -# General setup -# -CONFIG_SCHED_MUQSS=y -CONFIG_INIT_ENV_ARG_LIMIT=32 -CONFIG_CROSS_COMPILE="" -# CONFIG_COMPILE_TEST is not set -CONFIG_LOCALVERSION="" -CONFIG_LOCALVERSION_AUTO=y -CONFIG_HAVE_KERNEL_GZIP=y -CONFIG_HAVE_KERNEL_BZIP2=y -CONFIG_HAVE_KERNEL_LZMA=y -CONFIG_HAVE_KERNEL_XZ=y -CONFIG_HAVE_KERNEL_LZO=y -CONFIG_HAVE_KERNEL_LZ4=y -# CONFIG_KERNEL_GZIP is not set -# CONFIG_KERNEL_BZIP2 is not set -# CONFIG_KERNEL_LZMA is not set -# CONFIG_KERNEL_XZ is not set -# CONFIG_KERNEL_LZO is not set -CONFIG_KERNEL_LZ4=y -CONFIG_DEFAULT_HOSTNAME="(none)" -CONFIG_SWAP=y -CONFIG_SYSVIPC=y -CONFIG_SYSVIPC_SYSCTL=y -CONFIG_POSIX_MQUEUE=y -CONFIG_POSIX_MQUEUE_SYSCTL=y -CONFIG_CROSS_MEMORY_ATTACH=y -CONFIG_FHANDLE=y -# CONFIG_USELIB is not set -CONFIG_AUDIT=y -CONFIG_HAVE_ARCH_AUDITSYSCALL=y -CONFIG_AUDITSYSCALL=y -CONFIG_AUDIT_WATCH=y -CONFIG_AUDIT_TREE=y - -# -# IRQ subsystem -# -CONFIG_GENERIC_IRQ_PROBE=y -CONFIG_GENERIC_IRQ_SHOW=y -CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y -CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_GENERIC_IRQ_MIGRATION=y -CONFIG_GENERIC_IRQ_CHIP=y -CONFIG_IRQ_DOMAIN=y -CONFIG_IRQ_SIM=y -CONFIG_IRQ_DOMAIN_HIERARCHY=y -CONFIG_GENERIC_MSI_IRQ=y -CONFIG_GENERIC_MSI_IRQ_DOMAIN=y -# CONFIG_IRQ_DOMAIN_DEBUG is not set -CONFIG_IRQ_FORCED_THREADING=y -CONFIG_FORCE_IRQ_THREADING=y -CONFIG_SPARSE_IRQ=y -# CONFIG_GENERIC_IRQ_DEBUGFS is not set -CONFIG_CLOCKSOURCE_WATCHDOG=y -CONFIG_ARCH_CLOCKSOURCE_DATA=y -CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y -CONFIG_GENERIC_TIME_VSYSCALL=y -CONFIG_GENERIC_CLOCKEVENTS=y -CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y -CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y -CONFIG_GENERIC_CMOS_UPDATE=y - -# -# Timers subsystem -# -CONFIG_TICK_ONESHOT=y -CONFIG_HZ_PERIODIC=y -# CONFIG_NO_HZ_IDLE is not set -# CONFIG_NO_HZ_FULL is not set -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y - -# -# CPU/Task time and stats accounting -# -CONFIG_VIRT_CPU_ACCOUNTING=y -# CONFIG_TICK_CPU_ACCOUNTING is not set -CONFIG_VIRT_CPU_ACCOUNTING_GEN=y -CONFIG_IRQ_TIME_ACCOUNTING=y -CONFIG_BSD_PROCESS_ACCT=y -# CONFIG_BSD_PROCESS_ACCT_V3 is not set -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_XACCT=y -CONFIG_TASK_IO_ACCOUNTING=y - -# -# RCU Subsystem -# -CONFIG_PREEMPT_RCU=y -# CONFIG_RCU_EXPERT is not set -CONFIG_SRCU=y -CONFIG_TREE_SRCU=y -CONFIG_TASKS_RCU=y -CONFIG_RCU_STALL_COMMON=y -CONFIG_RCU_NEED_SEGCBLIST=y -CONFIG_CONTEXT_TRACKING=y -# CONFIG_CONTEXT_TRACKING_FORCE is not set -CONFIG_BUILD_BIN2C=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_LOG_BUF_SHIFT=17 -CONFIG_LOG_CPU_MAX_BUF_SHIFT=13 -CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 -CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y -CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y -CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y -CONFIG_ARCH_SUPPORTS_INT128=y -CONFIG_CGROUPS=y -CONFIG_PAGE_COUNTER=y -CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y -CONFIG_MEMCG_SWAP_ENABLED=y -CONFIG_BLK_CGROUP=y -# CONFIG_DEBUG_BLK_CGROUP is not set -CONFIG_CGROUP_WRITEBACK=y -CONFIG_CGROUP_SCHED=y -CONFIG_CGROUP_PIDS=y -# CONFIG_CGROUP_RDMA is not set -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_HUGETLB=y -CONFIG_CPUSETS=y -CONFIG_PROC_PID_CPUSET=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CGROUP_PERF=y -CONFIG_CGROUP_BPF=y -# CONFIG_CGROUP_DEBUG is not set -CONFIG_SOCK_CGROUP_DATA=y -# CONFIG_CHECKPOINT_RESTORE is not set -CONFIG_NAMESPACES=y -CONFIG_UTS_NS=y -CONFIG_IPC_NS=y -CONFIG_USER_NS=y -CONFIG_PID_NS=y -CONFIG_NET_NS=y -# CONFIG_SYSFS_DEPRECATED is not set -CONFIG_RELAY=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_INITRAMFS_SOURCE="" -CONFIG_RD_GZIP=y -CONFIG_RD_BZIP2=y -CONFIG_RD_LZMA=y -CONFIG_RD_XZ=y -CONFIG_RD_LZO=y -CONFIG_RD_LZ4=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -# CONFIG_LOCAL_INIT is not set -CONFIG_SYSCTL=y -CONFIG_ANON_INODES=y -CONFIG_HAVE_UID16=y -CONFIG_SYSCTL_EXCEPTION_TRACE=y -CONFIG_HAVE_PCSPKR_PLATFORM=y -CONFIG_BPF=y -# CONFIG_EXPERT is not set -CONFIG_UID16=y -CONFIG_MULTIUSER=y -CONFIG_SGETMASK_SYSCALL=y -CONFIG_SYSFS_SYSCALL=y -# CONFIG_SYSCTL_SYSCALL is not set -CONFIG_POSIX_TIMERS=y -CONFIG_KALLSYMS=y -# CONFIG_KALLSYMS_ALL is not set -CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y -CONFIG_KALLSYMS_BASE_RELATIVE=y -CONFIG_PRINTK=y -CONFIG_PRINTK_NMI=y -CONFIG_BUG=y -CONFIG_ELF_CORE=y -CONFIG_PCSPKR_PLATFORM=y -CONFIG_BASE_FULL=y -CONFIG_FUTEX=y -CONFIG_FUTEX_PI=y -CONFIG_EPOLL=y -CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y -CONFIG_EVENTFD=y -CONFIG_BPF_SYSCALL=y -CONFIG_BPF_JIT_ALWAYS_ON=y -CONFIG_SHMEM=y -CONFIG_AIO=y -CONFIG_ADVISE_SYSCALLS=y -CONFIG_USERFAULTFD=y -CONFIG_PCI_QUIRKS=y -CONFIG_MEMBARRIER=y -# CONFIG_EMBEDDED is not set -CONFIG_HAVE_PERF_EVENTS=y -# CONFIG_PC104 is not set - -# -# Kernel Performance Events And Counters -# -CONFIG_PERF_EVENTS=y -# CONFIG_DEBUG_PERF_USE_VMALLOC is not set -CONFIG_VM_EVENT_COUNTERS=y -CONFIG_SLUB_DEBUG=y -# CONFIG_SLUB_MEMCG_SYSFS_ON is not set -# CONFIG_COMPAT_BRK is not set -# CONFIG_SLAB is not set -CONFIG_SLUB=y -CONFIG_SLAB_MERGE_DEFAULT=y -CONFIG_SLAB_FREELIST_RANDOM=y -CONFIG_SLAB_FREELIST_HARDENED=y -CONFIG_SLAB_HARDENED=y -CONFIG_SLAB_SANITIZE=y -CONFIG_SLAB_SANITIZE_VERIFY=y -CONFIG_SLUB_CPU_PARTIAL=y -CONFIG_SYSTEM_DATA_VERIFICATION=y -CONFIG_PROFILING=y -CONFIG_TRACEPOINTS=y -CONFIG_CRASH_CORE=y -CONFIG_KEXEC_CORE=y -CONFIG_HOTPLUG_SMT=y -# CONFIG_OPROFILE is not set -CONFIG_HAVE_OPROFILE=y -CONFIG_OPROFILE_NMI_TIMER=y -CONFIG_KPROBES=y -CONFIG_JUMP_LABEL=y -# CONFIG_STATIC_KEYS_SELFTEST is not set -CONFIG_OPTPROBES=y -CONFIG_UPROBES=y -# CONFIG_HAVE_64BIT_ALIGNED_ACCESS is not set -CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y -CONFIG_ARCH_USE_BUILTIN_BSWAP=y -CONFIG_KRETPROBES=y -CONFIG_USER_RETURN_NOTIFIER=y -CONFIG_HAVE_IOREMAP_PROT=y -CONFIG_HAVE_KPROBES=y -CONFIG_HAVE_KRETPROBES=y -CONFIG_HAVE_OPTPROBES=y -CONFIG_HAVE_KPROBES_ON_FTRACE=y -CONFIG_HAVE_NMI=y -CONFIG_HAVE_ARCH_TRACEHOOK=y -CONFIG_HAVE_DMA_CONTIGUOUS=y -CONFIG_GENERIC_SMP_IDLE_THREAD=y -CONFIG_ARCH_HAS_FORTIFY_SOURCE=y -CONFIG_ARCH_HAS_SET_MEMORY=y -CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y -CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y -CONFIG_HAVE_CLK=y -CONFIG_HAVE_DMA_API_DEBUG=y -CONFIG_HAVE_HW_BREAKPOINT=y -CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y -CONFIG_HAVE_USER_RETURN_NOTIFIER=y -CONFIG_HAVE_PERF_EVENTS_NMI=y -CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y -CONFIG_HAVE_PERF_REGS=y -CONFIG_HAVE_PERF_USER_STACK_DUMP=y -CONFIG_HAVE_ARCH_JUMP_LABEL=y -CONFIG_HAVE_RCU_TABLE_FREE=y -CONFIG_HAVE_RCU_TABLE_INVALIDATE=y -CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y -CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y -CONFIG_HAVE_CMPXCHG_LOCAL=y -CONFIG_HAVE_CMPXCHG_DOUBLE=y -CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y -CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y -CONFIG_HAVE_ARCH_SECCOMP_FILTER=y -CONFIG_SECCOMP_FILTER=y -CONFIG_HAVE_GCC_PLUGINS=y -# CONFIG_GCC_PLUGINS is not set -CONFIG_HAVE_CC_STACKPROTECTOR=y -CONFIG_CC_STACKPROTECTOR=y -# CONFIG_CC_STACKPROTECTOR_NONE is not set -# CONFIG_CC_STACKPROTECTOR_REGULAR is not set -CONFIG_CC_STACKPROTECTOR_STRONG=y -CONFIG_THIN_ARCHIVES=y -CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y -CONFIG_HAVE_CONTEXT_TRACKING=y -CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y -CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y -CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y -CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y -CONFIG_HAVE_ARCH_HUGE_VMAP=y -CONFIG_HAVE_ARCH_SOFT_DIRTY=y -CONFIG_HAVE_MOD_ARCH_SPECIFIC=y -CONFIG_MODULES_USE_ELF_RELA=y -CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK=y -CONFIG_ARCH_HAS_ELF_RANDOMIZE=y -CONFIG_HAVE_ARCH_MMAP_RND_BITS=y -CONFIG_HAVE_EXIT_THREAD=y -CONFIG_ARCH_MMAP_RND_BITS=32 -CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y -CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16 -CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y -CONFIG_HAVE_COPY_THREAD_TLS=y -CONFIG_HAVE_STACK_VALIDATION=y -# CONFIG_HAVE_ARCH_HASH is not set -# CONFIG_ISA_BUS_API is not set -CONFIG_OLD_SIGSUSPEND3=y -CONFIG_COMPAT_OLD_SIGACTION=y -# CONFIG_CPU_NO_EFFICIENT_FFS is not set -CONFIG_HAVE_ARCH_VMAP_STACK=y -CONFIG_VMAP_STACK=y -# CONFIG_ARCH_OPTIONAL_KERNEL_RWX is not set -# CONFIG_ARCH_OPTIONAL_KERNEL_RWX_DEFAULT is not set -CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y -CONFIG_STRICT_KERNEL_RWX=y -CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y -CONFIG_STRICT_MODULE_RWX=y -CONFIG_ARCH_HAS_REFCOUNT=y -CONFIG_REFCOUNT_FULL=y - -# -# GCOV-based kernel profiling -# -# CONFIG_GCOV_KERNEL is not set -CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y -# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set -CONFIG_SLABINFO=y -CONFIG_RT_MUTEXES=y -CONFIG_BASE_SMALL=0 -CONFIG_MODULES=y -CONFIG_MODULE_FORCE_LOAD=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_MODULE_SIG=y -# CONFIG_MODULE_SIG_FORCE is not set -CONFIG_MODULE_SIG_ALL=y -# CONFIG_MODULE_SIG_SHA1 is not set -# CONFIG_MODULE_SIG_SHA224 is not set -# CONFIG_MODULE_SIG_SHA256 is not set -# CONFIG_MODULE_SIG_SHA384 is not set -CONFIG_MODULE_SIG_SHA512=y -CONFIG_MODULE_SIG_HASH="sha512" -CONFIG_MODULE_COMPRESS=y -CONFIG_MODULE_COMPRESS_GZIP=y -# CONFIG_MODULE_COMPRESS_XZ is not set -# CONFIG_TRIM_UNUSED_KSYMS is not set -CONFIG_MODULES_TREE_LOOKUP=y -CONFIG_BLOCK=y -CONFIG_BLK_SCSI_REQUEST=y -CONFIG_BLK_DEV_BSG=y -CONFIG_BLK_DEV_BSGLIB=y -CONFIG_BLK_DEV_INTEGRITY=y -CONFIG_BLK_DEV_ZONED=y -CONFIG_BLK_DEV_THROTTLING=y -# CONFIG_BLK_DEV_THROTTLING_LOW is not set -CONFIG_BLK_CMDLINE_PARSER=y -CONFIG_BLK_WBT=y -CONFIG_BLK_WBT_SQ=y -CONFIG_BLK_WBT_MQ=y -CONFIG_BLK_DEBUG_FS=y -# CONFIG_BLK_SED_OPAL is not set - -# -# Partition Types -# -CONFIG_PARTITION_ADVANCED=y -# CONFIG_ACORN_PARTITION is not set -# CONFIG_AIX_PARTITION is not set -# CONFIG_OSF_PARTITION is not set -# CONFIG_AMIGA_PARTITION is not set -# CONFIG_ATARI_PARTITION is not set -# CONFIG_MAC_PARTITION is not set -CONFIG_MSDOS_PARTITION=y -# CONFIG_BSD_DISKLABEL is not set -# CONFIG_MINIX_SUBPARTITION is not set -# CONFIG_SOLARIS_X86_PARTITION is not set -# CONFIG_UNIXWARE_DISKLABEL is not set -CONFIG_LDM_PARTITION=y -CONFIG_LDM_DEBUG=y -# CONFIG_SGI_PARTITION is not set -# CONFIG_ULTRIX_PARTITION is not set -# CONFIG_SUN_PARTITION is not set -# CONFIG_KARMA_PARTITION is not set -CONFIG_EFI_PARTITION=y -# CONFIG_SYSV68_PARTITION is not set -CONFIG_CMDLINE_PARTITION=y -CONFIG_BLOCK_COMPAT=y -CONFIG_BLK_MQ_PCI=y -CONFIG_BLK_MQ_VIRTIO=y -CONFIG_BLK_MQ_RDMA=y - -# -# IO Schedulers -# -CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_DEADLINE=y -CONFIG_IOSCHED_CFQ=y -CONFIG_CFQ_GROUP_IOSCHED=y -CONFIG_IOSCHED_BFQ_SQ=y -CONFIG_BFQ_SQ_GROUP_IOSCHED=y -# CONFIG_DEFAULT_DEADLINE is not set -# CONFIG_DEFAULT_CFQ is not set -CONFIG_DEFAULT_BFQ_SQ=y -# CONFIG_DEFAULT_NOOP is not set -CONFIG_DEFAULT_IOSCHED="bfq-sq" -CONFIG_MQ_IOSCHED_BFQ=y -CONFIG_MQ_BFQ_GROUP_IOSCHED=y -CONFIG_MQ_IOSCHED_DEADLINE=y -# CONFIG_MQ_IOSCHED_KYBER is not set -CONFIG_IOSCHED_BFQ=y -CONFIG_BFQ_GROUP_IOSCHED=y -CONFIG_PREEMPT_NOTIFIERS=y -CONFIG_PADATA=y -CONFIG_ASN1=y -CONFIG_UNINLINE_SPIN_UNLOCK=y -CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y -CONFIG_MUTEX_SPIN_ON_OWNER=y -CONFIG_RWSEM_SPIN_ON_OWNER=y -CONFIG_LOCK_SPIN_ON_OWNER=y -CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y -CONFIG_QUEUED_SPINLOCKS=y -CONFIG_ARCH_USE_QUEUED_RWLOCKS=y -CONFIG_QUEUED_RWLOCKS=y -CONFIG_FREEZER=y - -# -# Processor type and features -# -CONFIG_ZONE_DMA=y -CONFIG_SMP=y -CONFIG_X86_FEATURE_NAMES=y -CONFIG_X86_FAST_FEATURE_TESTS=y -CONFIG_X86_X2APIC=y -CONFIG_X86_MPPARSE=y -# CONFIG_GOLDFISH is not set -CONFIG_RETPOLINE=y -CONFIG_INTEL_RDT=y -# CONFIG_X86_EXTENDED_PLATFORM is not set -CONFIG_X86_INTEL_LPSS=y -CONFIG_X86_AMD_PLATFORM_DEVICE=y -CONFIG_IOSF_MBI=y -# CONFIG_IOSF_MBI_DEBUG is not set -CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y -CONFIG_SCHED_OMIT_FRAME_POINTER=y -CONFIG_HYPERVISOR_GUEST=y -CONFIG_PARAVIRT=y -# CONFIG_PARAVIRT_DEBUG is not set -# CONFIG_PARAVIRT_SPINLOCKS is not set -# CONFIG_XEN is not set -CONFIG_KVM_GUEST=y -# CONFIG_KVM_DEBUG_FS is not set -# CONFIG_PARAVIRT_TIME_ACCOUNTING is not set -CONFIG_PARAVIRT_CLOCK=y -CONFIG_NO_BOOTMEM=y -# CONFIG_MK8 is not set -# CONFIG_MPSC is not set -# CONFIG_MCORE2 is not set -# CONFIG_MATOM is not set -CONFIG_GENERIC_CPU=y -CONFIG_X86_INTERNODE_CACHE_SHIFT=6 -CONFIG_X86_L1_CACHE_SHIFT=6 -CONFIG_X86_TSC=y -CONFIG_X86_CMPXCHG64=y -CONFIG_X86_CMOV=y -CONFIG_X86_MINIMUM_CPU_FAMILY=64 -CONFIG_X86_DEBUGCTLMSR=y -CONFIG_CPU_SUP_INTEL=y -CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_CENTAUR=y -CONFIG_HPET_TIMER=y -CONFIG_HPET_EMULATE_RTC=y -CONFIG_DMI=y -CONFIG_GART_IOMMU=y -CONFIG_CALGARY_IOMMU=y -CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y -CONFIG_SWIOTLB=y -CONFIG_IOMMU_HELPER=y -CONFIG_MAXSMP=y -CONFIG_NR_CPUS=8192 -CONFIG_SCHED_SMT=y -CONFIG_SMT_NICE=y -CONFIG_SCHED_MC=y -CONFIG_SCHED_MC_PRIO=y -# CONFIG_PREEMPT_NONE is not set -# CONFIG_PREEMPT_VOLUNTARY is not set -CONFIG_PREEMPT=y -CONFIG_PREEMPT_COUNT=y -CONFIG_X86_LOCAL_APIC=y -CONFIG_X86_IO_APIC=y -CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y -CONFIG_X86_MCE=y -# CONFIG_X86_MCELOG_LEGACY is not set -CONFIG_X86_MCE_INTEL=y -CONFIG_X86_MCE_AMD=y -CONFIG_X86_MCE_THRESHOLD=y -# CONFIG_X86_MCE_INJECT is not set -CONFIG_X86_THERMAL_VECTOR=y - -# -# Performance monitoring -# -CONFIG_PERF_EVENTS_INTEL_UNCORE=y -CONFIG_PERF_EVENTS_INTEL_RAPL=y -CONFIG_PERF_EVENTS_INTEL_CSTATE=y -CONFIG_PERF_EVENTS_AMD_POWER=m -# CONFIG_VM86 is not set -# CONFIG_X86_16BIT is not set -CONFIG_X86_VSYSCALL_EMULATION=y -CONFIG_I8K=m -CONFIG_MICROCODE=y -CONFIG_MICROCODE_INTEL=y -CONFIG_MICROCODE_AMD=y -CONFIG_MICROCODE_OLD_INTERFACE=y -CONFIG_X86_MSR=m -CONFIG_X86_CPUID=m -# CONFIG_X86_5LEVEL is not set -CONFIG_ARCH_PHYS_ADDR_T_64BIT=y -CONFIG_ARCH_DMA_ADDR_T_64BIT=y -CONFIG_X86_DIRECT_GBPAGES=y -CONFIG_ARCH_HAS_MEM_ENCRYPT=y -CONFIG_AMD_MEM_ENCRYPT=y -# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set -CONFIG_ARCH_USE_MEMREMAP_PROT=y -CONFIG_NUMA=y -CONFIG_AMD_NUMA=y -CONFIG_X86_64_ACPI_NUMA=y -CONFIG_NODES_SPAN_OTHER_NODES=y -# CONFIG_NUMA_EMU is not set -CONFIG_NODES_SHIFT=10 -CONFIG_ARCH_SPARSEMEM_ENABLE=y -CONFIG_ARCH_SPARSEMEM_DEFAULT=y -CONFIG_ARCH_SELECT_MEMORY_MODEL=y -CONFIG_ARCH_MEMORY_PROBE=y -CONFIG_ARCH_PROC_KCORE_TEXT=y -CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 -CONFIG_SELECT_MEMORY_MODEL=y -CONFIG_SPARSEMEM_MANUAL=y -CONFIG_SPARSEMEM=y -CONFIG_NEED_MULTIPLE_NODES=y -CONFIG_HAVE_MEMORY_PRESENT=y -CONFIG_SPARSEMEM_EXTREME=y -CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y -CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER=y -CONFIG_SPARSEMEM_VMEMMAP=y -CONFIG_HAVE_MEMBLOCK=y -CONFIG_HAVE_MEMBLOCK_NODE_MAP=y -CONFIG_HAVE_GENERIC_GUP=y -CONFIG_ARCH_DISCARD_MEMBLOCK=y -CONFIG_MEMORY_ISOLATION=y -CONFIG_HAVE_BOOTMEM_INFO_NODE=y -CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTPLUG_SPARSE=y -# CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE is not set -CONFIG_MEMORY_HOTREMOVE=y -CONFIG_SPLIT_PTLOCK_CPUS=4 -CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y -CONFIG_MEMORY_BALLOON=y -CONFIG_BALLOON_COMPACTION=y -CONFIG_COMPACTION=y -CONFIG_MIGRATION=y -CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y -CONFIG_ARCH_ENABLE_THP_MIGRATION=y -CONFIG_PHYS_ADDR_T_64BIT=y -CONFIG_BOUNCE=y -CONFIG_VIRT_TO_BUS=y -CONFIG_MMU_NOTIFIER=y -CONFIG_KSM=y -CONFIG_UKSM=y -# CONFIG_KSM_LEGACY is not set -CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 -CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y -CONFIG_MEMORY_FAILURE=y -# CONFIG_HWPOISON_INJECT is not set -CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y -# CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set -CONFIG_ARCH_WANTS_THP_SWAP=y -CONFIG_THP_SWAP=y -CONFIG_TRANSPARENT_HUGE_PAGECACHE=y -CONFIG_CLEANCACHE=y -CONFIG_FRONTSWAP=y -CONFIG_CMA=y -# CONFIG_CMA_DEBUG is not set -# CONFIG_CMA_DEBUGFS is not set -CONFIG_CMA_AREAS=7 -# CONFIG_ZSWAP is not set -CONFIG_ZPOOL=m -CONFIG_ZBUD=m -CONFIG_Z3FOLD=m -CONFIG_ZSMALLOC=y -# CONFIG_PGTABLE_MAPPING is not set -# CONFIG_ZSMALLOC_STAT is not set -CONFIG_GENERIC_EARLY_IOREMAP=y -CONFIG_ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT=y -# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set -# CONFIG_IDLE_PAGE_TRACKING is not set -CONFIG_ARCH_HAS_ZONE_DEVICE=y -# CONFIG_ZONE_DEVICE is not set -CONFIG_FRAME_VECTOR=y -CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y -CONFIG_ARCH_HAS_PKEYS=y -# CONFIG_PERCPU_STATS is not set -CONFIG_X86_PMEM_LEGACY_DEVICE=y -CONFIG_X86_PMEM_LEGACY=m -CONFIG_X86_CHECK_BIOS_CORRUPTION=y -CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y -CONFIG_X86_RESERVE_LOW=64 -CONFIG_MTRR=y -CONFIG_MTRR_SANITIZER=y -CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=0 -CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=1 -CONFIG_X86_PAT=y -CONFIG_ARCH_USES_PG_UNCACHED=y -CONFIG_ARCH_RANDOM=y -CONFIG_X86_SMAP=y -CONFIG_X86_INTEL_MPX=y -CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y -CONFIG_EFI=y -CONFIG_EFI_STUB=y -CONFIG_EFI_MIXED=y -CONFIG_SECCOMP=y -CONFIG_HZ_100=y -# CONFIG_HZ_250_NODEF is not set -# CONFIG_HZ_300_NODEF is not set -# CONFIG_HZ_1000_NODEF is not set -CONFIG_HZ=100 -CONFIG_SCHED_HRTICK=y -CONFIG_KEXEC=y -# CONFIG_CRASH_DUMP is not set -# CONFIG_KEXEC_JUMP is not set -CONFIG_PHYSICAL_START=0x1000000 -CONFIG_RELOCATABLE=y -CONFIG_RANDOMIZE_BASE=y -CONFIG_X86_NEED_RELOCS=y -CONFIG_PHYSICAL_ALIGN=0x1000000 -CONFIG_RANDOMIZE_MEMORY=y -CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0xa -CONFIG_HOTPLUG_CPU=y -CONFIG_BOOTPARAM_HOTPLUG_CPU0=y -# CONFIG_DEBUG_HOTPLUG_CPU0 is not set -# CONFIG_COMPAT_VDSO is not set -CONFIG_LEGACY_VSYSCALL_NATIVE=y -# CONFIG_LEGACY_VSYSCALL_EMULATE is not set -# CONFIG_LEGACY_VSYSCALL_NONE is not set -# CONFIG_CMDLINE_BOOL is not set -CONFIG_MODIFY_LDT_SYSCALL=y -CONFIG_HAVE_LIVEPATCH=y -CONFIG_ARCH_HAS_ADD_PAGES=y -CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y -CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y -CONFIG_USE_PERCPU_NUMA_NODE_ID=y - -# -# Power management and ACPI options -# -CONFIG_ARCH_HIBERNATION_HEADER=y -CONFIG_SUSPEND=y -CONFIG_SUSPEND_FREEZER=y -CONFIG_HIBERNATE_CALLBACKS=y -CONFIG_HIBERNATION=y -CONFIG_PM_STD_PARTITION="" -CONFIG_PM_SLEEP=y -CONFIG_PM_SLEEP_SMP=y -CONFIG_PM_AUTOSLEEP=y -CONFIG_PM_WAKELOCKS=y -CONFIG_PM_WAKELOCKS_LIMIT=100 -CONFIG_PM_WAKELOCKS_GC=y -CONFIG_PM=y -# CONFIG_PM_DEBUG is not set -CONFIG_PM_OPP=y -CONFIG_PM_CLK=y -CONFIG_PM_GENERIC_DOMAINS=y -# CONFIG_WQ_POWER_EFFICIENT_DEFAULT is not set -CONFIG_PM_GENERIC_DOMAINS_SLEEP=y -CONFIG_ACPI=y -CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y -CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y -CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y -# CONFIG_ACPI_DEBUGGER is not set -CONFIG_ACPI_SLEEP=y -# CONFIG_ACPI_PROCFS_POWER is not set -CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y -# CONFIG_ACPI_EC_DEBUGFS is not set -CONFIG_ACPI_AC=m -CONFIG_ACPI_BATTERY=m -CONFIG_ACPI_BUTTON=m -CONFIG_ACPI_VIDEO=m -CONFIG_ACPI_FAN=m -CONFIG_ACPI_DOCK=y -CONFIG_ACPI_CPU_FREQ_PSS=y -CONFIG_ACPI_PROCESSOR_CSTATE=y -CONFIG_ACPI_PROCESSOR_IDLE=y -CONFIG_ACPI_CPPC_LIB=y -CONFIG_ACPI_PROCESSOR=y -CONFIG_ACPI_IPMI=m -CONFIG_ACPI_HOTPLUG_CPU=y -CONFIG_ACPI_PROCESSOR_AGGREGATOR=m -CONFIG_ACPI_THERMAL=m -CONFIG_ACPI_NUMA=y -# CONFIG_ACPI_CUSTOM_DSDT is not set -CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y -CONFIG_ACPI_TABLE_UPGRADE=y -# CONFIG_ACPI_DEBUG is not set -CONFIG_ACPI_PCI_SLOT=y -CONFIG_X86_PM_TIMER=y -CONFIG_ACPI_CONTAINER=y -CONFIG_ACPI_HOTPLUG_MEMORY=y -CONFIG_ACPI_HOTPLUG_IOAPIC=y -CONFIG_ACPI_SBS=m -CONFIG_ACPI_HED=y -# CONFIG_ACPI_CUSTOM_METHOD is not set -CONFIG_ACPI_BGRT=y -# CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set -CONFIG_ACPI_NFIT=m -CONFIG_HAVE_ACPI_APEI=y -CONFIG_HAVE_ACPI_APEI_NMI=y -CONFIG_ACPI_APEI=y -CONFIG_ACPI_APEI_GHES=y -CONFIG_ACPI_APEI_PCIEAER=y -CONFIG_ACPI_APEI_MEMORY_FAILURE=y -# CONFIG_ACPI_APEI_EINJ is not set -# CONFIG_ACPI_APEI_ERST_DEBUG is not set -CONFIG_DPTF_POWER=m -CONFIG_ACPI_WATCHDOG=y -CONFIG_ACPI_EXTLOG=m -CONFIG_PMIC_OPREGION=y -# CONFIG_XPOWER_PMIC_OPREGION is not set -# CONFIG_BXT_WC_PMIC_OPREGION is not set -CONFIG_ACPI_CONFIGFS=m -CONFIG_SFI=y - -# -# CPU Frequency scaling -# -CONFIG_CPU_FREQ=y -CONFIG_CPU_FREQ_GOV_ATTR_SET=y -CONFIG_CPU_FREQ_GOV_COMMON=y -CONFIG_CPU_FREQ_STAT=y -CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y -# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set -CONFIG_CPU_FREQ_GOV_PERFORMANCE=y -CONFIG_CPU_FREQ_GOV_POWERSAVE=y -CONFIG_CPU_FREQ_GOV_USERSPACE=y -CONFIG_CPU_FREQ_GOV_ONDEMAND=y -CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y -CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y - -# -# CPU frequency scaling drivers -# -CONFIG_X86_INTEL_PSTATE=y -CONFIG_X86_PCC_CPUFREQ=m -CONFIG_X86_ACPI_CPUFREQ=m -CONFIG_X86_ACPI_CPUFREQ_CPB=y -CONFIG_X86_POWERNOW_K8=m -CONFIG_X86_AMD_FREQ_SENSITIVITY=m -# CONFIG_X86_SPEEDSTEP_CENTRINO is not set -# CONFIG_X86_P4_CLOCKMOD is not set - -# -# shared options -# -# CONFIG_X86_SPEEDSTEP_LIB is not set - -# -# CPU Idle -# -CONFIG_CPU_IDLE=y -CONFIG_CPU_IDLE_GOV_LADDER=y -CONFIG_CPU_IDLE_GOV_MENU=y -# CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED is not set -CONFIG_INTEL_IDLE=y - -# -# Bus options (PCI etc.) -# -CONFIG_PCI=y -CONFIG_PCI_DIRECT=y -CONFIG_PCI_MMCONFIG=y -CONFIG_PCI_DOMAINS=y -CONFIG_PCIEPORTBUS=y -CONFIG_HOTPLUG_PCI_PCIE=y -CONFIG_PCIEAER=y -CONFIG_PCIE_ECRC=y -CONFIG_PCIEAER_INJECT=m -CONFIG_PCIEASPM=y -# CONFIG_PCIEASPM_DEBUG is not set -CONFIG_PCIEASPM_DEFAULT=y -# CONFIG_PCIEASPM_POWERSAVE is not set -# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set -# CONFIG_PCIEASPM_PERFORMANCE is not set -CONFIG_PCIE_PME=y -CONFIG_PCIE_DPC=y -CONFIG_PCIE_PTM=y -CONFIG_PCI_BUS_ADDR_T_64BIT=y -CONFIG_PCI_MSI=y -CONFIG_PCI_MSI_IRQ_DOMAIN=y -# CONFIG_PCI_DEBUG is not set -CONFIG_PCI_REALLOC_ENABLE_AUTO=y -CONFIG_PCI_STUB=m -CONFIG_HT_IRQ=y -CONFIG_PCI_ATS=y -CONFIG_PCI_LOCKLESS_CONFIG=y -CONFIG_PCI_IOV=y -CONFIG_PCI_PRI=y -CONFIG_PCI_PASID=y -CONFIG_PCI_LABEL=y -CONFIG_PCI_HYPERV=m -CONFIG_HOTPLUG_PCI=y -CONFIG_HOTPLUG_PCI_ACPI=y -CONFIG_HOTPLUG_PCI_ACPI_IBM=m -CONFIG_HOTPLUG_PCI_CPCI=y -CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m -CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m -CONFIG_HOTPLUG_PCI_SHPC=m - -# -# DesignWare PCI Core Support -# -# CONFIG_PCIE_DW_PLAT is not set - -# -# PCI host controller drivers -# -CONFIG_VMD=m - -# -# PCI Endpoint -# -# CONFIG_PCI_ENDPOINT is not set - -# -# PCI switch controller drivers -# -CONFIG_PCI_SW_SWITCHTEC=m -CONFIG_ISA_DMA_API=y -CONFIG_AMD_NB=y -CONFIG_PCCARD=m -CONFIG_PCMCIA=m -CONFIG_PCMCIA_LOAD_CIS=y -CONFIG_CARDBUS=y - -# -# PC-card bridges -# -CONFIG_YENTA=m -CONFIG_YENTA_O2=y -CONFIG_YENTA_RICOH=y -CONFIG_YENTA_TI=y -CONFIG_YENTA_ENE_TUNE=y -CONFIG_YENTA_TOSHIBA=y -CONFIG_PD6729=m -CONFIG_I82092=m -CONFIG_PCCARD_NONSTATIC=y -CONFIG_RAPIDIO=y -CONFIG_RAPIDIO_TSI721=y -CONFIG_RAPIDIO_DISC_TIMEOUT=30 -CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y -CONFIG_RAPIDIO_DMA_ENGINE=y -# CONFIG_RAPIDIO_DEBUG is not set -CONFIG_RAPIDIO_ENUM_BASIC=m -CONFIG_RAPIDIO_CHMAN=m -CONFIG_RAPIDIO_MPORT_CDEV=m - -# -# RapidIO Switch drivers -# -CONFIG_RAPIDIO_TSI57X=y -CONFIG_RAPIDIO_CPS_XX=y -CONFIG_RAPIDIO_TSI568=y -CONFIG_RAPIDIO_CPS_GEN2=y -CONFIG_RAPIDIO_RXS_GEN3=m -CONFIG_X86_SYSFB=y - -# -# Executable file formats / Emulations -# -CONFIG_BINFMT_ELF=y -CONFIG_COMPAT_BINFMT_ELF=y -CONFIG_ELFCORE=y -CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y -CONFIG_BINFMT_SCRIPT=y -# CONFIG_HAVE_AOUT is not set -CONFIG_BINFMT_MISC=y -CONFIG_COREDUMP=y -CONFIG_IA32_EMULATION=y -CONFIG_IA32_AOUT=y -CONFIG_X86_X32=y -CONFIG_COMPAT_32=y -CONFIG_COMPAT=y -CONFIG_COMPAT_FOR_U64_ALIGNMENT=y -CONFIG_SYSVIPC_COMPAT=y -CONFIG_X86_DEV_DMA_OPS=y -CONFIG_NET=y -CONFIG_COMPAT_NETLINK_MESSAGES=y -CONFIG_NET_INGRESS=y -CONFIG_NET_EGRESS=y - -# -# Networking options -# -CONFIG_PACKET=m -CONFIG_PACKET_DIAG=m -CONFIG_UNIX=m -CONFIG_UNIX_DIAG=m -CONFIG_TLS=m -CONFIG_XFRM=y -CONFIG_XFRM_OFFLOAD=y -CONFIG_XFRM_ALGO=m -CONFIG_XFRM_USER=m -CONFIG_XFRM_SUB_POLICY=y -CONFIG_XFRM_MIGRATE=y -CONFIG_XFRM_STATISTICS=y -CONFIG_XFRM_IPCOMP=m -CONFIG_NET_KEY=m -CONFIG_NET_KEY_MIGRATE=y -CONFIG_SMC=m -CONFIG_SMC_DIAG=m -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_ADVANCED_ROUTER=y -CONFIG_IP_FIB_TRIE_STATS=y -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_ROUTE_MULTIPATH=y -CONFIG_IP_ROUTE_VERBOSE=y -CONFIG_IP_ROUTE_CLASSID=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -CONFIG_IP_PNP_BOOTP=y -CONFIG_IP_PNP_RARP=y -CONFIG_NET_IPIP=m -CONFIG_NET_IPGRE_DEMUX=m -CONFIG_NET_IP_TUNNEL=m -CONFIG_NET_IPGRE=m -CONFIG_NET_IPGRE_BROADCAST=y -CONFIG_IP_MROUTE=y -CONFIG_IP_MROUTE_MULTIPLE_TABLES=y -CONFIG_IP_PIMSM_V1=y -CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y -CONFIG_NET_IPVTI=m -CONFIG_NET_UDP_TUNNEL=m -CONFIG_NET_FOU=m -CONFIG_NET_FOU_IP_TUNNELS=y -CONFIG_INET_AH=m -CONFIG_INET_ESP=m -CONFIG_INET_ESP_OFFLOAD=m -CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_TUNNEL=m -CONFIG_INET_TUNNEL=m -CONFIG_INET_XFRM_MODE_TRANSPORT=m -CONFIG_INET_XFRM_MODE_TUNNEL=m -CONFIG_INET_XFRM_MODE_BEET=m -CONFIG_INET_DIAG=m -CONFIG_INET_TCP_DIAG=m -CONFIG_INET_UDP_DIAG=m -# CONFIG_INET_RAW_DIAG is not set -# CONFIG_INET_DIAG_DESTROY is not set -CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_BIC=m -CONFIG_TCP_CONG_CUBIC=m -CONFIG_TCP_CONG_WESTWOOD=m -CONFIG_TCP_CONG_HTCP=m -CONFIG_TCP_CONG_HSTCP=m -CONFIG_TCP_CONG_HYBLA=m -CONFIG_TCP_CONG_VEGAS=m -CONFIG_TCP_CONG_NV=m -CONFIG_TCP_CONG_SCALABLE=m -CONFIG_TCP_CONG_LP=m -CONFIG_TCP_CONG_VENO=m -CONFIG_TCP_CONG_YEAH=m -CONFIG_TCP_CONG_ILLINOIS=m -CONFIG_TCP_CONG_DCTCP=m -# CONFIG_TCP_CONG_CDG is not set -CONFIG_TCP_CONG_BBR=m -CONFIG_DEFAULT_RENO=y -CONFIG_DEFAULT_TCP_CONG="reno" -CONFIG_TCP_MD5SIG=y -CONFIG_IPV6=m -CONFIG_IPV6_ROUTER_PREF=y -CONFIG_IPV6_ROUTE_INFO=y -CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_INET6_AH=m -CONFIG_INET6_ESP=m -CONFIG_INET6_ESP_OFFLOAD=m -CONFIG_INET6_IPCOMP=m -CONFIG_IPV6_MIP6=m -CONFIG_IPV6_ILA=m -CONFIG_INET6_XFRM_TUNNEL=m -CONFIG_INET6_TUNNEL=m -CONFIG_INET6_XFRM_MODE_TRANSPORT=m -CONFIG_INET6_XFRM_MODE_TUNNEL=m -CONFIG_INET6_XFRM_MODE_BEET=m -CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m -CONFIG_IPV6_VTI=m -CONFIG_IPV6_SIT=m -CONFIG_IPV6_SIT_6RD=y -CONFIG_IPV6_NDISC_NODETYPE=y -CONFIG_IPV6_TUNNEL=m -CONFIG_IPV6_GRE=m -CONFIG_IPV6_FOU=m -CONFIG_IPV6_FOU_TUNNEL=m -CONFIG_IPV6_MULTIPLE_TABLES=y -CONFIG_IPV6_SUBTREES=y -CONFIG_IPV6_MROUTE=y -CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y -CONFIG_IPV6_PIMSM_V2=y -CONFIG_IPV6_SEG6_LWTUNNEL=y -CONFIG_IPV6_SEG6_HMAC=y -# CONFIG_NETLABEL is not set -CONFIG_NETWORK_SECMARK=y -CONFIG_NET_PTP_CLASSIFY=y -CONFIG_NETWORK_PHY_TIMESTAMPING=y -CONFIG_NETFILTER=y -CONFIG_NETFILTER_ADVANCED=y -CONFIG_BRIDGE_NETFILTER=m - -# -# Core Netfilter Configuration -# -CONFIG_NETFILTER_INGRESS=y -CONFIG_NETFILTER_NETLINK=m -CONFIG_NETFILTER_NETLINK_ACCT=m -CONFIG_NETFILTER_NETLINK_QUEUE=m -CONFIG_NETFILTER_NETLINK_LOG=m -CONFIG_NF_CONNTRACK=m -CONFIG_NF_LOG_COMMON=m -CONFIG_NF_LOG_NETDEV=m -CONFIG_NF_CONNTRACK_MARK=y -CONFIG_NF_CONNTRACK_SECMARK=y -CONFIG_NF_CONNTRACK_ZONES=y -CONFIG_NF_CONNTRACK_PROCFS=y -CONFIG_NF_CONNTRACK_EVENTS=y -CONFIG_NF_CONNTRACK_TIMEOUT=y -CONFIG_NF_CONNTRACK_TIMESTAMP=y -CONFIG_NF_CONNTRACK_LABELS=y -CONFIG_NF_CT_PROTO_DCCP=y -CONFIG_NF_CT_PROTO_GRE=m -CONFIG_NF_CT_PROTO_SCTP=y -CONFIG_NF_CT_PROTO_UDPLITE=y -CONFIG_NF_CONNTRACK_AMANDA=m -CONFIG_NF_CONNTRACK_FTP=m -CONFIG_NF_CONNTRACK_H323=m -CONFIG_NF_CONNTRACK_IRC=m -CONFIG_NF_CONNTRACK_BROADCAST=m -CONFIG_NF_CONNTRACK_NETBIOS_NS=m -CONFIG_NF_CONNTRACK_SNMP=m -CONFIG_NF_CONNTRACK_PPTP=m -CONFIG_NF_CONNTRACK_SANE=m -CONFIG_NF_CONNTRACK_SIP=m -CONFIG_NF_CONNTRACK_TFTP=m -CONFIG_NF_CT_NETLINK=m -CONFIG_NF_CT_NETLINK_TIMEOUT=m -CONFIG_NF_CT_NETLINK_HELPER=m -CONFIG_NETFILTER_NETLINK_GLUE_CT=y -CONFIG_NF_NAT=m -CONFIG_NF_NAT_NEEDED=y -CONFIG_NF_NAT_PROTO_DCCP=y -CONFIG_NF_NAT_PROTO_UDPLITE=y -CONFIG_NF_NAT_PROTO_SCTP=y -CONFIG_NF_NAT_AMANDA=m -CONFIG_NF_NAT_FTP=m -CONFIG_NF_NAT_IRC=m -CONFIG_NF_NAT_SIP=m -CONFIG_NF_NAT_TFTP=m -CONFIG_NF_NAT_REDIRECT=m -CONFIG_NETFILTER_SYNPROXY=m -CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_INET=m -CONFIG_NF_TABLES_NETDEV=m -CONFIG_NFT_EXTHDR=m -CONFIG_NFT_META=m -CONFIG_NFT_RT=m -CONFIG_NFT_NUMGEN=m -CONFIG_NFT_CT=m -CONFIG_NFT_SET_RBTREE=m -CONFIG_NFT_SET_HASH=m -CONFIG_NFT_SET_BITMAP=m -CONFIG_NFT_COUNTER=m -CONFIG_NFT_LOG=m -CONFIG_NFT_LIMIT=m -CONFIG_NFT_MASQ=m -CONFIG_NFT_REDIR=m -CONFIG_NFT_NAT=m -CONFIG_NFT_OBJREF=m -CONFIG_NFT_QUEUE=m -CONFIG_NFT_QUOTA=m -CONFIG_NFT_REJECT=m -CONFIG_NFT_REJECT_INET=m -CONFIG_NFT_COMPAT=m -CONFIG_NFT_HASH=m -CONFIG_NFT_FIB=m -CONFIG_NFT_FIB_INET=m -CONFIG_NF_DUP_NETDEV=m -CONFIG_NFT_DUP_NETDEV=m -CONFIG_NFT_FWD_NETDEV=m -CONFIG_NFT_FIB_NETDEV=m -CONFIG_NETFILTER_XTABLES=m - -# -# Xtables combined modules -# -CONFIG_NETFILTER_XT_MARK=m -CONFIG_NETFILTER_XT_CONNMARK=m -CONFIG_NETFILTER_XT_SET=m - -# -# Xtables targets -# -CONFIG_NETFILTER_XT_TARGET_AUDIT=m -CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m -CONFIG_NETFILTER_XT_TARGET_CONNMARK=m -CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m -CONFIG_NETFILTER_XT_TARGET_CT=m -CONFIG_NETFILTER_XT_TARGET_DSCP=m -CONFIG_NETFILTER_XT_TARGET_HL=m -CONFIG_NETFILTER_XT_TARGET_HMARK=m -CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m -CONFIG_NETFILTER_XT_TARGET_LED=m -CONFIG_NETFILTER_XT_TARGET_LOG=m -CONFIG_NETFILTER_XT_TARGET_MARK=m -CONFIG_NETFILTER_XT_NAT=m -CONFIG_NETFILTER_XT_TARGET_NETMAP=m -CONFIG_NETFILTER_XT_TARGET_NFLOG=m -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m -CONFIG_NETFILTER_XT_TARGET_NOTRACK=m -CONFIG_NETFILTER_XT_TARGET_RATEEST=m -CONFIG_NETFILTER_XT_TARGET_REDIRECT=m -CONFIG_NETFILTER_XT_TARGET_TEE=m -CONFIG_NETFILTER_XT_TARGET_TPROXY=m -CONFIG_NETFILTER_XT_TARGET_TRACE=m -CONFIG_NETFILTER_XT_TARGET_SECMARK=m -CONFIG_NETFILTER_XT_TARGET_TCPMSS=m -CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m - -# -# Xtables matches -# -CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m -CONFIG_NETFILTER_XT_MATCH_BPF=m -CONFIG_NETFILTER_XT_MATCH_CGROUP=m -CONFIG_NETFILTER_XT_MATCH_CLUSTER=m -CONFIG_NETFILTER_XT_MATCH_COMMENT=m -CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m -CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m -CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m -CONFIG_NETFILTER_XT_MATCH_CONNMARK=m -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_CPU=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m -CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m -CONFIG_NETFILTER_XT_MATCH_DSCP=m -CONFIG_NETFILTER_XT_MATCH_ECN=m -CONFIG_NETFILTER_XT_MATCH_ESP=m -CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m -CONFIG_NETFILTER_XT_MATCH_HELPER=m -CONFIG_NETFILTER_XT_MATCH_HL=m -CONFIG_NETFILTER_XT_MATCH_IPCOMP=m -CONFIG_NETFILTER_XT_MATCH_IPRANGE=m -CONFIG_NETFILTER_XT_MATCH_IPVS=m -CONFIG_NETFILTER_XT_MATCH_L2TP=m -CONFIG_NETFILTER_XT_MATCH_LENGTH=m -CONFIG_NETFILTER_XT_MATCH_LIMIT=m -CONFIG_NETFILTER_XT_MATCH_MAC=m -CONFIG_NETFILTER_XT_MATCH_MARK=m -CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m -CONFIG_NETFILTER_XT_MATCH_NFACCT=m -CONFIG_NETFILTER_XT_MATCH_OSF=m -CONFIG_NETFILTER_XT_MATCH_OWNER=m -CONFIG_NETFILTER_XT_MATCH_POLICY=m -CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m -CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m -CONFIG_NETFILTER_XT_MATCH_QUOTA=m -CONFIG_NETFILTER_XT_MATCH_RATEEST=m -CONFIG_NETFILTER_XT_MATCH_REALM=m -CONFIG_NETFILTER_XT_MATCH_RECENT=m -CONFIG_NETFILTER_XT_MATCH_SCTP=m -CONFIG_NETFILTER_XT_MATCH_SOCKET=m -CONFIG_NETFILTER_XT_MATCH_STATE=m -CONFIG_NETFILTER_XT_MATCH_STATISTIC=m -CONFIG_NETFILTER_XT_MATCH_STRING=m -CONFIG_NETFILTER_XT_MATCH_TCPMSS=m -CONFIG_NETFILTER_XT_MATCH_TIME=m -CONFIG_NETFILTER_XT_MATCH_U32=m -CONFIG_IP_SET=m -CONFIG_IP_SET_MAX=256 -CONFIG_IP_SET_BITMAP_IP=m -CONFIG_IP_SET_BITMAP_IPMAC=m -CONFIG_IP_SET_BITMAP_PORT=m -CONFIG_IP_SET_HASH_IP=m -CONFIG_IP_SET_HASH_IPMARK=m -CONFIG_IP_SET_HASH_IPPORT=m -CONFIG_IP_SET_HASH_IPPORTIP=m -CONFIG_IP_SET_HASH_IPPORTNET=m -CONFIG_IP_SET_HASH_IPMAC=m -CONFIG_IP_SET_HASH_MAC=m -CONFIG_IP_SET_HASH_NETPORTNET=m -CONFIG_IP_SET_HASH_NET=m -CONFIG_IP_SET_HASH_NETNET=m -CONFIG_IP_SET_HASH_NETPORT=m -CONFIG_IP_SET_HASH_NETIFACE=m -CONFIG_IP_SET_LIST_SET=m -CONFIG_IP_VS=m -CONFIG_IP_VS_IPV6=y -# CONFIG_IP_VS_DEBUG is not set -CONFIG_IP_VS_TAB_BITS=12 - -# -# IPVS transport protocol load balancing support -# -CONFIG_IP_VS_PROTO_TCP=y -CONFIG_IP_VS_PROTO_UDP=y -CONFIG_IP_VS_PROTO_AH_ESP=y -CONFIG_IP_VS_PROTO_ESP=y -CONFIG_IP_VS_PROTO_AH=y -CONFIG_IP_VS_PROTO_SCTP=y - -# -# IPVS scheduler -# -CONFIG_IP_VS_RR=m -CONFIG_IP_VS_WRR=m -CONFIG_IP_VS_LC=m -CONFIG_IP_VS_WLC=m -CONFIG_IP_VS_FO=m -CONFIG_IP_VS_OVF=m -CONFIG_IP_VS_LBLC=m -CONFIG_IP_VS_LBLCR=m -CONFIG_IP_VS_DH=m -CONFIG_IP_VS_SH=m -CONFIG_IP_VS_SED=m -CONFIG_IP_VS_NQ=m - -# -# IPVS SH scheduler -# -CONFIG_IP_VS_SH_TAB_BITS=8 - -# -# IPVS application helper -# -CONFIG_IP_VS_FTP=m -CONFIG_IP_VS_NFCT=y -CONFIG_IP_VS_PE_SIP=m - -# -# IP: Netfilter Configuration -# -CONFIG_NF_DEFRAG_IPV4=m -CONFIG_NF_CONNTRACK_IPV4=m -CONFIG_NF_SOCKET_IPV4=m -CONFIG_NF_TABLES_IPV4=m -CONFIG_NFT_CHAIN_ROUTE_IPV4=m -CONFIG_NFT_REJECT_IPV4=m -CONFIG_NFT_DUP_IPV4=m -CONFIG_NFT_FIB_IPV4=m -CONFIG_NF_TABLES_ARP=m -CONFIG_NF_DUP_IPV4=m -CONFIG_NF_LOG_ARP=m -CONFIG_NF_LOG_IPV4=m -CONFIG_NF_REJECT_IPV4=m -CONFIG_NF_NAT_IPV4=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NF_NAT_MASQUERADE_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m -CONFIG_NF_NAT_SNMP_BASIC=m -CONFIG_NF_NAT_PROTO_GRE=m -CONFIG_NF_NAT_PPTP=m -CONFIG_NF_NAT_H323=m -CONFIG_IP_NF_IPTABLES=m -CONFIG_IP_NF_MATCH_AH=m -CONFIG_IP_NF_MATCH_ECN=m -CONFIG_IP_NF_MATCH_RPFILTER=m -CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m -CONFIG_IP_NF_TARGET_REJECT=m -CONFIG_IP_NF_TARGET_SYNPROXY=m -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_TARGET_MASQUERADE=m -CONFIG_IP_NF_TARGET_NETMAP=m -CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m -CONFIG_IP_NF_TARGET_ECN=m -CONFIG_IP_NF_TARGET_TTL=m -CONFIG_IP_NF_RAW=m -# CONFIG_IP_NF_SECURITY is not set -CONFIG_IP_NF_ARPTABLES=m -CONFIG_IP_NF_ARPFILTER=m -CONFIG_IP_NF_ARP_MANGLE=m - -# -# IPv6: Netfilter Configuration -# -CONFIG_NF_DEFRAG_IPV6=m -CONFIG_NF_CONNTRACK_IPV6=m -CONFIG_NF_SOCKET_IPV6=m -CONFIG_NF_TABLES_IPV6=m -CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m -CONFIG_NFT_REJECT_IPV6=m -CONFIG_NFT_DUP_IPV6=m -CONFIG_NFT_FIB_IPV6=m -CONFIG_NF_DUP_IPV6=m -CONFIG_NF_REJECT_IPV6=m -CONFIG_NF_LOG_IPV6=m -CONFIG_NF_NAT_IPV6=m -CONFIG_NF_NAT_MASQUERADE_IPV6=m -CONFIG_IP6_NF_IPTABLES=m -CONFIG_IP6_NF_MATCH_AH=m -CONFIG_IP6_NF_MATCH_EUI64=m -CONFIG_IP6_NF_MATCH_FRAG=m -CONFIG_IP6_NF_MATCH_OPTS=m -CONFIG_IP6_NF_MATCH_HL=m -CONFIG_IP6_NF_MATCH_IPV6HEADER=m -CONFIG_IP6_NF_MATCH_MH=m -CONFIG_IP6_NF_MATCH_RPFILTER=m -CONFIG_IP6_NF_MATCH_RT=m -CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m -CONFIG_IP6_NF_TARGET_REJECT=m -CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m -CONFIG_IP6_NF_RAW=m -# CONFIG_IP6_NF_SECURITY is not set -CONFIG_IP6_NF_NAT=m -CONFIG_IP6_NF_TARGET_MASQUERADE=m -CONFIG_IP6_NF_TARGET_NPT=m - -# -# DECnet: Netfilter Configuration -# -CONFIG_DECNET_NF_GRABULATOR=m -CONFIG_NF_TABLES_BRIDGE=m -CONFIG_NFT_BRIDGE_META=m -CONFIG_NFT_BRIDGE_REJECT=m -CONFIG_NF_LOG_BRIDGE=m -CONFIG_BRIDGE_NF_EBTABLES=m -CONFIG_BRIDGE_EBT_BROUTE=m -CONFIG_BRIDGE_EBT_T_FILTER=m -CONFIG_BRIDGE_EBT_T_NAT=m -CONFIG_BRIDGE_EBT_802_3=m -CONFIG_BRIDGE_EBT_AMONG=m -CONFIG_BRIDGE_EBT_ARP=m -CONFIG_BRIDGE_EBT_IP=m -CONFIG_BRIDGE_EBT_IP6=m -CONFIG_BRIDGE_EBT_LIMIT=m -CONFIG_BRIDGE_EBT_MARK=m -CONFIG_BRIDGE_EBT_PKTTYPE=m -CONFIG_BRIDGE_EBT_STP=m -CONFIG_BRIDGE_EBT_VLAN=m -CONFIG_BRIDGE_EBT_ARPREPLY=m -CONFIG_BRIDGE_EBT_DNAT=m -CONFIG_BRIDGE_EBT_MARK_T=m -CONFIG_BRIDGE_EBT_REDIRECT=m -CONFIG_BRIDGE_EBT_SNAT=m -CONFIG_BRIDGE_EBT_LOG=m -CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -CONFIG_INET_DCCP_DIAG=m - -# -# DCCP CCIDs Configuration -# -# CONFIG_IP_DCCP_CCID2_DEBUG is not set -CONFIG_IP_DCCP_CCID3=y -# CONFIG_IP_DCCP_CCID3_DEBUG is not set -CONFIG_IP_DCCP_TFRC_LIB=y - -# -# DCCP Kernel Hacking -# -# CONFIG_IP_DCCP_DEBUG is not set -# CONFIG_NET_DCCPPROBE is not set -CONFIG_IP_SCTP=m -# CONFIG_NET_SCTPPROBE is not set -# CONFIG_SCTP_DBG_OBJCNT is not set -CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5=y -# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1 is not set -# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set -CONFIG_SCTP_COOKIE_HMAC_MD5=y -CONFIG_SCTP_COOKIE_HMAC_SHA1=y -CONFIG_INET_SCTP_DIAG=m -CONFIG_RDS=m -CONFIG_RDS_RDMA=m -CONFIG_RDS_TCP=m -# CONFIG_RDS_DEBUG is not set -CONFIG_TIPC=m -CONFIG_TIPC_MEDIA_IB=y -CONFIG_TIPC_MEDIA_UDP=y -CONFIG_ATM=m -CONFIG_ATM_CLIP=m -# CONFIG_ATM_CLIP_NO_ICMP is not set -CONFIG_ATM_LANE=m -CONFIG_ATM_MPOA=m -CONFIG_ATM_BR2684=m -# CONFIG_ATM_BR2684_IPFILTER is not set -CONFIG_L2TP=m -# CONFIG_L2TP_DEBUGFS is not set -CONFIG_L2TP_V3=y -CONFIG_L2TP_IP=m -CONFIG_L2TP_ETH=m -CONFIG_STP=m -CONFIG_GARP=m -CONFIG_MRP=m -CONFIG_BRIDGE=m -CONFIG_BRIDGE_IGMP_SNOOPING=y -CONFIG_BRIDGE_VLAN_FILTERING=y -CONFIG_HAVE_NET_DSA=y -CONFIG_NET_DSA=m -CONFIG_NET_DSA_TAG_DSA=y -CONFIG_NET_DSA_TAG_EDSA=y -CONFIG_NET_DSA_TAG_KSZ=y -CONFIG_NET_DSA_TAG_LAN9303=y -CONFIG_NET_DSA_TAG_MTK=y -CONFIG_NET_DSA_TAG_TRAILER=y -CONFIG_NET_DSA_TAG_QCA=y -CONFIG_VLAN_8021Q=m -CONFIG_VLAN_8021Q_GVRP=y -CONFIG_VLAN_8021Q_MVRP=y -CONFIG_DECNET=m -CONFIG_DECNET_ROUTER=y -CONFIG_LLC=m -CONFIG_LLC2=m -CONFIG_IPX=m -CONFIG_IPX_INTERN=y -CONFIG_ATALK=m -CONFIG_DEV_APPLETALK=m -CONFIG_IPDDP=m -CONFIG_IPDDP_ENCAP=y -CONFIG_X25=m -CONFIG_LAPB=m -CONFIG_PHONET=m -CONFIG_6LOWPAN=m -# CONFIG_6LOWPAN_DEBUGFS is not set -CONFIG_6LOWPAN_NHC=m -CONFIG_6LOWPAN_NHC_DEST=m -CONFIG_6LOWPAN_NHC_FRAGMENT=m -CONFIG_6LOWPAN_NHC_HOP=m -CONFIG_6LOWPAN_NHC_IPV6=m -CONFIG_6LOWPAN_NHC_MOBILITY=m -CONFIG_6LOWPAN_NHC_ROUTING=m -CONFIG_6LOWPAN_NHC_UDP=m -CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m -CONFIG_6LOWPAN_GHC_UDP=m -CONFIG_6LOWPAN_GHC_ICMPV6=m -CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m -CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m -CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m -CONFIG_IEEE802154=m -CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y -CONFIG_IEEE802154_SOCKET=m -CONFIG_IEEE802154_6LOWPAN=m -CONFIG_MAC802154=m -CONFIG_NET_SCHED=y - -# -# Queueing/Scheduling -# -CONFIG_NET_SCH_CBQ=m -CONFIG_NET_SCH_HTB=m -CONFIG_NET_SCH_HFSC=m -CONFIG_NET_SCH_ATM=m -CONFIG_NET_SCH_PRIO=m -CONFIG_NET_SCH_MULTIQ=m -CONFIG_NET_SCH_RED=m -CONFIG_NET_SCH_SFB=m -CONFIG_NET_SCH_SFQ=m -CONFIG_NET_SCH_TEQL=m -CONFIG_NET_SCH_TBF=m -CONFIG_NET_SCH_GRED=m -CONFIG_NET_SCH_DSMARK=m -CONFIG_NET_SCH_NETEM=m -CONFIG_NET_SCH_DRR=m -CONFIG_NET_SCH_MQPRIO=m -CONFIG_NET_SCH_CHOKE=m -CONFIG_NET_SCH_QFQ=m -CONFIG_NET_SCH_CODEL=m -CONFIG_NET_SCH_FQ_CODEL=m -CONFIG_NET_SCH_FQ=m -CONFIG_NET_SCH_HHF=m -CONFIG_NET_SCH_PIE=m -CONFIG_NET_SCH_INGRESS=m -CONFIG_NET_SCH_PLUG=m -# CONFIG_NET_SCH_DEFAULT is not set - -# -# Classification -# -CONFIG_NET_CLS=y -CONFIG_NET_CLS_BASIC=m -CONFIG_NET_CLS_TCINDEX=m -CONFIG_NET_CLS_ROUTE4=m -CONFIG_NET_CLS_FW=m -CONFIG_NET_CLS_U32=m -CONFIG_CLS_U32_PERF=y -CONFIG_CLS_U32_MARK=y -CONFIG_NET_CLS_RSVP=m -CONFIG_NET_CLS_RSVP6=m -CONFIG_NET_CLS_FLOW=m -CONFIG_NET_CLS_CGROUP=m -CONFIG_NET_CLS_BPF=m -CONFIG_NET_CLS_FLOWER=m -CONFIG_NET_CLS_MATCHALL=m -CONFIG_NET_EMATCH=y -CONFIG_NET_EMATCH_STACK=32 -CONFIG_NET_EMATCH_CMP=m -CONFIG_NET_EMATCH_NBYTE=m -CONFIG_NET_EMATCH_U32=m -CONFIG_NET_EMATCH_META=m -CONFIG_NET_EMATCH_TEXT=m -CONFIG_NET_EMATCH_CANID=m -CONFIG_NET_EMATCH_IPSET=m -CONFIG_NET_CLS_ACT=y -CONFIG_NET_ACT_POLICE=m -CONFIG_NET_ACT_GACT=m -CONFIG_GACT_PROB=y -CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_SAMPLE=m -CONFIG_NET_ACT_IPT=m -CONFIG_NET_ACT_NAT=m -CONFIG_NET_ACT_PEDIT=m -# CONFIG_NET_ACT_SIMP is not set -CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_ACT_CSUM=m -CONFIG_NET_ACT_VLAN=m -CONFIG_NET_ACT_BPF=m -CONFIG_NET_ACT_CONNMARK=m -CONFIG_NET_ACT_SKBMOD=m -CONFIG_NET_ACT_IFE=m -CONFIG_NET_ACT_TUNNEL_KEY=m -CONFIG_NET_IFE_SKBMARK=m -CONFIG_NET_IFE_SKBPRIO=m -CONFIG_NET_IFE_SKBTCINDEX=m -CONFIG_NET_CLS_IND=y -CONFIG_NET_SCH_FIFO=y -CONFIG_DCB=y -CONFIG_DNS_RESOLVER=y -CONFIG_BATMAN_ADV=m -# CONFIG_BATMAN_ADV_BATMAN_V is not set -CONFIG_BATMAN_ADV_BLA=y -CONFIG_BATMAN_ADV_DAT=y -CONFIG_BATMAN_ADV_NC=y -CONFIG_BATMAN_ADV_MCAST=y -CONFIG_BATMAN_ADV_DEBUGFS=y -# CONFIG_BATMAN_ADV_DEBUG is not set -CONFIG_OPENVSWITCH=m -CONFIG_OPENVSWITCH_GRE=m -CONFIG_OPENVSWITCH_VXLAN=m -CONFIG_OPENVSWITCH_GENEVE=m -CONFIG_VSOCKETS=m -CONFIG_VMWARE_VMCI_VSOCKETS=m -CONFIG_VIRTIO_VSOCKETS=m -CONFIG_VIRTIO_VSOCKETS_COMMON=m -CONFIG_HYPERV_VSOCKETS=m -CONFIG_NETLINK_DIAG=m -CONFIG_MPLS=y -CONFIG_NET_MPLS_GSO=m -CONFIG_MPLS_ROUTING=m -CONFIG_MPLS_IPTUNNEL=m -CONFIG_NET_NSH=m -CONFIG_HSR=m -CONFIG_NET_SWITCHDEV=y -CONFIG_NET_L3_MASTER_DEV=y -# CONFIG_NET_NCSI is not set -CONFIG_RPS=y -CONFIG_RFS_ACCEL=y -CONFIG_XPS=y -CONFIG_CGROUP_NET_PRIO=y -CONFIG_CGROUP_NET_CLASSID=y -CONFIG_NET_RX_BUSY_POLL=y -CONFIG_BQL=y -CONFIG_BPF_JIT=y -# CONFIG_BPF_STREAM_PARSER is not set -CONFIG_NET_FLOW_LIMIT=y - -# -# Network testing -# -# CONFIG_NET_PKTGEN is not set -# CONFIG_NET_TCPPROBE is not set -# CONFIG_NET_DROP_MONITOR is not set -CONFIG_HAMRADIO=y - -# -# Packet Radio protocols -# -CONFIG_AX25=m -CONFIG_AX25_DAMA_SLAVE=y -CONFIG_NETROM=m -CONFIG_ROSE=m - -# -# AX.25 network device drivers -# -CONFIG_MKISS=m -CONFIG_6PACK=m -CONFIG_BPQETHER=m -CONFIG_BAYCOM_SER_FDX=m -CONFIG_BAYCOM_SER_HDX=m -CONFIG_BAYCOM_PAR=m -CONFIG_YAM=m -CONFIG_CAN=m -CONFIG_CAN_RAW=m -CONFIG_CAN_BCM=m -CONFIG_CAN_GW=m - -# -# CAN Device Drivers -# -CONFIG_CAN_VCAN=m -CONFIG_CAN_VXCAN=m -CONFIG_CAN_SLCAN=m -CONFIG_CAN_DEV=m -CONFIG_CAN_CALC_BITTIMING=y -CONFIG_CAN_LEDS=y -CONFIG_CAN_JANZ_ICAN3=m -CONFIG_CAN_C_CAN=m -CONFIG_CAN_C_CAN_PLATFORM=m -CONFIG_CAN_C_CAN_PCI=m -CONFIG_CAN_CC770=m -CONFIG_CAN_CC770_ISA=m -CONFIG_CAN_CC770_PLATFORM=m -CONFIG_CAN_IFI_CANFD=m -CONFIG_CAN_M_CAN=m -CONFIG_CAN_PEAK_PCIEFD=m -CONFIG_CAN_SJA1000=m -CONFIG_CAN_SJA1000_ISA=m -CONFIG_CAN_SJA1000_PLATFORM=m -CONFIG_CAN_EMS_PCMCIA=m -CONFIG_CAN_EMS_PCI=m -CONFIG_CAN_PEAK_PCMCIA=m -CONFIG_CAN_PEAK_PCI=m -CONFIG_CAN_PEAK_PCIEC=y -CONFIG_CAN_KVASER_PCI=m -CONFIG_CAN_PLX_PCI=m -CONFIG_CAN_SOFTING=m -CONFIG_CAN_SOFTING_CS=m - -# -# CAN SPI interfaces -# -CONFIG_CAN_HI311X=m -CONFIG_CAN_MCP251X=m - -# -# CAN USB interfaces -# -CONFIG_CAN_EMS_USB=m -CONFIG_CAN_ESD_USB2=m -CONFIG_CAN_GS_USB=m -CONFIG_CAN_KVASER_USB=m -CONFIG_CAN_PEAK_USB=m -CONFIG_CAN_8DEV_USB=m -CONFIG_CAN_MCBA_USB=m -# CONFIG_CAN_DEBUG_DEVICES is not set -CONFIG_BT=m -CONFIG_BT_BREDR=y -CONFIG_BT_RFCOMM=m -CONFIG_BT_RFCOMM_TTY=y -CONFIG_BT_BNEP=m -CONFIG_BT_BNEP_MC_FILTER=y -CONFIG_BT_BNEP_PROTO_FILTER=y -CONFIG_BT_CMTP=m -CONFIG_BT_HIDP=m -CONFIG_BT_HS=y -CONFIG_BT_LE=y -CONFIG_BT_6LOWPAN=m -CONFIG_BT_LEDS=y -# CONFIG_BT_SELFTEST is not set -CONFIG_BT_DEBUGFS=y - -# -# Bluetooth device drivers -# -CONFIG_BT_INTEL=m -CONFIG_BT_BCM=m -CONFIG_BT_RTL=m -CONFIG_BT_QCA=m -CONFIG_BT_HCIBTUSB=m -CONFIG_BT_HCIBTUSB_BCM=y -CONFIG_BT_HCIBTUSB_RTL=y -CONFIG_BT_HCIBTSDIO=m -CONFIG_BT_HCIUART=m -CONFIG_BT_HCIUART_SERDEV=y -CONFIG_BT_HCIUART_H4=y -CONFIG_BT_HCIUART_NOKIA=m -CONFIG_BT_HCIUART_BCSP=y -CONFIG_BT_HCIUART_ATH3K=y -CONFIG_BT_HCIUART_LL=y -CONFIG_BT_HCIUART_3WIRE=y -CONFIG_BT_HCIUART_INTEL=y -CONFIG_BT_HCIUART_BCM=y -CONFIG_BT_HCIUART_QCA=y -CONFIG_BT_HCIUART_AG6XX=y -CONFIG_BT_HCIUART_MRVL=y -CONFIG_BT_HCIBCM203X=m -CONFIG_BT_HCIBPA10X=m -CONFIG_BT_HCIBFUSB=m -CONFIG_BT_HCIDTL1=m -CONFIG_BT_HCIBT3C=m -CONFIG_BT_HCIBLUECARD=m -CONFIG_BT_HCIBTUART=m -CONFIG_BT_HCIVHCI=m -CONFIG_BT_MRVL=m -CONFIG_BT_MRVL_SDIO=m -CONFIG_BT_ATH3K=m -CONFIG_BT_WILINK=m -CONFIG_AF_RXRPC=m -CONFIG_AF_RXRPC_IPV6=y -# CONFIG_AF_RXRPC_INJECT_LOSS is not set -# CONFIG_AF_RXRPC_DEBUG is not set -# CONFIG_RXKAD is not set -CONFIG_AF_KCM=m -CONFIG_STREAM_PARSER=m -CONFIG_FIB_RULES=y -CONFIG_WIRELESS=y -CONFIG_WIRELESS_EXT=y -CONFIG_WEXT_CORE=y -CONFIG_WEXT_PROC=y -CONFIG_WEXT_SPY=y -CONFIG_WEXT_PRIV=y -CONFIG_CFG80211=m -CONFIG_NL80211_TESTMODE=y -# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set -CONFIG_CFG80211_DEFAULT_PS=y -# CONFIG_CFG80211_DEBUGFS is not set -# CONFIG_CFG80211_INTERNAL_REGDB is not set -CONFIG_CFG80211_CRDA_SUPPORT=y -CONFIG_CFG80211_WEXT=y -CONFIG_CFG80211_WEXT_EXPORT=y -CONFIG_LIB80211=m -CONFIG_LIB80211_CRYPT_WEP=m -CONFIG_LIB80211_CRYPT_CCMP=m -CONFIG_LIB80211_CRYPT_TKIP=m -# CONFIG_LIB80211_DEBUG is not set -CONFIG_MAC80211=m -CONFIG_MAC80211_HAS_RC=y -CONFIG_MAC80211_RC_MINSTREL=y -CONFIG_MAC80211_RC_MINSTREL_HT=y -# CONFIG_MAC80211_RC_MINSTREL_VHT is not set -CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y -CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" -CONFIG_MAC80211_MESH=y -CONFIG_MAC80211_LEDS=y -# CONFIG_MAC80211_DEBUGFS is not set -# CONFIG_MAC80211_MESSAGE_TRACING is not set -# CONFIG_MAC80211_DEBUG_MENU is not set -CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 -CONFIG_WIMAX=m -CONFIG_WIMAX_DEBUG_LEVEL=8 -CONFIG_RFKILL=m -CONFIG_RFKILL_LEDS=y -CONFIG_RFKILL_INPUT=y -CONFIG_RFKILL_GPIO=m -CONFIG_NET_9P=m -CONFIG_NET_9P_VIRTIO=m -CONFIG_NET_9P_RDMA=m -# CONFIG_NET_9P_DEBUG is not set -CONFIG_CAIF=m -# CONFIG_CAIF_DEBUG is not set -CONFIG_CAIF_NETDEV=m -CONFIG_CAIF_USB=m -CONFIG_CEPH_LIB=m -# CONFIG_CEPH_LIB_PRETTYDEBUG is not set -CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y -CONFIG_NFC=m -CONFIG_NFC_DIGITAL=m -CONFIG_NFC_NCI=m -CONFIG_NFC_NCI_SPI=m -CONFIG_NFC_NCI_UART=m -CONFIG_NFC_HCI=m -CONFIG_NFC_SHDLC=y - -# -# Near Field Communication (NFC) devices -# -CONFIG_NFC_TRF7970A=m -CONFIG_NFC_MEI_PHY=m -CONFIG_NFC_SIM=m -CONFIG_NFC_PORT100=m -CONFIG_NFC_FDP=m -CONFIG_NFC_FDP_I2C=m -CONFIG_NFC_PN544=m -CONFIG_NFC_PN544_I2C=m -CONFIG_NFC_PN544_MEI=m -CONFIG_NFC_PN533=m -CONFIG_NFC_PN533_USB=m -CONFIG_NFC_PN533_I2C=m -CONFIG_NFC_MICROREAD=m -CONFIG_NFC_MICROREAD_I2C=m -CONFIG_NFC_MICROREAD_MEI=m -CONFIG_NFC_MRVL=m -CONFIG_NFC_MRVL_USB=m -CONFIG_NFC_MRVL_UART=m -CONFIG_NFC_MRVL_I2C=m -CONFIG_NFC_MRVL_SPI=m -CONFIG_NFC_ST21NFCA=m -CONFIG_NFC_ST21NFCA_I2C=m -CONFIG_NFC_ST_NCI=m -CONFIG_NFC_ST_NCI_I2C=m -CONFIG_NFC_ST_NCI_SPI=m -CONFIG_NFC_NXP_NCI=m -CONFIG_NFC_NXP_NCI_I2C=m -CONFIG_NFC_S3FWRN5=m -CONFIG_NFC_S3FWRN5_I2C=m -CONFIG_NFC_ST95HF=m -CONFIG_PSAMPLE=m -CONFIG_NET_IFE=m -CONFIG_LWTUNNEL=y -CONFIG_LWTUNNEL_BPF=y -CONFIG_DST_CACHE=y -CONFIG_GRO_CELLS=y -CONFIG_NET_DEVLINK=m -CONFIG_MAY_USE_DEVLINK=m -CONFIG_HAVE_EBPF_JIT=y - -# -# Device Drivers -# - -# -# Generic Driver Options -# -# CONFIG_UEVENT_HELPER is not set -CONFIG_DEVTMPFS=y -CONFIG_DEVTMPFS_MOUNT=y -CONFIG_STANDALONE=y -CONFIG_PREVENT_FIRMWARE_BUILD=y -CONFIG_FW_LOADER=y -# CONFIG_FIRMWARE_IN_KERNEL is not set -CONFIG_EXTRA_FIRMWARE="" -CONFIG_FW_LOADER_USER_HELPER=y -# CONFIG_FW_LOADER_USER_HELPER_FALLBACK is not set -CONFIG_WANT_DEV_COREDUMP=y -CONFIG_ALLOW_DEV_COREDUMP=y -CONFIG_DEV_COREDUMP=y -# CONFIG_DEBUG_DRIVER is not set -# CONFIG_DEBUG_DEVRES is not set -# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set -CONFIG_TEST_ASYNC_DRIVER_PROBE=m -# CONFIG_SYS_HYPERVISOR is not set -# CONFIG_GENERIC_CPU_DEVICES is not set -CONFIG_GENERIC_CPU_AUTOPROBE=y -CONFIG_GENERIC_CPU_VULNERABILITIES=y -CONFIG_REGMAP=y -CONFIG_REGMAP_I2C=m -CONFIG_REGMAP_SPI=y -CONFIG_REGMAP_SPMI=m -CONFIG_REGMAP_W1=m -CONFIG_REGMAP_MMIO=y -CONFIG_REGMAP_IRQ=y -CONFIG_DMA_SHARED_BUFFER=y -# CONFIG_DMA_FENCE_TRACE is not set -# CONFIG_DMA_CMA is not set - -# -# Bus devices -# -CONFIG_CONNECTOR=m -CONFIG_MTD=m -CONFIG_MTD_TESTS=m -CONFIG_MTD_REDBOOT_PARTS=m -CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 -CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED=y -CONFIG_MTD_REDBOOT_PARTS_READONLY=y -CONFIG_MTD_CMDLINE_PARTS=m -CONFIG_MTD_AR7_PARTS=m - -# -# Partition parsers -# - -# -# User Modules And Translation Layers -# -CONFIG_MTD_BLKDEVS=m -CONFIG_MTD_BLOCK=m -CONFIG_MTD_BLOCK_RO=m -CONFIG_FTL=m -CONFIG_NFTL=m -CONFIG_NFTL_RW=y -CONFIG_INFTL=m -CONFIG_RFD_FTL=m -CONFIG_SSFDC=m -CONFIG_SM_FTL=m -CONFIG_MTD_OOPS=m -CONFIG_MTD_SWAP=m -# CONFIG_MTD_PARTITIONED_MASTER is not set - -# -# RAM/ROM/Flash chip drivers -# -CONFIG_MTD_CFI=m -CONFIG_MTD_JEDECPROBE=m -CONFIG_MTD_GEN_PROBE=m -# CONFIG_MTD_CFI_ADV_OPTIONS is not set -CONFIG_MTD_MAP_BANK_WIDTH_1=y -CONFIG_MTD_MAP_BANK_WIDTH_2=y -CONFIG_MTD_MAP_BANK_WIDTH_4=y -# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set -# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set -# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set -CONFIG_MTD_CFI_I1=y -CONFIG_MTD_CFI_I2=y -# CONFIG_MTD_CFI_I4 is not set -# CONFIG_MTD_CFI_I8 is not set -CONFIG_MTD_CFI_INTELEXT=m -CONFIG_MTD_CFI_AMDSTD=m -CONFIG_MTD_CFI_STAA=m -CONFIG_MTD_CFI_UTIL=m -CONFIG_MTD_RAM=m -CONFIG_MTD_ROM=m -CONFIG_MTD_ABSENT=m - -# -# Mapping drivers for chip access -# -CONFIG_MTD_COMPLEX_MAPPINGS=y -CONFIG_MTD_PHYSMAP=m -# CONFIG_MTD_PHYSMAP_COMPAT is not set -CONFIG_MTD_SBC_GXX=m -CONFIG_MTD_AMD76XROM=m -CONFIG_MTD_ICHXROM=m -CONFIG_MTD_ESB2ROM=m -CONFIG_MTD_CK804XROM=m -CONFIG_MTD_SCB2_FLASH=m -CONFIG_MTD_NETtel=m -CONFIG_MTD_L440GX=m -CONFIG_MTD_PCI=m -CONFIG_MTD_PCMCIA=m -# CONFIG_MTD_PCMCIA_ANONYMOUS is not set -CONFIG_MTD_GPIO_ADDR=m -CONFIG_MTD_INTEL_VR_NOR=m -CONFIG_MTD_PLATRAM=m -CONFIG_MTD_LATCH_ADDR=m - -# -# Self-contained MTD device drivers -# -CONFIG_MTD_PMC551=m -CONFIG_MTD_PMC551_BUGFIX=y -# CONFIG_MTD_PMC551_DEBUG is not set -CONFIG_MTD_DATAFLASH=m -CONFIG_MTD_DATAFLASH_WRITE_VERIFY=y -CONFIG_MTD_DATAFLASH_OTP=y -CONFIG_MTD_M25P80=m -CONFIG_MTD_MCHP23K256=m -CONFIG_MTD_SST25L=m -CONFIG_MTD_SLRAM=m -CONFIG_MTD_PHRAM=m -CONFIG_MTD_MTDRAM=m -CONFIG_MTDRAM_TOTAL_SIZE=4096 -CONFIG_MTDRAM_ERASE_SIZE=128 -CONFIG_MTD_BLOCK2MTD=m - -# -# Disk-On-Chip Device Drivers -# -CONFIG_MTD_DOCG3=m -CONFIG_BCH_CONST_M=14 -CONFIG_BCH_CONST_T=4 -CONFIG_MTD_NAND_ECC=m -CONFIG_MTD_NAND_ECC_SMC=y -CONFIG_MTD_NAND=m -CONFIG_MTD_NAND_BCH=m -CONFIG_MTD_NAND_ECC_BCH=y -CONFIG_MTD_SM_COMMON=m -CONFIG_MTD_NAND_DENALI=m -CONFIG_MTD_NAND_DENALI_PCI=m -CONFIG_MTD_NAND_GPIO=m -# CONFIG_MTD_NAND_OMAP_BCH_BUILD is not set -CONFIG_MTD_NAND_RICOH=m -CONFIG_MTD_NAND_DISKONCHIP=m -CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED=y -CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0x0 -CONFIG_MTD_NAND_DISKONCHIP_PROBE_HIGH=y -CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y -CONFIG_MTD_NAND_DOCG4=m -CONFIG_MTD_NAND_CAFE=m -CONFIG_MTD_NAND_NANDSIM=m -CONFIG_MTD_NAND_PLATFORM=m -CONFIG_MTD_ONENAND=m -CONFIG_MTD_ONENAND_VERIFY_WRITE=y -CONFIG_MTD_ONENAND_GENERIC=m -CONFIG_MTD_ONENAND_OTP=y -CONFIG_MTD_ONENAND_2X_PROGRAM=y - -# -# LPDDR & LPDDR2 PCM memory drivers -# -CONFIG_MTD_LPDDR=m -CONFIG_MTD_QINFO_PROBE=m -CONFIG_MTD_SPI_NOR=m -CONFIG_MTD_MT81xx_NOR=m -CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y -CONFIG_MTD_UBI=m -CONFIG_MTD_UBI_WL_THRESHOLD=4096 -CONFIG_MTD_UBI_BEB_LIMIT=20 -CONFIG_MTD_UBI_FASTMAP=y -# CONFIG_MTD_UBI_GLUEBI is not set -CONFIG_MTD_UBI_BLOCK=y -# CONFIG_OF is not set -CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y -CONFIG_PARPORT=m -CONFIG_PARPORT_PC=m -CONFIG_PARPORT_SERIAL=m -CONFIG_PARPORT_PC_FIFO=y -CONFIG_PARPORT_PC_SUPERIO=y -CONFIG_PARPORT_PC_PCMCIA=m -# CONFIG_PARPORT_GSC is not set -CONFIG_PARPORT_AX88796=m -CONFIG_PARPORT_1284=y -CONFIG_PARPORT_NOT_PC=y -CONFIG_PNP=y -# CONFIG_PNP_DEBUG_MESSAGES is not set - -# -# Protocols -# -CONFIG_PNPACPI=y -CONFIG_BLK_DEV=y -# CONFIG_BLK_DEV_NULL_BLK is not set -CONFIG_BLK_DEV_FD=m -CONFIG_PARIDE=m - -# -# Parallel IDE high-level drivers -# -CONFIG_PARIDE_PD=m -CONFIG_PARIDE_PCD=m -CONFIG_PARIDE_PF=m -CONFIG_PARIDE_PT=m -CONFIG_PARIDE_PG=m - -# -# Parallel IDE protocol modules -# -CONFIG_PARIDE_ATEN=m -CONFIG_PARIDE_BPCK=m -CONFIG_PARIDE_COMM=m -CONFIG_PARIDE_DSTR=m -CONFIG_PARIDE_FIT2=m -CONFIG_PARIDE_FIT3=m -CONFIG_PARIDE_EPAT=m -CONFIG_PARIDE_EPATC8=y -CONFIG_PARIDE_EPIA=m -CONFIG_PARIDE_FRIQ=m -CONFIG_PARIDE_FRPW=m -CONFIG_PARIDE_KBIC=m -CONFIG_PARIDE_KTTI=m -CONFIG_PARIDE_ON20=m -CONFIG_PARIDE_ON26=m -CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m -CONFIG_ZRAM=m -CONFIG_ZRAM_WRITEBACK=y -CONFIG_BLK_DEV_DAC960=m -CONFIG_BLK_DEV_UMEM=m -# CONFIG_BLK_DEV_COW_COMMON is not set -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 -CONFIG_BLK_DEV_CRYPTOLOOP=m -CONFIG_BLK_DEV_DRBD=m -# CONFIG_DRBD_FAULT_INJECTION is not set -CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_SKD=m -CONFIG_BLK_DEV_SX8=m -CONFIG_BLK_DEV_RAM=m -CONFIG_BLK_DEV_RAM_COUNT=16 -CONFIG_BLK_DEV_RAM_SIZE=16384 -CONFIG_BLK_DEV_RAM_DAX=y -CONFIG_CDROM_PKTCDVD=m -CONFIG_CDROM_PKTCDVD_BUFFERS=8 -CONFIG_CDROM_PKTCDVD_WCACHE=y -CONFIG_ATA_OVER_ETH=m -CONFIG_VIRTIO_BLK=m -# CONFIG_VIRTIO_BLK_SCSI is not set -CONFIG_BLK_DEV_RBD=m -CONFIG_BLK_DEV_RSXX=m -CONFIG_NVME_CORE=m -CONFIG_BLK_DEV_NVME=m -CONFIG_NVME_FABRICS=m -CONFIG_NVME_RDMA=m -CONFIG_NVME_FC=m -CONFIG_NVME_TARGET=m -CONFIG_NVME_TARGET_LOOP=m -CONFIG_NVME_TARGET_RDMA=m -CONFIG_NVME_TARGET_FC=m -CONFIG_NVME_TARGET_FCLOOP=m - -# -# Misc devices -# -CONFIG_SENSORS_LIS3LV02D=m -CONFIG_AD525X_DPOT=m -CONFIG_AD525X_DPOT_I2C=m -CONFIG_AD525X_DPOT_SPI=m -# CONFIG_DUMMY_IRQ is not set -CONFIG_IBM_ASM=m -CONFIG_PHANTOM=m -CONFIG_SGI_IOC4=m -CONFIG_TIFM_CORE=m -CONFIG_TIFM_7XX1=m -CONFIG_ICS932S401=m -CONFIG_ENCLOSURE_SERVICES=m -CONFIG_HP_ILO=m -CONFIG_APDS9802ALS=m -CONFIG_ISL29003=m -CONFIG_ISL29020=m -CONFIG_SENSORS_TSL2550=m -CONFIG_SENSORS_BH1770=m -CONFIG_SENSORS_APDS990X=m -CONFIG_HMC6352=m -CONFIG_DS1682=m -CONFIG_TI_DAC7512=m -CONFIG_VMWARE_BALLOON=m -CONFIG_USB_SWITCH_FSA9480=m -CONFIG_LATTICE_ECP3_CONFIG=m -CONFIG_SRAM=y -CONFIG_PCI_ENDPOINT_TEST=m -CONFIG_C2PORT=m -CONFIG_C2PORT_DURAMAR_2150=m - -# -# EEPROM support -# -CONFIG_EEPROM_AT24=m -CONFIG_EEPROM_AT25=m -CONFIG_EEPROM_LEGACY=m -CONFIG_EEPROM_MAX6875=m -CONFIG_EEPROM_93CX6=m -CONFIG_EEPROM_93XX46=m -CONFIG_EEPROM_IDT_89HPESX=m -CONFIG_CB710_CORE=m -# CONFIG_CB710_DEBUG is not set -CONFIG_CB710_DEBUG_ASSUMPTIONS=y - -# -# Texas Instruments shared transport line discipline -# -CONFIG_TI_ST=m -CONFIG_SENSORS_LIS3_I2C=m - -# -# Altera FPGA firmware download module -# -CONFIG_ALTERA_STAPL=m -CONFIG_INTEL_MEI=y -CONFIG_INTEL_MEI_ME=y -CONFIG_INTEL_MEI_TXE=m -CONFIG_VMWARE_VMCI=m - -# -# Intel MIC Bus Driver -# -CONFIG_INTEL_MIC_BUS=m - -# -# SCIF Bus Driver -# -CONFIG_SCIF_BUS=m - -# -# VOP Bus Driver -# -CONFIG_VOP_BUS=m - -# -# Intel MIC Host Driver -# -CONFIG_INTEL_MIC_HOST=m - -# -# Intel MIC Card Driver -# -CONFIG_INTEL_MIC_CARD=m - -# -# SCIF Driver -# -CONFIG_SCIF=m - -# -# Intel MIC Coprocessor State Management (COSM) Drivers -# -CONFIG_MIC_COSM=m - -# -# VOP Driver -# -CONFIG_VOP=m -CONFIG_VHOST_RING=m -CONFIG_GENWQE=m -CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 -CONFIG_ECHO=m -# CONFIG_CXL_BASE is not set -# CONFIG_CXL_AFU_DRIVER_OPS is not set -# CONFIG_CXL_LIB is not set -CONFIG_HAVE_IDE=y -# CONFIG_IDE is not set - -# -# SCSI device support -# -CONFIG_SCSI_MOD=m -CONFIG_RAID_ATTRS=m -CONFIG_SCSI=m -CONFIG_SCSI_DMA=y -CONFIG_SCSI_NETLINK=y -# CONFIG_SCSI_MQ_DEFAULT is not set -CONFIG_SCSI_PROC_FS=y - -# -# SCSI support type (disk, tape, CD-ROM) -# -CONFIG_BLK_DEV_SD=m -CONFIG_CHR_DEV_ST=m -CONFIG_CHR_DEV_OSST=m -CONFIG_BLK_DEV_SR=m -CONFIG_BLK_DEV_SR_VENDOR=y -CONFIG_CHR_DEV_SG=m -CONFIG_CHR_DEV_SCH=m -CONFIG_SCSI_ENCLOSURE=m -CONFIG_SCSI_CONSTANTS=y -CONFIG_SCSI_LOGGING=y -CONFIG_SCSI_SCAN_ASYNC=y - -# -# SCSI Transports -# -CONFIG_SCSI_SPI_ATTRS=m -CONFIG_SCSI_FC_ATTRS=m -CONFIG_SCSI_ISCSI_ATTRS=m -CONFIG_SCSI_SAS_ATTRS=m -CONFIG_SCSI_SAS_LIBSAS=m -CONFIG_SCSI_SAS_ATA=y -CONFIG_SCSI_SAS_HOST_SMP=y -CONFIG_SCSI_SRP_ATTRS=m -CONFIG_SCSI_LOWLEVEL=y -CONFIG_ISCSI_TCP=m -CONFIG_ISCSI_BOOT_SYSFS=m -CONFIG_SCSI_CXGB3_ISCSI=m -CONFIG_SCSI_CXGB4_ISCSI=m -CONFIG_SCSI_BNX2_ISCSI=m -CONFIG_SCSI_BNX2X_FCOE=m -CONFIG_BE2ISCSI=m -CONFIG_BLK_DEV_3W_XXXX_RAID=m -CONFIG_SCSI_HPSA=m -CONFIG_SCSI_3W_9XXX=m -CONFIG_SCSI_3W_SAS=m -CONFIG_SCSI_ACARD=m -CONFIG_SCSI_AACRAID=m -CONFIG_SCSI_AIC7XXX=m -CONFIG_AIC7XXX_CMDS_PER_DEVICE=4 -CONFIG_AIC7XXX_RESET_DELAY_MS=15000 -# CONFIG_AIC7XXX_DEBUG_ENABLE is not set -CONFIG_AIC7XXX_DEBUG_MASK=0 -# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set -CONFIG_SCSI_AIC79XX=m -CONFIG_AIC79XX_CMDS_PER_DEVICE=4 -CONFIG_AIC79XX_RESET_DELAY_MS=15000 -# CONFIG_AIC79XX_DEBUG_ENABLE is not set -CONFIG_AIC79XX_DEBUG_MASK=0 -# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set -CONFIG_SCSI_AIC94XX=m -# CONFIG_AIC94XX_DEBUG is not set -CONFIG_SCSI_MVSAS=m -# CONFIG_SCSI_MVSAS_DEBUG is not set -CONFIG_SCSI_MVSAS_TASKLET=y -CONFIG_SCSI_MVUMI=m -CONFIG_SCSI_DPT_I2O=m -CONFIG_SCSI_ADVANSYS=m -CONFIG_SCSI_ARCMSR=m -CONFIG_SCSI_ESAS2R=m -CONFIG_MEGARAID_NEWGEN=y -CONFIG_MEGARAID_MM=m -CONFIG_MEGARAID_MAILBOX=m -CONFIG_MEGARAID_LEGACY=m -CONFIG_MEGARAID_SAS=m -CONFIG_SCSI_MPT3SAS=m -CONFIG_SCSI_MPT2SAS_MAX_SGE=128 -CONFIG_SCSI_MPT3SAS_MAX_SGE=128 -CONFIG_SCSI_MPT2SAS=m -CONFIG_SCSI_SMARTPQI=m -CONFIG_SCSI_UFSHCD=m -CONFIG_SCSI_UFSHCD_PCI=m -CONFIG_SCSI_UFS_DWC_TC_PCI=m -CONFIG_SCSI_UFSHCD_PLATFORM=m -CONFIG_SCSI_UFS_DWC_TC_PLATFORM=m -CONFIG_SCSI_HPTIOP=m -CONFIG_SCSI_BUSLOGIC=m -CONFIG_SCSI_FLASHPOINT=y -CONFIG_VMWARE_PVSCSI=m -CONFIG_HYPERV_STORAGE=m -CONFIG_LIBFC=m -CONFIG_LIBFCOE=m -CONFIG_FCOE=m -CONFIG_FCOE_FNIC=m -CONFIG_SCSI_SNIC=m -# CONFIG_SCSI_SNIC_DEBUG_FS is not set -CONFIG_SCSI_DMX3191D=m -CONFIG_SCSI_EATA=m -# CONFIG_SCSI_EATA_TAGGED_QUEUE is not set -# CONFIG_SCSI_EATA_LINKED_COMMANDS is not set -CONFIG_SCSI_EATA_MAX_TAGS=16 -CONFIG_SCSI_FUTURE_DOMAIN=m -CONFIG_SCSI_GDTH=m -CONFIG_SCSI_ISCI=m -CONFIG_SCSI_IPS=m -CONFIG_SCSI_INITIO=m -CONFIG_SCSI_INIA100=m -CONFIG_SCSI_PPA=m -CONFIG_SCSI_IMM=m -# CONFIG_SCSI_IZIP_EPP16 is not set -# CONFIG_SCSI_IZIP_SLOW_CTR is not set -CONFIG_SCSI_STEX=m -CONFIG_SCSI_SYM53C8XX_2=m -CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 -CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 -CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 -CONFIG_SCSI_SYM53C8XX_MMIO=y -CONFIG_SCSI_IPR=m -# CONFIG_SCSI_IPR_TRACE is not set -# CONFIG_SCSI_IPR_DUMP is not set -CONFIG_SCSI_QLOGIC_1280=m -CONFIG_SCSI_QLA_FC=m -CONFIG_TCM_QLA2XXX=m -# CONFIG_TCM_QLA2XXX_DEBUG is not set -CONFIG_SCSI_QLA_ISCSI=m -CONFIG_QEDI=m -CONFIG_QEDF=m -CONFIG_SCSI_LPFC=m -# CONFIG_SCSI_LPFC_DEBUG_FS is not set -CONFIG_SCSI_DC395x=m -CONFIG_SCSI_AM53C974=m -CONFIG_SCSI_WD719X=m -# CONFIG_SCSI_DEBUG is not set -CONFIG_SCSI_PMCRAID=m -CONFIG_SCSI_PM8001=m -CONFIG_SCSI_BFA_FC=m -CONFIG_SCSI_VIRTIO=m -CONFIG_SCSI_CHELSIO_FCOE=m -CONFIG_SCSI_LOWLEVEL_PCMCIA=y -CONFIG_PCMCIA_AHA152X=m -CONFIG_PCMCIA_FDOMAIN=m -CONFIG_PCMCIA_QLOGIC=m -CONFIG_PCMCIA_SYM53C500=m -CONFIG_SCSI_DH=y -CONFIG_SCSI_DH_RDAC=m -CONFIG_SCSI_DH_HP_SW=m -CONFIG_SCSI_DH_EMC=m -CONFIG_SCSI_DH_ALUA=m -CONFIG_SCSI_OSD_INITIATOR=m -CONFIG_SCSI_OSD_ULD=m -CONFIG_SCSI_OSD_DPRINT_SENSE=1 -# CONFIG_SCSI_OSD_DEBUG is not set -CONFIG_ATA=m -# CONFIG_ATA_NONSTANDARD is not set -CONFIG_ATA_VERBOSE_ERROR=y -CONFIG_ATA_ACPI=y -CONFIG_SATA_ZPODD=y -CONFIG_SATA_PMP=y - -# -# Controllers with non-SFF native interface -# -CONFIG_SATA_AHCI=m -CONFIG_SATA_AHCI_PLATFORM=m -CONFIG_SATA_INIC162X=m -CONFIG_SATA_ACARD_AHCI=m -CONFIG_SATA_SIL24=m -CONFIG_ATA_SFF=y - -# -# SFF controllers with custom DMA interface -# -CONFIG_PDC_ADMA=m -CONFIG_SATA_QSTOR=m -CONFIG_SATA_SX4=m -CONFIG_ATA_BMDMA=y - -# -# SATA SFF controllers with BMDMA -# -CONFIG_ATA_PIIX=m -CONFIG_SATA_DWC=m -# CONFIG_SATA_DWC_OLD_DMA is not set -# CONFIG_SATA_DWC_DEBUG is not set -CONFIG_SATA_MV=m -CONFIG_SATA_NV=m -CONFIG_SATA_PROMISE=m -CONFIG_SATA_SIL=m -CONFIG_SATA_SIS=m -CONFIG_SATA_SVW=m -CONFIG_SATA_ULI=m -CONFIG_SATA_VIA=m -CONFIG_SATA_VITESSE=m - -# -# PATA SFF controllers with BMDMA -# -CONFIG_PATA_ALI=m -CONFIG_PATA_AMD=m -CONFIG_PATA_ARTOP=m -CONFIG_PATA_ATIIXP=m -CONFIG_PATA_ATP867X=m -CONFIG_PATA_CMD64X=m -CONFIG_PATA_CYPRESS=m -CONFIG_PATA_EFAR=m -CONFIG_PATA_HPT366=m -CONFIG_PATA_HPT37X=m -CONFIG_PATA_HPT3X2N=m -CONFIG_PATA_HPT3X3=m -CONFIG_PATA_HPT3X3_DMA=y -CONFIG_PATA_IT8213=m -CONFIG_PATA_IT821X=m -CONFIG_PATA_JMICRON=m -CONFIG_PATA_MARVELL=m -CONFIG_PATA_NETCELL=m -CONFIG_PATA_NINJA32=m -CONFIG_PATA_NS87415=m -CONFIG_PATA_OLDPIIX=m -CONFIG_PATA_OPTIDMA=m -CONFIG_PATA_PDC2027X=m -CONFIG_PATA_PDC_OLD=m -CONFIG_PATA_RADISYS=m -CONFIG_PATA_RDC=m -CONFIG_PATA_SCH=m -CONFIG_PATA_SERVERWORKS=m -CONFIG_PATA_SIL680=m -CONFIG_PATA_SIS=m -CONFIG_PATA_TOSHIBA=m -CONFIG_PATA_TRIFLEX=m -CONFIG_PATA_VIA=m -CONFIG_PATA_WINBOND=m - -# -# PIO-only SFF controllers -# -CONFIG_PATA_CMD640_PCI=m -CONFIG_PATA_MPIIX=m -CONFIG_PATA_NS87410=m -CONFIG_PATA_OPTI=m -CONFIG_PATA_PCMCIA=m -CONFIG_PATA_RZ1000=m - -# -# Generic fallback / legacy drivers -# -CONFIG_PATA_ACPI=m -CONFIG_ATA_GENERIC=m -CONFIG_PATA_LEGACY=m -CONFIG_MD=y -CONFIG_BLK_DEV_MD=m -CONFIG_MD_LINEAR=m -CONFIG_MD_RAID0=m -CONFIG_MD_RAID1=m -CONFIG_MD_RAID10=m -CONFIG_MD_RAID456=m -CONFIG_MD_MULTIPATH=m -CONFIG_MD_FAULTY=m -CONFIG_MD_CLUSTER=m -CONFIG_BCACHE=m -# CONFIG_BCACHE_DEBUG is not set -# CONFIG_BCACHE_CLOSURES_DEBUG is not set -CONFIG_BLK_DEV_DM_BUILTIN=y -CONFIG_BLK_DEV_DM=m -# CONFIG_DM_MQ_DEFAULT is not set -# CONFIG_DM_DEBUG is not set -CONFIG_DM_BUFIO=m -# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set -CONFIG_DM_BIO_PRISON=m -CONFIG_DM_PERSISTENT_DATA=m -CONFIG_DM_CRYPT=m -CONFIG_DM_SNAPSHOT=m -CONFIG_DM_THIN_PROVISIONING=m -CONFIG_DM_CACHE=m -CONFIG_DM_CACHE_SMQ=m -CONFIG_DM_ERA=m -CONFIG_DM_MIRROR=m -CONFIG_DM_LOG_USERSPACE=m -CONFIG_DM_RAID=m -CONFIG_DM_ZERO=m -CONFIG_DM_MULTIPATH=m -CONFIG_DM_MULTIPATH_QL=m -CONFIG_DM_MULTIPATH_ST=m -CONFIG_DM_DELAY=m -CONFIG_DM_UEVENT=y -CONFIG_DM_FLAKEY=m -CONFIG_DM_VERITY=m -# CONFIG_DM_VERITY_FEC is not set -CONFIG_DM_SWITCH=m -CONFIG_DM_LOG_WRITES=m -CONFIG_DM_INTEGRITY=m -CONFIG_DM_ZONED=m -CONFIG_TARGET_CORE=m -CONFIG_TCM_IBLOCK=m -CONFIG_TCM_FILEIO=m -CONFIG_TCM_PSCSI=m -CONFIG_TCM_USER2=m -CONFIG_LOOPBACK_TARGET=m -CONFIG_TCM_FC=m -CONFIG_ISCSI_TARGET=m -CONFIG_ISCSI_TARGET_CXGB4=m -CONFIG_SBP_TARGET=m -CONFIG_FUSION=y -CONFIG_FUSION_SPI=m -CONFIG_FUSION_FC=m -CONFIG_FUSION_SAS=m -CONFIG_FUSION_MAX_SGE=128 -CONFIG_FUSION_CTL=m -CONFIG_FUSION_LAN=m -CONFIG_FUSION_LOGGING=y - -# -# IEEE 1394 (FireWire) support -# -CONFIG_FIREWIRE=m -CONFIG_FIREWIRE_OHCI=m -CONFIG_FIREWIRE_SBP2=m -CONFIG_FIREWIRE_NET=m -CONFIG_FIREWIRE_NOSY=m -CONFIG_MACINTOSH_DRIVERS=y -CONFIG_MAC_EMUMOUSEBTN=m -CONFIG_NETDEVICES=y -CONFIG_MII=m -CONFIG_NET_CORE=y -CONFIG_BONDING=m -CONFIG_DUMMY=m -CONFIG_EQUALIZER=m -CONFIG_NET_FC=y -CONFIG_IFB=m -CONFIG_NET_TEAM=m -CONFIG_NET_TEAM_MODE_BROADCAST=m -CONFIG_NET_TEAM_MODE_ROUNDROBIN=m -CONFIG_NET_TEAM_MODE_RANDOM=m -CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m -CONFIG_NET_TEAM_MODE_LOADBALANCE=m -CONFIG_MACVLAN=m -CONFIG_MACVTAP=m -CONFIG_IPVLAN=m -CONFIG_IPVTAP=m -CONFIG_VXLAN=m -CONFIG_GENEVE=m -CONFIG_GTP=m -CONFIG_MACSEC=m -CONFIG_NETCONSOLE=m -CONFIG_NETCONSOLE_DYNAMIC=y -CONFIG_NETPOLL=y -CONFIG_NET_POLL_CONTROLLER=y -# CONFIG_NTB_NETDEV is not set -CONFIG_RIONET=m -CONFIG_RIONET_TX_SIZE=128 -CONFIG_RIONET_RX_SIZE=128 -CONFIG_TUN=m -CONFIG_TAP=m -# CONFIG_TUN_VNET_CROSS_LE is not set -CONFIG_VETH=m -CONFIG_VIRTIO_NET=m -CONFIG_NLMON=m -CONFIG_NET_VRF=m -CONFIG_VSOCKMON=m -CONFIG_SUNGEM_PHY=m -CONFIG_ARCNET=m -CONFIG_ARCNET_1201=m -CONFIG_ARCNET_1051=m -CONFIG_ARCNET_RAW=m -CONFIG_ARCNET_CAP=m -CONFIG_ARCNET_COM90xx=m -CONFIG_ARCNET_COM90xxIO=m -CONFIG_ARCNET_RIM_I=m -CONFIG_ARCNET_COM20020=m -CONFIG_ARCNET_COM20020_PCI=m -CONFIG_ARCNET_COM20020_CS=m -CONFIG_ATM_DRIVERS=y -# CONFIG_ATM_DUMMY is not set -CONFIG_ATM_TCP=m -CONFIG_ATM_LANAI=m -CONFIG_ATM_ENI=m -# CONFIG_ATM_ENI_DEBUG is not set -# CONFIG_ATM_ENI_TUNE_BURST is not set -CONFIG_ATM_FIRESTREAM=m -CONFIG_ATM_ZATM=m -# CONFIG_ATM_ZATM_DEBUG is not set -CONFIG_ATM_NICSTAR=m -CONFIG_ATM_NICSTAR_USE_SUNI=y -CONFIG_ATM_NICSTAR_USE_IDT77105=y -CONFIG_ATM_IDT77252=m -# CONFIG_ATM_IDT77252_DEBUG is not set -# CONFIG_ATM_IDT77252_RCV_ALL is not set -CONFIG_ATM_IDT77252_USE_SUNI=y -CONFIG_ATM_AMBASSADOR=m -# CONFIG_ATM_AMBASSADOR_DEBUG is not set -CONFIG_ATM_HORIZON=m -# CONFIG_ATM_HORIZON_DEBUG is not set -CONFIG_ATM_IA=m -# CONFIG_ATM_IA_DEBUG is not set -CONFIG_ATM_FORE200E=m -CONFIG_ATM_FORE200E_USE_TASKLET=y -CONFIG_ATM_FORE200E_TX_RETRY=16 -CONFIG_ATM_FORE200E_DEBUG=0 -CONFIG_ATM_HE=m -CONFIG_ATM_HE_USE_SUNI=y -CONFIG_ATM_SOLOS=m - -# -# CAIF transport drivers -# -CONFIG_CAIF_TTY=m -CONFIG_CAIF_SPI_SLAVE=m -CONFIG_CAIF_SPI_SYNC=y -CONFIG_CAIF_HSI=m -CONFIG_CAIF_VIRTIO=m - -# -# Distributed Switch Architecture drivers -# -CONFIG_B53=m -CONFIG_B53_SPI_DRIVER=m -CONFIG_B53_MDIO_DRIVER=m -CONFIG_B53_MMAP_DRIVER=m -CONFIG_B53_SRAB_DRIVER=m -CONFIG_NET_DSA_LOOP=m -CONFIG_NET_DSA_MT7530=m -CONFIG_NET_DSA_MV88E6060=m -CONFIG_MICROCHIP_KSZ=m -CONFIG_MICROCHIP_KSZ_SPI_DRIVER=m -CONFIG_NET_DSA_MV88E6XXX=m -CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y -CONFIG_NET_DSA_QCA8K=m -CONFIG_NET_DSA_SMSC_LAN9303=m -CONFIG_NET_DSA_SMSC_LAN9303_I2C=m -CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m -CONFIG_ETHERNET=y -CONFIG_MDIO=m -CONFIG_NET_VENDOR_3COM=y -CONFIG_PCMCIA_3C574=m -CONFIG_PCMCIA_3C589=m -CONFIG_VORTEX=m -CONFIG_TYPHOON=m -CONFIG_NET_VENDOR_ADAPTEC=y -CONFIG_ADAPTEC_STARFIRE=m -CONFIG_NET_VENDOR_AGERE=y -CONFIG_ET131X=m -CONFIG_NET_VENDOR_ALACRITECH=y -CONFIG_SLICOSS=m -CONFIG_NET_VENDOR_ALTEON=y -CONFIG_ACENIC=m -# CONFIG_ACENIC_OMIT_TIGON_I is not set -CONFIG_ALTERA_TSE=m -CONFIG_NET_VENDOR_AMAZON=y -CONFIG_ENA_ETHERNET=m -CONFIG_NET_VENDOR_AMD=y -CONFIG_AMD8111_ETH=m -CONFIG_PCNET32=m -CONFIG_PCMCIA_NMCLAN=m -CONFIG_AMD_XGBE=m -CONFIG_AMD_XGBE_DCB=y -CONFIG_AMD_XGBE_HAVE_ECC=y -CONFIG_NET_VENDOR_AQUANTIA=y -CONFIG_AQTION=m -CONFIG_NET_VENDOR_ARC=y -CONFIG_NET_VENDOR_ATHEROS=y -CONFIG_ATL2=m -CONFIG_ATL1=m -CONFIG_ATL1E=m -CONFIG_ATL1C=m -CONFIG_ALX=m -CONFIG_NET_VENDOR_AURORA=y -CONFIG_AURORA_NB8800=m -CONFIG_NET_CADENCE=y -CONFIG_MACB=m -CONFIG_MACB_USE_HWSTAMP=y -CONFIG_MACB_PCI=m -CONFIG_NET_VENDOR_BROADCOM=y -CONFIG_B44=m -CONFIG_B44_PCI_AUTOSELECT=y -CONFIG_B44_PCICORE_AUTOSELECT=y -CONFIG_B44_PCI=y -CONFIG_BNX2=m -CONFIG_CNIC=m -CONFIG_TIGON3=m -CONFIG_TIGON3_HWMON=y -CONFIG_BNX2X=m -CONFIG_BNX2X_SRIOV=y -CONFIG_BNXT=m -CONFIG_BNXT_SRIOV=y -CONFIG_BNXT_FLOWER_OFFLOAD=y -CONFIG_BNXT_DCB=y -CONFIG_NET_VENDOR_BROCADE=y -CONFIG_BNA=m -CONFIG_NET_VENDOR_CAVIUM=y -CONFIG_THUNDER_NIC_PF=m -CONFIG_THUNDER_NIC_VF=m -CONFIG_THUNDER_NIC_BGX=m -CONFIG_THUNDER_NIC_RGX=m -CONFIG_LIQUIDIO=m -CONFIG_LIQUIDIO_VF=m -CONFIG_NET_VENDOR_CHELSIO=y -CONFIG_CHELSIO_T1=m -CONFIG_CHELSIO_T1_1G=y -CONFIG_CHELSIO_T3=m -CONFIG_CHELSIO_T4=m -CONFIG_CHELSIO_T4_DCB=y -# CONFIG_CHELSIO_T4_FCOE is not set -CONFIG_CHELSIO_T4VF=m -CONFIG_CHELSIO_LIB=m -CONFIG_NET_VENDOR_CISCO=y -CONFIG_ENIC=m -CONFIG_CX_ECAT=m -CONFIG_DNET=m -CONFIG_NET_VENDOR_DEC=y -CONFIG_NET_TULIP=y -CONFIG_DE2104X=m -CONFIG_DE2104X_DSL=0 -CONFIG_TULIP=m -CONFIG_TULIP_MWI=y -CONFIG_TULIP_MMIO=y -CONFIG_TULIP_NAPI=y -CONFIG_TULIP_NAPI_HW_MITIGATION=y -CONFIG_DE4X5=m -CONFIG_WINBOND_840=m -CONFIG_DM9102=m -CONFIG_ULI526X=m -CONFIG_PCMCIA_XIRCOM=m -CONFIG_NET_VENDOR_DLINK=y -CONFIG_DL2K=m -CONFIG_SUNDANCE=m -# CONFIG_SUNDANCE_MMIO is not set -CONFIG_NET_VENDOR_EMULEX=y -CONFIG_BE2NET=m -CONFIG_BE2NET_HWMON=y -CONFIG_NET_VENDOR_EZCHIP=y -CONFIG_NET_VENDOR_EXAR=y -CONFIG_S2IO=m -CONFIG_VXGE=m -# CONFIG_VXGE_DEBUG_TRACE_ALL is not set -CONFIG_NET_VENDOR_FUJITSU=y -CONFIG_PCMCIA_FMVJ18X=m -CONFIG_NET_VENDOR_HP=y -CONFIG_HP100=m -CONFIG_NET_VENDOR_HUAWEI=y -CONFIG_HINIC=m -CONFIG_NET_VENDOR_INTEL=y -CONFIG_E100=m -CONFIG_E1000=m -CONFIG_E1000E=m -CONFIG_E1000E_HWTS=y -CONFIG_IGB=m -CONFIG_IGB_HWMON=y -CONFIG_IGB_DCA=y -CONFIG_IGBVF=m -CONFIG_IXGB=m -CONFIG_IXGBE=m -CONFIG_IXGBE_HWMON=y -CONFIG_IXGBE_DCA=y -CONFIG_IXGBE_DCB=y -CONFIG_IXGBEVF=m -CONFIG_I40E=m -CONFIG_I40E_DCB=y -CONFIG_I40EVF=m -CONFIG_FM10K=m -CONFIG_NET_VENDOR_I825XX=y -CONFIG_JME=m -CONFIG_NET_VENDOR_MARVELL=y -CONFIG_MVMDIO=m -CONFIG_SKGE=m -# CONFIG_SKGE_DEBUG is not set -CONFIG_SKGE_GENESIS=y -CONFIG_SKY2=m -# CONFIG_SKY2_DEBUG is not set -CONFIG_NET_VENDOR_MELLANOX=y -CONFIG_MLX4_EN=m -CONFIG_MLX4_EN_DCB=y -CONFIG_MLX4_CORE=m -CONFIG_MLX4_DEBUG=y -CONFIG_MLX5_CORE=m -CONFIG_MLX5_ACCEL=y -CONFIG_MLX5_FPGA=y -# CONFIG_MLX5_CORE_EN is not set -CONFIG_MLXSW_CORE=m -CONFIG_MLXSW_CORE_HWMON=y -CONFIG_MLXSW_CORE_THERMAL=y -CONFIG_MLXSW_PCI=m -CONFIG_MLXSW_I2C=m -CONFIG_MLXSW_SWITCHIB=m -CONFIG_MLXSW_SWITCHX2=m -CONFIG_MLXSW_SPECTRUM=m -CONFIG_MLXSW_SPECTRUM_DCB=y -CONFIG_MLXSW_MINIMAL=m -CONFIG_MLXFW=m -CONFIG_NET_VENDOR_MICREL=y -CONFIG_KS8842=m -CONFIG_KS8851=m -CONFIG_KS8851_MLL=m -CONFIG_KSZ884X_PCI=m -CONFIG_NET_VENDOR_MICROCHIP=y -CONFIG_ENC28J60=m -# CONFIG_ENC28J60_WRITEVERIFY is not set -CONFIG_ENCX24J600=m -CONFIG_NET_VENDOR_MYRI=y -CONFIG_MYRI10GE=m -CONFIG_MYRI10GE_DCA=y -CONFIG_FEALNX=m -CONFIG_NET_VENDOR_NATSEMI=y -CONFIG_NATSEMI=m -CONFIG_NS83820=m -CONFIG_NET_VENDOR_NETRONOME=y -CONFIG_NFP=m -# CONFIG_NFP_APP_FLOWER is not set -# CONFIG_NFP_DEBUG is not set -CONFIG_NET_VENDOR_8390=y -CONFIG_PCMCIA_AXNET=m -CONFIG_NE2K_PCI=m -CONFIG_PCMCIA_PCNET=m -CONFIG_NET_VENDOR_NVIDIA=y -CONFIG_FORCEDETH=m -CONFIG_NET_VENDOR_OKI=y -CONFIG_ETHOC=m -CONFIG_NET_PACKET_ENGINE=y -CONFIG_HAMACHI=m -CONFIG_YELLOWFIN=m -CONFIG_NET_VENDOR_QLOGIC=y -CONFIG_QLA3XXX=m -CONFIG_QLCNIC=m -CONFIG_QLCNIC_SRIOV=y -CONFIG_QLCNIC_DCB=y -CONFIG_QLCNIC_HWMON=y -CONFIG_QLGE=m -CONFIG_NETXEN_NIC=m -CONFIG_QED=m -CONFIG_QED_LL2=y -CONFIG_QED_SRIOV=y -CONFIG_QEDE=m -CONFIG_QED_RDMA=y -CONFIG_QED_ISCSI=y -CONFIG_QED_FCOE=y -CONFIG_NET_VENDOR_QUALCOMM=y -CONFIG_QCOM_EMAC=m -CONFIG_RMNET=m -CONFIG_NET_VENDOR_REALTEK=y -CONFIG_ATP=m -CONFIG_8139CP=m -CONFIG_8139TOO=m -# CONFIG_8139TOO_PIO is not set -CONFIG_8139TOO_TUNE_TWISTER=y -CONFIG_8139TOO_8129=y -# CONFIG_8139_OLD_RX_RESET is not set -CONFIG_R8169=m -CONFIG_NET_VENDOR_RENESAS=y -CONFIG_NET_VENDOR_RDC=y -CONFIG_R6040=m -CONFIG_NET_VENDOR_ROCKER=y -CONFIG_ROCKER=m -CONFIG_NET_VENDOR_SAMSUNG=y -CONFIG_SXGBE_ETH=m -CONFIG_NET_VENDOR_SEEQ=y -CONFIG_NET_VENDOR_SILAN=y -CONFIG_SC92031=m -CONFIG_NET_VENDOR_SIS=y -CONFIG_SIS900=m -CONFIG_SIS190=m -CONFIG_NET_VENDOR_SOLARFLARE=y -CONFIG_SFC=m -CONFIG_SFC_MTD=y -CONFIG_SFC_MCDI_MON=y -CONFIG_SFC_SRIOV=y -CONFIG_SFC_MCDI_LOGGING=y -CONFIG_SFC_FALCON=m -CONFIG_SFC_FALCON_MTD=y -CONFIG_NET_VENDOR_SMSC=y -CONFIG_PCMCIA_SMC91C92=m -CONFIG_EPIC100=m -CONFIG_SMSC911X=m -# CONFIG_SMSC911X_ARCH_HOOKS is not set -CONFIG_SMSC9420=m -CONFIG_NET_VENDOR_STMICRO=y -CONFIG_STMMAC_ETH=m -CONFIG_STMMAC_PLATFORM=m -CONFIG_DWMAC_GENERIC=m -CONFIG_STMMAC_PCI=m -CONFIG_NET_VENDOR_SUN=y -CONFIG_HAPPYMEAL=m -CONFIG_SUNGEM=m -CONFIG_CASSINI=m -CONFIG_NIU=m -CONFIG_NET_VENDOR_TEHUTI=y -CONFIG_TEHUTI=m -CONFIG_NET_VENDOR_TI=y -CONFIG_TI_CPSW_ALE=m -CONFIG_TLAN=m -CONFIG_NET_VENDOR_VIA=y -CONFIG_VIA_RHINE=m -CONFIG_VIA_RHINE_MMIO=y -CONFIG_VIA_VELOCITY=m -CONFIG_NET_VENDOR_WIZNET=y -CONFIG_WIZNET_W5100=m -CONFIG_WIZNET_W5300=m -# CONFIG_WIZNET_BUS_DIRECT is not set -# CONFIG_WIZNET_BUS_INDIRECT is not set -CONFIG_WIZNET_BUS_ANY=y -CONFIG_WIZNET_W5100_SPI=m -CONFIG_NET_VENDOR_XIRCOM=y -CONFIG_PCMCIA_XIRC2PS=m -CONFIG_NET_VENDOR_SYNOPSYS=y -CONFIG_DWC_XLGMAC=m -CONFIG_DWC_XLGMAC_PCI=m -CONFIG_FDDI=m -CONFIG_DEFXX=m -# CONFIG_DEFXX_MMIO is not set -CONFIG_SKFP=m -CONFIG_HIPPI=y -CONFIG_ROADRUNNER=m -CONFIG_ROADRUNNER_LARGE_RINGS=y -CONFIG_NET_SB1000=m -CONFIG_MDIO_DEVICE=m -CONFIG_MDIO_BUS=m -CONFIG_MDIO_BITBANG=m -CONFIG_MDIO_CAVIUM=m -CONFIG_MDIO_GPIO=m -CONFIG_MDIO_THUNDER=m -CONFIG_PHYLIB=m -CONFIG_SWPHY=y -CONFIG_LED_TRIGGER_PHY=y - -# -# MII PHY device drivers -# -CONFIG_AMD_PHY=m -CONFIG_AQUANTIA_PHY=m -CONFIG_AT803X_PHY=m -CONFIG_BCM7XXX_PHY=m -CONFIG_BCM87XX_PHY=m -CONFIG_BCM_NET_PHYLIB=m -CONFIG_BROADCOM_PHY=m -CONFIG_CICADA_PHY=m -CONFIG_CORTINA_PHY=m -CONFIG_DAVICOM_PHY=m -CONFIG_DP83848_PHY=m -CONFIG_DP83867_PHY=m -CONFIG_FIXED_PHY=m -CONFIG_ICPLUS_PHY=m -CONFIG_INTEL_XWAY_PHY=m -CONFIG_LSI_ET1011C_PHY=m -CONFIG_LXT_PHY=m -CONFIG_MARVELL_PHY=m -CONFIG_MARVELL_10G_PHY=m -CONFIG_MICREL_PHY=m -CONFIG_MICROCHIP_PHY=m -CONFIG_MICROSEMI_PHY=m -CONFIG_NATIONAL_PHY=m -CONFIG_QSEMI_PHY=m -CONFIG_REALTEK_PHY=m -CONFIG_ROCKCHIP_PHY=m -CONFIG_SMSC_PHY=m -CONFIG_STE10XP=m -CONFIG_TERANETICS_PHY=m -CONFIG_VITESSE_PHY=m -CONFIG_XILINX_GMII2RGMII=m -CONFIG_MICREL_KS8995MA=m -CONFIG_PLIP=m -CONFIG_PPP=m -CONFIG_PPP_BSDCOMP=m -CONFIG_PPP_DEFLATE=m -CONFIG_PPP_FILTER=y -CONFIG_PPP_MPPE=m -CONFIG_PPP_MULTILINK=y -CONFIG_PPPOATM=m -CONFIG_PPPOE=m -CONFIG_PPTP=m -CONFIG_PPPOL2TP=m -CONFIG_PPP_ASYNC=m -CONFIG_PPP_SYNC_TTY=m -CONFIG_SLIP=m -CONFIG_SLHC=m -CONFIG_SLIP_COMPRESSED=y -CONFIG_SLIP_SMART=y -CONFIG_SLIP_MODE_SLIP6=y - -# -# Host-side USB support is needed for USB Network Adapter support -# -CONFIG_USB_NET_DRIVERS=m -CONFIG_USB_CATC=m -CONFIG_USB_KAWETH=m -CONFIG_USB_PEGASUS=m -CONFIG_USB_RTL8150=m -CONFIG_USB_RTL8152=m -CONFIG_USB_LAN78XX=m -CONFIG_USB_USBNET=m -CONFIG_USB_NET_AX8817X=m -CONFIG_USB_NET_AX88179_178A=m -CONFIG_USB_NET_CDCETHER=m -CONFIG_USB_NET_CDC_EEM=m -CONFIG_USB_NET_CDC_NCM=m -CONFIG_USB_NET_HUAWEI_CDC_NCM=m -CONFIG_USB_NET_CDC_MBIM=m -CONFIG_USB_NET_DM9601=m -CONFIG_USB_NET_SR9700=m -CONFIG_USB_NET_SR9800=m -CONFIG_USB_NET_SMSC75XX=m -CONFIG_USB_NET_SMSC95XX=m -CONFIG_USB_NET_GL620A=m -CONFIG_USB_NET_NET1080=m -CONFIG_USB_NET_PLUSB=m -CONFIG_USB_NET_MCS7830=m -CONFIG_USB_NET_RNDIS_HOST=m -CONFIG_USB_NET_CDC_SUBSET_ENABLE=m -CONFIG_USB_NET_CDC_SUBSET=m -CONFIG_USB_ALI_M5632=y -CONFIG_USB_AN2720=y -CONFIG_USB_BELKIN=y -CONFIG_USB_ARMLINUX=y -CONFIG_USB_EPSON2888=y -CONFIG_USB_KC2190=y -CONFIG_USB_NET_ZAURUS=m -CONFIG_USB_NET_CX82310_ETH=m -CONFIG_USB_NET_KALMIA=m -CONFIG_USB_NET_QMI_WWAN=m -CONFIG_USB_HSO=m -CONFIG_USB_NET_INT51X1=m -CONFIG_USB_CDC_PHONET=m -CONFIG_USB_IPHETH=m -CONFIG_USB_SIERRA_NET=m -CONFIG_USB_VL600=m -CONFIG_USB_NET_CH9200=m -CONFIG_WLAN=y -CONFIG_WLAN_VENDOR_ADMTEK=y -CONFIG_ADM8211=m -CONFIG_ATH_COMMON=m -CONFIG_WLAN_VENDOR_ATH=y -# CONFIG_ATH_DEBUG is not set -CONFIG_ATH5K=m -# CONFIG_ATH5K_DEBUG is not set -# CONFIG_ATH5K_TRACER is not set -CONFIG_ATH5K_PCI=y -CONFIG_ATH9K_HW=m -CONFIG_ATH9K_COMMON=m -CONFIG_ATH9K_BTCOEX_SUPPORT=y -CONFIG_ATH9K=m -CONFIG_ATH9K_PCI=y -CONFIG_ATH9K_AHB=y -# CONFIG_ATH9K_DEBUGFS is not set -CONFIG_ATH9K_DYNACK=y -CONFIG_ATH9K_WOW=y -CONFIG_ATH9K_RFKILL=y -CONFIG_ATH9K_CHANNEL_CONTEXT=y -CONFIG_ATH9K_PCOEM=y -CONFIG_ATH9K_HTC=m -# CONFIG_ATH9K_HTC_DEBUGFS is not set -CONFIG_ATH9K_HWRNG=y -CONFIG_CARL9170=m -CONFIG_CARL9170_LEDS=y -CONFIG_CARL9170_WPC=y -# CONFIG_CARL9170_HWRNG is not set -CONFIG_ATH6KL=m -CONFIG_ATH6KL_SDIO=m -CONFIG_ATH6KL_USB=m -# CONFIG_ATH6KL_DEBUG is not set -# CONFIG_ATH6KL_TRACING is not set -CONFIG_AR5523=m -CONFIG_WIL6210=m -CONFIG_WIL6210_ISR_COR=y -CONFIG_WIL6210_TRACING=y -CONFIG_WIL6210_DEBUGFS=y -CONFIG_ATH10K=m -CONFIG_ATH10K_PCI=m -CONFIG_ATH10K_SDIO=m -CONFIG_ATH10K_USB=m -# CONFIG_ATH10K_DEBUG is not set -# CONFIG_ATH10K_DEBUGFS is not set -# CONFIG_ATH10K_TRACING is not set -CONFIG_WCN36XX=m -# CONFIG_WCN36XX_DEBUGFS is not set -CONFIG_WLAN_VENDOR_ATMEL=y -CONFIG_ATMEL=m -CONFIG_PCI_ATMEL=m -CONFIG_PCMCIA_ATMEL=m -CONFIG_AT76C50X_USB=m -CONFIG_WLAN_VENDOR_BROADCOM=y -CONFIG_B43=m -CONFIG_B43_BCMA=y -CONFIG_B43_SSB=y -CONFIG_B43_BUSES_BCMA_AND_SSB=y -# CONFIG_B43_BUSES_BCMA is not set -# CONFIG_B43_BUSES_SSB is not set -CONFIG_B43_PCI_AUTOSELECT=y -CONFIG_B43_PCICORE_AUTOSELECT=y -CONFIG_B43_SDIO=y -CONFIG_B43_BCMA_PIO=y -CONFIG_B43_PIO=y -CONFIG_B43_PHY_G=y -CONFIG_B43_PHY_N=y -CONFIG_B43_PHY_LP=y -CONFIG_B43_PHY_HT=y -CONFIG_B43_LEDS=y -CONFIG_B43_HWRNG=y -# CONFIG_B43_DEBUG is not set -CONFIG_B43LEGACY=m -CONFIG_B43LEGACY_PCI_AUTOSELECT=y -CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y -CONFIG_B43LEGACY_LEDS=y -CONFIG_B43LEGACY_HWRNG=y -# CONFIG_B43LEGACY_DEBUG is not set -CONFIG_B43LEGACY_DMA=y -CONFIG_B43LEGACY_PIO=y -CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y -# CONFIG_B43LEGACY_DMA_MODE is not set -# CONFIG_B43LEGACY_PIO_MODE is not set -CONFIG_BRCMUTIL=m -CONFIG_BRCMSMAC=m -CONFIG_BRCMFMAC=m -CONFIG_BRCMFMAC_PROTO_BCDC=y -CONFIG_BRCMFMAC_PROTO_MSGBUF=y -CONFIG_BRCMFMAC_SDIO=y -CONFIG_BRCMFMAC_USB=y -CONFIG_BRCMFMAC_PCIE=y -CONFIG_BRCM_TRACING=y -# CONFIG_BRCMDBG is not set -CONFIG_WLAN_VENDOR_CISCO=y -CONFIG_AIRO=m -CONFIG_AIRO_CS=m -CONFIG_WLAN_VENDOR_INTEL=y -CONFIG_IPW2100=m -CONFIG_IPW2100_MONITOR=y -# CONFIG_IPW2100_DEBUG is not set -CONFIG_IPW2200=m -CONFIG_IPW2200_MONITOR=y -CONFIG_IPW2200_RADIOTAP=y -CONFIG_IPW2200_PROMISCUOUS=y -CONFIG_IPW2200_QOS=y -# CONFIG_IPW2200_DEBUG is not set -CONFIG_LIBIPW=m -# CONFIG_LIBIPW_DEBUG is not set -CONFIG_IWLEGACY=m -CONFIG_IWL4965=m -CONFIG_IWL3945=m - -# -# iwl3945 / iwl4965 Debugging Options -# -# CONFIG_IWLEGACY_DEBUG is not set -CONFIG_IWLWIFI=m -CONFIG_IWLWIFI_LEDS=y -CONFIG_IWLDVM=m -CONFIG_IWLMVM=m -CONFIG_IWLWIFI_OPMODE_MODULAR=y -CONFIG_IWLWIFI_BCAST_FILTERING=y - -# -# Debugging Options -# -# CONFIG_IWLWIFI_DEBUG is not set -CONFIG_IWLWIFI_DEVICE_TRACING=y -CONFIG_WLAN_VENDOR_INTERSIL=y -CONFIG_HOSTAP=m -CONFIG_HOSTAP_FIRMWARE=y -CONFIG_HOSTAP_FIRMWARE_NVRAM=y -CONFIG_HOSTAP_PLX=m -CONFIG_HOSTAP_PCI=m -CONFIG_HOSTAP_CS=m -CONFIG_HERMES=m -CONFIG_HERMES_PRISM=y -CONFIG_HERMES_CACHE_FW_ON_INIT=y -CONFIG_PLX_HERMES=m -CONFIG_TMD_HERMES=m -CONFIG_NORTEL_HERMES=m -CONFIG_PCI_HERMES=m -CONFIG_PCMCIA_HERMES=m -CONFIG_PCMCIA_SPECTRUM=m -CONFIG_ORINOCO_USB=m -CONFIG_P54_COMMON=m -CONFIG_P54_USB=m -CONFIG_P54_PCI=m -CONFIG_P54_SPI=m -CONFIG_P54_SPI_DEFAULT_EEPROM=y -CONFIG_P54_LEDS=y -CONFIG_PRISM54=m -CONFIG_WLAN_VENDOR_MARVELL=y -CONFIG_LIBERTAS=m -CONFIG_LIBERTAS_USB=m -CONFIG_LIBERTAS_CS=m -CONFIG_LIBERTAS_SDIO=m -CONFIG_LIBERTAS_SPI=m -# CONFIG_LIBERTAS_DEBUG is not set -CONFIG_LIBERTAS_MESH=y -CONFIG_LIBERTAS_THINFIRM=m -# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set -CONFIG_LIBERTAS_THINFIRM_USB=m -CONFIG_MWIFIEX=m -CONFIG_MWIFIEX_SDIO=m -CONFIG_MWIFIEX_PCIE=m -CONFIG_MWIFIEX_USB=m -CONFIG_MWL8K=m -CONFIG_WLAN_VENDOR_MEDIATEK=y -CONFIG_MT7601U=m -CONFIG_WLAN_VENDOR_RALINK=y -CONFIG_RT2X00=m -CONFIG_RT2400PCI=m -CONFIG_RT2500PCI=m -CONFIG_RT61PCI=m -CONFIG_RT2800PCI=m -CONFIG_RT2800PCI_RT33XX=y -CONFIG_RT2800PCI_RT35XX=y -CONFIG_RT2800PCI_RT53XX=y -CONFIG_RT2800PCI_RT3290=y -CONFIG_RT2500USB=m -CONFIG_RT73USB=m -CONFIG_RT2800USB=m -CONFIG_RT2800USB_RT33XX=y -CONFIG_RT2800USB_RT35XX=y -CONFIG_RT2800USB_RT3573=y -CONFIG_RT2800USB_RT53XX=y -CONFIG_RT2800USB_RT55XX=y -CONFIG_RT2800USB_UNKNOWN=y -CONFIG_RT2800_LIB=m -CONFIG_RT2800_LIB_MMIO=m -CONFIG_RT2X00_LIB_MMIO=m -CONFIG_RT2X00_LIB_PCI=m -CONFIG_RT2X00_LIB_USB=m -CONFIG_RT2X00_LIB=m -CONFIG_RT2X00_LIB_FIRMWARE=y -CONFIG_RT2X00_LIB_CRYPTO=y -CONFIG_RT2X00_LIB_LEDS=y -# CONFIG_RT2X00_DEBUG is not set -CONFIG_WLAN_VENDOR_REALTEK=y -CONFIG_RTL8180=m -CONFIG_RTL8187=m -CONFIG_RTL8187_LEDS=y -CONFIG_RTL_CARDS=m -CONFIG_RTL8192CE=m -CONFIG_RTL8192SE=m -CONFIG_RTL8192DE=m -CONFIG_RTL8723AE=m -CONFIG_RTL8723BE=m -CONFIG_RTL8188EE=m -CONFIG_RTL8192EE=m -CONFIG_RTL8821AE=m -CONFIG_RTL8192CU=m -CONFIG_RTLWIFI=m -CONFIG_RTLWIFI_PCI=m -CONFIG_RTLWIFI_USB=m -# CONFIG_RTLWIFI_DEBUG is not set -CONFIG_RTL8192C_COMMON=m -CONFIG_RTL8723_COMMON=m -CONFIG_RTLBTCOEXIST=m -CONFIG_RTL8XXXU=m -CONFIG_RTL8XXXU_UNTESTED=y -CONFIG_WLAN_VENDOR_RSI=y -CONFIG_RSI_91X=m -# CONFIG_RSI_DEBUGFS is not set -CONFIG_RSI_SDIO=m -CONFIG_RSI_USB=m -CONFIG_WLAN_VENDOR_ST=y -CONFIG_CW1200=m -CONFIG_CW1200_WLAN_SDIO=m -CONFIG_CW1200_WLAN_SPI=m -CONFIG_WLAN_VENDOR_TI=y -CONFIG_WL1251=m -CONFIG_WL1251_SPI=m -CONFIG_WL1251_SDIO=m -CONFIG_WL12XX=m -CONFIG_WL18XX=m -CONFIG_WLCORE=m -CONFIG_WLCORE_SDIO=m -CONFIG_WILINK_PLATFORM_DATA=y -CONFIG_WLAN_VENDOR_ZYDAS=y -CONFIG_USB_ZD1201=m -CONFIG_ZD1211RW=m -# CONFIG_ZD1211RW_DEBUG is not set -CONFIG_WLAN_VENDOR_QUANTENNA=y -CONFIG_QTNFMAC=m -CONFIG_QTNFMAC_PEARL_PCIE=m -CONFIG_PCMCIA_RAYCS=m -CONFIG_PCMCIA_WL3501=m -# CONFIG_MAC80211_HWSIM is not set -CONFIG_USB_NET_RNDIS_WLAN=m - -# -# WiMAX Wireless Broadband devices -# -CONFIG_WIMAX_I2400M=m -CONFIG_WIMAX_I2400M_USB=m -CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 -CONFIG_WAN=y -CONFIG_LANMEDIA=m -CONFIG_HDLC=m -CONFIG_HDLC_RAW=m -CONFIG_HDLC_RAW_ETH=m -CONFIG_HDLC_CISCO=m -CONFIG_HDLC_FR=m -CONFIG_HDLC_PPP=m -CONFIG_HDLC_X25=m -CONFIG_PCI200SYN=m -CONFIG_WANXL=m -CONFIG_PC300TOO=m -CONFIG_FARSYNC=m -CONFIG_DSCC4=m -CONFIG_DSCC4_PCISYNC=y -CONFIG_DSCC4_PCI_RST=y -CONFIG_DLCI=m -CONFIG_DLCI_MAX=8 -CONFIG_LAPBETHER=m -CONFIG_X25_ASY=m -CONFIG_SBNI=m -CONFIG_SBNI_MULTILINE=y -CONFIG_IEEE802154_DRIVERS=m -CONFIG_IEEE802154_FAKELB=m -CONFIG_IEEE802154_AT86RF230=m -# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set -CONFIG_IEEE802154_MRF24J40=m -CONFIG_IEEE802154_CC2520=m -CONFIG_IEEE802154_ATUSB=m -CONFIG_IEEE802154_ADF7242=m -CONFIG_IEEE802154_CA8210=m -# CONFIG_IEEE802154_CA8210_DEBUGFS is not set -CONFIG_VMXNET3=m -CONFIG_FUJITSU_ES=m -CONFIG_HYPERV_NET=m -CONFIG_ISDN=y -CONFIG_ISDN_I4L=m -CONFIG_ISDN_PPP=y -CONFIG_ISDN_PPP_VJ=y -CONFIG_ISDN_MPP=y -CONFIG_IPPP_FILTER=y -CONFIG_ISDN_PPP_BSDCOMP=m -CONFIG_ISDN_AUDIO=y -CONFIG_ISDN_TTY_FAX=y -CONFIG_ISDN_X25=y - -# -# ISDN feature submodules -# -CONFIG_ISDN_DIVERSION=m - -# -# ISDN4Linux hardware drivers -# - -# -# Passive cards -# -CONFIG_ISDN_DRV_HISAX=m - -# -# D-channel protocol features -# -CONFIG_HISAX_EURO=y -CONFIG_DE_AOC=y -# CONFIG_HISAX_NO_SENDCOMPLETE is not set -# CONFIG_HISAX_NO_LLC is not set -# CONFIG_HISAX_NO_KEYPAD is not set -CONFIG_HISAX_1TR6=y -CONFIG_HISAX_NI1=y -CONFIG_HISAX_MAX_CARDS=8 - -# -# HiSax supported cards -# -CONFIG_HISAX_16_3=y -CONFIG_HISAX_TELESPCI=y -CONFIG_HISAX_S0BOX=y -CONFIG_HISAX_FRITZPCI=y -CONFIG_HISAX_AVM_A1_PCMCIA=y -CONFIG_HISAX_ELSA=y -CONFIG_HISAX_DIEHLDIVA=y -CONFIG_HISAX_SEDLBAUER=y -CONFIG_HISAX_NETJET=y -CONFIG_HISAX_NETJET_U=y -CONFIG_HISAX_NICCY=y -CONFIG_HISAX_BKM_A4T=y -CONFIG_HISAX_SCT_QUADRO=y -CONFIG_HISAX_GAZEL=y -CONFIG_HISAX_HFC_PCI=y -CONFIG_HISAX_W6692=y -CONFIG_HISAX_HFC_SX=y -CONFIG_HISAX_ENTERNOW_PCI=y -# CONFIG_HISAX_DEBUG is not set - -# -# HiSax PCMCIA card service modules -# -CONFIG_HISAX_SEDLBAUER_CS=m -CONFIG_HISAX_ELSA_CS=m -CONFIG_HISAX_AVM_A1_CS=m -CONFIG_HISAX_TELES_CS=m - -# -# HiSax sub driver modules -# -CONFIG_HISAX_ST5481=m -CONFIG_HISAX_HFCUSB=m -CONFIG_HISAX_HFC4S8S=m -CONFIG_HISAX_FRITZ_PCIPNP=m -CONFIG_ISDN_CAPI=m -CONFIG_CAPI_TRACE=y -CONFIG_ISDN_CAPI_CAPI20=m -CONFIG_ISDN_CAPI_MIDDLEWARE=y -CONFIG_ISDN_CAPI_CAPIDRV=m -# CONFIG_ISDN_CAPI_CAPIDRV_VERBOSE is not set - -# -# CAPI hardware drivers -# -CONFIG_CAPI_AVM=y -CONFIG_ISDN_DRV_AVMB1_B1PCI=m -CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y -CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m -CONFIG_ISDN_DRV_AVMB1_AVM_CS=m -CONFIG_ISDN_DRV_AVMB1_T1PCI=m -CONFIG_ISDN_DRV_AVMB1_C4=m -CONFIG_CAPI_EICON=y -CONFIG_ISDN_DIVAS=m -CONFIG_ISDN_DIVAS_BRIPCI=y -CONFIG_ISDN_DIVAS_PRIPCI=y -CONFIG_ISDN_DIVAS_DIVACAPI=m -CONFIG_ISDN_DIVAS_USERIDI=m -CONFIG_ISDN_DIVAS_MAINT=m -CONFIG_ISDN_DRV_GIGASET=m -CONFIG_GIGASET_CAPI=y -# CONFIG_GIGASET_I4L is not set -# CONFIG_GIGASET_DUMMYLL is not set -CONFIG_GIGASET_BASE=m -CONFIG_GIGASET_M105=m -CONFIG_GIGASET_M101=m -# CONFIG_GIGASET_DEBUG is not set -CONFIG_HYSDN=m -CONFIG_HYSDN_CAPI=y -CONFIG_MISDN=m -CONFIG_MISDN_DSP=m -CONFIG_MISDN_L1OIP=m - -# -# mISDN hardware drivers -# -CONFIG_MISDN_HFCPCI=m -CONFIG_MISDN_HFCMULTI=m -CONFIG_MISDN_HFCUSB=m -CONFIG_MISDN_AVMFRITZ=m -CONFIG_MISDN_SPEEDFAX=m -CONFIG_MISDN_INFINEON=m -CONFIG_MISDN_W6692=m -CONFIG_MISDN_NETJET=m -CONFIG_MISDN_IPAC=m -CONFIG_MISDN_ISAR=m -CONFIG_ISDN_HDLC=m -CONFIG_NVM=y -# CONFIG_NVM_DEBUG is not set -CONFIG_NVM_RRPC=m -CONFIG_NVM_PBLK=m - -# -# Input device support -# -CONFIG_INPUT=y -CONFIG_INPUT_LEDS=y -CONFIG_INPUT_FF_MEMLESS=m -CONFIG_INPUT_POLLDEV=m -CONFIG_INPUT_SPARSEKMAP=m -CONFIG_INPUT_MATRIXKMAP=m - -# -# Userland interfaces -# -CONFIG_INPUT_MOUSEDEV=y -CONFIG_INPUT_MOUSEDEV_PSAUX=y -CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 -CONFIG_INPUT_JOYDEV=m -CONFIG_INPUT_EVDEV=m -# CONFIG_INPUT_EVBUG is not set - -# -# Input Device Drivers -# -CONFIG_INPUT_KEYBOARD=y -CONFIG_KEYBOARD_ADC=m -CONFIG_KEYBOARD_ADP5588=m -CONFIG_KEYBOARD_ADP5589=m -CONFIG_KEYBOARD_ATKBD=y -CONFIG_KEYBOARD_QT1070=m -CONFIG_KEYBOARD_QT2160=m -CONFIG_KEYBOARD_DLINK_DIR685=m -CONFIG_KEYBOARD_LKKBD=m -CONFIG_KEYBOARD_GPIO=m -CONFIG_KEYBOARD_GPIO_POLLED=m -CONFIG_KEYBOARD_TCA6416=m -CONFIG_KEYBOARD_TCA8418=m -CONFIG_KEYBOARD_MATRIX=m -CONFIG_KEYBOARD_LM8323=m -CONFIG_KEYBOARD_LM8333=m -CONFIG_KEYBOARD_MAX7359=m -CONFIG_KEYBOARD_MCS=m -CONFIG_KEYBOARD_MPR121=m -CONFIG_KEYBOARD_NEWTON=m -CONFIG_KEYBOARD_OPENCORES=m -CONFIG_KEYBOARD_SAMSUNG=m -CONFIG_KEYBOARD_STOWAWAY=m -CONFIG_KEYBOARD_SUNKBD=m -CONFIG_KEYBOARD_TM2_TOUCHKEY=m -CONFIG_KEYBOARD_XTKBD=m -CONFIG_KEYBOARD_CROS_EC=m -CONFIG_INPUT_MOUSE=y -CONFIG_MOUSE_PS2=m -CONFIG_MOUSE_PS2_ALPS=y -CONFIG_MOUSE_PS2_BYD=y -CONFIG_MOUSE_PS2_LOGIPS2PP=y -CONFIG_MOUSE_PS2_SYNAPTICS=y -CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y -CONFIG_MOUSE_PS2_CYPRESS=y -CONFIG_MOUSE_PS2_LIFEBOOK=y -CONFIG_MOUSE_PS2_TRACKPOINT=y -CONFIG_MOUSE_PS2_ELANTECH=y -CONFIG_MOUSE_PS2_SENTELIC=y -CONFIG_MOUSE_PS2_TOUCHKIT=y -CONFIG_MOUSE_PS2_FOCALTECH=y -# CONFIG_MOUSE_PS2_VMMOUSE is not set -CONFIG_MOUSE_PS2_SMBUS=y -CONFIG_MOUSE_SERIAL=m -CONFIG_MOUSE_APPLETOUCH=m -CONFIG_MOUSE_BCM5974=m -CONFIG_MOUSE_CYAPA=m -CONFIG_MOUSE_ELAN_I2C=m -CONFIG_MOUSE_ELAN_I2C_I2C=y -CONFIG_MOUSE_ELAN_I2C_SMBUS=y -CONFIG_MOUSE_VSXXXAA=m -CONFIG_MOUSE_GPIO=m -CONFIG_MOUSE_SYNAPTICS_I2C=m -CONFIG_MOUSE_SYNAPTICS_USB=m -CONFIG_INPUT_JOYSTICK=y -CONFIG_JOYSTICK_ANALOG=m -CONFIG_JOYSTICK_A3D=m -CONFIG_JOYSTICK_ADI=m -CONFIG_JOYSTICK_COBRA=m -CONFIG_JOYSTICK_GF2K=m -CONFIG_JOYSTICK_GRIP=m -CONFIG_JOYSTICK_GRIP_MP=m -CONFIG_JOYSTICK_GUILLEMOT=m -CONFIG_JOYSTICK_INTERACT=m -CONFIG_JOYSTICK_SIDEWINDER=m -CONFIG_JOYSTICK_TMDC=m -CONFIG_JOYSTICK_IFORCE=m -CONFIG_JOYSTICK_IFORCE_USB=y -CONFIG_JOYSTICK_IFORCE_232=y -CONFIG_JOYSTICK_WARRIOR=m -CONFIG_JOYSTICK_MAGELLAN=m -CONFIG_JOYSTICK_SPACEORB=m -CONFIG_JOYSTICK_SPACEBALL=m -CONFIG_JOYSTICK_STINGER=m -CONFIG_JOYSTICK_TWIDJOY=m -CONFIG_JOYSTICK_ZHENHUA=m -CONFIG_JOYSTICK_DB9=m -CONFIG_JOYSTICK_GAMECON=m -CONFIG_JOYSTICK_TURBOGRAFX=m -CONFIG_JOYSTICK_AS5011=m -# CONFIG_JOYSTICK_JOYDUMP is not set -CONFIG_JOYSTICK_XPAD=m -CONFIG_JOYSTICK_XPAD_FF=y -CONFIG_JOYSTICK_XPAD_LEDS=y -CONFIG_JOYSTICK_WALKERA0701=m -CONFIG_JOYSTICK_PSXPAD_SPI=m -CONFIG_JOYSTICK_PSXPAD_SPI_FF=y -CONFIG_INPUT_TABLET=y -CONFIG_TABLET_USB_ACECAD=m -CONFIG_TABLET_USB_AIPTEK=m -CONFIG_TABLET_USB_GTCO=m -CONFIG_TABLET_USB_HANWANG=m -CONFIG_TABLET_USB_KBTAB=m -CONFIG_TABLET_USB_PEGASUS=m -CONFIG_TABLET_SERIAL_WACOM4=m -CONFIG_INPUT_TOUCHSCREEN=y -CONFIG_TOUCHSCREEN_PROPERTIES=y -CONFIG_TOUCHSCREEN_ADS7846=m -CONFIG_TOUCHSCREEN_AD7877=m -CONFIG_TOUCHSCREEN_AD7879=m -CONFIG_TOUCHSCREEN_AD7879_I2C=m -CONFIG_TOUCHSCREEN_AD7879_SPI=m -CONFIG_TOUCHSCREEN_ATMEL_MXT=m -# CONFIG_TOUCHSCREEN_ATMEL_MXT_T37 is not set -CONFIG_TOUCHSCREEN_AUO_PIXCIR=m -CONFIG_TOUCHSCREEN_BU21013=m -CONFIG_TOUCHSCREEN_CY8CTMG110=m -CONFIG_TOUCHSCREEN_CYTTSP_CORE=m -CONFIG_TOUCHSCREEN_CYTTSP_I2C=m -CONFIG_TOUCHSCREEN_CYTTSP_SPI=m -CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m -CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m -CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m -CONFIG_TOUCHSCREEN_DA9052=m -CONFIG_TOUCHSCREEN_DYNAPRO=m -CONFIG_TOUCHSCREEN_HAMPSHIRE=m -CONFIG_TOUCHSCREEN_EETI=m -CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m -CONFIG_TOUCHSCREEN_FUJITSU=m -CONFIG_TOUCHSCREEN_GOODIX=m -CONFIG_TOUCHSCREEN_ILI210X=m -CONFIG_TOUCHSCREEN_GUNZE=m -CONFIG_TOUCHSCREEN_EKTF2127=m -CONFIG_TOUCHSCREEN_ELAN=m -CONFIG_TOUCHSCREEN_ELO=m -CONFIG_TOUCHSCREEN_WACOM_W8001=m -CONFIG_TOUCHSCREEN_WACOM_I2C=m -CONFIG_TOUCHSCREEN_MAX11801=m -CONFIG_TOUCHSCREEN_MCS5000=m -CONFIG_TOUCHSCREEN_MMS114=m -CONFIG_TOUCHSCREEN_MELFAS_MIP4=m -CONFIG_TOUCHSCREEN_MTOUCH=m -CONFIG_TOUCHSCREEN_INEXIO=m -CONFIG_TOUCHSCREEN_MK712=m -CONFIG_TOUCHSCREEN_PENMOUNT=m -CONFIG_TOUCHSCREEN_EDT_FT5X06=m -CONFIG_TOUCHSCREEN_TOUCHRIGHT=m -CONFIG_TOUCHSCREEN_TOUCHWIN=m -CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m -CONFIG_TOUCHSCREEN_UCB1400=m -CONFIG_TOUCHSCREEN_PIXCIR=m -CONFIG_TOUCHSCREEN_WDT87XX_I2C=m -CONFIG_TOUCHSCREEN_WM831X=m -CONFIG_TOUCHSCREEN_WM97XX=m -CONFIG_TOUCHSCREEN_WM9705=y -CONFIG_TOUCHSCREEN_WM9712=y -CONFIG_TOUCHSCREEN_WM9713=y -CONFIG_TOUCHSCREEN_USB_COMPOSITE=m -CONFIG_TOUCHSCREEN_MC13783=m -CONFIG_TOUCHSCREEN_USB_EGALAX=y -CONFIG_TOUCHSCREEN_USB_PANJIT=y -CONFIG_TOUCHSCREEN_USB_3M=y -CONFIG_TOUCHSCREEN_USB_ITM=y -CONFIG_TOUCHSCREEN_USB_ETURBO=y -CONFIG_TOUCHSCREEN_USB_GUNZE=y -CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y -CONFIG_TOUCHSCREEN_USB_IRTOUCH=y -CONFIG_TOUCHSCREEN_USB_IDEALTEK=y -CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y -CONFIG_TOUCHSCREEN_USB_GOTOP=y -CONFIG_TOUCHSCREEN_USB_JASTEC=y -CONFIG_TOUCHSCREEN_USB_ELO=y -CONFIG_TOUCHSCREEN_USB_E2I=y -CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y -CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y -CONFIG_TOUCHSCREEN_USB_NEXIO=y -CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y -CONFIG_TOUCHSCREEN_TOUCHIT213=m -CONFIG_TOUCHSCREEN_TSC_SERIO=m -CONFIG_TOUCHSCREEN_TSC200X_CORE=m -CONFIG_TOUCHSCREEN_TSC2004=m -CONFIG_TOUCHSCREEN_TSC2005=m -CONFIG_TOUCHSCREEN_TSC2007=m -CONFIG_TOUCHSCREEN_TSC2007_IIO=y -CONFIG_TOUCHSCREEN_PCAP=m -CONFIG_TOUCHSCREEN_RM_TS=m -CONFIG_TOUCHSCREEN_SILEAD=m -CONFIG_TOUCHSCREEN_SIS_I2C=m -CONFIG_TOUCHSCREEN_ST1232=m -CONFIG_TOUCHSCREEN_STMFTS=m -CONFIG_TOUCHSCREEN_SUR40=m -CONFIG_TOUCHSCREEN_SURFACE3_SPI=m -CONFIG_TOUCHSCREEN_SX8654=m -CONFIG_TOUCHSCREEN_TPS6507X=m -CONFIG_TOUCHSCREEN_ZET6223=m -CONFIG_TOUCHSCREEN_ZFORCE=m -CONFIG_TOUCHSCREEN_ROHM_BU21023=m -CONFIG_INPUT_MISC=y -CONFIG_INPUT_88PM80X_ONKEY=m -CONFIG_INPUT_AD714X=m -CONFIG_INPUT_AD714X_I2C=m -CONFIG_INPUT_AD714X_SPI=m -CONFIG_INPUT_ARIZONA_HAPTICS=m -CONFIG_INPUT_BMA150=m -CONFIG_INPUT_E3X0_BUTTON=m -CONFIG_INPUT_PCSPKR=m -CONFIG_INPUT_MAX77693_HAPTIC=m -CONFIG_INPUT_MC13783_PWRBUTTON=m -CONFIG_INPUT_MMA8450=m -CONFIG_INPUT_APANEL=m -CONFIG_INPUT_GP2A=m -CONFIG_INPUT_GPIO_BEEPER=m -CONFIG_INPUT_GPIO_TILT_POLLED=m -CONFIG_INPUT_GPIO_DECODER=m -CONFIG_INPUT_ATLAS_BTNS=m -CONFIG_INPUT_ATI_REMOTE2=m -CONFIG_INPUT_KEYSPAN_REMOTE=m -CONFIG_INPUT_KXTJ9=m -CONFIG_INPUT_KXTJ9_POLLED_MODE=y -CONFIG_INPUT_POWERMATE=m -CONFIG_INPUT_YEALINK=m -CONFIG_INPUT_CM109=m -CONFIG_INPUT_REGULATOR_HAPTIC=m -CONFIG_INPUT_RETU_PWRBUTTON=m -CONFIG_INPUT_TPS65218_PWRBUTTON=m -CONFIG_INPUT_AXP20X_PEK=m -CONFIG_INPUT_UINPUT=m -CONFIG_INPUT_PCF50633_PMU=m -CONFIG_INPUT_PCF8574=m -CONFIG_INPUT_PWM_BEEPER=m -CONFIG_INPUT_PWM_VIBRA=m -CONFIG_INPUT_GPIO_ROTARY_ENCODER=m -CONFIG_INPUT_DA9052_ONKEY=m -CONFIG_INPUT_DA9063_ONKEY=m -CONFIG_INPUT_WM831X_ON=m -CONFIG_INPUT_PCAP=m -CONFIG_INPUT_ADXL34X=m -CONFIG_INPUT_ADXL34X_I2C=m -CONFIG_INPUT_ADXL34X_SPI=m -CONFIG_INPUT_IMS_PCU=m -CONFIG_INPUT_CMA3000=m -CONFIG_INPUT_CMA3000_I2C=m -CONFIG_INPUT_IDEAPAD_SLIDEBAR=m -CONFIG_INPUT_SOC_BUTTON_ARRAY=m -CONFIG_INPUT_DRV260X_HAPTICS=m -CONFIG_INPUT_DRV2665_HAPTICS=m -CONFIG_INPUT_DRV2667_HAPTICS=m -CONFIG_RMI4_CORE=m -CONFIG_RMI4_I2C=m -CONFIG_RMI4_SPI=m -CONFIG_RMI4_SMB=m -CONFIG_RMI4_F03=y -CONFIG_RMI4_F03_SERIO=m -CONFIG_RMI4_2D_SENSOR=y -CONFIG_RMI4_F11=y -CONFIG_RMI4_F12=y -CONFIG_RMI4_F30=y -CONFIG_RMI4_F34=y -CONFIG_RMI4_F54=y -CONFIG_RMI4_F55=y - -# -# Hardware I/O ports -# -CONFIG_SERIO=y -CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y -CONFIG_SERIO_I8042=y -CONFIG_SERIO_SERPORT=m -CONFIG_SERIO_CT82C710=m -CONFIG_SERIO_PARKBD=m -CONFIG_SERIO_PCIPS2=m -CONFIG_SERIO_LIBPS2=y -CONFIG_SERIO_RAW=m -CONFIG_SERIO_ALTERA_PS2=m -CONFIG_SERIO_PS2MULT=m -CONFIG_SERIO_ARC_PS2=m -CONFIG_HYPERV_KEYBOARD=m -CONFIG_SERIO_GPIO_PS2=m -CONFIG_USERIO=m -CONFIG_GAMEPORT=m -CONFIG_GAMEPORT_NS558=m -CONFIG_GAMEPORT_L4=m -CONFIG_GAMEPORT_EMU10K1=m -CONFIG_GAMEPORT_FM801=m - -# -# Character devices -# -CONFIG_TTY=y -CONFIG_VT=y -CONFIG_CONSOLE_TRANSLATIONS=y -CONFIG_VT_CONSOLE=y -CONFIG_VT_CONSOLE_SLEEP=y -CONFIG_HW_CONSOLE=y -CONFIG_VT_HW_CONSOLE_BINDING=y -CONFIG_UNIX98_PTYS=y -CONFIG_LEGACY_PTYS=y -CONFIG_LEGACY_PTY_COUNT=256 -CONFIG_SERIAL_NONSTANDARD=y -CONFIG_ROCKETPORT=m -CONFIG_CYCLADES=m -CONFIG_CYZ_INTR=y -CONFIG_MOXA_INTELLIO=m -CONFIG_MOXA_SMARTIO=m -CONFIG_SYNCLINK=m -CONFIG_SYNCLINKMP=m -CONFIG_SYNCLINK_GT=m -CONFIG_NOZOMI=m -CONFIG_ISI=m -CONFIG_N_HDLC=m -CONFIG_N_GSM=m -CONFIG_TRACE_ROUTER=m -CONFIG_TRACE_SINK=m -CONFIG_DEVMEM=y -# CONFIG_DEVKMEM is not set - -# -# Serial drivers -# -CONFIG_SERIAL_EARLYCON=y -CONFIG_SERIAL_8250=y -# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set -CONFIG_SERIAL_8250_PNP=y -CONFIG_SERIAL_8250_FINTEK=y -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_8250_DMA=y -CONFIG_SERIAL_8250_PCI=y -CONFIG_SERIAL_8250_EXAR=y -CONFIG_SERIAL_8250_CS=m -CONFIG_SERIAL_8250_MEN_MCB=m -CONFIG_SERIAL_8250_NR_UARTS=4 -CONFIG_SERIAL_8250_RUNTIME_UARTS=4 -CONFIG_SERIAL_8250_EXTENDED=y -CONFIG_SERIAL_8250_MANY_PORTS=y -CONFIG_SERIAL_8250_SHARE_IRQ=y -CONFIG_SERIAL_8250_DETECT_IRQ=y -CONFIG_SERIAL_8250_RSA=y -# CONFIG_SERIAL_8250_FSL is not set -CONFIG_SERIAL_8250_DW=m -CONFIG_SERIAL_8250_RT288X=y -CONFIG_SERIAL_8250_LPSS=y -CONFIG_SERIAL_8250_MID=y -CONFIG_SERIAL_8250_MOXA=m - -# -# Non-8250 serial port support -# -CONFIG_SERIAL_MAX3100=m -CONFIG_SERIAL_MAX310X=y -CONFIG_SERIAL_UARTLITE=m -CONFIG_SERIAL_UARTLITE_NR_UARTS=1 -CONFIG_SERIAL_CORE=y -CONFIG_SERIAL_CORE_CONSOLE=y -CONFIG_SERIAL_JSM=m -CONFIG_SERIAL_SCCNXP=m -CONFIG_SERIAL_SC16IS7XX_CORE=m -CONFIG_SERIAL_SC16IS7XX=m -CONFIG_SERIAL_SC16IS7XX_I2C=y -CONFIG_SERIAL_SC16IS7XX_SPI=y -CONFIG_SERIAL_ALTERA_JTAGUART=m -CONFIG_SERIAL_ALTERA_UART=m -CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 -CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 -CONFIG_SERIAL_IFX6X60=m -CONFIG_SERIAL_ARC=m -CONFIG_SERIAL_ARC_NR_PORTS=1 -CONFIG_SERIAL_RP2=m -CONFIG_SERIAL_RP2_NR_UARTS=32 -CONFIG_SERIAL_FSL_LPUART=m -CONFIG_SERIAL_MEN_Z135=m -CONFIG_SERIAL_DEV_BUS=m -CONFIG_PRINTER=m -CONFIG_LP_CONSOLE=y -CONFIG_PPDEV=m -CONFIG_HVC_DRIVER=y -CONFIG_VIRTIO_CONSOLE=m -CONFIG_IPMI_HANDLER=m -CONFIG_IPMI_DMI_DECODE=y -CONFIG_IPMI_PANIC_EVENT=y -CONFIG_IPMI_PANIC_STRING=y -CONFIG_IPMI_DEVICE_INTERFACE=m -CONFIG_IPMI_SI=m -CONFIG_IPMI_SSIF=m -CONFIG_IPMI_WATCHDOG=m -CONFIG_IPMI_POWEROFF=m -CONFIG_HW_RANDOM=m -CONFIG_HW_RANDOM_TIMERIOMEM=m -CONFIG_HW_RANDOM_INTEL=m -CONFIG_HW_RANDOM_AMD=m -CONFIG_HW_RANDOM_VIA=m -CONFIG_HW_RANDOM_VIRTIO=m -CONFIG_HW_RANDOM_TPM=m -CONFIG_NVRAM=m -CONFIG_R3964=m -CONFIG_APPLICOM=m - -# -# PCMCIA character devices -# -CONFIG_SYNCLINK_CS=m -CONFIG_CARDMAN_4000=m -CONFIG_CARDMAN_4040=m -CONFIG_SCR24X=m -CONFIG_IPWIRELESS=m -CONFIG_MWAVE=m -CONFIG_RAW_DRIVER=m -CONFIG_MAX_RAW_DEVS=256 -CONFIG_HPET=y -CONFIG_HPET_MMAP=y -CONFIG_HPET_MMAP_DEFAULT=y -CONFIG_HANGCHECK_TIMER=m -CONFIG_TCG_TPM=m -CONFIG_TCG_TIS_CORE=m -CONFIG_TCG_TIS=m -CONFIG_TCG_TIS_SPI=m -CONFIG_TCG_TIS_I2C_ATMEL=m -CONFIG_TCG_TIS_I2C_INFINEON=m -CONFIG_TCG_TIS_I2C_NUVOTON=m -CONFIG_TCG_NSC=m -CONFIG_TCG_ATMEL=m -CONFIG_TCG_INFINEON=m -CONFIG_TCG_CRB=m -CONFIG_TCG_VTPM_PROXY=m -CONFIG_TCG_TIS_ST33ZP24=m -CONFIG_TCG_TIS_ST33ZP24_I2C=m -CONFIG_TCG_TIS_ST33ZP24_SPI=m -CONFIG_TELCLOCK=m -CONFIG_DEVPORT=y -CONFIG_XILLYBUS=m -CONFIG_XILLYBUS_PCIE=m - -# -# I2C support -# -CONFIG_I2C=m -CONFIG_I2C_BOARDINFO=y -CONFIG_I2C_COMPAT=y -CONFIG_I2C_CHARDEV=m -CONFIG_I2C_MUX=m - -# -# Multiplexer I2C Chip support -# -CONFIG_I2C_MUX_GPIO=m -CONFIG_I2C_MUX_LTC4306=m -CONFIG_I2C_MUX_PCA9541=m -CONFIG_I2C_MUX_PCA954x=m -CONFIG_I2C_MUX_REG=m -CONFIG_I2C_MUX_MLXCPLD=m -CONFIG_I2C_HELPER_AUTO=y -CONFIG_I2C_SMBUS=m -CONFIG_I2C_ALGOBIT=m -CONFIG_I2C_ALGOPCA=m - -# -# I2C Hardware Bus support -# - -# -# PC SMBus host controller drivers -# -CONFIG_I2C_ALI1535=m -CONFIG_I2C_ALI1563=m -CONFIG_I2C_ALI15X3=m -CONFIG_I2C_AMD756=m -CONFIG_I2C_AMD756_S4882=m -CONFIG_I2C_AMD8111=m -CONFIG_I2C_I801=m -CONFIG_I2C_ISCH=m -CONFIG_I2C_ISMT=m -CONFIG_I2C_PIIX4=m -CONFIG_I2C_NFORCE2=m -CONFIG_I2C_NFORCE2_S4985=m -CONFIG_I2C_SIS5595=m -CONFIG_I2C_SIS630=m -CONFIG_I2C_SIS96X=m -CONFIG_I2C_VIA=m -CONFIG_I2C_VIAPRO=m - -# -# ACPI drivers -# -CONFIG_I2C_SCMI=m - -# -# I2C system bus drivers (mostly embedded / system-on-chip) -# -CONFIG_I2C_CBUS_GPIO=m -CONFIG_I2C_DESIGNWARE_CORE=m -CONFIG_I2C_DESIGNWARE_PLATFORM=m -# CONFIG_I2C_DESIGNWARE_SLAVE is not set -CONFIG_I2C_DESIGNWARE_PCI=m -# CONFIG_I2C_DESIGNWARE_BAYTRAIL is not set -CONFIG_I2C_EMEV2=m -CONFIG_I2C_GPIO=m -CONFIG_I2C_KEMPLD=m -CONFIG_I2C_OCORES=m -CONFIG_I2C_PCA_PLATFORM=m -# CONFIG_I2C_PXA_PCI is not set -CONFIG_I2C_SIMTEC=m -CONFIG_I2C_XILINX=m - -# -# External I2C/SMBus adapter drivers -# -CONFIG_I2C_DIOLAN_U2C=m -CONFIG_I2C_DLN2=m -CONFIG_I2C_PARPORT=m -CONFIG_I2C_PARPORT_LIGHT=m -CONFIG_I2C_ROBOTFUZZ_OSIF=m -CONFIG_I2C_TAOS_EVM=m -CONFIG_I2C_TINY_USB=m -CONFIG_I2C_VIPERBOARD=m - -# -# Other I2C/SMBus bus drivers -# -CONFIG_I2C_MLXCPLD=m -CONFIG_I2C_CROS_EC_TUNNEL=m -# CONFIG_I2C_STUB is not set -CONFIG_I2C_SLAVE=y -CONFIG_I2C_SLAVE_EEPROM=m -# CONFIG_I2C_DEBUG_CORE is not set -# CONFIG_I2C_DEBUG_ALGO is not set -# CONFIG_I2C_DEBUG_BUS is not set -CONFIG_SPI=y -# CONFIG_SPI_DEBUG is not set -CONFIG_SPI_MASTER=y - -# -# SPI Master Controller Drivers -# -CONFIG_SPI_ALTERA=m -CONFIG_SPI_AXI_SPI_ENGINE=m -CONFIG_SPI_BITBANG=m -CONFIG_SPI_BUTTERFLY=m -CONFIG_SPI_CADENCE=m -CONFIG_SPI_DESIGNWARE=m -CONFIG_SPI_DW_PCI=m -CONFIG_SPI_DW_MID_DMA=y -CONFIG_SPI_DW_MMIO=m -CONFIG_SPI_DLN2=m -CONFIG_SPI_GPIO=m -CONFIG_SPI_LM70_LLP=m -CONFIG_SPI_OC_TINY=m -CONFIG_SPI_PXA2XX=m -CONFIG_SPI_PXA2XX_PCI=m -CONFIG_SPI_ROCKCHIP=m -CONFIG_SPI_SC18IS602=m -CONFIG_SPI_XCOMM=m -CONFIG_SPI_XILINX=m -CONFIG_SPI_ZYNQMP_GQSPI=m - -# -# SPI Protocol Masters -# -CONFIG_SPI_SPIDEV=m -CONFIG_SPI_LOOPBACK_TEST=m -CONFIG_SPI_TLE62X0=m -CONFIG_SPI_SLAVE=y -CONFIG_SPI_SLAVE_TIME=m -CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m -CONFIG_SPMI=m -CONFIG_HSI=m -CONFIG_HSI_BOARDINFO=y - -# -# HSI controllers -# - -# -# HSI clients -# -CONFIG_HSI_CHAR=m -CONFIG_PPS=m -# CONFIG_PPS_DEBUG is not set -# CONFIG_NTP_PPS is not set - -# -# PPS clients support -# -# CONFIG_PPS_CLIENT_KTIMER is not set -CONFIG_PPS_CLIENT_LDISC=m -CONFIG_PPS_CLIENT_PARPORT=m -CONFIG_PPS_CLIENT_GPIO=m - -# -# PPS generators support -# - -# -# PTP clock support -# -CONFIG_PTP_1588_CLOCK=m -CONFIG_DP83640_PHY=m -CONFIG_PTP_1588_CLOCK_KVM=m -CONFIG_PINCTRL=y - -# -# Pin controllers -# -CONFIG_PINMUX=y -CONFIG_PINCONF=y -CONFIG_GENERIC_PINCONF=y -# CONFIG_DEBUG_PINCTRL is not set -CONFIG_PINCTRL_AMD=m -CONFIG_PINCTRL_MCP23S08=m -CONFIG_PINCTRL_BAYTRAIL=y -CONFIG_PINCTRL_CHERRYVIEW=m -CONFIG_PINCTRL_INTEL=m -CONFIG_PINCTRL_BROXTON=m -CONFIG_PINCTRL_CANNONLAKE=m -CONFIG_PINCTRL_DENVERTON=m -CONFIG_PINCTRL_GEMINILAKE=m -CONFIG_PINCTRL_LEWISBURG=m -CONFIG_PINCTRL_SUNRISEPOINT=m -CONFIG_GPIOLIB=y -CONFIG_GPIO_ACPI=y -CONFIG_GPIOLIB_IRQCHIP=y -# CONFIG_DEBUG_GPIO is not set -CONFIG_GPIO_SYSFS=y -CONFIG_GPIO_GENERIC=m -CONFIG_GPIO_MAX730X=m - -# -# Memory mapped GPIO drivers -# -CONFIG_GPIO_AMDPT=m -CONFIG_GPIO_AXP209=m -CONFIG_GPIO_DWAPB=m -CONFIG_GPIO_EXAR=m -CONFIG_GPIO_GENERIC_PLATFORM=m -CONFIG_GPIO_ICH=m -CONFIG_GPIO_LYNXPOINT=y -CONFIG_GPIO_MENZ127=m -CONFIG_GPIO_MOCKUP=m -CONFIG_GPIO_VX855=m - -# -# Port-mapped I/O GPIO drivers -# -CONFIG_GPIO_F7188X=m -CONFIG_GPIO_IT87=m -CONFIG_GPIO_SCH=m -CONFIG_GPIO_SCH311X=m - -# -# I2C GPIO expanders -# -CONFIG_GPIO_ADP5588=m -CONFIG_GPIO_MAX7300=m -CONFIG_GPIO_MAX732X=m -CONFIG_GPIO_PCA953X=m -CONFIG_GPIO_PCF857X=m -CONFIG_GPIO_TPIC2810=m - -# -# MFD GPIO expanders -# -CONFIG_GPIO_ARIZONA=m -CONFIG_GPIO_BD9571MWV=m -CONFIG_GPIO_DA9052=m -CONFIG_GPIO_DLN2=m -CONFIG_GPIO_JANZ_TTL=m -CONFIG_GPIO_KEMPLD=m -CONFIG_GPIO_LP3943=m -CONFIG_GPIO_LP873X=m -CONFIG_GPIO_TPS65086=m -CONFIG_GPIO_TPS65218=m -CONFIG_GPIO_TPS65912=m -CONFIG_GPIO_UCB1400=m -CONFIG_GPIO_WHISKEY_COVE=m -CONFIG_GPIO_WM831X=m -CONFIG_GPIO_WM8994=m - -# -# PCI GPIO expanders -# -CONFIG_GPIO_AMD8111=m -CONFIG_GPIO_ML_IOH=m -CONFIG_GPIO_PCI_IDIO_16=m -CONFIG_GPIO_RDC321X=m - -# -# SPI GPIO expanders -# -CONFIG_GPIO_MAX7301=m -CONFIG_GPIO_MC33880=m -CONFIG_GPIO_PISOSR=m -CONFIG_GPIO_XRA1403=m - -# -# USB GPIO expanders -# -CONFIG_GPIO_VIPERBOARD=m -CONFIG_W1=m -CONFIG_W1_CON=y - -# -# 1-wire Bus Masters -# -CONFIG_W1_MASTER_MATROX=m -CONFIG_W1_MASTER_DS2490=m -CONFIG_W1_MASTER_DS2482=m -CONFIG_W1_MASTER_DS1WM=m -CONFIG_W1_MASTER_GPIO=m - -# -# 1-wire Slaves -# -CONFIG_W1_SLAVE_THERM=m -CONFIG_W1_SLAVE_SMEM=m -# CONFIG_W1_SLAVE_DS2405 is not set -CONFIG_W1_SLAVE_DS2408=m -# CONFIG_W1_SLAVE_DS2408_READBACK is not set -CONFIG_W1_SLAVE_DS2413=m -CONFIG_W1_SLAVE_DS2406=m -CONFIG_W1_SLAVE_DS2423=m -CONFIG_W1_SLAVE_DS2805=m -CONFIG_W1_SLAVE_DS2431=m -CONFIG_W1_SLAVE_DS2433=m -CONFIG_W1_SLAVE_DS2433_CRC=y -CONFIG_W1_SLAVE_DS2438=m -CONFIG_W1_SLAVE_DS2760=m -CONFIG_W1_SLAVE_DS2780=m -CONFIG_W1_SLAVE_DS2781=m -CONFIG_W1_SLAVE_DS28E04=m -CONFIG_POWER_AVS=y -CONFIG_POWER_RESET=y -# CONFIG_POWER_RESET_RESTART is not set -CONFIG_POWER_SUPPLY=y -# CONFIG_POWER_SUPPLY_DEBUG is not set -CONFIG_PDA_POWER=m -CONFIG_GENERIC_ADC_BATTERY=m -CONFIG_WM831X_BACKUP=m -CONFIG_WM831X_POWER=m -# CONFIG_TEST_POWER is not set -CONFIG_BATTERY_DS2760=m -CONFIG_BATTERY_DS2780=m -CONFIG_BATTERY_DS2781=m -CONFIG_BATTERY_DS2782=m -CONFIG_BATTERY_SBS=m -CONFIG_CHARGER_SBS=m -CONFIG_BATTERY_BQ27XXX=m -CONFIG_BATTERY_BQ27XXX_I2C=m -CONFIG_BATTERY_BQ27XXX_HDQ=m -# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set -CONFIG_BATTERY_DA9052=m -CONFIG_CHARGER_DA9150=m -CONFIG_BATTERY_DA9150=m -CONFIG_CHARGER_AXP20X=m -CONFIG_BATTERY_AXP20X=m -CONFIG_AXP20X_POWER=m -CONFIG_AXP288_CHARGER=m -CONFIG_AXP288_FUEL_GAUGE=m -CONFIG_BATTERY_MAX17040=m -CONFIG_BATTERY_MAX17042=m -CONFIG_BATTERY_MAX1721X=m -CONFIG_CHARGER_PCF50633=m -CONFIG_CHARGER_ISP1704=m -CONFIG_CHARGER_MAX8903=m -CONFIG_CHARGER_LP8727=m -CONFIG_CHARGER_GPIO=m -CONFIG_CHARGER_MANAGER=y -CONFIG_CHARGER_LTC3651=m -CONFIG_CHARGER_MAX14577=m -CONFIG_CHARGER_MAX77693=m -CONFIG_CHARGER_BQ2415X=m -CONFIG_CHARGER_BQ24190=m -CONFIG_CHARGER_BQ24257=m -CONFIG_CHARGER_BQ24735=m -CONFIG_CHARGER_BQ25890=m -CONFIG_CHARGER_SMB347=m -CONFIG_CHARGER_TPS65217=m -CONFIG_BATTERY_GAUGE_LTC2941=m -CONFIG_BATTERY_RT5033=m -CONFIG_CHARGER_RT9455=m -CONFIG_HWMON=m -CONFIG_HWMON_VID=m -# CONFIG_HWMON_DEBUG_CHIP is not set - -# -# Native drivers -# -CONFIG_SENSORS_ABITUGURU=m -CONFIG_SENSORS_ABITUGURU3=m -CONFIG_SENSORS_AD7314=m -CONFIG_SENSORS_AD7414=m -CONFIG_SENSORS_AD7418=m -CONFIG_SENSORS_ADM1021=m -CONFIG_SENSORS_ADM1025=m -CONFIG_SENSORS_ADM1026=m -CONFIG_SENSORS_ADM1029=m -CONFIG_SENSORS_ADM1031=m -CONFIG_SENSORS_ADM9240=m -CONFIG_SENSORS_ADT7X10=m -CONFIG_SENSORS_ADT7310=m -CONFIG_SENSORS_ADT7410=m -CONFIG_SENSORS_ADT7411=m -CONFIG_SENSORS_ADT7462=m -CONFIG_SENSORS_ADT7470=m -CONFIG_SENSORS_ADT7475=m -CONFIG_SENSORS_ASC7621=m -CONFIG_SENSORS_K8TEMP=m -CONFIG_SENSORS_K10TEMP=m -CONFIG_SENSORS_FAM15H_POWER=m -CONFIG_SENSORS_APPLESMC=m -CONFIG_SENSORS_ASB100=m -CONFIG_SENSORS_ASPEED=m -CONFIG_SENSORS_ATXP1=m -CONFIG_SENSORS_DS620=m -CONFIG_SENSORS_DS1621=m -CONFIG_SENSORS_DELL_SMM=m -CONFIG_SENSORS_DA9052_ADC=m -CONFIG_SENSORS_I5K_AMB=m -CONFIG_SENSORS_F71805F=m -CONFIG_SENSORS_F71882FG=m -CONFIG_SENSORS_F75375S=m -CONFIG_SENSORS_MC13783_ADC=m -CONFIG_SENSORS_FSCHMD=m -CONFIG_SENSORS_FTSTEUTATES=m -CONFIG_SENSORS_GL518SM=m -CONFIG_SENSORS_GL520SM=m -CONFIG_SENSORS_G760A=m -CONFIG_SENSORS_G762=m -CONFIG_SENSORS_GPIO_FAN=m -CONFIG_SENSORS_HIH6130=m -CONFIG_SENSORS_IBMAEM=m -CONFIG_SENSORS_IBMPEX=m -CONFIG_SENSORS_IIO_HWMON=m -CONFIG_SENSORS_I5500=m -CONFIG_SENSORS_CORETEMP=m -CONFIG_SENSORS_IT87=m -CONFIG_SENSORS_JC42=m -CONFIG_SENSORS_POWR1220=m -CONFIG_SENSORS_LINEAGE=m -CONFIG_SENSORS_LTC2945=m -CONFIG_SENSORS_LTC2990=m -CONFIG_SENSORS_LTC4151=m -CONFIG_SENSORS_LTC4215=m -CONFIG_SENSORS_LTC4222=m -CONFIG_SENSORS_LTC4245=m -CONFIG_SENSORS_LTC4260=m -CONFIG_SENSORS_LTC4261=m -CONFIG_SENSORS_MAX1111=m -CONFIG_SENSORS_MAX16065=m -CONFIG_SENSORS_MAX1619=m -CONFIG_SENSORS_MAX1668=m -CONFIG_SENSORS_MAX197=m -CONFIG_SENSORS_MAX31722=m -CONFIG_SENSORS_MAX6639=m -CONFIG_SENSORS_MAX6642=m -CONFIG_SENSORS_MAX6650=m -CONFIG_SENSORS_MAX6697=m -CONFIG_SENSORS_MAX31790=m -CONFIG_SENSORS_MCP3021=m -CONFIG_SENSORS_TC654=m -CONFIG_SENSORS_MENF21BMC_HWMON=m -CONFIG_SENSORS_ADCXX=m -CONFIG_SENSORS_LM63=m -CONFIG_SENSORS_LM70=m -CONFIG_SENSORS_LM73=m -CONFIG_SENSORS_LM75=m -CONFIG_SENSORS_LM77=m -CONFIG_SENSORS_LM78=m -CONFIG_SENSORS_LM80=m -CONFIG_SENSORS_LM83=m -CONFIG_SENSORS_LM85=m -CONFIG_SENSORS_LM87=m -CONFIG_SENSORS_LM90=m -CONFIG_SENSORS_LM92=m -CONFIG_SENSORS_LM93=m -CONFIG_SENSORS_LM95234=m -CONFIG_SENSORS_LM95241=m -CONFIG_SENSORS_LM95245=m -CONFIG_SENSORS_PC87360=m -CONFIG_SENSORS_PC87427=m -CONFIG_SENSORS_NTC_THERMISTOR=m -CONFIG_SENSORS_NCT6683=m -CONFIG_SENSORS_NCT6775=m -CONFIG_SENSORS_NCT7802=m -CONFIG_SENSORS_NCT7904=m -CONFIG_SENSORS_PCF8591=m -CONFIG_PMBUS=m -CONFIG_SENSORS_PMBUS=m -CONFIG_SENSORS_ADM1275=m -CONFIG_SENSORS_IBM_CFFPS=m -CONFIG_SENSORS_IR35221=m -CONFIG_SENSORS_LM25066=m -CONFIG_SENSORS_LTC2978=m -CONFIG_SENSORS_LTC2978_REGULATOR=y -CONFIG_SENSORS_LTC3815=m -CONFIG_SENSORS_MAX16064=m -CONFIG_SENSORS_MAX20751=m -CONFIG_SENSORS_MAX34440=m -CONFIG_SENSORS_MAX8688=m -CONFIG_SENSORS_TPS40422=m -CONFIG_SENSORS_TPS53679=m -CONFIG_SENSORS_UCD9000=m -CONFIG_SENSORS_UCD9200=m -CONFIG_SENSORS_ZL6100=m -CONFIG_SENSORS_SHT15=m -CONFIG_SENSORS_SHT21=m -CONFIG_SENSORS_SHT3x=m -CONFIG_SENSORS_SHTC1=m -CONFIG_SENSORS_SIS5595=m -CONFIG_SENSORS_DME1737=m -CONFIG_SENSORS_EMC1403=m -CONFIG_SENSORS_EMC2103=m -CONFIG_SENSORS_EMC6W201=m -CONFIG_SENSORS_SMSC47M1=m -CONFIG_SENSORS_SMSC47M192=m -CONFIG_SENSORS_SMSC47B397=m -CONFIG_SENSORS_SCH56XX_COMMON=m -CONFIG_SENSORS_SCH5627=m -CONFIG_SENSORS_SCH5636=m -CONFIG_SENSORS_STTS751=m -CONFIG_SENSORS_SMM665=m -CONFIG_SENSORS_ADC128D818=m -CONFIG_SENSORS_ADS1015=m -CONFIG_SENSORS_ADS7828=m -CONFIG_SENSORS_ADS7871=m -CONFIG_SENSORS_AMC6821=m -CONFIG_SENSORS_INA209=m -CONFIG_SENSORS_INA2XX=m -CONFIG_SENSORS_INA3221=m -CONFIG_SENSORS_TC74=m -CONFIG_SENSORS_THMC50=m -CONFIG_SENSORS_TMP102=m -CONFIG_SENSORS_TMP103=m -CONFIG_SENSORS_TMP108=m -CONFIG_SENSORS_TMP401=m -CONFIG_SENSORS_TMP421=m -CONFIG_SENSORS_VIA_CPUTEMP=m -CONFIG_SENSORS_VIA686A=m -CONFIG_SENSORS_VT1211=m -CONFIG_SENSORS_VT8231=m -CONFIG_SENSORS_W83781D=m -CONFIG_SENSORS_W83791D=m -CONFIG_SENSORS_W83792D=m -CONFIG_SENSORS_W83793=m -CONFIG_SENSORS_W83795=m -# CONFIG_SENSORS_W83795_FANCTRL is not set -CONFIG_SENSORS_W83L785TS=m -CONFIG_SENSORS_W83L786NG=m -CONFIG_SENSORS_W83627HF=m -CONFIG_SENSORS_W83627EHF=m -CONFIG_SENSORS_WM831X=m -CONFIG_SENSORS_XGENE=m - -# -# ACPI drivers -# -CONFIG_SENSORS_ACPI_POWER=m -CONFIG_SENSORS_ATK0110=m -CONFIG_THERMAL=y -CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=0 -CONFIG_THERMAL_WRITABLE_TRIPS=y -CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y -# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set -# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set -# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set -CONFIG_THERMAL_GOV_FAIR_SHARE=y -CONFIG_THERMAL_GOV_STEP_WISE=y -CONFIG_THERMAL_GOV_BANG_BANG=y -CONFIG_THERMAL_GOV_USER_SPACE=y -# CONFIG_THERMAL_GOV_POWER_ALLOCATOR is not set -CONFIG_CLOCK_THERMAL=y -CONFIG_DEVFREQ_THERMAL=y -# CONFIG_THERMAL_EMULATION is not set -CONFIG_INTEL_POWERCLAMP=m -CONFIG_X86_PKG_TEMP_THERMAL=m -CONFIG_INTEL_SOC_DTS_IOSF_CORE=m -CONFIG_INTEL_SOC_DTS_THERMAL=m - -# -# ACPI INT340X thermal drivers -# -CONFIG_INT340X_THERMAL=m -CONFIG_ACPI_THERMAL_REL=m -CONFIG_INT3406_THERMAL=m -CONFIG_INTEL_BXT_PMIC_THERMAL=m -CONFIG_INTEL_PCH_THERMAL=m -CONFIG_GENERIC_ADC_THERMAL=m -CONFIG_WATCHDOG=y -CONFIG_WATCHDOG_CORE=y -# CONFIG_WATCHDOG_NOWAYOUT is not set -CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y -# CONFIG_WATCHDOG_SYSFS is not set - -# -# Watchdog Device Drivers -# -CONFIG_SOFT_WATCHDOG=m -# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set -CONFIG_DA9052_WATCHDOG=m -CONFIG_DA9063_WATCHDOG=m -CONFIG_DA9062_WATCHDOG=m -CONFIG_MENF21BMC_WATCHDOG=m -CONFIG_WDAT_WDT=m -CONFIG_WM831X_WATCHDOG=m -CONFIG_XILINX_WATCHDOG=m -CONFIG_ZIIRAVE_WATCHDOG=m -CONFIG_CADENCE_WATCHDOG=m -CONFIG_DW_WATCHDOG=m -CONFIG_MAX63XX_WATCHDOG=m -CONFIG_RETU_WATCHDOG=m -CONFIG_ACQUIRE_WDT=m -CONFIG_ADVANTECH_WDT=m -CONFIG_ALIM1535_WDT=m -CONFIG_ALIM7101_WDT=m -CONFIG_F71808E_WDT=m -# CONFIG_SP5100_TCO is not set -CONFIG_SBC_FITPC2_WATCHDOG=m -CONFIG_EUROTECH_WDT=m -CONFIG_IB700_WDT=m -CONFIG_IBMASR=m -CONFIG_WAFER_WDT=m -CONFIG_I6300ESB_WDT=m -CONFIG_IE6XX_WDT=m -CONFIG_ITCO_WDT=m -CONFIG_ITCO_VENDOR_SUPPORT=y -CONFIG_IT8712F_WDT=m -CONFIG_IT87_WDT=m -CONFIG_HP_WATCHDOG=m -CONFIG_KEMPLD_WDT=m -CONFIG_HPWDT_NMI_DECODING=y -CONFIG_SC1200_WDT=m -CONFIG_PC87413_WDT=m -CONFIG_NV_TCO=m -CONFIG_60XX_WDT=m -CONFIG_CPU5_WDT=m -CONFIG_SMSC_SCH311X_WDT=m -CONFIG_SMSC37B787_WDT=m -CONFIG_VIA_WDT=m -CONFIG_W83627HF_WDT=m -CONFIG_W83877F_WDT=m -CONFIG_W83977F_WDT=m -CONFIG_MACHZ_WDT=m -CONFIG_SBC_EPX_C3_WATCHDOG=m -CONFIG_INTEL_MEI_WDT=m -CONFIG_NI903X_WDT=m -CONFIG_NIC7018_WDT=m -CONFIG_MEN_A21_WDT=m - -# -# PCI-based Watchdog Cards -# -CONFIG_PCIPCWATCHDOG=m -CONFIG_WDTPCI=m - -# -# USB-based Watchdog Cards -# -CONFIG_USBPCWATCHDOG=m - -# -# Watchdog Pretimeout Governors -# -CONFIG_WATCHDOG_PRETIMEOUT_GOV=y -# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set -CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y -CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m -CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y -CONFIG_SSB_POSSIBLE=y - -# -# Sonics Silicon Backplane -# -CONFIG_SSB=m -CONFIG_SSB_SPROM=y -CONFIG_SSB_BLOCKIO=y -CONFIG_SSB_PCIHOST_POSSIBLE=y -CONFIG_SSB_PCIHOST=y -CONFIG_SSB_B43_PCI_BRIDGE=y -CONFIG_SSB_PCMCIAHOST_POSSIBLE=y -CONFIG_SSB_PCMCIAHOST=y -CONFIG_SSB_SDIOHOST_POSSIBLE=y -CONFIG_SSB_SDIOHOST=y -# CONFIG_SSB_DEBUG is not set -CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y -CONFIG_SSB_DRIVER_PCICORE=y -CONFIG_SSB_DRIVER_GPIO=y -CONFIG_BCMA_POSSIBLE=y -CONFIG_BCMA=m -CONFIG_BCMA_BLOCKIO=y -CONFIG_BCMA_HOST_PCI_POSSIBLE=y -CONFIG_BCMA_HOST_PCI=y -CONFIG_BCMA_HOST_SOC=y -CONFIG_BCMA_DRIVER_PCI=y -CONFIG_BCMA_SFLASH=y -CONFIG_BCMA_DRIVER_GMAC_CMN=y -CONFIG_BCMA_DRIVER_GPIO=y -# CONFIG_BCMA_DEBUG is not set - -# -# Multifunction device drivers -# -CONFIG_MFD_CORE=y -CONFIG_MFD_BCM590XX=m -CONFIG_MFD_BD9571MWV=m -CONFIG_MFD_AXP20X=m -CONFIG_MFD_AXP20X_I2C=m -CONFIG_MFD_CROS_EC=m -CONFIG_MFD_CROS_EC_I2C=m -CONFIG_MFD_CROS_EC_SPI=m -CONFIG_PMIC_DA9052=y -CONFIG_MFD_DA9052_SPI=y -CONFIG_MFD_DA9062=m -CONFIG_MFD_DA9063=m -CONFIG_MFD_DA9150=m -CONFIG_MFD_DLN2=m -CONFIG_MFD_MC13XXX=m -CONFIG_MFD_MC13XXX_SPI=m -CONFIG_MFD_MC13XXX_I2C=m -CONFIG_HTC_PASIC3=m -CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m -CONFIG_LPC_ICH=m -CONFIG_LPC_SCH=m -CONFIG_INTEL_SOC_PMIC_BXTWC=m -CONFIG_MFD_INTEL_LPSS=m -CONFIG_MFD_INTEL_LPSS_ACPI=m -CONFIG_MFD_INTEL_LPSS_PCI=m -CONFIG_MFD_JANZ_CMODIO=m -CONFIG_MFD_KEMPLD=m -CONFIG_MFD_88PM800=m -CONFIG_MFD_88PM805=m -CONFIG_MFD_MAX14577=m -CONFIG_MFD_MAX77693=m -CONFIG_MFD_MAX8907=m -CONFIG_MFD_MT6397=m -CONFIG_MFD_MENF21BMC=m -CONFIG_EZX_PCAP=y -CONFIG_MFD_VIPERBOARD=m -CONFIG_MFD_RETU=m -CONFIG_MFD_PCF50633=m -CONFIG_PCF50633_ADC=m -CONFIG_PCF50633_GPIO=m -CONFIG_UCB1400_CORE=m -CONFIG_MFD_RDC321X=m -CONFIG_MFD_RTSX_PCI=m -CONFIG_MFD_RT5033=m -CONFIG_MFD_RTSX_USB=m -CONFIG_MFD_SI476X_CORE=m -CONFIG_MFD_SM501=m -CONFIG_MFD_SM501_GPIO=y -CONFIG_MFD_SKY81452=m -CONFIG_ABX500_CORE=y -CONFIG_MFD_SYSCON=y -CONFIG_MFD_TI_AM335X_TSCADC=m -CONFIG_MFD_LP3943=m -CONFIG_MFD_TI_LMU=m -CONFIG_TPS6105X=m -CONFIG_TPS65010=m -CONFIG_TPS6507X=m -CONFIG_MFD_TPS65086=m -CONFIG_MFD_TPS65217=m -CONFIG_MFD_TI_LP873X=m -CONFIG_MFD_TPS65218=m -CONFIG_MFD_TPS65912=y -CONFIG_MFD_TPS65912_I2C=m -CONFIG_MFD_TPS65912_SPI=y -CONFIG_MFD_WL1273_CORE=m -CONFIG_MFD_LM3533=m -# CONFIG_MFD_TMIO is not set -CONFIG_MFD_VX855=m -CONFIG_MFD_ARIZONA=y -CONFIG_MFD_ARIZONA_I2C=m -CONFIG_MFD_ARIZONA_SPI=m -CONFIG_MFD_CS47L24=y -CONFIG_MFD_WM5102=y -CONFIG_MFD_WM5110=y -CONFIG_MFD_WM8997=y -CONFIG_MFD_WM8998=y -CONFIG_MFD_WM831X=y -CONFIG_MFD_WM831X_SPI=y -CONFIG_MFD_WM8994=m -CONFIG_REGULATOR=y -# CONFIG_REGULATOR_DEBUG is not set -CONFIG_REGULATOR_FIXED_VOLTAGE=m -CONFIG_REGULATOR_VIRTUAL_CONSUMER=m -CONFIG_REGULATOR_USERSPACE_CONSUMER=m -CONFIG_REGULATOR_88PM800=m -CONFIG_REGULATOR_ACT8865=m -CONFIG_REGULATOR_AD5398=m -CONFIG_REGULATOR_ANATOP=m -CONFIG_REGULATOR_ARIZONA_LDO1=m -CONFIG_REGULATOR_ARIZONA_MICSUPP=m -CONFIG_REGULATOR_AXP20X=m -CONFIG_REGULATOR_BCM590XX=m -CONFIG_REGULATOR_BD9571MWV=m -CONFIG_REGULATOR_DA9052=m -CONFIG_REGULATOR_DA9062=m -CONFIG_REGULATOR_DA9063=m -CONFIG_REGULATOR_DA9210=m -CONFIG_REGULATOR_DA9211=m -CONFIG_REGULATOR_FAN53555=m -CONFIG_REGULATOR_GPIO=m -CONFIG_REGULATOR_ISL9305=m -CONFIG_REGULATOR_ISL6271A=m -CONFIG_REGULATOR_LM363X=m -CONFIG_REGULATOR_LP3971=m -CONFIG_REGULATOR_LP3972=m -CONFIG_REGULATOR_LP872X=m -CONFIG_REGULATOR_LP8755=m -CONFIG_REGULATOR_LTC3589=m -CONFIG_REGULATOR_LTC3676=m -CONFIG_REGULATOR_MAX14577=m -CONFIG_REGULATOR_MAX1586=m -CONFIG_REGULATOR_MAX8649=m -CONFIG_REGULATOR_MAX8660=m -CONFIG_REGULATOR_MAX8907=m -CONFIG_REGULATOR_MAX8952=m -CONFIG_REGULATOR_MAX77693=m -CONFIG_REGULATOR_MC13XXX_CORE=m -CONFIG_REGULATOR_MC13783=m -CONFIG_REGULATOR_MC13892=m -CONFIG_REGULATOR_MT6311=m -CONFIG_REGULATOR_MT6323=m -CONFIG_REGULATOR_MT6397=m -CONFIG_REGULATOR_PCAP=m -CONFIG_REGULATOR_PCF50633=m -CONFIG_REGULATOR_PFUZE100=m -CONFIG_REGULATOR_PV88060=m -CONFIG_REGULATOR_PV88080=m -CONFIG_REGULATOR_PV88090=m -CONFIG_REGULATOR_PWM=m -CONFIG_REGULATOR_QCOM_SPMI=m -CONFIG_REGULATOR_RT5033=m -CONFIG_REGULATOR_SKY81452=m -CONFIG_REGULATOR_TPS51632=m -CONFIG_REGULATOR_TPS6105X=m -CONFIG_REGULATOR_TPS62360=m -CONFIG_REGULATOR_TPS65023=m -CONFIG_REGULATOR_TPS6507X=m -CONFIG_REGULATOR_TPS65086=m -CONFIG_REGULATOR_TPS65132=m -CONFIG_REGULATOR_TPS65217=m -CONFIG_REGULATOR_TPS6524X=m -CONFIG_REGULATOR_TPS65912=m -CONFIG_REGULATOR_WM831X=m -CONFIG_REGULATOR_WM8994=m -CONFIG_CEC_CORE=m -CONFIG_RC_CORE=m -CONFIG_RC_MAP=m -CONFIG_RC_DECODERS=y -CONFIG_LIRC=m -CONFIG_IR_LIRC_CODEC=m -CONFIG_IR_NEC_DECODER=m -CONFIG_IR_RC5_DECODER=m -CONFIG_IR_RC6_DECODER=m -CONFIG_IR_JVC_DECODER=m -CONFIG_IR_SONY_DECODER=m -CONFIG_IR_SANYO_DECODER=m -CONFIG_IR_SHARP_DECODER=m -CONFIG_IR_MCE_KBD_DECODER=m -CONFIG_IR_XMP_DECODER=m -CONFIG_RC_DEVICES=y -CONFIG_RC_ATI_REMOTE=m -CONFIG_IR_ENE=m -CONFIG_IR_HIX5HD2=m -CONFIG_IR_IMON=m -CONFIG_IR_MCEUSB=m -CONFIG_IR_ITE_CIR=m -CONFIG_IR_FINTEK=m -CONFIG_IR_NUVOTON=m -CONFIG_IR_REDRAT3=m -CONFIG_IR_SPI=m -CONFIG_IR_STREAMZAP=m -CONFIG_IR_WINBOND_CIR=m -CONFIG_IR_IGORPLUGUSB=m -CONFIG_IR_IGUANA=m -CONFIG_IR_TTUSBIR=m -CONFIG_RC_LOOPBACK=m -CONFIG_IR_GPIO_CIR=m -CONFIG_IR_GPIO_TX=m -CONFIG_IR_PWM_TX=m -CONFIG_IR_SERIAL=m -CONFIG_IR_SERIAL_TRANSMITTER=y -CONFIG_IR_SIR=m -CONFIG_MEDIA_SUPPORT=m - -# -# Multimedia core support -# -CONFIG_MEDIA_CAMERA_SUPPORT=y -CONFIG_MEDIA_ANALOG_TV_SUPPORT=y -CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y -CONFIG_MEDIA_RADIO_SUPPORT=y -CONFIG_MEDIA_SDR_SUPPORT=y -CONFIG_MEDIA_CEC_SUPPORT=y -CONFIG_MEDIA_CEC_RC=y -CONFIG_MEDIA_CONTROLLER=y -CONFIG_MEDIA_CONTROLLER_DVB=y -CONFIG_VIDEO_DEV=m -CONFIG_VIDEO_V4L2_SUBDEV_API=y -CONFIG_VIDEO_V4L2=m -# CONFIG_VIDEO_ADV_DEBUG is not set -# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set -CONFIG_VIDEO_PCI_SKELETON=m -CONFIG_VIDEO_TUNER=m -CONFIG_V4L2_MEM2MEM_DEV=m -CONFIG_V4L2_FLASH_LED_CLASS=m -CONFIG_V4L2_FWNODE=m -CONFIG_VIDEOBUF_GEN=m -CONFIG_VIDEOBUF_DMA_SG=m -CONFIG_VIDEOBUF_VMALLOC=m -CONFIG_VIDEOBUF_DVB=m -CONFIG_VIDEOBUF2_CORE=m -CONFIG_VIDEOBUF2_MEMOPS=m -CONFIG_VIDEOBUF2_DMA_CONTIG=m -CONFIG_VIDEOBUF2_VMALLOC=m -CONFIG_VIDEOBUF2_DMA_SG=m -CONFIG_VIDEOBUF2_DVB=m -CONFIG_DVB_CORE=m -CONFIG_DVB_NET=y -CONFIG_TTPCI_EEPROM=m -CONFIG_DVB_MAX_ADAPTERS=8 -CONFIG_DVB_DYNAMIC_MINORS=y -# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set - -# -# Media drivers -# -CONFIG_MEDIA_USB_SUPPORT=y - -# -# Webcam devices -# -CONFIG_USB_VIDEO_CLASS=m -CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y -CONFIG_USB_GSPCA=m -CONFIG_USB_M5602=m -CONFIG_USB_STV06XX=m -CONFIG_USB_GL860=m -CONFIG_USB_GSPCA_BENQ=m -CONFIG_USB_GSPCA_CONEX=m -CONFIG_USB_GSPCA_CPIA1=m -CONFIG_USB_GSPCA_DTCS033=m -CONFIG_USB_GSPCA_ETOMS=m -CONFIG_USB_GSPCA_FINEPIX=m -CONFIG_USB_GSPCA_JEILINJ=m -CONFIG_USB_GSPCA_JL2005BCD=m -CONFIG_USB_GSPCA_KINECT=m -CONFIG_USB_GSPCA_KONICA=m -CONFIG_USB_GSPCA_MARS=m -CONFIG_USB_GSPCA_MR97310A=m -CONFIG_USB_GSPCA_NW80X=m -CONFIG_USB_GSPCA_OV519=m -CONFIG_USB_GSPCA_OV534=m -CONFIG_USB_GSPCA_OV534_9=m -CONFIG_USB_GSPCA_PAC207=m -CONFIG_USB_GSPCA_PAC7302=m -CONFIG_USB_GSPCA_PAC7311=m -CONFIG_USB_GSPCA_SE401=m -CONFIG_USB_GSPCA_SN9C2028=m -CONFIG_USB_GSPCA_SN9C20X=m -CONFIG_USB_GSPCA_SONIXB=m -CONFIG_USB_GSPCA_SONIXJ=m -CONFIG_USB_GSPCA_SPCA500=m -CONFIG_USB_GSPCA_SPCA501=m -CONFIG_USB_GSPCA_SPCA505=m -CONFIG_USB_GSPCA_SPCA506=m -CONFIG_USB_GSPCA_SPCA508=m -CONFIG_USB_GSPCA_SPCA561=m -CONFIG_USB_GSPCA_SPCA1528=m -CONFIG_USB_GSPCA_SQ905=m -CONFIG_USB_GSPCA_SQ905C=m -CONFIG_USB_GSPCA_SQ930X=m -CONFIG_USB_GSPCA_STK014=m -CONFIG_USB_GSPCA_STK1135=m -CONFIG_USB_GSPCA_STV0680=m -CONFIG_USB_GSPCA_SUNPLUS=m -CONFIG_USB_GSPCA_T613=m -CONFIG_USB_GSPCA_TOPRO=m -CONFIG_USB_GSPCA_TOUPTEK=m -CONFIG_USB_GSPCA_TV8532=m -CONFIG_USB_GSPCA_VC032X=m -CONFIG_USB_GSPCA_VICAM=m -CONFIG_USB_GSPCA_XIRLINK_CIT=m -CONFIG_USB_GSPCA_ZC3XX=m -CONFIG_USB_PWC=m -# CONFIG_USB_PWC_DEBUG is not set -CONFIG_USB_PWC_INPUT_EVDEV=y -CONFIG_VIDEO_CPIA2=m -CONFIG_USB_ZR364XX=m -CONFIG_USB_STKWEBCAM=m -CONFIG_USB_S2255=m -CONFIG_VIDEO_USBTV=m - -# -# Analog TV USB devices -# -CONFIG_VIDEO_PVRUSB2=m -CONFIG_VIDEO_PVRUSB2_SYSFS=y -CONFIG_VIDEO_PVRUSB2_DVB=y -# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set -CONFIG_VIDEO_HDPVR=m -CONFIG_VIDEO_USBVISION=m -CONFIG_VIDEO_STK1160_COMMON=m -CONFIG_VIDEO_STK1160=m -CONFIG_VIDEO_GO7007=m -CONFIG_VIDEO_GO7007_USB=m -CONFIG_VIDEO_GO7007_LOADER=m -CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m - -# -# Analog/digital TV USB devices -# -CONFIG_VIDEO_AU0828=m -CONFIG_VIDEO_AU0828_V4L2=y -CONFIG_VIDEO_AU0828_RC=y -CONFIG_VIDEO_CX231XX=m -CONFIG_VIDEO_CX231XX_RC=y -CONFIG_VIDEO_CX231XX_ALSA=m -CONFIG_VIDEO_CX231XX_DVB=m -CONFIG_VIDEO_TM6000=m -CONFIG_VIDEO_TM6000_ALSA=m -CONFIG_VIDEO_TM6000_DVB=m - -# -# Digital TV USB devices -# -CONFIG_DVB_USB=m -# CONFIG_DVB_USB_DEBUG is not set -CONFIG_DVB_USB_DIB3000MC=m -CONFIG_DVB_USB_A800=m -CONFIG_DVB_USB_DIBUSB_MB=m -CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y -CONFIG_DVB_USB_DIBUSB_MC=m -CONFIG_DVB_USB_DIB0700=m -CONFIG_DVB_USB_UMT_010=m -CONFIG_DVB_USB_CXUSB=m -CONFIG_DVB_USB_M920X=m -CONFIG_DVB_USB_DIGITV=m -CONFIG_DVB_USB_VP7045=m -CONFIG_DVB_USB_VP702X=m -CONFIG_DVB_USB_GP8PSK=m -CONFIG_DVB_USB_NOVA_T_USB2=m -CONFIG_DVB_USB_TTUSB2=m -CONFIG_DVB_USB_DTT200U=m -CONFIG_DVB_USB_OPERA1=m -CONFIG_DVB_USB_AF9005=m -CONFIG_DVB_USB_AF9005_REMOTE=m -CONFIG_DVB_USB_PCTV452E=m -CONFIG_DVB_USB_DW2102=m -CONFIG_DVB_USB_CINERGY_T2=m -CONFIG_DVB_USB_DTV5100=m -CONFIG_DVB_USB_FRIIO=m -CONFIG_DVB_USB_AZ6027=m -CONFIG_DVB_USB_TECHNISAT_USB2=m -CONFIG_DVB_USB_V2=m -CONFIG_DVB_USB_AF9015=m -CONFIG_DVB_USB_AF9035=m -CONFIG_DVB_USB_ANYSEE=m -CONFIG_DVB_USB_AU6610=m -CONFIG_DVB_USB_AZ6007=m -CONFIG_DVB_USB_CE6230=m -CONFIG_DVB_USB_EC168=m -CONFIG_DVB_USB_GL861=m -CONFIG_DVB_USB_LME2510=m -CONFIG_DVB_USB_MXL111SF=m -CONFIG_DVB_USB_RTL28XXU=m -CONFIG_DVB_USB_DVBSKY=m -CONFIG_DVB_USB_ZD1301=m -CONFIG_DVB_TTUSB_BUDGET=m -CONFIG_DVB_TTUSB_DEC=m -CONFIG_SMS_USB_DRV=m -CONFIG_DVB_B2C2_FLEXCOP_USB=m -# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set -CONFIG_DVB_AS102=m - -# -# Webcam, TV (analog/digital) USB devices -# -CONFIG_VIDEO_EM28XX=m -CONFIG_VIDEO_EM28XX_V4L2=m -CONFIG_VIDEO_EM28XX_ALSA=m -CONFIG_VIDEO_EM28XX_DVB=m -CONFIG_VIDEO_EM28XX_RC=m - -# -# Software defined radio USB devices -# -CONFIG_USB_AIRSPY=m -CONFIG_USB_HACKRF=m -CONFIG_USB_MSI2500=m - -# -# USB HDMI CEC adapters -# -CONFIG_USB_PULSE8_CEC=m -CONFIG_USB_RAINSHADOW_CEC=m -CONFIG_MEDIA_PCI_SUPPORT=y - -# -# Media capture support -# -CONFIG_VIDEO_MEYE=m -CONFIG_VIDEO_SOLO6X10=m -CONFIG_VIDEO_TW5864=m -CONFIG_VIDEO_TW68=m -CONFIG_VIDEO_TW686X=m -CONFIG_VIDEO_ZORAN=m -CONFIG_VIDEO_ZORAN_DC30=m -CONFIG_VIDEO_ZORAN_ZR36060=m -CONFIG_VIDEO_ZORAN_BUZ=m -CONFIG_VIDEO_ZORAN_DC10=m -CONFIG_VIDEO_ZORAN_LML33=m -CONFIG_VIDEO_ZORAN_LML33R10=m -CONFIG_VIDEO_ZORAN_AVS6EYES=m - -# -# Media capture/analog TV support -# -CONFIG_VIDEO_IVTV=m -# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set -CONFIG_VIDEO_IVTV_ALSA=m -CONFIG_VIDEO_FB_IVTV=m -CONFIG_VIDEO_HEXIUM_GEMINI=m -CONFIG_VIDEO_HEXIUM_ORION=m -CONFIG_VIDEO_MXB=m -CONFIG_VIDEO_DT3155=m - -# -# Media capture/analog/hybrid TV support -# -CONFIG_VIDEO_CX18=m -CONFIG_VIDEO_CX18_ALSA=m -CONFIG_VIDEO_CX23885=m -CONFIG_MEDIA_ALTERA_CI=m -CONFIG_VIDEO_CX25821=m -CONFIG_VIDEO_CX25821_ALSA=m -CONFIG_VIDEO_CX88=m -CONFIG_VIDEO_CX88_ALSA=m -CONFIG_VIDEO_CX88_BLACKBIRD=m -CONFIG_VIDEO_CX88_DVB=m -CONFIG_VIDEO_CX88_ENABLE_VP3054=y -CONFIG_VIDEO_CX88_VP3054=m -CONFIG_VIDEO_CX88_MPEG=m -CONFIG_VIDEO_BT848=m -CONFIG_DVB_BT8XX=m -CONFIG_VIDEO_SAA7134=m -CONFIG_VIDEO_SAA7134_ALSA=m -CONFIG_VIDEO_SAA7134_RC=y -CONFIG_VIDEO_SAA7134_DVB=m -CONFIG_VIDEO_SAA7134_GO7007=m -CONFIG_VIDEO_SAA7164=m -# CONFIG_VIDEO_COBALT is not set - -# -# Media digital TV PCI Adapters -# -CONFIG_DVB_AV7110_IR=y -CONFIG_DVB_AV7110=m -CONFIG_DVB_AV7110_OSD=y -CONFIG_DVB_BUDGET_CORE=m -CONFIG_DVB_BUDGET=m -CONFIG_DVB_BUDGET_CI=m -CONFIG_DVB_BUDGET_AV=m -CONFIG_DVB_BUDGET_PATCH=m -CONFIG_DVB_B2C2_FLEXCOP_PCI=m -# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set -CONFIG_DVB_PLUTO2=m -CONFIG_DVB_DM1105=m -CONFIG_DVB_PT1=m -CONFIG_DVB_PT3=m -CONFIG_MANTIS_CORE=m -CONFIG_DVB_MANTIS=m -CONFIG_DVB_HOPPER=m -CONFIG_DVB_NGENE=m -CONFIG_DVB_DDBRIDGE=m -# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set -CONFIG_DVB_SMIPCIE=m -CONFIG_DVB_NETUP_UNIDVB=m -CONFIG_V4L_PLATFORM_DRIVERS=y -CONFIG_VIDEO_CAFE_CCIC=m -CONFIG_SOC_CAMERA=m -CONFIG_SOC_CAMERA_PLATFORM=m -CONFIG_V4L_MEM2MEM_DRIVERS=y -CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m -CONFIG_VIDEO_SH_VEU=m -# CONFIG_V4L_TEST_DRIVERS is not set -CONFIG_DVB_PLATFORM_DRIVERS=y -CONFIG_CEC_PLATFORM_DRIVERS=y -CONFIG_SDR_PLATFORM_DRIVERS=y - -# -# Supported MMC/SDIO adapters -# -CONFIG_SMS_SDIO_DRV=m -CONFIG_RADIO_ADAPTERS=y -CONFIG_RADIO_TEA575X=m -CONFIG_RADIO_SI470X=y -CONFIG_USB_SI470X=m -CONFIG_I2C_SI470X=m -CONFIG_RADIO_SI4713=m -CONFIG_USB_SI4713=m -CONFIG_PLATFORM_SI4713=m -CONFIG_I2C_SI4713=m -CONFIG_RADIO_SI476X=m -CONFIG_USB_MR800=m -CONFIG_USB_DSBR=m -CONFIG_RADIO_MAXIRADIO=m -CONFIG_RADIO_SHARK=m -CONFIG_RADIO_SHARK2=m -CONFIG_USB_KEENE=m -CONFIG_USB_RAREMONO=m -CONFIG_USB_MA901=m -CONFIG_RADIO_TEA5764=m -CONFIG_RADIO_SAA7706H=m -CONFIG_RADIO_TEF6862=m -CONFIG_RADIO_WL1273=m - -# -# Texas Instruments WL128x FM driver (ST based) -# -CONFIG_RADIO_WL128X=m - -# -# Supported FireWire (IEEE 1394) Adapters -# -CONFIG_DVB_FIREDTV=m -CONFIG_DVB_FIREDTV_INPUT=y -CONFIG_MEDIA_COMMON_OPTIONS=y - -# -# common driver options -# -CONFIG_VIDEO_CX2341X=m -CONFIG_VIDEO_TVEEPROM=m -CONFIG_CYPRESS_FIRMWARE=m -CONFIG_DVB_B2C2_FLEXCOP=m -CONFIG_VIDEO_SAA7146=m -CONFIG_VIDEO_SAA7146_VV=m -CONFIG_SMS_SIANO_MDTV=m -CONFIG_SMS_SIANO_RC=y -# CONFIG_SMS_SIANO_DEBUGFS is not set - -# -# Media ancillary drivers (tuners, sensors, i2c, spi, frontends) -# -CONFIG_MEDIA_SUBDRV_AUTOSELECT=y -CONFIG_MEDIA_ATTACH=y -CONFIG_VIDEO_IR_I2C=m - -# -# Audio decoders, processors and mixers -# -CONFIG_VIDEO_TVAUDIO=m -CONFIG_VIDEO_TDA7432=m -CONFIG_VIDEO_TDA9840=m -CONFIG_VIDEO_TEA6415C=m -CONFIG_VIDEO_TEA6420=m -CONFIG_VIDEO_MSP3400=m -CONFIG_VIDEO_CS3308=m -CONFIG_VIDEO_CS5345=m -CONFIG_VIDEO_CS53L32A=m -CONFIG_VIDEO_UDA1342=m -CONFIG_VIDEO_WM8775=m -CONFIG_VIDEO_WM8739=m -CONFIG_VIDEO_VP27SMPX=m -CONFIG_VIDEO_SONY_BTF_MPX=m - -# -# RDS decoders -# -CONFIG_VIDEO_SAA6588=m - -# -# Video decoders -# -CONFIG_VIDEO_BT819=m -CONFIG_VIDEO_BT856=m -CONFIG_VIDEO_BT866=m -CONFIG_VIDEO_KS0127=m -CONFIG_VIDEO_SAA7110=m -CONFIG_VIDEO_SAA711X=m -CONFIG_VIDEO_TVP5150=m -CONFIG_VIDEO_TW2804=m -CONFIG_VIDEO_TW9903=m -CONFIG_VIDEO_TW9906=m -CONFIG_VIDEO_VPX3220=m - -# -# Video and audio decoders -# -CONFIG_VIDEO_SAA717X=m -CONFIG_VIDEO_CX25840=m - -# -# Video encoders -# -CONFIG_VIDEO_SAA7127=m -CONFIG_VIDEO_SAA7185=m -CONFIG_VIDEO_ADV7170=m -CONFIG_VIDEO_ADV7175=m - -# -# Camera sensor devices -# -CONFIG_VIDEO_OV2640=m -CONFIG_VIDEO_OV7640=m -CONFIG_VIDEO_OV7670=m -CONFIG_VIDEO_MT9M111=m -CONFIG_VIDEO_MT9V011=m - -# -# Flash devices -# - -# -# Video improvement chips -# -CONFIG_VIDEO_UPD64031A=m -CONFIG_VIDEO_UPD64083=m - -# -# Audio/Video compression chips -# -CONFIG_VIDEO_SAA6752HS=m - -# -# SDR tuner chips -# - -# -# Miscellaneous helper chips -# -CONFIG_VIDEO_M52790=m - -# -# Sensors used on soc_camera driver -# - -# -# soc_camera sensor drivers -# -CONFIG_SOC_CAMERA_IMX074=m -CONFIG_SOC_CAMERA_MT9M001=m -CONFIG_SOC_CAMERA_MT9M111=m -CONFIG_SOC_CAMERA_MT9T031=m -CONFIG_SOC_CAMERA_MT9T112=m -CONFIG_SOC_CAMERA_MT9V022=m -CONFIG_SOC_CAMERA_OV5642=m -CONFIG_SOC_CAMERA_OV772X=m -CONFIG_SOC_CAMERA_OV9640=m -CONFIG_SOC_CAMERA_OV9740=m -CONFIG_SOC_CAMERA_RJ54N1=m -CONFIG_SOC_CAMERA_TW9910=m -CONFIG_MEDIA_TUNER=m -CONFIG_MEDIA_TUNER_SIMPLE=m -CONFIG_MEDIA_TUNER_TDA8290=m -CONFIG_MEDIA_TUNER_TDA827X=m -CONFIG_MEDIA_TUNER_TDA18271=m -CONFIG_MEDIA_TUNER_TDA9887=m -CONFIG_MEDIA_TUNER_TEA5761=m -CONFIG_MEDIA_TUNER_TEA5767=m -CONFIG_MEDIA_TUNER_MSI001=m -CONFIG_MEDIA_TUNER_MT20XX=m -CONFIG_MEDIA_TUNER_MT2060=m -CONFIG_MEDIA_TUNER_MT2063=m -CONFIG_MEDIA_TUNER_MT2266=m -CONFIG_MEDIA_TUNER_MT2131=m -CONFIG_MEDIA_TUNER_QT1010=m -CONFIG_MEDIA_TUNER_XC2028=m -CONFIG_MEDIA_TUNER_XC5000=m -CONFIG_MEDIA_TUNER_XC4000=m -CONFIG_MEDIA_TUNER_MXL5005S=m -CONFIG_MEDIA_TUNER_MXL5007T=m -CONFIG_MEDIA_TUNER_MC44S803=m -CONFIG_MEDIA_TUNER_MAX2165=m -CONFIG_MEDIA_TUNER_TDA18218=m -CONFIG_MEDIA_TUNER_FC0011=m -CONFIG_MEDIA_TUNER_FC0012=m -CONFIG_MEDIA_TUNER_FC0013=m -CONFIG_MEDIA_TUNER_TDA18212=m -CONFIG_MEDIA_TUNER_E4000=m -CONFIG_MEDIA_TUNER_FC2580=m -CONFIG_MEDIA_TUNER_M88RS6000T=m -CONFIG_MEDIA_TUNER_TUA9001=m -CONFIG_MEDIA_TUNER_SI2157=m -CONFIG_MEDIA_TUNER_IT913X=m -CONFIG_MEDIA_TUNER_R820T=m -CONFIG_MEDIA_TUNER_MXL301RF=m -CONFIG_MEDIA_TUNER_QM1D1C0042=m - -# -# Multistandard (satellite) frontends -# -CONFIG_DVB_STB0899=m -CONFIG_DVB_STB6100=m -CONFIG_DVB_STV090x=m -CONFIG_DVB_STV0910=m -CONFIG_DVB_STV6110x=m -CONFIG_DVB_STV6111=m -CONFIG_DVB_MXL5XX=m -CONFIG_DVB_M88DS3103=m - -# -# Multistandard (cable + terrestrial) frontends -# -CONFIG_DVB_DRXK=m -CONFIG_DVB_TDA18271C2DD=m -CONFIG_DVB_SI2165=m -CONFIG_DVB_MN88472=m -CONFIG_DVB_MN88473=m - -# -# DVB-S (satellite) frontends -# -CONFIG_DVB_CX24110=m -CONFIG_DVB_CX24123=m -CONFIG_DVB_MT312=m -CONFIG_DVB_ZL10036=m -CONFIG_DVB_ZL10039=m -CONFIG_DVB_S5H1420=m -CONFIG_DVB_STV0288=m -CONFIG_DVB_STB6000=m -CONFIG_DVB_STV0299=m -CONFIG_DVB_STV6110=m -CONFIG_DVB_STV0900=m -CONFIG_DVB_TDA8083=m -CONFIG_DVB_TDA10086=m -CONFIG_DVB_TDA8261=m -CONFIG_DVB_VES1X93=m -CONFIG_DVB_TUNER_ITD1000=m -CONFIG_DVB_TUNER_CX24113=m -CONFIG_DVB_TDA826X=m -CONFIG_DVB_TUA6100=m -CONFIG_DVB_CX24116=m -CONFIG_DVB_CX24117=m -CONFIG_DVB_CX24120=m -CONFIG_DVB_SI21XX=m -CONFIG_DVB_TS2020=m -CONFIG_DVB_DS3000=m -CONFIG_DVB_MB86A16=m -CONFIG_DVB_TDA10071=m - -# -# DVB-T (terrestrial) frontends -# -CONFIG_DVB_SP8870=m -CONFIG_DVB_SP887X=m -CONFIG_DVB_CX22700=m -CONFIG_DVB_CX22702=m -CONFIG_DVB_DRXD=m -CONFIG_DVB_L64781=m -CONFIG_DVB_TDA1004X=m -CONFIG_DVB_NXT6000=m -CONFIG_DVB_MT352=m -CONFIG_DVB_ZL10353=m -CONFIG_DVB_DIB3000MB=m -CONFIG_DVB_DIB3000MC=m -CONFIG_DVB_DIB7000M=m -CONFIG_DVB_DIB7000P=m -CONFIG_DVB_TDA10048=m -CONFIG_DVB_AF9013=m -CONFIG_DVB_EC100=m -CONFIG_DVB_STV0367=m -CONFIG_DVB_CXD2820R=m -CONFIG_DVB_CXD2841ER=m -CONFIG_DVB_RTL2830=m -CONFIG_DVB_RTL2832=m -CONFIG_DVB_RTL2832_SDR=m -CONFIG_DVB_SI2168=m -CONFIG_DVB_AS102_FE=m -CONFIG_DVB_ZD1301_DEMOD=m -CONFIG_DVB_GP8PSK_FE=m - -# -# DVB-C (cable) frontends -# -CONFIG_DVB_VES1820=m -CONFIG_DVB_TDA10021=m -CONFIG_DVB_TDA10023=m -CONFIG_DVB_STV0297=m - -# -# ATSC (North American/Korean Terrestrial/Cable DTV) frontends -# -CONFIG_DVB_NXT200X=m -CONFIG_DVB_OR51211=m -CONFIG_DVB_OR51132=m -CONFIG_DVB_BCM3510=m -CONFIG_DVB_LGDT330X=m -CONFIG_DVB_LGDT3305=m -CONFIG_DVB_LGDT3306A=m -CONFIG_DVB_LG2160=m -CONFIG_DVB_S5H1409=m -CONFIG_DVB_AU8522=m -CONFIG_DVB_AU8522_DTV=m -CONFIG_DVB_AU8522_V4L=m -CONFIG_DVB_S5H1411=m - -# -# ISDB-T (terrestrial) frontends -# -CONFIG_DVB_S921=m -CONFIG_DVB_DIB8000=m -CONFIG_DVB_MB86A20S=m - -# -# ISDB-S (satellite) & ISDB-T (terrestrial) frontends -# -CONFIG_DVB_TC90522=m - -# -# Digital terrestrial only tuners/PLL -# -CONFIG_DVB_PLL=m -CONFIG_DVB_TUNER_DIB0070=m -CONFIG_DVB_TUNER_DIB0090=m - -# -# SEC control devices for DVB-S -# -CONFIG_DVB_DRX39XYJ=m -CONFIG_DVB_LNBH25=m -CONFIG_DVB_LNBP21=m -CONFIG_DVB_LNBP22=m -CONFIG_DVB_ISL6405=m -CONFIG_DVB_ISL6421=m -CONFIG_DVB_ISL6423=m -CONFIG_DVB_A8293=m -CONFIG_DVB_SP2=m -CONFIG_DVB_LGS8GXX=m -CONFIG_DVB_ATBM8830=m -CONFIG_DVB_TDA665x=m -CONFIG_DVB_IX2505V=m -CONFIG_DVB_M88RS2000=m -CONFIG_DVB_AF9033=m -CONFIG_DVB_HORUS3A=m -CONFIG_DVB_ASCOT2E=m -CONFIG_DVB_HELENE=m - -# -# Tools to develop new frontends -# -# CONFIG_DVB_DUMMY_FE is not set - -# -# Graphics support -# -CONFIG_AGP=m -CONFIG_AGP_AMD64=m -CONFIG_AGP_INTEL=m -CONFIG_AGP_SIS=m -CONFIG_AGP_VIA=m -CONFIG_INTEL_GTT=m -CONFIG_VGA_ARB=y -CONFIG_VGA_ARB_MAX_GPUS=16 -CONFIG_VGA_SWITCHEROO=y -CONFIG_DRM=m -CONFIG_DRM_MIPI_DSI=y -CONFIG_DRM_DP_AUX_CHARDEV=y -# CONFIG_DRM_DEBUG_MM_SELFTEST is not set -CONFIG_DRM_KMS_HELPER=m -CONFIG_DRM_KMS_FB_HELPER=y -CONFIG_DRM_FBDEV_EMULATION=y -CONFIG_DRM_FBDEV_OVERALLOC=100 -# CONFIG_DRM_LOAD_EDID_FIRMWARE is not set -CONFIG_DRM_TTM=m -CONFIG_DRM_GEM_CMA_HELPER=y -CONFIG_DRM_KMS_CMA_HELPER=y -CONFIG_DRM_VM=y - -# -# I2C encoder or helper chips -# -CONFIG_DRM_I2C_CH7006=m -CONFIG_DRM_I2C_SIL164=m -CONFIG_DRM_I2C_NXP_TDA998X=m -CONFIG_DRM_RADEON=m -# CONFIG_DRM_RADEON_USERPTR is not set -CONFIG_DRM_AMDGPU=m -CONFIG_DRM_AMDGPU_SI=y -CONFIG_DRM_AMDGPU_CIK=y -CONFIG_DRM_AMDGPU_USERPTR=y -# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set - -# -# ACP (Audio CoProcessor) Configuration -# -CONFIG_DRM_AMD_ACP=y -CONFIG_DRM_NOUVEAU=m -CONFIG_NOUVEAU_DEBUG=5 -CONFIG_NOUVEAU_DEBUG_DEFAULT=3 -CONFIG_DRM_NOUVEAU_BACKLIGHT=y -CONFIG_DRM_I915=m -# CONFIG_DRM_I915_ALPHA_SUPPORT is not set -CONFIG_DRM_I915_CAPTURE_ERROR=y -CONFIG_DRM_I915_COMPRESS_ERROR=y -CONFIG_DRM_I915_USERPTR=y -CONFIG_DRM_I915_GVT=y -CONFIG_DRM_I915_GVT_KVMGT=m -CONFIG_DRM_VGEM=m -CONFIG_DRM_VMWGFX=m -CONFIG_DRM_VMWGFX_FBCON=y -CONFIG_DRM_GMA500=m -CONFIG_DRM_GMA600=y -CONFIG_DRM_GMA3600=y -CONFIG_DRM_UDL=m -CONFIG_DRM_AST=m -CONFIG_DRM_MGAG200=m -CONFIG_DRM_CIRRUS_QEMU=m -CONFIG_DRM_QXL=m -CONFIG_DRM_BOCHS=m -CONFIG_DRM_VIRTIO_GPU=m -CONFIG_DRM_PANEL=y - -# -# Display Panels -# -CONFIG_DRM_BRIDGE=y -CONFIG_DRM_PANEL_BRIDGE=y - -# -# Display Interface Bridges -# -CONFIG_DRM_ANALOGIX_ANX78XX=m -CONFIG_HSA_AMD=m -CONFIG_DRM_HISI_HIBMC=m -CONFIG_DRM_TINYDRM=m -CONFIG_TINYDRM_MIPI_DBI=m -CONFIG_TINYDRM_MI0283QT=m -CONFIG_TINYDRM_REPAPER=m -CONFIG_TINYDRM_ST7586=m -# CONFIG_DRM_LEGACY is not set -# CONFIG_DRM_LIB_RANDOM is not set - -# -# Frame buffer Devices -# -CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y -CONFIG_FB_CMDLINE=y -CONFIG_FB_NOTIFY=y -# CONFIG_FB_DDC is not set -CONFIG_FB_BOOT_VESA_SUPPORT=y -CONFIG_FB_CFB_FILLRECT=y -CONFIG_FB_CFB_COPYAREA=y -CONFIG_FB_CFB_IMAGEBLIT=y -# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set -CONFIG_FB_SYS_FILLRECT=m -CONFIG_FB_SYS_COPYAREA=m -CONFIG_FB_SYS_IMAGEBLIT=m -# CONFIG_FB_PROVIDE_GET_FB_UNMAPPED_AREA is not set -CONFIG_FB_FOREIGN_ENDIAN=y -CONFIG_FB_BOTH_ENDIAN=y -# CONFIG_FB_BIG_ENDIAN is not set -# CONFIG_FB_LITTLE_ENDIAN is not set -CONFIG_FB_SYS_FOPS=m -CONFIG_FB_DEFERRED_IO=y -# CONFIG_FB_SVGALIB is not set -# CONFIG_FB_MACMODES is not set -CONFIG_FB_BACKLIGHT=y -CONFIG_FB_MODE_HELPERS=y -CONFIG_FB_TILEBLITTING=y - -# -# Frame buffer hardware drivers -# -# CONFIG_FB_CIRRUS is not set -# CONFIG_FB_PM2 is not set -# CONFIG_FB_CYBER2000 is not set -# CONFIG_FB_ARC is not set -# CONFIG_FB_ASILIANT is not set -# CONFIG_FB_IMSTT is not set -# CONFIG_FB_VGA16 is not set -# CONFIG_FB_UVESA is not set -CONFIG_FB_VESA=y -CONFIG_FB_EFI=y -# CONFIG_FB_N411 is not set -# CONFIG_FB_HGA is not set -# CONFIG_FB_OPENCORES is not set -# CONFIG_FB_S1D13XXX is not set -# CONFIG_FB_NVIDIA is not set -# CONFIG_FB_RIVA is not set -# CONFIG_FB_I740 is not set -# CONFIG_FB_LE80578 is not set -# CONFIG_FB_MATROX is not set -# CONFIG_FB_RADEON is not set -# CONFIG_FB_ATY128 is not set -# CONFIG_FB_ATY is not set -# CONFIG_FB_S3 is not set -# CONFIG_FB_SAVAGE is not set -# CONFIG_FB_SIS is not set -# CONFIG_FB_VIA is not set -# CONFIG_FB_NEOMAGIC is not set -# CONFIG_FB_KYRO is not set -# CONFIG_FB_3DFX is not set -# CONFIG_FB_VOODOO1 is not set -# CONFIG_FB_VT8623 is not set -# CONFIG_FB_TRIDENT is not set -# CONFIG_FB_ARK is not set -# CONFIG_FB_PM3 is not set -# CONFIG_FB_CARMINE is not set -# CONFIG_FB_SM501 is not set -# CONFIG_FB_SMSCUFX is not set -# CONFIG_FB_UDL is not set -# CONFIG_FB_IBM_GXT4500 is not set -# CONFIG_FB_VIRTUAL is not set -# CONFIG_FB_METRONOME is not set -# CONFIG_FB_MB862XX is not set -# CONFIG_FB_BROADSHEET is not set -# CONFIG_FB_AUO_K190X is not set -# CONFIG_FB_HYPERV is not set -CONFIG_FB_SIMPLE=y -# CONFIG_FB_SM712 is not set -CONFIG_BACKLIGHT_LCD_SUPPORT=y -CONFIG_LCD_CLASS_DEVICE=m -CONFIG_LCD_L4F00242T03=m -CONFIG_LCD_LMS283GF05=m -CONFIG_LCD_LTV350QV=m -CONFIG_LCD_ILI922X=m -CONFIG_LCD_ILI9320=m -CONFIG_LCD_TDO24M=m -CONFIG_LCD_VGG2432A4=m -CONFIG_LCD_PLATFORM=m -CONFIG_LCD_S6E63M0=m -CONFIG_LCD_LD9040=m -CONFIG_LCD_AMS369FG06=m -CONFIG_LCD_LMS501KF03=m -CONFIG_LCD_HX8357=m -CONFIG_BACKLIGHT_CLASS_DEVICE=y -CONFIG_BACKLIGHT_GENERIC=m -CONFIG_BACKLIGHT_LM3533=m -CONFIG_BACKLIGHT_PWM=m -CONFIG_BACKLIGHT_DA9052=m -CONFIG_BACKLIGHT_APPLE=m -CONFIG_BACKLIGHT_PM8941_WLED=m -CONFIG_BACKLIGHT_SAHARA=m -CONFIG_BACKLIGHT_WM831X=m -CONFIG_BACKLIGHT_ADP8860=m -CONFIG_BACKLIGHT_ADP8870=m -CONFIG_BACKLIGHT_PCF50633=m -CONFIG_BACKLIGHT_LM3630A=m -CONFIG_BACKLIGHT_LM3639=m -CONFIG_BACKLIGHT_LP855X=m -CONFIG_BACKLIGHT_SKY81452=m -CONFIG_BACKLIGHT_TPS65217=m -CONFIG_BACKLIGHT_GPIO=m -CONFIG_BACKLIGHT_LV5207LP=m -CONFIG_BACKLIGHT_BD6107=m -CONFIG_BACKLIGHT_ARCXCNN=m -# CONFIG_VGASTATE is not set -CONFIG_HDMI=y - -# -# Console display driver support -# -CONFIG_VGA_CONSOLE=y -CONFIG_VGACON_SOFT_SCROLLBACK=y -CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 -# CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set -CONFIG_DUMMY_CONSOLE=y -CONFIG_DUMMY_CONSOLE_COLUMNS=80 -CONFIG_DUMMY_CONSOLE_ROWS=25 -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y -CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y -CONFIG_LOGO=y -CONFIG_LOGO_LINUX_MONO=y -CONFIG_LOGO_LINUX_VGA16=y -CONFIG_LOGO_LINUX_CLUT224=y -CONFIG_SOUND=m -CONFIG_SOUND_OSS_CORE=y -CONFIG_SOUND_OSS_CORE_PRECLAIM=y -CONFIG_SND=m -CONFIG_SND_TIMER=m -CONFIG_SND_PCM=m -CONFIG_SND_PCM_ELD=y -CONFIG_SND_PCM_IEC958=y -CONFIG_SND_DMAENGINE_PCM=m -CONFIG_SND_HWDEP=m -CONFIG_SND_SEQ_DEVICE=m -CONFIG_SND_RAWMIDI=m -CONFIG_SND_COMPRESS_OFFLOAD=m -CONFIG_SND_JACK=y -CONFIG_SND_JACK_INPUT_DEV=y -CONFIG_SND_OSSEMUL=y -CONFIG_SND_MIXER_OSS=m -CONFIG_SND_PCM_OSS=m -CONFIG_SND_PCM_OSS_PLUGINS=y -CONFIG_SND_PCM_TIMER=y -CONFIG_SND_HRTIMER=m -CONFIG_SND_DYNAMIC_MINORS=y -CONFIG_SND_MAX_CARDS=32 -CONFIG_SND_SUPPORT_OLD_API=y -CONFIG_SND_PROC_FS=y -CONFIG_SND_VERBOSE_PROCFS=y -# CONFIG_SND_VERBOSE_PRINTK is not set -# CONFIG_SND_DEBUG is not set -CONFIG_SND_VMASTER=y -CONFIG_SND_DMA_SGBUF=y -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m -CONFIG_SND_SEQUENCER_OSS=m -CONFIG_SND_SEQ_HRTIMER_DEFAULT=y -CONFIG_SND_SEQ_MIDI_EVENT=m -CONFIG_SND_SEQ_MIDI=m -CONFIG_SND_SEQ_MIDI_EMUL=m -CONFIG_SND_SEQ_VIRMIDI=m -CONFIG_SND_MPU401_UART=m -CONFIG_SND_OPL3_LIB=m -CONFIG_SND_OPL3_LIB_SEQ=m -# CONFIG_SND_OPL4_LIB_SEQ is not set -CONFIG_SND_VX_LIB=m -CONFIG_SND_AC97_CODEC=m -CONFIG_SND_DRIVERS=y -# CONFIG_SND_PCSP is not set -CONFIG_SND_DUMMY=m -CONFIG_SND_ALOOP=m -CONFIG_SND_VIRMIDI=m -CONFIG_SND_MTPAV=m -CONFIG_SND_MTS64=m -CONFIG_SND_SERIAL_U16550=m -CONFIG_SND_MPU401=m -CONFIG_SND_PORTMAN2X4=m -CONFIG_SND_AC97_POWER_SAVE=y -CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 -CONFIG_SND_SB_COMMON=m -CONFIG_SND_PCI=y -CONFIG_SND_AD1889=m -CONFIG_SND_ALS300=m -CONFIG_SND_ALS4000=m -CONFIG_SND_ALI5451=m -CONFIG_SND_ASIHPI=m -CONFIG_SND_ATIIXP=m -CONFIG_SND_ATIIXP_MODEM=m -CONFIG_SND_AU8810=m -CONFIG_SND_AU8820=m -CONFIG_SND_AU8830=m -CONFIG_SND_AW2=m -CONFIG_SND_AZT3328=m -CONFIG_SND_BT87X=m -CONFIG_SND_BT87X_OVERCLOCK=y -CONFIG_SND_CA0106=m -CONFIG_SND_CMIPCI=m -CONFIG_SND_OXYGEN_LIB=m -CONFIG_SND_OXYGEN=m -CONFIG_SND_CS4281=m -CONFIG_SND_CS46XX=m -CONFIG_SND_CS46XX_NEW_DSP=y -CONFIG_SND_CTXFI=m -CONFIG_SND_DARLA20=m -CONFIG_SND_GINA20=m -CONFIG_SND_LAYLA20=m -CONFIG_SND_DARLA24=m -CONFIG_SND_GINA24=m -CONFIG_SND_LAYLA24=m -CONFIG_SND_MONA=m -CONFIG_SND_MIA=m -CONFIG_SND_ECHO3G=m -CONFIG_SND_INDIGO=m -CONFIG_SND_INDIGOIO=m -CONFIG_SND_INDIGODJ=m -CONFIG_SND_INDIGOIOX=m -CONFIG_SND_INDIGODJX=m -CONFIG_SND_EMU10K1=m -CONFIG_SND_EMU10K1_SEQ=m -CONFIG_SND_EMU10K1X=m -CONFIG_SND_ENS1370=m -CONFIG_SND_ENS1371=m -CONFIG_SND_ES1938=m -CONFIG_SND_ES1968=m -CONFIG_SND_ES1968_INPUT=y -CONFIG_SND_ES1968_RADIO=y -CONFIG_SND_FM801=m -CONFIG_SND_FM801_TEA575X_BOOL=y -CONFIG_SND_HDSP=m -CONFIG_SND_HDSPM=m -CONFIG_SND_ICE1712=m -CONFIG_SND_ICE1724=m -CONFIG_SND_INTEL8X0=m -CONFIG_SND_INTEL8X0M=m -CONFIG_SND_KORG1212=m -CONFIG_SND_LOLA=m -CONFIG_SND_LX6464ES=m -CONFIG_SND_MAESTRO3=m -CONFIG_SND_MAESTRO3_INPUT=y -CONFIG_SND_MIXART=m -CONFIG_SND_NM256=m -CONFIG_SND_PCXHR=m -CONFIG_SND_RIPTIDE=m -CONFIG_SND_RME32=m -CONFIG_SND_RME96=m -CONFIG_SND_RME9652=m -CONFIG_SND_SONICVIBES=m -CONFIG_SND_TRIDENT=m -CONFIG_SND_VIA82XX=m -CONFIG_SND_VIA82XX_MODEM=m -CONFIG_SND_VIRTUOSO=m -CONFIG_SND_VX222=m -CONFIG_SND_YMFPCI=m - -# -# HD-Audio -# -CONFIG_SND_HDA=m -CONFIG_SND_HDA_INTEL=m -CONFIG_SND_HDA_HWDEP=y -CONFIG_SND_HDA_RECONFIG=y -CONFIG_SND_HDA_INPUT_BEEP=y -CONFIG_SND_HDA_INPUT_BEEP_MODE=1 -CONFIG_SND_HDA_PATCH_LOADER=y -CONFIG_SND_HDA_CODEC_REALTEK=m -CONFIG_SND_HDA_CODEC_ANALOG=m -CONFIG_SND_HDA_CODEC_SIGMATEL=m -CONFIG_SND_HDA_CODEC_VIA=m -CONFIG_SND_HDA_CODEC_HDMI=m -CONFIG_SND_HDA_CODEC_CIRRUS=m -CONFIG_SND_HDA_CODEC_CONEXANT=m -CONFIG_SND_HDA_CODEC_CA0110=m -CONFIG_SND_HDA_CODEC_CA0132=m -CONFIG_SND_HDA_CODEC_CA0132_DSP=y -CONFIG_SND_HDA_CODEC_CMEDIA=m -CONFIG_SND_HDA_CODEC_SI3054=m -CONFIG_SND_HDA_GENERIC=m -CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 -CONFIG_SND_HDA_CORE=m -CONFIG_SND_HDA_DSP_LOADER=y -CONFIG_SND_HDA_I915=y -CONFIG_SND_HDA_EXT_CORE=m -CONFIG_SND_HDA_PREALLOC_SIZE=64 -CONFIG_SND_SPI=y -CONFIG_SND_USB=y -CONFIG_SND_USB_AUDIO=m -CONFIG_SND_USB_UA101=m -CONFIG_SND_USB_USX2Y=m -CONFIG_SND_USB_CAIAQ=m -CONFIG_SND_USB_CAIAQ_INPUT=y -CONFIG_SND_USB_US122L=m -CONFIG_SND_USB_6FIRE=m -CONFIG_SND_USB_HIFACE=m -CONFIG_SND_BCD2000=m -CONFIG_SND_USB_LINE6=m -CONFIG_SND_USB_POD=m -CONFIG_SND_USB_PODHD=m -CONFIG_SND_USB_TONEPORT=m -CONFIG_SND_USB_VARIAX=m -CONFIG_SND_FIREWIRE=y -CONFIG_SND_FIREWIRE_LIB=m -CONFIG_SND_DICE=m -CONFIG_SND_OXFW=m -CONFIG_SND_ISIGHT=m -CONFIG_SND_FIREWORKS=m -CONFIG_SND_BEBOB=m -CONFIG_SND_FIREWIRE_DIGI00X=m -CONFIG_SND_FIREWIRE_TASCAM=m -CONFIG_SND_FIREWIRE_MOTU=m -CONFIG_SND_FIREFACE=m -CONFIG_SND_PCMCIA=y -CONFIG_SND_VXPOCKET=m -CONFIG_SND_PDAUDIOCF=m -CONFIG_SND_SOC=m -CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y -CONFIG_SND_SOC_COMPRESS=y -CONFIG_SND_SOC_TOPOLOGY=y -CONFIG_SND_SOC_AMD_ACP=m -CONFIG_SND_ATMEL_SOC=m -CONFIG_SND_DESIGNWARE_I2S=m -CONFIG_SND_DESIGNWARE_PCM=y - -# -# SoC Audio for Freescale CPUs -# - -# -# Common SoC Audio options for Freescale CPUs: -# -CONFIG_SND_SOC_FSL_ASRC=m -CONFIG_SND_SOC_FSL_SAI=m -CONFIG_SND_SOC_FSL_SSI=m -CONFIG_SND_SOC_FSL_SPDIF=m -CONFIG_SND_SOC_FSL_ESAI=m -CONFIG_SND_SOC_IMX_AUDMUX=m -CONFIG_SND_I2S_HI6210_I2S=m -CONFIG_SND_SOC_IMG=y -CONFIG_SND_SOC_IMG_I2S_IN=m -CONFIG_SND_SOC_IMG_I2S_OUT=m -CONFIG_SND_SOC_IMG_PARALLEL_OUT=m -CONFIG_SND_SOC_IMG_SPDIF_IN=m -CONFIG_SND_SOC_IMG_SPDIF_OUT=m -CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m -CONFIG_SND_SST_IPC=m -CONFIG_SND_SST_IPC_ACPI=m -CONFIG_SND_SOC_INTEL_SST=m -CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m -CONFIG_SND_SOC_INTEL_SST_ACPI=m -CONFIG_SND_SOC_INTEL_SST_MATCH=m -CONFIG_SND_SOC_INTEL_HASWELL=m -CONFIG_SND_SOC_INTEL_HASWELL_MACH=m -CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m -CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m -CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m -CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m -CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_SKYLAKE=m -CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m -CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m -CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m - -# -# STMicroelectronics STM32 SOC audio support -# -CONFIG_SND_SOC_XTFPGA_I2S=m -CONFIG_ZX_TDM=m -CONFIG_SND_SOC_I2C_AND_SPI=m - -# -# CODEC drivers -# -# CONFIG_SND_SOC_AC97_CODEC is not set -CONFIG_SND_SOC_ADAU_UTILS=m -CONFIG_SND_SOC_ADAU1701=m -CONFIG_SND_SOC_ADAU17X1=m -CONFIG_SND_SOC_ADAU1761=m -CONFIG_SND_SOC_ADAU1761_I2C=m -CONFIG_SND_SOC_ADAU1761_SPI=m -CONFIG_SND_SOC_ADAU7002=m -CONFIG_SND_SOC_AK4104=m -CONFIG_SND_SOC_AK4554=m -CONFIG_SND_SOC_AK4613=m -CONFIG_SND_SOC_AK4642=m -CONFIG_SND_SOC_AK5386=m -CONFIG_SND_SOC_ALC5623=m -# CONFIG_SND_SOC_BT_SCO is not set -CONFIG_SND_SOC_CS35L32=m -CONFIG_SND_SOC_CS35L33=m -CONFIG_SND_SOC_CS35L34=m -CONFIG_SND_SOC_CS35L35=m -CONFIG_SND_SOC_CS42L42=m -CONFIG_SND_SOC_CS42L51=m -CONFIG_SND_SOC_CS42L51_I2C=m -CONFIG_SND_SOC_CS42L52=m -CONFIG_SND_SOC_CS42L56=m -CONFIG_SND_SOC_CS42L73=m -CONFIG_SND_SOC_CS4265=m -CONFIG_SND_SOC_CS4270=m -CONFIG_SND_SOC_CS4271=m -CONFIG_SND_SOC_CS4271_I2C=m -CONFIG_SND_SOC_CS4271_SPI=m -CONFIG_SND_SOC_CS42XX8=m -CONFIG_SND_SOC_CS42XX8_I2C=m -CONFIG_SND_SOC_CS43130=m -CONFIG_SND_SOC_CS4349=m -CONFIG_SND_SOC_CS53L30=m -CONFIG_SND_SOC_DA7213=m -CONFIG_SND_SOC_DA7219=m -CONFIG_SND_SOC_DIO2125=m -CONFIG_SND_SOC_DMIC=m -CONFIG_SND_SOC_HDMI_CODEC=m -CONFIG_SND_SOC_ES7134=m -CONFIG_SND_SOC_ES8316=m -CONFIG_SND_SOC_ES8328=m -CONFIG_SND_SOC_ES8328_I2C=m -CONFIG_SND_SOC_ES8328_SPI=m -CONFIG_SND_SOC_GTM601=m -CONFIG_SND_SOC_HDAC_HDMI=m -CONFIG_SND_SOC_INNO_RK3036=m -CONFIG_SND_SOC_MAX98090=m -CONFIG_SND_SOC_MAX98357A=m -CONFIG_SND_SOC_MAX98504=m -CONFIG_SND_SOC_MAX98927=m -CONFIG_SND_SOC_MAX9860=m -CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m -CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m -CONFIG_SND_SOC_PCM1681=m -CONFIG_SND_SOC_PCM179X=m -CONFIG_SND_SOC_PCM179X_I2C=m -CONFIG_SND_SOC_PCM179X_SPI=m -CONFIG_SND_SOC_PCM3168A=m -CONFIG_SND_SOC_PCM3168A_I2C=m -CONFIG_SND_SOC_PCM3168A_SPI=m -CONFIG_SND_SOC_PCM512x=m -CONFIG_SND_SOC_PCM512x_I2C=m -CONFIG_SND_SOC_PCM512x_SPI=m -CONFIG_SND_SOC_RL6231=m -CONFIG_SND_SOC_RL6347A=m -CONFIG_SND_SOC_RT286=m -CONFIG_SND_SOC_RT298=m -CONFIG_SND_SOC_RT5514=m -CONFIG_SND_SOC_RT5514_SPI=m -CONFIG_SND_SOC_RT5616=m -CONFIG_SND_SOC_RT5631=m -CONFIG_SND_SOC_RT5640=m -CONFIG_SND_SOC_RT5645=m -CONFIG_SND_SOC_RT5651=m -CONFIG_SND_SOC_RT5663=m -CONFIG_SND_SOC_RT5670=m -CONFIG_SND_SOC_RT5677=m -CONFIG_SND_SOC_RT5677_SPI=m -CONFIG_SND_SOC_SGTL5000=m -CONFIG_SND_SOC_SI476X=m -CONFIG_SND_SOC_SIGMADSP=m -CONFIG_SND_SOC_SIGMADSP_I2C=m -CONFIG_SND_SOC_SIGMADSP_REGMAP=m -CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m -CONFIG_SND_SOC_SPDIF=m -CONFIG_SND_SOC_SSM2602=m -CONFIG_SND_SOC_SSM2602_SPI=m -CONFIG_SND_SOC_SSM2602_I2C=m -CONFIG_SND_SOC_SSM4567=m -CONFIG_SND_SOC_STA32X=m -CONFIG_SND_SOC_STA350=m -CONFIG_SND_SOC_STI_SAS=m -CONFIG_SND_SOC_TAS2552=m -CONFIG_SND_SOC_TAS5086=m -CONFIG_SND_SOC_TAS571X=m -CONFIG_SND_SOC_TAS5720=m -CONFIG_SND_SOC_TFA9879=m -CONFIG_SND_SOC_TLV320AIC23=m -CONFIG_SND_SOC_TLV320AIC23_I2C=m -CONFIG_SND_SOC_TLV320AIC23_SPI=m -CONFIG_SND_SOC_TLV320AIC31XX=m -CONFIG_SND_SOC_TLV320AIC3X=m -CONFIG_SND_SOC_TS3A227E=m -CONFIG_SND_SOC_WM8510=m -CONFIG_SND_SOC_WM8523=m -CONFIG_SND_SOC_WM8524=m -CONFIG_SND_SOC_WM8580=m -CONFIG_SND_SOC_WM8711=m -CONFIG_SND_SOC_WM8728=m -CONFIG_SND_SOC_WM8731=m -CONFIG_SND_SOC_WM8737=m -CONFIG_SND_SOC_WM8741=m -CONFIG_SND_SOC_WM8750=m -CONFIG_SND_SOC_WM8753=m -CONFIG_SND_SOC_WM8770=m -CONFIG_SND_SOC_WM8776=m -CONFIG_SND_SOC_WM8804=m -CONFIG_SND_SOC_WM8804_I2C=m -CONFIG_SND_SOC_WM8804_SPI=m -CONFIG_SND_SOC_WM8903=m -CONFIG_SND_SOC_WM8960=m -CONFIG_SND_SOC_WM8962=m -CONFIG_SND_SOC_WM8974=m -CONFIG_SND_SOC_WM8978=m -CONFIG_SND_SOC_WM8985=m -CONFIG_SND_SOC_ZX_AUD96P22=m -CONFIG_SND_SOC_NAU8540=m -CONFIG_SND_SOC_NAU8810=m -CONFIG_SND_SOC_NAU8824=m -CONFIG_SND_SOC_NAU8825=m -CONFIG_SND_SOC_TPA6130A2=m -CONFIG_SND_SIMPLE_CARD_UTILS=m -CONFIG_SND_SIMPLE_CARD=m -CONFIG_SND_X86=y -CONFIG_HDMI_LPE_AUDIO=m -CONFIG_SND_SYNTH_EMUX=m -CONFIG_AC97_BUS=m - -# -# HID support -# -CONFIG_HID=m -CONFIG_HID_BATTERY_STRENGTH=y -CONFIG_HIDRAW=y -CONFIG_UHID=m -CONFIG_HID_GENERIC=m - -# -# Special HID drivers -# -CONFIG_HID_A4TECH=m -CONFIG_HID_ACCUTOUCH=m -CONFIG_HID_ACRUX=m -CONFIG_HID_ACRUX_FF=y -CONFIG_HID_APPLE=m -CONFIG_HID_APPLEIR=m -CONFIG_HID_ASUS=m -CONFIG_HID_AUREAL=m -CONFIG_HID_BELKIN=m -CONFIG_HID_BETOP_FF=m -CONFIG_HID_CHERRY=m -CONFIG_HID_CHICONY=m -CONFIG_HID_CORSAIR=m -CONFIG_HID_PRODIKEYS=m -CONFIG_HID_CMEDIA=m -CONFIG_HID_CP2112=m -CONFIG_HID_CYPRESS=m -CONFIG_HID_DRAGONRISE=m -CONFIG_DRAGONRISE_FF=y -CONFIG_HID_EMS_FF=m -CONFIG_HID_ELECOM=m -CONFIG_HID_ELO=m -CONFIG_HID_EZKEY=m -CONFIG_HID_GEMBIRD=m -CONFIG_HID_GFRM=m -CONFIG_HID_HOLTEK=m -CONFIG_HOLTEK_FF=y -CONFIG_HID_GT683R=m -CONFIG_HID_KEYTOUCH=m -CONFIG_HID_KYE=m -CONFIG_HID_UCLOGIC=m -CONFIG_HID_WALTOP=m -CONFIG_HID_GYRATION=m -CONFIG_HID_ICADE=m -CONFIG_HID_ITE=m -CONFIG_HID_TWINHAN=m -CONFIG_HID_KENSINGTON=m -CONFIG_HID_LCPOWER=m -CONFIG_HID_LED=m -CONFIG_HID_LENOVO=m -CONFIG_HID_LOGITECH=m -CONFIG_HID_LOGITECH_DJ=m -CONFIG_HID_LOGITECH_HIDPP=m -CONFIG_LOGITECH_FF=y -CONFIG_LOGIRUMBLEPAD2_FF=y -CONFIG_LOGIG940_FF=y -CONFIG_LOGIWHEELS_FF=y -CONFIG_HID_MAGICMOUSE=m -CONFIG_HID_MAYFLASH=m -CONFIG_HID_MICROSOFT=m -CONFIG_HID_MONTEREY=m -CONFIG_HID_MULTITOUCH=m -CONFIG_HID_NTI=m -CONFIG_HID_NTRIG=m -CONFIG_HID_ORTEK=m -CONFIG_HID_PANTHERLORD=m -CONFIG_PANTHERLORD_FF=y -CONFIG_HID_PENMOUNT=m -CONFIG_HID_PETALYNX=m -CONFIG_HID_PICOLCD=m -CONFIG_HID_PICOLCD_FB=y -CONFIG_HID_PICOLCD_BACKLIGHT=y -CONFIG_HID_PICOLCD_LCD=y -CONFIG_HID_PICOLCD_LEDS=y -CONFIG_HID_PICOLCD_CIR=y -CONFIG_HID_PLANTRONICS=m -CONFIG_HID_PRIMAX=m -CONFIG_HID_RETRODE=m -CONFIG_HID_ROCCAT=m -CONFIG_HID_SAITEK=m -CONFIG_HID_SAMSUNG=m -CONFIG_HID_SONY=m -CONFIG_SONY_FF=y -CONFIG_HID_SPEEDLINK=m -CONFIG_HID_STEELSERIES=m -CONFIG_HID_SUNPLUS=m -CONFIG_HID_RMI=m -CONFIG_HID_GREENASIA=m -CONFIG_GREENASIA_FF=y -CONFIG_HID_HYPERV_MOUSE=m -CONFIG_HID_SMARTJOYPLUS=m -CONFIG_SMARTJOYPLUS_FF=y -CONFIG_HID_TIVO=m -CONFIG_HID_TOPSEED=m -CONFIG_HID_THINGM=m -CONFIG_HID_THRUSTMASTER=m -CONFIG_THRUSTMASTER_FF=y -CONFIG_HID_UDRAW_PS3=m -CONFIG_HID_WACOM=m -CONFIG_HID_WIIMOTE=m -CONFIG_HID_XINMO=m -CONFIG_HID_ZEROPLUS=m -CONFIG_ZEROPLUS_FF=y -CONFIG_HID_ZYDACRON=m -CONFIG_HID_SENSOR_HUB=m -CONFIG_HID_SENSOR_CUSTOM_SENSOR=m -CONFIG_HID_ALPS=m - -# -# USB HID support -# -CONFIG_USB_HID=m -CONFIG_HID_PID=y -CONFIG_USB_HIDDEV=y - -# -# I2C HID support -# -CONFIG_I2C_HID=m - -# -# Intel ISH HID support -# -CONFIG_INTEL_ISH_HID=m -CONFIG_USB_OHCI_LITTLE_ENDIAN=y -CONFIG_USB_SUPPORT=y -CONFIG_USB_COMMON=y -CONFIG_USB_ARCH_HAS_HCD=y -CONFIG_USB=m -CONFIG_USB_PCI=y -CONFIG_USB_ANNOUNCE_NEW_DEVICES=y - -# -# Miscellaneous USB options -# -CONFIG_USB_DEFAULT_PERSIST=y -CONFIG_USB_DYNAMIC_MINORS=y -CONFIG_USB_OTG=y -# CONFIG_USB_OTG_WHITELIST is not set -# CONFIG_USB_OTG_BLACKLIST_HUB is not set -CONFIG_USB_OTG_FSM=m -CONFIG_USB_LEDS_TRIGGER_USBPORT=m -CONFIG_USB_MON=m -CONFIG_USB_WUSB=m -CONFIG_USB_WUSB_CBAF=m -# CONFIG_USB_WUSB_CBAF_DEBUG is not set - -# -# USB Host Controller Drivers -# -CONFIG_USB_C67X00_HCD=m -CONFIG_USB_XHCI_HCD=m -CONFIG_USB_XHCI_PCI=m -CONFIG_USB_XHCI_PLATFORM=m -CONFIG_USB_EHCI_HCD=m -CONFIG_USB_EHCI_ROOT_HUB_TT=y -CONFIG_USB_EHCI_TT_NEWSCHED=y -CONFIG_USB_EHCI_PCI=m -CONFIG_USB_EHCI_HCD_PLATFORM=m -CONFIG_USB_OXU210HP_HCD=m -CONFIG_USB_ISP116X_HCD=m -CONFIG_USB_ISP1362_HCD=m -CONFIG_USB_FOTG210_HCD=m -CONFIG_USB_MAX3421_HCD=m -CONFIG_USB_OHCI_HCD=m -CONFIG_USB_OHCI_HCD_PCI=m -CONFIG_USB_OHCI_HCD_SSB=y -CONFIG_USB_OHCI_HCD_PLATFORM=m -CONFIG_USB_UHCI_HCD=m -CONFIG_USB_U132_HCD=m -CONFIG_USB_SL811_HCD=m -# CONFIG_USB_SL811_HCD_ISO is not set -CONFIG_USB_SL811_CS=m -CONFIG_USB_R8A66597_HCD=m -CONFIG_USB_WHCI_HCD=m -CONFIG_USB_HWA_HCD=m -CONFIG_USB_HCD_BCMA=m -CONFIG_USB_HCD_SSB=m -# CONFIG_USB_HCD_TEST_MODE is not set - -# -# USB Device Class drivers -# -CONFIG_USB_ACM=m -CONFIG_USB_PRINTER=m -CONFIG_USB_WDM=m -CONFIG_USB_TMC=m - -# -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may -# - -# -# also be needed; see USB_STORAGE Help for more info -# -CONFIG_USB_STORAGE=m -# CONFIG_USB_STORAGE_DEBUG is not set -CONFIG_USB_STORAGE_REALTEK=m -CONFIG_REALTEK_AUTOPM=y -CONFIG_USB_STORAGE_DATAFAB=m -CONFIG_USB_STORAGE_FREECOM=m -CONFIG_USB_STORAGE_ISD200=m -CONFIG_USB_STORAGE_USBAT=m -CONFIG_USB_STORAGE_SDDR09=m -CONFIG_USB_STORAGE_SDDR55=m -CONFIG_USB_STORAGE_JUMPSHOT=m -CONFIG_USB_STORAGE_ALAUDA=m -CONFIG_USB_STORAGE_ONETOUCH=m -CONFIG_USB_STORAGE_KARMA=m -CONFIG_USB_STORAGE_CYPRESS_ATACB=m -CONFIG_USB_STORAGE_ENE_UB6250=m -CONFIG_USB_UAS=m - -# -# USB Imaging devices -# -CONFIG_USB_MDC800=m -CONFIG_USB_MICROTEK=m -CONFIG_USBIP_CORE=m -CONFIG_USBIP_VHCI_HCD=m -CONFIG_USBIP_VHCI_HC_PORTS=8 -CONFIG_USBIP_VHCI_NR_HCS=1 -CONFIG_USBIP_HOST=m -CONFIG_USBIP_VUDC=m -# CONFIG_USBIP_DEBUG is not set -CONFIG_USB_MUSB_HDRC=m -# CONFIG_USB_MUSB_HOST is not set -# CONFIG_USB_MUSB_GADGET is not set -CONFIG_USB_MUSB_DUAL_ROLE=y - -# -# Platform Glue Layer -# - -# -# MUSB DMA mode -# -CONFIG_MUSB_PIO_ONLY=y -CONFIG_USB_DWC3=m -# CONFIG_USB_DWC3_ULPI is not set -# CONFIG_USB_DWC3_HOST is not set -# CONFIG_USB_DWC3_GADGET is not set -CONFIG_USB_DWC3_DUAL_ROLE=y - -# -# Platform Glue Driver Support -# -CONFIG_USB_DWC3_PCI=m -CONFIG_USB_DWC2=m -# CONFIG_USB_DWC2_HOST is not set - -# -# Gadget/Dual-role mode requires USB Gadget support to be enabled -# -# CONFIG_USB_DWC2_PERIPHERAL is not set -CONFIG_USB_DWC2_DUAL_ROLE=y -CONFIG_USB_DWC2_PCI=m -# CONFIG_USB_DWC2_DEBUG is not set -# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set -CONFIG_USB_CHIPIDEA=m -CONFIG_USB_CHIPIDEA_PCI=m -CONFIG_USB_CHIPIDEA_UDC=y -CONFIG_USB_CHIPIDEA_HOST=y -# CONFIG_USB_CHIPIDEA_ULPI is not set -CONFIG_USB_ISP1760=m -CONFIG_USB_ISP1760_HCD=y -CONFIG_USB_ISP1761_UDC=y -# CONFIG_USB_ISP1760_HOST_ROLE is not set -# CONFIG_USB_ISP1760_GADGET_ROLE is not set -CONFIG_USB_ISP1760_DUAL_ROLE=y - -# -# USB port drivers -# -CONFIG_USB_USS720=m -CONFIG_USB_SERIAL=m -CONFIG_USB_SERIAL_GENERIC=y -CONFIG_USB_SERIAL_SIMPLE=m -CONFIG_USB_SERIAL_AIRCABLE=m -CONFIG_USB_SERIAL_ARK3116=m -CONFIG_USB_SERIAL_BELKIN=m -CONFIG_USB_SERIAL_CH341=m -CONFIG_USB_SERIAL_WHITEHEAT=m -CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m -CONFIG_USB_SERIAL_CP210X=m -CONFIG_USB_SERIAL_CYPRESS_M8=m -CONFIG_USB_SERIAL_EMPEG=m -CONFIG_USB_SERIAL_FTDI_SIO=m -CONFIG_USB_SERIAL_VISOR=m -CONFIG_USB_SERIAL_IPAQ=m -CONFIG_USB_SERIAL_IR=m -CONFIG_USB_SERIAL_EDGEPORT=m -CONFIG_USB_SERIAL_EDGEPORT_TI=m -CONFIG_USB_SERIAL_F81232=m -CONFIG_USB_SERIAL_F8153X=m -CONFIG_USB_SERIAL_GARMIN=m -CONFIG_USB_SERIAL_IPW=m -CONFIG_USB_SERIAL_IUU=m -CONFIG_USB_SERIAL_KEYSPAN_PDA=m -CONFIG_USB_SERIAL_KEYSPAN=m -CONFIG_USB_SERIAL_KLSI=m -CONFIG_USB_SERIAL_KOBIL_SCT=m -CONFIG_USB_SERIAL_MCT_U232=m -CONFIG_USB_SERIAL_METRO=m -CONFIG_USB_SERIAL_MOS7720=m -CONFIG_USB_SERIAL_MOS7715_PARPORT=y -CONFIG_USB_SERIAL_MOS7840=m -CONFIG_USB_SERIAL_MXUPORT=m -CONFIG_USB_SERIAL_NAVMAN=m -CONFIG_USB_SERIAL_PL2303=m -CONFIG_USB_SERIAL_OTI6858=m -CONFIG_USB_SERIAL_QCAUX=m -CONFIG_USB_SERIAL_QUALCOMM=m -CONFIG_USB_SERIAL_SPCP8X5=m -CONFIG_USB_SERIAL_SAFE=m -CONFIG_USB_SERIAL_SAFE_PADDED=y -CONFIG_USB_SERIAL_SIERRAWIRELESS=m -CONFIG_USB_SERIAL_SYMBOL=m -CONFIG_USB_SERIAL_TI=m -CONFIG_USB_SERIAL_CYBERJACK=m -CONFIG_USB_SERIAL_XIRCOM=m -CONFIG_USB_SERIAL_WWAN=m -CONFIG_USB_SERIAL_OPTION=m -CONFIG_USB_SERIAL_OMNINET=m -CONFIG_USB_SERIAL_OPTICON=m -CONFIG_USB_SERIAL_XSENS_MT=m -CONFIG_USB_SERIAL_WISHBONE=m -CONFIG_USB_SERIAL_SSU100=m -CONFIG_USB_SERIAL_QT2=m -# CONFIG_USB_SERIAL_UPD78F0730 is not set -# CONFIG_USB_SERIAL_DEBUG is not set - -# -# USB Miscellaneous drivers -# -CONFIG_USB_EMI62=m -CONFIG_USB_EMI26=m -CONFIG_USB_ADUTUX=m -CONFIG_USB_SEVSEG=m -CONFIG_USB_RIO500=m -CONFIG_USB_LEGOTOWER=m -CONFIG_USB_LCD=m -CONFIG_USB_CYPRESS_CY7C63=m -CONFIG_USB_CYTHERM=m -CONFIG_USB_IDMOUSE=m -CONFIG_USB_FTDI_ELAN=m -CONFIG_USB_APPLEDISPLAY=m -CONFIG_USB_SISUSBVGA=m -CONFIG_USB_SISUSBVGA_CON=y -CONFIG_USB_LD=m -CONFIG_USB_TRANCEVIBRATOR=m -CONFIG_USB_IOWARRIOR=m -CONFIG_USB_TEST=m -CONFIG_USB_EHSET_TEST_FIXTURE=m -CONFIG_USB_ISIGHTFW=m -CONFIG_USB_YUREX=m -CONFIG_USB_EZUSB_FX2=m -CONFIG_USB_HUB_USB251XB=m -CONFIG_USB_HSIC_USB3503=m -CONFIG_USB_HSIC_USB4604=m -CONFIG_USB_LINK_LAYER_TEST=m -CONFIG_USB_CHAOSKEY=m -CONFIG_USB_ATM=m -CONFIG_USB_SPEEDTOUCH=m -CONFIG_USB_CXACRU=m -CONFIG_USB_UEAGLEATM=m -CONFIG_USB_XUSBATM=m - -# -# USB Physical Layer drivers -# -CONFIG_USB_PHY=y -CONFIG_NOP_USB_XCEIV=m -CONFIG_USB_GPIO_VBUS=m -CONFIG_TAHVO_USB=m -CONFIG_TAHVO_USB_HOST_BY_DEFAULT=y -CONFIG_USB_ISP1301=m -CONFIG_USB_GADGET=m -# CONFIG_USB_GADGET_DEBUG is not set -# CONFIG_USB_GADGET_DEBUG_FILES is not set -# CONFIG_USB_GADGET_DEBUG_FS is not set -CONFIG_USB_GADGET_VBUS_DRAW=2 -CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 -CONFIG_U_SERIAL_CONSOLE=y - -# -# USB Peripheral Controller -# -CONFIG_USB_FOTG210_UDC=m -CONFIG_USB_GR_UDC=m -CONFIG_USB_R8A66597=m -CONFIG_USB_PXA27X=m -CONFIG_USB_MV_UDC=m -CONFIG_USB_MV_U3D=m -CONFIG_USB_SNP_CORE=m -CONFIG_USB_M66592=m -CONFIG_USB_BDC_UDC=m - -# -# Platform Support -# -CONFIG_USB_BDC_PCI=m -CONFIG_USB_AMD5536UDC=m -CONFIG_USB_NET2272=m -CONFIG_USB_NET2272_DMA=y -CONFIG_USB_NET2280=m -CONFIG_USB_GOKU=m -CONFIG_USB_EG20T=m -# CONFIG_USB_DUMMY_HCD is not set -CONFIG_USB_LIBCOMPOSITE=m -CONFIG_USB_F_ACM=m -CONFIG_USB_F_SS_LB=m -CONFIG_USB_U_SERIAL=m -CONFIG_USB_U_ETHER=m -CONFIG_USB_U_AUDIO=m -CONFIG_USB_F_SERIAL=m -CONFIG_USB_F_OBEX=m -CONFIG_USB_F_NCM=m -CONFIG_USB_F_ECM=m -CONFIG_USB_F_PHONET=m -CONFIG_USB_F_EEM=m -CONFIG_USB_F_SUBSET=m -CONFIG_USB_F_RNDIS=m -CONFIG_USB_F_MASS_STORAGE=m -CONFIG_USB_F_FS=m -CONFIG_USB_F_UAC1=m -CONFIG_USB_F_UAC2=m -CONFIG_USB_F_UVC=m -CONFIG_USB_F_MIDI=m -CONFIG_USB_F_HID=m -CONFIG_USB_F_PRINTER=m -CONFIG_USB_F_TCM=m -CONFIG_USB_CONFIGFS=m -CONFIG_USB_CONFIGFS_SERIAL=y -CONFIG_USB_CONFIGFS_ACM=y -CONFIG_USB_CONFIGFS_OBEX=y -CONFIG_USB_CONFIGFS_NCM=y -CONFIG_USB_CONFIGFS_ECM=y -CONFIG_USB_CONFIGFS_ECM_SUBSET=y -CONFIG_USB_CONFIGFS_RNDIS=y -CONFIG_USB_CONFIGFS_EEM=y -CONFIG_USB_CONFIGFS_PHONET=y -CONFIG_USB_CONFIGFS_MASS_STORAGE=y -CONFIG_USB_CONFIGFS_F_LB_SS=y -CONFIG_USB_CONFIGFS_F_FS=y -CONFIG_USB_CONFIGFS_F_UAC1=y -# CONFIG_USB_CONFIGFS_F_UAC1_LEGACY is not set -CONFIG_USB_CONFIGFS_F_UAC2=y -CONFIG_USB_CONFIGFS_F_MIDI=y -CONFIG_USB_CONFIGFS_F_HID=y -CONFIG_USB_CONFIGFS_F_UVC=y -CONFIG_USB_CONFIGFS_F_PRINTER=y -CONFIG_USB_CONFIGFS_F_TCM=y -CONFIG_USB_ZERO=m -CONFIG_USB_ZERO_HNPTEST=y -CONFIG_USB_AUDIO=m -CONFIG_GADGET_UAC1=y -# CONFIG_GADGET_UAC1_LEGACY is not set -CONFIG_USB_ETH=m -CONFIG_USB_ETH_RNDIS=y -CONFIG_USB_ETH_EEM=y -CONFIG_USB_G_NCM=m -CONFIG_USB_GADGETFS=m -CONFIG_USB_FUNCTIONFS=m -CONFIG_USB_FUNCTIONFS_ETH=y -CONFIG_USB_FUNCTIONFS_RNDIS=y -CONFIG_USB_FUNCTIONFS_GENERIC=y -CONFIG_USB_MASS_STORAGE=m -CONFIG_USB_GADGET_TARGET=m -CONFIG_USB_G_SERIAL=m -CONFIG_USB_MIDI_GADGET=m -CONFIG_USB_G_PRINTER=m -CONFIG_USB_CDC_COMPOSITE=m -CONFIG_USB_G_NOKIA=m -CONFIG_USB_G_ACM_MS=m -CONFIG_USB_G_MULTI=m -CONFIG_USB_G_MULTI_RNDIS=y -CONFIG_USB_G_MULTI_CDC=y -CONFIG_USB_G_HID=m -# CONFIG_USB_G_DBGP is not set -CONFIG_USB_G_WEBCAM=m - -# -# USB Power Delivery and Type-C drivers -# -CONFIG_TYPEC=m -CONFIG_TYPEC_UCSI=m -CONFIG_UCSI_ACPI=m -CONFIG_USB_LED_TRIG=y -CONFIG_USB_ULPI_BUS=m -CONFIG_UWB=m -CONFIG_UWB_HWA=m -CONFIG_UWB_WHCI=m -CONFIG_UWB_I1480U=m -CONFIG_MMC=m -CONFIG_MMC_BLOCK=m -CONFIG_MMC_BLOCK_MINORS=8 -CONFIG_SDIO_UART=m -# CONFIG_MMC_TEST is not set - -# -# MMC/SD/SDIO Host Controller Drivers -# -# CONFIG_MMC_DEBUG is not set -CONFIG_MMC_SDHCI=m -CONFIG_MMC_SDHCI_PCI=m -CONFIG_MMC_RICOH_MMC=y -CONFIG_MMC_SDHCI_ACPI=m -CONFIG_MMC_SDHCI_PLTFM=m -CONFIG_MMC_WBSD=m -CONFIG_MMC_TIFM_SD=m -CONFIG_MMC_SPI=m -CONFIG_MMC_SDRICOH_CS=m -CONFIG_MMC_CB710=m -CONFIG_MMC_VIA_SDMMC=m -CONFIG_MMC_VUB300=m -CONFIG_MMC_USHC=m -CONFIG_MMC_USDHI6ROL0=m -CONFIG_MMC_REALTEK_PCI=m -CONFIG_MMC_REALTEK_USB=m -CONFIG_MMC_TOSHIBA_PCI=m -CONFIG_MMC_MTK=m -CONFIG_MMC_SDHCI_XENON=m -CONFIG_MEMSTICK=m -# CONFIG_MEMSTICK_DEBUG is not set - -# -# MemoryStick drivers -# -# CONFIG_MEMSTICK_UNSAFE_RESUME is not set -CONFIG_MSPRO_BLOCK=m -CONFIG_MS_BLOCK=m - -# -# MemoryStick Host Controller Drivers -# -CONFIG_MEMSTICK_TIFM_MS=m -CONFIG_MEMSTICK_JMICRON_38X=m -CONFIG_MEMSTICK_R592=m -CONFIG_MEMSTICK_REALTEK_PCI=m -CONFIG_MEMSTICK_REALTEK_USB=m -CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=y -CONFIG_LEDS_CLASS_FLASH=m -# CONFIG_LEDS_BRIGHTNESS_HW_CHANGED is not set - -# -# LED drivers -# -CONFIG_LEDS_AS3645A=m -CONFIG_LEDS_LM3530=m -CONFIG_LEDS_LM3533=m -CONFIG_LEDS_LM3642=m -CONFIG_LEDS_MT6323=m -CONFIG_LEDS_PCA9532=m -CONFIG_LEDS_PCA9532_GPIO=y -CONFIG_LEDS_GPIO=m -CONFIG_LEDS_LP3944=m -CONFIG_LEDS_LP3952=m -CONFIG_LEDS_LP55XX_COMMON=m -CONFIG_LEDS_LP5521=m -CONFIG_LEDS_LP5523=m -CONFIG_LEDS_LP5562=m -CONFIG_LEDS_LP8501=m -CONFIG_LEDS_LP8860=m -CONFIG_LEDS_CLEVO_MAIL=m -CONFIG_LEDS_PCA955X=m -# CONFIG_LEDS_PCA955X_GPIO is not set -CONFIG_LEDS_PCA963X=m -CONFIG_LEDS_WM831X_STATUS=m -CONFIG_LEDS_DA9052=m -CONFIG_LEDS_DAC124S085=m -CONFIG_LEDS_PWM=m -CONFIG_LEDS_REGULATOR=m -CONFIG_LEDS_BD2802=m -CONFIG_LEDS_INTEL_SS4200=m -CONFIG_LEDS_LT3593=m -CONFIG_LEDS_MC13783=m -CONFIG_LEDS_TCA6507=m -CONFIG_LEDS_TLC591XX=m -CONFIG_LEDS_LM355x=m -CONFIG_LEDS_MENF21BMC=m - -# -# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) -# -CONFIG_LEDS_BLINKM=m -CONFIG_LEDS_MLXCPLD=m -CONFIG_LEDS_USER=m -CONFIG_LEDS_NIC78BX=m - -# -# LED Triggers -# -CONFIG_LEDS_TRIGGERS=y -CONFIG_LEDS_TRIGGER_TIMER=m -CONFIG_LEDS_TRIGGER_ONESHOT=m -CONFIG_LEDS_TRIGGER_DISK=y -# CONFIG_LEDS_TRIGGER_MTD is not set -CONFIG_LEDS_TRIGGER_HEARTBEAT=m -CONFIG_LEDS_TRIGGER_BACKLIGHT=m -CONFIG_LEDS_TRIGGER_CPU=y -CONFIG_LEDS_TRIGGER_GPIO=m -CONFIG_LEDS_TRIGGER_DEFAULT_ON=m - -# -# iptables trigger is under Netfilter config (LED target) -# -CONFIG_LEDS_TRIGGER_TRANSIENT=m -CONFIG_LEDS_TRIGGER_CAMERA=m -CONFIG_LEDS_TRIGGER_PANIC=y -CONFIG_ACCESSIBILITY=y -CONFIG_A11Y_BRAILLE_CONSOLE=y -CONFIG_INFINIBAND=m -CONFIG_INFINIBAND_USER_MAD=m -CONFIG_INFINIBAND_USER_ACCESS=m -# CONFIG_INFINIBAND_EXP_USER_ACCESS is not set -CONFIG_INFINIBAND_USER_MEM=y -CONFIG_INFINIBAND_ON_DEMAND_PAGING=y -CONFIG_INFINIBAND_ADDR_TRANS=y -CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y -CONFIG_INFINIBAND_MTHCA=m -CONFIG_INFINIBAND_MTHCA_DEBUG=y -CONFIG_INFINIBAND_QIB=m -CONFIG_INFINIBAND_QIB_DCA=y -CONFIG_INFINIBAND_CXGB3=m -# CONFIG_INFINIBAND_CXGB3_DEBUG is not set -CONFIG_INFINIBAND_CXGB4=m -CONFIG_INFINIBAND_I40IW=m -CONFIG_MLX4_INFINIBAND=m -CONFIG_MLX5_INFINIBAND=m -CONFIG_INFINIBAND_NES=m -# CONFIG_INFINIBAND_NES_DEBUG is not set -CONFIG_INFINIBAND_OCRDMA=m -CONFIG_INFINIBAND_VMWARE_PVRDMA=m -CONFIG_INFINIBAND_USNIC=m -CONFIG_INFINIBAND_IPOIB=m -CONFIG_INFINIBAND_IPOIB_CM=y -CONFIG_INFINIBAND_IPOIB_DEBUG=y -# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set -CONFIG_INFINIBAND_SRP=m -CONFIG_INFINIBAND_SRPT=m -CONFIG_INFINIBAND_ISER=m -CONFIG_INFINIBAND_ISERT=m -CONFIG_INFINIBAND_OPA_VNIC=m -CONFIG_INFINIBAND_RDMAVT=m -CONFIG_RDMA_RXE=m -CONFIG_INFINIBAND_HFI1=m -# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set -# CONFIG_SDMA_VERBOSITY is not set -CONFIG_INFINIBAND_QEDR=m -CONFIG_INFINIBAND_BNXT_RE=m -CONFIG_EDAC_ATOMIC_SCRUB=y -CONFIG_EDAC_SUPPORT=y -CONFIG_EDAC=y -CONFIG_EDAC_LEGACY_SYSFS=y -# CONFIG_EDAC_DEBUG is not set -CONFIG_EDAC_DECODE_MCE=m -CONFIG_EDAC_GHES=y -CONFIG_EDAC_AMD64=m -CONFIG_EDAC_AMD64_ERROR_INJECTION=y -CONFIG_EDAC_E752X=m -CONFIG_EDAC_I82975X=m -CONFIG_EDAC_I3000=m -CONFIG_EDAC_I3200=m -CONFIG_EDAC_IE31200=m -CONFIG_EDAC_X38=m -CONFIG_EDAC_I5400=m -CONFIG_EDAC_I7CORE=m -CONFIG_EDAC_I5000=m -CONFIG_EDAC_I5100=m -CONFIG_EDAC_I7300=m -CONFIG_EDAC_SBRIDGE=m -CONFIG_EDAC_SKX=m -CONFIG_EDAC_PND2=m -CONFIG_RTC_LIB=y -CONFIG_RTC_MC146818_LIB=y -CONFIG_RTC_CLASS=y -CONFIG_RTC_HCTOSYS=y -CONFIG_RTC_HCTOSYS_DEVICE="rtc0" -CONFIG_RTC_SYSTOHC=y -CONFIG_RTC_SYSTOHC_DEVICE="rtc0" -# CONFIG_RTC_DEBUG is not set -CONFIG_RTC_NVMEM=y - -# -# RTC interfaces -# -CONFIG_RTC_INTF_SYSFS=y -CONFIG_RTC_INTF_PROC=y -CONFIG_RTC_INTF_DEV=y -CONFIG_RTC_INTF_DEV_UIE_EMUL=y -# CONFIG_RTC_DRV_TEST is not set - -# -# I2C RTC drivers -# -CONFIG_RTC_DRV_88PM80X=m -CONFIG_RTC_DRV_ABB5ZES3=m -CONFIG_RTC_DRV_ABX80X=m -CONFIG_RTC_DRV_DS1307=m -CONFIG_RTC_DRV_DS1307_HWMON=y -# CONFIG_RTC_DRV_DS1307_CENTURY is not set -CONFIG_RTC_DRV_DS1374=m -CONFIG_RTC_DRV_DS1374_WDT=y -CONFIG_RTC_DRV_DS1672=m -CONFIG_RTC_DRV_MAX6900=m -CONFIG_RTC_DRV_MAX8907=m -CONFIG_RTC_DRV_RS5C372=m -CONFIG_RTC_DRV_ISL1208=m -CONFIG_RTC_DRV_ISL12022=m -CONFIG_RTC_DRV_X1205=m -CONFIG_RTC_DRV_PCF8523=m -CONFIG_RTC_DRV_PCF85063=m -CONFIG_RTC_DRV_PCF8563=m -CONFIG_RTC_DRV_PCF8583=m -CONFIG_RTC_DRV_M41T80=m -CONFIG_RTC_DRV_M41T80_WDT=y -CONFIG_RTC_DRV_BQ32K=m -CONFIG_RTC_DRV_S35390A=m -CONFIG_RTC_DRV_FM3130=m -CONFIG_RTC_DRV_RX8010=m -CONFIG_RTC_DRV_RX8581=m -CONFIG_RTC_DRV_RX8025=m -CONFIG_RTC_DRV_EM3027=m -CONFIG_RTC_DRV_RV8803=m - -# -# SPI RTC drivers -# -CONFIG_RTC_DRV_M41T93=m -CONFIG_RTC_DRV_M41T94=m -CONFIG_RTC_DRV_DS1302=m -CONFIG_RTC_DRV_DS1305=m -CONFIG_RTC_DRV_DS1343=m -CONFIG_RTC_DRV_DS1347=m -CONFIG_RTC_DRV_DS1390=m -CONFIG_RTC_DRV_MAX6916=m -CONFIG_RTC_DRV_R9701=m -CONFIG_RTC_DRV_RX4581=m -CONFIG_RTC_DRV_RX6110=m -CONFIG_RTC_DRV_RS5C348=m -CONFIG_RTC_DRV_MAX6902=m -CONFIG_RTC_DRV_PCF2123=m -CONFIG_RTC_DRV_MCP795=m -CONFIG_RTC_I2C_AND_SPI=m - -# -# SPI and I2C RTC drivers -# -CONFIG_RTC_DRV_DS3232=m -CONFIG_RTC_DRV_DS3232_HWMON=y -CONFIG_RTC_DRV_PCF2127=m -CONFIG_RTC_DRV_RV3029C2=m -CONFIG_RTC_DRV_RV3029_HWMON=y - -# -# Platform RTC drivers -# -CONFIG_RTC_DRV_CMOS=y -CONFIG_RTC_DRV_DS1286=m -CONFIG_RTC_DRV_DS1511=m -CONFIG_RTC_DRV_DS1553=m -CONFIG_RTC_DRV_DS1685_FAMILY=m -CONFIG_RTC_DRV_DS1685=y -# CONFIG_RTC_DRV_DS1689 is not set -# CONFIG_RTC_DRV_DS17285 is not set -# CONFIG_RTC_DRV_DS17485 is not set -# CONFIG_RTC_DRV_DS17885 is not set -CONFIG_RTC_DS1685_PROC_REGS=y -CONFIG_RTC_DS1685_SYSFS_REGS=y -CONFIG_RTC_DRV_DS1742=m -CONFIG_RTC_DRV_DS2404=m -CONFIG_RTC_DRV_DA9052=m -CONFIG_RTC_DRV_DA9063=m -CONFIG_RTC_DRV_STK17TA8=m -CONFIG_RTC_DRV_M48T86=m -CONFIG_RTC_DRV_M48T35=m -CONFIG_RTC_DRV_M48T59=m -CONFIG_RTC_DRV_MSM6242=m -CONFIG_RTC_DRV_BQ4802=m -CONFIG_RTC_DRV_RP5C01=m -CONFIG_RTC_DRV_V3020=m -CONFIG_RTC_DRV_WM831X=m -CONFIG_RTC_DRV_PCF50633=m - -# -# on-CPU RTC drivers -# -CONFIG_RTC_DRV_FTRTC010=m -CONFIG_RTC_DRV_PCAP=m -CONFIG_RTC_DRV_MC13XXX=m -CONFIG_RTC_DRV_MT6397=m - -# -# HID Sensor RTC drivers -# -CONFIG_RTC_DRV_HID_SENSOR_TIME=m -CONFIG_DMADEVICES=y -# CONFIG_DMADEVICES_DEBUG is not set - -# -# DMA Devices -# -CONFIG_DMA_ENGINE=y -CONFIG_DMA_VIRTUAL_CHANNELS=y -CONFIG_DMA_ACPI=y -CONFIG_ALTERA_MSGDMA=m -CONFIG_INTEL_IDMA64=m -CONFIG_INTEL_IOATDMA=m -CONFIG_INTEL_MIC_X100_DMA=m -CONFIG_QCOM_HIDMA_MGMT=m -CONFIG_QCOM_HIDMA=m -CONFIG_DW_DMAC_CORE=y -CONFIG_DW_DMAC=m -CONFIG_DW_DMAC_PCI=y -CONFIG_HSU_DMA=y - -# -# DMA Clients -# -CONFIG_ASYNC_TX_DMA=y -# CONFIG_DMATEST is not set -CONFIG_DMA_ENGINE_RAID=y - -# -# DMABUF options -# -CONFIG_SYNC_FILE=y -# CONFIG_SW_SYNC is not set -CONFIG_DCA=m -CONFIG_AUXDISPLAY=y -CONFIG_HD44780=m -CONFIG_KS0108=m -CONFIG_KS0108_PORT=0x378 -CONFIG_KS0108_DELAY=2 -CONFIG_CFAG12864B=m -CONFIG_CFAG12864B_RATE=20 -CONFIG_IMG_ASCII_LCD=m -CONFIG_PANEL=m -CONFIG_PANEL_PARPORT=0 -CONFIG_PANEL_PROFILE=5 -# CONFIG_PANEL_CHANGE_MESSAGE is not set -CONFIG_CHARLCD=m -CONFIG_UIO=m -CONFIG_UIO_CIF=m -CONFIG_UIO_PDRV_GENIRQ=m -CONFIG_UIO_DMEM_GENIRQ=m -CONFIG_UIO_AEC=m -CONFIG_UIO_SERCOS3=m -CONFIG_UIO_PCI_GENERIC=m -CONFIG_UIO_NETX=m -CONFIG_UIO_PRUSS=m -CONFIG_UIO_MF624=m -CONFIG_UIO_HV_GENERIC=m -CONFIG_VFIO_IOMMU_TYPE1=m -CONFIG_VFIO_VIRQFD=m -CONFIG_VFIO=m -# CONFIG_VFIO_NOIOMMU is not set -CONFIG_VFIO_PCI=m -CONFIG_VFIO_PCI_VGA=y -CONFIG_VFIO_PCI_MMAP=y -CONFIG_VFIO_PCI_INTX=y -CONFIG_VFIO_PCI_IGD=y -CONFIG_VFIO_MDEV=m -CONFIG_VFIO_MDEV_DEVICE=m -CONFIG_IRQ_BYPASS_MANAGER=m -CONFIG_VIRT_DRIVERS=y -CONFIG_VIRTIO=m - -# -# Virtio drivers -# -CONFIG_VIRTIO_PCI=m -CONFIG_VIRTIO_PCI_LEGACY=y -CONFIG_VIRTIO_BALLOON=m -CONFIG_VIRTIO_INPUT=m -CONFIG_VIRTIO_MMIO=m -CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y - -# -# Microsoft Hyper-V guest support -# -CONFIG_HYPERV=m -CONFIG_HYPERV_TSCPAGE=y -CONFIG_HYPERV_UTILS=m -CONFIG_HYPERV_BALLOON=m -CONFIG_STAGING=y -CONFIG_IRDA=m - -# -# IrDA protocols -# -CONFIG_IRLAN=m -CONFIG_IRNET=m -CONFIG_IRCOMM=m -CONFIG_IRDA_ULTRA=y - -# -# IrDA options -# -CONFIG_IRDA_CACHE_LAST_LSAP=y -CONFIG_IRDA_FAST_RR=y -# CONFIG_IRDA_DEBUG is not set - -# -# Infrared-port device drivers -# - -# -# SIR device drivers -# -CONFIG_IRTTY_SIR=m - -# -# Dongle support -# -CONFIG_DONGLE=y -CONFIG_ESI_DONGLE=m -CONFIG_ACTISYS_DONGLE=m -CONFIG_TEKRAM_DONGLE=m -CONFIG_TOIM3232_DONGLE=m -CONFIG_LITELINK_DONGLE=m -CONFIG_MA600_DONGLE=m -CONFIG_GIRBIL_DONGLE=m -CONFIG_MCP2120_DONGLE=m -CONFIG_OLD_BELKIN_DONGLE=m -CONFIG_ACT200L_DONGLE=m -CONFIG_KINGSUN_DONGLE=m -CONFIG_KSDAZZLE_DONGLE=m -CONFIG_KS959_DONGLE=m - -# -# FIR device drivers -# -CONFIG_USB_IRDA=m -CONFIG_SIGMATEL_FIR=m -CONFIG_NSC_FIR=m -CONFIG_WINBOND_FIR=m -CONFIG_SMC_IRCC_FIR=m -CONFIG_ALI_FIR=m -CONFIG_VLSI_FIR=m -CONFIG_VIA_FIR=m -CONFIG_MCS_FIR=m -CONFIG_PRISM2_USB=m -CONFIG_COMEDI=m -# CONFIG_COMEDI_DEBUG is not set -CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 -CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 -CONFIG_COMEDI_MISC_DRIVERS=y -CONFIG_COMEDI_BOND=m -CONFIG_COMEDI_TEST=m -CONFIG_COMEDI_PARPORT=m -CONFIG_COMEDI_SERIAL2002=m -CONFIG_COMEDI_ISA_DRIVERS=y -CONFIG_COMEDI_PCL711=m -CONFIG_COMEDI_PCL724=m -CONFIG_COMEDI_PCL726=m -CONFIG_COMEDI_PCL730=m -CONFIG_COMEDI_PCL812=m -CONFIG_COMEDI_PCL816=m -CONFIG_COMEDI_PCL818=m -CONFIG_COMEDI_PCM3724=m -CONFIG_COMEDI_AMPLC_DIO200_ISA=m -CONFIG_COMEDI_AMPLC_PC236_ISA=m -CONFIG_COMEDI_AMPLC_PC263_ISA=m -CONFIG_COMEDI_RTI800=m -CONFIG_COMEDI_RTI802=m -CONFIG_COMEDI_DAC02=m -CONFIG_COMEDI_DAS16M1=m -CONFIG_COMEDI_DAS08_ISA=m -CONFIG_COMEDI_DAS16=m -CONFIG_COMEDI_DAS800=m -CONFIG_COMEDI_DAS1800=m -CONFIG_COMEDI_DAS6402=m -CONFIG_COMEDI_DT2801=m -CONFIG_COMEDI_DT2811=m -CONFIG_COMEDI_DT2814=m -CONFIG_COMEDI_DT2815=m -CONFIG_COMEDI_DT2817=m -CONFIG_COMEDI_DT282X=m -CONFIG_COMEDI_DMM32AT=m -CONFIG_COMEDI_FL512=m -CONFIG_COMEDI_AIO_AIO12_8=m -CONFIG_COMEDI_AIO_IIRO_16=m -CONFIG_COMEDI_II_PCI20KC=m -CONFIG_COMEDI_C6XDIGIO=m -CONFIG_COMEDI_MPC624=m -CONFIG_COMEDI_ADQ12B=m -CONFIG_COMEDI_NI_AT_A2150=m -CONFIG_COMEDI_NI_AT_AO=m -CONFIG_COMEDI_NI_ATMIO=m -CONFIG_COMEDI_NI_ATMIO16D=m -CONFIG_COMEDI_NI_LABPC_ISA=m -CONFIG_COMEDI_PCMAD=m -CONFIG_COMEDI_PCMDA12=m -CONFIG_COMEDI_PCMMIO=m -CONFIG_COMEDI_PCMUIO=m -CONFIG_COMEDI_MULTIQ3=m -CONFIG_COMEDI_S526=m -CONFIG_COMEDI_PCI_DRIVERS=m -CONFIG_COMEDI_8255_PCI=m -CONFIG_COMEDI_ADDI_WATCHDOG=m -CONFIG_COMEDI_ADDI_APCI_1032=m -CONFIG_COMEDI_ADDI_APCI_1500=m -CONFIG_COMEDI_ADDI_APCI_1516=m -CONFIG_COMEDI_ADDI_APCI_1564=m -CONFIG_COMEDI_ADDI_APCI_16XX=m -CONFIG_COMEDI_ADDI_APCI_2032=m -CONFIG_COMEDI_ADDI_APCI_2200=m -CONFIG_COMEDI_ADDI_APCI_3120=m -CONFIG_COMEDI_ADDI_APCI_3501=m -CONFIG_COMEDI_ADDI_APCI_3XXX=m -CONFIG_COMEDI_ADL_PCI6208=m -CONFIG_COMEDI_ADL_PCI7X3X=m -CONFIG_COMEDI_ADL_PCI8164=m -CONFIG_COMEDI_ADL_PCI9111=m -CONFIG_COMEDI_ADL_PCI9118=m -CONFIG_COMEDI_ADV_PCI1710=m -CONFIG_COMEDI_ADV_PCI1720=m -CONFIG_COMEDI_ADV_PCI1723=m -CONFIG_COMEDI_ADV_PCI1724=m -CONFIG_COMEDI_ADV_PCI1760=m -CONFIG_COMEDI_ADV_PCI_DIO=m -CONFIG_COMEDI_AMPLC_DIO200_PCI=m -CONFIG_COMEDI_AMPLC_PC236_PCI=m -CONFIG_COMEDI_AMPLC_PC263_PCI=m -CONFIG_COMEDI_AMPLC_PCI224=m -CONFIG_COMEDI_AMPLC_PCI230=m -CONFIG_COMEDI_CONTEC_PCI_DIO=m -CONFIG_COMEDI_DAS08_PCI=m -CONFIG_COMEDI_DT3000=m -CONFIG_COMEDI_DYNA_PCI10XX=m -CONFIG_COMEDI_GSC_HPDI=m -CONFIG_COMEDI_MF6X4=m -CONFIG_COMEDI_ICP_MULTI=m -CONFIG_COMEDI_DAQBOARD2000=m -CONFIG_COMEDI_JR3_PCI=m -CONFIG_COMEDI_KE_COUNTER=m -CONFIG_COMEDI_CB_PCIDAS64=m -CONFIG_COMEDI_CB_PCIDAS=m -CONFIG_COMEDI_CB_PCIDDA=m -CONFIG_COMEDI_CB_PCIMDAS=m -CONFIG_COMEDI_CB_PCIMDDA=m -CONFIG_COMEDI_ME4000=m -CONFIG_COMEDI_ME_DAQ=m -CONFIG_COMEDI_NI_6527=m -CONFIG_COMEDI_NI_65XX=m -CONFIG_COMEDI_NI_660X=m -CONFIG_COMEDI_NI_670X=m -CONFIG_COMEDI_NI_LABPC_PCI=m -CONFIG_COMEDI_NI_PCIDIO=m -CONFIG_COMEDI_NI_PCIMIO=m -CONFIG_COMEDI_RTD520=m -CONFIG_COMEDI_S626=m -CONFIG_COMEDI_MITE=m -CONFIG_COMEDI_NI_TIOCMD=m -CONFIG_COMEDI_PCMCIA_DRIVERS=m -CONFIG_COMEDI_CB_DAS16_CS=m -CONFIG_COMEDI_DAS08_CS=m -CONFIG_COMEDI_NI_DAQ_700_CS=m -CONFIG_COMEDI_NI_DAQ_DIO24_CS=m -CONFIG_COMEDI_NI_LABPC_CS=m -CONFIG_COMEDI_NI_MIO_CS=m -CONFIG_COMEDI_QUATECH_DAQP_CS=m -CONFIG_COMEDI_USB_DRIVERS=m -CONFIG_COMEDI_DT9812=m -CONFIG_COMEDI_NI_USB6501=m -CONFIG_COMEDI_USBDUX=m -CONFIG_COMEDI_USBDUXFAST=m -CONFIG_COMEDI_USBDUXSIGMA=m -CONFIG_COMEDI_VMK80XX=m -CONFIG_COMEDI_8254=m -CONFIG_COMEDI_8255=m -CONFIG_COMEDI_8255_SA=m -CONFIG_COMEDI_KCOMEDILIB=m -CONFIG_COMEDI_AMPLC_DIO200=m -CONFIG_COMEDI_AMPLC_PC236=m -CONFIG_COMEDI_DAS08=m -CONFIG_COMEDI_ISADMA=m -CONFIG_COMEDI_NI_LABPC=m -CONFIG_COMEDI_NI_LABPC_ISADMA=m -CONFIG_COMEDI_NI_TIO=m -CONFIG_RTL8192U=m -CONFIG_RTLLIB=m -CONFIG_RTLLIB_CRYPTO_CCMP=m -CONFIG_RTLLIB_CRYPTO_TKIP=m -CONFIG_RTLLIB_CRYPTO_WEP=m -CONFIG_RTL8192E=m -CONFIG_RTL8723BS=m -CONFIG_R8712U=m -CONFIG_R8188EU=m -CONFIG_88EU_AP_MODE=y -CONFIG_R8822BE=m -CONFIG_RTLHALMAC_ST=m -CONFIG_RTLPHYDM_ST=m -CONFIG_RTLWIFI_DEBUG_ST=y -CONFIG_RTS5208=m -CONFIG_VT6655=m -CONFIG_VT6656=m - -# -# IIO staging drivers -# - -# -# Accelerometers -# -CONFIG_ADIS16201=m -CONFIG_ADIS16203=m -CONFIG_ADIS16209=m -CONFIG_ADIS16240=m - -# -# Analog to digital converters -# -CONFIG_AD7606=m -CONFIG_AD7606_IFACE_PARALLEL=m -CONFIG_AD7606_IFACE_SPI=m -CONFIG_AD7780=m -CONFIG_AD7816=m -CONFIG_AD7192=m -CONFIG_AD7280=m - -# -# Analog digital bi-direction converters -# -CONFIG_ADT7316=m -CONFIG_ADT7316_SPI=m -CONFIG_ADT7316_I2C=m - -# -# Capacitance to digital converters -# -CONFIG_AD7150=m -CONFIG_AD7152=m -CONFIG_AD7746=m - -# -# Direct Digital Synthesis -# -CONFIG_AD9832=m -CONFIG_AD9834=m - -# -# Digital gyroscope sensors -# -CONFIG_ADIS16060=m - -# -# Network Analyzer, Impedance Converters -# -CONFIG_AD5933=m - -# -# Light sensors -# -CONFIG_TSL2x7x=m - -# -# Active energy metering IC -# -CONFIG_ADE7753=m -CONFIG_ADE7754=m -CONFIG_ADE7758=m -CONFIG_ADE7759=m -CONFIG_ADE7854=m -CONFIG_ADE7854_I2C=m -CONFIG_ADE7854_SPI=m - -# -# Resolver to digital converters -# -CONFIG_AD2S90=m -CONFIG_AD2S1200=m -CONFIG_AD2S1210=m - -# -# Triggers - standalone -# -CONFIG_FB_SM750=m -CONFIG_FB_XGI=m - -# -# Speakup console speech -# -CONFIG_SPEAKUP=m -CONFIG_SPEAKUP_SYNTH_ACNTSA=m -CONFIG_SPEAKUP_SYNTH_APOLLO=m -CONFIG_SPEAKUP_SYNTH_AUDPTR=m -CONFIG_SPEAKUP_SYNTH_BNS=m -CONFIG_SPEAKUP_SYNTH_DECTLK=m -CONFIG_SPEAKUP_SYNTH_DECEXT=m -CONFIG_SPEAKUP_SYNTH_LTLK=m -CONFIG_SPEAKUP_SYNTH_SOFT=m -CONFIG_SPEAKUP_SYNTH_SPKOUT=m -CONFIG_SPEAKUP_SYNTH_TXPRT=m -# CONFIG_SPEAKUP_SYNTH_DUMMY is not set -CONFIG_STAGING_MEDIA=y -CONFIG_INTEL_ATOMISP=y -CONFIG_VIDEO_ATOMISP=m -CONFIG_VIDEO_OV5693=m -CONFIG_VIDEO_IMX=m -CONFIG_VIDEO_OV2722=m -CONFIG_VIDEO_GC2235=m -CONFIG_VIDEO_OV8858=m -CONFIG_VIDEO_MSRLIST_HELPER=m -CONFIG_VIDEO_MT9M114=m -CONFIG_VIDEO_AP1302=m -CONFIG_VIDEO_GC0310=m -CONFIG_VIDEO_OV2680=m -CONFIG_VIDEO_LM3554=m -CONFIG_I2C_BCM2048=m -CONFIG_DVB_CXD2099=m -CONFIG_LIRC_STAGING=y -CONFIG_LIRC_ZILOG=m - -# -# Android -# -CONFIG_LTE_GDM724X=m -CONFIG_FIREWIRE_SERIAL=m -CONFIG_FWTTY_MAX_TOTAL_PORTS=64 -CONFIG_FWTTY_MAX_CARD_PORTS=32 -CONFIG_MTD_SPINAND_MT29F=m -CONFIG_MTD_SPINAND_ONDIEECC=y -CONFIG_LNET=m -CONFIG_LNET_MAX_PAYLOAD=1048576 -# CONFIG_LNET_SELFTEST is not set -CONFIG_LNET_XPRT_IB=m -CONFIG_LUSTRE_FS=m -# CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK is not set -CONFIG_DGNC=m -CONFIG_GS_FPGABOOT=m -CONFIG_CRYPTO_SKEIN=m -CONFIG_UNISYSSPAR=y -# CONFIG_UNISYS_VISORBUS is not set -CONFIG_FB_TFT=m -CONFIG_FB_TFT_AGM1264K_FL=m -CONFIG_FB_TFT_BD663474=m -CONFIG_FB_TFT_HX8340BN=m -CONFIG_FB_TFT_HX8347D=m -CONFIG_FB_TFT_HX8353D=m -# CONFIG_FB_TFT_HX8357D is not set -# CONFIG_FB_TFT_ILI9163 is not set -CONFIG_FB_TFT_ILI9320=m -CONFIG_FB_TFT_ILI9325=m -CONFIG_FB_TFT_ILI9340=m -CONFIG_FB_TFT_ILI9341=m -CONFIG_FB_TFT_ILI9481=m -CONFIG_FB_TFT_ILI9486=m -CONFIG_FB_TFT_PCD8544=m -CONFIG_FB_TFT_RA8875=m -CONFIG_FB_TFT_S6D02A1=m -CONFIG_FB_TFT_S6D1121=m -CONFIG_FB_TFT_SH1106=m -CONFIG_FB_TFT_SSD1289=m -CONFIG_FB_TFT_SSD1305=m -CONFIG_FB_TFT_SSD1306=m -CONFIG_FB_TFT_SSD1325=m -CONFIG_FB_TFT_SSD1331=m -CONFIG_FB_TFT_SSD1351=m -CONFIG_FB_TFT_ST7735R=m -CONFIG_FB_TFT_ST7789V=m -CONFIG_FB_TFT_TINYLCD=m -CONFIG_FB_TFT_TLS8204=m -CONFIG_FB_TFT_UC1611=m -CONFIG_FB_TFT_UC1701=m -CONFIG_FB_TFT_UPD161704=m -CONFIG_FB_TFT_WATTEROTT=m -CONFIG_FB_FLEX=m -CONFIG_FB_TFT_FBTFT_DEVICE=m -CONFIG_WILC1000=m -CONFIG_WILC1000_SDIO=m -CONFIG_WILC1000_SPI=m -# CONFIG_WILC1000_HW_OOB_INTR is not set -CONFIG_MOST=m -CONFIG_MOSTCORE=m -CONFIG_AIM_CDEV=m -CONFIG_AIM_NETWORK=m -CONFIG_AIM_SOUND=m -CONFIG_AIM_V4L2=m -CONFIG_HDM_DIM2=m -CONFIG_HDM_I2C=m -CONFIG_HDM_USB=m -CONFIG_KS7010=m -CONFIG_GREYBUS=m -CONFIG_GREYBUS_ES2=m -CONFIG_GREYBUS_AUDIO=m -CONFIG_GREYBUS_BOOTROM=m -CONFIG_GREYBUS_FIRMWARE=m -CONFIG_GREYBUS_HID=m -CONFIG_GREYBUS_LIGHT=m -CONFIG_GREYBUS_LOG=m -CONFIG_GREYBUS_LOOPBACK=m -CONFIG_GREYBUS_POWER=m -CONFIG_GREYBUS_RAW=m -CONFIG_GREYBUS_VIBRATOR=m -CONFIG_GREYBUS_BRIDGED_PHY=m -CONFIG_GREYBUS_GPIO=m -CONFIG_GREYBUS_I2C=m -CONFIG_GREYBUS_PWM=m -CONFIG_GREYBUS_SDIO=m -CONFIG_GREYBUS_SPI=m -CONFIG_GREYBUS_UART=m -CONFIG_GREYBUS_USB=m - -# -# USB Power Delivery and Type-C drivers -# -CONFIG_TYPEC_TCPM=m -CONFIG_TYPEC_TCPCI=m -CONFIG_TYPEC_FUSB302=m -CONFIG_DRM_VBOXVIDEO=m -CONFIG_PI433=m -CONFIG_X86_PLATFORM_DEVICES=y -CONFIG_ACER_WMI=m -CONFIG_ACERHDF=m -CONFIG_ALIENWARE_WMI=m -CONFIG_ASUS_LAPTOP=m -CONFIG_DELL_SMBIOS=m -CONFIG_DELL_LAPTOP=m -CONFIG_DELL_WMI=m -CONFIG_DELL_WMI_AIO=m -CONFIG_DELL_WMI_LED=m -CONFIG_DELL_SMO8800=m -CONFIG_DELL_RBTN=m -CONFIG_FUJITSU_LAPTOP=m -CONFIG_FUJITSU_TABLET=m -CONFIG_AMILO_RFKILL=m -CONFIG_HP_ACCEL=m -CONFIG_HP_WIRELESS=m -CONFIG_HP_WMI=m -CONFIG_MSI_LAPTOP=m -CONFIG_PANASONIC_LAPTOP=m -CONFIG_COMPAL_LAPTOP=m -CONFIG_SONY_LAPTOP=m -CONFIG_SONYPI_COMPAT=y -CONFIG_IDEAPAD_LAPTOP=m -CONFIG_SURFACE3_WMI=m -CONFIG_THINKPAD_ACPI=m -CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y -# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set -# CONFIG_THINKPAD_ACPI_DEBUG is not set -# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set -CONFIG_THINKPAD_ACPI_VIDEO=y -CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y -CONFIG_SENSORS_HDAPS=m -CONFIG_INTEL_MENLOW=m -CONFIG_EEEPC_LAPTOP=m -CONFIG_ASUS_WMI=m -CONFIG_ASUS_NB_WMI=m -CONFIG_EEEPC_WMI=m -CONFIG_ASUS_WIRELESS=m -CONFIG_ACPI_WMI=m -CONFIG_WMI_BMOF=m -CONFIG_MSI_WMI=m -CONFIG_PEAQ_WMI=m -CONFIG_TOPSTAR_LAPTOP=m -CONFIG_ACPI_TOSHIBA=m -CONFIG_TOSHIBA_BT_RFKILL=m -CONFIG_TOSHIBA_HAPS=m -CONFIG_TOSHIBA_WMI=m -CONFIG_ACPI_CMPC=m -CONFIG_INTEL_CHT_INT33FE=m -CONFIG_INTEL_INT0002_VGPIO=m -CONFIG_INTEL_HID_EVENT=m -CONFIG_INTEL_VBTN=m -CONFIG_INTEL_IPS=m -CONFIG_INTEL_PMC_CORE=y -CONFIG_IBM_RTL=m -CONFIG_SAMSUNG_LAPTOP=m -CONFIG_MXM_WMI=m -CONFIG_INTEL_OAKTRAIL=m -CONFIG_SAMSUNG_Q10=m -CONFIG_APPLE_GMUX=m -CONFIG_INTEL_RST=m -CONFIG_INTEL_SMARTCONNECT=m -CONFIG_PVPANIC=m -CONFIG_INTEL_PMC_IPC=m -CONFIG_INTEL_BXTWC_PMIC_TMU=m -CONFIG_SURFACE_PRO3_BUTTON=m -CONFIG_SURFACE_3_BUTTON=m -CONFIG_INTEL_PUNIT_IPC=m -CONFIG_INTEL_TELEMETRY=m -CONFIG_MLX_PLATFORM=m -CONFIG_MLX_CPLD_PLATFORM=m -# CONFIG_INTEL_TURBO_MAX_3 is not set -CONFIG_PMC_ATOM=y -CONFIG_CHROME_PLATFORMS=y -CONFIG_CHROMEOS_LAPTOP=m -CONFIG_CHROMEOS_PSTORE=m -CONFIG_CROS_EC_CHARDEV=m -CONFIG_CROS_EC_LPC=m -CONFIG_CROS_EC_LPC_MEC=y -CONFIG_CROS_EC_PROTO=y -CONFIG_CROS_KBD_LED_BACKLIGHT=m -CONFIG_CLKDEV_LOOKUP=y -CONFIG_HAVE_CLK_PREPARE=y -CONFIG_COMMON_CLK=y - -# -# Common Clock Framework -# -CONFIG_COMMON_CLK_WM831X=m -CONFIG_COMMON_CLK_SI5351=m -CONFIG_COMMON_CLK_CDCE706=m -CONFIG_COMMON_CLK_CS2000_CP=m -# CONFIG_COMMON_CLK_NXP is not set -CONFIG_COMMON_CLK_PWM=m -# CONFIG_COMMON_CLK_PXA is not set -# CONFIG_COMMON_CLK_PIC32 is not set -CONFIG_HWSPINLOCK=m - -# -# Clock Source drivers -# -CONFIG_CLKEVT_I8253=y -CONFIG_I8253_LOCK=y -CONFIG_CLKBLD_I8253=y -# CONFIG_ATMEL_PIT is not set -# CONFIG_SH_TIMER_CMT is not set -# CONFIG_SH_TIMER_MTU2 is not set -# CONFIG_SH_TIMER_TMU is not set -# CONFIG_EM_TIMER_STI is not set -CONFIG_MAILBOX=y -CONFIG_PCC=y -CONFIG_ALTERA_MBOX=m -CONFIG_IOMMU_API=y -CONFIG_IOMMU_SUPPORT=y - -# -# Generic IOMMU Pagetable Support -# -CONFIG_IOMMU_IOVA=y -CONFIG_AMD_IOMMU=y -CONFIG_AMD_IOMMU_V2=m -CONFIG_DMAR_TABLE=y -CONFIG_INTEL_IOMMU=y -CONFIG_INTEL_IOMMU_SVM=y -# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set -CONFIG_INTEL_IOMMU_FLOPPY_WA=y -CONFIG_IRQ_REMAP=y - -# -# Remoteproc drivers -# -CONFIG_REMOTEPROC=m - -# -# Rpmsg drivers -# -CONFIG_RPMSG=m -CONFIG_RPMSG_CHAR=m -CONFIG_RPMSG_QCOM_GLINK_NATIVE=m -CONFIG_RPMSG_QCOM_GLINK_RPM=m - -# -# SOC (System On Chip) specific Drivers -# - -# -# Amlogic SoC drivers -# - -# -# Broadcom SoC drivers -# - -# -# i.MX SoC drivers -# - -# -# Qualcomm SoC drivers -# -# CONFIG_SUNXI_SRAM is not set -CONFIG_SOC_TI=y -CONFIG_PM_DEVFREQ=y - -# -# DEVFREQ Governors -# -CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y -CONFIG_DEVFREQ_GOV_PERFORMANCE=y -CONFIG_DEVFREQ_GOV_POWERSAVE=y -CONFIG_DEVFREQ_GOV_USERSPACE=y -CONFIG_DEVFREQ_GOV_PASSIVE=m - -# -# DEVFREQ Drivers -# -CONFIG_PM_DEVFREQ_EVENT=y -CONFIG_EXTCON=y - -# -# Extcon Device Drivers -# -CONFIG_EXTCON_ADC_JACK=m -CONFIG_EXTCON_ARIZONA=m -CONFIG_EXTCON_AXP288=m -CONFIG_EXTCON_GPIO=m -# CONFIG_EXTCON_INTEL_INT3496 is not set -CONFIG_EXTCON_MAX14577=m -CONFIG_EXTCON_MAX3355=m -CONFIG_EXTCON_MAX77693=m -CONFIG_EXTCON_RT8973A=m -CONFIG_EXTCON_SM5502=m -# CONFIG_EXTCON_USB_GPIO is not set -CONFIG_EXTCON_USBC_CROS_EC=m -CONFIG_MEMORY=y -CONFIG_IIO=m -CONFIG_IIO_BUFFER=y -CONFIG_IIO_BUFFER_CB=m -CONFIG_IIO_KFIFO_BUF=m -CONFIG_IIO_TRIGGERED_BUFFER=m -CONFIG_IIO_CONFIGFS=m -CONFIG_IIO_TRIGGER=y -CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 -CONFIG_IIO_SW_DEVICE=m -CONFIG_IIO_SW_TRIGGER=m -CONFIG_IIO_TRIGGERED_EVENT=m - -# -# Accelerometers -# -CONFIG_BMA180=m -CONFIG_BMA220=m -CONFIG_BMC150_ACCEL=m -CONFIG_BMC150_ACCEL_I2C=m -CONFIG_BMC150_ACCEL_SPI=m -CONFIG_DA280=m -CONFIG_DA311=m -CONFIG_DMARD09=m -CONFIG_DMARD10=m -CONFIG_HID_SENSOR_ACCEL_3D=m -CONFIG_IIO_ST_ACCEL_3AXIS=m -CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m -CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m -CONFIG_KXSD9=m -CONFIG_KXSD9_SPI=m -CONFIG_KXSD9_I2C=m -CONFIG_KXCJK1013=m -CONFIG_MC3230=m -CONFIG_MMA7455=m -CONFIG_MMA7455_I2C=m -CONFIG_MMA7455_SPI=m -CONFIG_MMA7660=m -CONFIG_MMA8452=m -CONFIG_MMA9551_CORE=m -CONFIG_MMA9551=m -CONFIG_MMA9553=m -CONFIG_MXC4005=m -CONFIG_MXC6255=m -CONFIG_SCA3000=m -CONFIG_STK8312=m -CONFIG_STK8BA50=m - -# -# Analog to digital converters -# -CONFIG_AD_SIGMA_DELTA=m -CONFIG_AD7266=m -CONFIG_AD7291=m -CONFIG_AD7298=m -CONFIG_AD7476=m -CONFIG_AD7766=m -CONFIG_AD7791=m -CONFIG_AD7793=m -CONFIG_AD7887=m -CONFIG_AD7923=m -CONFIG_AD799X=m -CONFIG_AXP20X_ADC=m -CONFIG_AXP288_ADC=m -CONFIG_CC10001_ADC=m -CONFIG_DA9150_GPADC=m -CONFIG_DLN2_ADC=m -CONFIG_HI8435=m -CONFIG_HX711=m -CONFIG_INA2XX_ADC=m -CONFIG_LTC2471=m -CONFIG_LTC2485=m -CONFIG_LTC2497=m -CONFIG_MAX1027=m -CONFIG_MAX11100=m -CONFIG_MAX1118=m -CONFIG_MAX1363=m -CONFIG_MAX9611=m -CONFIG_MCP320X=m -CONFIG_MCP3422=m -CONFIG_MEN_Z188_ADC=m -CONFIG_NAU7802=m -CONFIG_QCOM_VADC_COMMON=m -CONFIG_QCOM_SPMI_IADC=m -CONFIG_QCOM_SPMI_VADC=m -CONFIG_TI_ADC081C=m -CONFIG_TI_ADC0832=m -CONFIG_TI_ADC084S021=m -CONFIG_TI_ADC12138=m -CONFIG_TI_ADC108S102=m -CONFIG_TI_ADC128S052=m -CONFIG_TI_ADC161S626=m -CONFIG_TI_ADS1015=m -CONFIG_TI_ADS7950=m -CONFIG_TI_AM335X_ADC=m -CONFIG_TI_TLC4541=m -CONFIG_VIPERBOARD_ADC=m - -# -# Amplifiers -# -CONFIG_AD8366=m - -# -# Chemical Sensors -# -CONFIG_ATLAS_PH_SENSOR=m -CONFIG_CCS811=m -CONFIG_IAQCORE=m -CONFIG_VZ89X=m -CONFIG_IIO_CROS_EC_SENSORS_CORE=m -CONFIG_IIO_CROS_EC_SENSORS=m - -# -# Hid Sensor IIO Common -# -CONFIG_HID_SENSOR_IIO_COMMON=m -CONFIG_HID_SENSOR_IIO_TRIGGER=m -CONFIG_IIO_MS_SENSORS_I2C=m - -# -# SSP Sensor Common -# -CONFIG_IIO_SSP_SENSORS_COMMONS=m -CONFIG_IIO_SSP_SENSORHUB=m -CONFIG_IIO_ST_SENSORS_I2C=m -CONFIG_IIO_ST_SENSORS_SPI=m -CONFIG_IIO_ST_SENSORS_CORE=m - -# -# Counters -# - -# -# Digital to analog converters -# -CONFIG_AD5064=m -CONFIG_AD5360=m -CONFIG_AD5380=m -CONFIG_AD5421=m -CONFIG_AD5446=m -CONFIG_AD5449=m -CONFIG_AD5592R_BASE=m -CONFIG_AD5592R=m -CONFIG_AD5593R=m -CONFIG_AD5504=m -CONFIG_AD5624R_SPI=m -CONFIG_LTC2632=m -CONFIG_AD5686=m -CONFIG_AD5755=m -CONFIG_AD5761=m -CONFIG_AD5764=m -CONFIG_AD5791=m -CONFIG_AD7303=m -CONFIG_AD8801=m -CONFIG_M62332=m -CONFIG_MAX517=m -CONFIG_MCP4725=m -CONFIG_MCP4922=m - -# -# IIO dummy driver -# -# CONFIG_IIO_SIMPLE_DUMMY is not set - -# -# Frequency Synthesizers DDS/PLL -# - -# -# Clock Generator/Distribution -# -CONFIG_AD9523=m - -# -# Phase-Locked Loop (PLL) frequency synthesizers -# -CONFIG_ADF4350=m - -# -# Digital gyroscope sensors -# -CONFIG_ADIS16080=m -CONFIG_ADIS16130=m -CONFIG_ADIS16136=m -CONFIG_ADIS16260=m -CONFIG_ADXRS450=m -CONFIG_BMG160=m -CONFIG_BMG160_I2C=m -CONFIG_BMG160_SPI=m -CONFIG_HID_SENSOR_GYRO_3D=m -CONFIG_MPU3050=m -CONFIG_MPU3050_I2C=m -CONFIG_IIO_ST_GYRO_3AXIS=m -CONFIG_IIO_ST_GYRO_I2C_3AXIS=m -CONFIG_IIO_ST_GYRO_SPI_3AXIS=m -CONFIG_ITG3200=m - -# -# Health Sensors -# - -# -# Heart Rate Monitors -# -CONFIG_AFE4403=m -CONFIG_AFE4404=m -CONFIG_MAX30100=m -CONFIG_MAX30102=m - -# -# Humidity sensors -# -CONFIG_AM2315=m -CONFIG_DHT11=m -CONFIG_HDC100X=m -CONFIG_HID_SENSOR_HUMIDITY=m -CONFIG_HTS221=m -CONFIG_HTS221_I2C=m -CONFIG_HTS221_SPI=m -CONFIG_HTU21=m -CONFIG_SI7005=m -CONFIG_SI7020=m - -# -# Inertial measurement units -# -CONFIG_ADIS16400=m -CONFIG_ADIS16480=m -CONFIG_BMI160=m -CONFIG_BMI160_I2C=m -CONFIG_BMI160_SPI=m -CONFIG_KMX61=m -CONFIG_INV_MPU6050_IIO=m -CONFIG_INV_MPU6050_I2C=m -CONFIG_INV_MPU6050_SPI=m -CONFIG_IIO_ST_LSM6DSX=m -CONFIG_IIO_ST_LSM6DSX_I2C=m -CONFIG_IIO_ST_LSM6DSX_SPI=m -CONFIG_IIO_ADIS_LIB=m -CONFIG_IIO_ADIS_LIB_BUFFER=y - -# -# Light sensors -# -# CONFIG_ACPI_ALS is not set -CONFIG_ADJD_S311=m -CONFIG_AL3320A=m -CONFIG_APDS9300=m -CONFIG_APDS9960=m -CONFIG_BH1750=m -CONFIG_BH1780=m -CONFIG_CM32181=m -CONFIG_CM3232=m -CONFIG_CM3323=m -CONFIG_CM36651=m -CONFIG_IIO_CROS_EC_LIGHT_PROX=m -CONFIG_GP2AP020A00F=m -CONFIG_SENSORS_ISL29018=m -CONFIG_SENSORS_ISL29028=m -CONFIG_ISL29125=m -CONFIG_HID_SENSOR_ALS=m -CONFIG_HID_SENSOR_PROX=m -CONFIG_JSA1212=m -CONFIG_RPR0521=m -CONFIG_SENSORS_LM3533=m -CONFIG_LTR501=m -CONFIG_MAX44000=m -CONFIG_OPT3001=m -CONFIG_PA12203001=m -CONFIG_SI1145=m -CONFIG_STK3310=m -CONFIG_TCS3414=m -CONFIG_TCS3472=m -CONFIG_SENSORS_TSL2563=m -CONFIG_TSL2583=m -CONFIG_TSL4531=m -CONFIG_US5182D=m -CONFIG_VCNL4000=m -CONFIG_VEML6070=m -CONFIG_VL6180=m - -# -# Magnetometer sensors -# -CONFIG_AK8975=m -CONFIG_AK09911=m -CONFIG_BMC150_MAGN=m -CONFIG_BMC150_MAGN_I2C=m -CONFIG_BMC150_MAGN_SPI=m -CONFIG_MAG3110=m -CONFIG_HID_SENSOR_MAGNETOMETER_3D=m -CONFIG_MMC35240=m -CONFIG_IIO_ST_MAGN_3AXIS=m -CONFIG_IIO_ST_MAGN_I2C_3AXIS=m -CONFIG_IIO_ST_MAGN_SPI_3AXIS=m -CONFIG_SENSORS_HMC5843=m -CONFIG_SENSORS_HMC5843_I2C=m -CONFIG_SENSORS_HMC5843_SPI=m - -# -# Multiplexers -# - -# -# Inclinometer sensors -# -CONFIG_HID_SENSOR_INCLINOMETER_3D=m -CONFIG_HID_SENSOR_DEVICE_ROTATION=m - -# -# Triggers - standalone -# -CONFIG_IIO_HRTIMER_TRIGGER=m -CONFIG_IIO_INTERRUPT_TRIGGER=m -CONFIG_IIO_TIGHTLOOP_TRIGGER=m -CONFIG_IIO_SYSFS_TRIGGER=m - -# -# Digital potentiometers -# -CONFIG_DS1803=m -CONFIG_MAX5481=m -CONFIG_MAX5487=m -CONFIG_MCP4131=m -CONFIG_MCP4531=m -CONFIG_TPL0102=m - -# -# Digital potentiostats -# -CONFIG_LMP91000=m - -# -# Pressure sensors -# -CONFIG_ABP060MG=m -CONFIG_BMP280=m -CONFIG_BMP280_I2C=m -CONFIG_BMP280_SPI=m -CONFIG_IIO_CROS_EC_BARO=m -CONFIG_HID_SENSOR_PRESS=m -CONFIG_HP03=m -CONFIG_MPL115=m -CONFIG_MPL115_I2C=m -CONFIG_MPL115_SPI=m -CONFIG_MPL3115=m -CONFIG_MS5611=m -CONFIG_MS5611_I2C=m -CONFIG_MS5611_SPI=m -CONFIG_MS5637=m -CONFIG_IIO_ST_PRESS=m -CONFIG_IIO_ST_PRESS_I2C=m -CONFIG_IIO_ST_PRESS_SPI=m -CONFIG_T5403=m -CONFIG_HP206C=m -CONFIG_ZPA2326=m -CONFIG_ZPA2326_I2C=m -CONFIG_ZPA2326_SPI=m - -# -# Lightning sensors -# -CONFIG_AS3935=m - -# -# Proximity and distance sensors -# -CONFIG_LIDAR_LITE_V2=m -CONFIG_SRF04=m -CONFIG_SX9500=m -CONFIG_SRF08=m - -# -# Temperature sensors -# -CONFIG_MAXIM_THERMOCOUPLE=m -CONFIG_HID_SENSOR_TEMP=m -CONFIG_MLX90614=m -CONFIG_TMP006=m -CONFIG_TMP007=m -CONFIG_TSYS01=m -CONFIG_TSYS02D=m -CONFIG_NTB=m -CONFIG_NTB_AMD=m -CONFIG_NTB_IDT=m -CONFIG_NTB_INTEL=m -CONFIG_NTB_PINGPONG=m -CONFIG_NTB_TOOL=m -CONFIG_NTB_PERF=m -CONFIG_NTB_TRANSPORT=m -CONFIG_VME_BUS=y - -# -# VME Bridge Drivers -# -CONFIG_VME_CA91CX42=m -CONFIG_VME_TSI148=m -CONFIG_VME_FAKE=m - -# -# VME Board Drivers -# -CONFIG_VMIVME_7805=m - -# -# VME Device Drivers -# -CONFIG_VME_USER=m -CONFIG_VME_PIO2=m -CONFIG_PWM=y -CONFIG_PWM_SYSFS=y -CONFIG_PWM_CROS_EC=m -CONFIG_PWM_LP3943=m -CONFIG_PWM_LPSS=m -CONFIG_PWM_LPSS_PCI=m -CONFIG_PWM_LPSS_PLATFORM=m -CONFIG_PWM_PCA9685=m -CONFIG_ARM_GIC_MAX_NR=1 -CONFIG_IPACK_BUS=m -CONFIG_BOARD_TPCI200=m -CONFIG_SERIAL_IPOCTAL=m -CONFIG_RESET_CONTROLLER=y -# CONFIG_RESET_ATH79 is not set -# CONFIG_RESET_BERLIN is not set -# CONFIG_RESET_IMX7 is not set -# CONFIG_RESET_LANTIQ is not set -# CONFIG_RESET_LPC18XX is not set -# CONFIG_RESET_MESON is not set -# CONFIG_RESET_PISTACHIO is not set -# CONFIG_RESET_SOCFPGA is not set -# CONFIG_RESET_STM32 is not set -# CONFIG_RESET_SUNXI is not set -CONFIG_RESET_TI_SYSCON=m -# CONFIG_RESET_ZYNQ is not set -# CONFIG_RESET_TEGRA_BPMP is not set -CONFIG_FMC=m -CONFIG_FMC_FAKEDEV=m -CONFIG_FMC_TRIVIAL=m -CONFIG_FMC_WRITE_EEPROM=m -CONFIG_FMC_CHARDEV=m - -# -# PHY Subsystem -# -CONFIG_GENERIC_PHY=y -CONFIG_BCM_KONA_USB2_PHY=m -CONFIG_PHY_PXA_28NM_HSIC=m -CONFIG_PHY_PXA_28NM_USB2=m -CONFIG_PHY_CPCAP_USB=m -CONFIG_PHY_QCOM_USB_HS=m -CONFIG_PHY_QCOM_USB_HSIC=m -CONFIG_PHY_SAMSUNG_USB2=m -# CONFIG_PHY_EXYNOS4210_USB2 is not set -# CONFIG_PHY_EXYNOS4X12_USB2 is not set -# CONFIG_PHY_EXYNOS5250_USB2 is not set -CONFIG_PHY_TUSB1210=m -CONFIG_POWERCAP=y -CONFIG_INTEL_RAPL=m -CONFIG_MCB=m -CONFIG_MCB_PCI=m -CONFIG_MCB_LPC=m - -# -# Performance monitor support -# -CONFIG_RAS=y -CONFIG_RAS_CEC=y -CONFIG_THUNDERBOLT=m - -# -# Android -# -# CONFIG_ANDROID is not set -CONFIG_LIBNVDIMM=m -CONFIG_BLK_DEV_PMEM=m -CONFIG_ND_BLK=m -CONFIG_ND_CLAIM=y -CONFIG_ND_BTT=m -CONFIG_BTT=y -CONFIG_DAX=y -CONFIG_DEV_DAX=m -CONFIG_NVMEM=y -CONFIG_STM=m -# CONFIG_STM_DUMMY is not set -CONFIG_STM_SOURCE_CONSOLE=m -CONFIG_STM_SOURCE_HEARTBEAT=m -CONFIG_INTEL_TH=m -CONFIG_INTEL_TH_PCI=m -CONFIG_INTEL_TH_GTH=m -CONFIG_INTEL_TH_STH=m -CONFIG_INTEL_TH_MSU=m -CONFIG_INTEL_TH_PTI=m -# CONFIG_INTEL_TH_DEBUG is not set -CONFIG_FPGA=m -CONFIG_FPGA_MGR_ALTERA_CVP=m -CONFIG_FPGA_MGR_ALTERA_PS_SPI=m -CONFIG_FPGA_MGR_XILINX_SPI=m -CONFIG_ALTERA_PR_IP_CORE=m - -# -# FSI support -# -CONFIG_FSI=m -CONFIG_FSI_MASTER_GPIO=m -CONFIG_FSI_MASTER_HUB=m -CONFIG_FSI_SCOM=m - -# -# Firmware Drivers -# -CONFIG_EDD=m -# CONFIG_EDD_OFF is not set -CONFIG_FIRMWARE_MEMMAP=y -CONFIG_DELL_RBU=m -CONFIG_DCDBAS=m -CONFIG_DMIID=y -CONFIG_DMI_SYSFS=m -CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y -CONFIG_ISCSI_IBFT_FIND=y -CONFIG_ISCSI_IBFT=m -CONFIG_FW_CFG_SYSFS=m -# CONFIG_FW_CFG_SYSFS_CMDLINE is not set -CONFIG_GOOGLE_FIRMWARE=y -CONFIG_GOOGLE_SMI=m -CONFIG_GOOGLE_COREBOOT_TABLE=m -CONFIG_GOOGLE_COREBOOT_TABLE_ACPI=m -CONFIG_GOOGLE_MEMCONSOLE=m -CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY=m -CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m -CONFIG_GOOGLE_VPD=m - -# -# EFI (Extensible Firmware Interface) Support -# -CONFIG_EFI_VARS=m -CONFIG_EFI_ESRT=y -CONFIG_EFI_VARS_PSTORE=m -CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE=y -CONFIG_EFI_RUNTIME_MAP=y -# CONFIG_EFI_FAKE_MEMMAP is not set -CONFIG_EFI_RUNTIME_WRAPPERS=y -CONFIG_EFI_BOOTLOADER_CONTROL=m -CONFIG_EFI_CAPSULE_LOADER=m -CONFIG_EFI_TEST=m -CONFIG_APPLE_PROPERTIES=y -CONFIG_RESET_ATTACK_MITIGATION=y -CONFIG_UEFI_CPER=y -CONFIG_EFI_DEV_PATH_PARSER=y - -# -# Tegra firmware driver -# - -# -# File systems -# -CONFIG_DCACHE_WORD_ACCESS=y -CONFIG_FS_IOMAP=y -CONFIG_EXT2_FS=m -CONFIG_EXT2_FS_XATTR=y -CONFIG_EXT2_FS_POSIX_ACL=y -CONFIG_EXT2_FS_SECURITY=y -CONFIG_EXT3_FS=m -CONFIG_EXT3_FS_POSIX_ACL=y -CONFIG_EXT3_FS_SECURITY=y -CONFIG_EXT4_FS=m -CONFIG_EXT4_FS_POSIX_ACL=y -CONFIG_EXT4_FS_SECURITY=y -CONFIG_EXT4_ENCRYPTION=y -CONFIG_EXT4_FS_ENCRYPTION=y -# CONFIG_EXT4_DEBUG is not set -CONFIG_JBD2=m -# CONFIG_JBD2_DEBUG is not set -CONFIG_FS_MBCACHE=m -# CONFIG_REISERFS_FS is not set -# CONFIG_JFS_FS is not set -CONFIG_XFS_FS=m -CONFIG_XFS_QUOTA=y -CONFIG_XFS_POSIX_ACL=y -CONFIG_XFS_RT=y -# CONFIG_XFS_WARN is not set -# CONFIG_XFS_DEBUG is not set -# CONFIG_GFS2_FS is not set -# CONFIG_OCFS2_FS is not set -CONFIG_BTRFS_FS=m -CONFIG_BTRFS_FS_POSIX_ACL=y -# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set -# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set -# CONFIG_BTRFS_DEBUG is not set -# CONFIG_BTRFS_ASSERT is not set -CONFIG_NILFS2_FS=m -CONFIG_F2FS_FS=m -CONFIG_F2FS_STAT_FS=y -CONFIG_F2FS_FS_XATTR=y -CONFIG_F2FS_FS_POSIX_ACL=y -CONFIG_F2FS_FS_SECURITY=y -CONFIG_F2FS_CHECK_FS=y -CONFIG_F2FS_FS_ENCRYPTION=y -# CONFIG_F2FS_FAULT_INJECTION is not set -CONFIG_FS_DAX=y -CONFIG_FS_POSIX_ACL=y -CONFIG_EXPORTFS=y -CONFIG_EXPORTFS_BLOCK_OPS=y -CONFIG_FILE_LOCKING=y -CONFIG_MANDATORY_FILE_LOCKING=y -CONFIG_FS_ENCRYPTION=m -CONFIG_FSNOTIFY=y -CONFIG_DNOTIFY=y -CONFIG_INOTIFY_USER=y -CONFIG_FANOTIFY=y -# CONFIG_FANOTIFY_ACCESS_PERMISSIONS is not set -CONFIG_QUOTA=y -CONFIG_QUOTA_NETLINK_INTERFACE=y -# CONFIG_PRINT_QUOTA_WARNING is not set -# CONFIG_QUOTA_DEBUG is not set -CONFIG_QUOTA_TREE=m -CONFIG_QFMT_V1=m -CONFIG_QFMT_V2=m -CONFIG_QUOTACTL=y -CONFIG_QUOTACTL_COMPAT=y -CONFIG_AUTOFS4_FS=m -CONFIG_FUSE_FS=m -CONFIG_CUSE=m -CONFIG_OVERLAY_FS=m -# CONFIG_OVERLAY_FS_REDIRECT_DIR is not set -CONFIG_OVERLAY_FS_INDEX=y - -# -# Caches -# -CONFIG_FSCACHE=m -CONFIG_FSCACHE_STATS=y -# CONFIG_FSCACHE_HISTOGRAM is not set -# CONFIG_FSCACHE_DEBUG is not set -# CONFIG_FSCACHE_OBJECT_LIST is not set -CONFIG_CACHEFILES=m -# CONFIG_CACHEFILES_DEBUG is not set -# CONFIG_CACHEFILES_HISTOGRAM is not set - -# -# CD-ROM/DVD Filesystems -# -CONFIG_ISO9660_FS=m -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_UDF_FS=m -CONFIG_UDF_NLS=y - -# -# DOS/FAT/NT Filesystems -# -CONFIG_FAT_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_FAT_DEFAULT_CODEPAGE=437 -CONFIG_FAT_DEFAULT_IOCHARSET="utf8" -# CONFIG_FAT_DEFAULT_UTF8 is not set -CONFIG_NTFS_FS=m -# CONFIG_NTFS_DEBUG is not set -CONFIG_NTFS_RW=y - -# -# Pseudo filesystems -# -CONFIG_PROC_FS=y -CONFIG_PROC_KCORE=y -CONFIG_PROC_SYSCTL=y -CONFIG_PROC_PAGE_MONITOR=y -# CONFIG_PROC_CHILDREN is not set -CONFIG_KERNFS=y -CONFIG_SYSFS=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_TMPFS_XATTR=y -CONFIG_HUGETLBFS=y -CONFIG_HUGETLB_PAGE=y -CONFIG_ARCH_HAS_GIGANTIC_PAGE=y -CONFIG_CONFIGFS_FS=m -CONFIG_EFIVAR_FS=m -CONFIG_MISC_FILESYSTEMS=y -CONFIG_ORANGEFS_FS=m -CONFIG_ADFS_FS=m -# CONFIG_ADFS_FS_RW is not set -CONFIG_AFFS_FS=m -CONFIG_ECRYPT_FS=m -CONFIG_ECRYPT_FS_MESSAGING=y -CONFIG_HFS_FS=m -CONFIG_HFSPLUS_FS=m -CONFIG_HFSPLUS_FS_POSIX_ACL=y -CONFIG_BEFS_FS=m -# CONFIG_BEFS_DEBUG is not set -CONFIG_BFS_FS=m -CONFIG_EFS_FS=m -CONFIG_JFFS2_FS=m -CONFIG_JFFS2_FS_DEBUG=0 -CONFIG_JFFS2_FS_WRITEBUFFER=y -CONFIG_JFFS2_FS_WBUF_VERIFY=y -CONFIG_JFFS2_SUMMARY=y -CONFIG_JFFS2_FS_XATTR=y -CONFIG_JFFS2_FS_POSIX_ACL=y -CONFIG_JFFS2_FS_SECURITY=y -CONFIG_JFFS2_COMPRESSION_OPTIONS=y -CONFIG_JFFS2_ZLIB=y -CONFIG_JFFS2_LZO=y -CONFIG_JFFS2_RTIME=y -CONFIG_JFFS2_RUBIN=y -# CONFIG_JFFS2_CMODE_NONE is not set -CONFIG_JFFS2_CMODE_PRIORITY=y -# CONFIG_JFFS2_CMODE_SIZE is not set -# CONFIG_JFFS2_CMODE_FAVOURLZO is not set -CONFIG_UBIFS_FS=m -CONFIG_UBIFS_FS_ADVANCED_COMPR=y -CONFIG_UBIFS_FS_LZO=y -CONFIG_UBIFS_FS_ZLIB=y -CONFIG_UBIFS_ATIME_SUPPORT=y -CONFIG_UBIFS_FS_ENCRYPTION=y -CONFIG_UBIFS_FS_SECURITY=y -CONFIG_CRAMFS=m -CONFIG_SQUASHFS=m -# CONFIG_SQUASHFS_FILE_CACHE is not set -CONFIG_SQUASHFS_FILE_DIRECT=y -# CONFIG_SQUASHFS_DECOMP_SINGLE is not set -# CONFIG_SQUASHFS_DECOMP_MULTI is not set -CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU=y -CONFIG_SQUASHFS_XATTR=y -CONFIG_SQUASHFS_ZLIB=y -CONFIG_SQUASHFS_LZ4=y -CONFIG_SQUASHFS_LZO=y -CONFIG_SQUASHFS_XZ=y -CONFIG_SQUASHFS_ZSTD=y -CONFIG_SQUASHFS_4K_DEVBLK_SIZE=y -CONFIG_SQUASHFS_EMBEDDED=y -CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 -CONFIG_VXFS_FS=m -CONFIG_MINIX_FS=m -CONFIG_OMFS_FS=m -CONFIG_HPFS_FS=m -CONFIG_QNX4FS_FS=m -CONFIG_QNX6FS_FS=m -# CONFIG_QNX6FS_DEBUG is not set -CONFIG_ROMFS_FS=m -# CONFIG_ROMFS_BACKED_BY_BLOCK is not set -# CONFIG_ROMFS_BACKED_BY_MTD is not set -CONFIG_ROMFS_BACKED_BY_BOTH=y -CONFIG_ROMFS_ON_BLOCK=y -CONFIG_ROMFS_ON_MTD=y -CONFIG_PSTORE=y -# CONFIG_PSTORE_ZLIB_COMPRESS is not set -# CONFIG_PSTORE_LZO_COMPRESS is not set -CONFIG_PSTORE_LZ4_COMPRESS=y -# CONFIG_PSTORE_CONSOLE is not set -CONFIG_PSTORE_PMSG=y -CONFIG_PSTORE_RAM=m -CONFIG_SYSV_FS=m -CONFIG_UFS_FS=m -# CONFIG_UFS_FS_WRITE is not set -# CONFIG_UFS_DEBUG is not set -CONFIG_EXOFS_FS=m -# CONFIG_EXOFS_DEBUG is not set -CONFIG_ORE=m -CONFIG_NETWORK_FILESYSTEMS=y -CONFIG_NFS_FS=m -CONFIG_NFS_V2=m -CONFIG_NFS_V3=m -CONFIG_NFS_V3_ACL=y -CONFIG_NFS_V4=m -CONFIG_NFS_SWAP=y -CONFIG_NFS_V4_1=y -CONFIG_NFS_V4_2=y -CONFIG_PNFS_FILE_LAYOUT=m -CONFIG_PNFS_BLOCK=m -CONFIG_PNFS_FLEXFILE_LAYOUT=m -CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" -CONFIG_NFS_V4_1_MIGRATION=y -CONFIG_NFS_V4_SECURITY_LABEL=y -CONFIG_NFS_FSCACHE=y -# CONFIG_NFS_USE_LEGACY_DNS is not set -CONFIG_NFS_USE_KERNEL_DNS=y -CONFIG_NFSD=m -CONFIG_NFSD_V2_ACL=y -CONFIG_NFSD_V3=y -CONFIG_NFSD_V3_ACL=y -CONFIG_NFSD_V4=y -CONFIG_NFSD_PNFS=y -CONFIG_NFSD_BLOCKLAYOUT=y -CONFIG_NFSD_SCSILAYOUT=y -CONFIG_NFSD_FLEXFILELAYOUT=y -# CONFIG_NFSD_V4_SECURITY_LABEL is not set -# CONFIG_NFSD_FAULT_INJECTION is not set -CONFIG_GRACE_PERIOD=m -CONFIG_LOCKD=m -CONFIG_LOCKD_V4=y -CONFIG_NFS_ACL_SUPPORT=m -CONFIG_NFS_COMMON=y -CONFIG_SUNRPC=m -CONFIG_SUNRPC_GSS=m -CONFIG_SUNRPC_BACKCHANNEL=y -CONFIG_SUNRPC_SWAP=y -CONFIG_RPCSEC_GSS_KRB5=m -# CONFIG_SUNRPC_DEBUG is not set -CONFIG_SUNRPC_XPRT_RDMA=m -CONFIG_CEPH_FS=m -CONFIG_CEPH_FSCACHE=y -CONFIG_CEPH_FS_POSIX_ACL=y -CONFIG_CIFS=m -CONFIG_CIFS_STATS=y -CONFIG_CIFS_STATS2=y -CONFIG_CIFS_WEAK_PW_HASH=y -CONFIG_CIFS_UPCALL=y -CONFIG_CIFS_XATTR=y -CONFIG_CIFS_ACL=y -# CONFIG_CIFS_DEBUG is not set -CONFIG_CIFS_DFS_UPCALL=y -CONFIG_CIFS_SMB311=y -CONFIG_CIFS_FSCACHE=y -CONFIG_NCP_FS=m -CONFIG_NCPFS_PACKET_SIGNING=y -CONFIG_NCPFS_IOCTL_LOCKING=y -CONFIG_NCPFS_STRONG=y -CONFIG_NCPFS_NFS_NS=y -CONFIG_NCPFS_OS2_NS=y -CONFIG_NCPFS_SMALLDOS=y -CONFIG_NCPFS_NLS=y -CONFIG_NCPFS_EXTRAS=y -CONFIG_CODA_FS=m -CONFIG_AFS_FS=m -# CONFIG_AFS_DEBUG is not set -CONFIG_AFS_FSCACHE=y -CONFIG_9P_FS=m -CONFIG_9P_FSCACHE=y -CONFIG_9P_FS_POSIX_ACL=y -CONFIG_9P_FS_SECURITY=y -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=m -CONFIG_NLS_CODEPAGE_737=m -CONFIG_NLS_CODEPAGE_775=m -CONFIG_NLS_CODEPAGE_850=m -CONFIG_NLS_CODEPAGE_852=m -CONFIG_NLS_CODEPAGE_855=m -CONFIG_NLS_CODEPAGE_857=m -CONFIG_NLS_CODEPAGE_860=m -CONFIG_NLS_CODEPAGE_861=m -CONFIG_NLS_CODEPAGE_862=m -CONFIG_NLS_CODEPAGE_863=m -CONFIG_NLS_CODEPAGE_864=m -CONFIG_NLS_CODEPAGE_865=m -CONFIG_NLS_CODEPAGE_866=m -CONFIG_NLS_CODEPAGE_869=m -CONFIG_NLS_CODEPAGE_936=m -CONFIG_NLS_CODEPAGE_950=m -CONFIG_NLS_CODEPAGE_932=m -CONFIG_NLS_CODEPAGE_949=m -CONFIG_NLS_CODEPAGE_874=m -CONFIG_NLS_ISO8859_8=m -CONFIG_NLS_CODEPAGE_1250=m -CONFIG_NLS_CODEPAGE_1251=m -CONFIG_NLS_ASCII=m -CONFIG_NLS_ISO8859_1=m -CONFIG_NLS_ISO8859_2=m -CONFIG_NLS_ISO8859_3=m -CONFIG_NLS_ISO8859_4=m -CONFIG_NLS_ISO8859_5=m -CONFIG_NLS_ISO8859_6=m -CONFIG_NLS_ISO8859_7=m -CONFIG_NLS_ISO8859_9=m -CONFIG_NLS_ISO8859_13=m -CONFIG_NLS_ISO8859_14=m -CONFIG_NLS_ISO8859_15=m -CONFIG_NLS_KOI8_R=m -CONFIG_NLS_KOI8_U=m -CONFIG_NLS_MAC_ROMAN=m -CONFIG_NLS_MAC_CELTIC=m -CONFIG_NLS_MAC_CENTEURO=m -CONFIG_NLS_MAC_CROATIAN=m -CONFIG_NLS_MAC_CYRILLIC=m -CONFIG_NLS_MAC_GAELIC=m -CONFIG_NLS_MAC_GREEK=m -CONFIG_NLS_MAC_ICELAND=m -CONFIG_NLS_MAC_INUIT=m -CONFIG_NLS_MAC_ROMANIAN=m -CONFIG_NLS_MAC_TURKISH=m -CONFIG_NLS_UTF8=m -CONFIG_DLM=m -# CONFIG_DLM_DEBUG is not set - -# -# Kernel hacking -# -CONFIG_TRACE_IRQFLAGS_SUPPORT=y - -# -# printk and dmesg options -# -# CONFIG_PRINTK_TIME is not set -CONFIG_CONSOLE_LOGLEVEL_DEFAULT=1 -CONFIG_MESSAGE_LOGLEVEL_DEFAULT=1 -# CONFIG_BOOT_PRINTK_DELAY is not set -# CONFIG_DYNAMIC_DEBUG is not set - -# -# Compile-time checks and compiler options -# -# CONFIG_DEBUG_INFO is not set -# CONFIG_ENABLE_WARN_DEPRECATED is not set -# CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_FRAME_WARN=0 -CONFIG_STRIP_ASM_SYMS=y -# CONFIG_READABLE_ASM is not set -# CONFIG_UNUSED_SYMBOLS is not set -# CONFIG_PAGE_OWNER is not set -CONFIG_DEBUG_FS=y -# CONFIG_HEADERS_CHECK is not set -# CONFIG_DEBUG_SECTION_MISMATCH is not set -CONFIG_SECTION_MISMATCH_WARN_ONLY=y -CONFIG_STACK_VALIDATION=y -# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set -CONFIG_MAGIC_SYSRQ=y -CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x1 -CONFIG_MAGIC_SYSRQ_SERIAL=y -CONFIG_DEBUG_KERNEL=y - -# -# Memory Debugging -# -# CONFIG_PAGE_EXTENSION is not set -# CONFIG_DEBUG_PAGEALLOC is not set -# CONFIG_PAGE_POISONING is not set -# CONFIG_DEBUG_PAGE_REF is not set -# CONFIG_DEBUG_RODATA_TEST is not set -# CONFIG_DEBUG_OBJECTS is not set -CONFIG_SLUB_DEBUG_ON=y -# CONFIG_SLUB_STATS is not set -CONFIG_HAVE_DEBUG_KMEMLEAK=y -# CONFIG_DEBUG_KMEMLEAK is not set -# CONFIG_DEBUG_STACK_USAGE is not set -# CONFIG_DEBUG_VM is not set -CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y -# CONFIG_DEBUG_VIRTUAL is not set -CONFIG_DEBUG_MEMORY_INIT=y -# CONFIG_DEBUG_PER_CPU_MAPS is not set -CONFIG_HAVE_DEBUG_STACKOVERFLOW=y -# CONFIG_DEBUG_STACKOVERFLOW is not set -CONFIG_HAVE_ARCH_KASAN=y -# CONFIG_KASAN is not set -CONFIG_ARCH_HAS_KCOV=y -# CONFIG_KCOV is not set -# CONFIG_DEBUG_SHIRQ is not set - -# -# Debug Lockups and Hangs -# -# CONFIG_SOFTLOCKUP_DETECTOR is not set -CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y -# CONFIG_HARDLOCKUP_DETECTOR is not set -# CONFIG_DETECT_HUNG_TASK is not set -# CONFIG_WQ_WATCHDOG is not set -# CONFIG_PANIC_ON_OOPS is not set -CONFIG_PANIC_ON_OOPS_VALUE=0 -CONFIG_PANIC_TIMEOUT=0 -CONFIG_SCHED_DEBUG=y -CONFIG_SCHED_INFO=y -CONFIG_SCHEDSTATS=y -CONFIG_SCHED_STACK_END_CHECK=y -# CONFIG_DEBUG_TIMEKEEPING is not set -# CONFIG_DEBUG_PREEMPT is not set - -# -# Lock Debugging (spinlocks, mutexes, etc...) -# -# CONFIG_DEBUG_RT_MUTEXES is not set -# CONFIG_DEBUG_SPINLOCK is not set -# CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set -# CONFIG_DEBUG_LOCK_ALLOC is not set -# CONFIG_PROVE_LOCKING is not set -# CONFIG_LOCK_STAT is not set -# CONFIG_DEBUG_ATOMIC_SLEEP is not set -# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set -# CONFIG_LOCK_TORTURE_TEST is not set -# CONFIG_WW_MUTEX_SELFTEST is not set -CONFIG_STACKTRACE=y -# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set -# CONFIG_DEBUG_KOBJECT is not set -CONFIG_DEBUG_BUGVERBOSE=y -CONFIG_DEBUG_LIST=y -CONFIG_DEBUG_PI_LIST=y -CONFIG_DEBUG_SG=y -CONFIG_DEBUG_NOTIFIERS=y -CONFIG_DEBUG_CREDENTIALS=y - -# -# RCU Debugging -# -# CONFIG_PROVE_RCU is not set -CONFIG_TORTURE_TEST=m -CONFIG_RCU_PERF_TEST=m -# CONFIG_RCU_TORTURE_TEST is not set -CONFIG_RCU_CPU_STALL_TIMEOUT=60 -# CONFIG_RCU_TRACE is not set -# CONFIG_RCU_EQS_DEBUG is not set -# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set -# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set -# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set -# CONFIG_NOTIFIER_ERROR_INJECTION is not set -# CONFIG_FAULT_INJECTION is not set -# CONFIG_LATENCYTOP is not set -CONFIG_USER_STACKTRACE_SUPPORT=y -CONFIG_NOP_TRACER=y -CONFIG_HAVE_FUNCTION_TRACER=y -CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y -CONFIG_HAVE_DYNAMIC_FTRACE=y -CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y -CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y -CONFIG_HAVE_SYSCALL_TRACEPOINTS=y -CONFIG_HAVE_FENTRY=y -CONFIG_HAVE_C_RECORDMCOUNT=y -CONFIG_TRACE_CLOCK=y -CONFIG_RING_BUFFER=y -CONFIG_EVENT_TRACING=y -CONFIG_CONTEXT_SWITCH_TRACER=y -CONFIG_TRACING=y -CONFIG_TRACING_SUPPORT=y -CONFIG_FTRACE=y -# CONFIG_FUNCTION_TRACER is not set -# CONFIG_IRQSOFF_TRACER is not set -# CONFIG_PREEMPT_TRACER is not set -# CONFIG_SCHED_TRACER is not set -# CONFIG_HWLAT_TRACER is not set -# CONFIG_ENABLE_DEFAULT_TRACERS is not set -# CONFIG_FTRACE_SYSCALLS is not set -# CONFIG_TRACER_SNAPSHOT is not set -CONFIG_BRANCH_PROFILE_NONE=y -# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set -# CONFIG_STACK_TRACER is not set -# CONFIG_BLK_DEV_IO_TRACE is not set -CONFIG_KPROBE_EVENTS=y -CONFIG_UPROBE_EVENTS=y -CONFIG_BPF_EVENTS=y -CONFIG_PROBE_EVENTS=y -# CONFIG_MMIOTRACE is not set -# CONFIG_HIST_TRIGGERS is not set -# CONFIG_TRACEPOINT_BENCHMARK is not set -# CONFIG_RING_BUFFER_BENCHMARK is not set -# CONFIG_RING_BUFFER_STARTUP_TEST is not set -# CONFIG_TRACE_EVAL_MAP_FILE is not set -# CONFIG_TRACING_EVENTS_GPIO is not set -# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set -# CONFIG_DMA_API_DEBUG is not set - -# -# Runtime Testing -# -CONFIG_LKDTM=m -# CONFIG_TEST_LIST_SORT is not set -# CONFIG_TEST_SORT is not set -# CONFIG_KPROBES_SANITY_TEST is not set -# CONFIG_BACKTRACE_SELF_TEST is not set -# CONFIG_RBTREE_TEST is not set -# CONFIG_INTERVAL_TREE_TEST is not set -# CONFIG_PERCPU_TEST is not set -# CONFIG_ATOMIC64_SELFTEST is not set -# CONFIG_ASYNC_RAID6_TEST is not set -# CONFIG_TEST_HEXDUMP is not set -# CONFIG_TEST_STRING_HELPERS is not set -# CONFIG_TEST_KSTRTOX is not set -CONFIG_TEST_PRINTF=m -CONFIG_TEST_BITMAP=m -CONFIG_TEST_UUID=m -# CONFIG_TEST_RHASHTABLE is not set -CONFIG_TEST_HASH=m -# CONFIG_TEST_PARMAN is not set -CONFIG_TEST_LKM=m -# CONFIG_TEST_USER_COPY is not set -# CONFIG_TEST_BPF is not set -# CONFIG_TEST_FIRMWARE is not set -CONFIG_TEST_SYSCTL=m -# CONFIG_TEST_UDELAY is not set -CONFIG_TEST_STATIC_KEYS=m -CONFIG_TEST_KMOD=m -CONFIG_MEMTEST=y -# CONFIG_BUG_ON_DATA_CORRUPTION is not set -# CONFIG_SAMPLES is not set -CONFIG_HAVE_ARCH_KGDB=y -# CONFIG_KGDB is not set -CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y -# CONFIG_ARCH_WANTS_UBSAN_NO_NULL is not set -# CONFIG_UBSAN is not set -CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y -CONFIG_STRICT_DEVMEM=y -CONFIG_IO_STRICT_DEVMEM=y -CONFIG_EARLY_PRINTK_USB=y -CONFIG_X86_VERBOSE_BOOTUP=y -CONFIG_EARLY_PRINTK=y -# CONFIG_EARLY_PRINTK_DBGP is not set -# CONFIG_EARLY_PRINTK_EFI is not set -CONFIG_EARLY_PRINTK_USB_XDBC=y -CONFIG_X86_PTDUMP_CORE=y -# CONFIG_X86_PTDUMP is not set -# CONFIG_EFI_PGT_DUMP is not set -CONFIG_DEBUG_WX=y -CONFIG_DOUBLEFAULT=y -# CONFIG_DEBUG_TLBFLUSH is not set -# CONFIG_IOMMU_DEBUG is not set -# CONFIG_IOMMU_STRESS is not set -CONFIG_HAVE_MMIOTRACE_SUPPORT=y -# CONFIG_X86_DECODER_SELFTEST is not set -CONFIG_IO_DELAY_TYPE_0X80=0 -CONFIG_IO_DELAY_TYPE_0XED=1 -CONFIG_IO_DELAY_TYPE_UDELAY=2 -CONFIG_IO_DELAY_TYPE_NONE=3 -CONFIG_IO_DELAY_0X80=y -# CONFIG_IO_DELAY_0XED is not set -# CONFIG_IO_DELAY_UDELAY is not set -# CONFIG_IO_DELAY_NONE is not set -CONFIG_DEFAULT_IO_DELAY_TYPE=0 -# CONFIG_DEBUG_BOOT_PARAMS is not set -# CONFIG_CPA_DEBUG is not set -# CONFIG_OPTIMIZE_INLINING is not set -# CONFIG_DEBUG_ENTRY is not set -# CONFIG_DEBUG_NMI_SELFTEST is not set -CONFIG_X86_DEBUG_FPU=y -# CONFIG_PUNIT_ATOM_DEBUG is not set -CONFIG_UNWINDER_ORC=y -# CONFIG_UNWINDER_FRAME_POINTER is not set - -# -# Security options -# -CONFIG_KEYS=y -CONFIG_KEYS_COMPAT=y -CONFIG_PERSISTENT_KEYRINGS=y -# CONFIG_BIG_KEYS is not set -CONFIG_TRUSTED_KEYS=m -CONFIG_ENCRYPTED_KEYS=m -# CONFIG_KEY_DH_OPERATIONS is not set -CONFIG_SECURITY_DMESG_RESTRICT=y -CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y -CONFIG_SECURITY_TIOCSTI_RESTRICT=y -CONFIG_SECURITY=y -# CONFIG_SECURITY_WRITABLE_HOOKS is not set -CONFIG_SECURITYFS=y -CONFIG_SECURITY_NETWORK=y -CONFIG_PAGE_TABLE_ISOLATION=y -# CONFIG_SECURITY_INFINIBAND is not set -# CONFIG_SECURITY_NETWORK_XFRM is not set -CONFIG_SECURITY_PATH=y -CONFIG_INTEL_TXT=y -CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y -CONFIG_HARDENED_USERCOPY=y -CONFIG_FORTIFY_SOURCE=y -CONFIG_PAGE_SANITIZE=y -CONFIG_PAGE_SANITIZE_VERIFY=y -# CONFIG_STATIC_USERMODEHELPER is not set -# CONFIG_SECURITY_SELINUX is not set -# CONFIG_SECURITY_SMACK is not set -# CONFIG_SECURITY_TOMOYO is not set -CONFIG_SECURITY_APPARMOR=y -CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE=1 -CONFIG_SECURITY_APPARMOR_HASH=y -CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y -# CONFIG_SECURITY_APPARMOR_DEBUG is not set -# CONFIG_SECURITY_LOADPIN is not set -CONFIG_SECURITY_YAMA=y -# CONFIG_INTEGRITY is not set -CONFIG_DEFAULT_SECURITY_APPARMOR=y -# CONFIG_DEFAULT_SECURITY_DAC is not set -CONFIG_DEFAULT_SECURITY="apparmor" -CONFIG_XOR_BLOCKS=m -CONFIG_ASYNC_CORE=m -CONFIG_ASYNC_MEMCPY=m -CONFIG_ASYNC_XOR=m -CONFIG_ASYNC_PQ=m -CONFIG_ASYNC_RAID6_RECOV=m -CONFIG_CRYPTO=y - -# -# Crypto core or helper -# -CONFIG_CRYPTO_ALGAPI=y -CONFIG_CRYPTO_ALGAPI2=y -CONFIG_CRYPTO_AEAD=m -CONFIG_CRYPTO_AEAD2=y -CONFIG_CRYPTO_BLKCIPHER=y -CONFIG_CRYPTO_BLKCIPHER2=y -CONFIG_CRYPTO_HASH=y -CONFIG_CRYPTO_HASH2=y -CONFIG_CRYPTO_RNG=m -CONFIG_CRYPTO_RNG2=y -CONFIG_CRYPTO_RNG_DEFAULT=m -CONFIG_CRYPTO_AKCIPHER2=y -CONFIG_CRYPTO_AKCIPHER=y -CONFIG_CRYPTO_KPP2=y -CONFIG_CRYPTO_KPP=m -CONFIG_CRYPTO_ACOMP2=y -CONFIG_CRYPTO_RSA=y -CONFIG_CRYPTO_DH=m -CONFIG_CRYPTO_ECDH=m -CONFIG_CRYPTO_MANAGER=y -CONFIG_CRYPTO_MANAGER2=y -CONFIG_CRYPTO_USER=m -CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y -CONFIG_CRYPTO_GF128MUL=m -CONFIG_CRYPTO_NULL=m -CONFIG_CRYPTO_NULL2=y -CONFIG_CRYPTO_PCRYPT=m -CONFIG_CRYPTO_WORKQUEUE=y -CONFIG_CRYPTO_CRYPTD=m -CONFIG_CRYPTO_MCRYPTD=m -CONFIG_CRYPTO_AUTHENC=m -CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_ABLK_HELPER=m -CONFIG_CRYPTO_SIMD=m -CONFIG_CRYPTO_GLUE_HELPER_X86=m -CONFIG_CRYPTO_ENGINE=m - -# -# Authenticated Encryption with Associated Data -# -CONFIG_CRYPTO_CCM=m -CONFIG_CRYPTO_GCM=m -CONFIG_CRYPTO_CHACHA20POLY1305=m -CONFIG_CRYPTO_SEQIV=m -CONFIG_CRYPTO_ECHAINIV=m - -# -# Block modes -# -CONFIG_CRYPTO_CBC=m -CONFIG_CRYPTO_CTR=m -CONFIG_CRYPTO_CTS=m -CONFIG_CRYPTO_ECB=y -CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_XTS=m -CONFIG_CRYPTO_KEYWRAP=m - -# -# Hash modes -# -CONFIG_CRYPTO_CMAC=m -CONFIG_CRYPTO_HMAC=m -CONFIG_CRYPTO_XCBC=m -CONFIG_CRYPTO_VMAC=m - -# -# Digest -# -CONFIG_CRYPTO_CRC32C=m -CONFIG_CRYPTO_CRC32C_INTEL=m -CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_CRC32_PCLMUL=m -CONFIG_CRYPTO_CRCT10DIF=y -CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m -CONFIG_CRYPTO_GHASH=m -CONFIG_CRYPTO_POLY1305=m -CONFIG_CRYPTO_POLY1305_X86_64=m -CONFIG_CRYPTO_MD4=m -CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD128=m -CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_RMD256=m -CONFIG_CRYPTO_RMD320=m -CONFIG_CRYPTO_SHA1=y -CONFIG_CRYPTO_SHA1_SSSE3=m -CONFIG_CRYPTO_SHA256_SSSE3=m -CONFIG_CRYPTO_SHA512_SSSE3=m -CONFIG_CRYPTO_SHA1_MB=m -CONFIG_CRYPTO_SHA256_MB=m -CONFIG_CRYPTO_SHA512_MB=m -CONFIG_CRYPTO_SHA256=m -CONFIG_CRYPTO_SHA512=y -CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_TGR192=m -CONFIG_CRYPTO_WP512=m -CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m - -# -# Ciphers -# -CONFIG_CRYPTO_AES=y -CONFIG_CRYPTO_AES_TI=m -CONFIG_CRYPTO_AES_X86_64=m -CONFIG_CRYPTO_AES_NI_INTEL=m -CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_ARC4=m -CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_BLOWFISH_COMMON=m -CONFIG_CRYPTO_BLOWFISH_X86_64=m -CONFIG_CRYPTO_CAMELLIA=m -CONFIG_CRYPTO_CAMELLIA_X86_64=m -CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m -CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m -CONFIG_CRYPTO_CAST_COMMON=m -CONFIG_CRYPTO_CAST5=m -CONFIG_CRYPTO_CAST5_AVX_X86_64=m -CONFIG_CRYPTO_CAST6=m -CONFIG_CRYPTO_CAST6_AVX_X86_64=m -CONFIG_CRYPTO_DES=m -CONFIG_CRYPTO_DES3_EDE_X86_64=m -CONFIG_CRYPTO_FCRYPT=m -CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m -CONFIG_CRYPTO_CHACHA20=m -CONFIG_CRYPTO_CHACHA20_X86_64=m -CONFIG_CRYPTO_SEED=m -CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m -CONFIG_CRYPTO_SERPENT_AVX_X86_64=m -CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m -CONFIG_CRYPTO_TEA=m -CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_TWOFISH_COMMON=m -CONFIG_CRYPTO_TWOFISH_X86_64=m -CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m -CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m - -# -# Compression -# -CONFIG_CRYPTO_DEFLATE=m -CONFIG_CRYPTO_LZO=y -CONFIG_CRYPTO_842=m -CONFIG_CRYPTO_LZ4=m -CONFIG_CRYPTO_LZ4HC=m - -# -# Random Number Generation -# -CONFIG_CRYPTO_ANSI_CPRNG=m -CONFIG_CRYPTO_DRBG_MENU=m -CONFIG_CRYPTO_DRBG_HMAC=y -CONFIG_CRYPTO_DRBG_HASH=y -CONFIG_CRYPTO_DRBG_CTR=y -CONFIG_CRYPTO_DRBG=m -CONFIG_CRYPTO_JITTERENTROPY=m -CONFIG_CRYPTO_USER_API=m -CONFIG_CRYPTO_USER_API_HASH=m -CONFIG_CRYPTO_USER_API_SKCIPHER=m -CONFIG_CRYPTO_USER_API_RNG=m -CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_CRYPTO_HASH_INFO=y -CONFIG_CRYPTO_HW=y -CONFIG_CRYPTO_DEV_PADLOCK=m -CONFIG_CRYPTO_DEV_PADLOCK_AES=m -CONFIG_CRYPTO_DEV_PADLOCK_SHA=m -# CONFIG_CRYPTO_DEV_FSL_CAAM_CRYPTO_API_DESC is not set -CONFIG_CRYPTO_DEV_CCP=y -CONFIG_CRYPTO_DEV_CCP_DD=m -CONFIG_CRYPTO_DEV_SP_CCP=y -CONFIG_CRYPTO_DEV_CCP_CRYPTO=m -CONFIG_CRYPTO_DEV_QAT=m -CONFIG_CRYPTO_DEV_QAT_DH895xCC=m -CONFIG_CRYPTO_DEV_QAT_C3XXX=m -CONFIG_CRYPTO_DEV_QAT_C62X=m -CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m -CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m -CONFIG_CRYPTO_DEV_QAT_C62XVF=m -CONFIG_CRYPTO_DEV_NITROX=m -CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m -CONFIG_CRYPTO_DEV_CHELSIO=m -CONFIG_CRYPTO_DEV_VIRTIO=m -CONFIG_ASYMMETRIC_KEY_TYPE=y -CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y -CONFIG_X509_CERTIFICATE_PARSER=y -CONFIG_PKCS7_MESSAGE_PARSER=y -# CONFIG_PKCS7_TEST_KEY is not set -# CONFIG_SIGNED_PE_FILE_VERIFICATION is not set - -# -# Certificates for signature checking -# -CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" -CONFIG_SYSTEM_TRUSTED_KEYRING=y -CONFIG_SYSTEM_TRUSTED_KEYS="" -# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set -CONFIG_SECONDARY_TRUSTED_KEYRING=y -CONFIG_SYSTEM_BLACKLIST_KEYRING=y -CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" -CONFIG_HAVE_KVM=y -CONFIG_HAVE_KVM_IRQCHIP=y -CONFIG_HAVE_KVM_IRQFD=y -CONFIG_HAVE_KVM_IRQ_ROUTING=y -CONFIG_HAVE_KVM_EVENTFD=y -CONFIG_KVM_MMIO=y -CONFIG_KVM_ASYNC_PF=y -CONFIG_HAVE_KVM_MSI=y -CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y -CONFIG_KVM_VFIO=y -CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y -CONFIG_KVM_COMPAT=y -CONFIG_HAVE_KVM_IRQ_BYPASS=y -CONFIG_VIRTUALIZATION=y -CONFIG_KVM=m -CONFIG_KVM_INTEL=m -CONFIG_KVM_AMD=m -# CONFIG_KVM_MMU_AUDIT is not set -CONFIG_VHOST_NET=m -CONFIG_VHOST_SCSI=m -CONFIG_VHOST_VSOCK=m -CONFIG_VHOST=m -# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set -CONFIG_BINARY_PRINTF=y - -# -# Library routines -# -CONFIG_RAID6_PQ=m -CONFIG_BITREVERSE=y -# CONFIG_HAVE_ARCH_BITREVERSE is not set -CONFIG_RATIONAL=y -CONFIG_GENERIC_STRNCPY_FROM_USER=y -CONFIG_GENERIC_STRNLEN_USER=y -CONFIG_GENERIC_NET_UTILS=y -CONFIG_GENERIC_FIND_FIRST_BIT=y -CONFIG_GENERIC_PCI_IOMAP=y -CONFIG_GENERIC_IOMAP=y -CONFIG_GENERIC_IO=y -CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y -CONFIG_ARCH_HAS_FAST_MULTIPLIER=y -CONFIG_CRC_CCITT=m -CONFIG_CRC16=m -CONFIG_CRC_T10DIF=y -CONFIG_CRC_ITU_T=m -CONFIG_CRC32=y -# CONFIG_CRC32_SELFTEST is not set -CONFIG_CRC32_SLICEBY8=y -# CONFIG_CRC32_SLICEBY4 is not set -# CONFIG_CRC32_SARWATE is not set -# CONFIG_CRC32_BIT is not set -CONFIG_CRC4=m -CONFIG_CRC7=m -CONFIG_LIBCRC32C=m -CONFIG_CRC8=m -CONFIG_XXHASH=m -# CONFIG_AUDIT_ARCH_COMPAT_GENERIC is not set -# CONFIG_RANDOM32_SELFTEST is not set -CONFIG_842_COMPRESS=m -CONFIG_842_DECOMPRESS=m -CONFIG_ZLIB_INFLATE=y -CONFIG_ZLIB_DEFLATE=y -CONFIG_LZO_COMPRESS=y -CONFIG_LZO_DECOMPRESS=y -CONFIG_LZ4_COMPRESS=y -CONFIG_LZ4HC_COMPRESS=m -CONFIG_LZ4_DECOMPRESS=y -CONFIG_ZSTD_COMPRESS=m -CONFIG_ZSTD_DECOMPRESS=m -CONFIG_XZ_DEC=y -CONFIG_XZ_DEC_X86=y -CONFIG_XZ_DEC_POWERPC=y -CONFIG_XZ_DEC_IA64=y -CONFIG_XZ_DEC_ARM=y -CONFIG_XZ_DEC_ARMTHUMB=y -CONFIG_XZ_DEC_SPARC=y -CONFIG_XZ_DEC_BCJ=y -# CONFIG_XZ_DEC_TEST is not set -CONFIG_DECOMPRESS_GZIP=y -CONFIG_DECOMPRESS_BZIP2=y -CONFIG_DECOMPRESS_LZMA=y -CONFIG_DECOMPRESS_XZ=y -CONFIG_DECOMPRESS_LZO=y -CONFIG_DECOMPRESS_LZ4=y -CONFIG_GENERIC_ALLOCATOR=y -CONFIG_REED_SOLOMON=m -CONFIG_REED_SOLOMON_ENC8=y -CONFIG_REED_SOLOMON_DEC8=y -CONFIG_REED_SOLOMON_DEC16=y -CONFIG_BCH=m -CONFIG_BCH_CONST_PARAMS=y -CONFIG_TEXTSEARCH=y -CONFIG_TEXTSEARCH_KMP=m -CONFIG_TEXTSEARCH_BM=m -CONFIG_TEXTSEARCH_FSM=m -CONFIG_BTREE=y -CONFIG_INTERVAL_TREE=y -CONFIG_RADIX_TREE_MULTIORDER=y -CONFIG_ASSOCIATIVE_ARRAY=y -CONFIG_HAS_IOMEM=y -CONFIG_HAS_IOPORT_MAP=y -CONFIG_HAS_DMA=y -# CONFIG_DMA_NOOP_OPS is not set -CONFIG_DMA_VIRT_OPS=y -CONFIG_CHECK_SIGNATURE=y -CONFIG_CPUMASK_OFFSTACK=y -CONFIG_CPU_RMAP=y -CONFIG_DQL=y -CONFIG_GLOB=y -# CONFIG_GLOB_SELFTEST is not set -CONFIG_NLATTR=y -CONFIG_LRU_CACHE=m -CONFIG_CLZ_TAB=y -CONFIG_CORDIC=m -CONFIG_DDR=y -CONFIG_IRQ_POLL=y -CONFIG_MPILIB=y -CONFIG_OID_REGISTRY=y -CONFIG_UCS2_STRING=y -CONFIG_FONT_SUPPORT=y -CONFIG_FONTS=y -CONFIG_FONT_8x8=y -CONFIG_FONT_8x16=y -# CONFIG_FONT_6x11 is not set -# CONFIG_FONT_7x14 is not set -# CONFIG_FONT_PEARL_8x8 is not set -# CONFIG_FONT_ACORN_8x8 is not set -# CONFIG_FONT_MINI_4x6 is not set -# CONFIG_FONT_6x10 is not set -# CONFIG_FONT_10x18 is not set -# CONFIG_FONT_SUN8x16 is not set -# CONFIG_FONT_SUN12x22 is not set -# CONFIG_SG_SPLIT is not set -CONFIG_SG_POOL=y -CONFIG_ARCH_HAS_SG_CHAIN=y -CONFIG_ARCH_HAS_PMEM_API=y -CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y -CONFIG_SBITMAP=y -CONFIG_PARMAN=m -# CONFIG_STRING_SELFTEST is not set diff --git a/sys-kernel/linux-sources-redcore-lts/files/restore-SD_PREFER_SIBLING-on-MC-domains.patch b/sys-kernel/linux-sources-redcore-lts/files/restore-SD_PREFER_SIBLING-on-MC-domains.patch deleted file mode 100644 index b6be46cc..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/restore-SD_PREFER_SIBLING-on-MC-domains.patch +++ /dev/null @@ -1,12 +0,0 @@ -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 093f2ceba..808998fe1 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -1164,6 +1164,7 @@ sd_init(struct sched_domain_topology_level *tl, - sd->smt_gain = 1178; /* ~15% */ - - } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { -+ sd->flags |= SD_PREFER_SIBLING; - sd->imbalance_pct = 117; - sd->cache_nice_tries = 1; - sd->busy_idx = 2; diff --git a/sys-kernel/linux-sources-redcore-lts/files/uksm-linux-hardened.patch b/sys-kernel/linux-sources-redcore-lts/files/uksm-linux-hardened.patch deleted file mode 100644 index f0596117..00000000 --- a/sys-kernel/linux-sources-redcore-lts/files/uksm-linux-hardened.patch +++ /dev/null @@ -1,6919 +0,0 @@ -diff -Nur a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX ---- a/Documentation/vm/00-INDEX 2018-05-25 15:18:02.000000000 +0100 -+++ b/Documentation/vm/00-INDEX 2018-05-26 19:30:55.783140311 +0100 -@@ -20,6 +20,8 @@ - - description of the idle page tracking feature. - ksm.txt - - how to use the Kernel Samepage Merging feature. -+uksm.txt -+ - Introduction to Ultra KSM - numa - - information about NUMA specific code in the Linux vm. - numa_memory_policy.txt -diff -Nur a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt ---- a/Documentation/vm/uksm.txt 1970-01-01 01:00:00.000000000 +0100 -+++ b/Documentation/vm/uksm.txt 2018-05-26 19:30:55.783140311 +0100 -@@ -0,0 +1,61 @@ -+The Ultra Kernel Samepage Merging feature -+---------------------------------------------- -+/* -+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia -+ * -+ * This is an improvement upon KSM. Some basic data structures and routines -+ * are borrowed from ksm.c . -+ * -+ * Its new features: -+ * 1. Full system scan: -+ * It automatically scans all user processes' anonymous VMAs. Kernel-user -+ * interaction to submit a memory area to KSM is no longer needed. -+ * -+ * 2. Rich area detection: -+ * It automatically detects rich areas containing abundant duplicated -+ * pages based. Rich areas are given a full scan speed. Poor areas are -+ * sampled at a reasonable speed with very low CPU consumption. -+ * -+ * 3. Ultra Per-page scan speed improvement: -+ * A new hash algorithm is proposed. As a result, on a machine with -+ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it -+ * can scan memory areas that does not contain duplicated pages at speed of -+ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of -+ * 477MB/sec ~ 923MB/sec. -+ * -+ * 4. Thrashing area avoidance: -+ * Thrashing area(an VMA that has frequent Ksm page break-out) can be -+ * filtered out. My benchmark shows it's more efficient than KSM's per-page -+ * hash value based volatile page detection. -+ * -+ * -+ * 5. Misc changes upon KSM: -+ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page -+ * comparison. It's much faster than default C version on x86. -+ * * rmap_item now has an struct *page member to loosely cache a -+ * address-->page mapping, which reduces too much time-costly -+ * follow_page(). -+ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. -+ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ -+ * ksm is needed for this case. -+ * -+ * 6. Full Zero Page consideration(contributed by Figo Zhang) -+ * Now uksmd consider full zero pages as special pages and merge them to an -+ * special unswappable uksm zero page. -+ */ -+ -+ChangeLog: -+ -+2012-05-05 The creation of this Doc -+2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. -+2012-05-28 UKSM 0.1.1.2 bug fix release -+2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 -+2012-07-2 UKSM 0.1.2-beta2 -+2012-07-10 UKSM 0.1.2-beta3 -+2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. -+2012-10-13 UKSM 0.1.2.1 Bug fixes. -+2012-12-31 UKSM 0.1.2.2 Minor bug fixes. -+2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug". -+2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings. -+2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation. -+2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration. -diff -Nur a/fs/exec.c b/fs/exec.c ---- a/fs/exec.c 2018-05-26 19:24:34.831782903 +0100 -+++ b/fs/exec.c 2018-05-26 19:31:18.404873956 +0100 -@@ -63,6 +63,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -1377,6 +1378,7 @@ - /* An exec changes our domain. We are no longer part of the thread - group */ - current->self_exec_id++; -+ - flush_signal_handlers(current, 0); - } - EXPORT_SYMBOL(setup_new_exec); -diff -Nur a/fs/proc/meminfo.c b/fs/proc/meminfo.c ---- a/fs/proc/meminfo.c 2018-05-25 15:18:02.000000000 +0100 -+++ b/fs/proc/meminfo.c 2018-05-26 19:30:55.784140344 +0100 -@@ -118,6 +118,10 @@ - global_zone_page_state(NR_KERNEL_STACK_KB)); - show_val_kb(m, "PageTables: ", - global_zone_page_state(NR_PAGETABLE)); -+#ifdef CONFIG_UKSM -+ show_val_kb(m, "KsmZeroPages: ", -+ global_zone_page_state(NR_UKSM_ZERO_PAGES)); -+#endif - #ifdef CONFIG_QUICKLIST - show_val_kb(m, "Quicklists: ", quicklist_total_size()); - #endif -diff -Nur a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h ---- a/include/asm-generic/pgtable.h 2018-05-25 15:18:02.000000000 +0100 -+++ b/include/asm-generic/pgtable.h 2018-05-26 19:30:55.784140344 +0100 -@@ -789,12 +789,25 @@ - extern void untrack_pfn_moved(struct vm_area_struct *vma); - #endif - -+#ifdef CONFIG_UKSM -+static inline int is_uksm_zero_pfn(unsigned long pfn) -+{ -+ extern unsigned long uksm_zero_pfn; -+ return pfn == uksm_zero_pfn; -+} -+#else -+static inline int is_uksm_zero_pfn(unsigned long pfn) -+{ -+ return 0; -+} -+#endif -+ - #ifdef __HAVE_COLOR_ZERO_PAGE - static inline int is_zero_pfn(unsigned long pfn) - { - extern unsigned long zero_pfn; - unsigned long offset_from_zero_pfn = pfn - zero_pfn; -- return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); -+ return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn); - } - - #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) -@@ -803,7 +816,7 @@ - static inline int is_zero_pfn(unsigned long pfn) - { - extern unsigned long zero_pfn; -- return pfn == zero_pfn; -+ return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); - } - - static inline unsigned long my_zero_pfn(unsigned long addr) -diff -Nur a/include/linux/ksm.h b/include/linux/ksm.h ---- a/include/linux/ksm.h 2018-05-25 15:18:02.000000000 +0100 -+++ b/include/linux/ksm.h 2018-05-26 19:30:55.784140344 +0100 -@@ -21,21 +21,6 @@ - #ifdef CONFIG_KSM - int ksm_madvise(struct vm_area_struct *vma, unsigned long start, - unsigned long end, int advice, unsigned long *vm_flags); --int __ksm_enter(struct mm_struct *mm); --void __ksm_exit(struct mm_struct *mm); -- --static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) --{ -- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) -- return __ksm_enter(mm); -- return 0; --} -- --static inline void ksm_exit(struct mm_struct *mm) --{ -- if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) -- __ksm_exit(mm); --} - - static inline struct stable_node *page_stable_node(struct page *page) - { -@@ -65,6 +50,33 @@ - void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); - void ksm_migrate_page(struct page *newpage, struct page *oldpage); - -+#ifdef CONFIG_KSM_LEGACY -+int __ksm_enter(struct mm_struct *mm); -+void __ksm_exit(struct mm_struct *mm); -+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) -+{ -+ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) -+ return __ksm_enter(mm); -+ return 0; -+} -+ -+static inline void ksm_exit(struct mm_struct *mm) -+{ -+ if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) -+ __ksm_exit(mm); -+} -+ -+#elif defined(CONFIG_UKSM) -+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) -+{ -+ return 0; -+} -+ -+static inline void ksm_exit(struct mm_struct *mm) -+{ -+} -+#endif /* !CONFIG_UKSM */ -+ - #else /* !CONFIG_KSM */ - - static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) -@@ -106,4 +118,6 @@ - #endif /* CONFIG_MMU */ - #endif /* !CONFIG_KSM */ - -+#include -+ - #endif /* __LINUX_KSM_H */ -diff -Nur a/include/linux/mm_types.h b/include/linux/mm_types.h ---- a/include/linux/mm_types.h 2018-05-25 15:18:02.000000000 +0100 -+++ b/include/linux/mm_types.h 2018-05-26 19:30:55.784140344 +0100 -@@ -337,6 +337,9 @@ - struct mempolicy *vm_policy; /* NUMA policy for the VMA */ - #endif - struct vm_userfaultfd_ctx vm_userfaultfd_ctx; -+#ifdef CONFIG_UKSM -+ struct vma_slot *uksm_vma_slot; -+#endif - } __randomize_layout; - - struct core_thread { -diff -Nur a/include/linux/mmzone.h b/include/linux/mmzone.h ---- a/include/linux/mmzone.h 2018-05-25 15:18:02.000000000 +0100 -+++ b/include/linux/mmzone.h 2018-05-26 19:30:55.785140376 +0100 -@@ -148,6 +148,9 @@ - NR_ZSPAGES, /* allocated in zsmalloc */ - #endif - NR_FREE_CMA_PAGES, -+#ifdef CONFIG_UKSM -+ NR_UKSM_ZERO_PAGES, -+#endif - NR_VM_ZONE_STAT_ITEMS }; - - enum node_stat_item { -@@ -872,7 +875,7 @@ - } - - /** -- * is_highmem - helper function to quickly check if a struct zone is a -+ * is_highmem - helper function to quickly check if a struct zone is a - * highmem zone or not. This is an attempt to keep references - * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. - * @zone - pointer to struct zone variable -diff -Nur a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h ---- a/include/linux/sradix-tree.h 1970-01-01 01:00:00.000000000 +0100 -+++ b/include/linux/sradix-tree.h 2018-05-26 19:30:55.785140376 +0100 -@@ -0,0 +1,77 @@ -+#ifndef _LINUX_SRADIX_TREE_H -+#define _LINUX_SRADIX_TREE_H -+ -+ -+#define INIT_SRADIX_TREE(root, mask) \ -+do { \ -+ (root)->height = 0; \ -+ (root)->gfp_mask = (mask); \ -+ (root)->rnode = NULL; \ -+} while (0) -+ -+#define ULONG_BITS (sizeof(unsigned long) * 8) -+#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -+//#define SRADIX_TREE_MAP_SHIFT 6 -+//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) -+//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) -+ -+struct sradix_tree_node { -+ unsigned int height; /* Height from the bottom */ -+ unsigned int count; -+ unsigned int fulls; /* Number of full sublevel trees */ -+ struct sradix_tree_node *parent; -+ void *stores[0]; -+}; -+ -+/* A simple radix tree implementation */ -+struct sradix_tree_root { -+ unsigned int height; -+ struct sradix_tree_node *rnode; -+ -+ /* Where found to have available empty stores in its sublevels */ -+ struct sradix_tree_node *enter_node; -+ unsigned int shift; -+ unsigned int stores_size; -+ unsigned int mask; -+ unsigned long min; /* The first hole index */ -+ unsigned long num; -+ //unsigned long *height_to_maxindex; -+ -+ /* How the node is allocated and freed. */ -+ struct sradix_tree_node *(*alloc)(void); -+ void (*free)(struct sradix_tree_node *node); -+ -+ /* When a new node is added and removed */ -+ void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); -+ void (*assign)(struct sradix_tree_node *node, unsigned int index, void *item); -+ void (*rm)(struct sradix_tree_node *node, unsigned int offset); -+}; -+ -+struct sradix_tree_path { -+ struct sradix_tree_node *node; -+ int offset; -+}; -+ -+static inline -+void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) -+{ -+ root->height = 0; -+ root->rnode = NULL; -+ root->shift = shift; -+ root->stores_size = 1UL << shift; -+ root->mask = root->stores_size - 1; -+} -+ -+ -+extern void *sradix_tree_next(struct sradix_tree_root *root, -+ struct sradix_tree_node *node, unsigned long index, -+ int (*iter)(void *, unsigned long)); -+ -+extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); -+ -+extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, -+ struct sradix_tree_node *node, unsigned long index); -+ -+extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); -+ -+#endif /* _LINUX_SRADIX_TREE_H */ -diff -Nur a/include/linux/uksm.h b/include/linux/uksm.h ---- a/include/linux/uksm.h 1970-01-01 01:00:00.000000000 +0100 -+++ b/include/linux/uksm.h 2018-05-26 19:30:55.785140376 +0100 -@@ -0,0 +1,149 @@ -+#ifndef __LINUX_UKSM_H -+#define __LINUX_UKSM_H -+/* -+ * Memory merging support. -+ * -+ * This code enables dynamic sharing of identical pages found in different -+ * memory areas, even if they are not shared by fork(). -+ */ -+ -+/* if !CONFIG_UKSM this file should not be compiled at all. */ -+#ifdef CONFIG_UKSM -+ -+#include -+#include -+#include -+#include -+#include -+ -+extern unsigned long zero_pfn __read_mostly; -+extern unsigned long uksm_zero_pfn __read_mostly; -+extern struct page *empty_uksm_zero_page; -+ -+/* must be done before linked to mm */ -+extern void uksm_vma_add_new(struct vm_area_struct *vma); -+extern void uksm_remove_vma(struct vm_area_struct *vma); -+ -+#define UKSM_SLOT_NEED_SORT (1 << 0) -+#define UKSM_SLOT_NEED_RERAND (1 << 1) -+#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ -+#define UKSM_SLOT_FUL_SCANNED (1 << 3) -+#define UKSM_SLOT_IN_UKSM (1 << 4) -+ -+struct vma_slot { -+ struct sradix_tree_node *snode; -+ unsigned long sindex; -+ -+ struct list_head slot_list; -+ unsigned long fully_scanned_round; -+ unsigned long dedup_num; -+ unsigned long pages_scanned; -+ unsigned long this_sampled; -+ unsigned long last_scanned; -+ unsigned long pages_to_scan; -+ struct scan_rung *rung; -+ struct page **rmap_list_pool; -+ unsigned int *pool_counts; -+ unsigned long pool_size; -+ struct vm_area_struct *vma; -+ struct mm_struct *mm; -+ unsigned long ctime_j; -+ unsigned long pages; -+ unsigned long flags; -+ unsigned long pages_cowed; /* pages cowed this round */ -+ unsigned long pages_merged; /* pages merged this round */ -+ unsigned long pages_bemerged; -+ -+ /* when it has page merged in this eval round */ -+ struct list_head dedup_list; -+}; -+ -+static inline void uksm_unmap_zero_page(pte_t pte) -+{ -+ if (pte_pfn(pte) == uksm_zero_pfn) -+ __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); -+} -+ -+static inline void uksm_map_zero_page(pte_t pte) -+{ -+ if (pte_pfn(pte) == uksm_zero_pfn) -+ __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); -+} -+ -+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) -+{ -+ if (vma->uksm_vma_slot && PageKsm(page)) -+ vma->uksm_vma_slot->pages_cowed++; -+} -+ -+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) -+{ -+ if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) -+ vma->uksm_vma_slot->pages_cowed++; -+} -+ -+static inline int uksm_flags_can_scan(unsigned long vm_flags) -+{ -+#ifdef VM_SAO -+ if (vm_flags & VM_SAO) -+ return 0; -+#endif -+ -+ return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | -+ VM_HUGETLB | VM_MIXEDMAP | VM_SHARED -+ | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN)); -+} -+ -+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) -+{ -+ if (uksm_flags_can_scan(*vm_flags_p)) -+ *vm_flags_p |= VM_MERGEABLE; -+} -+ -+/* -+ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will -+ * be removed when uksm zero page patch is stable enough. -+ */ -+static inline void uksm_bugon_zeropage(pte_t pte) -+{ -+ BUG_ON(pte_pfn(pte) == uksm_zero_pfn); -+} -+#else -+static inline void uksm_vma_add_new(struct vm_area_struct *vma) -+{ -+} -+ -+static inline void uksm_remove_vma(struct vm_area_struct *vma) -+{ -+} -+ -+static inline void uksm_unmap_zero_page(pte_t pte) -+{ -+} -+ -+static inline void uksm_map_zero_page(pte_t pte) -+{ -+} -+ -+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) -+{ -+} -+ -+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) -+{ -+} -+ -+static inline int uksm_flags_can_scan(unsigned long vm_flags) -+{ -+ return 0; -+} -+ -+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) -+{ -+} -+ -+static inline void uksm_bugon_zeropage(pte_t pte) -+{ -+} -+#endif /* !CONFIG_UKSM */ -+#endif /* __LINUX_UKSM_H */ -diff -Nur a/kernel/fork.c b/kernel/fork.c ---- a/kernel/fork.c 2018-05-26 19:24:34.840783196 +0100 -+++ b/kernel/fork.c 2018-05-26 19:30:55.785140376 +0100 -@@ -655,7 +655,7 @@ - goto fail_nomem; - charge = len; - } -- tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); -+ tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); - if (!tmp) - goto fail_nomem; - *tmp = *mpnt; -@@ -714,7 +714,7 @@ - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; -- -+ uksm_vma_add_new(tmp); - mm->map_count++; - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); -diff -Nur a/lib/Makefile b/lib/Makefile ---- a/lib/Makefile 2018-05-25 15:18:02.000000000 +0100 -+++ b/lib/Makefile 2018-05-26 19:30:55.786140408 +0100 -@@ -18,7 +18,7 @@ - KCOV_INSTRUMENT_dynamic_debug.o := n - - lib-y := ctype.o string.o vsprintf.o cmdline.o \ -- rbtree.o radix-tree.o dump_stack.o timerqueue.o\ -+ rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\ - idr.o int_sqrt.o extable.o \ - sha1.o chacha20.o irq_regs.o argv_split.o \ - flex_proportions.o ratelimit.o show_mem.o \ -diff -Nur a/lib/sradix-tree.c b/lib/sradix-tree.c ---- a/lib/sradix-tree.c 1970-01-01 01:00:00.000000000 +0100 -+++ b/lib/sradix-tree.c 2018-05-26 19:30:55.786140408 +0100 -@@ -0,0 +1,476 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) -+{ -+ return node->fulls == root->stores_size || -+ (node->height == 1 && node->count == root->stores_size); -+} -+ -+/* -+ * Extend a sradix tree so it can store key @index. -+ */ -+static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) -+{ -+ struct sradix_tree_node *node; -+ unsigned int height; -+ -+ if (unlikely(root->rnode == NULL)) { -+ if (!(node = root->alloc())) -+ return -ENOMEM; -+ -+ node->height = 1; -+ root->rnode = node; -+ root->height = 1; -+ } -+ -+ /* Figure out what the height should be. */ -+ height = root->height; -+ index >>= root->shift * height; -+ -+ while (index) { -+ index >>= root->shift; -+ height++; -+ } -+ -+ while (height > root->height) { -+ unsigned int newheight; -+ -+ if (!(node = root->alloc())) -+ return -ENOMEM; -+ -+ /* Increase the height. */ -+ node->stores[0] = root->rnode; -+ root->rnode->parent = node; -+ if (root->extend) -+ root->extend(node, root->rnode); -+ -+ newheight = root->height + 1; -+ node->height = newheight; -+ node->count = 1; -+ if (sradix_node_full(root, root->rnode)) -+ node->fulls = 1; -+ -+ root->rnode = node; -+ root->height = newheight; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Search the next item from the current node, that is not NULL -+ * and can satify root->iter(). -+ */ -+void *sradix_tree_next(struct sradix_tree_root *root, -+ struct sradix_tree_node *node, unsigned long index, -+ int (*iter)(void *item, unsigned long height)) -+{ -+ unsigned long offset; -+ void *item; -+ -+ if (unlikely(node == NULL)) { -+ node = root->rnode; -+ for (offset = 0; offset < root->stores_size; offset++) { -+ item = node->stores[offset]; -+ if (item && (!iter || iter(item, node->height))) -+ break; -+ } -+ -+ if (unlikely(offset >= root->stores_size)) -+ return NULL; -+ -+ if (node->height == 1) -+ return item; -+ else -+ goto go_down; -+ } -+ -+ while (node) { -+ offset = (index & root->mask) + 1; -+ for (; offset < root->stores_size; offset++) { -+ item = node->stores[offset]; -+ if (item && (!iter || iter(item, node->height))) -+ break; -+ } -+ -+ if (offset < root->stores_size) -+ break; -+ -+ node = node->parent; -+ index >>= root->shift; -+ } -+ -+ if (!node) -+ return NULL; -+ -+ while (node->height > 1) { -+go_down: -+ node = item; -+ for (offset = 0; offset < root->stores_size; offset++) { -+ item = node->stores[offset]; -+ if (item && (!iter || iter(item, node->height))) -+ break; -+ } -+ -+ if (unlikely(offset >= root->stores_size)) -+ return NULL; -+ } -+ -+ BUG_ON(offset > root->stores_size); -+ -+ return item; -+} -+ -+/* -+ * Blindly insert the item to the tree. Typically, we reuse the -+ * first empty store item. -+ */ -+int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) -+{ -+ unsigned long index; -+ unsigned int height; -+ struct sradix_tree_node *node, *tmp = NULL; -+ int offset, offset_saved; -+ void **store = NULL; -+ int error, i, j, shift; -+ -+go_on: -+ index = root->min; -+ -+ if (root->enter_node && !sradix_node_full(root, root->enter_node)) { -+ node = root->enter_node; -+ BUG_ON((index >> (root->shift * root->height))); -+ } else { -+ node = root->rnode; -+ if (node == NULL || (index >> (root->shift * root->height)) -+ || sradix_node_full(root, node)) { -+ error = sradix_tree_extend(root, index); -+ if (error) -+ return error; -+ -+ node = root->rnode; -+ } -+ } -+ -+ -+ height = node->height; -+ shift = (height - 1) * root->shift; -+ offset = (index >> shift) & root->mask; -+ while (shift > 0) { -+ offset_saved = offset; -+ for (; offset < root->stores_size; offset++) { -+ store = &node->stores[offset]; -+ tmp = *store; -+ -+ if (!tmp || !sradix_node_full(root, tmp)) -+ break; -+ } -+ BUG_ON(offset >= root->stores_size); -+ -+ if (offset != offset_saved) { -+ index += (offset - offset_saved) << shift; -+ index &= ~((1UL << shift) - 1); -+ } -+ -+ if (!tmp) { -+ if (!(tmp = root->alloc())) -+ return -ENOMEM; -+ -+ tmp->height = shift / root->shift; -+ *store = tmp; -+ tmp->parent = node; -+ node->count++; -+// if (root->extend) -+// root->extend(node, tmp); -+ } -+ -+ node = tmp; -+ shift -= root->shift; -+ offset = (index >> shift) & root->mask; -+ } -+ -+ BUG_ON(node->height != 1); -+ -+ -+ store = &node->stores[offset]; -+ for (i = 0, j = 0; -+ j < root->stores_size - node->count && -+ i < root->stores_size - offset && j < num; i++) { -+ if (!store[i]) { -+ store[i] = item[j]; -+ if (root->assign) -+ root->assign(node, index + i, item[j]); -+ j++; -+ } -+ } -+ -+ node->count += j; -+ root->num += j; -+ num -= j; -+ -+ while (sradix_node_full(root, node)) { -+ node = node->parent; -+ if (!node) -+ break; -+ -+ node->fulls++; -+ } -+ -+ if (unlikely(!node)) { -+ /* All nodes are full */ -+ root->min = 1 << (root->height * root->shift); -+ root->enter_node = NULL; -+ } else { -+ root->min = index + i - 1; -+ root->min |= (1UL << (node->height - 1)) - 1; -+ root->min++; -+ root->enter_node = node; -+ } -+ -+ if (num) { -+ item += j; -+ goto go_on; -+ } -+ -+ return 0; -+} -+ -+ -+/** -+ * sradix_tree_shrink - shrink height of a sradix tree to minimal -+ * @root sradix tree root -+ * -+ */ -+static inline void sradix_tree_shrink(struct sradix_tree_root *root) -+{ -+ /* try to shrink tree height */ -+ while (root->height > 1) { -+ struct sradix_tree_node *to_free = root->rnode; -+ -+ /* -+ * The candidate node has more than one child, or its child -+ * is not at the leftmost store, we cannot shrink. -+ */ -+ if (to_free->count != 1 || !to_free->stores[0]) -+ break; -+ -+ root->rnode = to_free->stores[0]; -+ root->rnode->parent = NULL; -+ root->height--; -+ if (unlikely(root->enter_node == to_free)) -+ root->enter_node = NULL; -+ root->free(to_free); -+ } -+} -+ -+/* -+ * Del the item on the known leaf node and index -+ */ -+void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, -+ struct sradix_tree_node *node, unsigned long index) -+{ -+ unsigned int offset; -+ struct sradix_tree_node *start, *end; -+ -+ BUG_ON(node->height != 1); -+ -+ start = node; -+ while (node && !(--node->count)) -+ node = node->parent; -+ -+ end = node; -+ if (!node) { -+ root->rnode = NULL; -+ root->height = 0; -+ root->min = 0; -+ root->num = 0; -+ root->enter_node = NULL; -+ } else { -+ offset = (index >> (root->shift * (node->height - 1))) & root->mask; -+ if (root->rm) -+ root->rm(node, offset); -+ node->stores[offset] = NULL; -+ root->num--; -+ if (root->min > index) { -+ root->min = index; -+ root->enter_node = node; -+ } -+ } -+ -+ if (start != end) { -+ do { -+ node = start; -+ start = start->parent; -+ if (unlikely(root->enter_node == node)) -+ root->enter_node = end; -+ root->free(node); -+ } while (start != end); -+ -+ /* -+ * Note that shrink may free "end", so enter_node still need to -+ * be checked inside. -+ */ -+ sradix_tree_shrink(root); -+ } else if (node->count == root->stores_size - 1) { -+ /* It WAS a full leaf node. Update the ancestors */ -+ node = node->parent; -+ while (node) { -+ node->fulls--; -+ if (node->fulls != root->stores_size - 1) -+ break; -+ -+ node = node->parent; -+ } -+ } -+} -+ -+void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) -+{ -+ unsigned int height, offset; -+ struct sradix_tree_node *node; -+ int shift; -+ -+ node = root->rnode; -+ if (node == NULL || (index >> (root->shift * root->height))) -+ return NULL; -+ -+ height = root->height; -+ shift = (height - 1) * root->shift; -+ -+ do { -+ offset = (index >> shift) & root->mask; -+ node = node->stores[offset]; -+ if (!node) -+ return NULL; -+ -+ shift -= root->shift; -+ } while (shift >= 0); -+ -+ return node; -+} -+ -+/* -+ * Return the item if it exists, otherwise create it in place -+ * and return the created item. -+ */ -+void *sradix_tree_lookup_create(struct sradix_tree_root *root, -+ unsigned long index, void *(*item_alloc)(void)) -+{ -+ unsigned int height, offset; -+ struct sradix_tree_node *node, *tmp; -+ void *item; -+ int shift, error; -+ -+ if (root->rnode == NULL || (index >> (root->shift * root->height))) { -+ if (item_alloc) { -+ error = sradix_tree_extend(root, index); -+ if (error) -+ return NULL; -+ } else { -+ return NULL; -+ } -+ } -+ -+ node = root->rnode; -+ height = root->height; -+ shift = (height - 1) * root->shift; -+ -+ do { -+ offset = (index >> shift) & root->mask; -+ if (!node->stores[offset]) { -+ if (!(tmp = root->alloc())) -+ return NULL; -+ -+ tmp->height = shift / root->shift; -+ node->stores[offset] = tmp; -+ tmp->parent = node; -+ node->count++; -+ node = tmp; -+ } else { -+ node = node->stores[offset]; -+ } -+ -+ shift -= root->shift; -+ } while (shift > 0); -+ -+ BUG_ON(node->height != 1); -+ offset = index & root->mask; -+ if (node->stores[offset]) { -+ return node->stores[offset]; -+ } else if (item_alloc) { -+ if (!(item = item_alloc())) -+ return NULL; -+ -+ node->stores[offset] = item; -+ -+ /* -+ * NOTE: we do NOT call root->assign here, since this item is -+ * newly created by us having no meaning. Caller can call this -+ * if it's necessary to do so. -+ */ -+ -+ node->count++; -+ root->num++; -+ -+ while (sradix_node_full(root, node)) { -+ node = node->parent; -+ if (!node) -+ break; -+ -+ node->fulls++; -+ } -+ -+ if (unlikely(!node)) { -+ /* All nodes are full */ -+ root->min = 1 << (root->height * root->shift); -+ } else { -+ if (root->min == index) { -+ root->min |= (1UL << (node->height - 1)) - 1; -+ root->min++; -+ root->enter_node = node; -+ } -+ } -+ -+ return item; -+ } else { -+ return NULL; -+ } -+ -+} -+ -+int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) -+{ -+ unsigned int height, offset; -+ struct sradix_tree_node *node; -+ int shift; -+ -+ node = root->rnode; -+ if (node == NULL || (index >> (root->shift * root->height))) -+ return -ENOENT; -+ -+ height = root->height; -+ shift = (height - 1) * root->shift; -+ -+ do { -+ offset = (index >> shift) & root->mask; -+ node = node->stores[offset]; -+ if (!node) -+ return -ENOENT; -+ -+ shift -= root->shift; -+ } while (shift > 0); -+ -+ offset = index & root->mask; -+ if (!node->stores[offset]) -+ return -ENOENT; -+ -+ sradix_tree_delete_from_leaf(root, node, index); -+ -+ return 0; -+} -diff -Nur a/mm/Kconfig b/mm/Kconfig ---- a/mm/Kconfig 2018-05-26 19:24:34.846783391 +0100 -+++ b/mm/Kconfig 2018-05-26 19:30:55.786140408 +0100 -@@ -315,6 +315,32 @@ - See Documentation/vm/ksm.txt for more information: KSM is inactive - until a program has madvised that an area is MADV_MERGEABLE, and - root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). -+choice -+ prompt "Choose UKSM/KSM strategy" -+ default UKSM -+ depends on KSM -+ help -+ This option allows to select a UKSM/KSM stragety. -+ -+config UKSM -+ bool "Ultra-KSM for page merging" -+ depends on KSM -+ help -+ UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same -+ page Merging), but with a fundamentally rewritten core algorithm. With -+ an advanced algorithm, UKSM now can transparently scans all anonymously -+ mapped user space applications with an significantly improved scan speed -+ and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from -+ UKSM. Now UKSM has its first stable release and first real world enterprise user. -+ For more information, please goto its project page. -+ (www.kerneldedup.org) -+ -+config KSM_LEGACY -+ bool "Legacy KSM implementation" -+ depends on KSM -+ help -+ The legacy KSM implementation from Red Hat. -+endchoice - - config DEFAULT_MMAP_MIN_ADDR - int "Low address space to protect from user allocation" -diff -Nur a/mm/Makefile b/mm/Makefile ---- a/mm/Makefile 2018-05-25 15:18:02.000000000 +0100 -+++ b/mm/Makefile 2018-05-26 19:30:55.786140408 +0100 -@@ -65,7 +65,8 @@ - obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o - obj-$(CONFIG_SLOB) += slob.o - obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o --obj-$(CONFIG_KSM) += ksm.o -+obj-$(CONFIG_KSM_LEGACY) += ksm.o -+obj-$(CONFIG_UKSM) += uksm.o - obj-$(CONFIG_PAGE_POISONING) += page_poison.o - obj-$(CONFIG_SLAB) += slab.o - obj-$(CONFIG_SLUB) += slub.o -diff -Nur a/mm/memory.c b/mm/memory.c ---- a/mm/memory.c 2018-05-25 15:18:02.000000000 +0100 -+++ b/mm/memory.c 2018-05-26 19:30:55.787140441 +0100 -@@ -129,6 +129,25 @@ - - unsigned long highest_memmap_pfn __read_mostly; - -+#ifdef CONFIG_UKSM -+unsigned long uksm_zero_pfn __read_mostly; -+EXPORT_SYMBOL_GPL(uksm_zero_pfn); -+struct page *empty_uksm_zero_page; -+ -+static int __init setup_uksm_zero_page(void) -+{ -+ empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0); -+ if (!empty_uksm_zero_page) -+ panic("Oh boy, that early out of memory?"); -+ -+ SetPageReserved(empty_uksm_zero_page); -+ uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); -+ -+ return 0; -+} -+core_initcall(setup_uksm_zero_page); -+#endif -+ - /* - * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() - */ -@@ -140,6 +159,7 @@ - core_initcall(init_zero_pfn); - - -+ - #if defined(SPLIT_RSS_COUNTING) - - void sync_mm_rss(struct mm_struct *mm) -@@ -1035,6 +1055,9 @@ - get_page(page); - page_dup_rmap(page, false); - rss[mm_counter(page)]++; -+ -+ /* Should return NULL in vm_normal_page() */ -+ uksm_bugon_zeropage(pte); - } else if (pte_devmap(pte)) { - page = pte_page(pte); - -@@ -1048,6 +1071,8 @@ - page_dup_rmap(page, false); - rss[mm_counter(page)]++; - } -+ } else { -+ uksm_map_zero_page(pte); - } - - out_set_pte: -@@ -1317,8 +1342,10 @@ - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - tlb_remove_tlb_entry(tlb, pte, addr); -- if (unlikely(!page)) -+ if (unlikely(!page)) { -+ uksm_unmap_zero_page(ptent); - continue; -+ } - - if (!PageAnon(page)) { - if (pte_dirty(ptent)) { -@@ -2318,8 +2345,10 @@ - clear_page(kaddr); - kunmap_atomic(kaddr); - flush_dcache_page(dst); -- } else -+ } else { - copy_user_highpage(dst, src, va, vma); -+ uksm_cow_page(vma, src); -+ } - } - - static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) -@@ -2468,6 +2497,7 @@ - vmf->address); - if (!new_page) - goto oom; -+ uksm_cow_pte(vma, vmf->orig_pte); - } else { - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - vmf->address); -@@ -2494,7 +2524,9 @@ - mm_counter_file(old_page)); - inc_mm_counter_fast(mm, MM_ANONPAGES); - } -+ uksm_bugon_zeropage(vmf->orig_pte); - } else { -+ uksm_unmap_zero_page(vmf->orig_pte); - inc_mm_counter_fast(mm, MM_ANONPAGES); - } - flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); -diff -Nur a/mm/mmap.c b/mm/mmap.c ---- a/mm/mmap.c 2018-05-26 19:24:34.847783423 +0100 -+++ b/mm/mmap.c 2018-05-26 19:30:55.788140473 +0100 -@@ -45,6 +45,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -173,6 +174,7 @@ - if (vma->vm_file) - fput(vma->vm_file); - mpol_put(vma_policy(vma)); -+ uksm_remove_vma(vma); - kmem_cache_free(vm_area_cachep, vma); - return next; - } -@@ -699,9 +701,16 @@ - long adjust_next = 0; - int remove_next = 0; - -+/* -+ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is -+ * acquired -+ */ -+ uksm_remove_vma(vma); -+ - if (next && !insert) { - struct vm_area_struct *exporter = NULL, *importer = NULL; - -+ uksm_remove_vma(next); - if (end >= next->vm_end) { - /* - * vma expands, overlapping all the next, and -@@ -834,6 +843,7 @@ - end_changed = true; - } - vma->vm_pgoff = pgoff; -+ - if (adjust_next) { - next->vm_start += adjust_next << PAGE_SHIFT; - next->vm_pgoff += adjust_next; -@@ -939,6 +949,7 @@ - if (remove_next == 2) { - remove_next = 1; - end = next->vm_end; -+ uksm_remove_vma(next); - goto again; - } - else if (next) -@@ -965,10 +976,14 @@ - */ - VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); - } -+ } else { -+ if (next && !insert) -+ uksm_vma_add_new(next); - } - if (insert && file) - uprobe_mmap(insert); - -+ uksm_vma_add_new(vma); - validate_mm(mm); - - return 0; -@@ -1385,6 +1400,9 @@ - vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | - mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - -+ /* If uksm is enabled, we add VM_MERGEABLE to new VMAs. */ -+ uksm_vm_flags_mod(&vm_flags); -+ - if (flags & MAP_LOCKED) - if (!can_do_mlock()) - return -EPERM; -@@ -1724,6 +1742,7 @@ - allow_write_access(file); - } - file = vma->vm_file; -+ uksm_vma_add_new(vma); - out: - perf_event_mmap(vma); - -@@ -1765,6 +1784,7 @@ - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); - free_vma: -+ uksm_remove_vma(vma); - kmem_cache_free(vm_area_cachep, vma); - unacct_error: - if (charged) -@@ -2589,6 +2609,8 @@ - else - err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); - -+ uksm_vma_add_new(new); -+ - /* Success. */ - if (!err) - return 0; -@@ -2881,6 +2903,7 @@ - if ((flags & (~VM_EXEC)) != 0) - return -EINVAL; - flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; -+ uksm_vm_flags_mod(&flags); - - error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (offset_in_page(error)) -@@ -2938,6 +2961,7 @@ - vma->vm_flags = flags; - vma->vm_page_prot = vm_get_page_prot(flags); - vma_link(mm, vma, prev, rb_link, rb_parent); -+ uksm_vma_add_new(vma); - out: - perf_event_mmap(vma); - mm->total_vm += len >> PAGE_SHIFT; -@@ -3015,6 +3039,12 @@ - up_write(&mm->mmap_sem); - } - -+ /* -+ * Taking write lock on mmap_sem does not harm others, -+ * but it's crucial for uksm to avoid races. -+ */ -+ down_write(&mm->mmap_sem); -+ - if (mm->locked_vm) { - vma = mm->mmap; - while (vma) { -@@ -3049,6 +3079,11 @@ - vma = remove_vma(vma); - } - vm_unacct_memory(nr_accounted); -+ -+ mm->mmap = NULL; -+ mm->mm_rb = RB_ROOT; -+ vmacache_invalidate(mm); -+ up_write(&mm->mmap_sem); - } - - /* Insert vm structure into process list sorted by address -@@ -3158,6 +3193,7 @@ - new_vma->vm_ops->open(new_vma); - vma_link(mm, new_vma, prev, rb_link, rb_parent); - *need_rmap_locks = false; -+ uksm_vma_add_new(new_vma); - } - return new_vma; - -@@ -3308,6 +3344,7 @@ - vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); - - perf_event_mmap(vma); -+ uksm_vma_add_new(vma); - - return vma; - -diff -Nur a/mm/rmap.c b/mm/rmap.c ---- a/mm/rmap.c 2018-05-25 15:18:02.000000000 +0100 -+++ b/mm/rmap.c 2018-05-26 19:30:55.788140473 +0100 -@@ -1013,9 +1013,9 @@ - - /** - * __page_set_anon_rmap - set up new anonymous rmap -- * @page: Page to add to rmap -+ * @page: Page to add to rmap - * @vma: VM area to add page to. -- * @address: User virtual address of the mapping -+ * @address: User virtual address of the mapping - * @exclusive: the page is exclusively owned by the current process - */ - static void __page_set_anon_rmap(struct page *page, -diff -Nur a/mm/uksm.c b/mm/uksm.c ---- a/mm/uksm.c 1970-01-01 01:00:00.000000000 +0100 -+++ b/mm/uksm.c 2018-05-26 19:30:55.791140570 +0100 -@@ -0,0 +1,5584 @@ -+/* -+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia -+ * -+ * This is an improvement upon KSM. Some basic data structures and routines -+ * are borrowed from ksm.c . -+ * -+ * Its new features: -+ * 1. Full system scan: -+ * It automatically scans all user processes' anonymous VMAs. Kernel-user -+ * interaction to submit a memory area to KSM is no longer needed. -+ * -+ * 2. Rich area detection: -+ * It automatically detects rich areas containing abundant duplicated -+ * pages based. Rich areas are given a full scan speed. Poor areas are -+ * sampled at a reasonable speed with very low CPU consumption. -+ * -+ * 3. Ultra Per-page scan speed improvement: -+ * A new hash algorithm is proposed. As a result, on a machine with -+ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it -+ * can scan memory areas that does not contain duplicated pages at speed of -+ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of -+ * 477MB/sec ~ 923MB/sec. -+ * -+ * 4. Thrashing area avoidance: -+ * Thrashing area(an VMA that has frequent Ksm page break-out) can be -+ * filtered out. My benchmark shows it's more efficient than KSM's per-page -+ * hash value based volatile page detection. -+ * -+ * -+ * 5. Misc changes upon KSM: -+ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page -+ * comparison. It's much faster than default C version on x86. -+ * * rmap_item now has an struct *page member to loosely cache a -+ * address-->page mapping, which reduces too much time-costly -+ * follow_page(). -+ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. -+ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ -+ * ksm is needed for this case. -+ * -+ * 6. Full Zero Page consideration(contributed by Figo Zhang) -+ * Now uksmd consider full zero pages as special pages and merge them to an -+ * special unswappable uksm zero page. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include "internal.h" -+ -+#ifdef CONFIG_X86 -+#undef memcmp -+ -+#ifdef CONFIG_X86_32 -+#define memcmp memcmpx86_32 -+/* -+ * Compare 4-byte-aligned address s1 and s2, with length n -+ */ -+int memcmpx86_32(void *s1, void *s2, size_t n) -+{ -+ size_t num = n / 4; -+ register int res; -+ -+ __asm__ __volatile__ -+ ( -+ "testl %3,%3\n\t" -+ "repe; cmpsd\n\t" -+ "je 1f\n\t" -+ "sbbl %0,%0\n\t" -+ "orl $1,%0\n" -+ "1:" -+ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) -+ : "0" (0) -+ : "cc"); -+ -+ return res; -+} -+ -+/* -+ * Check the page is all zero ? -+ */ -+static int is_full_zero(const void *s1, size_t len) -+{ -+ unsigned char same; -+ -+ len /= 4; -+ -+ __asm__ __volatile__ -+ ("repe; scasl;" -+ "sete %0" -+ : "=qm" (same), "+D" (s1), "+c" (len) -+ : "a" (0) -+ : "cc"); -+ -+ return same; -+} -+ -+ -+#elif defined(CONFIG_X86_64) -+#define memcmp memcmpx86_64 -+/* -+ * Compare 8-byte-aligned address s1 and s2, with length n -+ */ -+int memcmpx86_64(void *s1, void *s2, size_t n) -+{ -+ size_t num = n / 8; -+ register int res; -+ -+ __asm__ __volatile__ -+ ( -+ "testq %q3,%q3\n\t" -+ "repe; cmpsq\n\t" -+ "je 1f\n\t" -+ "sbbq %q0,%q0\n\t" -+ "orq $1,%q0\n" -+ "1:" -+ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) -+ : "0" (0) -+ : "cc"); -+ -+ return res; -+} -+ -+static int is_full_zero(const void *s1, size_t len) -+{ -+ unsigned char same; -+ -+ len /= 8; -+ -+ __asm__ __volatile__ -+ ("repe; scasq;" -+ "sete %0" -+ : "=qm" (same), "+D" (s1), "+c" (len) -+ : "a" (0) -+ : "cc"); -+ -+ return same; -+} -+ -+#endif -+#else -+static int is_full_zero(const void *s1, size_t len) -+{ -+ unsigned long *src = s1; -+ int i; -+ -+ len /= sizeof(*src); -+ -+ for (i = 0; i < len; i++) { -+ if (src[i]) -+ return 0; -+ } -+ -+ return 1; -+} -+#endif -+ -+#define UKSM_RUNG_ROUND_FINISHED (1 << 0) -+#define TIME_RATIO_SCALE 10000 -+ -+#define SLOT_TREE_NODE_SHIFT 8 -+#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) -+struct slot_tree_node { -+ unsigned long size; -+ struct sradix_tree_node snode; -+ void *stores[SLOT_TREE_NODE_STORE_SIZE]; -+}; -+ -+static struct kmem_cache *slot_tree_node_cachep; -+ -+static struct sradix_tree_node *slot_tree_node_alloc(void) -+{ -+ struct slot_tree_node *p; -+ -+ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL | -+ __GFP_NORETRY | __GFP_NOWARN); -+ if (!p) -+ return NULL; -+ -+ return &p->snode; -+} -+ -+static void slot_tree_node_free(struct sradix_tree_node *node) -+{ -+ struct slot_tree_node *p; -+ -+ p = container_of(node, struct slot_tree_node, snode); -+ kmem_cache_free(slot_tree_node_cachep, p); -+} -+ -+static void slot_tree_node_extend(struct sradix_tree_node *parent, -+ struct sradix_tree_node *child) -+{ -+ struct slot_tree_node *p, *c; -+ -+ p = container_of(parent, struct slot_tree_node, snode); -+ c = container_of(child, struct slot_tree_node, snode); -+ -+ p->size += c->size; -+} -+ -+void slot_tree_node_assign(struct sradix_tree_node *node, -+ unsigned int index, void *item) -+{ -+ struct vma_slot *slot = item; -+ struct slot_tree_node *cur; -+ -+ slot->snode = node; -+ slot->sindex = index; -+ -+ while (node) { -+ cur = container_of(node, struct slot_tree_node, snode); -+ cur->size += slot->pages; -+ node = node->parent; -+ } -+} -+ -+void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset) -+{ -+ struct vma_slot *slot; -+ struct slot_tree_node *cur; -+ unsigned long pages; -+ -+ if (node->height == 1) { -+ slot = node->stores[offset]; -+ pages = slot->pages; -+ } else { -+ cur = container_of(node->stores[offset], -+ struct slot_tree_node, snode); -+ pages = cur->size; -+ } -+ -+ while (node) { -+ cur = container_of(node, struct slot_tree_node, snode); -+ cur->size -= pages; -+ node = node->parent; -+ } -+} -+ -+unsigned long slot_iter_index; -+int slot_iter(void *item, unsigned long height) -+{ -+ struct slot_tree_node *node; -+ struct vma_slot *slot; -+ -+ if (height == 1) { -+ slot = item; -+ if (slot_iter_index < slot->pages) { -+ /*in this one*/ -+ return 1; -+ } else { -+ slot_iter_index -= slot->pages; -+ return 0; -+ } -+ -+ } else { -+ node = container_of(item, struct slot_tree_node, snode); -+ if (slot_iter_index < node->size) { -+ /*in this one*/ -+ return 1; -+ } else { -+ slot_iter_index -= node->size; -+ return 0; -+ } -+ } -+} -+ -+ -+static inline void slot_tree_init_root(struct sradix_tree_root *root) -+{ -+ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); -+ root->alloc = slot_tree_node_alloc; -+ root->free = slot_tree_node_free; -+ root->extend = slot_tree_node_extend; -+ root->assign = slot_tree_node_assign; -+ root->rm = slot_tree_node_rm; -+} -+ -+void slot_tree_init(void) -+{ -+ slot_tree_node_cachep = kmem_cache_create("slot_tree_node", -+ sizeof(struct slot_tree_node), 0, -+ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, -+ NULL); -+} -+ -+ -+/* Each rung of this ladder is a list of VMAs having a same scan ratio */ -+struct scan_rung { -+ //struct list_head scanned_list; -+ struct sradix_tree_root vma_root; -+ struct sradix_tree_root vma_root2; -+ -+ struct vma_slot *current_scan; -+ unsigned long current_offset; -+ -+ /* -+ * The initial value for current_offset, it should loop over -+ * [0~ step - 1] to let all slot have its chance to be scanned. -+ */ -+ unsigned long offset_init; -+ unsigned long step; /* dynamic step for current_offset */ -+ unsigned int flags; -+ unsigned long pages_to_scan; -+ //unsigned long fully_scanned_slots; -+ /* -+ * a little bit tricky - if cpu_time_ratio > 0, then the value is the -+ * the cpu time ratio it can spend in rung_i for every scan -+ * period. if < 0, then it is the cpu time ratio relative to the -+ * max cpu percentage user specified. Both in unit of -+ * 1/TIME_RATIO_SCALE -+ */ -+ int cpu_ratio; -+ -+ /* -+ * How long it will take for all slots in this rung to be fully -+ * scanned? If it's zero, we don't care about the cover time: -+ * it's fully scanned. -+ */ -+ unsigned int cover_msecs; -+ //unsigned long vma_num; -+ //unsigned long pages; /* Sum of all slot's pages in rung */ -+}; -+ -+/** -+ * node of either the stable or unstale rbtree -+ * -+ */ -+struct tree_node { -+ struct rb_node node; /* link in the main (un)stable rbtree */ -+ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ -+ u32 hash; -+ unsigned long count; /* TODO: merged with sub_root */ -+ struct list_head all_list; /* all tree nodes in stable/unstable tree */ -+}; -+ -+/** -+ * struct stable_node - node of the stable rbtree -+ * @node: rb node of this ksm page in the stable tree -+ * @hlist: hlist head of rmap_items using this ksm page -+ * @kpfn: page frame number of this ksm page -+ */ -+struct stable_node { -+ struct rb_node node; /* link in sub-rbtree */ -+ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ -+ struct hlist_head hlist; -+ unsigned long kpfn; -+ u32 hash_max; /* if ==0 then it's not been calculated yet */ -+ struct list_head all_list; /* in a list for all stable nodes */ -+}; -+ -+/** -+ * struct node_vma - group rmap_items linked in a same stable -+ * node together. -+ */ -+struct node_vma { -+ union { -+ struct vma_slot *slot; -+ unsigned long key; /* slot is used as key sorted on hlist */ -+ }; -+ struct hlist_node hlist; -+ struct hlist_head rmap_hlist; -+ struct stable_node *head; -+}; -+ -+/** -+ * struct rmap_item - reverse mapping item for virtual addresses -+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list -+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree -+ * @mm: the memory structure this rmap_item is pointing into -+ * @address: the virtual address this rmap_item tracks (+ flags in low bits) -+ * @node: rb node of this rmap_item in the unstable tree -+ * @head: pointer to stable_node heading this list in the stable tree -+ * @hlist: link into hlist of rmap_items hanging off that stable_node -+ */ -+struct rmap_item { -+ struct vma_slot *slot; -+ struct page *page; -+ unsigned long address; /* + low bits used for flags below */ -+ unsigned long hash_round; -+ unsigned long entry_index; -+ union { -+ struct {/* when in unstable tree */ -+ struct rb_node node; -+ struct tree_node *tree_node; -+ u32 hash_max; -+ }; -+ struct { /* when in stable tree */ -+ struct node_vma *head; -+ struct hlist_node hlist; -+ struct anon_vma *anon_vma; -+ }; -+ }; -+} __aligned(4); -+ -+struct rmap_list_entry { -+ union { -+ struct rmap_item *item; -+ unsigned long addr; -+ }; -+ /* lowest bit is used for is_addr tag */ -+} __aligned(4); /* 4 aligned to fit in to pages*/ -+ -+ -+/* Basic data structure definition ends */ -+ -+ -+/* -+ * Flags for rmap_item to judge if it's listed in the stable/unstable tree. -+ * The flags use the low bits of rmap_item.address -+ */ -+#define UNSTABLE_FLAG 0x1 -+#define STABLE_FLAG 0x2 -+#define get_rmap_addr(x) ((x)->address & PAGE_MASK) -+ -+/* -+ * rmap_list_entry helpers -+ */ -+#define IS_ADDR_FLAG 1 -+#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) -+#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) -+#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) -+ -+ -+/* -+ * High speed caches for frequently allocated and freed structs -+ */ -+static struct kmem_cache *rmap_item_cache; -+static struct kmem_cache *stable_node_cache; -+static struct kmem_cache *node_vma_cache; -+static struct kmem_cache *vma_slot_cache; -+static struct kmem_cache *tree_node_cache; -+#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ -+ sizeof(struct __struct), __alignof__(struct __struct),\ -+ (__flags), NULL) -+ -+/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ -+#define SCAN_LADDER_SIZE 4 -+static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; -+ -+/* The evaluation rounds uksmd has finished */ -+static unsigned long long uksm_eval_round = 1; -+ -+/* -+ * we add 1 to this var when we consider we should rebuild the whole -+ * unstable tree. -+ */ -+static unsigned long uksm_hash_round = 1; -+ -+/* -+ * How many times the whole memory is scanned. -+ */ -+static unsigned long long fully_scanned_round = 1; -+ -+/* The total number of virtual pages of all vma slots */ -+static u64 uksm_pages_total; -+ -+/* The number of pages has been scanned since the start up */ -+static u64 uksm_pages_scanned; -+ -+static u64 scanned_virtual_pages; -+ -+/* The number of pages has been scanned since last encode_benefit call */ -+static u64 uksm_pages_scanned_last; -+ -+/* If the scanned number is tooo large, we encode it here */ -+static u64 pages_scanned_stored; -+ -+static unsigned long pages_scanned_base; -+ -+/* The number of nodes in the stable tree */ -+static unsigned long uksm_pages_shared; -+ -+/* The number of page slots additionally sharing those nodes */ -+static unsigned long uksm_pages_sharing; -+ -+/* The number of nodes in the unstable tree */ -+static unsigned long uksm_pages_unshared; -+ -+/* -+ * Milliseconds ksmd should sleep between scans, -+ * >= 100ms to be consistent with -+ * scan_time_to_sleep_msec() -+ */ -+static unsigned int uksm_sleep_jiffies; -+ -+/* The real value for the uksmd next sleep */ -+static unsigned int uksm_sleep_real; -+ -+/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ -+static unsigned int uksm_sleep_saved; -+ -+/* Max percentage of cpu utilization ksmd can take to scan in one batch */ -+static unsigned int uksm_max_cpu_percentage; -+ -+static int uksm_cpu_governor; -+ -+static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; -+ -+struct uksm_cpu_preset_s { -+ int cpu_ratio[SCAN_LADDER_SIZE]; -+ unsigned int cover_msecs[SCAN_LADDER_SIZE]; -+ unsigned int max_cpu; /* percentage */ -+}; -+ -+struct uksm_cpu_preset_s uksm_cpu_preset[4] = { -+ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, -+ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, -+ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, -+ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, -+}; -+ -+/* The default value for uksm_ema_page_time if it's not initialized */ -+#define UKSM_PAGE_TIME_DEFAULT 500 -+ -+/*cost to scan one page by expotional moving average in nsecs */ -+static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; -+ -+/* The expotional moving average alpha weight, in percentage. */ -+#define EMA_ALPHA 20 -+ -+/* -+ * The threshold used to filter out thrashing areas, -+ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound -+ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio -+ * will be considered as having a zero duplication ratio. -+ */ -+static unsigned int uksm_thrash_threshold = 50; -+ -+/* How much dedup ratio is considered to be abundant*/ -+static unsigned int uksm_abundant_threshold = 10; -+ -+/* All slots having merged pages in this eval round. */ -+struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); -+ -+/* How many times the ksmd has slept since startup */ -+static unsigned long long uksm_sleep_times; -+ -+#define UKSM_RUN_STOP 0 -+#define UKSM_RUN_MERGE 1 -+static unsigned int uksm_run = 1; -+ -+static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); -+static DEFINE_MUTEX(uksm_thread_mutex); -+ -+/* -+ * List vma_slot_new is for newly created vma_slot waiting to be added by -+ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to -+ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding -+ * VMA has been removed/freed. -+ */ -+struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); -+struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); -+struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); -+static DEFINE_SPINLOCK(vma_slot_list_lock); -+ -+/* The unstable tree heads */ -+static struct rb_root root_unstable_tree = RB_ROOT; -+ -+/* -+ * All tree_nodes are in a list to be freed at once when unstable tree is -+ * freed after each scan round. -+ */ -+static struct list_head unstable_tree_node_list = -+ LIST_HEAD_INIT(unstable_tree_node_list); -+ -+/* List contains all stable nodes */ -+static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); -+ -+/* -+ * When the hash strength is changed, the stable tree must be delta_hashed and -+ * re-structured. We use two set of below structs to speed up the -+ * re-structuring of stable tree. -+ */ -+static struct list_head -+stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), -+ LIST_HEAD_INIT(stable_tree_node_list[1])}; -+ -+static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; -+static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; -+static struct rb_root *root_stable_treep = &root_stable_tree[0]; -+static unsigned long stable_tree_index; -+ -+/* The hash strength needed to hash a full page */ -+#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) -+ -+/* The hash strength needed for loop-back hashing */ -+#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) -+ -+/* The random offsets in a page */ -+static u32 *random_nums; -+ -+/* The hash strength */ -+static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; -+ -+/* The delta value each time the hash strength increases or decreases */ -+static unsigned long hash_strength_delta; -+#define HASH_STRENGTH_DELTA_MAX 5 -+ -+/* The time we have saved due to random_sample_hash */ -+static u64 rshash_pos; -+ -+/* The time we have wasted due to hash collision */ -+static u64 rshash_neg; -+ -+struct uksm_benefit { -+ u64 pos; -+ u64 neg; -+ u64 scanned; -+ unsigned long base; -+} benefit; -+ -+/* -+ * The relative cost of memcmp, compared to 1 time unit of random sample -+ * hash, this value is tested when ksm module is initialized -+ */ -+static unsigned long memcmp_cost; -+ -+static unsigned long rshash_neg_cont_zero; -+static unsigned long rshash_cont_obscure; -+ -+/* The possible states of hash strength adjustment heuristic */ -+enum rshash_states { -+ RSHASH_STILL, -+ RSHASH_TRYUP, -+ RSHASH_TRYDOWN, -+ RSHASH_NEW, -+ RSHASH_PRE_STILL, -+}; -+ -+/* The possible direction we are about to adjust hash strength */ -+enum rshash_direct { -+ GO_UP, -+ GO_DOWN, -+ OBSCURE, -+ STILL, -+}; -+ -+/* random sampling hash state machine */ -+static struct { -+ enum rshash_states state; -+ enum rshash_direct pre_direct; -+ u8 below_count; -+ /* Keep a lookup window of size 5, iff above_count/below_count > 3 -+ * in this window we stop trying. -+ */ -+ u8 lookup_window_index; -+ u64 stable_benefit; -+ unsigned long turn_point_down; -+ unsigned long turn_benefit_down; -+ unsigned long turn_point_up; -+ unsigned long turn_benefit_up; -+ unsigned long stable_point; -+} rshash_state; -+ -+/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ -+static u32 *zero_hash_table; -+ -+static inline struct node_vma *alloc_node_vma(void) -+{ -+ struct node_vma *node_vma; -+ -+ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL | -+ __GFP_NORETRY | __GFP_NOWARN); -+ if (node_vma) { -+ INIT_HLIST_HEAD(&node_vma->rmap_hlist); -+ INIT_HLIST_NODE(&node_vma->hlist); -+ } -+ return node_vma; -+} -+ -+static inline void free_node_vma(struct node_vma *node_vma) -+{ -+ kmem_cache_free(node_vma_cache, node_vma); -+} -+ -+ -+static inline struct vma_slot *alloc_vma_slot(void) -+{ -+ struct vma_slot *slot; -+ -+ /* -+ * In case ksm is not initialized by now. -+ * Oops, we need to consider the call site of uksm_init() in the future. -+ */ -+ if (!vma_slot_cache) -+ return NULL; -+ -+ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL | -+ __GFP_NORETRY | __GFP_NOWARN); -+ if (slot) { -+ INIT_LIST_HEAD(&slot->slot_list); -+ INIT_LIST_HEAD(&slot->dedup_list); -+ slot->flags |= UKSM_SLOT_NEED_RERAND; -+ } -+ return slot; -+} -+ -+static inline void free_vma_slot(struct vma_slot *vma_slot) -+{ -+ kmem_cache_free(vma_slot_cache, vma_slot); -+} -+ -+ -+ -+static inline struct rmap_item *alloc_rmap_item(void) -+{ -+ struct rmap_item *rmap_item; -+ -+ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | -+ __GFP_NORETRY | __GFP_NOWARN); -+ if (rmap_item) { -+ /* bug on lowest bit is not clear for flag use */ -+ BUG_ON(is_addr(rmap_item)); -+ } -+ return rmap_item; -+} -+ -+static inline void free_rmap_item(struct rmap_item *rmap_item) -+{ -+ rmap_item->slot = NULL; /* debug safety */ -+ kmem_cache_free(rmap_item_cache, rmap_item); -+} -+ -+static inline struct stable_node *alloc_stable_node(void) -+{ -+ struct stable_node *node; -+ -+ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | -+ __GFP_NORETRY | __GFP_NOWARN); -+ if (!node) -+ return NULL; -+ -+ INIT_HLIST_HEAD(&node->hlist); -+ list_add(&node->all_list, &stable_node_list); -+ return node; -+} -+ -+static inline void free_stable_node(struct stable_node *stable_node) -+{ -+ list_del(&stable_node->all_list); -+ kmem_cache_free(stable_node_cache, stable_node); -+} -+ -+static inline struct tree_node *alloc_tree_node(struct list_head *list) -+{ -+ struct tree_node *node; -+ -+ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | -+ __GFP_NORETRY | __GFP_NOWARN); -+ if (!node) -+ return NULL; -+ -+ list_add(&node->all_list, list); -+ return node; -+} -+ -+static inline void free_tree_node(struct tree_node *node) -+{ -+ list_del(&node->all_list); -+ kmem_cache_free(tree_node_cache, node); -+} -+ -+static void uksm_drop_anon_vma(struct rmap_item *rmap_item) -+{ -+ struct anon_vma *anon_vma = rmap_item->anon_vma; -+ -+ put_anon_vma(anon_vma); -+} -+ -+ -+/** -+ * Remove a stable node from stable_tree, may unlink from its tree_node and -+ * may remove its parent tree_node if no other stable node is pending. -+ * -+ * @stable_node The node need to be removed -+ * @unlink_rb Will this node be unlinked from the rbtree? -+ * @remove_tree_ node Will its tree_node be removed if empty? -+ */ -+static void remove_node_from_stable_tree(struct stable_node *stable_node, -+ int unlink_rb, int remove_tree_node) -+{ -+ struct node_vma *node_vma; -+ struct rmap_item *rmap_item; -+ struct hlist_node *n; -+ -+ if (!hlist_empty(&stable_node->hlist)) { -+ hlist_for_each_entry_safe(node_vma, n, -+ &stable_node->hlist, hlist) { -+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { -+ uksm_pages_sharing--; -+ -+ uksm_drop_anon_vma(rmap_item); -+ rmap_item->address &= PAGE_MASK; -+ } -+ free_node_vma(node_vma); -+ cond_resched(); -+ } -+ -+ /* the last one is counted as shared */ -+ uksm_pages_shared--; -+ uksm_pages_sharing++; -+ } -+ -+ if (stable_node->tree_node && unlink_rb) { -+ rb_erase(&stable_node->node, -+ &stable_node->tree_node->sub_root); -+ -+ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && -+ remove_tree_node) { -+ rb_erase(&stable_node->tree_node->node, -+ root_stable_treep); -+ free_tree_node(stable_node->tree_node); -+ } else { -+ stable_node->tree_node->count--; -+ } -+ } -+ -+ free_stable_node(stable_node); -+} -+ -+ -+/* -+ * get_uksm_page: checks if the page indicated by the stable node -+ * is still its ksm page, despite having held no reference to it. -+ * In which case we can trust the content of the page, and it -+ * returns the gotten page; but if the page has now been zapped, -+ * remove the stale node from the stable tree and return NULL. -+ * -+ * You would expect the stable_node to hold a reference to the ksm page. -+ * But if it increments the page's count, swapping out has to wait for -+ * ksmd to come around again before it can free the page, which may take -+ * seconds or even minutes: much too unresponsive. So instead we use a -+ * "keyhole reference": access to the ksm page from the stable node peeps -+ * out through its keyhole to see if that page still holds the right key, -+ * pointing back to this stable node. This relies on freeing a PageAnon -+ * page to reset its page->mapping to NULL, and relies on no other use of -+ * a page to put something that might look like our key in page->mapping. -+ * -+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference, -+ * but this is different - made simpler by uksm_thread_mutex being held, but -+ * interesting for assuming that no other use of the struct page could ever -+ * put our expected_mapping into page->mapping (or a field of the union which -+ * coincides with page->mapping). The RCU calls are not for KSM at all, but -+ * to keep the page_count protocol described with page_cache_get_speculative. -+ * -+ * Note: it is possible that get_uksm_page() will return NULL one moment, -+ * then page the next, if the page is in between page_freeze_refs() and -+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page -+ * is on its way to being freed; but it is an anomaly to bear in mind. -+ * -+ * @unlink_rb: if the removal of this node will firstly unlink from -+ * its rbtree. stable_node_reinsert will prevent this when restructuring the -+ * node from its old tree. -+ * -+ * @remove_tree_node: if this is the last one of its tree_node, will the -+ * tree_node be freed ? If we are inserting stable node, this tree_node may -+ * be reused, so don't free it. -+ */ -+static struct page *get_uksm_page(struct stable_node *stable_node, -+ int unlink_rb, int remove_tree_node) -+{ -+ struct page *page; -+ void *expected_mapping; -+ unsigned long kpfn; -+ -+ expected_mapping = (void *)((unsigned long)stable_node | -+ PAGE_MAPPING_KSM); -+again: -+ kpfn = READ_ONCE(stable_node->kpfn); -+ page = pfn_to_page(kpfn); -+ -+ /* -+ * page is computed from kpfn, so on most architectures reading -+ * page->mapping is naturally ordered after reading node->kpfn, -+ * but on Alpha we need to be more careful. -+ */ -+ smp_read_barrier_depends(); -+ -+ if (READ_ONCE(page->mapping) != expected_mapping) -+ goto stale; -+ -+ /* -+ * We cannot do anything with the page while its refcount is 0. -+ * Usually 0 means free, or tail of a higher-order page: in which -+ * case this node is no longer referenced, and should be freed; -+ * however, it might mean that the page is under page_freeze_refs(). -+ * The __remove_mapping() case is easy, again the node is now stale; -+ * but if page is swapcache in migrate_page_move_mapping(), it might -+ * still be our page, in which case it's essential to keep the node. -+ */ -+ while (!get_page_unless_zero(page)) { -+ /* -+ * Another check for page->mapping != expected_mapping would -+ * work here too. We have chosen the !PageSwapCache test to -+ * optimize the common case, when the page is or is about to -+ * be freed: PageSwapCache is cleared (under spin_lock_irq) -+ * in the freeze_refs section of __remove_mapping(); but Anon -+ * page->mapping reset to NULL later, in free_pages_prepare(). -+ */ -+ if (!PageSwapCache(page)) -+ goto stale; -+ cpu_relax(); -+ } -+ -+ if (READ_ONCE(page->mapping) != expected_mapping) { -+ put_page(page); -+ goto stale; -+ } -+ -+ lock_page(page); -+ if (READ_ONCE(page->mapping) != expected_mapping) { -+ unlock_page(page); -+ put_page(page); -+ goto stale; -+ } -+ unlock_page(page); -+ return page; -+stale: -+ /* -+ * We come here from above when page->mapping or !PageSwapCache -+ * suggests that the node is stale; but it might be under migration. -+ * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), -+ * before checking whether node->kpfn has been changed. -+ */ -+ smp_rmb(); -+ if (stable_node->kpfn != kpfn) -+ goto again; -+ -+ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); -+ -+ return NULL; -+} -+ -+/* -+ * Removing rmap_item from stable or unstable tree. -+ * This function will clean the information from the stable/unstable tree. -+ */ -+static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) -+{ -+ if (rmap_item->address & STABLE_FLAG) { -+ struct stable_node *stable_node; -+ struct node_vma *node_vma; -+ struct page *page; -+ -+ node_vma = rmap_item->head; -+ stable_node = node_vma->head; -+ page = get_uksm_page(stable_node, 1, 1); -+ if (!page) -+ goto out; -+ -+ /* -+ * page lock is needed because it's racing with -+ * try_to_unmap_ksm(), etc. -+ */ -+ lock_page(page); -+ hlist_del(&rmap_item->hlist); -+ -+ if (hlist_empty(&node_vma->rmap_hlist)) { -+ hlist_del(&node_vma->hlist); -+ free_node_vma(node_vma); -+ } -+ unlock_page(page); -+ -+ put_page(page); -+ if (hlist_empty(&stable_node->hlist)) { -+ /* do NOT call remove_node_from_stable_tree() here, -+ * it's possible for a forked rmap_item not in -+ * stable tree while the in-tree rmap_items were -+ * deleted. -+ */ -+ uksm_pages_shared--; -+ } else -+ uksm_pages_sharing--; -+ -+ -+ uksm_drop_anon_vma(rmap_item); -+ } else if (rmap_item->address & UNSTABLE_FLAG) { -+ if (rmap_item->hash_round == uksm_hash_round) { -+ -+ rb_erase(&rmap_item->node, -+ &rmap_item->tree_node->sub_root); -+ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { -+ rb_erase(&rmap_item->tree_node->node, -+ &root_unstable_tree); -+ -+ free_tree_node(rmap_item->tree_node); -+ } else -+ rmap_item->tree_node->count--; -+ } -+ uksm_pages_unshared--; -+ } -+ -+ rmap_item->address &= PAGE_MASK; -+ rmap_item->hash_max = 0; -+ -+out: -+ cond_resched(); /* we're called from many long loops */ -+} -+ -+static inline int slot_in_uksm(struct vma_slot *slot) -+{ -+ return list_empty(&slot->slot_list); -+} -+ -+/* -+ * Test if the mm is exiting -+ */ -+static inline bool uksm_test_exit(struct mm_struct *mm) -+{ -+ return atomic_read(&mm->mm_users) == 0; -+} -+ -+static inline unsigned long vma_pool_size(struct vma_slot *slot) -+{ -+ return round_up(sizeof(struct rmap_list_entry) * slot->pages, -+ PAGE_SIZE) >> PAGE_SHIFT; -+} -+ -+#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) -+ -+/* must be done with sem locked */ -+static int slot_pool_alloc(struct vma_slot *slot) -+{ -+ unsigned long pool_size; -+ -+ if (slot->rmap_list_pool) -+ return 0; -+ -+ pool_size = vma_pool_size(slot); -+ slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *), -+ GFP_KERNEL); -+ if (!slot->rmap_list_pool) -+ return -ENOMEM; -+ -+ slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int), -+ GFP_KERNEL); -+ if (!slot->pool_counts) { -+ kfree(slot->rmap_list_pool); -+ return -ENOMEM; -+ } -+ -+ slot->pool_size = pool_size; -+ BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); -+ slot->flags |= UKSM_SLOT_IN_UKSM; -+ uksm_pages_total += slot->pages; -+ -+ return 0; -+} -+ -+/* -+ * Called after vma is unlinked from its mm -+ */ -+void uksm_remove_vma(struct vm_area_struct *vma) -+{ -+ struct vma_slot *slot; -+ -+ if (!vma->uksm_vma_slot) -+ return; -+ -+ spin_lock(&vma_slot_list_lock); -+ slot = vma->uksm_vma_slot; -+ if (!slot) -+ goto out; -+ -+ if (slot_in_uksm(slot)) { -+ /** -+ * This slot has been added by ksmd, so move to the del list -+ * waiting ksmd to free it. -+ */ -+ list_add_tail(&slot->slot_list, &vma_slot_del); -+ } else { -+ /** -+ * It's still on new list. It's ok to free slot directly. -+ */ -+ list_del(&slot->slot_list); -+ free_vma_slot(slot); -+ } -+out: -+ vma->uksm_vma_slot = NULL; -+ spin_unlock(&vma_slot_list_lock); -+} -+ -+/** -+ * Need to do two things: -+ * 1. check if slot was moved to del list -+ * 2. make sure the mmap_sem is manipulated under valid vma. -+ * -+ * My concern here is that in some cases, this may make -+ * vma_slot_list_lock() waiters to serialized further by some -+ * sem->wait_lock, can this really be expensive? -+ * -+ * -+ * @return -+ * 0: if successfully locked mmap_sem -+ * -ENOENT: this slot was moved to del list -+ * -EBUSY: vma lock failed -+ */ -+static int try_down_read_slot_mmap_sem(struct vma_slot *slot) -+{ -+ struct vm_area_struct *vma; -+ struct mm_struct *mm; -+ struct rw_semaphore *sem; -+ -+ spin_lock(&vma_slot_list_lock); -+ -+ /* the slot_list was removed and inited from new list, when it enters -+ * uksm_list. If now it's not empty, then it must be moved to del list -+ */ -+ if (!slot_in_uksm(slot)) { -+ spin_unlock(&vma_slot_list_lock); -+ return -ENOENT; -+ } -+ -+ BUG_ON(slot->pages != vma_pages(slot->vma)); -+ /* Ok, vma still valid */ -+ vma = slot->vma; -+ mm = vma->vm_mm; -+ sem = &mm->mmap_sem; -+ -+ if (uksm_test_exit(mm)) { -+ spin_unlock(&vma_slot_list_lock); -+ return -ENOENT; -+ } -+ -+ if (down_read_trylock(sem)) { -+ spin_unlock(&vma_slot_list_lock); -+ if (slot_pool_alloc(slot)) { -+ uksm_remove_vma(vma); -+ up_read(sem); -+ return -ENOENT; -+ } -+ return 0; -+ } -+ -+ spin_unlock(&vma_slot_list_lock); -+ return -EBUSY; -+} -+ -+static inline unsigned long -+vma_page_address(struct page *page, struct vm_area_struct *vma) -+{ -+ pgoff_t pgoff = page->index; -+ unsigned long address; -+ -+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); -+ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { -+ /* page should be within @vma mapping range */ -+ return -EFAULT; -+ } -+ return address; -+} -+ -+ -+/* return 0 on success with the item's mmap_sem locked */ -+static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) -+{ -+ struct mm_struct *mm; -+ struct vma_slot *slot = item->slot; -+ int err = -EINVAL; -+ -+ struct page *page; -+ -+ /* -+ * try_down_read_slot_mmap_sem() returns non-zero if the slot -+ * has been removed by uksm_remove_vma(). -+ */ -+ if (try_down_read_slot_mmap_sem(slot)) -+ return -EBUSY; -+ -+ mm = slot->vma->vm_mm; -+ -+ if (uksm_test_exit(mm)) -+ goto failout_up; -+ -+ page = item->page; -+ rcu_read_lock(); -+ if (!get_page_unless_zero(page)) { -+ rcu_read_unlock(); -+ goto failout_up; -+ } -+ -+ /* No need to consider huge page here. */ -+ if (item->slot->vma->anon_vma != page_anon_vma(page) || -+ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { -+ /* -+ * TODO: -+ * should we release this item becase of its stale page -+ * mapping? -+ */ -+ put_page(page); -+ rcu_read_unlock(); -+ goto failout_up; -+ } -+ rcu_read_unlock(); -+ return 0; -+ -+failout_up: -+ up_read(&mm->mmap_sem); -+ return err; -+} -+ -+/* -+ * What kind of VMA is considered ? -+ */ -+static inline int vma_can_enter(struct vm_area_struct *vma) -+{ -+ return uksm_flags_can_scan(vma->vm_flags); -+} -+ -+/* -+ * Called whenever a fresh new vma is created A new vma_slot. -+ * is created and inserted into a global list Must be called. -+ * after vma is inserted to its mm. -+ */ -+void uksm_vma_add_new(struct vm_area_struct *vma) -+{ -+ struct vma_slot *slot; -+ -+ if (!vma_can_enter(vma)) { -+ vma->uksm_vma_slot = NULL; -+ return; -+ } -+ -+ slot = alloc_vma_slot(); -+ if (!slot) { -+ vma->uksm_vma_slot = NULL; -+ return; -+ } -+ -+ vma->uksm_vma_slot = slot; -+ vma->vm_flags |= VM_MERGEABLE; -+ slot->vma = vma; -+ slot->mm = vma->vm_mm; -+ slot->ctime_j = jiffies; -+ slot->pages = vma_pages(vma); -+ spin_lock(&vma_slot_list_lock); -+ list_add_tail(&slot->slot_list, &vma_slot_new); -+ spin_unlock(&vma_slot_list_lock); -+} -+ -+/* 32/3 < they < 32/2 */ -+#define shiftl 8 -+#define shiftr 12 -+ -+#define HASH_FROM_TO(from, to) \ -+for (index = from; index < to; index++) { \ -+ pos = random_nums[index]; \ -+ hash += key[pos]; \ -+ hash += (hash << shiftl); \ -+ hash ^= (hash >> shiftr); \ -+} -+ -+ -+#define HASH_FROM_DOWN_TO(from, to) \ -+for (index = from - 1; index >= to; index--) { \ -+ hash ^= (hash >> shiftr); \ -+ hash ^= (hash >> (shiftr*2)); \ -+ hash -= (hash << shiftl); \ -+ hash += (hash << (shiftl*2)); \ -+ pos = random_nums[index]; \ -+ hash -= key[pos]; \ -+} -+ -+/* -+ * The main random sample hash function. -+ */ -+static u32 random_sample_hash(void *addr, u32 hash_strength) -+{ -+ u32 hash = 0xdeadbeef; -+ int index, pos, loop = hash_strength; -+ u32 *key = (u32 *)addr; -+ -+ if (loop > HASH_STRENGTH_FULL) -+ loop = HASH_STRENGTH_FULL; -+ -+ HASH_FROM_TO(0, loop); -+ -+ if (hash_strength > HASH_STRENGTH_FULL) { -+ loop = hash_strength - HASH_STRENGTH_FULL; -+ HASH_FROM_TO(0, loop); -+ } -+ -+ return hash; -+} -+ -+ -+/** -+ * It's used when hash strength is adjusted -+ * -+ * @addr The page's virtual address -+ * @from The original hash strength -+ * @to The hash strength changed to -+ * @hash The hash value generated with "from" hash value -+ * -+ * return the hash value -+ */ -+static u32 delta_hash(void *addr, int from, int to, u32 hash) -+{ -+ u32 *key = (u32 *)addr; -+ int index, pos; /* make sure they are int type */ -+ -+ if (to > from) { -+ if (from >= HASH_STRENGTH_FULL) { -+ from -= HASH_STRENGTH_FULL; -+ to -= HASH_STRENGTH_FULL; -+ HASH_FROM_TO(from, to); -+ } else if (to <= HASH_STRENGTH_FULL) { -+ HASH_FROM_TO(from, to); -+ } else { -+ HASH_FROM_TO(from, HASH_STRENGTH_FULL); -+ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); -+ } -+ } else { -+ if (from <= HASH_STRENGTH_FULL) { -+ HASH_FROM_DOWN_TO(from, to); -+ } else if (to >= HASH_STRENGTH_FULL) { -+ from -= HASH_STRENGTH_FULL; -+ to -= HASH_STRENGTH_FULL; -+ HASH_FROM_DOWN_TO(from, to); -+ } else { -+ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); -+ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); -+ } -+ } -+ -+ return hash; -+} -+ -+/** -+ * -+ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round -+ * has finished. -+ * -+ * return 0 if no page has been scanned since last call, 1 otherwise. -+ */ -+static inline int encode_benefit(void) -+{ -+ u64 scanned_delta, pos_delta, neg_delta; -+ unsigned long base = benefit.base; -+ -+ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; -+ -+ if (!scanned_delta) -+ return 0; -+ -+ scanned_delta >>= base; -+ pos_delta = rshash_pos >> base; -+ neg_delta = rshash_neg >> base; -+ -+ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || -+ CAN_OVERFLOW_U64(benefit.neg, neg_delta) || -+ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { -+ benefit.scanned >>= 1; -+ benefit.neg >>= 1; -+ benefit.pos >>= 1; -+ benefit.base++; -+ scanned_delta >>= 1; -+ pos_delta >>= 1; -+ neg_delta >>= 1; -+ } -+ -+ benefit.pos += pos_delta; -+ benefit.neg += neg_delta; -+ benefit.scanned += scanned_delta; -+ -+ BUG_ON(!benefit.scanned); -+ -+ rshash_pos = rshash_neg = 0; -+ uksm_pages_scanned_last = uksm_pages_scanned; -+ -+ return 1; -+} -+ -+static inline void reset_benefit(void) -+{ -+ benefit.pos = 0; -+ benefit.neg = 0; -+ benefit.base = 0; -+ benefit.scanned = 0; -+} -+ -+static inline void inc_rshash_pos(unsigned long delta) -+{ -+ if (CAN_OVERFLOW_U64(rshash_pos, delta)) -+ encode_benefit(); -+ -+ rshash_pos += delta; -+} -+ -+static inline void inc_rshash_neg(unsigned long delta) -+{ -+ if (CAN_OVERFLOW_U64(rshash_neg, delta)) -+ encode_benefit(); -+ -+ rshash_neg += delta; -+} -+ -+ -+static inline u32 page_hash(struct page *page, unsigned long hash_strength, -+ int cost_accounting) -+{ -+ u32 val; -+ unsigned long delta; -+ -+ void *addr = kmap_atomic(page); -+ -+ val = random_sample_hash(addr, hash_strength); -+ kunmap_atomic(addr); -+ -+ if (cost_accounting) { -+ if (hash_strength < HASH_STRENGTH_FULL) -+ delta = HASH_STRENGTH_FULL - hash_strength; -+ else -+ delta = 0; -+ -+ inc_rshash_pos(delta); -+ } -+ -+ return val; -+} -+ -+static int memcmp_pages(struct page *page1, struct page *page2, -+ int cost_accounting) -+{ -+ char *addr1, *addr2; -+ int ret; -+ -+ addr1 = kmap_atomic(page1); -+ addr2 = kmap_atomic(page2); -+ ret = memcmp(addr1, addr2, PAGE_SIZE); -+ kunmap_atomic(addr2); -+ kunmap_atomic(addr1); -+ -+ if (cost_accounting) -+ inc_rshash_neg(memcmp_cost); -+ -+ return ret; -+} -+ -+static inline int pages_identical(struct page *page1, struct page *page2) -+{ -+ return !memcmp_pages(page1, page2, 0); -+} -+ -+static inline int is_page_full_zero(struct page *page) -+{ -+ char *addr; -+ int ret; -+ -+ addr = kmap_atomic(page); -+ ret = is_full_zero(addr, PAGE_SIZE); -+ kunmap_atomic(addr); -+ -+ return ret; -+} -+ -+static int write_protect_page(struct vm_area_struct *vma, struct page *page, -+ pte_t *orig_pte, pte_t *old_pte) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ struct page_vma_mapped_walk pvmw = { -+ .page = page, -+ .vma = vma, -+ }; -+ int swapped; -+ int err = -EFAULT; -+ unsigned long mmun_start; /* For mmu_notifiers */ -+ unsigned long mmun_end; /* For mmu_notifiers */ -+ -+ pvmw.address = page_address_in_vma(page, vma); -+ if (pvmw.address == -EFAULT) -+ goto out; -+ -+ BUG_ON(PageTransCompound(page)); -+ -+ mmun_start = pvmw.address; -+ mmun_end = pvmw.address + PAGE_SIZE; -+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); -+ -+ if (!page_vma_mapped_walk(&pvmw)) -+ goto out_mn; -+ if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) -+ goto out_unlock; -+ -+ if (old_pte) -+ *old_pte = *pvmw.pte; -+ -+ if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || -+ (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) { -+ pte_t entry; -+ -+ swapped = PageSwapCache(page); -+ flush_cache_page(vma, pvmw.address, page_to_pfn(page)); -+ /* -+ * Ok this is tricky, when get_user_pages_fast() run it doesn't -+ * take any lock, therefore the check that we are going to make -+ * with the pagecount against the mapcount is racey and -+ * O_DIRECT can happen right after the check. -+ * So we clear the pte and flush the tlb before the check -+ * this assure us that no O_DIRECT can happen after the check -+ * or in the middle of the check. -+ */ -+ entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); -+ /* -+ * Check that no O_DIRECT or similar I/O is in progress on the -+ * page -+ */ -+ if (page_mapcount(page) + 1 + swapped != page_count(page)) { -+ set_pte_at(mm, pvmw.address, pvmw.pte, entry); -+ goto out_unlock; -+ } -+ if (pte_dirty(entry)) -+ set_page_dirty(page); -+ -+ if (pte_protnone(entry)) -+ entry = pte_mkclean(pte_clear_savedwrite(entry)); -+ else -+ entry = pte_mkclean(pte_wrprotect(entry)); -+ -+ set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); -+ } -+ *orig_pte = *pvmw.pte; -+ err = 0; -+ -+out_unlock: -+ page_vma_mapped_walk_done(&pvmw); -+out_mn: -+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); -+out: -+ return err; -+} -+ -+#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ -+#define MERGE_ERR_COLLI 2 /* there is a collision */ -+#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ -+#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ -+ -+ -+/** -+ * replace_page - replace page in vma by new ksm page -+ * @vma: vma that holds the pte pointing to page -+ * @page: the page we are replacing by kpage -+ * @kpage: the ksm page we replace page by -+ * @orig_pte: the original value of the pte -+ * -+ * Returns 0 on success, MERGE_ERR_PGERR on failure. -+ */ -+static int replace_page(struct vm_area_struct *vma, struct page *page, -+ struct page *kpage, pte_t orig_pte) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ pgd_t *pgd; -+ p4d_t *p4d; -+ pud_t *pud; -+ pmd_t *pmd; -+ pte_t *ptep; -+ spinlock_t *ptl; -+ pte_t entry; -+ -+ unsigned long addr; -+ int err = MERGE_ERR_PGERR; -+ unsigned long mmun_start; /* For mmu_notifiers */ -+ unsigned long mmun_end; /* For mmu_notifiers */ -+ -+ addr = page_address_in_vma(page, vma); -+ if (addr == -EFAULT) -+ goto out; -+ -+ pgd = pgd_offset(mm, addr); -+ if (!pgd_present(*pgd)) -+ goto out; -+ -+ p4d = p4d_offset(pgd, addr); -+ pud = pud_offset(p4d, addr); -+ if (!pud_present(*pud)) -+ goto out; -+ -+ pmd = pmd_offset(pud, addr); -+ BUG_ON(pmd_trans_huge(*pmd)); -+ if (!pmd_present(*pmd)) -+ goto out; -+ -+ mmun_start = addr; -+ mmun_end = addr + PAGE_SIZE; -+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); -+ -+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); -+ if (!pte_same(*ptep, orig_pte)) { -+ pte_unmap_unlock(ptep, ptl); -+ goto out_mn; -+ } -+ -+ flush_cache_page(vma, addr, pte_pfn(*ptep)); -+ ptep_clear_flush_notify(vma, addr, ptep); -+ entry = mk_pte(kpage, vma->vm_page_prot); -+ -+ /* special treatment is needed for zero_page */ -+ if ((page_to_pfn(kpage) == uksm_zero_pfn) || -+ (page_to_pfn(kpage) == zero_pfn)) { -+ entry = pte_mkspecial(entry); -+ dec_mm_counter(mm, MM_ANONPAGES); -+ inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); -+ } else { -+ get_page(kpage); -+ page_add_anon_rmap(kpage, vma, addr, false); -+ } -+ -+ set_pte_at_notify(mm, addr, ptep, entry); -+ -+ page_remove_rmap(page, false); -+ if (!page_mapped(page)) -+ try_to_free_swap(page); -+ put_page(page); -+ -+ pte_unmap_unlock(ptep, ptl); -+ err = 0; -+out_mn: -+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); -+out: -+ return err; -+} -+ -+ -+/** -+ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The -+ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its -+ * hash_max member has not been calculated. -+ * -+ * @page The page needs to be hashed -+ * @hash_old The hash value calculated with current hash strength -+ * -+ * return the new hash value calculated at HASH_STRENGTH_MAX -+ */ -+static inline u32 page_hash_max(struct page *page, u32 hash_old) -+{ -+ u32 hash_max = 0; -+ void *addr; -+ -+ addr = kmap_atomic(page); -+ hash_max = delta_hash(addr, hash_strength, -+ HASH_STRENGTH_MAX, hash_old); -+ -+ kunmap_atomic(addr); -+ -+ if (!hash_max) -+ hash_max = 1; -+ -+ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); -+ return hash_max; -+} -+ -+/* -+ * We compare the hash again, to ensure that it is really a hash collision -+ * instead of being caused by page write. -+ */ -+static inline int check_collision(struct rmap_item *rmap_item, -+ u32 hash) -+{ -+ int err; -+ struct page *page = rmap_item->page; -+ -+ /* if this rmap_item has already been hash_maxed, then the collision -+ * must appears in the second-level rbtree search. In this case we check -+ * if its hash_max value has been changed. Otherwise, the collision -+ * happens in the first-level rbtree search, so we check against it's -+ * current hash value. -+ */ -+ if (rmap_item->hash_max) { -+ inc_rshash_neg(memcmp_cost); -+ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); -+ -+ if (rmap_item->hash_max == page_hash_max(page, hash)) -+ err = MERGE_ERR_COLLI; -+ else -+ err = MERGE_ERR_CHANGED; -+ } else { -+ inc_rshash_neg(memcmp_cost + hash_strength); -+ -+ if (page_hash(page, hash_strength, 0) == hash) -+ err = MERGE_ERR_COLLI; -+ else -+ err = MERGE_ERR_CHANGED; -+ } -+ -+ return err; -+} -+ -+/** -+ * Try to merge a rmap_item.page with a kpage in stable node. kpage must -+ * already be a ksm page. -+ * -+ * @return 0 if the pages were merged, -EFAULT otherwise. -+ */ -+static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, -+ struct page *kpage, u32 hash) -+{ -+ struct vm_area_struct *vma = rmap_item->slot->vma; -+ struct mm_struct *mm = vma->vm_mm; -+ pte_t orig_pte = __pte(0); -+ int err = MERGE_ERR_PGERR; -+ struct page *page; -+ -+ if (uksm_test_exit(mm)) -+ goto out; -+ -+ page = rmap_item->page; -+ -+ if (page == kpage) { /* ksm page forked */ -+ err = 0; -+ goto out; -+ } -+ -+ /* -+ * We need the page lock to read a stable PageSwapCache in -+ * write_protect_page(). We use trylock_page() instead of -+ * lock_page() because we don't want to wait here - we -+ * prefer to continue scanning and merging different pages, -+ * then come back to this page when it is unlocked. -+ */ -+ if (!trylock_page(page)) -+ goto out; -+ -+ if (!PageAnon(page) || !PageKsm(kpage)) -+ goto out_unlock; -+ -+ if (PageTransCompound(page)) { -+ err = split_huge_page(page); -+ if (err) -+ goto out_unlock; -+ } -+ -+ /* -+ * If this anonymous page is mapped only here, its pte may need -+ * to be write-protected. If it's mapped elsewhere, all of its -+ * ptes are necessarily already write-protected. But in either -+ * case, we need to lock and check page_count is not raised. -+ */ -+ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { -+ if (pages_identical(page, kpage)) -+ err = replace_page(vma, page, kpage, orig_pte); -+ else -+ err = check_collision(rmap_item, hash); -+ } -+ -+ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { -+ munlock_vma_page(page); -+ if (!PageMlocked(kpage)) { -+ unlock_page(page); -+ lock_page(kpage); -+ mlock_vma_page(kpage); -+ page = kpage; /* for final unlock */ -+ } -+ } -+ -+out_unlock: -+ unlock_page(page); -+out: -+ return err; -+} -+ -+ -+ -+/** -+ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance -+ * to restore a page mapping that has been changed in try_to_merge_two_pages. -+ * -+ * @return 0 on success. -+ */ -+static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, -+ pte_t orig_pte, pte_t wprt_pte) -+{ -+ struct mm_struct *mm = vma->vm_mm; -+ pgd_t *pgd; -+ p4d_t *p4d; -+ pud_t *pud; -+ pmd_t *pmd; -+ pte_t *ptep; -+ spinlock_t *ptl; -+ -+ int err = -EFAULT; -+ -+ pgd = pgd_offset(mm, addr); -+ if (!pgd_present(*pgd)) -+ goto out; -+ -+ p4d = p4d_offset(pgd, addr); -+ pud = pud_offset(p4d, addr); -+ if (!pud_present(*pud)) -+ goto out; -+ -+ pmd = pmd_offset(pud, addr); -+ if (!pmd_present(*pmd)) -+ goto out; -+ -+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); -+ if (!pte_same(*ptep, wprt_pte)) { -+ /* already copied, let it be */ -+ pte_unmap_unlock(ptep, ptl); -+ goto out; -+ } -+ -+ /* -+ * Good boy, still here. When we still get the ksm page, it does not -+ * return to the free page pool, there is no way that a pte was changed -+ * to other page and gets back to this page. And remind that ksm page -+ * do not reuse in do_wp_page(). So it's safe to restore the original -+ * pte. -+ */ -+ flush_cache_page(vma, addr, pte_pfn(*ptep)); -+ ptep_clear_flush_notify(vma, addr, ptep); -+ set_pte_at_notify(mm, addr, ptep, orig_pte); -+ -+ pte_unmap_unlock(ptep, ptl); -+ err = 0; -+out: -+ return err; -+} -+ -+/** -+ * try_to_merge_two_pages() - take two identical pages and prepare -+ * them to be merged into one page(rmap_item->page) -+ * -+ * @return 0 if we successfully merged two identical pages into -+ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision -+ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been -+ * changed since it's hashed. MERGE_ERR_PGERR otherwise. -+ * -+ */ -+static int try_to_merge_two_pages(struct rmap_item *rmap_item, -+ struct rmap_item *tree_rmap_item, -+ u32 hash) -+{ -+ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); -+ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); -+ struct vm_area_struct *vma1 = rmap_item->slot->vma; -+ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; -+ struct page *page = rmap_item->page; -+ struct page *tree_page = tree_rmap_item->page; -+ int err = MERGE_ERR_PGERR; -+ struct address_space *saved_mapping; -+ -+ -+ if (rmap_item->page == tree_rmap_item->page) -+ goto out; -+ -+ if (!trylock_page(page)) -+ goto out; -+ -+ if (!PageAnon(page)) -+ goto out_unlock; -+ -+ if (PageTransCompound(page)) { -+ err = split_huge_page(page); -+ if (err) -+ goto out_unlock; -+ } -+ -+ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { -+ unlock_page(page); -+ goto out; -+ } -+ -+ /* -+ * While we hold page lock, upgrade page from -+ * PageAnon+anon_vma to PageKsm+NULL stable_node: -+ * stable_tree_insert() will update stable_node. -+ */ -+ saved_mapping = page->mapping; -+ set_page_stable_node(page, NULL); -+ mark_page_accessed(page); -+ if (!PageDirty(page)) -+ SetPageDirty(page); -+ -+ unlock_page(page); -+ -+ if (!trylock_page(tree_page)) -+ goto restore_out; -+ -+ if (!PageAnon(tree_page)) { -+ unlock_page(tree_page); -+ goto restore_out; -+ } -+ -+ if (PageTransCompound(tree_page)) { -+ err = split_huge_page(tree_page); -+ if (err) { -+ unlock_page(tree_page); -+ goto restore_out; -+ } -+ } -+ -+ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { -+ unlock_page(tree_page); -+ goto restore_out; -+ } -+ -+ if (pages_identical(page, tree_page)) { -+ err = replace_page(vma2, tree_page, page, wprt_pte2); -+ if (err) { -+ unlock_page(tree_page); -+ goto restore_out; -+ } -+ -+ if ((vma2->vm_flags & VM_LOCKED)) { -+ munlock_vma_page(tree_page); -+ if (!PageMlocked(page)) { -+ unlock_page(tree_page); -+ lock_page(page); -+ mlock_vma_page(page); -+ tree_page = page; /* for final unlock */ -+ } -+ } -+ -+ unlock_page(tree_page); -+ -+ goto out; /* success */ -+ -+ } else { -+ if (tree_rmap_item->hash_max && -+ tree_rmap_item->hash_max == rmap_item->hash_max) { -+ err = MERGE_ERR_COLLI_MAX; -+ } else if (page_hash(page, hash_strength, 0) == -+ page_hash(tree_page, hash_strength, 0)) { -+ inc_rshash_neg(memcmp_cost + hash_strength * 2); -+ err = MERGE_ERR_COLLI; -+ } else { -+ err = MERGE_ERR_CHANGED; -+ } -+ -+ unlock_page(tree_page); -+ } -+ -+restore_out: -+ lock_page(page); -+ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), -+ orig_pte1, wprt_pte1)) -+ page->mapping = saved_mapping; -+ -+out_unlock: -+ unlock_page(page); -+out: -+ return err; -+} -+ -+static inline int hash_cmp(u32 new_val, u32 node_val) -+{ -+ if (new_val > node_val) -+ return 1; -+ else if (new_val < node_val) -+ return -1; -+ else -+ return 0; -+} -+ -+static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) -+{ -+ u32 hash_max = item->hash_max; -+ -+ if (!hash_max) { -+ hash_max = page_hash_max(item->page, hash); -+ -+ item->hash_max = hash_max; -+ } -+ -+ return hash_max; -+} -+ -+ -+ -+/** -+ * stable_tree_search() - search the stable tree for a page -+ * -+ * @item: the rmap_item we are comparing with -+ * @hash: the hash value of this item->page already calculated -+ * -+ * @return the page we have found, NULL otherwise. The page returned has -+ * been gotten. -+ */ -+static struct page *stable_tree_search(struct rmap_item *item, u32 hash) -+{ -+ struct rb_node *node = root_stable_treep->rb_node; -+ struct tree_node *tree_node; -+ unsigned long hash_max; -+ struct page *page = item->page; -+ struct stable_node *stable_node; -+ -+ stable_node = page_stable_node(page); -+ if (stable_node) { -+ /* ksm page forked, that is -+ * if (PageKsm(page) && !in_stable_tree(rmap_item)) -+ * it's actually gotten once outside. -+ */ -+ get_page(page); -+ return page; -+ } -+ -+ while (node) { -+ int cmp; -+ -+ tree_node = rb_entry(node, struct tree_node, node); -+ -+ cmp = hash_cmp(hash, tree_node->hash); -+ -+ if (cmp < 0) -+ node = node->rb_left; -+ else if (cmp > 0) -+ node = node->rb_right; -+ else -+ break; -+ } -+ -+ if (!node) -+ return NULL; -+ -+ if (tree_node->count == 1) { -+ stable_node = rb_entry(tree_node->sub_root.rb_node, -+ struct stable_node, node); -+ BUG_ON(!stable_node); -+ -+ goto get_page_out; -+ } -+ -+ /* -+ * ok, we have to search the second -+ * level subtree, hash the page to a -+ * full strength. -+ */ -+ node = tree_node->sub_root.rb_node; -+ BUG_ON(!node); -+ hash_max = rmap_item_hash_max(item, hash); -+ -+ while (node) { -+ int cmp; -+ -+ stable_node = rb_entry(node, struct stable_node, node); -+ -+ cmp = hash_cmp(hash_max, stable_node->hash_max); -+ -+ if (cmp < 0) -+ node = node->rb_left; -+ else if (cmp > 0) -+ node = node->rb_right; -+ else -+ goto get_page_out; -+ } -+ -+ return NULL; -+ -+get_page_out: -+ page = get_uksm_page(stable_node, 1, 1); -+ return page; -+} -+ -+static int try_merge_rmap_item(struct rmap_item *item, -+ struct page *kpage, -+ struct page *tree_page) -+{ -+ struct vm_area_struct *vma = item->slot->vma; -+ struct page_vma_mapped_walk pvmw = { -+ .page = kpage, -+ .vma = vma, -+ }; -+ -+ pvmw.address = get_rmap_addr(item); -+ if (!page_vma_mapped_walk(&pvmw)) -+ return 0; -+ -+ if (pte_write(*pvmw.pte)) { -+ /* has changed, abort! */ -+ page_vma_mapped_walk_done(&pvmw); -+ return 0; -+ } -+ -+ get_page(tree_page); -+ page_add_anon_rmap(tree_page, vma, pvmw.address, false); -+ -+ flush_cache_page(vma, pvmw.address, page_to_pfn(kpage)); -+ ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); -+ set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte, -+ mk_pte(tree_page, vma->vm_page_prot)); -+ -+ page_remove_rmap(kpage, false); -+ put_page(kpage); -+ -+ page_vma_mapped_walk_done(&pvmw); -+ -+ return 1; -+} -+ -+/** -+ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted -+ * into stable tree, the page was found to be identical to a stable ksm page, -+ * this is the last chance we can merge them into one. -+ * -+ * @item1: the rmap_item holding the page which we wanted to insert -+ * into stable tree. -+ * @item2: the other rmap_item we found when unstable tree search -+ * @oldpage: the page currently mapped by the two rmap_items -+ * @tree_page: the page we found identical in stable tree node -+ * @success1: return if item1 is successfully merged -+ * @success2: return if item2 is successfully merged -+ */ -+static void try_merge_with_stable(struct rmap_item *item1, -+ struct rmap_item *item2, -+ struct page **kpage, -+ struct page *tree_page, -+ int *success1, int *success2) -+{ -+ struct vm_area_struct *vma1 = item1->slot->vma; -+ struct vm_area_struct *vma2 = item2->slot->vma; -+ *success1 = 0; -+ *success2 = 0; -+ -+ if (unlikely(*kpage == tree_page)) { -+ /* I don't think this can really happen */ -+ pr_warn("UKSM: unexpected condition detected in " -+ "%s -- *kpage == tree_page !\n", __func__); -+ *success1 = 1; -+ *success2 = 1; -+ return; -+ } -+ -+ if (!PageAnon(*kpage) || !PageKsm(*kpage)) -+ goto failed; -+ -+ if (!trylock_page(tree_page)) -+ goto failed; -+ -+ /* If the oldpage is still ksm and still pointed -+ * to in the right place, and still write protected, -+ * we are confident it's not changed, no need to -+ * memcmp anymore. -+ * be ware, we cannot take nested pte locks, -+ * deadlock risk. -+ */ -+ if (!try_merge_rmap_item(item1, *kpage, tree_page)) -+ goto unlock_failed; -+ -+ /* ok, then vma2, remind that pte1 already set */ -+ if (!try_merge_rmap_item(item2, *kpage, tree_page)) -+ goto success_1; -+ -+ *success2 = 1; -+success_1: -+ *success1 = 1; -+ -+ -+ if ((*success1 && vma1->vm_flags & VM_LOCKED) || -+ (*success2 && vma2->vm_flags & VM_LOCKED)) { -+ munlock_vma_page(*kpage); -+ if (!PageMlocked(tree_page)) -+ mlock_vma_page(tree_page); -+ } -+ -+ /* -+ * We do not need oldpage any more in the caller, so can break the lock -+ * now. -+ */ -+ unlock_page(*kpage); -+ *kpage = tree_page; /* Get unlocked outside. */ -+ return; -+ -+unlock_failed: -+ unlock_page(tree_page); -+failed: -+ return; -+} -+ -+static inline void stable_node_hash_max(struct stable_node *node, -+ struct page *page, u32 hash) -+{ -+ u32 hash_max = node->hash_max; -+ -+ if (!hash_max) { -+ hash_max = page_hash_max(page, hash); -+ node->hash_max = hash_max; -+ } -+} -+ -+static inline -+struct stable_node *new_stable_node(struct tree_node *tree_node, -+ struct page *kpage, u32 hash_max) -+{ -+ struct stable_node *new_stable_node; -+ -+ new_stable_node = alloc_stable_node(); -+ if (!new_stable_node) -+ return NULL; -+ -+ new_stable_node->kpfn = page_to_pfn(kpage); -+ new_stable_node->hash_max = hash_max; -+ new_stable_node->tree_node = tree_node; -+ set_page_stable_node(kpage, new_stable_node); -+ -+ return new_stable_node; -+} -+ -+static inline -+struct stable_node *first_level_insert(struct tree_node *tree_node, -+ struct rmap_item *rmap_item, -+ struct rmap_item *tree_rmap_item, -+ struct page **kpage, u32 hash, -+ int *success1, int *success2) -+{ -+ int cmp; -+ struct page *tree_page; -+ u32 hash_max = 0; -+ struct stable_node *stable_node, *new_snode; -+ struct rb_node *parent = NULL, **new; -+ -+ /* this tree node contains no sub-tree yet */ -+ stable_node = rb_entry(tree_node->sub_root.rb_node, -+ struct stable_node, node); -+ -+ tree_page = get_uksm_page(stable_node, 1, 0); -+ if (tree_page) { -+ cmp = memcmp_pages(*kpage, tree_page, 1); -+ if (!cmp) { -+ try_merge_with_stable(rmap_item, tree_rmap_item, kpage, -+ tree_page, success1, success2); -+ put_page(tree_page); -+ if (!*success1 && !*success2) -+ goto failed; -+ -+ return stable_node; -+ -+ } else { -+ /* -+ * collision in first level try to create a subtree. -+ * A new node need to be created. -+ */ -+ put_page(tree_page); -+ -+ stable_node_hash_max(stable_node, tree_page, -+ tree_node->hash); -+ hash_max = rmap_item_hash_max(rmap_item, hash); -+ cmp = hash_cmp(hash_max, stable_node->hash_max); -+ -+ parent = &stable_node->node; -+ if (cmp < 0) -+ new = &parent->rb_left; -+ else if (cmp > 0) -+ new = &parent->rb_right; -+ else -+ goto failed; -+ } -+ -+ } else { -+ /* the only stable_node deleted, we reuse its tree_node. -+ */ -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ } -+ -+ new_snode = new_stable_node(tree_node, *kpage, hash_max); -+ if (!new_snode) -+ goto failed; -+ -+ rb_link_node(&new_snode->node, parent, new); -+ rb_insert_color(&new_snode->node, &tree_node->sub_root); -+ tree_node->count++; -+ *success1 = *success2 = 1; -+ -+ return new_snode; -+ -+failed: -+ return NULL; -+} -+ -+static inline -+struct stable_node *stable_subtree_insert(struct tree_node *tree_node, -+ struct rmap_item *rmap_item, -+ struct rmap_item *tree_rmap_item, -+ struct page **kpage, u32 hash, -+ int *success1, int *success2) -+{ -+ struct page *tree_page; -+ u32 hash_max; -+ struct stable_node *stable_node, *new_snode; -+ struct rb_node *parent, **new; -+ -+research: -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ BUG_ON(!*new); -+ hash_max = rmap_item_hash_max(rmap_item, hash); -+ while (*new) { -+ int cmp; -+ -+ stable_node = rb_entry(*new, struct stable_node, node); -+ -+ cmp = hash_cmp(hash_max, stable_node->hash_max); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else { -+ tree_page = get_uksm_page(stable_node, 1, 0); -+ if (tree_page) { -+ cmp = memcmp_pages(*kpage, tree_page, 1); -+ if (!cmp) { -+ try_merge_with_stable(rmap_item, -+ tree_rmap_item, kpage, -+ tree_page, success1, success2); -+ -+ put_page(tree_page); -+ if (!*success1 && !*success2) -+ goto failed; -+ /* -+ * successfully merged with a stable -+ * node -+ */ -+ return stable_node; -+ } else { -+ put_page(tree_page); -+ goto failed; -+ } -+ } else { -+ /* -+ * stable node may be deleted, -+ * and subtree maybe -+ * restructed, cannot -+ * continue, research it. -+ */ -+ if (tree_node->count) { -+ goto research; -+ } else { -+ /* reuse the tree node*/ -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ } -+ } -+ } -+ } -+ -+ new_snode = new_stable_node(tree_node, *kpage, hash_max); -+ if (!new_snode) -+ goto failed; -+ -+ rb_link_node(&new_snode->node, parent, new); -+ rb_insert_color(&new_snode->node, &tree_node->sub_root); -+ tree_node->count++; -+ *success1 = *success2 = 1; -+ -+ return new_snode; -+ -+failed: -+ return NULL; -+} -+ -+ -+/** -+ * stable_tree_insert() - try to insert a merged page in unstable tree to -+ * the stable tree -+ * -+ * @kpage: the page need to be inserted -+ * @hash: the current hash of this page -+ * @rmap_item: the rmap_item being scanned -+ * @tree_rmap_item: the rmap_item found on unstable tree -+ * @success1: return if rmap_item is merged -+ * @success2: return if tree_rmap_item is merged -+ * -+ * @return the stable_node on stable tree if at least one -+ * rmap_item is inserted into stable tree, NULL -+ * otherwise. -+ */ -+static struct stable_node * -+stable_tree_insert(struct page **kpage, u32 hash, -+ struct rmap_item *rmap_item, -+ struct rmap_item *tree_rmap_item, -+ int *success1, int *success2) -+{ -+ struct rb_node **new = &root_stable_treep->rb_node; -+ struct rb_node *parent = NULL; -+ struct stable_node *stable_node; -+ struct tree_node *tree_node; -+ u32 hash_max = 0; -+ -+ *success1 = *success2 = 0; -+ -+ while (*new) { -+ int cmp; -+ -+ tree_node = rb_entry(*new, struct tree_node, node); -+ -+ cmp = hash_cmp(hash, tree_node->hash); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else -+ break; -+ } -+ -+ if (*new) { -+ if (tree_node->count == 1) { -+ stable_node = first_level_insert(tree_node, rmap_item, -+ tree_rmap_item, kpage, -+ hash, success1, success2); -+ } else { -+ stable_node = stable_subtree_insert(tree_node, -+ rmap_item, tree_rmap_item, kpage, -+ hash, success1, success2); -+ } -+ } else { -+ -+ /* no tree node found */ -+ tree_node = alloc_tree_node(stable_tree_node_listp); -+ if (!tree_node) { -+ stable_node = NULL; -+ goto out; -+ } -+ -+ stable_node = new_stable_node(tree_node, *kpage, hash_max); -+ if (!stable_node) { -+ free_tree_node(tree_node); -+ goto out; -+ } -+ -+ tree_node->hash = hash; -+ rb_link_node(&tree_node->node, parent, new); -+ rb_insert_color(&tree_node->node, root_stable_treep); -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ -+ rb_link_node(&stable_node->node, parent, new); -+ rb_insert_color(&stable_node->node, &tree_node->sub_root); -+ tree_node->count++; -+ *success1 = *success2 = 1; -+ } -+ -+out: -+ return stable_node; -+} -+ -+ -+/** -+ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem -+ * -+ * @return 0 on success, -EBUSY if unable to lock the mmap_sem, -+ * -EINVAL if the page mapping has been changed. -+ */ -+static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) -+{ -+ int err; -+ -+ err = get_mergeable_page_lock_mmap(tree_rmap_item); -+ -+ if (err == -EINVAL) { -+ /* its page map has been changed, remove it */ -+ remove_rmap_item_from_tree(tree_rmap_item); -+ } -+ -+ /* The page is gotten and mmap_sem is locked now. */ -+ return err; -+} -+ -+ -+/** -+ * unstable_tree_search_insert() - search an unstable tree rmap_item with the -+ * same hash value. Get its page and trylock the mmap_sem -+ */ -+static inline -+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, -+ u32 hash) -+ -+{ -+ struct rb_node **new = &root_unstable_tree.rb_node; -+ struct rb_node *parent = NULL; -+ struct tree_node *tree_node; -+ u32 hash_max; -+ struct rmap_item *tree_rmap_item; -+ -+ while (*new) { -+ int cmp; -+ -+ tree_node = rb_entry(*new, struct tree_node, node); -+ -+ cmp = hash_cmp(hash, tree_node->hash); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else -+ break; -+ } -+ -+ if (*new) { -+ /* got the tree_node */ -+ if (tree_node->count == 1) { -+ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, -+ struct rmap_item, node); -+ BUG_ON(!tree_rmap_item); -+ -+ goto get_page_out; -+ } -+ -+ /* well, search the collision subtree */ -+ new = &tree_node->sub_root.rb_node; -+ BUG_ON(!*new); -+ hash_max = rmap_item_hash_max(rmap_item, hash); -+ -+ while (*new) { -+ int cmp; -+ -+ tree_rmap_item = rb_entry(*new, struct rmap_item, -+ node); -+ -+ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); -+ parent = *new; -+ if (cmp < 0) -+ new = &parent->rb_left; -+ else if (cmp > 0) -+ new = &parent->rb_right; -+ else -+ goto get_page_out; -+ } -+ } else { -+ /* alloc a new tree_node */ -+ tree_node = alloc_tree_node(&unstable_tree_node_list); -+ if (!tree_node) -+ return NULL; -+ -+ tree_node->hash = hash; -+ rb_link_node(&tree_node->node, parent, new); -+ rb_insert_color(&tree_node->node, &root_unstable_tree); -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ } -+ -+ /* did not found even in sub-tree */ -+ rmap_item->tree_node = tree_node; -+ rmap_item->address |= UNSTABLE_FLAG; -+ rmap_item->hash_round = uksm_hash_round; -+ rb_link_node(&rmap_item->node, parent, new); -+ rb_insert_color(&rmap_item->node, &tree_node->sub_root); -+ -+ uksm_pages_unshared++; -+ return NULL; -+ -+get_page_out: -+ if (tree_rmap_item->page == rmap_item->page) -+ return NULL; -+ -+ if (get_tree_rmap_item_page(tree_rmap_item)) -+ return NULL; -+ -+ return tree_rmap_item; -+} -+ -+static void hold_anon_vma(struct rmap_item *rmap_item, -+ struct anon_vma *anon_vma) -+{ -+ rmap_item->anon_vma = anon_vma; -+ get_anon_vma(anon_vma); -+} -+ -+ -+/** -+ * stable_tree_append() - append a rmap_item to a stable node. Deduplication -+ * ratio statistics is done in this function. -+ * -+ */ -+static void stable_tree_append(struct rmap_item *rmap_item, -+ struct stable_node *stable_node, int logdedup) -+{ -+ struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL; -+ unsigned long key = (unsigned long)rmap_item->slot; -+ unsigned long factor = rmap_item->slot->rung->step; -+ -+ BUG_ON(!stable_node); -+ rmap_item->address |= STABLE_FLAG; -+ -+ if (hlist_empty(&stable_node->hlist)) { -+ uksm_pages_shared++; -+ goto node_vma_new; -+ } else { -+ uksm_pages_sharing++; -+ } -+ -+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { -+ if (node_vma->key >= key) -+ break; -+ -+ if (logdedup) { -+ node_vma->slot->pages_bemerged += factor; -+ if (list_empty(&node_vma->slot->dedup_list)) -+ list_add(&node_vma->slot->dedup_list, -+ &vma_slot_dedup); -+ } -+ } -+ -+ if (node_vma) { -+ if (node_vma->key == key) { -+ node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist); -+ goto node_vma_ok; -+ } else if (node_vma->key > key) { -+ node_vma_cont = node_vma; -+ } -+ } -+ -+node_vma_new: -+ /* no same vma already in node, alloc a new node_vma */ -+ new_node_vma = alloc_node_vma(); -+ BUG_ON(!new_node_vma); -+ new_node_vma->head = stable_node; -+ new_node_vma->slot = rmap_item->slot; -+ -+ if (!node_vma) { -+ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); -+ } else if (node_vma->key != key) { -+ if (node_vma->key < key) -+ hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist); -+ else { -+ hlist_add_before(&new_node_vma->hlist, -+ &node_vma->hlist); -+ } -+ -+ } -+ node_vma = new_node_vma; -+ -+node_vma_ok: /* ok, ready to add to the list */ -+ rmap_item->head = node_vma; -+ hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); -+ hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); -+ if (logdedup) { -+ rmap_item->slot->pages_merged++; -+ if (node_vma_cont) { -+ node_vma = node_vma_cont; -+ hlist_for_each_entry_continue(node_vma, hlist) { -+ node_vma->slot->pages_bemerged += factor; -+ if (list_empty(&node_vma->slot->dedup_list)) -+ list_add(&node_vma->slot->dedup_list, -+ &vma_slot_dedup); -+ } -+ } -+ } -+} -+ -+/* -+ * We use break_ksm to break COW on a ksm page: it's a stripped down -+ * -+ * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) -+ * put_page(page); -+ * -+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, -+ * in case the application has unmapped and remapped mm,addr meanwhile. -+ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP -+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. -+ */ -+static int break_ksm(struct vm_area_struct *vma, unsigned long addr) -+{ -+ struct page *page; -+ int ret = 0; -+ -+ do { -+ cond_resched(); -+ page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); -+ if (IS_ERR_OR_NULL(page)) -+ break; -+ if (PageKsm(page)) { -+ ret = handle_mm_fault(vma, addr, -+ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE); -+ } else -+ ret = VM_FAULT_WRITE; -+ put_page(page); -+ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); -+ /* -+ * We must loop because handle_mm_fault() may back out if there's -+ * any difficulty e.g. if pte accessed bit gets updated concurrently. -+ * -+ * VM_FAULT_WRITE is what we have been hoping for: it indicates that -+ * COW has been broken, even if the vma does not permit VM_WRITE; -+ * but note that a concurrent fault might break PageKsm for us. -+ * -+ * VM_FAULT_SIGBUS could occur if we race with truncation of the -+ * backing file, which also invalidates anonymous pages: that's -+ * okay, that truncation will have unmapped the PageKsm for us. -+ * -+ * VM_FAULT_OOM: at the time of writing (late July 2009), setting -+ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the -+ * current task has TIF_MEMDIE set, and will be OOM killed on return -+ * to user; and ksmd, having no mm, would never be chosen for that. -+ * -+ * But if the mm is in a limited mem_cgroup, then the fault may fail -+ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and -+ * even ksmd can fail in this way - though it's usually breaking ksm -+ * just to undo a merge it made a moment before, so unlikely to oom. -+ * -+ * That's a pity: we might therefore have more kernel pages allocated -+ * than we're counting as nodes in the stable tree; but uksm_do_scan -+ * will retry to break_cow on each pass, so should recover the page -+ * in due course. The important thing is to not let VM_MERGEABLE -+ * be cleared while any such pages might remain in the area. -+ */ -+ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; -+} -+ -+static void break_cow(struct rmap_item *rmap_item) -+{ -+ struct vm_area_struct *vma = rmap_item->slot->vma; -+ struct mm_struct *mm = vma->vm_mm; -+ unsigned long addr = get_rmap_addr(rmap_item); -+ -+ if (uksm_test_exit(mm)) -+ goto out; -+ -+ break_ksm(vma, addr); -+out: -+ return; -+} -+ -+/* -+ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather -+ * than check every pte of a given vma, the locking doesn't quite work for -+ * that - an rmap_item is assigned to the stable tree after inserting ksm -+ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing -+ * rmap_items from parent to child at fork time (so as not to waste time -+ * if exit comes before the next scan reaches it). -+ * -+ * Similarly, although we'd like to remove rmap_items (so updating counts -+ * and freeing memory) when unmerging an area, it's easier to leave that -+ * to the next pass of ksmd - consider, for example, how ksmd might be -+ * in cmp_and_merge_page on one of the rmap_items we would be removing. -+ */ -+inline int unmerge_uksm_pages(struct vm_area_struct *vma, -+ unsigned long start, unsigned long end) -+{ -+ unsigned long addr; -+ int err = 0; -+ -+ for (addr = start; addr < end && !err; addr += PAGE_SIZE) { -+ if (uksm_test_exit(vma->vm_mm)) -+ break; -+ if (signal_pending(current)) -+ err = -ERESTARTSYS; -+ else -+ err = break_ksm(vma, addr); -+ } -+ return err; -+} -+ -+static inline void inc_uksm_pages_scanned(void) -+{ -+ u64 delta; -+ -+ -+ if (uksm_pages_scanned == U64_MAX) { -+ encode_benefit(); -+ -+ delta = uksm_pages_scanned >> pages_scanned_base; -+ -+ if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { -+ pages_scanned_stored >>= 1; -+ delta >>= 1; -+ pages_scanned_base++; -+ } -+ -+ pages_scanned_stored += delta; -+ -+ uksm_pages_scanned = uksm_pages_scanned_last = 0; -+ } -+ -+ uksm_pages_scanned++; -+} -+ -+static inline int find_zero_page_hash(int strength, u32 hash) -+{ -+ return (zero_hash_table[strength] == hash); -+} -+ -+static -+int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) -+{ -+ struct page *zero_page = empty_uksm_zero_page; -+ struct mm_struct *mm = vma->vm_mm; -+ pte_t orig_pte = __pte(0); -+ int err = -EFAULT; -+ -+ if (uksm_test_exit(mm)) -+ goto out; -+ -+ if (!trylock_page(page)) -+ goto out; -+ -+ if (!PageAnon(page)) -+ goto out_unlock; -+ -+ if (PageTransCompound(page)) { -+ err = split_huge_page(page); -+ if (err) -+ goto out_unlock; -+ } -+ -+ if (write_protect_page(vma, page, &orig_pte, 0) == 0) { -+ if (is_page_full_zero(page)) -+ err = replace_page(vma, page, zero_page, orig_pte); -+ } -+ -+out_unlock: -+ unlock_page(page); -+out: -+ return err; -+} -+ -+/* -+ * cmp_and_merge_page() - first see if page can be merged into the stable -+ * tree; if not, compare hash to previous and if it's the same, see if page -+ * can be inserted into the unstable tree, or merged with a page already there -+ * and both transferred to the stable tree. -+ * -+ * @page: the page that we are searching identical page to. -+ * @rmap_item: the reverse mapping into the virtual address of this page -+ */ -+static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) -+{ -+ struct rmap_item *tree_rmap_item; -+ struct page *page; -+ struct page *kpage = NULL; -+ u32 hash_max; -+ int err; -+ unsigned int success1, success2; -+ struct stable_node *snode; -+ int cmp; -+ struct rb_node *parent = NULL, **new; -+ -+ remove_rmap_item_from_tree(rmap_item); -+ page = rmap_item->page; -+ -+ /* We first start with searching the page inside the stable tree */ -+ kpage = stable_tree_search(rmap_item, hash); -+ if (kpage) { -+ err = try_to_merge_with_uksm_page(rmap_item, kpage, -+ hash); -+ if (!err) { -+ /* -+ * The page was successfully merged, add -+ * its rmap_item to the stable tree. -+ * page lock is needed because it's -+ * racing with try_to_unmap_ksm(), etc. -+ */ -+ lock_page(kpage); -+ snode = page_stable_node(kpage); -+ stable_tree_append(rmap_item, snode, 1); -+ unlock_page(kpage); -+ put_page(kpage); -+ return; /* success */ -+ } -+ put_page(kpage); -+ -+ /* -+ * if it's a collision and it has been search in sub-rbtree -+ * (hash_max != 0), we want to abort, because if it is -+ * successfully merged in unstable tree, the collision trends to -+ * happen again. -+ */ -+ if (err == MERGE_ERR_COLLI && rmap_item->hash_max) -+ return; -+ } -+ -+ tree_rmap_item = -+ unstable_tree_search_insert(rmap_item, hash); -+ if (tree_rmap_item) { -+ err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); -+ /* -+ * As soon as we merge this page, we want to remove the -+ * rmap_item of the page we have merged with from the unstable -+ * tree, and insert it instead as new node in the stable tree. -+ */ -+ if (!err) { -+ kpage = page; -+ remove_rmap_item_from_tree(tree_rmap_item); -+ lock_page(kpage); -+ snode = stable_tree_insert(&kpage, hash, -+ rmap_item, tree_rmap_item, -+ &success1, &success2); -+ -+ /* -+ * Do not log dedup for tree item, it's not counted as -+ * scanned in this round. -+ */ -+ if (success2) -+ stable_tree_append(tree_rmap_item, snode, 0); -+ -+ /* -+ * The order of these two stable append is important: -+ * we are scanning rmap_item. -+ */ -+ if (success1) -+ stable_tree_append(rmap_item, snode, 1); -+ -+ /* -+ * The original kpage may be unlocked inside -+ * stable_tree_insert() already. This page -+ * should be unlocked before doing -+ * break_cow(). -+ */ -+ unlock_page(kpage); -+ -+ if (!success1) -+ break_cow(rmap_item); -+ -+ if (!success2) -+ break_cow(tree_rmap_item); -+ -+ } else if (err == MERGE_ERR_COLLI) { -+ BUG_ON(tree_rmap_item->tree_node->count > 1); -+ -+ rmap_item_hash_max(tree_rmap_item, -+ tree_rmap_item->tree_node->hash); -+ -+ hash_max = rmap_item_hash_max(rmap_item, hash); -+ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); -+ parent = &tree_rmap_item->node; -+ if (cmp < 0) -+ new = &parent->rb_left; -+ else if (cmp > 0) -+ new = &parent->rb_right; -+ else -+ goto put_up_out; -+ -+ rmap_item->tree_node = tree_rmap_item->tree_node; -+ rmap_item->address |= UNSTABLE_FLAG; -+ rmap_item->hash_round = uksm_hash_round; -+ rb_link_node(&rmap_item->node, parent, new); -+ rb_insert_color(&rmap_item->node, -+ &tree_rmap_item->tree_node->sub_root); -+ rmap_item->tree_node->count++; -+ } else { -+ /* -+ * either one of the page has changed or they collide -+ * at the max hash, we consider them as ill items. -+ */ -+ remove_rmap_item_from_tree(tree_rmap_item); -+ } -+put_up_out: -+ put_page(tree_rmap_item->page); -+ up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem); -+ } -+} -+ -+ -+ -+ -+static inline unsigned long get_pool_index(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; -+ if (pool_index >= slot->pool_size) -+ BUG(); -+ return pool_index; -+} -+ -+static inline unsigned long index_page_offset(unsigned long index) -+{ -+ return offset_in_page(sizeof(struct rmap_list_entry *) * index); -+} -+ -+static inline -+struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, -+ unsigned long index, int need_alloc) -+{ -+ unsigned long pool_index; -+ struct page *page; -+ void *addr; -+ -+ -+ pool_index = get_pool_index(slot, index); -+ if (!slot->rmap_list_pool[pool_index]) { -+ if (!need_alloc) -+ return NULL; -+ -+ page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); -+ if (!page) -+ return NULL; -+ -+ slot->rmap_list_pool[pool_index] = page; -+ } -+ -+ addr = kmap(slot->rmap_list_pool[pool_index]); -+ addr += index_page_offset(index); -+ -+ return addr; -+} -+ -+static inline void put_rmap_list_entry(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = get_pool_index(slot, index); -+ BUG_ON(!slot->rmap_list_pool[pool_index]); -+ kunmap(slot->rmap_list_pool[pool_index]); -+} -+ -+static inline int entry_is_new(struct rmap_list_entry *entry) -+{ -+ return !entry->item; -+} -+ -+static inline unsigned long get_index_orig_addr(struct vma_slot *slot, -+ unsigned long index) -+{ -+ return slot->vma->vm_start + (index << PAGE_SHIFT); -+} -+ -+static inline unsigned long get_entry_address(struct rmap_list_entry *entry) -+{ -+ unsigned long addr; -+ -+ if (is_addr(entry->addr)) -+ addr = get_clean_addr(entry->addr); -+ else if (entry->item) -+ addr = get_rmap_addr(entry->item); -+ else -+ BUG(); -+ -+ return addr; -+} -+ -+static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) -+{ -+ if (is_addr(entry->addr)) -+ return NULL; -+ -+ return entry->item; -+} -+ -+static inline void inc_rmap_list_pool_count(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = get_pool_index(slot, index); -+ BUG_ON(!slot->rmap_list_pool[pool_index]); -+ slot->pool_counts[pool_index]++; -+} -+ -+static inline void dec_rmap_list_pool_count(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = get_pool_index(slot, index); -+ BUG_ON(!slot->rmap_list_pool[pool_index]); -+ BUG_ON(!slot->pool_counts[pool_index]); -+ slot->pool_counts[pool_index]--; -+} -+ -+static inline int entry_has_rmap(struct rmap_list_entry *entry) -+{ -+ return !is_addr(entry->addr) && entry->item; -+} -+ -+static inline void swap_entries(struct rmap_list_entry *entry1, -+ unsigned long index1, -+ struct rmap_list_entry *entry2, -+ unsigned long index2) -+{ -+ struct rmap_list_entry tmp; -+ -+ /* swapping two new entries is meaningless */ -+ BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); -+ -+ tmp = *entry1; -+ *entry1 = *entry2; -+ *entry2 = tmp; -+ -+ if (entry_has_rmap(entry1)) -+ entry1->item->entry_index = index1; -+ -+ if (entry_has_rmap(entry2)) -+ entry2->item->entry_index = index2; -+ -+ if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { -+ inc_rmap_list_pool_count(entry1->item->slot, index1); -+ dec_rmap_list_pool_count(entry1->item->slot, index2); -+ } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { -+ inc_rmap_list_pool_count(entry2->item->slot, index2); -+ dec_rmap_list_pool_count(entry2->item->slot, index1); -+ } -+} -+ -+static inline void free_entry_item(struct rmap_list_entry *entry) -+{ -+ unsigned long index; -+ struct rmap_item *item; -+ -+ if (!is_addr(entry->addr)) { -+ BUG_ON(!entry->item); -+ item = entry->item; -+ entry->addr = get_rmap_addr(item); -+ set_is_addr(entry->addr); -+ index = item->entry_index; -+ remove_rmap_item_from_tree(item); -+ dec_rmap_list_pool_count(item->slot, index); -+ free_rmap_item(item); -+ } -+} -+ -+static inline int pool_entry_boundary(unsigned long index) -+{ -+ unsigned long linear_addr; -+ -+ linear_addr = sizeof(struct rmap_list_entry *) * index; -+ return index && !offset_in_page(linear_addr); -+} -+ -+static inline void try_free_last_pool(struct vma_slot *slot, -+ unsigned long index) -+{ -+ unsigned long pool_index; -+ -+ pool_index = get_pool_index(slot, index); -+ if (slot->rmap_list_pool[pool_index] && -+ !slot->pool_counts[pool_index]) { -+ __free_page(slot->rmap_list_pool[pool_index]); -+ slot->rmap_list_pool[pool_index] = NULL; -+ slot->flags |= UKSM_SLOT_NEED_SORT; -+ } -+ -+} -+ -+static inline unsigned long vma_item_index(struct vm_area_struct *vma, -+ struct rmap_item *item) -+{ -+ return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; -+} -+ -+static int within_same_pool(struct vma_slot *slot, -+ unsigned long i, unsigned long j) -+{ -+ unsigned long pool_i, pool_j; -+ -+ pool_i = get_pool_index(slot, i); -+ pool_j = get_pool_index(slot, j); -+ -+ return (pool_i == pool_j); -+} -+ -+static void sort_rmap_entry_list(struct vma_slot *slot) -+{ -+ unsigned long i, j; -+ struct rmap_list_entry *entry, *swap_entry; -+ -+ entry = get_rmap_list_entry(slot, 0, 0); -+ for (i = 0; i < slot->pages; ) { -+ -+ if (!entry) -+ goto skip_whole_pool; -+ -+ if (entry_is_new(entry)) -+ goto next_entry; -+ -+ if (is_addr(entry->addr)) { -+ entry->addr = 0; -+ goto next_entry; -+ } -+ -+ j = vma_item_index(slot->vma, entry->item); -+ if (j == i) -+ goto next_entry; -+ -+ if (within_same_pool(slot, i, j)) -+ swap_entry = entry + j - i; -+ else -+ swap_entry = get_rmap_list_entry(slot, j, 1); -+ -+ swap_entries(entry, i, swap_entry, j); -+ if (!within_same_pool(slot, i, j)) -+ put_rmap_list_entry(slot, j); -+ continue; -+ -+skip_whole_pool: -+ i += PAGE_SIZE / sizeof(*entry); -+ if (i < slot->pages) -+ entry = get_rmap_list_entry(slot, i, 0); -+ continue; -+ -+next_entry: -+ if (i >= slot->pages - 1 || -+ !within_same_pool(slot, i, i + 1)) { -+ put_rmap_list_entry(slot, i); -+ if (i + 1 < slot->pages) -+ entry = get_rmap_list_entry(slot, i + 1, 0); -+ } else -+ entry++; -+ i++; -+ continue; -+ } -+ -+ /* free empty pool entries which contain no rmap_item */ -+ /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ -+ for (i = 0; i < slot->pool_size; i++) { -+ unsigned char has_rmap; -+ void *addr; -+ -+ if (!slot->rmap_list_pool[i]) -+ continue; -+ -+ has_rmap = 0; -+ addr = kmap(slot->rmap_list_pool[i]); -+ BUG_ON(!addr); -+ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { -+ entry = (struct rmap_list_entry *)addr + j; -+ if (is_addr(entry->addr)) -+ continue; -+ if (!entry->item) -+ continue; -+ has_rmap = 1; -+ } -+ kunmap(slot->rmap_list_pool[i]); -+ if (!has_rmap) { -+ BUG_ON(slot->pool_counts[i]); -+ __free_page(slot->rmap_list_pool[i]); -+ slot->rmap_list_pool[i] = NULL; -+ } -+ } -+ -+ slot->flags &= ~UKSM_SLOT_NEED_SORT; -+} -+ -+/* -+ * vma_fully_scanned() - if all the pages in this slot have been scanned. -+ */ -+static inline int vma_fully_scanned(struct vma_slot *slot) -+{ -+ return slot->pages_scanned == slot->pages; -+} -+ -+/** -+ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to -+ * its random permutation. This function is embedded with the random -+ * permutation index management code. -+ */ -+static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) -+{ -+ unsigned long rand_range, addr, swap_index, scan_index; -+ struct rmap_item *item = NULL; -+ struct rmap_list_entry *scan_entry, *swap_entry = NULL; -+ struct page *page; -+ -+ scan_index = swap_index = slot->pages_scanned % slot->pages; -+ -+ if (pool_entry_boundary(scan_index)) -+ try_free_last_pool(slot, scan_index - 1); -+ -+ if (vma_fully_scanned(slot)) { -+ if (slot->flags & UKSM_SLOT_NEED_SORT) -+ slot->flags |= UKSM_SLOT_NEED_RERAND; -+ else -+ slot->flags &= ~UKSM_SLOT_NEED_RERAND; -+ if (slot->flags & UKSM_SLOT_NEED_SORT) -+ sort_rmap_entry_list(slot); -+ } -+ -+ scan_entry = get_rmap_list_entry(slot, scan_index, 1); -+ if (!scan_entry) -+ return NULL; -+ -+ if (entry_is_new(scan_entry)) { -+ scan_entry->addr = get_index_orig_addr(slot, scan_index); -+ set_is_addr(scan_entry->addr); -+ } -+ -+ if (slot->flags & UKSM_SLOT_NEED_RERAND) { -+ rand_range = slot->pages - scan_index; -+ BUG_ON(!rand_range); -+ swap_index = scan_index + (prandom_u32() % rand_range); -+ } -+ -+ if (swap_index != scan_index) { -+ swap_entry = get_rmap_list_entry(slot, swap_index, 1); -+ if (entry_is_new(swap_entry)) { -+ swap_entry->addr = get_index_orig_addr(slot, -+ swap_index); -+ set_is_addr(swap_entry->addr); -+ } -+ swap_entries(scan_entry, scan_index, swap_entry, swap_index); -+ } -+ -+ addr = get_entry_address(scan_entry); -+ item = get_entry_item(scan_entry); -+ BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); -+ -+ page = follow_page(slot->vma, addr, FOLL_GET); -+ if (IS_ERR_OR_NULL(page)) -+ goto nopage; -+ -+ if (!PageAnon(page)) -+ goto putpage; -+ -+ /*check is zero_page pfn or uksm_zero_page*/ -+ if ((page_to_pfn(page) == zero_pfn) -+ || (page_to_pfn(page) == uksm_zero_pfn)) -+ goto putpage; -+ -+ flush_anon_page(slot->vma, page, addr); -+ flush_dcache_page(page); -+ -+ -+ *hash = page_hash(page, hash_strength, 1); -+ inc_uksm_pages_scanned(); -+ /*if the page content all zero, re-map to zero-page*/ -+ if (find_zero_page_hash(hash_strength, *hash)) { -+ if (!cmp_and_merge_zero_page(slot->vma, page)) { -+ slot->pages_merged++; -+ -+ /* For full-zero pages, no need to create rmap item */ -+ goto putpage; -+ } else { -+ inc_rshash_neg(memcmp_cost / 2); -+ } -+ } -+ -+ if (!item) { -+ item = alloc_rmap_item(); -+ if (item) { -+ /* It has already been zeroed */ -+ item->slot = slot; -+ item->address = addr; -+ item->entry_index = scan_index; -+ scan_entry->item = item; -+ inc_rmap_list_pool_count(slot, scan_index); -+ } else -+ goto putpage; -+ } -+ -+ BUG_ON(item->slot != slot); -+ /* the page may have changed */ -+ item->page = page; -+ put_rmap_list_entry(slot, scan_index); -+ if (swap_entry) -+ put_rmap_list_entry(slot, swap_index); -+ return item; -+ -+putpage: -+ put_page(page); -+ page = NULL; -+nopage: -+ /* no page, store addr back and free rmap_item if possible */ -+ free_entry_item(scan_entry); -+ put_rmap_list_entry(slot, scan_index); -+ if (swap_entry) -+ put_rmap_list_entry(slot, swap_index); -+ return NULL; -+} -+ -+static inline int in_stable_tree(struct rmap_item *rmap_item) -+{ -+ return rmap_item->address & STABLE_FLAG; -+} -+ -+/** -+ * scan_vma_one_page() - scan the next page in a vma_slot. Called with -+ * mmap_sem locked. -+ */ -+static noinline void scan_vma_one_page(struct vma_slot *slot) -+{ -+ u32 hash; -+ struct mm_struct *mm; -+ struct rmap_item *rmap_item = NULL; -+ struct vm_area_struct *vma = slot->vma; -+ -+ mm = vma->vm_mm; -+ BUG_ON(!mm); -+ BUG_ON(!slot); -+ -+ rmap_item = get_next_rmap_item(slot, &hash); -+ if (!rmap_item) -+ goto out1; -+ -+ if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) -+ goto out2; -+ -+ cmp_and_merge_page(rmap_item, hash); -+out2: -+ put_page(rmap_item->page); -+out1: -+ slot->pages_scanned++; -+ slot->this_sampled++; -+ if (slot->fully_scanned_round != fully_scanned_round) -+ scanned_virtual_pages++; -+ -+ if (vma_fully_scanned(slot)) -+ slot->fully_scanned_round = fully_scanned_round; -+} -+ -+static inline unsigned long rung_get_pages(struct scan_rung *rung) -+{ -+ struct slot_tree_node *node; -+ -+ if (!rung->vma_root.rnode) -+ return 0; -+ -+ node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); -+ -+ return node->size; -+} -+ -+#define RUNG_SAMPLED_MIN 3 -+ -+static inline -+void uksm_calc_rung_step(struct scan_rung *rung, -+ unsigned long page_time, unsigned long ratio) -+{ -+ unsigned long sampled, pages; -+ -+ /* will be fully scanned ? */ -+ if (!rung->cover_msecs) { -+ rung->step = 1; -+ return; -+ } -+ -+ sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) -+ * ratio / page_time; -+ -+ /* -+ * Before we finsish a scan round and expensive per-round jobs, -+ * we need to have a chance to estimate the per page time. So -+ * the sampled number can not be too small. -+ */ -+ if (sampled < RUNG_SAMPLED_MIN) -+ sampled = RUNG_SAMPLED_MIN; -+ -+ pages = rung_get_pages(rung); -+ if (likely(pages > sampled)) -+ rung->step = pages / sampled; -+ else -+ rung->step = 1; -+} -+ -+static inline int step_need_recalc(struct scan_rung *rung) -+{ -+ unsigned long pages, stepmax; -+ -+ pages = rung_get_pages(rung); -+ stepmax = pages / RUNG_SAMPLED_MIN; -+ -+ return pages && (rung->step > pages || -+ (stepmax && rung->step > stepmax)); -+} -+ -+static inline -+void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) -+{ -+ struct vma_slot *slot; -+ -+ if (finished) -+ rung->flags |= UKSM_RUNG_ROUND_FINISHED; -+ -+ if (step_recalc || step_need_recalc(rung)) { -+ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); -+ BUG_ON(step_need_recalc(rung)); -+ } -+ -+ slot_iter_index = prandom_u32() % rung->step; -+ BUG_ON(!rung->vma_root.rnode); -+ slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); -+ BUG_ON(!slot); -+ -+ rung->current_scan = slot; -+ rung->current_offset = slot_iter_index; -+} -+ -+static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) -+{ -+ return &slot->rung->vma_root; -+} -+ -+/* -+ * return if resetted. -+ */ -+static int advance_current_scan(struct scan_rung *rung) -+{ -+ unsigned short n; -+ struct vma_slot *slot, *next = NULL; -+ -+ BUG_ON(!rung->vma_root.num); -+ -+ slot = rung->current_scan; -+ n = (slot->pages - rung->current_offset) % rung->step; -+ slot_iter_index = rung->step - n; -+ next = sradix_tree_next(&rung->vma_root, slot->snode, -+ slot->sindex, slot_iter); -+ -+ if (next) { -+ rung->current_offset = slot_iter_index; -+ rung->current_scan = next; -+ return 0; -+ } else { -+ reset_current_scan(rung, 1, 0); -+ return 1; -+ } -+} -+ -+static inline void rung_rm_slot(struct vma_slot *slot) -+{ -+ struct scan_rung *rung = slot->rung; -+ struct sradix_tree_root *root; -+ -+ if (rung->current_scan == slot) -+ advance_current_scan(rung); -+ -+ root = slot_get_root(slot); -+ sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); -+ slot->snode = NULL; -+ if (step_need_recalc(rung)) { -+ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); -+ BUG_ON(step_need_recalc(rung)); -+ } -+ -+ /* In case advance_current_scan loop back to this slot again */ -+ if (rung->vma_root.num && rung->current_scan == slot) -+ reset_current_scan(slot->rung, 1, 0); -+} -+ -+static inline void rung_add_new_slots(struct scan_rung *rung, -+ struct vma_slot **slots, unsigned long num) -+{ -+ int err; -+ struct vma_slot *slot; -+ unsigned long i; -+ struct sradix_tree_root *root = &rung->vma_root; -+ -+ err = sradix_tree_enter(root, (void **)slots, num); -+ BUG_ON(err); -+ -+ for (i = 0; i < num; i++) { -+ slot = slots[i]; -+ slot->rung = rung; -+ BUG_ON(vma_fully_scanned(slot)); -+ } -+ -+ if (rung->vma_root.num == num) -+ reset_current_scan(rung, 0, 1); -+} -+ -+static inline int rung_add_one_slot(struct scan_rung *rung, -+ struct vma_slot *slot) -+{ -+ int err; -+ -+ err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); -+ if (err) -+ return err; -+ -+ slot->rung = rung; -+ if (rung->vma_root.num == 1) -+ reset_current_scan(rung, 0, 1); -+ -+ return 0; -+} -+ -+/* -+ * Return true if the slot is deleted from its rung. -+ */ -+static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) -+{ -+ struct scan_rung *old_rung = slot->rung; -+ int err; -+ -+ if (old_rung == rung) -+ return 0; -+ -+ rung_rm_slot(slot); -+ err = rung_add_one_slot(rung, slot); -+ if (err) { -+ err = rung_add_one_slot(old_rung, slot); -+ WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ -+ } -+ -+ return 1; -+} -+ -+static inline int vma_rung_up(struct vma_slot *slot) -+{ -+ struct scan_rung *rung; -+ -+ rung = slot->rung; -+ if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) -+ rung++; -+ -+ return vma_rung_enter(slot, rung); -+} -+ -+static inline int vma_rung_down(struct vma_slot *slot) -+{ -+ struct scan_rung *rung; -+ -+ rung = slot->rung; -+ if (slot->rung != &uksm_scan_ladder[0]) -+ rung--; -+ -+ return vma_rung_enter(slot, rung); -+} -+ -+/** -+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. -+ */ -+static unsigned long cal_dedup_ratio(struct vma_slot *slot) -+{ -+ unsigned long ret; -+ unsigned long pages; -+ -+ pages = slot->this_sampled; -+ if (!pages) -+ return 0; -+ -+ BUG_ON(slot->pages_scanned == slot->last_scanned); -+ -+ ret = slot->pages_merged; -+ -+ /* Thrashing area filtering */ -+ if (ret && uksm_thrash_threshold) { -+ if (slot->pages_cowed * 100 / slot->pages_merged -+ > uksm_thrash_threshold) { -+ ret = 0; -+ } else { -+ ret = slot->pages_merged - slot->pages_cowed; -+ } -+ } -+ -+ return ret * 100 / pages; -+} -+ -+/** -+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. -+ */ -+static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) -+{ -+ unsigned long ret; -+ unsigned long pages; -+ -+ pages = slot->pages; -+ if (!pages) -+ return 0; -+ -+ ret = slot->pages_bemerged; -+ -+ /* Thrashing area filtering */ -+ if (ret && uksm_thrash_threshold) { -+ if (slot->pages_cowed * 100 / slot->pages_bemerged -+ > uksm_thrash_threshold) { -+ ret = 0; -+ } else { -+ ret = slot->pages_bemerged - slot->pages_cowed; -+ } -+ } -+ -+ return ret * 100 / pages; -+} -+ -+/** -+ * stable_node_reinsert() - When the hash_strength has been adjusted, the -+ * stable tree need to be restructured, this is the function re-inserting the -+ * stable node. -+ */ -+static inline void stable_node_reinsert(struct stable_node *new_node, -+ struct page *page, -+ struct rb_root *root_treep, -+ struct list_head *tree_node_listp, -+ u32 hash) -+{ -+ struct rb_node **new = &root_treep->rb_node; -+ struct rb_node *parent = NULL; -+ struct stable_node *stable_node; -+ struct tree_node *tree_node; -+ struct page *tree_page; -+ int cmp; -+ -+ while (*new) { -+ int cmp; -+ -+ tree_node = rb_entry(*new, struct tree_node, node); -+ -+ cmp = hash_cmp(hash, tree_node->hash); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else -+ break; -+ } -+ -+ if (*new) { -+ /* find a stable tree node with same first level hash value */ -+ stable_node_hash_max(new_node, page, hash); -+ if (tree_node->count == 1) { -+ stable_node = rb_entry(tree_node->sub_root.rb_node, -+ struct stable_node, node); -+ tree_page = get_uksm_page(stable_node, 1, 0); -+ if (tree_page) { -+ stable_node_hash_max(stable_node, -+ tree_page, hash); -+ put_page(tree_page); -+ -+ /* prepare for stable node insertion */ -+ -+ cmp = hash_cmp(new_node->hash_max, -+ stable_node->hash_max); -+ parent = &stable_node->node; -+ if (cmp < 0) -+ new = &parent->rb_left; -+ else if (cmp > 0) -+ new = &parent->rb_right; -+ else -+ goto failed; -+ -+ goto add_node; -+ } else { -+ /* the only stable_node deleted, the tree node -+ * was not deleted. -+ */ -+ goto tree_node_reuse; -+ } -+ } -+ -+ /* well, search the collision subtree */ -+ new = &tree_node->sub_root.rb_node; -+ parent = NULL; -+ BUG_ON(!*new); -+ while (*new) { -+ int cmp; -+ -+ stable_node = rb_entry(*new, struct stable_node, node); -+ -+ cmp = hash_cmp(new_node->hash_max, -+ stable_node->hash_max); -+ -+ if (cmp < 0) { -+ parent = *new; -+ new = &parent->rb_left; -+ } else if (cmp > 0) { -+ parent = *new; -+ new = &parent->rb_right; -+ } else { -+ /* oh, no, still a collision */ -+ goto failed; -+ } -+ } -+ -+ goto add_node; -+ } -+ -+ /* no tree node found */ -+ tree_node = alloc_tree_node(tree_node_listp); -+ if (!tree_node) { -+ pr_err("UKSM: memory allocation error!\n"); -+ goto failed; -+ } else { -+ tree_node->hash = hash; -+ rb_link_node(&tree_node->node, parent, new); -+ rb_insert_color(&tree_node->node, root_treep); -+ -+tree_node_reuse: -+ /* prepare for stable node insertion */ -+ parent = NULL; -+ new = &tree_node->sub_root.rb_node; -+ } -+ -+add_node: -+ rb_link_node(&new_node->node, parent, new); -+ rb_insert_color(&new_node->node, &tree_node->sub_root); -+ new_node->tree_node = tree_node; -+ tree_node->count++; -+ return; -+ -+failed: -+ /* This can only happen when two nodes have collided -+ * in two levels. -+ */ -+ new_node->tree_node = NULL; -+ return; -+} -+ -+static inline void free_all_tree_nodes(struct list_head *list) -+{ -+ struct tree_node *node, *tmp; -+ -+ list_for_each_entry_safe(node, tmp, list, all_list) { -+ free_tree_node(node); -+ } -+} -+ -+/** -+ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash -+ * strength to the current hash_strength. It re-structures the hole tree. -+ */ -+static inline void stable_tree_delta_hash(u32 prev_hash_strength) -+{ -+ struct stable_node *node, *tmp; -+ struct rb_root *root_new_treep; -+ struct list_head *new_tree_node_listp; -+ -+ stable_tree_index = (stable_tree_index + 1) % 2; -+ root_new_treep = &root_stable_tree[stable_tree_index]; -+ new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; -+ *root_new_treep = RB_ROOT; -+ BUG_ON(!list_empty(new_tree_node_listp)); -+ -+ /* -+ * we need to be safe, the node could be removed by get_uksm_page() -+ */ -+ list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { -+ void *addr; -+ struct page *node_page; -+ u32 hash; -+ -+ /* -+ * We are completely re-structuring the stable nodes to a new -+ * stable tree. We don't want to touch the old tree unlinks and -+ * old tree_nodes. The old tree_nodes will be freed at once. -+ */ -+ node_page = get_uksm_page(node, 0, 0); -+ if (!node_page) -+ continue; -+ -+ if (node->tree_node) { -+ hash = node->tree_node->hash; -+ -+ addr = kmap_atomic(node_page); -+ -+ hash = delta_hash(addr, prev_hash_strength, -+ hash_strength, hash); -+ kunmap_atomic(addr); -+ } else { -+ /* -+ *it was not inserted to rbtree due to collision in last -+ *round scan. -+ */ -+ hash = page_hash(node_page, hash_strength, 0); -+ } -+ -+ stable_node_reinsert(node, node_page, root_new_treep, -+ new_tree_node_listp, hash); -+ put_page(node_page); -+ } -+ -+ root_stable_treep = root_new_treep; -+ free_all_tree_nodes(stable_tree_node_listp); -+ BUG_ON(!list_empty(stable_tree_node_listp)); -+ stable_tree_node_listp = new_tree_node_listp; -+} -+ -+static inline void inc_hash_strength(unsigned long delta) -+{ -+ hash_strength += 1 << delta; -+ if (hash_strength > HASH_STRENGTH_MAX) -+ hash_strength = HASH_STRENGTH_MAX; -+} -+ -+static inline void dec_hash_strength(unsigned long delta) -+{ -+ unsigned long change = 1 << delta; -+ -+ if (hash_strength <= change + 1) -+ hash_strength = 1; -+ else -+ hash_strength -= change; -+} -+ -+static inline void inc_hash_strength_delta(void) -+{ -+ hash_strength_delta++; -+ if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) -+ hash_strength_delta = HASH_STRENGTH_DELTA_MAX; -+} -+ -+static inline unsigned long get_current_neg_ratio(void) -+{ -+ u64 pos = benefit.pos; -+ u64 neg = benefit.neg; -+ -+ if (!neg) -+ return 0; -+ -+ if (!pos || neg > pos) -+ return 100; -+ -+ if (neg > div64_u64(U64_MAX, 100)) -+ pos = div64_u64(pos, 100); -+ else -+ neg *= 100; -+ -+ return div64_u64(neg, pos); -+} -+ -+static inline unsigned long get_current_benefit(void) -+{ -+ u64 pos = benefit.pos; -+ u64 neg = benefit.neg; -+ u64 scanned = benefit.scanned; -+ -+ if (neg > pos) -+ return 0; -+ -+ return div64_u64((pos - neg), scanned); -+} -+ -+static inline int judge_rshash_direction(void) -+{ -+ u64 current_neg_ratio, stable_benefit; -+ u64 current_benefit, delta = 0; -+ int ret = STILL; -+ -+ /* -+ * Try to probe a value after the boot, and in case the system -+ * are still for a long time. -+ */ -+ if ((fully_scanned_round & 0xFFULL) == 10) { -+ ret = OBSCURE; -+ goto out; -+ } -+ -+ current_neg_ratio = get_current_neg_ratio(); -+ -+ if (current_neg_ratio == 0) { -+ rshash_neg_cont_zero++; -+ if (rshash_neg_cont_zero > 2) -+ return GO_DOWN; -+ else -+ return STILL; -+ } -+ rshash_neg_cont_zero = 0; -+ -+ if (current_neg_ratio > 90) { -+ ret = GO_UP; -+ goto out; -+ } -+ -+ current_benefit = get_current_benefit(); -+ stable_benefit = rshash_state.stable_benefit; -+ -+ if (!stable_benefit) { -+ ret = OBSCURE; -+ goto out; -+ } -+ -+ if (current_benefit > stable_benefit) -+ delta = current_benefit - stable_benefit; -+ else if (current_benefit < stable_benefit) -+ delta = stable_benefit - current_benefit; -+ -+ delta = div64_u64(100 * delta, stable_benefit); -+ -+ if (delta > 50) { -+ rshash_cont_obscure++; -+ if (rshash_cont_obscure > 2) -+ return OBSCURE; -+ else -+ return STILL; -+ } -+ -+out: -+ rshash_cont_obscure = 0; -+ return ret; -+} -+ -+/** -+ * rshash_adjust() - The main function to control the random sampling state -+ * machine for hash strength adapting. -+ * -+ * return true if hash_strength has changed. -+ */ -+static inline int rshash_adjust(void) -+{ -+ unsigned long prev_hash_strength = hash_strength; -+ -+ if (!encode_benefit()) -+ return 0; -+ -+ switch (rshash_state.state) { -+ case RSHASH_STILL: -+ switch (judge_rshash_direction()) { -+ case GO_UP: -+ if (rshash_state.pre_direct == GO_DOWN) -+ hash_strength_delta = 0; -+ -+ inc_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ rshash_state.stable_benefit = get_current_benefit(); -+ rshash_state.pre_direct = GO_UP; -+ break; -+ -+ case GO_DOWN: -+ if (rshash_state.pre_direct == GO_UP) -+ hash_strength_delta = 0; -+ -+ dec_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ rshash_state.stable_benefit = get_current_benefit(); -+ rshash_state.pre_direct = GO_DOWN; -+ break; -+ -+ case OBSCURE: -+ rshash_state.stable_point = hash_strength; -+ rshash_state.turn_point_down = hash_strength; -+ rshash_state.turn_point_up = hash_strength; -+ rshash_state.turn_benefit_down = get_current_benefit(); -+ rshash_state.turn_benefit_up = get_current_benefit(); -+ rshash_state.lookup_window_index = 0; -+ rshash_state.state = RSHASH_TRYDOWN; -+ dec_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ break; -+ -+ case STILL: -+ break; -+ default: -+ BUG(); -+ } -+ break; -+ -+ case RSHASH_TRYDOWN: -+ if (rshash_state.lookup_window_index++ % 5 == 0) -+ rshash_state.below_count = 0; -+ -+ if (get_current_benefit() < rshash_state.stable_benefit) -+ rshash_state.below_count++; -+ else if (get_current_benefit() > -+ rshash_state.turn_benefit_down) { -+ rshash_state.turn_point_down = hash_strength; -+ rshash_state.turn_benefit_down = get_current_benefit(); -+ } -+ -+ if (rshash_state.below_count >= 3 || -+ judge_rshash_direction() == GO_UP || -+ hash_strength == 1) { -+ hash_strength = rshash_state.stable_point; -+ hash_strength_delta = 0; -+ inc_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ rshash_state.lookup_window_index = 0; -+ rshash_state.state = RSHASH_TRYUP; -+ hash_strength_delta = 0; -+ } else { -+ dec_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ } -+ break; -+ -+ case RSHASH_TRYUP: -+ if (rshash_state.lookup_window_index++ % 5 == 0) -+ rshash_state.below_count = 0; -+ -+ if (get_current_benefit() < rshash_state.turn_benefit_down) -+ rshash_state.below_count++; -+ else if (get_current_benefit() > rshash_state.turn_benefit_up) { -+ rshash_state.turn_point_up = hash_strength; -+ rshash_state.turn_benefit_up = get_current_benefit(); -+ } -+ -+ if (rshash_state.below_count >= 3 || -+ judge_rshash_direction() == GO_DOWN || -+ hash_strength == HASH_STRENGTH_MAX) { -+ hash_strength = rshash_state.turn_benefit_up > -+ rshash_state.turn_benefit_down ? -+ rshash_state.turn_point_up : -+ rshash_state.turn_point_down; -+ -+ rshash_state.state = RSHASH_PRE_STILL; -+ } else { -+ inc_hash_strength(hash_strength_delta); -+ inc_hash_strength_delta(); -+ } -+ -+ break; -+ -+ case RSHASH_NEW: -+ case RSHASH_PRE_STILL: -+ rshash_state.stable_benefit = get_current_benefit(); -+ rshash_state.state = RSHASH_STILL; -+ hash_strength_delta = 0; -+ break; -+ default: -+ BUG(); -+ } -+ -+ /* rshash_neg = rshash_pos = 0; */ -+ reset_benefit(); -+ -+ if (prev_hash_strength != hash_strength) -+ stable_tree_delta_hash(prev_hash_strength); -+ -+ return prev_hash_strength != hash_strength; -+} -+ -+/** -+ * round_update_ladder() - The main function to do update of all the -+ * adjustments whenever a scan round is finished. -+ */ -+static noinline void round_update_ladder(void) -+{ -+ int i; -+ unsigned long dedup; -+ struct vma_slot *slot, *tmp_slot; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) -+ uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; -+ -+ list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { -+ -+ /* slot may be rung_rm_slot() when mm exits */ -+ if (slot->snode) { -+ dedup = cal_dedup_ratio_old(slot); -+ if (dedup && dedup >= uksm_abundant_threshold) -+ vma_rung_up(slot); -+ } -+ -+ slot->pages_bemerged = 0; -+ slot->pages_cowed = 0; -+ -+ list_del_init(&slot->dedup_list); -+ } -+} -+ -+static void uksm_del_vma_slot(struct vma_slot *slot) -+{ -+ int i, j; -+ struct rmap_list_entry *entry; -+ -+ if (slot->snode) { -+ /* -+ * In case it just failed when entering the rung, it's not -+ * necessary. -+ */ -+ rung_rm_slot(slot); -+ } -+ -+ if (!list_empty(&slot->dedup_list)) -+ list_del(&slot->dedup_list); -+ -+ if (!slot->rmap_list_pool || !slot->pool_counts) { -+ /* In case it OOMed in uksm_vma_enter() */ -+ goto out; -+ } -+ -+ for (i = 0; i < slot->pool_size; i++) { -+ void *addr; -+ -+ if (!slot->rmap_list_pool[i]) -+ continue; -+ -+ addr = kmap(slot->rmap_list_pool[i]); -+ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { -+ entry = (struct rmap_list_entry *)addr + j; -+ if (is_addr(entry->addr)) -+ continue; -+ if (!entry->item) -+ continue; -+ -+ remove_rmap_item_from_tree(entry->item); -+ free_rmap_item(entry->item); -+ slot->pool_counts[i]--; -+ } -+ BUG_ON(slot->pool_counts[i]); -+ kunmap(slot->rmap_list_pool[i]); -+ __free_page(slot->rmap_list_pool[i]); -+ } -+ kfree(slot->rmap_list_pool); -+ kfree(slot->pool_counts); -+ -+out: -+ slot->rung = NULL; -+ if (slot->flags & UKSM_SLOT_IN_UKSM) { -+ BUG_ON(uksm_pages_total < slot->pages); -+ uksm_pages_total -= slot->pages; -+ } -+ -+ if (slot->fully_scanned_round == fully_scanned_round) -+ scanned_virtual_pages -= slot->pages; -+ else -+ scanned_virtual_pages -= slot->pages_scanned; -+ free_vma_slot(slot); -+} -+ -+ -+#define SPIN_LOCK_PERIOD 32 -+static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; -+static inline void cleanup_vma_slots(void) -+{ -+ struct vma_slot *slot; -+ int i; -+ -+ i = 0; -+ spin_lock(&vma_slot_list_lock); -+ while (!list_empty(&vma_slot_del)) { -+ slot = list_entry(vma_slot_del.next, -+ struct vma_slot, slot_list); -+ list_del(&slot->slot_list); -+ cleanup_slots[i++] = slot; -+ if (i == SPIN_LOCK_PERIOD) { -+ spin_unlock(&vma_slot_list_lock); -+ while (--i >= 0) -+ uksm_del_vma_slot(cleanup_slots[i]); -+ i = 0; -+ spin_lock(&vma_slot_list_lock); -+ } -+ } -+ spin_unlock(&vma_slot_list_lock); -+ -+ while (--i >= 0) -+ uksm_del_vma_slot(cleanup_slots[i]); -+} -+ -+/* -+ * Expotional moving average formula -+ */ -+static inline unsigned long ema(unsigned long curr, unsigned long last_ema) -+{ -+ /* -+ * For a very high burst, even the ema cannot work well, a false very -+ * high per-page time estimation can result in feedback in very high -+ * overhead of context switch and rung update -- this will then lead -+ * to higher per-paper time, this may not converge. -+ * -+ * Instead, we try to approach this value in a binary manner. -+ */ -+ if (curr > last_ema * 10) -+ return last_ema * 2; -+ -+ return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; -+} -+ -+/* -+ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to -+ * nanoseconds based on current uksm_sleep_jiffies. -+ */ -+static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) -+{ -+ return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / -+ (TIME_RATIO_SCALE - ratio) * ratio; -+} -+ -+ -+static inline unsigned long rung_real_ratio(int cpu_time_ratio) -+{ -+ unsigned long ret; -+ -+ BUG_ON(!cpu_time_ratio); -+ -+ if (cpu_time_ratio > 0) -+ ret = cpu_time_ratio; -+ else -+ ret = (unsigned long)(-cpu_time_ratio) * -+ uksm_max_cpu_percentage / 100UL; -+ -+ return ret ? ret : 1; -+} -+ -+static noinline void uksm_calc_scan_pages(void) -+{ -+ struct scan_rung *ladder = uksm_scan_ladder; -+ unsigned long sleep_usecs, nsecs; -+ unsigned long ratio; -+ int i; -+ unsigned long per_page; -+ -+ if (uksm_ema_page_time > 100000 || -+ (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) -+ uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; -+ -+ per_page = uksm_ema_page_time; -+ BUG_ON(!per_page); -+ -+ /* -+ * For every 8 eval round, we try to probe a uksm_sleep_jiffies value -+ * based on saved user input. -+ */ -+ if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) -+ uksm_sleep_jiffies = uksm_sleep_saved; -+ -+ /* We require a rung scan at least 1 page in a period. */ -+ nsecs = per_page; -+ ratio = rung_real_ratio(ladder[0].cpu_ratio); -+ if (cpu_ratio_to_nsec(ratio) < nsecs) { -+ sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio -+ / NSEC_PER_USEC; -+ uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; -+ } -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ ratio = rung_real_ratio(ladder[i].cpu_ratio); -+ ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / -+ per_page; -+ BUG_ON(!ladder[i].pages_to_scan); -+ uksm_calc_rung_step(&ladder[i], per_page, ratio); -+ } -+} -+ -+/* -+ * From the scan time of this round (ns) to next expected min sleep time -+ * (ms), be careful of the possible overflows. ratio is taken from -+ * rung_real_ratio() -+ */ -+static inline -+unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) -+{ -+ scan_time >>= 20; /* to msec level now */ -+ BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); -+ -+ return (unsigned int) ((unsigned long) scan_time * -+ (TIME_RATIO_SCALE - ratio) / ratio); -+} -+ -+#define __round_mask(x, y) ((__typeof__(x))((y)-1)) -+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) -+ -+static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) -+{ -+ struct scan_rung *rung; -+ -+ rung = &uksm_scan_ladder[0]; -+ rung_add_new_slots(rung, slots, num); -+} -+ -+static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; -+ -+static void uksm_enter_all_slots(void) -+{ -+ struct vma_slot *slot; -+ unsigned long index; -+ struct list_head empty_vma_list; -+ int i; -+ -+ i = 0; -+ index = 0; -+ INIT_LIST_HEAD(&empty_vma_list); -+ -+ spin_lock(&vma_slot_list_lock); -+ while (!list_empty(&vma_slot_new)) { -+ slot = list_entry(vma_slot_new.next, -+ struct vma_slot, slot_list); -+ -+ if (!slot->vma->anon_vma) { -+ list_move(&slot->slot_list, &empty_vma_list); -+ } else if (vma_can_enter(slot->vma)) { -+ batch_slots[index++] = slot; -+ list_del_init(&slot->slot_list); -+ } else { -+ list_move(&slot->slot_list, &vma_slot_noadd); -+ } -+ -+ if (++i == SPIN_LOCK_PERIOD || -+ (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { -+ spin_unlock(&vma_slot_list_lock); -+ -+ if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { -+ uksm_vma_enter(batch_slots, index); -+ index = 0; -+ } -+ i = 0; -+ cond_resched(); -+ spin_lock(&vma_slot_list_lock); -+ } -+ } -+ -+ list_splice(&empty_vma_list, &vma_slot_new); -+ -+ spin_unlock(&vma_slot_list_lock); -+ -+ if (index) -+ uksm_vma_enter(batch_slots, index); -+ -+} -+ -+static inline int rung_round_finished(struct scan_rung *rung) -+{ -+ return rung->flags & UKSM_RUNG_ROUND_FINISHED; -+} -+ -+static inline void judge_slot(struct vma_slot *slot) -+{ -+ struct scan_rung *rung = slot->rung; -+ unsigned long dedup; -+ int deleted; -+ -+ dedup = cal_dedup_ratio(slot); -+ if (vma_fully_scanned(slot) && uksm_thrash_threshold) -+ deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); -+ else if (dedup && dedup >= uksm_abundant_threshold) -+ deleted = vma_rung_up(slot); -+ else -+ deleted = vma_rung_down(slot); -+ -+ slot->pages_merged = 0; -+ slot->pages_cowed = 0; -+ slot->this_sampled = 0; -+ -+ if (vma_fully_scanned(slot)) -+ slot->pages_scanned = 0; -+ -+ slot->last_scanned = slot->pages_scanned; -+ -+ /* If its deleted in above, then rung was already advanced. */ -+ if (!deleted) -+ advance_current_scan(rung); -+} -+ -+ -+static inline int hash_round_finished(void) -+{ -+ if (scanned_virtual_pages > (uksm_pages_total >> 2)) { -+ scanned_virtual_pages = 0; -+ if (uksm_pages_scanned) -+ fully_scanned_round++; -+ -+ return 1; -+ } else { -+ return 0; -+ } -+} -+ -+#define UKSM_MMSEM_BATCH 5 -+#define BUSY_RETRY 100 -+ -+/** -+ * uksm_do_scan() - the main worker function. -+ */ -+static noinline void uksm_do_scan(void) -+{ -+ struct vma_slot *slot, *iter; -+ struct mm_struct *busy_mm; -+ unsigned char round_finished, all_rungs_emtpy; -+ int i, err, mmsem_batch; -+ unsigned long pcost; -+ long long delta_exec; -+ unsigned long vpages, max_cpu_ratio; -+ unsigned long long start_time, end_time, scan_time; -+ unsigned int expected_jiffies; -+ -+ might_sleep(); -+ -+ vpages = 0; -+ -+ start_time = task_sched_runtime(current); -+ max_cpu_ratio = 0; -+ mmsem_batch = 0; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE;) { -+ struct scan_rung *rung = &uksm_scan_ladder[i]; -+ unsigned long ratio; -+ int busy_retry; -+ -+ if (!rung->pages_to_scan) { -+ i++; -+ continue; -+ } -+ -+ if (!rung->vma_root.num) { -+ rung->pages_to_scan = 0; -+ i++; -+ continue; -+ } -+ -+ ratio = rung_real_ratio(rung->cpu_ratio); -+ if (ratio > max_cpu_ratio) -+ max_cpu_ratio = ratio; -+ -+ busy_retry = BUSY_RETRY; -+ /* -+ * Do not consider rung_round_finished() here, just used up the -+ * rung->pages_to_scan quota. -+ */ -+ while (rung->pages_to_scan && rung->vma_root.num && -+ likely(!freezing(current))) { -+ int reset = 0; -+ -+ slot = rung->current_scan; -+ -+ BUG_ON(vma_fully_scanned(slot)); -+ -+ if (mmsem_batch) -+ err = 0; -+ else -+ err = try_down_read_slot_mmap_sem(slot); -+ -+ if (err == -ENOENT) { -+rm_slot: -+ rung_rm_slot(slot); -+ continue; -+ } -+ -+ busy_mm = slot->mm; -+ -+ if (err == -EBUSY) { -+ /* skip other vmas on the same mm */ -+ do { -+ reset = advance_current_scan(rung); -+ iter = rung->current_scan; -+ busy_retry--; -+ if (iter->vma->vm_mm != busy_mm || -+ !busy_retry || reset) -+ break; -+ } while (1); -+ -+ if (iter->vma->vm_mm != busy_mm) { -+ continue; -+ } else { -+ /* scan round finsished */ -+ break; -+ } -+ } -+ -+ BUG_ON(!vma_can_enter(slot->vma)); -+ if (uksm_test_exit(slot->vma->vm_mm)) { -+ mmsem_batch = 0; -+ up_read(&slot->vma->vm_mm->mmap_sem); -+ goto rm_slot; -+ } -+ -+ if (mmsem_batch) -+ mmsem_batch--; -+ else -+ mmsem_batch = UKSM_MMSEM_BATCH; -+ -+ /* Ok, we have take the mmap_sem, ready to scan */ -+ scan_vma_one_page(slot); -+ rung->pages_to_scan--; -+ vpages++; -+ -+ if (rung->current_offset + rung->step > slot->pages - 1 -+ || vma_fully_scanned(slot)) { -+ up_read(&slot->vma->vm_mm->mmap_sem); -+ judge_slot(slot); -+ mmsem_batch = 0; -+ } else { -+ rung->current_offset += rung->step; -+ if (!mmsem_batch) -+ up_read(&slot->vma->vm_mm->mmap_sem); -+ } -+ -+ busy_retry = BUSY_RETRY; -+ cond_resched(); -+ } -+ -+ if (mmsem_batch) { -+ up_read(&slot->vma->vm_mm->mmap_sem); -+ mmsem_batch = 0; -+ } -+ -+ if (freezing(current)) -+ break; -+ -+ cond_resched(); -+ } -+ end_time = task_sched_runtime(current); -+ delta_exec = end_time - start_time; -+ -+ if (freezing(current)) -+ return; -+ -+ cleanup_vma_slots(); -+ uksm_enter_all_slots(); -+ -+ round_finished = 1; -+ all_rungs_emtpy = 1; -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ struct scan_rung *rung = &uksm_scan_ladder[i]; -+ -+ if (rung->vma_root.num) { -+ all_rungs_emtpy = 0; -+ if (!rung_round_finished(rung)) -+ round_finished = 0; -+ } -+ } -+ -+ if (all_rungs_emtpy) -+ round_finished = 0; -+ -+ if (round_finished) { -+ round_update_ladder(); -+ uksm_eval_round++; -+ -+ if (hash_round_finished() && rshash_adjust()) { -+ /* Reset the unstable root iff hash strength changed */ -+ uksm_hash_round++; -+ root_unstable_tree = RB_ROOT; -+ free_all_tree_nodes(&unstable_tree_node_list); -+ } -+ -+ /* -+ * A number of pages can hang around indefinitely on per-cpu -+ * pagevecs, raised page count preventing write_protect_page -+ * from merging them. Though it doesn't really matter much, -+ * it is puzzling to see some stuck in pages_volatile until -+ * other activity jostles them out, and they also prevented -+ * LTP's KSM test from succeeding deterministically; so drain -+ * them here (here rather than on entry to uksm_do_scan(), -+ * so we don't IPI too often when pages_to_scan is set low). -+ */ -+ lru_add_drain_all(); -+ } -+ -+ -+ if (vpages && delta_exec > 0) { -+ pcost = (unsigned long) delta_exec / vpages; -+ if (likely(uksm_ema_page_time)) -+ uksm_ema_page_time = ema(pcost, uksm_ema_page_time); -+ else -+ uksm_ema_page_time = pcost; -+ } -+ -+ uksm_calc_scan_pages(); -+ uksm_sleep_real = uksm_sleep_jiffies; -+ /* in case of radical cpu bursts, apply the upper bound */ -+ end_time = task_sched_runtime(current); -+ if (max_cpu_ratio && end_time > start_time) { -+ scan_time = end_time - start_time; -+ expected_jiffies = msecs_to_jiffies( -+ scan_time_to_sleep(scan_time, max_cpu_ratio)); -+ -+ if (expected_jiffies > uksm_sleep_real) -+ uksm_sleep_real = expected_jiffies; -+ -+ /* We have a 1 second up bound for responsiveness. */ -+ if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) -+ uksm_sleep_real = msecs_to_jiffies(1000); -+ } -+ -+ return; -+} -+ -+static int ksmd_should_run(void) -+{ -+ return uksm_run & UKSM_RUN_MERGE; -+} -+ -+static int uksm_scan_thread(void *nothing) -+{ -+ set_freezable(); -+ set_user_nice(current, 5); -+ -+ while (!kthread_should_stop()) { -+ mutex_lock(&uksm_thread_mutex); -+ if (ksmd_should_run()) -+ uksm_do_scan(); -+ mutex_unlock(&uksm_thread_mutex); -+ -+ try_to_freeze(); -+ -+ if (ksmd_should_run()) { -+ schedule_timeout_interruptible(uksm_sleep_real); -+ uksm_sleep_times++; -+ } else { -+ wait_event_freezable(uksm_thread_wait, -+ ksmd_should_run() || kthread_should_stop()); -+ } -+ } -+ return 0; -+} -+ -+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) -+{ -+ struct stable_node *stable_node; -+ struct node_vma *node_vma; -+ struct rmap_item *rmap_item; -+ int search_new_forks = 0; -+ unsigned long address; -+ -+ VM_BUG_ON_PAGE(!PageKsm(page), page); -+ VM_BUG_ON_PAGE(!PageLocked(page), page); -+ -+ stable_node = page_stable_node(page); -+ if (!stable_node) -+ return; -+again: -+ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { -+ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { -+ struct anon_vma *anon_vma = rmap_item->anon_vma; -+ struct anon_vma_chain *vmac; -+ struct vm_area_struct *vma; -+ -+ cond_resched(); -+ anon_vma_lock_read(anon_vma); -+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, -+ 0, ULONG_MAX) { -+ cond_resched(); -+ vma = vmac->vma; -+ address = get_rmap_addr(rmap_item); -+ -+ if (address < vma->vm_start || -+ address >= vma->vm_end) -+ continue; -+ -+ if ((rmap_item->slot->vma == vma) == -+ search_new_forks) -+ continue; -+ -+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) -+ continue; -+ -+ if (!rwc->rmap_one(page, vma, address, rwc->arg)) { -+ anon_vma_unlock_read(anon_vma); -+ return; -+ } -+ -+ if (rwc->done && rwc->done(page)) { -+ anon_vma_unlock_read(anon_vma); -+ return; -+ } -+ } -+ anon_vma_unlock_read(anon_vma); -+ } -+ } -+ if (!search_new_forks++) -+ goto again; -+} -+ -+#ifdef CONFIG_MIGRATION -+/* Common ksm interface but may be specific to uksm */ -+void ksm_migrate_page(struct page *newpage, struct page *oldpage) -+{ -+ struct stable_node *stable_node; -+ -+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); -+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); -+ VM_BUG_ON(newpage->mapping != oldpage->mapping); -+ -+ stable_node = page_stable_node(newpage); -+ if (stable_node) { -+ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); -+ stable_node->kpfn = page_to_pfn(newpage); -+ /* -+ * newpage->mapping was set in advance; now we need smp_wmb() -+ * to make sure that the new stable_node->kpfn is visible -+ * to get_ksm_page() before it can see that oldpage->mapping -+ * has gone stale (or that PageSwapCache has been cleared). -+ */ -+ smp_wmb(); -+ set_page_stable_node(oldpage, NULL); -+ } -+} -+#endif /* CONFIG_MIGRATION */ -+ -+#ifdef CONFIG_MEMORY_HOTREMOVE -+static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, -+ unsigned long end_pfn) -+{ -+ struct rb_node *node; -+ -+ for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { -+ struct stable_node *stable_node; -+ -+ stable_node = rb_entry(node, struct stable_node, node); -+ if (stable_node->kpfn >= start_pfn && -+ stable_node->kpfn < end_pfn) -+ return stable_node; -+ } -+ return NULL; -+} -+ -+static int uksm_memory_callback(struct notifier_block *self, -+ unsigned long action, void *arg) -+{ -+ struct memory_notify *mn = arg; -+ struct stable_node *stable_node; -+ -+ switch (action) { -+ case MEM_GOING_OFFLINE: -+ /* -+ * Keep it very simple for now: just lock out ksmd and -+ * MADV_UNMERGEABLE while any memory is going offline. -+ * mutex_lock_nested() is necessary because lockdep was alarmed -+ * that here we take uksm_thread_mutex inside notifier chain -+ * mutex, and later take notifier chain mutex inside -+ * uksm_thread_mutex to unlock it. But that's safe because both -+ * are inside mem_hotplug_mutex. -+ */ -+ mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); -+ break; -+ -+ case MEM_OFFLINE: -+ /* -+ * Most of the work is done by page migration; but there might -+ * be a few stable_nodes left over, still pointing to struct -+ * pages which have been offlined: prune those from the tree. -+ */ -+ while ((stable_node = uksm_check_stable_tree(mn->start_pfn, -+ mn->start_pfn + mn->nr_pages)) != NULL) -+ remove_node_from_stable_tree(stable_node, 1, 1); -+ /* fallthrough */ -+ -+ case MEM_CANCEL_OFFLINE: -+ mutex_unlock(&uksm_thread_mutex); -+ break; -+ } -+ return NOTIFY_OK; -+} -+#endif /* CONFIG_MEMORY_HOTREMOVE */ -+ -+#ifdef CONFIG_SYSFS -+/* -+ * This all compiles without CONFIG_SYSFS, but is a waste of space. -+ */ -+ -+#define UKSM_ATTR_RO(_name) \ -+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) -+#define UKSM_ATTR(_name) \ -+ static struct kobj_attribute _name##_attr = \ -+ __ATTR(_name, 0644, _name##_show, _name##_store) -+ -+static ssize_t max_cpu_percentage_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", uksm_max_cpu_percentage); -+} -+ -+static ssize_t max_cpu_percentage_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ unsigned long max_cpu_percentage; -+ int err; -+ -+ err = kstrtoul(buf, 10, &max_cpu_percentage); -+ if (err || max_cpu_percentage > 100) -+ return -EINVAL; -+ -+ if (max_cpu_percentage == 100) -+ max_cpu_percentage = 99; -+ else if (max_cpu_percentage < 10) -+ max_cpu_percentage = 10; -+ -+ uksm_max_cpu_percentage = max_cpu_percentage; -+ -+ return count; -+} -+UKSM_ATTR(max_cpu_percentage); -+ -+static ssize_t sleep_millisecs_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); -+} -+ -+static ssize_t sleep_millisecs_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ unsigned long msecs; -+ int err; -+ -+ err = kstrtoul(buf, 10, &msecs); -+ if (err || msecs > MSEC_PER_SEC) -+ return -EINVAL; -+ -+ uksm_sleep_jiffies = msecs_to_jiffies(msecs); -+ uksm_sleep_saved = uksm_sleep_jiffies; -+ -+ return count; -+} -+UKSM_ATTR(sleep_millisecs); -+ -+ -+static ssize_t cpu_governor_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); -+ int i; -+ -+ buf[0] = '\0'; -+ for (i = 0; i < n ; i++) { -+ if (uksm_cpu_governor == i) -+ strcat(buf, "["); -+ -+ strcat(buf, uksm_cpu_governor_str[i]); -+ -+ if (uksm_cpu_governor == i) -+ strcat(buf, "]"); -+ -+ strcat(buf, " "); -+ } -+ strcat(buf, "\n"); -+ -+ return strlen(buf); -+} -+ -+static inline void init_performance_values(void) -+{ -+ int i; -+ struct scan_rung *rung; -+ struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; -+ -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = uksm_scan_ladder + i; -+ rung->cpu_ratio = preset->cpu_ratio[i]; -+ rung->cover_msecs = preset->cover_msecs[i]; -+ } -+ -+ uksm_max_cpu_percentage = preset->max_cpu; -+} -+ -+static ssize_t cpu_governor_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); -+ -+ for (n--; n >= 0 ; n--) { -+ if (!strncmp(buf, uksm_cpu_governor_str[n], -+ strlen(uksm_cpu_governor_str[n]))) -+ break; -+ } -+ -+ if (n < 0) -+ return -EINVAL; -+ else -+ uksm_cpu_governor = n; -+ -+ init_performance_values(); -+ -+ return count; -+} -+UKSM_ATTR(cpu_governor); -+ -+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, -+ char *buf) -+{ -+ return sprintf(buf, "%u\n", uksm_run); -+} -+ -+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int err; -+ unsigned long flags; -+ -+ err = kstrtoul(buf, 10, &flags); -+ if (err || flags > UINT_MAX) -+ return -EINVAL; -+ if (flags > UKSM_RUN_MERGE) -+ return -EINVAL; -+ -+ mutex_lock(&uksm_thread_mutex); -+ if (uksm_run != flags) -+ uksm_run = flags; -+ mutex_unlock(&uksm_thread_mutex); -+ -+ if (flags & UKSM_RUN_MERGE) -+ wake_up_interruptible(&uksm_thread_wait); -+ -+ return count; -+} -+UKSM_ATTR(run); -+ -+static ssize_t abundant_threshold_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", uksm_abundant_threshold); -+} -+ -+static ssize_t abundant_threshold_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int err; -+ unsigned long flags; -+ -+ err = kstrtoul(buf, 10, &flags); -+ if (err || flags > 99) -+ return -EINVAL; -+ -+ uksm_abundant_threshold = flags; -+ -+ return count; -+} -+UKSM_ATTR(abundant_threshold); -+ -+static ssize_t thrash_threshold_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", uksm_thrash_threshold); -+} -+ -+static ssize_t thrash_threshold_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int err; -+ unsigned long flags; -+ -+ err = kstrtoul(buf, 10, &flags); -+ if (err || flags > 99) -+ return -EINVAL; -+ -+ uksm_thrash_threshold = flags; -+ -+ return count; -+} -+UKSM_ATTR(thrash_threshold); -+ -+static ssize_t cpu_ratios_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ int i, size; -+ struct scan_rung *rung; -+ char *p = buf; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = &uksm_scan_ladder[i]; -+ -+ if (rung->cpu_ratio > 0) -+ size = sprintf(p, "%d ", rung->cpu_ratio); -+ else -+ size = sprintf(p, "MAX/%d ", -+ TIME_RATIO_SCALE / -rung->cpu_ratio); -+ -+ p += size; -+ } -+ -+ *p++ = '\n'; -+ *p = '\0'; -+ -+ return p - buf; -+} -+ -+static ssize_t cpu_ratios_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int i, cpuratios[SCAN_LADDER_SIZE], err; -+ unsigned long value; -+ struct scan_rung *rung; -+ char *p, *end = NULL; -+ -+ p = kzalloc(count, GFP_KERNEL); -+ if (!p) -+ return -ENOMEM; -+ -+ memcpy(p, buf, count); -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ if (i != SCAN_LADDER_SIZE - 1) { -+ end = strchr(p, ' '); -+ if (!end) -+ return -EINVAL; -+ -+ *end = '\0'; -+ } -+ -+ if (strstr(p, "MAX/")) { -+ p = strchr(p, '/') + 1; -+ err = kstrtoul(p, 10, &value); -+ if (err || value > TIME_RATIO_SCALE || !value) -+ return -EINVAL; -+ -+ cpuratios[i] = -(int) (TIME_RATIO_SCALE / value); -+ } else { -+ err = kstrtoul(p, 10, &value); -+ if (err || value > TIME_RATIO_SCALE || !value) -+ return -EINVAL; -+ -+ cpuratios[i] = value; -+ } -+ -+ p = end + 1; -+ } -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = &uksm_scan_ladder[i]; -+ -+ rung->cpu_ratio = cpuratios[i]; -+ } -+ -+ return count; -+} -+UKSM_ATTR(cpu_ratios); -+ -+static ssize_t eval_intervals_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ int i, size; -+ struct scan_rung *rung; -+ char *p = buf; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = &uksm_scan_ladder[i]; -+ size = sprintf(p, "%u ", rung->cover_msecs); -+ p += size; -+ } -+ -+ *p++ = '\n'; -+ *p = '\0'; -+ -+ return p - buf; -+} -+ -+static ssize_t eval_intervals_store(struct kobject *kobj, -+ struct kobj_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int i, err; -+ unsigned long values[SCAN_LADDER_SIZE]; -+ struct scan_rung *rung; -+ char *p, *end = NULL; -+ ssize_t ret = count; -+ -+ p = kzalloc(count + 2, GFP_KERNEL); -+ if (!p) -+ return -ENOMEM; -+ -+ memcpy(p, buf, count); -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ if (i != SCAN_LADDER_SIZE - 1) { -+ end = strchr(p, ' '); -+ if (!end) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ *end = '\0'; -+ } -+ -+ err = kstrtoul(p, 10, &values[i]); -+ if (err) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ p = end + 1; -+ } -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = &uksm_scan_ladder[i]; -+ -+ rung->cover_msecs = values[i]; -+ } -+ -+out: -+ kfree(p); -+ return ret; -+} -+UKSM_ATTR(eval_intervals); -+ -+static ssize_t ema_per_page_time_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", uksm_ema_page_time); -+} -+UKSM_ATTR_RO(ema_per_page_time); -+ -+static ssize_t pages_shared_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", uksm_pages_shared); -+} -+UKSM_ATTR_RO(pages_shared); -+ -+static ssize_t pages_sharing_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", uksm_pages_sharing); -+} -+UKSM_ATTR_RO(pages_sharing); -+ -+static ssize_t pages_unshared_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", uksm_pages_unshared); -+} -+UKSM_ATTR_RO(pages_unshared); -+ -+static ssize_t full_scans_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%llu\n", fully_scanned_round); -+} -+UKSM_ATTR_RO(full_scans); -+ -+static ssize_t pages_scanned_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ unsigned long base = 0; -+ u64 delta, ret; -+ -+ if (pages_scanned_stored) { -+ base = pages_scanned_base; -+ ret = pages_scanned_stored; -+ delta = uksm_pages_scanned >> base; -+ if (CAN_OVERFLOW_U64(ret, delta)) { -+ ret >>= 1; -+ delta >>= 1; -+ base++; -+ ret += delta; -+ } -+ } else { -+ ret = uksm_pages_scanned; -+ } -+ -+ while (ret > ULONG_MAX) { -+ ret >>= 1; -+ base++; -+ } -+ -+ if (base) -+ return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); -+ else -+ return sprintf(buf, "%lu\n", (unsigned long)ret); -+} -+UKSM_ATTR_RO(pages_scanned); -+ -+static ssize_t hash_strength_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%lu\n", hash_strength); -+} -+UKSM_ATTR_RO(hash_strength); -+ -+static ssize_t sleep_times_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%llu\n", uksm_sleep_times); -+} -+UKSM_ATTR_RO(sleep_times); -+ -+ -+static struct attribute *uksm_attrs[] = { -+ &max_cpu_percentage_attr.attr, -+ &sleep_millisecs_attr.attr, -+ &cpu_governor_attr.attr, -+ &run_attr.attr, -+ &ema_per_page_time_attr.attr, -+ &pages_shared_attr.attr, -+ &pages_sharing_attr.attr, -+ &pages_unshared_attr.attr, -+ &full_scans_attr.attr, -+ &pages_scanned_attr.attr, -+ &hash_strength_attr.attr, -+ &sleep_times_attr.attr, -+ &thrash_threshold_attr.attr, -+ &abundant_threshold_attr.attr, -+ &cpu_ratios_attr.attr, -+ &eval_intervals_attr.attr, -+ NULL, -+}; -+ -+static struct attribute_group uksm_attr_group = { -+ .attrs = uksm_attrs, -+ .name = "uksm", -+}; -+#endif /* CONFIG_SYSFS */ -+ -+static inline void init_scan_ladder(void) -+{ -+ int i; -+ struct scan_rung *rung; -+ -+ for (i = 0; i < SCAN_LADDER_SIZE; i++) { -+ rung = uksm_scan_ladder + i; -+ slot_tree_init_root(&rung->vma_root); -+ } -+ -+ init_performance_values(); -+ uksm_calc_scan_pages(); -+} -+ -+static inline int cal_positive_negative_costs(void) -+{ -+ struct page *p1, *p2; -+ unsigned char *addr1, *addr2; -+ unsigned long i, time_start, hash_cost; -+ unsigned long loopnum = 0; -+ -+ /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ -+ volatile u32 hash; -+ volatile int ret; -+ -+ p1 = alloc_page(GFP_KERNEL); -+ if (!p1) -+ return -ENOMEM; -+ -+ p2 = alloc_page(GFP_KERNEL); -+ if (!p2) -+ return -ENOMEM; -+ -+ addr1 = kmap_atomic(p1); -+ addr2 = kmap_atomic(p2); -+ memset(addr1, prandom_u32(), PAGE_SIZE); -+ memcpy(addr2, addr1, PAGE_SIZE); -+ -+ /* make sure that the two pages differ in last byte */ -+ addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; -+ kunmap_atomic(addr2); -+ kunmap_atomic(addr1); -+ -+ time_start = jiffies; -+ while (jiffies - time_start < 100) { -+ for (i = 0; i < 100; i++) -+ hash = page_hash(p1, HASH_STRENGTH_FULL, 0); -+ loopnum += 100; -+ } -+ hash_cost = (jiffies - time_start); -+ -+ time_start = jiffies; -+ for (i = 0; i < loopnum; i++) -+ ret = pages_identical(p1, p2); -+ memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); -+ memcmp_cost /= hash_cost; -+ pr_info("UKSM: relative memcmp_cost = %lu " -+ "hash=%u cmp_ret=%d.\n", -+ memcmp_cost, hash, ret); -+ -+ __free_page(p1); -+ __free_page(p2); -+ return 0; -+} -+ -+static int init_zeropage_hash_table(void) -+{ -+ struct page *page; -+ char *addr; -+ int i; -+ -+ page = alloc_page(GFP_KERNEL); -+ if (!page) -+ return -ENOMEM; -+ -+ addr = kmap_atomic(page); -+ memset(addr, 0, PAGE_SIZE); -+ kunmap_atomic(addr); -+ -+ zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32), -+ GFP_KERNEL); -+ if (!zero_hash_table) -+ return -ENOMEM; -+ -+ for (i = 0; i < HASH_STRENGTH_MAX; i++) -+ zero_hash_table[i] = page_hash(page, i, 0); -+ -+ __free_page(page); -+ -+ return 0; -+} -+ -+static inline int init_random_sampling(void) -+{ -+ unsigned long i; -+ -+ random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); -+ if (!random_nums) -+ return -ENOMEM; -+ -+ for (i = 0; i < HASH_STRENGTH_FULL; i++) -+ random_nums[i] = i; -+ -+ for (i = 0; i < HASH_STRENGTH_FULL; i++) { -+ unsigned long rand_range, swap_index, tmp; -+ -+ rand_range = HASH_STRENGTH_FULL - i; -+ swap_index = i + prandom_u32() % rand_range; -+ tmp = random_nums[i]; -+ random_nums[i] = random_nums[swap_index]; -+ random_nums[swap_index] = tmp; -+ } -+ -+ rshash_state.state = RSHASH_NEW; -+ rshash_state.below_count = 0; -+ rshash_state.lookup_window_index = 0; -+ -+ return cal_positive_negative_costs(); -+} -+ -+static int __init uksm_slab_init(void) -+{ -+ rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); -+ if (!rmap_item_cache) -+ goto out; -+ -+ stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); -+ if (!stable_node_cache) -+ goto out_free1; -+ -+ node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); -+ if (!node_vma_cache) -+ goto out_free2; -+ -+ vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); -+ if (!vma_slot_cache) -+ goto out_free3; -+ -+ tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); -+ if (!tree_node_cache) -+ goto out_free4; -+ -+ return 0; -+ -+out_free4: -+ kmem_cache_destroy(vma_slot_cache); -+out_free3: -+ kmem_cache_destroy(node_vma_cache); -+out_free2: -+ kmem_cache_destroy(stable_node_cache); -+out_free1: -+ kmem_cache_destroy(rmap_item_cache); -+out: -+ return -ENOMEM; -+} -+ -+static void __init uksm_slab_free(void) -+{ -+ kmem_cache_destroy(stable_node_cache); -+ kmem_cache_destroy(rmap_item_cache); -+ kmem_cache_destroy(node_vma_cache); -+ kmem_cache_destroy(vma_slot_cache); -+ kmem_cache_destroy(tree_node_cache); -+} -+ -+/* Common interface to ksm, different to it. */ -+int ksm_madvise(struct vm_area_struct *vma, unsigned long start, -+ unsigned long end, int advice, unsigned long *vm_flags) -+{ -+ int err; -+ -+ switch (advice) { -+ case MADV_MERGEABLE: -+ return 0; /* just ignore the advice */ -+ -+ case MADV_UNMERGEABLE: -+ if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags)) -+ return 0; /* just ignore the advice */ -+ -+ if (vma->anon_vma) { -+ err = unmerge_uksm_pages(vma, start, end); -+ if (err) -+ return err; -+ } -+ -+ uksm_remove_vma(vma); -+ *vm_flags &= ~VM_MERGEABLE; -+ break; -+ } -+ -+ return 0; -+} -+ -+/* Common interface to ksm, actually the same. */ -+struct page *ksm_might_need_to_copy(struct page *page, -+ struct vm_area_struct *vma, unsigned long address) -+{ -+ struct anon_vma *anon_vma = page_anon_vma(page); -+ struct page *new_page; -+ -+ if (PageKsm(page)) { -+ if (page_stable_node(page)) -+ return page; /* no need to copy it */ -+ } else if (!anon_vma) { -+ return page; /* no need to copy it */ -+ } else if (anon_vma->root == vma->anon_vma->root && -+ page->index == linear_page_index(vma, address)) { -+ return page; /* still no need to copy it */ -+ } -+ if (!PageUptodate(page)) -+ return page; /* let do_swap_page report the error */ -+ -+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); -+ if (new_page) { -+ copy_user_highpage(new_page, page, address, vma); -+ -+ SetPageDirty(new_page); -+ __SetPageUptodate(new_page); -+ __SetPageLocked(new_page); -+ } -+ -+ return new_page; -+} -+ -+static int __init uksm_init(void) -+{ -+ struct task_struct *uksm_thread; -+ int err; -+ -+ uksm_sleep_jiffies = msecs_to_jiffies(100); -+ uksm_sleep_saved = uksm_sleep_jiffies; -+ -+ slot_tree_init(); -+ init_scan_ladder(); -+ -+ -+ err = init_random_sampling(); -+ if (err) -+ goto out_free2; -+ -+ err = uksm_slab_init(); -+ if (err) -+ goto out_free1; -+ -+ err = init_zeropage_hash_table(); -+ if (err) -+ goto out_free0; -+ -+ uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); -+ if (IS_ERR(uksm_thread)) { -+ pr_err("uksm: creating kthread failed\n"); -+ err = PTR_ERR(uksm_thread); -+ goto out_free; -+ } -+ -+#ifdef CONFIG_SYSFS -+ err = sysfs_create_group(mm_kobj, &uksm_attr_group); -+ if (err) { -+ pr_err("uksm: register sysfs failed\n"); -+ kthread_stop(uksm_thread); -+ goto out_free; -+ } -+#else -+ uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ -+ -+#endif /* CONFIG_SYSFS */ -+ -+#ifdef CONFIG_MEMORY_HOTREMOVE -+ /* -+ * Choose a high priority since the callback takes uksm_thread_mutex: -+ * later callbacks could only be taking locks which nest within that. -+ */ -+ hotplug_memory_notifier(uksm_memory_callback, 100); -+#endif -+ return 0; -+ -+out_free: -+ kfree(zero_hash_table); -+out_free0: -+ uksm_slab_free(); -+out_free1: -+ kfree(random_nums); -+out_free2: -+ kfree(uksm_scan_ladder); -+ return err; -+} -+ -+#ifdef MODULE -+subsys_initcall(ksm_init); -+#else -+late_initcall(uksm_init); -+#endif -+ -diff -Nur a/mm/vmstat.c b/mm/vmstat.c ---- a/mm/vmstat.c 2018-05-25 15:18:02.000000000 +0100 -+++ b/mm/vmstat.c 2018-05-26 19:30:55.791140570 +0100 -@@ -1091,6 +1091,9 @@ - "nr_dirtied", - "nr_written", - -+#ifdef CONFIG_UKSM -+ "nr_uksm_zero_pages", -+#endif - /* enum writeback_stat_item counters */ - "nr_dirty_threshold", - "nr_dirty_background_threshold", -- cgit v1.2.3