From 92a797f3beda2038fde56650b58d09f16e0dc347 Mon Sep 17 00:00:00 2001
From: V3n3RiX <venerix@redcorelinux.org>
Date: Sat, 7 Aug 2021 12:40:58 +0100
Subject: sys-kernel/linux-{image,sources}-redcore-lts : version bump

---
 sys-kernel/linux-image-redcore-lts/Manifest        |    2 +-
 ...ctl-and-CONFIG-to-disallow-unprivileged-C.patch |  154 +
 .../files/5.10-amd64.config                        |   21 +-
 .../files/5.10-linux-hardened.patch                | 3479 ----------
 .../files/5.10-uksm-linux-hardened.patch           | 6911 -------------------
 .../linux-image-redcore-lts/files/5.10-uksm.patch  | 6935 ++++++++++++++++++++
 .../linux-image-redcore-lts-5.10.47-r1.ebuild      |  162 -
 .../linux-image-redcore-lts-5.10.56.ebuild         |  162 +
 8 files changed, 7255 insertions(+), 10571 deletions(-)
 create mode 100644 sys-kernel/linux-image-redcore-lts/files/5.10-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch
 delete mode 100644 sys-kernel/linux-image-redcore-lts/files/5.10-linux-hardened.patch
 delete mode 100644 sys-kernel/linux-image-redcore-lts/files/5.10-uksm-linux-hardened.patch
 create mode 100644 sys-kernel/linux-image-redcore-lts/files/5.10-uksm.patch
 delete mode 100644 sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-5.10.47-r1.ebuild
 create mode 100644 sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-5.10.56.ebuild

(limited to 'sys-kernel/linux-image-redcore-lts')

diff --git a/sys-kernel/linux-image-redcore-lts/Manifest b/sys-kernel/linux-image-redcore-lts/Manifest
index 6b661c69..d5b573d7 100644
--- a/sys-kernel/linux-image-redcore-lts/Manifest
+++ b/sys-kernel/linux-image-redcore-lts/Manifest
@@ -1 +1 @@
-DIST linux-5.10.47.tar.xz 116362372 BLAKE2B 6cfe773c5004d742e1f47f8ec408cd37b8183de5662370437b62a53ccb7960dc97499a90e41d94c8ff25207c4cc9428ca9fe6e15388e4be83ba34e83bb2df9da SHA512 80760ce0e55f146b1434cb21975cb1b3f94a6fa7f5c8edd9e534084596e8262ee5945f2b25b98039d9d405232083f995782bbdaafbb7b387bb785eafc3e2e9c8
+DIST linux-5.10.56.tar.xz 116391116 BLAKE2B 026ed9e08e0dfba2551e249880c2ab0490d2aadb068997cfba6e1a5b7b610567db9f76c3cd8d6bcba3c18511c70037812e305f3ed6503a101776b653dd284d3a SHA512 61438e9354074b47e0aa834ab277e9947c8b7353761e0ee9dfe2f2e4ecd3e8c406e7efbe4db5cc3d8e234e95a52a83f6009b7f0ae7710dc33c9afd9e78471e9e
diff --git a/sys-kernel/linux-image-redcore-lts/files/5.10-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch b/sys-kernel/linux-image-redcore-lts/files/5.10-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch
new file mode 100644
index 00000000..31a1d918
--- /dev/null
+++ b/sys-kernel/linux-image-redcore-lts/files/5.10-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch
@@ -0,0 +1,154 @@
+From f615330c6169a5fe5750706f1db7cbdd520f9534 Mon Sep 17 00:00:00 2001
+From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
+Date: Mon, 16 Sep 2019 04:53:20 +0200
+Subject: [PATCH 1/2] ZEN: Add sysctl and CONFIG to disallow unprivileged
+ CLONE_NEWUSER
+
+Our default behavior continues to match the vanilla kernel.
+---
+ include/linux/user_namespace.h |  4 ++++
+ init/Kconfig                   | 16 ++++++++++++++++
+ kernel/fork.c                  | 14 ++++++++++++++
+ kernel/sysctl.c                | 12 ++++++++++++
+ kernel/user_namespace.c        |  7 +++++++
+ 5 files changed, 53 insertions(+)
+
+diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
+index 6ef1c7109fc4..2140091b0b8d 100644
+--- a/include/linux/user_namespace.h
++++ b/include/linux/user_namespace.h
+@@ -106,6 +106,8 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
+ 
+ #ifdef CONFIG_USER_NS
+ 
++extern int unprivileged_userns_clone;
++
+ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
+ {
+ 	if (ns)
+@@ -139,6 +141,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns);
+ struct ns_common *ns_get_owner(struct ns_common *ns);
+ #else
+ 
++#define unprivileged_userns_clone 0
++
+ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
+ {
+ 	return &init_user_ns;
+diff --git a/init/Kconfig b/init/Kconfig
+index 0872a5a2e759..a40d8afeb1bb 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -1173,6 +1173,22 @@ config USER_NS
+ 
+ 	  If unsure, say N.
+ 
++config USER_NS_UNPRIVILEGED
++	bool "Allow unprivileged users to create namespaces"
++	default y
++	depends on USER_NS
++	help
++	  When disabled, unprivileged users will not be able to create
++	  new namespaces. Allowing users to create their own namespaces
++	  has been part of several recent local privilege escalation
++	  exploits, so if you need user namespaces but are
++	  paranoid^Wsecurity-conscious you want to disable this.
++
++	  This setting can be overridden at runtime via the
++	  kernel.unprivileged_userns_clone sysctl.
++
++	  If unsure, say Y.
++
+ config PID_NS
+ 	bool "PID Namespaces"
+ 	default y
+diff --git a/kernel/fork.c b/kernel/fork.c
+index c675fdbd3dce..9266039e28e4 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -97,6 +97,10 @@
+ #include <linux/scs.h>
+ #include <linux/io_uring.h>
+ 
++#ifdef CONFIG_USER_NS
++#include <linux/user_namespace.h>
++#endif
++
+ #include <asm/pgalloc.h>
+ #include <linux/uaccess.h>
+ #include <asm/mmu_context.h>
+@@ -1863,6 +1867,10 @@ static __latent_entropy struct task_struct *copy_process(
+ 	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+ 		return ERR_PTR(-EINVAL);
+ 
++	if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone)
++		if (!capable(CAP_SYS_ADMIN))
++			return ERR_PTR(-EPERM);
++
+ 	/*
+ 	 * Thread groups must share signals as well, and detached threads
+ 	 * can only be started up within the thread group.
+@@ -2928,6 +2936,12 @@ int ksys_unshare(unsigned long unshare_flags)
+ 	if (unshare_flags & CLONE_NEWNS)
+ 		unshare_flags |= CLONE_FS;
+ 
++	if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) {
++		err = -EPERM;
++		if (!capable(CAP_SYS_ADMIN))
++			goto bad_unshare_out;
++	}
++
+ 	err = check_unshare_flags(unshare_flags);
+ 	if (err)
+ 		goto bad_unshare_out;
+diff --git a/kernel/sysctl.c b/kernel/sysctl.c
+index afad085960b8..a94828fb31c2 100644
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -103,6 +103,9 @@
+ #ifdef CONFIG_LOCKUP_DETECTOR
+ #include <linux/nmi.h>
+ #endif
++#ifdef CONFIG_USER_NS
++#include <linux/user_namespace.h>
++#endif
+ 
+ #if defined(CONFIG_SYSCTL)
+ 
+@@ -1902,6 +1905,15 @@ static struct ctl_table kern_table[] = {
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ #endif
++#ifdef CONFIG_USER_NS
++	{
++		.procname	= "unprivileged_userns_clone",
++		.data		= &unprivileged_userns_clone,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec,
++	},
++#endif
+ #ifdef CONFIG_PROC_SYSCTL
+ 	{
+ 		.procname	= "tainted",
+diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
+index e703d5d9cbe8..5758274feaee 100644
+--- a/kernel/user_namespace.c
++++ b/kernel/user_namespace.c
+@@ -21,6 +21,13 @@
+ #include <linux/bsearch.h>
+ #include <linux/sort.h>
+ 
++/* sysctl */
++#ifdef CONFIG_USER_NS_UNPRIVILEGED
++int unprivileged_userns_clone = 1;
++#else
++int unprivileged_userns_clone;
++#endif
++
+ static struct kmem_cache *user_ns_cachep __read_mostly;
+ static DEFINE_MUTEX(userns_state_mutex);
+ 
+-- 
+2.31.1
+
diff --git a/sys-kernel/linux-image-redcore-lts/files/5.10-amd64.config b/sys-kernel/linux-image-redcore-lts/files/5.10-amd64.config
index 0d48c66c..f2fd796d 100644
--- a/sys-kernel/linux-image-redcore-lts/files/5.10-amd64.config
+++ b/sys-kernel/linux-image-redcore-lts/files/5.10-amd64.config
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/x86 5.10.47-redcore-lts Kernel Configuration
+# Linux/x86 5.10.56-redcore-lts Kernel Configuration
 #
 CONFIG_CC_VERSION_TEXT="gcc (Gentoo Hardened 10.3.0-r10 p1) 10.3.0"
 CONFIG_CC_IS_GCC=y
@@ -251,7 +251,6 @@ CONFIG_BPF_JIT_ALWAYS_ON=y
 CONFIG_BPF_JIT_DEFAULT_ON=y
 # CONFIG_BPF_PRELOAD is not set
 CONFIG_USERFAULTFD=y
-# CONFIG_USERFAULTFD_UNPRIVILEGED is not set
 CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y
 CONFIG_KCMP=y
 CONFIG_RSEQ=y
@@ -806,9 +805,9 @@ CONFIG_MODULES_USE_ELF_RELA=y
 CONFIG_ARCH_HAS_ELF_RANDOMIZE=y
 CONFIG_HAVE_ARCH_MMAP_RND_BITS=y
 CONFIG_HAVE_EXIT_THREAD=y
-CONFIG_ARCH_MMAP_RND_BITS=32
+CONFIG_ARCH_MMAP_RND_BITS=28
 CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y
-CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS=8
 CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y
 CONFIG_HAVE_STACK_VALIDATION=y
 CONFIG_HAVE_RELIABLE_STACKTRACE=y
@@ -1123,7 +1122,6 @@ CONFIG_TCP_CONG_BBR=m
 CONFIG_DEFAULT_RENO=y
 CONFIG_DEFAULT_TCP_CONG="reno"
 CONFIG_TCP_MD5SIG=y
-# CONFIG_TCP_SIMULT_CONNECT_DEFAULT_ON is not set
 CONFIG_IPV6=m
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
@@ -2246,7 +2244,6 @@ CONFIG_RFD_FTL=m
 CONFIG_SSFDC=m
 CONFIG_SM_FTL=m
 CONFIG_MTD_OOPS=m
-CONFIG_MTD_PSTORE=m
 CONFIG_MTD_SWAP=m
 # CONFIG_MTD_PARTITIONED_MASTER is not set
 
@@ -9102,7 +9099,6 @@ CONFIG_IPACK_BUS=m
 CONFIG_BOARD_TPCI200=m
 CONFIG_SERIAL_IPOCTAL=m
 CONFIG_RESET_CONTROLLER=y
-# CONFIG_RESET_BRCMSTB_RESCAL is not set
 CONFIG_RESET_TI_SYSCON=m
 
 #
@@ -9489,12 +9485,6 @@ CONFIG_PSTORE_COMPRESS_DEFAULT="zstd"
 CONFIG_PSTORE_PMSG=y
 # CONFIG_PSTORE_FTRACE is not set
 CONFIG_PSTORE_RAM=m
-CONFIG_PSTORE_ZONE=m
-CONFIG_PSTORE_BLK=m
-CONFIG_PSTORE_BLK_BLKDEV=""
-CONFIG_PSTORE_BLK_KMSG_SIZE=64
-CONFIG_PSTORE_BLK_MAX_REASON=2
-CONFIG_PSTORE_BLK_PMSG_SIZE=64
 CONFIG_SYSV_FS=m
 CONFIG_UFS_FS=m
 # CONFIG_UFS_FS_WRITE is not set
@@ -9642,8 +9632,6 @@ CONFIG_TRUSTED_KEYS=m
 CONFIG_ENCRYPTED_KEYS=m
 # CONFIG_KEY_DH_OPERATIONS is not set
 CONFIG_SECURITY_DMESG_RESTRICT=y
-CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y
-CONFIG_SECURITY_TIOCSTI_RESTRICT=y
 CONFIG_SECURITY=y
 CONFIG_SECURITYFS=y
 CONFIG_SECURITY_NETWORK=y
@@ -9688,8 +9676,6 @@ CONFIG_LSM="yama,loadpin,safesetid,integrity,apparmor"
 CONFIG_INIT_STACK_NONE=y
 # CONFIG_INIT_ON_ALLOC_DEFAULT_ON is not set
 # CONFIG_INIT_ON_FREE_DEFAULT_ON is not set
-CONFIG_PAGE_SANITIZE_VERIFY=y
-CONFIG_SLAB_SANITIZE_VERIFY=y
 # end of Memory initialization
 # end of Kernel hardening options
 # end of Security options
@@ -10143,7 +10129,6 @@ CONFIG_STRIP_ASM_SYMS=y
 # CONFIG_HEADERS_INSTALL is not set
 # CONFIG_DEBUG_SECTION_MISMATCH is not set
 CONFIG_SECTION_MISMATCH_WARN_ONLY=y
-# CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE is not set
 CONFIG_STACK_VALIDATION=y
 # CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set
 # end of Compile-time checks and compiler options
diff --git a/sys-kernel/linux-image-redcore-lts/files/5.10-linux-hardened.patch b/sys-kernel/linux-image-redcore-lts/files/5.10-linux-hardened.patch
deleted file mode 100644
index e304e170..00000000
--- a/sys-kernel/linux-image-redcore-lts/files/5.10-linux-hardened.patch
+++ /dev/null
@@ -1,3479 +0,0 @@
-diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
-index 26bfe7ae711b..0e8e3fdd7005 100644
---- a/Documentation/admin-guide/kernel-parameters.txt
-+++ b/Documentation/admin-guide/kernel-parameters.txt
-@@ -518,17 +518,6 @@
- 			nosocket -- Disable socket memory accounting.
- 			nokmem -- Disable kernel memory accounting.
- 
--	checkreqprot	[SELINUX] Set initial checkreqprot flag value.
--			Format: { "0" | "1" }
--			See security/selinux/Kconfig help text.
--			0 -- check protection applied by kernel (includes
--				any implied execute protection).
--			1 -- check protection requested by application.
--			Default value is set via a kernel config option.
--			Value can be changed at runtime via
--				/sys/fs/selinux/checkreqprot.
--			Setting checkreqprot to 1 is deprecated.
--
- 	cio_ignore=	[S390]
- 			See Documentation/s390/common_io.rst for details.
- 	clk_ignore_unused
-@@ -3566,6 +3555,11 @@
- 			the specified number of seconds.  This is to be used if
- 			your oopses keep scrolling off the screen.
- 
-+	extra_latent_entropy
-+			Enable a very simple form of latent entropy extraction
-+			from the first 4GB of memory as the bootmem allocator
-+			passes the memory pages to the buddy allocator.
-+
- 	pcbit=		[HW,ISDN]
- 
- 	pcd.		[PARIDE]
-diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
-index d4b32cc32bb7..3cd263f8ac46 100644
---- a/Documentation/admin-guide/sysctl/kernel.rst
-+++ b/Documentation/admin-guide/sysctl/kernel.rst
-@@ -860,6 +860,8 @@ with respect to CAP_PERFMON use cases.
- >=1  Disallow CPU event access by users without ``CAP_PERFMON``.
- 
- >=2  Disallow kernel profiling by users without ``CAP_PERFMON``.
-+
-+>=3  Disallow use of any event by users without ``CAP_PERFMON``.
- ===  ==================================================================
- 
- 
-@@ -1383,6 +1385,26 @@ If a value outside of this range is written to ``threads-max`` an
- ``EINVAL`` error occurs.
- 
- 
-+tiocsti_restrict
-+================
-+
-+This toggle indicates whether unprivileged users are prevented from using the
-+``TIOCSTI`` ioctl to inject commands into other processes which share a tty
-+session.
-+
-+= ============================================================================
-+0 No restriction, except the default one of only being able to inject commands
-+  into one's own tty.
-+1 Users must have ``CAP_SYS_ADMIN`` to use the ``TIOCSTI`` ioctl.
-+= ============================================================================
-+
-+When user namespaces are in use, the check for ``CAP_SYS_ADMIN`` is done
-+against the user namespace that originally opened the tty.
-+
-+The kernel config option ``CONFIG_SECURITY_TIOCSTI_RESTRICT`` sets the default
-+value of ``tiocsti_restrict``.
-+
-+
- traceoff_on_warning
- ===================
- 
-diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
-index 4abcfff15e38..fa2d0a9709f2 100644
---- a/Documentation/networking/ip-sysctl.rst
-+++ b/Documentation/networking/ip-sysctl.rst
-@@ -664,6 +664,24 @@ tcp_comp_sack_nr - INTEGER
- 
- 	Default : 44
- 
-+tcp_simult_connect - BOOLEAN
-+	Enable TCP simultaneous connect that adds a weakness in Linux's strict
-+	implementation of TCP that allows two clients to connect to each other
-+	without either entering a listening state. The weakness allows an attacker
-+	to easily prevent a client from connecting to a known server provided the
-+	source port for the connection is guessed correctly.
-+
-+	As the weakness could be used to prevent an antivirus or IPS from fetching
-+	updates, or prevent an SSL gateway from fetching a CRL, it should be
-+	eliminated by disabling this option. Though Linux is one of few operating
-+	systems supporting simultaneous connect, it has no legitimate use in
-+	practice and is rarely supported by firewalls.
-+
-+	Disabling this may break TCP STUNT which is used by some applications for
-+	NAT traversal.
-+
-+	Default: Value of CONFIG_TCP_SIMULT_CONNECT_DEFAULT_ON
-+
- tcp_slow_start_after_idle - BOOLEAN
- 	If set, provide RFC2861 behavior and time out the congestion
- 	window after an idle period.  An idle period is defined at
-diff --git a/Makefile b/Makefile
-index fb2937bca41b..711389d443ab 100644
---- a/Makefile
-+++ b/Makefile
-@@ -2,7 +2,7 @@
- VERSION = 5
- PATCHLEVEL = 10
- SUBLEVEL = 47
--EXTRAVERSION =
-+EXTRAVERSION = -hardened1
- NAME = Dare mighty things
- 
- # *DOCUMENTATION*
-diff --git a/arch/Kconfig b/arch/Kconfig
-index 69fe7133c765..8b5c346d5dd8 100644
---- a/arch/Kconfig
-+++ b/arch/Kconfig
-@@ -752,7 +752,7 @@ config ARCH_MMAP_RND_BITS
- 	int "Number of bits to use for ASLR of mmap base address" if EXPERT
- 	range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX
- 	default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT
--	default ARCH_MMAP_RND_BITS_MIN
-+	default ARCH_MMAP_RND_BITS_MAX
- 	depends on HAVE_ARCH_MMAP_RND_BITS
- 	help
- 	  This value can be used to select the number of bits to use to
-@@ -786,7 +786,7 @@ config ARCH_MMAP_RND_COMPAT_BITS
- 	int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT
- 	range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX
- 	default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT
--	default ARCH_MMAP_RND_COMPAT_BITS_MIN
-+	default ARCH_MMAP_RND_COMPAT_BITS_MAX
- 	depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS
- 	help
- 	  This value can be used to select the number of bits to use to
-diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
-index 5e5cf3af6351..d13da5ae03e7 100644
---- a/arch/arm64/Kconfig
-+++ b/arch/arm64/Kconfig
-@@ -1200,6 +1200,7 @@ config RODATA_FULL_DEFAULT_ENABLED
- 
- config ARM64_SW_TTBR0_PAN
- 	bool "Emulate Privileged Access Never using TTBR0_EL1 switching"
-+	default y
- 	help
- 	  Enabling this option prevents the kernel from accessing
- 	  user-space memory directly by pointing TTBR0_EL1 to a reserved
-@@ -1794,6 +1795,7 @@ config RANDOMIZE_BASE
- 	bool "Randomize the address of the kernel image"
- 	select ARM64_MODULE_PLTS if MODULES
- 	select RELOCATABLE
-+	default y
- 	help
- 	  Randomizes the virtual address at which the kernel image is
- 	  loaded, as a security feature that deters exploit attempts
-diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
-index 5cfe3cf6f2ac..f25871361bdc 100644
---- a/arch/arm64/configs/defconfig
-+++ b/arch/arm64/configs/defconfig
-@@ -1,4 +1,3 @@
--CONFIG_SYSVIPC=y
- CONFIG_POSIX_MQUEUE=y
- CONFIG_AUDIT=y
- CONFIG_NO_HZ_IDLE=y
-diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
-index 8d1c8dcb87fd..32c1609a1158 100644
---- a/arch/arm64/include/asm/elf.h
-+++ b/arch/arm64/include/asm/elf.h
-@@ -124,14 +124,10 @@
- 
- /*
-  * This is the base location for PIE (ET_DYN with INTERP) loads. On
-- * 64-bit, this is above 4GB to leave the entire 32-bit address
-+ * 64-bit, this is raised to 4GB to leave the entire 32-bit address
-  * space open for things that want to use the area for 32-bit pointers.
-  */
--#ifdef CONFIG_ARM64_FORCE_52BIT
--#define ELF_ET_DYN_BASE		(2 * TASK_SIZE_64 / 3)
--#else
--#define ELF_ET_DYN_BASE		(2 * DEFAULT_MAP_WINDOW_64 / 3)
--#endif /* CONFIG_ARM64_FORCE_52BIT */
-+#define ELF_ET_DYN_BASE		0x100000000UL
- 
- #ifndef __ASSEMBLY__
- 
-@@ -189,10 +185,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
- /* 1GB of VA */
- #ifdef CONFIG_COMPAT
- #define STACK_RND_MASK			(test_thread_flag(TIF_32BIT) ? \
--						0x7ff >> (PAGE_SHIFT - 12) : \
--						0x3ffff >> (PAGE_SHIFT - 12))
-+						((1UL << mmap_rnd_compat_bits) - 1) >> (PAGE_SHIFT - 12) : \
-+						((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12))
- #else
--#define STACK_RND_MASK			(0x3ffff >> (PAGE_SHIFT - 12))
-+#define STACK_RND_MASK			(((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12))
- #endif
- 
- #ifdef __AARCH64EB__
-diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
-index f3c8a8110f60..372192b9ebd1 100644
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -1195,8 +1195,7 @@ config VM86
- 	default X86_LEGACY_VM86
- 
- config X86_16BIT
--	bool "Enable support for 16-bit segments" if EXPERT
--	default y
-+	bool "Enable support for 16-bit segments"
- 	depends on MODIFY_LDT_SYSCALL
- 	help
- 	  This option is required by programs like Wine to run 16-bit
-@@ -2298,7 +2297,7 @@ config COMPAT_VDSO
- choice
- 	prompt "vsyscall table for legacy applications"
- 	depends on X86_64
--	default LEGACY_VSYSCALL_XONLY
-+	default LEGACY_VSYSCALL_NONE
- 	help
- 	  Legacy user code that does not know how to find the vDSO expects
- 	  to be able to issue three syscalls by calling fixed addresses in
-@@ -2394,8 +2393,7 @@ config CMDLINE_OVERRIDE
- 	  be set to 'N' under normal conditions.
- 
- config MODIFY_LDT_SYSCALL
--	bool "Enable the LDT (local descriptor table)" if EXPERT
--	default y
-+	bool "Enable the LDT (local descriptor table)"
- 	help
- 	  Linux can allow user programs to install a per-process x86
- 	  Local Descriptor Table (LDT) using the modify_ldt(2) system
-diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
-index 9936528e1939..981ee8c0e330 100644
---- a/arch/x86/configs/x86_64_defconfig
-+++ b/arch/x86/configs/x86_64_defconfig
-@@ -1,5 +1,4 @@
- # CONFIG_LOCALVERSION_AUTO is not set
--CONFIG_SYSVIPC=y
- CONFIG_POSIX_MQUEUE=y
- CONFIG_AUDIT=y
- CONFIG_NO_HZ=y
-diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
-index 9185cb1d13b9..543912071557 100644
---- a/arch/x86/entry/vdso/vma.c
-+++ b/arch/x86/entry/vdso/vma.c
-@@ -315,55 +315,9 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
- }
- 
- #ifdef CONFIG_X86_64
--/*
-- * Put the vdso above the (randomized) stack with another randomized
-- * offset.  This way there is no hole in the middle of address space.
-- * To save memory make sure it is still in the same PTE as the stack
-- * top.  This doesn't give that many random bits.
-- *
-- * Note that this algorithm is imperfect: the distribution of the vdso
-- * start address within a PMD is biased toward the end.
-- *
-- * Only used for the 64-bit and x32 vdsos.
-- */
--static unsigned long vdso_addr(unsigned long start, unsigned len)
--{
--	unsigned long addr, end;
--	unsigned offset;
--
--	/*
--	 * Round up the start address.  It can start out unaligned as a result
--	 * of stack start randomization.
--	 */
--	start = PAGE_ALIGN(start);
--
--	/* Round the lowest possible end address up to a PMD boundary. */
--	end = (start + len + PMD_SIZE - 1) & PMD_MASK;
--	if (end >= TASK_SIZE_MAX)
--		end = TASK_SIZE_MAX;
--	end -= len;
--
--	if (end > start) {
--		offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
--		addr = start + (offset << PAGE_SHIFT);
--	} else {
--		addr = start;
--	}
--
--	/*
--	 * Forcibly align the final address in case we have a hardware
--	 * issue that requires alignment for performance reasons.
--	 */
--	addr = align_vdso_addr(addr);
--
--	return addr;
--}
--
- static int map_vdso_randomized(const struct vdso_image *image)
- {
--	unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start);
--
--	return map_vdso(image, addr);
-+	return map_vdso(image, 0);
- }
- #endif
- 
-diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
-index b9a5d488f1a5..608cca19cf8c 100644
---- a/arch/x86/include/asm/elf.h
-+++ b/arch/x86/include/asm/elf.h
-@@ -246,11 +246,11 @@ extern int force_personality32;
- 
- /*
-  * This is the base location for PIE (ET_DYN with INTERP) loads. On
-- * 64-bit, this is above 4GB to leave the entire 32-bit address
-+ * 64-bit, this is raised to 4GB to leave the entire 32-bit address
-  * space open for things that want to use the area for 32-bit pointers.
-  */
- #define ELF_ET_DYN_BASE		(mmap_is_ia32() ? 0x000400000UL : \
--						  (DEFAULT_MAP_WINDOW / 3 * 2))
-+						  0x100000000UL)
- 
- /* This yields a mask that user programs can use to figure out what
-    instruction set this CPU supports.  This could be done in user space,
-@@ -330,8 +330,8 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
- 
- #ifdef CONFIG_X86_32
- 
--#define __STACK_RND_MASK(is32bit) (0x7ff)
--#define STACK_RND_MASK (0x7ff)
-+#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1)
-+#define STACK_RND_MASK ((1UL << mmap_rnd_bits) - 1)
- 
- #define ARCH_DLINFO		ARCH_DLINFO_IA32
- 
-@@ -340,7 +340,11 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
- #else /* CONFIG_X86_32 */
- 
- /* 1GB for 64bit, 8MB for 32bit */
--#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff)
-+#ifdef CONFIG_COMPAT
-+#define __STACK_RND_MASK(is32bit) ((is32bit) ? (1UL << mmap_rnd_compat_bits) - 1 : (1UL << mmap_rnd_bits) - 1)
-+#else
-+#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1)
-+#endif
- #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32())
- 
- #define ARCH_DLINFO							\
-@@ -398,5 +402,4 @@ struct va_alignment {
- } ____cacheline_aligned;
- 
- extern struct va_alignment va_align;
--extern unsigned long align_vdso_addr(unsigned long);
- #endif /* _ASM_X86_ELF_H */
-diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
-index 25148ebd3634..1a41d2c767a1 100644
---- a/arch/x86/kernel/cpu/common.c
-+++ b/arch/x86/kernel/cpu/common.c
-@@ -399,6 +399,7 @@ EXPORT_SYMBOL_GPL(native_write_cr4);
- void cr4_update_irqsoff(unsigned long set, unsigned long clear)
- {
- 	unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
-+	BUG_ON(cr4 != __read_cr4());
- 
- 	lockdep_assert_irqs_disabled();
- 
-diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
-index 145a7ac0c19a..161e25d02fd5 100644
---- a/arch/x86/kernel/process.c
-+++ b/arch/x86/kernel/process.c
-@@ -43,6 +43,8 @@
- #include <asm/io_bitmap.h>
- #include <asm/proto.h>
- #include <asm/frame.h>
-+#include <asm/elf.h>
-+#include <linux/sizes.h>
- 
- #include "process.h"
- 
-@@ -596,6 +598,7 @@ void speculation_ctrl_update_current(void)
- static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
- {
- 	unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
-+	BUG_ON(cr4 != __read_cr4());
- 
- 	newval = cr4 ^ mask;
- 	if (newval != cr4) {
-@@ -905,7 +908,10 @@ unsigned long arch_align_stack(unsigned long sp)
- 
- unsigned long arch_randomize_brk(struct mm_struct *mm)
- {
--	return randomize_page(mm->brk, 0x02000000);
-+	if (mmap_is_ia32())
-+		return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE;
-+	else
-+		return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE;
- }
- 
- /*
-diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
-index 504fa5425bce..e30ec4c750d1 100644
---- a/arch/x86/kernel/sys_x86_64.c
-+++ b/arch/x86/kernel/sys_x86_64.c
-@@ -52,13 +52,6 @@ static unsigned long get_align_bits(void)
- 	return va_align.bits & get_align_mask();
- }
- 
--unsigned long align_vdso_addr(unsigned long addr)
--{
--	unsigned long align_mask = get_align_mask();
--	addr = (addr + align_mask) & ~align_mask;
--	return addr | get_align_bits();
--}
--
- static int __init control_va_addr_alignment(char *str)
- {
- 	/* guard against enabling this on other CPU families */
-@@ -120,10 +113,7 @@ static void find_start_end(unsigned long addr, unsigned long flags,
- 	}
- 
- 	*begin	= get_mmap_base(1);
--	if (in_32bit_syscall())
--		*end = task_size_32bit();
--	else
--		*end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
-+	*end	= get_mmap_base(0);
- }
- 
- unsigned long
-@@ -200,7 +190,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- 
- 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
- 	info.length = len;
--	info.low_limit = PAGE_SIZE;
-+	info.low_limit = get_mmap_base(1);
- 	info.high_limit = get_mmap_base(0);
- 
- 	/*
-diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
-index 7c055259de3a..291b7b4476a9 100644
---- a/arch/x86/mm/init_32.c
-+++ b/arch/x86/mm/init_32.c
-@@ -546,9 +546,9 @@ static void __init pagetable_init(void)
- 
- #define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL)
- /* Bits supported by the hardware: */
--pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK;
-+pteval_t __supported_pte_mask __ro_after_init = DEFAULT_PTE_MASK;
- /* Bits allowed in normal kernel mappings: */
--pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK;
-+pteval_t __default_kernel_pte_mask __ro_after_init = DEFAULT_PTE_MASK;
- EXPORT_SYMBOL_GPL(__supported_pte_mask);
- /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */
- EXPORT_SYMBOL(__default_kernel_pte_mask);
-diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
-index b5a3fa4033d3..c3d771ffc178 100644
---- a/arch/x86/mm/init_64.c
-+++ b/arch/x86/mm/init_64.c
-@@ -97,9 +97,9 @@ DEFINE_ENTRY(pte, pte, init)
-  */
- 
- /* Bits supported by the hardware: */
--pteval_t __supported_pte_mask __read_mostly = ~0;
-+pteval_t __supported_pte_mask __ro_after_init = ~0;
- /* Bits allowed in normal kernel mappings: */
--pteval_t __default_kernel_pte_mask __read_mostly = ~0;
-+pteval_t __default_kernel_pte_mask __ro_after_init = ~0;
- EXPORT_SYMBOL_GPL(__supported_pte_mask);
- /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */
- EXPORT_SYMBOL(__default_kernel_pte_mask);
-diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
-index 569ac1d57f55..044d88da4aee 100644
---- a/arch/x86/mm/tlb.c
-+++ b/arch/x86/mm/tlb.c
-@@ -1066,6 +1066,7 @@ STATIC_NOPV void native_flush_tlb_global(void)
- 	raw_local_irq_save(flags);
- 
- 	cr4 = this_cpu_read(cpu_tlbstate.cr4);
-+	BUG_ON(cr4 != __read_cr4());
- 	/* toggle PGE */
- 	native_write_cr4(cr4 ^ X86_CR4_PGE);
- 	/* write old PGE again and flush TLBs */
-diff --git a/block/blk-mq.c b/block/blk-mq.c
-index 4bf9449b4586..3215e9d0025c 100644
---- a/block/blk-mq.c
-+++ b/block/blk-mq.c
-@@ -569,7 +569,7 @@ EXPORT_SYMBOL(blk_mq_end_request);
-  * Softirq action handler - move entries to local list and loop over them
-  * while passing them to the queue registered handler.
-  */
--static __latent_entropy void blk_done_softirq(struct softirq_action *h)
-+static __latent_entropy void blk_done_softirq(void)
- {
- 	struct list_head *cpu_list, local_list;
- 
-diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
-index 61c762961ca8..02a83039c25b 100644
---- a/drivers/ata/libata-core.c
-+++ b/drivers/ata/libata-core.c
-@@ -4540,7 +4540,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
- 	struct ata_port *ap;
- 	unsigned int tag;
- 
--	WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */
-+	BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */
- 	ap = qc->ap;
- 
- 	qc->flags = 0;
-@@ -4557,7 +4557,7 @@ void __ata_qc_complete(struct ata_queued_cmd *qc)
- 	struct ata_port *ap;
- 	struct ata_link *link;
- 
--	WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */
-+	BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */
- 	WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE));
- 	ap = qc->ap;
- 	link = qc->dev->link;
-diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
-index d229a2d0c017..2fd45f01e7a2 100644
---- a/drivers/char/Kconfig
-+++ b/drivers/char/Kconfig
-@@ -327,7 +327,6 @@ config NSC_GPIO
- 
- config DEVMEM
- 	bool "/dev/mem virtual device support"
--	default y
- 	help
- 	  Say Y here if you want to support the /dev/mem device.
- 	  The /dev/mem device is used to access areas of physical
-@@ -391,7 +390,6 @@ config MAX_RAW_DEVS
- config DEVPORT
- 	bool "/dev/port character device"
- 	depends on ISA || PCI
--	default y
- 	help
- 	  Say Y here if you want to support the /dev/port device. The /dev/port
- 	  device is similar to /dev/mem, but for I/O ports.
-diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
-index 93fd984eb2f5..d9086484d2de 100644
---- a/drivers/tty/Kconfig
-+++ b/drivers/tty/Kconfig
-@@ -122,7 +122,6 @@ config UNIX98_PTYS
- 
- config LEGACY_PTYS
- 	bool "Legacy (BSD) PTY support"
--	default y
- 	help
- 	  A pseudo terminal (PTY) is a software device consisting of two
- 	  halves: a master and a slave. The slave device behaves identical to
-diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
-index bc5314092aa4..4de9e74c701c 100644
---- a/drivers/tty/tty_io.c
-+++ b/drivers/tty/tty_io.c
-@@ -171,6 +171,7 @@ static void free_tty_struct(struct tty_struct *tty)
- 	put_device(tty->dev);
- 	kfree(tty->write_buf);
- 	tty->magic = 0xDEADDEAD;
-+	put_user_ns(tty->owner_user_ns);
- 	kfree(tty);
- }
- 
-@@ -2261,11 +2262,19 @@ static int tty_fasync(int fd, struct file *filp, int on)
-  *	FIXME: may race normal receive processing
-  */
- 
-+int tiocsti_restrict = IS_ENABLED(CONFIG_SECURITY_TIOCSTI_RESTRICT);
-+
- static int tiocsti(struct tty_struct *tty, char __user *p)
- {
- 	char ch, mbz = 0;
- 	struct tty_ldisc *ld;
- 
-+	if (tiocsti_restrict &&
-+		!ns_capable(tty->owner_user_ns, CAP_SYS_ADMIN)) {
-+		dev_warn_ratelimited(tty->dev,
-+			"Denied TIOCSTI ioctl for non-privileged process\n");
-+		return -EPERM;
-+	}
- 	if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN))
- 		return -EPERM;
- 	if (get_user(ch, p))
-@@ -3100,6 +3109,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx)
- 	tty->index = idx;
- 	tty_line_name(driver, idx, tty->name);
- 	tty->dev = tty_get_device(tty);
-+	tty->owner_user_ns = get_user_ns(current_user_ns());
- 
- 	return tty;
- }
-diff --git a/drivers/usb/core/Makefile b/drivers/usb/core/Makefile
-index 18e874b0441e..fc7a3a9aa72a 100644
---- a/drivers/usb/core/Makefile
-+++ b/drivers/usb/core/Makefile
-@@ -11,6 +11,7 @@ usbcore-y += phy.o port.o
- usbcore-$(CONFIG_OF)		+= of.o
- usbcore-$(CONFIG_USB_PCI)		+= hcd-pci.o
- usbcore-$(CONFIG_ACPI)		+= usb-acpi.o
-+usbcore-$(CONFIG_SYSCTL)		+= sysctl.o
- 
- obj-$(CONFIG_USB)		+= usbcore.o
- 
-diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
-index 357730e8f52f..3884416d9029 100644
---- a/drivers/usb/core/hub.c
-+++ b/drivers/usb/core/hub.c
-@@ -5116,6 +5116,12 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus,
- 			goto done;
- 		return;
- 	}
-+
-+	if (deny_new_usb) {
-+		dev_err(&port_dev->dev, "denied insert of USB device on port %d\n", port1);
-+		goto done;
-+	}
-+
- 	if (hub_is_superspeed(hub->hdev))
- 		unit_load = 150;
- 	else
-diff --git a/drivers/usb/core/sysctl.c b/drivers/usb/core/sysctl.c
-new file mode 100644
-index 000000000000..3fa188ac8f67
---- /dev/null
-+++ b/drivers/usb/core/sysctl.c
-@@ -0,0 +1,44 @@
-+#include <linux/errno.h>
-+#include <linux/init.h>
-+#include <linux/kmemleak.h>
-+#include <linux/sysctl.h>
-+#include <linux/usb.h>
-+
-+static struct ctl_table usb_table[] = {
-+	{
-+		.procname	= "deny_new_usb",
-+		.data		= &deny_new_usb,
-+		.maxlen		= sizeof(int),
-+		.mode		= 0644,
-+		.proc_handler	= proc_dointvec_minmax_sysadmin,
-+		.extra1		= SYSCTL_ZERO,
-+		.extra2		= SYSCTL_ONE,
-+	},
-+	{ }
-+};
-+
-+static struct ctl_table usb_root_table[] = {
-+	{ .procname	= "kernel",
-+	  .mode		= 0555,
-+	  .child	= usb_table },
-+	{ }
-+};
-+
-+static struct ctl_table_header *usb_table_header;
-+
-+int __init usb_init_sysctl(void)
-+{
-+	usb_table_header = register_sysctl_table(usb_root_table);
-+	if (!usb_table_header) {
-+		pr_warn("usb: sysctl registration failed\n");
-+		return -ENOMEM;
-+	}
-+
-+	kmemleak_not_leak(usb_table_header);
-+	return 0;
-+}
-+
-+void usb_exit_sysctl(void)
-+{
-+	unregister_sysctl_table(usb_table_header);
-+}
-diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
-index db4de5367737..28bdbd91e33c 100644
---- a/drivers/usb/core/usb.c
-+++ b/drivers/usb/core/usb.c
-@@ -72,6 +72,9 @@ MODULE_PARM_DESC(autosuspend, "default autosuspend delay");
- #define usb_autosuspend_delay		0
- #endif
- 
-+int deny_new_usb __read_mostly = 0;
-+EXPORT_SYMBOL(deny_new_usb);
-+
- static bool match_endpoint(struct usb_endpoint_descriptor *epd,
- 		struct usb_endpoint_descriptor **bulk_in,
- 		struct usb_endpoint_descriptor **bulk_out,
-@@ -1010,6 +1013,9 @@ static int __init usb_init(void)
- 	usb_debugfs_init();
- 
- 	usb_acpi_register();
-+	retval = usb_init_sysctl();
-+	if (retval)
-+		goto sysctl_init_failed;
- 	retval = bus_register(&usb_bus_type);
- 	if (retval)
- 		goto bus_register_failed;
-@@ -1044,6 +1050,8 @@ static int __init usb_init(void)
- bus_notifier_failed:
- 	bus_unregister(&usb_bus_type);
- bus_register_failed:
-+	usb_exit_sysctl();
-+sysctl_init_failed:
- 	usb_acpi_unregister();
- 	usb_debugfs_cleanup();
- out:
-@@ -1067,6 +1075,7 @@ static void __exit usb_exit(void)
- 	usb_hub_cleanup();
- 	bus_unregister_notifier(&usb_bus_type, &usb_bus_nb);
- 	bus_unregister(&usb_bus_type);
-+	usb_exit_sysctl();
- 	usb_acpi_unregister();
- 	usb_debugfs_cleanup();
- 	idr_destroy(&usb_bus_idr);
-diff --git a/fs/exec.c b/fs/exec.c
-index ca89e0e3ef10..d2a03d32e195 100644
---- a/fs/exec.c
-+++ b/fs/exec.c
-@@ -34,6 +34,7 @@
- #include <linux/swap.h>
- #include <linux/string.h>
- #include <linux/init.h>
-+#include <linux/sched.h>
- #include <linux/sched/mm.h>
- #include <linux/sched/coredump.h>
- #include <linux/sched/signal.h>
-@@ -64,6 +65,7 @@
- #include <linux/compat.h>
- #include <linux/vmalloc.h>
- #include <linux/io_uring.h>
-+#include <linux/random.h>
- 
- #include <linux/uaccess.h>
- #include <asm/mmu_context.h>
-@@ -280,6 +282,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
- 	mm->stack_vm = mm->total_vm = 1;
- 	mmap_write_unlock(mm);
- 	bprm->p = vma->vm_end - sizeof(void *);
-+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
-+		bprm->p ^= get_random_int() & ~PAGE_MASK;
- 	return 0;
- err:
- 	mmap_write_unlock(mm);
-diff --git a/fs/inode.c b/fs/inode.c
-index 5eea9912a0b9..f86f383a3e1d 100644
---- a/fs/inode.c
-+++ b/fs/inode.c
-@@ -116,6 +116,10 @@ int proc_nr_inodes(struct ctl_table *table, int write,
- }
- #endif
- 
-+/* sysctl */
-+int device_sidechannel_restrict __read_mostly = 1;
-+EXPORT_SYMBOL(device_sidechannel_restrict);
-+
- static int no_open(struct inode *inode, struct file *file)
- {
- 	return -ENXIO;
-diff --git a/fs/namei.c b/fs/namei.c
-index 4c9d0c36545d..e05f9512934a 100644
---- a/fs/namei.c
-+++ b/fs/namei.c
-@@ -932,10 +932,10 @@ static inline void put_link(struct nameidata *nd)
- 		path_put(&last->link);
- }
- 
--int sysctl_protected_symlinks __read_mostly = 0;
--int sysctl_protected_hardlinks __read_mostly = 0;
--int sysctl_protected_fifos __read_mostly;
--int sysctl_protected_regular __read_mostly;
-+int sysctl_protected_symlinks __read_mostly = 1;
-+int sysctl_protected_hardlinks __read_mostly = 1;
-+int sysctl_protected_fifos __read_mostly = 2;
-+int sysctl_protected_regular __read_mostly = 2;
- 
- /**
-  * may_follow_link - Check symlink following for unsafe situations
-diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
-index 14a72224b657..080a8027c6b1 100644
---- a/fs/nfs/Kconfig
-+++ b/fs/nfs/Kconfig
-@@ -195,7 +195,6 @@ config NFS_DEBUG
- 	bool
- 	depends on NFS_FS && SUNRPC_DEBUG
- 	select CRC32
--	default y
- 
- config NFS_DISABLE_UDP_SUPPORT
-        bool "NFS: Disable NFS UDP protocol support"
-diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
-index c930001056f9..6a0a51b3f593 100644
---- a/fs/proc/Kconfig
-+++ b/fs/proc/Kconfig
-@@ -41,7 +41,6 @@ config PROC_KCORE
- config PROC_VMCORE
- 	bool "/proc/vmcore support"
- 	depends on PROC_FS && CRASH_DUMP
--	default y
- 	help
- 	  Exports the dump image of crashed kernel in ELF format.
- 
-diff --git a/fs/stat.c b/fs/stat.c
-index 1196af4d1ea0..4291a2c694e5 100644
---- a/fs/stat.c
-+++ b/fs/stat.c
-@@ -43,8 +43,13 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
- 	stat->gid = inode->i_gid;
- 	stat->rdev = inode->i_rdev;
- 	stat->size = i_size_read(inode);
--	stat->atime = inode->i_atime;
--	stat->mtime = inode->i_mtime;
-+	if (is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) {
-+		stat->atime = inode->i_ctime;
-+		stat->mtime = inode->i_ctime;
-+	} else {
-+		stat->atime = inode->i_atime;
-+		stat->mtime = inode->i_mtime;
-+	}
- 	stat->ctime = inode->i_ctime;
- 	stat->blksize = i_blocksize(inode);
- 	stat->blocks = inode->i_blocks;
-@@ -91,9 +96,14 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
- 	stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT |
- 				  STATX_ATTR_DAX);
- 
--	if (inode->i_op->getattr)
--		return inode->i_op->getattr(path, stat, request_mask,
--					    query_flags);
-+	if (inode->i_op->getattr) {
-+		int retval = inode->i_op->getattr(path, stat, request_mask, query_flags);
-+		if (!retval && is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) {
-+			stat->atime = stat->ctime;
-+			stat->mtime = stat->ctime;
-+		}
-+		return retval;
-+	}
- 
- 	generic_fillattr(inode, stat);
- 	return 0;
-diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
-index 000b457ad087..06d35ecdcbc8 100644
---- a/fs/userfaultfd.c
-+++ b/fs/userfaultfd.c
-@@ -28,7 +28,11 @@
- #include <linux/security.h>
- #include <linux/hugetlb.h>
- 
-+#ifdef CONFIG_USERFAULTFD_UNPRIVILEGED
- int sysctl_unprivileged_userfaultfd __read_mostly = 1;
-+#else
-+int sysctl_unprivileged_userfaultfd __read_mostly;
-+#endif
- 
- static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
- 
-diff --git a/include/linux/cache.h b/include/linux/cache.h
-index d742c57eaee5..f0222c070458 100644
---- a/include/linux/cache.h
-+++ b/include/linux/cache.h
-@@ -37,6 +37,8 @@
- #define __ro_after_init __section(".data..ro_after_init")
- #endif
- 
-+#define __read_only __ro_after_init
-+
- #ifndef ____cacheline_aligned
- #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
- #endif
-diff --git a/include/linux/capability.h b/include/linux/capability.h
-index 1e7fe311cabe..a5b6d4c9acf5 100644
---- a/include/linux/capability.h
-+++ b/include/linux/capability.h
-@@ -208,6 +208,7 @@ extern bool has_capability_noaudit(struct task_struct *t, int cap);
- extern bool has_ns_capability_noaudit(struct task_struct *t,
- 				      struct user_namespace *ns, int cap);
- extern bool capable(int cap);
-+extern bool capable_noaudit(int cap);
- extern bool ns_capable(struct user_namespace *ns, int cap);
- extern bool ns_capable_noaudit(struct user_namespace *ns, int cap);
- extern bool ns_capable_setid(struct user_namespace *ns, int cap);
-@@ -234,6 +235,10 @@ static inline bool capable(int cap)
- {
- 	return true;
- }
-+static inline bool capable_noaudit(int cap)
-+{
-+	return true;
-+}
- static inline bool ns_capable(struct user_namespace *ns, int cap)
- {
- 	return true;
-diff --git a/include/linux/dccp.h b/include/linux/dccp.h
-index 07e547c02fd8..504afa1a4be6 100644
---- a/include/linux/dccp.h
-+++ b/include/linux/dccp.h
-@@ -259,6 +259,7 @@ struct dccp_ackvec;
-  * @dccps_sync_scheduled - flag which signals "send out-of-band message soon"
-  * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets
-  * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing)
-+ * @dccps_ccid_timer - used by the CCIDs
-  * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs)
-  */
- struct dccp_sock {
-@@ -303,6 +304,7 @@ struct dccp_sock {
- 	__u8				dccps_sync_scheduled:1;
- 	struct tasklet_struct		dccps_xmitlet;
- 	struct timer_list		dccps_xmit_timer;
-+	struct timer_list		dccps_ccid_timer;
- };
- 
- static inline struct dccp_sock *dccp_sk(const struct sock *sk)
-diff --git a/include/linux/fs.h b/include/linux/fs.h
-index 8bde32cf9711..83d50b0a2a18 100644
---- a/include/linux/fs.h
-+++ b/include/linux/fs.h
-@@ -3475,4 +3475,15 @@ static inline int inode_drain_writes(struct inode *inode)
- 	return filemap_write_and_wait(inode->i_mapping);
- }
- 
-+extern int device_sidechannel_restrict;
-+
-+static inline bool is_sidechannel_device(const struct inode *inode)
-+{
-+	umode_t mode;
-+	if (!device_sidechannel_restrict)
-+		return false;
-+	mode = inode->i_mode;
-+	return ((S_ISCHR(mode) || S_ISBLK(mode)) && (mode & (S_IROTH | S_IWOTH)));
-+}
-+
- #endif /* _LINUX_FS_H */
-diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
-index f8acddcf54fb..7b109980327f 100644
---- a/include/linux/fsnotify.h
-+++ b/include/linux/fsnotify.h
-@@ -83,10 +83,14 @@ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
- static inline int fsnotify_file(struct file *file, __u32 mask)
- {
- 	const struct path *path = &file->f_path;
-+	struct inode *inode = file_inode(file);
- 
- 	if (file->f_mode & FMODE_NONOTIFY)
- 		return 0;
- 
-+	if (mask & (FS_ACCESS | FS_MODIFY) && is_sidechannel_device(inode))
-+		return 0;
-+
- 	return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
- }
- 
-diff --git a/include/linux/gfp.h b/include/linux/gfp.h
-index c603237e006c..893378b0262e 100644
---- a/include/linux/gfp.h
-+++ b/include/linux/gfp.h
-@@ -568,9 +568,9 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
- extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
- extern unsigned long get_zeroed_page(gfp_t gfp_mask);
- 
--void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
-+void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __attribute__((alloc_size(1)));
- void free_pages_exact(void *virt, size_t size);
--void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
-+void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __attribute__((alloc_size(2)));
- 
- #define __get_free_page(gfp_mask) \
- 		__get_free_pages((gfp_mask), 0)
-diff --git a/include/linux/highmem.h b/include/linux/highmem.h
-index 14e6202ce47f..4348ad7f5c50 100644
---- a/include/linux/highmem.h
-+++ b/include/linux/highmem.h
-@@ -284,6 +284,13 @@ static inline void clear_highpage(struct page *page)
- 	kunmap_atomic(kaddr);
- }
- 
-+static inline void verify_zero_highpage(struct page *page)
-+{
-+	void *kaddr = kmap_atomic(page);
-+	BUG_ON(memchr_inv(kaddr, 0, PAGE_SIZE));
-+	kunmap_atomic(kaddr);
-+}
-+
- static inline void zero_user_segments(struct page *page,
- 	unsigned start1, unsigned end1,
- 	unsigned start2, unsigned end2)
-diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
-index ee8299eb1f52..4381b79f76cf 100644
---- a/include/linux/interrupt.h
-+++ b/include/linux/interrupt.h
-@@ -554,7 +554,7 @@ extern const char * const softirq_to_name[NR_SOFTIRQS];
- 
- struct softirq_action
- {
--	void	(*action)(struct softirq_action *);
-+	void	(*action)(void);
- };
- 
- asmlinkage void do_softirq(void);
-@@ -569,7 +569,7 @@ static inline void do_softirq_own_stack(void)
- }
- #endif
- 
--extern void open_softirq(int nr, void (*action)(struct softirq_action *));
-+extern void __init open_softirq(int nr, void (*action)(void));
- extern void softirq_init(void);
- extern void __raise_softirq_irqoff(unsigned int nr);
- 
-diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h
-index 2b5b64256cf4..8cdce21dce0f 100644
---- a/include/linux/kobject_ns.h
-+++ b/include/linux/kobject_ns.h
-@@ -45,7 +45,7 @@ struct kobj_ns_type_operations {
- 	void (*drop_ns)(void *);
- };
- 
--int kobj_ns_type_register(const struct kobj_ns_type_operations *ops);
-+int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops);
- int kobj_ns_type_registered(enum kobj_ns_type type);
- const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent);
- const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj);
-diff --git a/include/linux/mm.h b/include/linux/mm.h
-index 289c26f055cd..0a691a57044d 100644
---- a/include/linux/mm.h
-+++ b/include/linux/mm.h
-@@ -759,7 +759,7 @@ static inline int is_vmalloc_or_module_addr(const void *x)
- }
- #endif
- 
--extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
-+extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __attribute__((alloc_size(1)));
- static inline void *kvmalloc(size_t size, gfp_t flags)
- {
- 	return kvmalloc_node(size, flags, NUMA_NO_NODE);
-@@ -894,10 +894,15 @@ static inline void set_compound_page_dtor(struct page *page,
- 	page[1].compound_dtor = compound_dtor;
- }
- 
--static inline void destroy_compound_page(struct page *page)
-+static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
- {
- 	VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
--	compound_page_dtors[page[1].compound_dtor](page);
-+	return compound_page_dtors[page[1].compound_dtor];
-+}
-+
-+static inline void destroy_compound_page(struct page *page)
-+{
-+	(*get_compound_page_dtor(page))(page);
- }
- 
- static inline unsigned int compound_order(struct page *page)
-diff --git a/include/linux/percpu.h b/include/linux/percpu.h
-index 5e76af742c80..9a6c682ec127 100644
---- a/include/linux/percpu.h
-+++ b/include/linux/percpu.h
-@@ -123,7 +123,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
- 				pcpu_fc_populate_pte_fn_t populate_pte_fn);
- #endif
- 
--extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
-+extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __attribute__((alloc_size(1)));
- extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
- extern bool is_kernel_percpu_address(unsigned long addr);
- 
-@@ -131,8 +131,8 @@ extern bool is_kernel_percpu_address(unsigned long addr);
- extern void __init setup_per_cpu_areas(void);
- #endif
- 
--extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp);
--extern void __percpu *__alloc_percpu(size_t size, size_t align);
-+extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __attribute__((alloc_size(1)));
-+extern void __percpu *__alloc_percpu(size_t size, size_t align) __attribute__((alloc_size(1)));
- extern void free_percpu(void __percpu *__pdata);
- extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
- 
-diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
-index 072ac6c1ef2b..2c8f98a8f8b0 100644
---- a/include/linux/perf_event.h
-+++ b/include/linux/perf_event.h
-@@ -1315,6 +1315,14 @@ static inline int perf_is_paranoid(void)
- 	return sysctl_perf_event_paranoid > -1;
- }
- 
-+static inline int perf_allow_open(struct perf_event_attr *attr)
-+{
-+	if (sysctl_perf_event_paranoid > 2 && !perfmon_capable())
-+		return -EACCES;
-+
-+	return security_perf_event_open(attr, PERF_SECURITY_OPEN);
-+}
-+
- static inline int perf_allow_kernel(struct perf_event_attr *attr)
- {
- 	if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
-diff --git a/include/linux/slab.h b/include/linux/slab.h
-index dd6897f62010..78f99835b91b 100644
---- a/include/linux/slab.h
-+++ b/include/linux/slab.h
-@@ -181,7 +181,7 @@ int kmem_cache_shrink(struct kmem_cache *);
- /*
-  * Common kmalloc functions provided by all allocators
-  */
--void * __must_check krealloc(const void *, size_t, gfp_t);
-+void * __must_check krealloc(const void *, size_t, gfp_t) __attribute((alloc_size(2)));
- void kfree(const void *);
- void kfree_sensitive(const void *);
- size_t __ksize(const void *);
-@@ -386,7 +386,7 @@ static __always_inline unsigned int kmalloc_index(size_t size)
- }
- #endif /* !CONFIG_SLOB */
- 
--void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
-+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1)));
- void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc;
- void kmem_cache_free(struct kmem_cache *, void *);
- 
-@@ -410,7 +410,7 @@ static __always_inline void kfree_bulk(size_t size, void **p)
- }
- 
- #ifdef CONFIG_NUMA
--void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
-+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1)));
- void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
- #else
- static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
-@@ -535,7 +535,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
-  *	Try really hard to succeed the allocation but fail
-  *	eventually.
-  */
--static __always_inline void *kmalloc(size_t size, gfp_t flags)
-+static __always_inline __attribute__((alloc_size(1))) void *kmalloc(size_t size, gfp_t flags)
- {
- 	if (__builtin_constant_p(size)) {
- #ifndef CONFIG_SLOB
-@@ -557,7 +557,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
- 	return __kmalloc(size, flags);
- }
- 
--static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
-+static __always_inline __attribute__((alloc_size(1))) void *kmalloc_node(size_t size, gfp_t flags, int node)
- {
- #ifndef CONFIG_SLOB
- 	if (__builtin_constant_p(size) &&
-diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
-index 1be0ed5befa1..c71cf30b5987 100644
---- a/include/linux/slub_def.h
-+++ b/include/linux/slub_def.h
-@@ -113,6 +113,11 @@ struct kmem_cache {
- 	unsigned long random;
- #endif
- 
-+#ifdef CONFIG_SLAB_CANARY
-+	unsigned long random_active;
-+	unsigned long random_inactive;
-+#endif
-+
- #ifdef CONFIG_NUMA
- 	/*
- 	 * Defragmentation by allocating from a remote node.
-diff --git a/include/linux/string.h b/include/linux/string.h
-index b1f3894a0a3e..4c5564a6ad80 100644
---- a/include/linux/string.h
-+++ b/include/linux/string.h
-@@ -264,6 +264,12 @@ void __read_overflow2(void) __compiletime_error("detected read beyond size of ob
- void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter");
- void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter");
- 
-+#ifdef CONFIG_FORTIFY_SOURCE_STRICT_STRING
-+#define __string_size(p) __builtin_object_size(p, 1)
-+#else
-+#define __string_size(p) __builtin_object_size(p, 0)
-+#endif
-+
- #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
- 
- #ifdef CONFIG_KASAN
-@@ -292,7 +298,7 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size)
- 
- __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size)
- {
--	size_t p_size = __builtin_object_size(p, 0);
-+	size_t p_size = __string_size(p);
- 	if (__builtin_constant_p(size) && p_size < size)
- 		__write_overflow();
- 	if (p_size < size)
-@@ -302,7 +308,7 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size)
- 
- __FORTIFY_INLINE char *strcat(char *p, const char *q)
- {
--	size_t p_size = __builtin_object_size(p, 0);
-+	size_t p_size = __string_size(p);
- 	if (p_size == (size_t)-1)
- 		return __underlying_strcat(p, q);
- 	if (strlcat(p, q, p_size) >= p_size)
-@@ -313,7 +319,7 @@ __FORTIFY_INLINE char *strcat(char *p, const char *q)
- __FORTIFY_INLINE __kernel_size_t strlen(const char *p)
- {
- 	__kernel_size_t ret;
--	size_t p_size = __builtin_object_size(p, 0);
-+	size_t p_size = __string_size(p);
- 
- 	/* Work around gcc excess stack consumption issue */
- 	if (p_size == (size_t)-1 ||
-@@ -328,7 +334,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p)
- extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
- __FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen)
- {
--	size_t p_size = __builtin_object_size(p, 0);
-+	size_t p_size = __string_size(p);
- 	__kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
- 	if (p_size <= ret && maxlen != ret)
- 		fortify_panic(__func__);
-@@ -340,8 +346,8 @@ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy);
- __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
- {
- 	size_t ret;
--	size_t p_size = __builtin_object_size(p, 0);
--	size_t q_size = __builtin_object_size(q, 0);
-+	size_t p_size = __string_size(p);
-+	size_t q_size = __string_size(q);
- 	if (p_size == (size_t)-1 && q_size == (size_t)-1)
- 		return __real_strlcpy(p, q, size);
- 	ret = strlen(q);
-@@ -361,8 +367,8 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
- __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count)
- {
- 	size_t p_len, copy_len;
--	size_t p_size = __builtin_object_size(p, 0);
--	size_t q_size = __builtin_object_size(q, 0);
-+	size_t p_size = __string_size(p);
-+	size_t q_size = __string_size(q);
- 	if (p_size == (size_t)-1 && q_size == (size_t)-1)
- 		return __underlying_strncat(p, q, count);
- 	p_len = strlen(p);
-@@ -475,8 +481,8 @@ __FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp)
- /* defined after fortified strlen and memcpy to reuse them */
- __FORTIFY_INLINE char *strcpy(char *p, const char *q)
- {
--	size_t p_size = __builtin_object_size(p, 0);
--	size_t q_size = __builtin_object_size(q, 0);
-+	size_t p_size = __string_size(p);
-+	size_t q_size = __string_size(q);
- 	if (p_size == (size_t)-1 && q_size == (size_t)-1)
- 		return __underlying_strcpy(p, q);
- 	memcpy(p, q, strlen(q) + 1);
-diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
-index 51298a4f4623..b835c57330f2 100644
---- a/include/linux/sysctl.h
-+++ b/include/linux/sysctl.h
-@@ -53,6 +53,8 @@ int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *);
- int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *);
- int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer,
- 		size_t *lenp, loff_t *ppos);
-+int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
-+				  void *buffer, size_t *lenp, loff_t *ppos);
- int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *);
- int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *,
- 		loff_t *);
-diff --git a/include/linux/tty.h b/include/linux/tty.h
-index 5972f43b9d5a..b1750024d570 100644
---- a/include/linux/tty.h
-+++ b/include/linux/tty.h
-@@ -14,6 +14,7 @@
- #include <uapi/linux/tty.h>
- #include <linux/rwsem.h>
- #include <linux/llist.h>
-+#include <linux/user_namespace.h>
- 
- 
- /*
-@@ -341,6 +342,7 @@ struct tty_struct {
- 	/* If the tty has a pending do_SAK, queue it here - akpm */
- 	struct work_struct SAK_work;
- 	struct tty_port *port;
-+	struct user_namespace *owner_user_ns;
- } __randomize_layout;
- 
- /* Each of a tty's open files has private_data pointing to tty_file_private */
-@@ -350,6 +352,8 @@ struct tty_file_private {
- 	struct list_head list;
- };
- 
-+extern int tiocsti_restrict;
-+
- /* tty magic number */
- #define TTY_MAGIC		0x5401
- 
-diff --git a/include/linux/usb.h b/include/linux/usb.h
-index d6a41841b93e..f7f3d138b4e6 100644
---- a/include/linux/usb.h
-+++ b/include/linux/usb.h
-@@ -2037,6 +2037,17 @@ extern void usb_led_activity(enum usb_led_event ev);
- static inline void usb_led_activity(enum usb_led_event ev) {}
- #endif
- 
-+/* sysctl.c */
-+extern int deny_new_usb;
-+#ifdef CONFIG_SYSCTL
-+extern int usb_init_sysctl(void);
-+extern void usb_exit_sysctl(void);
-+#else
-+static inline int usb_init_sysctl(void) { return 0; }
-+static inline void usb_exit_sysctl(void) { }
-+#endif /* CONFIG_SYSCTL */
-+
-+
- #endif  /* __KERNEL__ */
- 
- #endif
-diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
-index 7616c7bf4b24..bdbfcfe5df1e 100644
---- a/include/linux/user_namespace.h
-+++ b/include/linux/user_namespace.h
-@@ -109,6 +109,8 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
- 
- #ifdef CONFIG_USER_NS
- 
-+extern int unprivileged_userns_clone;
-+
- static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
- {
- 	if (ns)
-@@ -142,6 +144,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns);
- struct ns_common *ns_get_owner(struct ns_common *ns);
- #else
- 
-+#define unprivileged_userns_clone 0
-+
- static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
- {
- 	return &init_user_ns;
-diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
-index 76dad53a410a..35de3a67efa4 100644
---- a/include/linux/vmalloc.h
-+++ b/include/linux/vmalloc.h
-@@ -97,18 +97,18 @@ static inline void vmalloc_init(void)
- static inline unsigned long vmalloc_nr_pages(void) { return 0; }
- #endif
- 
--extern void *vmalloc(unsigned long size);
--extern void *vzalloc(unsigned long size);
--extern void *vmalloc_user(unsigned long size);
--extern void *vmalloc_node(unsigned long size, int node);
--extern void *vzalloc_node(unsigned long size, int node);
--extern void *vmalloc_32(unsigned long size);
--extern void *vmalloc_32_user(unsigned long size);
--extern void *__vmalloc(unsigned long size, gfp_t gfp_mask);
-+extern void *vmalloc(unsigned long size) __attribute__((alloc_size(1)));
-+extern void *vzalloc(unsigned long size) __attribute__((alloc_size(1)));
-+extern void *vmalloc_user(unsigned long size) __attribute__((alloc_size(1)));
-+extern void *vmalloc_node(unsigned long size, int node) __attribute__((alloc_size(1)));
-+extern void *vzalloc_node(unsigned long size, int node) __attribute__((alloc_size(1)));
-+extern void *vmalloc_32(unsigned long size) __attribute__((alloc_size(1)));
-+extern void *vmalloc_32_user(unsigned long size) __attribute__((alloc_size(1)));
-+extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __attribute__((alloc_size(1)));
- extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
- 			unsigned long start, unsigned long end, gfp_t gfp_mask,
- 			pgprot_t prot, unsigned long vm_flags, int node,
--			const void *caller);
-+			const void *caller) __attribute__((alloc_size(1)));
- void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
- 		int node, const void *caller);
- 
-diff --git a/include/net/tcp.h b/include/net/tcp.h
-index 7d66c61d22c7..cbb8c45ac186 100644
---- a/include/net/tcp.h
-+++ b/include/net/tcp.h
-@@ -245,6 +245,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
- /* sysctl variables for tcp */
- extern int sysctl_tcp_max_orphans;
- extern long sysctl_tcp_mem[3];
-+extern int sysctl_tcp_simult_connect;
- 
- #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
- #define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
-diff --git a/init/Kconfig b/init/Kconfig
-index fc4c9f416fad..36edd8448d40 100644
---- a/init/Kconfig
-+++ b/init/Kconfig
-@@ -418,6 +418,7 @@ config USELIB
- config AUDIT
- 	bool "Auditing support"
- 	depends on NET
-+	default y
- 	help
- 	  Enable auditing infrastructure that can be used with another
- 	  kernel subsystem, such as SELinux (which requires this for
-@@ -1172,6 +1173,22 @@ config USER_NS
- 
- 	  If unsure, say N.
- 
-+config USER_NS_UNPRIVILEGED
-+	bool "Allow unprivileged users to create namespaces"
-+	depends on USER_NS
-+	default n
-+	help
-+	  When disabled, unprivileged users will not be able to create
-+	  new namespaces. Allowing users to create their own namespaces
-+	  has been part of several recent local privilege escalation
-+	  exploits, so if you need user namespaces but are
-+	  paranoid^Wsecurity-conscious you want to disable this.
-+
-+	  This setting can be overridden at runtime via the
-+	  kernel.unprivileged_userns_clone sysctl.
-+
-+	  If unsure, say N.
-+
- config PID_NS
- 	bool "PID Namespaces"
- 	default y
-@@ -1402,9 +1419,8 @@ menuconfig EXPERT
- 	  Only use this if you really know what you are doing.
- 
- config UID16
--	bool "Enable 16-bit UID system calls" if EXPERT
-+	bool "Enable 16-bit UID system calls"
- 	depends on HAVE_UID16 && MULTIUSER
--	default y
- 	help
- 	  This enables the legacy 16-bit UID syscall wrappers.
- 
-@@ -1433,14 +1449,13 @@ config SGETMASK_SYSCALL
- 	  If unsure, leave the default option here.
- 
- config SYSFS_SYSCALL
--	bool "Sysfs syscall support" if EXPERT
--	default y
-+	bool "Sysfs syscall support"
- 	help
- 	  sys_sysfs is an obsolete system call no longer supported in libc.
- 	  Note that disabling this option is more secure but might break
- 	  compatibility with some systems.
- 
--	  If unsure say Y here.
-+	  If unsure say N here.
- 
- config FHANDLE
- 	bool "open by fhandle syscalls" if EXPERT
-@@ -1591,8 +1606,7 @@ config SHMEM
- 	  which may be appropriate on small systems without swap.
- 
- config AIO
--	bool "Enable AIO support" if EXPERT
--	default y
-+	bool "Enable AIO support"
- 	help
- 	  This option enables POSIX asynchronous I/O which may by used
- 	  by some high performance threaded applications. Disabling
-@@ -1731,6 +1745,23 @@ config USERFAULTFD
- 	  Enable the userfaultfd() system call that allows to intercept and
- 	  handle page faults in userland.
- 
-+config USERFAULTFD_UNPRIVILEGED
-+	bool "Allow unprivileged users to use the userfaultfd syscall"
-+	depends on USERFAULTFD
-+	default n
-+	help
-+	  When disabled, unprivileged users will not be able to use the userfaultfd
-+	  syscall. Userfaultfd provide attackers with a way to stall a kernel
-+	  thread in the middle of memory accesses from userspace by initiating an
-+	  access on an unmapped page. To avoid various heap grooming and heap
-+	  spraying techniques for exploiting use-after-free flaws this should be
-+	  disabled by default.
-+
-+	  This setting can be overridden at runtime via the
-+	  vm.unprivileged_userfaultfd sysctl.
-+
-+	  If unsure, say N.
-+
- config ARCH_HAS_MEMBARRIER_CALLBACKS
- 	bool
- 
-@@ -1853,7 +1884,7 @@ config VM_EVENT_COUNTERS
- 
- config SLUB_DEBUG
- 	default y
--	bool "Enable SLUB debugging support" if EXPERT
-+	bool "Enable SLUB debugging support"
- 	depends on SLUB && SYSFS
- 	help
- 	  SLUB has extensive debug support features. Disabling these can
-@@ -1877,7 +1908,6 @@ config SLUB_MEMCG_SYSFS_ON
- 
- config COMPAT_BRK
- 	bool "Disable heap randomization"
--	default y
- 	help
- 	  Randomizing heap placement makes heap exploits harder, but it
- 	  also breaks ancient binaries (including anything libc5 based).
-@@ -1924,7 +1954,6 @@ endchoice
- 
- config SLAB_MERGE_DEFAULT
- 	bool "Allow slab caches to be merged"
--	default y
- 	help
- 	  For reduced kernel memory fragmentation, slab caches can be
- 	  merged when they share the same size and other characteristics.
-@@ -1939,6 +1968,7 @@ config SLAB_MERGE_DEFAULT
- config SLAB_FREELIST_RANDOM
- 	bool "Randomize slab freelist"
- 	depends on SLAB || SLUB
-+	default y
- 	help
- 	  Randomizes the freelist order used on creating new pages. This
- 	  security feature reduces the predictability of the kernel slab
-@@ -1947,6 +1977,7 @@ config SLAB_FREELIST_RANDOM
- config SLAB_FREELIST_HARDENED
- 	bool "Harden slab freelist metadata"
- 	depends on SLAB || SLUB
-+	default y
- 	help
- 	  Many kernel heap attacks try to target slab cache metadata and
- 	  other infrastructure. This options makes minor performance
-@@ -1955,6 +1986,23 @@ config SLAB_FREELIST_HARDENED
- 	  sanity-checking than others. This option is most effective with
- 	  CONFIG_SLUB.
- 
-+config SLAB_CANARY
-+	depends on SLUB
-+	depends on !SLAB_MERGE_DEFAULT
-+	bool "SLAB canaries"
-+	default y
-+	help
-+	  Place canaries at the end of kernel slab allocations, sacrificing
-+	  some performance and memory usage for security.
-+
-+	  Canaries can detect some forms of heap corruption when allocations
-+	  are freed and as part of the HARDENED_USERCOPY feature. It provides
-+	  basic use-after-free detection for HARDENED_USERCOPY.
-+
-+	  Canaries absorb small overflows (rendering them harmless), mitigate
-+	  non-NUL terminated C string overflows on 64-bit via a guaranteed zero
-+	  byte and provide basic double-free detection.
-+
- config SHUFFLE_PAGE_ALLOCATOR
- 	bool "Page allocator randomization"
- 	default SLAB_FREELIST_RANDOM && ACPI_NUMA
-diff --git a/kernel/audit.c b/kernel/audit.c
-index 68cee3bc8cfe..2059c66f7c9b 100644
---- a/kernel/audit.c
-+++ b/kernel/audit.c
-@@ -1693,6 +1693,9 @@ static int __init audit_enable(char *str)
- 
- 	if (audit_default == AUDIT_OFF)
- 		audit_initialized = AUDIT_DISABLED;
-+	else if (!audit_ever_enabled)
-+		audit_initialized = AUDIT_UNINITIALIZED;
-+
- 	if (audit_set_enabled(audit_default))
- 		pr_err("audit: error setting audit state (%d)\n",
- 		       audit_default);
-diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
-index 182e162f8fd0..1705707b3b90 100644
---- a/kernel/bpf/core.c
-+++ b/kernel/bpf/core.c
-@@ -524,7 +524,7 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
- /* All BPF JIT sysctl knobs here. */
- int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
- int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
--int bpf_jit_harden   __read_mostly;
-+int bpf_jit_harden   __read_mostly = 2;
- long bpf_jit_limit   __read_mostly;
- 
- static void
-diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
-index 9433ab9995cd..348c36273f1a 100644
---- a/kernel/bpf/syscall.c
-+++ b/kernel/bpf/syscall.c
-@@ -50,7 +50,7 @@ static DEFINE_SPINLOCK(map_idr_lock);
- static DEFINE_IDR(link_idr);
- static DEFINE_SPINLOCK(link_idr_lock);
- 
--int sysctl_unprivileged_bpf_disabled __read_mostly;
-+int sysctl_unprivileged_bpf_disabled __read_mostly = 1;
- 
- static const struct bpf_map_ops * const bpf_map_types[] = {
- #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
-diff --git a/kernel/capability.c b/kernel/capability.c
-index de7eac903a2a..5602178f3d21 100644
---- a/kernel/capability.c
-+++ b/kernel/capability.c
-@@ -449,6 +449,12 @@ bool capable(int cap)
- 	return ns_capable(&init_user_ns, cap);
- }
- EXPORT_SYMBOL(capable);
-+
-+bool capable_noaudit(int cap)
-+{
-+	return ns_capable_noaudit(&init_user_ns, cap);
-+}
-+EXPORT_SYMBOL(capable_noaudit);
- #endif /* CONFIG_MULTIUSER */
- 
- /**
-diff --git a/kernel/events/core.c b/kernel/events/core.c
-index 7e0fdc19043e..42636279e201 100644
---- a/kernel/events/core.c
-+++ b/kernel/events/core.c
-@@ -408,8 +408,13 @@ static cpumask_var_t perf_online_mask;
-  *   0 - disallow raw tracepoint access for unpriv
-  *   1 - disallow cpu events for unpriv
-  *   2 - disallow kernel profiling for unpriv
-+ *   3 - disallow all unpriv perf event use
-  */
-+#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
-+int sysctl_perf_event_paranoid __read_mostly = 3;
-+#else
- int sysctl_perf_event_paranoid __read_mostly = 2;
-+#endif
- 
- /* Minimum for 512 kiB + 1 user control page */
- int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
-@@ -11692,7 +11697,7 @@ SYSCALL_DEFINE5(perf_event_open,
- 		return -EINVAL;
- 
- 	/* Do we allow access to perf_event_open(2) ? */
--	err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
-+	err = perf_allow_open(&attr);
- 	if (err)
- 		return err;
- 
-diff --git a/kernel/fork.c b/kernel/fork.c
-index 7c044d377926..8066141b692f 100644
---- a/kernel/fork.c
-+++ b/kernel/fork.c
-@@ -82,6 +82,7 @@
- #include <linux/perf_event.h>
- #include <linux/posix-timers.h>
- #include <linux/user-return-notifier.h>
-+#include <linux/user_namespace.h>
- #include <linux/oom.h>
- #include <linux/khugepaged.h>
- #include <linux/signalfd.h>
-@@ -1871,6 +1872,10 @@ static __latent_entropy struct task_struct *copy_process(
- 	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
- 		return ERR_PTR(-EINVAL);
- 
-+	if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone)
-+		if (!capable(CAP_SYS_ADMIN))
-+			return ERR_PTR(-EPERM);
-+
- 	/*
- 	 * Thread groups must share signals as well, and detached threads
- 	 * can only be started up within the thread group.
-@@ -2936,6 +2941,12 @@ int ksys_unshare(unsigned long unshare_flags)
- 	if (unshare_flags & CLONE_NEWNS)
- 		unshare_flags |= CLONE_FS;
- 
-+	if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) {
-+		err = -EPERM;
-+		if (!capable(CAP_SYS_ADMIN))
-+			goto bad_unshare_out;
-+	}
-+
- 	err = check_unshare_flags(unshare_flags);
- 	if (err)
- 		goto bad_unshare_out;
-diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
-index aa897c3f2e92..d8976886fd68 100644
---- a/kernel/rcu/tiny.c
-+++ b/kernel/rcu/tiny.c
-@@ -101,7 +101,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
- }
- 
- /* Invoke the RCU callbacks whose grace period has elapsed.  */
--static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
-+static __latent_entropy void rcu_process_callbacks(void)
- {
- 	struct rcu_head *next, *list;
- 	unsigned long flags;
-diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
-index 61e250cdd7c9..9ef3aa84f3c9 100644
---- a/kernel/rcu/tree.c
-+++ b/kernel/rcu/tree.c
-@@ -2727,7 +2727,7 @@ static __latent_entropy void rcu_core(void)
- 		queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work);
- }
- 
--static void rcu_core_si(struct softirq_action *h)
-+static void rcu_core_si(void)
- {
- 	rcu_core();
- }
-diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index d6e1c90de570..03ea833c66a5 100644
---- a/kernel/sched/fair.c
-+++ b/kernel/sched/fair.c
-@@ -10669,7 +10669,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
-  * run_rebalance_domains is triggered when needed from the scheduler tick.
-  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
-  */
--static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
-+static __latent_entropy void run_rebalance_domains(void)
- {
- 	struct rq *this_rq = this_rq();
- 	enum cpu_idle_type idle = this_rq->idle_balance ?
-diff --git a/kernel/softirq.c b/kernel/softirq.c
-index 09229ad82209..6a02d63b135a 100644
---- a/kernel/softirq.c
-+++ b/kernel/softirq.c
-@@ -52,7 +52,7 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);
- EXPORT_PER_CPU_SYMBOL(irq_stat);
- #endif
- 
--static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
-+static struct softirq_action softirq_vec[NR_SOFTIRQS] __ro_after_init __aligned(PAGE_SIZE);
- 
- DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
- 
-@@ -295,7 +295,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
- 		kstat_incr_softirqs_this_cpu(vec_nr);
- 
- 		trace_softirq_entry(vec_nr);
--		h->action(h);
-+		h->action();
- 		trace_softirq_exit(vec_nr);
- 		if (unlikely(prev_count != preempt_count())) {
- 			pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
-@@ -486,7 +486,7 @@ void __raise_softirq_irqoff(unsigned int nr)
- 	or_softirq_pending(1UL << nr);
- }
- 
--void open_softirq(int nr, void (*action)(struct softirq_action *))
-+void __init open_softirq(int nr, void (*action)(void))
- {
- 	softirq_vec[nr].action = action;
- }
-@@ -532,8 +532,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
- }
- EXPORT_SYMBOL(__tasklet_hi_schedule);
- 
--static void tasklet_action_common(struct softirq_action *a,
--				  struct tasklet_head *tl_head,
-+static void tasklet_action_common(struct tasklet_head *tl_head,
- 				  unsigned int softirq_nr)
- {
- 	struct tasklet_struct *list;
-@@ -573,14 +572,14 @@ static void tasklet_action_common(struct softirq_action *a,
- 	}
- }
- 
--static __latent_entropy void tasklet_action(struct softirq_action *a)
-+static __latent_entropy void tasklet_action(void)
- {
--	tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
-+	tasklet_action_common(this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
- }
- 
--static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
-+static __latent_entropy void tasklet_hi_action(void)
- {
--	tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
-+	tasklet_action_common(this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
- }
- 
- void tasklet_setup(struct tasklet_struct *t,
-diff --git a/kernel/sysctl.c b/kernel/sysctl.c
-index b9306d2bb426..c88545fb5967 100644
---- a/kernel/sysctl.c
-+++ b/kernel/sysctl.c
-@@ -103,38 +103,44 @@
- #ifdef CONFIG_LOCKUP_DETECTOR
- #include <linux/nmi.h>
- #endif
-+#ifdef CONFIG_USER_NS
-+#include <linux/user_namespace.h>
-+#endif
-+#if defined CONFIG_TTY
-+#include <linux/tty.h>
-+#endif
- 
- #if defined(CONFIG_SYSCTL)
- 
- /* Constants used for minimum and  maximum */
- #ifdef CONFIG_LOCKUP_DETECTOR
--static int sixty = 60;
--#endif
--
--static int __maybe_unused neg_one = -1;
--static int __maybe_unused two = 2;
--static int __maybe_unused four = 4;
--static unsigned long zero_ul;
--static unsigned long one_ul = 1;
--static unsigned long long_max = LONG_MAX;
--static int one_hundred = 100;
--static int two_hundred = 200;
--static int one_thousand = 1000;
-+static int sixty __read_only = 60;
-+#endif
-+
-+static int __maybe_unused neg_one __read_only = -1;
-+static int __maybe_unused two __read_only = 2;
-+static int __maybe_unused four __read_only = 4;
-+static unsigned long zero_ul __read_only;
-+static unsigned long one_ul __read_only = 1;
-+static unsigned long long_max __read_only = LONG_MAX;
-+static int one_hundred __read_only = 100;
-+static int two_hundred __read_only = 200;
-+static int one_thousand __read_only = 1000;
- #ifdef CONFIG_PRINTK
--static int ten_thousand = 10000;
-+static int ten_thousand __read_only = 10000;
- #endif
- #ifdef CONFIG_PERF_EVENTS
--static int six_hundred_forty_kb = 640 * 1024;
-+static int six_hundred_forty_kb __read_only = 640 * 1024;
- #endif
- 
- /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
--static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
-+static unsigned long dirty_bytes_min __read_only = 2 * PAGE_SIZE;
- 
- /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
--static int maxolduid = 65535;
--static int minolduid;
-+static int maxolduid __read_only = 65535;
-+static int minolduid __read_only;
- 
--static int ngroups_max = NGROUPS_MAX;
-+static int ngroups_max __read_only = NGROUPS_MAX;
- static const int cap_last_cap = CAP_LAST_CAP;
- 
- /*
-@@ -142,7 +148,7 @@ static const int cap_last_cap = CAP_LAST_CAP;
-  * and hung_task_check_interval_secs
-  */
- #ifdef CONFIG_DETECT_HUNG_TASK
--static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
-+static unsigned long hung_task_timeout_max __read_only = (LONG_MAX/HZ);
- #endif
- 
- #ifdef CONFIG_INOTIFY_USER
-@@ -185,19 +191,19 @@ int sysctl_legacy_va_layout;
- #endif
- 
- #ifdef CONFIG_SCHED_DEBUG
--static int min_sched_granularity_ns = 100000;		/* 100 usecs */
--static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
--static int min_wakeup_granularity_ns;			/* 0 usecs */
--static int max_wakeup_granularity_ns = NSEC_PER_SEC;	/* 1 second */
-+static int min_sched_granularity_ns __read_only = 100000;		/* 100 usecs */
-+static int max_sched_granularity_ns __read_only = NSEC_PER_SEC;	/* 1 second */
-+static int min_wakeup_granularity_ns __read_only;			/* 0 usecs */
-+static int max_wakeup_granularity_ns __read_only = NSEC_PER_SEC;	/* 1 second */
- #ifdef CONFIG_SMP
--static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
--static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-+static int min_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_NONE;
-+static int max_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_END-1;
- #endif /* CONFIG_SMP */
- #endif /* CONFIG_SCHED_DEBUG */
- 
- #ifdef CONFIG_COMPACTION
--static int min_extfrag_threshold;
--static int max_extfrag_threshold = 1000;
-+static int min_extfrag_threshold __read_only;
-+static int max_extfrag_threshold __read_only = 1000;
- #endif
- 
- #endif /* CONFIG_SYSCTL */
-@@ -887,8 +893,27 @@ static int proc_taint(struct ctl_table *table, int write,
- 	return err;
- }
- 
--#ifdef CONFIG_PRINTK
--static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
-+/**
-+ * proc_dointvec_minmax_sysadmin - read a vector of integers with min/max values
-+ * checking CAP_SYS_ADMIN on write
-+ * @table: the sysctl table
-+ * @write: %TRUE if this is a write to the sysctl file
-+ * @buffer: the user buffer
-+ * @lenp: the size of the user buffer
-+ * @ppos: file position
-+ *
-+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
-+ * values from/to the user buffer, treated as an ASCII string.
-+ *
-+ * This routine will ensure the values are within the range specified by
-+ * table->extra1 (min) and table->extra2 (max).
-+ *
-+ * Writing is only allowed when root has CAP_SYS_ADMIN.
-+ *
-+ * Returns 0 on success, -EPERM on permission failure or -EINVAL on write
-+ * when the range check fails.
-+ */
-+int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
- 				void *buffer, size_t *lenp, loff_t *ppos)
- {
- 	if (write && !capable(CAP_SYS_ADMIN))
-@@ -896,7 +921,6 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
- 
- 	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- }
--#endif
- 
- /**
-  * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
-@@ -1582,6 +1606,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
- 	return -ENOSYS;
- }
- 
-+int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
-+				  void *buffer, size_t *lenp, loff_t *ppos)
-+{
-+	return -ENOSYS;
-+}
-+
- int proc_dointvec_jiffies(struct ctl_table *table, int write,
- 		    void *buffer, size_t *lenp, loff_t *ppos)
- {
-@@ -1902,6 +1932,15 @@ static struct ctl_table kern_table[] = {
- 		.proc_handler	= proc_dointvec,
- 	},
- #endif
-+#ifdef CONFIG_USER_NS
-+	{
-+		.procname	= "unprivileged_userns_clone",
-+		.data		= &unprivileged_userns_clone,
-+		.maxlen		= sizeof(int),
-+		.mode		= 0644,
-+		.proc_handler	= proc_dointvec,
-+	},
-+#endif
- #ifdef CONFIG_PROC_SYSCTL
- 	{
- 		.procname	= "tainted",
-@@ -2260,6 +2299,26 @@ static struct ctl_table kern_table[] = {
- 		.extra2		= &two,
- 	},
- #endif
-+#if defined CONFIG_TTY
-+	{
-+		.procname	= "tiocsti_restrict",
-+		.data		= &tiocsti_restrict,
-+		.maxlen		= sizeof(int),
-+		.mode		= 0644,
-+		.proc_handler	= proc_dointvec_minmax_sysadmin,
-+		.extra1		= SYSCTL_ZERO,
-+		.extra2		= SYSCTL_ONE,
-+	},
-+#endif
-+	{
-+		.procname	= "device_sidechannel_restrict",
-+		.data		= &device_sidechannel_restrict,
-+		.maxlen		= sizeof(int),
-+		.mode		= 0644,
-+		.proc_handler	= proc_dointvec_minmax_sysadmin,
-+		.extra1		= SYSCTL_ZERO,
-+		.extra2		= SYSCTL_ONE,
-+	},
- 	{
- 		.procname	= "ngroups_max",
- 		.data		= &ngroups_max,
-@@ -3415,6 +3474,7 @@ EXPORT_SYMBOL(proc_douintvec);
- EXPORT_SYMBOL(proc_dointvec_jiffies);
- EXPORT_SYMBOL(proc_dointvec_minmax);
- EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
-+EXPORT_SYMBOL(proc_dointvec_minmax_sysadmin);
- EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
- EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
- EXPORT_SYMBOL(proc_dostring);
-diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
-index 9505b1f21cdf..b67bb69052af 100644
---- a/kernel/time/hrtimer.c
-+++ b/kernel/time/hrtimer.c
-@@ -1605,7 +1605,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
- 	}
- }
- 
--static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
-+static __latent_entropy void hrtimer_run_softirq(void)
- {
- 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
- 	unsigned long flags;
-diff --git a/kernel/time/timer.c b/kernel/time/timer.c
-index c3ad64fb9d8b..217bc49a3856 100644
---- a/kernel/time/timer.c
-+++ b/kernel/time/timer.c
-@@ -1753,7 +1753,7 @@ static inline void __run_timers(struct timer_base *base)
- /*
-  * This function runs timers and the timer-tq in bottom half context.
-  */
--static __latent_entropy void run_timer_softirq(struct softirq_action *h)
-+static __latent_entropy void run_timer_softirq(void)
- {
- 	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
- 
-diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
-index ce396ea4de60..c3a6ef1f10ed 100644
---- a/kernel/user_namespace.c
-+++ b/kernel/user_namespace.c
-@@ -21,6 +21,13 @@
- #include <linux/bsearch.h>
- #include <linux/sort.h>
- 
-+/* sysctl */
-+#ifdef CONFIG_USER_NS_UNPRIVILEGED
-+int unprivileged_userns_clone = 1;
-+#else
-+int unprivileged_userns_clone;
-+#endif
-+
- static struct kmem_cache *user_ns_cachep __read_mostly;
- static DEFINE_MUTEX(userns_state_mutex);
- 
-diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
-index dcf4a9028e16..82f084142d8b 100644
---- a/lib/Kconfig.debug
-+++ b/lib/Kconfig.debug
-@@ -374,6 +374,9 @@ config DEBUG_FORCE_FUNCTION_ALIGN_32B
- 
- 	  It is mainly for debug and performance tuning use.
- 
-+config DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE
-+	bool "Enable verbose reporting of writable function pointers"
-+
- #
- # Select this config option from the architecture Kconfig, if it
- # is preferred to always offer frame pointers as a config
-@@ -488,7 +491,7 @@ config DEBUG_FS
- choice
- 	prompt "Debugfs default access"
- 	depends on DEBUG_FS
--	default DEBUG_FS_ALLOW_ALL
-+	default DEBUG_FS_ALLOW_NONE
- 	help
- 	  This selects the default access restrictions for debugfs.
- 	  It can be overridden with kernel command line option
-@@ -894,6 +897,7 @@ menu "Debug Oops, Lockups and Hangs"
- 
- config PANIC_ON_OOPS
- 	bool "Panic on Oops"
-+	default y
- 	help
- 	  Say Y here to enable the kernel to panic when it oopses. This
- 	  has the same effect as setting oops=panic on the kernel command
-@@ -903,7 +907,7 @@ config PANIC_ON_OOPS
- 	  anything erroneous after an oops which could result in data
- 	  corruption or other issues.
- 
--	  Say N if unsure.
-+	  Say Y if unsure.
- 
- config PANIC_ON_OOPS_VALUE
- 	int
-@@ -1471,6 +1475,7 @@ menu "Debug kernel data structures"
- config DEBUG_LIST
- 	bool "Debug linked list manipulation"
- 	depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION
-+	default y
- 	help
- 	  Enable this to turn on extended checks in the linked-list
- 	  walking routines.
-@@ -1510,6 +1515,7 @@ config DEBUG_NOTIFIERS
- config BUG_ON_DATA_CORRUPTION
- 	bool "Trigger a BUG when data corruption is detected"
- 	select DEBUG_LIST
-+	default y
- 	help
- 	  Select this option if the kernel should BUG when it encounters
- 	  data corruption in kernel memory structures when they get checked
-@@ -1665,6 +1671,7 @@ config STRICT_DEVMEM
- config IO_STRICT_DEVMEM
- 	bool "Filter I/O access to /dev/mem"
- 	depends on STRICT_DEVMEM
-+	default y
- 	help
- 	  If this option is disabled, you allow userspace (root) access to all
- 	  io-memory regardless of whether a driver is actively using that
-diff --git a/lib/irq_poll.c b/lib/irq_poll.c
-index 2f17b488d58e..b6e7996a0058 100644
---- a/lib/irq_poll.c
-+++ b/lib/irq_poll.c
-@@ -75,7 +75,7 @@ void irq_poll_complete(struct irq_poll *iop)
- }
- EXPORT_SYMBOL(irq_poll_complete);
- 
--static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
-+static void __latent_entropy irq_poll_softirq(void)
- {
- 	struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
- 	int rearm = 0, budget = irq_poll_budget;
-diff --git a/lib/kobject.c b/lib/kobject.c
-index ea53b30cf483..5343bbeea5f8 100644
---- a/lib/kobject.c
-+++ b/lib/kobject.c
-@@ -1023,9 +1023,9 @@ EXPORT_SYMBOL_GPL(kset_create_and_add);
- 
- 
- static DEFINE_SPINLOCK(kobj_ns_type_lock);
--static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES];
-+static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES] __ro_after_init;
- 
--int kobj_ns_type_register(const struct kobj_ns_type_operations *ops)
-+int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops)
- {
- 	enum kobj_ns_type type = ops->type;
- 	int error;
-diff --git a/lib/nlattr.c b/lib/nlattr.c
-index fe60f9ae9db1..0e9d8d239973 100644
---- a/lib/nlattr.c
-+++ b/lib/nlattr.c
-@@ -778,6 +778,8 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count)
- {
- 	int minlen = min_t(int, count, nla_len(src));
- 
-+	BUG_ON(minlen < 0);
-+
- 	memcpy(dest, nla_data(src), minlen);
- 	if (count > minlen)
- 		memset(dest + minlen, 0, count - minlen);
-diff --git a/lib/vsprintf.c b/lib/vsprintf.c
-index fd0fde639ec9..a4c940a6aff2 100644
---- a/lib/vsprintf.c
-+++ b/lib/vsprintf.c
-@@ -821,7 +821,7 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr,
- 	return pointer_string(buf, end, (const void *)hashval, spec);
- }
- 
--int kptr_restrict __read_mostly;
-+int kptr_restrict __read_mostly = 2;
- 
- static noinline_for_stack
- char *restricted_pointer(char *buf, char *end, const void *ptr,
-diff --git a/mm/Kconfig b/mm/Kconfig
-index 390165ffbb0f..3b24c9e3535e 100644
---- a/mm/Kconfig
-+++ b/mm/Kconfig
-@@ -321,7 +321,8 @@ config KSM
- config DEFAULT_MMAP_MIN_ADDR
- 	int "Low address space to protect from user allocation"
- 	depends on MMU
--	default 4096
-+	default 32768 if ARM || (ARM64 && COMPAT)
-+	default 65536
- 	help
- 	  This is the portion of low virtual memory which should be protected
- 	  from userspace allocation.  Keeping a user from writing to low pages
-diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
-index 864f129f1937..929d585bd267 100644
---- a/mm/Kconfig.debug
-+++ b/mm/Kconfig.debug
-@@ -126,6 +126,7 @@ config DEBUG_WX
- 	depends on ARCH_HAS_DEBUG_WX
- 	depends on MMU
- 	select PTDUMP_CORE
-+	default y
- 	help
- 	  Generate a warning if any W+X mappings are found at boot.
- 
-diff --git a/mm/mmap.c b/mm/mmap.c
-index 5c8b4485860d..0e26c225bb53 100644
---- a/mm/mmap.c
-+++ b/mm/mmap.c
-@@ -231,6 +231,13 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
- 
- 	newbrk = PAGE_ALIGN(brk);
- 	oldbrk = PAGE_ALIGN(mm->brk);
-+	/* properly handle unaligned min_brk as an empty heap */
-+	if (min_brk & ~PAGE_MASK) {
-+		if (brk == min_brk)
-+			newbrk -= PAGE_SIZE;
-+		if (mm->brk == min_brk)
-+			oldbrk -= PAGE_SIZE;
-+	}
- 	if (oldbrk == newbrk) {
- 		mm->brk = brk;
- 		goto success;
-diff --git a/mm/page_alloc.c b/mm/page_alloc.c
-index 81cc7fdc9c8f..254def6fa5b3 100644
---- a/mm/page_alloc.c
-+++ b/mm/page_alloc.c
-@@ -70,6 +70,7 @@
- #include <linux/psi.h>
- #include <linux/padata.h>
- #include <linux/khugepaged.h>
-+#include <linux/random.h>
- 
- #include <asm/sections.h>
- #include <asm/tlbflush.h>
-@@ -136,6 +137,15 @@ struct pcpu_drain {
- static DEFINE_MUTEX(pcpu_drain_mutex);
- static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
- 
-+bool __meminitdata extra_latent_entropy;
-+
-+static int __init setup_extra_latent_entropy(char *str)
-+{
-+	extra_latent_entropy = true;
-+	return 0;
-+}
-+early_param("extra_latent_entropy", setup_extra_latent_entropy);
-+
- #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
- volatile unsigned long latent_entropy __latent_entropy;
- EXPORT_SYMBOL(latent_entropy);
-@@ -1529,6 +1539,25 @@ static void __free_pages_ok(struct page *page, unsigned int order,
- 	local_irq_restore(flags);
- }
- 
-+static void __init __gather_extra_latent_entropy(struct page *page,
-+						 unsigned int nr_pages)
-+{
-+	if (extra_latent_entropy && !PageHighMem(page) && page_to_pfn(page) < 0x100000) {
-+		unsigned long hash = 0;
-+		size_t index, end = PAGE_SIZE * nr_pages / sizeof hash;
-+		const unsigned long *data = lowmem_page_address(page);
-+
-+		for (index = 0; index < end; index++)
-+			hash ^= hash + data[index];
-+#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
-+		latent_entropy ^= hash;
-+		add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy));
-+#else
-+		add_device_randomness((const void *)&hash, sizeof(hash));
-+#endif
-+	}
-+}
-+
- void __free_pages_core(struct page *page, unsigned int order)
- {
- 	unsigned int nr_pages = 1 << order;
-@@ -1548,7 +1577,6 @@ void __free_pages_core(struct page *page, unsigned int order)
- 	}
- 	__ClearPageReserved(p);
- 	set_page_count(p, 0);
--
- 	atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
- 
- 	/*
-@@ -1607,6 +1635,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
- {
- 	if (early_page_uninitialised(pfn))
- 		return;
-+	__gather_extra_latent_entropy(page, 1 << order);
- 	__free_pages_core(page, order);
- }
- 
-@@ -1698,6 +1727,7 @@ static void __init deferred_free_range(unsigned long pfn,
- 	if (nr_pages == pageblock_nr_pages &&
- 	    (pfn & (pageblock_nr_pages - 1)) == 0) {
- 		set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-+		__gather_extra_latent_entropy(page, 1 << pageblock_order);
- 		__free_pages_core(page, pageblock_order);
- 		return;
- 	}
-@@ -1705,6 +1735,7 @@ static void __init deferred_free_range(unsigned long pfn,
- 	for (i = 0; i < nr_pages; i++, page++, pfn++) {
- 		if ((pfn & (pageblock_nr_pages - 1)) == 0)
- 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-+		__gather_extra_latent_entropy(page, 1);
- 		__free_pages_core(page, 0);
- 	}
- }
-@@ -2284,6 +2315,12 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
- {
- 	post_alloc_hook(page, order, gfp_flags);
- 
-+	if (IS_ENABLED(CONFIG_PAGE_SANITIZE_VERIFY) && want_init_on_free()) {
-+		int i;
-+		for (i = 0; i < (1 << order); i++)
-+			verify_zero_highpage(page + i);
-+	}
-+
- 	if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
- 		kernel_init_free_pages(page, 1 << order);
- 
-diff --git a/mm/slab.h b/mm/slab.h
-index e258ffcfb0ef..6208d0d5ef15 100644
---- a/mm/slab.h
-+++ b/mm/slab.h
-@@ -433,9 +433,13 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
- 	struct page *page;
- 
- 	page = virt_to_head_page(obj);
-+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
-+	BUG_ON(!PageSlab(page));
-+#else
- 	if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n",
- 					__func__))
- 		return NULL;
-+#endif
- 	return page->slab_cache;
- }
- 
-@@ -465,10 +469,15 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
- 		return s;
- 
- 	cachep = virt_to_cache(x);
--	if (WARN(cachep && cachep != s,
--		  "%s: Wrong slab cache. %s but object is from %s\n",
--		  __func__, s->name, cachep->name))
-+	if (cachep && cachep != s) {
-+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
-+		BUG();
-+#else
-+		WARN(1, "%s: Wrong slab cache. %s but object is from %s\n",
-+			__func__, s->name, cachep->name);
-+#endif
- 		print_tracking(cachep, x);
-+	}
- 	return cachep;
- }
- 
-@@ -493,7 +502,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
- 	 * back there or track user information then we can
- 	 * only use the space before that information.
- 	 */
--	if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER))
-+	if ((s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) || IS_ENABLED(CONFIG_SLAB_CANARY))
- 		return s->inuse;
- 	/*
- 	 * Else we can use all the padding etc for the allocation
-@@ -619,8 +628,10 @@ static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
- static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
- {
- 	if (static_branch_unlikely(&init_on_alloc)) {
-+#ifndef CONFIG_SLUB
- 		if (c->ctor)
- 			return false;
-+#endif
- 		if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))
- 			return flags & __GFP_ZERO;
- 		return true;
-@@ -630,9 +641,15 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
- 
- static inline bool slab_want_init_on_free(struct kmem_cache *c)
- {
--	if (static_branch_unlikely(&init_on_free))
--		return !(c->ctor ||
--			 (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)));
-+	if (static_branch_unlikely(&init_on_free)) {
-+#ifndef CONFIG_SLUB
-+		if (c->ctor)
-+			return false;
-+#endif
-+		if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))
-+			return false;
-+		return true;
-+	}
- 	return false;
- }
- 
-diff --git a/mm/slab_common.c b/mm/slab_common.c
-index ec832904f408..c24e2f33c9fb 100644
---- a/mm/slab_common.c
-+++ b/mm/slab_common.c
-@@ -30,10 +30,10 @@
- 
- #include "slab.h"
- 
--enum slab_state slab_state;
-+enum slab_state slab_state __ro_after_init;
- LIST_HEAD(slab_caches);
- DEFINE_MUTEX(slab_mutex);
--struct kmem_cache *kmem_cache;
-+struct kmem_cache *kmem_cache __ro_after_init;
- 
- #ifdef CONFIG_HARDENED_USERCOPY
- bool usercopy_fallback __ro_after_init =
-@@ -61,7 +61,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
- /*
-  * Merge control. If this is set then no merging of slab caches will occur.
-  */
--static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
-+static bool slab_nomerge __ro_after_init = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
- 
- static int __init setup_slab_nomerge(char *str)
- {
-diff --git a/mm/slub.c b/mm/slub.c
-index f5fc44208bdc..d1021b7544a6 100644
---- a/mm/slub.c
-+++ b/mm/slub.c
-@@ -128,6 +128,12 @@ static inline bool kmem_cache_debug(struct kmem_cache *s)
- 	return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
- }
- 
-+static inline bool has_sanitize_verify(struct kmem_cache *s)
-+{
-+	return IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) &&
-+	       slab_want_init_on_free(s);
-+}
-+
- void *fixup_red_left(struct kmem_cache *s, void *p)
- {
- 	if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
-@@ -433,6 +439,55 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
- 	return false;
- }
- 
-+#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SLAB_CANARY)
-+/*
-+ * See comment in calculate_sizes().
-+ */
-+static inline bool freeptr_outside_object(struct kmem_cache *s)
-+{
-+	return s->offset >= s->inuse;
-+}
-+
-+/*
-+ * Return offset of the end of info block which is inuse + free pointer if
-+ * not overlapping with object.
-+ */
-+static inline unsigned int get_info_end(struct kmem_cache *s)
-+{
-+	if (freeptr_outside_object(s))
-+		return s->inuse + sizeof(void *);
-+	else
-+		return s->inuse;
-+}
-+#endif
-+
-+#ifdef CONFIG_SLAB_CANARY
-+static inline unsigned long *get_canary(struct kmem_cache *s, void *object)
-+{
-+	return object + get_info_end(s);
-+}
-+
-+static inline unsigned long get_canary_value(const void *canary, unsigned long value)
-+{
-+	return (value ^ (unsigned long)canary) & CANARY_MASK;
-+}
-+
-+static inline void set_canary(struct kmem_cache *s, void *object, unsigned long value)
-+{
-+	unsigned long *canary = get_canary(s, object);
-+	*canary = get_canary_value(canary, value);
-+}
-+
-+static inline void check_canary(struct kmem_cache *s, void *object, unsigned long value)
-+{
-+	unsigned long *canary = get_canary(s, object);
-+	BUG_ON(*canary != get_canary_value(canary, value));
-+}
-+#else
-+#define set_canary(s, object, value)
-+#define check_canary(s, object, value)
-+#endif
-+
- #ifdef CONFIG_SLUB_DEBUG
- static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
- static DEFINE_SPINLOCK(object_map_lock);
-@@ -487,13 +542,13 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p)
-  * Debug settings:
-  */
- #if defined(CONFIG_SLUB_DEBUG_ON)
--static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
-+static slab_flags_t slub_debug __ro_after_init = DEBUG_DEFAULT_FLAGS;
- #else
--static slab_flags_t slub_debug;
-+static slab_flags_t slub_debug __ro_after_init;
- #endif
- 
--static char *slub_debug_string;
--static int disable_higher_order_debug;
-+static char *slub_debug_string __ro_after_init;
-+static int disable_higher_order_debug __ro_after_init;
- 
- /*
-  * slub is about to manipulate internal object metadata.  This memory lies
-@@ -544,26 +599,6 @@ static void print_section(char *level, char *text, u8 *addr,
- 	metadata_access_disable();
- }
- 
--/*
-- * See comment in calculate_sizes().
-- */
--static inline bool freeptr_outside_object(struct kmem_cache *s)
--{
--	return s->offset >= s->inuse;
--}
--
--/*
-- * Return offset of the end of info block which is inuse + free pointer if
-- * not overlapping with object.
-- */
--static inline unsigned int get_info_end(struct kmem_cache *s)
--{
--	if (freeptr_outside_object(s))
--		return s->inuse + sizeof(void *);
--	else
--		return s->inuse;
--}
--
- static struct track *get_track(struct kmem_cache *s, void *object,
- 	enum track_item alloc)
- {
-@@ -571,6 +606,9 @@ static struct track *get_track(struct kmem_cache *s, void *object,
- 
- 	p = object + get_info_end(s);
- 
-+	if (IS_ENABLED(CONFIG_SLAB_CANARY))
-+		p = (void *)p + sizeof(void *);
-+
- 	return p + alloc;
- }
- 
-@@ -712,6 +750,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
- 
- 	off = get_info_end(s);
- 
-+	if (IS_ENABLED(CONFIG_SLAB_CANARY))
-+		off += sizeof(void *);
-+
- 	if (s->flags & SLAB_STORE_USER)
- 		off += 2 * sizeof(struct track);
- 
-@@ -820,8 +861,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
-  * 	Meta data starts here.
-  *
-  * 	A. Free pointer (if we cannot overwrite object on free)
-- * 	B. Tracking data for SLAB_STORE_USER
-- * 	C. Padding to reach required alignment boundary or at mininum
-+ * 	B. Canary for SLAB_CANARY
-+ * 	C. Tracking data for SLAB_STORE_USER
-+ * 	D. Padding to reach required alignment boundary or at mininum
-  * 		one word if debugging is on to be able to detect writes
-  * 		before the word boundary.
-  *
-@@ -839,6 +881,9 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
- {
- 	unsigned long off = get_info_end(s);	/* The end of info */
- 
-+	if (IS_ENABLED(CONFIG_SLAB_CANARY))
-+		off += sizeof(void *);
-+
- 	if (s->flags & SLAB_STORE_USER)
- 		/* We also have user information there */
- 		off += 2 * sizeof(struct track);
-@@ -1559,6 +1604,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
- 		object = next;
- 		next = get_freepointer(s, object);
- 
-+		check_canary(s, object, s->random_active);
-+
- 		if (slab_want_init_on_free(s)) {
- 			/*
- 			 * Clear the object and the metadata, but don't touch
-@@ -1569,8 +1616,12 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
- 							   : 0;
- 			memset((char *)object + s->inuse, 0,
- 			       s->size - s->inuse - rsize);
--
-+			if (!IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && s->ctor)
-+				s->ctor(object);
- 		}
-+
-+		set_canary(s, object, s->random_inactive);
-+
- 		/* If object's reuse doesn't have to be delayed */
- 		if (!slab_free_hook(s, object)) {
- 			/* Move object to the new freelist */
-@@ -1578,6 +1629,18 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
- 			*head = object;
- 			if (!*tail)
- 				*tail = object;
-+		} else if (slab_want_init_on_free(s) && s->ctor) {
-+			/* Objects that are put into quarantine by KASAN will
-+			 * still undergo free_consistency_checks() and thus
-+			 * need to show a valid freepointer to check_object().
-+			 *
-+			 * Note that doing this for all caches (not just ctor
-+			 * ones, which have s->offset >= object_size)) causes a
-+			 * GPF, due to KASAN poisoning and the way
-+			 * set_freepointer() eventually dereferences the
-+			 * freepointer.
-+			 */
-+			set_freepointer(s, object, NULL);
- 		}
- 	} while (object != old_tail);
- 
-@@ -1591,8 +1654,9 @@ static void *setup_object(struct kmem_cache *s, struct page *page,
- 				void *object)
- {
- 	setup_object_debug(s, page, object);
-+	set_canary(s, object, s->random_inactive);
- 	object = kasan_init_slab_obj(s, object);
--	if (unlikely(s->ctor)) {
-+	if (unlikely(s->ctor) && !has_sanitize_verify(s)) {
- 		kasan_unpoison_object_data(s, object);
- 		s->ctor(object);
- 		kasan_poison_object_data(s, object);
-@@ -2883,8 +2947,28 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
- 
- 	maybe_wipe_obj_freeptr(s, object);
- 
--	if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
-+	if (has_sanitize_verify(s) && object) {
-+		/* KASAN hasn't unpoisoned the object yet (this is done in the
-+		 * post-alloc hook), so let's do it temporarily.
-+		 */
-+		kasan_unpoison_object_data(s, object);
-+		BUG_ON(memchr_inv(object, 0, s->object_size));
-+		if (s->ctor)
-+			s->ctor(object);
-+		kasan_poison_object_data(s, object);
-+	} else if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) {
- 		memset(object, 0, s->object_size);
-+		if (s->ctor) {
-+			kasan_unpoison_object_data(s, object);
-+			s->ctor(object);
-+			kasan_poison_object_data(s, object);
-+		}
-+	}
-+
-+	if (object) {
-+		check_canary(s, object, s->random_inactive);
-+		set_canary(s, object, s->random_active);
-+	}
- 
- 	slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
- 
-@@ -3273,7 +3357,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- 			  void **p)
- {
- 	struct kmem_cache_cpu *c;
--	int i;
-+	int i, k;
- 	struct obj_cgroup *objcg = NULL;
- 
- 	/* memcg and kmem_cache debug support */
-@@ -3323,11 +3407,35 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- 	local_irq_enable();
- 
- 	/* Clear memory outside IRQ disabled fastpath loop */
--	if (unlikely(slab_want_init_on_alloc(flags, s))) {
-+	if (has_sanitize_verify(s)) {
-+		int j;
-+
-+		for (j = 0; j < i; j++) {
-+			/* KASAN hasn't unpoisoned the object yet (this is done
-+			 * in the post-alloc hook), so let's do it temporarily.
-+			 */
-+			kasan_unpoison_object_data(s, p[j]);
-+			BUG_ON(memchr_inv(p[j], 0, s->object_size));
-+			if (s->ctor)
-+				s->ctor(p[j]);
-+			kasan_poison_object_data(s, p[j]);
-+		}
-+	} else if (unlikely(slab_want_init_on_alloc(flags, s))) {
- 		int j;
- 
--		for (j = 0; j < i; j++)
-+		for (j = 0; j < i; j++) {
- 			memset(p[j], 0, s->object_size);
-+			if (s->ctor) {
-+				kasan_unpoison_object_data(s, p[j]);
-+				s->ctor(p[j]);
-+				kasan_poison_object_data(s, p[j]);
-+			}
-+		}
-+	}
-+
-+	for (k = 0; k < i; k++) {
-+		check_canary(s, p[k], s->random_inactive);
-+		set_canary(s, p[k], s->random_active);
- 	}
- 
- 	/* memcg and kmem_cache debug support */
-@@ -3361,9 +3469,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk);
-  * and increases the number of allocations possible without having to
-  * take the list_lock.
-  */
--static unsigned int slub_min_order;
--static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
--static unsigned int slub_min_objects;
-+static unsigned int slub_min_order __ro_after_init;
-+static unsigned int slub_max_order __ro_after_init = PAGE_ALLOC_COSTLY_ORDER;
-+static unsigned int slub_min_objects __ro_after_init;
- 
- /*
-  * Calculate the order of allocation given an slab object size.
-@@ -3531,6 +3639,7 @@ static void early_kmem_cache_node_alloc(int node)
- 	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
- 	init_tracking(kmem_cache_node, n);
- #endif
-+	set_canary(kmem_cache_node, n, kmem_cache_node->random_active);
- 	n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
- 		      GFP_KERNEL);
- 	page->freelist = get_freepointer(kmem_cache_node, n);
-@@ -3705,6 +3814,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
- 		s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
- 	}
- 
-+	if (IS_ENABLED(CONFIG_SLAB_CANARY))
-+		size += sizeof(void *);
-+
- #ifdef CONFIG_SLUB_DEBUG
- 	if (flags & SLAB_STORE_USER)
- 		/*
-@@ -3778,6 +3890,10 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
- #ifdef CONFIG_SLAB_FREELIST_HARDENED
- 	s->random = get_random_long();
- #endif
-+#ifdef CONFIG_SLAB_CANARY
-+	s->random_active = get_random_long();
-+	s->random_inactive = get_random_long();
-+#endif
- 
- 	if (!calculate_sizes(s, -1))
- 		goto error;
-@@ -4051,6 +4167,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
- 		offset -= s->red_left_pad;
- 	}
- 
-+	check_canary(s, (void *)ptr - offset, s->random_active);
-+
- 	/* Allow address range falling entirely within usercopy region. */
- 	if (offset >= s->useroffset &&
- 	    offset - s->useroffset <= s->usersize &&
-@@ -4084,7 +4202,11 @@ size_t __ksize(const void *object)
- 	page = virt_to_head_page(object);
- 
- 	if (unlikely(!PageSlab(page))) {
-+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
-+		BUG_ON(!PageCompound(page));
-+#else
- 		WARN_ON(!PageCompound(page));
-+#endif
- 		return page_size(page);
- 	}
- 
-@@ -4875,7 +4997,7 @@ enum slab_stat_type {
- #define SO_TOTAL	(1 << SL_TOTAL)
- 
- #ifdef CONFIG_MEMCG
--static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON);
-+static bool memcg_sysfs_enabled __ro_after_init = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON);
- 
- static int __init setup_slub_memcg_sysfs(char *str)
- {
-diff --git a/mm/swap.c b/mm/swap.c
-index 47a47681c86b..762095d95092 100644
---- a/mm/swap.c
-+++ b/mm/swap.c
-@@ -102,6 +102,8 @@ static void __put_single_page(struct page *page)
- 
- static void __put_compound_page(struct page *page)
- {
-+	compound_page_dtor *dtor;
-+
- 	/*
- 	 * __page_cache_release() is supposed to be called for thp, not for
- 	 * hugetlb. This is because hugetlb page does never have PageLRU set
-@@ -110,7 +112,15 @@ static void __put_compound_page(struct page *page)
- 	 */
- 	if (!PageHuge(page))
- 		__page_cache_release(page);
--	destroy_compound_page(page);
-+	dtor = get_compound_page_dtor(page);
-+	if (!PageHuge(page))
-+		BUG_ON(dtor != free_compound_page
-+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-+			&& dtor != free_transhuge_page
-+#endif
-+		);
-+
-+	(*dtor)(page);
- }
- 
- void __put_page(struct page *page)
-diff --git a/mm/util.c b/mm/util.c
-index 4ddb6e186dd5..62ed34dfceb7 100644
---- a/mm/util.c
-+++ b/mm/util.c
-@@ -336,9 +336,9 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
- {
- 	/* Is the current task 32bit ? */
- 	if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
--		return randomize_page(mm->brk, SZ_32M);
-+		return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE;
- 
--	return randomize_page(mm->brk, SZ_1G);
-+	return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE;
- }
- 
- unsigned long arch_mmap_rnd(void)
-diff --git a/net/core/dev.c b/net/core/dev.c
-index 0c9ce36afc8c..c77d2c765b03 100644
---- a/net/core/dev.c
-+++ b/net/core/dev.c
-@@ -4870,7 +4870,7 @@ int netif_rx_any_context(struct sk_buff *skb)
- }
- EXPORT_SYMBOL(netif_rx_any_context);
- 
--static __latent_entropy void net_tx_action(struct softirq_action *h)
-+static __latent_entropy void net_tx_action(void)
- {
- 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
- 
-@@ -6838,7 +6838,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
- 	return work;
- }
- 
--static __latent_entropy void net_rx_action(struct softirq_action *h)
-+static __latent_entropy void net_rx_action(void)
- {
- 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
- 	unsigned long time_limit = jiffies +
-diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
-index 3da1f77bd039..dbca1f1e2449 100644
---- a/net/dccp/ccids/ccid2.c
-+++ b/net/dccp/ccids/ccid2.c
-@@ -126,21 +126,26 @@ static void dccp_tasklet_schedule(struct sock *sk)
- 
- static void ccid2_hc_tx_rto_expire(struct timer_list *t)
- {
--	struct ccid2_hc_tx_sock *hc = from_timer(hc, t, tx_rtotimer);
--	struct sock *sk = hc->sk;
--	const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
-+	struct dccp_sock *dp = from_timer(dp, t, dccps_ccid_timer);
-+	struct sock *sk = (struct sock *)dp;
-+	struct ccid2_hc_tx_sock *hc;
-+	bool sender_was_blocked;
- 
- 	bh_lock_sock(sk);
-+
-+	if (inet_sk_state_load(sk) == DCCP_CLOSED)
-+		goto out;
-+
-+	hc = ccid_priv(dp->dccps_hc_tx_ccid);
-+	sender_was_blocked = ccid2_cwnd_network_limited(hc);
-+
- 	if (sock_owned_by_user(sk)) {
--		sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5);
-+		sk_reset_timer(sk, &dp->dccps_ccid_timer, jiffies + HZ / 5);
- 		goto out;
- 	}
- 
- 	ccid2_pr_debug("RTO_EXPIRE\n");
- 
--	if (sk->sk_state == DCCP_CLOSED)
--		goto out;
--
- 	/* back-off timer */
- 	hc->tx_rto <<= 1;
- 	if (hc->tx_rto > DCCP_RTO_MAX)
-@@ -166,7 +171,7 @@ static void ccid2_hc_tx_rto_expire(struct timer_list *t)
- 	if (sender_was_blocked)
- 		dccp_tasklet_schedule(sk);
- 	/* restart backed-off timer */
--	sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
-+	sk_reset_timer(sk, &dp->dccps_ccid_timer, jiffies + hc->tx_rto);
- out:
- 	bh_unlock_sock(sk);
- 	sock_put(sk);
-@@ -330,7 +335,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
- 	}
- #endif
- 
--	sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
-+	sk_reset_timer(sk, &dp->dccps_ccid_timer, jiffies + hc->tx_rto);
- 
- #ifdef CONFIG_IP_DCCP_CCID2_DEBUG
- 	do {
-@@ -700,9 +705,9 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
- 
- 	/* restart RTO timer if not all outstanding data has been acked */
- 	if (hc->tx_pipe == 0)
--		sk_stop_timer(sk, &hc->tx_rtotimer);
-+		sk_stop_timer(sk, &dp->dccps_ccid_timer);
- 	else
--		sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
-+		sk_reset_timer(sk, &dp->dccps_ccid_timer, jiffies + hc->tx_rto);
- done:
- 	/* check if incoming Acks allow pending packets to be sent */
- 	if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
-@@ -737,17 +742,18 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
- 	hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32;
- 	hc->tx_cwnd_used = 0;
- 	hc->sk		 = sk;
--	timer_setup(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 0);
-+	timer_setup(&dp->dccps_ccid_timer, ccid2_hc_tx_rto_expire, 0);
- 	INIT_LIST_HEAD(&hc->tx_av_chunks);
- 	return 0;
- }
- 
- static void ccid2_hc_tx_exit(struct sock *sk)
- {
-+	struct dccp_sock *dp = dccp_sk(sk);
- 	struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- 	int i;
- 
--	sk_stop_timer(sk, &hc->tx_rtotimer);
-+	sk_stop_timer(sk, &dp->dccps_ccid_timer);
- 
- 	for (i = 0; i < hc->tx_seqbufc; i++)
- 		kfree(hc->tx_seqbuf[i]);
-diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
-index b9ee1a4a8955..685f4d046c0d 100644
---- a/net/dccp/ccids/ccid3.c
-+++ b/net/dccp/ccids/ccid3.c
-@@ -184,17 +184,24 @@ static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc,
- 
- static void ccid3_hc_tx_no_feedback_timer(struct timer_list *t)
- {
--	struct ccid3_hc_tx_sock *hc = from_timer(hc, t, tx_no_feedback_timer);
--	struct sock *sk = hc->sk;
-+	struct dccp_sock *dp = from_timer(dp, t, dccps_ccid_timer);
-+	struct ccid3_hc_tx_sock *hc;
-+	struct sock *sk = (struct sock *)dp;
- 	unsigned long t_nfb = USEC_PER_SEC / 5;
- 
- 	bh_lock_sock(sk);
-+
-+	if (inet_sk_state_load(sk) == DCCP_CLOSED)
-+		goto out;
-+
- 	if (sock_owned_by_user(sk)) {
- 		/* Try again later. */
- 		/* XXX: set some sensible MIB */
- 		goto restart_timer;
- 	}
- 
-+	hc = ccid_priv(dp->dccps_hc_tx_ccid);
-+
- 	ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk,
- 		       ccid3_tx_state_name(hc->tx_state));
- 
-@@ -250,8 +257,8 @@ static void ccid3_hc_tx_no_feedback_timer(struct timer_list *t)
- 		t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
- 
- restart_timer:
--	sk_reset_timer(sk, &hc->tx_no_feedback_timer,
--			   jiffies + usecs_to_jiffies(t_nfb));
-+	sk_reset_timer(sk, &dp->dccps_ccid_timer,
-+		       jiffies + usecs_to_jiffies(t_nfb));
- out:
- 	bh_unlock_sock(sk);
- 	sock_put(sk);
-@@ -280,7 +287,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
- 		return -EBADMSG;
- 
- 	if (hc->tx_state == TFRC_SSTATE_NO_SENT) {
--		sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies +
-+		sk_reset_timer(sk, &dp->dccps_ccid_timer, (jiffies +
- 			       usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
- 		hc->tx_last_win_count	= 0;
- 		hc->tx_t_last_win_count = now;
-@@ -354,6 +361,7 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
- static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
- {
- 	struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
-+	struct dccp_sock *dp = dccp_sk(sk);
- 	struct tfrc_tx_hist_entry *acked;
- 	ktime_t now;
- 	unsigned long t_nfb;
-@@ -420,7 +428,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
- 			       (unsigned int)(hc->tx_x >> 6));
- 
- 	/* unschedule no feedback timer */
--	sk_stop_timer(sk, &hc->tx_no_feedback_timer);
-+	sk_stop_timer(sk, &dp->dccps_ccid_timer);
- 
- 	/*
- 	 * As we have calculated new ipi, delta, t_nom it is possible
-@@ -445,8 +453,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
- 		       "expire in %lu jiffies (%luus)\n",
- 		       dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb);
- 
--	sk_reset_timer(sk, &hc->tx_no_feedback_timer,
--			   jiffies + usecs_to_jiffies(t_nfb));
-+	sk_reset_timer(sk, &dp->dccps_ccid_timer,
-+		       jiffies + usecs_to_jiffies(t_nfb));
- }
- 
- static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
-@@ -488,21 +496,23 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
- 
- static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
- {
-+	struct dccp_sock *dp = dccp_sk(sk);
- 	struct ccid3_hc_tx_sock *hc = ccid_priv(ccid);
- 
- 	hc->tx_state = TFRC_SSTATE_NO_SENT;
- 	hc->tx_hist  = NULL;
- 	hc->sk	     = sk;
--	timer_setup(&hc->tx_no_feedback_timer,
-+	timer_setup(&dp->dccps_ccid_timer,
- 		    ccid3_hc_tx_no_feedback_timer, 0);
- 	return 0;
- }
- 
- static void ccid3_hc_tx_exit(struct sock *sk)
- {
-+	struct dccp_sock *dp = dccp_sk(sk);
- 	struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- 
--	sk_stop_timer(sk, &hc->tx_no_feedback_timer);
-+	sk_stop_timer(sk, &dp->dccps_ccid_timer);
- 	tfrc_tx_hist_purge(&hc->tx_hist);
- }
- 
-diff --git a/net/dccp/proto.c b/net/dccp/proto.c
-index 6d705d90c614..359e848dba6c 100644
---- a/net/dccp/proto.c
-+++ b/net/dccp/proto.c
-@@ -279,7 +279,9 @@ int dccp_disconnect(struct sock *sk, int flags)
- 
- 	dccp_clear_xmit_timers(sk);
- 	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
-+	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
- 	dp->dccps_hc_rx_ccid = NULL;
-+	dp->dccps_hc_tx_ccid = NULL;
- 
- 	__skb_queue_purge(&sk->sk_receive_queue);
- 	__skb_queue_purge(&sk->sk_write_queue);
-diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
-index 87983e70f03f..d1584b4b39f9 100644
---- a/net/ipv4/Kconfig
-+++ b/net/ipv4/Kconfig
-@@ -267,6 +267,7 @@ config IP_PIMSM_V2
- 
- config SYN_COOKIES
- 	bool "IP: TCP syncookie support"
-+	default y
- 	help
- 	  Normal TCP/IP networking is open to an attack known as "SYN
- 	  flooding". This denial-of-service attack prevents legitimate remote
-@@ -742,3 +743,26 @@ config TCP_MD5SIG
- 	  on the Internet.
- 
- 	  If unsure, say N.
-+
-+config TCP_SIMULT_CONNECT_DEFAULT_ON
-+	bool "Enable TCP simultaneous connect"
-+	help
-+	  Enable TCP simultaneous connect that adds a weakness in Linux's strict
-+	  implementation of TCP that allows two clients to connect to each other
-+	  without either entering a listening state. The weakness allows an
-+	  attacker to easily prevent a client from connecting to a known server
-+	  provided the source port for the connection is guessed correctly.
-+
-+	  As the weakness could be used to prevent an antivirus or IPS from
-+	  fetching updates, or prevent an SSL gateway from fetching a CRL, it
-+	  should be eliminated by disabling this option. Though Linux is one of
-+	  few operating systems supporting simultaneous connect, it has no
-+	  legitimate use in practice and is rarely supported by firewalls.
-+
-+	  Disabling this may break TCP STUNT which is used by some applications
-+	  for NAT traversal.
-+
-+	  This setting can be overridden at runtime via the
-+	  net.ipv4.tcp_simult_connect sysctl.
-+
-+	  If unsure, say N.
-diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
-index 08829809e88b..d06be35bacbe 100644
---- a/net/ipv4/sysctl_net_ipv4.c
-+++ b/net/ipv4/sysctl_net_ipv4.c
-@@ -588,6 +588,15 @@ static struct ctl_table ipv4_table[] = {
- 		.mode		= 0644,
- 		.proc_handler	= proc_do_static_key,
- 	},
-+	{
-+		.procname	= "tcp_simult_connect",
-+		.data		= &sysctl_tcp_simult_connect,
-+		.maxlen		= sizeof(int),
-+		.mode		= 0644,
-+		.proc_handler	= proc_dointvec_minmax,
-+		.extra1		= SYSCTL_ZERO,
-+		.extra2		= SYSCTL_ONE,
-+	},
- 	{ }
- };
- 
-diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
-index fac5c1469cee..7c3ffb3f4002 100644
---- a/net/ipv4/tcp_input.c
-+++ b/net/ipv4/tcp_input.c
-@@ -82,6 +82,7 @@
- #include <net/mptcp.h>
- 
- int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
-+int sysctl_tcp_simult_connect __read_mostly = IS_ENABLED(CONFIG_TCP_SIMULT_CONNECT_DEFAULT_ON);
- 
- #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
- #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
-@@ -6197,7 +6198,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
- 	    tcp_paws_reject(&tp->rx_opt, 0))
- 		goto discard_and_undo;
- 
--	if (th->syn) {
-+	if (th->syn && sysctl_tcp_simult_connect) {
- 		/* We see SYN without ACK. It is attempt of
- 		 * simultaneous connect with crossed SYNs.
- 		 * Particularly, it can be connect to self.
-diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost
-index 12a87be0fb44..f4c69e330a53 100644
---- a/scripts/Makefile.modpost
-+++ b/scripts/Makefile.modpost
-@@ -47,6 +47,7 @@ MODPOST = scripts/mod/modpost								\
- 	$(if $(CONFIG_MODVERSIONS),-m)							\
- 	$(if $(CONFIG_MODULE_SRCVERSION_ALL),-a)					\
- 	$(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E)					\
-+	$(if $(CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE),-f)			\
- 	$(if $(KBUILD_MODPOST_WARN),-w) \
- 	-o $@
- 
-diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig
-index ae19fb0243b9..ad78375ece5e 100644
---- a/scripts/gcc-plugins/Kconfig
-+++ b/scripts/gcc-plugins/Kconfig
-@@ -53,6 +53,11 @@ config GCC_PLUGIN_LATENT_ENTROPY
- 	  is some slowdown of the boot process (about 0.5%) and fork and
- 	  irq processing.
- 
-+	  When extra_latent_entropy is passed on the kernel command line,
-+	  entropy will be extracted from up to the first 4GB of RAM while the
-+	  runtime memory allocator is being initialized.  This costs even more
-+	  slowdown of the boot process.
-+
- 	  Note that entropy extracted this way is not cryptographically
- 	  secure!
- 
-diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
-index e08f75aed429..649595efc541 100644
---- a/scripts/mod/modpost.c
-+++ b/scripts/mod/modpost.c
-@@ -35,6 +35,8 @@ static int warn_unresolved = 0;
- /* How a symbol is exported */
- static int sec_mismatch_count = 0;
- static int sec_mismatch_fatal = 0;
-+static int writable_fptr_count = 0;
-+static int writable_fptr_verbose = 0;
- /* ignore missing files */
- static int ignore_missing_files;
- /* If set to 1, only warn (instead of error) about missing ns imports */
-@@ -1007,6 +1009,7 @@ enum mismatch {
- 	ANY_EXIT_TO_ANY_INIT,
- 	EXPORT_TO_INIT_EXIT,
- 	EXTABLE_TO_NON_TEXT,
-+	DATA_TO_TEXT
- };
- 
- /**
-@@ -1133,6 +1136,12 @@ static const struct sectioncheck sectioncheck[] = {
- 	.good_tosec = {ALL_TEXT_SECTIONS , NULL},
- 	.mismatch = EXTABLE_TO_NON_TEXT,
- 	.handler = extable_mismatch_handler,
-+},
-+/* Do not reference code from writable data */
-+{
-+	.fromsec = { DATA_SECTIONS, NULL },
-+	.bad_tosec = { ALL_TEXT_SECTIONS, NULL },
-+	.mismatch = DATA_TO_TEXT
- }
- };
- 
-@@ -1320,10 +1329,10 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr,
- 			continue;
- 		if (!is_valid_name(elf, sym))
- 			continue;
--		if (sym->st_value == addr)
--			return sym;
- 		/* Find a symbol nearby - addr are maybe negative */
- 		d = sym->st_value - addr;
-+		if (d == 0)
-+			return sym;
- 		if (d < 0)
- 			d = addr - sym->st_value;
- 		if (d < distance) {
-@@ -1458,7 +1467,13 @@ static void report_sec_mismatch(const char *modname,
- 	char *prl_from;
- 	char *prl_to;
- 
--	sec_mismatch_count++;
-+	if (mismatch->mismatch == DATA_TO_TEXT) {
-+		writable_fptr_count++;
-+		if (!writable_fptr_verbose)
-+			return;
-+	} else {
-+		sec_mismatch_count++;
-+	}
- 
- 	get_pretty_name(from_is_func, &from, &from_p);
- 	get_pretty_name(to_is_func, &to, &to_p);
-@@ -1580,6 +1595,12 @@ static void report_sec_mismatch(const char *modname,
- 		fatal("There's a special handler for this mismatch type, "
- 		      "we should never get here.");
- 		break;
-+	case DATA_TO_TEXT:
-+		fprintf(stderr,
-+		"The %s %s:%s references\n"
-+		"the %s %s:%s%s\n",
-+		from, fromsec, fromsym, to, tosec, tosym, to_p);
-+		break;
- 	}
- 	fprintf(stderr, "\n");
- }
-@@ -2546,7 +2567,7 @@ int main(int argc, char **argv)
- 	struct dump_list *dump_read_start = NULL;
- 	struct dump_list **dump_read_iter = &dump_read_start;
- 
--	while ((opt = getopt(argc, argv, "ei:mnT:o:awENd:")) != -1) {
-+	while ((opt = getopt(argc, argv, "ei:fmnT:o:awENd:")) != -1) {
- 		switch (opt) {
- 		case 'e':
- 			external_module = 1;
-@@ -2557,6 +2578,9 @@ int main(int argc, char **argv)
- 			(*dump_read_iter)->file = optarg;
- 			dump_read_iter = &(*dump_read_iter)->next;
- 			break;
-+		case 'f':
-+			writable_fptr_verbose = 1;
-+			break;
- 		case 'm':
- 			modversions = 1;
- 			break;
-@@ -2657,6 +2681,11 @@ int main(int argc, char **argv)
- 	}
- 
- 	free(buf.p);
-+	if (writable_fptr_count && !writable_fptr_verbose)
-+		warn("modpost: Found %d writable function pointer%s.\n"
-+		     "To see full details build your kernel with:\n"
-+		     "'make CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE=y'\n",
-+		     writable_fptr_count, (writable_fptr_count == 1 ? "" : "s"));
- 
- 	return err;
- }
-diff --git a/security/Kconfig b/security/Kconfig
-index 7561f6f99f1d..615205c0113b 100644
---- a/security/Kconfig
-+++ b/security/Kconfig
-@@ -9,7 +9,7 @@ source "security/keys/Kconfig"
- 
- config SECURITY_DMESG_RESTRICT
- 	bool "Restrict unprivileged access to the kernel syslog"
--	default n
-+	default y
- 	help
- 	  This enforces restrictions on unprivileged users reading the kernel
- 	  syslog via dmesg(8).
-@@ -19,10 +19,34 @@ config SECURITY_DMESG_RESTRICT
- 
- 	  If you are unsure how to answer this question, answer N.
- 
-+config SECURITY_PERF_EVENTS_RESTRICT
-+	bool "Restrict unprivileged use of performance events"
-+	depends on PERF_EVENTS
-+	default y
-+	help
-+	  If you say Y here, the kernel.perf_event_paranoid sysctl
-+	  will be set to 3 by default, and no unprivileged use of the
-+	  perf_event_open syscall will be permitted unless it is
-+	  changed.
-+
-+config SECURITY_TIOCSTI_RESTRICT
-+	bool "Restrict unprivileged use of tiocsti command injection"
-+	default y
-+	help
-+	  This enforces restrictions on unprivileged users injecting commands
-+	  into other processes which share a tty session using the TIOCSTI
-+	  ioctl. This option makes TIOCSTI use require CAP_SYS_ADMIN.
-+
-+	  If this option is not selected, no restrictions will be enforced
-+	  unless the tiocsti_restrict sysctl is explicitly set to (1).
-+
-+	  If you are unsure how to answer this question, answer N.
-+
- config SECURITY
- 	bool "Enable different security models"
- 	depends on SYSFS
- 	depends on MULTIUSER
-+	default y
- 	help
- 	  This allows you to choose different security modules to be
- 	  configured into your kernel.
-@@ -48,6 +72,7 @@ config SECURITYFS
- config SECURITY_NETWORK
- 	bool "Socket and Networking Security Hooks"
- 	depends on SECURITY
-+	default y
- 	help
- 	  This enables the socket and networking security hooks.
- 	  If enabled, a security module can use these hooks to
-@@ -154,6 +179,7 @@ config HARDENED_USERCOPY
- 	bool "Harden memory copies between kernel and userspace"
- 	depends on HAVE_HARDENED_USERCOPY_ALLOCATOR
- 	imply STRICT_DEVMEM
-+	default y
- 	help
- 	  This option checks for obviously wrong memory regions when
- 	  copying memory to/from the kernel (via copy_to_user() and
-@@ -166,7 +192,6 @@ config HARDENED_USERCOPY
- config HARDENED_USERCOPY_FALLBACK
- 	bool "Allow usercopy whitelist violations to fallback to object size"
- 	depends on HARDENED_USERCOPY
--	default y
- 	help
- 	  This is a temporary option that allows missing usercopy whitelists
- 	  to be discovered via a WARN() to the kernel log, instead of
-@@ -191,10 +216,21 @@ config HARDENED_USERCOPY_PAGESPAN
- config FORTIFY_SOURCE
- 	bool "Harden common str/mem functions against buffer overflows"
- 	depends on ARCH_HAS_FORTIFY_SOURCE
-+	default y
- 	help
- 	  Detect overflows of buffers in common string and memory functions
- 	  where the compiler can determine and validate the buffer sizes.
- 
-+config FORTIFY_SOURCE_STRICT_STRING
-+	bool "Harden common functions against buffer overflows"
-+	depends on FORTIFY_SOURCE
-+	depends on EXPERT
-+	help
-+	  Perform stricter overflow checks catching overflows within objects
-+	  for common C string functions rather than only between objects.
-+
-+	  This is not yet intended for production use, only bug finding.
-+
- config STATIC_USERMODEHELPER
- 	bool "Force all usermode helper calls through a single binary"
- 	help
-diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
-index 269967c4fc1b..7dede18f1074 100644
---- a/security/Kconfig.hardening
-+++ b/security/Kconfig.hardening
-@@ -190,6 +190,7 @@ config STACKLEAK_RUNTIME_DISABLE
- 
- config INIT_ON_ALLOC_DEFAULT_ON
- 	bool "Enable heap memory zeroing on allocation by default"
-+	default yes
- 	help
- 	  This has the effect of setting "init_on_alloc=1" on the kernel
- 	  command line. This can be disabled with "init_on_alloc=0".
-@@ -202,6 +203,7 @@ config INIT_ON_ALLOC_DEFAULT_ON
- 
- config INIT_ON_FREE_DEFAULT_ON
- 	bool "Enable heap memory zeroing on free by default"
-+	default yes
- 	help
- 	  This has the effect of setting "init_on_free=1" on the kernel
- 	  command line. This can be disabled with "init_on_free=0".
-@@ -217,6 +219,21 @@ config INIT_ON_FREE_DEFAULT_ON
- 	  touching "cold" memory areas. Most cases see 3-5% impact. Some
- 	  synthetic workloads have measured as high as 8%.
- 
-+config PAGE_SANITIZE_VERIFY
-+	bool "Verify sanitized pages"
-+	default y
-+	help
-+	  When init_on_free is enabled, verify that newly allocated pages
-+	  are zeroed to detect write-after-free bugs.
-+
-+config SLAB_SANITIZE_VERIFY
-+	bool "Verify sanitized SLAB allocations"
-+	default y
-+	depends on !KASAN
-+	help
-+	  When init_on_free is enabled, verify that newly allocated slab
-+	  objects are zeroed to detect write-after-free bugs.
-+
- endmenu
- 
- endmenu
-diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig
-index 9e921fc72538..ae851a826c26 100644
---- a/security/selinux/Kconfig
-+++ b/security/selinux/Kconfig
-@@ -3,7 +3,7 @@ config SECURITY_SELINUX
- 	bool "NSA SELinux Support"
- 	depends on SECURITY_NETWORK && AUDIT && NET && INET
- 	select NETWORK_SECMARK
--	default n
-+	default y
- 	help
- 	  This selects NSA Security-Enhanced Linux (SELinux).
- 	  You will also need a policy configuration and a labeled filesystem.
-@@ -70,29 +70,6 @@ config SECURITY_SELINUX_AVC_STATS
- 	  /sys/fs/selinux/avc/cache_stats, which may be monitored via
- 	  tools such as avcstat.
- 
--config SECURITY_SELINUX_CHECKREQPROT_VALUE
--	int "NSA SELinux checkreqprot default value"
--	depends on SECURITY_SELINUX
--	range 0 1
--	default 0
--	help
--	  This option sets the default value for the 'checkreqprot' flag
--	  that determines whether SELinux checks the protection requested
--	  by the application or the protection that will be applied by the
--	  kernel (including any implied execute for read-implies-exec) for
--	  mmap and mprotect calls.  If this option is set to 0 (zero),
--	  SELinux will default to checking the protection that will be applied
--	  by the kernel.  If this option is set to 1 (one), SELinux will
--	  default to checking the protection requested by the application.
--	  The checkreqprot flag may be changed from the default via the
--	  'checkreqprot=' boot parameter.  It may also be changed at runtime
--	  via /sys/fs/selinux/checkreqprot if authorized by policy.
--
--	  WARNING: this option is deprecated and will be removed in a future
--	  kernel release.
--
--	  If you are unsure how to answer this question, answer 0.
--
- config SECURITY_SELINUX_SIDTAB_HASH_BITS
- 	int "NSA SELinux sidtab hashtable size"
- 	depends on SECURITY_SELINUX
-diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
-index 227eb8967963..a8fe132825cd 100644
---- a/security/selinux/hooks.c
-+++ b/security/selinux/hooks.c
-@@ -136,21 +136,7 @@ static int __init selinux_enabled_setup(char *str)
- __setup("selinux=", selinux_enabled_setup);
- #endif
- 
--static unsigned int selinux_checkreqprot_boot =
--	CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE;
--
--static int __init checkreqprot_setup(char *str)
--{
--	unsigned long checkreqprot;
--
--	if (!kstrtoul(str, 0, &checkreqprot)) {
--		selinux_checkreqprot_boot = checkreqprot ? 1 : 0;
--		if (checkreqprot)
--			pr_warn("SELinux: checkreqprot set to 1 via kernel parameter.  This is deprecated and will be rejected in a future kernel release.\n");
--	}
--	return 1;
--}
--__setup("checkreqprot=", checkreqprot_setup);
-+static const unsigned int selinux_checkreqprot_boot;
- 
- /**
-  * selinux_secmark_enabled - Check to see if SECMARK is currently enabled
-diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
-index 2b745ae8cb98..de739d432da6 100644
---- a/security/selinux/selinuxfs.c
-+++ b/security/selinux/selinuxfs.c
-@@ -724,7 +724,6 @@ static ssize_t sel_read_checkreqprot(struct file *filp, char __user *buf,
- static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf,
- 				      size_t count, loff_t *ppos)
- {
--	struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info;
- 	char *page;
- 	ssize_t length;
- 	unsigned int new_value;
-@@ -748,18 +747,9 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf,
- 		return PTR_ERR(page);
- 
- 	length = -EINVAL;
--	if (sscanf(page, "%u", &new_value) != 1)
-+	if (sscanf(page, "%u", &new_value) != 1 || new_value)
- 		goto out;
- 
--	if (new_value) {
--		char comm[sizeof(current->comm)];
--
--		memcpy(comm, current->comm, sizeof(comm));
--		pr_warn_once("SELinux: %s (%d) set checkreqprot to 1. This is deprecated and will be rejected in a future kernel release.\n",
--			     comm, current->pid);
--	}
--
--	checkreqprot_set(fsi->state, (new_value ? 1 : 0));
- 	length = count;
- out:
- 	kfree(page);
-diff --git a/security/yama/Kconfig b/security/yama/Kconfig
-index a810304123ca..b809050b25d2 100644
---- a/security/yama/Kconfig
-+++ b/security/yama/Kconfig
-@@ -2,7 +2,7 @@
- config SECURITY_YAMA
- 	bool "Yama support"
- 	depends on SECURITY
--	default n
-+	default y
- 	help
- 	  This selects Yama, which extends DAC support with additional
- 	  system-wide security settings beyond regular Linux discretionary
-diff --git a/tools/perf/Documentation/security.txt b/tools/perf/Documentation/security.txt
-index 4fe3b8b1958f..a7d88cc23a70 100644
---- a/tools/perf/Documentation/security.txt
-+++ b/tools/perf/Documentation/security.txt
-@@ -148,6 +148,7 @@ Perf tool provides a message similar to the one below:
-    >= 0: Disallow raw and ftrace function tracepoint access
-    >= 1: Disallow CPU event access
-    >= 2: Disallow kernel profiling
-+   >= 3: Disallow use of any event
-    To make the adjusted perf_event_paranoid setting permanent preserve it
-    in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = <setting>)
- 
diff --git a/sys-kernel/linux-image-redcore-lts/files/5.10-uksm-linux-hardened.patch b/sys-kernel/linux-image-redcore-lts/files/5.10-uksm-linux-hardened.patch
deleted file mode 100644
index f85a1de1..00000000
--- a/sys-kernel/linux-image-redcore-lts/files/5.10-uksm-linux-hardened.patch
+++ /dev/null
@@ -1,6911 +0,0 @@
-diff -Nur a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
---- a/Documentation/vm/uksm.txt	1970-01-01 01:00:00.000000000 +0100
-+++ b/Documentation/vm/uksm.txt	2021-01-03 14:22:34.498459039 +0000
-@@ -0,0 +1,61 @@
-+The Ultra Kernel Samepage Merging feature
-+----------------------------------------------
-+/*
-+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
-+ *
-+ * This is an improvement upon KSM. Some basic data structures and routines
-+ * are borrowed from ksm.c .
-+ *
-+ * Its new features:
-+ * 1. Full system scan:
-+ *      It automatically scans all user processes' anonymous VMAs. Kernel-user
-+ *      interaction to submit a memory area to KSM is no longer needed.
-+ *
-+ * 2. Rich area detection:
-+ *      It automatically detects rich areas containing abundant duplicated
-+ *      pages based. Rich areas are given a full scan speed. Poor areas are
-+ *      sampled at a reasonable speed with very low CPU consumption.
-+ *
-+ * 3. Ultra Per-page scan speed improvement:
-+ *      A new hash algorithm is proposed. As a result, on a machine with
-+ *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
-+ *      can scan memory areas that does not contain duplicated pages at speed of
-+ *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
-+ *      477MB/sec ~ 923MB/sec.
-+ *
-+ * 4. Thrashing area avoidance:
-+ *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
-+ *      filtered out. My benchmark shows it's more efficient than KSM's per-page
-+ *      hash value based volatile page detection.
-+ *
-+ *
-+ * 5. Misc changes upon KSM:
-+ *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
-+ *        comparison. It's much faster than default C version on x86.
-+ *      * rmap_item now has an struct *page member to loosely cache a
-+ *        address-->page mapping, which reduces too much time-costly
-+ *        follow_page().
-+ *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
-+ *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
-+ *        ksm is needed for this case.
-+ *
-+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
-+ *    Now uksmd consider full zero pages as special pages and merge them to an
-+ *    special unswappable uksm zero page.
-+ */
-+
-+ChangeLog:
-+
-+2012-05-05 The creation of this Doc
-+2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
-+2012-05-28 UKSM 0.1.1.2 bug fix release
-+2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
-+2012-07-2  UKSM 0.1.2-beta2
-+2012-07-10 UKSM 0.1.2-beta3
-+2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
-+2012-10-13 UKSM 0.1.2.1 Bug fixes.
-+2012-12-31 UKSM 0.1.2.2 Minor bug fixes.
-+2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug".
-+2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings.
-+2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation.
-+2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration.
-diff -Nur a/fs/exec.c b/fs/exec.c
---- a/fs/exec.c	2021-01-03 14:20:51.258372089 +0000
-+++ b/fs/exec.c	2021-01-03 14:23:32.755627017 +0000
-@@ -65,6 +65,7 @@
- #include <linux/vmalloc.h>
- #include <linux/io_uring.h>
- #include <linux/random.h>
-+#include <linux/ksm.h>
- 
- #include <linux/uaccess.h>
- #include <asm/mmu_context.h>
-diff -Nur a/fs/proc/meminfo.c b/fs/proc/meminfo.c
---- a/fs/proc/meminfo.c	2020-12-30 10:54:29.000000000 +0000
-+++ b/fs/proc/meminfo.c	2021-01-03 14:22:34.498459039 +0000
-@@ -108,7 +108,10 @@
- #endif
- 	show_val_kb(m, "PageTables:     ",
- 		    global_zone_page_state(NR_PAGETABLE));
--
-+#ifdef CONFIG_UKSM
-+	show_val_kb(m, "KsmZeroPages:     ",
-+		    global_zone_page_state(NR_UKSM_ZERO_PAGES));
-+#endif
- 	show_val_kb(m, "NFS_Unstable:   ", 0);
- 	show_val_kb(m, "Bounce:         ",
- 		    global_zone_page_state(NR_BOUNCE));
-diff -Nur a/include/linux/ksm.h b/include/linux/ksm.h
---- a/include/linux/ksm.h	2020-12-30 10:54:29.000000000 +0000
-+++ b/include/linux/ksm.h	2021-01-03 14:22:34.498459039 +0000
-@@ -21,20 +21,16 @@
- #ifdef CONFIG_KSM
- int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
- 		unsigned long end, int advice, unsigned long *vm_flags);
--int __ksm_enter(struct mm_struct *mm);
--void __ksm_exit(struct mm_struct *mm);
- 
--static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-+static inline struct stable_node *page_stable_node(struct page *page)
- {
--	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
--		return __ksm_enter(mm);
--	return 0;
-+	return PageKsm(page) ? page_rmapping(page) : NULL;
- }
- 
--static inline void ksm_exit(struct mm_struct *mm)
-+static inline void set_page_stable_node(struct page *page,
-+					struct stable_node *stable_node)
- {
--	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
--		__ksm_exit(mm);
-+	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
- }
- 
- /*
-@@ -54,6 +50,33 @@
- void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
- void ksm_migrate_page(struct page *newpage, struct page *oldpage);
- 
-+#ifdef CONFIG_KSM_LEGACY
-+int __ksm_enter(struct mm_struct *mm);
-+void __ksm_exit(struct mm_struct *mm);
-+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-+{
-+	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
-+		return __ksm_enter(mm);
-+	return 0;
-+}
-+
-+static inline void ksm_exit(struct mm_struct *mm)
-+{
-+	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
-+		__ksm_exit(mm);
-+}
-+
-+#elif defined(CONFIG_UKSM)
-+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-+{
-+	return 0;
-+}
-+
-+static inline void ksm_exit(struct mm_struct *mm)
-+{
-+}
-+#endif /* !CONFIG_UKSM */
-+
- #else  /* !CONFIG_KSM */
- 
- static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-@@ -89,4 +112,6 @@
- #endif /* CONFIG_MMU */
- #endif /* !CONFIG_KSM */
- 
-+#include <linux/uksm.h>
-+
- #endif /* __LINUX_KSM_H */
-diff -Nur a/include/linux/mm_types.h b/include/linux/mm_types.h
---- a/include/linux/mm_types.h	2020-12-30 10:54:29.000000000 +0000
-+++ b/include/linux/mm_types.h	2021-01-03 14:22:34.498459039 +0000
-@@ -372,6 +372,9 @@
- 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
- #endif
- 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
-+#ifdef CONFIG_UKSM
-+	struct vma_slot *uksm_vma_slot;
-+#endif
- } __randomize_layout;
- 
- struct core_thread {
-diff -Nur a/include/linux/mmzone.h b/include/linux/mmzone.h
---- a/include/linux/mmzone.h	2020-12-30 10:54:29.000000000 +0000
-+++ b/include/linux/mmzone.h	2021-01-03 14:22:34.498459039 +0000
-@@ -159,6 +159,9 @@
- 	NR_ZSPAGES,		/* allocated in zsmalloc */
- #endif
- 	NR_FREE_CMA_PAGES,
-+#ifdef CONFIG_UKSM
-+	NR_UKSM_ZERO_PAGES,
-+#endif
- 	NR_VM_ZONE_STAT_ITEMS };
- 
- enum node_stat_item {
-diff -Nur a/include/linux/pgtable.h b/include/linux/pgtable.h
---- a/include/linux/pgtable.h	2020-12-30 10:54:29.000000000 +0000
-+++ b/include/linux/pgtable.h	2021-01-03 14:22:34.498459039 +0000
-@@ -1060,12 +1060,25 @@
- extern void untrack_pfn_moved(struct vm_area_struct *vma);
- #endif
- 
-+#ifdef CONFIG_UKSM
-+static inline int is_uksm_zero_pfn(unsigned long pfn)
-+{
-+	extern unsigned long uksm_zero_pfn;
-+	return pfn == uksm_zero_pfn;
-+}
-+#else
-+static inline int is_uksm_zero_pfn(unsigned long pfn)
-+{
-+	return 0;
-+}
-+#endif
-+
- #ifdef __HAVE_COLOR_ZERO_PAGE
- static inline int is_zero_pfn(unsigned long pfn)
- {
- 	extern unsigned long zero_pfn;
- 	unsigned long offset_from_zero_pfn = pfn - zero_pfn;
--	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
-+	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
- }
- 
- #define my_zero_pfn(addr)	page_to_pfn(ZERO_PAGE(addr))
-@@ -1074,7 +1087,7 @@
- static inline int is_zero_pfn(unsigned long pfn)
- {
- 	extern unsigned long zero_pfn;
--	return pfn == zero_pfn;
-+	return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
- }
- 
- static inline unsigned long my_zero_pfn(unsigned long addr)
-diff -Nur a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
---- a/include/linux/sradix-tree.h	1970-01-01 01:00:00.000000000 +0100
-+++ b/include/linux/sradix-tree.h	2021-01-03 14:22:34.498459039 +0000
-@@ -0,0 +1,77 @@
-+#ifndef _LINUX_SRADIX_TREE_H
-+#define _LINUX_SRADIX_TREE_H
-+
-+
-+#define INIT_SRADIX_TREE(root, mask)					\
-+do {									\
-+	(root)->height = 0;						\
-+	(root)->gfp_mask = (mask);					\
-+	(root)->rnode = NULL;						\
-+} while (0)
-+
-+#define ULONG_BITS	(sizeof(unsigned long) * 8)
-+#define SRADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
-+//#define SRADIX_TREE_MAP_SHIFT	6
-+//#define SRADIX_TREE_MAP_SIZE	(1UL << SRADIX_TREE_MAP_SHIFT)
-+//#define SRADIX_TREE_MAP_MASK	(SRADIX_TREE_MAP_SIZE-1)
-+
-+struct sradix_tree_node {
-+	unsigned int	height;		/* Height from the bottom */
-+	unsigned int	count;
-+	unsigned int	fulls;		/* Number of full sublevel trees */
-+	struct sradix_tree_node *parent;
-+	void *stores[0];
-+};
-+
-+/* A simple radix tree implementation */
-+struct sradix_tree_root {
-+	unsigned int            height;
-+	struct sradix_tree_node *rnode;
-+
-+	/* Where found to have available empty stores in its sublevels */
-+	struct sradix_tree_node *enter_node;
-+	unsigned int shift;
-+	unsigned int stores_size;
-+	unsigned int mask;
-+	unsigned long min;	/* The first hole index */
-+	unsigned long num;
-+	//unsigned long *height_to_maxindex;
-+
-+	/* How the node is allocated and freed. */
-+	struct sradix_tree_node *(*alloc)(void);
-+	void (*free)(struct sradix_tree_node *node);
-+
-+	/* When a new node is added and removed */
-+	void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
-+	void (*assign)(struct sradix_tree_node *node, unsigned int index, void *item);
-+	void (*rm)(struct sradix_tree_node *node, unsigned int offset);
-+};
-+
-+struct sradix_tree_path {
-+	struct sradix_tree_node *node;
-+	int offset;
-+};
-+
-+static inline
-+void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
-+{
-+	root->height = 0;
-+	root->rnode = NULL;
-+	root->shift = shift;
-+	root->stores_size = 1UL << shift;
-+	root->mask = root->stores_size - 1;
-+}
-+
-+
-+extern void *sradix_tree_next(struct sradix_tree_root *root,
-+		       struct sradix_tree_node *node, unsigned long index,
-+		       int (*iter)(void *, unsigned long));
-+
-+extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
-+
-+extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
-+			struct sradix_tree_node *node, unsigned long index);
-+
-+extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
-+
-+#endif /* _LINUX_SRADIX_TREE_H */
-diff -Nur a/include/linux/uksm.h b/include/linux/uksm.h
---- a/include/linux/uksm.h	1970-01-01 01:00:00.000000000 +0100
-+++ b/include/linux/uksm.h	2021-01-03 14:22:34.498459039 +0000
-@@ -0,0 +1,149 @@
-+#ifndef __LINUX_UKSM_H
-+#define __LINUX_UKSM_H
-+/*
-+ * Memory merging support.
-+ *
-+ * This code enables dynamic sharing of identical pages found in different
-+ * memory areas, even if they are not shared by fork().
-+ */
-+
-+/* if !CONFIG_UKSM this file should not be compiled at all. */
-+#ifdef CONFIG_UKSM
-+
-+#include <linux/bitops.h>
-+#include <linux/mm.h>
-+#include <linux/pagemap.h>
-+#include <linux/rmap.h>
-+#include <linux/sched.h>
-+
-+extern unsigned long zero_pfn __read_mostly;
-+extern unsigned long uksm_zero_pfn __read_mostly;
-+extern struct page *empty_uksm_zero_page;
-+
-+/* must be done before linked to mm */
-+extern void uksm_vma_add_new(struct vm_area_struct *vma);
-+extern void uksm_remove_vma(struct vm_area_struct *vma);
-+
-+#define UKSM_SLOT_NEED_SORT	(1 << 0)
-+#define UKSM_SLOT_NEED_RERAND	(1 << 1)
-+#define UKSM_SLOT_SCANNED	(1 << 2) /* It's scanned in this round */
-+#define UKSM_SLOT_FUL_SCANNED	(1 << 3)
-+#define UKSM_SLOT_IN_UKSM	(1 << 4)
-+
-+struct vma_slot {
-+	struct sradix_tree_node *snode;
-+	unsigned long sindex;
-+
-+	struct list_head slot_list;
-+	unsigned long fully_scanned_round;
-+	unsigned long dedup_num;
-+	unsigned long pages_scanned;
-+	unsigned long this_sampled;
-+	unsigned long last_scanned;
-+	unsigned long pages_to_scan;
-+	struct scan_rung *rung;
-+	struct page **rmap_list_pool;
-+	unsigned int *pool_counts;
-+	unsigned long pool_size;
-+	struct vm_area_struct *vma;
-+	struct mm_struct *mm;
-+	unsigned long ctime_j;
-+	unsigned long pages;
-+	unsigned long flags;
-+	unsigned long pages_cowed; /* pages cowed this round */
-+	unsigned long pages_merged; /* pages merged this round */
-+	unsigned long pages_bemerged;
-+
-+	/* when it has page merged in this eval round */
-+	struct list_head dedup_list;
-+};
-+
-+static inline void uksm_unmap_zero_page(pte_t pte)
-+{
-+	if (pte_pfn(pte) == uksm_zero_pfn)
-+		__dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
-+}
-+
-+static inline void uksm_map_zero_page(pte_t pte)
-+{
-+	if (pte_pfn(pte) == uksm_zero_pfn)
-+		__inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
-+}
-+
-+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
-+{
-+	if (vma->uksm_vma_slot && PageKsm(page))
-+		vma->uksm_vma_slot->pages_cowed++;
-+}
-+
-+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
-+{
-+	if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
-+		vma->uksm_vma_slot->pages_cowed++;
-+}
-+
-+static inline int uksm_flags_can_scan(unsigned long vm_flags)
-+{
-+#ifdef VM_SAO
-+		if (vm_flags & VM_SAO)
-+			return 0;
-+#endif
-+
-+	return !(vm_flags & (VM_PFNMAP | VM_IO  | VM_DONTEXPAND |
-+			     VM_HUGETLB | VM_MIXEDMAP | VM_SHARED
-+			     | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN));
-+}
-+
-+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
-+{
-+	if (uksm_flags_can_scan(*vm_flags_p))
-+		*vm_flags_p |= VM_MERGEABLE;
-+}
-+
-+/*
-+ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
-+ * be removed when uksm zero page patch is stable enough.
-+ */
-+static inline void uksm_bugon_zeropage(pte_t pte)
-+{
-+	BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
-+}
-+#else
-+static inline void uksm_vma_add_new(struct vm_area_struct *vma)
-+{
-+}
-+
-+static inline void uksm_remove_vma(struct vm_area_struct *vma)
-+{
-+}
-+
-+static inline void uksm_unmap_zero_page(pte_t pte)
-+{
-+}
-+
-+static inline void uksm_map_zero_page(pte_t pte)
-+{
-+}
-+
-+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
-+{
-+}
-+
-+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
-+{
-+}
-+
-+static inline int uksm_flags_can_scan(unsigned long vm_flags)
-+{
-+	return 0;
-+}
-+
-+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
-+{
-+}
-+
-+static inline void uksm_bugon_zeropage(pte_t pte)
-+{
-+}
-+#endif /* !CONFIG_UKSM */
-+#endif /* __LINUX_UKSM_H */
-diff -Nur a/kernel/fork.c b/kernel/fork.c
---- a/kernel/fork.c	2021-01-03 14:20:51.263372191 +0000
-+++ b/kernel/fork.c	2021-01-03 14:22:34.499459059 +0000
-@@ -588,7 +588,7 @@
- 		__vma_link_rb(mm, tmp, rb_link, rb_parent);
- 		rb_link = &tmp->vm_rb.rb_right;
- 		rb_parent = &tmp->vm_rb;
--
-+		uksm_vma_add_new(tmp);
- 		mm->map_count++;
- 		if (!(tmp->vm_flags & VM_WIPEONFORK))
- 			retval = copy_page_range(tmp, mpnt);
-diff -Nur a/lib/Makefile b/lib/Makefile
---- a/lib/Makefile	2020-12-30 10:54:29.000000000 +0000
-+++ b/lib/Makefile	2021-01-03 14:22:34.499459059 +0000
-@@ -31,7 +31,7 @@
- KCSAN_SANITIZE_random32.o := n
- 
- lib-y := ctype.o string.o vsprintf.o cmdline.o \
--	 rbtree.o radix-tree.o timerqueue.o xarray.o \
-+	 rbtree.o radix-tree.o sradix-tree.o timerqueue.o xarray.o \
- 	 idr.o extable.o sha1.o irq_regs.o argv_split.o \
- 	 flex_proportions.o ratelimit.o show_mem.o \
- 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-diff -Nur a/lib/sradix-tree.c b/lib/sradix-tree.c
---- a/lib/sradix-tree.c	1970-01-01 01:00:00.000000000 +0100
-+++ b/lib/sradix-tree.c	2021-01-03 14:22:34.499459059 +0000
-@@ -0,0 +1,476 @@
-+#include <linux/errno.h>
-+#include <linux/mm.h>
-+#include <linux/mman.h>
-+#include <linux/spinlock.h>
-+#include <linux/slab.h>
-+#include <linux/gcd.h>
-+#include <linux/sradix-tree.h>
-+
-+static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
-+{
-+	return node->fulls == root->stores_size ||
-+		(node->height == 1 && node->count == root->stores_size);
-+}
-+
-+/*
-+ *	Extend a sradix tree so it can store key @index.
-+ */
-+static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
-+{
-+	struct sradix_tree_node *node;
-+	unsigned int height;
-+
-+	if (unlikely(root->rnode == NULL)) {
-+		if (!(node = root->alloc()))
-+			return -ENOMEM;
-+
-+		node->height = 1;
-+		root->rnode = node;
-+		root->height = 1;
-+	}
-+
-+	/* Figure out what the height should be.  */
-+	height = root->height;
-+	index >>= root->shift * height;
-+
-+	while (index) {
-+		index >>= root->shift;
-+		height++;
-+	}
-+
-+	while (height > root->height) {
-+		unsigned int newheight;
-+
-+		if (!(node = root->alloc()))
-+			return -ENOMEM;
-+
-+		/* Increase the height.  */
-+		node->stores[0] = root->rnode;
-+		root->rnode->parent = node;
-+		if (root->extend)
-+			root->extend(node, root->rnode);
-+
-+		newheight = root->height + 1;
-+		node->height = newheight;
-+		node->count = 1;
-+		if (sradix_node_full(root, root->rnode))
-+			node->fulls = 1;
-+
-+		root->rnode = node;
-+		root->height = newheight;
-+	}
-+
-+	return 0;
-+}
-+
-+/*
-+ * Search the next item from the current node, that is not NULL
-+ * and can satify root->iter().
-+ */
-+void *sradix_tree_next(struct sradix_tree_root *root,
-+		       struct sradix_tree_node *node, unsigned long index,
-+		       int (*iter)(void *item, unsigned long height))
-+{
-+	unsigned long offset;
-+	void *item;
-+
-+	if (unlikely(node == NULL)) {
-+		node = root->rnode;
-+		for (offset = 0; offset < root->stores_size; offset++) {
-+			item = node->stores[offset];
-+			if (item && (!iter || iter(item, node->height)))
-+				break;
-+		}
-+
-+		if (unlikely(offset >= root->stores_size))
-+			return NULL;
-+
-+		if (node->height == 1)
-+			return item;
-+		else
-+			goto go_down;
-+	}
-+
-+	while (node) {
-+		offset = (index & root->mask) + 1;
-+		for (; offset < root->stores_size; offset++) {
-+			item = node->stores[offset];
-+			if (item && (!iter || iter(item, node->height)))
-+				break;
-+		}
-+
-+		if (offset < root->stores_size)
-+			break;
-+
-+		node = node->parent;
-+		index >>= root->shift;
-+	}
-+
-+	if (!node)
-+		return NULL;
-+
-+	while (node->height > 1) {
-+go_down:
-+		node = item;
-+		for (offset = 0; offset < root->stores_size; offset++) {
-+			item = node->stores[offset];
-+			if (item && (!iter || iter(item, node->height)))
-+				break;
-+		}
-+
-+		if (unlikely(offset >= root->stores_size))
-+			return NULL;
-+	}
-+
-+	BUG_ON(offset > root->stores_size);
-+
-+	return item;
-+}
-+
-+/*
-+ * Blindly insert the item to the tree. Typically, we reuse the
-+ * first empty store item.
-+ */
-+int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
-+{
-+	unsigned long index;
-+	unsigned int height;
-+	struct sradix_tree_node *node, *tmp = NULL;
-+	int offset, offset_saved;
-+	void **store = NULL;
-+	int error, i, j, shift;
-+
-+go_on:
-+	index = root->min;
-+
-+	if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
-+		node = root->enter_node;
-+		BUG_ON((index >> (root->shift * root->height)));
-+	} else {
-+		node = root->rnode;
-+		if (node == NULL || (index >> (root->shift * root->height))
-+		    || sradix_node_full(root, node)) {
-+			error = sradix_tree_extend(root, index);
-+			if (error)
-+				return error;
-+
-+			node = root->rnode;
-+		}
-+	}
-+
-+
-+	height = node->height;
-+	shift = (height - 1) * root->shift;
-+	offset = (index >> shift) & root->mask;
-+	while (shift > 0) {
-+		offset_saved = offset;
-+		for (; offset < root->stores_size; offset++) {
-+			store = &node->stores[offset];
-+			tmp = *store;
-+
-+			if (!tmp || !sradix_node_full(root, tmp))
-+				break;
-+		}
-+		BUG_ON(offset >= root->stores_size);
-+
-+		if (offset != offset_saved) {
-+			index += (offset - offset_saved) << shift;
-+			index &= ~((1UL << shift) - 1);
-+		}
-+
-+		if (!tmp) {
-+			if (!(tmp = root->alloc()))
-+				return -ENOMEM;
-+
-+			tmp->height = shift / root->shift;
-+			*store = tmp;
-+			tmp->parent = node;
-+			node->count++;
-+//			if (root->extend)
-+//				root->extend(node, tmp);
-+		}
-+
-+		node = tmp;
-+		shift -= root->shift;
-+		offset = (index >> shift) & root->mask;
-+	}
-+
-+	BUG_ON(node->height != 1);
-+
-+
-+	store = &node->stores[offset];
-+	for (i = 0, j = 0;
-+	      j < root->stores_size - node->count &&
-+	      i < root->stores_size - offset && j < num; i++) {
-+		if (!store[i]) {
-+			store[i] = item[j];
-+			if (root->assign)
-+				root->assign(node, index + i, item[j]);
-+			j++;
-+		}
-+	}
-+
-+	node->count += j;
-+	root->num += j;
-+	num -= j;
-+
-+	while (sradix_node_full(root, node)) {
-+		node = node->parent;
-+		if (!node)
-+			break;
-+
-+		node->fulls++;
-+	}
-+
-+	if (unlikely(!node)) {
-+		/* All nodes are full */
-+		root->min = 1 << (root->height * root->shift);
-+		root->enter_node = NULL;
-+	} else {
-+		root->min = index + i - 1;
-+		root->min |= (1UL << (node->height - 1)) - 1;
-+		root->min++;
-+		root->enter_node = node;
-+	}
-+
-+	if (num) {
-+		item += j;
-+		goto go_on;
-+	}
-+
-+	return 0;
-+}
-+
-+
-+/**
-+ *	sradix_tree_shrink    -    shrink height of a sradix tree to minimal
-+ *      @root		sradix tree root
-+ *
-+ */
-+static inline void sradix_tree_shrink(struct sradix_tree_root *root)
-+{
-+	/* try to shrink tree height */
-+	while (root->height > 1) {
-+		struct sradix_tree_node *to_free = root->rnode;
-+
-+		/*
-+		 * The candidate node has more than one child, or its child
-+		 * is not at the leftmost store, we cannot shrink.
-+		 */
-+		if (to_free->count != 1 || !to_free->stores[0])
-+			break;
-+
-+		root->rnode = to_free->stores[0];
-+		root->rnode->parent = NULL;
-+		root->height--;
-+		if (unlikely(root->enter_node == to_free))
-+			root->enter_node = NULL;
-+		root->free(to_free);
-+	}
-+}
-+
-+/*
-+ * Del the item on the known leaf node and index
-+ */
-+void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
-+				  struct sradix_tree_node *node, unsigned long index)
-+{
-+	unsigned int offset;
-+	struct sradix_tree_node *start, *end;
-+
-+	BUG_ON(node->height != 1);
-+
-+	start = node;
-+	while (node && !(--node->count))
-+		node = node->parent;
-+
-+	end = node;
-+	if (!node) {
-+		root->rnode = NULL;
-+		root->height = 0;
-+		root->min = 0;
-+		root->num = 0;
-+		root->enter_node = NULL;
-+	} else {
-+		offset = (index >> (root->shift * (node->height - 1))) & root->mask;
-+		if (root->rm)
-+			root->rm(node, offset);
-+		node->stores[offset] = NULL;
-+		root->num--;
-+		if (root->min > index) {
-+			root->min = index;
-+			root->enter_node = node;
-+		}
-+	}
-+
-+	if (start != end) {
-+		do {
-+			node = start;
-+			start = start->parent;
-+			if (unlikely(root->enter_node == node))
-+				root->enter_node = end;
-+			root->free(node);
-+		} while (start != end);
-+
-+		/*
-+		 * Note that shrink may free "end", so enter_node still need to
-+		 * be checked inside.
-+		 */
-+		sradix_tree_shrink(root);
-+	} else if (node->count == root->stores_size - 1) {
-+		/* It WAS a full leaf node. Update the ancestors */
-+		node = node->parent;
-+		while (node) {
-+			node->fulls--;
-+			if (node->fulls != root->stores_size - 1)
-+				break;
-+
-+			node = node->parent;
-+		}
-+	}
-+}
-+
-+void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
-+{
-+	unsigned int height, offset;
-+	struct sradix_tree_node *node;
-+	int shift;
-+
-+	node = root->rnode;
-+	if (node == NULL || (index >> (root->shift * root->height)))
-+		return NULL;
-+
-+	height = root->height;
-+	shift = (height - 1) * root->shift;
-+
-+	do {
-+		offset = (index >> shift) & root->mask;
-+		node = node->stores[offset];
-+		if (!node)
-+			return NULL;
-+
-+		shift -= root->shift;
-+	} while (shift >= 0);
-+
-+	return node;
-+}
-+
-+/*
-+ * Return the item if it exists, otherwise create it in place
-+ * and return the created item.
-+ */
-+void *sradix_tree_lookup_create(struct sradix_tree_root *root,
-+			unsigned long index, void *(*item_alloc)(void))
-+{
-+	unsigned int height, offset;
-+	struct sradix_tree_node *node, *tmp;
-+	void *item;
-+	int shift, error;
-+
-+	if (root->rnode == NULL || (index >> (root->shift * root->height))) {
-+		if (item_alloc) {
-+			error = sradix_tree_extend(root, index);
-+			if (error)
-+				return NULL;
-+		} else {
-+			return NULL;
-+		}
-+	}
-+
-+	node = root->rnode;
-+	height = root->height;
-+	shift = (height - 1) * root->shift;
-+
-+	do {
-+		offset = (index >> shift) & root->mask;
-+		if (!node->stores[offset]) {
-+			if (!(tmp = root->alloc()))
-+				return NULL;
-+
-+			tmp->height = shift / root->shift;
-+			node->stores[offset] = tmp;
-+			tmp->parent = node;
-+			node->count++;
-+			node = tmp;
-+		} else {
-+			node = node->stores[offset];
-+		}
-+
-+		shift -= root->shift;
-+	} while (shift > 0);
-+
-+	BUG_ON(node->height != 1);
-+	offset = index & root->mask;
-+	if (node->stores[offset]) {
-+		return node->stores[offset];
-+	} else if (item_alloc) {
-+		if (!(item = item_alloc()))
-+			return NULL;
-+
-+		node->stores[offset] = item;
-+
-+		/*
-+		 * NOTE: we do NOT call root->assign here, since this item is
-+		 * newly created by us having no meaning. Caller can call this
-+		 * if it's necessary to do so.
-+		 */
-+
-+		node->count++;
-+		root->num++;
-+
-+		while (sradix_node_full(root, node)) {
-+			node = node->parent;
-+			if (!node)
-+				break;
-+
-+			node->fulls++;
-+		}
-+
-+		if (unlikely(!node)) {
-+			/* All nodes are full */
-+			root->min = 1 << (root->height * root->shift);
-+		} else {
-+			if (root->min == index) {
-+				root->min |= (1UL << (node->height - 1)) - 1;
-+				root->min++;
-+				root->enter_node = node;
-+			}
-+		}
-+
-+		return item;
-+	} else {
-+		return NULL;
-+	}
-+
-+}
-+
-+int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
-+{
-+	unsigned int height, offset;
-+	struct sradix_tree_node *node;
-+	int shift;
-+
-+	node = root->rnode;
-+	if (node == NULL || (index >> (root->shift * root->height)))
-+		return -ENOENT;
-+
-+	height = root->height;
-+	shift = (height - 1) * root->shift;
-+
-+	do {
-+		offset = (index >> shift) & root->mask;
-+		node = node->stores[offset];
-+		if (!node)
-+			return -ENOENT;
-+
-+		shift -= root->shift;
-+	} while (shift > 0);
-+
-+	offset = index & root->mask;
-+	if (!node->stores[offset])
-+		return -ENOENT;
-+
-+	sradix_tree_delete_from_leaf(root, node, index);
-+
-+	return 0;
-+}
-diff -Nur a/mm/Kconfig b/mm/Kconfig
---- a/mm/Kconfig	2021-01-03 14:20:51.266372252 +0000
-+++ b/mm/Kconfig	2021-01-03 14:22:34.499459059 +0000
-@@ -317,6 +317,32 @@
- 	  See Documentation/vm/ksm.rst for more information: KSM is inactive
- 	  until a program has madvised that an area is MADV_MERGEABLE, and
- 	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
-+choice
-+	prompt "Choose UKSM/KSM strategy"
-+	default UKSM
-+	depends on KSM
-+	help
-+	  This option allows to select a UKSM/KSM stragety.
-+
-+config UKSM
-+	bool "Ultra-KSM for page merging"
-+	depends on KSM
-+	help
-+	UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
-+	page Merging), but with a fundamentally rewritten core algorithm. With
-+	an advanced algorithm, UKSM now can transparently scans all anonymously
-+	mapped user space applications with an significantly improved scan speed
-+	and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
-+	UKSM. Now UKSM has its first stable release and first real world enterprise user.
-+	For more information, please goto its project page.
-+	(github.com/dolohow/uksm)
-+
-+config KSM_LEGACY
-+	bool "Legacy KSM implementation"
-+	depends on KSM
-+	help
-+	The legacy KSM implementation from Red Hat.
-+endchoice
- 
- config DEFAULT_MMAP_MIN_ADDR
- 	int "Low address space to protect from user allocation"
-diff -Nur a/mm/ksm.c b/mm/ksm.c
---- a/mm/ksm.c	2020-12-30 10:54:29.000000000 +0000
-+++ b/mm/ksm.c	2021-01-03 14:22:34.499459059 +0000
-@@ -858,17 +858,6 @@
- 	return err;
- }
- 
--static inline struct stable_node *page_stable_node(struct page *page)
--{
--	return PageKsm(page) ? page_rmapping(page) : NULL;
--}
--
--static inline void set_page_stable_node(struct page *page,
--					struct stable_node *stable_node)
--{
--	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
--}
--
- #ifdef CONFIG_SYSFS
- /*
-  * Only called through the sysfs control interface:
-diff -Nur a/mm/Makefile b/mm/Makefile
---- a/mm/Makefile	2020-12-30 10:54:29.000000000 +0000
-+++ b/mm/Makefile	2021-01-03 14:22:34.499459059 +0000
-@@ -76,7 +76,8 @@
- obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
- obj-$(CONFIG_SLOB) += slob.o
- obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
--obj-$(CONFIG_KSM) += ksm.o
-+obj-$(CONFIG_KSM_LEGACY) += ksm.o
-+obj-$(CONFIG_UKSM) += uksm.o
- obj-$(CONFIG_PAGE_POISONING) += page_poison.o
- obj-$(CONFIG_SLAB) += slab.o
- obj-$(CONFIG_SLUB) += slub.o
-diff -Nur a/mm/memory.c b/mm/memory.c
---- a/mm/memory.c	2020-12-30 10:54:29.000000000 +0000
-+++ b/mm/memory.c	2021-01-03 14:22:34.500459079 +0000
-@@ -146,6 +146,25 @@
- 
- unsigned long highest_memmap_pfn __read_mostly;
- 
-+#ifdef CONFIG_UKSM
-+unsigned long uksm_zero_pfn __read_mostly;
-+EXPORT_SYMBOL_GPL(uksm_zero_pfn);
-+struct page *empty_uksm_zero_page;
-+
-+static int __init setup_uksm_zero_page(void)
-+{
-+	empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0);
-+	if (!empty_uksm_zero_page)
-+		panic("Oh boy, that early out of memory?");
-+
-+	SetPageReserved(empty_uksm_zero_page);
-+	uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
-+
-+	return 0;
-+}
-+core_initcall(setup_uksm_zero_page);
-+#endif
-+
- /*
-  * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
-  */
-@@ -161,6 +180,7 @@
- 	trace_rss_stat(mm, member, count);
- }
- 
-+
- #if defined(SPLIT_RSS_COUNTING)
- 
- void sync_mm_rss(struct mm_struct *mm)
-@@ -869,6 +889,11 @@
- 		get_page(page);
- 		page_dup_rmap(page, false);
- 		rss[mm_counter(page)]++;
-+
-+		/* Should return NULL in vm_normal_page() */
-+		uksm_bugon_zeropage(pte);
-+	} else {
-+		uksm_map_zero_page(pte);
- 	}
- 
- 	/*
-@@ -1248,8 +1273,10 @@
- 			ptent = ptep_get_and_clear_full(mm, addr, pte,
- 							tlb->fullmm);
- 			tlb_remove_tlb_entry(tlb, pte, addr);
--			if (unlikely(!page))
-+			if (unlikely(!page)) {
-+				uksm_unmap_zero_page(ptent);
- 				continue;
-+			}
- 
- 			if (!PageAnon(page)) {
- 				if (pte_dirty(ptent)) {
-@@ -2597,6 +2624,7 @@
- 
- 	if (likely(src)) {
- 		copy_user_highpage(dst, src, addr, vma);
-+		uksm_cow_page(vma, src);
- 		return true;
- 	}
- 
-@@ -2843,6 +2871,7 @@
- 							      vmf->address);
- 		if (!new_page)
- 			goto oom;
-+		uksm_cow_pte(vma, vmf->orig_pte);
- 	} else {
- 		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- 				vmf->address);
-@@ -2885,7 +2914,9 @@
- 						mm_counter_file(old_page));
- 				inc_mm_counter_fast(mm, MM_ANONPAGES);
- 			}
-+			uksm_bugon_zeropage(vmf->orig_pte);
- 		} else {
-+			uksm_unmap_zero_page(vmf->orig_pte);
- 			inc_mm_counter_fast(mm, MM_ANONPAGES);
- 		}
- 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
-diff -Nur a/mm/mmap.c b/mm/mmap.c
---- a/mm/mmap.c	2021-01-03 14:20:51.267372272 +0000
-+++ b/mm/mmap.c	2021-01-03 14:22:34.500459079 +0000
-@@ -46,6 +46,7 @@
- #include <linux/moduleparam.h>
- #include <linux/pkeys.h>
- #include <linux/oom.h>
-+#include <linux/ksm.h>
- #include <linux/sched/mm.h>
- 
- #include <linux/uaccess.h>
-@@ -181,6 +182,7 @@
- 	if (vma->vm_file)
- 		fput(vma->vm_file);
- 	mpol_put(vma_policy(vma));
-+       uksm_remove_vma(vma);
- 	vm_area_free(vma);
- 	return next;
- }
-@@ -757,9 +759,16 @@
- 	long adjust_next = 0;
- 	int remove_next = 0;
- 
-+/*
-+ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
-+ * acquired
-+ */
-+	uksm_remove_vma(vma);
-+
- 	if (next && !insert) {
- 		struct vm_area_struct *exporter = NULL, *importer = NULL;
- 
-+		uksm_remove_vma(next);
- 		if (end >= next->vm_end) {
- 			/*
- 			 * vma expands, overlapping all the next, and
-@@ -890,6 +899,7 @@
- 		end_changed = true;
- 	}
- 	vma->vm_pgoff = pgoff;
-+
- 	if (adjust_next) {
- 		next->vm_start += adjust_next;
- 		next->vm_pgoff += adjust_next >> PAGE_SHIFT;
-@@ -994,6 +1004,7 @@
- 		if (remove_next == 2) {
- 			remove_next = 1;
- 			end = next->vm_end;
-+			uksm_remove_vma(next);
- 			goto again;
- 		}
- 		else if (next)
-@@ -1020,10 +1031,14 @@
- 			 */
- 			VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
- 		}
-+	} else {
-+		if (next && !insert)
-+			uksm_vma_add_new(next);
- 	}
- 	if (insert && file)
- 		uprobe_mmap(insert);
- 
-+	uksm_vma_add_new(vma);
- 	validate_mm(mm);
- 
- 	return 0;
-@@ -1479,6 +1494,9 @@
- 	vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
- 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
- 
-+	/* If uksm is enabled, we add VM_MERGEABLE to new VMAs. */
-+	uksm_vm_flags_mod(&vm_flags);
-+
- 	if (flags & MAP_LOCKED)
- 		if (!can_do_mlock())
- 			return -EPERM;
-@@ -1874,6 +1892,7 @@
- 			allow_write_access(file);
- 	}
- 	file = vma->vm_file;
-+	uksm_vma_add_new(vma);
- out:
- 	perf_event_mmap(vma);
- 
-@@ -1916,6 +1935,7 @@
- 	if (vm_flags & VM_DENYWRITE)
- 		allow_write_access(file);
- free_vma:
-+	uksm_remove_vma(vma);
- 	vm_area_free(vma);
- unacct_error:
- 	if (charged)
-@@ -2775,6 +2795,8 @@
- 	else
- 		err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
- 
-+	uksm_vma_add_new(new);
-+
- 	/* Success. */
- 	if (!err)
- 		return 0;
-@@ -3082,6 +3104,7 @@
- 	if ((flags & (~VM_EXEC)) != 0)
- 		return -EINVAL;
- 	flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
-+	uksm_vm_flags_mod(&flags);
- 
- 	mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- 	if (IS_ERR_VALUE(mapped_addr))
-@@ -3127,6 +3150,7 @@
- 	vma->vm_flags = flags;
- 	vma->vm_page_prot = vm_get_page_prot(flags);
- 	vma_link(mm, vma, prev, rb_link, rb_parent);
-+	uksm_vma_add_new(vma);
- out:
- 	perf_event_mmap(vma);
- 	mm->total_vm += len >> PAGE_SHIFT;
-@@ -3204,6 +3228,12 @@
- 		mmap_write_unlock(mm);
- 	}
- 
-+	/*
-+	 * Taking write lock on mmap does not harm others,
-+	 * but it's crucial for uksm to avoid races.
-+	 */
-+	mmap_write_lock(mm);
-+
- 	if (mm->locked_vm) {
- 		vma = mm->mmap;
- 		while (vma) {
-@@ -3239,6 +3269,11 @@
- 		cond_resched();
- 	}
- 	vm_unacct_memory(nr_accounted);
-+
-+	mm->mmap = NULL;
-+	mm->mm_rb = RB_ROOT;
-+	vmacache_invalidate(mm);
-+	mmap_write_unlock(mm);
- }
- 
- /* Insert vm structure into process list sorted by address
-@@ -3346,6 +3381,7 @@
- 			new_vma->vm_ops->open(new_vma);
- 		vma_link(mm, new_vma, prev, rb_link, rb_parent);
- 		*need_rmap_locks = false;
-+		uksm_vma_add_new(new_vma);
- 	}
- 	return new_vma;
- 
-@@ -3498,6 +3534,7 @@
- 	vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
- 
- 	perf_event_mmap(vma);
-+	uksm_vma_add_new(vma);
- 
- 	return vma;
- 
-diff -Nur a/mm/uksm.c b/mm/uksm.c
---- a/mm/uksm.c	1970-01-01 01:00:00.000000000 +0100
-+++ b/mm/uksm.c	2021-01-03 14:22:34.501459099 +0000
-@@ -0,0 +1,5614 @@
-+/*
-+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
-+ *
-+ * This is an improvement upon KSM. Some basic data structures and routines
-+ * are borrowed from ksm.c .
-+ *
-+ * Its new features:
-+ * 1. Full system scan:
-+ *      It automatically scans all user processes' anonymous VMAs. Kernel-user
-+ *      interaction to submit a memory area to KSM is no longer needed.
-+ *
-+ * 2. Rich area detection:
-+ *      It automatically detects rich areas containing abundant duplicated
-+ *      pages based. Rich areas are given a full scan speed. Poor areas are
-+ *      sampled at a reasonable speed with very low CPU consumption.
-+ *
-+ * 3. Ultra Per-page scan speed improvement:
-+ *      A new hash algorithm is proposed. As a result, on a machine with
-+ *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
-+ *      can scan memory areas that does not contain duplicated pages at speed of
-+ *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
-+ *      477MB/sec ~ 923MB/sec.
-+ *
-+ * 4. Thrashing area avoidance:
-+ *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
-+ *      filtered out. My benchmark shows it's more efficient than KSM's per-page
-+ *      hash value based volatile page detection.
-+ *
-+ *
-+ * 5. Misc changes upon KSM:
-+ *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
-+ *        comparison. It's much faster than default C version on x86.
-+ *      * rmap_item now has an struct *page member to loosely cache a
-+ *        address-->page mapping, which reduces too much time-costly
-+ *        follow_page().
-+ *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
-+ *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
-+ *        ksm is needed for this case.
-+ *
-+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
-+ *    Now uksmd consider full zero pages as special pages and merge them to an
-+ *    special unswappable uksm zero page.
-+ */
-+
-+#include <linux/errno.h>
-+#include <linux/mm.h>
-+#include <linux/fs.h>
-+#include <linux/mman.h>
-+#include <linux/sched.h>
-+#include <linux/sched/mm.h>
-+#include <linux/sched/coredump.h>
-+#include <linux/sched/cputime.h>
-+#include <linux/rwsem.h>
-+#include <linux/pagemap.h>
-+#include <linux/rmap.h>
-+#include <linux/spinlock.h>
-+#include <linux/jhash.h>
-+#include <linux/delay.h>
-+#include <linux/kthread.h>
-+#include <linux/wait.h>
-+#include <linux/slab.h>
-+#include <linux/rbtree.h>
-+#include <linux/memory.h>
-+#include <linux/mmu_notifier.h>
-+#include <linux/swap.h>
-+#include <linux/ksm.h>
-+#include <linux/crypto.h>
-+#include <linux/scatterlist.h>
-+#include <crypto/hash.h>
-+#include <linux/random.h>
-+#include <linux/math64.h>
-+#include <linux/gcd.h>
-+#include <linux/freezer.h>
-+#include <linux/oom.h>
-+#include <linux/numa.h>
-+#include <linux/sradix-tree.h>
-+
-+#include <asm/tlbflush.h>
-+#include "internal.h"
-+
-+#ifdef CONFIG_X86
-+#undef memcmp
-+
-+#ifdef CONFIG_X86_32
-+#define memcmp memcmpx86_32
-+/*
-+ * Compare 4-byte-aligned address s1 and s2, with length n
-+ */
-+int memcmpx86_32(void *s1, void *s2, size_t n)
-+{
-+	size_t num = n / 4;
-+	register int res;
-+
-+	__asm__ __volatile__
-+	(
-+	 "testl %3,%3\n\t"
-+	 "repe; cmpsd\n\t"
-+	 "je        1f\n\t"
-+	 "sbbl      %0,%0\n\t"
-+	 "orl       $1,%0\n"
-+	 "1:"
-+	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
-+	 : "0" (0)
-+	 : "cc");
-+
-+	return res;
-+}
-+
-+/*
-+ * Check the page is all zero ?
-+ */
-+static int is_full_zero(const void *s1, size_t len)
-+{
-+	unsigned char same;
-+
-+	len /= 4;
-+
-+	__asm__ __volatile__
-+	("repe; scasl;"
-+	 "sete %0"
-+	 : "=qm" (same), "+D" (s1), "+c" (len)
-+	 : "a" (0)
-+	 : "cc");
-+
-+	return same;
-+}
-+
-+
-+#elif defined(CONFIG_X86_64)
-+#define memcmp memcmpx86_64
-+/*
-+ * Compare 8-byte-aligned address s1 and s2, with length n
-+ */
-+int memcmpx86_64(void *s1, void *s2, size_t n)
-+{
-+	size_t num = n / 8;
-+	register int res;
-+
-+	__asm__ __volatile__
-+	(
-+	 "testq %q3,%q3\n\t"
-+	 "repe; cmpsq\n\t"
-+	 "je        1f\n\t"
-+	 "sbbq      %q0,%q0\n\t"
-+	 "orq       $1,%q0\n"
-+	 "1:"
-+	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
-+	 : "0" (0)
-+	 : "cc");
-+
-+	return res;
-+}
-+
-+static int is_full_zero(const void *s1, size_t len)
-+{
-+	unsigned char same;
-+
-+	len /= 8;
-+
-+	__asm__ __volatile__
-+	("repe; scasq;"
-+	 "sete %0"
-+	 : "=qm" (same), "+D" (s1), "+c" (len)
-+	 : "a" (0)
-+	 : "cc");
-+
-+	return same;
-+}
-+
-+#endif
-+#else
-+static int is_full_zero(const void *s1, size_t len)
-+{
-+	unsigned long *src = s1;
-+	int i;
-+
-+	len /= sizeof(*src);
-+
-+	for (i = 0; i < len; i++) {
-+		if (src[i])
-+			return 0;
-+	}
-+
-+	return 1;
-+}
-+#endif
-+
-+#define UKSM_RUNG_ROUND_FINISHED  (1 << 0)
-+#define TIME_RATIO_SCALE	10000
-+
-+#define SLOT_TREE_NODE_SHIFT	8
-+#define SLOT_TREE_NODE_STORE_SIZE	(1UL << SLOT_TREE_NODE_SHIFT)
-+struct slot_tree_node {
-+	unsigned long size;
-+	struct sradix_tree_node snode;
-+	void *stores[SLOT_TREE_NODE_STORE_SIZE];
-+};
-+
-+static struct kmem_cache *slot_tree_node_cachep;
-+
-+static struct sradix_tree_node *slot_tree_node_alloc(void)
-+{
-+	struct slot_tree_node *p;
-+
-+	p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL |
-+			      __GFP_NORETRY | __GFP_NOWARN);
-+	if (!p)
-+		return NULL;
-+
-+	return &p->snode;
-+}
-+
-+static void slot_tree_node_free(struct sradix_tree_node *node)
-+{
-+	struct slot_tree_node *p;
-+
-+	p = container_of(node, struct slot_tree_node, snode);
-+	kmem_cache_free(slot_tree_node_cachep, p);
-+}
-+
-+static void slot_tree_node_extend(struct sradix_tree_node *parent,
-+				  struct sradix_tree_node *child)
-+{
-+	struct slot_tree_node *p, *c;
-+
-+	p = container_of(parent, struct slot_tree_node, snode);
-+	c = container_of(child, struct slot_tree_node, snode);
-+
-+	p->size += c->size;
-+}
-+
-+void slot_tree_node_assign(struct sradix_tree_node *node,
-+			   unsigned int index, void *item)
-+{
-+	struct vma_slot *slot = item;
-+	struct slot_tree_node *cur;
-+
-+	slot->snode = node;
-+	slot->sindex = index;
-+
-+	while (node) {
-+		cur = container_of(node, struct slot_tree_node, snode);
-+		cur->size += slot->pages;
-+		node = node->parent;
-+	}
-+}
-+
-+void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset)
-+{
-+	struct vma_slot *slot;
-+	struct slot_tree_node *cur;
-+	unsigned long pages;
-+
-+	if (node->height == 1) {
-+		slot = node->stores[offset];
-+		pages = slot->pages;
-+	} else {
-+		cur = container_of(node->stores[offset],
-+				   struct slot_tree_node, snode);
-+		pages = cur->size;
-+	}
-+
-+	while (node) {
-+		cur = container_of(node, struct slot_tree_node, snode);
-+		cur->size -= pages;
-+		node = node->parent;
-+	}
-+}
-+
-+unsigned long slot_iter_index;
-+int slot_iter(void *item,  unsigned long height)
-+{
-+	struct slot_tree_node *node;
-+	struct vma_slot *slot;
-+
-+	if (height == 1) {
-+		slot = item;
-+		if (slot_iter_index < slot->pages) {
-+			/*in this one*/
-+			return 1;
-+		} else {
-+			slot_iter_index -= slot->pages;
-+			return 0;
-+		}
-+
-+	} else {
-+		node = container_of(item, struct slot_tree_node, snode);
-+		if (slot_iter_index < node->size) {
-+			/*in this one*/
-+			return 1;
-+		} else {
-+			slot_iter_index -= node->size;
-+			return 0;
-+		}
-+	}
-+}
-+
-+
-+static inline void slot_tree_init_root(struct sradix_tree_root *root)
-+{
-+	init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
-+	root->alloc = slot_tree_node_alloc;
-+	root->free = slot_tree_node_free;
-+	root->extend = slot_tree_node_extend;
-+	root->assign = slot_tree_node_assign;
-+	root->rm = slot_tree_node_rm;
-+}
-+
-+void slot_tree_init(void)
-+{
-+	slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
-+				sizeof(struct slot_tree_node), 0,
-+				SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
-+				NULL);
-+}
-+
-+
-+/* Each rung of this ladder is a list of VMAs having a same scan ratio */
-+struct scan_rung {
-+	//struct list_head scanned_list;
-+	struct sradix_tree_root vma_root;
-+	struct sradix_tree_root vma_root2;
-+
-+	struct vma_slot *current_scan;
-+	unsigned long current_offset;
-+
-+	/*
-+	 * The initial value for current_offset, it should loop over
-+	 * [0~ step - 1] to let all slot have its chance to be scanned.
-+	 */
-+	unsigned long offset_init;
-+	unsigned long step; /* dynamic step for current_offset */
-+	unsigned int flags;
-+	unsigned long pages_to_scan;
-+	//unsigned long fully_scanned_slots;
-+	/*
-+	 * a little bit tricky - if cpu_time_ratio > 0, then the value is the
-+	 * the cpu time ratio it can spend in rung_i for every scan
-+	 * period. if < 0, then it is the cpu time ratio relative to the
-+	 * max cpu percentage user specified. Both in unit of
-+	 * 1/TIME_RATIO_SCALE
-+	 */
-+	int cpu_ratio;
-+
-+	/*
-+	 * How long it will take for all slots in this rung to be fully
-+	 * scanned? If it's zero, we don't care about the cover time:
-+	 * it's fully scanned.
-+	 */
-+	unsigned int cover_msecs;
-+	//unsigned long vma_num;
-+	//unsigned long pages; /* Sum of all slot's pages in rung */
-+};
-+
-+/**
-+ * node of either the stable or unstale rbtree
-+ *
-+ */
-+struct tree_node {
-+	struct rb_node node; /* link in the main (un)stable rbtree */
-+	struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
-+	u32 hash;
-+	unsigned long count; /* TODO: merged with sub_root */
-+	struct list_head all_list; /* all tree nodes in stable/unstable tree */
-+};
-+
-+/**
-+ * struct stable_node - node of the stable rbtree
-+ * @node: rb node of this ksm page in the stable tree
-+ * @hlist: hlist head of rmap_items using this ksm page
-+ * @kpfn: page frame number of this ksm page
-+ */
-+struct stable_node {
-+	struct rb_node node; /* link in sub-rbtree */
-+	struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
-+	struct hlist_head hlist;
-+	unsigned long kpfn;
-+	u32 hash_max; /* if ==0 then it's not been calculated yet */
-+	struct list_head all_list; /* in a list for all stable nodes */
-+};
-+
-+/**
-+ * struct node_vma - group rmap_items linked in a same stable
-+ * node together.
-+ */
-+struct node_vma {
-+	union {
-+		struct vma_slot *slot;
-+		unsigned long key;  /* slot is used as key sorted on hlist */
-+	};
-+	struct hlist_node hlist;
-+	struct hlist_head rmap_hlist;
-+	struct stable_node *head;
-+};
-+
-+/**
-+ * struct rmap_item - reverse mapping item for virtual addresses
-+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
-+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
-+ * @mm: the memory structure this rmap_item is pointing into
-+ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
-+ * @node: rb node of this rmap_item in the unstable tree
-+ * @head: pointer to stable_node heading this list in the stable tree
-+ * @hlist: link into hlist of rmap_items hanging off that stable_node
-+ */
-+struct rmap_item {
-+	struct vma_slot *slot;
-+	struct page *page;
-+	unsigned long address;	/* + low bits used for flags below */
-+	unsigned long hash_round;
-+	unsigned long entry_index;
-+	union {
-+		struct {/* when in unstable tree */
-+			struct rb_node node;
-+			struct tree_node *tree_node;
-+			u32 hash_max;
-+		};
-+		struct { /* when in stable tree */
-+			struct node_vma *head;
-+			struct hlist_node hlist;
-+			struct anon_vma *anon_vma;
-+		};
-+	};
-+} __aligned(4);
-+
-+struct rmap_list_entry {
-+	union {
-+		struct rmap_item *item;
-+		unsigned long addr;
-+	};
-+	/* lowest bit is used for is_addr tag */
-+} __aligned(4); /* 4 aligned to fit in to pages*/
-+
-+
-+/* Basic data structure definition ends */
-+
-+
-+/*
-+ * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
-+ * The flags use the low bits of rmap_item.address
-+ */
-+#define UNSTABLE_FLAG	0x1
-+#define STABLE_FLAG	0x2
-+#define get_rmap_addr(x)	((x)->address & PAGE_MASK)
-+
-+/*
-+ * rmap_list_entry helpers
-+ */
-+#define IS_ADDR_FLAG	1
-+#define is_addr(ptr)		((unsigned long)(ptr) & IS_ADDR_FLAG)
-+#define set_is_addr(ptr)	((ptr) |= IS_ADDR_FLAG)
-+#define get_clean_addr(ptr)	(((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
-+
-+
-+/*
-+ * High speed caches for frequently allocated and freed structs
-+ */
-+static struct kmem_cache *rmap_item_cache;
-+static struct kmem_cache *stable_node_cache;
-+static struct kmem_cache *node_vma_cache;
-+static struct kmem_cache *vma_slot_cache;
-+static struct kmem_cache *tree_node_cache;
-+#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
-+		sizeof(struct __struct), __alignof__(struct __struct),\
-+		(__flags), NULL)
-+
-+/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
-+#define SCAN_LADDER_SIZE 4
-+static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
-+
-+/* The evaluation rounds uksmd has finished */
-+static unsigned long long uksm_eval_round = 1;
-+
-+/*
-+ * we add 1 to this var when we consider we should rebuild the whole
-+ * unstable tree.
-+ */
-+static unsigned long uksm_hash_round = 1;
-+
-+/*
-+ * How many times the whole memory is scanned.
-+ */
-+static unsigned long long fully_scanned_round = 1;
-+
-+/* The total number of virtual pages of all vma slots */
-+static u64 uksm_pages_total;
-+
-+/* The number of pages has been scanned since the start up */
-+static u64 uksm_pages_scanned;
-+
-+static u64 scanned_virtual_pages;
-+
-+/* The number of pages has been scanned since last encode_benefit call */
-+static u64 uksm_pages_scanned_last;
-+
-+/* If the scanned number is tooo large, we encode it here */
-+static u64 pages_scanned_stored;
-+
-+static unsigned long pages_scanned_base;
-+
-+/* The number of nodes in the stable tree */
-+static unsigned long uksm_pages_shared;
-+
-+/* The number of page slots additionally sharing those nodes */
-+static unsigned long uksm_pages_sharing;
-+
-+/* The number of nodes in the unstable tree */
-+static unsigned long uksm_pages_unshared;
-+
-+/*
-+ * Milliseconds ksmd should sleep between scans,
-+ * >= 100ms to be consistent with
-+ * scan_time_to_sleep_msec()
-+ */
-+static unsigned int uksm_sleep_jiffies;
-+
-+/* The real value for the uksmd next sleep */
-+static unsigned int uksm_sleep_real;
-+
-+/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
-+static unsigned int uksm_sleep_saved;
-+
-+/* Max percentage of cpu utilization ksmd can take to scan in one batch */
-+static unsigned int uksm_max_cpu_percentage;
-+
-+static int uksm_cpu_governor;
-+
-+static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
-+
-+struct uksm_cpu_preset_s {
-+	int cpu_ratio[SCAN_LADDER_SIZE];
-+	unsigned int cover_msecs[SCAN_LADDER_SIZE];
-+	unsigned int max_cpu; /* percentage */
-+};
-+
-+struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
-+	{ {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
-+	{ {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
-+	{ {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
-+	{ {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
-+};
-+
-+/* The default value for uksm_ema_page_time if it's not initialized */
-+#define UKSM_PAGE_TIME_DEFAULT	500
-+
-+/*cost to scan one page by expotional moving average in nsecs */
-+static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
-+
-+/* The expotional moving average alpha weight, in percentage. */
-+#define EMA_ALPHA	20
-+
-+/*
-+ * The threshold used to filter out thrashing areas,
-+ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
-+ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
-+ * will be considered as having a zero duplication ratio.
-+ */
-+static unsigned int uksm_thrash_threshold = 50;
-+
-+/* How much dedup ratio is considered to be abundant*/
-+static unsigned int uksm_abundant_threshold = 10;
-+
-+/* All slots having merged pages in this eval round. */
-+struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
-+
-+/* How many times the ksmd has slept since startup */
-+static unsigned long long uksm_sleep_times;
-+
-+#define UKSM_RUN_STOP	0
-+#define UKSM_RUN_MERGE	1
-+static unsigned int uksm_run = 1;
-+
-+static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
-+static DEFINE_MUTEX(uksm_thread_mutex);
-+
-+/*
-+ * List vma_slot_new is for newly created vma_slot waiting to be added by
-+ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
-+ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
-+ * VMA has been removed/freed.
-+ */
-+struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
-+struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
-+struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
-+static DEFINE_SPINLOCK(vma_slot_list_lock);
-+
-+/* The unstable tree heads */
-+static struct rb_root root_unstable_tree = RB_ROOT;
-+
-+/*
-+ * All tree_nodes are in a list to be freed at once when unstable tree is
-+ * freed after each scan round.
-+ */
-+static struct list_head unstable_tree_node_list =
-+				LIST_HEAD_INIT(unstable_tree_node_list);
-+
-+/* List contains all stable nodes */
-+static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
-+
-+/*
-+ * When the hash strength is changed, the stable tree must be delta_hashed and
-+ * re-structured. We use two set of below structs to speed up the
-+ * re-structuring of stable tree.
-+ */
-+static struct list_head
-+stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
-+			    LIST_HEAD_INIT(stable_tree_node_list[1])};
-+
-+static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
-+static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
-+static struct rb_root *root_stable_treep = &root_stable_tree[0];
-+static unsigned long stable_tree_index;
-+
-+/* The hash strength needed to hash a full page */
-+#define HASH_STRENGTH_FULL		(PAGE_SIZE / sizeof(u32))
-+
-+/* The hash strength needed for loop-back hashing */
-+#define HASH_STRENGTH_MAX		(HASH_STRENGTH_FULL + 10)
-+
-+/* The random offsets in a page */
-+static u32 *random_nums;
-+
-+/* The hash strength */
-+static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
-+
-+/* The delta value each time the hash strength increases or decreases */
-+static unsigned long hash_strength_delta;
-+#define HASH_STRENGTH_DELTA_MAX	5
-+
-+/* The time we have saved due to random_sample_hash */
-+static u64 rshash_pos;
-+
-+/* The time we have wasted due to hash collision */
-+static u64 rshash_neg;
-+
-+struct uksm_benefit {
-+	u64 pos;
-+	u64 neg;
-+	u64 scanned;
-+	unsigned long base;
-+} benefit;
-+
-+/*
-+ * The relative cost of memcmp, compared to 1 time unit of random sample
-+ * hash, this value is tested when ksm module is initialized
-+ */
-+static unsigned long memcmp_cost;
-+
-+static unsigned long  rshash_neg_cont_zero;
-+static unsigned long  rshash_cont_obscure;
-+
-+/* The possible states of hash strength adjustment heuristic */
-+enum rshash_states {
-+		RSHASH_STILL,
-+		RSHASH_TRYUP,
-+		RSHASH_TRYDOWN,
-+		RSHASH_NEW,
-+		RSHASH_PRE_STILL,
-+};
-+
-+/* The possible direction we are about to adjust hash strength */
-+enum rshash_direct {
-+	GO_UP,
-+	GO_DOWN,
-+	OBSCURE,
-+	STILL,
-+};
-+
-+/* random sampling hash state machine */
-+static struct {
-+	enum rshash_states state;
-+	enum rshash_direct pre_direct;
-+	u8 below_count;
-+	/* Keep a lookup window of size 5, iff above_count/below_count > 3
-+	 * in this window we stop trying.
-+	 */
-+	u8 lookup_window_index;
-+	u64 stable_benefit;
-+	unsigned long turn_point_down;
-+	unsigned long turn_benefit_down;
-+	unsigned long turn_point_up;
-+	unsigned long turn_benefit_up;
-+	unsigned long stable_point;
-+} rshash_state;
-+
-+/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
-+static u32 *zero_hash_table;
-+
-+static inline struct node_vma *alloc_node_vma(void)
-+{
-+	struct node_vma *node_vma;
-+
-+	node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL |
-+				     __GFP_NORETRY | __GFP_NOWARN);
-+	if (node_vma) {
-+		INIT_HLIST_HEAD(&node_vma->rmap_hlist);
-+		INIT_HLIST_NODE(&node_vma->hlist);
-+	}
-+	return node_vma;
-+}
-+
-+static inline void free_node_vma(struct node_vma *node_vma)
-+{
-+	kmem_cache_free(node_vma_cache, node_vma);
-+}
-+
-+
-+static inline struct vma_slot *alloc_vma_slot(void)
-+{
-+	struct vma_slot *slot;
-+
-+	/*
-+	 * In case ksm is not initialized by now.
-+	 * Oops, we need to consider the call site of uksm_init() in the future.
-+	 */
-+	if (!vma_slot_cache)
-+		return NULL;
-+
-+	slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL |
-+				 __GFP_NORETRY | __GFP_NOWARN);
-+	if (slot) {
-+		INIT_LIST_HEAD(&slot->slot_list);
-+		INIT_LIST_HEAD(&slot->dedup_list);
-+		slot->flags |= UKSM_SLOT_NEED_RERAND;
-+	}
-+	return slot;
-+}
-+
-+static inline void free_vma_slot(struct vma_slot *vma_slot)
-+{
-+	kmem_cache_free(vma_slot_cache, vma_slot);
-+}
-+
-+
-+
-+static inline struct rmap_item *alloc_rmap_item(void)
-+{
-+	struct rmap_item *rmap_item;
-+
-+	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
-+				      __GFP_NORETRY | __GFP_NOWARN);
-+	if (rmap_item) {
-+		/* bug on lowest bit is not clear for flag use */
-+		BUG_ON(is_addr(rmap_item));
-+	}
-+	return rmap_item;
-+}
-+
-+static inline void free_rmap_item(struct rmap_item *rmap_item)
-+{
-+	rmap_item->slot = NULL;	/* debug safety */
-+	kmem_cache_free(rmap_item_cache, rmap_item);
-+}
-+
-+static inline struct stable_node *alloc_stable_node(void)
-+{
-+	struct stable_node *node;
-+
-+	node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL |
-+				__GFP_NORETRY | __GFP_NOWARN);
-+	if (!node)
-+		return NULL;
-+
-+	INIT_HLIST_HEAD(&node->hlist);
-+	list_add(&node->all_list, &stable_node_list);
-+	return node;
-+}
-+
-+static inline void free_stable_node(struct stable_node *stable_node)
-+{
-+	list_del(&stable_node->all_list);
-+	kmem_cache_free(stable_node_cache, stable_node);
-+}
-+
-+static inline struct tree_node *alloc_tree_node(struct list_head *list)
-+{
-+	struct tree_node *node;
-+
-+	node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL |
-+				 __GFP_NORETRY | __GFP_NOWARN);
-+	if (!node)
-+		return NULL;
-+
-+	list_add(&node->all_list, list);
-+	return node;
-+}
-+
-+static inline void free_tree_node(struct tree_node *node)
-+{
-+	list_del(&node->all_list);
-+	kmem_cache_free(tree_node_cache, node);
-+}
-+
-+static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
-+{
-+	struct anon_vma *anon_vma = rmap_item->anon_vma;
-+
-+	put_anon_vma(anon_vma);
-+}
-+
-+
-+/**
-+ * Remove a stable node from stable_tree, may unlink from its tree_node and
-+ * may remove its parent tree_node if no other stable node is pending.
-+ *
-+ * @stable_node	    The node need to be removed
-+ * @unlink_rb	    Will this node be unlinked from the rbtree?
-+ * @remove_tree_    node Will its tree_node be removed if empty?
-+ */
-+static void remove_node_from_stable_tree(struct stable_node *stable_node,
-+					 int unlink_rb,  int remove_tree_node)
-+{
-+	struct node_vma *node_vma;
-+	struct rmap_item *rmap_item;
-+	struct hlist_node *n;
-+
-+	if (!hlist_empty(&stable_node->hlist)) {
-+		hlist_for_each_entry_safe(node_vma, n,
-+					  &stable_node->hlist, hlist) {
-+			hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
-+				uksm_pages_sharing--;
-+
-+				uksm_drop_anon_vma(rmap_item);
-+				rmap_item->address &= PAGE_MASK;
-+			}
-+			free_node_vma(node_vma);
-+			cond_resched();
-+		}
-+
-+		/* the last one is counted as shared */
-+		uksm_pages_shared--;
-+		uksm_pages_sharing++;
-+	}
-+
-+	if (stable_node->tree_node && unlink_rb) {
-+		rb_erase(&stable_node->node,
-+			 &stable_node->tree_node->sub_root);
-+
-+		if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
-+		    remove_tree_node) {
-+			rb_erase(&stable_node->tree_node->node,
-+				 root_stable_treep);
-+			free_tree_node(stable_node->tree_node);
-+		} else {
-+			stable_node->tree_node->count--;
-+		}
-+	}
-+
-+	free_stable_node(stable_node);
-+}
-+
-+
-+/*
-+ * get_uksm_page: checks if the page indicated by the stable node
-+ * is still its ksm page, despite having held no reference to it.
-+ * In which case we can trust the content of the page, and it
-+ * returns the gotten page; but if the page has now been zapped,
-+ * remove the stale node from the stable tree and return NULL.
-+ *
-+ * You would expect the stable_node to hold a reference to the ksm page.
-+ * But if it increments the page's count, swapping out has to wait for
-+ * ksmd to come around again before it can free the page, which may take
-+ * seconds or even minutes: much too unresponsive.  So instead we use a
-+ * "keyhole reference": access to the ksm page from the stable node peeps
-+ * out through its keyhole to see if that page still holds the right key,
-+ * pointing back to this stable node.  This relies on freeing a PageAnon
-+ * page to reset its page->mapping to NULL, and relies on no other use of
-+ * a page to put something that might look like our key in page->mapping.
-+ *
-+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
-+ * but this is different - made simpler by uksm_thread_mutex being held, but
-+ * interesting for assuming that no other use of the struct page could ever
-+ * put our expected_mapping into page->mapping (or a field of the union which
-+ * coincides with page->mapping).  The RCU calls are not for KSM at all, but
-+ * to keep the page_count protocol described with page_cache_get_speculative.
-+ *
-+ * Note: it is possible that get_uksm_page() will return NULL one moment,
-+ * then page the next, if the page is in between page_freeze_refs() and
-+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
-+ * is on its way to being freed; but it is an anomaly to bear in mind.
-+ *
-+ * @unlink_rb:			if the removal of this node will firstly unlink from
-+ * its rbtree. stable_node_reinsert will prevent this when restructuring the
-+ * node from its old tree.
-+ *
-+ * @remove_tree_node:	if this is the last one of its tree_node, will the
-+ * tree_node be freed ? If we are inserting stable node, this tree_node may
-+ * be reused, so don't free it.
-+ */
-+static struct page *get_uksm_page(struct stable_node *stable_node,
-+				 int unlink_rb, int remove_tree_node)
-+{
-+	struct page *page;
-+	void *expected_mapping;
-+	unsigned long kpfn;
-+
-+	expected_mapping = (void *)((unsigned long)stable_node |
-+				    PAGE_MAPPING_KSM);
-+again:
-+	kpfn = READ_ONCE(stable_node->kpfn);
-+	page = pfn_to_page(kpfn);
-+
-+	/*
-+	 * page is computed from kpfn, so on most architectures reading
-+	 * page->mapping is naturally ordered after reading node->kpfn,
-+	 * but on Alpha we need to be more careful.
-+	 */
-+	smp_rmb();
-+
-+	if (READ_ONCE(page->mapping) != expected_mapping)
-+		goto stale;
-+
-+	/*
-+	 * We cannot do anything with the page while its refcount is 0.
-+	 * Usually 0 means free, or tail of a higher-order page: in which
-+	 * case this node is no longer referenced, and should be freed;
-+	 * however, it might mean that the page is under page_freeze_refs().
-+	 * The __remove_mapping() case is easy, again the node is now stale;
-+	 * but if page is swapcache in migrate_page_move_mapping(), it might
-+	 * still be our page, in which case it's essential to keep the node.
-+	 */
-+	while (!get_page_unless_zero(page)) {
-+		/*
-+		 * Another check for page->mapping != expected_mapping would
-+		 * work here too.  We have chosen the !PageSwapCache test to
-+		 * optimize the common case, when the page is or is about to
-+		 * be freed: PageSwapCache is cleared (under spin_lock_irq)
-+		 * in the freeze_refs section of __remove_mapping(); but Anon
-+		 * page->mapping reset to NULL later, in free_pages_prepare().
-+		 */
-+		if (!PageSwapCache(page))
-+			goto stale;
-+		cpu_relax();
-+	}
-+
-+	if (READ_ONCE(page->mapping) != expected_mapping) {
-+		put_page(page);
-+		goto stale;
-+	}
-+
-+	lock_page(page);
-+	if (READ_ONCE(page->mapping) != expected_mapping) {
-+		unlock_page(page);
-+		put_page(page);
-+		goto stale;
-+	}
-+	unlock_page(page);
-+	return page;
-+stale:
-+	/*
-+	 * We come here from above when page->mapping or !PageSwapCache
-+	 * suggests that the node is stale; but it might be under migration.
-+	 * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
-+	 * before checking whether node->kpfn has been changed.
-+	 */
-+	smp_rmb();
-+	if (stable_node->kpfn != kpfn)
-+		goto again;
-+
-+	remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
-+
-+	return NULL;
-+}
-+
-+/*
-+ * Removing rmap_item from stable or unstable tree.
-+ * This function will clean the information from the stable/unstable tree.
-+ */
-+static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
-+{
-+	if (rmap_item->address & STABLE_FLAG) {
-+		struct stable_node *stable_node;
-+		struct node_vma *node_vma;
-+		struct page *page;
-+
-+		node_vma = rmap_item->head;
-+		stable_node = node_vma->head;
-+		page = get_uksm_page(stable_node, 1, 1);
-+		if (!page)
-+			goto out;
-+
-+		/*
-+		 * page lock is needed because it's racing with
-+		 * try_to_unmap_ksm(), etc.
-+		 */
-+		lock_page(page);
-+		hlist_del(&rmap_item->hlist);
-+
-+		if (hlist_empty(&node_vma->rmap_hlist)) {
-+			hlist_del(&node_vma->hlist);
-+			free_node_vma(node_vma);
-+		}
-+		unlock_page(page);
-+
-+		put_page(page);
-+		if (hlist_empty(&stable_node->hlist)) {
-+			/* do NOT call remove_node_from_stable_tree() here,
-+			 * it's possible for a forked rmap_item not in
-+			 * stable tree while the in-tree rmap_items were
-+			 * deleted.
-+			 */
-+			uksm_pages_shared--;
-+		} else
-+			uksm_pages_sharing--;
-+
-+
-+		uksm_drop_anon_vma(rmap_item);
-+	} else if (rmap_item->address & UNSTABLE_FLAG) {
-+		if (rmap_item->hash_round == uksm_hash_round) {
-+
-+			rb_erase(&rmap_item->node,
-+				 &rmap_item->tree_node->sub_root);
-+			if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
-+				rb_erase(&rmap_item->tree_node->node,
-+					 &root_unstable_tree);
-+
-+				free_tree_node(rmap_item->tree_node);
-+			} else
-+				rmap_item->tree_node->count--;
-+		}
-+		uksm_pages_unshared--;
-+	}
-+
-+	rmap_item->address &= PAGE_MASK;
-+	rmap_item->hash_max = 0;
-+
-+out:
-+	cond_resched();		/* we're called from many long loops */
-+}
-+
-+static inline int slot_in_uksm(struct vma_slot *slot)
-+{
-+	return list_empty(&slot->slot_list);
-+}
-+
-+/*
-+ * Test if the mm is exiting
-+ */
-+static inline bool uksm_test_exit(struct mm_struct *mm)
-+{
-+	return atomic_read(&mm->mm_users) == 0;
-+}
-+
-+static inline unsigned long vma_pool_size(struct vma_slot *slot)
-+{
-+	return round_up(sizeof(struct rmap_list_entry) * slot->pages,
-+			PAGE_SIZE) >> PAGE_SHIFT;
-+}
-+
-+#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
-+
-+/* must be done with sem locked */
-+static int slot_pool_alloc(struct vma_slot *slot)
-+{
-+	unsigned long pool_size;
-+
-+	if (slot->rmap_list_pool)
-+		return 0;
-+
-+	pool_size = vma_pool_size(slot);
-+	slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *),
-+				       GFP_KERNEL);
-+	if (!slot->rmap_list_pool)
-+		return -ENOMEM;
-+
-+	slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int),
-+				    GFP_KERNEL);
-+	if (!slot->pool_counts) {
-+		kfree(slot->rmap_list_pool);
-+		return -ENOMEM;
-+	}
-+
-+	slot->pool_size = pool_size;
-+	BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
-+	slot->flags |= UKSM_SLOT_IN_UKSM;
-+	uksm_pages_total += slot->pages;
-+
-+	return 0;
-+}
-+
-+/*
-+ * Called after vma is unlinked from its mm
-+ */
-+void uksm_remove_vma(struct vm_area_struct *vma)
-+{
-+	struct vma_slot *slot;
-+
-+	if (!vma->uksm_vma_slot)
-+		return;
-+
-+	spin_lock(&vma_slot_list_lock);
-+	slot = vma->uksm_vma_slot;
-+	if (!slot)
-+		goto out;
-+
-+	if (slot_in_uksm(slot)) {
-+		/**
-+		 * This slot has been added by ksmd, so move to the del list
-+		 * waiting ksmd to free it.
-+		 */
-+		list_add_tail(&slot->slot_list, &vma_slot_del);
-+	} else {
-+		/**
-+		 * It's still on new list. It's ok to free slot directly.
-+		 */
-+		list_del(&slot->slot_list);
-+		free_vma_slot(slot);
-+	}
-+out:
-+	vma->uksm_vma_slot = NULL;
-+	spin_unlock(&vma_slot_list_lock);
-+}
-+
-+/**
-+ * Need to do two things:
-+ * 1. check if slot was moved to del list
-+ * 2. make sure the mmap_sem is manipulated under valid vma.
-+ *
-+ * My concern here is that in some cases, this may make
-+ * vma_slot_list_lock() waiters to serialized further by some
-+ * sem->wait_lock, can this really be expensive?
-+ *
-+ *
-+ * @return
-+ * 0: if successfully locked mmap_sem
-+ * -ENOENT: this slot was moved to del list
-+ * -EBUSY: vma lock failed
-+ */
-+static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
-+{
-+	struct vm_area_struct *vma;
-+	struct mm_struct *mm;
-+	struct rw_semaphore *sem;
-+
-+	spin_lock(&vma_slot_list_lock);
-+
-+	/* the slot_list was removed and inited from new list, when it enters
-+	 * uksm_list. If now it's not empty, then it must be moved to del list
-+	 */
-+	if (!slot_in_uksm(slot)) {
-+		spin_unlock(&vma_slot_list_lock);
-+		return -ENOENT;
-+	}
-+
-+	BUG_ON(slot->pages != vma_pages(slot->vma));
-+	/* Ok, vma still valid */
-+	vma = slot->vma;
-+	mm = vma->vm_mm;
-+	sem = &mm->mmap_lock;
-+
-+	if (uksm_test_exit(mm)) {
-+		spin_unlock(&vma_slot_list_lock);
-+		return -ENOENT;
-+	}
-+
-+	if (down_read_trylock(sem)) {
-+		spin_unlock(&vma_slot_list_lock);
-+		if (slot_pool_alloc(slot)) {
-+			uksm_remove_vma(vma);
-+			up_read(sem);
-+			return -ENOENT;
-+		}
-+		return 0;
-+	}
-+
-+	spin_unlock(&vma_slot_list_lock);
-+	return -EBUSY;
-+}
-+
-+static inline unsigned long
-+vma_page_address(struct page *page, struct vm_area_struct *vma)
-+{
-+	pgoff_t pgoff = page->index;
-+	unsigned long address;
-+
-+	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-+	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
-+		/* page should be within @vma mapping range */
-+		return -EFAULT;
-+	}
-+	return address;
-+}
-+
-+
-+/* return 0 on success with the item's mmap_sem locked */
-+static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
-+{
-+	struct mm_struct *mm;
-+	struct vma_slot *slot = item->slot;
-+	int err = -EINVAL;
-+
-+	struct page *page;
-+
-+	/*
-+	 * try_down_read_slot_mmap_sem() returns non-zero if the slot
-+	 * has been removed by uksm_remove_vma().
-+	 */
-+	if (try_down_read_slot_mmap_sem(slot))
-+		return -EBUSY;
-+
-+	mm = slot->vma->vm_mm;
-+
-+	if (uksm_test_exit(mm))
-+		goto failout_up;
-+
-+	page = item->page;
-+	rcu_read_lock();
-+	if (!get_page_unless_zero(page)) {
-+		rcu_read_unlock();
-+		goto failout_up;
-+	}
-+
-+	/* No need to consider huge page here. */
-+	if (item->slot->vma->anon_vma != page_anon_vma(page) ||
-+	    vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
-+		/*
-+		 * TODO:
-+		 * should we release this item becase of its stale page
-+		 * mapping?
-+		 */
-+		put_page(page);
-+		rcu_read_unlock();
-+		goto failout_up;
-+	}
-+	rcu_read_unlock();
-+	return 0;
-+
-+failout_up:
-+	mmap_read_unlock(mm);
-+	return err;
-+}
-+
-+/*
-+ * What kind of VMA is considered ?
-+ */
-+static inline int vma_can_enter(struct vm_area_struct *vma)
-+{
-+	return uksm_flags_can_scan(vma->vm_flags);
-+}
-+
-+/*
-+ * Called whenever a fresh new vma is created A new vma_slot.
-+ * is created and inserted into a global list Must be called.
-+ * after vma is inserted to its mm.
-+ */
-+void uksm_vma_add_new(struct vm_area_struct *vma)
-+{
-+	struct vma_slot *slot;
-+
-+	if (!vma_can_enter(vma)) {
-+		vma->uksm_vma_slot = NULL;
-+		return;
-+	}
-+
-+	slot = alloc_vma_slot();
-+	if (!slot) {
-+		vma->uksm_vma_slot = NULL;
-+		return;
-+	}
-+
-+	vma->uksm_vma_slot = slot;
-+	vma->vm_flags |= VM_MERGEABLE;
-+	slot->vma = vma;
-+	slot->mm = vma->vm_mm;
-+	slot->ctime_j = jiffies;
-+	slot->pages = vma_pages(vma);
-+	spin_lock(&vma_slot_list_lock);
-+	list_add_tail(&slot->slot_list, &vma_slot_new);
-+	spin_unlock(&vma_slot_list_lock);
-+}
-+
-+/*   32/3 < they < 32/2 */
-+#define shiftl	8
-+#define shiftr	12
-+
-+#define HASH_FROM_TO(from, to)			\
-+for (index = from; index < to; index++) {	\
-+	pos = random_nums[index];		\
-+	hash += key[pos];			\
-+	hash += (hash << shiftl);		\
-+	hash ^= (hash >> shiftr);		\
-+}
-+
-+
-+#define HASH_FROM_DOWN_TO(from, to)		\
-+for (index = from - 1; index >= to; index--) {	\
-+	hash ^= (hash >> shiftr);		\
-+	hash ^= (hash >> (shiftr*2));		\
-+	hash -= (hash << shiftl);		\
-+	hash += (hash << (shiftl*2));		\
-+	pos = random_nums[index];		\
-+	hash -= key[pos];			\
-+}
-+
-+/*
-+ * The main random sample hash function.
-+ */
-+static u32 random_sample_hash(void *addr, u32 hash_strength)
-+{
-+	u32 hash = 0xdeadbeef;
-+	int index, pos, loop = hash_strength;
-+	u32 *key = (u32 *)addr;
-+
-+	if (loop > HASH_STRENGTH_FULL)
-+		loop = HASH_STRENGTH_FULL;
-+
-+	HASH_FROM_TO(0, loop);
-+
-+	if (hash_strength > HASH_STRENGTH_FULL) {
-+		loop = hash_strength - HASH_STRENGTH_FULL;
-+		HASH_FROM_TO(0, loop);
-+	}
-+
-+	return hash;
-+}
-+
-+
-+/**
-+ * It's used when hash strength is adjusted
-+ *
-+ * @addr The page's virtual address
-+ * @from The original hash strength
-+ * @to   The hash strength changed to
-+ * @hash The hash value generated with "from" hash value
-+ *
-+ * return the hash value
-+ */
-+static u32 delta_hash(void *addr, int from, int to, u32 hash)
-+{
-+	u32 *key = (u32 *)addr;
-+	int index, pos; /* make sure they are int type */
-+
-+	if (to > from) {
-+		if (from >= HASH_STRENGTH_FULL) {
-+			from -= HASH_STRENGTH_FULL;
-+			to -= HASH_STRENGTH_FULL;
-+			HASH_FROM_TO(from, to);
-+		} else if (to <= HASH_STRENGTH_FULL) {
-+			HASH_FROM_TO(from, to);
-+		} else {
-+			HASH_FROM_TO(from, HASH_STRENGTH_FULL);
-+			HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
-+		}
-+	} else {
-+		if (from <= HASH_STRENGTH_FULL) {
-+			HASH_FROM_DOWN_TO(from, to);
-+		} else if (to >= HASH_STRENGTH_FULL) {
-+			from -= HASH_STRENGTH_FULL;
-+			to -= HASH_STRENGTH_FULL;
-+			HASH_FROM_DOWN_TO(from, to);
-+		} else {
-+			HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
-+			HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
-+		}
-+	}
-+
-+	return hash;
-+}
-+
-+/**
-+ *
-+ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
-+ * has finished.
-+ *
-+ * return 0 if no page has been scanned since last call, 1 otherwise.
-+ */
-+static inline int encode_benefit(void)
-+{
-+	u64 scanned_delta, pos_delta, neg_delta;
-+	unsigned long base = benefit.base;
-+
-+	scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
-+
-+	if (!scanned_delta)
-+		return 0;
-+
-+	scanned_delta >>= base;
-+	pos_delta = rshash_pos >> base;
-+	neg_delta = rshash_neg >> base;
-+
-+	if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
-+	    CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
-+	    CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
-+		benefit.scanned >>= 1;
-+		benefit.neg >>= 1;
-+		benefit.pos >>= 1;
-+		benefit.base++;
-+		scanned_delta >>= 1;
-+		pos_delta >>= 1;
-+		neg_delta >>= 1;
-+	}
-+
-+	benefit.pos += pos_delta;
-+	benefit.neg += neg_delta;
-+	benefit.scanned += scanned_delta;
-+
-+	BUG_ON(!benefit.scanned);
-+
-+	rshash_pos = rshash_neg = 0;
-+	uksm_pages_scanned_last = uksm_pages_scanned;
-+
-+	return 1;
-+}
-+
-+static inline void reset_benefit(void)
-+{
-+	benefit.pos = 0;
-+	benefit.neg = 0;
-+	benefit.base = 0;
-+	benefit.scanned = 0;
-+}
-+
-+static inline void inc_rshash_pos(unsigned long delta)
-+{
-+	if (CAN_OVERFLOW_U64(rshash_pos, delta))
-+		encode_benefit();
-+
-+	rshash_pos += delta;
-+}
-+
-+static inline void inc_rshash_neg(unsigned long delta)
-+{
-+	if (CAN_OVERFLOW_U64(rshash_neg, delta))
-+		encode_benefit();
-+
-+	rshash_neg += delta;
-+}
-+
-+
-+static inline u32 page_hash(struct page *page, unsigned long hash_strength,
-+			    int cost_accounting)
-+{
-+	u32 val;
-+	unsigned long delta;
-+
-+	void *addr = kmap_atomic(page);
-+
-+	val = random_sample_hash(addr, hash_strength);
-+	kunmap_atomic(addr);
-+
-+	if (cost_accounting) {
-+		if (hash_strength < HASH_STRENGTH_FULL)
-+			delta = HASH_STRENGTH_FULL - hash_strength;
-+		else
-+			delta = 0;
-+
-+		inc_rshash_pos(delta);
-+	}
-+
-+	return val;
-+}
-+
-+static int memcmp_pages_with_cost(struct page *page1, struct page *page2,
-+			int cost_accounting)
-+{
-+	char *addr1, *addr2;
-+	int ret;
-+
-+	addr1 = kmap_atomic(page1);
-+	addr2 = kmap_atomic(page2);
-+	ret = memcmp(addr1, addr2, PAGE_SIZE);
-+	kunmap_atomic(addr2);
-+	kunmap_atomic(addr1);
-+
-+	if (cost_accounting)
-+		inc_rshash_neg(memcmp_cost);
-+
-+	return ret;
-+}
-+
-+static inline int pages_identical_with_cost(struct page *page1, struct page *page2)
-+{
-+	return !memcmp_pages_with_cost(page1, page2, 0);
-+}
-+
-+static inline int is_page_full_zero(struct page *page)
-+{
-+	char *addr;
-+	int ret;
-+
-+	addr = kmap_atomic(page);
-+	ret = is_full_zero(addr, PAGE_SIZE);
-+	kunmap_atomic(addr);
-+
-+	return ret;
-+}
-+
-+static int write_protect_page(struct vm_area_struct *vma, struct page *page,
-+			      pte_t *orig_pte, pte_t *old_pte)
-+{
-+	struct mm_struct *mm = vma->vm_mm;
-+	struct page_vma_mapped_walk pvmw = {
-+		.page = page,
-+		.vma = vma,
-+	};
-+       struct mmu_notifier_range range;
-+	int swapped;
-+	int err = -EFAULT;
-+
-+	pvmw.address = page_address_in_vma(page, vma);
-+	if (pvmw.address == -EFAULT)
-+		goto out;
-+
-+	BUG_ON(PageTransCompound(page));
-+
-+        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, pvmw.address,
-+                                pvmw.address + PAGE_SIZE);
-+	mmu_notifier_invalidate_range_start(&range);
-+
-+	if (!page_vma_mapped_walk(&pvmw))
-+		goto out_mn;
-+	if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
-+		goto out_unlock;
-+
-+	if (old_pte)
-+		*old_pte = *pvmw.pte;
-+
-+	if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
-+	    (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) {
-+		pte_t entry;
-+
-+		swapped = PageSwapCache(page);
-+		flush_cache_page(vma, pvmw.address, page_to_pfn(page));
-+		/*
-+		 * Ok this is tricky, when get_user_pages_fast() run it doesn't
-+		 * take any lock, therefore the check that we are going to make
-+		 * with the pagecount against the mapcount is racey and
-+		 * O_DIRECT can happen right after the check.
-+		 * So we clear the pte and flush the tlb before the check
-+		 * this assure us that no O_DIRECT can happen after the check
-+		 * or in the middle of the check.
-+		 */
-+		entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
-+		/*
-+		 * Check that no O_DIRECT or similar I/O is in progress on the
-+		 * page
-+		 */
-+		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
-+			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
-+			goto out_unlock;
-+		}
-+		if (pte_dirty(entry))
-+			set_page_dirty(page);
-+
-+		if (pte_protnone(entry))
-+			entry = pte_mkclean(pte_clear_savedwrite(entry));
-+		else
-+			entry = pte_mkclean(pte_wrprotect(entry));
-+
-+		set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
-+	}
-+	*orig_pte = *pvmw.pte;
-+	err = 0;
-+
-+out_unlock:
-+	page_vma_mapped_walk_done(&pvmw);
-+out_mn:
-+	mmu_notifier_invalidate_range_end(&range);
-+out:
-+	return err;
-+}
-+
-+#define MERGE_ERR_PGERR		1 /* the page is invalid cannot continue */
-+#define MERGE_ERR_COLLI		2 /* there is a collision */
-+#define MERGE_ERR_COLLI_MAX	3 /* collision at the max hash strength */
-+#define MERGE_ERR_CHANGED	4 /* the page has changed since last hash */
-+
-+
-+/**
-+ * replace_page - replace page in vma by new ksm page
-+ * @vma:      vma that holds the pte pointing to page
-+ * @page:     the page we are replacing by kpage
-+ * @kpage:    the ksm page we replace page by
-+ * @orig_pte: the original value of the pte
-+ *
-+ * Returns 0 on success, MERGE_ERR_PGERR on failure.
-+ */
-+static int replace_page(struct vm_area_struct *vma, struct page *page,
-+			struct page *kpage, pte_t orig_pte)
-+{
-+	struct mm_struct *mm = vma->vm_mm;
-+       struct mmu_notifier_range range;
-+	pgd_t *pgd;
-+	p4d_t *p4d;
-+	pud_t *pud;
-+	pmd_t *pmd;
-+	pte_t *ptep;
-+	spinlock_t *ptl;
-+	pte_t entry;
-+
-+	unsigned long addr;
-+	int err = MERGE_ERR_PGERR;
-+
-+	addr = page_address_in_vma(page, vma);
-+	if (addr == -EFAULT)
-+		goto out;
-+
-+	pgd = pgd_offset(mm, addr);
-+	if (!pgd_present(*pgd))
-+		goto out;
-+
-+	p4d = p4d_offset(pgd, addr);
-+	pud = pud_offset(p4d, addr);
-+	if (!pud_present(*pud))
-+		goto out;
-+
-+	pmd = pmd_offset(pud, addr);
-+	BUG_ON(pmd_trans_huge(*pmd));
-+	if (!pmd_present(*pmd))
-+		goto out;
-+
-+        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
-+                                addr + PAGE_SIZE);
-+	mmu_notifier_invalidate_range_start(&range);
-+
-+	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
-+	if (!pte_same(*ptep, orig_pte)) {
-+		pte_unmap_unlock(ptep, ptl);
-+		goto out_mn;
-+	}
-+
-+	flush_cache_page(vma, addr, pte_pfn(*ptep));
-+	ptep_clear_flush_notify(vma, addr, ptep);
-+	entry = mk_pte(kpage, vma->vm_page_prot);
-+
-+	/* special treatment is needed for zero_page */
-+	if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
-+				(page_to_pfn(kpage) == zero_pfn)) {
-+		entry = pte_mkspecial(entry);
-+		dec_mm_counter(mm, MM_ANONPAGES);
-+		inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
-+	} else {
-+		get_page(kpage);
-+		page_add_anon_rmap(kpage, vma, addr, false);
-+	}
-+
-+	set_pte_at_notify(mm, addr, ptep, entry);
-+
-+	page_remove_rmap(page, false);
-+	if (!page_mapped(page))
-+		try_to_free_swap(page);
-+	put_page(page);
-+
-+	pte_unmap_unlock(ptep, ptl);
-+	err = 0;
-+out_mn:
-+	mmu_notifier_invalidate_range_end(&range);
-+out:
-+	return err;
-+}
-+
-+
-+/**
-+ *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
-+ *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
-+ *  hash_max member has not been calculated.
-+ *
-+ * @page The page needs to be hashed
-+ * @hash_old The hash value calculated with current hash strength
-+ *
-+ * return the new hash value calculated at HASH_STRENGTH_MAX
-+ */
-+static inline u32 page_hash_max(struct page *page, u32 hash_old)
-+{
-+	u32 hash_max = 0;
-+	void *addr;
-+
-+	addr = kmap_atomic(page);
-+	hash_max = delta_hash(addr, hash_strength,
-+			      HASH_STRENGTH_MAX, hash_old);
-+
-+	kunmap_atomic(addr);
-+
-+	if (!hash_max)
-+		hash_max = 1;
-+
-+	inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
-+	return hash_max;
-+}
-+
-+/*
-+ * We compare the hash again, to ensure that it is really a hash collision
-+ * instead of being caused by page write.
-+ */
-+static inline int check_collision(struct rmap_item *rmap_item,
-+				  u32 hash)
-+{
-+	int err;
-+	struct page *page = rmap_item->page;
-+
-+	/* if this rmap_item has already been hash_maxed, then the collision
-+	 * must appears in the second-level rbtree search. In this case we check
-+	 * if its hash_max value has been changed. Otherwise, the collision
-+	 * happens in the first-level rbtree search, so we check against it's
-+	 * current hash value.
-+	 */
-+	if (rmap_item->hash_max) {
-+		inc_rshash_neg(memcmp_cost);
-+		inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
-+
-+		if (rmap_item->hash_max == page_hash_max(page, hash))
-+			err = MERGE_ERR_COLLI;
-+		else
-+			err = MERGE_ERR_CHANGED;
-+	} else {
-+		inc_rshash_neg(memcmp_cost + hash_strength);
-+
-+		if (page_hash(page, hash_strength, 0) == hash)
-+			err = MERGE_ERR_COLLI;
-+		else
-+			err = MERGE_ERR_CHANGED;
-+	}
-+
-+	return err;
-+}
-+
-+/**
-+ * Try to merge a rmap_item.page with a kpage in stable node. kpage must
-+ * already be a ksm page.
-+ *
-+ * @return 0 if the pages were merged, -EFAULT otherwise.
-+ */
-+static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
-+				      struct page *kpage, u32 hash)
-+{
-+	struct vm_area_struct *vma = rmap_item->slot->vma;
-+	struct mm_struct *mm = vma->vm_mm;
-+	pte_t orig_pte = __pte(0);
-+	int err = MERGE_ERR_PGERR;
-+	struct page *page;
-+
-+	if (uksm_test_exit(mm))
-+		goto out;
-+
-+	page = rmap_item->page;
-+
-+	if (page == kpage) { /* ksm page forked */
-+		err = 0;
-+		goto out;
-+	}
-+
-+	/*
-+	 * We need the page lock to read a stable PageSwapCache in
-+	 * write_protect_page().  We use trylock_page() instead of
-+	 * lock_page() because we don't want to wait here - we
-+	 * prefer to continue scanning and merging different pages,
-+	 * then come back to this page when it is unlocked.
-+	 */
-+	if (!trylock_page(page))
-+		goto out;
-+
-+	if (!PageAnon(page) || !PageKsm(kpage))
-+		goto out_unlock;
-+
-+	if (PageTransCompound(page)) {
-+		err = split_huge_page(page);
-+		if (err)
-+			goto out_unlock;
-+	}
-+
-+	/*
-+	 * If this anonymous page is mapped only here, its pte may need
-+	 * to be write-protected.  If it's mapped elsewhere, all of its
-+	 * ptes are necessarily already write-protected.  But in either
-+	 * case, we need to lock and check page_count is not raised.
-+	 */
-+	if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
-+		if (pages_identical_with_cost(page, kpage))
-+			err = replace_page(vma, page, kpage, orig_pte);
-+		else
-+			err = check_collision(rmap_item, hash);
-+	}
-+
-+	if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
-+		munlock_vma_page(page);
-+		if (!PageMlocked(kpage)) {
-+			unlock_page(page);
-+			lock_page(kpage);
-+			mlock_vma_page(kpage);
-+			page = kpage;		/* for final unlock */
-+		}
-+	}
-+
-+out_unlock:
-+	unlock_page(page);
-+out:
-+	return err;
-+}
-+
-+
-+
-+/**
-+ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
-+ * to restore a page mapping that has been changed in try_to_merge_two_pages.
-+ *
-+ * @return 0 on success.
-+ */
-+static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
-+			     pte_t orig_pte, pte_t wprt_pte)
-+{
-+	struct mm_struct *mm = vma->vm_mm;
-+	pgd_t *pgd;
-+	p4d_t *p4d;
-+	pud_t *pud;
-+	pmd_t *pmd;
-+	pte_t *ptep;
-+	spinlock_t *ptl;
-+
-+	int err = -EFAULT;
-+
-+	pgd = pgd_offset(mm, addr);
-+	if (!pgd_present(*pgd))
-+		goto out;
-+
-+	p4d = p4d_offset(pgd, addr);
-+	pud = pud_offset(p4d, addr);
-+	if (!pud_present(*pud))
-+		goto out;
-+
-+	pmd = pmd_offset(pud, addr);
-+	if (!pmd_present(*pmd))
-+		goto out;
-+
-+	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
-+	if (!pte_same(*ptep, wprt_pte)) {
-+		/* already copied, let it be */
-+		pte_unmap_unlock(ptep, ptl);
-+		goto out;
-+	}
-+
-+	/*
-+	 * Good boy, still here. When we still get the ksm page, it does not
-+	 * return to the free page pool, there is no way that a pte was changed
-+	 * to other page and gets back to this page. And remind that ksm page
-+	 * do not reuse in do_wp_page(). So it's safe to restore the original
-+	 * pte.
-+	 */
-+	flush_cache_page(vma, addr, pte_pfn(*ptep));
-+	ptep_clear_flush_notify(vma, addr, ptep);
-+	set_pte_at_notify(mm, addr, ptep, orig_pte);
-+
-+	pte_unmap_unlock(ptep, ptl);
-+	err = 0;
-+out:
-+	return err;
-+}
-+
-+/**
-+ * try_to_merge_two_pages() - take two identical pages and prepare
-+ * them to be merged into one page(rmap_item->page)
-+ *
-+ * @return 0 if we successfully merged two identical pages into
-+ *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
-+ *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
-+ *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
-+ *
-+ */
-+static int try_to_merge_two_pages(struct rmap_item *rmap_item,
-+				  struct rmap_item *tree_rmap_item,
-+				  u32 hash)
-+{
-+	pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
-+	pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
-+	struct vm_area_struct *vma1 = rmap_item->slot->vma;
-+	struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
-+	struct page *page = rmap_item->page;
-+	struct page *tree_page = tree_rmap_item->page;
-+	int err = MERGE_ERR_PGERR;
-+	struct address_space *saved_mapping;
-+
-+
-+	if (rmap_item->page == tree_rmap_item->page)
-+		goto out;
-+
-+	if (!trylock_page(page))
-+		goto out;
-+
-+	if (!PageAnon(page))
-+		goto out_unlock;
-+
-+	if (PageTransCompound(page)) {
-+		err = split_huge_page(page);
-+		if (err)
-+			goto out_unlock;
-+	}
-+
-+	if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
-+		unlock_page(page);
-+		goto out;
-+	}
-+
-+	/*
-+	 * While we hold page lock, upgrade page from
-+	 * PageAnon+anon_vma to PageKsm+NULL stable_node:
-+	 * stable_tree_insert() will update stable_node.
-+	 */
-+	saved_mapping = page->mapping;
-+	set_page_stable_node(page, NULL);
-+	mark_page_accessed(page);
-+	if (!PageDirty(page))
-+		SetPageDirty(page);
-+
-+	unlock_page(page);
-+
-+	if (!trylock_page(tree_page))
-+		goto restore_out;
-+
-+	if (!PageAnon(tree_page)) {
-+		unlock_page(tree_page);
-+		goto restore_out;
-+	}
-+
-+	if (PageTransCompound(tree_page)) {
-+		err = split_huge_page(tree_page);
-+		if (err) {
-+			unlock_page(tree_page);
-+			goto restore_out;
-+		}
-+	}
-+
-+	if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
-+		unlock_page(tree_page);
-+		goto restore_out;
-+	}
-+
-+	if (pages_identical_with_cost(page, tree_page)) {
-+		err = replace_page(vma2, tree_page, page, wprt_pte2);
-+		if (err) {
-+			unlock_page(tree_page);
-+			goto restore_out;
-+		}
-+
-+		if ((vma2->vm_flags & VM_LOCKED)) {
-+			munlock_vma_page(tree_page);
-+			if (!PageMlocked(page)) {
-+				unlock_page(tree_page);
-+				lock_page(page);
-+				mlock_vma_page(page);
-+				tree_page = page; /* for final unlock */
-+			}
-+		}
-+
-+		unlock_page(tree_page);
-+
-+		goto out; /* success */
-+
-+	} else {
-+		if (tree_rmap_item->hash_max &&
-+		    tree_rmap_item->hash_max == rmap_item->hash_max) {
-+			err = MERGE_ERR_COLLI_MAX;
-+		} else if (page_hash(page, hash_strength, 0) ==
-+		    page_hash(tree_page, hash_strength, 0)) {
-+			inc_rshash_neg(memcmp_cost + hash_strength * 2);
-+			err = MERGE_ERR_COLLI;
-+		} else {
-+			err = MERGE_ERR_CHANGED;
-+		}
-+
-+		unlock_page(tree_page);
-+	}
-+
-+restore_out:
-+	lock_page(page);
-+	if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
-+				  orig_pte1, wprt_pte1))
-+		page->mapping = saved_mapping;
-+
-+out_unlock:
-+	unlock_page(page);
-+out:
-+	return err;
-+}
-+
-+static inline int hash_cmp(u32 new_val, u32 node_val)
-+{
-+	if (new_val > node_val)
-+		return 1;
-+	else if (new_val < node_val)
-+		return -1;
-+	else
-+		return 0;
-+}
-+
-+static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
-+{
-+	u32 hash_max = item->hash_max;
-+
-+	if (!hash_max) {
-+		hash_max = page_hash_max(item->page, hash);
-+
-+		item->hash_max = hash_max;
-+	}
-+
-+	return hash_max;
-+}
-+
-+
-+
-+/**
-+ * stable_tree_search() - search the stable tree for a page
-+ *
-+ * @item:	the rmap_item we are comparing with
-+ * @hash:	the hash value of this item->page already calculated
-+ *
-+ * @return	the page we have found, NULL otherwise. The page returned has
-+ *			been gotten.
-+ */
-+static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
-+{
-+	struct rb_node *node = root_stable_treep->rb_node;
-+	struct tree_node *tree_node;
-+	unsigned long hash_max;
-+	struct page *page = item->page;
-+	struct stable_node *stable_node;
-+
-+	stable_node = page_stable_node(page);
-+	if (stable_node) {
-+		/* ksm page forked, that is
-+		 * if (PageKsm(page) && !in_stable_tree(rmap_item))
-+		 * it's actually gotten once outside.
-+		 */
-+		get_page(page);
-+		return page;
-+	}
-+
-+	while (node) {
-+		int cmp;
-+
-+		tree_node = rb_entry(node, struct tree_node, node);
-+
-+		cmp = hash_cmp(hash, tree_node->hash);
-+
-+		if (cmp < 0)
-+			node = node->rb_left;
-+		else if (cmp > 0)
-+			node = node->rb_right;
-+		else
-+			break;
-+	}
-+
-+	if (!node)
-+		return NULL;
-+
-+	if (tree_node->count == 1) {
-+		stable_node = rb_entry(tree_node->sub_root.rb_node,
-+				       struct stable_node, node);
-+		BUG_ON(!stable_node);
-+
-+		goto get_page_out;
-+	}
-+
-+	/*
-+	 * ok, we have to search the second
-+	 * level subtree, hash the page to a
-+	 * full strength.
-+	 */
-+	node = tree_node->sub_root.rb_node;
-+	BUG_ON(!node);
-+	hash_max = rmap_item_hash_max(item, hash);
-+
-+	while (node) {
-+		int cmp;
-+
-+		stable_node = rb_entry(node, struct stable_node, node);
-+
-+		cmp = hash_cmp(hash_max, stable_node->hash_max);
-+
-+		if (cmp < 0)
-+			node = node->rb_left;
-+		else if (cmp > 0)
-+			node = node->rb_right;
-+		else
-+			goto get_page_out;
-+	}
-+
-+	return NULL;
-+
-+get_page_out:
-+	page = get_uksm_page(stable_node, 1, 1);
-+	return page;
-+}
-+
-+static int try_merge_rmap_item(struct rmap_item *item,
-+			       struct page *kpage,
-+			       struct page *tree_page)
-+{
-+	struct vm_area_struct *vma = item->slot->vma;
-+	struct page_vma_mapped_walk pvmw = {
-+		.page = kpage,
-+		.vma = vma,
-+	};
-+
-+	pvmw.address = get_rmap_addr(item);
-+	if (!page_vma_mapped_walk(&pvmw))
-+		return 0;
-+
-+	if (pte_write(*pvmw.pte)) {
-+		/* has changed, abort! */
-+		page_vma_mapped_walk_done(&pvmw);
-+		return 0;
-+	}
-+
-+	get_page(tree_page);
-+	page_add_anon_rmap(tree_page, vma, pvmw.address, false);
-+
-+	flush_cache_page(vma, pvmw.address, page_to_pfn(kpage));
-+	ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
-+	set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte,
-+			  mk_pte(tree_page, vma->vm_page_prot));
-+
-+	page_remove_rmap(kpage, false);
-+	put_page(kpage);
-+
-+	page_vma_mapped_walk_done(&pvmw);
-+
-+	return 1;
-+}
-+
-+/**
-+ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
-+ * into stable tree, the page was found to be identical to a stable ksm page,
-+ * this is the last chance we can merge them into one.
-+ *
-+ * @item1:	the rmap_item holding the page which we wanted to insert
-+ *		into stable tree.
-+ * @item2:	the other rmap_item we found when unstable tree search
-+ * @oldpage:	the page currently mapped by the two rmap_items
-+ * @tree_page:	the page we found identical in stable tree node
-+ * @success1:	return if item1 is successfully merged
-+ * @success2:	return if item2 is successfully merged
-+ */
-+static void try_merge_with_stable(struct rmap_item *item1,
-+				  struct rmap_item *item2,
-+				  struct page **kpage,
-+				  struct page *tree_page,
-+				  int *success1, int *success2)
-+{
-+	struct vm_area_struct *vma1 = item1->slot->vma;
-+	struct vm_area_struct *vma2 = item2->slot->vma;
-+	*success1 = 0;
-+	*success2 = 0;
-+
-+	if (unlikely(*kpage == tree_page)) {
-+		/* I don't think this can really happen */
-+		pr_warn("UKSM: unexpected condition detected in "
-+			"%s -- *kpage == tree_page !\n", __func__);
-+		*success1 = 1;
-+		*success2 = 1;
-+		return;
-+	}
-+
-+	if (!PageAnon(*kpage) || !PageKsm(*kpage))
-+		goto failed;
-+
-+	if (!trylock_page(tree_page))
-+		goto failed;
-+
-+	/* If the oldpage is still ksm and still pointed
-+	 * to in the right place, and still write protected,
-+	 * we are confident it's not changed, no need to
-+	 * memcmp anymore.
-+	 * be ware, we cannot take nested pte locks,
-+	 * deadlock risk.
-+	 */
-+	if (!try_merge_rmap_item(item1, *kpage, tree_page))
-+		goto unlock_failed;
-+
-+	/* ok, then vma2, remind that pte1 already set */
-+	if (!try_merge_rmap_item(item2, *kpage, tree_page))
-+		goto success_1;
-+
-+	*success2 = 1;
-+success_1:
-+	*success1 = 1;
-+
-+
-+	if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
-+	    (*success2 && vma2->vm_flags & VM_LOCKED)) {
-+		munlock_vma_page(*kpage);
-+		if (!PageMlocked(tree_page))
-+			mlock_vma_page(tree_page);
-+	}
-+
-+	/*
-+	 * We do not need oldpage any more in the caller, so can break the lock
-+	 * now.
-+	 */
-+	unlock_page(*kpage);
-+	*kpage = tree_page; /* Get unlocked outside. */
-+	return;
-+
-+unlock_failed:
-+	unlock_page(tree_page);
-+failed:
-+	return;
-+}
-+
-+static inline void stable_node_hash_max(struct stable_node *node,
-+					 struct page *page, u32 hash)
-+{
-+	u32 hash_max = node->hash_max;
-+
-+	if (!hash_max) {
-+		hash_max = page_hash_max(page, hash);
-+		node->hash_max = hash_max;
-+	}
-+}
-+
-+static inline
-+struct stable_node *new_stable_node(struct tree_node *tree_node,
-+				    struct page *kpage, u32 hash_max)
-+{
-+	struct stable_node *new_stable_node;
-+
-+	new_stable_node = alloc_stable_node();
-+	if (!new_stable_node)
-+		return NULL;
-+
-+	new_stable_node->kpfn = page_to_pfn(kpage);
-+	new_stable_node->hash_max = hash_max;
-+	new_stable_node->tree_node = tree_node;
-+	set_page_stable_node(kpage, new_stable_node);
-+
-+	return new_stable_node;
-+}
-+
-+static inline
-+struct stable_node *first_level_insert(struct tree_node *tree_node,
-+				       struct rmap_item *rmap_item,
-+				       struct rmap_item *tree_rmap_item,
-+				       struct page **kpage, u32 hash,
-+				       int *success1, int *success2)
-+{
-+	int cmp;
-+	struct page *tree_page;
-+	u32 hash_max = 0;
-+	struct stable_node *stable_node, *new_snode;
-+	struct rb_node *parent = NULL, **new;
-+
-+	/* this tree node contains no sub-tree yet */
-+	stable_node = rb_entry(tree_node->sub_root.rb_node,
-+			       struct stable_node, node);
-+
-+	tree_page = get_uksm_page(stable_node, 1, 0);
-+	if (tree_page) {
-+		cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
-+		if (!cmp) {
-+			try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
-+					      tree_page, success1, success2);
-+			put_page(tree_page);
-+			if (!*success1 && !*success2)
-+				goto failed;
-+
-+			return stable_node;
-+
-+		} else {
-+			/*
-+			 * collision in first level try to create a subtree.
-+			 * A new node need to be created.
-+			 */
-+			put_page(tree_page);
-+
-+			stable_node_hash_max(stable_node, tree_page,
-+					     tree_node->hash);
-+			hash_max = rmap_item_hash_max(rmap_item, hash);
-+			cmp = hash_cmp(hash_max, stable_node->hash_max);
-+
-+			parent = &stable_node->node;
-+			if (cmp < 0)
-+				new = &parent->rb_left;
-+			else if (cmp > 0)
-+				new = &parent->rb_right;
-+			else
-+				goto failed;
-+		}
-+
-+	} else {
-+		/* the only stable_node deleted, we reuse its tree_node.
-+		 */
-+		parent = NULL;
-+		new = &tree_node->sub_root.rb_node;
-+	}
-+
-+	new_snode = new_stable_node(tree_node, *kpage, hash_max);
-+	if (!new_snode)
-+		goto failed;
-+
-+	rb_link_node(&new_snode->node, parent, new);
-+	rb_insert_color(&new_snode->node, &tree_node->sub_root);
-+	tree_node->count++;
-+	*success1 = *success2 = 1;
-+
-+	return new_snode;
-+
-+failed:
-+	return NULL;
-+}
-+
-+static inline
-+struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
-+					  struct rmap_item *rmap_item,
-+					  struct rmap_item *tree_rmap_item,
-+					  struct page **kpage, u32 hash,
-+					  int *success1, int *success2)
-+{
-+	struct page *tree_page;
-+	u32 hash_max;
-+	struct stable_node *stable_node, *new_snode;
-+	struct rb_node *parent, **new;
-+
-+research:
-+	parent = NULL;
-+	new = &tree_node->sub_root.rb_node;
-+	BUG_ON(!*new);
-+	hash_max = rmap_item_hash_max(rmap_item, hash);
-+	while (*new) {
-+		int cmp;
-+
-+		stable_node = rb_entry(*new, struct stable_node, node);
-+
-+		cmp = hash_cmp(hash_max, stable_node->hash_max);
-+
-+		if (cmp < 0) {
-+			parent = *new;
-+			new = &parent->rb_left;
-+		} else if (cmp > 0) {
-+			parent = *new;
-+			new = &parent->rb_right;
-+		} else {
-+			tree_page = get_uksm_page(stable_node, 1, 0);
-+			if (tree_page) {
-+				cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
-+				if (!cmp) {
-+					try_merge_with_stable(rmap_item,
-+						tree_rmap_item, kpage,
-+						tree_page, success1, success2);
-+
-+					put_page(tree_page);
-+					if (!*success1 && !*success2)
-+						goto failed;
-+					/*
-+					 * successfully merged with a stable
-+					 * node
-+					 */
-+					return stable_node;
-+				} else {
-+					put_page(tree_page);
-+					goto failed;
-+				}
-+			} else {
-+				/*
-+				 * stable node may be deleted,
-+				 * and subtree maybe
-+				 * restructed, cannot
-+				 * continue, research it.
-+				 */
-+				if (tree_node->count) {
-+					goto research;
-+				} else {
-+					/* reuse the tree node*/
-+					parent = NULL;
-+					new = &tree_node->sub_root.rb_node;
-+				}
-+			}
-+		}
-+	}
-+
-+	new_snode = new_stable_node(tree_node, *kpage, hash_max);
-+	if (!new_snode)
-+		goto failed;
-+
-+	rb_link_node(&new_snode->node, parent, new);
-+	rb_insert_color(&new_snode->node, &tree_node->sub_root);
-+	tree_node->count++;
-+	*success1 = *success2 = 1;
-+
-+	return new_snode;
-+
-+failed:
-+	return NULL;
-+}
-+
-+
-+/**
-+ * stable_tree_insert() - try to insert a merged page in unstable tree to
-+ * the stable tree
-+ *
-+ * @kpage:		the page need to be inserted
-+ * @hash:		the current hash of this page
-+ * @rmap_item:		the rmap_item being scanned
-+ * @tree_rmap_item:	the rmap_item found on unstable tree
-+ * @success1:		return if rmap_item is merged
-+ * @success2:		return if tree_rmap_item is merged
-+ *
-+ * @return		the stable_node on stable tree if at least one
-+ *			rmap_item is inserted into stable tree, NULL
-+ *			otherwise.
-+ */
-+static struct stable_node *
-+stable_tree_insert(struct page **kpage, u32 hash,
-+		   struct rmap_item *rmap_item,
-+		   struct rmap_item *tree_rmap_item,
-+		   int *success1, int *success2)
-+{
-+	struct rb_node **new = &root_stable_treep->rb_node;
-+	struct rb_node *parent = NULL;
-+	struct stable_node *stable_node;
-+	struct tree_node *tree_node;
-+	u32 hash_max = 0;
-+
-+	*success1 = *success2 = 0;
-+
-+	while (*new) {
-+		int cmp;
-+
-+		tree_node = rb_entry(*new, struct tree_node, node);
-+
-+		cmp = hash_cmp(hash, tree_node->hash);
-+
-+		if (cmp < 0) {
-+			parent = *new;
-+			new = &parent->rb_left;
-+		} else if (cmp > 0) {
-+			parent = *new;
-+			new = &parent->rb_right;
-+		} else
-+			break;
-+	}
-+
-+	if (*new) {
-+		if (tree_node->count == 1) {
-+			stable_node = first_level_insert(tree_node, rmap_item,
-+						tree_rmap_item, kpage,
-+						hash, success1, success2);
-+		} else {
-+			stable_node = stable_subtree_insert(tree_node,
-+					rmap_item, tree_rmap_item, kpage,
-+					hash, success1, success2);
-+		}
-+	} else {
-+
-+		/* no tree node found */
-+		tree_node = alloc_tree_node(stable_tree_node_listp);
-+		if (!tree_node) {
-+			stable_node = NULL;
-+			goto out;
-+		}
-+
-+		stable_node = new_stable_node(tree_node, *kpage, hash_max);
-+		if (!stable_node) {
-+			free_tree_node(tree_node);
-+			goto out;
-+		}
-+
-+		tree_node->hash = hash;
-+		rb_link_node(&tree_node->node, parent, new);
-+		rb_insert_color(&tree_node->node, root_stable_treep);
-+		parent = NULL;
-+		new = &tree_node->sub_root.rb_node;
-+
-+		rb_link_node(&stable_node->node, parent, new);
-+		rb_insert_color(&stable_node->node, &tree_node->sub_root);
-+		tree_node->count++;
-+		*success1 = *success2 = 1;
-+	}
-+
-+out:
-+	return stable_node;
-+}
-+
-+
-+/**
-+ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
-+ *
-+ * @return	0 on success, -EBUSY if unable to lock the mmap_sem,
-+ *		-EINVAL if the page mapping has been changed.
-+ */
-+static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
-+{
-+	int err;
-+
-+	err = get_mergeable_page_lock_mmap(tree_rmap_item);
-+
-+	if (err == -EINVAL) {
-+		/* its page map has been changed, remove it */
-+		remove_rmap_item_from_tree(tree_rmap_item);
-+	}
-+
-+	/* The page is gotten and mmap_sem is locked now. */
-+	return err;
-+}
-+
-+
-+/**
-+ * unstable_tree_search_insert() - search an unstable tree rmap_item with the
-+ * same hash value. Get its page and trylock the mmap_sem
-+ */
-+static inline
-+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
-+					      u32 hash)
-+
-+{
-+	struct rb_node **new = &root_unstable_tree.rb_node;
-+	struct rb_node *parent = NULL;
-+	struct tree_node *tree_node;
-+	u32 hash_max;
-+	struct rmap_item *tree_rmap_item;
-+
-+	while (*new) {
-+		int cmp;
-+
-+		tree_node = rb_entry(*new, struct tree_node, node);
-+
-+		cmp = hash_cmp(hash, tree_node->hash);
-+
-+		if (cmp < 0) {
-+			parent = *new;
-+			new = &parent->rb_left;
-+		} else if (cmp > 0) {
-+			parent = *new;
-+			new = &parent->rb_right;
-+		} else
-+			break;
-+	}
-+
-+	if (*new) {
-+		/* got the tree_node */
-+		if (tree_node->count == 1) {
-+			tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
-+						  struct rmap_item, node);
-+			BUG_ON(!tree_rmap_item);
-+
-+			goto get_page_out;
-+		}
-+
-+		/* well, search the collision subtree */
-+		new = &tree_node->sub_root.rb_node;
-+		BUG_ON(!*new);
-+		hash_max = rmap_item_hash_max(rmap_item, hash);
-+
-+		while (*new) {
-+			int cmp;
-+
-+			tree_rmap_item = rb_entry(*new, struct rmap_item,
-+						  node);
-+
-+			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
-+			parent = *new;
-+			if (cmp < 0)
-+				new = &parent->rb_left;
-+			else if (cmp > 0)
-+				new = &parent->rb_right;
-+			else
-+				goto get_page_out;
-+		}
-+	} else {
-+		/* alloc a new tree_node */
-+		tree_node = alloc_tree_node(&unstable_tree_node_list);
-+		if (!tree_node)
-+			return NULL;
-+
-+		tree_node->hash = hash;
-+		rb_link_node(&tree_node->node, parent, new);
-+		rb_insert_color(&tree_node->node, &root_unstable_tree);
-+		parent = NULL;
-+		new = &tree_node->sub_root.rb_node;
-+	}
-+
-+	/* did not found even in sub-tree */
-+	rmap_item->tree_node = tree_node;
-+	rmap_item->address |= UNSTABLE_FLAG;
-+	rmap_item->hash_round = uksm_hash_round;
-+	rb_link_node(&rmap_item->node, parent, new);
-+	rb_insert_color(&rmap_item->node, &tree_node->sub_root);
-+
-+	uksm_pages_unshared++;
-+	return NULL;
-+
-+get_page_out:
-+	if (tree_rmap_item->page == rmap_item->page)
-+		return NULL;
-+
-+	if (get_tree_rmap_item_page(tree_rmap_item))
-+		return NULL;
-+
-+	return tree_rmap_item;
-+}
-+
-+static void hold_anon_vma(struct rmap_item *rmap_item,
-+			  struct anon_vma *anon_vma)
-+{
-+	rmap_item->anon_vma = anon_vma;
-+	get_anon_vma(anon_vma);
-+}
-+
-+
-+/**
-+ * stable_tree_append() - append a rmap_item to a stable node. Deduplication
-+ * ratio statistics is done in this function.
-+ *
-+ */
-+static void stable_tree_append(struct rmap_item *rmap_item,
-+			       struct stable_node *stable_node, int logdedup)
-+{
-+	struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
-+	unsigned long key = (unsigned long)rmap_item->slot;
-+	unsigned long factor = rmap_item->slot->rung->step;
-+
-+	BUG_ON(!stable_node);
-+	rmap_item->address |= STABLE_FLAG;
-+
-+	if (hlist_empty(&stable_node->hlist)) {
-+		uksm_pages_shared++;
-+		goto node_vma_new;
-+	} else {
-+		uksm_pages_sharing++;
-+	}
-+
-+	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
-+		if (node_vma->key >= key)
-+			break;
-+
-+		if (logdedup) {
-+			node_vma->slot->pages_bemerged += factor;
-+			if (list_empty(&node_vma->slot->dedup_list))
-+				list_add(&node_vma->slot->dedup_list,
-+					 &vma_slot_dedup);
-+		}
-+	}
-+
-+	if (node_vma) {
-+		if (node_vma->key == key) {
-+			node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
-+			goto node_vma_ok;
-+		} else if (node_vma->key > key) {
-+			node_vma_cont = node_vma;
-+		}
-+	}
-+
-+node_vma_new:
-+	/* no same vma already in node, alloc a new node_vma */
-+	new_node_vma = alloc_node_vma();
-+	BUG_ON(!new_node_vma);
-+	new_node_vma->head = stable_node;
-+	new_node_vma->slot = rmap_item->slot;
-+
-+	if (!node_vma) {
-+		hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
-+	} else if (node_vma->key != key) {
-+		if (node_vma->key < key)
-+			hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
-+		else {
-+			hlist_add_before(&new_node_vma->hlist,
-+					 &node_vma->hlist);
-+		}
-+
-+	}
-+	node_vma = new_node_vma;
-+
-+node_vma_ok: /* ok, ready to add to the list */
-+	rmap_item->head = node_vma;
-+	hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
-+	hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
-+	if (logdedup) {
-+		rmap_item->slot->pages_merged++;
-+		if (node_vma_cont) {
-+			node_vma = node_vma_cont;
-+			hlist_for_each_entry_continue(node_vma, hlist) {
-+				node_vma->slot->pages_bemerged += factor;
-+				if (list_empty(&node_vma->slot->dedup_list))
-+					list_add(&node_vma->slot->dedup_list,
-+						 &vma_slot_dedup);
-+			}
-+		}
-+	}
-+}
-+
-+/*
-+ * We use break_ksm to break COW on a ksm page: it's a stripped down
-+ *
-+ *	if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
-+ *		put_page(page);
-+ *
-+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
-+ * in case the application has unmapped and remapped mm,addr meanwhile.
-+ * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
-+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
-+ */
-+static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
-+{
-+	struct page *page;
-+	int ret = 0;
-+
-+	do {
-+		cond_resched();
-+		page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
-+		if (IS_ERR_OR_NULL(page))
-+			break;
-+		if (PageKsm(page)) {
-+			ret = handle_mm_fault(vma, addr,
-+					      FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
-+                                             NULL);
-+		} else
-+			ret = VM_FAULT_WRITE;
-+		put_page(page);
-+	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
-+	/*
-+	 * We must loop because handle_mm_fault() may back out if there's
-+	 * any difficulty e.g. if pte accessed bit gets updated concurrently.
-+	 *
-+	 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
-+	 * COW has been broken, even if the vma does not permit VM_WRITE;
-+	 * but note that a concurrent fault might break PageKsm for us.
-+	 *
-+	 * VM_FAULT_SIGBUS could occur if we race with truncation of the
-+	 * backing file, which also invalidates anonymous pages: that's
-+	 * okay, that truncation will have unmapped the PageKsm for us.
-+	 *
-+	 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
-+	 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
-+	 * current task has TIF_MEMDIE set, and will be OOM killed on return
-+	 * to user; and ksmd, having no mm, would never be chosen for that.
-+	 *
-+	 * But if the mm is in a limited mem_cgroup, then the fault may fail
-+	 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
-+	 * even ksmd can fail in this way - though it's usually breaking ksm
-+	 * just to undo a merge it made a moment before, so unlikely to oom.
-+	 *
-+	 * That's a pity: we might therefore have more kernel pages allocated
-+	 * than we're counting as nodes in the stable tree; but uksm_do_scan
-+	 * will retry to break_cow on each pass, so should recover the page
-+	 * in due course.  The important thing is to not let VM_MERGEABLE
-+	 * be cleared while any such pages might remain in the area.
-+	 */
-+	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
-+}
-+
-+static void break_cow(struct rmap_item *rmap_item)
-+{
-+	struct vm_area_struct *vma = rmap_item->slot->vma;
-+	struct mm_struct *mm = vma->vm_mm;
-+	unsigned long addr = get_rmap_addr(rmap_item);
-+
-+	if (uksm_test_exit(mm))
-+		goto out;
-+
-+	break_ksm(vma, addr);
-+out:
-+	return;
-+}
-+
-+/*
-+ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
-+ * than check every pte of a given vma, the locking doesn't quite work for
-+ * that - an rmap_item is assigned to the stable tree after inserting ksm
-+ * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
-+ * rmap_items from parent to child at fork time (so as not to waste time
-+ * if exit comes before the next scan reaches it).
-+ *
-+ * Similarly, although we'd like to remove rmap_items (so updating counts
-+ * and freeing memory) when unmerging an area, it's easier to leave that
-+ * to the next pass of ksmd - consider, for example, how ksmd might be
-+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
-+ */
-+inline int unmerge_uksm_pages(struct vm_area_struct *vma,
-+		      unsigned long start, unsigned long end)
-+{
-+	unsigned long addr;
-+	int err = 0;
-+
-+	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
-+		if (uksm_test_exit(vma->vm_mm))
-+			break;
-+		if (signal_pending(current))
-+			err = -ERESTARTSYS;
-+		else
-+			err = break_ksm(vma, addr);
-+	}
-+	return err;
-+}
-+
-+static inline void inc_uksm_pages_scanned(void)
-+{
-+	u64 delta;
-+
-+
-+	if (uksm_pages_scanned == U64_MAX) {
-+		encode_benefit();
-+
-+		delta = uksm_pages_scanned >> pages_scanned_base;
-+
-+		if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
-+			pages_scanned_stored >>= 1;
-+			delta >>= 1;
-+			pages_scanned_base++;
-+		}
-+
-+		pages_scanned_stored += delta;
-+
-+		uksm_pages_scanned = uksm_pages_scanned_last = 0;
-+	}
-+
-+	uksm_pages_scanned++;
-+}
-+
-+static inline int find_zero_page_hash(int strength, u32 hash)
-+{
-+	return (zero_hash_table[strength] == hash);
-+}
-+
-+static
-+int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
-+{
-+	struct page *zero_page = empty_uksm_zero_page;
-+	struct mm_struct *mm = vma->vm_mm;
-+	pte_t orig_pte = __pte(0);
-+	int err = -EFAULT;
-+
-+	if (uksm_test_exit(mm))
-+		goto out;
-+
-+	if (!trylock_page(page))
-+		goto out;
-+
-+	if (!PageAnon(page))
-+		goto out_unlock;
-+
-+	if (PageTransCompound(page)) {
-+		err = split_huge_page(page);
-+		if (err)
-+			goto out_unlock;
-+	}
-+
-+	if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
-+		if (is_page_full_zero(page))
-+			err = replace_page(vma, page, zero_page, orig_pte);
-+	}
-+
-+out_unlock:
-+	unlock_page(page);
-+out:
-+	return err;
-+}
-+
-+/*
-+ * cmp_and_merge_page() - first see if page can be merged into the stable
-+ * tree; if not, compare hash to previous and if it's the same, see if page
-+ * can be inserted into the unstable tree, or merged with a page already there
-+ * and both transferred to the stable tree.
-+ *
-+ * @page: the page that we are searching identical page to.
-+ * @rmap_item: the reverse mapping into the virtual address of this page
-+ */
-+static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
-+{
-+	struct rmap_item *tree_rmap_item;
-+	struct page *page;
-+	struct page *kpage = NULL;
-+	u32 hash_max;
-+	int err;
-+	unsigned int success1, success2;
-+	struct stable_node *snode;
-+	int cmp;
-+	struct rb_node *parent = NULL, **new;
-+
-+	remove_rmap_item_from_tree(rmap_item);
-+	page = rmap_item->page;
-+
-+	/* We first start with searching the page inside the stable tree */
-+	kpage = stable_tree_search(rmap_item, hash);
-+	if (kpage) {
-+		err = try_to_merge_with_uksm_page(rmap_item, kpage,
-+						 hash);
-+		if (!err) {
-+			/*
-+			 * The page was successfully merged, add
-+			 * its rmap_item to the stable tree.
-+			 * page lock is needed because it's
-+			 * racing with try_to_unmap_ksm(), etc.
-+			 */
-+			lock_page(kpage);
-+			snode = page_stable_node(kpage);
-+			stable_tree_append(rmap_item, snode, 1);
-+			unlock_page(kpage);
-+			put_page(kpage);
-+			return; /* success */
-+		}
-+		put_page(kpage);
-+
-+		/*
-+		 * if it's a collision and it has been search in sub-rbtree
-+		 * (hash_max != 0), we want to abort, because if it is
-+		 * successfully merged in unstable tree, the collision trends to
-+		 * happen again.
-+		 */
-+		if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
-+			return;
-+	}
-+
-+	tree_rmap_item =
-+		unstable_tree_search_insert(rmap_item, hash);
-+	if (tree_rmap_item) {
-+		err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
-+		/*
-+		 * As soon as we merge this page, we want to remove the
-+		 * rmap_item of the page we have merged with from the unstable
-+		 * tree, and insert it instead as new node in the stable tree.
-+		 */
-+		if (!err) {
-+			kpage = page;
-+			remove_rmap_item_from_tree(tree_rmap_item);
-+			lock_page(kpage);
-+			snode = stable_tree_insert(&kpage, hash,
-+						   rmap_item, tree_rmap_item,
-+						   &success1, &success2);
-+
-+			/*
-+			 * Do not log dedup for tree item, it's not counted as
-+			 * scanned in this round.
-+			 */
-+			if (success2)
-+				stable_tree_append(tree_rmap_item, snode, 0);
-+
-+			/*
-+			 * The order of these two stable append is important:
-+			 * we are scanning rmap_item.
-+			 */
-+			if (success1)
-+				stable_tree_append(rmap_item, snode, 1);
-+
-+			/*
-+			 * The original kpage may be unlocked inside
-+			 * stable_tree_insert() already. This page
-+			 * should be unlocked before doing
-+			 * break_cow().
-+			 */
-+			unlock_page(kpage);
-+
-+			if (!success1)
-+				break_cow(rmap_item);
-+
-+			if (!success2)
-+				break_cow(tree_rmap_item);
-+
-+		} else if (err == MERGE_ERR_COLLI) {
-+			BUG_ON(tree_rmap_item->tree_node->count > 1);
-+
-+			rmap_item_hash_max(tree_rmap_item,
-+					   tree_rmap_item->tree_node->hash);
-+
-+			hash_max = rmap_item_hash_max(rmap_item, hash);
-+			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
-+			parent = &tree_rmap_item->node;
-+			if (cmp < 0)
-+				new = &parent->rb_left;
-+			else if (cmp > 0)
-+				new = &parent->rb_right;
-+			else
-+				goto put_up_out;
-+
-+			rmap_item->tree_node = tree_rmap_item->tree_node;
-+			rmap_item->address |= UNSTABLE_FLAG;
-+			rmap_item->hash_round = uksm_hash_round;
-+			rb_link_node(&rmap_item->node, parent, new);
-+			rb_insert_color(&rmap_item->node,
-+					&tree_rmap_item->tree_node->sub_root);
-+			rmap_item->tree_node->count++;
-+		} else {
-+			/*
-+			 * either one of the page has changed or they collide
-+			 * at the max hash, we consider them as ill items.
-+			 */
-+			remove_rmap_item_from_tree(tree_rmap_item);
-+		}
-+put_up_out:
-+		put_page(tree_rmap_item->page);
-+		mmap_read_unlock(tree_rmap_item->slot->vma->vm_mm);
-+	}
-+}
-+
-+
-+
-+
-+static inline unsigned long get_pool_index(struct vma_slot *slot,
-+					   unsigned long index)
-+{
-+	unsigned long pool_index;
-+
-+	pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
-+	if (pool_index >= slot->pool_size)
-+		BUG();
-+	return pool_index;
-+}
-+
-+static inline unsigned long index_page_offset(unsigned long index)
-+{
-+	return offset_in_page(sizeof(struct rmap_list_entry *) * index);
-+}
-+
-+static inline
-+struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
-+					    unsigned long index, int need_alloc)
-+{
-+	unsigned long pool_index;
-+	struct page *page;
-+	void *addr;
-+
-+
-+	pool_index = get_pool_index(slot, index);
-+	if (!slot->rmap_list_pool[pool_index]) {
-+		if (!need_alloc)
-+			return NULL;
-+
-+		page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
-+		if (!page)
-+			return NULL;
-+
-+		slot->rmap_list_pool[pool_index] = page;
-+	}
-+
-+	addr = kmap(slot->rmap_list_pool[pool_index]);
-+	addr += index_page_offset(index);
-+
-+	return addr;
-+}
-+
-+static inline void put_rmap_list_entry(struct vma_slot *slot,
-+				       unsigned long index)
-+{
-+	unsigned long pool_index;
-+
-+	pool_index = get_pool_index(slot, index);
-+	BUG_ON(!slot->rmap_list_pool[pool_index]);
-+	kunmap(slot->rmap_list_pool[pool_index]);
-+}
-+
-+static inline int entry_is_new(struct rmap_list_entry *entry)
-+{
-+	return !entry->item;
-+}
-+
-+static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
-+						unsigned long index)
-+{
-+	return slot->vma->vm_start + (index << PAGE_SHIFT);
-+}
-+
-+static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
-+{
-+	unsigned long addr;
-+
-+	if (is_addr(entry->addr))
-+		addr = get_clean_addr(entry->addr);
-+	else if (entry->item)
-+		addr = get_rmap_addr(entry->item);
-+	else
-+		BUG();
-+
-+	return addr;
-+}
-+
-+static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
-+{
-+	if (is_addr(entry->addr))
-+		return NULL;
-+
-+	return entry->item;
-+}
-+
-+static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
-+					    unsigned long index)
-+{
-+	unsigned long pool_index;
-+
-+	pool_index = get_pool_index(slot, index);
-+	BUG_ON(!slot->rmap_list_pool[pool_index]);
-+	slot->pool_counts[pool_index]++;
-+}
-+
-+static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
-+					    unsigned long index)
-+{
-+	unsigned long pool_index;
-+
-+	pool_index = get_pool_index(slot, index);
-+	BUG_ON(!slot->rmap_list_pool[pool_index]);
-+	BUG_ON(!slot->pool_counts[pool_index]);
-+	slot->pool_counts[pool_index]--;
-+}
-+
-+static inline int entry_has_rmap(struct rmap_list_entry *entry)
-+{
-+	return !is_addr(entry->addr) && entry->item;
-+}
-+
-+static inline void swap_entries(struct rmap_list_entry *entry1,
-+				unsigned long index1,
-+				struct rmap_list_entry *entry2,
-+				unsigned long index2)
-+{
-+	struct rmap_list_entry tmp;
-+
-+	/* swapping two new entries is meaningless */
-+	BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
-+
-+	tmp = *entry1;
-+	*entry1 = *entry2;
-+	*entry2 = tmp;
-+
-+	if (entry_has_rmap(entry1))
-+		entry1->item->entry_index = index1;
-+
-+	if (entry_has_rmap(entry2))
-+		entry2->item->entry_index = index2;
-+
-+	if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
-+		inc_rmap_list_pool_count(entry1->item->slot, index1);
-+		dec_rmap_list_pool_count(entry1->item->slot, index2);
-+	} else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
-+		inc_rmap_list_pool_count(entry2->item->slot, index2);
-+		dec_rmap_list_pool_count(entry2->item->slot, index1);
-+	}
-+}
-+
-+static inline void free_entry_item(struct rmap_list_entry *entry)
-+{
-+	unsigned long index;
-+	struct rmap_item *item;
-+
-+	if (!is_addr(entry->addr)) {
-+		BUG_ON(!entry->item);
-+		item = entry->item;
-+		entry->addr = get_rmap_addr(item);
-+		set_is_addr(entry->addr);
-+		index = item->entry_index;
-+		remove_rmap_item_from_tree(item);
-+		dec_rmap_list_pool_count(item->slot, index);
-+		free_rmap_item(item);
-+	}
-+}
-+
-+static inline int pool_entry_boundary(unsigned long index)
-+{
-+	unsigned long linear_addr;
-+
-+	linear_addr = sizeof(struct rmap_list_entry *) * index;
-+	return index && !offset_in_page(linear_addr);
-+}
-+
-+static inline void try_free_last_pool(struct vma_slot *slot,
-+				      unsigned long index)
-+{
-+	unsigned long pool_index;
-+
-+	pool_index = get_pool_index(slot, index);
-+	if (slot->rmap_list_pool[pool_index] &&
-+	    !slot->pool_counts[pool_index]) {
-+		__free_page(slot->rmap_list_pool[pool_index]);
-+		slot->rmap_list_pool[pool_index] = NULL;
-+		slot->flags |= UKSM_SLOT_NEED_SORT;
-+	}
-+
-+}
-+
-+static inline unsigned long vma_item_index(struct vm_area_struct *vma,
-+					   struct rmap_item *item)
-+{
-+	return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
-+}
-+
-+static int within_same_pool(struct vma_slot *slot,
-+			    unsigned long i, unsigned long j)
-+{
-+	unsigned long pool_i, pool_j;
-+
-+	pool_i = get_pool_index(slot, i);
-+	pool_j = get_pool_index(slot, j);
-+
-+	return (pool_i == pool_j);
-+}
-+
-+static void sort_rmap_entry_list(struct vma_slot *slot)
-+{
-+	unsigned long i, j;
-+	struct rmap_list_entry *entry, *swap_entry;
-+
-+	entry = get_rmap_list_entry(slot, 0, 0);
-+	for (i = 0; i < slot->pages; ) {
-+
-+		if (!entry)
-+			goto skip_whole_pool;
-+
-+		if (entry_is_new(entry))
-+			goto next_entry;
-+
-+		if (is_addr(entry->addr)) {
-+			entry->addr = 0;
-+			goto next_entry;
-+		}
-+
-+		j = vma_item_index(slot->vma, entry->item);
-+		if (j == i)
-+			goto next_entry;
-+
-+		if (within_same_pool(slot, i, j))
-+			swap_entry = entry + j - i;
-+		else
-+			swap_entry = get_rmap_list_entry(slot, j, 1);
-+
-+		swap_entries(entry, i, swap_entry, j);
-+		if (!within_same_pool(slot, i, j))
-+			put_rmap_list_entry(slot, j);
-+		continue;
-+
-+skip_whole_pool:
-+		i += PAGE_SIZE / sizeof(*entry);
-+		if (i < slot->pages)
-+			entry = get_rmap_list_entry(slot, i, 0);
-+		continue;
-+
-+next_entry:
-+		if (i >= slot->pages - 1 ||
-+		    !within_same_pool(slot, i, i + 1)) {
-+			put_rmap_list_entry(slot, i);
-+			if (i + 1 < slot->pages)
-+				entry = get_rmap_list_entry(slot, i + 1, 0);
-+		} else
-+			entry++;
-+		i++;
-+		continue;
-+	}
-+
-+	/* free empty pool entries which contain no rmap_item */
-+	/* CAN be simplied to based on only pool_counts when bug freed !!!!! */
-+	for (i = 0; i < slot->pool_size; i++) {
-+		unsigned char has_rmap;
-+		void *addr;
-+
-+		if (!slot->rmap_list_pool[i])
-+			continue;
-+
-+		has_rmap = 0;
-+		addr = kmap(slot->rmap_list_pool[i]);
-+		BUG_ON(!addr);
-+		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
-+			entry = (struct rmap_list_entry *)addr + j;
-+			if (is_addr(entry->addr))
-+				continue;
-+			if (!entry->item)
-+				continue;
-+			has_rmap = 1;
-+		}
-+		kunmap(slot->rmap_list_pool[i]);
-+		if (!has_rmap) {
-+			BUG_ON(slot->pool_counts[i]);
-+			__free_page(slot->rmap_list_pool[i]);
-+			slot->rmap_list_pool[i] = NULL;
-+		}
-+	}
-+
-+	slot->flags &= ~UKSM_SLOT_NEED_SORT;
-+}
-+
-+/*
-+ * vma_fully_scanned() - if all the pages in this slot have been scanned.
-+ */
-+static inline int vma_fully_scanned(struct vma_slot *slot)
-+{
-+	return slot->pages_scanned == slot->pages;
-+}
-+
-+/**
-+ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
-+ * its random permutation. This function is embedded with the random
-+ * permutation index management code.
-+ */
-+static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
-+{
-+	unsigned long rand_range, addr, swap_index, scan_index;
-+	struct rmap_item *item = NULL;
-+	struct rmap_list_entry *scan_entry, *swap_entry = NULL;
-+	struct page *page;
-+
-+	scan_index = swap_index = slot->pages_scanned % slot->pages;
-+
-+	if (pool_entry_boundary(scan_index))
-+		try_free_last_pool(slot, scan_index - 1);
-+
-+	if (vma_fully_scanned(slot)) {
-+		if (slot->flags & UKSM_SLOT_NEED_SORT)
-+			slot->flags |= UKSM_SLOT_NEED_RERAND;
-+		else
-+			slot->flags &= ~UKSM_SLOT_NEED_RERAND;
-+		if (slot->flags & UKSM_SLOT_NEED_SORT)
-+			sort_rmap_entry_list(slot);
-+	}
-+
-+	scan_entry = get_rmap_list_entry(slot, scan_index, 1);
-+	if (!scan_entry)
-+		return NULL;
-+
-+	if (entry_is_new(scan_entry)) {
-+		scan_entry->addr = get_index_orig_addr(slot, scan_index);
-+		set_is_addr(scan_entry->addr);
-+	}
-+
-+	if (slot->flags & UKSM_SLOT_NEED_RERAND) {
-+		rand_range = slot->pages - scan_index;
-+		BUG_ON(!rand_range);
-+		swap_index = scan_index + (prandom_u32() % rand_range);
-+	}
-+
-+	if (swap_index != scan_index) {
-+		swap_entry = get_rmap_list_entry(slot, swap_index, 1);
-+
-+		if (!swap_entry)
-+			return NULL;
-+
-+		if (entry_is_new(swap_entry)) {
-+			swap_entry->addr = get_index_orig_addr(slot,
-+							       swap_index);
-+			set_is_addr(swap_entry->addr);
-+		}
-+		swap_entries(scan_entry, scan_index, swap_entry, swap_index);
-+	}
-+
-+	addr = get_entry_address(scan_entry);
-+	item = get_entry_item(scan_entry);
-+	BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
-+
-+	page = follow_page(slot->vma, addr, FOLL_GET);
-+	if (IS_ERR_OR_NULL(page))
-+		goto nopage;
-+
-+	if (!PageAnon(page))
-+		goto putpage;
-+
-+	/*check is zero_page pfn or uksm_zero_page*/
-+	if ((page_to_pfn(page) == zero_pfn)
-+			|| (page_to_pfn(page) == uksm_zero_pfn))
-+		goto putpage;
-+
-+	flush_anon_page(slot->vma, page, addr);
-+	flush_dcache_page(page);
-+
-+
-+	*hash = page_hash(page, hash_strength, 1);
-+	inc_uksm_pages_scanned();
-+	/*if the page content all zero, re-map to zero-page*/
-+	if (find_zero_page_hash(hash_strength, *hash)) {
-+		if (!cmp_and_merge_zero_page(slot->vma, page)) {
-+			slot->pages_merged++;
-+
-+			/* For full-zero pages, no need to create rmap item */
-+			goto putpage;
-+		} else {
-+			inc_rshash_neg(memcmp_cost / 2);
-+		}
-+	}
-+
-+	if (!item) {
-+		item = alloc_rmap_item();
-+		if (item) {
-+			/* It has already been zeroed */
-+			item->slot = slot;
-+			item->address = addr;
-+			item->entry_index = scan_index;
-+			scan_entry->item = item;
-+			inc_rmap_list_pool_count(slot, scan_index);
-+		} else
-+			goto putpage;
-+	}
-+
-+	BUG_ON(item->slot != slot);
-+	/* the page may have changed */
-+	item->page = page;
-+	put_rmap_list_entry(slot, scan_index);
-+	if (swap_entry)
-+		put_rmap_list_entry(slot, swap_index);
-+	return item;
-+
-+putpage:
-+	put_page(page);
-+	page = NULL;
-+nopage:
-+	/* no page, store addr back and free rmap_item if possible */
-+	free_entry_item(scan_entry);
-+	put_rmap_list_entry(slot, scan_index);
-+	if (swap_entry)
-+		put_rmap_list_entry(slot, swap_index);
-+	return NULL;
-+}
-+
-+static inline int in_stable_tree(struct rmap_item *rmap_item)
-+{
-+	return rmap_item->address & STABLE_FLAG;
-+}
-+
-+/**
-+ * scan_vma_one_page() - scan the next page in a vma_slot. Called with
-+ * mmap_sem locked.
-+ */
-+static noinline void scan_vma_one_page(struct vma_slot *slot)
-+{
-+	u32 hash;
-+	struct mm_struct *mm;
-+	struct rmap_item *rmap_item = NULL;
-+	struct vm_area_struct *vma = slot->vma;
-+
-+	mm = vma->vm_mm;
-+	BUG_ON(!mm);
-+	BUG_ON(!slot);
-+
-+	rmap_item = get_next_rmap_item(slot, &hash);
-+	if (!rmap_item)
-+		goto out1;
-+
-+	if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
-+		goto out2;
-+
-+	cmp_and_merge_page(rmap_item, hash);
-+out2:
-+	put_page(rmap_item->page);
-+out1:
-+	slot->pages_scanned++;
-+	slot->this_sampled++;
-+	if (slot->fully_scanned_round != fully_scanned_round)
-+		scanned_virtual_pages++;
-+
-+	if (vma_fully_scanned(slot))
-+		slot->fully_scanned_round = fully_scanned_round;
-+}
-+
-+static inline unsigned long rung_get_pages(struct scan_rung *rung)
-+{
-+	struct slot_tree_node *node;
-+
-+	if (!rung->vma_root.rnode)
-+		return 0;
-+
-+	node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
-+
-+	return node->size;
-+}
-+
-+#define RUNG_SAMPLED_MIN	3
-+
-+static inline
-+void uksm_calc_rung_step(struct scan_rung *rung,
-+			 unsigned long page_time, unsigned long ratio)
-+{
-+	unsigned long sampled, pages;
-+
-+	/* will be fully scanned ? */
-+	if (!rung->cover_msecs) {
-+		rung->step = 1;
-+		return;
-+	}
-+
-+	sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
-+		  * ratio / page_time;
-+
-+	/*
-+	 *  Before we finsish a scan round and expensive per-round jobs,
-+	 *  we need to have a chance to estimate the per page time. So
-+	 *  the sampled number can not be too small.
-+	 */
-+	if (sampled < RUNG_SAMPLED_MIN)
-+		sampled = RUNG_SAMPLED_MIN;
-+
-+	pages = rung_get_pages(rung);
-+	if (likely(pages > sampled))
-+		rung->step = pages / sampled;
-+	else
-+		rung->step = 1;
-+}
-+
-+static inline int step_need_recalc(struct scan_rung *rung)
-+{
-+	unsigned long pages, stepmax;
-+
-+	pages = rung_get_pages(rung);
-+	stepmax = pages / RUNG_SAMPLED_MIN;
-+
-+	return pages && (rung->step > pages ||
-+			 (stepmax && rung->step > stepmax));
-+}
-+
-+static inline
-+void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
-+{
-+	struct vma_slot *slot;
-+
-+	if (finished)
-+		rung->flags |= UKSM_RUNG_ROUND_FINISHED;
-+
-+	if (step_recalc || step_need_recalc(rung)) {
-+		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
-+		BUG_ON(step_need_recalc(rung));
-+	}
-+
-+	slot_iter_index = prandom_u32() % rung->step;
-+	BUG_ON(!rung->vma_root.rnode);
-+	slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
-+	BUG_ON(!slot);
-+
-+	rung->current_scan = slot;
-+	rung->current_offset = slot_iter_index;
-+}
-+
-+static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
-+{
-+	return &slot->rung->vma_root;
-+}
-+
-+/*
-+ * return if resetted.
-+ */
-+static int advance_current_scan(struct scan_rung *rung)
-+{
-+	unsigned short n;
-+	struct vma_slot *slot, *next = NULL;
-+
-+	BUG_ON(!rung->vma_root.num);
-+
-+	slot = rung->current_scan;
-+	n = (slot->pages - rung->current_offset) % rung->step;
-+	slot_iter_index = rung->step - n;
-+	next = sradix_tree_next(&rung->vma_root, slot->snode,
-+				slot->sindex, slot_iter);
-+
-+	if (next) {
-+		rung->current_offset = slot_iter_index;
-+		rung->current_scan = next;
-+		return 0;
-+	} else {
-+		reset_current_scan(rung, 1, 0);
-+		return 1;
-+	}
-+}
-+
-+static inline void rung_rm_slot(struct vma_slot *slot)
-+{
-+	struct scan_rung *rung = slot->rung;
-+	struct sradix_tree_root *root;
-+
-+	if (rung->current_scan == slot)
-+		advance_current_scan(rung);
-+
-+	root = slot_get_root(slot);
-+	sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
-+	slot->snode = NULL;
-+	if (step_need_recalc(rung)) {
-+		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
-+		BUG_ON(step_need_recalc(rung));
-+	}
-+
-+	/* In case advance_current_scan loop back to this slot again */
-+	if (rung->vma_root.num && rung->current_scan == slot)
-+		reset_current_scan(slot->rung, 1, 0);
-+}
-+
-+static inline void rung_add_new_slots(struct scan_rung *rung,
-+			struct vma_slot **slots, unsigned long num)
-+{
-+	int err;
-+	struct vma_slot *slot;
-+	unsigned long i;
-+	struct sradix_tree_root *root = &rung->vma_root;
-+
-+	err = sradix_tree_enter(root, (void **)slots, num);
-+	BUG_ON(err);
-+
-+	for (i = 0; i < num; i++) {
-+		slot = slots[i];
-+		slot->rung = rung;
-+		BUG_ON(vma_fully_scanned(slot));
-+	}
-+
-+	if (rung->vma_root.num == num)
-+		reset_current_scan(rung, 0, 1);
-+}
-+
-+static inline int rung_add_one_slot(struct scan_rung *rung,
-+				     struct vma_slot *slot)
-+{
-+	int err;
-+
-+	err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
-+	if (err)
-+		return err;
-+
-+	slot->rung = rung;
-+	if (rung->vma_root.num == 1)
-+		reset_current_scan(rung, 0, 1);
-+
-+	return 0;
-+}
-+
-+/*
-+ * Return true if the slot is deleted from its rung.
-+ */
-+static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
-+{
-+	struct scan_rung *old_rung = slot->rung;
-+	int err;
-+
-+	if (old_rung == rung)
-+		return 0;
-+
-+	rung_rm_slot(slot);
-+	err = rung_add_one_slot(rung, slot);
-+	if (err) {
-+		err = rung_add_one_slot(old_rung, slot);
-+		WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
-+	}
-+
-+	return 1;
-+}
-+
-+static inline int vma_rung_up(struct vma_slot *slot)
-+{
-+	struct scan_rung *rung;
-+
-+	rung = slot->rung;
-+	if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
-+		rung++;
-+
-+	return vma_rung_enter(slot, rung);
-+}
-+
-+static inline int vma_rung_down(struct vma_slot *slot)
-+{
-+	struct scan_rung *rung;
-+
-+	rung = slot->rung;
-+	if (slot->rung != &uksm_scan_ladder[0])
-+		rung--;
-+
-+	return vma_rung_enter(slot, rung);
-+}
-+
-+/**
-+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
-+ */
-+static unsigned long cal_dedup_ratio(struct vma_slot *slot)
-+{
-+	unsigned long ret;
-+	unsigned long pages;
-+
-+	pages = slot->this_sampled;
-+	if (!pages)
-+		return 0;
-+
-+	BUG_ON(slot->pages_scanned == slot->last_scanned);
-+
-+	ret = slot->pages_merged;
-+
-+	/* Thrashing area filtering */
-+	if (ret && uksm_thrash_threshold) {
-+		if (slot->pages_cowed * 100 / slot->pages_merged
-+		    > uksm_thrash_threshold) {
-+			ret = 0;
-+		} else {
-+			ret = slot->pages_merged - slot->pages_cowed;
-+		}
-+	}
-+
-+	return ret * 100 / pages;
-+}
-+
-+/**
-+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
-+ */
-+static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
-+{
-+	unsigned long ret;
-+	unsigned long pages;
-+
-+	pages = slot->pages;
-+	if (!pages)
-+		return 0;
-+
-+	ret = slot->pages_bemerged;
-+
-+	/* Thrashing area filtering */
-+	if (ret && uksm_thrash_threshold) {
-+		if (slot->pages_cowed * 100 / slot->pages_bemerged
-+		    > uksm_thrash_threshold) {
-+			ret = 0;
-+		} else {
-+			ret = slot->pages_bemerged - slot->pages_cowed;
-+		}
-+	}
-+
-+	return ret * 100 / pages;
-+}
-+
-+/**
-+ * stable_node_reinsert() - When the hash_strength has been adjusted, the
-+ * stable tree need to be restructured, this is the function re-inserting the
-+ * stable node.
-+ */
-+static inline void stable_node_reinsert(struct stable_node *new_node,
-+					struct page *page,
-+					struct rb_root *root_treep,
-+					struct list_head *tree_node_listp,
-+					u32 hash)
-+{
-+	struct rb_node **new = &root_treep->rb_node;
-+	struct rb_node *parent = NULL;
-+	struct stable_node *stable_node;
-+	struct tree_node *tree_node;
-+	struct page *tree_page;
-+	int cmp;
-+
-+	while (*new) {
-+		int cmp;
-+
-+		tree_node = rb_entry(*new, struct tree_node, node);
-+
-+		cmp = hash_cmp(hash, tree_node->hash);
-+
-+		if (cmp < 0) {
-+			parent = *new;
-+			new = &parent->rb_left;
-+		} else if (cmp > 0) {
-+			parent = *new;
-+			new = &parent->rb_right;
-+		} else
-+			break;
-+	}
-+
-+	if (*new) {
-+		/* find a stable tree node with same first level hash value */
-+		stable_node_hash_max(new_node, page, hash);
-+		if (tree_node->count == 1) {
-+			stable_node = rb_entry(tree_node->sub_root.rb_node,
-+					       struct stable_node, node);
-+			tree_page = get_uksm_page(stable_node, 1, 0);
-+			if (tree_page) {
-+				stable_node_hash_max(stable_node,
-+						      tree_page, hash);
-+				put_page(tree_page);
-+
-+				/* prepare for stable node insertion */
-+
-+				cmp = hash_cmp(new_node->hash_max,
-+						   stable_node->hash_max);
-+				parent = &stable_node->node;
-+				if (cmp < 0)
-+					new = &parent->rb_left;
-+				else if (cmp > 0)
-+					new = &parent->rb_right;
-+				else
-+					goto failed;
-+
-+				goto add_node;
-+			} else {
-+				/* the only stable_node deleted, the tree node
-+				 * was not deleted.
-+				 */
-+				goto tree_node_reuse;
-+			}
-+		}
-+
-+		/* well, search the collision subtree */
-+		new = &tree_node->sub_root.rb_node;
-+		parent = NULL;
-+		BUG_ON(!*new);
-+		while (*new) {
-+			int cmp;
-+
-+			stable_node = rb_entry(*new, struct stable_node, node);
-+
-+			cmp = hash_cmp(new_node->hash_max,
-+					   stable_node->hash_max);
-+
-+			if (cmp < 0) {
-+				parent = *new;
-+				new = &parent->rb_left;
-+			} else if (cmp > 0) {
-+				parent = *new;
-+				new = &parent->rb_right;
-+			} else {
-+				/* oh, no, still a collision */
-+				goto failed;
-+			}
-+		}
-+
-+		goto add_node;
-+	}
-+
-+	/* no tree node found */
-+	tree_node = alloc_tree_node(tree_node_listp);
-+	if (!tree_node) {
-+		pr_err("UKSM: memory allocation error!\n");
-+		goto failed;
-+	} else {
-+		tree_node->hash = hash;
-+		rb_link_node(&tree_node->node, parent, new);
-+		rb_insert_color(&tree_node->node, root_treep);
-+
-+tree_node_reuse:
-+		/* prepare for stable node insertion */
-+		parent = NULL;
-+		new = &tree_node->sub_root.rb_node;
-+	}
-+
-+add_node:
-+	rb_link_node(&new_node->node, parent, new);
-+	rb_insert_color(&new_node->node, &tree_node->sub_root);
-+	new_node->tree_node = tree_node;
-+	tree_node->count++;
-+	return;
-+
-+failed:
-+	/* This can only happen when two nodes have collided
-+	 * in two levels.
-+	 */
-+	new_node->tree_node = NULL;
-+	return;
-+}
-+
-+static inline void free_all_tree_nodes(struct list_head *list)
-+{
-+	struct tree_node *node, *tmp;
-+
-+	list_for_each_entry_safe(node, tmp, list, all_list) {
-+		free_tree_node(node);
-+	}
-+}
-+
-+/**
-+ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
-+ * strength to the current hash_strength. It re-structures the hole tree.
-+ */
-+static inline void stable_tree_delta_hash(u32 prev_hash_strength)
-+{
-+	struct stable_node *node, *tmp;
-+	struct rb_root *root_new_treep;
-+	struct list_head *new_tree_node_listp;
-+
-+	stable_tree_index = (stable_tree_index + 1) % 2;
-+	root_new_treep = &root_stable_tree[stable_tree_index];
-+	new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
-+	*root_new_treep = RB_ROOT;
-+	BUG_ON(!list_empty(new_tree_node_listp));
-+
-+	/*
-+	 * we need to be safe, the node could be removed by get_uksm_page()
-+	 */
-+	list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
-+		void *addr;
-+		struct page *node_page;
-+		u32 hash;
-+
-+		/*
-+		 * We are completely re-structuring the stable nodes to a new
-+		 * stable tree. We don't want to touch the old tree unlinks and
-+		 * old tree_nodes. The old tree_nodes will be freed at once.
-+		 */
-+		node_page = get_uksm_page(node, 0, 0);
-+		if (!node_page)
-+			continue;
-+
-+		if (node->tree_node) {
-+			hash = node->tree_node->hash;
-+
-+			addr = kmap_atomic(node_page);
-+
-+			hash = delta_hash(addr, prev_hash_strength,
-+					  hash_strength, hash);
-+			kunmap_atomic(addr);
-+		} else {
-+			/*
-+			 *it was not inserted to rbtree due to collision in last
-+			 *round scan.
-+			 */
-+			hash = page_hash(node_page, hash_strength, 0);
-+		}
-+
-+		stable_node_reinsert(node, node_page, root_new_treep,
-+				     new_tree_node_listp, hash);
-+		put_page(node_page);
-+	}
-+
-+	root_stable_treep = root_new_treep;
-+	free_all_tree_nodes(stable_tree_node_listp);
-+	BUG_ON(!list_empty(stable_tree_node_listp));
-+	stable_tree_node_listp = new_tree_node_listp;
-+}
-+
-+static inline void inc_hash_strength(unsigned long delta)
-+{
-+	hash_strength += 1 << delta;
-+	if (hash_strength > HASH_STRENGTH_MAX)
-+		hash_strength = HASH_STRENGTH_MAX;
-+}
-+
-+static inline void dec_hash_strength(unsigned long delta)
-+{
-+	unsigned long change = 1 << delta;
-+
-+	if (hash_strength <= change + 1)
-+		hash_strength = 1;
-+	else
-+		hash_strength -= change;
-+}
-+
-+static inline void inc_hash_strength_delta(void)
-+{
-+	hash_strength_delta++;
-+	if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
-+		hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
-+}
-+
-+static inline unsigned long get_current_neg_ratio(void)
-+{
-+	u64 pos = benefit.pos;
-+	u64 neg = benefit.neg;
-+
-+	if (!neg)
-+		return 0;
-+
-+	if (!pos || neg > pos)
-+		return 100;
-+
-+	if (neg > div64_u64(U64_MAX, 100))
-+		pos = div64_u64(pos, 100);
-+	else
-+		neg *= 100;
-+
-+	return div64_u64(neg, pos);
-+}
-+
-+static inline unsigned long get_current_benefit(void)
-+{
-+	u64 pos = benefit.pos;
-+	u64 neg = benefit.neg;
-+	u64 scanned = benefit.scanned;
-+
-+	if (neg > pos)
-+		return 0;
-+
-+	return div64_u64((pos - neg), scanned);
-+}
-+
-+static inline int judge_rshash_direction(void)
-+{
-+	u64 current_neg_ratio, stable_benefit;
-+	u64 current_benefit, delta = 0;
-+	int ret = STILL;
-+
-+	/*
-+	 * Try to probe a value after the boot, and in case the system
-+	 * are still for a long time.
-+	 */
-+	if ((fully_scanned_round & 0xFFULL) == 10) {
-+		ret = OBSCURE;
-+		goto out;
-+	}
-+
-+	current_neg_ratio = get_current_neg_ratio();
-+
-+	if (current_neg_ratio == 0) {
-+		rshash_neg_cont_zero++;
-+		if (rshash_neg_cont_zero > 2)
-+			return GO_DOWN;
-+		else
-+			return STILL;
-+	}
-+	rshash_neg_cont_zero = 0;
-+
-+	if (current_neg_ratio > 90) {
-+		ret = GO_UP;
-+		goto out;
-+	}
-+
-+	current_benefit = get_current_benefit();
-+	stable_benefit = rshash_state.stable_benefit;
-+
-+	if (!stable_benefit) {
-+		ret = OBSCURE;
-+		goto out;
-+	}
-+
-+	if (current_benefit > stable_benefit)
-+		delta = current_benefit - stable_benefit;
-+	else if (current_benefit < stable_benefit)
-+		delta = stable_benefit - current_benefit;
-+
-+	delta = div64_u64(100 * delta, stable_benefit);
-+
-+	if (delta > 50) {
-+		rshash_cont_obscure++;
-+		if (rshash_cont_obscure > 2)
-+			return OBSCURE;
-+		else
-+			return STILL;
-+	}
-+
-+out:
-+	rshash_cont_obscure = 0;
-+	return ret;
-+}
-+
-+/**
-+ * rshash_adjust() - The main function to control the random sampling state
-+ * machine for hash strength adapting.
-+ *
-+ * return true if hash_strength has changed.
-+ */
-+static inline int rshash_adjust(void)
-+{
-+	unsigned long prev_hash_strength = hash_strength;
-+
-+	if (!encode_benefit())
-+		return 0;
-+
-+	switch (rshash_state.state) {
-+	case RSHASH_STILL:
-+		switch (judge_rshash_direction()) {
-+		case GO_UP:
-+			if (rshash_state.pre_direct == GO_DOWN)
-+				hash_strength_delta = 0;
-+
-+			inc_hash_strength(hash_strength_delta);
-+			inc_hash_strength_delta();
-+			rshash_state.stable_benefit = get_current_benefit();
-+			rshash_state.pre_direct = GO_UP;
-+			break;
-+
-+		case GO_DOWN:
-+			if (rshash_state.pre_direct == GO_UP)
-+				hash_strength_delta = 0;
-+
-+			dec_hash_strength(hash_strength_delta);
-+			inc_hash_strength_delta();
-+			rshash_state.stable_benefit = get_current_benefit();
-+			rshash_state.pre_direct = GO_DOWN;
-+			break;
-+
-+		case OBSCURE:
-+			rshash_state.stable_point = hash_strength;
-+			rshash_state.turn_point_down = hash_strength;
-+			rshash_state.turn_point_up = hash_strength;
-+			rshash_state.turn_benefit_down = get_current_benefit();
-+			rshash_state.turn_benefit_up = get_current_benefit();
-+			rshash_state.lookup_window_index = 0;
-+			rshash_state.state = RSHASH_TRYDOWN;
-+			dec_hash_strength(hash_strength_delta);
-+			inc_hash_strength_delta();
-+			break;
-+
-+		case STILL:
-+			break;
-+		default:
-+			BUG();
-+		}
-+		break;
-+
-+	case RSHASH_TRYDOWN:
-+		if (rshash_state.lookup_window_index++ % 5 == 0)
-+			rshash_state.below_count = 0;
-+
-+		if (get_current_benefit() < rshash_state.stable_benefit)
-+			rshash_state.below_count++;
-+		else if (get_current_benefit() >
-+			 rshash_state.turn_benefit_down) {
-+			rshash_state.turn_point_down = hash_strength;
-+			rshash_state.turn_benefit_down = get_current_benefit();
-+		}
-+
-+		if (rshash_state.below_count >= 3 ||
-+		    judge_rshash_direction() == GO_UP ||
-+		    hash_strength == 1) {
-+			hash_strength = rshash_state.stable_point;
-+			hash_strength_delta = 0;
-+			inc_hash_strength(hash_strength_delta);
-+			inc_hash_strength_delta();
-+			rshash_state.lookup_window_index = 0;
-+			rshash_state.state = RSHASH_TRYUP;
-+			hash_strength_delta = 0;
-+		} else {
-+			dec_hash_strength(hash_strength_delta);
-+			inc_hash_strength_delta();
-+		}
-+		break;
-+
-+	case RSHASH_TRYUP:
-+		if (rshash_state.lookup_window_index++ % 5 == 0)
-+			rshash_state.below_count = 0;
-+
-+		if (get_current_benefit() < rshash_state.turn_benefit_down)
-+			rshash_state.below_count++;
-+		else if (get_current_benefit() > rshash_state.turn_benefit_up) {
-+			rshash_state.turn_point_up = hash_strength;
-+			rshash_state.turn_benefit_up = get_current_benefit();
-+		}
-+
-+		if (rshash_state.below_count >= 3 ||
-+		    judge_rshash_direction() == GO_DOWN ||
-+		    hash_strength == HASH_STRENGTH_MAX) {
-+			hash_strength = rshash_state.turn_benefit_up >
-+				rshash_state.turn_benefit_down ?
-+				rshash_state.turn_point_up :
-+				rshash_state.turn_point_down;
-+
-+			rshash_state.state = RSHASH_PRE_STILL;
-+		} else {
-+			inc_hash_strength(hash_strength_delta);
-+			inc_hash_strength_delta();
-+		}
-+
-+		break;
-+
-+	case RSHASH_NEW:
-+	case RSHASH_PRE_STILL:
-+		rshash_state.stable_benefit = get_current_benefit();
-+		rshash_state.state = RSHASH_STILL;
-+		hash_strength_delta = 0;
-+		break;
-+	default:
-+		BUG();
-+	}
-+
-+	/* rshash_neg = rshash_pos = 0; */
-+	reset_benefit();
-+
-+	if (prev_hash_strength != hash_strength)
-+		stable_tree_delta_hash(prev_hash_strength);
-+
-+	return prev_hash_strength != hash_strength;
-+}
-+
-+/**
-+ * round_update_ladder() - The main function to do update of all the
-+ * adjustments whenever a scan round is finished.
-+ */
-+static noinline void round_update_ladder(void)
-+{
-+	int i;
-+	unsigned long dedup;
-+	struct vma_slot *slot, *tmp_slot;
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++)
-+		uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
-+
-+	list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
-+
-+		/* slot may be rung_rm_slot() when mm exits */
-+		if (slot->snode) {
-+			dedup = cal_dedup_ratio_old(slot);
-+			if (dedup && dedup >= uksm_abundant_threshold)
-+				vma_rung_up(slot);
-+		}
-+
-+		slot->pages_bemerged = 0;
-+		slot->pages_cowed = 0;
-+
-+		list_del_init(&slot->dedup_list);
-+	}
-+}
-+
-+static void uksm_del_vma_slot(struct vma_slot *slot)
-+{
-+	int i, j;
-+	struct rmap_list_entry *entry;
-+
-+	if (slot->snode) {
-+		/*
-+		 * In case it just failed when entering the rung, it's not
-+		 * necessary.
-+		 */
-+		rung_rm_slot(slot);
-+	}
-+
-+	if (!list_empty(&slot->dedup_list))
-+		list_del(&slot->dedup_list);
-+
-+	if (!slot->rmap_list_pool || !slot->pool_counts) {
-+		/* In case it OOMed in uksm_vma_enter() */
-+		goto out;
-+	}
-+
-+	for (i = 0; i < slot->pool_size; i++) {
-+		void *addr;
-+
-+		if (!slot->rmap_list_pool[i])
-+			continue;
-+
-+		addr = kmap(slot->rmap_list_pool[i]);
-+		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
-+			entry = (struct rmap_list_entry *)addr + j;
-+			if (is_addr(entry->addr))
-+				continue;
-+			if (!entry->item)
-+				continue;
-+
-+			remove_rmap_item_from_tree(entry->item);
-+			free_rmap_item(entry->item);
-+			slot->pool_counts[i]--;
-+		}
-+		BUG_ON(slot->pool_counts[i]);
-+		kunmap(slot->rmap_list_pool[i]);
-+		__free_page(slot->rmap_list_pool[i]);
-+	}
-+	kfree(slot->rmap_list_pool);
-+	kfree(slot->pool_counts);
-+
-+out:
-+	slot->rung = NULL;
-+	if (slot->flags & UKSM_SLOT_IN_UKSM) {
-+		BUG_ON(uksm_pages_total < slot->pages);
-+		uksm_pages_total -= slot->pages;
-+	}
-+
-+	if (slot->fully_scanned_round == fully_scanned_round)
-+		scanned_virtual_pages -= slot->pages;
-+	else
-+		scanned_virtual_pages -= slot->pages_scanned;
-+	free_vma_slot(slot);
-+}
-+
-+
-+#define SPIN_LOCK_PERIOD	32
-+static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
-+static inline void cleanup_vma_slots(void)
-+{
-+	struct vma_slot *slot;
-+	int i;
-+
-+	i = 0;
-+	spin_lock(&vma_slot_list_lock);
-+	while (!list_empty(&vma_slot_del)) {
-+		slot = list_entry(vma_slot_del.next,
-+				  struct vma_slot, slot_list);
-+		list_del(&slot->slot_list);
-+		cleanup_slots[i++] = slot;
-+		if (i == SPIN_LOCK_PERIOD) {
-+			spin_unlock(&vma_slot_list_lock);
-+			while (--i >= 0)
-+				uksm_del_vma_slot(cleanup_slots[i]);
-+			i = 0;
-+			spin_lock(&vma_slot_list_lock);
-+		}
-+	}
-+	spin_unlock(&vma_slot_list_lock);
-+
-+	while (--i >= 0)
-+		uksm_del_vma_slot(cleanup_slots[i]);
-+}
-+
-+/*
-+ * Expotional moving average formula
-+ */
-+static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
-+{
-+	/*
-+	 * For a very high burst, even the ema cannot work well, a false very
-+	 * high per-page time estimation can result in feedback in very high
-+	 * overhead of context switch and rung update -- this will then lead
-+	 * to higher per-paper time, this may not converge.
-+	 *
-+	 * Instead, we try to approach this value in a binary manner.
-+	 */
-+	if (curr > last_ema * 10)
-+		return last_ema * 2;
-+
-+	return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
-+}
-+
-+/*
-+ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
-+ * nanoseconds based on current uksm_sleep_jiffies.
-+ */
-+static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
-+{
-+	return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
-+		(TIME_RATIO_SCALE - ratio) * ratio;
-+}
-+
-+
-+static inline unsigned long rung_real_ratio(int cpu_time_ratio)
-+{
-+	unsigned long ret;
-+
-+	BUG_ON(!cpu_time_ratio);
-+
-+	if (cpu_time_ratio > 0)
-+		ret = cpu_time_ratio;
-+	else
-+		ret = (unsigned long)(-cpu_time_ratio) *
-+			uksm_max_cpu_percentage / 100UL;
-+
-+	return ret ? ret : 1;
-+}
-+
-+static noinline void uksm_calc_scan_pages(void)
-+{
-+	struct scan_rung *ladder = uksm_scan_ladder;
-+	unsigned long sleep_usecs, nsecs;
-+	unsigned long ratio;
-+	int i;
-+	unsigned long per_page;
-+
-+	if (uksm_ema_page_time > 100000 ||
-+	    (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
-+		uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
-+
-+	per_page = uksm_ema_page_time;
-+	BUG_ON(!per_page);
-+
-+	/*
-+	 * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
-+	 * based on saved user input.
-+	 */
-+	if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
-+		uksm_sleep_jiffies = uksm_sleep_saved;
-+
-+	/* We require a rung scan at least 1 page in a period. */
-+	nsecs = per_page;
-+	ratio = rung_real_ratio(ladder[0].cpu_ratio);
-+	if (cpu_ratio_to_nsec(ratio) < nsecs) {
-+		sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
-+				/ NSEC_PER_USEC;
-+		uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
-+	}
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		ratio = rung_real_ratio(ladder[i].cpu_ratio);
-+		ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
-+					per_page;
-+		BUG_ON(!ladder[i].pages_to_scan);
-+		uksm_calc_rung_step(&ladder[i], per_page, ratio);
-+	}
-+}
-+
-+/*
-+ * From the scan time of this round (ns) to next expected min sleep time
-+ * (ms), be careful of the possible overflows. ratio is taken from
-+ * rung_real_ratio()
-+ */
-+static inline
-+unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
-+{
-+	scan_time >>= 20; /* to msec level now */
-+	BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
-+
-+	return (unsigned int) ((unsigned long) scan_time *
-+			       (TIME_RATIO_SCALE - ratio) / ratio);
-+}
-+
-+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
-+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
-+
-+static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
-+{
-+	struct scan_rung *rung;
-+
-+	rung = &uksm_scan_ladder[0];
-+	rung_add_new_slots(rung, slots, num);
-+}
-+
-+static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
-+
-+static void uksm_enter_all_slots(void)
-+{
-+	struct vma_slot *slot;
-+	unsigned long index;
-+	struct list_head empty_vma_list;
-+	int i;
-+
-+	i = 0;
-+	index = 0;
-+	INIT_LIST_HEAD(&empty_vma_list);
-+
-+	spin_lock(&vma_slot_list_lock);
-+	while (!list_empty(&vma_slot_new)) {
-+		slot = list_entry(vma_slot_new.next,
-+				  struct vma_slot, slot_list);
-+
-+		if (!slot->vma->anon_vma) {
-+			list_move(&slot->slot_list, &empty_vma_list);
-+		} else if (vma_can_enter(slot->vma)) {
-+			batch_slots[index++] = slot;
-+			list_del_init(&slot->slot_list);
-+		} else {
-+			list_move(&slot->slot_list, &vma_slot_noadd);
-+		}
-+
-+		if (++i == SPIN_LOCK_PERIOD ||
-+		    (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
-+			spin_unlock(&vma_slot_list_lock);
-+
-+			if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
-+				uksm_vma_enter(batch_slots, index);
-+				index = 0;
-+			}
-+			i = 0;
-+			cond_resched();
-+			spin_lock(&vma_slot_list_lock);
-+		}
-+	}
-+
-+	list_splice(&empty_vma_list, &vma_slot_new);
-+
-+	spin_unlock(&vma_slot_list_lock);
-+
-+	if (index)
-+		uksm_vma_enter(batch_slots, index);
-+
-+}
-+
-+static inline int rung_round_finished(struct scan_rung *rung)
-+{
-+	return rung->flags & UKSM_RUNG_ROUND_FINISHED;
-+}
-+
-+static inline void judge_slot(struct vma_slot *slot)
-+{
-+	struct scan_rung *rung = slot->rung;
-+	unsigned long dedup;
-+	int deleted;
-+
-+	dedup = cal_dedup_ratio(slot);
-+	if (vma_fully_scanned(slot) && uksm_thrash_threshold)
-+		deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
-+	else if (dedup && dedup >= uksm_abundant_threshold)
-+		deleted = vma_rung_up(slot);
-+	else
-+		deleted = vma_rung_down(slot);
-+
-+	slot->pages_merged = 0;
-+	slot->pages_cowed = 0;
-+	slot->this_sampled = 0;
-+
-+	if (vma_fully_scanned(slot))
-+		slot->pages_scanned = 0;
-+
-+	slot->last_scanned = slot->pages_scanned;
-+
-+	/* If its deleted in above, then rung was already advanced. */
-+	if (!deleted)
-+		advance_current_scan(rung);
-+}
-+
-+
-+static inline int hash_round_finished(void)
-+{
-+	if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
-+		scanned_virtual_pages = 0;
-+		if (uksm_pages_scanned)
-+			fully_scanned_round++;
-+
-+		return 1;
-+	} else {
-+		return 0;
-+	}
-+}
-+
-+#define UKSM_MMSEM_BATCH	5
-+#define BUSY_RETRY		100
-+
-+/**
-+ * uksm_do_scan()  - the main worker function.
-+ */
-+static noinline void uksm_do_scan(void)
-+{
-+	struct vma_slot *slot, *iter;
-+	struct mm_struct *busy_mm;
-+	unsigned char round_finished, all_rungs_emtpy;
-+	int i, err, mmsem_batch;
-+	unsigned long pcost;
-+	long long delta_exec;
-+	unsigned long vpages, max_cpu_ratio;
-+	unsigned long long start_time, end_time, scan_time;
-+	unsigned int expected_jiffies;
-+
-+	might_sleep();
-+
-+	vpages = 0;
-+
-+	start_time = task_sched_runtime(current);
-+	max_cpu_ratio = 0;
-+	mmsem_batch = 0;
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE;) {
-+		struct scan_rung *rung = &uksm_scan_ladder[i];
-+		unsigned long ratio;
-+		int busy_retry;
-+
-+		if (!rung->pages_to_scan) {
-+			i++;
-+			continue;
-+		}
-+
-+		if (!rung->vma_root.num) {
-+			rung->pages_to_scan = 0;
-+			i++;
-+			continue;
-+		}
-+
-+		ratio = rung_real_ratio(rung->cpu_ratio);
-+		if (ratio > max_cpu_ratio)
-+			max_cpu_ratio = ratio;
-+
-+		busy_retry = BUSY_RETRY;
-+		/*
-+		 * Do not consider rung_round_finished() here, just used up the
-+		 * rung->pages_to_scan quota.
-+		 */
-+		while (rung->pages_to_scan && rung->vma_root.num &&
-+		       likely(!freezing(current))) {
-+			int reset = 0;
-+
-+			slot = rung->current_scan;
-+
-+			BUG_ON(vma_fully_scanned(slot));
-+
-+			if (mmsem_batch)
-+				err = 0;
-+			else
-+				err = try_down_read_slot_mmap_sem(slot);
-+
-+			if (err == -ENOENT) {
-+rm_slot:
-+				rung_rm_slot(slot);
-+				continue;
-+			}
-+
-+			busy_mm = slot->mm;
-+
-+			if (err == -EBUSY) {
-+				/* skip other vmas on the same mm */
-+				do {
-+					reset = advance_current_scan(rung);
-+					iter = rung->current_scan;
-+					busy_retry--;
-+					if (iter->vma->vm_mm != busy_mm ||
-+					    !busy_retry || reset)
-+						break;
-+				} while (1);
-+
-+				if (iter->vma->vm_mm != busy_mm) {
-+					continue;
-+				} else {
-+					/* scan round finsished */
-+					break;
-+				}
-+			}
-+
-+			BUG_ON(!vma_can_enter(slot->vma));
-+			if (uksm_test_exit(slot->vma->vm_mm)) {
-+				mmsem_batch = 0;
-+				mmap_read_unlock(slot->vma->vm_mm);
-+				goto rm_slot;
-+			}
-+
-+			if (mmsem_batch)
-+				mmsem_batch--;
-+			else
-+				mmsem_batch = UKSM_MMSEM_BATCH;
-+
-+			/* Ok, we have take the mmap_sem, ready to scan */
-+			scan_vma_one_page(slot);
-+			rung->pages_to_scan--;
-+			vpages++;
-+
-+			if (rung->current_offset + rung->step > slot->pages - 1
-+			    || vma_fully_scanned(slot)) {
-+				mmap_read_unlock(slot->vma->vm_mm);
-+				judge_slot(slot);
-+				mmsem_batch = 0;
-+			} else {
-+				rung->current_offset += rung->step;
-+				if (!mmsem_batch)
-+					mmap_read_unlock(slot->vma->vm_mm);
-+			}
-+
-+			busy_retry = BUSY_RETRY;
-+			cond_resched();
-+		}
-+
-+		if (mmsem_batch) {
-+			mmap_read_unlock(slot->vma->vm_mm);
-+			mmsem_batch = 0;
-+		}
-+
-+		if (freezing(current))
-+			break;
-+
-+		cond_resched();
-+	}
-+	end_time = task_sched_runtime(current);
-+	delta_exec = end_time - start_time;
-+
-+	if (freezing(current))
-+		return;
-+
-+	cleanup_vma_slots();
-+	uksm_enter_all_slots();
-+
-+	round_finished = 1;
-+	all_rungs_emtpy = 1;
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		struct scan_rung *rung = &uksm_scan_ladder[i];
-+
-+		if (rung->vma_root.num) {
-+			all_rungs_emtpy = 0;
-+			if (!rung_round_finished(rung))
-+				round_finished = 0;
-+		}
-+	}
-+
-+	if (all_rungs_emtpy)
-+		round_finished = 0;
-+
-+	if (round_finished) {
-+		round_update_ladder();
-+		uksm_eval_round++;
-+
-+		if (hash_round_finished() && rshash_adjust()) {
-+			/* Reset the unstable root iff hash strength changed */
-+			uksm_hash_round++;
-+			root_unstable_tree = RB_ROOT;
-+			free_all_tree_nodes(&unstable_tree_node_list);
-+		}
-+
-+		/*
-+		 * A number of pages can hang around indefinitely on per-cpu
-+		 * pagevecs, raised page count preventing write_protect_page
-+		 * from merging them.  Though it doesn't really matter much,
-+		 * it is puzzling to see some stuck in pages_volatile until
-+		 * other activity jostles them out, and they also prevented
-+		 * LTP's KSM test from succeeding deterministically; so drain
-+		 * them here (here rather than on entry to uksm_do_scan(),
-+		 * so we don't IPI too often when pages_to_scan is set low).
-+		 */
-+		lru_add_drain_all();
-+	}
-+
-+
-+	if (vpages && delta_exec > 0) {
-+		pcost = (unsigned long) delta_exec / vpages;
-+		if (likely(uksm_ema_page_time))
-+			uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
-+		else
-+			uksm_ema_page_time = pcost;
-+	}
-+
-+	uksm_calc_scan_pages();
-+	uksm_sleep_real = uksm_sleep_jiffies;
-+	/* in case of radical cpu bursts, apply the upper bound */
-+	end_time = task_sched_runtime(current);
-+	if (max_cpu_ratio && end_time > start_time) {
-+		scan_time = end_time - start_time;
-+		expected_jiffies = msecs_to_jiffies(
-+			scan_time_to_sleep(scan_time, max_cpu_ratio));
-+
-+		if (expected_jiffies > uksm_sleep_real)
-+			uksm_sleep_real = expected_jiffies;
-+
-+		/* We have a 1 second up bound for responsiveness. */
-+		if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
-+			uksm_sleep_real = msecs_to_jiffies(1000);
-+	}
-+
-+	return;
-+}
-+
-+static int ksmd_should_run(void)
-+{
-+	return uksm_run & UKSM_RUN_MERGE;
-+}
-+
-+static int uksm_scan_thread(void *nothing)
-+{
-+	set_freezable();
-+	set_user_nice(current, 5);
-+
-+	while (!kthread_should_stop()) {
-+		mutex_lock(&uksm_thread_mutex);
-+		if (ksmd_should_run())
-+			uksm_do_scan();
-+		mutex_unlock(&uksm_thread_mutex);
-+
-+		try_to_freeze();
-+
-+		if (ksmd_should_run()) {
-+			schedule_timeout_interruptible(uksm_sleep_real);
-+			uksm_sleep_times++;
-+		} else {
-+			wait_event_freezable(uksm_thread_wait,
-+				ksmd_should_run() || kthread_should_stop());
-+		}
-+	}
-+	return 0;
-+}
-+
-+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
-+{
-+	struct stable_node *stable_node;
-+	struct node_vma *node_vma;
-+	struct rmap_item *rmap_item;
-+	int search_new_forks = 0;
-+	unsigned long address;
-+
-+	VM_BUG_ON_PAGE(!PageKsm(page), page);
-+	VM_BUG_ON_PAGE(!PageLocked(page), page);
-+
-+	stable_node = page_stable_node(page);
-+	if (!stable_node)
-+		return;
-+again:
-+	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
-+		hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
-+			struct anon_vma *anon_vma = rmap_item->anon_vma;
-+			struct anon_vma_chain *vmac;
-+			struct vm_area_struct *vma;
-+
-+			cond_resched();
-+			anon_vma_lock_read(anon_vma);
-+			anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
-+						       0, ULONG_MAX) {
-+				cond_resched();
-+				vma = vmac->vma;
-+				address = get_rmap_addr(rmap_item);
-+
-+				if (address < vma->vm_start ||
-+				    address >= vma->vm_end)
-+					continue;
-+
-+				if ((rmap_item->slot->vma == vma) ==
-+				    search_new_forks)
-+					continue;
-+
-+				if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
-+					continue;
-+
-+				if (!rwc->rmap_one(page, vma, address, rwc->arg)) {
-+					anon_vma_unlock_read(anon_vma);
-+					return;
-+				}
-+
-+				if (rwc->done && rwc->done(page)) {
-+					anon_vma_unlock_read(anon_vma);
-+					return;
-+				}
-+			}
-+			anon_vma_unlock_read(anon_vma);
-+		}
-+	}
-+	if (!search_new_forks++)
-+		goto again;
-+}
-+
-+#ifdef CONFIG_MIGRATION
-+/* Common ksm interface but may be specific to uksm */
-+void ksm_migrate_page(struct page *newpage, struct page *oldpage)
-+{
-+	struct stable_node *stable_node;
-+
-+	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
-+	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-+	VM_BUG_ON(newpage->mapping != oldpage->mapping);
-+
-+	stable_node = page_stable_node(newpage);
-+	if (stable_node) {
-+		VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
-+		stable_node->kpfn = page_to_pfn(newpage);
-+		/*
-+		 * newpage->mapping was set in advance; now we need smp_wmb()
-+		 * to make sure that the new stable_node->kpfn is visible
-+		 * to get_ksm_page() before it can see that oldpage->mapping
-+		 * has gone stale (or that PageSwapCache has been cleared).
-+		 */
-+		smp_wmb();
-+		set_page_stable_node(oldpage, NULL);
-+	}
-+}
-+#endif /* CONFIG_MIGRATION */
-+
-+#ifdef CONFIG_MEMORY_HOTREMOVE
-+static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
-+						 unsigned long end_pfn)
-+{
-+	struct rb_node *node;
-+
-+	for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
-+		struct stable_node *stable_node;
-+
-+		stable_node = rb_entry(node, struct stable_node, node);
-+		if (stable_node->kpfn >= start_pfn &&
-+		    stable_node->kpfn < end_pfn)
-+			return stable_node;
-+	}
-+	return NULL;
-+}
-+
-+static int uksm_memory_callback(struct notifier_block *self,
-+			       unsigned long action, void *arg)
-+{
-+	struct memory_notify *mn = arg;
-+	struct stable_node *stable_node;
-+
-+	switch (action) {
-+	case MEM_GOING_OFFLINE:
-+		/*
-+		 * Keep it very simple for now: just lock out ksmd and
-+		 * MADV_UNMERGEABLE while any memory is going offline.
-+		 * mutex_lock_nested() is necessary because lockdep was alarmed
-+		 * that here we take uksm_thread_mutex inside notifier chain
-+		 * mutex, and later take notifier chain mutex inside
-+		 * uksm_thread_mutex to unlock it.   But that's safe because both
-+		 * are inside mem_hotplug_mutex.
-+		 */
-+		mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
-+		break;
-+
-+	case MEM_OFFLINE:
-+		/*
-+		 * Most of the work is done by page migration; but there might
-+		 * be a few stable_nodes left over, still pointing to struct
-+		 * pages which have been offlined: prune those from the tree.
-+		 */
-+		while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
-+					mn->start_pfn + mn->nr_pages)) != NULL)
-+			remove_node_from_stable_tree(stable_node, 1, 1);
-+		/* fallthrough */
-+
-+	case MEM_CANCEL_OFFLINE:
-+		mutex_unlock(&uksm_thread_mutex);
-+		break;
-+	}
-+	return NOTIFY_OK;
-+}
-+#endif /* CONFIG_MEMORY_HOTREMOVE */
-+
-+#ifdef CONFIG_SYSFS
-+/*
-+ * This all compiles without CONFIG_SYSFS, but is a waste of space.
-+ */
-+
-+#define UKSM_ATTR_RO(_name) \
-+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
-+#define UKSM_ATTR(_name) \
-+	static struct kobj_attribute _name##_attr = \
-+		__ATTR(_name, 0644, _name##_show, _name##_store)
-+
-+static ssize_t max_cpu_percentage_show(struct kobject *kobj,
-+				    struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
-+}
-+
-+static ssize_t max_cpu_percentage_store(struct kobject *kobj,
-+				     struct kobj_attribute *attr,
-+				     const char *buf, size_t count)
-+{
-+	unsigned long max_cpu_percentage;
-+	int err;
-+
-+	err = kstrtoul(buf, 10, &max_cpu_percentage);
-+	if (err || max_cpu_percentage > 100)
-+		return -EINVAL;
-+
-+	if (max_cpu_percentage == 100)
-+		max_cpu_percentage = 99;
-+	else if (max_cpu_percentage < 10)
-+		max_cpu_percentage = 10;
-+
-+	uksm_max_cpu_percentage = max_cpu_percentage;
-+
-+	return count;
-+}
-+UKSM_ATTR(max_cpu_percentage);
-+
-+static ssize_t sleep_millisecs_show(struct kobject *kobj,
-+				    struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
-+}
-+
-+static ssize_t sleep_millisecs_store(struct kobject *kobj,
-+				     struct kobj_attribute *attr,
-+				     const char *buf, size_t count)
-+{
-+	unsigned long msecs;
-+	int err;
-+
-+	err = kstrtoul(buf, 10, &msecs);
-+	if (err || msecs > MSEC_PER_SEC)
-+		return -EINVAL;
-+
-+	uksm_sleep_jiffies = msecs_to_jiffies(msecs);
-+	uksm_sleep_saved = uksm_sleep_jiffies;
-+
-+	return count;
-+}
-+UKSM_ATTR(sleep_millisecs);
-+
-+
-+static ssize_t cpu_governor_show(struct kobject *kobj,
-+				  struct kobj_attribute *attr, char *buf)
-+{
-+	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
-+	int i;
-+
-+	buf[0] = '\0';
-+	for (i = 0; i < n ; i++) {
-+		if (uksm_cpu_governor == i)
-+			strcat(buf, "[");
-+
-+		strcat(buf, uksm_cpu_governor_str[i]);
-+
-+		if (uksm_cpu_governor == i)
-+			strcat(buf, "]");
-+
-+		strcat(buf, " ");
-+	}
-+	strcat(buf, "\n");
-+
-+	return strlen(buf);
-+}
-+
-+static inline void init_performance_values(void)
-+{
-+	int i;
-+	struct scan_rung *rung;
-+	struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
-+
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		rung = uksm_scan_ladder + i;
-+		rung->cpu_ratio = preset->cpu_ratio[i];
-+		rung->cover_msecs = preset->cover_msecs[i];
-+	}
-+
-+	uksm_max_cpu_percentage = preset->max_cpu;
-+}
-+
-+static ssize_t cpu_governor_store(struct kobject *kobj,
-+				   struct kobj_attribute *attr,
-+				   const char *buf, size_t count)
-+{
-+	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
-+
-+	for (n--; n >= 0 ; n--) {
-+		if (!strncmp(buf, uksm_cpu_governor_str[n],
-+			     strlen(uksm_cpu_governor_str[n])))
-+			break;
-+	}
-+
-+	if (n < 0)
-+		return -EINVAL;
-+	else
-+		uksm_cpu_governor = n;
-+
-+	init_performance_values();
-+
-+	return count;
-+}
-+UKSM_ATTR(cpu_governor);
-+
-+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
-+			char *buf)
-+{
-+	return sprintf(buf, "%u\n", uksm_run);
-+}
-+
-+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
-+			 const char *buf, size_t count)
-+{
-+	int err;
-+	unsigned long flags;
-+
-+	err = kstrtoul(buf, 10, &flags);
-+	if (err || flags > UINT_MAX)
-+		return -EINVAL;
-+	if (flags > UKSM_RUN_MERGE)
-+		return -EINVAL;
-+
-+	mutex_lock(&uksm_thread_mutex);
-+	if (uksm_run != flags)
-+		uksm_run = flags;
-+	mutex_unlock(&uksm_thread_mutex);
-+
-+	if (flags & UKSM_RUN_MERGE)
-+		wake_up_interruptible(&uksm_thread_wait);
-+
-+	return count;
-+}
-+UKSM_ATTR(run);
-+
-+static ssize_t abundant_threshold_show(struct kobject *kobj,
-+				     struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%u\n", uksm_abundant_threshold);
-+}
-+
-+static ssize_t abundant_threshold_store(struct kobject *kobj,
-+				      struct kobj_attribute *attr,
-+				      const char *buf, size_t count)
-+{
-+	int err;
-+	unsigned long flags;
-+
-+	err = kstrtoul(buf, 10, &flags);
-+	if (err || flags > 99)
-+		return -EINVAL;
-+
-+	uksm_abundant_threshold = flags;
-+
-+	return count;
-+}
-+UKSM_ATTR(abundant_threshold);
-+
-+static ssize_t thrash_threshold_show(struct kobject *kobj,
-+				     struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%u\n", uksm_thrash_threshold);
-+}
-+
-+static ssize_t thrash_threshold_store(struct kobject *kobj,
-+				      struct kobj_attribute *attr,
-+				      const char *buf, size_t count)
-+{
-+	int err;
-+	unsigned long flags;
-+
-+	err = kstrtoul(buf, 10, &flags);
-+	if (err || flags > 99)
-+		return -EINVAL;
-+
-+	uksm_thrash_threshold = flags;
-+
-+	return count;
-+}
-+UKSM_ATTR(thrash_threshold);
-+
-+static ssize_t cpu_ratios_show(struct kobject *kobj,
-+			       struct kobj_attribute *attr, char *buf)
-+{
-+	int i, size;
-+	struct scan_rung *rung;
-+	char *p = buf;
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		rung = &uksm_scan_ladder[i];
-+
-+		if (rung->cpu_ratio > 0)
-+			size = sprintf(p, "%d ", rung->cpu_ratio);
-+		else
-+			size = sprintf(p, "MAX/%d ",
-+					TIME_RATIO_SCALE / -rung->cpu_ratio);
-+
-+		p += size;
-+	}
-+
-+	*p++ = '\n';
-+	*p = '\0';
-+
-+	return p - buf;
-+}
-+
-+static ssize_t cpu_ratios_store(struct kobject *kobj,
-+				      struct kobj_attribute *attr,
-+				      const char *buf, size_t count)
-+{
-+	int i, cpuratios[SCAN_LADDER_SIZE], err;
-+	unsigned long value;
-+	struct scan_rung *rung;
-+	char *p, *end = NULL;
-+
-+	p = kzalloc(count, GFP_KERNEL);
-+	if (!p)
-+		return -ENOMEM;
-+
-+	memcpy(p, buf, count);
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		if (i != SCAN_LADDER_SIZE - 1) {
-+			end = strchr(p, ' ');
-+			if (!end)
-+				return -EINVAL;
-+
-+			*end = '\0';
-+		}
-+
-+		if (strstr(p, "MAX/")) {
-+			p = strchr(p, '/') + 1;
-+			err = kstrtoul(p, 10, &value);
-+			if (err || value > TIME_RATIO_SCALE || !value)
-+				return -EINVAL;
-+
-+			cpuratios[i] = -(int) (TIME_RATIO_SCALE / value);
-+		} else {
-+			err = kstrtoul(p, 10, &value);
-+			if (err || value > TIME_RATIO_SCALE || !value)
-+				return -EINVAL;
-+
-+			cpuratios[i] = value;
-+		}
-+
-+		p = end + 1;
-+	}
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		rung = &uksm_scan_ladder[i];
-+
-+		rung->cpu_ratio = cpuratios[i];
-+	}
-+
-+	return count;
-+}
-+UKSM_ATTR(cpu_ratios);
-+
-+static ssize_t eval_intervals_show(struct kobject *kobj,
-+			       struct kobj_attribute *attr, char *buf)
-+{
-+	int i, size;
-+	struct scan_rung *rung;
-+	char *p = buf;
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		rung = &uksm_scan_ladder[i];
-+		size = sprintf(p, "%u ", rung->cover_msecs);
-+		p += size;
-+	}
-+
-+	*p++ = '\n';
-+	*p = '\0';
-+
-+	return p - buf;
-+}
-+
-+static ssize_t eval_intervals_store(struct kobject *kobj,
-+				      struct kobj_attribute *attr,
-+				      const char *buf, size_t count)
-+{
-+	int i, err;
-+	unsigned long values[SCAN_LADDER_SIZE];
-+	struct scan_rung *rung;
-+	char *p, *end = NULL;
-+	ssize_t ret = count;
-+
-+	p = kzalloc(count + 2, GFP_KERNEL);
-+	if (!p)
-+		return -ENOMEM;
-+
-+	memcpy(p, buf, count);
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		if (i != SCAN_LADDER_SIZE - 1) {
-+			end = strchr(p, ' ');
-+			if (!end) {
-+				ret = -EINVAL;
-+				goto out;
-+			}
-+
-+			*end = '\0';
-+		}
-+
-+		err = kstrtoul(p, 10, &values[i]);
-+		if (err) {
-+			ret = -EINVAL;
-+			goto out;
-+		}
-+
-+		p = end + 1;
-+	}
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		rung = &uksm_scan_ladder[i];
-+
-+		rung->cover_msecs = values[i];
-+	}
-+
-+out:
-+	kfree(p);
-+	return ret;
-+}
-+UKSM_ATTR(eval_intervals);
-+
-+static ssize_t ema_per_page_time_show(struct kobject *kobj,
-+				 struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%lu\n", uksm_ema_page_time);
-+}
-+UKSM_ATTR_RO(ema_per_page_time);
-+
-+static ssize_t pages_shared_show(struct kobject *kobj,
-+				 struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%lu\n", uksm_pages_shared);
-+}
-+UKSM_ATTR_RO(pages_shared);
-+
-+static ssize_t pages_sharing_show(struct kobject *kobj,
-+				  struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%lu\n", uksm_pages_sharing);
-+}
-+UKSM_ATTR_RO(pages_sharing);
-+
-+static ssize_t pages_unshared_show(struct kobject *kobj,
-+				   struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%lu\n", uksm_pages_unshared);
-+}
-+UKSM_ATTR_RO(pages_unshared);
-+
-+static ssize_t full_scans_show(struct kobject *kobj,
-+			       struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%llu\n", fully_scanned_round);
-+}
-+UKSM_ATTR_RO(full_scans);
-+
-+static ssize_t pages_scanned_show(struct kobject *kobj,
-+				  struct kobj_attribute *attr, char *buf)
-+{
-+	unsigned long base = 0;
-+	u64 delta, ret;
-+
-+	if (pages_scanned_stored) {
-+		base = pages_scanned_base;
-+		ret = pages_scanned_stored;
-+		delta = uksm_pages_scanned >> base;
-+		if (CAN_OVERFLOW_U64(ret, delta)) {
-+			ret >>= 1;
-+			delta >>= 1;
-+			base++;
-+			ret += delta;
-+		}
-+	} else {
-+		ret = uksm_pages_scanned;
-+	}
-+
-+	while (ret > ULONG_MAX) {
-+		ret >>= 1;
-+		base++;
-+	}
-+
-+	if (base)
-+		return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
-+	else
-+		return sprintf(buf, "%lu\n", (unsigned long)ret);
-+}
-+UKSM_ATTR_RO(pages_scanned);
-+
-+static ssize_t hash_strength_show(struct kobject *kobj,
-+				  struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%lu\n", hash_strength);
-+}
-+UKSM_ATTR_RO(hash_strength);
-+
-+static ssize_t sleep_times_show(struct kobject *kobj,
-+				  struct kobj_attribute *attr, char *buf)
-+{
-+	return sprintf(buf, "%llu\n", uksm_sleep_times);
-+}
-+UKSM_ATTR_RO(sleep_times);
-+
-+
-+static struct attribute *uksm_attrs[] = {
-+	&max_cpu_percentage_attr.attr,
-+	&sleep_millisecs_attr.attr,
-+	&cpu_governor_attr.attr,
-+	&run_attr.attr,
-+	&ema_per_page_time_attr.attr,
-+	&pages_shared_attr.attr,
-+	&pages_sharing_attr.attr,
-+	&pages_unshared_attr.attr,
-+	&full_scans_attr.attr,
-+	&pages_scanned_attr.attr,
-+	&hash_strength_attr.attr,
-+	&sleep_times_attr.attr,
-+	&thrash_threshold_attr.attr,
-+	&abundant_threshold_attr.attr,
-+	&cpu_ratios_attr.attr,
-+	&eval_intervals_attr.attr,
-+	NULL,
-+};
-+
-+static struct attribute_group uksm_attr_group = {
-+	.attrs = uksm_attrs,
-+	.name = "uksm",
-+};
-+#endif /* CONFIG_SYSFS */
-+
-+static inline void init_scan_ladder(void)
-+{
-+	int i;
-+	struct scan_rung *rung;
-+
-+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
-+		rung = uksm_scan_ladder + i;
-+		slot_tree_init_root(&rung->vma_root);
-+	}
-+
-+	init_performance_values();
-+	uksm_calc_scan_pages();
-+}
-+
-+static inline int cal_positive_negative_costs(void)
-+{
-+	struct page *p1, *p2;
-+	unsigned char *addr1, *addr2;
-+	unsigned long i, time_start, hash_cost;
-+	unsigned long loopnum = 0;
-+
-+	/*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
-+	volatile u32 hash;
-+	volatile int ret;
-+
-+	p1 = alloc_page(GFP_KERNEL);
-+	if (!p1)
-+		return -ENOMEM;
-+
-+	p2 = alloc_page(GFP_KERNEL);
-+	if (!p2)
-+		return -ENOMEM;
-+
-+	addr1 = kmap_atomic(p1);
-+	addr2 = kmap_atomic(p2);
-+	memset(addr1, prandom_u32(), PAGE_SIZE);
-+	memcpy(addr2, addr1, PAGE_SIZE);
-+
-+	/* make sure that the two pages differ in last byte */
-+	addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
-+	kunmap_atomic(addr2);
-+	kunmap_atomic(addr1);
-+
-+	time_start = jiffies;
-+	while (jiffies - time_start < 100) {
-+		for (i = 0; i < 100; i++)
-+			hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
-+		loopnum += 100;
-+	}
-+	hash_cost = (jiffies - time_start);
-+
-+	time_start = jiffies;
-+	for (i = 0; i < loopnum; i++)
-+		ret = pages_identical_with_cost(p1, p2);
-+	memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
-+	memcmp_cost /= hash_cost;
-+	pr_info("UKSM: relative memcmp_cost = %lu "
-+		"hash=%u cmp_ret=%d.\n",
-+		memcmp_cost, hash, ret);
-+
-+	__free_page(p1);
-+	__free_page(p2);
-+	return 0;
-+}
-+
-+static int init_zeropage_hash_table(void)
-+{
-+	struct page *page;
-+	char *addr;
-+	int i;
-+
-+	page = alloc_page(GFP_KERNEL);
-+	if (!page)
-+		return -ENOMEM;
-+
-+	addr = kmap_atomic(page);
-+	memset(addr, 0, PAGE_SIZE);
-+	kunmap_atomic(addr);
-+
-+	zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32),
-+		GFP_KERNEL);
-+	if (!zero_hash_table)
-+		return -ENOMEM;
-+
-+	for (i = 0; i < HASH_STRENGTH_MAX; i++)
-+		zero_hash_table[i] = page_hash(page, i, 0);
-+
-+	__free_page(page);
-+
-+	return 0;
-+}
-+
-+static inline int init_random_sampling(void)
-+{
-+	unsigned long i;
-+
-+	random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
-+	if (!random_nums)
-+		return -ENOMEM;
-+
-+	for (i = 0; i < HASH_STRENGTH_FULL; i++)
-+		random_nums[i] = i;
-+
-+	for (i = 0; i < HASH_STRENGTH_FULL; i++) {
-+		unsigned long rand_range, swap_index, tmp;
-+
-+		rand_range = HASH_STRENGTH_FULL - i;
-+		swap_index = i + prandom_u32() % rand_range;
-+		tmp = random_nums[i];
-+		random_nums[i] =  random_nums[swap_index];
-+		random_nums[swap_index] = tmp;
-+	}
-+
-+	rshash_state.state = RSHASH_NEW;
-+	rshash_state.below_count = 0;
-+	rshash_state.lookup_window_index = 0;
-+
-+	return cal_positive_negative_costs();
-+}
-+
-+static int __init uksm_slab_init(void)
-+{
-+	rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
-+	if (!rmap_item_cache)
-+		goto out;
-+
-+	stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
-+	if (!stable_node_cache)
-+		goto out_free1;
-+
-+	node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
-+	if (!node_vma_cache)
-+		goto out_free2;
-+
-+	vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
-+	if (!vma_slot_cache)
-+		goto out_free3;
-+
-+	tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
-+	if (!tree_node_cache)
-+		goto out_free4;
-+
-+	return 0;
-+
-+out_free4:
-+	kmem_cache_destroy(vma_slot_cache);
-+out_free3:
-+	kmem_cache_destroy(node_vma_cache);
-+out_free2:
-+	kmem_cache_destroy(stable_node_cache);
-+out_free1:
-+	kmem_cache_destroy(rmap_item_cache);
-+out:
-+	return -ENOMEM;
-+}
-+
-+static void __init uksm_slab_free(void)
-+{
-+	kmem_cache_destroy(stable_node_cache);
-+	kmem_cache_destroy(rmap_item_cache);
-+	kmem_cache_destroy(node_vma_cache);
-+	kmem_cache_destroy(vma_slot_cache);
-+	kmem_cache_destroy(tree_node_cache);
-+}
-+
-+/* Common interface to ksm, different to it. */
-+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
-+		unsigned long end, int advice, unsigned long *vm_flags)
-+{
-+	int err;
-+
-+	switch (advice) {
-+	case MADV_MERGEABLE:
-+		return 0;		/* just ignore the advice */
-+
-+	case MADV_UNMERGEABLE:
-+		if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags))
-+			return 0;		/* just ignore the advice */
-+
-+		if (vma->anon_vma) {
-+			err = unmerge_uksm_pages(vma, start, end);
-+			if (err)
-+				return err;
-+		}
-+
-+		uksm_remove_vma(vma);
-+		*vm_flags &= ~VM_MERGEABLE;
-+		break;
-+	}
-+
-+	return 0;
-+}
-+
-+/* Common interface to ksm, actually the same. */
-+struct page *ksm_might_need_to_copy(struct page *page,
-+			struct vm_area_struct *vma, unsigned long address)
-+{
-+	struct anon_vma *anon_vma = page_anon_vma(page);
-+	struct page *new_page;
-+
-+	if (PageKsm(page)) {
-+		if (page_stable_node(page))
-+			return page;	/* no need to copy it */
-+	} else if (!anon_vma) {
-+		return page;		/* no need to copy it */
-+	} else if (anon_vma->root == vma->anon_vma->root &&
-+		 page->index == linear_page_index(vma, address)) {
-+		return page;		/* still no need to copy it */
-+	}
-+	if (!PageUptodate(page))
-+		return page;		/* let do_swap_page report the error */
-+
-+	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-+	if (new_page) {
-+		copy_user_highpage(new_page, page, address, vma);
-+
-+		SetPageDirty(new_page);
-+		__SetPageUptodate(new_page);
-+		__SetPageLocked(new_page);
-+	}
-+
-+	return new_page;
-+}
-+
-+/* Copied from mm/ksm.c and required from 5.1 */
-+bool reuse_ksm_page(struct page *page,
-+		    struct vm_area_struct *vma,
-+		    unsigned long address)
-+{
-+#ifdef CONFIG_DEBUG_VM
-+	if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
-+			WARN_ON(!page_mapped(page)) ||
-+			WARN_ON(!PageLocked(page))) {
-+		dump_page(page, "reuse_ksm_page");
-+		return false;
-+	}
-+#endif
-+
-+	if (PageSwapCache(page) || !page_stable_node(page))
-+		return false;
-+	/* Prohibit parallel get_ksm_page() */
-+	if (!page_ref_freeze(page, 1))
-+		return false;
-+
-+	page_move_anon_rmap(page, vma);
-+	page->index = linear_page_index(vma, address);
-+	page_ref_unfreeze(page, 1);
-+
-+	return true;
-+}
-+
-+static int __init uksm_init(void)
-+{
-+	struct task_struct *uksm_thread;
-+	int err;
-+
-+	uksm_sleep_jiffies = msecs_to_jiffies(100);
-+	uksm_sleep_saved = uksm_sleep_jiffies;
-+
-+	slot_tree_init();
-+	init_scan_ladder();
-+
-+
-+	err = init_random_sampling();
-+	if (err)
-+		goto out_free2;
-+
-+	err = uksm_slab_init();
-+	if (err)
-+		goto out_free1;
-+
-+	err = init_zeropage_hash_table();
-+	if (err)
-+		goto out_free0;
-+
-+	uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
-+	if (IS_ERR(uksm_thread)) {
-+		pr_err("uksm: creating kthread failed\n");
-+		err = PTR_ERR(uksm_thread);
-+		goto out_free;
-+	}
-+
-+#ifdef CONFIG_SYSFS
-+	err = sysfs_create_group(mm_kobj, &uksm_attr_group);
-+	if (err) {
-+		pr_err("uksm: register sysfs failed\n");
-+		kthread_stop(uksm_thread);
-+		goto out_free;
-+	}
-+#else
-+	uksm_run = UKSM_RUN_MERGE;	/* no way for user to start it */
-+
-+#endif /* CONFIG_SYSFS */
-+
-+#ifdef CONFIG_MEMORY_HOTREMOVE
-+	/*
-+	 * Choose a high priority since the callback takes uksm_thread_mutex:
-+	 * later callbacks could only be taking locks which nest within that.
-+	 */
-+	hotplug_memory_notifier(uksm_memory_callback, 100);
-+#endif
-+	return 0;
-+
-+out_free:
-+	kfree(zero_hash_table);
-+out_free0:
-+	uksm_slab_free();
-+out_free1:
-+	kfree(random_nums);
-+out_free2:
-+	kfree(uksm_scan_ladder);
-+	return err;
-+}
-+
-+#ifdef MODULE
-+subsys_initcall(ksm_init);
-+#else
-+late_initcall(uksm_init);
-+#endif
-+
-diff -Nur a/mm/vmstat.c b/mm/vmstat.c
---- a/mm/vmstat.c	2020-12-30 10:54:29.000000000 +0000
-+++ b/mm/vmstat.c	2021-01-03 14:22:34.502459119 +0000
-@@ -1216,6 +1216,9 @@
- 	"nr_shadow_call_stack",
- #endif
- 
-+#ifdef CONFIG_UKSM
-+	"nr_uksm_zero_pages",
-+#endif
- 	/* enum writeback_stat_item counters */
- 	"nr_dirty_threshold",
- 	"nr_dirty_background_threshold",
diff --git a/sys-kernel/linux-image-redcore-lts/files/5.10-uksm.patch b/sys-kernel/linux-image-redcore-lts/files/5.10-uksm.patch
new file mode 100644
index 00000000..eacd7b69
--- /dev/null
+++ b/sys-kernel/linux-image-redcore-lts/files/5.10-uksm.patch
@@ -0,0 +1,6935 @@
+diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
+new file mode 100644
+index 000000000000..be19a3127001
+--- /dev/null
++++ b/Documentation/vm/uksm.txt
+@@ -0,0 +1,61 @@
++The Ultra Kernel Samepage Merging feature
++----------------------------------------------
++/*
++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
++ *
++ * This is an improvement upon KSM. Some basic data structures and routines
++ * are borrowed from ksm.c .
++ *
++ * Its new features:
++ * 1. Full system scan:
++ *      It automatically scans all user processes' anonymous VMAs. Kernel-user
++ *      interaction to submit a memory area to KSM is no longer needed.
++ *
++ * 2. Rich area detection:
++ *      It automatically detects rich areas containing abundant duplicated
++ *      pages based. Rich areas are given a full scan speed. Poor areas are
++ *      sampled at a reasonable speed with very low CPU consumption.
++ *
++ * 3. Ultra Per-page scan speed improvement:
++ *      A new hash algorithm is proposed. As a result, on a machine with
++ *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
++ *      can scan memory areas that does not contain duplicated pages at speed of
++ *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
++ *      477MB/sec ~ 923MB/sec.
++ *
++ * 4. Thrashing area avoidance:
++ *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
++ *      filtered out. My benchmark shows it's more efficient than KSM's per-page
++ *      hash value based volatile page detection.
++ *
++ *
++ * 5. Misc changes upon KSM:
++ *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
++ *        comparison. It's much faster than default C version on x86.
++ *      * rmap_item now has an struct *page member to loosely cache a
++ *        address-->page mapping, which reduces too much time-costly
++ *        follow_page().
++ *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
++ *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
++ *        ksm is needed for this case.
++ *
++ * 6. Full Zero Page consideration(contributed by Figo Zhang)
++ *    Now uksmd consider full zero pages as special pages and merge them to an
++ *    special unswappable uksm zero page.
++ */
++
++ChangeLog:
++
++2012-05-05 The creation of this Doc
++2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
++2012-05-28 UKSM 0.1.1.2 bug fix release
++2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
++2012-07-2  UKSM 0.1.2-beta2
++2012-07-10 UKSM 0.1.2-beta3
++2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
++2012-10-13 UKSM 0.1.2.1 Bug fixes.
++2012-12-31 UKSM 0.1.2.2 Minor bug fixes.
++2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug".
++2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings.
++2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation.
++2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration.
+diff --git a/fs/exec.c b/fs/exec.c
+index 547a2390baf5..fc64a20db6bd 100644
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -64,6 +64,7 @@
+ #include <linux/compat.h>
+ #include <linux/vmalloc.h>
+ #include <linux/io_uring.h>
++#include <linux/ksm.h>
+ 
+ #include <linux/uaccess.h>
+ #include <asm/mmu_context.h>
+diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
+index 887a5532e449..581a6762868e 100644
+--- a/fs/proc/meminfo.c
++++ b/fs/proc/meminfo.c
+@@ -108,7 +108,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
+ #endif
+ 	show_val_kb(m, "PageTables:     ",
+ 		    global_zone_page_state(NR_PAGETABLE));
+-
++#ifdef CONFIG_UKSM
++	show_val_kb(m, "KsmZeroPages:     ",
++		    global_zone_page_state(NR_UKSM_ZERO_PAGES));
++#endif
+ 	show_val_kb(m, "NFS_Unstable:   ", 0);
+ 	show_val_kb(m, "Bounce:         ",
+ 		    global_zone_page_state(NR_BOUNCE));
+diff --git a/include/linux/ksm.h b/include/linux/ksm.h
+index 161e8164abcf..f0dbdf3c986a 100644
+--- a/include/linux/ksm.h
++++ b/include/linux/ksm.h
+@@ -21,20 +21,16 @@ struct mem_cgroup;
+ #ifdef CONFIG_KSM
+ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
+ 		unsigned long end, int advice, unsigned long *vm_flags);
+-int __ksm_enter(struct mm_struct *mm);
+-void __ksm_exit(struct mm_struct *mm);
+ 
+-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
++static inline struct stable_node *page_stable_node(struct page *page)
+ {
+-	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
+-		return __ksm_enter(mm);
+-	return 0;
++	return PageKsm(page) ? page_rmapping(page) : NULL;
+ }
+ 
+-static inline void ksm_exit(struct mm_struct *mm)
++static inline void set_page_stable_node(struct page *page,
++					struct stable_node *stable_node)
+ {
+-	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
+-		__ksm_exit(mm);
++	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
+ }
+ 
+ /*
+@@ -54,6 +50,33 @@ struct page *ksm_might_need_to_copy(struct page *page,
+ void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
+ void ksm_migrate_page(struct page *newpage, struct page *oldpage);
+ 
++#ifdef CONFIG_KSM_LEGACY
++int __ksm_enter(struct mm_struct *mm);
++void __ksm_exit(struct mm_struct *mm);
++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
++{
++	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
++		return __ksm_enter(mm);
++	return 0;
++}
++
++static inline void ksm_exit(struct mm_struct *mm)
++{
++	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
++		__ksm_exit(mm);
++}
++
++#elif defined(CONFIG_UKSM)
++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
++{
++	return 0;
++}
++
++static inline void ksm_exit(struct mm_struct *mm)
++{
++}
++#endif /* !CONFIG_UKSM */
++
+ #else  /* !CONFIG_KSM */
+ 
+ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+@@ -89,4 +112,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+ #endif /* CONFIG_MMU */
+ #endif /* !CONFIG_KSM */
+ 
++#include <linux/uksm.h>
++
+ #endif /* __LINUX_KSM_H */
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 5a9238f6caad..5dd1ccf5cb69 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -371,6 +371,9 @@ struct vm_area_struct {
+ 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
+ #endif
+ 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
++#ifdef CONFIG_UKSM
++	struct vma_slot *uksm_vma_slot;
++#endif
+ } __randomize_layout;
+ 
+ struct core_thread {
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index fb3bf696c05e..e4477c3a9a4b 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -159,6 +159,9 @@ enum zone_stat_item {
+ 	NR_ZSPAGES,		/* allocated in zsmalloc */
+ #endif
+ 	NR_FREE_CMA_PAGES,
++#ifdef CONFIG_UKSM
++	NR_UKSM_ZERO_PAGES,
++#endif
+ 	NR_VM_ZONE_STAT_ITEMS };
+ 
+ enum node_stat_item {
+diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
+index e237004d498d..092cdfb7090b 100644
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -1060,12 +1060,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+ extern void untrack_pfn_moved(struct vm_area_struct *vma);
+ #endif
+ 
++#ifdef CONFIG_UKSM
++static inline int is_uksm_zero_pfn(unsigned long pfn)
++{
++	extern unsigned long uksm_zero_pfn;
++	return pfn == uksm_zero_pfn;
++}
++#else
++static inline int is_uksm_zero_pfn(unsigned long pfn)
++{
++	return 0;
++}
++#endif
++
+ #ifdef __HAVE_COLOR_ZERO_PAGE
+ static inline int is_zero_pfn(unsigned long pfn)
+ {
+ 	extern unsigned long zero_pfn;
+ 	unsigned long offset_from_zero_pfn = pfn - zero_pfn;
+-	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
++	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
+ }
+ 
+ #define my_zero_pfn(addr)	page_to_pfn(ZERO_PAGE(addr))
+@@ -1074,7 +1087,7 @@ static inline int is_zero_pfn(unsigned long pfn)
+ static inline int is_zero_pfn(unsigned long pfn)
+ {
+ 	extern unsigned long zero_pfn;
+-	return pfn == zero_pfn;
++	return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
+ }
+ 
+ static inline unsigned long my_zero_pfn(unsigned long addr)
+diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
+new file mode 100644
+index 000000000000..d71edba6b63f
+--- /dev/null
++++ b/include/linux/sradix-tree.h
+@@ -0,0 +1,77 @@
++#ifndef _LINUX_SRADIX_TREE_H
++#define _LINUX_SRADIX_TREE_H
++
++
++#define INIT_SRADIX_TREE(root, mask)					\
++do {									\
++	(root)->height = 0;						\
++	(root)->gfp_mask = (mask);					\
++	(root)->rnode = NULL;						\
++} while (0)
++
++#define ULONG_BITS	(sizeof(unsigned long) * 8)
++#define SRADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
++//#define SRADIX_TREE_MAP_SHIFT	6
++//#define SRADIX_TREE_MAP_SIZE	(1UL << SRADIX_TREE_MAP_SHIFT)
++//#define SRADIX_TREE_MAP_MASK	(SRADIX_TREE_MAP_SIZE-1)
++
++struct sradix_tree_node {
++	unsigned int	height;		/* Height from the bottom */
++	unsigned int	count;
++	unsigned int	fulls;		/* Number of full sublevel trees */
++	struct sradix_tree_node *parent;
++	void *stores[0];
++};
++
++/* A simple radix tree implementation */
++struct sradix_tree_root {
++	unsigned int            height;
++	struct sradix_tree_node *rnode;
++
++	/* Where found to have available empty stores in its sublevels */
++	struct sradix_tree_node *enter_node;
++	unsigned int shift;
++	unsigned int stores_size;
++	unsigned int mask;
++	unsigned long min;	/* The first hole index */
++	unsigned long num;
++	//unsigned long *height_to_maxindex;
++
++	/* How the node is allocated and freed. */
++	struct sradix_tree_node *(*alloc)(void);
++	void (*free)(struct sradix_tree_node *node);
++
++	/* When a new node is added and removed */
++	void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
++	void (*assign)(struct sradix_tree_node *node, unsigned int index, void *item);
++	void (*rm)(struct sradix_tree_node *node, unsigned int offset);
++};
++
++struct sradix_tree_path {
++	struct sradix_tree_node *node;
++	int offset;
++};
++
++static inline
++void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
++{
++	root->height = 0;
++	root->rnode = NULL;
++	root->shift = shift;
++	root->stores_size = 1UL << shift;
++	root->mask = root->stores_size - 1;
++}
++
++
++extern void *sradix_tree_next(struct sradix_tree_root *root,
++		       struct sradix_tree_node *node, unsigned long index,
++		       int (*iter)(void *, unsigned long));
++
++extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
++
++extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
++			struct sradix_tree_node *node, unsigned long index);
++
++extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
++
++#endif /* _LINUX_SRADIX_TREE_H */
+diff --git a/include/linux/uksm.h b/include/linux/uksm.h
+new file mode 100644
+index 000000000000..bb8651f534f2
+--- /dev/null
++++ b/include/linux/uksm.h
+@@ -0,0 +1,149 @@
++#ifndef __LINUX_UKSM_H
++#define __LINUX_UKSM_H
++/*
++ * Memory merging support.
++ *
++ * This code enables dynamic sharing of identical pages found in different
++ * memory areas, even if they are not shared by fork().
++ */
++
++/* if !CONFIG_UKSM this file should not be compiled at all. */
++#ifdef CONFIG_UKSM
++
++#include <linux/bitops.h>
++#include <linux/mm.h>
++#include <linux/pagemap.h>
++#include <linux/rmap.h>
++#include <linux/sched.h>
++
++extern unsigned long zero_pfn __read_mostly;
++extern unsigned long uksm_zero_pfn __read_mostly;
++extern struct page *empty_uksm_zero_page;
++
++/* must be done before linked to mm */
++extern void uksm_vma_add_new(struct vm_area_struct *vma);
++extern void uksm_remove_vma(struct vm_area_struct *vma);
++
++#define UKSM_SLOT_NEED_SORT	(1 << 0)
++#define UKSM_SLOT_NEED_RERAND	(1 << 1)
++#define UKSM_SLOT_SCANNED	(1 << 2) /* It's scanned in this round */
++#define UKSM_SLOT_FUL_SCANNED	(1 << 3)
++#define UKSM_SLOT_IN_UKSM	(1 << 4)
++
++struct vma_slot {
++	struct sradix_tree_node *snode;
++	unsigned long sindex;
++
++	struct list_head slot_list;
++	unsigned long fully_scanned_round;
++	unsigned long dedup_num;
++	unsigned long pages_scanned;
++	unsigned long this_sampled;
++	unsigned long last_scanned;
++	unsigned long pages_to_scan;
++	struct scan_rung *rung;
++	struct page **rmap_list_pool;
++	unsigned int *pool_counts;
++	unsigned long pool_size;
++	struct vm_area_struct *vma;
++	struct mm_struct *mm;
++	unsigned long ctime_j;
++	unsigned long pages;
++	unsigned long flags;
++	unsigned long pages_cowed; /* pages cowed this round */
++	unsigned long pages_merged; /* pages merged this round */
++	unsigned long pages_bemerged;
++
++	/* when it has page merged in this eval round */
++	struct list_head dedup_list;
++};
++
++static inline void uksm_unmap_zero_page(pte_t pte)
++{
++	if (pte_pfn(pte) == uksm_zero_pfn)
++		__dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
++}
++
++static inline void uksm_map_zero_page(pte_t pte)
++{
++	if (pte_pfn(pte) == uksm_zero_pfn)
++		__inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
++}
++
++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
++{
++	if (vma->uksm_vma_slot && PageKsm(page))
++		vma->uksm_vma_slot->pages_cowed++;
++}
++
++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
++{
++	if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
++		vma->uksm_vma_slot->pages_cowed++;
++}
++
++static inline int uksm_flags_can_scan(unsigned long vm_flags)
++{
++#ifdef VM_SAO
++		if (vm_flags & VM_SAO)
++			return 0;
++#endif
++
++	return !(vm_flags & (VM_PFNMAP | VM_IO  | VM_DONTEXPAND |
++			     VM_HUGETLB | VM_MIXEDMAP | VM_SHARED
++			     | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN));
++}
++
++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
++{
++	if (uksm_flags_can_scan(*vm_flags_p))
++		*vm_flags_p |= VM_MERGEABLE;
++}
++
++/*
++ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
++ * be removed when uksm zero page patch is stable enough.
++ */
++static inline void uksm_bugon_zeropage(pte_t pte)
++{
++	BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
++}
++#else
++static inline void uksm_vma_add_new(struct vm_area_struct *vma)
++{
++}
++
++static inline void uksm_remove_vma(struct vm_area_struct *vma)
++{
++}
++
++static inline void uksm_unmap_zero_page(pte_t pte)
++{
++}
++
++static inline void uksm_map_zero_page(pte_t pte)
++{
++}
++
++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
++{
++}
++
++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
++{
++}
++
++static inline int uksm_flags_can_scan(unsigned long vm_flags)
++{
++	return 0;
++}
++
++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
++{
++}
++
++static inline void uksm_bugon_zeropage(pte_t pte)
++{
++}
++#endif /* !CONFIG_UKSM */
++#endif /* __LINUX_UKSM_H */
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 6d266388d380..bb52f8731dd6 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -587,7 +587,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
+ 		__vma_link_rb(mm, tmp, rb_link, rb_parent);
+ 		rb_link = &tmp->vm_rb.rb_right;
+ 		rb_parent = &tmp->vm_rb;
+-
++		uksm_vma_add_new(tmp);
+ 		mm->map_count++;
+ 		if (!(tmp->vm_flags & VM_WIPEONFORK))
+ 			retval = copy_page_range(tmp, mpnt);
+diff --git a/lib/Makefile b/lib/Makefile
+index d415fc7067c5..e4045ebec8cd 100644
+--- a/lib/Makefile
++++ b/lib/Makefile
+@@ -31,7 +31,7 @@ endif
+ KCSAN_SANITIZE_random32.o := n
+ 
+ lib-y := ctype.o string.o vsprintf.o cmdline.o \
+-	 rbtree.o radix-tree.o timerqueue.o xarray.o \
++	 rbtree.o radix-tree.o sradix-tree.o timerqueue.o xarray.o \
+ 	 idr.o extable.o sha1.o irq_regs.o argv_split.o \
+ 	 flex_proportions.o ratelimit.o show_mem.o \
+ 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
+diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c
+new file mode 100644
+index 000000000000..ab21e6309b93
+--- /dev/null
++++ b/lib/sradix-tree.c
+@@ -0,0 +1,476 @@
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/mman.h>
++#include <linux/spinlock.h>
++#include <linux/slab.h>
++#include <linux/gcd.h>
++#include <linux/sradix-tree.h>
++
++static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
++{
++	return node->fulls == root->stores_size ||
++		(node->height == 1 && node->count == root->stores_size);
++}
++
++/*
++ *	Extend a sradix tree so it can store key @index.
++ */
++static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
++{
++	struct sradix_tree_node *node;
++	unsigned int height;
++
++	if (unlikely(root->rnode == NULL)) {
++		if (!(node = root->alloc()))
++			return -ENOMEM;
++
++		node->height = 1;
++		root->rnode = node;
++		root->height = 1;
++	}
++
++	/* Figure out what the height should be.  */
++	height = root->height;
++	index >>= root->shift * height;
++
++	while (index) {
++		index >>= root->shift;
++		height++;
++	}
++
++	while (height > root->height) {
++		unsigned int newheight;
++
++		if (!(node = root->alloc()))
++			return -ENOMEM;
++
++		/* Increase the height.  */
++		node->stores[0] = root->rnode;
++		root->rnode->parent = node;
++		if (root->extend)
++			root->extend(node, root->rnode);
++
++		newheight = root->height + 1;
++		node->height = newheight;
++		node->count = 1;
++		if (sradix_node_full(root, root->rnode))
++			node->fulls = 1;
++
++		root->rnode = node;
++		root->height = newheight;
++	}
++
++	return 0;
++}
++
++/*
++ * Search the next item from the current node, that is not NULL
++ * and can satify root->iter().
++ */
++void *sradix_tree_next(struct sradix_tree_root *root,
++		       struct sradix_tree_node *node, unsigned long index,
++		       int (*iter)(void *item, unsigned long height))
++{
++	unsigned long offset;
++	void *item;
++
++	if (unlikely(node == NULL)) {
++		node = root->rnode;
++		for (offset = 0; offset < root->stores_size; offset++) {
++			item = node->stores[offset];
++			if (item && (!iter || iter(item, node->height)))
++				break;
++		}
++
++		if (unlikely(offset >= root->stores_size))
++			return NULL;
++
++		if (node->height == 1)
++			return item;
++		else
++			goto go_down;
++	}
++
++	while (node) {
++		offset = (index & root->mask) + 1;
++		for (; offset < root->stores_size; offset++) {
++			item = node->stores[offset];
++			if (item && (!iter || iter(item, node->height)))
++				break;
++		}
++
++		if (offset < root->stores_size)
++			break;
++
++		node = node->parent;
++		index >>= root->shift;
++	}
++
++	if (!node)
++		return NULL;
++
++	while (node->height > 1) {
++go_down:
++		node = item;
++		for (offset = 0; offset < root->stores_size; offset++) {
++			item = node->stores[offset];
++			if (item && (!iter || iter(item, node->height)))
++				break;
++		}
++
++		if (unlikely(offset >= root->stores_size))
++			return NULL;
++	}
++
++	BUG_ON(offset > root->stores_size);
++
++	return item;
++}
++
++/*
++ * Blindly insert the item to the tree. Typically, we reuse the
++ * first empty store item.
++ */
++int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
++{
++	unsigned long index;
++	unsigned int height;
++	struct sradix_tree_node *node, *tmp = NULL;
++	int offset, offset_saved;
++	void **store = NULL;
++	int error, i, j, shift;
++
++go_on:
++	index = root->min;
++
++	if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
++		node = root->enter_node;
++		BUG_ON((index >> (root->shift * root->height)));
++	} else {
++		node = root->rnode;
++		if (node == NULL || (index >> (root->shift * root->height))
++		    || sradix_node_full(root, node)) {
++			error = sradix_tree_extend(root, index);
++			if (error)
++				return error;
++
++			node = root->rnode;
++		}
++	}
++
++
++	height = node->height;
++	shift = (height - 1) * root->shift;
++	offset = (index >> shift) & root->mask;
++	while (shift > 0) {
++		offset_saved = offset;
++		for (; offset < root->stores_size; offset++) {
++			store = &node->stores[offset];
++			tmp = *store;
++
++			if (!tmp || !sradix_node_full(root, tmp))
++				break;
++		}
++		BUG_ON(offset >= root->stores_size);
++
++		if (offset != offset_saved) {
++			index += (offset - offset_saved) << shift;
++			index &= ~((1UL << shift) - 1);
++		}
++
++		if (!tmp) {
++			if (!(tmp = root->alloc()))
++				return -ENOMEM;
++
++			tmp->height = shift / root->shift;
++			*store = tmp;
++			tmp->parent = node;
++			node->count++;
++//			if (root->extend)
++//				root->extend(node, tmp);
++		}
++
++		node = tmp;
++		shift -= root->shift;
++		offset = (index >> shift) & root->mask;
++	}
++
++	BUG_ON(node->height != 1);
++
++
++	store = &node->stores[offset];
++	for (i = 0, j = 0;
++	      j < root->stores_size - node->count &&
++	      i < root->stores_size - offset && j < num; i++) {
++		if (!store[i]) {
++			store[i] = item[j];
++			if (root->assign)
++				root->assign(node, index + i, item[j]);
++			j++;
++		}
++	}
++
++	node->count += j;
++	root->num += j;
++	num -= j;
++
++	while (sradix_node_full(root, node)) {
++		node = node->parent;
++		if (!node)
++			break;
++
++		node->fulls++;
++	}
++
++	if (unlikely(!node)) {
++		/* All nodes are full */
++		root->min = 1 << (root->height * root->shift);
++		root->enter_node = NULL;
++	} else {
++		root->min = index + i - 1;
++		root->min |= (1UL << (node->height - 1)) - 1;
++		root->min++;
++		root->enter_node = node;
++	}
++
++	if (num) {
++		item += j;
++		goto go_on;
++	}
++
++	return 0;
++}
++
++
++/**
++ *	sradix_tree_shrink    -    shrink height of a sradix tree to minimal
++ *      @root		sradix tree root
++ *
++ */
++static inline void sradix_tree_shrink(struct sradix_tree_root *root)
++{
++	/* try to shrink tree height */
++	while (root->height > 1) {
++		struct sradix_tree_node *to_free = root->rnode;
++
++		/*
++		 * The candidate node has more than one child, or its child
++		 * is not at the leftmost store, we cannot shrink.
++		 */
++		if (to_free->count != 1 || !to_free->stores[0])
++			break;
++
++		root->rnode = to_free->stores[0];
++		root->rnode->parent = NULL;
++		root->height--;
++		if (unlikely(root->enter_node == to_free))
++			root->enter_node = NULL;
++		root->free(to_free);
++	}
++}
++
++/*
++ * Del the item on the known leaf node and index
++ */
++void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
++				  struct sradix_tree_node *node, unsigned long index)
++{
++	unsigned int offset;
++	struct sradix_tree_node *start, *end;
++
++	BUG_ON(node->height != 1);
++
++	start = node;
++	while (node && !(--node->count))
++		node = node->parent;
++
++	end = node;
++	if (!node) {
++		root->rnode = NULL;
++		root->height = 0;
++		root->min = 0;
++		root->num = 0;
++		root->enter_node = NULL;
++	} else {
++		offset = (index >> (root->shift * (node->height - 1))) & root->mask;
++		if (root->rm)
++			root->rm(node, offset);
++		node->stores[offset] = NULL;
++		root->num--;
++		if (root->min > index) {
++			root->min = index;
++			root->enter_node = node;
++		}
++	}
++
++	if (start != end) {
++		do {
++			node = start;
++			start = start->parent;
++			if (unlikely(root->enter_node == node))
++				root->enter_node = end;
++			root->free(node);
++		} while (start != end);
++
++		/*
++		 * Note that shrink may free "end", so enter_node still need to
++		 * be checked inside.
++		 */
++		sradix_tree_shrink(root);
++	} else if (node->count == root->stores_size - 1) {
++		/* It WAS a full leaf node. Update the ancestors */
++		node = node->parent;
++		while (node) {
++			node->fulls--;
++			if (node->fulls != root->stores_size - 1)
++				break;
++
++			node = node->parent;
++		}
++	}
++}
++
++void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
++{
++	unsigned int height, offset;
++	struct sradix_tree_node *node;
++	int shift;
++
++	node = root->rnode;
++	if (node == NULL || (index >> (root->shift * root->height)))
++		return NULL;
++
++	height = root->height;
++	shift = (height - 1) * root->shift;
++
++	do {
++		offset = (index >> shift) & root->mask;
++		node = node->stores[offset];
++		if (!node)
++			return NULL;
++
++		shift -= root->shift;
++	} while (shift >= 0);
++
++	return node;
++}
++
++/*
++ * Return the item if it exists, otherwise create it in place
++ * and return the created item.
++ */
++void *sradix_tree_lookup_create(struct sradix_tree_root *root,
++			unsigned long index, void *(*item_alloc)(void))
++{
++	unsigned int height, offset;
++	struct sradix_tree_node *node, *tmp;
++	void *item;
++	int shift, error;
++
++	if (root->rnode == NULL || (index >> (root->shift * root->height))) {
++		if (item_alloc) {
++			error = sradix_tree_extend(root, index);
++			if (error)
++				return NULL;
++		} else {
++			return NULL;
++		}
++	}
++
++	node = root->rnode;
++	height = root->height;
++	shift = (height - 1) * root->shift;
++
++	do {
++		offset = (index >> shift) & root->mask;
++		if (!node->stores[offset]) {
++			if (!(tmp = root->alloc()))
++				return NULL;
++
++			tmp->height = shift / root->shift;
++			node->stores[offset] = tmp;
++			tmp->parent = node;
++			node->count++;
++			node = tmp;
++		} else {
++			node = node->stores[offset];
++		}
++
++		shift -= root->shift;
++	} while (shift > 0);
++
++	BUG_ON(node->height != 1);
++	offset = index & root->mask;
++	if (node->stores[offset]) {
++		return node->stores[offset];
++	} else if (item_alloc) {
++		if (!(item = item_alloc()))
++			return NULL;
++
++		node->stores[offset] = item;
++
++		/*
++		 * NOTE: we do NOT call root->assign here, since this item is
++		 * newly created by us having no meaning. Caller can call this
++		 * if it's necessary to do so.
++		 */
++
++		node->count++;
++		root->num++;
++
++		while (sradix_node_full(root, node)) {
++			node = node->parent;
++			if (!node)
++				break;
++
++			node->fulls++;
++		}
++
++		if (unlikely(!node)) {
++			/* All nodes are full */
++			root->min = 1 << (root->height * root->shift);
++		} else {
++			if (root->min == index) {
++				root->min |= (1UL << (node->height - 1)) - 1;
++				root->min++;
++				root->enter_node = node;
++			}
++		}
++
++		return item;
++	} else {
++		return NULL;
++	}
++
++}
++
++int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
++{
++	unsigned int height, offset;
++	struct sradix_tree_node *node;
++	int shift;
++
++	node = root->rnode;
++	if (node == NULL || (index >> (root->shift * root->height)))
++		return -ENOENT;
++
++	height = root->height;
++	shift = (height - 1) * root->shift;
++
++	do {
++		offset = (index >> shift) & root->mask;
++		node = node->stores[offset];
++		if (!node)
++			return -ENOENT;
++
++		shift -= root->shift;
++	} while (shift > 0);
++
++	offset = index & root->mask;
++	if (!node->stores[offset])
++		return -ENOENT;
++
++	sradix_tree_delete_from_leaf(root, node, index);
++
++	return 0;
++}
+diff --git a/mm/Kconfig b/mm/Kconfig
+index 390165ffbb0f..50d02cfa0b68 100644
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -317,6 +317,32 @@ config KSM
+ 	  See Documentation/vm/ksm.rst for more information: KSM is inactive
+ 	  until a program has madvised that an area is MADV_MERGEABLE, and
+ 	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
++choice
++	prompt "Choose UKSM/KSM strategy"
++	default UKSM
++	depends on KSM
++	help
++	  This option allows to select a UKSM/KSM stragety.
++
++config UKSM
++	bool "Ultra-KSM for page merging"
++	depends on KSM
++	help
++	UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
++	page Merging), but with a fundamentally rewritten core algorithm. With
++	an advanced algorithm, UKSM now can transparently scans all anonymously
++	mapped user space applications with an significantly improved scan speed
++	and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
++	UKSM. Now UKSM has its first stable release and first real world enterprise user.
++	For more information, please goto its project page.
++	(github.com/dolohow/uksm)
++
++config KSM_LEGACY
++	bool "Legacy KSM implementation"
++	depends on KSM
++	help
++	The legacy KSM implementation from Red Hat.
++endchoice
+ 
+ config DEFAULT_MMAP_MIN_ADDR
+ 	int "Low address space to protect from user allocation"
+diff --git a/mm/Makefile b/mm/Makefile
+index d73aed0fc99c..d6612b76c5da 100644
+--- a/mm/Makefile
++++ b/mm/Makefile
+@@ -76,7 +76,8 @@ obj-$(CONFIG_SPARSEMEM)	+= sparse.o
+ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
+ obj-$(CONFIG_SLOB) += slob.o
+ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+-obj-$(CONFIG_KSM) += ksm.o
++obj-$(CONFIG_KSM_LEGACY) += ksm.o
++obj-$(CONFIG_UKSM) += uksm.o
+ obj-$(CONFIG_PAGE_POISONING) += page_poison.o
+ obj-$(CONFIG_SLAB) += slab.o
+ obj-$(CONFIG_SLUB) += slub.o
+diff --git a/mm/ksm.c b/mm/ksm.c
+index 0960750bb316..ae17158cb67a 100644
+--- a/mm/ksm.c
++++ b/mm/ksm.c
+@@ -858,17 +858,6 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
+ 	return err;
+ }
+ 
+-static inline struct stable_node *page_stable_node(struct page *page)
+-{
+-	return PageKsm(page) ? page_rmapping(page) : NULL;
+-}
+-
+-static inline void set_page_stable_node(struct page *page,
+-					struct stable_node *stable_node)
+-{
+-	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
+-}
+-
+ #ifdef CONFIG_SYSFS
+ /*
+  * Only called through the sysfs control interface:
+diff --git a/mm/memory.c b/mm/memory.c
+index c48f8df6e502..db47ee177008 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -146,6 +146,25 @@ EXPORT_SYMBOL(zero_pfn);
+ 
+ unsigned long highest_memmap_pfn __read_mostly;
+ 
++#ifdef CONFIG_UKSM
++unsigned long uksm_zero_pfn __read_mostly;
++EXPORT_SYMBOL_GPL(uksm_zero_pfn);
++struct page *empty_uksm_zero_page;
++
++static int __init setup_uksm_zero_page(void)
++{
++	empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0);
++	if (!empty_uksm_zero_page)
++		panic("Oh boy, that early out of memory?");
++
++	SetPageReserved(empty_uksm_zero_page);
++	uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
++
++	return 0;
++}
++core_initcall(setup_uksm_zero_page);
++#endif
++
+ /*
+  * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
+  */
+@@ -161,6 +180,7 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
+ 	trace_rss_stat(mm, member, count);
+ }
+ 
++
+ #if defined(SPLIT_RSS_COUNTING)
+ 
+ void sync_mm_rss(struct mm_struct *mm)
+@@ -869,6 +889,11 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ 		get_page(page);
+ 		page_dup_rmap(page, false);
+ 		rss[mm_counter(page)]++;
++
++		/* Should return NULL in vm_normal_page() */
++		uksm_bugon_zeropage(pte);
++	} else {
++		uksm_map_zero_page(pte);
+ 	}
+ 
+ 	/*
+@@ -1237,8 +1262,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
+ 			ptent = ptep_get_and_clear_full(mm, addr, pte,
+ 							tlb->fullmm);
+ 			tlb_remove_tlb_entry(tlb, pte, addr);
+-			if (unlikely(!page))
++			if (unlikely(!page)) {
++				uksm_unmap_zero_page(ptent);
+ 				continue;
++			}
+ 
+ 			if (!PageAnon(page)) {
+ 				if (pte_dirty(ptent)) {
+@@ -2586,6 +2613,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
+ 
+ 	if (likely(src)) {
+ 		copy_user_highpage(dst, src, addr, vma);
++		uksm_cow_page(vma, src);
+ 		return true;
+ 	}
+ 
+@@ -2832,6 +2860,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
+ 							      vmf->address);
+ 		if (!new_page)
+ 			goto oom;
++		uksm_cow_pte(vma, vmf->orig_pte);
+ 	} else {
+ 		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
+ 				vmf->address);
+@@ -2874,7 +2903,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
+ 						mm_counter_file(old_page));
+ 				inc_mm_counter_fast(mm, MM_ANONPAGES);
+ 			}
++			uksm_bugon_zeropage(vmf->orig_pte);
+ 		} else {
++			uksm_unmap_zero_page(vmf->orig_pte);
+ 			inc_mm_counter_fast(mm, MM_ANONPAGES);
+ 		}
+ 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
+diff --git a/mm/mmap.c b/mm/mmap.c
+index 5c8b4485860d..b8dd56dd900d 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -46,6 +46,7 @@
+ #include <linux/moduleparam.h>
+ #include <linux/pkeys.h>
+ #include <linux/oom.h>
++#include <linux/ksm.h>
+ #include <linux/sched/mm.h>
+ 
+ #include <linux/uaccess.h>
+@@ -181,6 +182,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+ 	if (vma->vm_file)
+ 		fput(vma->vm_file);
+ 	mpol_put(vma_policy(vma));
++       uksm_remove_vma(vma);
+ 	vm_area_free(vma);
+ 	return next;
+ }
+@@ -750,9 +752,16 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
+ 	long adjust_next = 0;
+ 	int remove_next = 0;
+ 
++/*
++ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
++ * acquired
++ */
++	uksm_remove_vma(vma);
++
+ 	if (next && !insert) {
+ 		struct vm_area_struct *exporter = NULL, *importer = NULL;
+ 
++		uksm_remove_vma(next);
+ 		if (end >= next->vm_end) {
+ 			/*
+ 			 * vma expands, overlapping all the next, and
+@@ -883,6 +892,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
+ 		end_changed = true;
+ 	}
+ 	vma->vm_pgoff = pgoff;
++
+ 	if (adjust_next) {
+ 		next->vm_start += adjust_next;
+ 		next->vm_pgoff += adjust_next >> PAGE_SHIFT;
+@@ -987,6 +997,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
+ 		if (remove_next == 2) {
+ 			remove_next = 1;
+ 			end = next->vm_end;
++			uksm_remove_vma(next);
+ 			goto again;
+ 		}
+ 		else if (next)
+@@ -1013,10 +1024,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
+ 			 */
+ 			VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
+ 		}
++	} else {
++		if (next && !insert)
++			uksm_vma_add_new(next);
+ 	}
+ 	if (insert && file)
+ 		uprobe_mmap(insert);
+ 
++	uksm_vma_add_new(vma);
+ 	validate_mm(mm);
+ 
+ 	return 0;
+@@ -1472,6 +1487,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
+ 	vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+ 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+ 
++	/* If uksm is enabled, we add VM_MERGEABLE to new VMAs. */
++	uksm_vm_flags_mod(&vm_flags);
++
+ 	if (flags & MAP_LOCKED)
+ 		if (!can_do_mlock())
+ 			return -EPERM;
+@@ -1867,6 +1885,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
+ 			allow_write_access(file);
+ 	}
+ 	file = vma->vm_file;
++	uksm_vma_add_new(vma);
+ out:
+ 	perf_event_mmap(vma);
+ 
+@@ -1909,6 +1928,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
+ 	if (vm_flags & VM_DENYWRITE)
+ 		allow_write_access(file);
+ free_vma:
++	uksm_remove_vma(vma);
+ 	vm_area_free(vma);
+ unacct_error:
+ 	if (charged)
+@@ -2768,6 +2788,8 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	else
+ 		err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+ 
++	uksm_vma_add_new(new);
++
+ 	/* Success. */
+ 	if (!err)
+ 		return 0;
+@@ -3075,6 +3097,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
+ 	if ((flags & (~VM_EXEC)) != 0)
+ 		return -EINVAL;
+ 	flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
++	uksm_vm_flags_mod(&flags);
+ 
+ 	mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ 	if (IS_ERR_VALUE(mapped_addr))
+@@ -3120,6 +3143,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
+ 	vma->vm_flags = flags;
+ 	vma->vm_page_prot = vm_get_page_prot(flags);
+ 	vma_link(mm, vma, prev, rb_link, rb_parent);
++	uksm_vma_add_new(vma);
+ out:
+ 	perf_event_mmap(vma);
+ 	mm->total_vm += len >> PAGE_SHIFT;
+@@ -3197,6 +3221,12 @@ void exit_mmap(struct mm_struct *mm)
+ 		mmap_write_unlock(mm);
+ 	}
+ 
++	/*
++	 * Taking write lock on mmap does not harm others,
++	 * but it's crucial for uksm to avoid races.
++	 */
++	mmap_write_lock(mm);
++
+ 	if (mm->locked_vm) {
+ 		vma = mm->mmap;
+ 		while (vma) {
+@@ -3232,6 +3262,11 @@ void exit_mmap(struct mm_struct *mm)
+ 		cond_resched();
+ 	}
+ 	vm_unacct_memory(nr_accounted);
++
++	mm->mmap = NULL;
++	mm->mm_rb = RB_ROOT;
++	vmacache_invalidate(mm);
++	mmap_write_unlock(mm);
+ }
+ 
+ /* Insert vm structure into process list sorted by address
+@@ -3339,6 +3374,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
+ 			new_vma->vm_ops->open(new_vma);
+ 		vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ 		*need_rmap_locks = false;
++		uksm_vma_add_new(new_vma);
+ 	}
+ 	return new_vma;
+ 
+@@ -3491,6 +3527,7 @@ static struct vm_area_struct *__install_special_mapping(
+ 	vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
+ 
+ 	perf_event_mmap(vma);
++	uksm_vma_add_new(vma);
+ 
+ 	return vma;
+ 
+diff --git a/mm/uksm.c b/mm/uksm.c
+new file mode 100644
+index 000000000000..e4732c00be69
+--- /dev/null
++++ b/mm/uksm.c
+@@ -0,0 +1,5614 @@
++/*
++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
++ *
++ * This is an improvement upon KSM. Some basic data structures and routines
++ * are borrowed from ksm.c .
++ *
++ * Its new features:
++ * 1. Full system scan:
++ *      It automatically scans all user processes' anonymous VMAs. Kernel-user
++ *      interaction to submit a memory area to KSM is no longer needed.
++ *
++ * 2. Rich area detection:
++ *      It automatically detects rich areas containing abundant duplicated
++ *      pages based. Rich areas are given a full scan speed. Poor areas are
++ *      sampled at a reasonable speed with very low CPU consumption.
++ *
++ * 3. Ultra Per-page scan speed improvement:
++ *      A new hash algorithm is proposed. As a result, on a machine with
++ *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
++ *      can scan memory areas that does not contain duplicated pages at speed of
++ *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
++ *      477MB/sec ~ 923MB/sec.
++ *
++ * 4. Thrashing area avoidance:
++ *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
++ *      filtered out. My benchmark shows it's more efficient than KSM's per-page
++ *      hash value based volatile page detection.
++ *
++ *
++ * 5. Misc changes upon KSM:
++ *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
++ *        comparison. It's much faster than default C version on x86.
++ *      * rmap_item now has an struct *page member to loosely cache a
++ *        address-->page mapping, which reduces too much time-costly
++ *        follow_page().
++ *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
++ *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
++ *        ksm is needed for this case.
++ *
++ * 6. Full Zero Page consideration(contributed by Figo Zhang)
++ *    Now uksmd consider full zero pages as special pages and merge them to an
++ *    special unswappable uksm zero page.
++ */
++
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/fs.h>
++#include <linux/mman.h>
++#include <linux/sched.h>
++#include <linux/sched/mm.h>
++#include <linux/sched/coredump.h>
++#include <linux/sched/cputime.h>
++#include <linux/rwsem.h>
++#include <linux/pagemap.h>
++#include <linux/rmap.h>
++#include <linux/spinlock.h>
++#include <linux/jhash.h>
++#include <linux/delay.h>
++#include <linux/kthread.h>
++#include <linux/wait.h>
++#include <linux/slab.h>
++#include <linux/rbtree.h>
++#include <linux/memory.h>
++#include <linux/mmu_notifier.h>
++#include <linux/swap.h>
++#include <linux/ksm.h>
++#include <linux/crypto.h>
++#include <linux/scatterlist.h>
++#include <crypto/hash.h>
++#include <linux/random.h>
++#include <linux/math64.h>
++#include <linux/gcd.h>
++#include <linux/freezer.h>
++#include <linux/oom.h>
++#include <linux/numa.h>
++#include <linux/sradix-tree.h>
++
++#include <asm/tlbflush.h>
++#include "internal.h"
++
++#ifdef CONFIG_X86
++#undef memcmp
++
++#ifdef CONFIG_X86_32
++#define memcmp memcmpx86_32
++/*
++ * Compare 4-byte-aligned address s1 and s2, with length n
++ */
++int memcmpx86_32(void *s1, void *s2, size_t n)
++{
++	size_t num = n / 4;
++	register int res;
++
++	__asm__ __volatile__
++	(
++	 "testl %3,%3\n\t"
++	 "repe; cmpsd\n\t"
++	 "je        1f\n\t"
++	 "sbbl      %0,%0\n\t"
++	 "orl       $1,%0\n"
++	 "1:"
++	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
++	 : "0" (0)
++	 : "cc");
++
++	return res;
++}
++
++/*
++ * Check the page is all zero ?
++ */
++static int is_full_zero(const void *s1, size_t len)
++{
++	unsigned char same;
++
++	len /= 4;
++
++	__asm__ __volatile__
++	("repe; scasl;"
++	 "sete %0"
++	 : "=qm" (same), "+D" (s1), "+c" (len)
++	 : "a" (0)
++	 : "cc");
++
++	return same;
++}
++
++
++#elif defined(CONFIG_X86_64)
++#define memcmp memcmpx86_64
++/*
++ * Compare 8-byte-aligned address s1 and s2, with length n
++ */
++int memcmpx86_64(void *s1, void *s2, size_t n)
++{
++	size_t num = n / 8;
++	register int res;
++
++	__asm__ __volatile__
++	(
++	 "testq %q3,%q3\n\t"
++	 "repe; cmpsq\n\t"
++	 "je        1f\n\t"
++	 "sbbq      %q0,%q0\n\t"
++	 "orq       $1,%q0\n"
++	 "1:"
++	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
++	 : "0" (0)
++	 : "cc");
++
++	return res;
++}
++
++static int is_full_zero(const void *s1, size_t len)
++{
++	unsigned char same;
++
++	len /= 8;
++
++	__asm__ __volatile__
++	("repe; scasq;"
++	 "sete %0"
++	 : "=qm" (same), "+D" (s1), "+c" (len)
++	 : "a" (0)
++	 : "cc");
++
++	return same;
++}
++
++#endif
++#else
++static int is_full_zero(const void *s1, size_t len)
++{
++	unsigned long *src = s1;
++	int i;
++
++	len /= sizeof(*src);
++
++	for (i = 0; i < len; i++) {
++		if (src[i])
++			return 0;
++	}
++
++	return 1;
++}
++#endif
++
++#define UKSM_RUNG_ROUND_FINISHED  (1 << 0)
++#define TIME_RATIO_SCALE	10000
++
++#define SLOT_TREE_NODE_SHIFT	8
++#define SLOT_TREE_NODE_STORE_SIZE	(1UL << SLOT_TREE_NODE_SHIFT)
++struct slot_tree_node {
++	unsigned long size;
++	struct sradix_tree_node snode;
++	void *stores[SLOT_TREE_NODE_STORE_SIZE];
++};
++
++static struct kmem_cache *slot_tree_node_cachep;
++
++static struct sradix_tree_node *slot_tree_node_alloc(void)
++{
++	struct slot_tree_node *p;
++
++	p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL |
++			      __GFP_NORETRY | __GFP_NOWARN);
++	if (!p)
++		return NULL;
++
++	return &p->snode;
++}
++
++static void slot_tree_node_free(struct sradix_tree_node *node)
++{
++	struct slot_tree_node *p;
++
++	p = container_of(node, struct slot_tree_node, snode);
++	kmem_cache_free(slot_tree_node_cachep, p);
++}
++
++static void slot_tree_node_extend(struct sradix_tree_node *parent,
++				  struct sradix_tree_node *child)
++{
++	struct slot_tree_node *p, *c;
++
++	p = container_of(parent, struct slot_tree_node, snode);
++	c = container_of(child, struct slot_tree_node, snode);
++
++	p->size += c->size;
++}
++
++void slot_tree_node_assign(struct sradix_tree_node *node,
++			   unsigned int index, void *item)
++{
++	struct vma_slot *slot = item;
++	struct slot_tree_node *cur;
++
++	slot->snode = node;
++	slot->sindex = index;
++
++	while (node) {
++		cur = container_of(node, struct slot_tree_node, snode);
++		cur->size += slot->pages;
++		node = node->parent;
++	}
++}
++
++void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset)
++{
++	struct vma_slot *slot;
++	struct slot_tree_node *cur;
++	unsigned long pages;
++
++	if (node->height == 1) {
++		slot = node->stores[offset];
++		pages = slot->pages;
++	} else {
++		cur = container_of(node->stores[offset],
++				   struct slot_tree_node, snode);
++		pages = cur->size;
++	}
++
++	while (node) {
++		cur = container_of(node, struct slot_tree_node, snode);
++		cur->size -= pages;
++		node = node->parent;
++	}
++}
++
++unsigned long slot_iter_index;
++int slot_iter(void *item,  unsigned long height)
++{
++	struct slot_tree_node *node;
++	struct vma_slot *slot;
++
++	if (height == 1) {
++		slot = item;
++		if (slot_iter_index < slot->pages) {
++			/*in this one*/
++			return 1;
++		} else {
++			slot_iter_index -= slot->pages;
++			return 0;
++		}
++
++	} else {
++		node = container_of(item, struct slot_tree_node, snode);
++		if (slot_iter_index < node->size) {
++			/*in this one*/
++			return 1;
++		} else {
++			slot_iter_index -= node->size;
++			return 0;
++		}
++	}
++}
++
++
++static inline void slot_tree_init_root(struct sradix_tree_root *root)
++{
++	init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
++	root->alloc = slot_tree_node_alloc;
++	root->free = slot_tree_node_free;
++	root->extend = slot_tree_node_extend;
++	root->assign = slot_tree_node_assign;
++	root->rm = slot_tree_node_rm;
++}
++
++void slot_tree_init(void)
++{
++	slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
++				sizeof(struct slot_tree_node), 0,
++				SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
++				NULL);
++}
++
++
++/* Each rung of this ladder is a list of VMAs having a same scan ratio */
++struct scan_rung {
++	//struct list_head scanned_list;
++	struct sradix_tree_root vma_root;
++	struct sradix_tree_root vma_root2;
++
++	struct vma_slot *current_scan;
++	unsigned long current_offset;
++
++	/*
++	 * The initial value for current_offset, it should loop over
++	 * [0~ step - 1] to let all slot have its chance to be scanned.
++	 */
++	unsigned long offset_init;
++	unsigned long step; /* dynamic step for current_offset */
++	unsigned int flags;
++	unsigned long pages_to_scan;
++	//unsigned long fully_scanned_slots;
++	/*
++	 * a little bit tricky - if cpu_time_ratio > 0, then the value is the
++	 * the cpu time ratio it can spend in rung_i for every scan
++	 * period. if < 0, then it is the cpu time ratio relative to the
++	 * max cpu percentage user specified. Both in unit of
++	 * 1/TIME_RATIO_SCALE
++	 */
++	int cpu_ratio;
++
++	/*
++	 * How long it will take for all slots in this rung to be fully
++	 * scanned? If it's zero, we don't care about the cover time:
++	 * it's fully scanned.
++	 */
++	unsigned int cover_msecs;
++	//unsigned long vma_num;
++	//unsigned long pages; /* Sum of all slot's pages in rung */
++};
++
++/**
++ * node of either the stable or unstale rbtree
++ *
++ */
++struct tree_node {
++	struct rb_node node; /* link in the main (un)stable rbtree */
++	struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
++	u32 hash;
++	unsigned long count; /* TODO: merged with sub_root */
++	struct list_head all_list; /* all tree nodes in stable/unstable tree */
++};
++
++/**
++ * struct stable_node - node of the stable rbtree
++ * @node: rb node of this ksm page in the stable tree
++ * @hlist: hlist head of rmap_items using this ksm page
++ * @kpfn: page frame number of this ksm page
++ */
++struct stable_node {
++	struct rb_node node; /* link in sub-rbtree */
++	struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
++	struct hlist_head hlist;
++	unsigned long kpfn;
++	u32 hash_max; /* if ==0 then it's not been calculated yet */
++	struct list_head all_list; /* in a list for all stable nodes */
++};
++
++/**
++ * struct node_vma - group rmap_items linked in a same stable
++ * node together.
++ */
++struct node_vma {
++	union {
++		struct vma_slot *slot;
++		unsigned long key;  /* slot is used as key sorted on hlist */
++	};
++	struct hlist_node hlist;
++	struct hlist_head rmap_hlist;
++	struct stable_node *head;
++};
++
++/**
++ * struct rmap_item - reverse mapping item for virtual addresses
++ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
++ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
++ * @mm: the memory structure this rmap_item is pointing into
++ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
++ * @node: rb node of this rmap_item in the unstable tree
++ * @head: pointer to stable_node heading this list in the stable tree
++ * @hlist: link into hlist of rmap_items hanging off that stable_node
++ */
++struct rmap_item {
++	struct vma_slot *slot;
++	struct page *page;
++	unsigned long address;	/* + low bits used for flags below */
++	unsigned long hash_round;
++	unsigned long entry_index;
++	union {
++		struct {/* when in unstable tree */
++			struct rb_node node;
++			struct tree_node *tree_node;
++			u32 hash_max;
++		};
++		struct { /* when in stable tree */
++			struct node_vma *head;
++			struct hlist_node hlist;
++			struct anon_vma *anon_vma;
++		};
++	};
++} __aligned(4);
++
++struct rmap_list_entry {
++	union {
++		struct rmap_item *item;
++		unsigned long addr;
++	};
++	/* lowest bit is used for is_addr tag */
++} __aligned(4); /* 4 aligned to fit in to pages*/
++
++
++/* Basic data structure definition ends */
++
++
++/*
++ * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
++ * The flags use the low bits of rmap_item.address
++ */
++#define UNSTABLE_FLAG	0x1
++#define STABLE_FLAG	0x2
++#define get_rmap_addr(x)	((x)->address & PAGE_MASK)
++
++/*
++ * rmap_list_entry helpers
++ */
++#define IS_ADDR_FLAG	1
++#define is_addr(ptr)		((unsigned long)(ptr) & IS_ADDR_FLAG)
++#define set_is_addr(ptr)	((ptr) |= IS_ADDR_FLAG)
++#define get_clean_addr(ptr)	(((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
++
++
++/*
++ * High speed caches for frequently allocated and freed structs
++ */
++static struct kmem_cache *rmap_item_cache;
++static struct kmem_cache *stable_node_cache;
++static struct kmem_cache *node_vma_cache;
++static struct kmem_cache *vma_slot_cache;
++static struct kmem_cache *tree_node_cache;
++#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
++		sizeof(struct __struct), __alignof__(struct __struct),\
++		(__flags), NULL)
++
++/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
++#define SCAN_LADDER_SIZE 4
++static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
++
++/* The evaluation rounds uksmd has finished */
++static unsigned long long uksm_eval_round = 1;
++
++/*
++ * we add 1 to this var when we consider we should rebuild the whole
++ * unstable tree.
++ */
++static unsigned long uksm_hash_round = 1;
++
++/*
++ * How many times the whole memory is scanned.
++ */
++static unsigned long long fully_scanned_round = 1;
++
++/* The total number of virtual pages of all vma slots */
++static u64 uksm_pages_total;
++
++/* The number of pages has been scanned since the start up */
++static u64 uksm_pages_scanned;
++
++static u64 scanned_virtual_pages;
++
++/* The number of pages has been scanned since last encode_benefit call */
++static u64 uksm_pages_scanned_last;
++
++/* If the scanned number is tooo large, we encode it here */
++static u64 pages_scanned_stored;
++
++static unsigned long pages_scanned_base;
++
++/* The number of nodes in the stable tree */
++static unsigned long uksm_pages_shared;
++
++/* The number of page slots additionally sharing those nodes */
++static unsigned long uksm_pages_sharing;
++
++/* The number of nodes in the unstable tree */
++static unsigned long uksm_pages_unshared;
++
++/*
++ * Milliseconds ksmd should sleep between scans,
++ * >= 100ms to be consistent with
++ * scan_time_to_sleep_msec()
++ */
++static unsigned int uksm_sleep_jiffies;
++
++/* The real value for the uksmd next sleep */
++static unsigned int uksm_sleep_real;
++
++/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
++static unsigned int uksm_sleep_saved;
++
++/* Max percentage of cpu utilization ksmd can take to scan in one batch */
++static unsigned int uksm_max_cpu_percentage;
++
++static int uksm_cpu_governor;
++
++static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
++
++struct uksm_cpu_preset_s {
++	int cpu_ratio[SCAN_LADDER_SIZE];
++	unsigned int cover_msecs[SCAN_LADDER_SIZE];
++	unsigned int max_cpu; /* percentage */
++};
++
++struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
++	{ {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
++	{ {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
++	{ {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
++	{ {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
++};
++
++/* The default value for uksm_ema_page_time if it's not initialized */
++#define UKSM_PAGE_TIME_DEFAULT	500
++
++/*cost to scan one page by expotional moving average in nsecs */
++static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
++
++/* The expotional moving average alpha weight, in percentage. */
++#define EMA_ALPHA	20
++
++/*
++ * The threshold used to filter out thrashing areas,
++ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
++ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
++ * will be considered as having a zero duplication ratio.
++ */
++static unsigned int uksm_thrash_threshold = 50;
++
++/* How much dedup ratio is considered to be abundant*/
++static unsigned int uksm_abundant_threshold = 10;
++
++/* All slots having merged pages in this eval round. */
++struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
++
++/* How many times the ksmd has slept since startup */
++static unsigned long long uksm_sleep_times;
++
++#define UKSM_RUN_STOP	0
++#define UKSM_RUN_MERGE	1
++static unsigned int uksm_run = 1;
++
++static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
++static DEFINE_MUTEX(uksm_thread_mutex);
++
++/*
++ * List vma_slot_new is for newly created vma_slot waiting to be added by
++ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
++ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
++ * VMA has been removed/freed.
++ */
++struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
++struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
++struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
++static DEFINE_SPINLOCK(vma_slot_list_lock);
++
++/* The unstable tree heads */
++static struct rb_root root_unstable_tree = RB_ROOT;
++
++/*
++ * All tree_nodes are in a list to be freed at once when unstable tree is
++ * freed after each scan round.
++ */
++static struct list_head unstable_tree_node_list =
++				LIST_HEAD_INIT(unstable_tree_node_list);
++
++/* List contains all stable nodes */
++static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
++
++/*
++ * When the hash strength is changed, the stable tree must be delta_hashed and
++ * re-structured. We use two set of below structs to speed up the
++ * re-structuring of stable tree.
++ */
++static struct list_head
++stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
++			    LIST_HEAD_INIT(stable_tree_node_list[1])};
++
++static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
++static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
++static struct rb_root *root_stable_treep = &root_stable_tree[0];
++static unsigned long stable_tree_index;
++
++/* The hash strength needed to hash a full page */
++#define HASH_STRENGTH_FULL		(PAGE_SIZE / sizeof(u32))
++
++/* The hash strength needed for loop-back hashing */
++#define HASH_STRENGTH_MAX		(HASH_STRENGTH_FULL + 10)
++
++/* The random offsets in a page */
++static u32 *random_nums;
++
++/* The hash strength */
++static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
++
++/* The delta value each time the hash strength increases or decreases */
++static unsigned long hash_strength_delta;
++#define HASH_STRENGTH_DELTA_MAX	5
++
++/* The time we have saved due to random_sample_hash */
++static u64 rshash_pos;
++
++/* The time we have wasted due to hash collision */
++static u64 rshash_neg;
++
++struct uksm_benefit {
++	u64 pos;
++	u64 neg;
++	u64 scanned;
++	unsigned long base;
++} benefit;
++
++/*
++ * The relative cost of memcmp, compared to 1 time unit of random sample
++ * hash, this value is tested when ksm module is initialized
++ */
++static unsigned long memcmp_cost;
++
++static unsigned long  rshash_neg_cont_zero;
++static unsigned long  rshash_cont_obscure;
++
++/* The possible states of hash strength adjustment heuristic */
++enum rshash_states {
++		RSHASH_STILL,
++		RSHASH_TRYUP,
++		RSHASH_TRYDOWN,
++		RSHASH_NEW,
++		RSHASH_PRE_STILL,
++};
++
++/* The possible direction we are about to adjust hash strength */
++enum rshash_direct {
++	GO_UP,
++	GO_DOWN,
++	OBSCURE,
++	STILL,
++};
++
++/* random sampling hash state machine */
++static struct {
++	enum rshash_states state;
++	enum rshash_direct pre_direct;
++	u8 below_count;
++	/* Keep a lookup window of size 5, iff above_count/below_count > 3
++	 * in this window we stop trying.
++	 */
++	u8 lookup_window_index;
++	u64 stable_benefit;
++	unsigned long turn_point_down;
++	unsigned long turn_benefit_down;
++	unsigned long turn_point_up;
++	unsigned long turn_benefit_up;
++	unsigned long stable_point;
++} rshash_state;
++
++/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
++static u32 *zero_hash_table;
++
++static inline struct node_vma *alloc_node_vma(void)
++{
++	struct node_vma *node_vma;
++
++	node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL |
++				     __GFP_NORETRY | __GFP_NOWARN);
++	if (node_vma) {
++		INIT_HLIST_HEAD(&node_vma->rmap_hlist);
++		INIT_HLIST_NODE(&node_vma->hlist);
++	}
++	return node_vma;
++}
++
++static inline void free_node_vma(struct node_vma *node_vma)
++{
++	kmem_cache_free(node_vma_cache, node_vma);
++}
++
++
++static inline struct vma_slot *alloc_vma_slot(void)
++{
++	struct vma_slot *slot;
++
++	/*
++	 * In case ksm is not initialized by now.
++	 * Oops, we need to consider the call site of uksm_init() in the future.
++	 */
++	if (!vma_slot_cache)
++		return NULL;
++
++	slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL |
++				 __GFP_NORETRY | __GFP_NOWARN);
++	if (slot) {
++		INIT_LIST_HEAD(&slot->slot_list);
++		INIT_LIST_HEAD(&slot->dedup_list);
++		slot->flags |= UKSM_SLOT_NEED_RERAND;
++	}
++	return slot;
++}
++
++static inline void free_vma_slot(struct vma_slot *vma_slot)
++{
++	kmem_cache_free(vma_slot_cache, vma_slot);
++}
++
++
++
++static inline struct rmap_item *alloc_rmap_item(void)
++{
++	struct rmap_item *rmap_item;
++
++	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
++				      __GFP_NORETRY | __GFP_NOWARN);
++	if (rmap_item) {
++		/* bug on lowest bit is not clear for flag use */
++		BUG_ON(is_addr(rmap_item));
++	}
++	return rmap_item;
++}
++
++static inline void free_rmap_item(struct rmap_item *rmap_item)
++{
++	rmap_item->slot = NULL;	/* debug safety */
++	kmem_cache_free(rmap_item_cache, rmap_item);
++}
++
++static inline struct stable_node *alloc_stable_node(void)
++{
++	struct stable_node *node;
++
++	node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL |
++				__GFP_NORETRY | __GFP_NOWARN);
++	if (!node)
++		return NULL;
++
++	INIT_HLIST_HEAD(&node->hlist);
++	list_add(&node->all_list, &stable_node_list);
++	return node;
++}
++
++static inline void free_stable_node(struct stable_node *stable_node)
++{
++	list_del(&stable_node->all_list);
++	kmem_cache_free(stable_node_cache, stable_node);
++}
++
++static inline struct tree_node *alloc_tree_node(struct list_head *list)
++{
++	struct tree_node *node;
++
++	node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL |
++				 __GFP_NORETRY | __GFP_NOWARN);
++	if (!node)
++		return NULL;
++
++	list_add(&node->all_list, list);
++	return node;
++}
++
++static inline void free_tree_node(struct tree_node *node)
++{
++	list_del(&node->all_list);
++	kmem_cache_free(tree_node_cache, node);
++}
++
++static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
++{
++	struct anon_vma *anon_vma = rmap_item->anon_vma;
++
++	put_anon_vma(anon_vma);
++}
++
++
++/**
++ * Remove a stable node from stable_tree, may unlink from its tree_node and
++ * may remove its parent tree_node if no other stable node is pending.
++ *
++ * @stable_node	    The node need to be removed
++ * @unlink_rb	    Will this node be unlinked from the rbtree?
++ * @remove_tree_    node Will its tree_node be removed if empty?
++ */
++static void remove_node_from_stable_tree(struct stable_node *stable_node,
++					 int unlink_rb,  int remove_tree_node)
++{
++	struct node_vma *node_vma;
++	struct rmap_item *rmap_item;
++	struct hlist_node *n;
++
++	if (!hlist_empty(&stable_node->hlist)) {
++		hlist_for_each_entry_safe(node_vma, n,
++					  &stable_node->hlist, hlist) {
++			hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
++				uksm_pages_sharing--;
++
++				uksm_drop_anon_vma(rmap_item);
++				rmap_item->address &= PAGE_MASK;
++			}
++			free_node_vma(node_vma);
++			cond_resched();
++		}
++
++		/* the last one is counted as shared */
++		uksm_pages_shared--;
++		uksm_pages_sharing++;
++	}
++
++	if (stable_node->tree_node && unlink_rb) {
++		rb_erase(&stable_node->node,
++			 &stable_node->tree_node->sub_root);
++
++		if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
++		    remove_tree_node) {
++			rb_erase(&stable_node->tree_node->node,
++				 root_stable_treep);
++			free_tree_node(stable_node->tree_node);
++		} else {
++			stable_node->tree_node->count--;
++		}
++	}
++
++	free_stable_node(stable_node);
++}
++
++
++/*
++ * get_uksm_page: checks if the page indicated by the stable node
++ * is still its ksm page, despite having held no reference to it.
++ * In which case we can trust the content of the page, and it
++ * returns the gotten page; but if the page has now been zapped,
++ * remove the stale node from the stable tree and return NULL.
++ *
++ * You would expect the stable_node to hold a reference to the ksm page.
++ * But if it increments the page's count, swapping out has to wait for
++ * ksmd to come around again before it can free the page, which may take
++ * seconds or even minutes: much too unresponsive.  So instead we use a
++ * "keyhole reference": access to the ksm page from the stable node peeps
++ * out through its keyhole to see if that page still holds the right key,
++ * pointing back to this stable node.  This relies on freeing a PageAnon
++ * page to reset its page->mapping to NULL, and relies on no other use of
++ * a page to put something that might look like our key in page->mapping.
++ *
++ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
++ * but this is different - made simpler by uksm_thread_mutex being held, but
++ * interesting for assuming that no other use of the struct page could ever
++ * put our expected_mapping into page->mapping (or a field of the union which
++ * coincides with page->mapping).  The RCU calls are not for KSM at all, but
++ * to keep the page_count protocol described with page_cache_get_speculative.
++ *
++ * Note: it is possible that get_uksm_page() will return NULL one moment,
++ * then page the next, if the page is in between page_freeze_refs() and
++ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
++ * is on its way to being freed; but it is an anomaly to bear in mind.
++ *
++ * @unlink_rb:			if the removal of this node will firstly unlink from
++ * its rbtree. stable_node_reinsert will prevent this when restructuring the
++ * node from its old tree.
++ *
++ * @remove_tree_node:	if this is the last one of its tree_node, will the
++ * tree_node be freed ? If we are inserting stable node, this tree_node may
++ * be reused, so don't free it.
++ */
++static struct page *get_uksm_page(struct stable_node *stable_node,
++				 int unlink_rb, int remove_tree_node)
++{
++	struct page *page;
++	void *expected_mapping;
++	unsigned long kpfn;
++
++	expected_mapping = (void *)((unsigned long)stable_node |
++				    PAGE_MAPPING_KSM);
++again:
++	kpfn = READ_ONCE(stable_node->kpfn);
++	page = pfn_to_page(kpfn);
++
++	/*
++	 * page is computed from kpfn, so on most architectures reading
++	 * page->mapping is naturally ordered after reading node->kpfn,
++	 * but on Alpha we need to be more careful.
++	 */
++	smp_rmb();
++
++	if (READ_ONCE(page->mapping) != expected_mapping)
++		goto stale;
++
++	/*
++	 * We cannot do anything with the page while its refcount is 0.
++	 * Usually 0 means free, or tail of a higher-order page: in which
++	 * case this node is no longer referenced, and should be freed;
++	 * however, it might mean that the page is under page_freeze_refs().
++	 * The __remove_mapping() case is easy, again the node is now stale;
++	 * but if page is swapcache in migrate_page_move_mapping(), it might
++	 * still be our page, in which case it's essential to keep the node.
++	 */
++	while (!get_page_unless_zero(page)) {
++		/*
++		 * Another check for page->mapping != expected_mapping would
++		 * work here too.  We have chosen the !PageSwapCache test to
++		 * optimize the common case, when the page is or is about to
++		 * be freed: PageSwapCache is cleared (under spin_lock_irq)
++		 * in the freeze_refs section of __remove_mapping(); but Anon
++		 * page->mapping reset to NULL later, in free_pages_prepare().
++		 */
++		if (!PageSwapCache(page))
++			goto stale;
++		cpu_relax();
++	}
++
++	if (READ_ONCE(page->mapping) != expected_mapping) {
++		put_page(page);
++		goto stale;
++	}
++
++	lock_page(page);
++	if (READ_ONCE(page->mapping) != expected_mapping) {
++		unlock_page(page);
++		put_page(page);
++		goto stale;
++	}
++	unlock_page(page);
++	return page;
++stale:
++	/*
++	 * We come here from above when page->mapping or !PageSwapCache
++	 * suggests that the node is stale; but it might be under migration.
++	 * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
++	 * before checking whether node->kpfn has been changed.
++	 */
++	smp_rmb();
++	if (stable_node->kpfn != kpfn)
++		goto again;
++
++	remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
++
++	return NULL;
++}
++
++/*
++ * Removing rmap_item from stable or unstable tree.
++ * This function will clean the information from the stable/unstable tree.
++ */
++static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
++{
++	if (rmap_item->address & STABLE_FLAG) {
++		struct stable_node *stable_node;
++		struct node_vma *node_vma;
++		struct page *page;
++
++		node_vma = rmap_item->head;
++		stable_node = node_vma->head;
++		page = get_uksm_page(stable_node, 1, 1);
++		if (!page)
++			goto out;
++
++		/*
++		 * page lock is needed because it's racing with
++		 * try_to_unmap_ksm(), etc.
++		 */
++		lock_page(page);
++		hlist_del(&rmap_item->hlist);
++
++		if (hlist_empty(&node_vma->rmap_hlist)) {
++			hlist_del(&node_vma->hlist);
++			free_node_vma(node_vma);
++		}
++		unlock_page(page);
++
++		put_page(page);
++		if (hlist_empty(&stable_node->hlist)) {
++			/* do NOT call remove_node_from_stable_tree() here,
++			 * it's possible for a forked rmap_item not in
++			 * stable tree while the in-tree rmap_items were
++			 * deleted.
++			 */
++			uksm_pages_shared--;
++		} else
++			uksm_pages_sharing--;
++
++
++		uksm_drop_anon_vma(rmap_item);
++	} else if (rmap_item->address & UNSTABLE_FLAG) {
++		if (rmap_item->hash_round == uksm_hash_round) {
++
++			rb_erase(&rmap_item->node,
++				 &rmap_item->tree_node->sub_root);
++			if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
++				rb_erase(&rmap_item->tree_node->node,
++					 &root_unstable_tree);
++
++				free_tree_node(rmap_item->tree_node);
++			} else
++				rmap_item->tree_node->count--;
++		}
++		uksm_pages_unshared--;
++	}
++
++	rmap_item->address &= PAGE_MASK;
++	rmap_item->hash_max = 0;
++
++out:
++	cond_resched();		/* we're called from many long loops */
++}
++
++static inline int slot_in_uksm(struct vma_slot *slot)
++{
++	return list_empty(&slot->slot_list);
++}
++
++/*
++ * Test if the mm is exiting
++ */
++static inline bool uksm_test_exit(struct mm_struct *mm)
++{
++	return atomic_read(&mm->mm_users) == 0;
++}
++
++static inline unsigned long vma_pool_size(struct vma_slot *slot)
++{
++	return round_up(sizeof(struct rmap_list_entry) * slot->pages,
++			PAGE_SIZE) >> PAGE_SHIFT;
++}
++
++#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
++
++/* must be done with sem locked */
++static int slot_pool_alloc(struct vma_slot *slot)
++{
++	unsigned long pool_size;
++
++	if (slot->rmap_list_pool)
++		return 0;
++
++	pool_size = vma_pool_size(slot);
++	slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *),
++				       GFP_KERNEL);
++	if (!slot->rmap_list_pool)
++		return -ENOMEM;
++
++	slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int),
++				    GFP_KERNEL);
++	if (!slot->pool_counts) {
++		kfree(slot->rmap_list_pool);
++		return -ENOMEM;
++	}
++
++	slot->pool_size = pool_size;
++	BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
++	slot->flags |= UKSM_SLOT_IN_UKSM;
++	uksm_pages_total += slot->pages;
++
++	return 0;
++}
++
++/*
++ * Called after vma is unlinked from its mm
++ */
++void uksm_remove_vma(struct vm_area_struct *vma)
++{
++	struct vma_slot *slot;
++
++	if (!vma->uksm_vma_slot)
++		return;
++
++	spin_lock(&vma_slot_list_lock);
++	slot = vma->uksm_vma_slot;
++	if (!slot)
++		goto out;
++
++	if (slot_in_uksm(slot)) {
++		/**
++		 * This slot has been added by ksmd, so move to the del list
++		 * waiting ksmd to free it.
++		 */
++		list_add_tail(&slot->slot_list, &vma_slot_del);
++	} else {
++		/**
++		 * It's still on new list. It's ok to free slot directly.
++		 */
++		list_del(&slot->slot_list);
++		free_vma_slot(slot);
++	}
++out:
++	vma->uksm_vma_slot = NULL;
++	spin_unlock(&vma_slot_list_lock);
++}
++
++/**
++ * Need to do two things:
++ * 1. check if slot was moved to del list
++ * 2. make sure the mmap_sem is manipulated under valid vma.
++ *
++ * My concern here is that in some cases, this may make
++ * vma_slot_list_lock() waiters to serialized further by some
++ * sem->wait_lock, can this really be expensive?
++ *
++ *
++ * @return
++ * 0: if successfully locked mmap_sem
++ * -ENOENT: this slot was moved to del list
++ * -EBUSY: vma lock failed
++ */
++static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
++{
++	struct vm_area_struct *vma;
++	struct mm_struct *mm;
++	struct rw_semaphore *sem;
++
++	spin_lock(&vma_slot_list_lock);
++
++	/* the slot_list was removed and inited from new list, when it enters
++	 * uksm_list. If now it's not empty, then it must be moved to del list
++	 */
++	if (!slot_in_uksm(slot)) {
++		spin_unlock(&vma_slot_list_lock);
++		return -ENOENT;
++	}
++
++	BUG_ON(slot->pages != vma_pages(slot->vma));
++	/* Ok, vma still valid */
++	vma = slot->vma;
++	mm = vma->vm_mm;
++	sem = &mm->mmap_lock;
++
++	if (uksm_test_exit(mm)) {
++		spin_unlock(&vma_slot_list_lock);
++		return -ENOENT;
++	}
++
++	if (down_read_trylock(sem)) {
++		spin_unlock(&vma_slot_list_lock);
++		if (slot_pool_alloc(slot)) {
++			uksm_remove_vma(vma);
++			up_read(sem);
++			return -ENOENT;
++		}
++		return 0;
++	}
++
++	spin_unlock(&vma_slot_list_lock);
++	return -EBUSY;
++}
++
++static inline unsigned long
++vma_page_address(struct page *page, struct vm_area_struct *vma)
++{
++	pgoff_t pgoff = page->index;
++	unsigned long address;
++
++	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
++	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
++		/* page should be within @vma mapping range */
++		return -EFAULT;
++	}
++	return address;
++}
++
++
++/* return 0 on success with the item's mmap_sem locked */
++static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
++{
++	struct mm_struct *mm;
++	struct vma_slot *slot = item->slot;
++	int err = -EINVAL;
++
++	struct page *page;
++
++	/*
++	 * try_down_read_slot_mmap_sem() returns non-zero if the slot
++	 * has been removed by uksm_remove_vma().
++	 */
++	if (try_down_read_slot_mmap_sem(slot))
++		return -EBUSY;
++
++	mm = slot->vma->vm_mm;
++
++	if (uksm_test_exit(mm))
++		goto failout_up;
++
++	page = item->page;
++	rcu_read_lock();
++	if (!get_page_unless_zero(page)) {
++		rcu_read_unlock();
++		goto failout_up;
++	}
++
++	/* No need to consider huge page here. */
++	if (item->slot->vma->anon_vma != page_anon_vma(page) ||
++	    vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
++		/*
++		 * TODO:
++		 * should we release this item becase of its stale page
++		 * mapping?
++		 */
++		put_page(page);
++		rcu_read_unlock();
++		goto failout_up;
++	}
++	rcu_read_unlock();
++	return 0;
++
++failout_up:
++	mmap_read_unlock(mm);
++	return err;
++}
++
++/*
++ * What kind of VMA is considered ?
++ */
++static inline int vma_can_enter(struct vm_area_struct *vma)
++{
++	return uksm_flags_can_scan(vma->vm_flags);
++}
++
++/*
++ * Called whenever a fresh new vma is created A new vma_slot.
++ * is created and inserted into a global list Must be called.
++ * after vma is inserted to its mm.
++ */
++void uksm_vma_add_new(struct vm_area_struct *vma)
++{
++	struct vma_slot *slot;
++
++	if (!vma_can_enter(vma)) {
++		vma->uksm_vma_slot = NULL;
++		return;
++	}
++
++	slot = alloc_vma_slot();
++	if (!slot) {
++		vma->uksm_vma_slot = NULL;
++		return;
++	}
++
++	vma->uksm_vma_slot = slot;
++	vma->vm_flags |= VM_MERGEABLE;
++	slot->vma = vma;
++	slot->mm = vma->vm_mm;
++	slot->ctime_j = jiffies;
++	slot->pages = vma_pages(vma);
++	spin_lock(&vma_slot_list_lock);
++	list_add_tail(&slot->slot_list, &vma_slot_new);
++	spin_unlock(&vma_slot_list_lock);
++}
++
++/*   32/3 < they < 32/2 */
++#define shiftl	8
++#define shiftr	12
++
++#define HASH_FROM_TO(from, to)			\
++for (index = from; index < to; index++) {	\
++	pos = random_nums[index];		\
++	hash += key[pos];			\
++	hash += (hash << shiftl);		\
++	hash ^= (hash >> shiftr);		\
++}
++
++
++#define HASH_FROM_DOWN_TO(from, to)		\
++for (index = from - 1; index >= to; index--) {	\
++	hash ^= (hash >> shiftr);		\
++	hash ^= (hash >> (shiftr*2));		\
++	hash -= (hash << shiftl);		\
++	hash += (hash << (shiftl*2));		\
++	pos = random_nums[index];		\
++	hash -= key[pos];			\
++}
++
++/*
++ * The main random sample hash function.
++ */
++static u32 random_sample_hash(void *addr, u32 hash_strength)
++{
++	u32 hash = 0xdeadbeef;
++	int index, pos, loop = hash_strength;
++	u32 *key = (u32 *)addr;
++
++	if (loop > HASH_STRENGTH_FULL)
++		loop = HASH_STRENGTH_FULL;
++
++	HASH_FROM_TO(0, loop);
++
++	if (hash_strength > HASH_STRENGTH_FULL) {
++		loop = hash_strength - HASH_STRENGTH_FULL;
++		HASH_FROM_TO(0, loop);
++	}
++
++	return hash;
++}
++
++
++/**
++ * It's used when hash strength is adjusted
++ *
++ * @addr The page's virtual address
++ * @from The original hash strength
++ * @to   The hash strength changed to
++ * @hash The hash value generated with "from" hash value
++ *
++ * return the hash value
++ */
++static u32 delta_hash(void *addr, int from, int to, u32 hash)
++{
++	u32 *key = (u32 *)addr;
++	int index, pos; /* make sure they are int type */
++
++	if (to > from) {
++		if (from >= HASH_STRENGTH_FULL) {
++			from -= HASH_STRENGTH_FULL;
++			to -= HASH_STRENGTH_FULL;
++			HASH_FROM_TO(from, to);
++		} else if (to <= HASH_STRENGTH_FULL) {
++			HASH_FROM_TO(from, to);
++		} else {
++			HASH_FROM_TO(from, HASH_STRENGTH_FULL);
++			HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
++		}
++	} else {
++		if (from <= HASH_STRENGTH_FULL) {
++			HASH_FROM_DOWN_TO(from, to);
++		} else if (to >= HASH_STRENGTH_FULL) {
++			from -= HASH_STRENGTH_FULL;
++			to -= HASH_STRENGTH_FULL;
++			HASH_FROM_DOWN_TO(from, to);
++		} else {
++			HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
++			HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
++		}
++	}
++
++	return hash;
++}
++
++/**
++ *
++ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
++ * has finished.
++ *
++ * return 0 if no page has been scanned since last call, 1 otherwise.
++ */
++static inline int encode_benefit(void)
++{
++	u64 scanned_delta, pos_delta, neg_delta;
++	unsigned long base = benefit.base;
++
++	scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
++
++	if (!scanned_delta)
++		return 0;
++
++	scanned_delta >>= base;
++	pos_delta = rshash_pos >> base;
++	neg_delta = rshash_neg >> base;
++
++	if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
++	    CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
++	    CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
++		benefit.scanned >>= 1;
++		benefit.neg >>= 1;
++		benefit.pos >>= 1;
++		benefit.base++;
++		scanned_delta >>= 1;
++		pos_delta >>= 1;
++		neg_delta >>= 1;
++	}
++
++	benefit.pos += pos_delta;
++	benefit.neg += neg_delta;
++	benefit.scanned += scanned_delta;
++
++	BUG_ON(!benefit.scanned);
++
++	rshash_pos = rshash_neg = 0;
++	uksm_pages_scanned_last = uksm_pages_scanned;
++
++	return 1;
++}
++
++static inline void reset_benefit(void)
++{
++	benefit.pos = 0;
++	benefit.neg = 0;
++	benefit.base = 0;
++	benefit.scanned = 0;
++}
++
++static inline void inc_rshash_pos(unsigned long delta)
++{
++	if (CAN_OVERFLOW_U64(rshash_pos, delta))
++		encode_benefit();
++
++	rshash_pos += delta;
++}
++
++static inline void inc_rshash_neg(unsigned long delta)
++{
++	if (CAN_OVERFLOW_U64(rshash_neg, delta))
++		encode_benefit();
++
++	rshash_neg += delta;
++}
++
++
++static inline u32 page_hash(struct page *page, unsigned long hash_strength,
++			    int cost_accounting)
++{
++	u32 val;
++	unsigned long delta;
++
++	void *addr = kmap_atomic(page);
++
++	val = random_sample_hash(addr, hash_strength);
++	kunmap_atomic(addr);
++
++	if (cost_accounting) {
++		if (hash_strength < HASH_STRENGTH_FULL)
++			delta = HASH_STRENGTH_FULL - hash_strength;
++		else
++			delta = 0;
++
++		inc_rshash_pos(delta);
++	}
++
++	return val;
++}
++
++static int memcmp_pages_with_cost(struct page *page1, struct page *page2,
++			int cost_accounting)
++{
++	char *addr1, *addr2;
++	int ret;
++
++	addr1 = kmap_atomic(page1);
++	addr2 = kmap_atomic(page2);
++	ret = memcmp(addr1, addr2, PAGE_SIZE);
++	kunmap_atomic(addr2);
++	kunmap_atomic(addr1);
++
++	if (cost_accounting)
++		inc_rshash_neg(memcmp_cost);
++
++	return ret;
++}
++
++static inline int pages_identical_with_cost(struct page *page1, struct page *page2)
++{
++	return !memcmp_pages_with_cost(page1, page2, 0);
++}
++
++static inline int is_page_full_zero(struct page *page)
++{
++	char *addr;
++	int ret;
++
++	addr = kmap_atomic(page);
++	ret = is_full_zero(addr, PAGE_SIZE);
++	kunmap_atomic(addr);
++
++	return ret;
++}
++
++static int write_protect_page(struct vm_area_struct *vma, struct page *page,
++			      pte_t *orig_pte, pte_t *old_pte)
++{
++	struct mm_struct *mm = vma->vm_mm;
++	struct page_vma_mapped_walk pvmw = {
++		.page = page,
++		.vma = vma,
++	};
++       struct mmu_notifier_range range;
++	int swapped;
++	int err = -EFAULT;
++
++	pvmw.address = page_address_in_vma(page, vma);
++	if (pvmw.address == -EFAULT)
++		goto out;
++
++	BUG_ON(PageTransCompound(page));
++
++        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, pvmw.address,
++                                pvmw.address + PAGE_SIZE);
++	mmu_notifier_invalidate_range_start(&range);
++
++	if (!page_vma_mapped_walk(&pvmw))
++		goto out_mn;
++	if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
++		goto out_unlock;
++
++	if (old_pte)
++		*old_pte = *pvmw.pte;
++
++	if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
++	    (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) {
++		pte_t entry;
++
++		swapped = PageSwapCache(page);
++		flush_cache_page(vma, pvmw.address, page_to_pfn(page));
++		/*
++		 * Ok this is tricky, when get_user_pages_fast() run it doesn't
++		 * take any lock, therefore the check that we are going to make
++		 * with the pagecount against the mapcount is racey and
++		 * O_DIRECT can happen right after the check.
++		 * So we clear the pte and flush the tlb before the check
++		 * this assure us that no O_DIRECT can happen after the check
++		 * or in the middle of the check.
++		 */
++		entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
++		/*
++		 * Check that no O_DIRECT or similar I/O is in progress on the
++		 * page
++		 */
++		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
++			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
++			goto out_unlock;
++		}
++		if (pte_dirty(entry))
++			set_page_dirty(page);
++
++		if (pte_protnone(entry))
++			entry = pte_mkclean(pte_clear_savedwrite(entry));
++		else
++			entry = pte_mkclean(pte_wrprotect(entry));
++
++		set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
++	}
++	*orig_pte = *pvmw.pte;
++	err = 0;
++
++out_unlock:
++	page_vma_mapped_walk_done(&pvmw);
++out_mn:
++	mmu_notifier_invalidate_range_end(&range);
++out:
++	return err;
++}
++
++#define MERGE_ERR_PGERR		1 /* the page is invalid cannot continue */
++#define MERGE_ERR_COLLI		2 /* there is a collision */
++#define MERGE_ERR_COLLI_MAX	3 /* collision at the max hash strength */
++#define MERGE_ERR_CHANGED	4 /* the page has changed since last hash */
++
++
++/**
++ * replace_page - replace page in vma by new ksm page
++ * @vma:      vma that holds the pte pointing to page
++ * @page:     the page we are replacing by kpage
++ * @kpage:    the ksm page we replace page by
++ * @orig_pte: the original value of the pte
++ *
++ * Returns 0 on success, MERGE_ERR_PGERR on failure.
++ */
++static int replace_page(struct vm_area_struct *vma, struct page *page,
++			struct page *kpage, pte_t orig_pte)
++{
++	struct mm_struct *mm = vma->vm_mm;
++       struct mmu_notifier_range range;
++	pgd_t *pgd;
++	p4d_t *p4d;
++	pud_t *pud;
++	pmd_t *pmd;
++	pte_t *ptep;
++	spinlock_t *ptl;
++	pte_t entry;
++
++	unsigned long addr;
++	int err = MERGE_ERR_PGERR;
++
++	addr = page_address_in_vma(page, vma);
++	if (addr == -EFAULT)
++		goto out;
++
++	pgd = pgd_offset(mm, addr);
++	if (!pgd_present(*pgd))
++		goto out;
++
++	p4d = p4d_offset(pgd, addr);
++	pud = pud_offset(p4d, addr);
++	if (!pud_present(*pud))
++		goto out;
++
++	pmd = pmd_offset(pud, addr);
++	BUG_ON(pmd_trans_huge(*pmd));
++	if (!pmd_present(*pmd))
++		goto out;
++
++        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
++                                addr + PAGE_SIZE);
++	mmu_notifier_invalidate_range_start(&range);
++
++	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
++	if (!pte_same(*ptep, orig_pte)) {
++		pte_unmap_unlock(ptep, ptl);
++		goto out_mn;
++	}
++
++	flush_cache_page(vma, addr, pte_pfn(*ptep));
++	ptep_clear_flush_notify(vma, addr, ptep);
++	entry = mk_pte(kpage, vma->vm_page_prot);
++
++	/* special treatment is needed for zero_page */
++	if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
++				(page_to_pfn(kpage) == zero_pfn)) {
++		entry = pte_mkspecial(entry);
++		dec_mm_counter(mm, MM_ANONPAGES);
++		inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
++	} else {
++		get_page(kpage);
++		page_add_anon_rmap(kpage, vma, addr, false);
++	}
++
++	set_pte_at_notify(mm, addr, ptep, entry);
++
++	page_remove_rmap(page, false);
++	if (!page_mapped(page))
++		try_to_free_swap(page);
++	put_page(page);
++
++	pte_unmap_unlock(ptep, ptl);
++	err = 0;
++out_mn:
++	mmu_notifier_invalidate_range_end(&range);
++out:
++	return err;
++}
++
++
++/**
++ *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
++ *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
++ *  hash_max member has not been calculated.
++ *
++ * @page The page needs to be hashed
++ * @hash_old The hash value calculated with current hash strength
++ *
++ * return the new hash value calculated at HASH_STRENGTH_MAX
++ */
++static inline u32 page_hash_max(struct page *page, u32 hash_old)
++{
++	u32 hash_max = 0;
++	void *addr;
++
++	addr = kmap_atomic(page);
++	hash_max = delta_hash(addr, hash_strength,
++			      HASH_STRENGTH_MAX, hash_old);
++
++	kunmap_atomic(addr);
++
++	if (!hash_max)
++		hash_max = 1;
++
++	inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
++	return hash_max;
++}
++
++/*
++ * We compare the hash again, to ensure that it is really a hash collision
++ * instead of being caused by page write.
++ */
++static inline int check_collision(struct rmap_item *rmap_item,
++				  u32 hash)
++{
++	int err;
++	struct page *page = rmap_item->page;
++
++	/* if this rmap_item has already been hash_maxed, then the collision
++	 * must appears in the second-level rbtree search. In this case we check
++	 * if its hash_max value has been changed. Otherwise, the collision
++	 * happens in the first-level rbtree search, so we check against it's
++	 * current hash value.
++	 */
++	if (rmap_item->hash_max) {
++		inc_rshash_neg(memcmp_cost);
++		inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
++
++		if (rmap_item->hash_max == page_hash_max(page, hash))
++			err = MERGE_ERR_COLLI;
++		else
++			err = MERGE_ERR_CHANGED;
++	} else {
++		inc_rshash_neg(memcmp_cost + hash_strength);
++
++		if (page_hash(page, hash_strength, 0) == hash)
++			err = MERGE_ERR_COLLI;
++		else
++			err = MERGE_ERR_CHANGED;
++	}
++
++	return err;
++}
++
++/**
++ * Try to merge a rmap_item.page with a kpage in stable node. kpage must
++ * already be a ksm page.
++ *
++ * @return 0 if the pages were merged, -EFAULT otherwise.
++ */
++static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
++				      struct page *kpage, u32 hash)
++{
++	struct vm_area_struct *vma = rmap_item->slot->vma;
++	struct mm_struct *mm = vma->vm_mm;
++	pte_t orig_pte = __pte(0);
++	int err = MERGE_ERR_PGERR;
++	struct page *page;
++
++	if (uksm_test_exit(mm))
++		goto out;
++
++	page = rmap_item->page;
++
++	if (page == kpage) { /* ksm page forked */
++		err = 0;
++		goto out;
++	}
++
++	/*
++	 * We need the page lock to read a stable PageSwapCache in
++	 * write_protect_page().  We use trylock_page() instead of
++	 * lock_page() because we don't want to wait here - we
++	 * prefer to continue scanning and merging different pages,
++	 * then come back to this page when it is unlocked.
++	 */
++	if (!trylock_page(page))
++		goto out;
++
++	if (!PageAnon(page) || !PageKsm(kpage))
++		goto out_unlock;
++
++	if (PageTransCompound(page)) {
++		err = split_huge_page(page);
++		if (err)
++			goto out_unlock;
++	}
++
++	/*
++	 * If this anonymous page is mapped only here, its pte may need
++	 * to be write-protected.  If it's mapped elsewhere, all of its
++	 * ptes are necessarily already write-protected.  But in either
++	 * case, we need to lock and check page_count is not raised.
++	 */
++	if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
++		if (pages_identical_with_cost(page, kpage))
++			err = replace_page(vma, page, kpage, orig_pte);
++		else
++			err = check_collision(rmap_item, hash);
++	}
++
++	if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
++		munlock_vma_page(page);
++		if (!PageMlocked(kpage)) {
++			unlock_page(page);
++			lock_page(kpage);
++			mlock_vma_page(kpage);
++			page = kpage;		/* for final unlock */
++		}
++	}
++
++out_unlock:
++	unlock_page(page);
++out:
++	return err;
++}
++
++
++
++/**
++ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
++ * to restore a page mapping that has been changed in try_to_merge_two_pages.
++ *
++ * @return 0 on success.
++ */
++static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
++			     pte_t orig_pte, pte_t wprt_pte)
++{
++	struct mm_struct *mm = vma->vm_mm;
++	pgd_t *pgd;
++	p4d_t *p4d;
++	pud_t *pud;
++	pmd_t *pmd;
++	pte_t *ptep;
++	spinlock_t *ptl;
++
++	int err = -EFAULT;
++
++	pgd = pgd_offset(mm, addr);
++	if (!pgd_present(*pgd))
++		goto out;
++
++	p4d = p4d_offset(pgd, addr);
++	pud = pud_offset(p4d, addr);
++	if (!pud_present(*pud))
++		goto out;
++
++	pmd = pmd_offset(pud, addr);
++	if (!pmd_present(*pmd))
++		goto out;
++
++	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
++	if (!pte_same(*ptep, wprt_pte)) {
++		/* already copied, let it be */
++		pte_unmap_unlock(ptep, ptl);
++		goto out;
++	}
++
++	/*
++	 * Good boy, still here. When we still get the ksm page, it does not
++	 * return to the free page pool, there is no way that a pte was changed
++	 * to other page and gets back to this page. And remind that ksm page
++	 * do not reuse in do_wp_page(). So it's safe to restore the original
++	 * pte.
++	 */
++	flush_cache_page(vma, addr, pte_pfn(*ptep));
++	ptep_clear_flush_notify(vma, addr, ptep);
++	set_pte_at_notify(mm, addr, ptep, orig_pte);
++
++	pte_unmap_unlock(ptep, ptl);
++	err = 0;
++out:
++	return err;
++}
++
++/**
++ * try_to_merge_two_pages() - take two identical pages and prepare
++ * them to be merged into one page(rmap_item->page)
++ *
++ * @return 0 if we successfully merged two identical pages into
++ *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
++ *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
++ *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
++ *
++ */
++static int try_to_merge_two_pages(struct rmap_item *rmap_item,
++				  struct rmap_item *tree_rmap_item,
++				  u32 hash)
++{
++	pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
++	pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
++	struct vm_area_struct *vma1 = rmap_item->slot->vma;
++	struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
++	struct page *page = rmap_item->page;
++	struct page *tree_page = tree_rmap_item->page;
++	int err = MERGE_ERR_PGERR;
++	struct address_space *saved_mapping;
++
++
++	if (rmap_item->page == tree_rmap_item->page)
++		goto out;
++
++	if (!trylock_page(page))
++		goto out;
++
++	if (!PageAnon(page))
++		goto out_unlock;
++
++	if (PageTransCompound(page)) {
++		err = split_huge_page(page);
++		if (err)
++			goto out_unlock;
++	}
++
++	if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
++		unlock_page(page);
++		goto out;
++	}
++
++	/*
++	 * While we hold page lock, upgrade page from
++	 * PageAnon+anon_vma to PageKsm+NULL stable_node:
++	 * stable_tree_insert() will update stable_node.
++	 */
++	saved_mapping = page->mapping;
++	set_page_stable_node(page, NULL);
++	mark_page_accessed(page);
++	if (!PageDirty(page))
++		SetPageDirty(page);
++
++	unlock_page(page);
++
++	if (!trylock_page(tree_page))
++		goto restore_out;
++
++	if (!PageAnon(tree_page)) {
++		unlock_page(tree_page);
++		goto restore_out;
++	}
++
++	if (PageTransCompound(tree_page)) {
++		err = split_huge_page(tree_page);
++		if (err) {
++			unlock_page(tree_page);
++			goto restore_out;
++		}
++	}
++
++	if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
++		unlock_page(tree_page);
++		goto restore_out;
++	}
++
++	if (pages_identical_with_cost(page, tree_page)) {
++		err = replace_page(vma2, tree_page, page, wprt_pte2);
++		if (err) {
++			unlock_page(tree_page);
++			goto restore_out;
++		}
++
++		if ((vma2->vm_flags & VM_LOCKED)) {
++			munlock_vma_page(tree_page);
++			if (!PageMlocked(page)) {
++				unlock_page(tree_page);
++				lock_page(page);
++				mlock_vma_page(page);
++				tree_page = page; /* for final unlock */
++			}
++		}
++
++		unlock_page(tree_page);
++
++		goto out; /* success */
++
++	} else {
++		if (tree_rmap_item->hash_max &&
++		    tree_rmap_item->hash_max == rmap_item->hash_max) {
++			err = MERGE_ERR_COLLI_MAX;
++		} else if (page_hash(page, hash_strength, 0) ==
++		    page_hash(tree_page, hash_strength, 0)) {
++			inc_rshash_neg(memcmp_cost + hash_strength * 2);
++			err = MERGE_ERR_COLLI;
++		} else {
++			err = MERGE_ERR_CHANGED;
++		}
++
++		unlock_page(tree_page);
++	}
++
++restore_out:
++	lock_page(page);
++	if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
++				  orig_pte1, wprt_pte1))
++		page->mapping = saved_mapping;
++
++out_unlock:
++	unlock_page(page);
++out:
++	return err;
++}
++
++static inline int hash_cmp(u32 new_val, u32 node_val)
++{
++	if (new_val > node_val)
++		return 1;
++	else if (new_val < node_val)
++		return -1;
++	else
++		return 0;
++}
++
++static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
++{
++	u32 hash_max = item->hash_max;
++
++	if (!hash_max) {
++		hash_max = page_hash_max(item->page, hash);
++
++		item->hash_max = hash_max;
++	}
++
++	return hash_max;
++}
++
++
++
++/**
++ * stable_tree_search() - search the stable tree for a page
++ *
++ * @item:	the rmap_item we are comparing with
++ * @hash:	the hash value of this item->page already calculated
++ *
++ * @return	the page we have found, NULL otherwise. The page returned has
++ *			been gotten.
++ */
++static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
++{
++	struct rb_node *node = root_stable_treep->rb_node;
++	struct tree_node *tree_node;
++	unsigned long hash_max;
++	struct page *page = item->page;
++	struct stable_node *stable_node;
++
++	stable_node = page_stable_node(page);
++	if (stable_node) {
++		/* ksm page forked, that is
++		 * if (PageKsm(page) && !in_stable_tree(rmap_item))
++		 * it's actually gotten once outside.
++		 */
++		get_page(page);
++		return page;
++	}
++
++	while (node) {
++		int cmp;
++
++		tree_node = rb_entry(node, struct tree_node, node);
++
++		cmp = hash_cmp(hash, tree_node->hash);
++
++		if (cmp < 0)
++			node = node->rb_left;
++		else if (cmp > 0)
++			node = node->rb_right;
++		else
++			break;
++	}
++
++	if (!node)
++		return NULL;
++
++	if (tree_node->count == 1) {
++		stable_node = rb_entry(tree_node->sub_root.rb_node,
++				       struct stable_node, node);
++		BUG_ON(!stable_node);
++
++		goto get_page_out;
++	}
++
++	/*
++	 * ok, we have to search the second
++	 * level subtree, hash the page to a
++	 * full strength.
++	 */
++	node = tree_node->sub_root.rb_node;
++	BUG_ON(!node);
++	hash_max = rmap_item_hash_max(item, hash);
++
++	while (node) {
++		int cmp;
++
++		stable_node = rb_entry(node, struct stable_node, node);
++
++		cmp = hash_cmp(hash_max, stable_node->hash_max);
++
++		if (cmp < 0)
++			node = node->rb_left;
++		else if (cmp > 0)
++			node = node->rb_right;
++		else
++			goto get_page_out;
++	}
++
++	return NULL;
++
++get_page_out:
++	page = get_uksm_page(stable_node, 1, 1);
++	return page;
++}
++
++static int try_merge_rmap_item(struct rmap_item *item,
++			       struct page *kpage,
++			       struct page *tree_page)
++{
++	struct vm_area_struct *vma = item->slot->vma;
++	struct page_vma_mapped_walk pvmw = {
++		.page = kpage,
++		.vma = vma,
++	};
++
++	pvmw.address = get_rmap_addr(item);
++	if (!page_vma_mapped_walk(&pvmw))
++		return 0;
++
++	if (pte_write(*pvmw.pte)) {
++		/* has changed, abort! */
++		page_vma_mapped_walk_done(&pvmw);
++		return 0;
++	}
++
++	get_page(tree_page);
++	page_add_anon_rmap(tree_page, vma, pvmw.address, false);
++
++	flush_cache_page(vma, pvmw.address, page_to_pfn(kpage));
++	ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
++	set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte,
++			  mk_pte(tree_page, vma->vm_page_prot));
++
++	page_remove_rmap(kpage, false);
++	put_page(kpage);
++
++	page_vma_mapped_walk_done(&pvmw);
++
++	return 1;
++}
++
++/**
++ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
++ * into stable tree, the page was found to be identical to a stable ksm page,
++ * this is the last chance we can merge them into one.
++ *
++ * @item1:	the rmap_item holding the page which we wanted to insert
++ *		into stable tree.
++ * @item2:	the other rmap_item we found when unstable tree search
++ * @oldpage:	the page currently mapped by the two rmap_items
++ * @tree_page:	the page we found identical in stable tree node
++ * @success1:	return if item1 is successfully merged
++ * @success2:	return if item2 is successfully merged
++ */
++static void try_merge_with_stable(struct rmap_item *item1,
++				  struct rmap_item *item2,
++				  struct page **kpage,
++				  struct page *tree_page,
++				  int *success1, int *success2)
++{
++	struct vm_area_struct *vma1 = item1->slot->vma;
++	struct vm_area_struct *vma2 = item2->slot->vma;
++	*success1 = 0;
++	*success2 = 0;
++
++	if (unlikely(*kpage == tree_page)) {
++		/* I don't think this can really happen */
++		pr_warn("UKSM: unexpected condition detected in "
++			"%s -- *kpage == tree_page !\n", __func__);
++		*success1 = 1;
++		*success2 = 1;
++		return;
++	}
++
++	if (!PageAnon(*kpage) || !PageKsm(*kpage))
++		goto failed;
++
++	if (!trylock_page(tree_page))
++		goto failed;
++
++	/* If the oldpage is still ksm and still pointed
++	 * to in the right place, and still write protected,
++	 * we are confident it's not changed, no need to
++	 * memcmp anymore.
++	 * be ware, we cannot take nested pte locks,
++	 * deadlock risk.
++	 */
++	if (!try_merge_rmap_item(item1, *kpage, tree_page))
++		goto unlock_failed;
++
++	/* ok, then vma2, remind that pte1 already set */
++	if (!try_merge_rmap_item(item2, *kpage, tree_page))
++		goto success_1;
++
++	*success2 = 1;
++success_1:
++	*success1 = 1;
++
++
++	if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
++	    (*success2 && vma2->vm_flags & VM_LOCKED)) {
++		munlock_vma_page(*kpage);
++		if (!PageMlocked(tree_page))
++			mlock_vma_page(tree_page);
++	}
++
++	/*
++	 * We do not need oldpage any more in the caller, so can break the lock
++	 * now.
++	 */
++	unlock_page(*kpage);
++	*kpage = tree_page; /* Get unlocked outside. */
++	return;
++
++unlock_failed:
++	unlock_page(tree_page);
++failed:
++	return;
++}
++
++static inline void stable_node_hash_max(struct stable_node *node,
++					 struct page *page, u32 hash)
++{
++	u32 hash_max = node->hash_max;
++
++	if (!hash_max) {
++		hash_max = page_hash_max(page, hash);
++		node->hash_max = hash_max;
++	}
++}
++
++static inline
++struct stable_node *new_stable_node(struct tree_node *tree_node,
++				    struct page *kpage, u32 hash_max)
++{
++	struct stable_node *new_stable_node;
++
++	new_stable_node = alloc_stable_node();
++	if (!new_stable_node)
++		return NULL;
++
++	new_stable_node->kpfn = page_to_pfn(kpage);
++	new_stable_node->hash_max = hash_max;
++	new_stable_node->tree_node = tree_node;
++	set_page_stable_node(kpage, new_stable_node);
++
++	return new_stable_node;
++}
++
++static inline
++struct stable_node *first_level_insert(struct tree_node *tree_node,
++				       struct rmap_item *rmap_item,
++				       struct rmap_item *tree_rmap_item,
++				       struct page **kpage, u32 hash,
++				       int *success1, int *success2)
++{
++	int cmp;
++	struct page *tree_page;
++	u32 hash_max = 0;
++	struct stable_node *stable_node, *new_snode;
++	struct rb_node *parent = NULL, **new;
++
++	/* this tree node contains no sub-tree yet */
++	stable_node = rb_entry(tree_node->sub_root.rb_node,
++			       struct stable_node, node);
++
++	tree_page = get_uksm_page(stable_node, 1, 0);
++	if (tree_page) {
++		cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
++		if (!cmp) {
++			try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
++					      tree_page, success1, success2);
++			put_page(tree_page);
++			if (!*success1 && !*success2)
++				goto failed;
++
++			return stable_node;
++
++		} else {
++			/*
++			 * collision in first level try to create a subtree.
++			 * A new node need to be created.
++			 */
++			put_page(tree_page);
++
++			stable_node_hash_max(stable_node, tree_page,
++					     tree_node->hash);
++			hash_max = rmap_item_hash_max(rmap_item, hash);
++			cmp = hash_cmp(hash_max, stable_node->hash_max);
++
++			parent = &stable_node->node;
++			if (cmp < 0)
++				new = &parent->rb_left;
++			else if (cmp > 0)
++				new = &parent->rb_right;
++			else
++				goto failed;
++		}
++
++	} else {
++		/* the only stable_node deleted, we reuse its tree_node.
++		 */
++		parent = NULL;
++		new = &tree_node->sub_root.rb_node;
++	}
++
++	new_snode = new_stable_node(tree_node, *kpage, hash_max);
++	if (!new_snode)
++		goto failed;
++
++	rb_link_node(&new_snode->node, parent, new);
++	rb_insert_color(&new_snode->node, &tree_node->sub_root);
++	tree_node->count++;
++	*success1 = *success2 = 1;
++
++	return new_snode;
++
++failed:
++	return NULL;
++}
++
++static inline
++struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
++					  struct rmap_item *rmap_item,
++					  struct rmap_item *tree_rmap_item,
++					  struct page **kpage, u32 hash,
++					  int *success1, int *success2)
++{
++	struct page *tree_page;
++	u32 hash_max;
++	struct stable_node *stable_node, *new_snode;
++	struct rb_node *parent, **new;
++
++research:
++	parent = NULL;
++	new = &tree_node->sub_root.rb_node;
++	BUG_ON(!*new);
++	hash_max = rmap_item_hash_max(rmap_item, hash);
++	while (*new) {
++		int cmp;
++
++		stable_node = rb_entry(*new, struct stable_node, node);
++
++		cmp = hash_cmp(hash_max, stable_node->hash_max);
++
++		if (cmp < 0) {
++			parent = *new;
++			new = &parent->rb_left;
++		} else if (cmp > 0) {
++			parent = *new;
++			new = &parent->rb_right;
++		} else {
++			tree_page = get_uksm_page(stable_node, 1, 0);
++			if (tree_page) {
++				cmp = memcmp_pages_with_cost(*kpage, tree_page, 1);
++				if (!cmp) {
++					try_merge_with_stable(rmap_item,
++						tree_rmap_item, kpage,
++						tree_page, success1, success2);
++
++					put_page(tree_page);
++					if (!*success1 && !*success2)
++						goto failed;
++					/*
++					 * successfully merged with a stable
++					 * node
++					 */
++					return stable_node;
++				} else {
++					put_page(tree_page);
++					goto failed;
++				}
++			} else {
++				/*
++				 * stable node may be deleted,
++				 * and subtree maybe
++				 * restructed, cannot
++				 * continue, research it.
++				 */
++				if (tree_node->count) {
++					goto research;
++				} else {
++					/* reuse the tree node*/
++					parent = NULL;
++					new = &tree_node->sub_root.rb_node;
++				}
++			}
++		}
++	}
++
++	new_snode = new_stable_node(tree_node, *kpage, hash_max);
++	if (!new_snode)
++		goto failed;
++
++	rb_link_node(&new_snode->node, parent, new);
++	rb_insert_color(&new_snode->node, &tree_node->sub_root);
++	tree_node->count++;
++	*success1 = *success2 = 1;
++
++	return new_snode;
++
++failed:
++	return NULL;
++}
++
++
++/**
++ * stable_tree_insert() - try to insert a merged page in unstable tree to
++ * the stable tree
++ *
++ * @kpage:		the page need to be inserted
++ * @hash:		the current hash of this page
++ * @rmap_item:		the rmap_item being scanned
++ * @tree_rmap_item:	the rmap_item found on unstable tree
++ * @success1:		return if rmap_item is merged
++ * @success2:		return if tree_rmap_item is merged
++ *
++ * @return		the stable_node on stable tree if at least one
++ *			rmap_item is inserted into stable tree, NULL
++ *			otherwise.
++ */
++static struct stable_node *
++stable_tree_insert(struct page **kpage, u32 hash,
++		   struct rmap_item *rmap_item,
++		   struct rmap_item *tree_rmap_item,
++		   int *success1, int *success2)
++{
++	struct rb_node **new = &root_stable_treep->rb_node;
++	struct rb_node *parent = NULL;
++	struct stable_node *stable_node;
++	struct tree_node *tree_node;
++	u32 hash_max = 0;
++
++	*success1 = *success2 = 0;
++
++	while (*new) {
++		int cmp;
++
++		tree_node = rb_entry(*new, struct tree_node, node);
++
++		cmp = hash_cmp(hash, tree_node->hash);
++
++		if (cmp < 0) {
++			parent = *new;
++			new = &parent->rb_left;
++		} else if (cmp > 0) {
++			parent = *new;
++			new = &parent->rb_right;
++		} else
++			break;
++	}
++
++	if (*new) {
++		if (tree_node->count == 1) {
++			stable_node = first_level_insert(tree_node, rmap_item,
++						tree_rmap_item, kpage,
++						hash, success1, success2);
++		} else {
++			stable_node = stable_subtree_insert(tree_node,
++					rmap_item, tree_rmap_item, kpage,
++					hash, success1, success2);
++		}
++	} else {
++
++		/* no tree node found */
++		tree_node = alloc_tree_node(stable_tree_node_listp);
++		if (!tree_node) {
++			stable_node = NULL;
++			goto out;
++		}
++
++		stable_node = new_stable_node(tree_node, *kpage, hash_max);
++		if (!stable_node) {
++			free_tree_node(tree_node);
++			goto out;
++		}
++
++		tree_node->hash = hash;
++		rb_link_node(&tree_node->node, parent, new);
++		rb_insert_color(&tree_node->node, root_stable_treep);
++		parent = NULL;
++		new = &tree_node->sub_root.rb_node;
++
++		rb_link_node(&stable_node->node, parent, new);
++		rb_insert_color(&stable_node->node, &tree_node->sub_root);
++		tree_node->count++;
++		*success1 = *success2 = 1;
++	}
++
++out:
++	return stable_node;
++}
++
++
++/**
++ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
++ *
++ * @return	0 on success, -EBUSY if unable to lock the mmap_sem,
++ *		-EINVAL if the page mapping has been changed.
++ */
++static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
++{
++	int err;
++
++	err = get_mergeable_page_lock_mmap(tree_rmap_item);
++
++	if (err == -EINVAL) {
++		/* its page map has been changed, remove it */
++		remove_rmap_item_from_tree(tree_rmap_item);
++	}
++
++	/* The page is gotten and mmap_sem is locked now. */
++	return err;
++}
++
++
++/**
++ * unstable_tree_search_insert() - search an unstable tree rmap_item with the
++ * same hash value. Get its page and trylock the mmap_sem
++ */
++static inline
++struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
++					      u32 hash)
++
++{
++	struct rb_node **new = &root_unstable_tree.rb_node;
++	struct rb_node *parent = NULL;
++	struct tree_node *tree_node;
++	u32 hash_max;
++	struct rmap_item *tree_rmap_item;
++
++	while (*new) {
++		int cmp;
++
++		tree_node = rb_entry(*new, struct tree_node, node);
++
++		cmp = hash_cmp(hash, tree_node->hash);
++
++		if (cmp < 0) {
++			parent = *new;
++			new = &parent->rb_left;
++		} else if (cmp > 0) {
++			parent = *new;
++			new = &parent->rb_right;
++		} else
++			break;
++	}
++
++	if (*new) {
++		/* got the tree_node */
++		if (tree_node->count == 1) {
++			tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
++						  struct rmap_item, node);
++			BUG_ON(!tree_rmap_item);
++
++			goto get_page_out;
++		}
++
++		/* well, search the collision subtree */
++		new = &tree_node->sub_root.rb_node;
++		BUG_ON(!*new);
++		hash_max = rmap_item_hash_max(rmap_item, hash);
++
++		while (*new) {
++			int cmp;
++
++			tree_rmap_item = rb_entry(*new, struct rmap_item,
++						  node);
++
++			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
++			parent = *new;
++			if (cmp < 0)
++				new = &parent->rb_left;
++			else if (cmp > 0)
++				new = &parent->rb_right;
++			else
++				goto get_page_out;
++		}
++	} else {
++		/* alloc a new tree_node */
++		tree_node = alloc_tree_node(&unstable_tree_node_list);
++		if (!tree_node)
++			return NULL;
++
++		tree_node->hash = hash;
++		rb_link_node(&tree_node->node, parent, new);
++		rb_insert_color(&tree_node->node, &root_unstable_tree);
++		parent = NULL;
++		new = &tree_node->sub_root.rb_node;
++	}
++
++	/* did not found even in sub-tree */
++	rmap_item->tree_node = tree_node;
++	rmap_item->address |= UNSTABLE_FLAG;
++	rmap_item->hash_round = uksm_hash_round;
++	rb_link_node(&rmap_item->node, parent, new);
++	rb_insert_color(&rmap_item->node, &tree_node->sub_root);
++
++	uksm_pages_unshared++;
++	return NULL;
++
++get_page_out:
++	if (tree_rmap_item->page == rmap_item->page)
++		return NULL;
++
++	if (get_tree_rmap_item_page(tree_rmap_item))
++		return NULL;
++
++	return tree_rmap_item;
++}
++
++static void hold_anon_vma(struct rmap_item *rmap_item,
++			  struct anon_vma *anon_vma)
++{
++	rmap_item->anon_vma = anon_vma;
++	get_anon_vma(anon_vma);
++}
++
++
++/**
++ * stable_tree_append() - append a rmap_item to a stable node. Deduplication
++ * ratio statistics is done in this function.
++ *
++ */
++static void stable_tree_append(struct rmap_item *rmap_item,
++			       struct stable_node *stable_node, int logdedup)
++{
++	struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
++	unsigned long key = (unsigned long)rmap_item->slot;
++	unsigned long factor = rmap_item->slot->rung->step;
++
++	BUG_ON(!stable_node);
++	rmap_item->address |= STABLE_FLAG;
++
++	if (hlist_empty(&stable_node->hlist)) {
++		uksm_pages_shared++;
++		goto node_vma_new;
++	} else {
++		uksm_pages_sharing++;
++	}
++
++	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
++		if (node_vma->key >= key)
++			break;
++
++		if (logdedup) {
++			node_vma->slot->pages_bemerged += factor;
++			if (list_empty(&node_vma->slot->dedup_list))
++				list_add(&node_vma->slot->dedup_list,
++					 &vma_slot_dedup);
++		}
++	}
++
++	if (node_vma) {
++		if (node_vma->key == key) {
++			node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
++			goto node_vma_ok;
++		} else if (node_vma->key > key) {
++			node_vma_cont = node_vma;
++		}
++	}
++
++node_vma_new:
++	/* no same vma already in node, alloc a new node_vma */
++	new_node_vma = alloc_node_vma();
++	BUG_ON(!new_node_vma);
++	new_node_vma->head = stable_node;
++	new_node_vma->slot = rmap_item->slot;
++
++	if (!node_vma) {
++		hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
++	} else if (node_vma->key != key) {
++		if (node_vma->key < key)
++			hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
++		else {
++			hlist_add_before(&new_node_vma->hlist,
++					 &node_vma->hlist);
++		}
++
++	}
++	node_vma = new_node_vma;
++
++node_vma_ok: /* ok, ready to add to the list */
++	rmap_item->head = node_vma;
++	hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
++	hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
++	if (logdedup) {
++		rmap_item->slot->pages_merged++;
++		if (node_vma_cont) {
++			node_vma = node_vma_cont;
++			hlist_for_each_entry_continue(node_vma, hlist) {
++				node_vma->slot->pages_bemerged += factor;
++				if (list_empty(&node_vma->slot->dedup_list))
++					list_add(&node_vma->slot->dedup_list,
++						 &vma_slot_dedup);
++			}
++		}
++	}
++}
++
++/*
++ * We use break_ksm to break COW on a ksm page: it's a stripped down
++ *
++ *	if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
++ *		put_page(page);
++ *
++ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
++ * in case the application has unmapped and remapped mm,addr meanwhile.
++ * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
++ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
++ */
++static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
++{
++	struct page *page;
++	int ret = 0;
++
++	do {
++		cond_resched();
++		page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
++		if (IS_ERR_OR_NULL(page))
++			break;
++		if (PageKsm(page)) {
++			ret = handle_mm_fault(vma, addr,
++					      FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
++                                             NULL);
++		} else
++			ret = VM_FAULT_WRITE;
++		put_page(page);
++	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
++	/*
++	 * We must loop because handle_mm_fault() may back out if there's
++	 * any difficulty e.g. if pte accessed bit gets updated concurrently.
++	 *
++	 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
++	 * COW has been broken, even if the vma does not permit VM_WRITE;
++	 * but note that a concurrent fault might break PageKsm for us.
++	 *
++	 * VM_FAULT_SIGBUS could occur if we race with truncation of the
++	 * backing file, which also invalidates anonymous pages: that's
++	 * okay, that truncation will have unmapped the PageKsm for us.
++	 *
++	 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
++	 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
++	 * current task has TIF_MEMDIE set, and will be OOM killed on return
++	 * to user; and ksmd, having no mm, would never be chosen for that.
++	 *
++	 * But if the mm is in a limited mem_cgroup, then the fault may fail
++	 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
++	 * even ksmd can fail in this way - though it's usually breaking ksm
++	 * just to undo a merge it made a moment before, so unlikely to oom.
++	 *
++	 * That's a pity: we might therefore have more kernel pages allocated
++	 * than we're counting as nodes in the stable tree; but uksm_do_scan
++	 * will retry to break_cow on each pass, so should recover the page
++	 * in due course.  The important thing is to not let VM_MERGEABLE
++	 * be cleared while any such pages might remain in the area.
++	 */
++	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
++}
++
++static void break_cow(struct rmap_item *rmap_item)
++{
++	struct vm_area_struct *vma = rmap_item->slot->vma;
++	struct mm_struct *mm = vma->vm_mm;
++	unsigned long addr = get_rmap_addr(rmap_item);
++
++	if (uksm_test_exit(mm))
++		goto out;
++
++	break_ksm(vma, addr);
++out:
++	return;
++}
++
++/*
++ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
++ * than check every pte of a given vma, the locking doesn't quite work for
++ * that - an rmap_item is assigned to the stable tree after inserting ksm
++ * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
++ * rmap_items from parent to child at fork time (so as not to waste time
++ * if exit comes before the next scan reaches it).
++ *
++ * Similarly, although we'd like to remove rmap_items (so updating counts
++ * and freeing memory) when unmerging an area, it's easier to leave that
++ * to the next pass of ksmd - consider, for example, how ksmd might be
++ * in cmp_and_merge_page on one of the rmap_items we would be removing.
++ */
++inline int unmerge_uksm_pages(struct vm_area_struct *vma,
++		      unsigned long start, unsigned long end)
++{
++	unsigned long addr;
++	int err = 0;
++
++	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
++		if (uksm_test_exit(vma->vm_mm))
++			break;
++		if (signal_pending(current))
++			err = -ERESTARTSYS;
++		else
++			err = break_ksm(vma, addr);
++	}
++	return err;
++}
++
++static inline void inc_uksm_pages_scanned(void)
++{
++	u64 delta;
++
++
++	if (uksm_pages_scanned == U64_MAX) {
++		encode_benefit();
++
++		delta = uksm_pages_scanned >> pages_scanned_base;
++
++		if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
++			pages_scanned_stored >>= 1;
++			delta >>= 1;
++			pages_scanned_base++;
++		}
++
++		pages_scanned_stored += delta;
++
++		uksm_pages_scanned = uksm_pages_scanned_last = 0;
++	}
++
++	uksm_pages_scanned++;
++}
++
++static inline int find_zero_page_hash(int strength, u32 hash)
++{
++	return (zero_hash_table[strength] == hash);
++}
++
++static
++int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
++{
++	struct page *zero_page = empty_uksm_zero_page;
++	struct mm_struct *mm = vma->vm_mm;
++	pte_t orig_pte = __pte(0);
++	int err = -EFAULT;
++
++	if (uksm_test_exit(mm))
++		goto out;
++
++	if (!trylock_page(page))
++		goto out;
++
++	if (!PageAnon(page))
++		goto out_unlock;
++
++	if (PageTransCompound(page)) {
++		err = split_huge_page(page);
++		if (err)
++			goto out_unlock;
++	}
++
++	if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
++		if (is_page_full_zero(page))
++			err = replace_page(vma, page, zero_page, orig_pte);
++	}
++
++out_unlock:
++	unlock_page(page);
++out:
++	return err;
++}
++
++/*
++ * cmp_and_merge_page() - first see if page can be merged into the stable
++ * tree; if not, compare hash to previous and if it's the same, see if page
++ * can be inserted into the unstable tree, or merged with a page already there
++ * and both transferred to the stable tree.
++ *
++ * @page: the page that we are searching identical page to.
++ * @rmap_item: the reverse mapping into the virtual address of this page
++ */
++static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
++{
++	struct rmap_item *tree_rmap_item;
++	struct page *page;
++	struct page *kpage = NULL;
++	u32 hash_max;
++	int err;
++	unsigned int success1, success2;
++	struct stable_node *snode;
++	int cmp;
++	struct rb_node *parent = NULL, **new;
++
++	remove_rmap_item_from_tree(rmap_item);
++	page = rmap_item->page;
++
++	/* We first start with searching the page inside the stable tree */
++	kpage = stable_tree_search(rmap_item, hash);
++	if (kpage) {
++		err = try_to_merge_with_uksm_page(rmap_item, kpage,
++						 hash);
++		if (!err) {
++			/*
++			 * The page was successfully merged, add
++			 * its rmap_item to the stable tree.
++			 * page lock is needed because it's
++			 * racing with try_to_unmap_ksm(), etc.
++			 */
++			lock_page(kpage);
++			snode = page_stable_node(kpage);
++			stable_tree_append(rmap_item, snode, 1);
++			unlock_page(kpage);
++			put_page(kpage);
++			return; /* success */
++		}
++		put_page(kpage);
++
++		/*
++		 * if it's a collision and it has been search in sub-rbtree
++		 * (hash_max != 0), we want to abort, because if it is
++		 * successfully merged in unstable tree, the collision trends to
++		 * happen again.
++		 */
++		if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
++			return;
++	}
++
++	tree_rmap_item =
++		unstable_tree_search_insert(rmap_item, hash);
++	if (tree_rmap_item) {
++		err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
++		/*
++		 * As soon as we merge this page, we want to remove the
++		 * rmap_item of the page we have merged with from the unstable
++		 * tree, and insert it instead as new node in the stable tree.
++		 */
++		if (!err) {
++			kpage = page;
++			remove_rmap_item_from_tree(tree_rmap_item);
++			lock_page(kpage);
++			snode = stable_tree_insert(&kpage, hash,
++						   rmap_item, tree_rmap_item,
++						   &success1, &success2);
++
++			/*
++			 * Do not log dedup for tree item, it's not counted as
++			 * scanned in this round.
++			 */
++			if (success2)
++				stable_tree_append(tree_rmap_item, snode, 0);
++
++			/*
++			 * The order of these two stable append is important:
++			 * we are scanning rmap_item.
++			 */
++			if (success1)
++				stable_tree_append(rmap_item, snode, 1);
++
++			/*
++			 * The original kpage may be unlocked inside
++			 * stable_tree_insert() already. This page
++			 * should be unlocked before doing
++			 * break_cow().
++			 */
++			unlock_page(kpage);
++
++			if (!success1)
++				break_cow(rmap_item);
++
++			if (!success2)
++				break_cow(tree_rmap_item);
++
++		} else if (err == MERGE_ERR_COLLI) {
++			BUG_ON(tree_rmap_item->tree_node->count > 1);
++
++			rmap_item_hash_max(tree_rmap_item,
++					   tree_rmap_item->tree_node->hash);
++
++			hash_max = rmap_item_hash_max(rmap_item, hash);
++			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
++			parent = &tree_rmap_item->node;
++			if (cmp < 0)
++				new = &parent->rb_left;
++			else if (cmp > 0)
++				new = &parent->rb_right;
++			else
++				goto put_up_out;
++
++			rmap_item->tree_node = tree_rmap_item->tree_node;
++			rmap_item->address |= UNSTABLE_FLAG;
++			rmap_item->hash_round = uksm_hash_round;
++			rb_link_node(&rmap_item->node, parent, new);
++			rb_insert_color(&rmap_item->node,
++					&tree_rmap_item->tree_node->sub_root);
++			rmap_item->tree_node->count++;
++		} else {
++			/*
++			 * either one of the page has changed or they collide
++			 * at the max hash, we consider them as ill items.
++			 */
++			remove_rmap_item_from_tree(tree_rmap_item);
++		}
++put_up_out:
++		put_page(tree_rmap_item->page);
++		mmap_read_unlock(tree_rmap_item->slot->vma->vm_mm);
++	}
++}
++
++
++
++
++static inline unsigned long get_pool_index(struct vma_slot *slot,
++					   unsigned long index)
++{
++	unsigned long pool_index;
++
++	pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
++	if (pool_index >= slot->pool_size)
++		BUG();
++	return pool_index;
++}
++
++static inline unsigned long index_page_offset(unsigned long index)
++{
++	return offset_in_page(sizeof(struct rmap_list_entry *) * index);
++}
++
++static inline
++struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
++					    unsigned long index, int need_alloc)
++{
++	unsigned long pool_index;
++	struct page *page;
++	void *addr;
++
++
++	pool_index = get_pool_index(slot, index);
++	if (!slot->rmap_list_pool[pool_index]) {
++		if (!need_alloc)
++			return NULL;
++
++		page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
++		if (!page)
++			return NULL;
++
++		slot->rmap_list_pool[pool_index] = page;
++	}
++
++	addr = kmap(slot->rmap_list_pool[pool_index]);
++	addr += index_page_offset(index);
++
++	return addr;
++}
++
++static inline void put_rmap_list_entry(struct vma_slot *slot,
++				       unsigned long index)
++{
++	unsigned long pool_index;
++
++	pool_index = get_pool_index(slot, index);
++	BUG_ON(!slot->rmap_list_pool[pool_index]);
++	kunmap(slot->rmap_list_pool[pool_index]);
++}
++
++static inline int entry_is_new(struct rmap_list_entry *entry)
++{
++	return !entry->item;
++}
++
++static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
++						unsigned long index)
++{
++	return slot->vma->vm_start + (index << PAGE_SHIFT);
++}
++
++static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
++{
++	unsigned long addr;
++
++	if (is_addr(entry->addr))
++		addr = get_clean_addr(entry->addr);
++	else if (entry->item)
++		addr = get_rmap_addr(entry->item);
++	else
++		BUG();
++
++	return addr;
++}
++
++static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
++{
++	if (is_addr(entry->addr))
++		return NULL;
++
++	return entry->item;
++}
++
++static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
++					    unsigned long index)
++{
++	unsigned long pool_index;
++
++	pool_index = get_pool_index(slot, index);
++	BUG_ON(!slot->rmap_list_pool[pool_index]);
++	slot->pool_counts[pool_index]++;
++}
++
++static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
++					    unsigned long index)
++{
++	unsigned long pool_index;
++
++	pool_index = get_pool_index(slot, index);
++	BUG_ON(!slot->rmap_list_pool[pool_index]);
++	BUG_ON(!slot->pool_counts[pool_index]);
++	slot->pool_counts[pool_index]--;
++}
++
++static inline int entry_has_rmap(struct rmap_list_entry *entry)
++{
++	return !is_addr(entry->addr) && entry->item;
++}
++
++static inline void swap_entries(struct rmap_list_entry *entry1,
++				unsigned long index1,
++				struct rmap_list_entry *entry2,
++				unsigned long index2)
++{
++	struct rmap_list_entry tmp;
++
++	/* swapping two new entries is meaningless */
++	BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
++
++	tmp = *entry1;
++	*entry1 = *entry2;
++	*entry2 = tmp;
++
++	if (entry_has_rmap(entry1))
++		entry1->item->entry_index = index1;
++
++	if (entry_has_rmap(entry2))
++		entry2->item->entry_index = index2;
++
++	if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
++		inc_rmap_list_pool_count(entry1->item->slot, index1);
++		dec_rmap_list_pool_count(entry1->item->slot, index2);
++	} else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
++		inc_rmap_list_pool_count(entry2->item->slot, index2);
++		dec_rmap_list_pool_count(entry2->item->slot, index1);
++	}
++}
++
++static inline void free_entry_item(struct rmap_list_entry *entry)
++{
++	unsigned long index;
++	struct rmap_item *item;
++
++	if (!is_addr(entry->addr)) {
++		BUG_ON(!entry->item);
++		item = entry->item;
++		entry->addr = get_rmap_addr(item);
++		set_is_addr(entry->addr);
++		index = item->entry_index;
++		remove_rmap_item_from_tree(item);
++		dec_rmap_list_pool_count(item->slot, index);
++		free_rmap_item(item);
++	}
++}
++
++static inline int pool_entry_boundary(unsigned long index)
++{
++	unsigned long linear_addr;
++
++	linear_addr = sizeof(struct rmap_list_entry *) * index;
++	return index && !offset_in_page(linear_addr);
++}
++
++static inline void try_free_last_pool(struct vma_slot *slot,
++				      unsigned long index)
++{
++	unsigned long pool_index;
++
++	pool_index = get_pool_index(slot, index);
++	if (slot->rmap_list_pool[pool_index] &&
++	    !slot->pool_counts[pool_index]) {
++		__free_page(slot->rmap_list_pool[pool_index]);
++		slot->rmap_list_pool[pool_index] = NULL;
++		slot->flags |= UKSM_SLOT_NEED_SORT;
++	}
++
++}
++
++static inline unsigned long vma_item_index(struct vm_area_struct *vma,
++					   struct rmap_item *item)
++{
++	return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
++}
++
++static int within_same_pool(struct vma_slot *slot,
++			    unsigned long i, unsigned long j)
++{
++	unsigned long pool_i, pool_j;
++
++	pool_i = get_pool_index(slot, i);
++	pool_j = get_pool_index(slot, j);
++
++	return (pool_i == pool_j);
++}
++
++static void sort_rmap_entry_list(struct vma_slot *slot)
++{
++	unsigned long i, j;
++	struct rmap_list_entry *entry, *swap_entry;
++
++	entry = get_rmap_list_entry(slot, 0, 0);
++	for (i = 0; i < slot->pages; ) {
++
++		if (!entry)
++			goto skip_whole_pool;
++
++		if (entry_is_new(entry))
++			goto next_entry;
++
++		if (is_addr(entry->addr)) {
++			entry->addr = 0;
++			goto next_entry;
++		}
++
++		j = vma_item_index(slot->vma, entry->item);
++		if (j == i)
++			goto next_entry;
++
++		if (within_same_pool(slot, i, j))
++			swap_entry = entry + j - i;
++		else
++			swap_entry = get_rmap_list_entry(slot, j, 1);
++
++		swap_entries(entry, i, swap_entry, j);
++		if (!within_same_pool(slot, i, j))
++			put_rmap_list_entry(slot, j);
++		continue;
++
++skip_whole_pool:
++		i += PAGE_SIZE / sizeof(*entry);
++		if (i < slot->pages)
++			entry = get_rmap_list_entry(slot, i, 0);
++		continue;
++
++next_entry:
++		if (i >= slot->pages - 1 ||
++		    !within_same_pool(slot, i, i + 1)) {
++			put_rmap_list_entry(slot, i);
++			if (i + 1 < slot->pages)
++				entry = get_rmap_list_entry(slot, i + 1, 0);
++		} else
++			entry++;
++		i++;
++		continue;
++	}
++
++	/* free empty pool entries which contain no rmap_item */
++	/* CAN be simplied to based on only pool_counts when bug freed !!!!! */
++	for (i = 0; i < slot->pool_size; i++) {
++		unsigned char has_rmap;
++		void *addr;
++
++		if (!slot->rmap_list_pool[i])
++			continue;
++
++		has_rmap = 0;
++		addr = kmap(slot->rmap_list_pool[i]);
++		BUG_ON(!addr);
++		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
++			entry = (struct rmap_list_entry *)addr + j;
++			if (is_addr(entry->addr))
++				continue;
++			if (!entry->item)
++				continue;
++			has_rmap = 1;
++		}
++		kunmap(slot->rmap_list_pool[i]);
++		if (!has_rmap) {
++			BUG_ON(slot->pool_counts[i]);
++			__free_page(slot->rmap_list_pool[i]);
++			slot->rmap_list_pool[i] = NULL;
++		}
++	}
++
++	slot->flags &= ~UKSM_SLOT_NEED_SORT;
++}
++
++/*
++ * vma_fully_scanned() - if all the pages in this slot have been scanned.
++ */
++static inline int vma_fully_scanned(struct vma_slot *slot)
++{
++	return slot->pages_scanned == slot->pages;
++}
++
++/**
++ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
++ * its random permutation. This function is embedded with the random
++ * permutation index management code.
++ */
++static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
++{
++	unsigned long rand_range, addr, swap_index, scan_index;
++	struct rmap_item *item = NULL;
++	struct rmap_list_entry *scan_entry, *swap_entry = NULL;
++	struct page *page;
++
++	scan_index = swap_index = slot->pages_scanned % slot->pages;
++
++	if (pool_entry_boundary(scan_index))
++		try_free_last_pool(slot, scan_index - 1);
++
++	if (vma_fully_scanned(slot)) {
++		if (slot->flags & UKSM_SLOT_NEED_SORT)
++			slot->flags |= UKSM_SLOT_NEED_RERAND;
++		else
++			slot->flags &= ~UKSM_SLOT_NEED_RERAND;
++		if (slot->flags & UKSM_SLOT_NEED_SORT)
++			sort_rmap_entry_list(slot);
++	}
++
++	scan_entry = get_rmap_list_entry(slot, scan_index, 1);
++	if (!scan_entry)
++		return NULL;
++
++	if (entry_is_new(scan_entry)) {
++		scan_entry->addr = get_index_orig_addr(slot, scan_index);
++		set_is_addr(scan_entry->addr);
++	}
++
++	if (slot->flags & UKSM_SLOT_NEED_RERAND) {
++		rand_range = slot->pages - scan_index;
++		BUG_ON(!rand_range);
++		swap_index = scan_index + (prandom_u32() % rand_range);
++	}
++
++	if (swap_index != scan_index) {
++		swap_entry = get_rmap_list_entry(slot, swap_index, 1);
++
++		if (!swap_entry)
++			return NULL;
++
++		if (entry_is_new(swap_entry)) {
++			swap_entry->addr = get_index_orig_addr(slot,
++							       swap_index);
++			set_is_addr(swap_entry->addr);
++		}
++		swap_entries(scan_entry, scan_index, swap_entry, swap_index);
++	}
++
++	addr = get_entry_address(scan_entry);
++	item = get_entry_item(scan_entry);
++	BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
++
++	page = follow_page(slot->vma, addr, FOLL_GET);
++	if (IS_ERR_OR_NULL(page))
++		goto nopage;
++
++	if (!PageAnon(page))
++		goto putpage;
++
++	/*check is zero_page pfn or uksm_zero_page*/
++	if ((page_to_pfn(page) == zero_pfn)
++			|| (page_to_pfn(page) == uksm_zero_pfn))
++		goto putpage;
++
++	flush_anon_page(slot->vma, page, addr);
++	flush_dcache_page(page);
++
++
++	*hash = page_hash(page, hash_strength, 1);
++	inc_uksm_pages_scanned();
++	/*if the page content all zero, re-map to zero-page*/
++	if (find_zero_page_hash(hash_strength, *hash)) {
++		if (!cmp_and_merge_zero_page(slot->vma, page)) {
++			slot->pages_merged++;
++
++			/* For full-zero pages, no need to create rmap item */
++			goto putpage;
++		} else {
++			inc_rshash_neg(memcmp_cost / 2);
++		}
++	}
++
++	if (!item) {
++		item = alloc_rmap_item();
++		if (item) {
++			/* It has already been zeroed */
++			item->slot = slot;
++			item->address = addr;
++			item->entry_index = scan_index;
++			scan_entry->item = item;
++			inc_rmap_list_pool_count(slot, scan_index);
++		} else
++			goto putpage;
++	}
++
++	BUG_ON(item->slot != slot);
++	/* the page may have changed */
++	item->page = page;
++	put_rmap_list_entry(slot, scan_index);
++	if (swap_entry)
++		put_rmap_list_entry(slot, swap_index);
++	return item;
++
++putpage:
++	put_page(page);
++	page = NULL;
++nopage:
++	/* no page, store addr back and free rmap_item if possible */
++	free_entry_item(scan_entry);
++	put_rmap_list_entry(slot, scan_index);
++	if (swap_entry)
++		put_rmap_list_entry(slot, swap_index);
++	return NULL;
++}
++
++static inline int in_stable_tree(struct rmap_item *rmap_item)
++{
++	return rmap_item->address & STABLE_FLAG;
++}
++
++/**
++ * scan_vma_one_page() - scan the next page in a vma_slot. Called with
++ * mmap_sem locked.
++ */
++static noinline void scan_vma_one_page(struct vma_slot *slot)
++{
++	u32 hash;
++	struct mm_struct *mm;
++	struct rmap_item *rmap_item = NULL;
++	struct vm_area_struct *vma = slot->vma;
++
++	mm = vma->vm_mm;
++	BUG_ON(!mm);
++	BUG_ON(!slot);
++
++	rmap_item = get_next_rmap_item(slot, &hash);
++	if (!rmap_item)
++		goto out1;
++
++	if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
++		goto out2;
++
++	cmp_and_merge_page(rmap_item, hash);
++out2:
++	put_page(rmap_item->page);
++out1:
++	slot->pages_scanned++;
++	slot->this_sampled++;
++	if (slot->fully_scanned_round != fully_scanned_round)
++		scanned_virtual_pages++;
++
++	if (vma_fully_scanned(slot))
++		slot->fully_scanned_round = fully_scanned_round;
++}
++
++static inline unsigned long rung_get_pages(struct scan_rung *rung)
++{
++	struct slot_tree_node *node;
++
++	if (!rung->vma_root.rnode)
++		return 0;
++
++	node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
++
++	return node->size;
++}
++
++#define RUNG_SAMPLED_MIN	3
++
++static inline
++void uksm_calc_rung_step(struct scan_rung *rung,
++			 unsigned long page_time, unsigned long ratio)
++{
++	unsigned long sampled, pages;
++
++	/* will be fully scanned ? */
++	if (!rung->cover_msecs) {
++		rung->step = 1;
++		return;
++	}
++
++	sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
++		  * ratio / page_time;
++
++	/*
++	 *  Before we finsish a scan round and expensive per-round jobs,
++	 *  we need to have a chance to estimate the per page time. So
++	 *  the sampled number can not be too small.
++	 */
++	if (sampled < RUNG_SAMPLED_MIN)
++		sampled = RUNG_SAMPLED_MIN;
++
++	pages = rung_get_pages(rung);
++	if (likely(pages > sampled))
++		rung->step = pages / sampled;
++	else
++		rung->step = 1;
++}
++
++static inline int step_need_recalc(struct scan_rung *rung)
++{
++	unsigned long pages, stepmax;
++
++	pages = rung_get_pages(rung);
++	stepmax = pages / RUNG_SAMPLED_MIN;
++
++	return pages && (rung->step > pages ||
++			 (stepmax && rung->step > stepmax));
++}
++
++static inline
++void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
++{
++	struct vma_slot *slot;
++
++	if (finished)
++		rung->flags |= UKSM_RUNG_ROUND_FINISHED;
++
++	if (step_recalc || step_need_recalc(rung)) {
++		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
++		BUG_ON(step_need_recalc(rung));
++	}
++
++	slot_iter_index = prandom_u32() % rung->step;
++	BUG_ON(!rung->vma_root.rnode);
++	slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
++	BUG_ON(!slot);
++
++	rung->current_scan = slot;
++	rung->current_offset = slot_iter_index;
++}
++
++static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
++{
++	return &slot->rung->vma_root;
++}
++
++/*
++ * return if resetted.
++ */
++static int advance_current_scan(struct scan_rung *rung)
++{
++	unsigned short n;
++	struct vma_slot *slot, *next = NULL;
++
++	BUG_ON(!rung->vma_root.num);
++
++	slot = rung->current_scan;
++	n = (slot->pages - rung->current_offset) % rung->step;
++	slot_iter_index = rung->step - n;
++	next = sradix_tree_next(&rung->vma_root, slot->snode,
++				slot->sindex, slot_iter);
++
++	if (next) {
++		rung->current_offset = slot_iter_index;
++		rung->current_scan = next;
++		return 0;
++	} else {
++		reset_current_scan(rung, 1, 0);
++		return 1;
++	}
++}
++
++static inline void rung_rm_slot(struct vma_slot *slot)
++{
++	struct scan_rung *rung = slot->rung;
++	struct sradix_tree_root *root;
++
++	if (rung->current_scan == slot)
++		advance_current_scan(rung);
++
++	root = slot_get_root(slot);
++	sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
++	slot->snode = NULL;
++	if (step_need_recalc(rung)) {
++		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
++		BUG_ON(step_need_recalc(rung));
++	}
++
++	/* In case advance_current_scan loop back to this slot again */
++	if (rung->vma_root.num && rung->current_scan == slot)
++		reset_current_scan(slot->rung, 1, 0);
++}
++
++static inline void rung_add_new_slots(struct scan_rung *rung,
++			struct vma_slot **slots, unsigned long num)
++{
++	int err;
++	struct vma_slot *slot;
++	unsigned long i;
++	struct sradix_tree_root *root = &rung->vma_root;
++
++	err = sradix_tree_enter(root, (void **)slots, num);
++	BUG_ON(err);
++
++	for (i = 0; i < num; i++) {
++		slot = slots[i];
++		slot->rung = rung;
++		BUG_ON(vma_fully_scanned(slot));
++	}
++
++	if (rung->vma_root.num == num)
++		reset_current_scan(rung, 0, 1);
++}
++
++static inline int rung_add_one_slot(struct scan_rung *rung,
++				     struct vma_slot *slot)
++{
++	int err;
++
++	err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
++	if (err)
++		return err;
++
++	slot->rung = rung;
++	if (rung->vma_root.num == 1)
++		reset_current_scan(rung, 0, 1);
++
++	return 0;
++}
++
++/*
++ * Return true if the slot is deleted from its rung.
++ */
++static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
++{
++	struct scan_rung *old_rung = slot->rung;
++	int err;
++
++	if (old_rung == rung)
++		return 0;
++
++	rung_rm_slot(slot);
++	err = rung_add_one_slot(rung, slot);
++	if (err) {
++		err = rung_add_one_slot(old_rung, slot);
++		WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
++	}
++
++	return 1;
++}
++
++static inline int vma_rung_up(struct vma_slot *slot)
++{
++	struct scan_rung *rung;
++
++	rung = slot->rung;
++	if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
++		rung++;
++
++	return vma_rung_enter(slot, rung);
++}
++
++static inline int vma_rung_down(struct vma_slot *slot)
++{
++	struct scan_rung *rung;
++
++	rung = slot->rung;
++	if (slot->rung != &uksm_scan_ladder[0])
++		rung--;
++
++	return vma_rung_enter(slot, rung);
++}
++
++/**
++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
++ */
++static unsigned long cal_dedup_ratio(struct vma_slot *slot)
++{
++	unsigned long ret;
++	unsigned long pages;
++
++	pages = slot->this_sampled;
++	if (!pages)
++		return 0;
++
++	BUG_ON(slot->pages_scanned == slot->last_scanned);
++
++	ret = slot->pages_merged;
++
++	/* Thrashing area filtering */
++	if (ret && uksm_thrash_threshold) {
++		if (slot->pages_cowed * 100 / slot->pages_merged
++		    > uksm_thrash_threshold) {
++			ret = 0;
++		} else {
++			ret = slot->pages_merged - slot->pages_cowed;
++		}
++	}
++
++	return ret * 100 / pages;
++}
++
++/**
++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
++ */
++static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
++{
++	unsigned long ret;
++	unsigned long pages;
++
++	pages = slot->pages;
++	if (!pages)
++		return 0;
++
++	ret = slot->pages_bemerged;
++
++	/* Thrashing area filtering */
++	if (ret && uksm_thrash_threshold) {
++		if (slot->pages_cowed * 100 / slot->pages_bemerged
++		    > uksm_thrash_threshold) {
++			ret = 0;
++		} else {
++			ret = slot->pages_bemerged - slot->pages_cowed;
++		}
++	}
++
++	return ret * 100 / pages;
++}
++
++/**
++ * stable_node_reinsert() - When the hash_strength has been adjusted, the
++ * stable tree need to be restructured, this is the function re-inserting the
++ * stable node.
++ */
++static inline void stable_node_reinsert(struct stable_node *new_node,
++					struct page *page,
++					struct rb_root *root_treep,
++					struct list_head *tree_node_listp,
++					u32 hash)
++{
++	struct rb_node **new = &root_treep->rb_node;
++	struct rb_node *parent = NULL;
++	struct stable_node *stable_node;
++	struct tree_node *tree_node;
++	struct page *tree_page;
++	int cmp;
++
++	while (*new) {
++		int cmp;
++
++		tree_node = rb_entry(*new, struct tree_node, node);
++
++		cmp = hash_cmp(hash, tree_node->hash);
++
++		if (cmp < 0) {
++			parent = *new;
++			new = &parent->rb_left;
++		} else if (cmp > 0) {
++			parent = *new;
++			new = &parent->rb_right;
++		} else
++			break;
++	}
++
++	if (*new) {
++		/* find a stable tree node with same first level hash value */
++		stable_node_hash_max(new_node, page, hash);
++		if (tree_node->count == 1) {
++			stable_node = rb_entry(tree_node->sub_root.rb_node,
++					       struct stable_node, node);
++			tree_page = get_uksm_page(stable_node, 1, 0);
++			if (tree_page) {
++				stable_node_hash_max(stable_node,
++						      tree_page, hash);
++				put_page(tree_page);
++
++				/* prepare for stable node insertion */
++
++				cmp = hash_cmp(new_node->hash_max,
++						   stable_node->hash_max);
++				parent = &stable_node->node;
++				if (cmp < 0)
++					new = &parent->rb_left;
++				else if (cmp > 0)
++					new = &parent->rb_right;
++				else
++					goto failed;
++
++				goto add_node;
++			} else {
++				/* the only stable_node deleted, the tree node
++				 * was not deleted.
++				 */
++				goto tree_node_reuse;
++			}
++		}
++
++		/* well, search the collision subtree */
++		new = &tree_node->sub_root.rb_node;
++		parent = NULL;
++		BUG_ON(!*new);
++		while (*new) {
++			int cmp;
++
++			stable_node = rb_entry(*new, struct stable_node, node);
++
++			cmp = hash_cmp(new_node->hash_max,
++					   stable_node->hash_max);
++
++			if (cmp < 0) {
++				parent = *new;
++				new = &parent->rb_left;
++			} else if (cmp > 0) {
++				parent = *new;
++				new = &parent->rb_right;
++			} else {
++				/* oh, no, still a collision */
++				goto failed;
++			}
++		}
++
++		goto add_node;
++	}
++
++	/* no tree node found */
++	tree_node = alloc_tree_node(tree_node_listp);
++	if (!tree_node) {
++		pr_err("UKSM: memory allocation error!\n");
++		goto failed;
++	} else {
++		tree_node->hash = hash;
++		rb_link_node(&tree_node->node, parent, new);
++		rb_insert_color(&tree_node->node, root_treep);
++
++tree_node_reuse:
++		/* prepare for stable node insertion */
++		parent = NULL;
++		new = &tree_node->sub_root.rb_node;
++	}
++
++add_node:
++	rb_link_node(&new_node->node, parent, new);
++	rb_insert_color(&new_node->node, &tree_node->sub_root);
++	new_node->tree_node = tree_node;
++	tree_node->count++;
++	return;
++
++failed:
++	/* This can only happen when two nodes have collided
++	 * in two levels.
++	 */
++	new_node->tree_node = NULL;
++	return;
++}
++
++static inline void free_all_tree_nodes(struct list_head *list)
++{
++	struct tree_node *node, *tmp;
++
++	list_for_each_entry_safe(node, tmp, list, all_list) {
++		free_tree_node(node);
++	}
++}
++
++/**
++ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
++ * strength to the current hash_strength. It re-structures the hole tree.
++ */
++static inline void stable_tree_delta_hash(u32 prev_hash_strength)
++{
++	struct stable_node *node, *tmp;
++	struct rb_root *root_new_treep;
++	struct list_head *new_tree_node_listp;
++
++	stable_tree_index = (stable_tree_index + 1) % 2;
++	root_new_treep = &root_stable_tree[stable_tree_index];
++	new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
++	*root_new_treep = RB_ROOT;
++	BUG_ON(!list_empty(new_tree_node_listp));
++
++	/*
++	 * we need to be safe, the node could be removed by get_uksm_page()
++	 */
++	list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
++		void *addr;
++		struct page *node_page;
++		u32 hash;
++
++		/*
++		 * We are completely re-structuring the stable nodes to a new
++		 * stable tree. We don't want to touch the old tree unlinks and
++		 * old tree_nodes. The old tree_nodes will be freed at once.
++		 */
++		node_page = get_uksm_page(node, 0, 0);
++		if (!node_page)
++			continue;
++
++		if (node->tree_node) {
++			hash = node->tree_node->hash;
++
++			addr = kmap_atomic(node_page);
++
++			hash = delta_hash(addr, prev_hash_strength,
++					  hash_strength, hash);
++			kunmap_atomic(addr);
++		} else {
++			/*
++			 *it was not inserted to rbtree due to collision in last
++			 *round scan.
++			 */
++			hash = page_hash(node_page, hash_strength, 0);
++		}
++
++		stable_node_reinsert(node, node_page, root_new_treep,
++				     new_tree_node_listp, hash);
++		put_page(node_page);
++	}
++
++	root_stable_treep = root_new_treep;
++	free_all_tree_nodes(stable_tree_node_listp);
++	BUG_ON(!list_empty(stable_tree_node_listp));
++	stable_tree_node_listp = new_tree_node_listp;
++}
++
++static inline void inc_hash_strength(unsigned long delta)
++{
++	hash_strength += 1 << delta;
++	if (hash_strength > HASH_STRENGTH_MAX)
++		hash_strength = HASH_STRENGTH_MAX;
++}
++
++static inline void dec_hash_strength(unsigned long delta)
++{
++	unsigned long change = 1 << delta;
++
++	if (hash_strength <= change + 1)
++		hash_strength = 1;
++	else
++		hash_strength -= change;
++}
++
++static inline void inc_hash_strength_delta(void)
++{
++	hash_strength_delta++;
++	if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
++		hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
++}
++
++static inline unsigned long get_current_neg_ratio(void)
++{
++	u64 pos = benefit.pos;
++	u64 neg = benefit.neg;
++
++	if (!neg)
++		return 0;
++
++	if (!pos || neg > pos)
++		return 100;
++
++	if (neg > div64_u64(U64_MAX, 100))
++		pos = div64_u64(pos, 100);
++	else
++		neg *= 100;
++
++	return div64_u64(neg, pos);
++}
++
++static inline unsigned long get_current_benefit(void)
++{
++	u64 pos = benefit.pos;
++	u64 neg = benefit.neg;
++	u64 scanned = benefit.scanned;
++
++	if (neg > pos)
++		return 0;
++
++	return div64_u64((pos - neg), scanned);
++}
++
++static inline int judge_rshash_direction(void)
++{
++	u64 current_neg_ratio, stable_benefit;
++	u64 current_benefit, delta = 0;
++	int ret = STILL;
++
++	/*
++	 * Try to probe a value after the boot, and in case the system
++	 * are still for a long time.
++	 */
++	if ((fully_scanned_round & 0xFFULL) == 10) {
++		ret = OBSCURE;
++		goto out;
++	}
++
++	current_neg_ratio = get_current_neg_ratio();
++
++	if (current_neg_ratio == 0) {
++		rshash_neg_cont_zero++;
++		if (rshash_neg_cont_zero > 2)
++			return GO_DOWN;
++		else
++			return STILL;
++	}
++	rshash_neg_cont_zero = 0;
++
++	if (current_neg_ratio > 90) {
++		ret = GO_UP;
++		goto out;
++	}
++
++	current_benefit = get_current_benefit();
++	stable_benefit = rshash_state.stable_benefit;
++
++	if (!stable_benefit) {
++		ret = OBSCURE;
++		goto out;
++	}
++
++	if (current_benefit > stable_benefit)
++		delta = current_benefit - stable_benefit;
++	else if (current_benefit < stable_benefit)
++		delta = stable_benefit - current_benefit;
++
++	delta = div64_u64(100 * delta, stable_benefit);
++
++	if (delta > 50) {
++		rshash_cont_obscure++;
++		if (rshash_cont_obscure > 2)
++			return OBSCURE;
++		else
++			return STILL;
++	}
++
++out:
++	rshash_cont_obscure = 0;
++	return ret;
++}
++
++/**
++ * rshash_adjust() - The main function to control the random sampling state
++ * machine for hash strength adapting.
++ *
++ * return true if hash_strength has changed.
++ */
++static inline int rshash_adjust(void)
++{
++	unsigned long prev_hash_strength = hash_strength;
++
++	if (!encode_benefit())
++		return 0;
++
++	switch (rshash_state.state) {
++	case RSHASH_STILL:
++		switch (judge_rshash_direction()) {
++		case GO_UP:
++			if (rshash_state.pre_direct == GO_DOWN)
++				hash_strength_delta = 0;
++
++			inc_hash_strength(hash_strength_delta);
++			inc_hash_strength_delta();
++			rshash_state.stable_benefit = get_current_benefit();
++			rshash_state.pre_direct = GO_UP;
++			break;
++
++		case GO_DOWN:
++			if (rshash_state.pre_direct == GO_UP)
++				hash_strength_delta = 0;
++
++			dec_hash_strength(hash_strength_delta);
++			inc_hash_strength_delta();
++			rshash_state.stable_benefit = get_current_benefit();
++			rshash_state.pre_direct = GO_DOWN;
++			break;
++
++		case OBSCURE:
++			rshash_state.stable_point = hash_strength;
++			rshash_state.turn_point_down = hash_strength;
++			rshash_state.turn_point_up = hash_strength;
++			rshash_state.turn_benefit_down = get_current_benefit();
++			rshash_state.turn_benefit_up = get_current_benefit();
++			rshash_state.lookup_window_index = 0;
++			rshash_state.state = RSHASH_TRYDOWN;
++			dec_hash_strength(hash_strength_delta);
++			inc_hash_strength_delta();
++			break;
++
++		case STILL:
++			break;
++		default:
++			BUG();
++		}
++		break;
++
++	case RSHASH_TRYDOWN:
++		if (rshash_state.lookup_window_index++ % 5 == 0)
++			rshash_state.below_count = 0;
++
++		if (get_current_benefit() < rshash_state.stable_benefit)
++			rshash_state.below_count++;
++		else if (get_current_benefit() >
++			 rshash_state.turn_benefit_down) {
++			rshash_state.turn_point_down = hash_strength;
++			rshash_state.turn_benefit_down = get_current_benefit();
++		}
++
++		if (rshash_state.below_count >= 3 ||
++		    judge_rshash_direction() == GO_UP ||
++		    hash_strength == 1) {
++			hash_strength = rshash_state.stable_point;
++			hash_strength_delta = 0;
++			inc_hash_strength(hash_strength_delta);
++			inc_hash_strength_delta();
++			rshash_state.lookup_window_index = 0;
++			rshash_state.state = RSHASH_TRYUP;
++			hash_strength_delta = 0;
++		} else {
++			dec_hash_strength(hash_strength_delta);
++			inc_hash_strength_delta();
++		}
++		break;
++
++	case RSHASH_TRYUP:
++		if (rshash_state.lookup_window_index++ % 5 == 0)
++			rshash_state.below_count = 0;
++
++		if (get_current_benefit() < rshash_state.turn_benefit_down)
++			rshash_state.below_count++;
++		else if (get_current_benefit() > rshash_state.turn_benefit_up) {
++			rshash_state.turn_point_up = hash_strength;
++			rshash_state.turn_benefit_up = get_current_benefit();
++		}
++
++		if (rshash_state.below_count >= 3 ||
++		    judge_rshash_direction() == GO_DOWN ||
++		    hash_strength == HASH_STRENGTH_MAX) {
++			hash_strength = rshash_state.turn_benefit_up >
++				rshash_state.turn_benefit_down ?
++				rshash_state.turn_point_up :
++				rshash_state.turn_point_down;
++
++			rshash_state.state = RSHASH_PRE_STILL;
++		} else {
++			inc_hash_strength(hash_strength_delta);
++			inc_hash_strength_delta();
++		}
++
++		break;
++
++	case RSHASH_NEW:
++	case RSHASH_PRE_STILL:
++		rshash_state.stable_benefit = get_current_benefit();
++		rshash_state.state = RSHASH_STILL;
++		hash_strength_delta = 0;
++		break;
++	default:
++		BUG();
++	}
++
++	/* rshash_neg = rshash_pos = 0; */
++	reset_benefit();
++
++	if (prev_hash_strength != hash_strength)
++		stable_tree_delta_hash(prev_hash_strength);
++
++	return prev_hash_strength != hash_strength;
++}
++
++/**
++ * round_update_ladder() - The main function to do update of all the
++ * adjustments whenever a scan round is finished.
++ */
++static noinline void round_update_ladder(void)
++{
++	int i;
++	unsigned long dedup;
++	struct vma_slot *slot, *tmp_slot;
++
++	for (i = 0; i < SCAN_LADDER_SIZE; i++)
++		uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
++
++	list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
++
++		/* slot may be rung_rm_slot() when mm exits */
++		if (slot->snode) {
++			dedup = cal_dedup_ratio_old(slot);
++			if (dedup && dedup >= uksm_abundant_threshold)
++				vma_rung_up(slot);
++		}
++
++		slot->pages_bemerged = 0;
++		slot->pages_cowed = 0;
++
++		list_del_init(&slot->dedup_list);
++	}
++}
++
++static void uksm_del_vma_slot(struct vma_slot *slot)
++{
++	int i, j;
++	struct rmap_list_entry *entry;
++
++	if (slot->snode) {
++		/*
++		 * In case it just failed when entering the rung, it's not
++		 * necessary.
++		 */
++		rung_rm_slot(slot);
++	}
++
++	if (!list_empty(&slot->dedup_list))
++		list_del(&slot->dedup_list);
++
++	if (!slot->rmap_list_pool || !slot->pool_counts) {
++		/* In case it OOMed in uksm_vma_enter() */
++		goto out;
++	}
++
++	for (i = 0; i < slot->pool_size; i++) {
++		void *addr;
++
++		if (!slot->rmap_list_pool[i])
++			continue;
++
++		addr = kmap(slot->rmap_list_pool[i]);
++		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
++			entry = (struct rmap_list_entry *)addr + j;
++			if (is_addr(entry->addr))
++				continue;
++			if (!entry->item)
++				continue;
++
++			remove_rmap_item_from_tree(entry->item);
++			free_rmap_item(entry->item);
++			slot->pool_counts[i]--;
++		}
++		BUG_ON(slot->pool_counts[i]);
++		kunmap(slot->rmap_list_pool[i]);
++		__free_page(slot->rmap_list_pool[i]);
++	}
++	kfree(slot->rmap_list_pool);
++	kfree(slot->pool_counts);
++
++out:
++	slot->rung = NULL;
++	if (slot->flags & UKSM_SLOT_IN_UKSM) {
++		BUG_ON(uksm_pages_total < slot->pages);
++		uksm_pages_total -= slot->pages;
++	}
++
++	if (slot->fully_scanned_round == fully_scanned_round)
++		scanned_virtual_pages -= slot->pages;
++	else
++		scanned_virtual_pages -= slot->pages_scanned;
++	free_vma_slot(slot);
++}
++
++
++#define SPIN_LOCK_PERIOD	32
++static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
++static inline void cleanup_vma_slots(void)
++{
++	struct vma_slot *slot;
++	int i;
++
++	i = 0;
++	spin_lock(&vma_slot_list_lock);
++	while (!list_empty(&vma_slot_del)) {
++		slot = list_entry(vma_slot_del.next,
++				  struct vma_slot, slot_list);
++		list_del(&slot->slot_list);
++		cleanup_slots[i++] = slot;
++		if (i == SPIN_LOCK_PERIOD) {
++			spin_unlock(&vma_slot_list_lock);
++			while (--i >= 0)
++				uksm_del_vma_slot(cleanup_slots[i]);
++			i = 0;
++			spin_lock(&vma_slot_list_lock);
++		}
++	}
++	spin_unlock(&vma_slot_list_lock);
++
++	while (--i >= 0)
++		uksm_del_vma_slot(cleanup_slots[i]);
++}
++
++/*
++ * Expotional moving average formula
++ */
++static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
++{
++	/*
++	 * For a very high burst, even the ema cannot work well, a false very
++	 * high per-page time estimation can result in feedback in very high
++	 * overhead of context switch and rung update -- this will then lead
++	 * to higher per-paper time, this may not converge.
++	 *
++	 * Instead, we try to approach this value in a binary manner.
++	 */
++	if (curr > last_ema * 10)
++		return last_ema * 2;
++
++	return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
++}
++
++/*
++ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
++ * nanoseconds based on current uksm_sleep_jiffies.
++ */
++static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
++{
++	return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
++		(TIME_RATIO_SCALE - ratio) * ratio;
++}
++
++
++static inline unsigned long rung_real_ratio(int cpu_time_ratio)
++{
++	unsigned long ret;
++
++	BUG_ON(!cpu_time_ratio);
++
++	if (cpu_time_ratio > 0)
++		ret = cpu_time_ratio;
++	else
++		ret = (unsigned long)(-cpu_time_ratio) *
++			uksm_max_cpu_percentage / 100UL;
++
++	return ret ? ret : 1;
++}
++
++static noinline void uksm_calc_scan_pages(void)
++{
++	struct scan_rung *ladder = uksm_scan_ladder;
++	unsigned long sleep_usecs, nsecs;
++	unsigned long ratio;
++	int i;
++	unsigned long per_page;
++
++	if (uksm_ema_page_time > 100000 ||
++	    (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
++		uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
++
++	per_page = uksm_ema_page_time;
++	BUG_ON(!per_page);
++
++	/*
++	 * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
++	 * based on saved user input.
++	 */
++	if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
++		uksm_sleep_jiffies = uksm_sleep_saved;
++
++	/* We require a rung scan at least 1 page in a period. */
++	nsecs = per_page;
++	ratio = rung_real_ratio(ladder[0].cpu_ratio);
++	if (cpu_ratio_to_nsec(ratio) < nsecs) {
++		sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
++				/ NSEC_PER_USEC;
++		uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
++	}
++
++	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++		ratio = rung_real_ratio(ladder[i].cpu_ratio);
++		ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
++					per_page;
++		BUG_ON(!ladder[i].pages_to_scan);
++		uksm_calc_rung_step(&ladder[i], per_page, ratio);
++	}
++}
++
++/*
++ * From the scan time of this round (ns) to next expected min sleep time
++ * (ms), be careful of the possible overflows. ratio is taken from
++ * rung_real_ratio()
++ */
++static inline
++unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
++{
++	scan_time >>= 20; /* to msec level now */
++	BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
++
++	return (unsigned int) ((unsigned long) scan_time *
++			       (TIME_RATIO_SCALE - ratio) / ratio);
++}
++
++#define __round_mask(x, y) ((__typeof__(x))((y)-1))
++#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
++
++static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
++{
++	struct scan_rung *rung;
++
++	rung = &uksm_scan_ladder[0];
++	rung_add_new_slots(rung, slots, num);
++}
++
++static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
++
++static void uksm_enter_all_slots(void)
++{
++	struct vma_slot *slot;
++	unsigned long index;
++	struct list_head empty_vma_list;
++	int i;
++
++	i = 0;
++	index = 0;
++	INIT_LIST_HEAD(&empty_vma_list);
++
++	spin_lock(&vma_slot_list_lock);
++	while (!list_empty(&vma_slot_new)) {
++		slot = list_entry(vma_slot_new.next,
++				  struct vma_slot, slot_list);
++
++		if (!slot->vma->anon_vma) {
++			list_move(&slot->slot_list, &empty_vma_list);
++		} else if (vma_can_enter(slot->vma)) {
++			batch_slots[index++] = slot;
++			list_del_init(&slot->slot_list);
++		} else {
++			list_move(&slot->slot_list, &vma_slot_noadd);
++		}
++
++		if (++i == SPIN_LOCK_PERIOD ||
++		    (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
++			spin_unlock(&vma_slot_list_lock);
++
++			if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
++				uksm_vma_enter(batch_slots, index);
++				index = 0;
++			}
++			i = 0;
++			cond_resched();
++			spin_lock(&vma_slot_list_lock);
++		}
++	}
++
++	list_splice(&empty_vma_list, &vma_slot_new);
++
++	spin_unlock(&vma_slot_list_lock);
++
++	if (index)
++		uksm_vma_enter(batch_slots, index);
++
++}
++
++static inline int rung_round_finished(struct scan_rung *rung)
++{
++	return rung->flags & UKSM_RUNG_ROUND_FINISHED;
++}
++
++static inline void judge_slot(struct vma_slot *slot)
++{
++	struct scan_rung *rung = slot->rung;
++	unsigned long dedup;
++	int deleted;
++
++	dedup = cal_dedup_ratio(slot);
++	if (vma_fully_scanned(slot) && uksm_thrash_threshold)
++		deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
++	else if (dedup && dedup >= uksm_abundant_threshold)
++		deleted = vma_rung_up(slot);
++	else
++		deleted = vma_rung_down(slot);
++
++	slot->pages_merged = 0;
++	slot->pages_cowed = 0;
++	slot->this_sampled = 0;
++
++	if (vma_fully_scanned(slot))
++		slot->pages_scanned = 0;
++
++	slot->last_scanned = slot->pages_scanned;
++
++	/* If its deleted in above, then rung was already advanced. */
++	if (!deleted)
++		advance_current_scan(rung);
++}
++
++
++static inline int hash_round_finished(void)
++{
++	if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
++		scanned_virtual_pages = 0;
++		if (uksm_pages_scanned)
++			fully_scanned_round++;
++
++		return 1;
++	} else {
++		return 0;
++	}
++}
++
++#define UKSM_MMSEM_BATCH	5
++#define BUSY_RETRY		100
++
++/**
++ * uksm_do_scan()  - the main worker function.
++ */
++static noinline void uksm_do_scan(void)
++{
++	struct vma_slot *slot, *iter;
++	struct mm_struct *busy_mm;
++	unsigned char round_finished, all_rungs_emtpy;
++	int i, err, mmsem_batch;
++	unsigned long pcost;
++	long long delta_exec;
++	unsigned long vpages, max_cpu_ratio;
++	unsigned long long start_time, end_time, scan_time;
++	unsigned int expected_jiffies;
++
++	might_sleep();
++
++	vpages = 0;
++
++	start_time = task_sched_runtime(current);
++	max_cpu_ratio = 0;
++	mmsem_batch = 0;
++
++	for (i = 0; i < SCAN_LADDER_SIZE;) {
++		struct scan_rung *rung = &uksm_scan_ladder[i];
++		unsigned long ratio;
++		int busy_retry;
++
++		if (!rung->pages_to_scan) {
++			i++;
++			continue;
++		}
++
++		if (!rung->vma_root.num) {
++			rung->pages_to_scan = 0;
++			i++;
++			continue;
++		}
++
++		ratio = rung_real_ratio(rung->cpu_ratio);
++		if (ratio > max_cpu_ratio)
++			max_cpu_ratio = ratio;
++
++		busy_retry = BUSY_RETRY;
++		/*
++		 * Do not consider rung_round_finished() here, just used up the
++		 * rung->pages_to_scan quota.
++		 */
++		while (rung->pages_to_scan && rung->vma_root.num &&
++		       likely(!freezing(current))) {
++			int reset = 0;
++
++			slot = rung->current_scan;
++
++			BUG_ON(vma_fully_scanned(slot));
++
++			if (mmsem_batch)
++				err = 0;
++			else
++				err = try_down_read_slot_mmap_sem(slot);
++
++			if (err == -ENOENT) {
++rm_slot:
++				rung_rm_slot(slot);
++				continue;
++			}
++
++			busy_mm = slot->mm;
++
++			if (err == -EBUSY) {
++				/* skip other vmas on the same mm */
++				do {
++					reset = advance_current_scan(rung);
++					iter = rung->current_scan;
++					busy_retry--;
++					if (iter->vma->vm_mm != busy_mm ||
++					    !busy_retry || reset)
++						break;
++				} while (1);
++
++				if (iter->vma->vm_mm != busy_mm) {
++					continue;
++				} else {
++					/* scan round finsished */
++					break;
++				}
++			}
++
++			BUG_ON(!vma_can_enter(slot->vma));
++			if (uksm_test_exit(slot->vma->vm_mm)) {
++				mmsem_batch = 0;
++				mmap_read_unlock(slot->vma->vm_mm);
++				goto rm_slot;
++			}
++
++			if (mmsem_batch)
++				mmsem_batch--;
++			else
++				mmsem_batch = UKSM_MMSEM_BATCH;
++
++			/* Ok, we have take the mmap_sem, ready to scan */
++			scan_vma_one_page(slot);
++			rung->pages_to_scan--;
++			vpages++;
++
++			if (rung->current_offset + rung->step > slot->pages - 1
++			    || vma_fully_scanned(slot)) {
++				mmap_read_unlock(slot->vma->vm_mm);
++				judge_slot(slot);
++				mmsem_batch = 0;
++			} else {
++				rung->current_offset += rung->step;
++				if (!mmsem_batch)
++					mmap_read_unlock(slot->vma->vm_mm);
++			}
++
++			busy_retry = BUSY_RETRY;
++			cond_resched();
++		}
++
++		if (mmsem_batch) {
++			mmap_read_unlock(slot->vma->vm_mm);
++			mmsem_batch = 0;
++		}
++
++		if (freezing(current))
++			break;
++
++		cond_resched();
++	}
++	end_time = task_sched_runtime(current);
++	delta_exec = end_time - start_time;
++
++	if (freezing(current))
++		return;
++
++	cleanup_vma_slots();
++	uksm_enter_all_slots();
++
++	round_finished = 1;
++	all_rungs_emtpy = 1;
++	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++		struct scan_rung *rung = &uksm_scan_ladder[i];
++
++		if (rung->vma_root.num) {
++			all_rungs_emtpy = 0;
++			if (!rung_round_finished(rung))
++				round_finished = 0;
++		}
++	}
++
++	if (all_rungs_emtpy)
++		round_finished = 0;
++
++	if (round_finished) {
++		round_update_ladder();
++		uksm_eval_round++;
++
++		if (hash_round_finished() && rshash_adjust()) {
++			/* Reset the unstable root iff hash strength changed */
++			uksm_hash_round++;
++			root_unstable_tree = RB_ROOT;
++			free_all_tree_nodes(&unstable_tree_node_list);
++		}
++
++		/*
++		 * A number of pages can hang around indefinitely on per-cpu
++		 * pagevecs, raised page count preventing write_protect_page
++		 * from merging them.  Though it doesn't really matter much,
++		 * it is puzzling to see some stuck in pages_volatile until
++		 * other activity jostles them out, and they also prevented
++		 * LTP's KSM test from succeeding deterministically; so drain
++		 * them here (here rather than on entry to uksm_do_scan(),
++		 * so we don't IPI too often when pages_to_scan is set low).
++		 */
++		lru_add_drain_all();
++	}
++
++
++	if (vpages && delta_exec > 0) {
++		pcost = (unsigned long) delta_exec / vpages;
++		if (likely(uksm_ema_page_time))
++			uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
++		else
++			uksm_ema_page_time = pcost;
++	}
++
++	uksm_calc_scan_pages();
++	uksm_sleep_real = uksm_sleep_jiffies;
++	/* in case of radical cpu bursts, apply the upper bound */
++	end_time = task_sched_runtime(current);
++	if (max_cpu_ratio && end_time > start_time) {
++		scan_time = end_time - start_time;
++		expected_jiffies = msecs_to_jiffies(
++			scan_time_to_sleep(scan_time, max_cpu_ratio));
++
++		if (expected_jiffies > uksm_sleep_real)
++			uksm_sleep_real = expected_jiffies;
++
++		/* We have a 1 second up bound for responsiveness. */
++		if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
++			uksm_sleep_real = msecs_to_jiffies(1000);
++	}
++
++	return;
++}
++
++static int ksmd_should_run(void)
++{
++	return uksm_run & UKSM_RUN_MERGE;
++}
++
++static int uksm_scan_thread(void *nothing)
++{
++	set_freezable();
++	set_user_nice(current, 5);
++
++	while (!kthread_should_stop()) {
++		mutex_lock(&uksm_thread_mutex);
++		if (ksmd_should_run())
++			uksm_do_scan();
++		mutex_unlock(&uksm_thread_mutex);
++
++		try_to_freeze();
++
++		if (ksmd_should_run()) {
++			schedule_timeout_interruptible(uksm_sleep_real);
++			uksm_sleep_times++;
++		} else {
++			wait_event_freezable(uksm_thread_wait,
++				ksmd_should_run() || kthread_should_stop());
++		}
++	}
++	return 0;
++}
++
++void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
++{
++	struct stable_node *stable_node;
++	struct node_vma *node_vma;
++	struct rmap_item *rmap_item;
++	int search_new_forks = 0;
++	unsigned long address;
++
++	VM_BUG_ON_PAGE(!PageKsm(page), page);
++	VM_BUG_ON_PAGE(!PageLocked(page), page);
++
++	stable_node = page_stable_node(page);
++	if (!stable_node)
++		return;
++again:
++	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
++		hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
++			struct anon_vma *anon_vma = rmap_item->anon_vma;
++			struct anon_vma_chain *vmac;
++			struct vm_area_struct *vma;
++
++			cond_resched();
++			anon_vma_lock_read(anon_vma);
++			anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
++						       0, ULONG_MAX) {
++				cond_resched();
++				vma = vmac->vma;
++				address = get_rmap_addr(rmap_item);
++
++				if (address < vma->vm_start ||
++				    address >= vma->vm_end)
++					continue;
++
++				if ((rmap_item->slot->vma == vma) ==
++				    search_new_forks)
++					continue;
++
++				if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
++					continue;
++
++				if (!rwc->rmap_one(page, vma, address, rwc->arg)) {
++					anon_vma_unlock_read(anon_vma);
++					return;
++				}
++
++				if (rwc->done && rwc->done(page)) {
++					anon_vma_unlock_read(anon_vma);
++					return;
++				}
++			}
++			anon_vma_unlock_read(anon_vma);
++		}
++	}
++	if (!search_new_forks++)
++		goto again;
++}
++
++#ifdef CONFIG_MIGRATION
++/* Common ksm interface but may be specific to uksm */
++void ksm_migrate_page(struct page *newpage, struct page *oldpage)
++{
++	struct stable_node *stable_node;
++
++	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
++	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
++	VM_BUG_ON(newpage->mapping != oldpage->mapping);
++
++	stable_node = page_stable_node(newpage);
++	if (stable_node) {
++		VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
++		stable_node->kpfn = page_to_pfn(newpage);
++		/*
++		 * newpage->mapping was set in advance; now we need smp_wmb()
++		 * to make sure that the new stable_node->kpfn is visible
++		 * to get_ksm_page() before it can see that oldpage->mapping
++		 * has gone stale (or that PageSwapCache has been cleared).
++		 */
++		smp_wmb();
++		set_page_stable_node(oldpage, NULL);
++	}
++}
++#endif /* CONFIG_MIGRATION */
++
++#ifdef CONFIG_MEMORY_HOTREMOVE
++static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
++						 unsigned long end_pfn)
++{
++	struct rb_node *node;
++
++	for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
++		struct stable_node *stable_node;
++
++		stable_node = rb_entry(node, struct stable_node, node);
++		if (stable_node->kpfn >= start_pfn &&
++		    stable_node->kpfn < end_pfn)
++			return stable_node;
++	}
++	return NULL;
++}
++
++static int uksm_memory_callback(struct notifier_block *self,
++			       unsigned long action, void *arg)
++{
++	struct memory_notify *mn = arg;
++	struct stable_node *stable_node;
++
++	switch (action) {
++	case MEM_GOING_OFFLINE:
++		/*
++		 * Keep it very simple for now: just lock out ksmd and
++		 * MADV_UNMERGEABLE while any memory is going offline.
++		 * mutex_lock_nested() is necessary because lockdep was alarmed
++		 * that here we take uksm_thread_mutex inside notifier chain
++		 * mutex, and later take notifier chain mutex inside
++		 * uksm_thread_mutex to unlock it.   But that's safe because both
++		 * are inside mem_hotplug_mutex.
++		 */
++		mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
++		break;
++
++	case MEM_OFFLINE:
++		/*
++		 * Most of the work is done by page migration; but there might
++		 * be a few stable_nodes left over, still pointing to struct
++		 * pages which have been offlined: prune those from the tree.
++		 */
++		while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
++					mn->start_pfn + mn->nr_pages)) != NULL)
++			remove_node_from_stable_tree(stable_node, 1, 1);
++		/* fallthrough */
++
++	case MEM_CANCEL_OFFLINE:
++		mutex_unlock(&uksm_thread_mutex);
++		break;
++	}
++	return NOTIFY_OK;
++}
++#endif /* CONFIG_MEMORY_HOTREMOVE */
++
++#ifdef CONFIG_SYSFS
++/*
++ * This all compiles without CONFIG_SYSFS, but is a waste of space.
++ */
++
++#define UKSM_ATTR_RO(_name) \
++	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
++#define UKSM_ATTR(_name) \
++	static struct kobj_attribute _name##_attr = \
++		__ATTR(_name, 0644, _name##_show, _name##_store)
++
++static ssize_t max_cpu_percentage_show(struct kobject *kobj,
++				    struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
++}
++
++static ssize_t max_cpu_percentage_store(struct kobject *kobj,
++				     struct kobj_attribute *attr,
++				     const char *buf, size_t count)
++{
++	unsigned long max_cpu_percentage;
++	int err;
++
++	err = kstrtoul(buf, 10, &max_cpu_percentage);
++	if (err || max_cpu_percentage > 100)
++		return -EINVAL;
++
++	if (max_cpu_percentage == 100)
++		max_cpu_percentage = 99;
++	else if (max_cpu_percentage < 10)
++		max_cpu_percentage = 10;
++
++	uksm_max_cpu_percentage = max_cpu_percentage;
++
++	return count;
++}
++UKSM_ATTR(max_cpu_percentage);
++
++static ssize_t sleep_millisecs_show(struct kobject *kobj,
++				    struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
++}
++
++static ssize_t sleep_millisecs_store(struct kobject *kobj,
++				     struct kobj_attribute *attr,
++				     const char *buf, size_t count)
++{
++	unsigned long msecs;
++	int err;
++
++	err = kstrtoul(buf, 10, &msecs);
++	if (err || msecs > MSEC_PER_SEC)
++		return -EINVAL;
++
++	uksm_sleep_jiffies = msecs_to_jiffies(msecs);
++	uksm_sleep_saved = uksm_sleep_jiffies;
++
++	return count;
++}
++UKSM_ATTR(sleep_millisecs);
++
++
++static ssize_t cpu_governor_show(struct kobject *kobj,
++				  struct kobj_attribute *attr, char *buf)
++{
++	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
++	int i;
++
++	buf[0] = '\0';
++	for (i = 0; i < n ; i++) {
++		if (uksm_cpu_governor == i)
++			strcat(buf, "[");
++
++		strcat(buf, uksm_cpu_governor_str[i]);
++
++		if (uksm_cpu_governor == i)
++			strcat(buf, "]");
++
++		strcat(buf, " ");
++	}
++	strcat(buf, "\n");
++
++	return strlen(buf);
++}
++
++static inline void init_performance_values(void)
++{
++	int i;
++	struct scan_rung *rung;
++	struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
++
++
++	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++		rung = uksm_scan_ladder + i;
++		rung->cpu_ratio = preset->cpu_ratio[i];
++		rung->cover_msecs = preset->cover_msecs[i];
++	}
++
++	uksm_max_cpu_percentage = preset->max_cpu;
++}
++
++static ssize_t cpu_governor_store(struct kobject *kobj,
++				   struct kobj_attribute *attr,
++				   const char *buf, size_t count)
++{
++	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
++
++	for (n--; n >= 0 ; n--) {
++		if (!strncmp(buf, uksm_cpu_governor_str[n],
++			     strlen(uksm_cpu_governor_str[n])))
++			break;
++	}
++
++	if (n < 0)
++		return -EINVAL;
++	else
++		uksm_cpu_governor = n;
++
++	init_performance_values();
++
++	return count;
++}
++UKSM_ATTR(cpu_governor);
++
++static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
++			char *buf)
++{
++	return sprintf(buf, "%u\n", uksm_run);
++}
++
++static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
++			 const char *buf, size_t count)
++{
++	int err;
++	unsigned long flags;
++
++	err = kstrtoul(buf, 10, &flags);
++	if (err || flags > UINT_MAX)
++		return -EINVAL;
++	if (flags > UKSM_RUN_MERGE)
++		return -EINVAL;
++
++	mutex_lock(&uksm_thread_mutex);
++	if (uksm_run != flags)
++		uksm_run = flags;
++	mutex_unlock(&uksm_thread_mutex);
++
++	if (flags & UKSM_RUN_MERGE)
++		wake_up_interruptible(&uksm_thread_wait);
++
++	return count;
++}
++UKSM_ATTR(run);
++
++static ssize_t abundant_threshold_show(struct kobject *kobj,
++				     struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", uksm_abundant_threshold);
++}
++
++static ssize_t abundant_threshold_store(struct kobject *kobj,
++				      struct kobj_attribute *attr,
++				      const char *buf, size_t count)
++{
++	int err;
++	unsigned long flags;
++
++	err = kstrtoul(buf, 10, &flags);
++	if (err || flags > 99)
++		return -EINVAL;
++
++	uksm_abundant_threshold = flags;
++
++	return count;
++}
++UKSM_ATTR(abundant_threshold);
++
++static ssize_t thrash_threshold_show(struct kobject *kobj,
++				     struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%u\n", uksm_thrash_threshold);
++}
++
++static ssize_t thrash_threshold_store(struct kobject *kobj,
++				      struct kobj_attribute *attr,
++				      const char *buf, size_t count)
++{
++	int err;
++	unsigned long flags;
++
++	err = kstrtoul(buf, 10, &flags);
++	if (err || flags > 99)
++		return -EINVAL;
++
++	uksm_thrash_threshold = flags;
++
++	return count;
++}
++UKSM_ATTR(thrash_threshold);
++
++static ssize_t cpu_ratios_show(struct kobject *kobj,
++			       struct kobj_attribute *attr, char *buf)
++{
++	int i, size;
++	struct scan_rung *rung;
++	char *p = buf;
++
++	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++		rung = &uksm_scan_ladder[i];
++
++		if (rung->cpu_ratio > 0)
++			size = sprintf(p, "%d ", rung->cpu_ratio);
++		else
++			size = sprintf(p, "MAX/%d ",
++					TIME_RATIO_SCALE / -rung->cpu_ratio);
++
++		p += size;
++	}
++
++	*p++ = '\n';
++	*p = '\0';
++
++	return p - buf;
++}
++
++static ssize_t cpu_ratios_store(struct kobject *kobj,
++				      struct kobj_attribute *attr,
++				      const char *buf, size_t count)
++{
++	int i, cpuratios[SCAN_LADDER_SIZE], err;
++	unsigned long value;
++	struct scan_rung *rung;
++	char *p, *end = NULL;
++
++	p = kzalloc(count, GFP_KERNEL);
++	if (!p)
++		return -ENOMEM;
++
++	memcpy(p, buf, count);
++
++	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++		if (i != SCAN_LADDER_SIZE - 1) {
++			end = strchr(p, ' ');
++			if (!end)
++				return -EINVAL;
++
++			*end = '\0';
++		}
++
++		if (strstr(p, "MAX/")) {
++			p = strchr(p, '/') + 1;
++			err = kstrtoul(p, 10, &value);
++			if (err || value > TIME_RATIO_SCALE || !value)
++				return -EINVAL;
++
++			cpuratios[i] = -(int) (TIME_RATIO_SCALE / value);
++		} else {
++			err = kstrtoul(p, 10, &value);
++			if (err || value > TIME_RATIO_SCALE || !value)
++				return -EINVAL;
++
++			cpuratios[i] = value;
++		}
++
++		p = end + 1;
++	}
++
++	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++		rung = &uksm_scan_ladder[i];
++
++		rung->cpu_ratio = cpuratios[i];
++	}
++
++	return count;
++}
++UKSM_ATTR(cpu_ratios);
++
++static ssize_t eval_intervals_show(struct kobject *kobj,
++			       struct kobj_attribute *attr, char *buf)
++{
++	int i, size;
++	struct scan_rung *rung;
++	char *p = buf;
++
++	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++		rung = &uksm_scan_ladder[i];
++		size = sprintf(p, "%u ", rung->cover_msecs);
++		p += size;
++	}
++
++	*p++ = '\n';
++	*p = '\0';
++
++	return p - buf;
++}
++
++static ssize_t eval_intervals_store(struct kobject *kobj,
++				      struct kobj_attribute *attr,
++				      const char *buf, size_t count)
++{
++	int i, err;
++	unsigned long values[SCAN_LADDER_SIZE];
++	struct scan_rung *rung;
++	char *p, *end = NULL;
++	ssize_t ret = count;
++
++	p = kzalloc(count + 2, GFP_KERNEL);
++	if (!p)
++		return -ENOMEM;
++
++	memcpy(p, buf, count);
++
++	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++		if (i != SCAN_LADDER_SIZE - 1) {
++			end = strchr(p, ' ');
++			if (!end) {
++				ret = -EINVAL;
++				goto out;
++			}
++
++			*end = '\0';
++		}
++
++		err = kstrtoul(p, 10, &values[i]);
++		if (err) {
++			ret = -EINVAL;
++			goto out;
++		}
++
++		p = end + 1;
++	}
++
++	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++		rung = &uksm_scan_ladder[i];
++
++		rung->cover_msecs = values[i];
++	}
++
++out:
++	kfree(p);
++	return ret;
++}
++UKSM_ATTR(eval_intervals);
++
++static ssize_t ema_per_page_time_show(struct kobject *kobj,
++				 struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%lu\n", uksm_ema_page_time);
++}
++UKSM_ATTR_RO(ema_per_page_time);
++
++static ssize_t pages_shared_show(struct kobject *kobj,
++				 struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%lu\n", uksm_pages_shared);
++}
++UKSM_ATTR_RO(pages_shared);
++
++static ssize_t pages_sharing_show(struct kobject *kobj,
++				  struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%lu\n", uksm_pages_sharing);
++}
++UKSM_ATTR_RO(pages_sharing);
++
++static ssize_t pages_unshared_show(struct kobject *kobj,
++				   struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%lu\n", uksm_pages_unshared);
++}
++UKSM_ATTR_RO(pages_unshared);
++
++static ssize_t full_scans_show(struct kobject *kobj,
++			       struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%llu\n", fully_scanned_round);
++}
++UKSM_ATTR_RO(full_scans);
++
++static ssize_t pages_scanned_show(struct kobject *kobj,
++				  struct kobj_attribute *attr, char *buf)
++{
++	unsigned long base = 0;
++	u64 delta, ret;
++
++	if (pages_scanned_stored) {
++		base = pages_scanned_base;
++		ret = pages_scanned_stored;
++		delta = uksm_pages_scanned >> base;
++		if (CAN_OVERFLOW_U64(ret, delta)) {
++			ret >>= 1;
++			delta >>= 1;
++			base++;
++			ret += delta;
++		}
++	} else {
++		ret = uksm_pages_scanned;
++	}
++
++	while (ret > ULONG_MAX) {
++		ret >>= 1;
++		base++;
++	}
++
++	if (base)
++		return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
++	else
++		return sprintf(buf, "%lu\n", (unsigned long)ret);
++}
++UKSM_ATTR_RO(pages_scanned);
++
++static ssize_t hash_strength_show(struct kobject *kobj,
++				  struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%lu\n", hash_strength);
++}
++UKSM_ATTR_RO(hash_strength);
++
++static ssize_t sleep_times_show(struct kobject *kobj,
++				  struct kobj_attribute *attr, char *buf)
++{
++	return sprintf(buf, "%llu\n", uksm_sleep_times);
++}
++UKSM_ATTR_RO(sleep_times);
++
++
++static struct attribute *uksm_attrs[] = {
++	&max_cpu_percentage_attr.attr,
++	&sleep_millisecs_attr.attr,
++	&cpu_governor_attr.attr,
++	&run_attr.attr,
++	&ema_per_page_time_attr.attr,
++	&pages_shared_attr.attr,
++	&pages_sharing_attr.attr,
++	&pages_unshared_attr.attr,
++	&full_scans_attr.attr,
++	&pages_scanned_attr.attr,
++	&hash_strength_attr.attr,
++	&sleep_times_attr.attr,
++	&thrash_threshold_attr.attr,
++	&abundant_threshold_attr.attr,
++	&cpu_ratios_attr.attr,
++	&eval_intervals_attr.attr,
++	NULL,
++};
++
++static struct attribute_group uksm_attr_group = {
++	.attrs = uksm_attrs,
++	.name = "uksm",
++};
++#endif /* CONFIG_SYSFS */
++
++static inline void init_scan_ladder(void)
++{
++	int i;
++	struct scan_rung *rung;
++
++	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
++		rung = uksm_scan_ladder + i;
++		slot_tree_init_root(&rung->vma_root);
++	}
++
++	init_performance_values();
++	uksm_calc_scan_pages();
++}
++
++static inline int cal_positive_negative_costs(void)
++{
++	struct page *p1, *p2;
++	unsigned char *addr1, *addr2;
++	unsigned long i, time_start, hash_cost;
++	unsigned long loopnum = 0;
++
++	/*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
++	volatile u32 hash;
++	volatile int ret;
++
++	p1 = alloc_page(GFP_KERNEL);
++	if (!p1)
++		return -ENOMEM;
++
++	p2 = alloc_page(GFP_KERNEL);
++	if (!p2)
++		return -ENOMEM;
++
++	addr1 = kmap_atomic(p1);
++	addr2 = kmap_atomic(p2);
++	memset(addr1, prandom_u32(), PAGE_SIZE);
++	memcpy(addr2, addr1, PAGE_SIZE);
++
++	/* make sure that the two pages differ in last byte */
++	addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
++	kunmap_atomic(addr2);
++	kunmap_atomic(addr1);
++
++	time_start = jiffies;
++	while (jiffies - time_start < 100) {
++		for (i = 0; i < 100; i++)
++			hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
++		loopnum += 100;
++	}
++	hash_cost = (jiffies - time_start);
++
++	time_start = jiffies;
++	for (i = 0; i < loopnum; i++)
++		ret = pages_identical_with_cost(p1, p2);
++	memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
++	memcmp_cost /= hash_cost;
++	pr_info("UKSM: relative memcmp_cost = %lu "
++		"hash=%u cmp_ret=%d.\n",
++		memcmp_cost, hash, ret);
++
++	__free_page(p1);
++	__free_page(p2);
++	return 0;
++}
++
++static int init_zeropage_hash_table(void)
++{
++	struct page *page;
++	char *addr;
++	int i;
++
++	page = alloc_page(GFP_KERNEL);
++	if (!page)
++		return -ENOMEM;
++
++	addr = kmap_atomic(page);
++	memset(addr, 0, PAGE_SIZE);
++	kunmap_atomic(addr);
++
++	zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32),
++		GFP_KERNEL);
++	if (!zero_hash_table)
++		return -ENOMEM;
++
++	for (i = 0; i < HASH_STRENGTH_MAX; i++)
++		zero_hash_table[i] = page_hash(page, i, 0);
++
++	__free_page(page);
++
++	return 0;
++}
++
++static inline int init_random_sampling(void)
++{
++	unsigned long i;
++
++	random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
++	if (!random_nums)
++		return -ENOMEM;
++
++	for (i = 0; i < HASH_STRENGTH_FULL; i++)
++		random_nums[i] = i;
++
++	for (i = 0; i < HASH_STRENGTH_FULL; i++) {
++		unsigned long rand_range, swap_index, tmp;
++
++		rand_range = HASH_STRENGTH_FULL - i;
++		swap_index = i + prandom_u32() % rand_range;
++		tmp = random_nums[i];
++		random_nums[i] =  random_nums[swap_index];
++		random_nums[swap_index] = tmp;
++	}
++
++	rshash_state.state = RSHASH_NEW;
++	rshash_state.below_count = 0;
++	rshash_state.lookup_window_index = 0;
++
++	return cal_positive_negative_costs();
++}
++
++static int __init uksm_slab_init(void)
++{
++	rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
++	if (!rmap_item_cache)
++		goto out;
++
++	stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
++	if (!stable_node_cache)
++		goto out_free1;
++
++	node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
++	if (!node_vma_cache)
++		goto out_free2;
++
++	vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
++	if (!vma_slot_cache)
++		goto out_free3;
++
++	tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
++	if (!tree_node_cache)
++		goto out_free4;
++
++	return 0;
++
++out_free4:
++	kmem_cache_destroy(vma_slot_cache);
++out_free3:
++	kmem_cache_destroy(node_vma_cache);
++out_free2:
++	kmem_cache_destroy(stable_node_cache);
++out_free1:
++	kmem_cache_destroy(rmap_item_cache);
++out:
++	return -ENOMEM;
++}
++
++static void __init uksm_slab_free(void)
++{
++	kmem_cache_destroy(stable_node_cache);
++	kmem_cache_destroy(rmap_item_cache);
++	kmem_cache_destroy(node_vma_cache);
++	kmem_cache_destroy(vma_slot_cache);
++	kmem_cache_destroy(tree_node_cache);
++}
++
++/* Common interface to ksm, different to it. */
++int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
++		unsigned long end, int advice, unsigned long *vm_flags)
++{
++	int err;
++
++	switch (advice) {
++	case MADV_MERGEABLE:
++		return 0;		/* just ignore the advice */
++
++	case MADV_UNMERGEABLE:
++		if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags))
++			return 0;		/* just ignore the advice */
++
++		if (vma->anon_vma) {
++			err = unmerge_uksm_pages(vma, start, end);
++			if (err)
++				return err;
++		}
++
++		uksm_remove_vma(vma);
++		*vm_flags &= ~VM_MERGEABLE;
++		break;
++	}
++
++	return 0;
++}
++
++/* Common interface to ksm, actually the same. */
++struct page *ksm_might_need_to_copy(struct page *page,
++			struct vm_area_struct *vma, unsigned long address)
++{
++	struct anon_vma *anon_vma = page_anon_vma(page);
++	struct page *new_page;
++
++	if (PageKsm(page)) {
++		if (page_stable_node(page))
++			return page;	/* no need to copy it */
++	} else if (!anon_vma) {
++		return page;		/* no need to copy it */
++	} else if (anon_vma->root == vma->anon_vma->root &&
++		 page->index == linear_page_index(vma, address)) {
++		return page;		/* still no need to copy it */
++	}
++	if (!PageUptodate(page))
++		return page;		/* let do_swap_page report the error */
++
++	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
++	if (new_page) {
++		copy_user_highpage(new_page, page, address, vma);
++
++		SetPageDirty(new_page);
++		__SetPageUptodate(new_page);
++		__SetPageLocked(new_page);
++	}
++
++	return new_page;
++}
++
++/* Copied from mm/ksm.c and required from 5.1 */
++bool reuse_ksm_page(struct page *page,
++		    struct vm_area_struct *vma,
++		    unsigned long address)
++{
++#ifdef CONFIG_DEBUG_VM
++	if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
++			WARN_ON(!page_mapped(page)) ||
++			WARN_ON(!PageLocked(page))) {
++		dump_page(page, "reuse_ksm_page");
++		return false;
++	}
++#endif
++
++	if (PageSwapCache(page) || !page_stable_node(page))
++		return false;
++	/* Prohibit parallel get_ksm_page() */
++	if (!page_ref_freeze(page, 1))
++		return false;
++
++	page_move_anon_rmap(page, vma);
++	page->index = linear_page_index(vma, address);
++	page_ref_unfreeze(page, 1);
++
++	return true;
++}
++
++static int __init uksm_init(void)
++{
++	struct task_struct *uksm_thread;
++	int err;
++
++	uksm_sleep_jiffies = msecs_to_jiffies(100);
++	uksm_sleep_saved = uksm_sleep_jiffies;
++
++	slot_tree_init();
++	init_scan_ladder();
++
++
++	err = init_random_sampling();
++	if (err)
++		goto out_free2;
++
++	err = uksm_slab_init();
++	if (err)
++		goto out_free1;
++
++	err = init_zeropage_hash_table();
++	if (err)
++		goto out_free0;
++
++	uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
++	if (IS_ERR(uksm_thread)) {
++		pr_err("uksm: creating kthread failed\n");
++		err = PTR_ERR(uksm_thread);
++		goto out_free;
++	}
++
++#ifdef CONFIG_SYSFS
++	err = sysfs_create_group(mm_kobj, &uksm_attr_group);
++	if (err) {
++		pr_err("uksm: register sysfs failed\n");
++		kthread_stop(uksm_thread);
++		goto out_free;
++	}
++#else
++	uksm_run = UKSM_RUN_MERGE;	/* no way for user to start it */
++
++#endif /* CONFIG_SYSFS */
++
++#ifdef CONFIG_MEMORY_HOTREMOVE
++	/*
++	 * Choose a high priority since the callback takes uksm_thread_mutex:
++	 * later callbacks could only be taking locks which nest within that.
++	 */
++	hotplug_memory_notifier(uksm_memory_callback, 100);
++#endif
++	return 0;
++
++out_free:
++	kfree(zero_hash_table);
++out_free0:
++	uksm_slab_free();
++out_free1:
++	kfree(random_nums);
++out_free2:
++	kfree(uksm_scan_ladder);
++	return err;
++}
++
++#ifdef MODULE
++subsys_initcall(ksm_init);
++#else
++late_initcall(uksm_init);
++#endif
++
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 698bc0bc18d1..b7590f4944ca 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1216,6 +1216,9 @@ const char * const vmstat_text[] = {
+ 	"nr_shadow_call_stack",
+ #endif
+ 
++#ifdef CONFIG_UKSM
++	"nr_uksm_zero_pages",
++#endif
+ 	/* enum writeback_stat_item counters */
+ 	"nr_dirty_threshold",
+ 	"nr_dirty_background_threshold",
diff --git a/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-5.10.47-r1.ebuild b/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-5.10.47-r1.ebuild
deleted file mode 100644
index ed371f40..00000000
--- a/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-5.10.47-r1.ebuild
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright 1999-2017 Gentoo Foundation
-# Distributed under the terms of the GNU General Public License v2
-
-EAPI=6
-
-inherit eutils
-
-EXTRAVERSION="redcore-lts-r1"
-KV_FULL="${PV}-${EXTRAVERSION}"
-KV_MAJOR="5.10"
-
-DESCRIPTION="Redcore Linux Kernel Image (LTS)"
-HOMEPAGE="https://redcorelinux.org"
-SRC_URI="https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-${PV}.tar.xz"
-
-KEYWORDS="~amd64"
-LICENSE="GPL-2"
-SLOT="${PVR}"
-IUSE="+cryptsetup +dmraid +dracut +dkms +mdadm"
-
-RESTRICT="binchecks strip mirror"
-DEPEND="
-	app-arch/lz4
-	app-arch/xz-utils
-	sys-devel/autoconf
-	sys-devel/bc
-	sys-devel/make
-	cryptsetup? ( sys-fs/cryptsetup )
-	dmraid? ( sys-fs/dmraid )
-	dracut? ( >=sys-kernel/dracut-0.44-r8 )
-	dkms? ( sys-kernel/dkms sys-kernel/linux-sources-redcore-lts:${SLOT} )
-	mdadm? ( sys-fs/mdadm )
-	>=sys-kernel/linux-firmware-20180314"
-RDEPEND="${DEPEND}"
-
-PATCHES=(
-	"${FILESDIR}"/"${KV_MAJOR}"-ath10k-be-quiet.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-ata-fix-NCQ-LOG-strings-and-move-to-debug.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-radeon_dp_aux_transfer_native-no-ratelimited_debug.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-acpi-use-kern_warning_even_when_error.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-Unknow-SSD-HFM128GDHTNG-8310B-QUIRK_NO_APST.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-nvme-Patriot_Viper_VPN100-QUIRK_IGNORE_DEV_SUBNQN.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-do_not_bug_the_next_18-years.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-iwlwifi-use-debug-for-debug-infos.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-compress-modules-zstd-support.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-fix-bootconfig-makefile.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-apic_vector-spam-in-debug-mode-only.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-iwlwifi-fix-5e003982b07ae.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-enable-new-amd-energy-driver-for-all-ryzen.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-0001-Revert-hwmon-k10temp-Remove-support-for-displaying-v.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-k10temp-fix-ZEN2-desktop-add-ZEN3-desktop.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-add-amd-sfh-hid_driver.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-add-sbtsi_driver.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-0001-Revert-cpufreq-Avoid-configuring-old-governors-as-de.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-revert-parts-of-a00ec3874e7d326ab2dffbed92faddf6a77a84e9-no-Intel-NO.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-linux-hardened.patch
-	"${FILESDIR}"/"${KV_MAJOR}"-uksm-linux-hardened.patch
-)
-
-S="${WORKDIR}"/linux-"${PV}"
-
-pkg_setup() {
-	export KBUILD_BUILD_USER="nexus"
-	export KBUILD_BUILD_HOST="nexus.redcorelinux.org"
-
-	export REAL_ARCH="$ARCH"
-	unset ARCH ; unset LDFLAGS #will interfere with Makefile if set
-}
-
-src_prepare() {
-	default
-	emake mrproper
-	sed -ri "s|^(EXTRAVERSION =).*|\1 -${EXTRAVERSION}|" Makefile
-	cp "${FILESDIR}"/"${KV_MAJOR}"-amd64.config .config
-	rm -rf $(find . -type f|grep -F \.orig)
-}
-
-src_compile() {
-	emake prepare modules_prepare bzImage modules
-}
-
-src_install() {
-	dodir boot
-	insinto boot
-	newins .config config-"${KV_FULL}"
-	newins System.map System.map-"${KV_FULL}"
-	newins arch/x86/boot/bzImage vmlinuz-"${KV_FULL}"
-
-	dodir usr/src/linux-"${KV_FULL}"
-	insinto usr/src/linux-"${KV_FULL}"
-	doins Module.symvers
-	doins System.map
-	exeinto usr/src/linux-"${KV_FULL}"
-	doexe vmlinux
-
-	emake INSTALL_MOD_PATH="${D}" modules_install
-
-	rm -f "${D}"lib/modules/"${KV_FULL}"/build
-	rm -f "${D}"lib/modules/"${KV_FULL}"/source
-	export local KSYMS
-	for KSYMS in build source ; do
-		dosym ../../../usr/src/linux-"${KV_FULL}" lib/modules/"${KV_FULL}"/"${KSYMS}"
-	done
-}
-
-_grub2_update_grubcfg() {
-	if [[ -x $(which grub2-mkconfig) ]]; then
-		elog "Updating GRUB-2 bootloader configuration, please wait"
-		grub2-mkconfig -o "${ROOT}"boot/grub/grub.cfg
-	else
-		elog "It looks like you're not using GRUB-2, you must update bootloader configuration by hand"
-	fi
-}
-
-_dracut_initrd_create() {
-	if [[ -x $(which dracut) ]]; then
-		elog "Generating initrd for "${KV_FULL}", please wait"
-		addpredict /etc/ld.so.cache~
-		dracut -N -f --kver="${KV_FULL}" "${ROOT}"boot/initrd-"${KV_FULL}"
-	else
-		elog "It looks like you're not using dracut, you must generate an initrd by hand"
-	fi
-}
-
-_dracut_initrd_delete() {
-	rm -rf "${ROOT}"boot/initrd-"${KV_FULL}"
-}
-
-_dkms_modules_delete() {
-	if [[ -x $(which dkms) ]] ; then
-		export local DKMSMOD
-		for DKMSMOD in $(dkms status | cut -d " " -f1,2 | sed -e 's/,//g' | sed -e 's/ /\//g' | sed -e 's/://g' | uniq) ; do
-			dkms remove "${DKMSMOD}" -k "${KV_FULL}"
-		done
-	fi
-}
-
-_kernel_modules_delete() {
-	rm -rf "${ROOT}"lib/modules/"${KV_FULL}"
-}
-
-pkg_postinst() {
-	if [ $(stat -c %d:%i /) == $(stat -c %d:%i /proc/1/root/.) ]; then
-		if use dracut; then
-			_dracut_initrd_create
-		fi
-		_grub2_update_grubcfg
-	fi
-}
-
-pkg_postrm() {
-	if [ $(stat -c %d:%i /) == $(stat -c %d:%i /proc/1/root/.) ]; then
-		if use dracut; then
-			_dracut_initrd_delete
-		fi
-		_grub2_update_grubcfg
-	fi
-	if use dkms; then
-		_dkms_modules_delete
-	fi
-	_kernel_modules_delete
-}
diff --git a/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-5.10.56.ebuild b/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-5.10.56.ebuild
new file mode 100644
index 00000000..3746106b
--- /dev/null
+++ b/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-5.10.56.ebuild
@@ -0,0 +1,162 @@
+# Copyright 1999-2017 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+EAPI=6
+
+inherit eutils
+
+EXTRAVERSION="redcore-lts"
+KV_FULL="${PV}-${EXTRAVERSION}"
+KV_MAJOR="5.10"
+
+DESCRIPTION="Redcore Linux Kernel Image (LTS)"
+HOMEPAGE="https://redcorelinux.org"
+SRC_URI="https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-${PV}.tar.xz"
+
+KEYWORDS="~amd64"
+LICENSE="GPL-2"
+SLOT="${PVR}"
+IUSE="+cryptsetup +dmraid +dracut +dkms +mdadm"
+
+RESTRICT="binchecks strip mirror"
+DEPEND="
+	app-arch/lz4
+	app-arch/xz-utils
+	sys-devel/autoconf
+	sys-devel/bc
+	sys-devel/make
+	cryptsetup? ( sys-fs/cryptsetup )
+	dmraid? ( sys-fs/dmraid )
+	dracut? ( >=sys-kernel/dracut-0.44-r8 )
+	dkms? ( sys-kernel/dkms sys-kernel/linux-sources-redcore-lts:${SLOT} )
+	mdadm? ( sys-fs/mdadm )
+	>=sys-kernel/linux-firmware-20180314"
+RDEPEND="${DEPEND}"
+
+PATCHES=(
+	"${FILESDIR}"/"${KV_MAJOR}"-ath10k-be-quiet.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-ata-fix-NCQ-LOG-strings-and-move-to-debug.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-radeon_dp_aux_transfer_native-no-ratelimited_debug.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-acpi-use-kern_warning_even_when_error.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-Unknow-SSD-HFM128GDHTNG-8310B-QUIRK_NO_APST.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-nvme-Patriot_Viper_VPN100-QUIRK_IGNORE_DEV_SUBNQN.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-do_not_bug_the_next_18-years.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-iwlwifi-use-debug-for-debug-infos.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-compress-modules-zstd-support.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-fix-bootconfig-makefile.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-apic_vector-spam-in-debug-mode-only.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-iwlwifi-fix-5e003982b07ae.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-enable-new-amd-energy-driver-for-all-ryzen.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-0001-Revert-hwmon-k10temp-Remove-support-for-displaying-v.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-k10temp-fix-ZEN2-desktop-add-ZEN3-desktop.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-add-amd-sfh-hid_driver.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-add-sbtsi_driver.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-0001-Revert-cpufreq-Avoid-configuring-old-governors-as-de.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-revert-parts-of-a00ec3874e7d326ab2dffbed92faddf6a77a84e9-no-Intel-NO.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-C.patch
+	"${FILESDIR}"/"${KV_MAJOR}"-uksm.patch
+)
+
+S="${WORKDIR}"/linux-"${PV}"
+
+pkg_setup() {
+	export KBUILD_BUILD_USER="nexus"
+	export KBUILD_BUILD_HOST="nexus.redcorelinux.org"
+
+	export REAL_ARCH="$ARCH"
+	unset ARCH ; unset LDFLAGS #will interfere with Makefile if set
+}
+
+src_prepare() {
+	default
+	emake mrproper
+	sed -ri "s|^(EXTRAVERSION =).*|\1 -${EXTRAVERSION}|" Makefile
+	cp "${FILESDIR}"/"${KV_MAJOR}"-amd64.config .config
+	rm -rf $(find . -type f|grep -F \.orig)
+}
+
+src_compile() {
+	emake prepare modules_prepare bzImage modules
+}
+
+src_install() {
+	dodir boot
+	insinto boot
+	newins .config config-"${KV_FULL}"
+	newins System.map System.map-"${KV_FULL}"
+	newins arch/x86/boot/bzImage vmlinuz-"${KV_FULL}"
+
+	dodir usr/src/linux-"${KV_FULL}"
+	insinto usr/src/linux-"${KV_FULL}"
+	doins Module.symvers
+	doins System.map
+	exeinto usr/src/linux-"${KV_FULL}"
+	doexe vmlinux
+
+	emake INSTALL_MOD_PATH="${D}" modules_install
+
+	rm -f "${D}"lib/modules/"${KV_FULL}"/build
+	rm -f "${D}"lib/modules/"${KV_FULL}"/source
+	export local KSYMS
+	for KSYMS in build source ; do
+		dosym ../../../usr/src/linux-"${KV_FULL}" lib/modules/"${KV_FULL}"/"${KSYMS}"
+	done
+}
+
+_grub2_update_grubcfg() {
+	if [[ -x $(which grub2-mkconfig) ]]; then
+		elog "Updating GRUB-2 bootloader configuration, please wait"
+		grub2-mkconfig -o "${ROOT}"boot/grub/grub.cfg
+	else
+		elog "It looks like you're not using GRUB-2, you must update bootloader configuration by hand"
+	fi
+}
+
+_dracut_initrd_create() {
+	if [[ -x $(which dracut) ]]; then
+		elog "Generating initrd for "${KV_FULL}", please wait"
+		addpredict /etc/ld.so.cache~
+		dracut -N -f --kver="${KV_FULL}" "${ROOT}"boot/initrd-"${KV_FULL}"
+	else
+		elog "It looks like you're not using dracut, you must generate an initrd by hand"
+	fi
+}
+
+_dracut_initrd_delete() {
+	rm -rf "${ROOT}"boot/initrd-"${KV_FULL}"
+}
+
+_dkms_modules_delete() {
+	if [[ -x $(which dkms) ]] ; then
+		export local DKMSMOD
+		for DKMSMOD in $(dkms status | cut -d " " -f1,2 | sed -e 's/,//g' | sed -e 's/ /\//g' | sed -e 's/://g' | uniq) ; do
+			dkms remove "${DKMSMOD}" -k "${KV_FULL}"
+		done
+	fi
+}
+
+_kernel_modules_delete() {
+	rm -rf "${ROOT}"lib/modules/"${KV_FULL}"
+}
+
+pkg_postinst() {
+	if [ $(stat -c %d:%i /) == $(stat -c %d:%i /proc/1/root/.) ]; then
+		if use dracut; then
+			_dracut_initrd_create
+		fi
+		_grub2_update_grubcfg
+	fi
+}
+
+pkg_postrm() {
+	if [ $(stat -c %d:%i /) == $(stat -c %d:%i /proc/1/root/.) ]; then
+		if use dracut; then
+			_dracut_initrd_delete
+		fi
+		_grub2_update_grubcfg
+	fi
+	if use dkms; then
+		_dkms_modules_delete
+	fi
+	_kernel_modules_delete
+}
-- 
cgit v1.2.3