merge-gve-1.0.1 from branch/tag: gve-1.0.1 into branch: lakitu-4.19 Changelog: ------------------------------------------------------------- Kuo Zhao (1): gve: Add gve-1.0.1 source. BUG=b:151083587 TEST=tryjob, validation and K8s e2e RELEASE_NOTE=None Signed-off-by: Vaibhav Rustagi <vaibhavrustagi@google.com> Change-Id: I79794dae89590e2ac2362b2fdba9603f871c210a

commit: 8bd983177d60daba1e1df7e90bf02cfd0294e9b9 [log] [tgz]
author: Vaibhav Rustagi <vaibhavrustagi@google.com> Tue Mar 10 15:10:28 2020 -0700
committer: Vaibhav Rustagi <vaibhavrustagi@google.com> Tue Mar 10 15:11:30 2020 -0700
tree: 1493a270e26d1bedd8cbd8507e1b896415e31d79
parent: b0f6c7137076f6294fffb18fed6f715573372042 [diff]
parent: cbfc70d9eef0ebbd53fcf581cf1af76777e7071b [diff]
diff --git a/.gitignore b/.gitignore
index 97ba6b7..98e745c 100644
--- a/.gitignore
+++ b/.gitignore

@@ -94,6 +94,9 @@
 include/ksym
 arch/*/include/generated
 
+# kernelconfig build directory
+/build/
+
 # stgit generated dirs
 patches-*
 

diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
index 29601d9..d742c6c 100644
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab

@@ -106,6 +106,15 @@
 		are from ZONE_DMA.
 		Available when CONFIG_ZONE_DMA is enabled.
 
+What:		/sys/kernel/slab/cache/cache_dma32
+Date:		December 2018
+KernelVersion:	4.21
+Contact:	Nicolas Boichat <drinkcat@chromium.org>
+Description:
+		The cache_dma32 file is read-only and specifies whether objects
+		are from ZONE_DMA32.
+		Available when CONFIG_ZONE_DMA32 is enabled.
+
 What:		/sys/kernel/slab/cache/cpu_slabs
 Date:		May 2007
 KernelVersion:	2.6.22

diff --git a/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons b/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons
new file mode 100644
index 0000000..acb19b9
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons

@@ -0,0 +1,16 @@
+What:		/sys/kernel/wakeup_reasons/last_resume_reason
+Date:		February 2014
+Contact:	Ruchi Kandoi <kandoiruchi@google.com>
+Description:
+		The /sys/kernel/wakeup_reasons/last_resume_reason is
+		used to report wakeup reasons after system exited suspend.
+
+What:		/sys/kernel/wakeup_reasons/last_suspend_time
+Date:		March 2015
+Contact:	jinqian <jinqian@google.com>
+Description:
+		The /sys/kernel/wakeup_reasons/last_suspend_time is
+		used to report time spent in last suspend cycle. It contains
+		two numbers (in seconds) separated by space. First number is
+		the time spent in suspend and resume processes. Second number
+		is the time spent in sleep state.
\ No newline at end of file

diff --git a/Documentation/admin-guide/LSM/LoadPin.rst b/Documentation/admin-guide/LSM/LoadPin.rst
index 3207076..716ad9b 100644
--- a/Documentation/admin-guide/LSM/LoadPin.rst
+++ b/Documentation/admin-guide/LSM/LoadPin.rst

@@ -19,3 +19,13 @@
 created to toggle pinning: ``/proc/sys/kernel/loadpin/enabled``. (Having
 a mutable filesystem means pinning is mutable too, but having the
 sysctl allows for easy testing on systems with a mutable filesystem.)
+
+It's also possible to exclude specific file types from LoadPin using kernel
+command line option "``loadpin.exclude``". By default, all files are
+included, but they can be excluded using kernel command line option such
+as "``loadpin.exclude=kernel-module,kexec-image``". This allows to use
+different mechanisms such as ``CONFIG_MODULE_SIG`` and
+``CONFIG_KEXEC_VERIFY_SIG`` to verify kernel module and kernel image while
+still use LoadPin to protect the integrity of other files kernel loads. The
+full list of valid file types can be found in ``kernel_read_file_str``
+defined in ``include/linux/fs.h``.

diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
index 8d8d8f0..1a0f2ac0 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt

@@ -20,13 +20,26 @@
 details on how to configure BFQ for the desired tradeoff between
 latency and throughput, or on how to maximize throughput.
 
-BFQ has a non-null overhead, which limits the maximum IOPS that a CPU
-can process for a device scheduled with BFQ. To give an idea of the
-limits on slow or average CPUs, here are, first, the limits of BFQ for
-three different CPUs, on, respectively, an average laptop, an old
-desktop, and a cheap embedded system, in case full hierarchical
-support is enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set), but
-CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2):
+As every I/O scheduler, BFQ adds some overhead to per-I/O-request
+processing. To give an idea of this overhead, the total,
+single-lock-protected, per-request processing time of BFQ---i.e., the
+sum of the execution times of the request insertion, dispatch and
+completion hooks---is, e.g., 1.9 us on an Intel Core i7-2760QM@2.40GHz
+(dated CPU for notebooks; time measured with simple code
+instrumentation, and using the throughput-sync.sh script of the S
+suite [1], in performance-profiling mode). To put this result into
+context, the total, single-lock-protected, per-request execution time
+of the lightest I/O scheduler available in blk-mq, mq-deadline, is 0.7
+us (mq-deadline is ~800 LOC, against ~10500 LOC for BFQ).
+
+Scheduling overhead further limits the maximum IOPS that a CPU can
+process (already limited by the execution of the rest of the I/O
+stack). To give an idea of the limits with BFQ, on slow or average
+CPUs, here are, first, the limits of BFQ for three different CPUs, on,
+respectively, an average laptop, an old desktop, and a cheap embedded
+system, in case full hierarchical support is enabled (i.e.,
+CONFIG_BFQ_GROUP_IOSCHED is set), but CONFIG_DEBUG_BLK_CGROUP is not
+set (Section 4-2):
 - Intel i7-4850HQ: 400 KIOPS
 - AMD A8-3850: 250 KIOPS
 - ARM CortexTM-A53 Octa-core: 80 KIOPS
@@ -357,6 +370,13 @@
 than maximum throughput. In these cases, consider setting the
 strict_guarantees parameter.
 
+slice_idle_us
+-------------
+
+Controls the same tuning parameter as slice_idle, but in microseconds.
+Either tunable can be used to set idling behavior.  Afterwards, the
+other tunable will reflect the newly set value in sysfs.
+
 strict_guarantees
 -----------------
 
@@ -559,3 +579,5 @@
     Slightly extended version:
     http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
 							results.pdf
+
+[3] https://github.com/Algodev-github/S

diff --git a/Documentation/dev-tools/gcov.rst b/Documentation/dev-tools/gcov.rst
index 69a7d90..46aae52 100644
--- a/Documentation/dev-tools/gcov.rst
+++ b/Documentation/dev-tools/gcov.rst

@@ -34,10 +34,6 @@
         CONFIG_DEBUG_FS=y
         CONFIG_GCOV_KERNEL=y
 
-select the gcc's gcov format, default is autodetect based on gcc version::
-
-        CONFIG_GCOV_FORMAT_AUTODETECT=y
-
 and to get coverage data for the entire kernel::
 
         CONFIG_GCOV_PROFILE_ALL=y
@@ -169,6 +165,20 @@
       [user@build] gcov -o /tmp/coverage/tmp/out/init main.c
 
 
+Note on compilers
+-----------------
+
+GCC and LLVM gcov tools are not necessarily compatible. Use gcov_ to work with
+GCC-generated .gcno and .gcda files, and use llvm-cov_ for Clang.
+
+.. _gcov: http://gcc.gnu.org/onlinedocs/gcc/Gcov.html
+.. _llvm-cov: https://llvm.org/docs/CommandGuide/llvm-cov.html
+
+Build differences between GCC and Clang gcov are handled by Kconfig. It
+automatically selects the appropriate gcov format depending on the detected
+toolchain.
+
+
 Troubleshooting
 ---------------
 

diff --git a/Documentation/device-mapper/dm-init.txt b/Documentation/device-mapper/dm-init.txt
new file mode 100644
index 0000000..8464ee7
--- /dev/null
+++ b/Documentation/device-mapper/dm-init.txt

@@ -0,0 +1,114 @@
+Early creation of mapped devices
+====================================
+
+It is possible to configure a device-mapper device to act as the root device for
+your system in two ways.
+
+The first is to build an initial ramdisk which boots to a minimal userspace
+which configures the device, then pivot_root(8) in to it.
+
+The second is to create one or more device-mappers using the module parameter
+"dm-mod.create=" through the kernel boot command line argument.
+
+The format is specified as a string of data separated by commas and optionally
+semi-colons, where:
+ - a comma is used to separate fields like name, uuid, flags and table
+   (specifies one device)
+ - a semi-colon is used to separate devices.
+
+So the format will look like this:
+
+ dm-mod.create=<name>,<uuid>,<minor>,<flags>,<table>[,<table>+][;<name>,<uuid>,<minor>,<flags>,<table>[,<table>+]+]
+
+Where,
+	<name>		::= The device name.
+	<uuid>		::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | ""
+	<minor>		::= The device minor number | ""
+	<flags>		::= "ro" | "rw"
+	<table>		::= <start_sector> <num_sectors> <target_type> <target_args>
+	<target_type>	::= "verity" | "linear" | ... (see list below)
+
+The dm line should be equivalent to the one used by the dmsetup tool with the
+--concise argument.
+
+Target types
+============
+
+Not all target types are available as there are serious risks in allowing
+activation of certain DM targets without first using userspace tools to check
+the validity of associated metadata.
+
+	"cache":		constrained, userspace should verify cache device
+	"crypt":		allowed
+	"delay":		allowed
+	"era":			constrained, userspace should verify metadata device
+	"flakey":		constrained, meant for test
+	"linear":		allowed
+	"log-writes":		constrained, userspace should verify metadata device
+	"mirror":		constrained, userspace should verify main/mirror device
+	"raid":			constrained, userspace should verify metadata device
+	"snapshot":		constrained, userspace should verify src/dst device
+	"snapshot-origin":	allowed
+	"snapshot-merge":	constrained, userspace should verify src/dst device
+	"striped":		allowed
+	"switch":		constrained, userspace should verify dev path
+	"thin":			constrained, requires dm target message from userspace
+	"thin-pool":		constrained, requires dm target message from userspace
+	"verity":		allowed
+	"writecache":		constrained, userspace should verify cache device
+	"zero":			constrained, not meant for rootfs
+
+If the target is not listed above, it is constrained by default (not tested).
+
+Examples
+========
+An example of booting to a linear array made up of user-mode linux block
+devices:
+
+  dm-mod.create="lroot,,,rw, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" root=/dev/dm-0
+
+This will boot to a rw dm-linear target of 8192 sectors split across two block
+devices identified by their major:minor numbers.  After boot, udev will rename
+this target to /dev/mapper/lroot (depending on the rules). No uuid was assigned.
+
+An example of multiple device-mappers, with the dm-mod.create="..." contents is shown here
+split on multiple lines for readability:
+
+  vroot,,,ro,
+    0 1740800 verity 254:0 254:0 1740800 sha1
+      76e9be054b15884a9fa85973e9cb274c93afadb6
+      5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe;
+  vram,,,rw,
+    0 32768 linear 1:0 0,
+    32768 32768 linear 1:1 0
+
+Other examples (per target):
+
+"crypt":
+  dm-crypt,,8,ro,
+    0 1048576 crypt aes-xts-plain64
+    babebabebabebabebabebabebabebabebabebabebabebabebabebabebabebabe 0
+    /dev/sda 0 1 allow_discards
+
+"delay":
+  dm-delay,,4,ro,0 409600 delay /dev/sda1 0 500
+
+"linear":
+  dm-linear,,,rw,
+    0 32768 linear /dev/sda1 0,
+    32768 1024000 linear /dev/sda2 0,
+    1056768 204800 linear /dev/sda3 0,
+    1261568 512000 linear /dev/sda4 0
+
+"snapshot-origin":
+  dm-snap-orig,,4,ro,0 409600 snapshot-origin 8:2
+
+"striped":
+  dm-striped,,4,ro,0 1638400 striped 4 4096
+  /dev/sda1 0 /dev/sda2 0 /dev/sda3 0 /dev/sda4 0
+
+"verity":
+  dm-verity,,4,ro,
+    0 1638400 verity 1 8:1 8:2 4096 4096 204800 1 sha256
+    fb1a5a0f00deb908d8b53cb270858975e76cf64105d412ce764225d53b8f3cfd
+    51934789604d1b92399c52e7cb149d1b3a1b74bbbcb103b2a0aaacbed5c08584

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 0d0ecc7..94ce383 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt

@@ -398,6 +398,8 @@
  [stack]                  = the stack of the main process
  [vdso]                   = the "virtual dynamic shared object",
                             the kernel system call handler
+ [anon:<name>]            = an anonymous mapping that has been
+                            named by userspace
 
  or if empty, the mapping is anonymous.
 
@@ -427,6 +429,7 @@
 Locked:                0 kB
 THPeligible:           0
 VmFlags: rd ex mr mw me dw
+Name:           name from userspace
 
 the first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
@@ -503,6 +506,9 @@
 might change in future as well. So each consumer of these flags has to
 follow each specific kernel version for the exact semantic.
 
+The "Name" field will only be present on a mapping that has been named by
+userspace, and will show the name passed in by userspace.
+
 This file is only present if the CONFIG_MMU kernel configuration option is
 enabled.
 

diff --git a/Documentation/lzo.txt b/Documentation/lzo.txt
index 6fa6a93..f799342 100644
--- a/Documentation/lzo.txt
+++ b/Documentation/lzo.txt

@@ -78,16 +78,34 @@
      is an implementation design choice independent on the algorithm or
      encoding.
 
+Versions
+
+0: Original version
+1: LZO-RLE
+
+Version 1 of LZO implements an extension to encode runs of zeros using run
+length encoding. This improves speed for data with many zeros, which is a
+common case for zram. This modifies the bitstream in a backwards compatible way
+(v1 can correctly decompress v0 compressed data, but v0 cannot read v1 data).
+
+For maximum compatibility, both versions are available under different names
+(lzo and lzo-rle). Differences in the encoding are noted in this document with
+e.g.: version 1 only.
+
 Byte sequences
 ==============
 
   First byte encoding::
 
-      0..17   : follow regular instruction encoding, see below. It is worth
-                noting that codes 16 and 17 will represent a block copy from
-                the dictionary which is empty, and that they will always be
+      0..16   : follow regular instruction encoding, see below. It is worth
+                noting that code 16 will represent a block copy from the
+                dictionary which is empty, and that it will always be
                 invalid at this place.
 
+      17      : bitstream version. If the first byte is 17, the next byte
+                gives the bitstream version (version 1 only). If the first byte
+                is not 17, the bitstream version is 0.
+
       18..21  : copy 0..3 literals
                 state = (byte - 17) = 0..3  [ copy <state> literals ]
                 skip byte
@@ -140,6 +158,11 @@
            state = S (copy S literals after this block)
            End of stream is reached if distance == 16384
 
+        In version 1 only, this instruction is also used to encode a run of
+        zeros if distance = 0xbfff, i.e. H = 1 and the D bits are all 1.
+           In this case, it is followed by a fourth byte, X.
+           run length = ((X << 3) | (0 0 0 0 0 L L L)) + 4.
+
       0 0 1 L L L L L  (32..63)
            Copy of small block within 16kB distance (preferably less than 34B)
            length = 2 + (L ?: 31 + (zero_bytes * 255) + non_zero_byte)
@@ -165,7 +188,9 @@
 =======
 
   This document was written by Willy Tarreau <w@1wt.eu> on 2014/07/19 during an
-  analysis of the decompression code available in Linux 3.16-rc5. The code is
-  tricky, it is possible that this document contains mistakes or that a few
-  corner cases were overlooked. In any case, please report any doubt, fix, or
-  proposed updates to the author(s) so that the document can be updated.
+  analysis of the decompression code available in Linux 3.16-rc5, and updated
+  by Dave Rodgman <dave.rodgman@arm.com> on 2018/10/30 to introduce run-length
+  encoding. The code is tricky, it is possible that this document contains
+  mistakes or that a few corner cases were overlooked. In any case, please
+  report any doubt, fix, or proposed updates to the author(s) so that the
+  document can be updated.

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 7eb9366..7294284 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt

@@ -639,6 +639,16 @@
 	0 to disable the blackhole detection.
 	By default, it is set to 1hr.
 
+tcp_fwmark_accept - BOOLEAN
+	If set, incoming connections to listening sockets that do not have a
+	socket mark will set the mark of the accepting socket to the fwmark of
+	the incoming SYN packet. This will cause all packets on that connection
+	(starting from the first SYNACK) to be sent with that fwmark. The
+	listening socket's mark is unchanged. Listening sockets that already
+	have a fwmark set via setsockopt(SOL_SOCKET, SO_MARK, ...) are
+	unaffected.
+	Default: 0
+
 tcp_syn_retries - INTEGER
 	Number of times initial SYNs for an active TCP connection attempt
 	will be retransmitted. Should not be higher than 127. Default value

diff --git a/Documentation/networking/nf_flowtable.txt b/Documentation/networking/nf_flowtable.txt
index 54128c5..b01c918 100644
--- a/Documentation/networking/nf_flowtable.txt
+++ b/Documentation/networking/nf_flowtable.txt

@@ -76,7 +76,7 @@
 
         table inet x {
 		flowtable f {
-			hook ingress priority 0 devices = { eth0, eth1 };
+			hook ingress priority 0; devices = { eth0, eth1 };
 		}
                 chain y {
                         type filter hook forward priority 0; policy accept;

diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt
new file mode 100644
index 0000000..1a10371
--- /dev/null
+++ b/Documentation/scheduler/sched-tune.txt

@@ -0,0 +1,388 @@
+             Central, scheduler-driven, power-performance control
+                               (EXPERIMENTAL)
+
+Abstract
+========
+
+The topic of a single simple power-performance tunable, that is wholly
+scheduler centric, and has well defined and predictable properties has come up
+on several occasions in the past [1,2]. With techniques such as a scheduler
+driven DVFS [3], we now have a good framework for implementing such a tunable.
+This document describes the overall ideas behind its design and implementation.
+
+
+Table of Contents
+=================
+
+1. Motivation
+2. Introduction
+3. Signal Boosting Strategy
+4. OPP selection using boosted CPU utilization
+5. Per task group boosting
+6. Per-task wakeup-placement-strategy Selection
+7. Question and Answers
+   - What about "auto" mode?
+   - What about boosting on a congested system?
+   - How CPUs are boosted when we have tasks with multiple boost values?
+8. References
+
+
+1. Motivation
+=============
+
+Schedutil [3] is a utilization-driven cpufreq governor which allows the
+scheduler to select the optimal DVFS operating point (OPP) for running a task
+allocated to a CPU.
+
+However, sometimes it may be desired to intentionally boost the performance of
+a workload even if that could imply a reasonable increase in energy
+consumption. For example, in order to reduce the response time of a task, we
+may want to run the task at a higher OPP than the one that is actually required
+by it's CPU bandwidth demand.
+
+This last requirement is especially important if we consider that one of the
+main goals of the utilization-driven governor component is to replace all
+currently available CPUFreq policies. Since schedutil is event-based, as
+opposed to the sampling driven governors we currently have, they are already
+more responsive at selecting the optimal OPP to run tasks allocated to a CPU.
+However, just tracking the actual task utilization may not be enough from a
+performance standpoint.  For example, it is not possible to get behaviors
+similar to those provided by the "performance" and "interactive" CPUFreq
+governors.
+
+This document describes an implementation of a tunable, stacked on top of the
+utilization-driven governor which extends its functionality to support task
+performance boosting.
+
+By "performance boosting" we mean the reduction of the time required to
+complete a task activation, i.e. the time elapsed from a task wakeup to its
+next deactivation (e.g. because it goes back to sleep or it terminates).  For
+example, if we consider a simple periodic task which executes the same workload
+for 5[s] every 20[s] while running at a certain OPP, a boosted execution of
+that task must complete each of its activations in less than 5[s].
+
+The rest of this document introduces in more details the proposed solution
+which has been named SchedTune.
+
+
+2. Introduction
+===============
+
+SchedTune exposes a simple user-space interface provided through a new
+CGroup controller 'stune' which provides two power-performance tunables
+per group:
+
+  /<stune cgroup mount point>/schedtune.prefer_idle
+  /<stune cgroup mount point>/schedtune.boost
+
+The CGroup implementation permits arbitrary user-space defined task
+classification to tune the scheduler for different goals depending on the
+specific nature of the task, e.g. background vs interactive vs low-priority.
+
+More details are given in section 5.
+
+2.1 Boosting
+============
+
+The boost value is expressed as an integer in the range [0..100].
+
+A value of 0 (default) configures the CFS scheduler for maximum energy
+efficiency. This means that schedutil runs the tasks at the minimum OPP
+required to satisfy their workload demand.
+
+A value of 100 configures scheduler for maximum performance, which translates
+to the selection of the maximum OPP on that CPU.
+
+The range between 0 and 100 can be set to satisfy other scenarios suitably. For
+example to satisfy interactive response or depending on other system events
+(battery level etc).
+
+The overall design of the SchedTune module is built on top of "Per-Entity Load
+Tracking" (PELT) signals and schedutil by introducing a bias on the OPP
+selection.
+
+Each time a task is allocated on a CPU, cpufreq is given the opportunity to tune
+the operating frequency of that CPU to better match the workload demand. The
+selection of the actual OPP being activated is influenced by the boost value
+for the task CGroup.
+
+This simple biasing approach leverages existing frameworks, which means minimal
+modifications to the scheduler, and yet it allows to achieve a range of
+different behaviours all from a single simple tunable knob.
+
+In EAS schedulers, we use boosted task and CPU utilization for energy
+calculation and energy-aware task placement.
+
+2.2 prefer_idle
+===============
+
+This is a flag which indicates to the scheduler that userspace would like
+the scheduler to focus on energy or to focus on performance.
+
+A value of 0 (default) signals to the CFS scheduler that tasks in this group
+can be placed according to the energy-aware wakeup strategy.
+
+A value of 1 signals to the CFS scheduler that tasks in this group should be
+placed to minimise wakeup latency.
+
+Android platforms typically use this flag for application tasks which the
+user is currently interacting with.
+
+
+3. Signal Boosting Strategy
+===========================
+
+The whole PELT machinery works based on the value of a few load tracking signals
+which basically track the CPU bandwidth requirements for tasks and the capacity
+of CPUs. The basic idea behind the SchedTune knob is to artificially inflate
+some of these load tracking signals to make a task or RQ appears more demanding
+that it actually is.
+
+Which signals have to be inflated depends on the specific "consumer".  However,
+independently from the specific (signal, consumer) pair, it is important to
+define a simple and possibly consistent strategy for the concept of boosting a
+signal.
+
+A boosting strategy defines how the "abstract" user-space defined
+sched_cfs_boost value is translated into an internal "margin" value to be added
+to a signal to get its inflated value:
+
+  margin         := boosting_strategy(sched_cfs_boost, signal)
+  boosted_signal := signal + margin
+
+The boosting strategy currently implemented in SchedTune is called 'Signal
+Proportional Compensation' (SPC). With SPC, the sched_cfs_boost value is used to
+compute a margin which is proportional to the complement of the original signal.
+When a signal has a maximum possible value, its complement is defined as
+the delta from the actual value and its possible maximum.
+
+Since the tunable implementation uses signals which have SCHED_CAPACITY_SCALE as
+the maximum possible value, the margin becomes:
+
+	margin := sched_cfs_boost * (SCHED_CAPACITY_SCALE - signal)
+
+Using this boosting strategy:
+- a 100% sched_cfs_boost means that the signal is scaled to the maximum value
+- each value in the range of sched_cfs_boost effectively inflates the signal in
+  question by a quantity which is proportional to the maximum value.
+
+For example, by applying the SPC boosting strategy to the selection of the OPP
+to run a task it is possible to achieve these behaviors:
+
+-   0% boosting: run the task at the minimum OPP required by its workload
+- 100% boosting: run the task at the maximum OPP available for the CPU
+-  50% boosting: run at the half-way OPP between minimum and maximum
+
+Which means that, at 50% boosting, a task will be scheduled to run at half of
+the maximum theoretically achievable performance on the specific target
+platform.
+
+A graphical representation of an SPC boosted signal is represented in the
+following figure where:
+ a) "-" represents the original signal
+ b) "b" represents a  50% boosted signal
+ c) "p" represents a 100% boosted signal
+
+
+   ^
+   |  SCHED_CAPACITY_SCALE
+   +-----------------------------------------------------------------+
+   |pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
+   |
+   |                                             boosted_signal
+   |                                          bbbbbbbbbbbbbbbbbbbbbbbb
+   |
+   |                                            original signal
+   |                  bbbbbbbbbbbbbbbbbbbbbbbb+----------------------+
+   |                                          |
+   |bbbbbbbbbbbbbbbbbb                        |
+   |                                          |
+   |                                          |
+   |                                          |
+   |                  +-----------------------+
+   |                  |
+   |                  |
+   |                  |
+   |------------------+
+   |
+   |
+   +----------------------------------------------------------------------->
+
+The plot above shows a ramped load signal (titled 'original_signal') and it's
+boosted equivalent. For each step of the original signal the boosted signal
+corresponding to a 50% boost is midway from the original signal and the upper
+bound. Boosting by 100% generates a boosted signal which is always saturated to
+the upper bound.
+
+
+4. OPP selection using boosted CPU utilization
+==============================================
+
+It is worth calling out that the implementation does not introduce any new load
+signals. Instead, it provides an API to tune existing signals. This tuning is
+done on demand and only in scheduler code paths where it is sensible to do so.
+The new API calls are defined to return either the default signal or a boosted
+one, depending on the value of sched_cfs_boost. This is a clean an non invasive
+modification of the existing existing code paths.
+
+The signal representing a CPU's utilization is boosted according to the
+previously described SPC boosting strategy. To schedutil, this allows a CPU
+(ie CFS run-queue) to appear more used then it actually is.
+
+Thus, with the sched_cfs_boost enabled we have the following main functions to
+get the current utilization of a CPU:
+
+  cpu_util()
+  boosted_cpu_util()
+
+The new boosted_cpu_util() is similar to the first but returns a boosted
+utilization signal which is a function of the sched_cfs_boost value.
+
+This function is used in the CFS scheduler code paths where schedutil needs to
+decide the OPP to run a CPU at. For example, this allows selecting the highest
+OPP for a CPU which has the boost value set to 100%.
+
+
+5. Per task group boosting
+==========================
+
+On battery powered devices there usually are many background services which are
+long running and need energy efficient scheduling. On the other hand, some
+applications are more performance sensitive and require an interactive
+response and/or maximum performance, regardless of the energy cost.
+
+To better service such scenarios, the SchedTune implementation has an extension
+that provides a more fine grained boosting interface.
+
+A new CGroup controller, namely "schedtune", can be enabled which allows to
+defined and configure task groups with different boosting values.
+Tasks that require special performance can be put into separate CGroups.
+The value of the boost associated with the tasks in this group can be specified
+using a single knob exposed by the CGroup controller:
+
+   schedtune.boost
+
+This knob allows the definition of a boost value that is to be used for
+SPC boosting of all tasks attached to this group.
+
+The current schedtune controller implementation is really simple and has these
+main characteristics:
+
+  1) It is only possible to create 1 level depth hierarchies
+
+     The root control groups define the system-wide boost value to be applied
+     by default to all tasks. Its direct subgroups are named "boost groups" and
+     they define the boost value for specific set of tasks.
+     Further nested subgroups are not allowed since they do not have a sensible
+     meaning from a user-space standpoint.
+
+  2) It is possible to define only a limited number of "boost groups"
+
+     This number is defined at compile time and by default configured to 16.
+     This is a design decision motivated by two main reasons:
+     a) In a real system we do not expect utilization scenarios with more than
+        a few boost groups. For example, a reasonable collection of groups could
+        be just "background", "interactive" and "performance".
+     b) It simplifies the implementation considerably, especially for the code
+	which has to compute the per CPU boosting once there are multiple
+        RUNNABLE tasks with different boost values.
+
+Such a simple design should allow servicing the main utilization scenarios
+identified so far. It provides a simple interface which can be used to manage
+the power-performance of all tasks or only selected tasks.
+Moreover, this interface can be easily integrated by user-space run-times (e.g.
+Android, ChromeOS) to implement a QoS solution for task boosting based on tasks
+classification, which has been a long standing requirement.
+
+Setup and usage
+---------------
+
+0. Use a kernel with CONFIG_SCHED_TUNE support enabled
+
+1. Check that the "schedtune" CGroup controller is available:
+
+   root@linaro-nano:~# cat /proc/cgroups
+   #subsys_name	hierarchy	num_cgroups	enabled
+   cpuset  	0		1		1
+   cpu     	0		1		1
+   schedtune	0		1		1
+
+2. Mount a tmpfs to create the CGroups mount point (Optional)
+
+   root@linaro-nano:~# sudo mount -t tmpfs cgroups /sys/fs/cgroup
+
+3. Mount the "schedtune" controller
+
+   root@linaro-nano:~# mkdir /sys/fs/cgroup/stune
+   root@linaro-nano:~# sudo mount -t cgroup -o schedtune stune /sys/fs/cgroup/stune
+
+4. Create task groups and configure their specific boost value (Optional)
+
+   For example here we create a "performance" boost group configure to boost
+   all its tasks to 100%
+
+   root@linaro-nano:~# mkdir /sys/fs/cgroup/stune/performance
+   root@linaro-nano:~# echo 100 > /sys/fs/cgroup/stune/performance/schedtune.boost
+
+5. Move tasks into the boost group
+
+   For example, the following moves the tasks with PID $TASKPID (and all its
+   threads) into the "performance" boost group.
+
+   root@linaro-nano:~# echo "TASKPID > /sys/fs/cgroup/stune/performance/cgroup.procs
+
+This simple configuration allows only the threads of the $TASKPID task to run,
+when needed, at the highest OPP in the most capable CPU of the system.
+
+
+6. Per-task wakeup-placement-strategy Selection
+===============================================
+
+Many devices have a number of CFS tasks in use which require an absolute
+minimum wakeup latency, and many tasks for which wakeup latency is not
+important.
+
+For touch-driven environments, removing additional wakeup latency can be
+critical.
+
+When you use the Schedtume CGroup controller, you have access to a second
+parameter which allows a group to be marked such that energy_aware task
+placement is bypassed for tasks belonging to that group.
+
+prefer_idle=0 (default - use energy-aware task placement if available)
+prefer_idle=1 (never use energy-aware task placement for these tasks)
+
+Since the regular wakeup task placement algorithm in CFS is biased for
+performance, this has the effect of restoring minimum wakeup latency
+for the desired tasks whilst still allowing energy-aware wakeup placement
+to save energy for other tasks.
+
+
+7. Question and Answers
+=======================
+
+What about "auto" mode?
+-----------------------
+
+The 'auto' mode as described in [5] can be implemented by interfacing SchedTune
+with some suitable user-space element. This element could use the exposed
+system-wide or cgroup based interface.
+
+How are multiple groups of tasks with different boost values managed?
+---------------------------------------------------------------------
+
+The current SchedTune implementation keeps track of the boosted RUNNABLE tasks
+on a CPU. The CPU utilization seen by schedutil (and used to select an
+appropriate OPP) is boosted with a value which is the maximum of the boost
+values of the currently RUNNABLE tasks in its RQ.
+
+This allows cpufreq to boost a CPU only while there are boosted tasks ready
+to run and switch back to the energy efficient mode as soon as the last boosted
+task is dequeued.
+
+
+8. References
+=============
+[1] http://lwn.net/Articles/552889
+[2] http://lkml.org/lkml/2012/5/18/91
+[3] https://lkml.org/lkml/2016/3/29/1041

diff --git a/MAINTAINERS b/MAINTAINERS
index d735500..b9f9da0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS

@@ -7340,7 +7340,7 @@
 M:	Rodrigo Vivi <rodrigo.vivi@intel.com>
 L:	intel-gfx@lists.freedesktop.org
 W:	https://01.org/linuxgraphics/
-B:	https://01.org/linuxgraphics/documentation/how-report-bugs
+B:	https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs
 C:	irc://chat.freenode.net/intel-gfx
 Q:	http://patchwork.freedesktop.org/project/intel-gfx/
 T:	git git://anongit.freedesktop.org/drm-intel

diff --git a/Makefile b/Makefile
index 004d964..cec5475 100644
--- a/Makefile
+++ b/Makefile

@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 VERSION = 4
 PATCHLEVEL = 19
-SUBLEVEL = 104
+SUBLEVEL = 108
 EXTRAVERSION =
 NAME = "People's Front"
 
@@ -391,7 +391,7 @@
 
 CHECKFLAGS     := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \
 		  -Wbitwise -Wno-return-void -Wno-unknown-attribute $(CF)
-NOSTDINC_FLAGS  =
+NOSTDINC_FLAGS :=
 CFLAGS_MODULE   =
 AFLAGS_MODULE   =
 LDFLAGS_MODULE  =
@@ -481,7 +481,7 @@
 	    $(srctree) $(objtree) $(VERSION) $(PATCHLEVEL)
 endif
 
-ifeq ($(cc-name),clang)
+ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),)
 ifneq ($(CROSS_COMPILE),)
 CLANG_FLAGS	+= --target=$(notdir $(CROSS_COMPILE:%-=%))
 GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
@@ -507,9 +507,6 @@
 export RETPOLINE_CFLAGS
 export RETPOLINE_VDSO_CFLAGS
 
-KBUILD_CFLAGS	+= $(call cc-option,-fno-PIE)
-KBUILD_AFLAGS	+= $(call cc-option,-fno-PIE)
-
 # The expansion should be delayed until arch/$(SRCARCH)/Makefile is included.
 # Some architectures define CROSS_COMPILE in arch/$(SRCARCH)/Makefile.
 # CC_VERSION_TEXT is referenced from Kconfig (so it needs export),
@@ -596,6 +593,8 @@
 # Defaults to vmlinux, but the arch makefile usually adds further targets
 all: vmlinux
 
+KBUILD_CFLAGS	+= $(call cc-option,-fno-PIE)
+KBUILD_AFLAGS	+= $(call cc-option,-fno-PIE)
 CFLAGS_GCOV	:= -fprofile-arcs -ftest-coverage \
 	$(call cc-option,-fno-tree-loop-im) \
 	$(call cc-disable-warning,maybe-uninitialized,)
@@ -672,6 +671,12 @@
 # Tell gcc to never replace conditional load with a non-conditional one
 KBUILD_CFLAGS	+= $(call cc-option,--param=allow-store-data-races=0)
 
+# check for 'asm goto'
+ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC) $(KBUILD_CFLAGS)), y)
+	KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
+	KBUILD_AFLAGS += -DCC_HAVE_ASM_GOTO
+endif
+
 include scripts/Makefile.kcov
 include scripts/Makefile.gcc-plugins
 
@@ -695,17 +700,19 @@
 
 KBUILD_CFLAGS += $(stackp-flags-y)
 
-ifeq ($(cc-name),clang)
-KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
-KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
-KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
+ifdef CONFIG_CC_IS_CLANG
+KBUILD_CPPFLAGS += -Qunused-arguments
+KBUILD_CFLAGS += -Wno-format-invalid-specifier
+KBUILD_CFLAGS += -Wno-gnu
+KBUILD_CFLAGS += -Wno-address-of-packed-member
+KBUILD_CFLAGS += -Wno-duplicate-decl-specifier
 # Quiet clang warning: comparison of unsigned expression < 0 is always false
-KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
+KBUILD_CFLAGS += -Wno-tautological-compare
+KBUILD_CFLAGS += -Wno-constant-conversion
 # CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the
 # source of a reference will be _MergedGlobals and not on of the whitelisted names.
 # See modpost pattern 2
-KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
-KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
+KBUILD_CFLAGS += -mno-global-merge
 else
 
 # These warnings generated too much noise in a regular build.

diff --git a/PRESUBMIT.cfg b/PRESUBMIT.cfg
new file mode 100644
index 0000000..4fb5526
--- /dev/null
+++ b/PRESUBMIT.cfg

@@ -0,0 +1,9 @@
+[Hook Overrides]
+checkpatch_check: true
+aosp_license_check: false
+cros_license_check: false
+long_line_check: false
+signoff_check: true
+stray_whitespace_check: false
+tab_check: false
+tabbed_indent_required_check: false

diff --git a/arch/Kconfig b/arch/Kconfig
index a336548..6801123 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig

@@ -71,7 +71,6 @@
 config JUMP_LABEL
        bool "Optimize very unlikely/likely branches"
        depends on HAVE_ARCH_JUMP_LABEL
-       depends on CC_HAS_ASM_GOTO
        help
          This option enables a transparent branch optimization that
 	 makes certain almost-always-true or almost-always-false branch

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 185e552..e2f7c50 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig

@@ -61,7 +61,7 @@
 	select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32
 	select HAVE_CONTEXT_TRACKING
 	select HAVE_C_RECORDMCOUNT
-	select HAVE_DEBUG_KMEMLEAK
+	select HAVE_DEBUG_KMEMLEAK if !XIP_KERNEL
 	select HAVE_DMA_CONTIGUOUS if MMU
 	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) && !CPU_ENDIAN_BE32 && MMU
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
@@ -2008,7 +2008,7 @@
 config KEXEC
 	bool "Kexec system call (EXPERIMENTAL)"
 	depends on (!SMP || PM_SLEEP_SMP)
-	depends on !CPU_V7M
+	depends on MMU
 	select KEXEC_CORE
 	help
 	  kexec is a system call that implements the ability to shutdown your

diff --git a/arch/arm/boot/dts/imx6qdl-zii-rdu2.dtsi b/arch/arm/boot/dts/imx6qdl-zii-rdu2.dtsi
index 315d0e7..bc5f2de 100644
--- a/arch/arm/boot/dts/imx6qdl-zii-rdu2.dtsi
+++ b/arch/arm/boot/dts/imx6qdl-zii-rdu2.dtsi

@@ -657,7 +657,7 @@
 	pinctrl-0 = <&pinctrl_usdhc2>;
 	bus-width = <4>;
 	cd-gpios = <&gpio2 2 GPIO_ACTIVE_LOW>;
-	wp-gpios = <&gpio2 3 GPIO_ACTIVE_HIGH>;
+	disable-wp;
 	vmmc-supply = <&reg_3p3v_sd>;
 	vqmmc-supply = <&reg_3p3v>;
 	no-1-8-v;
@@ -670,7 +670,7 @@
 	pinctrl-0 = <&pinctrl_usdhc3>;
 	bus-width = <4>;
 	cd-gpios = <&gpio2 0 GPIO_ACTIVE_LOW>;
-	wp-gpios = <&gpio2 1 GPIO_ACTIVE_HIGH>;
+	disable-wp;
 	vmmc-supply = <&reg_3p3v_sd>;
 	vqmmc-supply = <&reg_3p3v>;
 	no-1-8-v;
@@ -804,6 +804,7 @@
 &usbh1 {
 	vbus-supply = <&reg_5p0v_main>;
 	disable-over-current;
+	maximum-speed = "full-speed";
 	status = "okay";
 };
 
@@ -1081,7 +1082,6 @@
 			MX6QDL_PAD_SD2_DAT1__SD2_DATA1		0x17059
 			MX6QDL_PAD_SD2_DAT2__SD2_DATA2		0x17059
 			MX6QDL_PAD_SD2_DAT3__SD2_DATA3		0x17059
-			MX6QDL_PAD_NANDF_D3__GPIO2_IO03		0x40010040
 			MX6QDL_PAD_NANDF_D2__GPIO2_IO02		0x40010040
 		>;
 	};
@@ -1094,7 +1094,6 @@
 			MX6QDL_PAD_SD3_DAT1__SD3_DATA1		0x17059
 			MX6QDL_PAD_SD3_DAT2__SD3_DATA2		0x17059
 			MX6QDL_PAD_SD3_DAT3__SD3_DATA3		0x17059
-			MX6QDL_PAD_NANDF_D1__GPIO2_IO01		0x40010040
 			MX6QDL_PAD_NANDF_D0__GPIO2_IO00		0x40010040
 
 		>;

diff --git a/arch/arm/boot/dts/r8a7779.dtsi b/arch/arm/boot/dts/r8a7779.dtsi
index 0391971..f1c9b2b 100644
--- a/arch/arm/boot/dts/r8a7779.dtsi
+++ b/arch/arm/boot/dts/r8a7779.dtsi

@@ -68,6 +68,14 @@
 		      <0xf0000100 0x100>;
 	};
 
+	timer@f0000200 {
+		compatible = "arm,cortex-a9-global-timer";
+		reg = <0xf0000200 0x100>;
+		interrupts = <GIC_PPI 11
+			(GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
+		clocks = <&cpg_clocks R8A7779_CLK_ZS>;
+	};
+
 	timer@f0000600 {
 		compatible = "arm,cortex-a9-twd-timer";
 		reg = <0xf0000600 0x20>;

diff --git a/arch/arm/boot/dts/stihxxx-b2120.dtsi b/arch/arm/boot/dts/stihxxx-b2120.dtsi
index 4dedfcb..ac42d3c 100644
--- a/arch/arm/boot/dts/stihxxx-b2120.dtsi
+++ b/arch/arm/boot/dts/stihxxx-b2120.dtsi

@@ -45,7 +45,7 @@
 			/* DAC */
 			format = "i2s";
 			mclk-fs = <256>;
-			frame-inversion = <1>;
+			frame-inversion;
 			cpu {
 				sound-dai = <&sti_uni_player2>;
 			};

diff --git a/arch/arm/boot/dts/stm32f469-disco.dts b/arch/arm/boot/dts/stm32f469-disco.dts
index 3ee768c..eea979e 100644
--- a/arch/arm/boot/dts/stm32f469-disco.dts
+++ b/arch/arm/boot/dts/stm32f469-disco.dts

@@ -75,6 +75,13 @@
 		regulator-max-microvolt = <3300000>;
 	};
 
+	vdd_dsi: vdd-dsi {
+		compatible = "regulator-fixed";
+		regulator-name = "vdd_dsi";
+		regulator-min-microvolt = <3300000>;
+		regulator-max-microvolt = <3300000>;
+	};
+
 	soc {
 		dma-ranges = <0xc0000000 0x0 0x10000000>;
 	};
@@ -154,6 +161,7 @@
 		compatible = "orisetech,otm8009a";
 		reg = <0>; /* dsi virtual channel (0..3) */
 		reset-gpios = <&gpioh 7 GPIO_ACTIVE_LOW>;
+		power-supply = <&vdd_dsi>;
 		status = "okay";
 
 		port {

diff --git a/arch/arm/boot/dts/sun8i-h3.dtsi b/arch/arm/boot/dts/sun8i-h3.dtsi
index 9233ba3..11172fb 100644
--- a/arch/arm/boot/dts/sun8i-h3.dtsi
+++ b/arch/arm/boot/dts/sun8i-h3.dtsi

@@ -80,7 +80,7 @@
 			#cooling-cells = <2>;
 		};
 
-		cpu@1 {
+		cpu1: cpu@1 {
 			compatible = "arm,cortex-a7";
 			device_type = "cpu";
 			reg = <1>;
@@ -90,7 +90,7 @@
 			#cooling-cells = <2>;
 		};
 
-		cpu@2 {
+		cpu2: cpu@2 {
 			compatible = "arm,cortex-a7";
 			device_type = "cpu";
 			reg = <2>;
@@ -100,7 +100,7 @@
 			#cooling-cells = <2>;
 		};
 
-		cpu@3 {
+		cpu3: cpu@3 {
 			compatible = "arm,cortex-a7";
 			device_type = "cpu";
 			reg = <3>;
@@ -111,6 +111,15 @@
 		};
 	};
 
+	pmu {
+		compatible = "arm,cortex-a7-pmu";
+		interrupts = <GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 122 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 123 IRQ_TYPE_LEVEL_HIGH>;
+		interrupt-affinity = <&cpu0>, <&cpu1>, <&cpu2>, <&cpu3>;
+	};
+
 	timer {
 		compatible = "arm,armv7-timer";
 		interrupts = <GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>,

diff --git a/arch/arm/kernel/jump_label.c b/arch/arm/kernel/jump_label.c
index 303b3ab..90bce3d 100644
--- a/arch/arm/kernel/jump_label.c
+++ b/arch/arm/kernel/jump_label.c

@@ -4,6 +4,8 @@
 #include <asm/patch.h>
 #include <asm/insn.h>
 
+#ifdef HAVE_JUMP_LABEL
+
 static void __arch_jump_label_transform(struct jump_entry *entry,
 					enum jump_label_type type,
 					bool is_static)
@@ -33,3 +35,5 @@
 {
 	__arch_jump_label_transform(entry, type, true);
 }
+
+#endif

diff --git a/arch/arm/mach-npcm/Kconfig b/arch/arm/mach-npcm/Kconfig
index 684c9c9..1d17515 100644
--- a/arch/arm/mach-npcm/Kconfig
+++ b/arch/arm/mach-npcm/Kconfig

@@ -10,7 +10,7 @@
 	depends on ARCH_MULTI_V7
 	select PINCTRL_NPCM7XX
 	select NPCM7XX_TIMER
-	select ARCH_REQUIRE_GPIOLIB
+	select GPIOLIB
 	select CACHE_L2X0
 	select ARM_GIC
 	select HAVE_ARM_TWD if SMP

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi
index 72813e7..bd43912 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi

@@ -69,6 +69,16 @@
 		clock-output-names = "osc32k";
 	};
 
+	pmu {
+		compatible = "arm,cortex-a53-pmu",
+			     "arm,armv8-pmuv3";
+		interrupts = <GIC_SPI 140 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 141 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 142 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>;
+		interrupt-affinity = <&cpu0>, <&cpu1>, <&cpu2>, <&cpu3>;
+	};
+
 	psci {
 		compatible = "arm,psci-0.2";
 		method = "smc";

diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi b/arch/arm64/boot/dts/qcom/msm8996.dtsi
index 8c86c41..3e7baab 100644
--- a/arch/arm64/boot/dts/qcom/msm8996.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi

@@ -918,6 +918,8 @@
 				interrupts = <0 138 IRQ_TYPE_LEVEL_HIGH>;
 				phys = <&hsusb_phy2>;
 				phy-names = "usb2-phy";
+				snps,dis_u2_susphy_quirk;
+				snps,dis_enblslpm_quirk;
 			};
 		};
 
@@ -947,6 +949,8 @@
 				interrupts = <0 131 IRQ_TYPE_LEVEL_HIGH>;
 				phys = <&hsusb_phy1>, <&ssusb_phy_0>;
 				phy-names = "usb2-phy", "usb3-phy";
+				snps,dis_u2_susphy_quirk;
+				snps,dis_enblslpm_quirk;
 			};
 		};
 

diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index 4b650ec..887a851 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h

@@ -35,13 +35,16 @@
 static inline void apply_alternatives_module(void *start, size_t length) { }
 #endif
 
-#define ALTINSTR_ENTRY(feature,cb)					      \
+#define ALTINSTR_ENTRY(feature)					              \
 	" .word 661b - .\n"				/* label           */ \
-	" .if " __stringify(cb) " == 0\n"				      \
 	" .word 663f - .\n"				/* new instruction */ \
-	" .else\n"							      \
+	" .hword " __stringify(feature) "\n"		/* feature bit     */ \
+	" .byte 662b-661b\n"				/* source len      */ \
+	" .byte 664f-663f\n"				/* replacement len */
+
+#define ALTINSTR_ENTRY_CB(feature, cb)					      \
+	" .word 661b - .\n"				/* label           */ \
 	" .word " __stringify(cb) "- .\n"		/* callback */	      \
-	" .endif\n"							      \
 	" .hword " __stringify(feature) "\n"		/* feature bit     */ \
 	" .byte 662b-661b\n"				/* source len      */ \
 	" .byte 664f-663f\n"				/* replacement len */
@@ -62,15 +65,14 @@
  *
  * Alternatives with callbacks do not generate replacement instructions.
  */
-#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled, cb)	\
+#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled)	\
 	".if "__stringify(cfg_enabled)" == 1\n"				\
 	"661:\n\t"							\
 	oldinstr "\n"							\
 	"662:\n"							\
 	".pushsection .altinstructions,\"a\"\n"				\
-	ALTINSTR_ENTRY(feature,cb)					\
+	ALTINSTR_ENTRY(feature)						\
 	".popsection\n"							\
-	" .if " __stringify(cb) " == 0\n"				\
 	".pushsection .altinstr_replacement, \"a\"\n"			\
 	"663:\n\t"							\
 	newinstr "\n"							\
@@ -78,17 +80,25 @@
 	".popsection\n\t"						\
 	".org	. - (664b-663b) + (662b-661b)\n\t"			\
 	".org	. - (662b-661b) + (664b-663b)\n"			\
-	".else\n\t"							\
+	".endif\n"
+
+#define __ALTERNATIVE_CFG_CB(oldinstr, feature, cfg_enabled, cb)	\
+	".if "__stringify(cfg_enabled)" == 1\n"				\
+	"661:\n\t"							\
+	oldinstr "\n"							\
+	"662:\n"							\
+	".pushsection .altinstructions,\"a\"\n"				\
+	ALTINSTR_ENTRY_CB(feature, cb)					\
+	".popsection\n"							\
 	"663:\n\t"							\
 	"664:\n\t"							\
-	".endif\n"							\
 	".endif\n"
 
 #define _ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg, ...)	\
-	__ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg), 0)
+	__ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg))
 
 #define ALTERNATIVE_CB(oldinstr, cb) \
-	__ALTERNATIVE_CFG(oldinstr, "NOT_AN_INSTRUCTION", ARM64_CB_PATCH, 1, cb)
+	__ALTERNATIVE_CFG_CB(oldinstr, ARM64_CB_PATCH, 1, cb)
 #else
 
 #include <asm/assembler.h>

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 1375307..ac3126a 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c

@@ -42,9 +42,7 @@
 #define COMPAT_ELF_HWCAP_DEFAULT	\
 				(COMPAT_HWCAP_HALF|COMPAT_HWCAP_THUMB|\
 				 COMPAT_HWCAP_FAST_MULT|COMPAT_HWCAP_EDSP|\
-				 COMPAT_HWCAP_TLS|COMPAT_HWCAP_VFP|\
-				 COMPAT_HWCAP_VFPv3|COMPAT_HWCAP_VFPv4|\
-				 COMPAT_HWCAP_NEON|COMPAT_HWCAP_IDIV|\
+				 COMPAT_HWCAP_TLS|COMPAT_HWCAP_IDIV|\
 				 COMPAT_HWCAP_LPAE)
 unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT;
 unsigned int compat_elf_hwcap2 __read_mostly;
@@ -1341,17 +1339,30 @@
 	{},
 };
 
-#define HWCAP_CAP(reg, field, s, min_value, cap_type, cap)	\
-	{							\
-		.desc = #cap,					\
-		.type = ARM64_CPUCAP_SYSTEM_FEATURE,		\
+
+#define HWCAP_CPUID_MATCH(reg, field, s, min_value)		\
 		.matches = has_cpuid_feature,			\
 		.sys_reg = reg,					\
 		.field_pos = field,				\
 		.sign = s,					\
 		.min_field_value = min_value,			\
+
+#define __HWCAP_CAP(name, cap_type, cap)			\
+		.desc = name,					\
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,		\
 		.hwcap_type = cap_type,				\
 		.hwcap = cap,					\
+
+#define HWCAP_CAP(reg, field, s, min_value, cap_type, cap)	\
+	{							\
+		__HWCAP_CAP(#cap, cap_type, cap)		\
+		HWCAP_CPUID_MATCH(reg, field, s, min_value)	\
+	}
+
+#define HWCAP_CAP_MATCH(match, cap_type, cap)			\
+	{							\
+		__HWCAP_CAP(#cap, cap_type, cap)		\
+		.matches = match,				\
 	}
 
 static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
@@ -1387,8 +1398,35 @@
 	{},
 };
 
+#ifdef CONFIG_COMPAT
+static bool compat_has_neon(const struct arm64_cpu_capabilities *cap, int scope)
+{
+	/*
+	 * Check that all of MVFR1_EL1.{SIMDSP, SIMDInt, SIMDLS} are available,
+	 * in line with that of arm32 as in vfp_init(). We make sure that the
+	 * check is future proof, by making sure value is non-zero.
+	 */
+	u32 mvfr1;
+
+	WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible());
+	if (scope == SCOPE_SYSTEM)
+		mvfr1 = read_sanitised_ftr_reg(SYS_MVFR1_EL1);
+	else
+		mvfr1 = read_sysreg_s(SYS_MVFR1_EL1);
+
+	return cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_SIMDSP_SHIFT) &&
+		cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_SIMDINT_SHIFT) &&
+		cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_SIMDLS_SHIFT);
+}
+#endif
+
 static const struct arm64_cpu_capabilities compat_elf_hwcaps[] = {
 #ifdef CONFIG_COMPAT
+	HWCAP_CAP_MATCH(compat_has_neon, CAP_COMPAT_HWCAP, COMPAT_HWCAP_NEON),
+	HWCAP_CAP(SYS_MVFR1_EL1, MVFR1_SIMDFMAC_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv4),
+	/* Arm v8 mandates MVFR0.FPDP == {0, 2}. So, piggy back on this for the presence of VFP support */
+	HWCAP_CAP(SYS_MVFR0_EL1, MVFR0_FPDP_SHIFT, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFP),
+	HWCAP_CAP(SYS_MVFR0_EL1, MVFR0_FPDP_SHIFT, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv3),
 	HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, FTR_UNSIGNED, 2, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_PMULL),
 	HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_AES_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_AES),
 	HWCAP_CAP(SYS_ID_ISAR5_EL1, ID_ISAR5_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA1),

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 58c53bc..14fdbaa 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c

@@ -218,6 +218,7 @@
 static void task_fpsimd_load(void)
 {
 	WARN_ON(!in_softirq() && !irqs_disabled());
+	WARN_ON(!system_supports_fpsimd());
 
 	if (system_supports_sve() && test_thread_flag(TIF_SVE))
 		sve_load_state(sve_pffr(&current->thread),
@@ -238,6 +239,7 @@
 	struct user_fpsimd_state *st = __this_cpu_read(fpsimd_last_state.st);
 	/* set by fpsimd_bind_task_to_cpu() or fpsimd_bind_state_to_cpu() */
 
+	WARN_ON(!system_supports_fpsimd());
 	WARN_ON(!in_softirq() && !irqs_disabled());
 
 	if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) {
@@ -977,6 +979,7 @@
 	struct fpsimd_last_state_struct *last =
 		this_cpu_ptr(&fpsimd_last_state);
 
+	WARN_ON(!system_supports_fpsimd());
 	last->st = &current->thread.uw.fpsimd_state;
 	current->thread.fpsimd_cpu = smp_processor_id();
 
@@ -996,6 +999,7 @@
 	struct fpsimd_last_state_struct *last =
 		this_cpu_ptr(&fpsimd_last_state);
 
+	WARN_ON(!system_supports_fpsimd());
 	WARN_ON(!in_softirq() && !irqs_disabled());
 
 	last->st = st;
@@ -1008,8 +1012,19 @@
  */
 void fpsimd_restore_current_state(void)
 {
-	if (!system_supports_fpsimd())
+	/*
+	 * For the tasks that were created before we detected the absence of
+	 * FP/SIMD, the TIF_FOREIGN_FPSTATE could be set via fpsimd_thread_switch(),
+	 * e.g, init. This could be then inherited by the children processes.
+	 * If we later detect that the system doesn't support FP/SIMD,
+	 * we must clear the flag for  all the tasks to indicate that the
+	 * FPSTATE is clean (as we can't have one) to avoid looping for ever in
+	 * do_notify_resume().
+	 */
+	if (!system_supports_fpsimd()) {
+		clear_thread_flag(TIF_FOREIGN_FPSTATE);
 		return;
+	}
 
 	local_bh_disable();
 
@@ -1028,7 +1043,7 @@
  */
 void fpsimd_update_current_state(struct user_fpsimd_state const *state)
 {
-	if (!system_supports_fpsimd())
+	if (WARN_ON(!system_supports_fpsimd()))
 		return;
 
 	local_bh_disable();
@@ -1055,6 +1070,7 @@
 
 void fpsimd_flush_cpu_state(void)
 {
+	WARN_ON(!system_supports_fpsimd());
 	__this_cpu_write(fpsimd_last_state.st, NULL);
 	set_thread_flag(TIF_FOREIGN_FPSTATE);
 }

diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c
index b90754a..e075641 100644
--- a/arch/arm64/kernel/jump_label.c
+++ b/arch/arm64/kernel/jump_label.c

@@ -20,6 +20,8 @@
 #include <linux/jump_label.h>
 #include <asm/insn.h>
 
+#ifdef HAVE_JUMP_LABEL
+
 void arch_jump_label_transform(struct jump_entry *entry,
 			       enum jump_label_type type)
 {
@@ -47,3 +49,5 @@
 	 * NOP needs to be replaced by a branch.
 	 */
 }
+
+#endif	/* HAVE_JUMP_LABEL */

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index d668c13..d6a49bb 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c

@@ -414,6 +414,13 @@
 	if (unlikely(next->flags & PF_KTHREAD))
 		return;
 
+	/*
+	 * If all CPUs implement the SSBS extension, then we just need to
+	 * context-switch the PSTATE field.
+	 */
+	if (cpu_have_feature(cpu_feature(SSBS)))
+		return;
+
 	/* If the mitigation is enabled, then we leave SSBS clear. */
 	if ((arm64_get_ssbd_state() == ARM64_SSBD_FORCE_ENABLE) ||
 	    test_tsk_thread_flag(next, TIF_SSBD))

diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 6290a4e..f397893 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c

@@ -37,7 +37,15 @@
 /* Check whether the FP regs were dirtied while in the host-side run loop: */
 static bool __hyp_text update_fp_enabled(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.host_thread_info->flags & _TIF_FOREIGN_FPSTATE)
+	/*
+	 * When the system doesn't support FP/SIMD, we cannot rely on
+	 * the _TIF_FOREIGN_FPSTATE flag. However, we always inject an
+	 * abort on the very first access to FP and thus we should never
+	 * see KVM_ARM64_FP_ENABLED. For added safety, make sure we always
+	 * trap the accesses.
+	 */
+	if (!system_supports_fpsimd() ||
+	    vcpu->arch.host_thread_info->flags & _TIF_FOREIGN_FPSTATE)
 		vcpu->arch.flags &= ~(KVM_ARM64_FP_ENABLED |
 				      KVM_ARM64_FP_HOST);
 

diff --git a/arch/microblaze/kernel/cpu/cache.c b/arch/microblaze/kernel/cpu/cache.c
index 0bde47e..dcba538 100644
--- a/arch/microblaze/kernel/cpu/cache.c
+++ b/arch/microblaze/kernel/cpu/cache.c

@@ -92,7 +92,8 @@
 #define CACHE_LOOP_LIMITS(start, end, cache_line_length, cache_size)	\
 do {									\
 	int align = ~(cache_line_length - 1);				\
-	end = min(start + cache_size, end);				\
+	if (start <  UINT_MAX - cache_size)				\
+		end = min(start + cache_size, end);			\
 	start &= align;							\
 } while (0)
 

diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index ad0a92f..39cab0e 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile

@@ -128,7 +128,7 @@
 # clang's output will be based upon the build machine. So for clang we simply
 # unconditionally specify -EB or -EL as appropriate.
 #
-ifeq ($(cc-name),clang)
+ifdef CONFIG_CC_IS_CLANG
 cflags-$(CONFIG_CPU_BIG_ENDIAN)		+= -EB
 cflags-$(CONFIG_CPU_LITTLE_ENDIAN)	+= -EL
 else

diff --git a/arch/mips/kernel/jump_label.c b/arch/mips/kernel/jump_label.c
index ab94392..32e3168 100644
--- a/arch/mips/kernel/jump_label.c
+++ b/arch/mips/kernel/jump_label.c

@@ -16,6 +16,8 @@
 #include <asm/cacheflush.h>
 #include <asm/inst.h>
 
+#ifdef HAVE_JUMP_LABEL
+
 /*
  * Define parameters for the standard MIPS and the microMIPS jump
  * instruction encoding respectively:
@@ -68,3 +70,5 @@
 
 	mutex_unlock(&text_mutex);
 }
+
+#endif /* HAVE_JUMP_LABEL */

diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c
index 0bef238..0d5f9c8 100644
--- a/arch/mips/kernel/vpe.c
+++ b/arch/mips/kernel/vpe.c

@@ -134,7 +134,7 @@
 {
 	list_del(&v->list);
 	if (v->load_addr)
-		release_progmem(v);
+		release_progmem(v->load_addr);
 	kfree(v);
 }
 

diff --git a/arch/mips/loongson64/loongson-3/platform.c b/arch/mips/loongson64/loongson-3/platform.c
index 25a97cc..0db4cc3 100644
--- a/arch/mips/loongson64/loongson-3/platform.c
+++ b/arch/mips/loongson64/loongson-3/platform.c

@@ -31,6 +31,9 @@
 			continue;
 
 		pdev = kzalloc(sizeof(struct platform_device), GFP_KERNEL);
+		if (!pdev)
+			return -ENOMEM;
+
 		pdev->name = loongson_sysconf.sensors[i].name;
 		pdev->id = loongson_sysconf.sensors[i].id;
 		pdev->dev.platform_data = &loongson_sysconf.sensors[i];

diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile
index c99fa1c..8b23bb1 100644
--- a/arch/mips/vdso/Makefile
+++ b/arch/mips/vdso/Makefile

@@ -12,7 +12,7 @@
 	$(filter -mno-loongson-%,$(KBUILD_CFLAGS)) \
 	-D__VDSO__
 
-ifeq ($(cc-name),clang)
+ifdef CONFIG_CC_IS_CLANG
 ccflags-vdso += $(filter --target=%,$(KBUILD_CFLAGS))
 endif
 

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 8954108..55ac368 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile

@@ -98,7 +98,7 @@
 endif
 endif
 
-ifneq ($(cc-name),clang)
+ifndef CONFIG_CC_IS_CLANG
   cflags-$(CONFIG_CPU_LITTLE_ENDIAN)	+= -mno-strict-align
 endif
 
@@ -179,7 +179,7 @@
 # Work around gcc code-gen bugs with -pg / -fno-omit-frame-pointer in gcc <= 4.8
 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=44199
 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52828
-ifneq ($(cc-name),clang)
+ifndef CONFIG_CC_IS_CLANG
 CC_FLAGS_FTRACE	+= $(call cc-ifversion, -lt, 0409, -mno-sched-epilog)
 endif
 endif

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index d0609c1..95b2df1 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h

@@ -38,7 +38,7 @@
 void __trace_hcall_entry(unsigned long opcode, unsigned long *args);
 void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf);
 /* OPAL tracing */
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 extern struct static_key opal_tracepoint_key;
 #endif
 

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index af1f3d5..377d23f 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c

@@ -554,12 +554,6 @@
 
 		pci_iov_remove_virtfn(edev->physfn, pdn->vf_index);
 		edev->pdev = NULL;
-
-		/*
-		 * We have to set the VF PE number to invalid one, which is
-		 * required to plug the VF successfully.
-		 */
-		pdn->pe_number = IODA_INVALID_PE;
 #endif
 		if (rmv_data)
 			list_add(&edev->rmv_list, &rmv_data->edev_list);

diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c
index 0080c5f..6472472 100644
--- a/arch/powerpc/kernel/jump_label.c
+++ b/arch/powerpc/kernel/jump_label.c

@@ -11,6 +11,7 @@
 #include <linux/jump_label.h>
 #include <asm/code-patching.h>
 
+#ifdef HAVE_JUMP_LABEL
 void arch_jump_label_transform(struct jump_entry *entry,
 			       enum jump_label_type type)
 {
@@ -21,3 +22,4 @@
 	else
 		patch_instruction(addr, PPC_INST_NOP);
 }
+#endif

diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index ab147a1..7cecc3b 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c

@@ -257,9 +257,22 @@
 				continue;
 
 #ifdef CONFIG_EEH
-			/* Release EEH device for the VF */
+			/*
+			 * Release EEH state for this VF. The PCI core
+			 * has already torn down the pci_dev for this VF, but
+			 * we're responsible to removing the eeh_dev since it
+			 * has the same lifetime as the pci_dn that spawned it.
+			 */
 			edev = pdn_to_eeh_dev(pdn);
 			if (edev) {
+				/*
+				 * We allocate pci_dn's for the totalvfs count,
+				 * but only only the vfs that were activated
+				 * have a configured PE.
+				 */
+				if (edev->pe)
+					eeh_rmv_from_parent_pe(edev);
+
 				pdn->edev = NULL;
 				kfree(edev);
 			}

diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index b3e8db3..57b3745 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c

@@ -200,14 +200,27 @@
 	 * normal/non-checkpointed stack pointer.
 	 */
 
+	unsigned long ret = tsk->thread.regs->gpr[1];
+
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	BUG_ON(tsk != current);
 
 	if (MSR_TM_ACTIVE(tsk->thread.regs->msr)) {
+		preempt_disable();
 		tm_reclaim_current(TM_CAUSE_SIGNAL);
 		if (MSR_TM_TRANSACTIONAL(tsk->thread.regs->msr))
-			return tsk->thread.ckpt_regs.gpr[1];
+			ret = tsk->thread.ckpt_regs.gpr[1];
+
+		/*
+		 * If we treclaim, we must clear the current thread's TM bits
+		 * before re-enabling preemption. Otherwise we might be
+		 * preempted and have the live MSR[TS] changed behind our back
+		 * (tm_recheckpoint_new_task() would recheckpoint). Besides, we
+		 * enter the signal handler in non-transactional state.
+		 */
+		tsk->thread.regs->msr &= ~MSR_TS_MASK;
+		preempt_enable();
 	}
 #endif
-	return tsk->thread.regs->gpr[1];
+	return ret;
 }

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 906b05c..06b4b82 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c

@@ -493,19 +493,11 @@
  */
 static int save_tm_user_regs(struct pt_regs *regs,
 			     struct mcontext __user *frame,
-			     struct mcontext __user *tm_frame, int sigret)
+			     struct mcontext __user *tm_frame, int sigret,
+			     unsigned long msr)
 {
-	unsigned long msr = regs->msr;
-
 	WARN_ON(tm_suspend_disabled);
 
-	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
-	 * just indicates to userland that we were doing a transaction, but we
-	 * don't want to return in transactional state.  This also ensures
-	 * that flush_fp_to_thread won't set TIF_RESTORE_TM again.
-	 */
-	regs->msr &= ~MSR_TS_MASK;
-
 	/* Save both sets of general registers */
 	if (save_general_regs(&current->thread.ckpt_regs, frame)
 	    || save_general_regs(regs, tm_frame))
@@ -916,6 +908,10 @@
 	int sigret;
 	unsigned long tramp;
 	struct pt_regs *regs = tsk->thread.regs;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/* Save the thread's msr before get_tm_stackpointer() changes it */
+	unsigned long msr = regs->msr;
+#endif
 
 	BUG_ON(tsk != current);
 
@@ -948,13 +944,13 @@
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	tm_frame = &rt_sf->uc_transact.uc_mcontext;
-	if (MSR_TM_ACTIVE(regs->msr)) {
+	if (MSR_TM_ACTIVE(msr)) {
 		if (__put_user((unsigned long)&rt_sf->uc_transact,
 			       &rt_sf->uc.uc_link) ||
 		    __put_user((unsigned long)tm_frame,
 			       &rt_sf->uc_transact.uc_regs))
 			goto badframe;
-		if (save_tm_user_regs(regs, frame, tm_frame, sigret))
+		if (save_tm_user_regs(regs, frame, tm_frame, sigret, msr))
 			goto badframe;
 	}
 	else
@@ -1365,6 +1361,10 @@
 	int sigret;
 	unsigned long tramp;
 	struct pt_regs *regs = tsk->thread.regs;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/* Save the thread's msr before get_tm_stackpointer() changes it */
+	unsigned long msr = regs->msr;
+#endif
 
 	BUG_ON(tsk != current);
 
@@ -1398,9 +1398,9 @@
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	tm_mctx = &frame->mctx_transact;
-	if (MSR_TM_ACTIVE(regs->msr)) {
+	if (MSR_TM_ACTIVE(msr)) {
 		if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact,
-				      sigret))
+				      sigret, msr))
 			goto badframe;
 	}
 	else

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index b5933d72..b088b07 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c

@@ -196,7 +196,8 @@
 static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 				 struct sigcontext __user *tm_sc,
 				 struct task_struct *tsk,
-				 int signr, sigset_t *set, unsigned long handler)
+				 int signr, sigset_t *set, unsigned long handler,
+				 unsigned long msr)
 {
 	/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
 	 * process never used altivec yet (MSR_VEC is zero in pt_regs of
@@ -211,12 +212,11 @@
 	elf_vrreg_t __user *tm_v_regs = sigcontext_vmx_regs(tm_sc);
 #endif
 	struct pt_regs *regs = tsk->thread.regs;
-	unsigned long msr = tsk->thread.regs->msr;
 	long err = 0;
 
 	BUG_ON(tsk != current);
 
-	BUG_ON(!MSR_TM_ACTIVE(regs->msr));
+	BUG_ON(!MSR_TM_ACTIVE(msr));
 
 	WARN_ON(tm_suspend_disabled);
 
@@ -226,13 +226,6 @@
 	 */
 	msr |= tsk->thread.ckpt_regs.msr & (MSR_FP | MSR_VEC | MSR_VSX);
 
-	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
-	 * just indicates to userland that we were doing a transaction, but we
-	 * don't want to return in transactional state.  This also ensures
-	 * that flush_fp_to_thread won't set TIF_RESTORE_TM again.
-	 */
-	regs->msr &= ~MSR_TS_MASK;
-
 #ifdef CONFIG_ALTIVEC
 	err |= __put_user(v_regs, &sc->v_regs);
 	err |= __put_user(tm_v_regs, &tm_sc->v_regs);
@@ -803,6 +796,10 @@
 	unsigned long newsp = 0;
 	long err = 0;
 	struct pt_regs *regs = tsk->thread.regs;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/* Save the thread's msr before get_tm_stackpointer() changes it */
+	unsigned long msr = regs->msr;
+#endif
 
 	BUG_ON(tsk != current);
 
@@ -820,7 +817,7 @@
 	err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __save_altstack(&frame->uc.uc_stack, regs->gpr[1]);
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	if (MSR_TM_ACTIVE(regs->msr)) {
+	if (MSR_TM_ACTIVE(msr)) {
 		/* The ucontext_t passed to userland points to the second
 		 * ucontext_t (for transactional state) with its uc_link ptr.
 		 */
@@ -828,7 +825,8 @@
 		err |= setup_tm_sigcontexts(&frame->uc.uc_mcontext,
 					    &frame->uc_transact.uc_mcontext,
 					    tsk, ksig->sig, NULL,
-					    (unsigned long)ksig->ka.sa.sa_handler);
+					    (unsigned long)ksig->ka.sa.sa_handler,
+					    msr);
 	} else
 #endif
 	{

diff --git a/arch/powerpc/platforms/powernv/opal-tracepoints.c b/arch/powerpc/platforms/powernv/opal-tracepoints.c
index f16a435..1ab7d26 100644
--- a/arch/powerpc/platforms/powernv/opal-tracepoints.c
+++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c

@@ -4,7 +4,7 @@
 #include <asm/trace.h>
 #include <asm/asm-prototypes.h>
 
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 struct static_key opal_tracepoint_key = STATIC_KEY_INIT;
 
 int opal_tracepoint_regfunc(void)

diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 74215eb..3f98158 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S

@@ -20,7 +20,7 @@
 	.section	".text"
 
 #ifdef CONFIG_TRACEPOINTS
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 #define OPAL_BRANCH(LABEL)					\
 	ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key)
 #else

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index ee63749..ecd211c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c

@@ -1552,6 +1552,10 @@
 
 	/* Reserve PE for each VF */
 	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
+		int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index);
+		int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
+		struct pci_dn *vf_pdn;
+
 		if (pdn->m64_single_mode)
 			pe_num = pdn->pe_num_map[vf_index];
 		else
@@ -1564,13 +1568,11 @@
 		pe->pbus = NULL;
 		pe->parent_dev = pdev;
 		pe->mve_number = -1;
-		pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
-			   pci_iov_virtfn_devfn(pdev, vf_index);
+		pe->rid = (vf_bus << 8) | vf_devfn;
 
 		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
 			hose->global_number, pdev->bus->number,
-			PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
-			PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
+			PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
 
 		if (pnv_ioda_configure_pe(phb, pe)) {
 			/* XXX What do we do here ? */
@@ -1584,6 +1586,15 @@
 		list_add_tail(&pe->list, &phb->ioda.pe_list);
 		mutex_unlock(&phb->ioda.pe_list_mutex);
 
+		/* associate this pe to it's pdn */
+		list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) {
+			if (vf_pdn->busno == vf_bus &&
+			    vf_pdn->devfn == vf_devfn) {
+				vf_pdn->pe_number = pe_num;
+				break;
+			}
+		}
+
 		pnv_pci_ioda2_setup_dma_pe(phb, pe);
 	}
 }
@@ -3004,9 +3015,6 @@
 	struct pci_dn *pdn;
 	int mul, total_vfs;
 
-	if (!pdev->is_physfn || pci_dev_is_added(pdev))
-		return;
-
 	pdn = pci_get_pdn(pdev);
 	pdn->vfs_expanded = 0;
 	pdn->m64_single_mode = false;
@@ -3081,6 +3089,30 @@
 		res->end = res->start - 1;
 	}
 }
+
+static void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev)
+{
+	if (WARN_ON(pci_dev_is_added(pdev)))
+		return;
+
+	if (pdev->is_virtfn) {
+		struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
+
+		/*
+		 * VF PEs are single-device PEs so their pdev pointer needs to
+		 * be set. The pdev doesn't exist when the PE is allocated (in
+		 * (pcibios_sriov_enable()) so we fix it up here.
+		 */
+		pe->pdev = pdev;
+		WARN_ON(!(pe->flags & PNV_IODA_PE_VF));
+	} else if (pdev->is_physfn) {
+		/*
+		 * For PFs adjust their allocated IOV resources to match what
+		 * the PHB can support using it's M64 BAR table.
+		 */
+		pnv_pci_ioda_fixup_iov_resources(pdev);
+	}
+}
 #endif /* CONFIG_PCI_IOV */
 
 static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
@@ -3974,7 +4006,7 @@
 	ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
 
 #ifdef CONFIG_PCI_IOV
-	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
+	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov;
 	ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
 	ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable;
 	ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable;

diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index c846300..b6fa900 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c

@@ -820,24 +820,6 @@
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 	struct pnv_phb *phb = hose->private_data;
-#ifdef CONFIG_PCI_IOV
-	struct pnv_ioda_pe *pe;
-	struct pci_dn *pdn;
-
-	/* Fix the VF pdn PE number */
-	if (pdev->is_virtfn) {
-		pdn = pci_get_pdn(pdev);
-		WARN_ON(pdn->pe_number != IODA_INVALID_PE);
-		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
-			if (pe->rid == ((pdev->bus->number << 8) |
-			    (pdev->devfn & 0xff))) {
-				pdn->pe_number = pe->pe_number;
-				pe->pdev = pdev;
-				break;
-			}
-		}
-	}
-#endif /* CONFIG_PCI_IOV */
 
 	if (phb && phb->dma_dev_setup)
 		phb->dma_dev_setup(phb, pdev);

diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S
index 50dc942..d91412c 100644
--- a/arch/powerpc/platforms/pseries/hvCall.S
+++ b/arch/powerpc/platforms/pseries/hvCall.S

@@ -19,7 +19,7 @@
 	
 #ifdef CONFIG_TRACEPOINTS
 
-#ifndef CONFIG_JUMP_LABEL
+#ifndef HAVE_JUMP_LABEL
 	.section	".toc","aw"
 
 	.globl hcall_tracepoint_refcount
@@ -79,7 +79,7 @@
 	mr	r5,BUFREG;					\
 	__HCALL_INST_POSTCALL
 
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 #define HCALL_BRANCH(LABEL)					\
 	ARCH_STATIC_BRANCH(LABEL, hcall_tracepoint_key)
 #else

diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 49e3a88..751b51a 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c

@@ -833,7 +833,7 @@
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 #ifdef CONFIG_TRACEPOINTS
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 struct static_key hcall_tracepoint_key = STATIC_KEY_INIT;
 
 int hcall_tracepoint_regfunc(void)

diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index e6c2e89..4bccde3 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile

@@ -63,7 +63,7 @@
 #
 cflags-$(CONFIG_FRAME_POINTER) += -fno-optimize-sibling-calls
 
-ifeq ($(call cc-option-yn,-mpacked-stack),y)
+ifeq ($(call cc-option-yn,-mpacked-stack -mbackchain -msoft-float),y)
 cflags-$(CONFIG_PACK_STACK)  += -mpacked-stack -D__PACK_STACK
 aflags-$(CONFIG_PACK_STACK)  += -D__PACK_STACK
 endif

diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c
index e68136c..0fc6522 100644
--- a/arch/s390/appldata/appldata_mem.c
+++ b/arch/s390/appldata/appldata_mem.c

@@ -63,6 +63,9 @@
 	u64 pgalloc;		/* page allocations */
 	u64 pgfault;		/* page faults (major+minor) */
 	u64 pgmajfault;		/* page faults (major only) */
+	u64 pgmajfault_s;	/* shmem page faults (major only) */
+	u64 pgmajfault_a;	/* anonymous page faults (major only) */
+	u64 pgmajfault_f;	/* file page faults (major only) */
 // <-- New in 2.6
 
 } __packed;
@@ -94,7 +97,11 @@
 	mem_data->pgalloc    = ev[PGALLOC_NORMAL];
 	mem_data->pgalloc    += ev[PGALLOC_DMA];
 	mem_data->pgfault    = ev[PGFAULT];
-	mem_data->pgmajfault = ev[PGMAJFAULT];
+	mem_data->pgmajfault =
+		ev[PGMAJFAULT_S] + ev[PGMAJFAULT_A] + ev[PGMAJFAULT_F];
+	mem_data->pgmajfault_s = ev[PGMAJFAULT_S];
+	mem_data->pgmajfault_a = ev[PGMAJFAULT_A];
+	mem_data->pgmajfault_f = ev[PGMAJFAULT_F];
 
 	si_meminfo(&val);
 	mem_data->sharedram = val.sharedram;

diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index ac3c86b..349b1c1 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h

@@ -42,7 +42,7 @@
 
 static inline void storage_key_init_range(unsigned long start, unsigned long end)
 {
-	if (PAGE_DEFAULT_KEY)
+	if (PAGE_DEFAULT_KEY != 0)
 		__storage_key_init_range(start, end);
 }
 

diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h
index 2dc9eb4..b6a4ce9 100644
--- a/arch/s390/include/asm/timex.h
+++ b/arch/s390/include/asm/timex.h

@@ -155,7 +155,7 @@
 
 static inline unsigned long long get_tod_clock(void)
 {
-	unsigned char clk[STORE_CLOCK_EXT_SIZE];
+	char clk[STORE_CLOCK_EXT_SIZE];
 
 	get_tod_clock_ext(clk);
 	return *((unsigned long long *)&clk[1]);

diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 762fc453..b524c15 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile

@@ -46,7 +46,7 @@
 obj-y	:= traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
 obj-y	+= processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
 obj-y	+= debug.o irq.o ipl.o dis.o diag.o vdso.o early_nobss.o
-obj-y	+= sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o
+obj-y	+= sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o
 obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
 obj-y	+= entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
 obj-y	+= nospec-branch.o
@@ -70,7 +70,6 @@
 obj-$(CONFIG_FUNCTION_TRACER)	+= mcount.o ftrace.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
-obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o
 
 obj-$(CONFIG_KEXEC_FILE)	+= machine_kexec_file.o kexec_image.o
 obj-$(CONFIG_KEXEC_FILE)	+= kexec_elf.o

diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c
index 68f415e..43f8430 100644
--- a/arch/s390/kernel/jump_label.c
+++ b/arch/s390/kernel/jump_label.c

@@ -10,6 +10,8 @@
 #include <linux/jump_label.h>
 #include <asm/ipl.h>
 
+#ifdef HAVE_JUMP_LABEL
+
 struct insn {
 	u16 opcode;
 	s32 offset;
@@ -100,3 +102,5 @@
 {
 	__jump_label_transform(entry, type, 1);
 }
+
+#endif

diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S
index e93fbf0..83afd5b 100644
--- a/arch/s390/kernel/mcount.S
+++ b/arch/s390/kernel/mcount.S

@@ -25,6 +25,12 @@
 #define STACK_PTREGS	  (STACK_FRAME_OVERHEAD)
 #define STACK_PTREGS_GPRS (STACK_PTREGS + __PT_GPRS)
 #define STACK_PTREGS_PSW  (STACK_PTREGS + __PT_PSW)
+#ifdef __PACK_STACK
+/* allocate just enough for r14, r15 and backchain */
+#define TRACED_FUNC_FRAME_SIZE	24
+#else
+#define TRACED_FUNC_FRAME_SIZE	STACK_FRAME_OVERHEAD
+#endif
 
 ENTRY(_mcount)
 	BR_EX	%r14
@@ -38,9 +44,16 @@
 #if !(defined(CC_USING_HOTPATCH) || defined(CC_USING_NOP_MCOUNT))
 	aghi	%r0,MCOUNT_RETURN_FIXUP
 #endif
-	aghi	%r15,-STACK_FRAME_SIZE
+	# allocate stack frame for ftrace_caller to contain traced function
+	aghi	%r15,-TRACED_FUNC_FRAME_SIZE
 	stg	%r1,__SF_BACKCHAIN(%r15)
+	stg	%r0,(__SF_GPRS+8*8)(%r15)
+	stg	%r15,(__SF_GPRS+9*8)(%r15)
+	# allocate pt_regs and stack frame for ftrace_trace_function
+	aghi	%r15,-STACK_FRAME_SIZE
 	stg	%r1,(STACK_PTREGS_GPRS+15*8)(%r15)
+	aghi	%r1,-TRACED_FUNC_FRAME_SIZE
+	stg	%r1,__SF_BACKCHAIN(%r15)
 	stg	%r0,(STACK_PTREGS_PSW+8)(%r15)
 	stmg	%r2,%r14,(STACK_PTREGS_GPRS+2*8)(%r15)
 #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 05ea466..3515f2b 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c

@@ -2109,7 +2109,7 @@
 		return -EINVAL;
 
 	if (!test_kvm_facility(kvm, 72))
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	mutex_lock(&fi->ais_lock);
 	ais.simm = fi->simm;
@@ -2412,7 +2412,7 @@
 	int ret = 0;
 
 	if (!test_kvm_facility(kvm, 72))
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req)))
 		return -EFAULT;
@@ -2492,7 +2492,7 @@
 	struct kvm_s390_ais_all ais;
 
 	if (!test_kvm_facility(kvm, 72))
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	if (copy_from_user(&ais, (void __user *)attr->addr, sizeof(ais)))
 		return -EFAULT;

diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c
index 430c14b..0e11fc0 100644
--- a/arch/s390/pci/pci_sysfs.c
+++ b/arch/s390/pci/pci_sysfs.c

@@ -13,6 +13,8 @@
 #include <linux/stat.h>
 #include <linux/pci.h>
 
+#include "../../../drivers/pci/pci.h"
+
 #include <asm/sclp.h>
 
 #define zpci_attr(name, fmt, member)					\
@@ -40,31 +42,50 @@
 static ssize_t recover_store(struct device *dev, struct device_attribute *attr,
 			     const char *buf, size_t count)
 {
+	struct kernfs_node *kn;
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct zpci_dev *zdev = to_zpci(pdev);
-	int ret;
+	int ret = 0;
 
-	if (!device_remove_file_self(dev, attr))
-		return count;
+	/* Can't use device_remove_self() here as that would lead us to lock
+	 * the pci_rescan_remove_lock while holding the device' kernfs lock.
+	 * This would create a possible deadlock with disable_slot() which is
+	 * not directly protected by the device' kernfs lock but takes it
+	 * during the device removal which happens under
+	 * pci_rescan_remove_lock.
+	 *
+	 * This is analogous to sdev_store_delete() in
+	 * drivers/scsi/scsi_sysfs.c
+	 */
+	kn = sysfs_break_active_protection(&dev->kobj, &attr->attr);
+	WARN_ON_ONCE(!kn);
+	/* device_remove_file() serializes concurrent calls ignoring all but
+	 * the first
+	 */
+	device_remove_file(dev, attr);
 
+	/* A concurrent call to recover_store() may slip between
+	 * sysfs_break_active_protection() and the sysfs file removal.
+	 * Once it unblocks from pci_lock_rescan_remove() the original pdev
+	 * will already be removed.
+	 */
 	pci_lock_rescan_remove();
-	pci_stop_and_remove_bus_device(pdev);
-	ret = zpci_disable_device(zdev);
-	if (ret)
-		goto error;
+	if (pci_dev_is_added(pdev)) {
+		pci_stop_and_remove_bus_device(pdev);
+		ret = zpci_disable_device(zdev);
+		if (ret)
+			goto out;
 
-	ret = zpci_enable_device(zdev);
-	if (ret)
-		goto error;
-
-	pci_rescan_bus(zdev->bus);
+		ret = zpci_enable_device(zdev);
+		if (ret)
+			goto out;
+		pci_rescan_bus(zdev->bus);
+	}
+out:
 	pci_unlock_rescan_remove();
-
-	return count;
-
-error:
-	pci_unlock_rescan_remove();
-	return ret;
+	if (kn)
+		sysfs_unbreak_active_protection(kn);
+	return ret ? ret : count;
 }
 static DEVICE_ATTR_WO(recover);
 

diff --git a/arch/sh/include/cpu-sh2a/cpu/sh7269.h b/arch/sh/include/cpu-sh2a/cpu/sh7269.h
index d516e5d..b887cc4 100644
--- a/arch/sh/include/cpu-sh2a/cpu/sh7269.h
+++ b/arch/sh/include/cpu-sh2a/cpu/sh7269.h

@@ -78,8 +78,15 @@
 	GPIO_FN_WDTOVF,
 
 	/* CAN */
-	GPIO_FN_CTX1, GPIO_FN_CRX1, GPIO_FN_CTX0, GPIO_FN_CTX0_CTX1,
-	GPIO_FN_CRX0, GPIO_FN_CRX0_CRX1, GPIO_FN_CRX0_CRX1_CRX2,
+	GPIO_FN_CTX2, GPIO_FN_CRX2,
+	GPIO_FN_CTX1, GPIO_FN_CRX1,
+	GPIO_FN_CTX0, GPIO_FN_CRX0,
+	GPIO_FN_CTX0_CTX1, GPIO_FN_CRX0_CRX1,
+	GPIO_FN_CTX0_CTX1_CTX2, GPIO_FN_CRX0_CRX1_CRX2,
+	GPIO_FN_CTX2_PJ21, GPIO_FN_CRX2_PJ20,
+	GPIO_FN_CTX1_PJ23, GPIO_FN_CRX1_PJ22,
+	GPIO_FN_CTX0_CTX1_PJ23, GPIO_FN_CRX0_CRX1_PJ22,
+	GPIO_FN_CTX0_CTX1_CTX2_PJ21, GPIO_FN_CRX0_CRX1_CRX2_PJ20,
 
 	/* DMAC */
 	GPIO_FN_TEND0, GPIO_FN_DACK0, GPIO_FN_DREQ0,

diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 97c0e19..cf86408 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile

@@ -118,4 +118,4 @@
 obj-$(CONFIG_SPARC64)	+= $(pc--y)
 
 obj-$(CONFIG_UPROBES)	+= uprobes.o
-obj-$(CONFIG_JUMP_LABEL) += jump_label.o
+obj-$(CONFIG_SPARC64)	+= jump_label.o

diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c
index a4cfaee..7f8eac5 100644
--- a/arch/sparc/kernel/jump_label.c
+++ b/arch/sparc/kernel/jump_label.c

@@ -9,6 +9,8 @@
 
 #include <asm/cacheflush.h>
 
+#ifdef HAVE_JUMP_LABEL
+
 void arch_jump_label_transform(struct jump_entry *entry,
 			       enum jump_label_type type)
 {
@@ -45,3 +47,5 @@
 	flushi(insn);
 	mutex_unlock(&text_mutex);
 }
+
+#endif

diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index 61afd78..59b6df1 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S

@@ -172,12 +172,14 @@
 	}
 	PERCPU_SECTION(SMP_CACHE_BYTES)
 
-#ifdef CONFIG_JUMP_LABEL
 	. = ALIGN(PAGE_SIZE);
 	.exit.text : {
 		EXIT_TEXT
 	}
-#endif
+
+	.exit.data : {
+		EXIT_DATA
+	}
 
 	. = ALIGN(PAGE_SIZE);
 	__init_end = .;

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index af35f5c..fbbe59a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig

@@ -204,6 +204,7 @@
 	select USER_STACKTRACE_SUPPORT
 	select VIRT_TO_BUS
 	select X86_FEATURE_NAMES		if PROC_FS
+	select ARCH_HAS_ALT_SYSCALL		if X86_64
 
 config INSTRUCTION_DECODER
 	def_bool y
@@ -408,6 +409,17 @@
 
 	  If in doubt, say Y.
 
+config X86_FAST_FEATURE_TESTS
+	bool "Fast CPU feature tests" if EMBEDDED
+	default y
+	---help---
+	  Some fast-paths in the kernel depend on the capabilities of the CPU.
+	  Say Y here for the kernel to patch in the appropriate code at runtime
+	  based on the capabilities of the CPU. The infrastructure for patching
+	  code at runtime takes up some additional space; space-constrained
+	  embedded systems may wish to say N here to produce smaller, slightly
+	  slower code.
+
 config X86_X2APIC
 	bool "Support x2apic"
 	depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 4833dd7..5a38de8 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile

@@ -306,10 +306,6 @@
 
 archprepare: checkbin
 checkbin:
-ifndef CONFIG_CC_HAS_ASM_GOTO
-	@echo Compiler lacks asm-goto support.
-	@exit 1
-endif
 ifdef CONFIG_RETPOLINE
 ifeq ($(RETPOLINE_CFLAGS),)
 	@echo "You are building kernel with non-retpoline compiler." >&2

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 466f66c..de1e6e6 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile

@@ -28,6 +28,7 @@
 
 KBUILD_CFLAGS := -m$(BITS) -O2
 KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC)
+KBUILD_CFLAGS += -fomit-frame-pointer
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 cflags-$(CONFIG_X86_32) := -march=i386
 cflags-$(CONFIG_X86_64) := -mcmodel=small

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 31fbb4a..9f912dc9 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h

@@ -339,7 +339,7 @@
  */
 .macro CALL_enter_from_user_mode
 #ifdef CONFIG_CONTEXT_TRACKING
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 	STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
 #endif
 	call enter_from_user_mode

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 8353348..ad35f51 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c

@@ -288,10 +288,17 @@
 	 * regs->orig_ax, which changes the behavior of some syscalls.
 	 */
 	nr &= __SYSCALL_MASK;
+#ifdef CONFIG_ALT_SYSCALL
+	if (likely(nr < ti->nr_syscalls)) {
+		nr = array_index_nospec(nr, ti->nr_syscalls);
+		regs->ax = ti->sys_call_table[nr](regs);
+	}
+#else
 	if (likely(nr < NR_syscalls)) {
 		nr = array_index_nospec(nr, NR_syscalls);
 		regs->ax = sys_call_table[nr](regs);
 	}
+#endif
 
 	syscall_return_slowpath(regs);
 }
@@ -323,6 +330,12 @@
 		nr = syscall_trace_enter(regs);
 	}
 
+#ifdef CONFIG_ALT_SYSCALL
+	if (likely(nr < ti->ia32_nr_syscalls)) {
+		nr = array_index_nospec(nr, ti->ia32_nr_syscalls);
+		regs->ax = ti->ia32_sys_call_table[nr](regs);
+	}
+#else
 	if (likely(nr < IA32_NR_syscalls)) {
 		nr = array_index_nospec(nr, IA32_NR_syscalls);
 #ifdef CONFIG_IA32_EMULATION
@@ -340,6 +353,7 @@
 			(unsigned int)regs->di, (unsigned int)regs->bp);
 #endif /* CONFIG_IA32_EMULATION */
 	}
+#endif
 
 	syscall_return_slowpath(regs);
 }

diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 42d4c89..ddff0ca 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c

@@ -11,6 +11,7 @@
 #include <linux/smp.h>
 #include <linux/kernel.h>
 #include <linux/mm_types.h>
+#include <linux/elf.h>
 
 #include <asm/processor.h>
 #include <asm/vdso.h>

diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index defb536..c3ec535 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c

@@ -245,6 +245,7 @@
 	[PERF_COUNT_HW_CPU_CYCLES]		= 0x0076,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
 	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0xff60,
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x0964,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c2,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c3,
 	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= 0x0287,

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index e91814d..79caeba 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c

@@ -1402,6 +1402,8 @@
 	old = ((s64)(prev_raw_count << shift) >> shift);
 	local64_add(new - old + count * period, &event->count);
 
+	local64_set(&hwc->period_left, -new);
+
 	perf_event_update_userpage(event);
 
 	return 0;

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 68889ac..0ecc9ba 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h

@@ -140,20 +140,7 @@
 
 #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
 
-#if defined(__clang__) && !defined(CONFIG_CC_HAS_ASM_GOTO)
-
-/*
- * Workaround for the sake of BPF compilation which utilizes kernel
- * headers, but clang does not support ASM GOTO and fails the build.
- */
-#ifndef __BPF_TRACING__
-#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments"
-#endif
-
-#define static_cpu_has(bit)            boot_cpu_has(bit)
-
-#else
-
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
  * These will statically patch the target code for additional
@@ -209,6 +196,12 @@
 		boot_cpu_has(bit) :				\
 		_static_cpu_has(bit)				\
 )
+#else
+/*
+ * Fall back to dynamic for gcc versions which don't support asm goto. Should be
+ * a minority now anyway.
+ */
+#define static_cpu_has(bit)		boot_cpu_has(bit)
 #endif
 
 #define cpu_has_bug(c, bit)		cpu_has(c, (bit))

diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 7010e1c..8c0de42 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h

@@ -2,6 +2,19 @@
 #ifndef _ASM_X86_JUMP_LABEL_H
 #define _ASM_X86_JUMP_LABEL_H
 
+#ifndef HAVE_JUMP_LABEL
+/*
+ * For better or for worse, if jump labels (the gcc extension) are missing,
+ * then the entire static branch patching infrastructure is compiled out.
+ * If that happens, the code in here will malfunction.  Raise a compiler
+ * error instead.
+ *
+ * In theory, jump labels and the static branch patching infrastructure
+ * could be decoupled to fix this.
+ */
+#error asm/jump_label.h included on a non-jump-label kernel
+#endif
+
 #define JUMP_LABEL_NOP_SIZE 5
 
 #ifdef CONFIG_X86_64

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 21a58fc..067288d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h

@@ -1040,7 +1040,7 @@
 	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 	void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
-	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
+	int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
 	int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 0f4feee..d2c25a1 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h

@@ -455,6 +455,8 @@
 #define MSR_K7_HWCR			0xc0010015
 #define MSR_K7_HWCR_SMMLOCK_BIT		0
 #define MSR_K7_HWCR_SMMLOCK		BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT)
+#define MSR_K7_HWCR_IRPERF_EN_BIT	30
+#define MSR_K7_HWCR_IRPERF_EN		BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT)
 #define MSR_K7_FID_VID_CTL		0xc0010041
 #define MSR_K7_FID_VID_STATUS		0xc0010042
 

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 75ded1d..9d5d949 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h

@@ -41,7 +41,6 @@
 	struct list_head	list;
 	nmi_handler_t		handler;
 	u64			max_duration;
-	struct irq_work		irq_work;
 	unsigned long		flags;
 	const char		*name;
 };

diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 033dc7c..4914a3e 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h

@@ -4,7 +4,7 @@
 
 #define __CLOBBERS_MEM(clb...)	"memory", ## clb
 
-#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CONFIG_CC_HAS_ASM_GOTO)
+#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO)
 
 /* Use asm goto */
 
@@ -21,7 +21,7 @@
 #define __BINARY_RMWcc_ARG	" %1, "
 
 
-#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */
+#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
 
 /* Use flags output or a set instruction */
 
@@ -36,7 +36,7 @@
 
 #define __BINARY_RMWcc_ARG	" %2, "
 
-#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */
+#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
 
 #define GEN_UNARY_RMWcc(op, var, arg0, cc)				\
 	__GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM())

diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index d653139..dc4418f 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h

@@ -33,6 +33,7 @@
 #define ia32_sys_call_table sys_call_table
 #define __NR_syscall_compat_max __NR_syscall_max
 #define IA32_NR_syscalls NR_syscalls
+#define ia32_nr_syscalls nr_syscalls
 #endif
 
 #if defined(CONFIG_IA32_EMULATION)

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 82b73b7..7ca78ae 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h

@@ -50,17 +50,52 @@
  */
 #ifndef __ASSEMBLY__
 struct task_struct;
+
+/* same as sys_call_ptr_t from asm/syscall.h */
+typedef asmlinkage long (*ti_sys_call_ptr_t)(const struct pt_regs *);
+
 #include <asm/cpufeature.h>
 #include <linux/atomic.h>
 
 struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	u32			status;		/* thread synchronous flags */
+#ifdef CONFIG_ALT_SYSCALL
+	/*
+	 * This uses nr_syscalls instead of nr_syscall_max because we want
+	 * to be able to entirely disable a syscall table (e.g. compat) by
+	 * setting nr_syscalls to 0. This requires some careful work in
+	 * the syscall entry assembly code, most variations use ..._max.
+	 */
+	unsigned int		nr_syscalls;	/* size of below */
+	const ti_sys_call_ptr_t	*sys_call_table;
+# ifdef CONFIG_IA32_EMULATION
+	unsigned int		ia32_nr_syscalls;	/* size of below */
+	const ti_sys_call_ptr_t	*ia32_sys_call_table;
+# endif
+#endif
 };
 
+#ifdef CONFIG_ALT_SYSCALL
+# ifdef CONFIG_IA32_EMULATION
+#  define INIT_THREAD_INFO_SYSCALL_COMPAT			\
+	.ia32_nr_syscalls	= IA32_NR_syscalls,		\
+	.ia32_sys_call_table	= ia32_sys_call_table,
+# else
+#  define INIT_THREAD_INFO_SYSCALL_COMPAT /* */
+# endif
+# define INIT_THREAD_INFO_SYSCALL \
+	.nr_syscalls	= NR_syscalls,		\
+	.sys_call_table	= sys_call_table,	\
+	INIT_THREAD_INFO_SYSCALL_COMPAT
+#else
+# define INIT_THREAD_INFO_SYSCALL /* */
+#endif
+
 #define INIT_THREAD_INFO(tsk)			\
 {						\
 	.flags		= 0,			\
+	INIT_THREAD_INFO_SYSCALL		\
 }
 
 #else /* !__ASSEMBLY__ */

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index da0b6bc..b7661a3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile

@@ -49,8 +49,7 @@
 obj-y			+= traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o dumpstack.o nmi.o
 obj-$(CONFIG_MODIFY_LDT_SYSCALL)	+= ldt.o
-obj-y			+= setup.o x86_init.o i8259.o irqinit.o
-obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o
+obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
 obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o
@@ -140,6 +139,8 @@
 obj-$(CONFIG_UNWINDER_FRAME_POINTER)	+= unwind_frame.o
 obj-$(CONFIG_UNWINDER_GUESS)		+= unwind_guess.o
 
+obj-$(CONFIG_ALT_SYSCALL)		+= alt-syscall.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)

diff --git a/arch/x86/kernel/alt-syscall.c b/arch/x86/kernel/alt-syscall.c
new file mode 100644
index 0000000..09e7ed7
--- /dev/null
+++ b/arch/x86/kernel/alt-syscall.c

@@ -0,0 +1,70 @@
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/syscalls.h>
+#include <linux/alt-syscall.h>
+
+#include <asm/syscall.h>
+#include <asm/syscalls.h>
+
+int arch_dup_sys_call_table(struct alt_sys_call_table *entry)
+{
+	if (!entry)
+		return -EINVAL;
+	/* Table already allocated. */
+	if (entry->table)
+		return -EINVAL;
+#ifdef CONFIG_IA32_EMULATION
+	if (entry->compat_table)
+		return -EINVAL;
+#endif
+	entry->size = NR_syscalls;
+	entry->table = kcalloc(entry->size, sizeof(sys_call_ptr_t),
+			       GFP_KERNEL);
+	if (!entry->table)
+		goto failed;
+
+	memcpy(entry->table, sys_call_table,
+	       entry->size * sizeof(sys_call_ptr_t));
+
+#ifdef CONFIG_IA32_EMULATION
+	entry->compat_size = IA32_NR_syscalls;
+	entry->compat_table = kcalloc(entry->compat_size,
+				      sizeof(sys_call_ptr_t), GFP_KERNEL);
+	if (!entry->compat_table)
+		goto failed;
+	memcpy(entry->compat_table, ia32_sys_call_table,
+	       entry->compat_size * sizeof(sys_call_ptr_t));
+#endif
+
+	return 0;
+
+failed:
+	entry->size = 0;
+	kfree(entry->table);
+	entry->table = NULL;
+#ifdef CONFIG_IA32_EMULATION
+	entry->compat_size = 0;
+#endif
+	return -ENOMEM;
+}
+
+/* Operates on "current", which isn't racey, since it's _in_ a syscall. */
+int arch_set_sys_call_table(struct alt_sys_call_table *entry)
+{
+	if (!entry)
+		return -EINVAL;
+
+	current_thread_info()->nr_syscalls = entry->size;
+	current_thread_info()->sys_call_table = entry->table;
+#ifdef CONFIG_IA32_EMULATION
+	current_thread_info()->ia32_nr_syscalls = entry->compat_size;
+	current_thread_info()->ia32_sys_call_table = entry->compat_table;
+#endif
+
+	return 0;
+}

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 75715fa..1207699 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c

@@ -25,6 +25,7 @@
 
 static const int amd_erratum_383[];
 static const int amd_erratum_400[];
+static const int amd_erratum_1054[];
 static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
 
 /*
@@ -983,6 +984,15 @@
 	/* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
 	if (!cpu_has(c, X86_FEATURE_XENPV))
 		set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+	/*
+	 * Turn on the Instructions Retired free counter on machines not
+	 * susceptible to erratum #1054 "Instructions Retired Performance
+	 * Counter May Be Inaccurate".
+	 */
+	if (cpu_has(c, X86_FEATURE_IRPERF) &&
+	    !cpu_has_amd_erratum(c, amd_erratum_1054))
+		msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
 }
 
 #ifdef CONFIG_X86_32
@@ -1110,6 +1120,10 @@
 static const int amd_erratum_383[] =
 	AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
 
+/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
+static const int amd_erratum_1054[] =
+	AMD_OSVW_ERRATUM(0, AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
+
 
 static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
 {

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index da0b696..f878d24 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c

@@ -1117,9 +1117,12 @@
 	.store			= store,
 };
 
+static void threshold_block_release(struct kobject *kobj);
+
 static struct kobj_type threshold_ktype = {
 	.sysfs_ops		= &threshold_ops,
 	.default_attrs		= default_attrs,
+	.release		= threshold_block_release,
 };
 
 static const char *get_name(unsigned int bank, struct threshold_block *b)
@@ -1152,8 +1155,9 @@
 	return buf_mcatype;
 }
 
-static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
-				     unsigned int block, u32 address)
+static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb,
+				     unsigned int bank, unsigned int block,
+				     u32 address)
 {
 	struct threshold_block *b = NULL;
 	u32 low, high;
@@ -1197,16 +1201,12 @@
 
 	INIT_LIST_HEAD(&b->miscj);
 
-	if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
-		list_add(&b->miscj,
-			 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
-	} else {
-		per_cpu(threshold_banks, cpu)[bank]->blocks = b;
-	}
+	if (tb->blocks)
+		list_add(&b->miscj, &tb->blocks->miscj);
+	else
+		tb->blocks = b;
 
-	err = kobject_init_and_add(&b->kobj, &threshold_ktype,
-				   per_cpu(threshold_banks, cpu)[bank]->kobj,
-				   get_name(bank, b));
+	err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b));
 	if (err)
 		goto out_free;
 recurse:
@@ -1214,7 +1214,7 @@
 	if (!address)
 		return 0;
 
-	err = allocate_threshold_blocks(cpu, bank, block, address);
+	err = allocate_threshold_blocks(cpu, tb, bank, block, address);
 	if (err)
 		goto out_free;
 
@@ -1299,8 +1299,6 @@
 		goto out_free;
 	}
 
-	per_cpu(threshold_banks, cpu)[bank] = b;
-
 	if (is_shared_bank(bank)) {
 		refcount_set(&b->cpus, 1);
 
@@ -1311,9 +1309,13 @@
 		}
 	}
 
-	err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank));
-	if (!err)
-		goto out;
+	err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank));
+	if (err)
+		goto out_free;
+
+	per_cpu(threshold_banks, cpu)[bank] = b;
+
+	return 0;
 
  out_free:
 	kfree(b);
@@ -1322,8 +1324,12 @@
 	return err;
 }
 
-static void deallocate_threshold_block(unsigned int cpu,
-						 unsigned int bank)
+static void threshold_block_release(struct kobject *kobj)
+{
+	kfree(to_block(kobj));
+}
+
+static void deallocate_threshold_block(unsigned int cpu, unsigned int bank)
 {
 	struct threshold_block *pos = NULL;
 	struct threshold_block *tmp = NULL;
@@ -1333,13 +1339,11 @@
 		return;
 
 	list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
-		kobject_put(&pos->kobj);
 		list_del(&pos->miscj);
-		kfree(pos);
+		kobject_put(&pos->kobj);
 	}
 
-	kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
-	per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
+	kobject_put(&head->blocks->kobj);
 }
 
 static void __threshold_remove_blocks(struct threshold_bank *b)

diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 4c3d9a3..eeea935 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c

@@ -16,6 +16,8 @@
 #include <asm/alternative.h>
 #include <asm/text-patching.h>
 
+#ifdef HAVE_JUMP_LABEL
+
 union jump_code_union {
 	char code[JUMP_LABEL_NOP_SIZE];
 	struct {
@@ -140,3 +142,5 @@
 	if (jlstate == JL_STATE_UPDATE)
 		__jump_label_transform(entry, type, text_poke_early, 1);
 }
+
+#endif

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 086cf1d..0f8b9b9 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c

@@ -102,18 +102,22 @@
 }
 fs_initcall(nmi_warning_debugfs);
 
-static void nmi_max_handler(struct irq_work *w)
+static void nmi_check_duration(struct nmiaction *action, u64 duration)
 {
-	struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
+	u64 whole_msecs = READ_ONCE(action->max_duration);
 	int remainder_ns, decimal_msecs;
-	u64 whole_msecs = READ_ONCE(a->max_duration);
+
+	if (duration < nmi_longest_ns || duration < action->max_duration)
+		return;
+
+	action->max_duration = duration;
 
 	remainder_ns = do_div(whole_msecs, (1000 * 1000));
 	decimal_msecs = remainder_ns / 1000;
 
 	printk_ratelimited(KERN_INFO
 		"INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
-		a->handler, whole_msecs, decimal_msecs);
+		action->handler, whole_msecs, decimal_msecs);
 }
 
 static int nmi_handle(unsigned int type, struct pt_regs *regs)
@@ -140,11 +144,7 @@
 		delta = sched_clock() - delta;
 		trace_nmi_handler(a->handler, (int)delta, thishandled);
 
-		if (delta < nmi_longest_ns || delta < a->max_duration)
-			continue;
-
-		a->max_duration = delta;
-		irq_work_queue(&a->irq_work);
+		nmi_check_duration(a, delta);
 	}
 
 	rcu_read_unlock();
@@ -162,8 +162,6 @@
 	if (!action->handler)
 		return -EINVAL;
 
-	init_irq_work(&action->irq_work, nmi_max_handler);
-
 	raw_spin_lock_irqsave(&desc->lock, flags);
 
 	/*

diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
index 85195d4..f321534 100644
--- a/arch/x86/kernel/sysfb_simplefb.c
+++ b/arch/x86/kernel/sysfb_simplefb.c

@@ -94,11 +94,11 @@
 	if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
 		size <<= 16;
 	length = mode->height * mode->stride;
-	length = PAGE_ALIGN(length);
 	if (length > size) {
 		printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
 		return -EINVAL;
 	}
+	length = PAGE_ALIGN(length);
 
 	/* setup IORESOURCE_MEM as framebuffer memory */
 	memset(&res, 0, sizeof(res));

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 03b7529..42053b2 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c

@@ -185,8 +185,7 @@
 /*
  * Secondary CPUs do not run through tsc_init(), so set up
  * all the scale factors for all CPUs, assuming the same
- * speed as the bootup CPU. (cpufreq notifiers will fix this
- * up if their speed diverges)
+ * speed as the bootup CPU.
  */
 static void __init cyc2ns_init_secondary_cpus(void)
 {
@@ -936,12 +935,12 @@
 }
 
 #ifdef CONFIG_CPU_FREQ
-/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
+/*
+ * Frequency scaling support. Adjust the TSC based timer when the CPU frequency
  * changes.
  *
- * RED-PEN: On SMP we assume all CPUs run with the same frequency.  It's
- * not that important because current Opteron setups do not support
- * scaling on SMP anyroads.
+ * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC
+ * as unstable and give up in those cases.
  *
  * Should fix up last_tsc too. Currently gettimeofday in the
  * first tick after the change will be slightly wrong.
@@ -955,22 +954,22 @@
 				void *data)
 {
 	struct cpufreq_freqs *freq = data;
-	unsigned long *lpj;
 
-	lpj = &boot_cpu_data.loops_per_jiffy;
-#ifdef CONFIG_SMP
-	if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-		lpj = &cpu_data(freq->cpu).loops_per_jiffy;
-#endif
+	if (num_online_cpus() > 1) {
+		mark_tsc_unstable("cpufreq changes on SMP");
+		return 0;
+	}
 
 	if (!ref_freq) {
 		ref_freq = freq->old;
-		loops_per_jiffy_ref = *lpj;
+		loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
 		tsc_khz_ref = tsc_khz;
 	}
+
 	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
-			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
-		*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
+		boot_cpu_data.loops_per_jiffy =
+			cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
 
 		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
 		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
@@ -1377,6 +1376,8 @@
 
 static bool __init determine_cpu_tsc_frequencies(bool early)
 {
+	u64 initial_tsc;
+
 	/* Make sure that cpu and tsc are not already calibrated */
 	WARN_ON(cpu_khz || tsc_khz);
 
@@ -1389,6 +1390,8 @@
 		cpu_khz = pit_hpet_ptimer_calibrate_cpu();
 	}
 
+	initial_tsc = rdtsc();
+
 	/*
 	 * Trust non-zero tsc_khz as authoritative,
 	 * and use it to sanity check cpu_khz,
@@ -1402,6 +1405,10 @@
 	if (tsc_khz == 0)
 		return false;
 
+	do_div(initial_tsc, cpu_khz / 1000);
+	pr_info("Initial usec timer %llu\n",
+		(unsigned long long)initial_tsc);
+
 	pr_info("Detected %lu.%03lu MHz processor\n",
 		(unsigned long)cpu_khz / KHZ,
 		(unsigned long)cpu_khz % KHZ);

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c91431b..69ce4b7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c

@@ -456,7 +456,7 @@
 
 /*
  * XXX: inoutclob user must know where the argument is being expanded.
- *      Relying on CONFIG_CC_HAS_ASM_GOTO would allow us to remove _fault.
+ *      Relying on CC_HAVE_ASM_GOTO would allow us to remove _fault.
  */
 #define asm_safe(insn, inoutclob...) \
 ({ \

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 3cc3b2d..4d000aea 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c

@@ -427,7 +427,7 @@
 
 			kvm_set_msi_irq(vcpu->kvm, entry, &irq);
 
-			if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0,
+			if (irq.trig_mode && kvm_apic_match_dest(vcpu, NULL, 0,
 						irq.dest_id, irq.dest_mode))
 				__set_bit(irq.vector, ioapic_handled_vectors);
 		}

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0590596..8c63925 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c

@@ -633,9 +633,11 @@
 static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
 {
 	u8 val;
-	if (pv_eoi_get_user(vcpu, &val) < 0)
+	if (pv_eoi_get_user(vcpu, &val) < 0) {
 		apic_debug("Can't read EOI MSR value: 0x%llx\n",
 			   (unsigned long long)vcpu->arch.pv_eoi.msr_val);
+		return false;
+	}
 	return val & 0x1;
 }
 
@@ -1060,11 +1062,8 @@
 				apic_clear_vector(vector, apic->regs + APIC_TMR);
 		}
 
-		if (vcpu->arch.apicv_active)
-			kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
-		else {
+		if (kvm_x86_ops->deliver_posted_interrupt(vcpu, vector)) {
 			kvm_lapic_set_irr(vector, apic);
-
 			kvm_make_request(KVM_REQ_EVENT, vcpu);
 			kvm_vcpu_kick(vcpu);
 		}

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 100ae4f..61f10a4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h

@@ -36,7 +36,7 @@
 	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
 	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
 	#ifdef CONFIG_X86_64
-	#define PT_MAX_FULL_LEVELS 4
+	#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
 	#define CMPXCHG cmpxchg
 	#else
 	#define CMPXCHG cmpxchg64

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7657dcd..3f0565e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c

@@ -1298,6 +1298,47 @@
 				    control->pause_filter_count, old);
 }
 
+/*
+ * The default MMIO mask is a single bit (excluding the present bit),
+ * which could conflict with the memory encryption bit. Check for
+ * memory encryption support and override the default MMIO mask if
+ * memory encryption is enabled.
+ */
+static __init void svm_adjust_mmio_mask(void)
+{
+	unsigned int enc_bit, mask_bit;
+	u64 msr, mask;
+
+	/* If there is no memory encryption support, use existing mask */
+	if (cpuid_eax(0x80000000) < 0x8000001f)
+		return;
+
+	/* If memory encryption is not enabled, use existing mask */
+	rdmsrl(MSR_K8_SYSCFG, msr);
+	if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+		return;
+
+	enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
+	mask_bit = boot_cpu_data.x86_phys_bits;
+
+	/* Increment the mask bit if it is the same as the encryption bit */
+	if (enc_bit == mask_bit)
+		mask_bit++;
+
+	/*
+	 * If the mask bit location is below 52, then some bits above the
+	 * physical addressing limit will always be reserved, so use the
+	 * rsvd_bits() function to generate the mask. This mask, along with
+	 * the present bit, will be used to generate a page fault with
+	 * PFER.RSV = 1.
+	 *
+	 * If the mask bit location is 52 (or above), then clear the mask.
+	 */
+	mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
+
+	kvm_mmu_set_mmio_spte_mask(mask, PT_WRITABLE_MASK | PT_USER_MASK);
+}
+
 static __init int svm_hardware_setup(void)
 {
 	int cpu;
@@ -1352,6 +1393,8 @@
 		}
 	}
 
+	svm_adjust_mmio_mask();
+
 	for_each_possible_cpu(cpu) {
 		r = svm_cpu_init(cpu);
 		if (r)
@@ -5140,8 +5183,11 @@
 	return;
 }
 
-static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
+static int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
 {
+	if (!vcpu->arch.apicv_active)
+		return -1;
+
 	kvm_lapic_set_irr(vec, vcpu->arch.apic);
 	smp_mb__after_atomic();
 
@@ -5150,6 +5196,8 @@
 		       kvm_cpu_get_apicid(vcpu->cpu));
 	else
 		kvm_vcpu_wake_up(vcpu);
+
+	return 0;
 }
 
 static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2660c01e..a81d7d9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c

@@ -5302,6 +5302,9 @@
 
 static int get_ept_level(struct kvm_vcpu *vcpu)
 {
+	/* Nested EPT currently only supports 4-level walks. */
+	if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
+		return 4;
 	if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
 		return 5;
 	return 4;
@@ -5722,6 +5725,26 @@
 		 (ss.selector & SEGMENT_RPL_MASK));
 }
 
+static bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu,
+					unsigned int port, int size);
+static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
+				       struct vmcs12 *vmcs12)
+{
+	unsigned long exit_qualification;
+	unsigned short port;
+	int size;
+
+	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+		return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
+
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+	port = exit_qualification >> 16;
+	size = (exit_qualification & 7) + 1;
+
+	return nested_vmx_check_io_bitmaps(vcpu, port, size);
+}
+
 /*
  * Check if guest state is valid. Returns true if valid, false if
  * not.
@@ -6261,24 +6284,29 @@
  * 2. If target vcpu isn't running(root mode), kick it to pick up the
  * interrupt from PIR in next vmentry.
  */
-static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int r;
 
 	r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
 	if (!r)
-		return;
+		return 0;
+
+	if (!vcpu->arch.apicv_active)
+		return -1;
 
 	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
-		return;
+		return 0;
 
 	/* If a previous notification has sent the IPI, nothing to do.  */
 	if (pi_test_and_set_on(&vmx->pi_desc))
-		return;
+		return 0;
 
 	if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
 		kvm_vcpu_kick(vcpu);
+
+	return 0;
 }
 
 /*
@@ -9466,23 +9494,17 @@
 static const int kvm_vmx_max_exit_handlers =
 	ARRAY_SIZE(kvm_vmx_exit_handlers);
 
-static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
-				       struct vmcs12 *vmcs12)
+/*
+ * Return true if an IO instruction with the specified port and size should cause
+ * a VM-exit into L1.
+ */
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+				 int size)
 {
-	unsigned long exit_qualification;
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	gpa_t bitmap, last_bitmap;
-	unsigned int port;
-	int size;
 	u8 b;
 
-	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
-		return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
-
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
-	port = exit_qualification >> 16;
-	size = (exit_qualification & 7) + 1;
-
 	last_bitmap = (gpa_t)-1;
 	b = -1;
 
@@ -13672,6 +13694,40 @@
 		to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
 }
 
+static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+				  struct x86_instruction_info *info)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	unsigned short port;
+	bool intercept;
+	int size;
+
+	if (info->intercept == x86_intercept_in ||
+	    info->intercept == x86_intercept_ins) {
+		port = info->src_val;
+		size = info->dst_bytes;
+	} else {
+		port = info->dst_val;
+		size = info->src_bytes;
+	}
+
+	/*
+	 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+	 * VM-exits depend on the 'unconditional IO exiting' VM-execution
+	 * control.
+	 *
+	 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+	 */
+	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+		intercept = nested_cpu_has(vmcs12,
+					   CPU_BASED_UNCOND_IO_EXITING);
+	else
+		intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
+
+	/* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
+	return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+}
+
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage)
@@ -13679,19 +13735,45 @@
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 
+	switch (info->intercept) {
 	/*
 	 * RDPID causes #UD if disabled through secondary execution controls.
 	 * Because it is marked as EmulateOnUD, we need to intercept it here.
 	 */
-	if (info->intercept == x86_intercept_rdtscp &&
-	    !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
-		ctxt->exception.vector = UD_VECTOR;
-		ctxt->exception.error_code_valid = false;
-		return X86EMUL_PROPAGATE_FAULT;
-	}
+	case x86_intercept_rdtscp:
+		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+			ctxt->exception.vector = UD_VECTOR;
+			ctxt->exception.error_code_valid = false;
+			return X86EMUL_PROPAGATE_FAULT;
+		}
+		break;
+
+	case x86_intercept_in:
+	case x86_intercept_ins:
+	case x86_intercept_out:
+	case x86_intercept_outs:
+		return vmx_check_intercept_io(vcpu, info);
+
+	case x86_intercept_lgdt:
+	case x86_intercept_lidt:
+	case x86_intercept_lldt:
+	case x86_intercept_ltr:
+	case x86_intercept_sgdt:
+	case x86_intercept_sidt:
+	case x86_intercept_sldt:
+	case x86_intercept_str:
+		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
+			return X86EMUL_CONTINUE;
+
+		/* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED.  */
+		break;
 
 	/* TODO: check more intercepts... */
-	return X86EMUL_CONTINUE;
+	default:
+		break;
+	}
+
+	return X86EMUL_UNHANDLEABLE;
 }
 
 #ifdef CONFIG_X86_64

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
deleted file mode 100644
index 3791ce8..0000000
--- a/arch/x86/kvm/vmx/vmx.c
+++ /dev/null

@@ -1,8033 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * Copyright (C) 2006 Qumranet, Inc.
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Authors:
- *   Avi Kivity   <avi@qumranet.com>
- *   Yaniv Kamay  <yaniv@qumranet.com>
- */
-
-#include <linux/frame.h>
-#include <linux/highmem.h>
-#include <linux/hrtimer.h>
-#include <linux/kernel.h>
-#include <linux/kvm_host.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/mod_devicetable.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/sched/smt.h>
-#include <linux/slab.h>
-#include <linux/tboot.h>
-#include <linux/trace_events.h>
-
-#include <asm/apic.h>
-#include <asm/asm.h>
-#include <asm/cpu.h>
-#include <asm/debugreg.h>
-#include <asm/desc.h>
-#include <asm/fpu/internal.h>
-#include <asm/io.h>
-#include <asm/irq_remapping.h>
-#include <asm/kexec.h>
-#include <asm/perf_event.h>
-#include <asm/mce.h>
-#include <asm/mmu_context.h>
-#include <asm/mshyperv.h>
-#include <asm/spec-ctrl.h>
-#include <asm/virtext.h>
-#include <asm/vmx.h>
-
-#include "capabilities.h"
-#include "cpuid.h"
-#include "evmcs.h"
-#include "irq.h"
-#include "kvm_cache_regs.h"
-#include "lapic.h"
-#include "mmu.h"
-#include "nested.h"
-#include "ops.h"
-#include "pmu.h"
-#include "trace.h"
-#include "vmcs.h"
-#include "vmcs12.h"
-#include "vmx.h"
-#include "x86.h"
-
-MODULE_AUTHOR("Qumranet");
-MODULE_LICENSE("GPL");
-
-static const struct x86_cpu_id vmx_cpu_id[] = {
-	X86_FEATURE_MATCH(X86_FEATURE_VMX),
-	{}
-};
-MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
-
-bool __read_mostly enable_vpid = 1;
-module_param_named(vpid, enable_vpid, bool, 0444);
-
-static bool __read_mostly enable_vnmi = 1;
-module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
-
-bool __read_mostly flexpriority_enabled = 1;
-module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
-
-bool __read_mostly enable_ept = 1;
-module_param_named(ept, enable_ept, bool, S_IRUGO);
-
-bool __read_mostly enable_unrestricted_guest = 1;
-module_param_named(unrestricted_guest,
-			enable_unrestricted_guest, bool, S_IRUGO);
-
-bool __read_mostly enable_ept_ad_bits = 1;
-module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
-
-static bool __read_mostly emulate_invalid_guest_state = true;
-module_param(emulate_invalid_guest_state, bool, S_IRUGO);
-
-static bool __read_mostly fasteoi = 1;
-module_param(fasteoi, bool, S_IRUGO);
-
-static bool __read_mostly enable_apicv = 1;
-module_param(enable_apicv, bool, S_IRUGO);
-
-/*
- * If nested=1, nested virtualization is supported, i.e., guests may use
- * VMX and be a hypervisor for its own guests. If nested=0, guests may not
- * use VMX instructions.
- */
-static bool __read_mostly nested = 1;
-module_param(nested, bool, S_IRUGO);
-
-bool __read_mostly enable_pml = 1;
-module_param_named(pml, enable_pml, bool, S_IRUGO);
-
-static bool __read_mostly dump_invalid_vmcs = 0;
-module_param(dump_invalid_vmcs, bool, 0644);
-
-#define MSR_BITMAP_MODE_X2APIC		1
-#define MSR_BITMAP_MODE_X2APIC_APICV	2
-
-#define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
-
-/* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
-static int __read_mostly cpu_preemption_timer_multi;
-static bool __read_mostly enable_preemption_timer = 1;
-#ifdef CONFIG_X86_64
-module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
-#endif
-
-#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
-#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
-#define KVM_VM_CR0_ALWAYS_ON				\
-	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | 	\
-	 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
-#define KVM_CR4_GUEST_OWNED_BITS				      \
-	(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
-	 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
-
-#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
-#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
-#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
-
-#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
-
-#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
-	RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
-	RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
-	RTIT_STATUS_BYTECNT))
-
-#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
-	(~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
-
-/*
- * These 2 parameters are used to config the controls for Pause-Loop Exiting:
- * ple_gap:    upper bound on the amount of time between two successive
- *             executions of PAUSE in a loop. Also indicate if ple enabled.
- *             According to test, this time is usually smaller than 128 cycles.
- * ple_window: upper bound on the amount of time a guest is allowed to execute
- *             in a PAUSE loop. Tests indicate that most spinlocks are held for
- *             less than 2^12 cycles
- * Time is measured based on a counter that runs at the same rate as the TSC,
- * refer SDM volume 3b section 21.6.13 & 22.1.3.
- */
-static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
-module_param(ple_gap, uint, 0444);
-
-static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
-module_param(ple_window, uint, 0444);
-
-/* Default doubles per-vcpu window every exit. */
-static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
-module_param(ple_window_grow, uint, 0444);
-
-/* Default resets per-vcpu window every exit to ple_window. */
-static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
-module_param(ple_window_shrink, uint, 0444);
-
-/* Default is to compute the maximum so we can never overflow. */
-static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
-module_param(ple_window_max, uint, 0444);
-
-/* Default is SYSTEM mode, 1 for host-guest mode */
-int __read_mostly pt_mode = PT_MODE_SYSTEM;
-module_param(pt_mode, int, S_IRUGO);
-
-static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
-static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
-static DEFINE_MUTEX(vmx_l1d_flush_mutex);
-
-/* Storage for pre module init parameter parsing */
-static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
-
-static const struct {
-	const char *option;
-	bool for_parse;
-} vmentry_l1d_param[] = {
-	[VMENTER_L1D_FLUSH_AUTO]	 = {"auto", true},
-	[VMENTER_L1D_FLUSH_NEVER]	 = {"never", true},
-	[VMENTER_L1D_FLUSH_COND]	 = {"cond", true},
-	[VMENTER_L1D_FLUSH_ALWAYS]	 = {"always", true},
-	[VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
-	[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
-};
-
-#define L1D_CACHE_ORDER 4
-static void *vmx_l1d_flush_pages;
-
-static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
-{
-	struct page *page;
-	unsigned int i;
-
-	if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
-		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
-		return 0;
-	}
-
-	if (!enable_ept) {
-		l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
-		return 0;
-	}
-
-	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
-		u64 msr;
-
-		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
-		if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
-			l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
-			return 0;
-		}
-	}
-
-	/* If set to auto use the default l1tf mitigation method */
-	if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
-		switch (l1tf_mitigation) {
-		case L1TF_MITIGATION_OFF:
-			l1tf = VMENTER_L1D_FLUSH_NEVER;
-			break;
-		case L1TF_MITIGATION_FLUSH_NOWARN:
-		case L1TF_MITIGATION_FLUSH:
-		case L1TF_MITIGATION_FLUSH_NOSMT:
-			l1tf = VMENTER_L1D_FLUSH_COND;
-			break;
-		case L1TF_MITIGATION_FULL:
-		case L1TF_MITIGATION_FULL_FORCE:
-			l1tf = VMENTER_L1D_FLUSH_ALWAYS;
-			break;
-		}
-	} else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
-		l1tf = VMENTER_L1D_FLUSH_ALWAYS;
-	}
-
-	if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
-	    !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
-		/*
-		 * This allocation for vmx_l1d_flush_pages is not tied to a VM
-		 * lifetime and so should not be charged to a memcg.
-		 */
-		page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
-		if (!page)
-			return -ENOMEM;
-		vmx_l1d_flush_pages = page_address(page);
-
-		/*
-		 * Initialize each page with a different pattern in
-		 * order to protect against KSM in the nested
-		 * virtualization case.
-		 */
-		for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
-			memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
-			       PAGE_SIZE);
-		}
-	}
-
-	l1tf_vmx_mitigation = l1tf;
-
-	if (l1tf != VMENTER_L1D_FLUSH_NEVER)
-		static_branch_enable(&vmx_l1d_should_flush);
-	else
-		static_branch_disable(&vmx_l1d_should_flush);
-
-	if (l1tf == VMENTER_L1D_FLUSH_COND)
-		static_branch_enable(&vmx_l1d_flush_cond);
-	else
-		static_branch_disable(&vmx_l1d_flush_cond);
-	return 0;
-}
-
-static int vmentry_l1d_flush_parse(const char *s)
-{
-	unsigned int i;
-
-	if (s) {
-		for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
-			if (vmentry_l1d_param[i].for_parse &&
-			    sysfs_streq(s, vmentry_l1d_param[i].option))
-				return i;
-		}
-	}
-	return -EINVAL;
-}
-
-static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
-{
-	int l1tf, ret;
-
-	l1tf = vmentry_l1d_flush_parse(s);
-	if (l1tf < 0)
-		return l1tf;
-
-	if (!boot_cpu_has(X86_BUG_L1TF))
-		return 0;
-
-	/*
-	 * Has vmx_init() run already? If not then this is the pre init
-	 * parameter parsing. In that case just store the value and let
-	 * vmx_init() do the proper setup after enable_ept has been
-	 * established.
-	 */
-	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
-		vmentry_l1d_flush_param = l1tf;
-		return 0;
-	}
-
-	mutex_lock(&vmx_l1d_flush_mutex);
-	ret = vmx_setup_l1d_flush(l1tf);
-	mutex_unlock(&vmx_l1d_flush_mutex);
-	return ret;
-}
-
-static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
-{
-	if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
-		return sprintf(s, "???\n");
-
-	return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
-}
-
-static const struct kernel_param_ops vmentry_l1d_flush_ops = {
-	.set = vmentry_l1d_flush_set,
-	.get = vmentry_l1d_flush_get,
-};
-module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
-
-static bool guest_state_valid(struct kvm_vcpu *vcpu);
-static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
-							  u32 msr, int type);
-
-void vmx_vmexit(void);
-
-#define vmx_insn_failed(fmt...)		\
-do {					\
-	WARN_ONCE(1, fmt);		\
-	pr_warn_ratelimited(fmt);	\
-} while (0)
-
-asmlinkage void vmread_error(unsigned long field, bool fault)
-{
-	if (fault)
-		kvm_spurious_fault();
-	else
-		vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
-}
-
-noinline void vmwrite_error(unsigned long field, unsigned long value)
-{
-	vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
-			field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
-}
-
-noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
-{
-	vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
-}
-
-noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
-{
-	vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
-}
-
-noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
-{
-	vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
-			ext, vpid, gva);
-}
-
-noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
-{
-	vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
-			ext, eptp, gpa);
-}
-
-static DEFINE_PER_CPU(struct vmcs *, vmxarea);
-DEFINE_PER_CPU(struct vmcs *, current_vmcs);
-/*
- * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
- * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
- */
-static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
-
-/*
- * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
- * can find which vCPU should be waken up.
- */
-static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
-static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
-
-static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
-static DEFINE_SPINLOCK(vmx_vpid_lock);
-
-struct vmcs_config vmcs_config;
-struct vmx_capability vmx_capability;
-
-#define VMX_SEGMENT_FIELD(seg)					\
-	[VCPU_SREG_##seg] = {                                   \
-		.selector = GUEST_##seg##_SELECTOR,		\
-		.base = GUEST_##seg##_BASE,		   	\
-		.limit = GUEST_##seg##_LIMIT,		   	\
-		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
-	}
-
-static const struct kvm_vmx_segment_field {
-	unsigned selector;
-	unsigned base;
-	unsigned limit;
-	unsigned ar_bytes;
-} kvm_vmx_segment_fields[] = {
-	VMX_SEGMENT_FIELD(CS),
-	VMX_SEGMENT_FIELD(DS),
-	VMX_SEGMENT_FIELD(ES),
-	VMX_SEGMENT_FIELD(FS),
-	VMX_SEGMENT_FIELD(GS),
-	VMX_SEGMENT_FIELD(SS),
-	VMX_SEGMENT_FIELD(TR),
-	VMX_SEGMENT_FIELD(LDTR),
-};
-
-u64 host_efer;
-static unsigned long host_idt_base;
-
-/*
- * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
- * will emulate SYSCALL in legacy mode if the vendor string in guest
- * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
- * support this emulation, IA32_STAR must always be included in
- * vmx_msr_index[], even in i386 builds.
- */
-const u32 vmx_msr_index[] = {
-#ifdef CONFIG_X86_64
-	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
-#endif
-	MSR_EFER, MSR_TSC_AUX, MSR_STAR,
-	MSR_IA32_TSX_CTRL,
-};
-
-#if IS_ENABLED(CONFIG_HYPERV)
-static bool __read_mostly enlightened_vmcs = true;
-module_param(enlightened_vmcs, bool, 0444);
-
-/* check_ept_pointer() should be under protection of ept_pointer_lock. */
-static void check_ept_pointer_match(struct kvm *kvm)
-{
-	struct kvm_vcpu *vcpu;
-	u64 tmp_eptp = INVALID_PAGE;
-	int i;
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (!VALID_PAGE(tmp_eptp)) {
-			tmp_eptp = to_vmx(vcpu)->ept_pointer;
-		} else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
-			to_kvm_vmx(kvm)->ept_pointers_match
-				= EPT_POINTERS_MISMATCH;
-			return;
-		}
-	}
-
-	to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
-}
-
-static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
-		void *data)
-{
-	struct kvm_tlb_range *range = data;
-
-	return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
-			range->pages);
-}
-
-static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
-		struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
-{
-	u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
-
-	/*
-	 * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
-	 * of the base of EPT PML4 table, strip off EPT configuration
-	 * information.
-	 */
-	if (range)
-		return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
-				kvm_fill_hv_flush_list_func, (void *)range);
-	else
-		return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
-}
-
-static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
-		struct kvm_tlb_range *range)
-{
-	struct kvm_vcpu *vcpu;
-	int ret = 0, i;
-
-	spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-
-	if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
-		check_ept_pointer_match(kvm);
-
-	if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
-		kvm_for_each_vcpu(i, vcpu, kvm) {
-			/* If ept_pointer is invalid pointer, bypass flush request. */
-			if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
-				ret |= __hv_remote_flush_tlb_with_range(
-					kvm, vcpu, range);
-		}
-	} else {
-		ret = __hv_remote_flush_tlb_with_range(kvm,
-				kvm_get_vcpu(kvm, 0), range);
-	}
-
-	spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-	return ret;
-}
-static int hv_remote_flush_tlb(struct kvm *kvm)
-{
-	return hv_remote_flush_tlb_with_range(kvm, NULL);
-}
-
-static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
-{
-	struct hv_enlightened_vmcs *evmcs;
-	struct hv_partition_assist_pg **p_hv_pa_pg =
-			&vcpu->kvm->arch.hyperv.hv_pa_pg;
-	/*
-	 * Synthetic VM-Exit is not enabled in current code and so All
-	 * evmcs in singe VM shares same assist page.
-	 */
-	if (!*p_hv_pa_pg)
-		*p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
-
-	if (!*p_hv_pa_pg)
-		return -ENOMEM;
-
-	evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
-
-	evmcs->partition_assist_page =
-		__pa(*p_hv_pa_pg);
-	evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
-	evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
-
-	return 0;
-}
-
-#endif /* IS_ENABLED(CONFIG_HYPERV) */
-
-/*
- * Comment's format: document - errata name - stepping - processor name.
- * Refer from
- * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
- */
-static u32 vmx_preemption_cpu_tfms[] = {
-/* 323344.pdf - BA86   - D0 - Xeon 7500 Series */
-0x000206E6,
-/* 323056.pdf - AAX65  - C2 - Xeon L3406 */
-/* 322814.pdf - AAT59  - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
-/* 322911.pdf - AAU65  - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
-0x00020652,
-/* 322911.pdf - AAU65  - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
-0x00020655,
-/* 322373.pdf - AAO95  - B1 - Xeon 3400 Series */
-/* 322166.pdf - AAN92  - B1 - i7-800 and i5-700 Desktop */
-/*
- * 320767.pdf - AAP86  - B1 -
- * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
- */
-0x000106E5,
-/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
-0x000106A0,
-/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
-0x000106A1,
-/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
-0x000106A4,
- /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
- /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
- /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
-0x000106A5,
- /* Xeon E3-1220 V2 */
-0x000306A8,
-};
-
-static inline bool cpu_has_broken_vmx_preemption_timer(void)
-{
-	u32 eax = cpuid_eax(0x00000001), i;
-
-	/* Clear the reserved bits */
-	eax &= ~(0x3U << 14 | 0xfU << 28);
-	for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
-		if (eax == vmx_preemption_cpu_tfms[i])
-			return true;
-
-	return false;
-}
-
-static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
-{
-	return flexpriority_enabled && lapic_in_kernel(vcpu);
-}
-
-static inline bool report_flexpriority(void)
-{
-	return flexpriority_enabled;
-}
-
-static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
-{
-	int i;
-
-	for (i = 0; i < vmx->nmsrs; ++i)
-		if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
-			return i;
-	return -1;
-}
-
-struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
-{
-	int i;
-
-	i = __find_msr_index(vmx, msr);
-	if (i >= 0)
-		return &vmx->guest_msrs[i];
-	return NULL;
-}
-
-static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data)
-{
-	int ret = 0;
-
-	u64 old_msr_data = msr->data;
-	msr->data = data;
-	if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
-		preempt_disable();
-		ret = kvm_set_shared_msr(msr->index, msr->data,
-					 msr->mask);
-		preempt_enable();
-		if (ret)
-			msr->data = old_msr_data;
-	}
-	return ret;
-}
-
-void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
-{
-	vmcs_clear(loaded_vmcs->vmcs);
-	if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
-		vmcs_clear(loaded_vmcs->shadow_vmcs);
-	loaded_vmcs->cpu = -1;
-	loaded_vmcs->launched = 0;
-}
-
-#ifdef CONFIG_KEXEC_CORE
-/*
- * This bitmap is used to indicate whether the vmclear
- * operation is enabled on all cpus. All disabled by
- * default.
- */
-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
-
-static inline void crash_enable_local_vmclear(int cpu)
-{
-	cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline void crash_disable_local_vmclear(int cpu)
-{
-	cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline int crash_local_vmclear_enabled(int cpu)
-{
-	return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static void crash_vmclear_local_loaded_vmcss(void)
-{
-	int cpu = raw_smp_processor_id();
-	struct loaded_vmcs *v;
-
-	if (!crash_local_vmclear_enabled(cpu))
-		return;
-
-	list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
-			    loaded_vmcss_on_cpu_link)
-		vmcs_clear(v->vmcs);
-}
-#else
-static inline void crash_enable_local_vmclear(int cpu) { }
-static inline void crash_disable_local_vmclear(int cpu) { }
-#endif /* CONFIG_KEXEC_CORE */
-
-static void __loaded_vmcs_clear(void *arg)
-{
-	struct loaded_vmcs *loaded_vmcs = arg;
-	int cpu = raw_smp_processor_id();
-
-	if (loaded_vmcs->cpu != cpu)
-		return; /* vcpu migration can race with cpu offline */
-	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
-		per_cpu(current_vmcs, cpu) = NULL;
-	crash_disable_local_vmclear(cpu);
-	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
-
-	/*
-	 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
-	 * is before setting loaded_vmcs->vcpu to -1 which is done in
-	 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
-	 * then adds the vmcs into percpu list before it is deleted.
-	 */
-	smp_wmb();
-
-	loaded_vmcs_init(loaded_vmcs);
-	crash_enable_local_vmclear(cpu);
-}
-
-void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
-{
-	int cpu = loaded_vmcs->cpu;
-
-	if (cpu != -1)
-		smp_call_function_single(cpu,
-			 __loaded_vmcs_clear, loaded_vmcs, 1);
-}
-
-static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
-				       unsigned field)
-{
-	bool ret;
-	u32 mask = 1 << (seg * SEG_FIELD_NR + field);
-
-	if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
-		kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
-		vmx->segment_cache.bitmask = 0;
-	}
-	ret = vmx->segment_cache.bitmask & mask;
-	vmx->segment_cache.bitmask |= mask;
-	return ret;
-}
-
-static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
-{
-	u16 *p = &vmx->segment_cache.seg[seg].selector;
-
-	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
-		*p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
-	return *p;
-}
-
-static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
-{
-	ulong *p = &vmx->segment_cache.seg[seg].base;
-
-	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
-		*p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
-	return *p;
-}
-
-static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
-{
-	u32 *p = &vmx->segment_cache.seg[seg].limit;
-
-	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
-		*p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
-	return *p;
-}
-
-static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
-{
-	u32 *p = &vmx->segment_cache.seg[seg].ar;
-
-	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
-		*p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
-	return *p;
-}
-
-void update_exception_bitmap(struct kvm_vcpu *vcpu)
-{
-	u32 eb;
-
-	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
-	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
-	/*
-	 * Guest access to VMware backdoor ports could legitimately
-	 * trigger #GP because of TSS I/O permission bitmap.
-	 * We intercept those #GP and allow access to them anyway
-	 * as VMware does.
-	 */
-	if (enable_vmware_backdoor)
-		eb |= (1u << GP_VECTOR);
-	if ((vcpu->guest_debug &
-	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
-	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
-		eb |= 1u << BP_VECTOR;
-	if (to_vmx(vcpu)->rmode.vm86_active)
-		eb = ~0;
-	if (enable_ept)
-		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
-
-	/* When we are running a nested L2 guest and L1 specified for it a
-	 * certain exception bitmap, we must trap the same exceptions and pass
-	 * them to L1. When running L2, we will only handle the exceptions
-	 * specified above if L1 did not want them.
-	 */
-	if (is_guest_mode(vcpu))
-		eb |= get_vmcs12(vcpu)->exception_bitmap;
-
-	vmcs_write32(EXCEPTION_BITMAP, eb);
-}
-
-/*
- * Check if MSR is intercepted for currently loaded MSR bitmap.
- */
-static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
-{
-	unsigned long *msr_bitmap;
-	int f = sizeof(unsigned long);
-
-	if (!cpu_has_vmx_msr_bitmap())
-		return true;
-
-	msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
-
-	if (msr <= 0x1fff) {
-		return !!test_bit(msr, msr_bitmap + 0x800 / f);
-	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
-		msr &= 0x1fff;
-		return !!test_bit(msr, msr_bitmap + 0xc00 / f);
-	}
-
-	return true;
-}
-
-static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
-		unsigned long entry, unsigned long exit)
-{
-	vm_entry_controls_clearbit(vmx, entry);
-	vm_exit_controls_clearbit(vmx, exit);
-}
-
-int vmx_find_msr_index(struct vmx_msrs *m, u32 msr)
-{
-	unsigned int i;
-
-	for (i = 0; i < m->nr; ++i) {
-		if (m->val[i].index == msr)
-			return i;
-	}
-	return -ENOENT;
-}
-
-static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
-{
-	int i;
-	struct msr_autoload *m = &vmx->msr_autoload;
-
-	switch (msr) {
-	case MSR_EFER:
-		if (cpu_has_load_ia32_efer()) {
-			clear_atomic_switch_msr_special(vmx,
-					VM_ENTRY_LOAD_IA32_EFER,
-					VM_EXIT_LOAD_IA32_EFER);
-			return;
-		}
-		break;
-	case MSR_CORE_PERF_GLOBAL_CTRL:
-		if (cpu_has_load_perf_global_ctrl()) {
-			clear_atomic_switch_msr_special(vmx,
-					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
-					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
-			return;
-		}
-		break;
-	}
-	i = vmx_find_msr_index(&m->guest, msr);
-	if (i < 0)
-		goto skip_guest;
-	--m->guest.nr;
-	m->guest.val[i] = m->guest.val[m->guest.nr];
-	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
-
-skip_guest:
-	i = vmx_find_msr_index(&m->host, msr);
-	if (i < 0)
-		return;
-
-	--m->host.nr;
-	m->host.val[i] = m->host.val[m->host.nr];
-	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
-}
-
-static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
-		unsigned long entry, unsigned long exit,
-		unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
-		u64 guest_val, u64 host_val)
-{
-	vmcs_write64(guest_val_vmcs, guest_val);
-	if (host_val_vmcs != HOST_IA32_EFER)
-		vmcs_write64(host_val_vmcs, host_val);
-	vm_entry_controls_setbit(vmx, entry);
-	vm_exit_controls_setbit(vmx, exit);
-}
-
-static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
-				  u64 guest_val, u64 host_val, bool entry_only)
-{
-	int i, j = 0;
-	struct msr_autoload *m = &vmx->msr_autoload;
-
-	switch (msr) {
-	case MSR_EFER:
-		if (cpu_has_load_ia32_efer()) {
-			add_atomic_switch_msr_special(vmx,
-					VM_ENTRY_LOAD_IA32_EFER,
-					VM_EXIT_LOAD_IA32_EFER,
-					GUEST_IA32_EFER,
-					HOST_IA32_EFER,
-					guest_val, host_val);
-			return;
-		}
-		break;
-	case MSR_CORE_PERF_GLOBAL_CTRL:
-		if (cpu_has_load_perf_global_ctrl()) {
-			add_atomic_switch_msr_special(vmx,
-					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
-					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
-					GUEST_IA32_PERF_GLOBAL_CTRL,
-					HOST_IA32_PERF_GLOBAL_CTRL,
-					guest_val, host_val);
-			return;
-		}
-		break;
-	case MSR_IA32_PEBS_ENABLE:
-		/* PEBS needs a quiescent period after being disabled (to write
-		 * a record).  Disabling PEBS through VMX MSR swapping doesn't
-		 * provide that period, so a CPU could write host's record into
-		 * guest's memory.
-		 */
-		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
-	}
-
-	i = vmx_find_msr_index(&m->guest, msr);
-	if (!entry_only)
-		j = vmx_find_msr_index(&m->host, msr);
-
-	if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) ||
-		(j < 0 &&  m->host.nr == NR_LOADSTORE_MSRS)) {
-		printk_once(KERN_WARNING "Not enough msr switch entries. "
-				"Can't add msr %x\n", msr);
-		return;
-	}
-	if (i < 0) {
-		i = m->guest.nr++;
-		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
-	}
-	m->guest.val[i].index = msr;
-	m->guest.val[i].value = guest_val;
-
-	if (entry_only)
-		return;
-
-	if (j < 0) {
-		j = m->host.nr++;
-		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
-	}
-	m->host.val[j].index = msr;
-	m->host.val[j].value = host_val;
-}
-
-static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
-{
-	u64 guest_efer = vmx->vcpu.arch.efer;
-	u64 ignore_bits = 0;
-
-	/* Shadow paging assumes NX to be available.  */
-	if (!enable_ept)
-		guest_efer |= EFER_NX;
-
-	/*
-	 * LMA and LME handled by hardware; SCE meaningless outside long mode.
-	 */
-	ignore_bits |= EFER_SCE;
-#ifdef CONFIG_X86_64
-	ignore_bits |= EFER_LMA | EFER_LME;
-	/* SCE is meaningful only in long mode on Intel */
-	if (guest_efer & EFER_LMA)
-		ignore_bits &= ~(u64)EFER_SCE;
-#endif
-
-	/*
-	 * On EPT, we can't emulate NX, so we must switch EFER atomically.
-	 * On CPUs that support "load IA32_EFER", always switch EFER
-	 * atomically, since it's faster than switching it manually.
-	 */
-	if (cpu_has_load_ia32_efer() ||
-	    (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
-		if (!(guest_efer & EFER_LMA))
-			guest_efer &= ~EFER_LME;
-		if (guest_efer != host_efer)
-			add_atomic_switch_msr(vmx, MSR_EFER,
-					      guest_efer, host_efer, false);
-		else
-			clear_atomic_switch_msr(vmx, MSR_EFER);
-		return false;
-	} else {
-		clear_atomic_switch_msr(vmx, MSR_EFER);
-
-		guest_efer &= ~ignore_bits;
-		guest_efer |= host_efer & ignore_bits;
-
-		vmx->guest_msrs[efer_offset].data = guest_efer;
-		vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
-
-		return true;
-	}
-}
-
-#ifdef CONFIG_X86_32
-/*
- * On 32-bit kernels, VM exits still load the FS and GS bases from the
- * VMCS rather than the segment table.  KVM uses this helper to figure
- * out the current bases to poke them into the VMCS before entry.
- */
-static unsigned long segment_base(u16 selector)
-{
-	struct desc_struct *table;
-	unsigned long v;
-
-	if (!(selector & ~SEGMENT_RPL_MASK))
-		return 0;
-
-	table = get_current_gdt_ro();
-
-	if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
-		u16 ldt_selector = kvm_read_ldt();
-
-		if (!(ldt_selector & ~SEGMENT_RPL_MASK))
-			return 0;
-
-		table = (struct desc_struct *)segment_base(ldt_selector);
-	}
-	v = get_desc_base(&table[selector >> 3]);
-	return v;
-}
-#endif
-
-static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
-{
-	u32 i;
-
-	wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
-	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
-	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
-	wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
-	for (i = 0; i < addr_range; i++) {
-		wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
-		wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
-	}
-}
-
-static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
-{
-	u32 i;
-
-	rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
-	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
-	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
-	rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
-	for (i = 0; i < addr_range; i++) {
-		rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
-		rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
-	}
-}
-
-static void pt_guest_enter(struct vcpu_vmx *vmx)
-{
-	if (pt_mode == PT_MODE_SYSTEM)
-		return;
-
-	/*
-	 * GUEST_IA32_RTIT_CTL is already set in the VMCS.
-	 * Save host state before VM entry.
-	 */
-	rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
-	if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
-		wrmsrl(MSR_IA32_RTIT_CTL, 0);
-		pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
-		pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
-	}
-}
-
-static void pt_guest_exit(struct vcpu_vmx *vmx)
-{
-	if (pt_mode == PT_MODE_SYSTEM)
-		return;
-
-	if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
-		pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
-		pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
-	}
-
-	/* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
-	wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
-}
-
-void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
-			unsigned long fs_base, unsigned long gs_base)
-{
-	if (unlikely(fs_sel != host->fs_sel)) {
-		if (!(fs_sel & 7))
-			vmcs_write16(HOST_FS_SELECTOR, fs_sel);
-		else
-			vmcs_write16(HOST_FS_SELECTOR, 0);
-		host->fs_sel = fs_sel;
-	}
-	if (unlikely(gs_sel != host->gs_sel)) {
-		if (!(gs_sel & 7))
-			vmcs_write16(HOST_GS_SELECTOR, gs_sel);
-		else
-			vmcs_write16(HOST_GS_SELECTOR, 0);
-		host->gs_sel = gs_sel;
-	}
-	if (unlikely(fs_base != host->fs_base)) {
-		vmcs_writel(HOST_FS_BASE, fs_base);
-		host->fs_base = fs_base;
-	}
-	if (unlikely(gs_base != host->gs_base)) {
-		vmcs_writel(HOST_GS_BASE, gs_base);
-		host->gs_base = gs_base;
-	}
-}
-
-void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct vmcs_host_state *host_state;
-#ifdef CONFIG_X86_64
-	int cpu = raw_smp_processor_id();
-#endif
-	unsigned long fs_base, gs_base;
-	u16 fs_sel, gs_sel;
-	int i;
-
-	vmx->req_immediate_exit = false;
-
-	/*
-	 * Note that guest MSRs to be saved/restored can also be changed
-	 * when guest state is loaded. This happens when guest transitions
-	 * to/from long-mode by setting MSR_EFER.LMA.
-	 */
-	if (!vmx->guest_msrs_ready) {
-		vmx->guest_msrs_ready = true;
-		for (i = 0; i < vmx->save_nmsrs; ++i)
-			kvm_set_shared_msr(vmx->guest_msrs[i].index,
-					   vmx->guest_msrs[i].data,
-					   vmx->guest_msrs[i].mask);
-
-	}
-	if (vmx->guest_state_loaded)
-		return;
-
-	host_state = &vmx->loaded_vmcs->host_state;
-
-	/*
-	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
-	 * allow segment selectors with cpl > 0 or ti == 1.
-	 */
-	host_state->ldt_sel = kvm_read_ldt();
-
-#ifdef CONFIG_X86_64
-	savesegment(ds, host_state->ds_sel);
-	savesegment(es, host_state->es_sel);
-
-	gs_base = cpu_kernelmode_gs_base(cpu);
-	if (likely(is_64bit_mm(current->mm))) {
-		save_fsgs_for_kvm();
-		fs_sel = current->thread.fsindex;
-		gs_sel = current->thread.gsindex;
-		fs_base = current->thread.fsbase;
-		vmx->msr_host_kernel_gs_base = current->thread.gsbase;
-	} else {
-		savesegment(fs, fs_sel);
-		savesegment(gs, gs_sel);
-		fs_base = read_msr(MSR_FS_BASE);
-		vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
-	}
-
-	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
-#else
-	savesegment(fs, fs_sel);
-	savesegment(gs, gs_sel);
-	fs_base = segment_base(fs_sel);
-	gs_base = segment_base(gs_sel);
-#endif
-
-	vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
-	vmx->guest_state_loaded = true;
-}
-
-static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
-{
-	struct vmcs_host_state *host_state;
-
-	if (!vmx->guest_state_loaded)
-		return;
-
-	host_state = &vmx->loaded_vmcs->host_state;
-
-	++vmx->vcpu.stat.host_state_reload;
-
-#ifdef CONFIG_X86_64
-	rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
-#endif
-	if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
-		kvm_load_ldt(host_state->ldt_sel);
-#ifdef CONFIG_X86_64
-		load_gs_index(host_state->gs_sel);
-#else
-		loadsegment(gs, host_state->gs_sel);
-#endif
-	}
-	if (host_state->fs_sel & 7)
-		loadsegment(fs, host_state->fs_sel);
-#ifdef CONFIG_X86_64
-	if (unlikely(host_state->ds_sel | host_state->es_sel)) {
-		loadsegment(ds, host_state->ds_sel);
-		loadsegment(es, host_state->es_sel);
-	}
-#endif
-	invalidate_tss_limit();
-#ifdef CONFIG_X86_64
-	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
-#endif
-	load_fixmap_gdt(raw_smp_processor_id());
-	vmx->guest_state_loaded = false;
-	vmx->guest_msrs_ready = false;
-}
-
-#ifdef CONFIG_X86_64
-static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
-{
-	preempt_disable();
-	if (vmx->guest_state_loaded)
-		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
-	preempt_enable();
-	return vmx->msr_guest_kernel_gs_base;
-}
-
-static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
-{
-	preempt_disable();
-	if (vmx->guest_state_loaded)
-		wrmsrl(MSR_KERNEL_GS_BASE, data);
-	preempt_enable();
-	vmx->msr_guest_kernel_gs_base = data;
-}
-#endif
-
-static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
-{
-	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-	struct pi_desc old, new;
-	unsigned int dest;
-
-	/*
-	 * In case of hot-plug or hot-unplug, we may have to undo
-	 * vmx_vcpu_pi_put even if there is no assigned device.  And we
-	 * always keep PI.NDST up to date for simplicity: it makes the
-	 * code easier, and CPU migration is not a fast path.
-	 */
-	if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
-		return;
-
-	/*
-	 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
-	 * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
-	 * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
-	 * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
-	 * correctly.
-	 */
-	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
-		pi_clear_sn(pi_desc);
-		goto after_clear_sn;
-	}
-
-	/* The full case.  */
-	do {
-		old.control = new.control = pi_desc->control;
-
-		dest = cpu_physical_id(cpu);
-
-		if (x2apic_enabled())
-			new.ndst = dest;
-		else
-			new.ndst = (dest << 8) & 0xFF00;
-
-		new.sn = 0;
-	} while (cmpxchg64(&pi_desc->control, old.control,
-			   new.control) != old.control);
-
-after_clear_sn:
-
-	/*
-	 * Clear SN before reading the bitmap.  The VT-d firmware
-	 * writes the bitmap and reads SN atomically (5.2.3 in the
-	 * spec), so it doesn't really have a memory barrier that
-	 * pairs with this, but we cannot do that and we need one.
-	 */
-	smp_mb__after_atomic();
-
-	if (!pi_is_pir_empty(pi_desc))
-		pi_set_on(pi_desc);
-}
-
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
-
-	if (!already_loaded) {
-		loaded_vmcs_clear(vmx->loaded_vmcs);
-		local_irq_disable();
-		crash_disable_local_vmclear(cpu);
-
-		/*
-		 * Read loaded_vmcs->cpu should be before fetching
-		 * loaded_vmcs->loaded_vmcss_on_cpu_link.
-		 * See the comments in __loaded_vmcs_clear().
-		 */
-		smp_rmb();
-
-		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
-			 &per_cpu(loaded_vmcss_on_cpu, cpu));
-		crash_enable_local_vmclear(cpu);
-		local_irq_enable();
-	}
-
-	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
-		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
-		vmcs_load(vmx->loaded_vmcs->vmcs);
-		indirect_branch_prediction_barrier();
-	}
-
-	if (!already_loaded) {
-		void *gdt = get_current_gdt_ro();
-		unsigned long sysenter_esp;
-
-		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-
-		/*
-		 * Linux uses per-cpu TSS and GDT, so set these when switching
-		 * processors.  See 22.2.4.
-		 */
-		vmcs_writel(HOST_TR_BASE,
-			    (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
-		vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
-
-		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
-		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
-
-		vmx->loaded_vmcs->cpu = cpu;
-	}
-
-	/* Setup TSC multiplier */
-	if (kvm_has_tsc_control &&
-	    vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
-		decache_tsc_multiplier(vmx);
-}
-
-/*
- * Switches to specified vcpu, until a matching vcpu_put(), but assumes
- * vcpu mutex is already taken.
- */
-void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	vmx_vcpu_load_vmcs(vcpu, cpu);
-
-	vmx_vcpu_pi_load(vcpu, cpu);
-
-	vmx->host_pkru = read_pkru();
-	vmx->host_debugctlmsr = get_debugctlmsr();
-}
-
-static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
-{
-	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
-	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
-		!kvm_vcpu_apicv_active(vcpu))
-		return;
-
-	/* Set SN when the vCPU is preempted */
-	if (vcpu->preempted)
-		pi_set_sn(pi_desc);
-}
-
-static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
-{
-	vmx_vcpu_pi_put(vcpu);
-
-	vmx_prepare_switch_to_host(to_vmx(vcpu));
-}
-
-static bool emulation_required(struct kvm_vcpu *vcpu)
-{
-	return emulate_invalid_guest_state && !guest_state_valid(vcpu);
-}
-
-static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
-
-unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned long rflags, save_rflags;
-
-	if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
-		kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
-		rflags = vmcs_readl(GUEST_RFLAGS);
-		if (vmx->rmode.vm86_active) {
-			rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
-			save_rflags = vmx->rmode.save_rflags;
-			rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
-		}
-		vmx->rflags = rflags;
-	}
-	return vmx->rflags;
-}
-
-void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned long old_rflags;
-
-	if (enable_unrestricted_guest) {
-		kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
-		vmx->rflags = rflags;
-		vmcs_writel(GUEST_RFLAGS, rflags);
-		return;
-	}
-
-	old_rflags = vmx_get_rflags(vcpu);
-	vmx->rflags = rflags;
-	if (vmx->rmode.vm86_active) {
-		vmx->rmode.save_rflags = rflags;
-		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
-	}
-	vmcs_writel(GUEST_RFLAGS, rflags);
-
-	if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
-		vmx->emulation_required = emulation_required(vcpu);
-}
-
-u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
-{
-	u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-	int ret = 0;
-
-	if (interruptibility & GUEST_INTR_STATE_STI)
-		ret |= KVM_X86_SHADOW_INT_STI;
-	if (interruptibility & GUEST_INTR_STATE_MOV_SS)
-		ret |= KVM_X86_SHADOW_INT_MOV_SS;
-
-	return ret;
-}
-
-void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
-{
-	u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-	u32 interruptibility = interruptibility_old;
-
-	interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
-
-	if (mask & KVM_X86_SHADOW_INT_MOV_SS)
-		interruptibility |= GUEST_INTR_STATE_MOV_SS;
-	else if (mask & KVM_X86_SHADOW_INT_STI)
-		interruptibility |= GUEST_INTR_STATE_STI;
-
-	if ((interruptibility != interruptibility_old))
-		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
-}
-
-static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned long value;
-
-	/*
-	 * Any MSR write that attempts to change bits marked reserved will
-	 * case a #GP fault.
-	 */
-	if (data & vmx->pt_desc.ctl_bitmask)
-		return 1;
-
-	/*
-	 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
-	 * result in a #GP unless the same write also clears TraceEn.
-	 */
-	if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
-		((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
-		return 1;
-
-	/*
-	 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
-	 * and FabricEn would cause #GP, if
-	 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
-	 */
-	if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
-		!(data & RTIT_CTL_FABRIC_EN) &&
-		!intel_pt_validate_cap(vmx->pt_desc.caps,
-					PT_CAP_single_range_output))
-		return 1;
-
-	/*
-	 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
-	 * utilize encodings marked reserved will casue a #GP fault.
-	 */
-	value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
-	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
-			!test_bit((data & RTIT_CTL_MTC_RANGE) >>
-			RTIT_CTL_MTC_RANGE_OFFSET, &value))
-		return 1;
-	value = intel_pt_validate_cap(vmx->pt_desc.caps,
-						PT_CAP_cycle_thresholds);
-	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
-			!test_bit((data & RTIT_CTL_CYC_THRESH) >>
-			RTIT_CTL_CYC_THRESH_OFFSET, &value))
-		return 1;
-	value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
-	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
-			!test_bit((data & RTIT_CTL_PSB_FREQ) >>
-			RTIT_CTL_PSB_FREQ_OFFSET, &value))
-		return 1;
-
-	/*
-	 * If ADDRx_CFG is reserved or the encodings is >2 will
-	 * cause a #GP fault.
-	 */
-	value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
-	if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
-		return 1;
-	value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
-	if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
-		return 1;
-	value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
-	if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
-		return 1;
-	value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
-	if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
-		return 1;
-
-	return 0;
-}
-
-static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
-{
-	unsigned long rip;
-
-	/*
-	 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
-	 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
-	 * set when EPT misconfig occurs.  In practice, real hardware updates
-	 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
-	 * (namely Hyper-V) don't set it due to it being undefined behavior,
-	 * i.e. we end up advancing IP with some random value.
-	 */
-	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
-	    to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
-		rip = kvm_rip_read(vcpu);
-		rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-		kvm_rip_write(vcpu, rip);
-	} else {
-		if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
-			return 0;
-	}
-
-	/* skipping an emulated instruction also counts */
-	vmx_set_interrupt_shadow(vcpu, 0);
-
-	return 1;
-}
-
-static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * Ensure that we clear the HLT state in the VMCS.  We don't need to
-	 * explicitly skip the instruction because if the HLT state is set,
-	 * then the instruction is already executing and RIP has already been
-	 * advanced.
-	 */
-	if (kvm_hlt_in_guest(vcpu->kvm) &&
-			vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
-		vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
-}
-
-static void vmx_queue_exception(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned nr = vcpu->arch.exception.nr;
-	bool has_error_code = vcpu->arch.exception.has_error_code;
-	u32 error_code = vcpu->arch.exception.error_code;
-	u32 intr_info = nr | INTR_INFO_VALID_MASK;
-
-	kvm_deliver_exception_payload(vcpu);
-
-	if (has_error_code) {
-		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
-		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
-	}
-
-	if (vmx->rmode.vm86_active) {
-		int inc_eip = 0;
-		if (kvm_exception_is_soft(nr))
-			inc_eip = vcpu->arch.event_exit_inst_len;
-		kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
-		return;
-	}
-
-	WARN_ON_ONCE(vmx->emulation_required);
-
-	if (kvm_exception_is_soft(nr)) {
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-			     vmx->vcpu.arch.event_exit_inst_len);
-		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
-	} else
-		intr_info |= INTR_TYPE_HARD_EXCEPTION;
-
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
-
-	vmx_clear_hlt(vcpu);
-}
-
-static bool vmx_rdtscp_supported(void)
-{
-	return cpu_has_vmx_rdtscp();
-}
-
-static bool vmx_invpcid_supported(void)
-{
-	return cpu_has_vmx_invpcid();
-}
-
-/*
- * Swap MSR entry in host/guest MSR entry array.
- */
-static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
-{
-	struct shared_msr_entry tmp;
-
-	tmp = vmx->guest_msrs[to];
-	vmx->guest_msrs[to] = vmx->guest_msrs[from];
-	vmx->guest_msrs[from] = tmp;
-}
-
-/*
- * Set up the vmcs to automatically save and restore system
- * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
- * mode, as fiddling with msrs is very expensive.
- */
-static void setup_msrs(struct vcpu_vmx *vmx)
-{
-	int save_nmsrs, index;
-
-	save_nmsrs = 0;
-#ifdef CONFIG_X86_64
-	/*
-	 * The SYSCALL MSRs are only needed on long mode guests, and only
-	 * when EFER.SCE is set.
-	 */
-	if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
-		index = __find_msr_index(vmx, MSR_STAR);
-		if (index >= 0)
-			move_msr_up(vmx, index, save_nmsrs++);
-		index = __find_msr_index(vmx, MSR_LSTAR);
-		if (index >= 0)
-			move_msr_up(vmx, index, save_nmsrs++);
-		index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
-		if (index >= 0)
-			move_msr_up(vmx, index, save_nmsrs++);
-	}
-#endif
-	index = __find_msr_index(vmx, MSR_EFER);
-	if (index >= 0 && update_transition_efer(vmx, index))
-		move_msr_up(vmx, index, save_nmsrs++);
-	index = __find_msr_index(vmx, MSR_TSC_AUX);
-	if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
-		move_msr_up(vmx, index, save_nmsrs++);
-	index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL);
-	if (index >= 0)
-		move_msr_up(vmx, index, save_nmsrs++);
-
-	vmx->save_nmsrs = save_nmsrs;
-	vmx->guest_msrs_ready = false;
-
-	if (cpu_has_vmx_msr_bitmap())
-		vmx_update_msr_bitmap(&vmx->vcpu);
-}
-
-static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
-{
-	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
-	if (is_guest_mode(vcpu) &&
-	    (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
-		return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
-
-	return vcpu->arch.tsc_offset;
-}
-
-static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
-{
-	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	u64 g_tsc_offset = 0;
-
-	/*
-	 * We're here if L1 chose not to trap WRMSR to TSC. According
-	 * to the spec, this should set L1's TSC; The offset that L1
-	 * set for L2 remains unchanged, and still needs to be added
-	 * to the newly set TSC to get L2's TSC.
-	 */
-	if (is_guest_mode(vcpu) &&
-	    (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
-		g_tsc_offset = vmcs12->tsc_offset;
-
-	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
-				   vcpu->arch.tsc_offset - g_tsc_offset,
-				   offset);
-	vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
-	return offset + g_tsc_offset;
-}
-
-/*
- * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
- * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
- * all guests if the "nested" module option is off, and can also be disabled
- * for a single guest by disabling its VMX cpuid bit.
- */
-bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
-{
-	return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
-}
-
-static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
-						 uint64_t val)
-{
-	uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
-
-	return !(val & ~valid_bits);
-}
-
-static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
-{
-	switch (msr->index) {
-	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
-		if (!nested)
-			return 1;
-		return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
-	default:
-		return 1;
-	}
-}
-
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct shared_msr_entry *msr;
-	u32 index;
-
-	switch (msr_info->index) {
-#ifdef CONFIG_X86_64
-	case MSR_FS_BASE:
-		msr_info->data = vmcs_readl(GUEST_FS_BASE);
-		break;
-	case MSR_GS_BASE:
-		msr_info->data = vmcs_readl(GUEST_GS_BASE);
-		break;
-	case MSR_KERNEL_GS_BASE:
-		msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
-		break;
-#endif
-	case MSR_EFER:
-		return kvm_get_msr_common(vcpu, msr_info);
-	case MSR_IA32_TSX_CTRL:
-		if (!msr_info->host_initiated &&
-		    !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
-			return 1;
-		goto find_shared_msr;
-	case MSR_IA32_UMWAIT_CONTROL:
-		if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
-			return 1;
-
-		msr_info->data = vmx->msr_ia32_umwait_control;
-		break;
-	case MSR_IA32_SPEC_CTRL:
-		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
-			return 1;
-
-		msr_info->data = to_vmx(vcpu)->spec_ctrl;
-		break;
-	case MSR_IA32_SYSENTER_CS:
-		msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
-		break;
-	case MSR_IA32_SYSENTER_EIP:
-		msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
-		break;
-	case MSR_IA32_SYSENTER_ESP:
-		msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
-		break;
-	case MSR_IA32_BNDCFGS:
-		if (!kvm_mpx_supported() ||
-		    (!msr_info->host_initiated &&
-		     !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
-			return 1;
-		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
-		break;
-	case MSR_IA32_MCG_EXT_CTL:
-		if (!msr_info->host_initiated &&
-		    !(vmx->msr_ia32_feature_control &
-		      FEATURE_CONTROL_LMCE))
-			return 1;
-		msr_info->data = vcpu->arch.mcg_ext_ctl;
-		break;
-	case MSR_IA32_FEATURE_CONTROL:
-		msr_info->data = vmx->msr_ia32_feature_control;
-		break;
-	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
-		if (!nested_vmx_allowed(vcpu))
-			return 1;
-		return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
-				       &msr_info->data);
-	case MSR_IA32_RTIT_CTL:
-		if (pt_mode != PT_MODE_HOST_GUEST)
-			return 1;
-		msr_info->data = vmx->pt_desc.guest.ctl;
-		break;
-	case MSR_IA32_RTIT_STATUS:
-		if (pt_mode != PT_MODE_HOST_GUEST)
-			return 1;
-		msr_info->data = vmx->pt_desc.guest.status;
-		break;
-	case MSR_IA32_RTIT_CR3_MATCH:
-		if ((pt_mode != PT_MODE_HOST_GUEST) ||
-			!intel_pt_validate_cap(vmx->pt_desc.caps,
-						PT_CAP_cr3_filtering))
-			return 1;
-		msr_info->data = vmx->pt_desc.guest.cr3_match;
-		break;
-	case MSR_IA32_RTIT_OUTPUT_BASE:
-		if ((pt_mode != PT_MODE_HOST_GUEST) ||
-			(!intel_pt_validate_cap(vmx->pt_desc.caps,
-					PT_CAP_topa_output) &&
-			 !intel_pt_validate_cap(vmx->pt_desc.caps,
-					PT_CAP_single_range_output)))
-			return 1;
-		msr_info->data = vmx->pt_desc.guest.output_base;
-		break;
-	case MSR_IA32_RTIT_OUTPUT_MASK:
-		if ((pt_mode != PT_MODE_HOST_GUEST) ||
-			(!intel_pt_validate_cap(vmx->pt_desc.caps,
-					PT_CAP_topa_output) &&
-			 !intel_pt_validate_cap(vmx->pt_desc.caps,
-					PT_CAP_single_range_output)))
-			return 1;
-		msr_info->data = vmx->pt_desc.guest.output_mask;
-		break;
-	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
-		index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
-		if ((pt_mode != PT_MODE_HOST_GUEST) ||
-			(index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
-					PT_CAP_num_address_ranges)))
-			return 1;
-		if (is_noncanonical_address(data, vcpu))
-			return 1;
-		if (index % 2)
-			msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
-		else
-			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
-		break;
-	case MSR_TSC_AUX:
-		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
-			return 1;
-		goto find_shared_msr;
-	default:
-	find_shared_msr:
-		msr = find_msr_entry(vmx, msr_info->index);
-		if (msr) {
-			msr_info->data = msr->data;
-			break;
-		}
-		return kvm_get_msr_common(vcpu, msr_info);
-	}
-
-	return 0;
-}
-
-/*
- * Writes msr value into the appropriate "register".
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct shared_msr_entry *msr;
-	int ret = 0;
-	u32 msr_index = msr_info->index;
-	u64 data = msr_info->data;
-	u32 index;
-
-	switch (msr_index) {
-	case MSR_EFER:
-		ret = kvm_set_msr_common(vcpu, msr_info);
-		break;
-#ifdef CONFIG_X86_64
-	case MSR_FS_BASE:
-		vmx_segment_cache_clear(vmx);
-		vmcs_writel(GUEST_FS_BASE, data);
-		break;
-	case MSR_GS_BASE:
-		vmx_segment_cache_clear(vmx);
-		vmcs_writel(GUEST_GS_BASE, data);
-		break;
-	case MSR_KERNEL_GS_BASE:
-		vmx_write_guest_kernel_gs_base(vmx, data);
-		break;
-#endif
-	case MSR_IA32_SYSENTER_CS:
-		if (is_guest_mode(vcpu))
-			get_vmcs12(vcpu)->guest_sysenter_cs = data;
-		vmcs_write32(GUEST_SYSENTER_CS, data);
-		break;
-	case MSR_IA32_SYSENTER_EIP:
-		if (is_guest_mode(vcpu))
-			get_vmcs12(vcpu)->guest_sysenter_eip = data;
-		vmcs_writel(GUEST_SYSENTER_EIP, data);
-		break;
-	case MSR_IA32_SYSENTER_ESP:
-		if (is_guest_mode(vcpu))
-			get_vmcs12(vcpu)->guest_sysenter_esp = data;
-		vmcs_writel(GUEST_SYSENTER_ESP, data);
-		break;
-	case MSR_IA32_DEBUGCTLMSR:
-		if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
-						VM_EXIT_SAVE_DEBUG_CONTROLS)
-			get_vmcs12(vcpu)->guest_ia32_debugctl = data;
-
-		ret = kvm_set_msr_common(vcpu, msr_info);
-		break;
-
-	case MSR_IA32_BNDCFGS:
-		if (!kvm_mpx_supported() ||
-		    (!msr_info->host_initiated &&
-		     !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
-			return 1;
-		if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
-		    (data & MSR_IA32_BNDCFGS_RSVD))
-			return 1;
-		vmcs_write64(GUEST_BNDCFGS, data);
-		break;
-	case MSR_IA32_UMWAIT_CONTROL:
-		if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
-			return 1;
-
-		/* The reserved bit 1 and non-32 bit [63:32] should be zero */
-		if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
-			return 1;
-
-		vmx->msr_ia32_umwait_control = data;
-		break;
-	case MSR_IA32_SPEC_CTRL:
-		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
-			return 1;
-
-		/* The STIBP bit doesn't fault even if it's not advertised */
-		if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
-			return 1;
-
-		vmx->spec_ctrl = data;
-
-		if (!data)
-			break;
-
-		/*
-		 * For non-nested:
-		 * When it's written (to non-zero) for the first time, pass
-		 * it through.
-		 *
-		 * For nested:
-		 * The handling of the MSR bitmap for L2 guests is done in
-		 * nested_vmx_prepare_msr_bitmap. We should not touch the
-		 * vmcs02.msr_bitmap here since it gets completely overwritten
-		 * in the merging. We update the vmcs01 here for L1 as well
-		 * since it will end up touching the MSR anyway now.
-		 */
-		vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
-					      MSR_IA32_SPEC_CTRL,
-					      MSR_TYPE_RW);
-		break;
-	case MSR_IA32_TSX_CTRL:
-		if (!msr_info->host_initiated &&
-		    !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
-			return 1;
-		if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
-			return 1;
-		goto find_shared_msr;
-	case MSR_IA32_PRED_CMD:
-		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
-			return 1;
-
-		if (data & ~PRED_CMD_IBPB)
-			return 1;
-
-		if (!data)
-			break;
-
-		wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
-
-		/*
-		 * For non-nested:
-		 * When it's written (to non-zero) for the first time, pass
-		 * it through.
-		 *
-		 * For nested:
-		 * The handling of the MSR bitmap for L2 guests is done in
-		 * nested_vmx_prepare_msr_bitmap. We should not touch the
-		 * vmcs02.msr_bitmap here since it gets completely overwritten
-		 * in the merging.
-		 */
-		vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
-					      MSR_TYPE_W);
-		break;
-	case MSR_IA32_CR_PAT:
-		if (!kvm_pat_valid(data))
-			return 1;
-
-		if (is_guest_mode(vcpu) &&
-		    get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
-			get_vmcs12(vcpu)->guest_ia32_pat = data;
-
-		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
-			vmcs_write64(GUEST_IA32_PAT, data);
-			vcpu->arch.pat = data;
-			break;
-		}
-		ret = kvm_set_msr_common(vcpu, msr_info);
-		break;
-	case MSR_IA32_TSC_ADJUST:
-		ret = kvm_set_msr_common(vcpu, msr_info);
-		break;
-	case MSR_IA32_MCG_EXT_CTL:
-		if ((!msr_info->host_initiated &&
-		     !(to_vmx(vcpu)->msr_ia32_feature_control &
-		       FEATURE_CONTROL_LMCE)) ||
-		    (data & ~MCG_EXT_CTL_LMCE_EN))
-			return 1;
-		vcpu->arch.mcg_ext_ctl = data;
-		break;
-	case MSR_IA32_FEATURE_CONTROL:
-		if (!vmx_feature_control_msr_valid(vcpu, data) ||
-		    (to_vmx(vcpu)->msr_ia32_feature_control &
-		     FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
-			return 1;
-		vmx->msr_ia32_feature_control = data;
-		if (msr_info->host_initiated && data == 0)
-			vmx_leave_nested(vcpu);
-		break;
-	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
-		if (!msr_info->host_initiated)
-			return 1; /* they are read-only */
-		if (!nested_vmx_allowed(vcpu))
-			return 1;
-		return vmx_set_vmx_msr(vcpu, msr_index, data);
-	case MSR_IA32_RTIT_CTL:
-		if ((pt_mode != PT_MODE_HOST_GUEST) ||
-			vmx_rtit_ctl_check(vcpu, data) ||
-			vmx->nested.vmxon)
-			return 1;
-		vmcs_write64(GUEST_IA32_RTIT_CTL, data);
-		vmx->pt_desc.guest.ctl = data;
-		pt_update_intercept_for_msr(vmx);
-		break;
-	case MSR_IA32_RTIT_STATUS:
-		if ((pt_mode != PT_MODE_HOST_GUEST) ||
-			(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
-			(data & MSR_IA32_RTIT_STATUS_MASK))
-			return 1;
-		vmx->pt_desc.guest.status = data;
-		break;
-	case MSR_IA32_RTIT_CR3_MATCH:
-		if ((pt_mode != PT_MODE_HOST_GUEST) ||
-			(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
-			!intel_pt_validate_cap(vmx->pt_desc.caps,
-						PT_CAP_cr3_filtering))
-			return 1;
-		vmx->pt_desc.guest.cr3_match = data;
-		break;
-	case MSR_IA32_RTIT_OUTPUT_BASE:
-		if ((pt_mode != PT_MODE_HOST_GUEST) ||
-			(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
-			(!intel_pt_validate_cap(vmx->pt_desc.caps,
-					PT_CAP_topa_output) &&
-			 !intel_pt_validate_cap(vmx->pt_desc.caps,
-					PT_CAP_single_range_output)) ||
-			(data & MSR_IA32_RTIT_OUTPUT_BASE_MASK))
-			return 1;
-		vmx->pt_desc.guest.output_base = data;
-		break;
-	case MSR_IA32_RTIT_OUTPUT_MASK:
-		if ((pt_mode != PT_MODE_HOST_GUEST) ||
-			(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
-			(!intel_pt_validate_cap(vmx->pt_desc.caps,
-					PT_CAP_topa_output) &&
-			 !intel_pt_validate_cap(vmx->pt_desc.caps,
-					PT_CAP_single_range_output)))
-			return 1;
-		vmx->pt_desc.guest.output_mask = data;
-		break;
-	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
-		index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
-		if ((pt_mode != PT_MODE_HOST_GUEST) ||
-			(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
-			(index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
-					PT_CAP_num_address_ranges)))
-			return 1;
-		if (is_noncanonical_address(data, vcpu))
-			return 1;
-		if (index % 2)
-			vmx->pt_desc.guest.addr_b[index / 2] = data;
-		else
-			vmx->pt_desc.guest.addr_a[index / 2] = data;
-		break;
-	case MSR_TSC_AUX:
-		if (!msr_info->host_initiated &&
-		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
-			return 1;
-		/* Check reserved bit, higher 32 bits should be zero */
-		if ((data >> 32) != 0)
-			return 1;
-		goto find_shared_msr;
-
-	default:
-	find_shared_msr:
-		msr = find_msr_entry(vmx, msr_index);
-		if (msr)
-			ret = vmx_set_guest_msr(vmx, msr, data);
-		else
-			ret = kvm_set_msr_common(vcpu, msr_info);
-	}
-
-	return ret;
-}
-
-static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
-{
-	kvm_register_mark_available(vcpu, reg);
-
-	switch (reg) {
-	case VCPU_REGS_RSP:
-		vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
-		break;
-	case VCPU_REGS_RIP:
-		vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
-		break;
-	case VCPU_EXREG_PDPTR:
-		if (enable_ept)
-			ept_save_pdptrs(vcpu);
-		break;
-	case VCPU_EXREG_CR3:
-		if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
-			vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
-		break;
-	default:
-		WARN_ON_ONCE(1);
-		break;
-	}
-}
-
-static __init int cpu_has_kvm_support(void)
-{
-	return cpu_has_vmx();
-}
-
-static __init int vmx_disabled_by_bios(void)
-{
-	u64 msr;
-
-	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
-	if (msr & FEATURE_CONTROL_LOCKED) {
-		/* launched w/ TXT and VMX disabled */
-		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
-			&& tboot_enabled())
-			return 1;
-		/* launched w/o TXT and VMX only enabled w/ TXT */
-		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
-			&& (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
-			&& !tboot_enabled()) {
-			printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
-				"activate TXT before enabling KVM\n");
-			return 1;
-		}
-		/* launched w/o TXT and VMX disabled */
-		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
-			&& !tboot_enabled())
-			return 1;
-	}
-
-	return 0;
-}
-
-static void kvm_cpu_vmxon(u64 addr)
-{
-	cr4_set_bits(X86_CR4_VMXE);
-	intel_pt_handle_vmx(1);
-
-	asm volatile ("vmxon %0" : : "m"(addr));
-}
-
-static int hardware_enable(void)
-{
-	int cpu = raw_smp_processor_id();
-	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
-	u64 old, test_bits;
-
-	if (cr4_read_shadow() & X86_CR4_VMXE)
-		return -EBUSY;
-
-	/*
-	 * This can happen if we hot-added a CPU but failed to allocate
-	 * VP assist page for it.
-	 */
-	if (static_branch_unlikely(&enable_evmcs) &&
-	    !hv_get_vp_assist_page(cpu))
-		return -EFAULT;
-
-	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
-	INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
-	spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-
-	/*
-	 * Now we can enable the vmclear operation in kdump
-	 * since the loaded_vmcss_on_cpu list on this cpu
-	 * has been initialized.
-	 *
-	 * Though the cpu is not in VMX operation now, there
-	 * is no problem to enable the vmclear operation
-	 * for the loaded_vmcss_on_cpu list is empty!
-	 */
-	crash_enable_local_vmclear(cpu);
-
-	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-
-	test_bits = FEATURE_CONTROL_LOCKED;
-	test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
-	if (tboot_enabled())
-		test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
-
-	if ((old & test_bits) != test_bits) {
-		/* enable and lock */
-		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
-	}
-	kvm_cpu_vmxon(phys_addr);
-	if (enable_ept)
-		ept_sync_global();
-
-	return 0;
-}
-
-static void vmclear_local_loaded_vmcss(void)
-{
-	int cpu = raw_smp_processor_id();
-	struct loaded_vmcs *v, *n;
-
-	list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
-				 loaded_vmcss_on_cpu_link)
-		__loaded_vmcs_clear(v);
-}
-
-
-/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
- * tricks.
- */
-static void kvm_cpu_vmxoff(void)
-{
-	asm volatile (__ex("vmxoff"));
-
-	intel_pt_handle_vmx(0);
-	cr4_clear_bits(X86_CR4_VMXE);
-}
-
-static void hardware_disable(void)
-{
-	vmclear_local_loaded_vmcss();
-	kvm_cpu_vmxoff();
-}
-
-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
-				      u32 msr, u32 *result)
-{
-	u32 vmx_msr_low, vmx_msr_high;
-	u32 ctl = ctl_min | ctl_opt;
-
-	rdmsr(msr, vmx_msr_low, vmx_msr_high);
-
-	ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
-	ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
-
-	/* Ensure minimum (required) set of control bits are supported. */
-	if (ctl_min & ~ctl)
-		return -EIO;
-
-	*result = ctl;
-	return 0;
-}
-
-static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
-				    struct vmx_capability *vmx_cap)
-{
-	u32 vmx_msr_low, vmx_msr_high;
-	u32 min, opt, min2, opt2;
-	u32 _pin_based_exec_control = 0;
-	u32 _cpu_based_exec_control = 0;
-	u32 _cpu_based_2nd_exec_control = 0;
-	u32 _vmexit_control = 0;
-	u32 _vmentry_control = 0;
-
-	memset(vmcs_conf, 0, sizeof(*vmcs_conf));
-	min = CPU_BASED_HLT_EXITING |
-#ifdef CONFIG_X86_64
-	      CPU_BASED_CR8_LOAD_EXITING |
-	      CPU_BASED_CR8_STORE_EXITING |
-#endif
-	      CPU_BASED_CR3_LOAD_EXITING |
-	      CPU_BASED_CR3_STORE_EXITING |
-	      CPU_BASED_UNCOND_IO_EXITING |
-	      CPU_BASED_MOV_DR_EXITING |
-	      CPU_BASED_USE_TSC_OFFSETTING |
-	      CPU_BASED_MWAIT_EXITING |
-	      CPU_BASED_MONITOR_EXITING |
-	      CPU_BASED_INVLPG_EXITING |
-	      CPU_BASED_RDPMC_EXITING;
-
-	opt = CPU_BASED_TPR_SHADOW |
-	      CPU_BASED_USE_MSR_BITMAPS |
-	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
-	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
-				&_cpu_based_exec_control) < 0)
-		return -EIO;
-#ifdef CONFIG_X86_64
-	if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
-		_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
-					   ~CPU_BASED_CR8_STORE_EXITING;
-#endif
-	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
-		min2 = 0;
-		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
-			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
-			SECONDARY_EXEC_WBINVD_EXITING |
-			SECONDARY_EXEC_ENABLE_VPID |
-			SECONDARY_EXEC_ENABLE_EPT |
-			SECONDARY_EXEC_UNRESTRICTED_GUEST |
-			SECONDARY_EXEC_PAUSE_LOOP_EXITING |
-			SECONDARY_EXEC_DESC |
-			SECONDARY_EXEC_RDTSCP |
-			SECONDARY_EXEC_ENABLE_INVPCID |
-			SECONDARY_EXEC_APIC_REGISTER_VIRT |
-			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-			SECONDARY_EXEC_SHADOW_VMCS |
-			SECONDARY_EXEC_XSAVES |
-			SECONDARY_EXEC_RDSEED_EXITING |
-			SECONDARY_EXEC_RDRAND_EXITING |
-			SECONDARY_EXEC_ENABLE_PML |
-			SECONDARY_EXEC_TSC_SCALING |
-			SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
-			SECONDARY_EXEC_PT_USE_GPA |
-			SECONDARY_EXEC_PT_CONCEAL_VMX |
-			SECONDARY_EXEC_ENABLE_VMFUNC |
-			SECONDARY_EXEC_ENCLS_EXITING;
-		if (adjust_vmx_controls(min2, opt2,
-					MSR_IA32_VMX_PROCBASED_CTLS2,
-					&_cpu_based_2nd_exec_control) < 0)
-			return -EIO;
-	}
-#ifndef CONFIG_X86_64
-	if (!(_cpu_based_2nd_exec_control &
-				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
-		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
-#endif
-
-	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
-		_cpu_based_2nd_exec_control &= ~(
-				SECONDARY_EXEC_APIC_REGISTER_VIRT |
-				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
-				SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
-
-	rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
-		&vmx_cap->ept, &vmx_cap->vpid);
-
-	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
-		/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
-		   enabled */
-		_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
-					     CPU_BASED_CR3_STORE_EXITING |
-					     CPU_BASED_INVLPG_EXITING);
-	} else if (vmx_cap->ept) {
-		vmx_cap->ept = 0;
-		pr_warn_once("EPT CAP should not exist if not support "
-				"1-setting enable EPT VM-execution control\n");
-	}
-	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
-		vmx_cap->vpid) {
-		vmx_cap->vpid = 0;
-		pr_warn_once("VPID CAP should not exist if not support "
-				"1-setting enable VPID VM-execution control\n");
-	}
-
-	min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
-#ifdef CONFIG_X86_64
-	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#endif
-	opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
-	      VM_EXIT_LOAD_IA32_PAT |
-	      VM_EXIT_LOAD_IA32_EFER |
-	      VM_EXIT_CLEAR_BNDCFGS |
-	      VM_EXIT_PT_CONCEAL_PIP |
-	      VM_EXIT_CLEAR_IA32_RTIT_CTL;
-	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
-				&_vmexit_control) < 0)
-		return -EIO;
-
-	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
-		 PIN_BASED_VMX_PREEMPTION_TIMER;
-	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
-				&_pin_based_exec_control) < 0)
-		return -EIO;
-
-	if (cpu_has_broken_vmx_preemption_timer())
-		_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
-	if (!(_cpu_based_2nd_exec_control &
-		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
-		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
-
-	min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
-	opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
-	      VM_ENTRY_LOAD_IA32_PAT |
-	      VM_ENTRY_LOAD_IA32_EFER |
-	      VM_ENTRY_LOAD_BNDCFGS |
-	      VM_ENTRY_PT_CONCEAL_PIP |
-	      VM_ENTRY_LOAD_IA32_RTIT_CTL;
-	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
-				&_vmentry_control) < 0)
-		return -EIO;
-
-	/*
-	 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
-	 * can't be used due to an errata where VM Exit may incorrectly clear
-	 * IA32_PERF_GLOBAL_CTRL[34:32].  Workaround the errata by using the
-	 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
-	 */
-	if (boot_cpu_data.x86 == 0x6) {
-		switch (boot_cpu_data.x86_model) {
-		case 26: /* AAK155 */
-		case 30: /* AAP115 */
-		case 37: /* AAT100 */
-		case 44: /* BC86,AAY89,BD102 */
-		case 46: /* BA97 */
-			_vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
-			_vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
-			pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
-					"does not work properly. Using workaround\n");
-			break;
-		default:
-			break;
-		}
-	}
-
-
-	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
-
-	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
-	if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
-		return -EIO;
-
-#ifdef CONFIG_X86_64
-	/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
-	if (vmx_msr_high & (1u<<16))
-		return -EIO;
-#endif
-
-	/* Require Write-Back (WB) memory type for VMCS accesses. */
-	if (((vmx_msr_high >> 18) & 15) != 6)
-		return -EIO;
-
-	vmcs_conf->size = vmx_msr_high & 0x1fff;
-	vmcs_conf->order = get_order(vmcs_conf->size);
-	vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
-
-	vmcs_conf->revision_id = vmx_msr_low;
-
-	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
-	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
-	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
-	vmcs_conf->vmexit_ctrl         = _vmexit_control;
-	vmcs_conf->vmentry_ctrl        = _vmentry_control;
-
-	if (static_branch_unlikely(&enable_evmcs))
-		evmcs_sanitize_exec_ctrls(vmcs_conf);
-
-	return 0;
-}
-
-struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
-{
-	int node = cpu_to_node(cpu);
-	struct page *pages;
-	struct vmcs *vmcs;
-
-	pages = __alloc_pages_node(node, flags, vmcs_config.order);
-	if (!pages)
-		return NULL;
-	vmcs = page_address(pages);
-	memset(vmcs, 0, vmcs_config.size);
-
-	/* KVM supports Enlightened VMCS v1 only */
-	if (static_branch_unlikely(&enable_evmcs))
-		vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
-	else
-		vmcs->hdr.revision_id = vmcs_config.revision_id;
-
-	if (shadow)
-		vmcs->hdr.shadow_vmcs = 1;
-	return vmcs;
-}
-
-void free_vmcs(struct vmcs *vmcs)
-{
-	free_pages((unsigned long)vmcs, vmcs_config.order);
-}
-
-/*
- * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
- */
-void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
-{
-	if (!loaded_vmcs->vmcs)
-		return;
-	loaded_vmcs_clear(loaded_vmcs);
-	free_vmcs(loaded_vmcs->vmcs);
-	loaded_vmcs->vmcs = NULL;
-	if (loaded_vmcs->msr_bitmap)
-		free_page((unsigned long)loaded_vmcs->msr_bitmap);
-	WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
-}
-
-int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
-{
-	loaded_vmcs->vmcs = alloc_vmcs(false);
-	if (!loaded_vmcs->vmcs)
-		return -ENOMEM;
-
-	loaded_vmcs->shadow_vmcs = NULL;
-	loaded_vmcs->hv_timer_soft_disabled = false;
-	loaded_vmcs_init(loaded_vmcs);
-
-	if (cpu_has_vmx_msr_bitmap()) {
-		loaded_vmcs->msr_bitmap = (unsigned long *)
-				__get_free_page(GFP_KERNEL_ACCOUNT);
-		if (!loaded_vmcs->msr_bitmap)
-			goto out_vmcs;
-		memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
-
-		if (IS_ENABLED(CONFIG_HYPERV) &&
-		    static_branch_unlikely(&enable_evmcs) &&
-		    (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
-			struct hv_enlightened_vmcs *evmcs =
-				(struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
-
-			evmcs->hv_enlightenments_control.msr_bitmap = 1;
-		}
-	}
-
-	memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
-	memset(&loaded_vmcs->controls_shadow, 0,
-		sizeof(struct vmcs_controls_shadow));
-
-	return 0;
-
-out_vmcs:
-	free_loaded_vmcs(loaded_vmcs);
-	return -ENOMEM;
-}
-
-static void free_kvm_area(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		free_vmcs(per_cpu(vmxarea, cpu));
-		per_cpu(vmxarea, cpu) = NULL;
-	}
-}
-
-static __init int alloc_kvm_area(void)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu) {
-		struct vmcs *vmcs;
-
-		vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
-		if (!vmcs) {
-			free_kvm_area();
-			return -ENOMEM;
-		}
-
-		/*
-		 * When eVMCS is enabled, alloc_vmcs_cpu() sets
-		 * vmcs->revision_id to KVM_EVMCS_VERSION instead of
-		 * revision_id reported by MSR_IA32_VMX_BASIC.
-		 *
-		 * However, even though not explicitly documented by
-		 * TLFS, VMXArea passed as VMXON argument should
-		 * still be marked with revision_id reported by
-		 * physical CPU.
-		 */
-		if (static_branch_unlikely(&enable_evmcs))
-			vmcs->hdr.revision_id = vmcs_config.revision_id;
-
-		per_cpu(vmxarea, cpu) = vmcs;
-	}
-	return 0;
-}
-
-static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
-		struct kvm_segment *save)
-{
-	if (!emulate_invalid_guest_state) {
-		/*
-		 * CS and SS RPL should be equal during guest entry according
-		 * to VMX spec, but in reality it is not always so. Since vcpu
-		 * is in the middle of the transition from real mode to
-		 * protected mode it is safe to assume that RPL 0 is a good
-		 * default value.
-		 */
-		if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
-			save->selector &= ~SEGMENT_RPL_MASK;
-		save->dpl = save->selector & SEGMENT_RPL_MASK;
-		save->s = 1;
-	}
-	vmx_set_segment(vcpu, save, seg);
-}
-
-static void enter_pmode(struct kvm_vcpu *vcpu)
-{
-	unsigned long flags;
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	/*
-	 * Update real mode segment cache. It may be not up-to-date if sement
-	 * register was written while vcpu was in a guest mode.
-	 */
-	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
-	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
-	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
-	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
-	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
-	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
-
-	vmx->rmode.vm86_active = 0;
-
-	vmx_segment_cache_clear(vmx);
-
-	vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
-
-	flags = vmcs_readl(GUEST_RFLAGS);
-	flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
-	flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
-	vmcs_writel(GUEST_RFLAGS, flags);
-
-	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
-			(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
-
-	update_exception_bitmap(vcpu);
-
-	fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
-	fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
-	fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
-	fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
-	fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
-	fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
-}
-
-static void fix_rmode_seg(int seg, struct kvm_segment *save)
-{
-	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-	struct kvm_segment var = *save;
-
-	var.dpl = 0x3;
-	if (seg == VCPU_SREG_CS)
-		var.type = 0x3;
-
-	if (!emulate_invalid_guest_state) {
-		var.selector = var.base >> 4;
-		var.base = var.base & 0xffff0;
-		var.limit = 0xffff;
-		var.g = 0;
-		var.db = 0;
-		var.present = 1;
-		var.s = 1;
-		var.l = 0;
-		var.unusable = 0;
-		var.type = 0x3;
-		var.avl = 0;
-		if (save->base & 0xf)
-			printk_once(KERN_WARNING "kvm: segment base is not "
-					"paragraph aligned when entering "
-					"protected mode (seg=%d)", seg);
-	}
-
-	vmcs_write16(sf->selector, var.selector);
-	vmcs_writel(sf->base, var.base);
-	vmcs_write32(sf->limit, var.limit);
-	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
-}
-
-static void enter_rmode(struct kvm_vcpu *vcpu)
-{
-	unsigned long flags;
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
-
-	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
-	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
-	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
-	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
-	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
-	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
-	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
-
-	vmx->rmode.vm86_active = 1;
-
-	/*
-	 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
-	 * vcpu. Warn the user that an update is overdue.
-	 */
-	if (!kvm_vmx->tss_addr)
-		printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
-			     "called before entering vcpu\n");
-
-	vmx_segment_cache_clear(vmx);
-
-	vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
-	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
-	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
-	flags = vmcs_readl(GUEST_RFLAGS);
-	vmx->rmode.save_rflags = flags;
-
-	flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
-
-	vmcs_writel(GUEST_RFLAGS, flags);
-	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
-	update_exception_bitmap(vcpu);
-
-	fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
-	fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
-	fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
-	fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
-	fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
-	fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
-
-	kvm_mmu_reset_context(vcpu);
-}
-
-void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
-
-	if (!msr)
-		return;
-
-	vcpu->arch.efer = efer;
-	if (efer & EFER_LMA) {
-		vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
-		msr->data = efer;
-	} else {
-		vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
-
-		msr->data = efer & ~EFER_LME;
-	}
-	setup_msrs(vmx);
-}
-
-#ifdef CONFIG_X86_64
-
-static void enter_lmode(struct kvm_vcpu *vcpu)
-{
-	u32 guest_tr_ar;
-
-	vmx_segment_cache_clear(to_vmx(vcpu));
-
-	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
-	if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
-		pr_debug_ratelimited("%s: tss fixup for long mode. \n",
-				     __func__);
-		vmcs_write32(GUEST_TR_AR_BYTES,
-			     (guest_tr_ar & ~VMX_AR_TYPE_MASK)
-			     | VMX_AR_TYPE_BUSY_64_TSS);
-	}
-	vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
-}
-
-static void exit_lmode(struct kvm_vcpu *vcpu)
-{
-	vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
-	vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
-}
-
-#endif
-
-static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
-{
-	int vpid = to_vmx(vcpu)->vpid;
-
-	if (!vpid_sync_vcpu_addr(vpid, addr))
-		vpid_sync_context(vpid);
-
-	/*
-	 * If VPIDs are not supported or enabled, then the above is a no-op.
-	 * But we don't really need a TLB flush in that case anyway, because
-	 * each VM entry/exit includes an implicit flush when VPID is 0.
-	 */
-}
-
-static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
-{
-	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
-
-	vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
-	vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
-}
-
-static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
-{
-	ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
-
-	vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
-	vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
-}
-
-static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
-
-	if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
-		return;
-
-	if (is_pae_paging(vcpu)) {
-		vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
-		vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
-		vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
-		vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
-	}
-}
-
-void ept_save_pdptrs(struct kvm_vcpu *vcpu)
-{
-	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
-
-	if (is_pae_paging(vcpu)) {
-		mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
-		mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
-		mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
-		mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
-	}
-
-	kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
-}
-
-static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
-					unsigned long cr0,
-					struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
-		vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
-	if (!(cr0 & X86_CR0_PG)) {
-		/* From paging/starting to nonpaging */
-		exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
-					  CPU_BASED_CR3_STORE_EXITING);
-		vcpu->arch.cr0 = cr0;
-		vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
-	} else if (!is_paging(vcpu)) {
-		/* From nonpaging to paging */
-		exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
-					    CPU_BASED_CR3_STORE_EXITING);
-		vcpu->arch.cr0 = cr0;
-		vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
-	}
-
-	if (!(cr0 & X86_CR0_WP))
-		*hw_cr0 &= ~X86_CR0_WP;
-}
-
-void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned long hw_cr0;
-
-	hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
-	if (enable_unrestricted_guest)
-		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
-	else {
-		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
-
-		if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
-			enter_pmode(vcpu);
-
-		if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
-			enter_rmode(vcpu);
-	}
-
-#ifdef CONFIG_X86_64
-	if (vcpu->arch.efer & EFER_LME) {
-		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
-			enter_lmode(vcpu);
-		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
-			exit_lmode(vcpu);
-	}
-#endif
-
-	if (enable_ept && !enable_unrestricted_guest)
-		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
-
-	vmcs_writel(CR0_READ_SHADOW, cr0);
-	vmcs_writel(GUEST_CR0, hw_cr0);
-	vcpu->arch.cr0 = cr0;
-
-	/* depends on vcpu->arch.cr0 to be set to a new value */
-	vmx->emulation_required = emulation_required(vcpu);
-}
-
-static int get_ept_level(struct kvm_vcpu *vcpu)
-{
-	if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
-		return 5;
-	return 4;
-}
-
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
-{
-	u64 eptp = VMX_EPTP_MT_WB;
-
-	eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
-
-	if (enable_ept_ad_bits &&
-	    (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
-		eptp |= VMX_EPTP_AD_ENABLE_BIT;
-	eptp |= (root_hpa & PAGE_MASK);
-
-	return eptp;
-}
-
-void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
-	struct kvm *kvm = vcpu->kvm;
-	bool update_guest_cr3 = true;
-	unsigned long guest_cr3;
-	u64 eptp;
-
-	guest_cr3 = cr3;
-	if (enable_ept) {
-		eptp = construct_eptp(vcpu, cr3);
-		vmcs_write64(EPT_POINTER, eptp);
-
-		if (kvm_x86_ops->tlb_remote_flush) {
-			spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-			to_vmx(vcpu)->ept_pointer = eptp;
-			to_kvm_vmx(kvm)->ept_pointers_match
-				= EPT_POINTERS_CHECK;
-			spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
-		}
-
-		/* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */
-		if (is_guest_mode(vcpu))
-			update_guest_cr3 = false;
-		else if (!enable_unrestricted_guest && !is_paging(vcpu))
-			guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
-		else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
-			guest_cr3 = vcpu->arch.cr3;
-		else /* vmcs01.GUEST_CR3 is already up-to-date. */
-			update_guest_cr3 = false;
-		ept_load_pdptrs(vcpu);
-	}
-
-	if (update_guest_cr3)
-		vmcs_writel(GUEST_CR3, guest_cr3);
-}
-
-int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	/*
-	 * Pass through host's Machine Check Enable value to hw_cr4, which
-	 * is in force while we are in guest mode.  Do not let guests control
-	 * this bit, even if host CR4.MCE == 0.
-	 */
-	unsigned long hw_cr4;
-
-	hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
-	if (enable_unrestricted_guest)
-		hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
-	else if (vmx->rmode.vm86_active)
-		hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
-	else
-		hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
-
-	if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
-		if (cr4 & X86_CR4_UMIP) {
-			secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
-			hw_cr4 &= ~X86_CR4_UMIP;
-		} else if (!is_guest_mode(vcpu) ||
-			!nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
-			secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
-		}
-	}
-
-	if (cr4 & X86_CR4_VMXE) {
-		/*
-		 * To use VMXON (and later other VMX instructions), a guest
-		 * must first be able to turn on cr4.VMXE (see handle_vmon()).
-		 * So basically the check on whether to allow nested VMX
-		 * is here.  We operate under the default treatment of SMM,
-		 * so VMX cannot be enabled under SMM.
-		 */
-		if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
-			return 1;
-	}
-
-	if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
-		return 1;
-
-	vcpu->arch.cr4 = cr4;
-
-	if (!enable_unrestricted_guest) {
-		if (enable_ept) {
-			if (!is_paging(vcpu)) {
-				hw_cr4 &= ~X86_CR4_PAE;
-				hw_cr4 |= X86_CR4_PSE;
-			} else if (!(cr4 & X86_CR4_PAE)) {
-				hw_cr4 &= ~X86_CR4_PAE;
-			}
-		}
-
-		/*
-		 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
-		 * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
-		 * to be manually disabled when guest switches to non-paging
-		 * mode.
-		 *
-		 * If !enable_unrestricted_guest, the CPU is always running
-		 * with CR0.PG=1 and CR4 needs to be modified.
-		 * If enable_unrestricted_guest, the CPU automatically
-		 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
-		 */
-		if (!is_paging(vcpu))
-			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
-	}
-
-	vmcs_writel(CR4_READ_SHADOW, cr4);
-	vmcs_writel(GUEST_CR4, hw_cr4);
-	return 0;
-}
-
-void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 ar;
-
-	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
-		*var = vmx->rmode.segs[seg];
-		if (seg == VCPU_SREG_TR
-		    || var->selector == vmx_read_guest_seg_selector(vmx, seg))
-			return;
-		var->base = vmx_read_guest_seg_base(vmx, seg);
-		var->selector = vmx_read_guest_seg_selector(vmx, seg);
-		return;
-	}
-	var->base = vmx_read_guest_seg_base(vmx, seg);
-	var->limit = vmx_read_guest_seg_limit(vmx, seg);
-	var->selector = vmx_read_guest_seg_selector(vmx, seg);
-	ar = vmx_read_guest_seg_ar(vmx, seg);
-	var->unusable = (ar >> 16) & 1;
-	var->type = ar & 15;
-	var->s = (ar >> 4) & 1;
-	var->dpl = (ar >> 5) & 3;
-	/*
-	 * Some userspaces do not preserve unusable property. Since usable
-	 * segment has to be present according to VMX spec we can use present
-	 * property to amend userspace bug by making unusable segment always
-	 * nonpresent. vmx_segment_access_rights() already marks nonpresent
-	 * segment as unusable.
-	 */
-	var->present = !var->unusable;
-	var->avl = (ar >> 12) & 1;
-	var->l = (ar >> 13) & 1;
-	var->db = (ar >> 14) & 1;
-	var->g = (ar >> 15) & 1;
-}
-
-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
-{
-	struct kvm_segment s;
-
-	if (to_vmx(vcpu)->rmode.vm86_active) {
-		vmx_get_segment(vcpu, &s, seg);
-		return s.base;
-	}
-	return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
-}
-
-int vmx_get_cpl(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (unlikely(vmx->rmode.vm86_active))
-		return 0;
-	else {
-		int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
-		return VMX_AR_DPL(ar);
-	}
-}
-
-static u32 vmx_segment_access_rights(struct kvm_segment *var)
-{
-	u32 ar;
-
-	if (var->unusable || !var->present)
-		ar = 1 << 16;
-	else {
-		ar = var->type & 15;
-		ar |= (var->s & 1) << 4;
-		ar |= (var->dpl & 3) << 5;
-		ar |= (var->present & 1) << 7;
-		ar |= (var->avl & 1) << 12;
-		ar |= (var->l & 1) << 13;
-		ar |= (var->db & 1) << 14;
-		ar |= (var->g & 1) << 15;
-	}
-
-	return ar;
-}
-
-void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-
-	vmx_segment_cache_clear(vmx);
-
-	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
-		vmx->rmode.segs[seg] = *var;
-		if (seg == VCPU_SREG_TR)
-			vmcs_write16(sf->selector, var->selector);
-		else if (var->s)
-			fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
-		goto out;
-	}
-
-	vmcs_writel(sf->base, var->base);
-	vmcs_write32(sf->limit, var->limit);
-	vmcs_write16(sf->selector, var->selector);
-
-	/*
-	 *   Fix the "Accessed" bit in AR field of segment registers for older
-	 * qemu binaries.
-	 *   IA32 arch specifies that at the time of processor reset the
-	 * "Accessed" bit in the AR field of segment registers is 1. And qemu
-	 * is setting it to 0 in the userland code. This causes invalid guest
-	 * state vmexit when "unrestricted guest" mode is turned on.
-	 *    Fix for this setup issue in cpu_reset is being pushed in the qemu
-	 * tree. Newer qemu binaries with that qemu fix would not need this
-	 * kvm hack.
-	 */
-	if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
-		var->type |= 0x1; /* Accessed */
-
-	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
-
-out:
-	vmx->emulation_required = emulation_required(vcpu);
-}
-
-static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
-{
-	u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
-
-	*db = (ar >> 14) & 1;
-	*l = (ar >> 13) & 1;
-}
-
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
-	dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
-	dt->address = vmcs_readl(GUEST_IDTR_BASE);
-}
-
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
-	vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
-	vmcs_writel(GUEST_IDTR_BASE, dt->address);
-}
-
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
-	dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
-	dt->address = vmcs_readl(GUEST_GDTR_BASE);
-}
-
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
-{
-	vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
-	vmcs_writel(GUEST_GDTR_BASE, dt->address);
-}
-
-static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
-{
-	struct kvm_segment var;
-	u32 ar;
-
-	vmx_get_segment(vcpu, &var, seg);
-	var.dpl = 0x3;
-	if (seg == VCPU_SREG_CS)
-		var.type = 0x3;
-	ar = vmx_segment_access_rights(&var);
-
-	if (var.base != (var.selector << 4))
-		return false;
-	if (var.limit != 0xffff)
-		return false;
-	if (ar != 0xf3)
-		return false;
-
-	return true;
-}
-
-static bool code_segment_valid(struct kvm_vcpu *vcpu)
-{
-	struct kvm_segment cs;
-	unsigned int cs_rpl;
-
-	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
-	cs_rpl = cs.selector & SEGMENT_RPL_MASK;
-
-	if (cs.unusable)
-		return false;
-	if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
-		return false;
-	if (!cs.s)
-		return false;
-	if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
-		if (cs.dpl > cs_rpl)
-			return false;
-	} else {
-		if (cs.dpl != cs_rpl)
-			return false;
-	}
-	if (!cs.present)
-		return false;
-
-	/* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
-	return true;
-}
-
-static bool stack_segment_valid(struct kvm_vcpu *vcpu)
-{
-	struct kvm_segment ss;
-	unsigned int ss_rpl;
-
-	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
-	ss_rpl = ss.selector & SEGMENT_RPL_MASK;
-
-	if (ss.unusable)
-		return true;
-	if (ss.type != 3 && ss.type != 7)
-		return false;
-	if (!ss.s)
-		return false;
-	if (ss.dpl != ss_rpl) /* DPL != RPL */
-		return false;
-	if (!ss.present)
-		return false;
-
-	return true;
-}
-
-static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
-{
-	struct kvm_segment var;
-	unsigned int rpl;
-
-	vmx_get_segment(vcpu, &var, seg);
-	rpl = var.selector & SEGMENT_RPL_MASK;
-
-	if (var.unusable)
-		return true;
-	if (!var.s)
-		return false;
-	if (!var.present)
-		return false;
-	if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
-		if (var.dpl < rpl) /* DPL < RPL */
-			return false;
-	}
-
-	/* TODO: Add other members to kvm_segment_field to allow checking for other access
-	 * rights flags
-	 */
-	return true;
-}
-
-static bool tr_valid(struct kvm_vcpu *vcpu)
-{
-	struct kvm_segment tr;
-
-	vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
-
-	if (tr.unusable)
-		return false;
-	if (tr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
-		return false;
-	if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
-		return false;
-	if (!tr.present)
-		return false;
-
-	return true;
-}
-
-static bool ldtr_valid(struct kvm_vcpu *vcpu)
-{
-	struct kvm_segment ldtr;
-
-	vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
-
-	if (ldtr.unusable)
-		return true;
-	if (ldtr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
-		return false;
-	if (ldtr.type != 2)
-		return false;
-	if (!ldtr.present)
-		return false;
-
-	return true;
-}
-
-static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
-{
-	struct kvm_segment cs, ss;
-
-	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
-	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
-
-	return ((cs.selector & SEGMENT_RPL_MASK) ==
-		 (ss.selector & SEGMENT_RPL_MASK));
-}
-
-/*
- * Check if guest state is valid. Returns true if valid, false if
- * not.
- * We assume that registers are always usable
- */
-static bool guest_state_valid(struct kvm_vcpu *vcpu)
-{
-	if (enable_unrestricted_guest)
-		return true;
-
-	/* real mode guest state checks */
-	if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
-		if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
-			return false;
-		if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
-			return false;
-		if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
-			return false;
-		if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
-			return false;
-		if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
-			return false;
-		if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
-			return false;
-	} else {
-	/* protected mode guest state checks */
-		if (!cs_ss_rpl_check(vcpu))
-			return false;
-		if (!code_segment_valid(vcpu))
-			return false;
-		if (!stack_segment_valid(vcpu))
-			return false;
-		if (!data_segment_valid(vcpu, VCPU_SREG_DS))
-			return false;
-		if (!data_segment_valid(vcpu, VCPU_SREG_ES))
-			return false;
-		if (!data_segment_valid(vcpu, VCPU_SREG_FS))
-			return false;
-		if (!data_segment_valid(vcpu, VCPU_SREG_GS))
-			return false;
-		if (!tr_valid(vcpu))
-			return false;
-		if (!ldtr_valid(vcpu))
-			return false;
-	}
-	/* TODO:
-	 * - Add checks on RIP
-	 * - Add checks on RFLAGS
-	 */
-
-	return true;
-}
-
-static int init_rmode_tss(struct kvm *kvm)
-{
-	gfn_t fn;
-	u16 data = 0;
-	int idx, r;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
-	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
-	if (r < 0)
-		goto out;
-	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
-	r = kvm_write_guest_page(kvm, fn++, &data,
-			TSS_IOPB_BASE_OFFSET, sizeof(u16));
-	if (r < 0)
-		goto out;
-	r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
-	if (r < 0)
-		goto out;
-	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
-	if (r < 0)
-		goto out;
-	data = ~0;
-	r = kvm_write_guest_page(kvm, fn, &data,
-				 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
-				 sizeof(u8));
-out:
-	srcu_read_unlock(&kvm->srcu, idx);
-	return r;
-}
-
-static int init_rmode_identity_map(struct kvm *kvm)
-{
-	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
-	int i, idx, r = 0;
-	kvm_pfn_t identity_map_pfn;
-	u32 tmp;
-
-	/* Protect kvm_vmx->ept_identity_pagetable_done. */
-	mutex_lock(&kvm->slots_lock);
-
-	if (likely(kvm_vmx->ept_identity_pagetable_done))
-		goto out2;
-
-	if (!kvm_vmx->ept_identity_map_addr)
-		kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
-	identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
-
-	r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
-				    kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
-	if (r < 0)
-		goto out2;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
-	if (r < 0)
-		goto out;
-	/* Set up identity-mapping pagetable for EPT in real mode */
-	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
-		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
-			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
-		r = kvm_write_guest_page(kvm, identity_map_pfn,
-				&tmp, i * sizeof(tmp), sizeof(tmp));
-		if (r < 0)
-			goto out;
-	}
-	kvm_vmx->ept_identity_pagetable_done = true;
-
-out:
-	srcu_read_unlock(&kvm->srcu, idx);
-
-out2:
-	mutex_unlock(&kvm->slots_lock);
-	return r;
-}
-
-static void seg_setup(int seg)
-{
-	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
-	unsigned int ar;
-
-	vmcs_write16(sf->selector, 0);
-	vmcs_writel(sf->base, 0);
-	vmcs_write32(sf->limit, 0xffff);
-	ar = 0x93;
-	if (seg == VCPU_SREG_CS)
-		ar |= 0x08; /* code segment */
-
-	vmcs_write32(sf->ar_bytes, ar);
-}
-
-static int alloc_apic_access_page(struct kvm *kvm)
-{
-	struct page *page;
-	int r = 0;
-
-	mutex_lock(&kvm->slots_lock);
-	if (kvm->arch.apic_access_page_done)
-		goto out;
-	r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
-				    APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
-	if (r)
-		goto out;
-
-	page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
-	if (is_error_page(page)) {
-		r = -EFAULT;
-		goto out;
-	}
-
-	/*
-	 * Do not pin the page in memory, so that memory hot-unplug
-	 * is able to migrate it.
-	 */
-	put_page(page);
-	kvm->arch.apic_access_page_done = true;
-out:
-	mutex_unlock(&kvm->slots_lock);
-	return r;
-}
-
-int allocate_vpid(void)
-{
-	int vpid;
-
-	if (!enable_vpid)
-		return 0;
-	spin_lock(&vmx_vpid_lock);
-	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
-	if (vpid < VMX_NR_VPIDS)
-		__set_bit(vpid, vmx_vpid_bitmap);
-	else
-		vpid = 0;
-	spin_unlock(&vmx_vpid_lock);
-	return vpid;
-}
-
-void free_vpid(int vpid)
-{
-	if (!enable_vpid || vpid == 0)
-		return;
-	spin_lock(&vmx_vpid_lock);
-	__clear_bit(vpid, vmx_vpid_bitmap);
-	spin_unlock(&vmx_vpid_lock);
-}
-
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
-							  u32 msr, int type)
-{
-	int f = sizeof(unsigned long);
-
-	if (!cpu_has_vmx_msr_bitmap())
-		return;
-
-	if (static_branch_unlikely(&enable_evmcs))
-		evmcs_touch_msr_bitmap();
-
-	/*
-	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
-	 * have the write-low and read-high bitmap offsets the wrong way round.
-	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
-	 */
-	if (msr <= 0x1fff) {
-		if (type & MSR_TYPE_R)
-			/* read-low */
-			__clear_bit(msr, msr_bitmap + 0x000 / f);
-
-		if (type & MSR_TYPE_W)
-			/* write-low */
-			__clear_bit(msr, msr_bitmap + 0x800 / f);
-
-	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
-		msr &= 0x1fff;
-		if (type & MSR_TYPE_R)
-			/* read-high */
-			__clear_bit(msr, msr_bitmap + 0x400 / f);
-
-		if (type & MSR_TYPE_W)
-			/* write-high */
-			__clear_bit(msr, msr_bitmap + 0xc00 / f);
-
-	}
-}
-
-static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
-							 u32 msr, int type)
-{
-	int f = sizeof(unsigned long);
-
-	if (!cpu_has_vmx_msr_bitmap())
-		return;
-
-	if (static_branch_unlikely(&enable_evmcs))
-		evmcs_touch_msr_bitmap();
-
-	/*
-	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
-	 * have the write-low and read-high bitmap offsets the wrong way round.
-	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
-	 */
-	if (msr <= 0x1fff) {
-		if (type & MSR_TYPE_R)
-			/* read-low */
-			__set_bit(msr, msr_bitmap + 0x000 / f);
-
-		if (type & MSR_TYPE_W)
-			/* write-low */
-			__set_bit(msr, msr_bitmap + 0x800 / f);
-
-	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
-		msr &= 0x1fff;
-		if (type & MSR_TYPE_R)
-			/* read-high */
-			__set_bit(msr, msr_bitmap + 0x400 / f);
-
-		if (type & MSR_TYPE_W)
-			/* write-high */
-			__set_bit(msr, msr_bitmap + 0xc00 / f);
-
-	}
-}
-
-static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
-			     			      u32 msr, int type, bool value)
-{
-	if (value)
-		vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
-	else
-		vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
-}
-
-static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
-{
-	u8 mode = 0;
-
-	if (cpu_has_secondary_exec_ctrls() &&
-	    (secondary_exec_controls_get(to_vmx(vcpu)) &
-	     SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
-		mode |= MSR_BITMAP_MODE_X2APIC;
-		if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
-			mode |= MSR_BITMAP_MODE_X2APIC_APICV;
-	}
-
-	return mode;
-}
-
-static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
-					 u8 mode)
-{
-	int msr;
-
-	for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
-		unsigned word = msr / BITS_PER_LONG;
-		msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
-		msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
-	}
-
-	if (mode & MSR_BITMAP_MODE_X2APIC) {
-		/*
-		 * TPR reads and writes can be virtualized even if virtual interrupt
-		 * delivery is not in use.
-		 */
-		vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
-		if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
-			vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
-			vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
-			vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
-		}
-	}
-}
-
-void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
-	u8 mode = vmx_msr_bitmap_mode(vcpu);
-	u8 changed = mode ^ vmx->msr_bitmap_mode;
-
-	if (!changed)
-		return;
-
-	if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
-		vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
-
-	vmx->msr_bitmap_mode = mode;
-}
-
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
-{
-	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
-	bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
-	u32 i;
-
-	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
-							MSR_TYPE_RW, flag);
-	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
-							MSR_TYPE_RW, flag);
-	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
-							MSR_TYPE_RW, flag);
-	vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
-							MSR_TYPE_RW, flag);
-	for (i = 0; i < vmx->pt_desc.addr_range; i++) {
-		vmx_set_intercept_for_msr(msr_bitmap,
-			MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
-		vmx_set_intercept_for_msr(msr_bitmap,
-			MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
-	}
-}
-
-static bool vmx_get_enable_apicv(struct kvm *kvm)
-{
-	return enable_apicv;
-}
-
-static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	void *vapic_page;
-	u32 vppr;
-	int rvi;
-
-	if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
-		!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
-		WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
-		return false;
-
-	rvi = vmx_get_rvi();
-
-	vapic_page = vmx->nested.virtual_apic_map.hva;
-	vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
-
-	return ((rvi & 0xf0) > (vppr & 0xf0));
-}
-
-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
-						     bool nested)
-{
-#ifdef CONFIG_SMP
-	int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
-
-	if (vcpu->mode == IN_GUEST_MODE) {
-		/*
-		 * The vector of interrupt to be delivered to vcpu had
-		 * been set in PIR before this function.
-		 *
-		 * Following cases will be reached in this block, and
-		 * we always send a notification event in all cases as
-		 * explained below.
-		 *
-		 * Case 1: vcpu keeps in non-root mode. Sending a
-		 * notification event posts the interrupt to vcpu.
-		 *
-		 * Case 2: vcpu exits to root mode and is still
-		 * runnable. PIR will be synced to vIRR before the
-		 * next vcpu entry. Sending a notification event in
-		 * this case has no effect, as vcpu is not in root
-		 * mode.
-		 *
-		 * Case 3: vcpu exits to root mode and is blocked.
-		 * vcpu_block() has already synced PIR to vIRR and
-		 * never blocks vcpu if vIRR is not cleared. Therefore,
-		 * a blocked vcpu here does not wait for any requested
-		 * interrupts in PIR, and sending a notification event
-		 * which has no effect is safe here.
-		 */
-
-		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
-		return true;
-	}
-#endif
-	return false;
-}
-
-static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
-						int vector)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (is_guest_mode(vcpu) &&
-	    vector == vmx->nested.posted_intr_nv) {
-		/*
-		 * If a posted intr is not recognized by hardware,
-		 * we will accomplish it in the next vmentry.
-		 */
-		vmx->nested.pi_pending = true;
-		kvm_make_request(KVM_REQ_EVENT, vcpu);
-		/* the PIR and ON have been set by L1. */
-		if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
-			kvm_vcpu_kick(vcpu);
-		return 0;
-	}
-	return -1;
-}
-/*
- * Send interrupt to vcpu via posted interrupt way.
- * 1. If target vcpu is running(non-root mode), send posted interrupt
- * notification to vcpu and hardware will sync PIR to vIRR atomically.
- * 2. If target vcpu isn't running(root mode), kick it to pick up the
- * interrupt from PIR in next vmentry.
- */
-static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	int r;
-
-	r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
-	if (!r)
-		return;
-
-	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
-		return;
-
-	/* If a previous notification has sent the IPI, nothing to do.  */
-	if (pi_test_and_set_on(&vmx->pi_desc))
-		return;
-
-	if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
-		kvm_vcpu_kick(vcpu);
-}
-
-/*
- * Set up the vmcs's constant host-state fields, i.e., host-state fields that
- * will not change in the lifetime of the guest.
- * Note that host-state that does change is set elsewhere. E.g., host-state
- * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
- */
-void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
-{
-	u32 low32, high32;
-	unsigned long tmpl;
-	unsigned long cr0, cr3, cr4;
-
-	cr0 = read_cr0();
-	WARN_ON(cr0 & X86_CR0_TS);
-	vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
-
-	/*
-	 * Save the most likely value for this task's CR3 in the VMCS.
-	 * We can't use __get_current_cr3_fast() because we're not atomic.
-	 */
-	cr3 = __read_cr3();
-	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
-	vmx->loaded_vmcs->host_state.cr3 = cr3;
-
-	/* Save the most likely value for this task's CR4 in the VMCS. */
-	cr4 = cr4_read_shadow();
-	vmcs_writel(HOST_CR4, cr4);			/* 22.2.3, 22.2.5 */
-	vmx->loaded_vmcs->host_state.cr4 = cr4;
-
-	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
-#ifdef CONFIG_X86_64
-	/*
-	 * Load null selectors, so we can avoid reloading them in
-	 * vmx_prepare_switch_to_host(), in case userspace uses
-	 * the null selectors too (the expected case).
-	 */
-	vmcs_write16(HOST_DS_SELECTOR, 0);
-	vmcs_write16(HOST_ES_SELECTOR, 0);
-#else
-	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-#endif
-	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
-
-	vmcs_writel(HOST_IDTR_BASE, host_idt_base);   /* 22.2.4 */
-
-	vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
-
-	rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
-	vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
-	rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
-	vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
-
-	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
-		rdmsr(MSR_IA32_CR_PAT, low32, high32);
-		vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
-	}
-
-	if (cpu_has_load_ia32_efer())
-		vmcs_write64(HOST_IA32_EFER, host_efer);
-}
-
-void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
-{
-	vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
-	if (enable_ept)
-		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
-	if (is_guest_mode(&vmx->vcpu))
-		vmx->vcpu.arch.cr4_guest_owned_bits &=
-			~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
-	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
-}
-
-u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
-{
-	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
-
-	if (!kvm_vcpu_apicv_active(&vmx->vcpu))
-		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
-
-	if (!enable_vnmi)
-		pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
-
-	if (!enable_preemption_timer)
-		pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
-
-	return pin_based_exec_ctrl;
-}
-
-static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
-	if (cpu_has_secondary_exec_ctrls()) {
-		if (kvm_vcpu_apicv_active(vcpu))
-			secondary_exec_controls_setbit(vmx,
-				      SECONDARY_EXEC_APIC_REGISTER_VIRT |
-				      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
-		else
-			secondary_exec_controls_clearbit(vmx,
-					SECONDARY_EXEC_APIC_REGISTER_VIRT |
-					SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
-	}
-
-	if (cpu_has_vmx_msr_bitmap())
-		vmx_update_msr_bitmap(vcpu);
-}
-
-u32 vmx_exec_control(struct vcpu_vmx *vmx)
-{
-	u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
-
-	if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
-		exec_control &= ~CPU_BASED_MOV_DR_EXITING;
-
-	if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
-		exec_control &= ~CPU_BASED_TPR_SHADOW;
-#ifdef CONFIG_X86_64
-		exec_control |= CPU_BASED_CR8_STORE_EXITING |
-				CPU_BASED_CR8_LOAD_EXITING;
-#endif
-	}
-	if (!enable_ept)
-		exec_control |= CPU_BASED_CR3_STORE_EXITING |
-				CPU_BASED_CR3_LOAD_EXITING  |
-				CPU_BASED_INVLPG_EXITING;
-	if (kvm_mwait_in_guest(vmx->vcpu.kvm))
-		exec_control &= ~(CPU_BASED_MWAIT_EXITING |
-				CPU_BASED_MONITOR_EXITING);
-	if (kvm_hlt_in_guest(vmx->vcpu.kvm))
-		exec_control &= ~CPU_BASED_HLT_EXITING;
-	return exec_control;
-}
-
-
-static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
-{
-	struct kvm_vcpu *vcpu = &vmx->vcpu;
-
-	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
-
-	if (pt_mode == PT_MODE_SYSTEM)
-		exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
-	if (!cpu_need_virtualize_apic_accesses(vcpu))
-		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-	if (vmx->vpid == 0)
-		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
-	if (!enable_ept) {
-		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
-		enable_unrestricted_guest = 0;
-	}
-	if (!enable_unrestricted_guest)
-		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
-	if (kvm_pause_in_guest(vmx->vcpu.kvm))
-		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-	if (!kvm_vcpu_apicv_active(vcpu))
-		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
-				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
-	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
-
-	/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
-	 * in vmx_set_cr4.  */
-	exec_control &= ~SECONDARY_EXEC_DESC;
-
-	/* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
-	   (handle_vmptrld).
-	   We can NOT enable shadow_vmcs here because we don't have yet
-	   a current VMCS12
-	*/
-	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
-
-	if (!enable_pml)
-		exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
-
-	if (vmx_xsaves_supported()) {
-		/* Exposing XSAVES only when XSAVE is exposed */
-		bool xsaves_enabled =
-			guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
-			guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
-
-		vcpu->arch.xsaves_enabled = xsaves_enabled;
-
-		if (!xsaves_enabled)
-			exec_control &= ~SECONDARY_EXEC_XSAVES;
-
-		if (nested) {
-			if (xsaves_enabled)
-				vmx->nested.msrs.secondary_ctls_high |=
-					SECONDARY_EXEC_XSAVES;
-			else
-				vmx->nested.msrs.secondary_ctls_high &=
-					~SECONDARY_EXEC_XSAVES;
-		}
-	}
-
-	if (vmx_rdtscp_supported()) {
-		bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
-		if (!rdtscp_enabled)
-			exec_control &= ~SECONDARY_EXEC_RDTSCP;
-
-		if (nested) {
-			if (rdtscp_enabled)
-				vmx->nested.msrs.secondary_ctls_high |=
-					SECONDARY_EXEC_RDTSCP;
-			else
-				vmx->nested.msrs.secondary_ctls_high &=
-					~SECONDARY_EXEC_RDTSCP;
-		}
-	}
-
-	if (vmx_invpcid_supported()) {
-		/* Exposing INVPCID only when PCID is exposed */
-		bool invpcid_enabled =
-			guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
-			guest_cpuid_has(vcpu, X86_FEATURE_PCID);
-
-		if (!invpcid_enabled) {
-			exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
-			guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
-		}
-
-		if (nested) {
-			if (invpcid_enabled)
-				vmx->nested.msrs.secondary_ctls_high |=
-					SECONDARY_EXEC_ENABLE_INVPCID;
-			else
-				vmx->nested.msrs.secondary_ctls_high &=
-					~SECONDARY_EXEC_ENABLE_INVPCID;
-		}
-	}
-
-	if (vmx_rdrand_supported()) {
-		bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
-		if (rdrand_enabled)
-			exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
-
-		if (nested) {
-			if (rdrand_enabled)
-				vmx->nested.msrs.secondary_ctls_high |=
-					SECONDARY_EXEC_RDRAND_EXITING;
-			else
-				vmx->nested.msrs.secondary_ctls_high &=
-					~SECONDARY_EXEC_RDRAND_EXITING;
-		}
-	}
-
-	if (vmx_rdseed_supported()) {
-		bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
-		if (rdseed_enabled)
-			exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
-
-		if (nested) {
-			if (rdseed_enabled)
-				vmx->nested.msrs.secondary_ctls_high |=
-					SECONDARY_EXEC_RDSEED_EXITING;
-			else
-				vmx->nested.msrs.secondary_ctls_high &=
-					~SECONDARY_EXEC_RDSEED_EXITING;
-		}
-	}
-
-	if (vmx_waitpkg_supported()) {
-		bool waitpkg_enabled =
-			guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG);
-
-		if (!waitpkg_enabled)
-			exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
-
-		if (nested) {
-			if (waitpkg_enabled)
-				vmx->nested.msrs.secondary_ctls_high |=
-					SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
-			else
-				vmx->nested.msrs.secondary_ctls_high &=
-					~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
-		}
-	}
-
-	vmx->secondary_exec_control = exec_control;
-}
-
-static void ept_set_mmio_spte_mask(void)
-{
-	/*
-	 * EPT Misconfigurations can be generated if the value of bits 2:0
-	 * of an EPT paging-structure entry is 110b (write/execute).
-	 */
-	kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
-				   VMX_EPT_MISCONFIG_WX_VALUE, 0);
-}
-
-#define VMX_XSS_EXIT_BITMAP 0
-
-/*
- * Noting that the initialization of Guest-state Area of VMCS is in
- * vmx_vcpu_reset().
- */
-static void init_vmcs(struct vcpu_vmx *vmx)
-{
-	if (nested)
-		nested_vmx_set_vmcs_shadowing_bitmap();
-
-	if (cpu_has_vmx_msr_bitmap())
-		vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
-
-	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
-
-	/* Control */
-	pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
-
-	exec_controls_set(vmx, vmx_exec_control(vmx));
-
-	if (cpu_has_secondary_exec_ctrls()) {
-		vmx_compute_secondary_exec_control(vmx);
-		secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
-	}
-
-	if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
-		vmcs_write64(EOI_EXIT_BITMAP0, 0);
-		vmcs_write64(EOI_EXIT_BITMAP1, 0);
-		vmcs_write64(EOI_EXIT_BITMAP2, 0);
-		vmcs_write64(EOI_EXIT_BITMAP3, 0);
-
-		vmcs_write16(GUEST_INTR_STATUS, 0);
-
-		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
-		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
-	}
-
-	if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
-		vmcs_write32(PLE_GAP, ple_gap);
-		vmx->ple_window = ple_window;
-		vmx->ple_window_dirty = true;
-	}
-
-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
-	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
-
-	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
-	vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
-	vmx_set_constant_host_state(vmx);
-	vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
-	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
-
-	if (cpu_has_vmx_vmfunc())
-		vmcs_write64(VM_FUNCTION_CONTROL, 0);
-
-	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
-	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
-	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
-	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
-	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
-
-	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
-		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
-
-	vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
-
-	/* 22.2.1, 20.8.1 */
-	vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
-
-	vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
-	vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
-
-	set_cr4_guest_host_mask(vmx);
-
-	if (vmx->vpid != 0)
-		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
-
-	if (vmx_xsaves_supported())
-		vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
-
-	if (enable_pml) {
-		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
-		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
-	}
-
-	if (cpu_has_vmx_encls_vmexit())
-		vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
-
-	if (pt_mode == PT_MODE_HOST_GUEST) {
-		memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
-		/* Bit[6~0] are forced to 1, writes are ignored. */
-		vmx->pt_desc.guest.output_mask = 0x7F;
-		vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
-	}
-}
-
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct msr_data apic_base_msr;
-	u64 cr0;
-
-	vmx->rmode.vm86_active = 0;
-	vmx->spec_ctrl = 0;
-
-	vmx->msr_ia32_umwait_control = 0;
-
-	vcpu->arch.microcode_version = 0x100000000ULL;
-	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
-	vmx->hv_deadline_tsc = -1;
-	kvm_set_cr8(vcpu, 0);
-
-	if (!init_event) {
-		apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
-				     MSR_IA32_APICBASE_ENABLE;
-		if (kvm_vcpu_is_reset_bsp(vcpu))
-			apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
-		apic_base_msr.host_initiated = true;
-		kvm_set_apic_base(vcpu, &apic_base_msr);
-	}
-
-	vmx_segment_cache_clear(vmx);
-
-	seg_setup(VCPU_SREG_CS);
-	vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
-	vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
-
-	seg_setup(VCPU_SREG_DS);
-	seg_setup(VCPU_SREG_ES);
-	seg_setup(VCPU_SREG_FS);
-	seg_setup(VCPU_SREG_GS);
-	seg_setup(VCPU_SREG_SS);
-
-	vmcs_write16(GUEST_TR_SELECTOR, 0);
-	vmcs_writel(GUEST_TR_BASE, 0);
-	vmcs_write32(GUEST_TR_LIMIT, 0xffff);
-	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
-
-	vmcs_write16(GUEST_LDTR_SELECTOR, 0);
-	vmcs_writel(GUEST_LDTR_BASE, 0);
-	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
-	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
-
-	if (!init_event) {
-		vmcs_write32(GUEST_SYSENTER_CS, 0);
-		vmcs_writel(GUEST_SYSENTER_ESP, 0);
-		vmcs_writel(GUEST_SYSENTER_EIP, 0);
-		vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-	}
-
-	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
-	kvm_rip_write(vcpu, 0xfff0);
-
-	vmcs_writel(GUEST_GDTR_BASE, 0);
-	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
-
-	vmcs_writel(GUEST_IDTR_BASE, 0);
-	vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
-
-	vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
-	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
-	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
-	if (kvm_mpx_supported())
-		vmcs_write64(GUEST_BNDCFGS, 0);
-
-	setup_msrs(vmx);
-
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
-
-	if (cpu_has_vmx_tpr_shadow() && !init_event) {
-		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
-		if (cpu_need_tpr_shadow(vcpu))
-			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
-				     __pa(vcpu->arch.apic->regs));
-		vmcs_write32(TPR_THRESHOLD, 0);
-	}
-
-	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
-
-	cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-	vmx->vcpu.arch.cr0 = cr0;
-	vmx_set_cr0(vcpu, cr0); /* enter rmode */
-	vmx_set_cr4(vcpu, 0);
-	vmx_set_efer(vcpu, 0);
-
-	update_exception_bitmap(vcpu);
-
-	vpid_sync_context(vmx->vpid);
-	if (init_event)
-		vmx_clear_hlt(vcpu);
-}
-
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
-	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
-	if (!enable_vnmi ||
-	    vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
-		enable_irq_window(vcpu);
-		return;
-	}
-
-	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
-}
-
-static void vmx_inject_irq(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	uint32_t intr;
-	int irq = vcpu->arch.interrupt.nr;
-
-	trace_kvm_inj_virq(irq);
-
-	++vcpu->stat.irq_injections;
-	if (vmx->rmode.vm86_active) {
-		int inc_eip = 0;
-		if (vcpu->arch.interrupt.soft)
-			inc_eip = vcpu->arch.event_exit_inst_len;
-		kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
-		return;
-	}
-	intr = irq | INTR_INFO_VALID_MASK;
-	if (vcpu->arch.interrupt.soft) {
-		intr |= INTR_TYPE_SOFT_INTR;
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-			     vmx->vcpu.arch.event_exit_inst_len);
-	} else
-		intr |= INTR_TYPE_EXT_INTR;
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
-
-	vmx_clear_hlt(vcpu);
-}
-
-static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (!enable_vnmi) {
-		/*
-		 * Tracking the NMI-blocked state in software is built upon
-		 * finding the next open IRQ window. This, in turn, depends on
-		 * well-behaving guests: They have to keep IRQs disabled at
-		 * least as long as the NMI handler runs. Otherwise we may
-		 * cause NMI nesting, maybe breaking the guest. But as this is
-		 * highly unlikely, we can live with the residual risk.
-		 */
-		vmx->loaded_vmcs->soft_vnmi_blocked = 1;
-		vmx->loaded_vmcs->vnmi_blocked_time = 0;
-	}
-
-	++vcpu->stat.nmi_injections;
-	vmx->loaded_vmcs->nmi_known_unmasked = false;
-
-	if (vmx->rmode.vm86_active) {
-		kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
-		return;
-	}
-
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
-
-	vmx_clear_hlt(vcpu);
-}
-
-bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	bool masked;
-
-	if (!enable_vnmi)
-		return vmx->loaded_vmcs->soft_vnmi_blocked;
-	if (vmx->loaded_vmcs->nmi_known_unmasked)
-		return false;
-	masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
-	vmx->loaded_vmcs->nmi_known_unmasked = !masked;
-	return masked;
-}
-
-void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (!enable_vnmi) {
-		if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
-			vmx->loaded_vmcs->soft_vnmi_blocked = masked;
-			vmx->loaded_vmcs->vnmi_blocked_time = 0;
-		}
-	} else {
-		vmx->loaded_vmcs->nmi_known_unmasked = !masked;
-		if (masked)
-			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-				      GUEST_INTR_STATE_NMI);
-		else
-			vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
-					GUEST_INTR_STATE_NMI);
-	}
-}
-
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
-{
-	if (to_vmx(vcpu)->nested.nested_run_pending)
-		return 0;
-
-	if (!enable_vnmi &&
-	    to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
-		return 0;
-
-	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-		  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
-		   | GUEST_INTR_STATE_NMI));
-}
-
-static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
-	return (!to_vmx(vcpu)->nested.nested_run_pending &&
-		vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
-}
-
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
-{
-	int ret;
-
-	if (enable_unrestricted_guest)
-		return 0;
-
-	ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
-				    PAGE_SIZE * 3);
-	if (ret)
-		return ret;
-	to_kvm_vmx(kvm)->tss_addr = addr;
-	return init_rmode_tss(kvm);
-}
-
-static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
-{
-	to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
-	return 0;
-}
-
-static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
-{
-	switch (vec) {
-	case BP_VECTOR:
-		/*
-		 * Update instruction length as we may reinject the exception
-		 * from user space while in guest debugging mode.
-		 */
-		to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
-			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
-			return false;
-		/* fall through */
-	case DB_VECTOR:
-		if (vcpu->guest_debug &
-			(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
-			return false;
-		/* fall through */
-	case DE_VECTOR:
-	case OF_VECTOR:
-	case BR_VECTOR:
-	case UD_VECTOR:
-	case DF_VECTOR:
-	case SS_VECTOR:
-	case GP_VECTOR:
-	case MF_VECTOR:
-		return true;
-	break;
-	}
-	return false;
-}
-
-static int handle_rmode_exception(struct kvm_vcpu *vcpu,
-				  int vec, u32 err_code)
-{
-	/*
-	 * Instruction with address size override prefix opcode 0x67
-	 * Cause the #SS fault with 0 error code in VM86 mode.
-	 */
-	if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
-		if (kvm_emulate_instruction(vcpu, 0)) {
-			if (vcpu->arch.halt_request) {
-				vcpu->arch.halt_request = 0;
-				return kvm_vcpu_halt(vcpu);
-			}
-			return 1;
-		}
-		return 0;
-	}
-
-	/*
-	 * Forward all other exceptions that are valid in real mode.
-	 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
-	 *        the required debugging infrastructure rework.
-	 */
-	kvm_queue_exception(vcpu, vec);
-	return 1;
-}
-
-/*
- * Trigger machine check on the host. We assume all the MSRs are already set up
- * by the CPU and that we still run on the same CPU as the MCE occurred on.
- * We pass a fake environment to the machine check handler because we want
- * the guest to be always treated like user space, no matter what context
- * it used internally.
- */
-static void kvm_machine_check(void)
-{
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
-	struct pt_regs regs = {
-		.cs = 3, /* Fake ring 3 no matter what the guest ran on */
-		.flags = X86_EFLAGS_IF,
-	};
-
-	do_machine_check(&regs, 0);
-#endif
-}
-
-static int handle_machine_check(struct kvm_vcpu *vcpu)
-{
-	/* handled by vmx_vcpu_run() */
-	return 1;
-}
-
-static int handle_exception_nmi(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct kvm_run *kvm_run = vcpu->run;
-	u32 intr_info, ex_no, error_code;
-	unsigned long cr2, rip, dr6;
-	u32 vect_info;
-
-	vect_info = vmx->idt_vectoring_info;
-	intr_info = vmx->exit_intr_info;
-
-	if (is_machine_check(intr_info) || is_nmi(intr_info))
-		return 1; /* handled by handle_exception_nmi_irqoff() */
-
-	if (is_invalid_opcode(intr_info))
-		return handle_ud(vcpu);
-
-	error_code = 0;
-	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
-		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
-
-	if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
-		WARN_ON_ONCE(!enable_vmware_backdoor);
-
-		/*
-		 * VMware backdoor emulation on #GP interception only handles
-		 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
-		 * error code on #GP.
-		 */
-		if (error_code) {
-			kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
-			return 1;
-		}
-		return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
-	}
-
-	/*
-	 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
-	 * MMIO, it is better to report an internal error.
-	 * See the comments in vmx_handle_exit.
-	 */
-	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
-	    !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
-		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
-		vcpu->run->internal.ndata = 3;
-		vcpu->run->internal.data[0] = vect_info;
-		vcpu->run->internal.data[1] = intr_info;
-		vcpu->run->internal.data[2] = error_code;
-		return 0;
-	}
-
-	if (is_page_fault(intr_info)) {
-		cr2 = vmcs_readl(EXIT_QUALIFICATION);
-		/* EPT won't cause page fault directly */
-		WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
-		return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
-	}
-
-	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
-
-	if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
-		return handle_rmode_exception(vcpu, ex_no, error_code);
-
-	switch (ex_no) {
-	case AC_VECTOR:
-		kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
-		return 1;
-	case DB_VECTOR:
-		dr6 = vmcs_readl(EXIT_QUALIFICATION);
-		if (!(vcpu->guest_debug &
-		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
-			vcpu->arch.dr6 &= ~DR_TRAP_BITS;
-			vcpu->arch.dr6 |= dr6 | DR6_RTM;
-			if (is_icebp(intr_info))
-				WARN_ON(!skip_emulated_instruction(vcpu));
-
-			kvm_queue_exception(vcpu, DB_VECTOR);
-			return 1;
-		}
-		kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
-		kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
-		/* fall through */
-	case BP_VECTOR:
-		/*
-		 * Update instruction length as we may reinject #BP from
-		 * user space while in guest debugging mode. Reading it for
-		 * #DB as well causes no harm, it is not used in that case.
-		 */
-		vmx->vcpu.arch.event_exit_inst_len =
-			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-		kvm_run->exit_reason = KVM_EXIT_DEBUG;
-		rip = kvm_rip_read(vcpu);
-		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
-		kvm_run->debug.arch.exception = ex_no;
-		break;
-	default:
-		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
-		kvm_run->ex.exception = ex_no;
-		kvm_run->ex.error_code = error_code;
-		break;
-	}
-	return 0;
-}
-
-static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
-{
-	++vcpu->stat.irq_exits;
-	return 1;
-}
-
-static int handle_triple_fault(struct kvm_vcpu *vcpu)
-{
-	vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
-	vcpu->mmio_needed = 0;
-	return 0;
-}
-
-static int handle_io(struct kvm_vcpu *vcpu)
-{
-	unsigned long exit_qualification;
-	int size, in, string;
-	unsigned port;
-
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	string = (exit_qualification & 16) != 0;
-
-	++vcpu->stat.io_exits;
-
-	if (string)
-		return kvm_emulate_instruction(vcpu, 0);
-
-	port = exit_qualification >> 16;
-	size = (exit_qualification & 7) + 1;
-	in = (exit_qualification & 8) != 0;
-
-	return kvm_fast_pio(vcpu, size, port, in);
-}
-
-static void
-vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
-{
-	/*
-	 * Patch in the VMCALL instruction:
-	 */
-	hypercall[0] = 0x0f;
-	hypercall[1] = 0x01;
-	hypercall[2] = 0xc1;
-}
-
-/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
-static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
-{
-	if (is_guest_mode(vcpu)) {
-		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-		unsigned long orig_val = val;
-
-		/*
-		 * We get here when L2 changed cr0 in a way that did not change
-		 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
-		 * but did change L0 shadowed bits. So we first calculate the
-		 * effective cr0 value that L1 would like to write into the
-		 * hardware. It consists of the L2-owned bits from the new
-		 * value combined with the L1-owned bits from L1's guest_cr0.
-		 */
-		val = (val & ~vmcs12->cr0_guest_host_mask) |
-			(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
-
-		if (!nested_guest_cr0_valid(vcpu, val))
-			return 1;
-
-		if (kvm_set_cr0(vcpu, val))
-			return 1;
-		vmcs_writel(CR0_READ_SHADOW, orig_val);
-		return 0;
-	} else {
-		if (to_vmx(vcpu)->nested.vmxon &&
-		    !nested_host_cr0_valid(vcpu, val))
-			return 1;
-
-		return kvm_set_cr0(vcpu, val);
-	}
-}
-
-static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
-{
-	if (is_guest_mode(vcpu)) {
-		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-		unsigned long orig_val = val;
-
-		/* analogously to handle_set_cr0 */
-		val = (val & ~vmcs12->cr4_guest_host_mask) |
-			(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
-		if (kvm_set_cr4(vcpu, val))
-			return 1;
-		vmcs_writel(CR4_READ_SHADOW, orig_val);
-		return 0;
-	} else
-		return kvm_set_cr4(vcpu, val);
-}
-
-static int handle_desc(struct kvm_vcpu *vcpu)
-{
-	WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
-	return kvm_emulate_instruction(vcpu, 0);
-}
-
-static int handle_cr(struct kvm_vcpu *vcpu)
-{
-	unsigned long exit_qualification, val;
-	int cr;
-	int reg;
-	int err;
-	int ret;
-
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	cr = exit_qualification & 15;
-	reg = (exit_qualification >> 8) & 15;
-	switch ((exit_qualification >> 4) & 3) {
-	case 0: /* mov to cr */
-		val = kvm_register_readl(vcpu, reg);
-		trace_kvm_cr_write(cr, val);
-		switch (cr) {
-		case 0:
-			err = handle_set_cr0(vcpu, val);
-			return kvm_complete_insn_gp(vcpu, err);
-		case 3:
-			WARN_ON_ONCE(enable_unrestricted_guest);
-			err = kvm_set_cr3(vcpu, val);
-			return kvm_complete_insn_gp(vcpu, err);
-		case 4:
-			err = handle_set_cr4(vcpu, val);
-			return kvm_complete_insn_gp(vcpu, err);
-		case 8: {
-				u8 cr8_prev = kvm_get_cr8(vcpu);
-				u8 cr8 = (u8)val;
-				err = kvm_set_cr8(vcpu, cr8);
-				ret = kvm_complete_insn_gp(vcpu, err);
-				if (lapic_in_kernel(vcpu))
-					return ret;
-				if (cr8_prev <= cr8)
-					return ret;
-				/*
-				 * TODO: we might be squashing a
-				 * KVM_GUESTDBG_SINGLESTEP-triggered
-				 * KVM_EXIT_DEBUG here.
-				 */
-				vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
-				return 0;
-			}
-		}
-		break;
-	case 2: /* clts */
-		WARN_ONCE(1, "Guest should always own CR0.TS");
-		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
-		trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
-		return kvm_skip_emulated_instruction(vcpu);
-	case 1: /*mov from cr*/
-		switch (cr) {
-		case 3:
-			WARN_ON_ONCE(enable_unrestricted_guest);
-			val = kvm_read_cr3(vcpu);
-			kvm_register_write(vcpu, reg, val);
-			trace_kvm_cr_read(cr, val);
-			return kvm_skip_emulated_instruction(vcpu);
-		case 8:
-			val = kvm_get_cr8(vcpu);
-			kvm_register_write(vcpu, reg, val);
-			trace_kvm_cr_read(cr, val);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-		break;
-	case 3: /* lmsw */
-		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
-		trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
-		kvm_lmsw(vcpu, val);
-
-		return kvm_skip_emulated_instruction(vcpu);
-	default:
-		break;
-	}
-	vcpu->run->exit_reason = 0;
-	vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
-	       (int)(exit_qualification >> 4) & 3, cr);
-	return 0;
-}
-
-static int handle_dr(struct kvm_vcpu *vcpu)
-{
-	unsigned long exit_qualification;
-	int dr, dr7, reg;
-
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
-
-	/* First, if DR does not exist, trigger UD */
-	if (!kvm_require_dr(vcpu, dr))
-		return 1;
-
-	/* Do not handle if the CPL > 0, will trigger GP on re-entry */
-	if (!kvm_require_cpl(vcpu, 0))
-		return 1;
-	dr7 = vmcs_readl(GUEST_DR7);
-	if (dr7 & DR7_GD) {
-		/*
-		 * As the vm-exit takes precedence over the debug trap, we
-		 * need to emulate the latter, either for the host or the
-		 * guest debugging itself.
-		 */
-		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
-			vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
-			vcpu->run->debug.arch.dr7 = dr7;
-			vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
-			vcpu->run->debug.arch.exception = DB_VECTOR;
-			vcpu->run->exit_reason = KVM_EXIT_DEBUG;
-			return 0;
-		} else {
-			vcpu->arch.dr6 &= ~DR_TRAP_BITS;
-			vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
-			kvm_queue_exception(vcpu, DB_VECTOR);
-			return 1;
-		}
-	}
-
-	if (vcpu->guest_debug == 0) {
-		exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
-
-		/*
-		 * No more DR vmexits; force a reload of the debug registers
-		 * and reenter on this instruction.  The next vmexit will
-		 * retrieve the full state of the debug registers.
-		 */
-		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
-		return 1;
-	}
-
-	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
-	if (exit_qualification & TYPE_MOV_FROM_DR) {
-		unsigned long val;
-
-		if (kvm_get_dr(vcpu, dr, &val))
-			return 1;
-		kvm_register_write(vcpu, reg, val);
-	} else
-		if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
-			return 1;
-
-	return kvm_skip_emulated_instruction(vcpu);
-}
-
-static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.dr6;
-}
-
-static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
-{
-}
-
-static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
-{
-	get_debugreg(vcpu->arch.db[0], 0);
-	get_debugreg(vcpu->arch.db[1], 1);
-	get_debugreg(vcpu->arch.db[2], 2);
-	get_debugreg(vcpu->arch.db[3], 3);
-	get_debugreg(vcpu->arch.dr6, 6);
-	vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
-
-	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
-	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
-}
-
-static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
-{
-	vmcs_writel(GUEST_DR7, val);
-}
-
-static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
-{
-	kvm_apic_update_ppr(vcpu);
-	return 1;
-}
-
-static int handle_interrupt_window(struct kvm_vcpu *vcpu)
-{
-	exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
-
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
-
-	++vcpu->stat.irq_window_exits;
-	return 1;
-}
-
-static int handle_vmcall(struct kvm_vcpu *vcpu)
-{
-	return kvm_emulate_hypercall(vcpu);
-}
-
-static int handle_invd(struct kvm_vcpu *vcpu)
-{
-	return kvm_emulate_instruction(vcpu, 0);
-}
-
-static int handle_invlpg(struct kvm_vcpu *vcpu)
-{
-	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
-	kvm_mmu_invlpg(vcpu, exit_qualification);
-	return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int handle_rdpmc(struct kvm_vcpu *vcpu)
-{
-	int err;
-
-	err = kvm_rdpmc(vcpu);
-	return kvm_complete_insn_gp(vcpu, err);
-}
-
-static int handle_wbinvd(struct kvm_vcpu *vcpu)
-{
-	return kvm_emulate_wbinvd(vcpu);
-}
-
-static int handle_xsetbv(struct kvm_vcpu *vcpu)
-{
-	u64 new_bv = kvm_read_edx_eax(vcpu);
-	u32 index = kvm_rcx_read(vcpu);
-
-	if (kvm_set_xcr(vcpu, index, new_bv) == 0)
-		return kvm_skip_emulated_instruction(vcpu);
-	return 1;
-}
-
-static int handle_apic_access(struct kvm_vcpu *vcpu)
-{
-	if (likely(fasteoi)) {
-		unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-		int access_type, offset;
-
-		access_type = exit_qualification & APIC_ACCESS_TYPE;
-		offset = exit_qualification & APIC_ACCESS_OFFSET;
-		/*
-		 * Sane guest uses MOV to write EOI, with written value
-		 * not cared. So make a short-circuit here by avoiding
-		 * heavy instruction emulation.
-		 */
-		if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
-		    (offset == APIC_EOI)) {
-			kvm_lapic_set_eoi(vcpu);
-			return kvm_skip_emulated_instruction(vcpu);
-		}
-	}
-	return kvm_emulate_instruction(vcpu, 0);
-}
-
-static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
-{
-	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	int vector = exit_qualification & 0xff;
-
-	/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
-	kvm_apic_set_eoi_accelerated(vcpu, vector);
-	return 1;
-}
-
-static int handle_apic_write(struct kvm_vcpu *vcpu)
-{
-	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	u32 offset = exit_qualification & 0xfff;
-
-	/* APIC-write VM exit is trap-like and thus no need to adjust IP */
-	kvm_apic_write_nodecode(vcpu, offset);
-	return 1;
-}
-
-static int handle_task_switch(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned long exit_qualification;
-	bool has_error_code = false;
-	u32 error_code = 0;
-	u16 tss_selector;
-	int reason, type, idt_v, idt_index;
-
-	idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
-	idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
-	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
-
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
-	reason = (u32)exit_qualification >> 30;
-	if (reason == TASK_SWITCH_GATE && idt_v) {
-		switch (type) {
-		case INTR_TYPE_NMI_INTR:
-			vcpu->arch.nmi_injected = false;
-			vmx_set_nmi_mask(vcpu, true);
-			break;
-		case INTR_TYPE_EXT_INTR:
-		case INTR_TYPE_SOFT_INTR:
-			kvm_clear_interrupt_queue(vcpu);
-			break;
-		case INTR_TYPE_HARD_EXCEPTION:
-			if (vmx->idt_vectoring_info &
-			    VECTORING_INFO_DELIVER_CODE_MASK) {
-				has_error_code = true;
-				error_code =
-					vmcs_read32(IDT_VECTORING_ERROR_CODE);
-			}
-			/* fall through */
-		case INTR_TYPE_SOFT_EXCEPTION:
-			kvm_clear_exception_queue(vcpu);
-			break;
-		default:
-			break;
-		}
-	}
-	tss_selector = exit_qualification;
-
-	if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
-		       type != INTR_TYPE_EXT_INTR &&
-		       type != INTR_TYPE_NMI_INTR))
-		WARN_ON(!skip_emulated_instruction(vcpu));
-
-	/*
-	 * TODO: What about debug traps on tss switch?
-	 *       Are we supposed to inject them and update dr6?
-	 */
-	return kvm_task_switch(vcpu, tss_selector,
-			       type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
-			       reason, has_error_code, error_code);
-}
-
-static int handle_ept_violation(struct kvm_vcpu *vcpu)
-{
-	unsigned long exit_qualification;
-	gpa_t gpa;
-	u64 error_code;
-
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
-	/*
-	 * EPT violation happened while executing iret from NMI,
-	 * "blocked by NMI" bit has to be set before next VM entry.
-	 * There are errata that may cause this bit to not be set:
-	 * AAK134, BY25.
-	 */
-	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
-			enable_vnmi &&
-			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
-		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
-
-	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
-	trace_kvm_page_fault(gpa, exit_qualification);
-
-	/* Is it a read fault? */
-	error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
-		     ? PFERR_USER_MASK : 0;
-	/* Is it a write fault? */
-	error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
-		      ? PFERR_WRITE_MASK : 0;
-	/* Is it a fetch fault? */
-	error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
-		      ? PFERR_FETCH_MASK : 0;
-	/* ept page table entry is present? */
-	error_code |= (exit_qualification &
-		       (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
-			EPT_VIOLATION_EXECUTABLE))
-		      ? PFERR_PRESENT_MASK : 0;
-
-	error_code |= (exit_qualification & 0x100) != 0 ?
-	       PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
-
-	vcpu->arch.exit_qualification = exit_qualification;
-	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
-}
-
-static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
-{
-	gpa_t gpa;
-
-	/*
-	 * A nested guest cannot optimize MMIO vmexits, because we have an
-	 * nGPA here instead of the required GPA.
-	 */
-	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
-	if (!is_guest_mode(vcpu) &&
-	    !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
-		trace_kvm_fast_mmio(gpa);
-		return kvm_skip_emulated_instruction(vcpu);
-	}
-
-	return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
-}
-
-static int handle_nmi_window(struct kvm_vcpu *vcpu)
-{
-	WARN_ON_ONCE(!enable_vnmi);
-	exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
-	++vcpu->stat.nmi_window_exits;
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
-
-	return 1;
-}
-
-static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	bool intr_window_requested;
-	unsigned count = 130;
-
-	/*
-	 * We should never reach the point where we are emulating L2
-	 * due to invalid guest state as that means we incorrectly
-	 * allowed a nested VMEntry with an invalid vmcs12.
-	 */
-	WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
-
-	intr_window_requested = exec_controls_get(vmx) &
-				CPU_BASED_INTR_WINDOW_EXITING;
-
-	while (vmx->emulation_required && count-- != 0) {
-		if (intr_window_requested && vmx_interrupt_allowed(vcpu))
-			return handle_interrupt_window(&vmx->vcpu);
-
-		if (kvm_test_request(KVM_REQ_EVENT, vcpu))
-			return 1;
-
-		if (!kvm_emulate_instruction(vcpu, 0))
-			return 0;
-
-		if (vmx->emulation_required && !vmx->rmode.vm86_active &&
-		    vcpu->arch.exception.pending) {
-			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			vcpu->run->internal.suberror =
-						KVM_INTERNAL_ERROR_EMULATION;
-			vcpu->run->internal.ndata = 0;
-			return 0;
-		}
-
-		if (vcpu->arch.halt_request) {
-			vcpu->arch.halt_request = 0;
-			return kvm_vcpu_halt(vcpu);
-		}
-
-		/*
-		 * Note, return 1 and not 0, vcpu_run() is responsible for
-		 * morphing the pending signal into the proper return code.
-		 */
-		if (signal_pending(current))
-			return 1;
-
-		if (need_resched())
-			schedule();
-	}
-
-	return 1;
-}
-
-static void grow_ple_window(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned int old = vmx->ple_window;
-
-	vmx->ple_window = __grow_ple_window(old, ple_window,
-					    ple_window_grow,
-					    ple_window_max);
-
-	if (vmx->ple_window != old) {
-		vmx->ple_window_dirty = true;
-		trace_kvm_ple_window_update(vcpu->vcpu_id,
-					    vmx->ple_window, old);
-	}
-}
-
-static void shrink_ple_window(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned int old = vmx->ple_window;
-
-	vmx->ple_window = __shrink_ple_window(old, ple_window,
-					      ple_window_shrink,
-					      ple_window);
-
-	if (vmx->ple_window != old) {
-		vmx->ple_window_dirty = true;
-		trace_kvm_ple_window_update(vcpu->vcpu_id,
-					    vmx->ple_window, old);
-	}
-}
-
-/*
- * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
- */
-static void wakeup_handler(void)
-{
-	struct kvm_vcpu *vcpu;
-	int cpu = smp_processor_id();
-
-	spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-	list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
-			blocked_vcpu_list) {
-		struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
-		if (pi_test_on(pi_desc) == 1)
-			kvm_vcpu_kick(vcpu);
-	}
-	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-}
-
-static void vmx_enable_tdp(void)
-{
-	kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
-		enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
-		enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
-		0ull, VMX_EPT_EXECUTABLE_MASK,
-		cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
-		VMX_EPT_RWX_MASK, 0ull);
-
-	ept_set_mmio_spte_mask();
-	kvm_enable_tdp();
-}
-
-/*
- * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
- * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
- */
-static int handle_pause(struct kvm_vcpu *vcpu)
-{
-	if (!kvm_pause_in_guest(vcpu->kvm))
-		grow_ple_window(vcpu);
-
-	/*
-	 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
-	 * VM-execution control is ignored if CPL > 0. OTOH, KVM
-	 * never set PAUSE_EXITING and just set PLE if supported,
-	 * so the vcpu must be CPL=0 if it gets a PAUSE exit.
-	 */
-	kvm_vcpu_on_spin(vcpu, true);
-	return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int handle_nop(struct kvm_vcpu *vcpu)
-{
-	return kvm_skip_emulated_instruction(vcpu);
-}
-
-static int handle_mwait(struct kvm_vcpu *vcpu)
-{
-	printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
-	return handle_nop(vcpu);
-}
-
-static int handle_invalid_op(struct kvm_vcpu *vcpu)
-{
-	kvm_queue_exception(vcpu, UD_VECTOR);
-	return 1;
-}
-
-static int handle_monitor_trap(struct kvm_vcpu *vcpu)
-{
-	return 1;
-}
-
-static int handle_monitor(struct kvm_vcpu *vcpu)
-{
-	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
-	return handle_nop(vcpu);
-}
-
-static int handle_invpcid(struct kvm_vcpu *vcpu)
-{
-	u32 vmx_instruction_info;
-	unsigned long type;
-	bool pcid_enabled;
-	gva_t gva;
-	struct x86_exception e;
-	unsigned i;
-	unsigned long roots_to_free = 0;
-	struct {
-		u64 pcid;
-		u64 gla;
-	} operand;
-
-	if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
-		return 1;
-	}
-
-	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
-	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
-
-	if (type > 3) {
-		kvm_inject_gp(vcpu, 0);
-		return 1;
-	}
-
-	/* According to the Intel instruction reference, the memory operand
-	 * is read even if it isn't needed (e.g., for type==all)
-	 */
-	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
-				vmx_instruction_info, false,
-				sizeof(operand), &gva))
-		return 1;
-
-	if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
-		kvm_inject_page_fault(vcpu, &e);
-		return 1;
-	}
-
-	if (operand.pcid >> 12 != 0) {
-		kvm_inject_gp(vcpu, 0);
-		return 1;
-	}
-
-	pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
-
-	switch (type) {
-	case INVPCID_TYPE_INDIV_ADDR:
-		if ((!pcid_enabled && (operand.pcid != 0)) ||
-		    is_noncanonical_address(operand.gla, vcpu)) {
-			kvm_inject_gp(vcpu, 0);
-			return 1;
-		}
-		kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
-		return kvm_skip_emulated_instruction(vcpu);
-
-	case INVPCID_TYPE_SINGLE_CTXT:
-		if (!pcid_enabled && (operand.pcid != 0)) {
-			kvm_inject_gp(vcpu, 0);
-			return 1;
-		}
-
-		if (kvm_get_active_pcid(vcpu) == operand.pcid) {
-			kvm_mmu_sync_roots(vcpu);
-			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-		}
-
-		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-			if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
-			    == operand.pcid)
-				roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
-
-		kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
-		/*
-		 * If neither the current cr3 nor any of the prev_roots use the
-		 * given PCID, then nothing needs to be done here because a
-		 * resync will happen anyway before switching to any other CR3.
-		 */
-
-		return kvm_skip_emulated_instruction(vcpu);
-
-	case INVPCID_TYPE_ALL_NON_GLOBAL:
-		/*
-		 * Currently, KVM doesn't mark global entries in the shadow
-		 * page tables, so a non-global flush just degenerates to a
-		 * global flush. If needed, we could optimize this later by
-		 * keeping track of global entries in shadow page tables.
-		 */
-
-		/* fall-through */
-	case INVPCID_TYPE_ALL_INCL_GLOBAL:
-		kvm_mmu_unload(vcpu);
-		return kvm_skip_emulated_instruction(vcpu);
-
-	default:
-		BUG(); /* We have already checked above that type <= 3 */
-	}
-}
-
-static int handle_pml_full(struct kvm_vcpu *vcpu)
-{
-	unsigned long exit_qualification;
-
-	trace_kvm_pml_full(vcpu->vcpu_id);
-
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
-	/*
-	 * PML buffer FULL happened while executing iret from NMI,
-	 * "blocked by NMI" bit has to be set before next VM entry.
-	 */
-	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
-			enable_vnmi &&
-			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
-		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-				GUEST_INTR_STATE_NMI);
-
-	/*
-	 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
-	 * here.., and there's no userspace involvement needed for PML.
-	 */
-	return 1;
-}
-
-static int handle_preemption_timer(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (!vmx->req_immediate_exit &&
-	    !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
-		kvm_lapic_expired_hv_timer(vcpu);
-
-	return 1;
-}
-
-/*
- * When nested=0, all VMX instruction VM Exits filter here.  The handlers
- * are overwritten by nested_vmx_setup() when nested=1.
- */
-static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
-{
-	kvm_queue_exception(vcpu, UD_VECTOR);
-	return 1;
-}
-
-static int handle_encls(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * SGX virtualization is not yet supported.  There is no software
-	 * enable bit for SGX, so we have to trap ENCLS and inject a #UD
-	 * to prevent the guest from executing ENCLS.
-	 */
-	kvm_queue_exception(vcpu, UD_VECTOR);
-	return 1;
-}
-
-/*
- * The exit handlers return 1 if the exit was handled fully and guest execution
- * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
- * to be done to userspace and return 0.
- */
-static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
-	[EXIT_REASON_EXCEPTION_NMI]           = handle_exception_nmi,
-	[EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
-	[EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
-	[EXIT_REASON_NMI_WINDOW]	      = handle_nmi_window,
-	[EXIT_REASON_IO_INSTRUCTION]          = handle_io,
-	[EXIT_REASON_CR_ACCESS]               = handle_cr,
-	[EXIT_REASON_DR_ACCESS]               = handle_dr,
-	[EXIT_REASON_CPUID]                   = kvm_emulate_cpuid,
-	[EXIT_REASON_MSR_READ]                = kvm_emulate_rdmsr,
-	[EXIT_REASON_MSR_WRITE]               = kvm_emulate_wrmsr,
-	[EXIT_REASON_INTERRUPT_WINDOW]        = handle_interrupt_window,
-	[EXIT_REASON_HLT]                     = kvm_emulate_halt,
-	[EXIT_REASON_INVD]		      = handle_invd,
-	[EXIT_REASON_INVLPG]		      = handle_invlpg,
-	[EXIT_REASON_RDPMC]                   = handle_rdpmc,
-	[EXIT_REASON_VMCALL]                  = handle_vmcall,
-	[EXIT_REASON_VMCLEAR]		      = handle_vmx_instruction,
-	[EXIT_REASON_VMLAUNCH]		      = handle_vmx_instruction,
-	[EXIT_REASON_VMPTRLD]		      = handle_vmx_instruction,
-	[EXIT_REASON_VMPTRST]		      = handle_vmx_instruction,
-	[EXIT_REASON_VMREAD]		      = handle_vmx_instruction,
-	[EXIT_REASON_VMRESUME]		      = handle_vmx_instruction,
-	[EXIT_REASON_VMWRITE]		      = handle_vmx_instruction,
-	[EXIT_REASON_VMOFF]		      = handle_vmx_instruction,
-	[EXIT_REASON_VMON]		      = handle_vmx_instruction,
-	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
-	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
-	[EXIT_REASON_APIC_WRITE]              = handle_apic_write,
-	[EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
-	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
-	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
-	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
-	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
-	[EXIT_REASON_GDTR_IDTR]		      = handle_desc,
-	[EXIT_REASON_LDTR_TR]		      = handle_desc,
-	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
-	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
-	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
-	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_mwait,
-	[EXIT_REASON_MONITOR_TRAP_FLAG]       = handle_monitor_trap,
-	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
-	[EXIT_REASON_INVEPT]                  = handle_vmx_instruction,
-	[EXIT_REASON_INVVPID]                 = handle_vmx_instruction,
-	[EXIT_REASON_RDRAND]                  = handle_invalid_op,
-	[EXIT_REASON_RDSEED]                  = handle_invalid_op,
-	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
-	[EXIT_REASON_INVPCID]                 = handle_invpcid,
-	[EXIT_REASON_VMFUNC]		      = handle_vmx_instruction,
-	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
-	[EXIT_REASON_ENCLS]		      = handle_encls,
-};
-
-static const int kvm_vmx_max_exit_handlers =
-	ARRAY_SIZE(kvm_vmx_exit_handlers);
-
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
-{
-	*info1 = vmcs_readl(EXIT_QUALIFICATION);
-	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
-}
-
-static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
-{
-	if (vmx->pml_pg) {
-		__free_page(vmx->pml_pg);
-		vmx->pml_pg = NULL;
-	}
-}
-
-static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 *pml_buf;
-	u16 pml_idx;
-
-	pml_idx = vmcs_read16(GUEST_PML_INDEX);
-
-	/* Do nothing if PML buffer is empty */
-	if (pml_idx == (PML_ENTITY_NUM - 1))
-		return;
-
-	/* PML index always points to next available PML buffer entity */
-	if (pml_idx >= PML_ENTITY_NUM)
-		pml_idx = 0;
-	else
-		pml_idx++;
-
-	pml_buf = page_address(vmx->pml_pg);
-	for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
-		u64 gpa;
-
-		gpa = pml_buf[pml_idx];
-		WARN_ON(gpa & (PAGE_SIZE - 1));
-		kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
-	}
-
-	/* reset PML index */
-	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
-}
-
-/*
- * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
- * Called before reporting dirty_bitmap to userspace.
- */
-static void kvm_flush_pml_buffers(struct kvm *kvm)
-{
-	int i;
-	struct kvm_vcpu *vcpu;
-	/*
-	 * We only need to kick vcpu out of guest mode here, as PML buffer
-	 * is flushed at beginning of all VMEXITs, and it's obvious that only
-	 * vcpus running in guest are possible to have unflushed GPAs in PML
-	 * buffer.
-	 */
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		kvm_vcpu_kick(vcpu);
-}
-
-static void vmx_dump_sel(char *name, uint32_t sel)
-{
-	pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
-	       name, vmcs_read16(sel),
-	       vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
-	       vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
-	       vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
-}
-
-static void vmx_dump_dtsel(char *name, uint32_t limit)
-{
-	pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
-	       name, vmcs_read32(limit),
-	       vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
-}
-
-void dump_vmcs(void)
-{
-	u32 vmentry_ctl, vmexit_ctl;
-	u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
-	unsigned long cr4;
-	u64 efer;
-	int i, n;
-
-	if (!dump_invalid_vmcs) {
-		pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
-		return;
-	}
-
-	vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
-	vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
-	cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
-	cr4 = vmcs_readl(GUEST_CR4);
-	efer = vmcs_read64(GUEST_IA32_EFER);
-	secondary_exec_control = 0;
-	if (cpu_has_secondary_exec_ctrls())
-		secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-
-	pr_err("*** Guest State ***\n");
-	pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
-	       vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
-	       vmcs_readl(CR0_GUEST_HOST_MASK));
-	pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
-	       cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
-	pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
-	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
-	    (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
-	{
-		pr_err("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
-		       vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
-		pr_err("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
-		       vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
-	}
-	pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
-	       vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
-	pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
-	       vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
-	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
-	       vmcs_readl(GUEST_SYSENTER_ESP),
-	       vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
-	vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
-	vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
-	vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
-	vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
-	vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
-	vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
-	vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
-	vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
-	vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
-	vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
-	if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
-	    (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
-		pr_err("EFER =     0x%016llx  PAT = 0x%016llx\n",
-		       efer, vmcs_read64(GUEST_IA32_PAT));
-	pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
-	       vmcs_read64(GUEST_IA32_DEBUGCTL),
-	       vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
-	if (cpu_has_load_perf_global_ctrl() &&
-	    vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
-		pr_err("PerfGlobCtl = 0x%016llx\n",
-		       vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
-	if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
-		pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
-	pr_err("Interruptibility = %08x  ActivityState = %08x\n",
-	       vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
-	       vmcs_read32(GUEST_ACTIVITY_STATE));
-	if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
-		pr_err("InterruptStatus = %04x\n",
-		       vmcs_read16(GUEST_INTR_STATUS));
-
-	pr_err("*** Host State ***\n");
-	pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
-	       vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
-	pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
-	       vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
-	       vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
-	       vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
-	       vmcs_read16(HOST_TR_SELECTOR));
-	pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
-	       vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
-	       vmcs_readl(HOST_TR_BASE));
-	pr_err("GDTBase=%016lx IDTBase=%016lx\n",
-	       vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
-	pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
-	       vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
-	       vmcs_readl(HOST_CR4));
-	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
-	       vmcs_readl(HOST_IA32_SYSENTER_ESP),
-	       vmcs_read32(HOST_IA32_SYSENTER_CS),
-	       vmcs_readl(HOST_IA32_SYSENTER_EIP));
-	if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
-		pr_err("EFER = 0x%016llx  PAT = 0x%016llx\n",
-		       vmcs_read64(HOST_IA32_EFER),
-		       vmcs_read64(HOST_IA32_PAT));
-	if (cpu_has_load_perf_global_ctrl() &&
-	    vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
-		pr_err("PerfGlobCtl = 0x%016llx\n",
-		       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
-
-	pr_err("*** Control State ***\n");
-	pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
-	       pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
-	pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
-	pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
-	       vmcs_read32(EXCEPTION_BITMAP),
-	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
-	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
-	pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
-	       vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
-	       vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
-	       vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
-	pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
-	       vmcs_read32(VM_EXIT_INTR_INFO),
-	       vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
-	       vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
-	pr_err("        reason=%08x qualification=%016lx\n",
-	       vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
-	pr_err("IDTVectoring: info=%08x errcode=%08x\n",
-	       vmcs_read32(IDT_VECTORING_INFO_FIELD),
-	       vmcs_read32(IDT_VECTORING_ERROR_CODE));
-	pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
-	if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
-		pr_err("TSC Multiplier = 0x%016llx\n",
-		       vmcs_read64(TSC_MULTIPLIER));
-	if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
-		if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
-			u16 status = vmcs_read16(GUEST_INTR_STATUS);
-			pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
-		}
-		pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
-		if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
-			pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
-		pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
-	}
-	if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
-		pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
-	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
-		pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
-	n = vmcs_read32(CR3_TARGET_COUNT);
-	for (i = 0; i + 1 < n; i += 4)
-		pr_err("CR3 target%u=%016lx target%u=%016lx\n",
-		       i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
-		       i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
-	if (i < n)
-		pr_err("CR3 target%u=%016lx\n",
-		       i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
-	if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
-		pr_err("PLE Gap=%08x Window=%08x\n",
-		       vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
-	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
-		pr_err("Virtual processor ID = 0x%04x\n",
-		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
-}
-
-/*
- * The guest has exited.  See if we can fix it or if we need userspace
- * assistance.
- */
-static int vmx_handle_exit(struct kvm_vcpu *vcpu,
-	enum exit_fastpath_completion exit_fastpath)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 exit_reason = vmx->exit_reason;
-	u32 vectoring_info = vmx->idt_vectoring_info;
-
-	trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
-
-	/*
-	 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
-	 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
-	 * querying dirty_bitmap, we only need to kick all vcpus out of guest
-	 * mode as if vcpus is in root mode, the PML buffer must has been
-	 * flushed already.
-	 */
-	if (enable_pml)
-		vmx_flush_pml_buffer(vcpu);
-
-	/* If guest state is invalid, start emulating */
-	if (vmx->emulation_required)
-		return handle_invalid_guest_state(vcpu);
-
-	if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
-		return nested_vmx_reflect_vmexit(vcpu, exit_reason);
-
-	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
-		dump_vmcs();
-		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-		vcpu->run->fail_entry.hardware_entry_failure_reason
-			= exit_reason;
-		return 0;
-	}
-
-	if (unlikely(vmx->fail)) {
-		dump_vmcs();
-		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-		vcpu->run->fail_entry.hardware_entry_failure_reason
-			= vmcs_read32(VM_INSTRUCTION_ERROR);
-		return 0;
-	}
-
-	/*
-	 * Note:
-	 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
-	 * delivery event since it indicates guest is accessing MMIO.
-	 * The vm-exit can be triggered again after return to guest that
-	 * will cause infinite loop.
-	 */
-	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
-			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
-			exit_reason != EXIT_REASON_EPT_VIOLATION &&
-			exit_reason != EXIT_REASON_PML_FULL &&
-			exit_reason != EXIT_REASON_TASK_SWITCH)) {
-		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
-		vcpu->run->internal.ndata = 3;
-		vcpu->run->internal.data[0] = vectoring_info;
-		vcpu->run->internal.data[1] = exit_reason;
-		vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
-		if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
-			vcpu->run->internal.ndata++;
-			vcpu->run->internal.data[3] =
-				vmcs_read64(GUEST_PHYSICAL_ADDRESS);
-		}
-		return 0;
-	}
-
-	if (unlikely(!enable_vnmi &&
-		     vmx->loaded_vmcs->soft_vnmi_blocked)) {
-		if (vmx_interrupt_allowed(vcpu)) {
-			vmx->loaded_vmcs->soft_vnmi_blocked = 0;
-		} else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
-			   vcpu->arch.nmi_pending) {
-			/*
-			 * This CPU don't support us in finding the end of an
-			 * NMI-blocked window if the guest runs with IRQs
-			 * disabled. So we pull the trigger after 1 s of
-			 * futile waiting, but inform the user about this.
-			 */
-			printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
-			       "state on VCPU %d after 1 s timeout\n",
-			       __func__, vcpu->vcpu_id);
-			vmx->loaded_vmcs->soft_vnmi_blocked = 0;
-		}
-	}
-
-	if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
-		kvm_skip_emulated_instruction(vcpu);
-		return 1;
-	} else if (exit_reason < kvm_vmx_max_exit_handlers
-	    && kvm_vmx_exit_handlers[exit_reason]) {
-#ifdef CONFIG_RETPOLINE
-		if (exit_reason == EXIT_REASON_MSR_WRITE)
-			return kvm_emulate_wrmsr(vcpu);
-		else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
-			return handle_preemption_timer(vcpu);
-		else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW)
-			return handle_interrupt_window(vcpu);
-		else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
-			return handle_external_interrupt(vcpu);
-		else if (exit_reason == EXIT_REASON_HLT)
-			return kvm_emulate_halt(vcpu);
-		else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
-			return handle_ept_misconfig(vcpu);
-#endif
-		return kvm_vmx_exit_handlers[exit_reason](vcpu);
-	} else {
-		vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
-				exit_reason);
-		dump_vmcs();
-		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		vcpu->run->internal.suberror =
-			KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
-		vcpu->run->internal.ndata = 1;
-		vcpu->run->internal.data[0] = exit_reason;
-		return 0;
-	}
-}
-
-/*
- * Software based L1D cache flush which is used when microcode providing
- * the cache control MSR is not loaded.
- *
- * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
- * flush it is required to read in 64 KiB because the replacement algorithm
- * is not exactly LRU. This could be sized at runtime via topology
- * information but as all relevant affected CPUs have 32KiB L1D cache size
- * there is no point in doing so.
- */
-static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
-{
-	int size = PAGE_SIZE << L1D_CACHE_ORDER;
-
-	/*
-	 * This code is only executed when the the flush mode is 'cond' or
-	 * 'always'
-	 */
-	if (static_branch_likely(&vmx_l1d_flush_cond)) {
-		bool flush_l1d;
-
-		/*
-		 * Clear the per-vcpu flush bit, it gets set again
-		 * either from vcpu_run() or from one of the unsafe
-		 * VMEXIT handlers.
-		 */
-		flush_l1d = vcpu->arch.l1tf_flush_l1d;
-		vcpu->arch.l1tf_flush_l1d = false;
-
-		/*
-		 * Clear the per-cpu flush bit, it gets set again from
-		 * the interrupt handlers.
-		 */
-		flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
-		kvm_clear_cpu_l1tf_flush_l1d();
-
-		if (!flush_l1d)
-			return;
-	}
-
-	vcpu->stat.l1d_flush++;
-
-	if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
-		wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
-		return;
-	}
-
-	asm volatile(
-		/* First ensure the pages are in the TLB */
-		"xorl	%%eax, %%eax\n"
-		".Lpopulate_tlb:\n\t"
-		"movzbl	(%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
-		"addl	$4096, %%eax\n\t"
-		"cmpl	%%eax, %[size]\n\t"
-		"jne	.Lpopulate_tlb\n\t"
-		"xorl	%%eax, %%eax\n\t"
-		"cpuid\n\t"
-		/* Now fill the cache */
-		"xorl	%%eax, %%eax\n"
-		".Lfill_cache:\n"
-		"movzbl	(%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
-		"addl	$64, %%eax\n\t"
-		"cmpl	%%eax, %[size]\n\t"
-		"jne	.Lfill_cache\n\t"
-		"lfence\n"
-		:: [flush_pages] "r" (vmx_l1d_flush_pages),
-		    [size] "r" (size)
-		: "eax", "ebx", "ecx", "edx");
-}
-
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
-{
-	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	int tpr_threshold;
-
-	if (is_guest_mode(vcpu) &&
-		nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
-		return;
-
-	tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
-	if (is_guest_mode(vcpu))
-		to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
-	else
-		vmcs_write32(TPR_THRESHOLD, tpr_threshold);
-}
-
-void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 sec_exec_control;
-
-	if (!lapic_in_kernel(vcpu))
-		return;
-
-	if (!flexpriority_enabled &&
-	    !cpu_has_vmx_virtualize_x2apic_mode())
-		return;
-
-	/* Postpone execution until vmcs01 is the current VMCS. */
-	if (is_guest_mode(vcpu)) {
-		vmx->nested.change_vmcs01_virtual_apic_mode = true;
-		return;
-	}
-
-	sec_exec_control = secondary_exec_controls_get(vmx);
-	sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
-			      SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
-
-	switch (kvm_get_apic_mode(vcpu)) {
-	case LAPIC_MODE_INVALID:
-		WARN_ONCE(true, "Invalid local APIC state");
-	case LAPIC_MODE_DISABLED:
-		break;
-	case LAPIC_MODE_XAPIC:
-		if (flexpriority_enabled) {
-			sec_exec_control |=
-				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-			vmx_flush_tlb(vcpu, true);
-		}
-		break;
-	case LAPIC_MODE_X2APIC:
-		if (cpu_has_vmx_virtualize_x2apic_mode())
-			sec_exec_control |=
-				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
-		break;
-	}
-	secondary_exec_controls_set(vmx, sec_exec_control);
-
-	vmx_update_msr_bitmap(vcpu);
-}
-
-static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
-{
-	if (!is_guest_mode(vcpu)) {
-		vmcs_write64(APIC_ACCESS_ADDR, hpa);
-		vmx_flush_tlb(vcpu, true);
-	}
-}
-
-static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
-{
-	u16 status;
-	u8 old;
-
-	if (max_isr == -1)
-		max_isr = 0;
-
-	status = vmcs_read16(GUEST_INTR_STATUS);
-	old = status >> 8;
-	if (max_isr != old) {
-		status &= 0xff;
-		status |= max_isr << 8;
-		vmcs_write16(GUEST_INTR_STATUS, status);
-	}
-}
-
-static void vmx_set_rvi(int vector)
-{
-	u16 status;
-	u8 old;
-
-	if (vector == -1)
-		vector = 0;
-
-	status = vmcs_read16(GUEST_INTR_STATUS);
-	old = (u8)status & 0xff;
-	if ((u8)vector != old) {
-		status &= ~0xff;
-		status |= (u8)vector;
-		vmcs_write16(GUEST_INTR_STATUS, status);
-	}
-}
-
-static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
-{
-	/*
-	 * When running L2, updating RVI is only relevant when
-	 * vmcs12 virtual-interrupt-delivery enabled.
-	 * However, it can be enabled only when L1 also
-	 * intercepts external-interrupts and in that case
-	 * we should not update vmcs02 RVI but instead intercept
-	 * interrupt. Therefore, do nothing when running L2.
-	 */
-	if (!is_guest_mode(vcpu))
-		vmx_set_rvi(max_irr);
-}
-
-static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	int max_irr;
-	bool max_irr_updated;
-
-	WARN_ON(!vcpu->arch.apicv_active);
-	if (pi_test_on(&vmx->pi_desc)) {
-		pi_clear_on(&vmx->pi_desc);
-		/*
-		 * IOMMU can write to PID.ON, so the barrier matters even on UP.
-		 * But on x86 this is just a compiler barrier anyway.
-		 */
-		smp_mb__after_atomic();
-		max_irr_updated =
-			kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
-
-		/*
-		 * If we are running L2 and L1 has a new pending interrupt
-		 * which can be injected, we should re-evaluate
-		 * what should be done with this new L1 interrupt.
-		 * If L1 intercepts external-interrupts, we should
-		 * exit from L2 to L1. Otherwise, interrupt should be
-		 * delivered directly to L2.
-		 */
-		if (is_guest_mode(vcpu) && max_irr_updated) {
-			if (nested_exit_on_intr(vcpu))
-				kvm_vcpu_exiting_guest_mode(vcpu);
-			else
-				kvm_make_request(KVM_REQ_EVENT, vcpu);
-		}
-	} else {
-		max_irr = kvm_lapic_find_highest_irr(vcpu);
-	}
-	vmx_hwapic_irr_update(vcpu, max_irr);
-	return max_irr;
-}
-
-static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
-	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
-	return pi_test_on(pi_desc) ||
-		(pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
-}
-
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
-{
-	if (!kvm_vcpu_apicv_active(vcpu))
-		return;
-
-	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
-	vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
-	vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
-	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
-}
-
-static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	pi_clear_on(&vmx->pi_desc);
-	memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
-}
-
-static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
-{
-	vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
-	/* if exit due to PF check for async PF */
-	if (is_page_fault(vmx->exit_intr_info))
-		vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
-
-	/* Handle machine checks before interrupts are enabled */
-	if (is_machine_check(vmx->exit_intr_info))
-		kvm_machine_check();
-
-	/* We need to handle NMIs before interrupts are enabled */
-	if (is_nmi(vmx->exit_intr_info)) {
-		kvm_before_interrupt(&vmx->vcpu);
-		asm("int $2");
-		kvm_after_interrupt(&vmx->vcpu);
-	}
-}
-
-static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
-{
-	unsigned int vector;
-	unsigned long entry;
-#ifdef CONFIG_X86_64
-	unsigned long tmp;
-#endif
-	gate_desc *desc;
-	u32 intr_info;
-
-	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-	if (WARN_ONCE(!is_external_intr(intr_info),
-	    "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
-		return;
-
-	vector = intr_info & INTR_INFO_VECTOR_MASK;
-	desc = (gate_desc *)host_idt_base + vector;
-	entry = gate_offset(desc);
-
-	kvm_before_interrupt(vcpu);
-
-	asm volatile(
-#ifdef CONFIG_X86_64
-		"mov %%" _ASM_SP ", %[sp]\n\t"
-		"and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
-		"push $%c[ss]\n\t"
-		"push %[sp]\n\t"
-#endif
-		"pushf\n\t"
-		__ASM_SIZE(push) " $%c[cs]\n\t"
-		CALL_NOSPEC
-		:
-#ifdef CONFIG_X86_64
-		[sp]"=&r"(tmp),
-#endif
-		ASM_CALL_CONSTRAINT
-		:
-		THUNK_TARGET(entry),
-		[ss]"i"(__KERNEL_DS),
-		[cs]"i"(__KERNEL_CS)
-	);
-
-	kvm_after_interrupt(vcpu);
-}
-STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
-
-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu,
-	enum exit_fastpath_completion *exit_fastpath)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
-		handle_external_interrupt_irqoff(vcpu);
-	else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
-		handle_exception_nmi_irqoff(vmx);
-	else if (!is_guest_mode(vcpu) &&
-		vmx->exit_reason == EXIT_REASON_MSR_WRITE)
-		*exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
-}
-
-static bool vmx_has_emulated_msr(int index)
-{
-	switch (index) {
-	case MSR_IA32_SMBASE:
-		/*
-		 * We cannot do SMM unless we can run the guest in big
-		 * real mode.
-		 */
-		return enable_unrestricted_guest || emulate_invalid_guest_state;
-	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
-		return nested;
-	case MSR_AMD64_VIRT_SPEC_CTRL:
-		/* This is AMD only.  */
-		return false;
-	default:
-		return true;
-	}
-}
-
-static bool vmx_pt_supported(void)
-{
-	return pt_mode == PT_MODE_HOST_GUEST;
-}
-
-static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
-{
-	u32 exit_intr_info;
-	bool unblock_nmi;
-	u8 vector;
-	bool idtv_info_valid;
-
-	idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
-
-	if (enable_vnmi) {
-		if (vmx->loaded_vmcs->nmi_known_unmasked)
-			return;
-		/*
-		 * Can't use vmx->exit_intr_info since we're not sure what
-		 * the exit reason is.
-		 */
-		exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
-		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
-		/*
-		 * SDM 3: 27.7.1.2 (September 2008)
-		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
-		 * a guest IRET fault.
-		 * SDM 3: 23.2.2 (September 2008)
-		 * Bit 12 is undefined in any of the following cases:
-		 *  If the VM exit sets the valid bit in the IDT-vectoring
-		 *   information field.
-		 *  If the VM exit is due to a double fault.
-		 */
-		if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
-		    vector != DF_VECTOR && !idtv_info_valid)
-			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-				      GUEST_INTR_STATE_NMI);
-		else
-			vmx->loaded_vmcs->nmi_known_unmasked =
-				!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
-				  & GUEST_INTR_STATE_NMI);
-	} else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
-		vmx->loaded_vmcs->vnmi_blocked_time +=
-			ktime_to_ns(ktime_sub(ktime_get(),
-					      vmx->loaded_vmcs->entry_time));
-}
-
-static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
-				      u32 idt_vectoring_info,
-				      int instr_len_field,
-				      int error_code_field)
-{
-	u8 vector;
-	int type;
-	bool idtv_info_valid;
-
-	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
-
-	vcpu->arch.nmi_injected = false;
-	kvm_clear_exception_queue(vcpu);
-	kvm_clear_interrupt_queue(vcpu);
-
-	if (!idtv_info_valid)
-		return;
-
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
-
-	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
-	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
-
-	switch (type) {
-	case INTR_TYPE_NMI_INTR:
-		vcpu->arch.nmi_injected = true;
-		/*
-		 * SDM 3: 27.7.1.2 (September 2008)
-		 * Clear bit "block by NMI" before VM entry if a NMI
-		 * delivery faulted.
-		 */
-		vmx_set_nmi_mask(vcpu, false);
-		break;
-	case INTR_TYPE_SOFT_EXCEPTION:
-		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
-		/* fall through */
-	case INTR_TYPE_HARD_EXCEPTION:
-		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
-			u32 err = vmcs_read32(error_code_field);
-			kvm_requeue_exception_e(vcpu, vector, err);
-		} else
-			kvm_requeue_exception(vcpu, vector);
-		break;
-	case INTR_TYPE_SOFT_INTR:
-		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
-		/* fall through */
-	case INTR_TYPE_EXT_INTR:
-		kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
-		break;
-	default:
-		break;
-	}
-}
-
-static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
-{
-	__vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
-				  VM_EXIT_INSTRUCTION_LEN,
-				  IDT_VECTORING_ERROR_CODE);
-}
-
-static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
-{
-	__vmx_complete_interrupts(vcpu,
-				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
-				  VM_ENTRY_INSTRUCTION_LEN,
-				  VM_ENTRY_EXCEPTION_ERROR_CODE);
-
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
-}
-
-static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
-{
-	int i, nr_msrs;
-	struct perf_guest_switch_msr *msrs;
-
-	msrs = perf_guest_get_msrs(&nr_msrs);
-
-	if (!msrs)
-		return;
-
-	for (i = 0; i < nr_msrs; i++)
-		if (msrs[i].host == msrs[i].guest)
-			clear_atomic_switch_msr(vmx, msrs[i].msr);
-		else
-			add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
-					msrs[i].host, false);
-}
-
-static void atomic_switch_umwait_control_msr(struct vcpu_vmx *vmx)
-{
-	u32 host_umwait_control;
-
-	if (!vmx_has_waitpkg(vmx))
-		return;
-
-	host_umwait_control = get_umwait_control_msr();
-
-	if (vmx->msr_ia32_umwait_control != host_umwait_control)
-		add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL,
-			vmx->msr_ia32_umwait_control,
-			host_umwait_control, false);
-	else
-		clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL);
-}
-
-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 tscl;
-	u32 delta_tsc;
-
-	if (vmx->req_immediate_exit) {
-		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
-		vmx->loaded_vmcs->hv_timer_soft_disabled = false;
-	} else if (vmx->hv_deadline_tsc != -1) {
-		tscl = rdtsc();
-		if (vmx->hv_deadline_tsc > tscl)
-			/* set_hv_timer ensures the delta fits in 32-bits */
-			delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
-				cpu_preemption_timer_multi);
-		else
-			delta_tsc = 0;
-
-		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
-		vmx->loaded_vmcs->hv_timer_soft_disabled = false;
-	} else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
-		vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
-		vmx->loaded_vmcs->hv_timer_soft_disabled = true;
-	}
-}
-
-void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
-{
-	if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
-		vmx->loaded_vmcs->host_state.rsp = host_rsp;
-		vmcs_writel(HOST_RSP, host_rsp);
-	}
-}
-
-bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
-
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned long cr3, cr4;
-
-	/* Record the guest's net vcpu time for enforced NMI injections. */
-	if (unlikely(!enable_vnmi &&
-		     vmx->loaded_vmcs->soft_vnmi_blocked))
-		vmx->loaded_vmcs->entry_time = ktime_get();
-
-	/* Don't enter VMX if guest state is invalid, let the exit handler
-	   start emulation until we arrive back to a valid state */
-	if (vmx->emulation_required)
-		return;
-
-	if (vmx->ple_window_dirty) {
-		vmx->ple_window_dirty = false;
-		vmcs_write32(PLE_WINDOW, vmx->ple_window);
-	}
-
-	if (vmx->nested.need_vmcs12_to_shadow_sync)
-		nested_sync_vmcs12_to_shadow(vcpu);
-
-	if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
-		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
-	if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
-		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
-
-	cr3 = __get_current_cr3_fast();
-	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
-		vmcs_writel(HOST_CR3, cr3);
-		vmx->loaded_vmcs->host_state.cr3 = cr3;
-	}
-
-	cr4 = cr4_read_shadow();
-	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
-		vmcs_writel(HOST_CR4, cr4);
-		vmx->loaded_vmcs->host_state.cr4 = cr4;
-	}
-
-	/* When single-stepping over STI and MOV SS, we must clear the
-	 * corresponding interruptibility bits in the guest state. Otherwise
-	 * vmentry fails as it then expects bit 14 (BS) in pending debug
-	 * exceptions being set, but that's not correct for the guest debugging
-	 * case. */
-	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		vmx_set_interrupt_shadow(vcpu, 0);
-
-	kvm_load_guest_xsave_state(vcpu);
-
-	if (static_cpu_has(X86_FEATURE_PKU) &&
-	    kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
-	    vcpu->arch.pkru != vmx->host_pkru)
-		__write_pkru(vcpu->arch.pkru);
-
-	pt_guest_enter(vmx);
-
-	atomic_switch_perf_msrs(vmx);
-	atomic_switch_umwait_control_msr(vmx);
-
-	if (enable_preemption_timer)
-		vmx_update_hv_timer(vcpu);
-
-	if (lapic_in_kernel(vcpu) &&
-		vcpu->arch.apic->lapic_timer.timer_advance_ns)
-		kvm_wait_lapic_expire(vcpu);
-
-	/*
-	 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
-	 * it's non-zero. Since vmentry is serialising on affected CPUs, there
-	 * is no need to worry about the conditional branch over the wrmsr
-	 * being speculatively taken.
-	 */
-	x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
-
-	/* L1D Flush includes CPU buffer clear to mitigate MDS */
-	if (static_branch_unlikely(&vmx_l1d_should_flush))
-		vmx_l1d_flush(vcpu);
-	else if (static_branch_unlikely(&mds_user_clear))
-		mds_clear_cpu_buffers();
-
-	if (vcpu->arch.cr2 != read_cr2())
-		write_cr2(vcpu->arch.cr2);
-
-	vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
-				   vmx->loaded_vmcs->launched);
-
-	vcpu->arch.cr2 = read_cr2();
-
-	/*
-	 * We do not use IBRS in the kernel. If this vCPU has used the
-	 * SPEC_CTRL MSR it may have left it on; save the value and
-	 * turn it off. This is much more efficient than blindly adding
-	 * it to the atomic save/restore list. Especially as the former
-	 * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
-	 *
-	 * For non-nested case:
-	 * If the L01 MSR bitmap does not intercept the MSR, then we need to
-	 * save it.
-	 *
-	 * For nested case:
-	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
-	 * save it.
-	 */
-	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
-		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
-
-	x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
-
-	/* All fields are clean at this point */
-	if (static_branch_unlikely(&enable_evmcs))
-		current_evmcs->hv_clean_fields |=
-			HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
-
-	if (static_branch_unlikely(&enable_evmcs))
-		current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
-
-	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
-	if (vmx->host_debugctlmsr)
-		update_debugctlmsr(vmx->host_debugctlmsr);
-
-#ifndef CONFIG_X86_64
-	/*
-	 * The sysexit path does not restore ds/es, so we must set them to
-	 * a reasonable value ourselves.
-	 *
-	 * We can't defer this to vmx_prepare_switch_to_host() since that
-	 * function may be executed in interrupt context, which saves and
-	 * restore segments around it, nullifying its effect.
-	 */
-	loadsegment(ds, __USER_DS);
-	loadsegment(es, __USER_DS);
-#endif
-
-	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
-				  | (1 << VCPU_EXREG_RFLAGS)
-				  | (1 << VCPU_EXREG_PDPTR)
-				  | (1 << VCPU_EXREG_SEGMENTS)
-				  | (1 << VCPU_EXREG_CR3));
-	vcpu->arch.regs_dirty = 0;
-
-	pt_guest_exit(vmx);
-
-	/*
-	 * eager fpu is enabled if PKEY is supported and CR4 is switched
-	 * back on host, so it is safe to read guest PKRU from current
-	 * XSAVE.
-	 */
-	if (static_cpu_has(X86_FEATURE_PKU) &&
-	    kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
-		vcpu->arch.pkru = rdpkru();
-		if (vcpu->arch.pkru != vmx->host_pkru)
-			__write_pkru(vmx->host_pkru);
-	}
-
-	kvm_load_host_xsave_state(vcpu);
-
-	vmx->nested.nested_run_pending = 0;
-	vmx->idt_vectoring_info = 0;
-
-	vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
-	if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
-		kvm_machine_check();
-
-	if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
-		return;
-
-	vmx->loaded_vmcs->launched = 1;
-	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-
-	vmx_recover_nmi_blocking(vmx);
-	vmx_complete_interrupts(vmx);
-}
-
-static struct kvm *vmx_vm_alloc(void)
-{
-	struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
-					    GFP_KERNEL_ACCOUNT | __GFP_ZERO,
-					    PAGE_KERNEL);
-	return &kvm_vmx->kvm;
-}
-
-static void vmx_vm_free(struct kvm *kvm)
-{
-	kfree(kvm->arch.hyperv.hv_pa_pg);
-	vfree(to_kvm_vmx(kvm));
-}
-
-static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (enable_pml)
-		vmx_destroy_pml_buffer(vmx);
-	free_vpid(vmx->vpid);
-	nested_vmx_free_vcpu(vcpu);
-	free_loaded_vmcs(vmx->loaded_vmcs);
-	kvm_vcpu_uninit(vcpu);
-	kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
-	kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
-	kmem_cache_free(kvm_vcpu_cache, vmx);
-}
-
-static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
-{
-	int err;
-	struct vcpu_vmx *vmx;
-	unsigned long *msr_bitmap;
-	int i, cpu;
-
-	BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0,
-		"struct kvm_vcpu must be at offset 0 for arch usercopy region");
-
-	vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
-	if (!vmx)
-		return ERR_PTR(-ENOMEM);
-
-	vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
-			GFP_KERNEL_ACCOUNT);
-	if (!vmx->vcpu.arch.user_fpu) {
-		printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
-		err = -ENOMEM;
-		goto free_partial_vcpu;
-	}
-
-	vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
-			GFP_KERNEL_ACCOUNT);
-	if (!vmx->vcpu.arch.guest_fpu) {
-		printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
-		err = -ENOMEM;
-		goto free_user_fpu;
-	}
-
-	vmx->vpid = allocate_vpid();
-
-	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
-	if (err)
-		goto free_vcpu;
-
-	err = -ENOMEM;
-
-	/*
-	 * If PML is turned on, failure on enabling PML just results in failure
-	 * of creating the vcpu, therefore we can simplify PML logic (by
-	 * avoiding dealing with cases, such as enabling PML partially on vcpus
-	 * for the guest), etc.
-	 */
-	if (enable_pml) {
-		vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-		if (!vmx->pml_pg)
-			goto uninit_vcpu;
-	}
-
-	BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS);
-
-	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
-		u32 index = vmx_msr_index[i];
-		u32 data_low, data_high;
-		int j = vmx->nmsrs;
-
-		if (rdmsr_safe(index, &data_low, &data_high) < 0)
-			continue;
-		if (wrmsr_safe(index, data_low, data_high) < 0)
-			continue;
-
-		vmx->guest_msrs[j].index = i;
-		vmx->guest_msrs[j].data = 0;
-		switch (index) {
-		case MSR_IA32_TSX_CTRL:
-			/*
-			 * No need to pass TSX_CTRL_CPUID_CLEAR through, so
-			 * let's avoid changing CPUID bits under the host
-			 * kernel's feet.
-			 */
-			vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
-			break;
-		default:
-			vmx->guest_msrs[j].mask = -1ull;
-			break;
-		}
-		++vmx->nmsrs;
-	}
-
-	err = alloc_loaded_vmcs(&vmx->vmcs01);
-	if (err < 0)
-		goto free_pml;
-
-	msr_bitmap = vmx->vmcs01.msr_bitmap;
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
-	vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
-	if (kvm_cstate_in_guest(kvm)) {
-		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
-		vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
-	}
-	vmx->msr_bitmap_mode = 0;
-
-	vmx->loaded_vmcs = &vmx->vmcs01;
-	cpu = get_cpu();
-	vmx_vcpu_load(&vmx->vcpu, cpu);
-	vmx->vcpu.cpu = cpu;
-	init_vmcs(vmx);
-	vmx_vcpu_put(&vmx->vcpu);
-	put_cpu();
-	if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
-		err = alloc_apic_access_page(kvm);
-		if (err)
-			goto free_vmcs;
-	}
-
-	if (enable_ept && !enable_unrestricted_guest) {
-		err = init_rmode_identity_map(kvm);
-		if (err)
-			goto free_vmcs;
-	}
-
-	if (nested)
-		nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
-					   vmx_capability.ept,
-					   kvm_vcpu_apicv_active(&vmx->vcpu));
-	else
-		memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
-
-	vmx->nested.posted_intr_nv = -1;
-	vmx->nested.current_vmptr = -1ull;
-
-	vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
-
-	/*
-	 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
-	 * or POSTED_INTR_WAKEUP_VECTOR.
-	 */
-	vmx->pi_desc.nv = POSTED_INTR_VECTOR;
-	vmx->pi_desc.sn = 1;
-
-	vmx->ept_pointer = INVALID_PAGE;
-
-	return &vmx->vcpu;
-
-free_vmcs:
-	free_loaded_vmcs(vmx->loaded_vmcs);
-free_pml:
-	vmx_destroy_pml_buffer(vmx);
-uninit_vcpu:
-	kvm_vcpu_uninit(&vmx->vcpu);
-free_vcpu:
-	free_vpid(vmx->vpid);
-	kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
-free_user_fpu:
-	kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
-free_partial_vcpu:
-	kmem_cache_free(kvm_vcpu_cache, vmx);
-	return ERR_PTR(err);
-}
-
-#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
-#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
-
-static int vmx_vm_init(struct kvm *kvm)
-{
-	spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
-
-	if (!ple_gap)
-		kvm->arch.pause_in_guest = true;
-
-	if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
-		switch (l1tf_mitigation) {
-		case L1TF_MITIGATION_OFF:
-		case L1TF_MITIGATION_FLUSH_NOWARN:
-			/* 'I explicitly don't care' is set */
-			break;
-		case L1TF_MITIGATION_FLUSH:
-		case L1TF_MITIGATION_FLUSH_NOSMT:
-		case L1TF_MITIGATION_FULL:
-			/*
-			 * Warn upon starting the first VM in a potentially
-			 * insecure environment.
-			 */
-			if (sched_smt_active())
-				pr_warn_once(L1TF_MSG_SMT);
-			if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
-				pr_warn_once(L1TF_MSG_L1D);
-			break;
-		case L1TF_MITIGATION_FULL_FORCE:
-			/* Flush is enforced */
-			break;
-		}
-	}
-	return 0;
-}
-
-static int __init vmx_check_processor_compat(void)
-{
-	struct vmcs_config vmcs_conf;
-	struct vmx_capability vmx_cap;
-
-	if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
-		return -EIO;
-	if (nested)
-		nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
-					   enable_apicv);
-	if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
-		printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
-				smp_processor_id());
-		return -EIO;
-	}
-	return 0;
-}
-
-static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
-{
-	u8 cache;
-	u64 ipat = 0;
-
-	/* For VT-d and EPT combination
-	 * 1. MMIO: always map as UC
-	 * 2. EPT with VT-d:
-	 *   a. VT-d without snooping control feature: can't guarantee the
-	 *	result, try to trust guest.
-	 *   b. VT-d with snooping control feature: snooping control feature of
-	 *	VT-d engine can guarantee the cache correctness. Just set it
-	 *	to WB to keep consistent with host. So the same as item 3.
-	 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
-	 *    consistent with host MTRR
-	 */
-	if (is_mmio) {
-		cache = MTRR_TYPE_UNCACHABLE;
-		goto exit;
-	}
-
-	if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
-		ipat = VMX_EPT_IPAT_BIT;
-		cache = MTRR_TYPE_WRBACK;
-		goto exit;
-	}
-
-	if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
-		ipat = VMX_EPT_IPAT_BIT;
-		if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
-			cache = MTRR_TYPE_WRBACK;
-		else
-			cache = MTRR_TYPE_UNCACHABLE;
-		goto exit;
-	}
-
-	cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
-
-exit:
-	return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
-}
-
-static int vmx_get_lpage_level(void)
-{
-	if (enable_ept && !cpu_has_vmx_ept_1g_page())
-		return PT_DIRECTORY_LEVEL;
-	else
-		/* For shadow and EPT supported 1GB page */
-		return PT_PDPE_LEVEL;
-}
-
-static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
-{
-	/*
-	 * These bits in the secondary execution controls field
-	 * are dynamic, the others are mostly based on the hypervisor
-	 * architecture and the guest's CPUID.  Do not touch the
-	 * dynamic bits.
-	 */
-	u32 mask =
-		SECONDARY_EXEC_SHADOW_VMCS |
-		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
-		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
-		SECONDARY_EXEC_DESC;
-
-	u32 new_ctl = vmx->secondary_exec_control;
-	u32 cur_ctl = secondary_exec_controls_get(vmx);
-
-	secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
-}
-
-/*
- * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
- * (indicating "allowed-1") if they are supported in the guest's CPUID.
- */
-static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct kvm_cpuid_entry2 *entry;
-
-	vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
-	vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
-
-#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do {		\
-	if (entry && (entry->_reg & (_cpuid_mask)))			\
-		vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask);	\
-} while (0)
-
-	entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
-	cr4_fixed1_update(X86_CR4_VME,        edx, bit(X86_FEATURE_VME));
-	cr4_fixed1_update(X86_CR4_PVI,        edx, bit(X86_FEATURE_VME));
-	cr4_fixed1_update(X86_CR4_TSD,        edx, bit(X86_FEATURE_TSC));
-	cr4_fixed1_update(X86_CR4_DE,         edx, bit(X86_FEATURE_DE));
-	cr4_fixed1_update(X86_CR4_PSE,        edx, bit(X86_FEATURE_PSE));
-	cr4_fixed1_update(X86_CR4_PAE,        edx, bit(X86_FEATURE_PAE));
-	cr4_fixed1_update(X86_CR4_MCE,        edx, bit(X86_FEATURE_MCE));
-	cr4_fixed1_update(X86_CR4_PGE,        edx, bit(X86_FEATURE_PGE));
-	cr4_fixed1_update(X86_CR4_OSFXSR,     edx, bit(X86_FEATURE_FXSR));
-	cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
-	cr4_fixed1_update(X86_CR4_VMXE,       ecx, bit(X86_FEATURE_VMX));
-	cr4_fixed1_update(X86_CR4_SMXE,       ecx, bit(X86_FEATURE_SMX));
-	cr4_fixed1_update(X86_CR4_PCIDE,      ecx, bit(X86_FEATURE_PCID));
-	cr4_fixed1_update(X86_CR4_OSXSAVE,    ecx, bit(X86_FEATURE_XSAVE));
-
-	entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
-	cr4_fixed1_update(X86_CR4_FSGSBASE,   ebx, bit(X86_FEATURE_FSGSBASE));
-	cr4_fixed1_update(X86_CR4_SMEP,       ebx, bit(X86_FEATURE_SMEP));
-	cr4_fixed1_update(X86_CR4_SMAP,       ebx, bit(X86_FEATURE_SMAP));
-	cr4_fixed1_update(X86_CR4_PKE,        ecx, bit(X86_FEATURE_PKU));
-	cr4_fixed1_update(X86_CR4_UMIP,       ecx, bit(X86_FEATURE_UMIP));
-	cr4_fixed1_update(X86_CR4_LA57,       ecx, bit(X86_FEATURE_LA57));
-
-#undef cr4_fixed1_update
-}
-
-static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (kvm_mpx_supported()) {
-		bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
-
-		if (mpx_enabled) {
-			vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
-			vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
-		} else {
-			vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
-			vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
-		}
-	}
-}
-
-static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	struct kvm_cpuid_entry2 *best = NULL;
-	int i;
-
-	for (i = 0; i < PT_CPUID_LEAVES; i++) {
-		best = kvm_find_cpuid_entry(vcpu, 0x14, i);
-		if (!best)
-			return;
-		vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
-		vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
-		vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
-		vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
-	}
-
-	/* Get the number of configurable Address Ranges for filtering */
-	vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
-						PT_CAP_num_address_ranges);
-
-	/* Initialize and clear the no dependency bits */
-	vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
-			RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
-
-	/*
-	 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
-	 * will inject an #GP
-	 */
-	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
-		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
-
-	/*
-	 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
-	 * PSBFreq can be set
-	 */
-	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
-		vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
-				RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
-
-	/*
-	 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
-	 * MTCFreq can be set
-	 */
-	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
-		vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
-				RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
-
-	/* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
-	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
-		vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
-							RTIT_CTL_PTW_EN);
-
-	/* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
-	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
-		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
-
-	/* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
-	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
-		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
-
-	/* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
-	if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
-		vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
-
-	/* unmask address range configure area */
-	for (i = 0; i < vmx->pt_desc.addr_range; i++)
-		vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
-}
-
-static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	/* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
-	vcpu->arch.xsaves_enabled = false;
-
-	if (cpu_has_secondary_exec_ctrls()) {
-		vmx_compute_secondary_exec_control(vmx);
-		vmcs_set_secondary_exec_control(vmx);
-	}
-
-	if (nested_vmx_allowed(vcpu))
-		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
-			FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX |
-			FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
-	else
-		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
-			~(FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX |
-			  FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX);
-
-	if (nested_vmx_allowed(vcpu)) {
-		nested_vmx_cr_fixed1_bits_update(vcpu);
-		nested_vmx_entry_exit_ctls_update(vcpu);
-	}
-
-	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
-			guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
-		update_intel_pt_cfg(vcpu);
-
-	if (boot_cpu_has(X86_FEATURE_RTM)) {
-		struct shared_msr_entry *msr;
-		msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL);
-		if (msr) {
-			bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
-			vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
-		}
-	}
-}
-
-static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
-{
-	if (func == 1 && nested)
-		entry->ecx |= bit(X86_FEATURE_VMX);
-}
-
-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
-	to_vmx(vcpu)->req_immediate_exit = true;
-}
-
-static int vmx_check_intercept(struct kvm_vcpu *vcpu,
-			       struct x86_instruction_info *info,
-			       enum x86_intercept_stage stage)
-{
-	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
-
-	/*
-	 * RDPID causes #UD if disabled through secondary execution controls.
-	 * Because it is marked as EmulateOnUD, we need to intercept it here.
-	 */
-	if (info->intercept == x86_intercept_rdtscp &&
-	    !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
-		ctxt->exception.vector = UD_VECTOR;
-		ctxt->exception.error_code_valid = false;
-		return X86EMUL_PROPAGATE_FAULT;
-	}
-
-	/* TODO: check more intercepts... */
-	return X86EMUL_CONTINUE;
-}
-
-#ifdef CONFIG_X86_64
-/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
-static inline int u64_shl_div_u64(u64 a, unsigned int shift,
-				  u64 divisor, u64 *result)
-{
-	u64 low = a << shift, high = a >> (64 - shift);
-
-	/* To avoid the overflow on divq */
-	if (high >= divisor)
-		return 1;
-
-	/* Low hold the result, high hold rem which is discarded */
-	asm("divq %2\n\t" : "=a" (low), "=d" (high) :
-	    "rm" (divisor), "0" (low), "1" (high));
-	*result = low;
-
-	return 0;
-}
-
-static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
-			    bool *expired)
-{
-	struct vcpu_vmx *vmx;
-	u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
-	struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
-
-	if (kvm_mwait_in_guest(vcpu->kvm) ||
-		kvm_can_post_timer_interrupt(vcpu))
-		return -EOPNOTSUPP;
-
-	vmx = to_vmx(vcpu);
-	tscl = rdtsc();
-	guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
-	delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
-	lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
-						    ktimer->timer_advance_ns);
-
-	if (delta_tsc > lapic_timer_advance_cycles)
-		delta_tsc -= lapic_timer_advance_cycles;
-	else
-		delta_tsc = 0;
-
-	/* Convert to host delta tsc if tsc scaling is enabled */
-	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
-	    delta_tsc && u64_shl_div_u64(delta_tsc,
-				kvm_tsc_scaling_ratio_frac_bits,
-				vcpu->arch.tsc_scaling_ratio, &delta_tsc))
-		return -ERANGE;
-
-	/*
-	 * If the delta tsc can't fit in the 32 bit after the multi shift,
-	 * we can't use the preemption timer.
-	 * It's possible that it fits on later vmentries, but checking
-	 * on every vmentry is costly so we just use an hrtimer.
-	 */
-	if (delta_tsc >> (cpu_preemption_timer_multi + 32))
-		return -ERANGE;
-
-	vmx->hv_deadline_tsc = tscl + delta_tsc;
-	*expired = !delta_tsc;
-	return 0;
-}
-
-static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
-{
-	to_vmx(vcpu)->hv_deadline_tsc = -1;
-}
-#endif
-
-static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
-{
-	if (!kvm_pause_in_guest(vcpu->kvm))
-		shrink_ple_window(vcpu);
-}
-
-static void vmx_slot_enable_log_dirty(struct kvm *kvm,
-				     struct kvm_memory_slot *slot)
-{
-	kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
-	kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
-}
-
-static void vmx_slot_disable_log_dirty(struct kvm *kvm,
-				       struct kvm_memory_slot *slot)
-{
-	kvm_mmu_slot_set_dirty(kvm, slot);
-}
-
-static void vmx_flush_log_dirty(struct kvm *kvm)
-{
-	kvm_flush_pml_buffers(kvm);
-}
-
-static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
-{
-	struct vmcs12 *vmcs12;
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	gpa_t gpa, dst;
-
-	if (is_guest_mode(vcpu)) {
-		WARN_ON_ONCE(vmx->nested.pml_full);
-
-		/*
-		 * Check if PML is enabled for the nested guest.
-		 * Whether eptp bit 6 is set is already checked
-		 * as part of A/D emulation.
-		 */
-		vmcs12 = get_vmcs12(vcpu);
-		if (!nested_cpu_has_pml(vmcs12))
-			return 0;
-
-		if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
-			vmx->nested.pml_full = true;
-			return 1;
-		}
-
-		gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
-		dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
-
-		if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
-					 offset_in_page(dst), sizeof(gpa)))
-			return 0;
-
-		vmcs12->guest_pml_index--;
-	}
-
-	return 0;
-}
-
-static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
-					   struct kvm_memory_slot *memslot,
-					   gfn_t offset, unsigned long mask)
-{
-	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
-}
-
-static void __pi_post_block(struct kvm_vcpu *vcpu)
-{
-	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-	struct pi_desc old, new;
-	unsigned int dest;
-
-	do {
-		old.control = new.control = pi_desc->control;
-		WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
-		     "Wakeup handler not enabled while the VCPU is blocked\n");
-
-		dest = cpu_physical_id(vcpu->cpu);
-
-		if (x2apic_enabled())
-			new.ndst = dest;
-		else
-			new.ndst = (dest << 8) & 0xFF00;
-
-		/* set 'NV' to 'notification vector' */
-		new.nv = POSTED_INTR_VECTOR;
-	} while (cmpxchg64(&pi_desc->control, old.control,
-			   new.control) != old.control);
-
-	if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
-		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-		list_del(&vcpu->blocked_vcpu_list);
-		spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-		vcpu->pre_pcpu = -1;
-	}
-}
-
-/*
- * This routine does the following things for vCPU which is going
- * to be blocked if VT-d PI is enabled.
- * - Store the vCPU to the wakeup list, so when interrupts happen
- *   we can find the right vCPU to wake up.
- * - Change the Posted-interrupt descriptor as below:
- *      'NDST' <-- vcpu->pre_pcpu
- *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
- * - If 'ON' is set during this process, which means at least one
- *   interrupt is posted for this vCPU, we cannot block it, in
- *   this case, return 1, otherwise, return 0.
- *
- */
-static int pi_pre_block(struct kvm_vcpu *vcpu)
-{
-	unsigned int dest;
-	struct pi_desc old, new;
-	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
-	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP)  ||
-		!kvm_vcpu_apicv_active(vcpu))
-		return 0;
-
-	WARN_ON(irqs_disabled());
-	local_irq_disable();
-	if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
-		vcpu->pre_pcpu = vcpu->cpu;
-		spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-		list_add_tail(&vcpu->blocked_vcpu_list,
-			      &per_cpu(blocked_vcpu_on_cpu,
-				       vcpu->pre_pcpu));
-		spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-	}
-
-	do {
-		old.control = new.control = pi_desc->control;
-
-		WARN((pi_desc->sn == 1),
-		     "Warning: SN field of posted-interrupts "
-		     "is set before blocking\n");
-
-		/*
-		 * Since vCPU can be preempted during this process,
-		 * vcpu->cpu could be different with pre_pcpu, we
-		 * need to set pre_pcpu as the destination of wakeup
-		 * notification event, then we can find the right vCPU
-		 * to wakeup in wakeup handler if interrupts happen
-		 * when the vCPU is in blocked state.
-		 */
-		dest = cpu_physical_id(vcpu->pre_pcpu);
-
-		if (x2apic_enabled())
-			new.ndst = dest;
-		else
-			new.ndst = (dest << 8) & 0xFF00;
-
-		/* set 'NV' to 'wakeup vector' */
-		new.nv = POSTED_INTR_WAKEUP_VECTOR;
-	} while (cmpxchg64(&pi_desc->control, old.control,
-			   new.control) != old.control);
-
-	/* We should not block the vCPU if an interrupt is posted for it.  */
-	if (pi_test_on(pi_desc) == 1)
-		__pi_post_block(vcpu);
-
-	local_irq_enable();
-	return (vcpu->pre_pcpu == -1);
-}
-
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
-{
-	if (pi_pre_block(vcpu))
-		return 1;
-
-	if (kvm_lapic_hv_timer_in_use(vcpu))
-		kvm_lapic_switch_to_sw_timer(vcpu);
-
-	return 0;
-}
-
-static void pi_post_block(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->pre_pcpu == -1)
-		return;
-
-	WARN_ON(irqs_disabled());
-	local_irq_disable();
-	__pi_post_block(vcpu);
-	local_irq_enable();
-}
-
-static void vmx_post_block(struct kvm_vcpu *vcpu)
-{
-	if (kvm_x86_ops->set_hv_timer)
-		kvm_lapic_switch_to_hv_timer(vcpu);
-
-	pi_post_block(vcpu);
-}
-
-/*
- * vmx_update_pi_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
-			      uint32_t guest_irq, bool set)
-{
-	struct kvm_kernel_irq_routing_entry *e;
-	struct kvm_irq_routing_table *irq_rt;
-	struct kvm_lapic_irq irq;
-	struct kvm_vcpu *vcpu;
-	struct vcpu_data vcpu_info;
-	int idx, ret = 0;
-
-	if (!kvm_arch_has_assigned_device(kvm) ||
-		!irq_remapping_cap(IRQ_POSTING_CAP) ||
-		!kvm_vcpu_apicv_active(kvm->vcpus[0]))
-		return 0;
-
-	idx = srcu_read_lock(&kvm->irq_srcu);
-	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-	if (guest_irq >= irq_rt->nr_rt_entries ||
-	    hlist_empty(&irq_rt->map[guest_irq])) {
-		pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
-			     guest_irq, irq_rt->nr_rt_entries);
-		goto out;
-	}
-
-	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
-		if (e->type != KVM_IRQ_ROUTING_MSI)
-			continue;
-		/*
-		 * VT-d PI cannot support posting multicast/broadcast
-		 * interrupts to a vCPU, we still use interrupt remapping
-		 * for these kind of interrupts.
-		 *
-		 * For lowest-priority interrupts, we only support
-		 * those with single CPU as the destination, e.g. user
-		 * configures the interrupts via /proc/irq or uses
-		 * irqbalance to make the interrupts single-CPU.
-		 *
-		 * We will support full lowest-priority interrupt later.
-		 *
-		 * In addition, we can only inject generic interrupts using
-		 * the PI mechanism, refuse to route others through it.
-		 */
-
-		kvm_set_msi_irq(kvm, e, &irq);
-		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
-		    !kvm_irq_is_postable(&irq)) {
-			/*
-			 * Make sure the IRTE is in remapped mode if
-			 * we don't handle it in posted mode.
-			 */
-			ret = irq_set_vcpu_affinity(host_irq, NULL);
-			if (ret < 0) {
-				printk(KERN_INFO
-				   "failed to back to remapped mode, irq: %u\n",
-				   host_irq);
-				goto out;
-			}
-
-			continue;
-		}
-
-		vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
-		vcpu_info.vector = irq.vector;
-
-		trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
-				vcpu_info.vector, vcpu_info.pi_desc_addr, set);
-
-		if (set)
-			ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
-		else
-			ret = irq_set_vcpu_affinity(host_irq, NULL);
-
-		if (ret < 0) {
-			printk(KERN_INFO "%s: failed to update PI IRTE\n",
-					__func__);
-			goto out;
-		}
-	}
-
-	ret = 0;
-out:
-	srcu_read_unlock(&kvm->irq_srcu, idx);
-	return ret;
-}
-
-static void vmx_setup_mce(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->arch.mcg_cap & MCG_LMCE_P)
-		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
-			FEATURE_CONTROL_LMCE;
-	else
-		to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
-			~FEATURE_CONTROL_LMCE;
-}
-
-static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
-{
-	/* we need a nested vmexit to enter SMM, postpone if run is pending */
-	if (to_vmx(vcpu)->nested.nested_run_pending)
-		return 0;
-	return 1;
-}
-
-static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
-	if (vmx->nested.smm.guest_mode)
-		nested_vmx_vmexit(vcpu, -1, 0, 0);
-
-	vmx->nested.smm.vmxon = vmx->nested.vmxon;
-	vmx->nested.vmxon = false;
-	vmx_clear_hlt(vcpu);
-	return 0;
-}
-
-static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	int ret;
-
-	if (vmx->nested.smm.vmxon) {
-		vmx->nested.vmxon = true;
-		vmx->nested.smm.vmxon = false;
-	}
-
-	if (vmx->nested.smm.guest_mode) {
-		ret = nested_vmx_enter_non_root_mode(vcpu, false);
-		if (ret)
-			return ret;
-
-		vmx->nested.smm.guest_mode = false;
-	}
-	return 0;
-}
-
-static int enable_smi_window(struct kvm_vcpu *vcpu)
-{
-	return 0;
-}
-
-static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
-{
-	return false;
-}
-
-static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
-{
-	return to_vmx(vcpu)->nested.vmxon;
-}
-
-static __init int hardware_setup(void)
-{
-	unsigned long host_bndcfgs;
-	struct desc_ptr dt;
-	int r, i;
-
-	rdmsrl_safe(MSR_EFER, &host_efer);
-
-	store_idt(&dt);
-	host_idt_base = dt.address;
-
-	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
-		kvm_define_shared_msr(i, vmx_msr_index[i]);
-
-	if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
-		return -EIO;
-
-	if (boot_cpu_has(X86_FEATURE_NX))
-		kvm_enable_efer_bits(EFER_NX);
-
-	if (boot_cpu_has(X86_FEATURE_MPX)) {
-		rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
-		WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
-	}
-
-	if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
-	    !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
-		enable_vpid = 0;
-
-	if (!cpu_has_vmx_ept() ||
-	    !cpu_has_vmx_ept_4levels() ||
-	    !cpu_has_vmx_ept_mt_wb() ||
-	    !cpu_has_vmx_invept_global())
-		enable_ept = 0;
-
-	if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
-		enable_ept_ad_bits = 0;
-
-	if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
-		enable_unrestricted_guest = 0;
-
-	if (!cpu_has_vmx_flexpriority())
-		flexpriority_enabled = 0;
-
-	if (!cpu_has_virtual_nmis())
-		enable_vnmi = 0;
-
-	/*
-	 * set_apic_access_page_addr() is used to reload apic access
-	 * page upon invalidation.  No need to do anything if not
-	 * using the APIC_ACCESS_ADDR VMCS field.
-	 */
-	if (!flexpriority_enabled)
-		kvm_x86_ops->set_apic_access_page_addr = NULL;
-
-	if (!cpu_has_vmx_tpr_shadow())
-		kvm_x86_ops->update_cr8_intercept = NULL;
-
-	if (enable_ept && !cpu_has_vmx_ept_2m_page())
-		kvm_disable_largepages();
-
-#if IS_ENABLED(CONFIG_HYPERV)
-	if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
-	    && enable_ept) {
-		kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb;
-		kvm_x86_ops->tlb_remote_flush_with_range =
-				hv_remote_flush_tlb_with_range;
-	}
-#endif
-
-	if (!cpu_has_vmx_ple()) {
-		ple_gap = 0;
-		ple_window = 0;
-		ple_window_grow = 0;
-		ple_window_max = 0;
-		ple_window_shrink = 0;
-	}
-
-	if (!cpu_has_vmx_apicv()) {
-		enable_apicv = 0;
-		kvm_x86_ops->sync_pir_to_irr = NULL;
-	}
-
-	if (cpu_has_vmx_tsc_scaling()) {
-		kvm_has_tsc_control = true;
-		kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
-		kvm_tsc_scaling_ratio_frac_bits = 48;
-	}
-
-	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
-
-	if (enable_ept)
-		vmx_enable_tdp();
-	else
-		kvm_disable_tdp();
-
-	/*
-	 * Only enable PML when hardware supports PML feature, and both EPT
-	 * and EPT A/D bit features are enabled -- PML depends on them to work.
-	 */
-	if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
-		enable_pml = 0;
-
-	if (!enable_pml) {
-		kvm_x86_ops->slot_enable_log_dirty = NULL;
-		kvm_x86_ops->slot_disable_log_dirty = NULL;
-		kvm_x86_ops->flush_log_dirty = NULL;
-		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
-	}
-
-	if (!cpu_has_vmx_preemption_timer())
-		enable_preemption_timer = false;
-
-	if (enable_preemption_timer) {
-		u64 use_timer_freq = 5000ULL * 1000 * 1000;
-		u64 vmx_msr;
-
-		rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
-		cpu_preemption_timer_multi =
-			vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
-
-		if (tsc_khz)
-			use_timer_freq = (u64)tsc_khz * 1000;
-		use_timer_freq >>= cpu_preemption_timer_multi;
-
-		/*
-		 * KVM "disables" the preemption timer by setting it to its max
-		 * value.  Don't use the timer if it might cause spurious exits
-		 * at a rate faster than 0.1 Hz (of uninterrupted guest time).
-		 */
-		if (use_timer_freq > 0xffffffffu / 10)
-			enable_preemption_timer = false;
-	}
-
-	if (!enable_preemption_timer) {
-		kvm_x86_ops->set_hv_timer = NULL;
-		kvm_x86_ops->cancel_hv_timer = NULL;
-		kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
-	}
-
-	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
-
-	kvm_mce_cap_supported |= MCG_LMCE_P;
-
-	if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
-		return -EINVAL;
-	if (!enable_ept || !cpu_has_vmx_intel_pt())
-		pt_mode = PT_MODE_SYSTEM;
-
-	if (nested) {
-		nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
-					   vmx_capability.ept, enable_apicv);
-
-		r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
-		if (r)
-			return r;
-	}
-
-	r = alloc_kvm_area();
-	if (r)
-		nested_vmx_hardware_unsetup();
-	return r;
-}
-
-static __exit void hardware_unsetup(void)
-{
-	if (nested)
-		nested_vmx_hardware_unsetup();
-
-	free_kvm_area();
-}
-
-static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
-	.cpu_has_kvm_support = cpu_has_kvm_support,
-	.disabled_by_bios = vmx_disabled_by_bios,
-	.hardware_setup = hardware_setup,
-	.hardware_unsetup = hardware_unsetup,
-	.check_processor_compatibility = vmx_check_processor_compat,
-	.hardware_enable = hardware_enable,
-	.hardware_disable = hardware_disable,
-	.cpu_has_accelerated_tpr = report_flexpriority,
-	.has_emulated_msr = vmx_has_emulated_msr,
-
-	.vm_init = vmx_vm_init,
-	.vm_alloc = vmx_vm_alloc,
-	.vm_free = vmx_vm_free,
-
-	.vcpu_create = vmx_create_vcpu,
-	.vcpu_free = vmx_free_vcpu,
-	.vcpu_reset = vmx_vcpu_reset,
-
-	.prepare_guest_switch = vmx_prepare_switch_to_guest,
-	.vcpu_load = vmx_vcpu_load,
-	.vcpu_put = vmx_vcpu_put,
-
-	.update_bp_intercept = update_exception_bitmap,
-	.get_msr_feature = vmx_get_msr_feature,
-	.get_msr = vmx_get_msr,
-	.set_msr = vmx_set_msr,
-	.get_segment_base = vmx_get_segment_base,
-	.get_segment = vmx_get_segment,
-	.set_segment = vmx_set_segment,
-	.get_cpl = vmx_get_cpl,
-	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
-	.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
-	.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
-	.set_cr0 = vmx_set_cr0,
-	.set_cr3 = vmx_set_cr3,
-	.set_cr4 = vmx_set_cr4,
-	.set_efer = vmx_set_efer,
-	.get_idt = vmx_get_idt,
-	.set_idt = vmx_set_idt,
-	.get_gdt = vmx_get_gdt,
-	.set_gdt = vmx_set_gdt,
-	.get_dr6 = vmx_get_dr6,
-	.set_dr6 = vmx_set_dr6,
-	.set_dr7 = vmx_set_dr7,
-	.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
-	.cache_reg = vmx_cache_reg,
-	.get_rflags = vmx_get_rflags,
-	.set_rflags = vmx_set_rflags,
-
-	.tlb_flush = vmx_flush_tlb,
-	.tlb_flush_gva = vmx_flush_tlb_gva,
-
-	.run = vmx_vcpu_run,
-	.handle_exit = vmx_handle_exit,
-	.skip_emulated_instruction = skip_emulated_instruction,
-	.set_interrupt_shadow = vmx_set_interrupt_shadow,
-	.get_interrupt_shadow = vmx_get_interrupt_shadow,
-	.patch_hypercall = vmx_patch_hypercall,
-	.set_irq = vmx_inject_irq,
-	.set_nmi = vmx_inject_nmi,
-	.queue_exception = vmx_queue_exception,
-	.cancel_injection = vmx_cancel_injection,
-	.interrupt_allowed = vmx_interrupt_allowed,
-	.nmi_allowed = vmx_nmi_allowed,
-	.get_nmi_mask = vmx_get_nmi_mask,
-	.set_nmi_mask = vmx_set_nmi_mask,
-	.enable_nmi_window = enable_nmi_window,
-	.enable_irq_window = enable_irq_window,
-	.update_cr8_intercept = update_cr8_intercept,
-	.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
-	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
-	.get_enable_apicv = vmx_get_enable_apicv,
-	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
-	.load_eoi_exitmap = vmx_load_eoi_exitmap,
-	.apicv_post_state_restore = vmx_apicv_post_state_restore,
-	.hwapic_irr_update = vmx_hwapic_irr_update,
-	.hwapic_isr_update = vmx_hwapic_isr_update,
-	.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
-	.sync_pir_to_irr = vmx_sync_pir_to_irr,
-	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
-	.dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
-
-	.set_tss_addr = vmx_set_tss_addr,
-	.set_identity_map_addr = vmx_set_identity_map_addr,
-	.get_tdp_level = get_ept_level,
-	.get_mt_mask = vmx_get_mt_mask,
-
-	.get_exit_info = vmx_get_exit_info,
-
-	.get_lpage_level = vmx_get_lpage_level,
-
-	.cpuid_update = vmx_cpuid_update,
-
-	.rdtscp_supported = vmx_rdtscp_supported,
-	.invpcid_supported = vmx_invpcid_supported,
-
-	.set_supported_cpuid = vmx_set_supported_cpuid,
-
-	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
-
-	.read_l1_tsc_offset = vmx_read_l1_tsc_offset,
-	.write_l1_tsc_offset = vmx_write_l1_tsc_offset,
-
-	.set_tdp_cr3 = vmx_set_cr3,
-
-	.check_intercept = vmx_check_intercept,
-	.handle_exit_irqoff = vmx_handle_exit_irqoff,
-	.mpx_supported = vmx_mpx_supported,
-	.xsaves_supported = vmx_xsaves_supported,
-	.umip_emulated = vmx_umip_emulated,
-	.pt_supported = vmx_pt_supported,
-
-	.request_immediate_exit = vmx_request_immediate_exit,
-
-	.sched_in = vmx_sched_in,
-
-	.slot_enable_log_dirty = vmx_slot_enable_log_dirty,
-	.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
-	.flush_log_dirty = vmx_flush_log_dirty,
-	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
-	.write_log_dirty = vmx_write_pml_buffer,
-
-	.pre_block = vmx_pre_block,
-	.post_block = vmx_post_block,
-
-	.pmu_ops = &intel_pmu_ops,
-
-	.update_pi_irte = vmx_update_pi_irte,
-
-#ifdef CONFIG_X86_64
-	.set_hv_timer = vmx_set_hv_timer,
-	.cancel_hv_timer = vmx_cancel_hv_timer,
-#endif
-
-	.setup_mce = vmx_setup_mce,
-
-	.smi_allowed = vmx_smi_allowed,
-	.pre_enter_smm = vmx_pre_enter_smm,
-	.pre_leave_smm = vmx_pre_leave_smm,
-	.enable_smi_window = enable_smi_window,
-
-	.check_nested_events = NULL,
-	.get_nested_state = NULL,
-	.set_nested_state = NULL,
-	.get_vmcs12_pages = NULL,
-	.nested_enable_evmcs = NULL,
-	.nested_get_evmcs_version = NULL,
-	.need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
-	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
-};
-
-static void vmx_cleanup_l1d_flush(void)
-{
-	if (vmx_l1d_flush_pages) {
-		free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
-		vmx_l1d_flush_pages = NULL;
-	}
-	/* Restore state so sysfs ignores VMX */
-	l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
-}
-
-static void vmx_exit(void)
-{
-#ifdef CONFIG_KEXEC_CORE
-	RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
-	synchronize_rcu();
-#endif
-
-	kvm_exit();
-
-#if IS_ENABLED(CONFIG_HYPERV)
-	if (static_branch_unlikely(&enable_evmcs)) {
-		int cpu;
-		struct hv_vp_assist_page *vp_ap;
-		/*
-		 * Reset everything to support using non-enlightened VMCS
-		 * access later (e.g. when we reload the module with
-		 * enlightened_vmcs=0)
-		 */
-		for_each_online_cpu(cpu) {
-			vp_ap =	hv_get_vp_assist_page(cpu);
-
-			if (!vp_ap)
-				continue;
-
-			vp_ap->nested_control.features.directhypercall = 0;
-			vp_ap->current_nested_vmcs = 0;
-			vp_ap->enlighten_vmentry = 0;
-		}
-
-		static_branch_disable(&enable_evmcs);
-	}
-#endif
-	vmx_cleanup_l1d_flush();
-}
-module_exit(vmx_exit);
-
-static int __init vmx_init(void)
-{
-	int r;
-
-#if IS_ENABLED(CONFIG_HYPERV)
-	/*
-	 * Enlightened VMCS usage should be recommended and the host needs
-	 * to support eVMCS v1 or above. We can also disable eVMCS support
-	 * with module parameter.
-	 */
-	if (enlightened_vmcs &&
-	    ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
-	    (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
-	    KVM_EVMCS_VERSION) {
-		int cpu;
-
-		/* Check that we have assist pages on all online CPUs */
-		for_each_online_cpu(cpu) {
-			if (!hv_get_vp_assist_page(cpu)) {
-				enlightened_vmcs = false;
-				break;
-			}
-		}
-
-		if (enlightened_vmcs) {
-			pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
-			static_branch_enable(&enable_evmcs);
-		}
-
-		if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
-			vmx_x86_ops.enable_direct_tlbflush
-				= hv_enable_direct_tlbflush;
-
-	} else {
-		enlightened_vmcs = false;
-	}
-#endif
-
-	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
-		     __alignof__(struct vcpu_vmx), THIS_MODULE);
-	if (r)
-		return r;
-
-	/*
-	 * Must be called after kvm_init() so enable_ept is properly set
-	 * up. Hand the parameter mitigation value in which was stored in
-	 * the pre module init parser. If no parameter was given, it will
-	 * contain 'auto' which will be turned into the default 'cond'
-	 * mitigation mode.
-	 */
-	r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
-	if (r) {
-		vmx_exit();
-		return r;
-	}
-
-#ifdef CONFIG_KEXEC_CORE
-	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
-			   crash_vmclear_local_loaded_vmcss);
-#endif
-	vmx_check_vmcs12_offsets();
-
-	return 0;
-}
-module_init(vmx_init);

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ade694f..2cb379e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c

@@ -8693,12 +8693,6 @@
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.apf.msr_val = 0;
-
-	vcpu_load(vcpu);
-	kvm_mmu_unload(vcpu);
-	vcpu_put(vcpu);
-
 	kvm_arch_vcpu_free(vcpu);
 }
 

diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 0a0e911..5cb9f00 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt

@@ -909,7 +909,7 @@
 
 GrpTable: Grp3_2
 0: TEST Ev,Iz
-1:
+1: TEST Ev,Iz
 2: NOT Ev
 3: NEG Ev
 4: MUL rAX,Ev

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e2d4b25..101f3ad 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c

@@ -2126,19 +2126,13 @@
 		.pgd = pgd,
 		.numpages = numpages,
 		.mask_set = __pgprot(0),
-		.mask_clr = __pgprot(0),
+		.mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)),
 		.flags = 0,
 	};
 
 	if (!(__supported_pte_mask & _PAGE_NX))
 		goto out;
 
-	if (!(page_flags & _PAGE_NX))
-		cpa.mask_clr = __pgprot(_PAGE_NX);
-
-	if (!(page_flags & _PAGE_RW))
-		cpa.mask_clr = __pgprot(_PAGE_RW);
-
 	if (!(page_flags & _PAGE_ENC))
 		cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
 

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 335a62e..b129915 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c

@@ -86,6 +86,8 @@
 	pgd_t *save_pgd;
 
 	save_pgd = efi_call_phys_prolog();
+	if (!save_pgd)
+		return EFI_ABORTED;
 
 	/* Disable interrupts around EFI calls: */
 	local_irq_save(flags);
@@ -480,7 +482,6 @@
 	efi_char16_t *c16;
 	char vendor[100] = "unknown";
 	int i = 0;
-	void *tmp;
 
 #ifdef CONFIG_X86_32
 	if (boot_params.efi_info.efi_systab_hi ||
@@ -505,14 +506,16 @@
 	/*
 	 * Show what we know for posterity
 	 */
-	c16 = tmp = early_memremap(efi.systab->fw_vendor, 2);
+	c16 = early_memremap_ro(efi.systab->fw_vendor,
+				sizeof(vendor) * sizeof(efi_char16_t));
 	if (c16) {
-		for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
-			vendor[i] = *c16++;
+		for (i = 0; i < sizeof(vendor) - 1 && c16[i]; ++i)
+			vendor[i] = c16[i];
 		vendor[i] = '\0';
-	} else
+		early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t));
+	} else {
 		pr_err("Could not map the firmware vendor!\n");
-	early_memunmap(tmp, 2);
+	}
 
 	pr_info("EFI v%u.%.02u by %s\n",
 		efi.systab->hdr.revision >> 16,
@@ -929,16 +932,14 @@
 
 	if (efi_alloc_page_tables()) {
 		pr_err("Failed to allocate EFI page tables\n");
-		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
-		return;
+		goto err;
 	}
 
 	efi_merge_regions();
 	new_memmap = efi_map_regions(&count, &pg_shift);
 	if (!new_memmap) {
 		pr_err("Error reallocating memory, EFI runtime non-functional!\n");
-		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
-		return;
+		goto err;
 	}
 
 	pa = __pa(new_memmap);
@@ -952,8 +953,7 @@
 
 	if (efi_memmap_init_late(pa, efi.memmap.desc_size * count)) {
 		pr_err("Failed to remap late EFI memory map\n");
-		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
-		return;
+		goto err;
 	}
 
 	if (efi_enabled(EFI_DBG)) {
@@ -961,12 +961,11 @@
 		efi_print_memmap();
 	}
 
-	BUG_ON(!efi.systab);
+	if (WARN_ON(!efi.systab))
+		goto err;
 
-	if (efi_setup_page_tables(pa, 1 << pg_shift)) {
-		clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
-		return;
-	}
+	if (efi_setup_page_tables(pa, 1 << pg_shift))
+		goto err;
 
 	efi_sync_low_kernel_mappings();
 
@@ -986,9 +985,9 @@
 	}
 
 	if (status != EFI_SUCCESS) {
-		pr_alert("Unable to switch EFI into virtual mode (status=%lx)!\n",
-			 status);
-		panic("EFI call to SetVirtualAddressMap() failed!");
+		pr_err("Unable to switch EFI into virtual mode (status=%lx)!\n",
+		       status);
+		goto err;
 	}
 
 	/*
@@ -1015,6 +1014,10 @@
 
 	/* clean DUMMY object */
 	efi_delete_dummy_variable();
+	return;
+
+err:
+	clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
 }
 
 void __init efi_enter_virtual_mode(void)

diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ee5d08f..0407bb9 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c

@@ -84,13 +84,15 @@
 
 	if (!efi_enabled(EFI_OLD_MEMMAP)) {
 		efi_switch_mm(&efi_mm);
-		return NULL;
+		return efi_mm.pgd;
 	}
 
 	early_code_mapping_set_exec(1);
 
 	n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
 	save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL);
+	if (!save_pgd)
+		return NULL;
 
 	/*
 	 * Build 1:1 identity mapping for efi=old_map usage. Note that
@@ -138,10 +140,11 @@
 		pgd_offset_k(pgd * PGDIR_SIZE)->pgd &= ~_PAGE_NX;
 	}
 
-out:
 	__flush_tlb_all();
-
 	return save_pgd;
+out:
+	efi_call_phys_epilog(save_pgd);
+	return NULL;
 }
 
 void __init efi_call_phys_epilog(pgd_t *save_pgd)
@@ -389,11 +392,12 @@
 		return 0;
 
 	page = alloc_page(GFP_KERNEL|__GFP_DMA32);
-	if (!page)
-		panic("Unable to allocate EFI runtime stack < 4GB\n");
+	if (!page) {
+		pr_err("Unable to allocate EFI runtime stack < 4GB\n");
+		return 1;
+	}
 
-	efi_scratch.phys_stack = virt_to_phys(page_address(page));
-	efi_scratch.phys_stack += PAGE_SIZE; /* stack grows down */
+	efi_scratch.phys_stack = page_to_phys(page + 1); /* stack grows down */
 
 	npages = (_etext - _text) >> PAGE_SHIFT;
 	text = __pa(_text);

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 9fe5952..b139f44 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c

@@ -578,7 +578,8 @@
 	bfqg_and_blkg_get(bfqg);
 
 	if (bfq_bfqq_busy(bfqq)) {
-		bfq_pos_tree_add_move(bfqd, bfqq);
+		if (unlikely(!bfqd->nonrot_with_queueing))
+			bfq_pos_tree_add_move(bfqd, bfqq);
 		bfq_activate_bfqq(bfqd, bfqq);
 	}
 
@@ -1102,7 +1103,7 @@
 	},
 #endif /* CONFIG_DEBUG_BLK_CGROUP */
 
-	/* the same statictics which cover the bfqg and its descendants */
+	/* the same statistics which cover the bfqg and its descendants */
 	{
 		.name = "bfq.io_service_bytes_recursive",
 		.private = (unsigned long)&blkcg_policy_bfq,

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 66b1ebc..b7515ac 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c

@@ -189,7 +189,7 @@
 /*
  * When a sync request is dispatched, the queue that contains that
  * request, and all the ancestor entities of that queue, are charged
- * with the number of sectors of the request. In constrast, if the
+ * with the number of sectors of the request. In contrast, if the
  * request is async, then the queue and its ancestor entities are
  * charged with the number of sectors of the request, multiplied by
  * the factor below. This throttles the bandwidth for async I/O,
@@ -217,7 +217,7 @@
  * queue merging.
  *
  * As can be deduced from the low time limit below, queue merging, if
- * successful, happens at the very beggining of the I/O of the involved
+ * successful, happens at the very beginning of the I/O of the involved
  * cooperating processes, as a consequence of the arrival of the very
  * first requests from each cooperator.  After that, there is very
  * little chance to find cooperators.
@@ -230,13 +230,26 @@
 #define BFQ_MIN_TT		(2 * NSEC_PER_MSEC)
 
 /* hw_tag detection: parallel requests threshold and min samples needed. */
-#define BFQ_HW_QUEUE_THRESHOLD	4
+#define BFQ_HW_QUEUE_THRESHOLD	3
 #define BFQ_HW_QUEUE_SAMPLES	32
 
 #define BFQQ_SEEK_THR		(sector_t)(8 * 100)
 #define BFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
+#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \
+	(get_sdist(last_pos, rq) >			\
+	 BFQQ_SEEK_THR &&				\
+	 (!blk_queue_nonrot(bfqd->queue) ||		\
+	  blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT))
 #define BFQQ_CLOSE_THR		(sector_t)(8 * 1024)
 #define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 19)
+/*
+ * Sync random I/O is likely to be confused with soft real-time I/O,
+ * because it is characterized by limited throughput and apparently
+ * isochronous arrival pattern. To avoid false positives, queues
+ * containing only random (seeky) I/O are prevented from being tagged
+ * as soft real-time.
+ */
+#define BFQQ_TOTALLY_SEEKY(bfqq)	(bfqq->seek_history == -1)
 
 /* Min number of samples required to perform peak-rate update */
 #define BFQ_RATE_MIN_SAMPLES	32
@@ -428,7 +441,7 @@
 
 /*
  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
- * We choose the request that is closesr to the head right now.  Distance
+ * We choose the request that is closer to the head right now.  Distance
  * behind the head is penalized and only allowed to a certain extent.
  */
 static struct request *bfq_choose_req(struct bfq_data *bfqd,
@@ -590,7 +603,16 @@
 				       bfq_merge_time_limit);
 }
 
-void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+/*
+ * The following function is not marked as __cold because it is
+ * actually cold, but for the same performance goal described in the
+ * comments on the likely() at the beginning of
+ * bfq_setup_cooperator(). Unexpectedly, to reach an even lower
+ * execution time for the case where this function is not invoked, we
+ * had to add an unlikely() in each involved if().
+ */
+void __cold
+bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
 	struct rb_node **p, *parent;
 	struct bfq_queue *__bfqq;
@@ -624,59 +646,73 @@
 }
 
 /*
- * Tell whether there are active queues or groups with differentiated weights.
- */
-static bool bfq_differentiated_weights(struct bfq_data *bfqd)
-{
-	/*
-	 * For weights to differ, at least one of the trees must contain
-	 * at least two nodes.
-	 */
-	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
-		(bfqd->queue_weights_tree.rb_node->rb_left ||
-		 bfqd->queue_weights_tree.rb_node->rb_right)
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	       ) ||
-	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
-		(bfqd->group_weights_tree.rb_node->rb_left ||
-		 bfqd->group_weights_tree.rb_node->rb_right)
-#endif
-	       );
-}
-
-/*
- * The following function returns true if every queue must receive the
- * same share of the throughput (this condition is used when deciding
- * whether idling may be disabled, see the comments in the function
- * bfq_better_to_idle()).
+ * The following function returns false either if every active queue
+ * must receive the same share of the throughput (symmetric scenario),
+ * or, as a special case, if bfqq must receive a share of the
+ * throughput lower than or equal to the share that every other active
+ * queue must receive.  If bfqq does sync I/O, then these are the only
+ * two cases where bfqq happens to be guaranteed its share of the
+ * throughput even if I/O dispatching is not plugged when bfqq remains
+ * temporarily empty (for more details, see the comments in the
+ * function bfq_better_to_idle()). For this reason, the return value
+ * of this function is used to check whether I/O-dispatch plugging can
+ * be avoided.
  *
- * Such a scenario occurs when:
+ * The above first case (symmetric scenario) occurs when:
  * 1) all active queues have the same weight,
- * 2) all active groups at the same level in the groups tree have the same
- *    weight,
+ * 2) all active queues belong to the same I/O-priority class,
  * 3) all active groups at the same level in the groups tree have the same
+ *    weight,
+ * 4) all active groups at the same level in the groups tree have the same
  *    number of children.
  *
- * Unfortunately, keeping the necessary state for evaluating exactly the
- * above symmetry conditions would be quite complex and time-consuming.
- * Therefore this function evaluates, instead, the following stronger
- * sub-conditions, for which it is much easier to maintain the needed
- * state:
+ * Unfortunately, keeping the necessary state for evaluating exactly
+ * the last two symmetry sub-conditions above would be quite complex
+ * and time consuming. Therefore this function evaluates, instead,
+ * only the following stronger three sub-conditions, for which it is
+ * much easier to maintain the needed state:
  * 1) all active queues have the same weight,
- * 2) all active groups have the same weight,
- * 3) all active groups have at most one active child each.
- * In particular, the last two conditions are always true if hierarchical
- * support and the cgroups interface are not enabled, thus no state needs
- * to be maintained in this case.
+ * 2) all active queues belong to the same I/O-priority class,
+ * 3) there are no active groups.
+ * In particular, the last condition is always true if hierarchical
+ * support or the cgroups interface are not enabled, thus no state
+ * needs to be maintained in this case.
  */
-static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
+static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
+				   struct bfq_queue *bfqq)
 {
-	return !bfq_differentiated_weights(bfqd);
+	bool smallest_weight = bfqq &&
+		bfqq->weight_counter &&
+		bfqq->weight_counter ==
+		container_of(
+			rb_first_cached(&bfqd->queue_weights_tree),
+			struct bfq_weight_counter,
+			weights_node);
+
+	/*
+	 * For queue weights to differ, queue_weights_tree must contain
+	 * at least two nodes.
+	 */
+	bool varied_queue_weights = !smallest_weight &&
+		!RB_EMPTY_ROOT(&bfqd->queue_weights_tree.rb_root) &&
+		(bfqd->queue_weights_tree.rb_root.rb_node->rb_left ||
+		 bfqd->queue_weights_tree.rb_root.rb_node->rb_right);
+
+	bool multiple_classes_busy =
+		(bfqd->busy_queues[0] && bfqd->busy_queues[1]) ||
+		(bfqd->busy_queues[0] && bfqd->busy_queues[2]) ||
+		(bfqd->busy_queues[1] && bfqd->busy_queues[2]);
+
+	return varied_queue_weights || multiple_classes_busy
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	       || bfqd->num_groups_with_pending_reqs > 0
+#endif
+		;
 }
 
 /*
  * If the weight-counter tree passed as input contains no counter for
- * the weight of the input entity, then add that counter; otherwise just
+ * the weight of the input queue, then add that counter; otherwise just
  * increment the existing counter.
  *
  * Note that weight-counter trees contain few nodes in mostly symmetric
@@ -687,25 +723,26 @@
  * In most scenarios, the rate at which nodes are created/destroyed
  * should be low too.
  */
-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
-			  struct rb_root *root)
+void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct rb_root_cached *root)
 {
-	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct bfq_entity *entity = &bfqq->entity;
+	struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
+	bool leftmost = true;
 
 	/*
-	 * Do not insert if the entity is already associated with a
+	 * Do not insert if the queue is already associated with a
 	 * counter, which happens if:
-	 *   1) the entity is associated with a queue,
-	 *   2) a request arrival has caused the queue to become both
+	 *   1) a request arrival has caused the queue to become both
 	 *      non-weight-raised, and hence change its weight, and
 	 *      backlogged; in this respect, each of the two events
 	 *      causes an invocation of this function,
-	 *   3) this is the invocation of this function caused by the
+	 *   2) this is the invocation of this function caused by the
 	 *      second event. This second invocation is actually useless,
 	 *      and we handle this fact by exiting immediately. More
 	 *      efficient or clearer solutions might possibly be adopted.
 	 */
-	if (entity->weight_counter)
+	if (bfqq->weight_counter)
 		return;
 
 	while (*new) {
@@ -715,77 +752,79 @@
 		parent = *new;
 
 		if (entity->weight == __counter->weight) {
-			entity->weight_counter = __counter;
+			bfqq->weight_counter = __counter;
 			goto inc_counter;
 		}
 		if (entity->weight < __counter->weight)
 			new = &((*new)->rb_left);
-		else
+		else {
 			new = &((*new)->rb_right);
+			leftmost = false;
+		}
 	}
 
-	entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
-					 GFP_ATOMIC);
+	bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+				       GFP_ATOMIC);
 
 	/*
 	 * In the unlucky event of an allocation failure, we just
-	 * exit. This will cause the weight of entity to not be
-	 * considered in bfq_differentiated_weights, which, in its
-	 * turn, causes the scenario to be deemed wrongly symmetric in
-	 * case entity's weight would have been the only weight making
-	 * the scenario asymmetric. On the bright side, no unbalance
-	 * will however occur when entity becomes inactive again (the
+	 * exit. This will cause the weight of queue to not be
+	 * considered in bfq_asymmetric_scenario, which, in its turn,
+	 * causes the scenario to be deemed wrongly symmetric in case
+	 * bfqq's weight would have been the only weight making the
+	 * scenario asymmetric.  On the bright side, no unbalance will
+	 * however occur when bfqq becomes inactive again (the
 	 * invocation of this function is triggered by an activation
-	 * of entity). In fact, bfq_weights_tree_remove does nothing
-	 * if !entity->weight_counter.
+	 * of queue).  In fact, bfq_weights_tree_remove does nothing
+	 * if !bfqq->weight_counter.
 	 */
-	if (unlikely(!entity->weight_counter))
+	if (unlikely(!bfqq->weight_counter))
 		return;
 
-	entity->weight_counter->weight = entity->weight;
-	rb_link_node(&entity->weight_counter->weights_node, parent, new);
-	rb_insert_color(&entity->weight_counter->weights_node, root);
+	bfqq->weight_counter->weight = entity->weight;
+	rb_link_node(&bfqq->weight_counter->weights_node, parent, new);
+	rb_insert_color_cached(&bfqq->weight_counter->weights_node, root,
+				leftmost);
 
 inc_counter:
-	entity->weight_counter->num_active++;
+	bfqq->weight_counter->num_active++;
+	bfqq->ref++;
 }
 
 /*
- * Decrement the weight counter associated with the entity, and, if the
+ * Decrement the weight counter associated with the queue, and, if the
  * counter reaches 0, remove the counter from the tree.
  * See the comments to the function bfq_weights_tree_add() for considerations
  * about overhead.
  */
 void __bfq_weights_tree_remove(struct bfq_data *bfqd,
-			       struct bfq_entity *entity,
-			       struct rb_root *root)
+			       struct bfq_queue *bfqq,
+			       struct rb_root_cached *root)
 {
-	if (!entity->weight_counter)
+	if (!bfqq->weight_counter)
 		return;
 
-	entity->weight_counter->num_active--;
-	if (entity->weight_counter->num_active > 0)
+	bfqq->weight_counter->num_active--;
+	if (bfqq->weight_counter->num_active > 0)
 		goto reset_entity_pointer;
 
-	rb_erase(&entity->weight_counter->weights_node, root);
-	kfree(entity->weight_counter);
+	rb_erase_cached(&bfqq->weight_counter->weights_node, root);
+	kfree(bfqq->weight_counter);
 
 reset_entity_pointer:
-	entity->weight_counter = NULL;
+	bfqq->weight_counter = NULL;
+	bfq_put_queue(bfqq);
 }
 
 /*
- * Invoke __bfq_weights_tree_remove on bfqq and all its inactive
- * parent entities.
+ * Invoke __bfq_weights_tree_remove on bfqq and decrement the number
+ * of active groups for each queue's inactive parent entity.
  */
 void bfq_weights_tree_remove(struct bfq_data *bfqd,
 			     struct bfq_queue *bfqq)
 {
 	struct bfq_entity *entity = bfqq->entity.parent;
 
-	__bfq_weights_tree_remove(bfqd, &bfqq->entity,
-				  &bfqd->queue_weights_tree);
-
 	for_each_entity(entity) {
 		struct bfq_sched_data *sd = entity->my_sched_data;
 
@@ -797,18 +836,37 @@
 			 * next_in_service for details on why
 			 * in_service_entity must be checked too).
 			 *
-			 * As a consequence, the weight of entity is
-			 * not to be removed. In addition, if entity
-			 * is active, then its parent entities are
-			 * active as well, and thus their weights are
-			 * not to be removed either. In the end, this
-			 * loop must stop here.
+			 * As a consequence, its parent entities are
+			 * active as well, and thus this loop must
+			 * stop here.
 			 */
 			break;
 		}
-		__bfq_weights_tree_remove(bfqd, entity,
-					  &bfqd->group_weights_tree);
+
+		/*
+		 * The decrement of num_groups_with_pending_reqs is
+		 * not performed immediately upon the deactivation of
+		 * entity, but it is delayed to when it also happens
+		 * that the first leaf descendant bfqq of entity gets
+		 * all its pending requests completed. The following
+		 * instructions perform this delayed decrement, if
+		 * needed. See the comments on
+		 * num_groups_with_pending_reqs for details.
+		 */
+		if (entity->in_groups_with_pending_reqs) {
+			entity->in_groups_with_pending_reqs = false;
+			bfqd->num_groups_with_pending_reqs--;
+		}
 	}
+
+	/*
+	 * Next function is invoked last, because it causes bfqq to be
+	 * freed if the following holds: bfqq is not in service and
+	 * has no dispatched request. DO NOT use bfqq after the next
+	 * function invocation.
+	 */
+	__bfq_weights_tree_remove(bfqd, bfqq,
+				  &bfqd->queue_weights_tree);
 }
 
 /*
@@ -864,7 +922,8 @@
 static unsigned long bfq_serv_to_charge(struct request *rq,
 					struct bfq_queue *bfqq)
 {
-	if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
+	if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 ||
+	    bfq_asymmetric_scenario(bfqq->bfqd, bfqq))
 		return blk_rq_sectors(rq);
 
 	return blk_rq_sectors(rq) * bfq_async_charge_factor;
@@ -898,8 +957,10 @@
 		 */
 		return;
 
-	new_budget = max_t(unsigned long, bfqq->max_budget,
-			   bfq_serv_to_charge(next_rq, bfqq));
+	new_budget = max_t(unsigned long,
+			   max_t(unsigned long, bfqq->max_budget,
+				 bfq_serv_to_charge(next_rq, bfqq)),
+			   entity->service);
 	if (entity->budget != new_budget) {
 		entity->budget = new_budget;
 		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
@@ -928,7 +989,7 @@
 	 *   of several files
 	 * mplayer took 23 seconds to start, if constantly weight-raised.
 	 *
-	 * As for higher values than that accomodating the above bad
+	 * As for higher values than that accommodating the above bad
 	 * scenario, tests show that higher values would often yield
 	 * the opposite of the desired result, i.e., would worsen
 	 * responsiveness by allowing non-interactive applications to
@@ -967,6 +1028,7 @@
 	else
 		bfq_clear_bfqq_IO_bound(bfqq);
 
+	bfqq->entity.new_weight = bic->saved_weight;
 	bfqq->ttime = bic->saved_ttime;
 	bfqq->wr_coeff = bic->saved_wr_coeff;
 	bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
@@ -1002,7 +1064,8 @@
 
 static int bfqq_process_refs(struct bfq_queue *bfqq)
 {
-	return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
+	return bfqq->ref - bfqq->allocated - bfqq->entity.on_st -
+		(bfqq->weight_counter != NULL);
 }
 
 /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
@@ -1013,8 +1076,18 @@
 
 	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
 		hlist_del_init(&item->burst_list_node);
-	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
-	bfqd->burst_size = 1;
+
+	/*
+	 * Start the creation of a new burst list only if there is no
+	 * active queue. See comments on the conditional invocation of
+	 * bfq_handle_burst().
+	 */
+	if (bfq_tot_busy_queues(bfqd) == 0) {
+		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+		bfqd->burst_size = 1;
+	} else
+		bfqd->burst_size = 0;
+
 	bfqd->burst_parent_entity = bfqq->entity.parent;
 }
 
@@ -1070,7 +1143,8 @@
  * many parallel threads/processes. Examples are systemd during boot,
  * or git grep. To help these processes get their job done as soon as
  * possible, it is usually better to not grant either weight-raising
- * or device idling to their queues.
+ * or device idling to their queues, unless these queues must be
+ * protected from the I/O flowing through other active queues.
  *
  * In this comment we describe, firstly, the reasons why this fact
  * holds, and, secondly, the next function, which implements the main
@@ -1082,7 +1156,10 @@
  * cumulatively served, the sooner the target job of these queues gets
  * completed. As a consequence, weight-raising any of these queues,
  * which also implies idling the device for it, is almost always
- * counterproductive. In most cases it just lowers throughput.
+ * counterproductive, unless there are other active queues to isolate
+ * these new queues from. If there no other active queues, then
+ * weight-raising these new queues just lowers throughput in most
+ * cases.
  *
  * On the other hand, a burst of queue creations may be caused also by
  * the start of an application that does not consist of a lot of
@@ -1116,14 +1193,16 @@
  * are very rare. They typically occur if some service happens to
  * start doing I/O exactly when the interactive task starts.
  *
- * Turning back to the next function, it implements all the steps
- * needed to detect the occurrence of a large burst and to properly
- * mark all the queues belonging to it (so that they can then be
- * treated in a different way). This goal is achieved by maintaining a
- * "burst list" that holds, temporarily, the queues that belong to the
- * burst in progress. The list is then used to mark these queues as
- * belonging to a large burst if the burst does become large. The main
- * steps are the following.
+ * Turning back to the next function, it is invoked only if there are
+ * no active queues (apart from active queues that would belong to the
+ * same, possible burst bfqq would belong to), and it implements all
+ * the steps needed to detect the occurrence of a large burst and to
+ * properly mark all the queues belonging to it (so that they can then
+ * be treated in a different way). This goal is achieved by
+ * maintaining a "burst list" that holds, temporarily, the queues that
+ * belong to the burst in progress. The list is then used to mark
+ * these queues as belonging to a large burst if the burst does become
+ * large. The main steps are the following.
  *
  * . when the very first queue is created, the queue is inserted into the
  *   list (as it could be the first queue in a possible burst)
@@ -1371,7 +1450,15 @@
 {
 	struct bfq_entity *entity = &bfqq->entity;
 
-	if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
+	/*
+	 * In the next compound condition, we check also whether there
+	 * is some budget left, because otherwise there is no point in
+	 * trying to go on serving bfqq with this same budget: bfqq
+	 * would be expired immediately after being selected for
+	 * service. This would only cause useless overhead.
+	 */
+	if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time &&
+	    bfq_bfqq_budget_left(bfqq) > 0) {
 		/*
 		 * We do not clear the flag non_blocking_wait_rq here, as
 		 * the latter is used in bfq_activate_bfqq to signal
@@ -1560,6 +1647,7 @@
 	 */
 	in_burst = bfq_bfqq_in_large_burst(bfqq);
 	soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+		!BFQQ_TOTALLY_SEEKY(bfqq) &&
 		!in_burst &&
 		time_is_before_jiffies(bfqq->soft_rt_next_start) &&
 		bfqq->dispatched == 0;
@@ -1656,6 +1744,72 @@
 				false, BFQQE_PREEMPTED);
 }
 
+static void bfq_reset_inject_limit(struct bfq_data *bfqd,
+				   struct bfq_queue *bfqq)
+{
+	/* invalidate baseline total service time */
+	bfqq->last_serv_time_ns = 0;
+
+	/*
+	 * Reset pointer in case we are waiting for
+	 * some request completion.
+	 */
+	bfqd->waited_rq = NULL;
+
+	/*
+	 * If bfqq has a short think time, then start by setting the
+	 * inject limit to 0 prudentially, because the service time of
+	 * an injected I/O request may be higher than the think time
+	 * of bfqq, and therefore, if one request was injected when
+	 * bfqq remains empty, this injected request might delay the
+	 * service of the next I/O request for bfqq significantly. In
+	 * case bfqq can actually tolerate some injection, then the
+	 * adaptive update will however raise the limit soon. This
+	 * lucky circumstance holds exactly because bfqq has a short
+	 * think time, and thus, after remaining empty, is likely to
+	 * get new I/O enqueued---and then completed---before being
+	 * expired. This is the very pattern that gives the
+	 * limit-update algorithm the chance to measure the effect of
+	 * injection on request service times, and then to update the
+	 * limit accordingly.
+	 *
+	 * However, in the following special case, the inject limit is
+	 * left to 1 even if the think time is short: bfqq's I/O is
+	 * synchronized with that of some other queue, i.e., bfqq may
+	 * receive new I/O only after the I/O of the other queue is
+	 * completed. Keeping the inject limit to 1 allows the
+	 * blocking I/O to be served while bfqq is in service. And
+	 * this is very convenient both for bfqq and for overall
+	 * throughput, as explained in detail in the comments in
+	 * bfq_update_has_short_ttime().
+	 *
+	 * On the opposite end, if bfqq has a long think time, then
+	 * start directly by 1, because:
+	 * a) on the bright side, keeping at most one request in
+	 * service in the drive is unlikely to cause any harm to the
+	 * latency of bfqq's requests, as the service time of a single
+	 * request is likely to be lower than the think time of bfqq;
+	 * b) on the downside, after becoming empty, bfqq is likely to
+	 * expire before getting its next request. With this request
+	 * arrival pattern, it is very hard to sample total service
+	 * times and update the inject limit accordingly (see comments
+	 * on bfq_update_inject_limit()). So the limit is likely to be
+	 * never, or at least seldom, updated.  As a consequence, by
+	 * setting the limit to 1, we avoid that no injection ever
+	 * occurs with bfqq. On the downside, this proactive step
+	 * further reduces chances to actually compute the baseline
+	 * total service time. Thus it reduces chances to execute the
+	 * limit-update algorithm and possibly raise the limit to more
+	 * than 1.
+	 */
+	if (bfq_bfqq_has_short_ttime(bfqq))
+		bfqq->inject_limit = 0;
+	else
+		bfqq->inject_limit = 1;
+
+	bfqq->decrease_time_jif = jiffies;
+}
+
 static void bfq_add_request(struct request *rq)
 {
 	struct bfq_queue *bfqq = RQ_BFQQ(rq);
@@ -1668,6 +1822,60 @@
 	bfqq->queued[rq_is_sync(rq)]++;
 	bfqd->queued++;
 
+	if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
+		/*
+		 * Periodically reset inject limit, to make sure that
+		 * the latter eventually drops in case workload
+		 * changes, see step (3) in the comments on
+		 * bfq_update_inject_limit().
+		 */
+		if (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
+					     msecs_to_jiffies(1000)))
+			bfq_reset_inject_limit(bfqd, bfqq);
+
+		/*
+		 * The following conditions must hold to setup a new
+		 * sampling of total service time, and then a new
+		 * update of the inject limit:
+		 * - bfqq is in service, because the total service
+		 *   time is evaluated only for the I/O requests of
+		 *   the queues in service;
+		 * - this is the right occasion to compute or to
+		 *   lower the baseline total service time, because
+		 *   there are actually no requests in the drive,
+		 *   or
+		 *   the baseline total service time is available, and
+		 *   this is the right occasion to compute the other
+		 *   quantity needed to update the inject limit, i.e.,
+		 *   the total service time caused by the amount of
+		 *   injection allowed by the current value of the
+		 *   limit. It is the right occasion because injection
+		 *   has actually been performed during the service
+		 *   hole, and there are still in-flight requests,
+		 *   which are very likely to be exactly the injected
+		 *   requests, or part of them;
+		 * - the minimum interval for sampling the total
+		 *   service time and updating the inject limit has
+		 *   elapsed.
+		 */
+		if (bfqq == bfqd->in_service_queue &&
+		    (bfqd->rq_in_driver == 0 ||
+		     (bfqq->last_serv_time_ns > 0 &&
+		      bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
+		    time_is_before_eq_jiffies(bfqq->decrease_time_jif +
+					      msecs_to_jiffies(100))) {
+			bfqd->last_empty_occupied_ns = ktime_get_ns();
+			/*
+			 * Start the state machine for measuring the
+			 * total service time of rq: setting
+			 * wait_dispatch will cause bfqd->waited_rq to
+			 * be set when rq will be dispatched.
+			 */
+			bfqd->wait_dispatch = true;
+			bfqd->rqs_injected = false;
+		}
+	}
+
 	elv_rb_add(&bfqq->sort_list, rq);
 
 	/*
@@ -1679,8 +1887,9 @@
 
 	/*
 	 * Adjust priority tree position, if next_rq changes.
+	 * See comments on bfq_pos_tree_add_move() for the unlikely().
 	 */
-	if (prev != bfqq->next_rq)
+	if (unlikely(!bfqd->nonrot_with_queueing && prev != bfqq->next_rq))
 		bfq_pos_tree_add_move(bfqd, bfqq);
 
 	if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
@@ -1820,7 +2029,9 @@
 			bfqq->pos_root = NULL;
 		}
 	} else {
-		bfq_pos_tree_add_move(bfqd, bfqq);
+		/* see comments on bfq_pos_tree_add_move() for the unlikely() */
+		if (unlikely(!bfqd->nonrot_with_queueing))
+			bfq_pos_tree_add_move(bfqd, bfqq);
 	}
 
 	if (rq->cmd_flags & REQ_META)
@@ -1910,7 +2121,12 @@
 		 */
 		if (prev != bfqq->next_rq) {
 			bfq_updated_next_req(bfqd, bfqq);
-			bfq_pos_tree_add_move(bfqd, bfqq);
+			/*
+			 * See comments on bfq_pos_tree_add_move() for
+			 * the unlikely().
+			 */
+			if (unlikely(!bfqd->nonrot_with_queueing))
+				bfq_pos_tree_add_move(bfqd, bfqq);
 		}
 	}
 }
@@ -2196,6 +2412,46 @@
 	struct bfq_queue *in_service_bfqq, *new_bfqq;
 
 	/*
+	 * Do not perform queue merging if the device is non
+	 * rotational and performs internal queueing. In fact, such a
+	 * device reaches a high speed through internal parallelism
+	 * and pipelining. This means that, to reach a high
+	 * throughput, it must have many requests enqueued at the same
+	 * time. But, in this configuration, the internal scheduling
+	 * algorithm of the device does exactly the job of queue
+	 * merging: it reorders requests so as to obtain as much as
+	 * possible a sequential I/O pattern. As a consequence, with
+	 * the workload generated by processes doing interleaved I/O,
+	 * the throughput reached by the device is likely to be the
+	 * same, with and without queue merging.
+	 *
+	 * Disabling merging also provides a remarkable benefit in
+	 * terms of throughput. Merging tends to make many workloads
+	 * artificially more uneven, because of shared queues
+	 * remaining non empty for incomparably more time than
+	 * non-merged queues. This may accentuate workload
+	 * asymmetries. For example, if one of the queues in a set of
+	 * merged queues has a higher weight than a normal queue, then
+	 * the shared queue may inherit such a high weight and, by
+	 * staying almost always active, may force BFQ to perform I/O
+	 * plugging most of the time. This evidently makes it harder
+	 * for BFQ to let the device reach a high throughput.
+	 *
+	 * Finally, the likely() macro below is not used because one
+	 * of the two branches is more likely than the other, but to
+	 * have the code path after the following if() executed as
+	 * fast as possible for the case of a non rotational device
+	 * with queueing. We want it because this is the fastest kind
+	 * of device. On the opposite end, the likely() may lengthen
+	 * the execution time of BFQ for the case of slower devices
+	 * (rotational or at least without queueing). But in this case
+	 * the execution time of BFQ matters very little, if not at
+	 * all.
+	 */
+	if (likely(bfqd->nonrot_with_queueing))
+		return NULL;
+
+	/*
 	 * Prevent bfqq from being merged if it has been created too
 	 * long ago. The idea is that true cooperating processes, and
 	 * thus their associated bfq_queues, are supposed to be
@@ -2216,7 +2472,7 @@
 		return NULL;
 
 	/* If there is only one backlogged queue, don't search. */
-	if (bfqd->busy_queues == 1)
+	if (bfq_tot_busy_queues(bfqd) == 1)
 		return NULL;
 
 	in_service_bfqq = bfqd->in_service_queue;
@@ -2258,6 +2514,7 @@
 	if (!bic)
 		return;
 
+	bic->saved_weight = bfqq->entity.orig_weight;
 	bic->saved_ttime = bfqq->ttime;
 	bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
 	bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
@@ -2276,6 +2533,7 @@
 		 * to enjoy weight raising if split soon.
 		 */
 		bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
+		bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now();
 		bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
 		bic->saved_last_wr_start_finish = jiffies;
 	} else {
@@ -2346,6 +2604,16 @@
 	 *   assignment causes no harm).
 	 */
 	new_bfqq->bic = NULL;
+	/*
+	 * If the queue is shared, the pid is the pid of one of the associated
+	 * processes. Which pid depends on the exact sequence of merge events
+	 * the queue underwent. So printing such a pid is useless and confusing
+	 * because it reports a random pid between those of the associated
+	 * processes.
+	 * We mark such a queue with a pid -1, and then print SHARED instead of
+	 * a pid in logging messages.
+	 */
+	new_bfqq->pid = -1;
 	bfqq->bic = NULL;
 	/* release process reference to bfqq */
 	bfq_put_queue(bfqq);
@@ -2380,8 +2648,8 @@
 		/*
 		 * bic still points to bfqq, then it has not yet been
 		 * redirected to some other bfq_queue, and a queue
-		 * merge beween bfqq and new_bfqq can be safely
-		 * fulfillled, i.e., bic can be redirected to new_bfqq
+		 * merge between bfqq and new_bfqq can be safely
+		 * fulfilled, i.e., bic can be redirected to new_bfqq
 		 * and bfqq can be put.
 		 */
 		bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
@@ -2515,12 +2783,14 @@
 	 * queue).
 	 */
 	if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
-	    bfq_symmetric_scenario(bfqd))
+	    !bfq_asymmetric_scenario(bfqd, bfqq))
 		sl = min_t(u64, sl, BFQ_MIN_TT);
 	else if (bfqq->wr_coeff > 1)
 		sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
 
 	bfqd->last_idling_start = ktime_get();
+	bfqd->last_idling_start_jiffies = jiffies;
+
 	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
 		      HRTIMER_MODE_REL);
 	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
@@ -2744,7 +3014,7 @@
 
 	if ((bfqd->rq_in_driver > 0 ||
 		now_ns - bfqd->last_completion < BFQ_MIN_TT)
-	     && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
+	    && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
 		bfqd->sequential_samples++;
 
 	bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
@@ -2796,7 +3066,7 @@
 	bfq_remove_request(q, rq);
 }
 
-static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
 	/*
 	 * If this bfqq is shared between multiple processes, check
@@ -2822,16 +3092,20 @@
 		bfq_requeue_bfqq(bfqd, bfqq, true);
 		/*
 		 * Resort priority tree of potential close cooperators.
+		 * See comments on bfq_pos_tree_add_move() for the unlikely().
 		 */
-		bfq_pos_tree_add_move(bfqd, bfqq);
+		if (unlikely(!bfqd->nonrot_with_queueing))
+			bfq_pos_tree_add_move(bfqd, bfqq);
 	}
 
 	/*
 	 * All in-service entities must have been properly deactivated
 	 * or requeued before executing the next function, which
-	 * resets all in-service entites as no more in service.
+	 * resets all in-service entities as no more in service. This
+	 * may cause bfqq to be freed. If this happens, the next
+	 * function returns true.
 	 */
-	__bfq_bfqd_reset_in_service(bfqd);
+	return __bfq_bfqd_reset_in_service(bfqd);
 }
 
 /**
@@ -3195,13 +3469,6 @@
 		    jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
 }
 
-static bool bfq_bfqq_injectable(struct bfq_queue *bfqq)
-{
-	return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
-		blk_queue_nonrot(bfqq->bfqd->queue) &&
-		bfqq->bfqd->hw_tag;
-}
-
 /**
  * bfq_bfqq_expire - expire a queue.
  * @bfqd: device owning the queue.
@@ -3236,7 +3503,6 @@
 	bool slow;
 	unsigned long delta = 0;
 	struct bfq_entity *entity = &bfqq->entity;
-	int ref;
 
 	/*
 	 * Check whether the process is slow (see bfq_bfqq_is_slow).
@@ -3278,16 +3544,32 @@
 		 * requests, then the request pattern is isochronous
 		 * (see the comments on the function
 		 * bfq_bfqq_softrt_next_start()). Thus we can compute
-		 * soft_rt_next_start. If, instead, the queue still
-		 * has outstanding requests, then we have to wait for
-		 * the completion of all the outstanding requests to
-		 * discover whether the request pattern is actually
-		 * isochronous.
+		 * soft_rt_next_start. And we do it, unless bfqq is in
+		 * interactive weight raising. We do not do it in the
+		 * latter subcase, for the following reason. bfqq may
+		 * be conveying the I/O needed to load a soft
+		 * real-time application. Such an application will
+		 * actually exhibit a soft real-time I/O pattern after
+		 * it finally starts doing its job. But, if
+		 * soft_rt_next_start is computed here for an
+		 * interactive bfqq, and bfqq had received a lot of
+		 * service before remaining with no outstanding
+		 * request (likely to happen on a fast device), then
+		 * soft_rt_next_start would be assigned such a high
+		 * value that, for a very long time, bfqq would be
+		 * prevented from being possibly considered as soft
+		 * real time.
+		 *
+		 * If, instead, the queue still has outstanding
+		 * requests, then we have to wait for the completion
+		 * of all the outstanding requests to discover whether
+		 * the request pattern is actually isochronous.
 		 */
-		if (bfqq->dispatched == 0)
+		if (bfqq->dispatched == 0 &&
+		    bfqq->wr_coeff != bfqd->bfq_wr_coeff)
 			bfqq->soft_rt_next_start =
 				bfq_bfqq_softrt_next_start(bfqd, bfqq);
-		else {
+		else if (bfqq->dispatched > 0) {
 			/*
 			 * Schedule an update of soft_rt_next_start to when
 			 * the task may be discovered to be isochronous.
@@ -3301,18 +3583,22 @@
 		slow, bfqq->dispatched, bfq_bfqq_has_short_ttime(bfqq));
 
 	/*
+	 * bfqq expired, so no total service time needs to be computed
+	 * any longer: reset state machine for measuring total service
+	 * times.
+	 */
+	bfqd->rqs_injected = bfqd->wait_dispatch = false;
+	bfqd->waited_rq = NULL;
+
+	/*
 	 * Increase, decrease or leave budget unchanged according to
 	 * reason.
 	 */
 	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
-	ref = bfqq->ref;
-	__bfq_bfqq_expire(bfqd, bfqq);
-
-	if (ref == 1) /* bfqq is gone, no more actions on it */
+	if (__bfq_bfqq_expire(bfqd, bfqq))
+		/* bfqq is gone, no more actions on it */
 		return;
 
-	bfqq->injected_service = 0;
-
 	/* mark bfqq as waiting a request only if a bic still points to it */
 	if (!bfq_bfqq_busy(bfqq) &&
 	    reason != BFQQE_BUDGET_TIMEOUT &&
@@ -3380,53 +3666,13 @@
 		bfq_bfqq_budget_timeout(bfqq);
 }
 
-/*
- * For a queue that becomes empty, device idling is allowed only if
- * this function returns true for the queue. As a consequence, since
- * device idling plays a critical role in both throughput boosting and
- * service guarantees, the return value of this function plays a
- * critical role in both these aspects as well.
- *
- * In a nutshell, this function returns true only if idling is
- * beneficial for throughput or, even if detrimental for throughput,
- * idling is however necessary to preserve service guarantees (low
- * latency, desired throughput distribution, ...). In particular, on
- * NCQ-capable devices, this function tries to return false, so as to
- * help keep the drives' internal queues full, whenever this helps the
- * device boost the throughput without causing any service-guarantee
- * issue.
- *
- * In more detail, the return value of this function is obtained by,
- * first, computing a number of boolean variables that take into
- * account throughput and service-guarantee issues, and, then,
- * combining these variables in a logical expression. Most of the
- * issues taken into account are not trivial. We discuss these issues
- * individually while introducing the variables.
- */
-static bool bfq_better_to_idle(struct bfq_queue *bfqq)
+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
+					     struct bfq_queue *bfqq)
 {
-	struct bfq_data *bfqd = bfqq->bfqd;
 	bool rot_without_queueing =
 		!blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag,
 		bfqq_sequential_and_IO_bound,
-		idling_boosts_thr, idling_boosts_thr_without_issues,
-		idling_needed_for_service_guarantees,
-		asymmetric_scenario;
-
-	if (bfqd->strict_guarantees)
-		return true;
-
-	/*
-	 * Idling is performed only if slice_idle > 0. In addition, we
-	 * do not idle if
-	 * (a) bfqq is async
-	 * (b) bfqq is in the idle io prio class: in this case we do
-	 * not idle because we want to minimize the bandwidth that
-	 * queues in this class can steal to higher-priority queues
-	 */
-	if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
-	    bfq_class_idle(bfqq))
-		return false;
+		idling_boosts_thr;
 
 	bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) &&
 		bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq);
@@ -3458,8 +3704,7 @@
 		 bfqq_sequential_and_IO_bound);
 
 	/*
-	 * The value of the next variable,
-	 * idling_boosts_thr_without_issues, is equal to that of
+	 * The return value of this function is equal to that of
 	 * idling_boosts_thr, unless a special case holds. In this
 	 * special case, described below, idling may cause problems to
 	 * weight-raised queues.
@@ -3476,169 +3721,259 @@
 	 * which enqueue several requests in advance, and further
 	 * reorder internally-queued requests.
 	 *
-	 * For this reason, we force to false the value of
-	 * idling_boosts_thr_without_issues if there are weight-raised
-	 * busy queues. In this case, and if bfqq is not weight-raised,
-	 * this guarantees that the device is not idled for bfqq (if,
-	 * instead, bfqq is weight-raised, then idling will be
-	 * guaranteed by another variable, see below). Combined with
-	 * the timestamping rules of BFQ (see [1] for details), this
-	 * behavior causes bfqq, and hence any sync non-weight-raised
-	 * queue, to get a lower number of requests served, and thus
-	 * to ask for a lower number of requests from the request
-	 * pool, before the busy weight-raised queues get served
-	 * again. This often mitigates starvation problems in the
-	 * presence of heavy write workloads and NCQ, thereby
-	 * guaranteeing a higher application and system responsiveness
-	 * in these hostile scenarios.
+	 * For this reason, we force to false the return value if
+	 * there are weight-raised busy queues. In this case, and if
+	 * bfqq is not weight-raised, this guarantees that the device
+	 * is not idled for bfqq (if, instead, bfqq is weight-raised,
+	 * then idling will be guaranteed by another variable, see
+	 * below). Combined with the timestamping rules of BFQ (see
+	 * [1] for details), this behavior causes bfqq, and hence any
+	 * sync non-weight-raised queue, to get a lower number of
+	 * requests served, and thus to ask for a lower number of
+	 * requests from the request pool, before the busy
+	 * weight-raised queues get served again. This often mitigates
+	 * starvation problems in the presence of heavy write
+	 * workloads and NCQ, thereby guaranteeing a higher
+	 * application and system responsiveness in these hostile
+	 * scenarios.
 	 */
-	idling_boosts_thr_without_issues = idling_boosts_thr &&
+	return idling_boosts_thr &&
 		bfqd->wr_busy_queues == 0;
+}
+
+/*
+ * There is a case where idling does not have to be performed for
+ * throughput concerns, but to preserve the throughput share of
+ * the process associated with bfqq.
+ *
+ * To introduce this case, we can note that allowing the drive
+ * to enqueue more than one request at a time, and hence
+ * delegating de facto final scheduling decisions to the
+ * drive's internal scheduler, entails loss of control on the
+ * actual request service order. In particular, the critical
+ * situation is when requests from different processes happen
+ * to be present, at the same time, in the internal queue(s)
+ * of the drive. In such a situation, the drive, by deciding
+ * the service order of the internally-queued requests, does
+ * determine also the actual throughput distribution among
+ * these processes. But the drive typically has no notion or
+ * concern about per-process throughput distribution, and
+ * makes its decisions only on a per-request basis. Therefore,
+ * the service distribution enforced by the drive's internal
+ * scheduler is likely to coincide with the desired throughput
+ * distribution only in a completely symmetric, or favorably
+ * skewed scenario where:
+ * (i-a) each of these processes must get the same throughput as
+ *	 the others,
+ * (i-b) in case (i-a) does not hold, it holds that the process
+ *       associated with bfqq must receive a lower or equal
+ *	 throughput than any of the other processes;
+ * (ii)  the I/O of each process has the same properties, in
+ *       terms of locality (sequential or random), direction
+ *       (reads or writes), request sizes, greediness
+ *       (from I/O-bound to sporadic), and so on;
+
+ * In fact, in such a scenario, the drive tends to treat the requests
+ * of each process in about the same way as the requests of the
+ * others, and thus to provide each of these processes with about the
+ * same throughput.  This is exactly the desired throughput
+ * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
+ * even more convenient distribution for (the process associated with)
+ * bfqq.
+ *
+ * In contrast, in any asymmetric or unfavorable scenario, device
+ * idling (I/O-dispatch plugging) is certainly needed to guarantee
+ * that bfqq receives its assigned fraction of the device throughput
+ * (see [1] for details).
+ *
+ * The problem is that idling may significantly reduce throughput with
+ * certain combinations of types of I/O and devices. An important
+ * example is sync random I/O on flash storage with command
+ * queueing. So, unless bfqq falls in cases where idling also boosts
+ * throughput, it is important to check conditions (i-a), i(-b) and
+ * (ii) accurately, so as to avoid idling when not strictly needed for
+ * service guarantees.
+ *
+ * Unfortunately, it is extremely difficult to thoroughly check
+ * condition (ii). And, in case there are active groups, it becomes
+ * very difficult to check conditions (i-a) and (i-b) too.  In fact,
+ * if there are active groups, then, for conditions (i-a) or (i-b) to
+ * become false 'indirectly', it is enough that an active group
+ * contains more active processes or sub-groups than some other active
+ * group. More precisely, for conditions (i-a) or (i-b) to become
+ * false because of such a group, it is not even necessary that the
+ * group is (still) active: it is sufficient that, even if the group
+ * has become inactive, some of its descendant processes still have
+ * some request already dispatched but still waiting for
+ * completion. In fact, requests have still to be guaranteed their
+ * share of the throughput even after being dispatched. In this
+ * respect, it is easy to show that, if a group frequently becomes
+ * inactive while still having in-flight requests, and if, when this
+ * happens, the group is not considered in the calculation of whether
+ * the scenario is asymmetric, then the group may fail to be
+ * guaranteed its fair share of the throughput (basically because
+ * idling may not be performed for the descendant processes of the
+ * group, but it had to be).  We address this issue with the following
+ * bi-modal behavior, implemented in the function
+ * bfq_asymmetric_scenario().
+ *
+ * If there are groups with requests waiting for completion
+ * (as commented above, some of these groups may even be
+ * already inactive), then the scenario is tagged as
+ * asymmetric, conservatively, without checking any of the
+ * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
+ * This behavior matches also the fact that groups are created
+ * exactly if controlling I/O is a primary concern (to
+ * preserve bandwidth and latency guarantees).
+ *
+ * On the opposite end, if there are no groups with requests waiting
+ * for completion, then only conditions (i-a) and (i-b) are actually
+ * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
+ * idling is not performed, regardless of whether condition (ii)
+ * holds.  In other words, only if conditions (i-a) and (i-b) do not
+ * hold, then idling is allowed, and the device tends to be prevented
+ * from queueing many requests, possibly of several processes. Since
+ * there are no groups with requests waiting for completion, then, to
+ * control conditions (i-a) and (i-b) it is enough to check just
+ * whether all the queues with requests waiting for completion also
+ * have the same weight.
+ *
+ * Not checking condition (ii) evidently exposes bfqq to the
+ * risk of getting less throughput than its fair share.
+ * However, for queues with the same weight, a further
+ * mechanism, preemption, mitigates or even eliminates this
+ * problem. And it does so without consequences on overall
+ * throughput. This mechanism and its benefits are explained
+ * in the next three paragraphs.
+ *
+ * Even if a queue, say Q, is expired when it remains idle, Q
+ * can still preempt the new in-service queue if the next
+ * request of Q arrives soon (see the comments on
+ * bfq_bfqq_update_budg_for_activation). If all queues and
+ * groups have the same weight, this form of preemption,
+ * combined with the hole-recovery heuristic described in the
+ * comments on function bfq_bfqq_update_budg_for_activation,
+ * are enough to preserve a correct bandwidth distribution in
+ * the mid term, even without idling. In fact, even if not
+ * idling allows the internal queues of the device to contain
+ * many requests, and thus to reorder requests, we can rather
+ * safely assume that the internal scheduler still preserves a
+ * minimum of mid-term fairness.
+ *
+ * More precisely, this preemption-based, idleless approach
+ * provides fairness in terms of IOPS, and not sectors per
+ * second. This can be seen with a simple example. Suppose
+ * that there are two queues with the same weight, but that
+ * the first queue receives requests of 8 sectors, while the
+ * second queue receives requests of 1024 sectors. In
+ * addition, suppose that each of the two queues contains at
+ * most one request at a time, which implies that each queue
+ * always remains idle after it is served. Finally, after
+ * remaining idle, each queue receives very quickly a new
+ * request. It follows that the two queues are served
+ * alternatively, preempting each other if needed. This
+ * implies that, although both queues have the same weight,
+ * the queue with large requests receives a service that is
+ * 1024/8 times as high as the service received by the other
+ * queue.
+ *
+ * The motivation for using preemption instead of idling (for
+ * queues with the same weight) is that, by not idling,
+ * service guarantees are preserved (completely or at least in
+ * part) without minimally sacrificing throughput. And, if
+ * there is no active group, then the primary expectation for
+ * this device is probably a high throughput.
+ *
+ * We are now left only with explaining the additional
+ * compound condition that is checked below for deciding
+ * whether the scenario is asymmetric. To explain this
+ * compound condition, we need to add that the function
+ * bfq_asymmetric_scenario checks the weights of only
+ * non-weight-raised queues, for efficiency reasons (see
+ * comments on bfq_weights_tree_add()). Then the fact that
+ * bfqq is weight-raised is checked explicitly here. More
+ * precisely, the compound condition below takes into account
+ * also the fact that, even if bfqq is being weight-raised,
+ * the scenario is still symmetric if all queues with requests
+ * waiting for completion happen to be
+ * weight-raised. Actually, we should be even more precise
+ * here, and differentiate between interactive weight raising
+ * and soft real-time weight raising.
+ *
+ * As a side note, it is worth considering that the above
+ * device-idling countermeasures may however fail in the
+ * following unlucky scenario: if idling is (correctly)
+ * disabled in a time period during which all symmetry
+ * sub-conditions hold, and hence the device is allowed to
+ * enqueue many requests, but at some later point in time some
+ * sub-condition stops to hold, then it may become impossible
+ * to let requests be served in the desired order until all
+ * the requests already queued in the device have been served.
+ */
+static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
+						 struct bfq_queue *bfqq)
+{
+	return (bfqq->wr_coeff > 1 &&
+		bfqd->wr_busy_queues <
+		bfq_tot_busy_queues(bfqd)) ||
+		bfq_asymmetric_scenario(bfqd, bfqq);
+}
+
+/*
+ * For a queue that becomes empty, device idling is allowed only if
+ * this function returns true for that queue. As a consequence, since
+ * device idling plays a critical role for both throughput boosting
+ * and service guarantees, the return value of this function plays a
+ * critical role as well.
+ *
+ * In a nutshell, this function returns true only if idling is
+ * beneficial for throughput or, even if detrimental for throughput,
+ * idling is however necessary to preserve service guarantees (low
+ * latency, desired throughput distribution, ...). In particular, on
+ * NCQ-capable devices, this function tries to return false, so as to
+ * help keep the drives' internal queues full, whenever this helps the
+ * device boost the throughput without causing any service-guarantee
+ * issue.
+ *
+ * Most of the issues taken into account to get the return value of
+ * this function are not trivial. We discuss these issues in the two
+ * functions providing the main pieces of information needed by this
+ * function.
+ */
+static bool bfq_better_to_idle(struct bfq_queue *bfqq)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+	bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar;
+
+	if (unlikely(bfqd->strict_guarantees))
+		return true;
 
 	/*
-	 * There is then a case where idling must be performed not
-	 * for throughput concerns, but to preserve service
-	 * guarantees.
-	 *
-	 * To introduce this case, we can note that allowing the drive
-	 * to enqueue more than one request at a time, and hence
-	 * delegating de facto final scheduling decisions to the
-	 * drive's internal scheduler, entails loss of control on the
-	 * actual request service order. In particular, the critical
-	 * situation is when requests from different processes happen
-	 * to be present, at the same time, in the internal queue(s)
-	 * of the drive. In such a situation, the drive, by deciding
-	 * the service order of the internally-queued requests, does
-	 * determine also the actual throughput distribution among
-	 * these processes. But the drive typically has no notion or
-	 * concern about per-process throughput distribution, and
-	 * makes its decisions only on a per-request basis. Therefore,
-	 * the service distribution enforced by the drive's internal
-	 * scheduler is likely to coincide with the desired
-	 * device-throughput distribution only in a completely
-	 * symmetric scenario where:
-	 * (i)  each of these processes must get the same throughput as
-	 *      the others;
-	 * (ii) all these processes have the same I/O pattern
-		(either sequential or random).
-	 * In fact, in such a scenario, the drive will tend to treat
-	 * the requests of each of these processes in about the same
-	 * way as the requests of the others, and thus to provide
-	 * each of these processes with about the same throughput
-	 * (which is exactly the desired throughput distribution). In
-	 * contrast, in any asymmetric scenario, device idling is
-	 * certainly needed to guarantee that bfqq receives its
-	 * assigned fraction of the device throughput (see [1] for
-	 * details).
-	 *
-	 * We address this issue by controlling, actually, only the
-	 * symmetry sub-condition (i), i.e., provided that
-	 * sub-condition (i) holds, idling is not performed,
-	 * regardless of whether sub-condition (ii) holds. In other
-	 * words, only if sub-condition (i) holds, then idling is
-	 * allowed, and the device tends to be prevented from queueing
-	 * many requests, possibly of several processes. The reason
-	 * for not controlling also sub-condition (ii) is that we
-	 * exploit preemption to preserve guarantees in case of
-	 * symmetric scenarios, even if (ii) does not hold, as
-	 * explained in the next two paragraphs.
-	 *
-	 * Even if a queue, say Q, is expired when it remains idle, Q
-	 * can still preempt the new in-service queue if the next
-	 * request of Q arrives soon (see the comments on
-	 * bfq_bfqq_update_budg_for_activation). If all queues and
-	 * groups have the same weight, this form of preemption,
-	 * combined with the hole-recovery heuristic described in the
-	 * comments on function bfq_bfqq_update_budg_for_activation,
-	 * are enough to preserve a correct bandwidth distribution in
-	 * the mid term, even without idling. In fact, even if not
-	 * idling allows the internal queues of the device to contain
-	 * many requests, and thus to reorder requests, we can rather
-	 * safely assume that the internal scheduler still preserves a
-	 * minimum of mid-term fairness. The motivation for using
-	 * preemption instead of idling is that, by not idling,
-	 * service guarantees are preserved without minimally
-	 * sacrificing throughput. In other words, both a high
-	 * throughput and its desired distribution are obtained.
-	 *
-	 * More precisely, this preemption-based, idleless approach
-	 * provides fairness in terms of IOPS, and not sectors per
-	 * second. This can be seen with a simple example. Suppose
-	 * that there are two queues with the same weight, but that
-	 * the first queue receives requests of 8 sectors, while the
-	 * second queue receives requests of 1024 sectors. In
-	 * addition, suppose that each of the two queues contains at
-	 * most one request at a time, which implies that each queue
-	 * always remains idle after it is served. Finally, after
-	 * remaining idle, each queue receives very quickly a new
-	 * request. It follows that the two queues are served
-	 * alternatively, preempting each other if needed. This
-	 * implies that, although both queues have the same weight,
-	 * the queue with large requests receives a service that is
-	 * 1024/8 times as high as the service received by the other
-	 * queue.
-	 *
-	 * On the other hand, device idling is performed, and thus
-	 * pure sector-domain guarantees are provided, for the
-	 * following queues, which are likely to need stronger
-	 * throughput guarantees: weight-raised queues, and queues
-	 * with a higher weight than other queues. When such queues
-	 * are active, sub-condition (i) is false, which triggers
-	 * device idling.
-	 *
-	 * According to the above considerations, the next variable is
-	 * true (only) if sub-condition (i) holds. To compute the
-	 * value of this variable, we not only use the return value of
-	 * the function bfq_symmetric_scenario(), but also check
-	 * whether bfqq is being weight-raised, because
-	 * bfq_symmetric_scenario() does not take into account also
-	 * weight-raised queues (see comments on
-	 * bfq_weights_tree_add()). In particular, if bfqq is being
-	 * weight-raised, it is important to idle only if there are
-	 * other, non-weight-raised queues that may steal throughput
-	 * to bfqq. Actually, we should be even more precise, and
-	 * differentiate between interactive weight raising and
-	 * soft real-time weight raising.
-	 *
-	 * As a side note, it is worth considering that the above
-	 * device-idling countermeasures may however fail in the
-	 * following unlucky scenario: if idling is (correctly)
-	 * disabled in a time period during which all symmetry
-	 * sub-conditions hold, and hence the device is allowed to
-	 * enqueue many requests, but at some later point in time some
-	 * sub-condition stops to hold, then it may become impossible
-	 * to let requests be served in the desired order until all
-	 * the requests already queued in the device have been served.
+	 * Idling is performed only if slice_idle > 0. In addition, we
+	 * do not idle if
+	 * (a) bfqq is async
+	 * (b) bfqq is in the idle io prio class: in this case we do
+	 * not idle because we want to minimize the bandwidth that
+	 * queues in this class can steal to higher-priority queues
 	 */
-	asymmetric_scenario = (bfqq->wr_coeff > 1 &&
-			       bfqd->wr_busy_queues < bfqd->busy_queues) ||
-		!bfq_symmetric_scenario(bfqd);
+	if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) ||
+	   bfq_class_idle(bfqq))
+		return false;
+
+	idling_boosts_thr_with_no_issue =
+		idling_boosts_thr_without_issues(bfqd, bfqq);
+
+	idling_needed_for_service_guar =
+		idling_needed_for_service_guarantees(bfqd, bfqq);
 
 	/*
-	 * Finally, there is a case where maximizing throughput is the
-	 * best choice even if it may cause unfairness toward
-	 * bfqq. Such a case is when bfqq became active in a burst of
-	 * queue activations. Queues that became active during a large
-	 * burst benefit only from throughput, as discussed in the
-	 * comments on bfq_handle_burst. Thus, if bfqq became active
-	 * in a burst and not idling the device maximizes throughput,
-	 * then the device must no be idled, because not idling the
-	 * device provides bfqq and all other queues in the burst with
-	 * maximum benefit. Combining this and the above case, we can
-	 * now establish when idling is actually needed to preserve
-	 * service guarantees.
-	 */
-	idling_needed_for_service_guarantees =
-		asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
-
-	/*
-	 * We have now all the components we need to compute the
+	 * We have now the two components we need to compute the
 	 * return value of the function, which is true only if idling
 	 * either boosts the throughput (without issues), or is
 	 * necessary to preserve service guarantees.
 	 */
-	return idling_boosts_thr_without_issues ||
-		idling_needed_for_service_guarantees;
+	return idling_boosts_thr_with_no_issue ||
+		idling_needed_for_service_guar;
 }
 
 /*
@@ -3657,26 +3992,98 @@
 	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
 }
 
-static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
+/*
+ * This function chooses the queue from which to pick the next extra
+ * I/O request to inject, if it finds a compatible queue. See the
+ * comments on bfq_update_inject_limit() for details on the injection
+ * mechanism, and for the definitions of the quantities mentioned
+ * below.
+ */
+static struct bfq_queue *
+bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
 {
-	struct bfq_queue *bfqq;
+	struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue;
+	unsigned int limit = in_serv_bfqq->inject_limit;
+	/*
+	 * If
+	 * - bfqq is not weight-raised and therefore does not carry
+	 *   time-critical I/O,
+	 * or
+	 * - regardless of whether bfqq is weight-raised, bfqq has
+	 *   however a long think time, during which it can absorb the
+	 *   effect of an appropriate number of extra I/O requests
+	 *   from other queues (see bfq_update_inject_limit for
+	 *   details on the computation of this number);
+	 * then injection can be performed without restrictions.
+	 */
+	bool in_serv_always_inject = in_serv_bfqq->wr_coeff == 1 ||
+		!bfq_bfqq_has_short_ttime(in_serv_bfqq);
 
 	/*
-	 * A linear search; but, with a high probability, very few
-	 * steps are needed to find a candidate queue, i.e., a queue
-	 * with enough budget left for its next request. In fact:
+	 * If
+	 * - the baseline total service time could not be sampled yet,
+	 *   so the inject limit happens to be still 0, and
+	 * - a lot of time has elapsed since the plugging of I/O
+	 *   dispatching started, so drive speed is being wasted
+	 *   significantly;
+	 * then temporarily raise inject limit to one request.
+	 */
+	if (limit == 0 && in_serv_bfqq->last_serv_time_ns == 0 &&
+	    bfq_bfqq_wait_request(in_serv_bfqq) &&
+	    time_is_before_eq_jiffies(bfqd->last_idling_start_jiffies +
+				      bfqd->bfq_slice_idle)
+		)
+		limit = 1;
+
+	if (bfqd->rq_in_driver >= limit)
+		return NULL;
+
+	/*
+	 * Linear search of the source queue for injection; but, with
+	 * a high probability, very few steps are needed to find a
+	 * candidate queue, i.e., a queue with enough budget left for
+	 * its next request. In fact:
 	 * - BFQ dynamically updates the budget of every queue so as
 	 *   to accommodate the expected backlog of the queue;
 	 * - if a queue gets all its requests dispatched as injected
 	 *   service, then the queue is removed from the active list
-	 *   (and re-added only if it gets new requests, but with
-	 *   enough budget for its new backlog).
+	 *   (and re-added only if it gets new requests, but then it
+	 *   is assigned again enough budget for its new backlog).
 	 */
 	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
 		if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
+		    (in_serv_always_inject || bfqq->wr_coeff > 1) &&
 		    bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
-		    bfq_bfqq_budget_left(bfqq))
-			return bfqq;
+		    bfq_bfqq_budget_left(bfqq)) {
+			/*
+			 * Allow for only one large in-flight request
+			 * on non-rotational devices, for the
+			 * following reason. On non-rotationl drives,
+			 * large requests take much longer than
+			 * smaller requests to be served. In addition,
+			 * the drive prefers to serve large requests
+			 * w.r.t. to small ones, if it can choose. So,
+			 * having more than one large requests queued
+			 * in the drive may easily make the next first
+			 * request of the in-service queue wait for so
+			 * long to break bfqq's service guarantees. On
+			 * the bright side, large requests let the
+			 * drive reach a very high throughput, even if
+			 * there is only one in-flight large request
+			 * at a time.
+			 */
+			if (blk_queue_nonrot(bfqd->queue) &&
+			    blk_rq_sectors(bfqq->next_rq) >=
+			    BFQQ_SECT_THR_NONROT)
+				limit = min_t(unsigned int, 1, limit);
+			else
+				limit = in_serv_bfqq->inject_limit;
+
+			if (bfqd->rq_in_driver < limit) {
+				bfqd->rqs_injected = true;
+				return bfqq;
+			}
+		}
 
 	return NULL;
 }
@@ -3763,14 +4170,32 @@
 	 * for a new request, or has requests waiting for a completion and
 	 * may idle after their completion, then keep it anyway.
 	 *
-	 * Yet, to boost throughput, inject service from other queues if
-	 * possible.
+	 * Yet, inject service from other queues if it boosts
+	 * throughput and is possible.
 	 */
 	if (bfq_bfqq_wait_request(bfqq) ||
 	    (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
-		if (bfq_bfqq_injectable(bfqq) &&
-		    bfqq->injected_service * bfqq->inject_coeff <
-		    bfqq->entity.service * 10)
+		struct bfq_queue *async_bfqq =
+			bfqq->bic && bfqq->bic->bfqq[0] &&
+			bfq_bfqq_busy(bfqq->bic->bfqq[0]) ?
+			bfqq->bic->bfqq[0] : NULL;
+
+		/*
+		 * If the process associated with bfqq has also async
+		 * I/O pending, then inject it
+		 * unconditionally. Injecting I/O from the same
+		 * process can cause no harm to the process. On the
+		 * contrary, it can only increase bandwidth and reduce
+		 * latency for the process.
+		 */
+		if (async_bfqq &&
+		    icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
+		    bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
+		    bfq_bfqq_budget_left(async_bfqq))
+			bfqq = bfqq->bic->bfqq[0];
+		else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
+			 (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 ||
+			  !bfq_bfqq_has_short_ttime(bfqq)))
 			bfqq = bfq_choose_bfqq_for_injection(bfqd);
 		else
 			bfqq = NULL;
@@ -3862,15 +4287,15 @@
 
 	bfq_bfqq_served(bfqq, service_to_charge);
 
+	if (bfqq == bfqd->in_service_queue && bfqd->wait_dispatch) {
+		bfqd->wait_dispatch = false;
+		bfqd->waited_rq = rq;
+	}
+
 	bfq_dispatch_remove(bfqd->queue, rq);
 
-	if (bfqq != bfqd->in_service_queue) {
-		if (likely(bfqd->in_service_queue))
-			bfqd->in_service_queue->injected_service +=
-				bfq_serv_to_charge(rq, bfqq);
-
+	if (bfqq != bfqd->in_service_queue)
 		goto return_rq;
-	}
 
 	/*
 	 * If weight raising has to terminate for bfqq, then next
@@ -3890,7 +4315,7 @@
 	 * belongs to CLASS_IDLE and other queues are waiting for
 	 * service.
 	 */
-	if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq)))
+	if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)))
 		goto return_rq;
 
 	bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
@@ -3908,7 +4333,7 @@
 	 * most a call to dispatch for nothing
 	 */
 	return !list_empty_careful(&bfqd->dispatch) ||
-		bfqd->busy_queues > 0;
+		bfq_tot_busy_queues(bfqd) > 0;
 }
 
 static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
@@ -3962,9 +4387,10 @@
 		goto start_rq;
 	}
 
-	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
+	bfq_log(bfqd, "dispatch requests: %d busy queues",
+		bfq_tot_busy_queues(bfqd));
 
-	if (bfqd->busy_queues == 0)
+	if (bfq_tot_busy_queues(bfqd) == 0)
 		goto exit;
 
 	/*
@@ -4301,13 +4727,6 @@
 			bfq_mark_bfqq_has_short_ttime(bfqq);
 		bfq_mark_bfqq_sync(bfqq);
 		bfq_mark_bfqq_just_created(bfqq);
-		/*
-		 * Aggressively inject a lot of service: up to 90%.
-		 * This coefficient remains constant during bfqq life,
-		 * but this behavior might be changed, after enough
-		 * testing and tuning.
-		 */
-		bfqq->inject_coeff = 1;
 	} else
 		bfq_clear_bfqq_sync(bfqq);
 
@@ -4445,17 +4864,19 @@
 		       struct request *rq)
 {
 	bfqq->seek_history <<= 1;
-	bfqq->seek_history |=
-		get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
-		(!blk_queue_nonrot(bfqd->queue) ||
-		 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
+	bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq);
+
+	if (bfqq->wr_coeff > 1 &&
+	    bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+	    BFQQ_TOTALLY_SEEKY(bfqq))
+		bfq_bfqq_end_wr(bfqq);
 }
 
 static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
 				       struct bfq_queue *bfqq,
 				       struct bfq_io_cq *bic)
 {
-	bool has_short_ttime = true;
+	bool has_short_ttime = true, state_changed;
 
 	/*
 	 * No need to update has_short_ttime if bfqq is async or in
@@ -4480,13 +4901,93 @@
 	     bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle))
 		has_short_ttime = false;
 
-	bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d",
-		     has_short_ttime);
+	state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq);
 
 	if (has_short_ttime)
 		bfq_mark_bfqq_has_short_ttime(bfqq);
 	else
 		bfq_clear_bfqq_has_short_ttime(bfqq);
+
+	/*
+	 * Until the base value for the total service time gets
+	 * finally computed for bfqq, the inject limit does depend on
+	 * the think-time state (short|long). In particular, the limit
+	 * is 0 or 1 if the think time is deemed, respectively, as
+	 * short or long (details in the comments in
+	 * bfq_update_inject_limit()). Accordingly, the next
+	 * instructions reset the inject limit if the think-time state
+	 * has changed and the above base value is still to be
+	 * computed.
+	 *
+	 * However, the reset is performed only if more than 100 ms
+	 * have elapsed since the last update of the inject limit, or
+	 * (inclusive) if the change is from short to long think
+	 * time. The reason for this waiting is as follows.
+	 *
+	 * bfqq may have a long think time because of a
+	 * synchronization with some other queue, i.e., because the
+	 * I/O of some other queue may need to be completed for bfqq
+	 * to receive new I/O. This happens, e.g., if bfqq is
+	 * associated with a process that does some sync. A sync
+	 * generates extra blocking I/O, which must be completed
+	 * before the process associated with bfqq can go on with its
+	 * I/O.
+	 *
+	 * If such a synchronization is actually in place, then,
+	 * without injection on bfqq, the blocking I/O cannot happen
+	 * to served while bfqq is in service. As a consequence, if
+	 * bfqq is granted I/O-dispatch-plugging, then bfqq remains
+	 * empty, and no I/O is dispatched, until the idle timeout
+	 * fires. This is likely to result in lower bandwidth and
+	 * higher latencies for bfqq, and in a severe loss of total
+	 * throughput.
+	 *
+	 * On the opposite end, a non-zero inject limit may allow the
+	 * I/O that blocks bfqq to be executed soon, and therefore
+	 * bfqq to receive new I/O soon. But, if this actually
+	 * happens, then the next think-time sample for bfqq may be
+	 * very low. This in turn may cause bfqq's think time to be
+	 * deemed short. Without the 100 ms barrier, this new state
+	 * change would cause the body of the next if to be executed
+	 * immediately. But this would set to 0 the inject
+	 * limit. Without injection, the blocking I/O would cause the
+	 * think time of bfqq to become long again, and therefore the
+	 * inject limit to be raised again, and so on. The only effect
+	 * of such a steady oscillation between the two think-time
+	 * states would be to prevent effective injection on bfqq.
+	 *
+	 * In contrast, if the inject limit is not reset during such a
+	 * long time interval as 100 ms, then the number of short
+	 * think time samples can grow significantly before the reset
+	 * is allowed. As a consequence, the think time state can
+	 * become stable before the reset. There will be no state
+	 * change when the 100 ms elapse, and therefore no reset of
+	 * the inject limit. The inject limit remains steadily equal
+	 * to 1 both during and after the 100 ms. So injection can be
+	 * performed at all times, and throughput gets boosted.
+	 *
+	 * An inject limit equal to 1 is however in conflict, in
+	 * general, with the fact that the think time of bfqq is
+	 * short, because injection may be likely to delay bfqq's I/O
+	 * (as explained in the comments in
+	 * bfq_update_inject_limit()). But this does not happen in
+	 * this special case, because bfqq's low think time is due to
+	 * an effective handling of a synchronization, through
+	 * injection. In this special case, bfqq's I/O does not get
+	 * delayed by injection; on the contrary, bfqq's I/O is
+	 * brought forward, because it is not blocked for
+	 * milliseconds.
+	 *
+	 * In addition, during the 100 ms, the base value for the
+	 * total service time is likely to get finally computed,
+	 * freeing the inject limit from its relation with the think
+	 * time.
+	 */
+	if (state_changed && bfqq->last_serv_time_ns == 0 &&
+	    (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
+				      msecs_to_jiffies(100)) ||
+	     !has_short_ttime))
+		bfq_reset_inject_limit(bfqd, bfqq);
 }
 
 /*
@@ -4496,19 +4997,9 @@
 static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 			    struct request *rq)
 {
-	struct bfq_io_cq *bic = RQ_BIC(rq);
-
 	if (rq->cmd_flags & REQ_META)
 		bfqq->meta_pending++;
 
-	bfq_update_io_thinktime(bfqd, bfqq);
-	bfq_update_has_short_ttime(bfqd, bfqq, bic);
-	bfq_update_io_seektime(bfqd, bfqq, rq);
-
-	bfq_log_bfqq(bfqd, bfqq,
-		     "rq_enqueued: has_short_ttime=%d (seeky %d)",
-		     bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq));
-
 	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 
 	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
@@ -4517,28 +5008,31 @@
 		bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
 
 		/*
-		 * There is just this request queued: if the request
-		 * is small and the queue is not to be expired, then
-		 * just exit.
+		 * There is just this request queued: if
+		 * - the request is small, and
+		 * - we are idling to boost throughput, and
+		 * - the queue is not to be expired,
+		 * then just exit.
 		 *
 		 * In this way, if the device is being idled to wait
 		 * for a new request from the in-service queue, we
 		 * avoid unplugging the device and committing the
-		 * device to serve just a small request. On the
-		 * contrary, we wait for the block layer to decide
-		 * when to unplug the device: hopefully, new requests
-		 * will be merged to this one quickly, then the device
-		 * will be unplugged and larger requests will be
-		 * dispatched.
+		 * device to serve just a small request. In contrast
+		 * we wait for the block layer to decide when to
+		 * unplug the device: hopefully, new requests will be
+		 * merged to this one quickly, then the device will be
+		 * unplugged and larger requests will be dispatched.
 		 */
-		if (small_req && !budget_timeout)
+		if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) &&
+		    !budget_timeout)
 			return;
 
 		/*
-		 * A large enough request arrived, or the queue is to
-		 * be expired: in both cases disk idling is to be
-		 * stopped, so clear wait_request flag and reset
-		 * timer.
+		 * A large enough request arrived, or idling is being
+		 * performed to preserve service guarantees, or
+		 * finally the queue is to be expired: in all these
+		 * cases disk idling is to be stopped, so clear
+		 * wait_request flag and reset timer.
 		 */
 		bfq_clear_bfqq_wait_request(bfqq);
 		hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
@@ -4564,8 +5058,6 @@
 	bool waiting, idle_timer_disabled = false;
 
 	if (new_bfqq) {
-		if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
-			new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
 		/*
 		 * Release the request's reference to the old bfqq
 		 * and make sure one is taken to the shared queue.
@@ -4595,6 +5087,10 @@
 		bfqq = new_bfqq;
 	}
 
+	bfq_update_io_thinktime(bfqd, bfqq);
+	bfq_update_has_short_ttime(bfqd, bfqq, RQ_BIC(rq));
+	bfq_update_io_seektime(bfqd, bfqq, rq);
+
 	waiting = bfqq && bfq_bfqq_wait_request(bfqq);
 	bfq_add_request(rq);
 	idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
@@ -4708,6 +5204,8 @@
 
 static void bfq_update_hw_tag(struct bfq_data *bfqd)
 {
+	struct bfq_queue *bfqq = bfqd->in_service_queue;
+
 	bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
 				       bfqd->rq_in_driver);
 
@@ -4720,7 +5218,18 @@
 	 * sum is not exact, as it's not taking into account deactivated
 	 * requests.
 	 */
-	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
+	if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
+		return;
+
+	/*
+	 * If active queue hasn't enough requests and can idle, bfq might not
+	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
+	 * case
+	 */
+	if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
+	    bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
+	    BFQ_HW_QUEUE_THRESHOLD &&
+	    bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
 		return;
 
 	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
@@ -4729,6 +5238,9 @@
 	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
 	bfqd->max_rq_in_driver = 0;
 	bfqd->hw_tag_samples = 0;
+
+	bfqd->nonrot_with_queueing =
+		blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag;
 }
 
 static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
@@ -4791,11 +5303,14 @@
 	 * isochronous, and both requisites for this condition to hold
 	 * are now satisfied, then compute soft_rt_next_start (see the
 	 * comments on the function bfq_bfqq_softrt_next_start()). We
-	 * schedule this delayed check when bfqq expires, if it still
-	 * has in-flight requests.
+	 * do not compute soft_rt_next_start if bfqq is in interactive
+	 * weight raising (see the comments in bfq_bfqq_expire() for
+	 * an explanation). We schedule this delayed update when bfqq
+	 * expires, if it still has in-flight requests.
 	 */
 	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
-	    RB_EMPTY_ROOT(&bfqq->sort_list))
+	    RB_EMPTY_ROOT(&bfqq->sort_list) &&
+	    bfqq->wr_coeff != bfqd->bfq_wr_coeff)
 		bfqq->soft_rt_next_start =
 			bfq_bfqq_softrt_next_start(bfqd, bfqq);
 
@@ -4853,6 +5368,164 @@
 }
 
 /*
+ * The processes associated with bfqq may happen to generate their
+ * cumulative I/O at a lower rate than the rate at which the device
+ * could serve the same I/O. This is rather probable, e.g., if only
+ * one process is associated with bfqq and the device is an SSD. It
+ * results in bfqq becoming often empty while in service. In this
+ * respect, if BFQ is allowed to switch to another queue when bfqq
+ * remains empty, then the device goes on being fed with I/O requests,
+ * and the throughput is not affected. In contrast, if BFQ is not
+ * allowed to switch to another queue---because bfqq is sync and
+ * I/O-dispatch needs to be plugged while bfqq is temporarily
+ * empty---then, during the service of bfqq, there will be frequent
+ * "service holes", i.e., time intervals during which bfqq gets empty
+ * and the device can only consume the I/O already queued in its
+ * hardware queues. During service holes, the device may even get to
+ * remaining idle. In the end, during the service of bfqq, the device
+ * is driven at a lower speed than the one it can reach with the kind
+ * of I/O flowing through bfqq.
+ *
+ * To counter this loss of throughput, BFQ implements a "request
+ * injection mechanism", which tries to fill the above service holes
+ * with I/O requests taken from other queues. The hard part in this
+ * mechanism is finding the right amount of I/O to inject, so as to
+ * both boost throughput and not break bfqq's bandwidth and latency
+ * guarantees. In this respect, the mechanism maintains a per-queue
+ * inject limit, computed as below. While bfqq is empty, the injection
+ * mechanism dispatches extra I/O requests only until the total number
+ * of I/O requests in flight---i.e., already dispatched but not yet
+ * completed---remains lower than this limit.
+ *
+ * A first definition comes in handy to introduce the algorithm by
+ * which the inject limit is computed.  We define as first request for
+ * bfqq, an I/O request for bfqq that arrives while bfqq is in
+ * service, and causes bfqq to switch from empty to non-empty. The
+ * algorithm updates the limit as a function of the effect of
+ * injection on the service times of only the first requests of
+ * bfqq. The reason for this restriction is that these are the
+ * requests whose service time is affected most, because they are the
+ * first to arrive after injection possibly occurred.
+ *
+ * To evaluate the effect of injection, the algorithm measures the
+ * "total service time" of first requests. We define as total service
+ * time of an I/O request, the time that elapses since when the
+ * request is enqueued into bfqq, to when it is completed. This
+ * quantity allows the whole effect of injection to be measured. It is
+ * easy to see why. Suppose that some requests of other queues are
+ * actually injected while bfqq is empty, and that a new request R
+ * then arrives for bfqq. If the device does start to serve all or
+ * part of the injected requests during the service hole, then,
+ * because of this extra service, it may delay the next invocation of
+ * the dispatch hook of BFQ. Then, even after R gets eventually
+ * dispatched, the device may delay the actual service of R if it is
+ * still busy serving the extra requests, or if it decides to serve,
+ * before R, some extra request still present in its queues. As a
+ * conclusion, the cumulative extra delay caused by injection can be
+ * easily evaluated by just comparing the total service time of first
+ * requests with and without injection.
+ *
+ * The limit-update algorithm works as follows. On the arrival of a
+ * first request of bfqq, the algorithm measures the total time of the
+ * request only if one of the three cases below holds, and, for each
+ * case, it updates the limit as described below:
+ *
+ * (1) If there is no in-flight request. This gives a baseline for the
+ *     total service time of the requests of bfqq. If the baseline has
+ *     not been computed yet, then, after computing it, the limit is
+ *     set to 1, to start boosting throughput, and to prepare the
+ *     ground for the next case. If the baseline has already been
+ *     computed, then it is updated, in case it results to be lower
+ *     than the previous value.
+ *
+ * (2) If the limit is higher than 0 and there are in-flight
+ *     requests. By comparing the total service time in this case with
+ *     the above baseline, it is possible to know at which extent the
+ *     current value of the limit is inflating the total service
+ *     time. If the inflation is below a certain threshold, then bfqq
+ *     is assumed to be suffering from no perceivable loss of its
+ *     service guarantees, and the limit is even tentatively
+ *     increased. If the inflation is above the threshold, then the
+ *     limit is decreased. Due to the lack of any hysteresis, this
+ *     logic makes the limit oscillate even in steady workload
+ *     conditions. Yet we opted for it, because it is fast in reaching
+ *     the best value for the limit, as a function of the current I/O
+ *     workload. To reduce oscillations, this step is disabled for a
+ *     short time interval after the limit happens to be decreased.
+ *
+ * (3) Periodically, after resetting the limit, to make sure that the
+ *     limit eventually drops in case the workload changes. This is
+ *     needed because, after the limit has gone safely up for a
+ *     certain workload, it is impossible to guess whether the
+ *     baseline total service time may have changed, without measuring
+ *     it again without injection. A more effective version of this
+ *     step might be to just sample the baseline, by interrupting
+ *     injection only once, and then to reset/lower the limit only if
+ *     the total service time with the current limit does happen to be
+ *     too large.
+ *
+ * More details on each step are provided in the comments on the
+ * pieces of code that implement these steps: the branch handling the
+ * transition from empty to non empty in bfq_add_request(), the branch
+ * handling injection in bfq_select_queue(), and the function
+ * bfq_choose_bfqq_for_injection(). These comments also explain some
+ * exceptions, made by the injection mechanism in some special cases.
+ */
+static void bfq_update_inject_limit(struct bfq_data *bfqd,
+				    struct bfq_queue *bfqq)
+{
+	u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns;
+	unsigned int old_limit = bfqq->inject_limit;
+
+	if (bfqq->last_serv_time_ns > 0) {
+		u64 threshold = (bfqq->last_serv_time_ns * 3)>>1;
+
+		if (tot_time_ns >= threshold && old_limit > 0) {
+			bfqq->inject_limit--;
+			bfqq->decrease_time_jif = jiffies;
+		} else if (tot_time_ns < threshold &&
+			   old_limit < bfqd->max_rq_in_driver<<1)
+			bfqq->inject_limit++;
+	}
+
+	/*
+	 * Either we still have to compute the base value for the
+	 * total service time, and there seem to be the right
+	 * conditions to do it, or we can lower the last base value
+	 * computed.
+	 *
+	 * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O
+	 * request in flight, because this function is in the code
+	 * path that handles the completion of a request of bfqq, and,
+	 * in particular, this function is executed before
+	 * bfqd->rq_in_driver is decremented in such a code path.
+	 */
+	if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) ||
+	    tot_time_ns < bfqq->last_serv_time_ns) {
+		bfqq->last_serv_time_ns = tot_time_ns;
+		/*
+		 * Now we certainly have a base value: make sure we
+		 * start trying injection.
+		 */
+		bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
+	} else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1)
+		/*
+		 * No I/O injected and no request still in service in
+		 * the drive: these are the exact conditions for
+		 * computing the base value of the total service time
+		 * for bfqq. So let's update this value, because it is
+		 * rather variable. For example, it varies if the size
+		 * or the spatial locality of the I/O requests in bfqq
+		 * change.
+		 */
+		bfqq->last_serv_time_ns = tot_time_ns;
+
+
+	/* update complete, not waiting for any request completion any longer */
+	bfqd->waited_rq = NULL;
+}
+
+/*
  * Handle either a requeue or a finish for rq. The things to do are
  * the same in both cases: all references to rq are to be dropped. In
  * particular, rq is considered completed from the point of view of
@@ -4896,6 +5569,9 @@
 
 		spin_lock_irqsave(&bfqd->lock, flags);
 
+		if (rq == bfqd->waited_rq)
+			bfq_update_inject_limit(bfqd, bfqq);
+
 		bfq_completed_request(bfqq, bfqd);
 		bfq_finish_requeue_request_body(bfqq);
 
@@ -5059,7 +5735,7 @@
  * preparation is that, after the prepare_request hook is invoked for
  * rq, rq may still be transformed into a request with no icq, i.e., a
  * request not associated with any queue. No bfq hook is invoked to
- * signal this tranformation. As a consequence, should these
+ * signal this transformation. As a consequence, should these
  * preparation operations be performed when the prepare_request hook
  * is invoked, and should rq be transformed one moment later, bfq
  * would end up in an inconsistent state, because it would have
@@ -5150,7 +5826,29 @@
 		}
 	}
 
-	if (unlikely(bfq_bfqq_just_created(bfqq)))
+	/*
+	 * Consider bfqq as possibly belonging to a burst of newly
+	 * created queues only if:
+	 * 1) A burst is actually happening (bfqd->burst_size > 0)
+	 * or
+	 * 2) There is no other active queue. In fact, if, in
+	 *    contrast, there are active queues not belonging to the
+	 *    possible burst bfqq may belong to, then there is no gain
+	 *    in considering bfqq as belonging to a burst, and
+	 *    therefore in not weight-raising bfqq. See comments on
+	 *    bfq_handle_burst().
+	 *
+	 * This filtering also helps eliminating false positives,
+	 * occurring when bfqq does not belong to an actual large
+	 * burst, but some background task (e.g., a service) happens
+	 * to trigger the creation of new queues very close to when
+	 * bfqq and its possible companion queues are created. See
+	 * comments on bfq_handle_burst() for further details also on
+	 * this issue.
+	 */
+	if (unlikely(bfq_bfqq_just_created(bfqq) &&
+		     (bfqd->burst_size > 0 ||
+		      bfq_tot_busy_queues(bfqd) == 0)))
 		bfq_handle_burst(bfqd, bfqq);
 
 	return bfqq;
@@ -5410,14 +6108,15 @@
 		     HRTIMER_MODE_REL);
 	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
 
-	bfqd->queue_weights_tree = RB_ROOT;
-	bfqd->group_weights_tree = RB_ROOT;
+	bfqd->queue_weights_tree = RB_ROOT_CACHED;
+	bfqd->num_groups_with_pending_reqs = 0;
 
 	INIT_LIST_HEAD(&bfqd->active_list);
 	INIT_LIST_HEAD(&bfqd->idle_list);
 	INIT_HLIST_HEAD(&bfqd->burst_list);
 
 	bfqd->hw_tag = -1;
+	bfqd->nonrot_with_queueing = blk_queue_nonrot(bfqd->queue);
 
 	bfqd->bfq_max_budget = bfq_default_max_budget;
 

diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index a41e988..eba7cd4 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h

@@ -32,6 +32,8 @@
 #define BFQ_DEFAULT_GRP_IOPRIO	0
 #define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
 
+#define MAX_PID_STR_LENGTH 12
+
 /*
  * Soft real-time applications are extremely more latency sensitive
  * than interactive ones. Over-raise the weight of the former to
@@ -89,7 +91,7 @@
  * expiration. This peculiar definition allows for the following
  * optimization, not yet exploited: while a given entity is still in
  * service, we already know which is the best candidate for next
- * service among the other active entitities in the same parent
+ * service among the other active entities in the same parent
  * entity. We can then quickly compare the timestamps of the
  * in-service entity with those of such best candidate.
  *
@@ -108,15 +110,14 @@
 };
 
 /**
- * struct bfq_weight_counter - counter of the number of all active entities
+ * struct bfq_weight_counter - counter of the number of all active queues
  *                             with a given weight.
  */
 struct bfq_weight_counter {
-	unsigned int weight; /* weight of the entities this counter refers to */
-	unsigned int num_active; /* nr of active entities with this weight */
+	unsigned int weight; /* weight of the queues this counter refers to */
+	unsigned int num_active; /* nr of active queues with this weight */
 	/*
-	 * Weights tree member (see bfq_data's @queue_weights_tree and
-	 * @group_weights_tree)
+	 * Weights tree member (see bfq_data's @queue_weights_tree)
 	 */
 	struct rb_node weights_node;
 };
@@ -141,7 +142,7 @@
  *
  * Unless cgroups are used, the weight value is calculated from the
  * ioprio to export the same interface as CFQ.  When dealing with
- * ``well-behaved'' queues (i.e., queues that do not spend too much
+ * "well-behaved" queues (i.e., queues that do not spend too much
  * time to consume their budget and have true sequential behavior, and
  * when there are no external factors breaking anticipation) the
  * relative weights at each level of the cgroups hierarchy should be
@@ -151,8 +152,6 @@
 struct bfq_entity {
 	/* service_tree member */
 	struct rb_node rb_node;
-	/* pointer to the weight counter associated with this entity */
-	struct bfq_weight_counter *weight_counter;
 
 	/*
 	 * Flag, true if the entity is on a tree (either the active or
@@ -199,6 +198,9 @@
 
 	/* flag, set to request a weight, ioprio or ioprio_class change  */
 	int prio_changed;
+
+	/* flag, set if the entity is counted in groups_with_pending_reqs */
+	bool in_groups_with_pending_reqs;
 };
 
 struct bfq_group;
@@ -240,6 +242,13 @@
 	/* next ioprio and ioprio class if a change is in progress */
 	unsigned short new_ioprio, new_ioprio_class;
 
+	/* last total-service-time sample, see bfq_update_inject_limit() */
+	u64 last_serv_time_ns;
+	/* limit for request injection */
+	unsigned int inject_limit;
+	/* last time the inject limit has been decreased, in jiffies */
+	unsigned long decrease_time_jif;
+
 	/*
 	 * Shared bfq_queue if queue is cooperating with one or more
 	 * other queues.
@@ -266,6 +275,9 @@
 	/* entity representing this queue in the scheduler */
 	struct bfq_entity entity;
 
+	/* pointer to the weight counter associated with this entity */
+	struct bfq_weight_counter *weight_counter;
+
 	/* maximum budget allowed from the feedback mechanism */
 	int max_budget;
 	/* budget expiration (in jiffies) */
@@ -354,29 +366,6 @@
 
 	/* max service rate measured so far */
 	u32 max_service_rate;
-	/*
-	 * Ratio between the service received by bfqq while it is in
-	 * service, and the cumulative service (of requests of other
-	 * queues) that may be injected while bfqq is empty but still
-	 * in service. To increase precision, the coefficient is
-	 * measured in tenths of unit. Here are some example of (1)
-	 * ratios, (2) resulting percentages of service injected
-	 * w.r.t. to the total service dispatched while bfqq is in
-	 * service, and (3) corresponding values of the coefficient:
-	 * 1 (50%) -> 10
-	 * 2 (33%) -> 20
-	 * 10 (9%) -> 100
-	 * 9.9 (9%) -> 99
-	 * 1.5 (40%) -> 15
-	 * 0.5 (66%) -> 5
-	 * 0.1 (90%) -> 1
-	 *
-	 * So, if the coefficient is lower than 10, then
-	 * injected service is more than bfqq service.
-	 */
-	unsigned int inject_coeff;
-	/* amount of service injected in current service slot */
-	unsigned int injected_service;
 };
 
 /**
@@ -416,6 +405,15 @@
 	bool was_in_burst_list;
 
 	/*
+	 * Save the weight when a merge occurs, to be able
+	 * to restore it in case of split. If the weight is not
+	 * correctly resumed when the queue is recycled,
+	 * then the weight of the recycled queue could differ
+	 * from the weight of the original queue.
+	 */
+	unsigned int saved_weight;
+
+	/*
 	 * Similar to previous fields: save wr information.
 	 */
 	unsigned long saved_wr_coeff;
@@ -447,22 +445,62 @@
 	 * weight-raised @bfq_queue (see the comments to the functions
 	 * bfq_weights_tree_[add|remove] for further details).
 	 */
-	struct rb_root queue_weights_tree;
-	/*
-	 * rbtree of non-queue @bfq_entity weight counters, sorted by
-	 * weight. Used to keep track of whether all @bfq_groups have
-	 * the same weight. The tree contains one counter for each
-	 * distinct weight associated to some active @bfq_group (see
-	 * the comments to the functions bfq_weights_tree_[add|remove]
-	 * for further details).
-	 */
-	struct rb_root group_weights_tree;
+	struct rb_root_cached queue_weights_tree;
 
 	/*
-	 * Number of bfq_queues containing requests (including the
-	 * queue in service, even if it is idling).
+	 * Number of groups with at least one descendant process that
+	 * has at least one request waiting for completion. Note that
+	 * this accounts for also requests already dispatched, but not
+	 * yet completed. Therefore this number of groups may differ
+	 * (be larger) than the number of active groups, as a group is
+	 * considered active only if its corresponding entity has
+	 * descendant queues with at least one request queued. This
+	 * number is used to decide whether a scenario is symmetric.
+	 * For a detailed explanation see comments on the computation
+	 * of the variable asymmetric_scenario in the function
+	 * bfq_better_to_idle().
+	 *
+	 * However, it is hard to compute this number exactly, for
+	 * groups with multiple descendant processes. Consider a group
+	 * that is inactive, i.e., that has no descendant process with
+	 * pending I/O inside BFQ queues. Then suppose that
+	 * num_groups_with_pending_reqs is still accounting for this
+	 * group, because the group has descendant processes with some
+	 * I/O request still in flight. num_groups_with_pending_reqs
+	 * should be decremented when the in-flight request of the
+	 * last descendant process is finally completed (assuming that
+	 * nothing else has changed for the group in the meantime, in
+	 * terms of composition of the group and active/inactive state of child
+	 * groups and processes). To accomplish this, an additional
+	 * pending-request counter must be added to entities, and must
+	 * be updated correctly. To avoid this additional field and operations,
+	 * we resort to the following tradeoff between simplicity and
+	 * accuracy: for an inactive group that is still counted in
+	 * num_groups_with_pending_reqs, we decrement
+	 * num_groups_with_pending_reqs when the first descendant
+	 * process of the group remains with no request waiting for
+	 * completion.
+	 *
+	 * Even this simpler decrement strategy requires a little
+	 * carefulness: to avoid multiple decrements, we flag a group,
+	 * more precisely an entity representing a group, as still
+	 * counted in num_groups_with_pending_reqs when it becomes
+	 * inactive. Then, when the first descendant queue of the
+	 * entity remains with no request waiting for completion,
+	 * num_groups_with_pending_reqs is decremented, and this flag
+	 * is reset. After this flag is reset for the entity,
+	 * num_groups_with_pending_reqs won't be decremented any
+	 * longer in case a new descendant queue of the entity remains
+	 * with no request waiting for completion.
 	 */
-	int busy_queues;
+	unsigned int num_groups_with_pending_reqs;
+
+	/*
+	 * Per-class (RT, BE, IDLE) number of bfq_queues containing
+	 * requests (including the queue in service, even if it is
+	 * idling).
+	 */
+	unsigned int busy_queues[3];
 	/* number of weight-raised busy @bfq_queues */
 	int wr_busy_queues;
 	/* number of queued requests */
@@ -470,6 +508,9 @@
 	/* number of requests dispatched and waiting for completion */
 	int rq_in_driver;
 
+	/* true if the device is non rotational and performs queueing */
+	bool nonrot_with_queueing;
+
 	/*
 	 * Maximum number of requests in driver in the last
 	 * @hw_tag_samples completed requests.
@@ -501,6 +542,26 @@
 	/* time of last request completion (ns) */
 	u64 last_completion;
 
+	/* time of last transition from empty to non-empty (ns) */
+	u64 last_empty_occupied_ns;
+
+	/*
+	 * Flag set to activate the sampling of the total service time
+	 * of a just-arrived first I/O request (see
+	 * bfq_update_inject_limit()). This will cause the setting of
+	 * waited_rq when the request is finally dispatched.
+	 */
+	bool wait_dispatch;
+	/*
+	 *  If set, then bfq_update_inject_limit() is invoked when
+	 *  waited_rq is eventually completed.
+	 */
+	struct request *waited_rq;
+	/*
+	 * True if some request has been injected during the last service hole.
+	 */
+	bool rqs_injected;
+
 	/* time of first rq dispatch in current observation interval (ns) */
 	u64 first_dispatch;
 	/* time of last rq dispatch in current observation interval (ns) */
@@ -510,6 +571,7 @@
 	ktime_t last_budget_start;
 	/* beginning of the last idle slice */
 	ktime_t last_idling_start;
+	unsigned long last_idling_start_jiffies;
 
 	/* number of samples in current observation interval */
 	int peak_rate_samples;
@@ -854,11 +916,11 @@
 void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
 struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
 void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
-			  struct rb_root *root);
+void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct rb_root_cached *root);
 void __bfq_weights_tree_remove(struct bfq_data *bfqd,
-			       struct bfq_entity *entity,
-			       struct rb_root *root);
+			       struct bfq_queue *bfqq,
+			       struct rb_root_cached *root);
 void bfq_weights_tree_remove(struct bfq_data *bfqd,
 			     struct bfq_queue *bfqq);
 void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
@@ -935,6 +997,7 @@
 
 struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
 struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
 struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
 struct bfq_entity *bfq_entity_of(struct rb_node *node);
 unsigned short bfq_ioprio_to_weight(int ioprio);
@@ -951,7 +1014,7 @@
 			     bool ins_into_idle_tree);
 bool next_queue_may_preempt(struct bfq_data *bfqd);
 struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd);
-void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
+bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
 void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 			 bool ins_into_idle_tree, bool expiration);
 void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
@@ -964,13 +1027,23 @@
 /* --------------- end of interface of B-WF2Q+ ---------------- */
 
 /* Logging facilities. */
+static inline void bfq_pid_to_str(int pid, char *str, int len)
+{
+	if (pid != -1)
+		snprintf(str, len, "%d", pid);
+	else
+		snprintf(str, len, "SHARED-");
+}
+
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
 
 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
+	char pid_str[MAX_PID_STR_LENGTH];	\
+	bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH);	\
 	blk_add_cgroup_trace_msg((bfqd)->queue,				\
 			bfqg_to_blkg(bfqq_group(bfqq))->blkcg,		\
-			"bfq%d%c " fmt, (bfqq)->pid,			\
+			"bfq%s%c " fmt, pid_str,			\
 			bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args);	\
 } while (0)
 
@@ -981,10 +1054,13 @@
 
 #else /* CONFIG_BFQ_GROUP_IOSCHED */
 
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	\
-	blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid,	\
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do {	\
+	char pid_str[MAX_PID_STR_LENGTH];	\
+	bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH);	\
+	blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str,	\
 			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
-				##args)
+				##args);	\
+} while (0)
 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...)		do {} while (0)
 
 #endif /* CONFIG_BFQ_GROUP_IOSCHED */

diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index ff7c2d4..48d899c 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c

@@ -44,6 +44,12 @@
 		BFQ_DEFAULT_GRP_CLASS - 1;
 }
 
+unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd)
+{
+	return bfqd->busy_queues[0] + bfqd->busy_queues[1] +
+		bfqd->busy_queues[2];
+}
+
 static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
 						 bool expiration);
 
@@ -53,7 +59,7 @@
  * bfq_update_next_in_service - update sd->next_in_service
  * @sd: sched_data for which to perform the update.
  * @new_entity: if not NULL, pointer to the entity whose activation,
- *		requeueing or repositionig triggered the invocation of
+ *		requeueing or repositioning triggered the invocation of
  *		this function.
  * @expiration: id true, this function is being invoked after the
  *             expiration of the in-service entity
@@ -84,7 +90,7 @@
 
 	/*
 	 * If this update is triggered by the activation, requeueing
-	 * or repositiong of an entity that does not coincide with
+	 * or repositioning of an entity that does not coincide with
 	 * sd->next_in_service, then a full lookup in the active tree
 	 * can be avoided. In fact, it is enough to check whether the
 	 * just-modified entity has the same priority as
@@ -731,7 +737,7 @@
 		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 		unsigned int prev_weight, new_weight;
 		struct bfq_data *bfqd = NULL;
-		struct rb_root *root;
+		struct rb_root_cached *root;
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 		struct bfq_sched_data *sd;
 		struct bfq_group *bfqg;
@@ -788,25 +794,23 @@
 		new_weight = entity->orig_weight *
 			     (bfqq ? bfqq->wr_coeff : 1);
 		/*
-		 * If the weight of the entity changes, remove the entity
-		 * from its old weight counter (if there is a counter
-		 * associated with the entity), and add it to the counter
-		 * associated with its new weight.
+		 * If the weight of the entity changes, and the entity is a
+		 * queue, remove the entity from its old weight counter (if
+		 * there is a counter associated with the entity).
 		 */
-		if (prev_weight != new_weight) {
-			root = bfqq ? &bfqd->queue_weights_tree :
-				      &bfqd->group_weights_tree;
-			__bfq_weights_tree_remove(bfqd, entity, root);
+		if (prev_weight != new_weight && bfqq) {
+			root = &bfqd->queue_weights_tree;
+			__bfq_weights_tree_remove(bfqd, bfqq, root);
 		}
 		entity->weight = new_weight;
 		/*
-		 * Add the entity to its weights tree only if it is
-		 * not associated with a weight-raised queue.
+		 * Add the entity, if it is not a weight-raised queue,
+		 * to the counter associated with its new weight.
 		 */
-		if (prev_weight != new_weight &&
-		    (bfqq ? bfqq->wr_coeff == 1 : 1))
+		if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) {
 			/* If we get here, root has been initialized. */
-			bfq_weights_tree_add(bfqd, entity, root);
+			bfq_weights_tree_add(bfqd, bfqq, root);
+		}
 
 		new_st->wsum += entity->weight;
 
@@ -1008,13 +1012,16 @@
 		entity->on_st = true;
 	}
 
-#ifdef BFQ_GROUP_IOSCHED_ENABLED
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 	if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
 		struct bfq_group *bfqg =
 			container_of(entity, struct bfq_group, entity);
+		struct bfq_data *bfqd = bfqg->bfqd;
 
-		bfq_weights_tree_add(bfqg->bfqd, entity,
-				     &bfqd->group_weights_tree);
+		if (!entity->in_groups_with_pending_reqs) {
+			entity->in_groups_with_pending_reqs = true;
+			bfqd->num_groups_with_pending_reqs++;
+		}
 	}
 #endif
 
@@ -1153,15 +1160,14 @@
 }
 
 /**
- * __bfq_deactivate_entity - deactivate an entity from its service tree.
- * @entity: the entity to deactivate.
+ * __bfq_deactivate_entity - update sched_data and service trees for
+ * entity, so as to represent entity as inactive
+ * @entity: the entity being deactivated.
  * @ins_into_idle_tree: if false, the entity will not be put into the
  *			idle tree.
  *
- * Deactivates an entity, independently of its previous state.  Must
- * be invoked only if entity is on a service tree. Extracts the entity
- * from that tree, and if necessary and allowed, puts it into the idle
- * tree.
+ * If necessary and allowed, puts entity into the idle tree. NOTE:
+ * entity may be on no tree if in service.
  */
 bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
 {
@@ -1390,7 +1396,7 @@
  * In this first case, update the virtual time in @st too (see the
  * comments on this update inside the function).
  *
- * In constrast, if there is an in-service entity, then return the
+ * In contrast, if there is an in-service entity, then return the
  * entity that would be set in service if not only the above
  * conditions, but also the next one held true: the currently
  * in-service entity, on expiration,
@@ -1473,12 +1479,12 @@
 		 * is being invoked as a part of the expiration path
 		 * of the in-service queue. In this case, even if
 		 * sd->in_service_entity is not NULL,
-		 * sd->in_service_entiy at this point is actually not
+		 * sd->in_service_entity at this point is actually not
 		 * in service any more, and, if needed, has already
 		 * been properly queued or requeued into the right
 		 * tree. The reason why sd->in_service_entity is still
 		 * not NULL here, even if expiration is true, is that
-		 * sd->in_service_entiy is reset as a last step in the
+		 * sd->in_service_entity is reset as a last step in the
 		 * expiration path. So, if expiration is true, tell
 		 * __bfq_lookup_next_entity that there is no
 		 * sd->in_service_entity.
@@ -1513,7 +1519,7 @@
 	struct bfq_sched_data *sd;
 	struct bfq_queue *bfqq;
 
-	if (bfqd->busy_queues == 0)
+	if (bfq_tot_busy_queues(bfqd) == 0)
 		return NULL;
 
 	/*
@@ -1599,7 +1605,8 @@
 	return bfqq;
 }
 
-void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+/* returns true if the in-service queue gets freed */
+bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
 {
 	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
 	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
@@ -1623,8 +1630,20 @@
 	 * service tree either, then release the service reference to
 	 * the queue it represents (taken with bfq_get_entity).
 	 */
-	if (!in_serv_entity->on_st)
+	if (!in_serv_entity->on_st) {
+		/*
+		 * If no process is referencing in_serv_bfqq any
+		 * longer, then the service reference may be the only
+		 * reference to the queue. If this is the case, then
+		 * bfqq gets freed here.
+		 */
+		int ref = in_serv_bfqq->ref;
 		bfq_put_queue(in_serv_bfqq);
+		if (ref == 1)
+			return true;
+	}
+
+	return false;
 }
 
 void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
@@ -1665,10 +1684,7 @@
 
 	bfq_clear_bfqq_busy(bfqq);
 
-	bfqd->busy_queues--;
-
-	if (!bfqq->dispatched)
-		bfq_weights_tree_remove(bfqd, bfqq);
+	bfqd->busy_queues[bfqq->ioprio_class - 1]--;
 
 	if (bfqq->wr_coeff > 1)
 		bfqd->wr_busy_queues--;
@@ -1676,6 +1692,9 @@
 	bfqg_stats_update_dequeue(bfqq_group(bfqq));
 
 	bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
+
+	if (!bfqq->dispatched)
+		bfq_weights_tree_remove(bfqd, bfqq);
 }
 
 /*
@@ -1688,11 +1707,11 @@
 	bfq_activate_bfqq(bfqd, bfqq);
 
 	bfq_mark_bfqq_busy(bfqq);
-	bfqd->busy_queues++;
+	bfqd->busy_queues[bfqq->ioprio_class - 1]++;
 
 	if (!bfqq->dispatched)
 		if (bfqq->wr_coeff == 1)
-			bfq_weights_tree_add(bfqd, &bfqq->entity,
+			bfq_weights_tree_add(bfqd, bfqq,
 					     &bfqd->queue_weights_tree);
 
 	if (bfqq->wr_coeff > 1)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 5006a0d..4ca4f0b 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c

@@ -16,6 +16,18 @@
 
 static void blk_mq_sysfs_release(struct kobject *kobj)
 {
+	struct blk_mq_ctxs *ctxs = container_of(kobj, struct blk_mq_ctxs, kobj);
+
+	free_percpu(ctxs->queue_ctx);
+	kfree(ctxs);
+}
+
+static void blk_mq_ctx_sysfs_release(struct kobject *kobj)
+{
+	struct blk_mq_ctx *ctx = container_of(kobj, struct blk_mq_ctx, kobj);
+
+	/* ctx->ctxs won't be released until all ctx are freed */
+	kobject_put(&ctx->ctxs->kobj);
 }
 
 static void blk_mq_hw_sysfs_release(struct kobject *kobj)
@@ -214,7 +226,7 @@
 static struct kobj_type blk_mq_ctx_ktype = {
 	.sysfs_ops	= &blk_mq_sysfs_ops,
 	.default_attrs	= default_ctx_attrs,
-	.release	= blk_mq_sysfs_release,
+	.release	= blk_mq_ctx_sysfs_release,
 };
 
 static struct kobj_type blk_mq_hw_ktype = {
@@ -246,7 +258,7 @@
 	if (!hctx->nr_ctx)
 		return 0;
 
-	ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
+	ret = kobject_add(&hctx->kobj, q->mq_kobj, "%u", hctx->queue_num);
 	if (ret)
 		return ret;
 
@@ -269,8 +281,8 @@
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_unregister_hctx(hctx);
 
-	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
-	kobject_del(&q->mq_kobj);
+	kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
+	kobject_del(q->mq_kobj);
 	kobject_put(&dev->kobj);
 
 	q->mq_sysfs_init_done = false;
@@ -290,7 +302,7 @@
 		ctx = per_cpu_ptr(q->queue_ctx, cpu);
 		kobject_put(&ctx->kobj);
 	}
-	kobject_put(&q->mq_kobj);
+	kobject_put(q->mq_kobj);
 }
 
 void blk_mq_sysfs_init(struct request_queue *q)
@@ -298,10 +310,12 @@
 	struct blk_mq_ctx *ctx;
 	int cpu;
 
-	kobject_init(&q->mq_kobj, &blk_mq_ktype);
+	kobject_init(q->mq_kobj, &blk_mq_ktype);
 
 	for_each_possible_cpu(cpu) {
 		ctx = per_cpu_ptr(q->queue_ctx, cpu);
+
+		kobject_get(q->mq_kobj);
 		kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
 	}
 }
@@ -314,11 +328,11 @@
 	WARN_ON_ONCE(!q->kobj.parent);
 	lockdep_assert_held(&q->sysfs_lock);
 
-	ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
+	ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
 	if (ret < 0)
 		goto out;
 
-	kobject_uevent(&q->mq_kobj, KOBJ_ADD);
+	kobject_uevent(q->mq_kobj, KOBJ_ADD);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_register_hctx(hctx);
@@ -335,8 +349,8 @@
 	while (--i >= 0)
 		blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
 
-	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
-	kobject_del(&q->mq_kobj);
+	kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
+	kobject_del(q->mq_kobj);
 	kobject_put(&dev->kobj);
 	return ret;
 }

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 684acaa..dba55f3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c

@@ -2453,6 +2453,34 @@
 	mutex_unlock(&set->tag_list_lock);
 }
 
+/* All allocations will be freed in release handler of q->mq_kobj */
+static int blk_mq_alloc_ctxs(struct request_queue *q)
+{
+	struct blk_mq_ctxs *ctxs;
+	int cpu;
+
+	ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
+	if (!ctxs)
+		return -ENOMEM;
+
+	ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
+	if (!ctxs->queue_ctx)
+		goto fail;
+
+	for_each_possible_cpu(cpu) {
+		struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
+		ctx->ctxs = ctxs;
+	}
+
+	q->mq_kobj = &ctxs->kobj;
+	q->queue_ctx = ctxs->queue_ctx;
+
+	return 0;
+ fail:
+	kfree(ctxs);
+	return -ENOMEM;
+}
+
 /*
  * It is the actual release handler for mq, but we do it from
  * request queue's release handler for avoiding use-after-free
@@ -2480,8 +2508,6 @@
 	 * both share lifetime with request queue.
 	 */
 	blk_mq_sysfs_deinit(q);
-
-	free_percpu(q->queue_ctx);
 }
 
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
@@ -2586,8 +2612,7 @@
 	if (!q->poll_cb)
 		goto err_exit;
 
-	q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
-	if (!q->queue_ctx)
+	if (blk_mq_alloc_ctxs(q))
 		goto err_exit;
 
 	/* init q->mq_kobj and sw queues' kobjects */
@@ -2596,7 +2621,7 @@
 	q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)),
 						GFP_KERNEL, set->numa_node);
 	if (!q->queue_hw_ctx)
-		goto err_percpu;
+		goto err_sys_init;
 
 	q->mq_map = set->mq_map;
 
@@ -2653,8 +2678,8 @@
 
 err_hctxs:
 	kfree(q->queue_hw_ctx);
-err_percpu:
-	free_percpu(q->queue_ctx);
+err_sys_init:
+	blk_mq_sysfs_deinit(q);
 err_exit:
 	q->mq_ops = NULL;
 	return ERR_PTR(-ENOMEM);

diff --git a/block/blk-mq.h b/block/blk-mq.h
index 5ad9251..a6094c2 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h

@@ -7,6 +7,11 @@
 
 struct blk_mq_tag_set;
 
+struct blk_mq_ctxs {
+	struct kobject kobj;
+	struct blk_mq_ctx __percpu	*queue_ctx;
+};
+
 /**
  * struct blk_mq_ctx - State for a software queue facing the submitting CPUs
  */
@@ -27,6 +32,7 @@
 	unsigned long		____cacheline_aligned_in_smp rq_completed[2];
 
 	struct request_queue	*queue;
+	struct blk_mq_ctxs      *ctxs;
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 

diff --git a/crypto/Makefile b/crypto/Makefile
index f6a234d..e7397bd 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile

@@ -124,7 +124,7 @@
 obj-$(CONFIG_CRYPTO_CRC32) += crc32_generic.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF) += crct10dif_common.o crct10dif_generic.o
 obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o
-obj-$(CONFIG_CRYPTO_LZO) += lzo.o
+obj-$(CONFIG_CRYPTO_LZO) += lzo.o lzo-rle.o
 obj-$(CONFIG_CRYPTO_LZ4) += lz4.o
 obj-$(CONFIG_CRYPTO_LZ4HC) += lz4hc.o
 obj-$(CONFIG_CRYPTO_842) += 842.o

diff --git a/crypto/lzo-rle.c b/crypto/lzo-rle.c
new file mode 100644
index 0000000..ea9c75b
--- /dev/null
+++ b/crypto/lzo-rle.c

@@ -0,0 +1,175 @@
+/*
+ * Cryptographic API.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/lzo.h>
+#include <crypto/internal/scompress.h>
+
+struct lzorle_ctx {
+	void *lzorle_comp_mem;
+};
+
+static void *lzorle_alloc_ctx(struct crypto_scomp *tfm)
+{
+	void *ctx;
+
+	ctx = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	return ctx;
+}
+
+static int lzorle_init(struct crypto_tfm *tfm)
+{
+	struct lzorle_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->lzorle_comp_mem = lzorle_alloc_ctx(NULL);
+	if (IS_ERR(ctx->lzorle_comp_mem))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void lzorle_free_ctx(struct crypto_scomp *tfm, void *ctx)
+{
+	kvfree(ctx);
+}
+
+static void lzorle_exit(struct crypto_tfm *tfm)
+{
+	struct lzorle_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	lzorle_free_ctx(NULL, ctx->lzorle_comp_mem);
+}
+
+static int __lzorle_compress(const u8 *src, unsigned int slen,
+			  u8 *dst, unsigned int *dlen, void *ctx)
+{
+	size_t tmp_len = *dlen; /* size_t(ulong) <-> uint on 64 bit */
+	int err;
+
+	err = lzorle1x_1_compress(src, slen, dst, &tmp_len, ctx);
+
+	if (err != LZO_E_OK)
+		return -EINVAL;
+
+	*dlen = tmp_len;
+	return 0;
+}
+
+static int lzorle_compress(struct crypto_tfm *tfm, const u8 *src,
+			unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+	struct lzorle_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	return __lzorle_compress(src, slen, dst, dlen, ctx->lzorle_comp_mem);
+}
+
+static int lzorle_scompress(struct crypto_scomp *tfm, const u8 *src,
+			 unsigned int slen, u8 *dst, unsigned int *dlen,
+			 void *ctx)
+{
+	return __lzorle_compress(src, slen, dst, dlen, ctx);
+}
+
+static int __lzorle_decompress(const u8 *src, unsigned int slen,
+			    u8 *dst, unsigned int *dlen)
+{
+	int err;
+	size_t tmp_len = *dlen; /* size_t(ulong) <-> uint on 64 bit */
+
+	err = lzo1x_decompress_safe(src, slen, dst, &tmp_len);
+
+	if (err != LZO_E_OK)
+		return -EINVAL;
+
+	*dlen = tmp_len;
+	return 0;
+}
+
+static int lzorle_decompress(struct crypto_tfm *tfm, const u8 *src,
+			  unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+	return __lzorle_decompress(src, slen, dst, dlen);
+}
+
+static int lzorle_sdecompress(struct crypto_scomp *tfm, const u8 *src,
+			   unsigned int slen, u8 *dst, unsigned int *dlen,
+			   void *ctx)
+{
+	return __lzorle_decompress(src, slen, dst, dlen);
+}
+
+static struct crypto_alg alg = {
+	.cra_name		= "lzo-rle",
+	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
+	.cra_ctxsize		= sizeof(struct lzorle_ctx),
+	.cra_module		= THIS_MODULE,
+	.cra_init		= lzorle_init,
+	.cra_exit		= lzorle_exit,
+	.cra_u			= { .compress = {
+	.coa_compress		= lzorle_compress,
+	.coa_decompress		= lzorle_decompress } }
+};
+
+static struct scomp_alg scomp = {
+	.alloc_ctx		= lzorle_alloc_ctx,
+	.free_ctx		= lzorle_free_ctx,
+	.compress		= lzorle_scompress,
+	.decompress		= lzorle_sdecompress,
+	.base			= {
+		.cra_name	= "lzo-rle",
+		.cra_driver_name = "lzo-rle-scomp",
+		.cra_module	 = THIS_MODULE,
+	}
+};
+
+static int __init lzorle_mod_init(void)
+{
+	int ret;
+
+	ret = crypto_register_alg(&alg);
+	if (ret)
+		return ret;
+
+	ret = crypto_register_scomp(&scomp);
+	if (ret) {
+		crypto_unregister_alg(&alg);
+		return ret;
+	}
+
+	return ret;
+}
+
+static void __exit lzorle_mod_fini(void)
+{
+	crypto_unregister_alg(&alg);
+	crypto_unregister_scomp(&scomp);
+}
+
+module_init(lzorle_mod_init);
+module_exit(lzorle_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZO-RLE Compression Algorithm");
+MODULE_ALIAS_CRYPTO("lzo-rle");

diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index d332988..18948f8 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c

@@ -76,8 +76,8 @@
 	"cast6", "arc4", "michael_mic", "deflate", "crc32c", "tea", "xtea",
 	"khazad", "wp512", "wp384", "wp256", "tnepres", "xeta",  "fcrypt",
 	"camellia", "seed", "salsa20", "rmd128", "rmd160", "rmd256", "rmd320",
-	"lzo", "cts", "zlib", "sha3-224", "sha3-256", "sha3-384", "sha3-512",
-	NULL
+	"lzo", "lzo-rle", "cts", "zlib", "sha3-224", "sha3-256", "sha3-384",
+	"sha3-512", NULL
 };
 
 static u32 block_sizes[] = { 16, 64, 256, 1024, 8192, 0 };

diff --git a/drivers/acpi/acpi_watchdog.c b/drivers/acpi/acpi_watchdog.c
index 9560030..23cde3d 100644
--- a/drivers/acpi/acpi_watchdog.c
+++ b/drivers/acpi/acpi_watchdog.c

@@ -129,12 +129,11 @@
 		gas = &entries[i].register_region;
 
 		res.start = gas->address;
+		res.end = res.start + ACPI_ACCESS_BYTE_WIDTH(gas->access_width) - 1;
 		if (gas->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) {
 			res.flags = IORESOURCE_MEM;
-			res.end = res.start + ALIGN(gas->access_width, 4) - 1;
 		} else if (gas->space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
 			res.flags = IORESOURCE_IO;
-			res.end = res.start + gas->access_width - 1;
 		} else {
 			pr_warn("Unsupported address space: %u\n",
 				gas->space_id);

diff --git a/drivers/acpi/acpica/dsfield.c b/drivers/acpi/acpica/dsfield.c
index 30fe895..bcc6a7a 100644
--- a/drivers/acpi/acpica/dsfield.c
+++ b/drivers/acpi/acpica/dsfield.c

@@ -244,7 +244,7 @@
  * FUNCTION:    acpi_ds_get_field_names
  *
  * PARAMETERS:  info            - create_field info structure
- *  `           walk_state      - Current method state
+ *              walk_state      - Current method state
  *              arg             - First parser arg for the field name list
  *
  * RETURN:      Status

diff --git a/drivers/acpi/acpica/dswload.c b/drivers/acpi/acpica/dswload.c
index d06c414..ba53662 100644
--- a/drivers/acpi/acpica/dswload.c
+++ b/drivers/acpi/acpica/dswload.c

@@ -412,6 +412,27 @@
 	ACPI_DEBUG_PRINT((ACPI_DB_DISPATCH, "Op=%p State=%p\n", op,
 			  walk_state));
 
+	/*
+	 * Disassembler: handle create field operators here.
+	 *
+	 * create_buffer_field is a deferred op that is typically processed in load
+	 * pass 2. However, disassembly of control method contents walk the parse
+	 * tree with ACPI_PARSE_LOAD_PASS1 and AML_CREATE operators are processed
+	 * in a later walk. This is a problem when there is a control method that
+	 * has the same name as the AML_CREATE object. In this case, any use of the
+	 * name segment will be detected as a method call rather than a reference
+	 * to a buffer field.
+	 *
+	 * This earlier creation during disassembly solves this issue by inserting
+	 * the named object in the ACPI namespace so that references to this name
+	 * would be a name string rather than a method call.
+	 */
+	if ((walk_state->parse_flags & ACPI_PARSE_DISASSEMBLE) &&
+	    (walk_state->op_info->flags & AML_CREATE)) {
+		status = acpi_ds_create_buffer_field(op, walk_state);
+		return_ACPI_STATUS(status);
+	}
+
 	/* We are only interested in opcodes that have an associated name */
 
 	if (!(walk_state->op_info->flags & (AML_NAMED | AML_FIELD))) {

diff --git a/drivers/acpi/acpica/evgpe.c b/drivers/acpi/acpica/evgpe.c
index 4b5d3b4..4da586f 100644
--- a/drivers/acpi/acpica/evgpe.c
+++ b/drivers/acpi/acpica/evgpe.c

@@ -81,8 +81,12 @@
 
 	ACPI_FUNCTION_TRACE(ev_enable_gpe);
 
-	/* Enable the requested GPE */
+	/* Clear the GPE (of stale events) */
+	status = acpi_hw_clear_gpe(gpe_event_info);
+	if (ACPI_FAILURE(status))
+		return_ACPI_STATUS(status);
 
+	/* Enable the requested GPE */
 	status = acpi_hw_low_set_gpe(gpe_event_info, ACPI_GPE_ENABLE);
 	return_ACPI_STATUS(status);
 }

diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c
index a25d77b..d5c19e2 100644
--- a/drivers/acpi/button.c
+++ b/drivers/acpi/button.c

@@ -102,6 +102,17 @@
 		},
 		.driver_data = (void *)(long)ACPI_BUTTON_LID_INIT_OPEN,
 	},
+	{
+		/*
+		 * Razer Blade Stealth 13 late 2019, notification of the LID device
+		 * only happens on close, not on open and _LID always returns closed.
+		 */
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Razer"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Razer Blade Stealth 13 Late 2019"),
+		},
+		.driver_data = (void *)(long)ACPI_BUTTON_LID_INIT_OPEN,
+	},
 	{}
 };
 

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 8340c81b..ec374db 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c

@@ -2282,7 +2282,7 @@
 		offset = to_interleave_offset(offset, mmio);
 
 	writeq(cmd, mmio->addr.base + offset);
-	nvdimm_flush(nfit_blk->nd_region);
+	nvdimm_flush(nfit_blk->nd_region, NULL);
 
 	if (nfit_blk->dimm_flags & NFIT_BLK_DCR_LATCH)
 		readq(mmio->addr.base + offset);
@@ -2331,7 +2331,7 @@
 	}
 
 	if (rw)
-		nvdimm_flush(nfit_blk->nd_region);
+		nvdimm_flush(nfit_blk->nd_region, NULL);
 
 	rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0;
 	return rc;

diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 847db3e..6bd58d7 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c

@@ -584,6 +584,7 @@
 	acpi_status status = AE_OK;
 	u32 acpi_state = acpi_target_sleep_state;
 	int error;
+	u64 tsc;
 
 	ACPI_FLUSH_CPU_CACHE();
 
@@ -600,6 +601,9 @@
 		error = acpi_suspend_lowlevel();
 		if (error)
 			return error;
+		tsc = rdtsc_ordered();
+		printk(KERN_INFO "TSC at resume: %llu\n",
+				(unsigned long long)tsc);
 		pr_info(PREFIX "Low-level resume complete\n");
 		pm_set_resume_via_firmware();
 		break;

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index fa1c5a4..bbc8710 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c

@@ -96,6 +96,7 @@
 
 static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent);
 static void ahci_remove_one(struct pci_dev *dev);
+static void ahci_shutdown_one(struct pci_dev *dev);
 static int ahci_vt8251_hardreset(struct ata_link *link, unsigned int *class,
 				 unsigned long deadline);
 static int ahci_avn_hardreset(struct ata_link *link, unsigned int *class,
@@ -609,6 +610,7 @@
 	.id_table		= ahci_pci_tbl,
 	.probe			= ahci_init_one,
 	.remove			= ahci_remove_one,
+	.shutdown		= ahci_shutdown_one,
 	.driver = {
 		.pm		= &ahci_pci_pm_ops,
 	},
@@ -1897,6 +1899,11 @@
 	return 0;
 }
 
+static void ahci_shutdown_one(struct pci_dev *pdev)
+{
+	ata_pci_shutdown_one(pdev);
+}
+
 static void ahci_remove_one(struct pci_dev *pdev)
 {
 	pm_runtime_get_noresume(&pdev->dev);

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index b45b6f7..75d582c 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c

@@ -6780,6 +6780,26 @@
 	ata_host_detach(host);
 }
 
+void ata_pci_shutdown_one(struct pci_dev *pdev)
+{
+	struct ata_host *host = pci_get_drvdata(pdev);
+	int i;
+
+	for (i = 0; i < host->n_ports; i++) {
+		struct ata_port *ap = host->ports[i];
+
+		ap->pflags |= ATA_PFLAG_FROZEN;
+
+		/* Disable port interrupts */
+		if (ap->ops->freeze)
+			ap->ops->freeze(ap);
+
+		/* Stop the port DMA engines */
+		if (ap->ops->port_stop)
+			ap->ops->port_stop(ap);
+	}
+}
+
 /* move to PCI subsystem */
 int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits)
 {
@@ -7400,6 +7420,7 @@
 
 #ifdef CONFIG_PCI
 EXPORT_SYMBOL_GPL(pci_test_config_bits);
+EXPORT_SYMBOL_GPL(ata_pci_shutdown_one);
 EXPORT_SYMBOL_GPL(ata_pci_remove_one);
 #ifdef CONFIG_PM
 EXPORT_SYMBOL_GPL(ata_pci_device_do_suspend);

diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index 99a3811..86aab14 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c

@@ -1504,12 +1504,14 @@
 static void
 fore200e_close(struct atm_vcc* vcc)
 {
-    struct fore200e*        fore200e = FORE200E_DEV(vcc->dev);
     struct fore200e_vcc*    fore200e_vcc;
+    struct fore200e*        fore200e;
     struct fore200e_vc_map* vc_map;
     unsigned long           flags;
 
     ASSERT(vcc);
+    fore200e = FORE200E_DEV(vcc->dev);
+
     ASSERT((vcc->vpi >= 0) && (vcc->vpi < 1<<FORE200E_VPI_BITS));
     ASSERT((vcc->vci >= 0) && (vcc->vci < 1<<FORE200E_VCI_BITS));
 
@@ -1554,10 +1556,10 @@
 static int
 fore200e_send(struct atm_vcc *vcc, struct sk_buff *skb)
 {
-    struct fore200e*        fore200e     = FORE200E_DEV(vcc->dev);
-    struct fore200e_vcc*    fore200e_vcc = FORE200E_VCC(vcc);
+    struct fore200e*        fore200e;
+    struct fore200e_vcc*    fore200e_vcc;
     struct fore200e_vc_map* vc_map;
-    struct host_txq*        txq          = &fore200e->host_txq;
+    struct host_txq*        txq;
     struct host_txq_entry*  entry;
     struct tpd*             tpd;
     struct tpd_haddr        tpd_haddr;
@@ -1570,9 +1572,18 @@
     unsigned char*          data;
     unsigned long           flags;
 
-    ASSERT(vcc);
-    ASSERT(fore200e);
-    ASSERT(fore200e_vcc);
+    if (!vcc)
+        return -EINVAL;
+
+    fore200e = FORE200E_DEV(vcc->dev);
+    fore200e_vcc = FORE200E_VCC(vcc);
+
+    if (!fore200e)
+        return -EINVAL;
+
+    txq = &fore200e->host_txq;
+    if (!fore200e_vcc)
+        return -EINVAL;
 
     if (!test_bit(ATM_VF_READY, &vcc->flags)) {
 	DPRINTK(1, "VC %d.%d.%d not ready for tx\n", vcc->itf, vcc->vpi, vcc->vpi);

diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 3e63a90..3360746 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig

@@ -60,6 +60,15 @@
 	  rescue mode with init=/bin/sh, even when the /dev directory
 	  on the rootfs is completely empty.
 
+config DEVTMPFS_SAFE
+	bool "Automount devtmpfs with nosuid/noexec"
+	depends on DEVTMPFS_MOUNT
+	default y
+	help
+	  This instructs the kernel to automount devtmpfs with the
+	  MS_NOEXEC and MS_NOSUID mount flags, which can prevent
+	  certain kinds of code-execution attack on embedded platforms.
+
 config STANDALONE
 	bool "Select only drivers that don't need compile-time external firmware"
 	default y

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 11d24a5..db5ab81 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c

@@ -470,7 +470,10 @@
 	atomic_inc(&probe_count);
 	pr_debug("bus: '%s': %s: probing driver %s with device %s\n",
 		 drv->bus->name, __func__, drv->name, dev_name(dev));
-	WARN_ON(!list_empty(&dev->devres_head));
+	if (!list_empty(&dev->devres_head)) {
+		dev_crit(dev, "Resources present before probing\n");
+		return -EBUSY;
+	}
 
 re_probe:
 	dev->driver = drv;
@@ -611,6 +614,7 @@
 		return -EBUSY;
 	return 0;
 }
+EXPORT_SYMBOL(driver_probe_done);
 
 /**
  * wait_for_device_probe

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index f776807..5b6b1b7e 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c

@@ -349,6 +349,7 @@
 int devtmpfs_mount(const char *mntdir)
 {
 	int err;
+	int mflags = MS_SILENT;
 
 	if (!mount_dev)
 		return 0;
@@ -356,8 +357,10 @@
 	if (!thread)
 		return 0;
 
-	err = ksys_mount("devtmpfs", (char *)mntdir, "devtmpfs", MS_SILENT,
-			 NULL);
+#ifdef CONFIG_DEVTMPFS_SAFE
+	mflags |= MS_NOEXEC | MS_NOSUID;
+#endif
+	err = ksys_mount("devtmpfs", (char *)mntdir, "devtmpfs", mflags, NULL);
 	if (err)
 		printk(KERN_INFO "devtmpfs: error mounting %i\n", err);
 	else

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index e9be1f5..d1f901b 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c

@@ -27,6 +27,7 @@
 #include <linux/limits.h>
 #include <linux/property.h>
 #include <linux/kmemleak.h>
+#include <linux/types.h>
 
 #include "base.h"
 #include "power/power.h"
@@ -67,7 +68,7 @@
 struct resource *platform_get_resource(struct platform_device *dev,
 				       unsigned int type, unsigned int num)
 {
-	int i;
+	u32 i;
 
 	for (i = 0; i < dev->num_resources; i++) {
 		struct resource *r = &dev->resource[i];
@@ -162,7 +163,7 @@
 					      unsigned int type,
 					      const char *name)
 {
-	int i;
+	u32 i;
 
 	for (i = 0; i < dev->num_resources; i++) {
 		struct resource *r = &dev->resource[i];
@@ -359,7 +360,8 @@
  */
 int platform_device_add(struct platform_device *pdev)
 {
-	int i, ret;
+	u32 i;
+	int ret;
 
 	if (!pdev)
 		return -EINVAL;
@@ -425,7 +427,7 @@
 		pdev->id = PLATFORM_DEVID_AUTO;
 	}
 
-	while (--i >= 0) {
+	while (i--) {
 		struct resource *r = &pdev->resource[i];
 		if (r->parent)
 			release_resource(r);
@@ -446,7 +448,7 @@
  */
 void platform_device_del(struct platform_device *pdev)
 {
-	int i;
+	u32 i;
 
 	if (pdev) {
 		device_remove_properties(&pdev->dev);

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 3b382a7..09678fa 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c

@@ -1773,7 +1773,12 @@
 		dev->power.direct_complete = false;
 
 	if (dev->power.direct_complete) {
-		if (pm_runtime_status_suspended(dev)) {
+		/*
+		 * Check if we're runtime suspended. If not, try to runtime
+		 * suspend for autosuspend cases.
+		 */
+		if (pm_runtime_status_suspended(dev) ||
+		    !pm_runtime_suspend(dev)) {
 			pm_runtime_disable(dev);
 			if (pm_runtime_status_suspended(dev))
 				goto Complete;

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 2dfa2e0..33773c5 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c

@@ -818,7 +818,7 @@
 	srcuidx = srcu_read_lock(&wakeup_srcu);
 	list_for_each_entry_rcu(ws, &wakeup_sources, entry) {
 		if (ws->active) {
-			pr_debug("active wakeup source: %s\n", ws->name);
+			pm_pr_dbg("active wakeup source: %s\n", ws->name);
 			active = 1;
 		} else if (!active &&
 			   (!last_activity_ws ||
@@ -829,7 +829,7 @@
 	}
 
 	if (!active && last_activity_ws)
-		pr_debug("last active wakeup source: %s\n",
+		pm_pr_dbg("last active wakeup source: %s\n",
 			last_activity_ws->name);
 	srcu_read_unlock(&wakeup_srcu, srcuidx);
 }
@@ -859,7 +859,7 @@
 	raw_spin_unlock_irqrestore(&events_lock, flags);
 
 	if (ret) {
-		pr_debug("PM: Wakeup pending, aborting suspend\n");
+		pm_pr_dbg("Wakeup pending, aborting suspend\n");
 		pm_print_active_wakeup_sources();
 	}
 

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 17defbf..02e8fff 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c

@@ -463,6 +463,25 @@
 	return kobj;
 }
 
+static inline void brd_check_and_reset_par(void)
+{
+	if (unlikely(!max_part))
+		max_part = 1;
+
+	/*
+	 * make sure 'max_part' can be divided exactly by (1U << MINORBITS),
+	 * otherwise, it is possiable to get same dev_t when adding partitions.
+	 */
+	if ((1U << MINORBITS) % max_part != 0)
+		max_part = 1UL << fls(max_part);
+
+	if (max_part > DISK_MAX_PARTS) {
+		pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n",
+			DISK_MAX_PARTS, DISK_MAX_PARTS);
+		max_part = DISK_MAX_PARTS;
+	}
+}
+
 static int __init brd_init(void)
 {
 	struct brd_device *brd, *next;
@@ -486,8 +505,7 @@
 	if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
 		return -EIO;
 
-	if (unlikely(!max_part))
-		max_part = 1;
+	brd_check_and_reset_par();
 
 	for (i = 0; i < rd_nr; i++) {
 		brd = brd_alloc(i);

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index e71589e..bf222c4 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c

@@ -852,14 +852,17 @@
 /* selects the fdc and drive, and enables the fdc's input/dma. */
 static void set_fdc(int drive)
 {
+	unsigned int new_fdc = fdc;
+
 	if (drive >= 0 && drive < N_DRIVE) {
-		fdc = FDC(drive);
+		new_fdc = FDC(drive);
 		current_drive = drive;
 	}
-	if (fdc != 1 && fdc != 0) {
+	if (new_fdc >= N_FDC) {
 		pr_info("bad fdc value\n");
 		return;
 	}
+	fdc = new_fdc;
 	set_dor(fdc, ~0, 8);
 #if N_FDC > 1
 	set_dor(1 - fdc, ~8, 0);

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 9cd231a..345e987 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c

@@ -426,11 +426,12 @@
 	 * information.
 	 */
 	struct file *file = lo->lo_backing_file;
+	struct request_queue *q = lo->lo_queue;
 	int ret;
 
 	mode |= FALLOC_FL_KEEP_SIZE;
 
-	if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
+	if (!blk_queue_discard(q)) {
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
@@ -459,7 +460,9 @@
 
 	if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) ||
 	    req_op(rq) != REQ_OP_READ) {
-		if (cmd->ret < 0)
+		if (cmd->ret == -EOPNOTSUPP)
+			ret = BLK_STS_NOTSUPP;
+		else if (cmd->ret < 0)
 			ret = BLK_STS_IOERR;
 		goto end_io;
 	}
@@ -863,6 +866,21 @@
 	struct file *file = lo->lo_backing_file;
 	struct inode *inode = file->f_mapping->host;
 	struct request_queue *q = lo->lo_queue;
+	struct request_queue *backingq;
+
+	/*
+	 * If the backing device is a block device, mirror its zeroing
+	 * capability. REQ_OP_DISCARD translates to a zero-out even when backed
+	 * by block devices to keep consistent behavior with file-backed loop
+	 * devices.
+	 */
+	if (S_ISBLK(inode->i_mode) && !lo->lo_encrypt_key_size) {
+		backingq = bdev_get_queue(inode->i_bdev);
+		blk_queue_max_discard_sectors(q,
+			backingq->limits.max_write_zeroes_sectors);
+
+		blk_queue_max_write_zeroes_sectors(q,
+			backingq->limits.max_write_zeroes_sectors);
 
 	/*
 	 * We use punch hole to reclaim the free space used by the
@@ -870,22 +888,24 @@
 	 * encryption is enabled, because it may give an attacker
 	 * useful information.
 	 */
-	if ((!file->f_op->fallocate) ||
-	    lo->lo_encrypt_key_size) {
+	} else if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
 		q->limits.discard_granularity = 0;
 		q->limits.discard_alignment = 0;
 		blk_queue_max_discard_sectors(q, 0);
 		blk_queue_max_write_zeroes_sectors(q, 0);
-		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
-		return;
+
+	} else {
+		q->limits.discard_granularity = inode->i_sb->s_blocksize;
+		q->limits.discard_alignment = 0;
+
+		blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
+		blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
 	}
 
-	q->limits.discard_granularity = inode->i_sb->s_blocksize;
-	q->limits.discard_alignment = 0;
-
-	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
-	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
-	blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
+	if (q->limits.max_write_zeroes_sectors)
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
+	else
+		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
 }
 
 static void loop_unprepare_queue(struct loop_device *lo)
@@ -911,6 +931,24 @@
 	return 0;
 }
 
+static void loop_update_rotational(struct loop_device *lo)
+{
+	struct file *file = lo->lo_backing_file;
+	struct inode *file_inode = file->f_mapping->host;
+	struct block_device *file_bdev = file_inode->i_sb->s_bdev;
+	struct request_queue *q = lo->lo_queue;
+	bool nonrot = true;
+
+	/* not all filesystems (e.g. tmpfs) have a sb->s_bdev */
+	if (file_bdev)
+		nonrot = blk_queue_nonrot(bdev_get_queue(file_bdev));
+
+	if (nonrot)
+		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+	else
+		blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
+}
+
 static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 		       struct block_device *bdev, unsigned int arg)
 {
@@ -974,6 +1012,7 @@
 	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
 		blk_queue_write_cache(lo->lo_queue, true, false);
 
+	loop_update_rotational(lo);
 	loop_update_dio(lo);
 	set_capacity(lo->lo_disk, size);
 	bd_set_size(bdev, size << 9);
@@ -1904,7 +1943,10 @@
  failed:
 	/* complete non-aio request */
 	if (!cmd->use_aio || ret) {
-		cmd->ret = ret ? -EIO : 0;
+		if (ret == -EOPNOTSUPP)
+			cmd->ret = ret;
+		else
+			cmd->ret = ret ? -EIO : 0;
 		blk_mq_complete_request(rq);
 	}
 }

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index b9d321b..226103a 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c

@@ -1216,6 +1216,16 @@
 		args = kzalloc(sizeof(*args), GFP_KERNEL);
 		if (!args) {
 			sock_shutdown(nbd);
+			/*
+			 * If num_connections is m (2 < m),
+			 * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful.
+			 * But NO.(n + 1) failed. We still have n recv threads.
+			 * So, add flush_workqueue here to prevent recv threads
+			 * dropping the last config_refs and trying to destroy
+			 * the workqueue from inside the workqueue.
+			 */
+			if (i)
+				flush_workqueue(nbd->recv_workq);
 			return -ENOMEM;
 		}
 		sk_set_memalloc(config->socks[i]->sock->sk);

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b942f4c..d3ad1b8 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c

@@ -2097,7 +2097,7 @@
 			       u64 off, u64 len)
 {
 	struct ceph_file_extent ex = { off, len };
-	union rbd_img_fill_iter dummy;
+	union rbd_img_fill_iter dummy = {};
 	struct rbd_img_fill_ctx fctx = {
 		.pos_type = OBJ_REQUEST_NODATA,
 		.pos = &dummy,

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index dd64f58..d54cacc 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c

@@ -18,6 +18,7 @@
 
 #define PART_BITS 4
 #define VQ_NAME_LEN 16
+#define DISCARD_MAX_SEGMENTS 256
 
 static int major;
 static DEFINE_IDA(vd_index_ida);
@@ -172,10 +173,50 @@
 	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
 }
 
+
+static inline int virtblk_setup_discard_write_zeroes(struct request *req,
+						     bool unmap)
+{
+	unsigned short segments = blk_rq_nr_discard_segments(req);
+	unsigned short n = 0;
+	struct virtio_blk_discard_write_zeroes *range;
+	struct bio *bio;
+	u32 flags = 0;
+
+	if (unmap)
+		flags |= VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP;
+
+	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
+	if (!range)
+		return -ENOMEM;
+
+	__rq_for_each_bio(bio, req) {
+		u64 sector = bio->bi_iter.bi_sector;
+		u32 num_sectors = bio->bi_iter.bi_size >> 9;
+
+		range[n].flags = cpu_to_le32(flags);
+		range[n].num_sectors = cpu_to_le32(num_sectors);
+		range[n].sector = cpu_to_le64(sector);
+		n++;
+	}
+
+	req->special_vec.bv_page = virt_to_page(range);
+	req->special_vec.bv_offset = offset_in_page(range);
+	req->special_vec.bv_len = sizeof(*range) * segments;
+	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+	return 0;
+}
+
 static inline void virtblk_request_done(struct request *req)
 {
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 
+	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+		kfree(page_address(req->special_vec.bv_page) +
+		      req->special_vec.bv_offset);
+	}
+
 	switch (req_op(req)) {
 	case REQ_OP_SCSI_IN:
 	case REQ_OP_SCSI_OUT:
@@ -225,6 +266,7 @@
 	int qid = hctx->queue_num;
 	int err;
 	bool notify = false;
+	bool unmap = false;
 	u32 type;
 
 	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
@@ -237,6 +279,13 @@
 	case REQ_OP_FLUSH:
 		type = VIRTIO_BLK_T_FLUSH;
 		break;
+	case REQ_OP_DISCARD:
+		type = VIRTIO_BLK_T_DISCARD;
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		type = VIRTIO_BLK_T_WRITE_ZEROES;
+		unmap = !(req->cmd_flags & REQ_NOUNMAP);
+		break;
 	case REQ_OP_SCSI_IN:
 	case REQ_OP_SCSI_OUT:
 		type = VIRTIO_BLK_T_SCSI_CMD;
@@ -256,6 +305,12 @@
 
 	blk_mq_start_request(req);
 
+	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
+		err = virtblk_setup_discard_write_zeroes(req, unmap);
+		if (err)
+			return BLK_STS_RESOURCE;
+	}
+
 	num = blk_rq_map_sg(hctx->queue, req, vbr->sg);
 	if (num) {
 		if (rq_data_dir(req) == WRITE)
@@ -779,6 +834,42 @@
 	if (!err && opt_io_size)
 		blk_queue_io_opt(q, blk_size * opt_io_size);
 
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
+		q->limits.discard_granularity = blk_size;
+
+		virtio_cread(vdev, struct virtio_blk_config,
+				discard_sector_alignment, &v);
+		if (v)
+			q->limits.discard_alignment = v << 9;
+		else
+			q->limits.discard_alignment = 0;
+
+		virtio_cread(vdev, struct virtio_blk_config,
+				max_discard_sectors, &v);
+		if (v)
+			blk_queue_max_discard_sectors(q, v);
+		else
+			blk_queue_max_discard_sectors(q, UINT_MAX);
+
+		virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
+				&v);
+		if (v && v <= DISCARD_MAX_SEGMENTS)
+			blk_queue_max_discard_segments(q, v);
+		else
+			blk_queue_max_discard_segments(q, DISCARD_MAX_SEGMENTS);
+
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
+	}
+
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
+		virtio_cread(vdev, struct virtio_blk_config,
+				max_write_zeroes_sectors, &v);
+		if (v)
+			blk_queue_max_write_zeroes_sectors(q, v);
+		else
+			blk_queue_max_write_zeroes_sectors(q, UINT_MAX);
+	}
+
 	virtblk_update_capacity(vblk, false);
 	virtio_device_ready(vdev);
 
@@ -887,14 +978,14 @@
 	VIRTIO_BLK_F_SCSI,
 #endif
 	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
-	VIRTIO_BLK_F_MQ,
+	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
 }
 ;
 static unsigned int features[] = {
 	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
 	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
 	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
-	VIRTIO_BLK_F_MQ,
+	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
 };
 
 static struct virtio_driver virtio_blk = {

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index 4ed0a78..4d9a388 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c

@@ -20,6 +20,7 @@
 
 static const char * const backends[] = {
 	"lzo",
+	"lzo-rle",
 #if IS_ENABLED(CONFIG_CRYPTO_LZ4)
 	"lz4",
 #endif

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 76abe40..e2c9e76 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c

@@ -41,7 +41,7 @@
 static DEFINE_MUTEX(zram_index_mutex);
 
 static int zram_major;
-static const char *default_compressor = "lzo";
+static const char *default_compressor = "lzo-rle";
 
 /* Module params (documentation at end) */
 static unsigned int num_devices = 1;
@@ -1588,23 +1588,55 @@
 	return len;
 }
 
-static int zram_open(struct block_device *bdev, fmode_t mode)
+int zram_open(struct block_device *bdev, fmode_t mode)
 {
-	int ret = 0;
 	struct zram *zram;
+	int open_count;
 
 	WARN_ON(!mutex_is_locked(&bdev->bd_mutex));
 
 	zram = bdev->bd_disk->private_data;
 	/* zram was claimed to reset so open request fails */
 	if (zram->claim)
-		ret = -EBUSY;
+		goto out_busy;
 
-	return ret;
+	/*
+	 * Chromium OS specific behavior:
+	 * sys_swapon opens the device once to populate its swapinfo->swap_file
+	 * and once when it claims the block device (blkdev_get).  By limiting
+	 * the maximum number of opens to 2, we ensure there are no prior open
+	 * references before swap is enabled.
+	 * (Note, kzalloc ensures nr_opens starts at 0.)
+	 */
+	open_count = atomic_inc_return(&zram->nr_opens);
+	if (open_count > 2)
+		goto out_busy_dec_nr_opens;
+	/*
+	 * swapon(2) claims the block device after setup.  If a zram is claimed
+	 * then open attempts are rejected. This is belt-and-suspenders as the
+	 * the block device and swap_file will both hold open nr_opens until
+	 * swapoff(2) is called.
+	 */
+	if (bdev->bd_holder != NULL)
+		goto out_busy_dec_nr_opens;
+
+	return 0;
+
+out_busy_dec_nr_opens:
+	atomic_dec(&zram->nr_opens);
+out_busy:
+	return -EBUSY;
+}
+
+void zram_release(struct gendisk *disk, fmode_t mode)
+{
+	struct zram *zram = disk->private_data;
+	atomic_dec(&zram->nr_opens);
 }
 
 static const struct block_device_operations zram_devops = {
 	.open = zram_open,
+	.release = zram_release,
 	.swap_slot_free_notify = zram_slot_free_notify,
 	.rw_page = zram_rw_page,
 	.owner = THIS_MODULE

diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index d1095df..e934528 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h

@@ -16,6 +16,8 @@
 #define _ZRAM_DRV_H_
 
 #include <linux/rwsem.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
 #include <linux/zsmalloc.h>
 #include <linux/crypto.h>
 
@@ -93,6 +95,8 @@
 	 * the number of pages zram can consume for storing compressed data
 	 */
 	unsigned long limit_pages;
+	int max_comp_streams;
+	atomic_t nr_opens;	/* number of active file handles */
 
 	struct zram_stats stats;
 	/*

diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index af44db2..fec6794 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c

@@ -735,10 +735,14 @@
 	flags = ipmi_ssif_lock_cond(ssif_info, &oflags);
 	msg = ssif_info->curr_msg;
 	if (msg) {
+		if (data) {
+			if (len > IPMI_MAX_MSG_LENGTH)
+				len = IPMI_MAX_MSG_LENGTH;
+			memcpy(msg->rsp, data, len);
+		} else {
+			len = 0;
+		}
 		msg->rsp_size = len;
-		if (msg->rsp_size > IPMI_MAX_MSG_LENGTH)
-			msg->rsp_size = IPMI_MAX_MSG_LENGTH;
-		memcpy(msg->rsp, data, msg->rsp_size);
 		ssif_info->curr_msg = NULL;
 	}
 

diff --git a/drivers/clk/clk-bulk.c b/drivers/clk/clk-bulk.c
index 6904ed6..6a7118d 100644
--- a/drivers/clk/clk-bulk.c
+++ b/drivers/clk/clk-bulk.c

@@ -17,8 +17,65 @@
  */
 
 #include <linux/clk.h>
+#include <linux/clk-provider.h>
 #include <linux/device.h>
 #include <linux/export.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+
+static int __must_check of_clk_bulk_get(struct device_node *np, int num_clks,
+					struct clk_bulk_data *clks)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < num_clks; i++)
+		clks[i].clk = NULL;
+
+	for (i = 0; i < num_clks; i++) {
+		clks[i].clk = of_clk_get(np, i);
+		if (IS_ERR(clks[i].clk)) {
+			ret = PTR_ERR(clks[i].clk);
+			pr_err("%pOF: Failed to get clk index: %d ret: %d\n",
+			       np, i, ret);
+			clks[i].clk = NULL;
+			goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	clk_bulk_put(i, clks);
+
+	return ret;
+}
+
+static int __must_check of_clk_bulk_get_all(struct device_node *np,
+					    struct clk_bulk_data **clks)
+{
+	struct clk_bulk_data *clk_bulk;
+	int num_clks;
+	int ret;
+
+	num_clks = of_clk_get_parent_count(np);
+	if (!num_clks)
+		return 0;
+
+	clk_bulk = kmalloc_array(num_clks, sizeof(*clk_bulk), GFP_KERNEL);
+	if (!clk_bulk)
+		return -ENOMEM;
+
+	ret = of_clk_bulk_get(np, num_clks, clk_bulk);
+	if (ret) {
+		kfree(clk_bulk);
+		return ret;
+	}
+
+	*clks = clk_bulk;
+
+	return num_clks;
+}
 
 void clk_bulk_put(int num_clks, struct clk_bulk_data *clks)
 {
@@ -59,6 +116,29 @@
 }
 EXPORT_SYMBOL(clk_bulk_get);
 
+void clk_bulk_put_all(int num_clks, struct clk_bulk_data *clks)
+{
+	if (IS_ERR_OR_NULL(clks))
+		return;
+
+	clk_bulk_put(num_clks, clks);
+
+	kfree(clks);
+}
+EXPORT_SYMBOL(clk_bulk_put_all);
+
+int __must_check clk_bulk_get_all(struct device *dev,
+				  struct clk_bulk_data **clks)
+{
+	struct device_node *np = dev_of_node(dev);
+
+	if (!np)
+		return 0;
+
+	return of_clk_bulk_get_all(np, clks);
+}
+EXPORT_SYMBOL(clk_bulk_get_all);
+
 #ifdef CONFIG_HAVE_CLK_PREPARE
 
 /**

diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
index d854e26..12c8745 100644
--- a/drivers/clk/clk-devres.c
+++ b/drivers/clk/clk-devres.c

@@ -70,6 +70,30 @@
 }
 EXPORT_SYMBOL_GPL(devm_clk_bulk_get);
 
+int __must_check devm_clk_bulk_get_all(struct device *dev,
+				       struct clk_bulk_data **clks)
+{
+	struct clk_bulk_devres *devres;
+	int ret;
+
+	devres = devres_alloc(devm_clk_bulk_release,
+			      sizeof(*devres), GFP_KERNEL);
+	if (!devres)
+		return -ENOMEM;
+
+	ret = clk_bulk_get_all(dev, &devres->clks);
+	if (ret > 0) {
+		*clks = devres->clks;
+		devres->num_clks = ret;
+		devres_add(dev, devres);
+	} else {
+		devres_free(devres);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devm_clk_bulk_get_all);
+
 static int devm_clk_match(struct device *dev, void *res, void *data)
 {
 	struct clk **c = res;

diff --git a/drivers/clk/qcom/clk-rcg2.c b/drivers/clk/qcom/clk-rcg2.c
index 51b2388..ee693e1 100644
--- a/drivers/clk/qcom/clk-rcg2.c
+++ b/drivers/clk/qcom/clk-rcg2.c

@@ -203,6 +203,9 @@
 
 	clk_flags = clk_hw_get_flags(hw);
 	p = clk_hw_get_parent_by_index(hw, index);
+	if (!p)
+		return -EINVAL;
+
 	if (clk_flags & CLK_SET_RATE_PARENT) {
 		rate = f->freq;
 		if (f->pre_div) {

diff --git a/drivers/clk/sunxi-ng/ccu-sun50i-a64.c b/drivers/clk/sunxi-ng/ccu-sun50i-a64.c
index dec4a13..9ac6c29 100644
--- a/drivers/clk/sunxi-ng/ccu-sun50i-a64.c
+++ b/drivers/clk/sunxi-ng/ccu-sun50i-a64.c

@@ -901,11 +901,26 @@
 	.num_resets	= ARRAY_SIZE(sun50i_a64_ccu_resets),
 };
 
+static struct ccu_pll_nb sun50i_a64_pll_cpu_nb = {
+	.common	= &pll_cpux_clk.common,
+	/* copy from pll_cpux_clk */
+	.enable	= BIT(31),
+	.lock	= BIT(28),
+};
+
+static struct ccu_mux_nb sun50i_a64_cpu_nb = {
+	.common		= &cpux_clk.common,
+	.cm		= &cpux_clk.mux,
+	.delay_us	= 1, /* > 8 clock cycles at 24 MHz */
+	.bypass_index	= 1, /* index of 24 MHz oscillator */
+};
+
 static int sun50i_a64_ccu_probe(struct platform_device *pdev)
 {
 	struct resource *res;
 	void __iomem *reg;
 	u32 val;
+	int ret;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	reg = devm_ioremap_resource(&pdev->dev, res);
@@ -919,7 +934,18 @@
 
 	writel(0x515, reg + SUN50I_A64_PLL_MIPI_REG);
 
-	return sunxi_ccu_probe(pdev->dev.of_node, reg, &sun50i_a64_ccu_desc);
+	ret = sunxi_ccu_probe(pdev->dev.of_node, reg, &sun50i_a64_ccu_desc);
+	if (ret)
+		return ret;
+
+	/* Gate then ungate PLL CPU after any rate changes */
+	ccu_pll_notifier_register(&sun50i_a64_pll_cpu_nb);
+
+	/* Reparent CPU during PLL CPU rate changes */
+	ccu_mux_notifier_register(pll_cpux_clk.common.hw.clk,
+				  &sun50i_a64_cpu_nb);
+
+	return 0;
 }
 
 static const struct of_device_id sun50i_a64_ccu_ids[] = {

diff --git a/drivers/clk/uniphier/clk-uniphier-peri.c b/drivers/clk/uniphier/clk-uniphier-peri.c
index 89b3ac3..8b75dc1 100644
--- a/drivers/clk/uniphier/clk-uniphier-peri.c
+++ b/drivers/clk/uniphier/clk-uniphier-peri.c

@@ -27,8 +27,8 @@
 #define UNIPHIER_PERI_CLK_FI2C(idx, ch)					\
 	UNIPHIER_CLK_GATE("i2c" #ch, (idx), "i2c", 0x24, 24 + (ch))
 
-#define UNIPHIER_PERI_CLK_SCSSI(idx)					\
-	UNIPHIER_CLK_GATE("scssi", (idx), "spi", 0x20, 17)
+#define UNIPHIER_PERI_CLK_SCSSI(idx, ch)				\
+	UNIPHIER_CLK_GATE("scssi" #ch, (idx), "spi", 0x20, 17 + (ch))
 
 #define UNIPHIER_PERI_CLK_MCSSI(idx)					\
 	UNIPHIER_CLK_GATE("mcssi", (idx), "spi", 0x24, 14)
@@ -44,7 +44,7 @@
 	UNIPHIER_PERI_CLK_I2C(6, 2),
 	UNIPHIER_PERI_CLK_I2C(7, 3),
 	UNIPHIER_PERI_CLK_I2C(8, 4),
-	UNIPHIER_PERI_CLK_SCSSI(11),
+	UNIPHIER_PERI_CLK_SCSSI(11, 0),
 	{ /* sentinel */ }
 };
 
@@ -60,7 +60,10 @@
 	UNIPHIER_PERI_CLK_FI2C(8, 4),
 	UNIPHIER_PERI_CLK_FI2C(9, 5),
 	UNIPHIER_PERI_CLK_FI2C(10, 6),
-	UNIPHIER_PERI_CLK_SCSSI(11),
-	UNIPHIER_PERI_CLK_MCSSI(12),
+	UNIPHIER_PERI_CLK_SCSSI(11, 0),
+	UNIPHIER_PERI_CLK_SCSSI(12, 1),
+	UNIPHIER_PERI_CLK_SCSSI(13, 2),
+	UNIPHIER_PERI_CLK_SCSSI(14, 3),
+	UNIPHIER_PERI_CLK_MCSSI(15),
 	{ /* sentinel */ }
 };

diff --git a/drivers/clocksource/bcm2835_timer.c b/drivers/clocksource/bcm2835_timer.c
index 60da253..1082dce 100644
--- a/drivers/clocksource/bcm2835_timer.c
+++ b/drivers/clocksource/bcm2835_timer.c

@@ -134,7 +134,7 @@
 	ret = setup_irq(irq, &timer->act);
 	if (ret) {
 		pr_err("Can't set up timer IRQ\n");
-		goto err_iounmap;
+		goto err_timer_free;
 	}
 
 	clockevents_config_and_register(&timer->evt, freq, 0xf, 0xffffffff);
@@ -143,6 +143,9 @@
 
 	return 0;
 
+err_timer_free:
+	kfree(timer);
+
 err_iounmap:
 	iounmap(base);
 	return ret;

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index e35c397..8cfd8f4 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c

@@ -2280,6 +2280,7 @@
 		ret = cpufreq_start_governor(policy);
 		if (!ret) {
 			pr_debug("cpufreq: governor change\n");
+			sched_cpufreq_governor_change(policy, old_gov);
 			return 0;
 		}
 		cpufreq_exit_governor(policy);

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 6df894d..96a3a9b 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c

@@ -221,7 +221,7 @@
 	}
 
 	/* Take note of the planned idle state. */
-	sched_idle_set_state(target_state);
+	sched_idle_set_state(target_state, index);
 
 	trace_cpu_idle_rcuidle(index, dev->cpu);
 	time_start = ns_to_ktime(local_clock());
@@ -235,7 +235,7 @@
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
 
 	/* The cpu is no longer idle or about to enter idle. */
-	sched_idle_set_state(NULL);
+	sched_idle_set_state(NULL, -1);
 
 	if (broadcast) {
 		if (WARN_ON_ONCE(!irqs_disabled()))

diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c
index 8b749c7..28d2411 100644
--- a/drivers/crypto/chelsio/chtls/chtls_cm.c
+++ b/drivers/crypto/chelsio/chtls/chtls_cm.c

@@ -731,6 +731,14 @@
 	return 0;
 }
 
+static void chtls_purge_wr_queue(struct sock *sk)
+{
+	struct sk_buff *skb;
+
+	while ((skb = dequeue_wr(sk)) != NULL)
+		kfree_skb(skb);
+}
+
 static void chtls_release_resources(struct sock *sk)
 {
 	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
@@ -745,6 +753,11 @@
 	kfree_skb(csk->txdata_skb_cache);
 	csk->txdata_skb_cache = NULL;
 
+	if (csk->wr_credits != csk->wr_max_credits) {
+		chtls_purge_wr_queue(sk);
+		chtls_reset_wr_list(csk);
+	}
+
 	if (csk->l2t_entry) {
 		cxgb4_l2t_release(csk->l2t_entry);
 		csk->l2t_entry = NULL;
@@ -1714,6 +1727,7 @@
 		else
 			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
 	}
+	kfree_skb(skb);
 }
 
 static void chtls_close_con_rpl(struct sock *sk, struct sk_buff *skb)
@@ -2041,19 +2055,6 @@
 	return 0;
 }
 
-static struct sk_buff *dequeue_wr(struct sock *sk)
-{
-	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
-	struct sk_buff *skb = csk->wr_skb_head;
-
-	if (likely(skb)) {
-	/* Don't bother clearing the tail */
-		csk->wr_skb_head = WR_SKB_CB(skb)->next_wr;
-		WR_SKB_CB(skb)->next_wr = NULL;
-	}
-	return skb;
-}
-
 static void chtls_rx_ack(struct sock *sk, struct sk_buff *skb)
 {
 	struct cpl_fw4_ack *hdr = cplhdr(skb) + RSS_HDR;

diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.h b/drivers/crypto/chelsio/chtls/chtls_cm.h
index 78eb3af..4282d8a 100644
--- a/drivers/crypto/chelsio/chtls/chtls_cm.h
+++ b/drivers/crypto/chelsio/chtls/chtls_cm.h

@@ -188,6 +188,12 @@
 	kfree_skb(skb);
 }
 
+static inline void chtls_reset_wr_list(struct chtls_sock *csk)
+{
+	csk->wr_skb_head = NULL;
+	csk->wr_skb_tail = NULL;
+}
+
 static inline void enqueue_wr(struct chtls_sock *csk, struct sk_buff *skb)
 {
 	WR_SKB_CB(skb)->next_wr = NULL;
@@ -200,4 +206,19 @@
 		WR_SKB_CB(csk->wr_skb_tail)->next_wr = skb;
 	csk->wr_skb_tail = skb;
 }
+
+static inline struct sk_buff *dequeue_wr(struct sock *sk)
+{
+	struct chtls_sock *csk = rcu_dereference_sk_user_data(sk);
+	struct sk_buff *skb = NULL;
+
+	skb = csk->wr_skb_head;
+
+	if (likely(skb)) {
+	 /* Don't bother clearing the tail */
+		csk->wr_skb_head = WR_SKB_CB(skb)->next_wr;
+		WR_SKB_CB(skb)->next_wr = NULL;
+	}
+	return skb;
+}
 #endif

diff --git a/drivers/crypto/chelsio/chtls/chtls_hw.c b/drivers/crypto/chelsio/chtls/chtls_hw.c
index 4909607..64d2482 100644
--- a/drivers/crypto/chelsio/chtls/chtls_hw.c
+++ b/drivers/crypto/chelsio/chtls/chtls_hw.c

@@ -361,6 +361,7 @@
 	kwr->sc_imm.cmd_more = cpu_to_be32(ULPTX_CMD_V(ULP_TX_SC_IMM));
 	kwr->sc_imm.len = cpu_to_be32(klen);
 
+	lock_sock(sk);
 	/* key info */
 	kctx = (struct _key_ctx *)(kwr + 1);
 	ret = chtls_key_info(csk, kctx, keylen, optname);
@@ -399,8 +400,10 @@
 		csk->tlshws.txkey = keyid;
 	}
 
+	release_sock(sk);
 	return ret;
 out_notcb:
+	release_sock(sk);
 	free_tls_keyid(sk);
 out_nokey:
 	kfree_skb(skb);

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index a89ebd9..41aaac6 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c

@@ -658,7 +658,7 @@
 	 * No 'host' or dax_operations since there is no access to this
 	 * device outside of mmap of the resulting character device.
 	 */
-	dax_dev = alloc_dax(dev_dax, NULL, NULL);
+	dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
 	if (!dax_dev) {
 		rc = -ENOMEM;
 		goto err_dax;

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 6e928f3..e3234fc 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c

@@ -72,26 +72,18 @@
 EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
 #endif
 
-/**
- * __bdev_dax_supported() - Check if the device supports dax for filesystem
- * @bdev: block device to check
- * @blocksize: The block size of the device
- *
- * This is a library function for filesystems to check if the block device
- * can be mounted with dax option.
- *
- * Return: true if supported, false if unsupported
- */
-bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
+bool __generic_fsdax_supported(struct dax_device *dax_dev,
+		struct block_device *bdev, int blocksize, sector_t start,
+		sector_t sectors)
 {
-	struct dax_device *dax_dev;
 	bool dax_enabled = false;
-	struct request_queue *q;
-	pgoff_t pgoff;
-	int err, id;
-	pfn_t pfn;
-	long len;
+	pgoff_t pgoff, pgoff_end;
 	char buf[BDEVNAME_SIZE];
+	void *kaddr, *end_kaddr;
+	pfn_t pfn, end_pfn;
+	sector_t last_page;
+	long len, len2;
+	int err, id;
 
 	if (blocksize != PAGE_SIZE) {
 		pr_debug("%s: error: unsupported blocksize for dax\n",
@@ -99,36 +91,29 @@
 		return false;
 	}
 
-	q = bdev_get_queue(bdev);
-	if (!q || !blk_queue_dax(q)) {
-		pr_debug("%s: error: request queue doesn't support dax\n",
-				bdevname(bdev, buf));
-		return false;
-	}
-
-	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
+	err = bdev_dax_pgoff(bdev, start, PAGE_SIZE, &pgoff);
 	if (err) {
 		pr_debug("%s: error: unaligned partition for dax\n",
 				bdevname(bdev, buf));
 		return false;
 	}
 
-	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
-	if (!dax_dev) {
-		pr_debug("%s: error: device does not support dax\n",
+	last_page = PFN_DOWN((start + sectors - 1) * 512) * PAGE_SIZE / 512;
+	err = bdev_dax_pgoff(bdev, last_page, PAGE_SIZE, &pgoff_end);
+	if (err) {
+		pr_debug("%s: error: unaligned partition for dax\n",
 				bdevname(bdev, buf));
 		return false;
 	}
 
 	id = dax_read_lock();
-	len = dax_direct_access(dax_dev, pgoff, 1, NULL, &pfn);
+	len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
+	len2 = dax_direct_access(dax_dev, pgoff_end, 1, &end_kaddr, &end_pfn);
 	dax_read_unlock(id);
 
-	put_dax(dax_dev);
-
-	if (len < 1) {
+	if (len < 1 || len2 < 1) {
 		pr_debug("%s: error: dax access failed (%ld)\n",
-				bdevname(bdev, buf), len);
+				bdevname(bdev, buf), len < 1 ? len : len2);
 		return false;
 	}
 
@@ -143,13 +128,20 @@
 		 */
 		WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
 		dax_enabled = true;
-	} else if (pfn_t_devmap(pfn)) {
-		struct dev_pagemap *pgmap;
+	} else if (pfn_t_devmap(pfn) && pfn_t_devmap(end_pfn)) {
+		struct dev_pagemap *pgmap, *end_pgmap;
 
 		pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL);
-		if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX)
+		end_pgmap = get_dev_pagemap(pfn_t_to_pfn(end_pfn), NULL);
+		if (pgmap && pgmap == end_pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX
+				&& pfn_t_to_page(pfn)->pgmap == pgmap
+				&& pfn_t_to_page(end_pfn)->pgmap == pgmap
+				&& pfn_t_to_pfn(pfn) == PHYS_PFN(__pa(kaddr))
+				&& pfn_t_to_pfn(end_pfn) == PHYS_PFN(__pa(end_kaddr)))
 			dax_enabled = true;
 		put_dev_pagemap(pgmap);
+		put_dev_pagemap(end_pgmap);
+
 	}
 
 	if (!dax_enabled) {
@@ -159,6 +151,49 @@
 	}
 	return true;
 }
+EXPORT_SYMBOL_GPL(__generic_fsdax_supported);
+
+/**
+ * __bdev_dax_supported() - Check if the device supports dax for filesystem
+ * @bdev: block device to check
+ * @blocksize: The block size of the device
+ *
+ * This is a library function for filesystems to check if the block device
+ * can be mounted with dax option.
+ *
+ * Return: true if supported, false if unsupported
+ */
+bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
+{
+	struct dax_device *dax_dev;
+	struct request_queue *q;
+	char buf[BDEVNAME_SIZE];
+	bool ret;
+	int id;
+
+	q = bdev_get_queue(bdev);
+	if (!q || !blk_queue_dax(q)) {
+		pr_debug("%s: error: request queue doesn't support dax\n",
+				bdevname(bdev, buf));
+		return false;
+	}
+
+	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	if (!dax_dev) {
+		pr_debug("%s: error: device does not support dax\n",
+				bdevname(bdev, buf));
+		return false;
+	}
+
+	id = dax_read_lock();
+	ret = dax_supported(dax_dev, bdev, blocksize, 0,
+			i_size_read(bdev->bd_inode) / 512);
+	dax_read_unlock(id);
+
+	put_dax(dax_dev);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(__bdev_dax_supported);
 #endif
 
@@ -167,6 +202,8 @@
 	DAXDEV_ALIVE,
 	/* gate whether dax_flush() calls the low level flush routine */
 	DAXDEV_WRITE_CACHE,
+	/* flag to check if device supports synchronous flush */
+	DAXDEV_SYNC,
 };
 
 /**
@@ -284,6 +321,15 @@
 }
 EXPORT_SYMBOL_GPL(dax_direct_access);
 
+bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
+		int blocksize, sector_t start, sector_t len)
+{
+	if (!dax_alive(dax_dev))
+		return false;
+
+	return dax_dev->ops->dax_supported(dax_dev, bdev, blocksize, start, len);
+}
+
 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 		size_t bytes, struct iov_iter *i)
 {
@@ -335,6 +381,18 @@
 }
 EXPORT_SYMBOL_GPL(dax_write_cache_enabled);
 
+bool __dax_synchronous(struct dax_device *dax_dev)
+{
+	return test_bit(DAXDEV_SYNC, &dax_dev->flags);
+}
+EXPORT_SYMBOL_GPL(__dax_synchronous);
+
+void __set_dax_synchronous(struct dax_device *dax_dev)
+{
+	set_bit(DAXDEV_SYNC, &dax_dev->flags);
+}
+EXPORT_SYMBOL_GPL(__set_dax_synchronous);
+
 bool dax_alive(struct dax_device *dax_dev)
 {
 	lockdep_assert_held(&dax_srcu);
@@ -488,7 +546,7 @@
 }
 
 struct dax_device *alloc_dax(void *private, const char *__host,
-		const struct dax_operations *ops)
+		const struct dax_operations *ops, unsigned long flags)
 {
 	struct dax_device *dax_dev;
 	const char *host;
@@ -511,6 +569,9 @@
 	dax_add_host(dax_dev, host);
 	dax_dev->ops = ops;
 	dax_dev->private = private;
+	if (flags & DAXDEV_F_SYNC)
+		set_dax_synchronous(dax_dev);
+
 	return dax_dev;
 
  err_dev:

diff --git a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig
index 6a172d3..4c4ec68 100644
--- a/drivers/devfreq/Kconfig
+++ b/drivers/devfreq/Kconfig

@@ -103,7 +103,8 @@
 
 config ARM_RK3399_DMC_DEVFREQ
 	tristate "ARM RK3399 DMC DEVFREQ Driver"
-	depends on ARCH_ROCKCHIP
+	depends on (ARCH_ROCKCHIP && HAVE_ARM_SMCCC) || \
+		(COMPILE_TEST && HAVE_ARM_SMCCC)
 	select DEVFREQ_EVENT_ROCKCHIP_DFI
 	select DEVFREQ_GOV_SIMPLE_ONDEMAND
 	select PM_DEVFREQ_EVENT

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 8122d0e..06a981c 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c

@@ -600,7 +600,6 @@
 {
 	struct devfreq *devfreq;
 	struct devfreq_governor *governor;
-	static atomic_t devfreq_no = ATOMIC_INIT(-1);
 	int err = 0;
 
 	if (!dev || !profile || !governor_name) {
@@ -661,8 +660,7 @@
 	}
 	devfreq->max_freq = devfreq->scaling_max_freq;
 
-	dev_set_name(&devfreq->dev, "devfreq%d",
-				atomic_inc_return(&devfreq_no));
+	dev_set_name(&devfreq->dev, "%s", dev_name(dev));
 	err = device_register(&devfreq->dev);
 	if (err) {
 		mutex_unlock(&devfreq->lock);

diff --git a/drivers/devfreq/event/Kconfig b/drivers/devfreq/event/Kconfig
index cd94980..8851bc4 100644
--- a/drivers/devfreq/event/Kconfig
+++ b/drivers/devfreq/event/Kconfig

@@ -33,7 +33,7 @@
 
 config DEVFREQ_EVENT_ROCKCHIP_DFI
 	tristate "ROCKCHIP DFI DEVFREQ event Driver"
-	depends on ARCH_ROCKCHIP
+	depends on ARCH_ROCKCHIP || COMPILE_TEST
 	help
 	  This add the devfreq-event driver for Rockchip SoC. It provides DFI
 	  (DDR Monitor Module) driver to count ddr load.

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index f1a441ab..8a52a5e 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c

@@ -190,7 +190,7 @@
 
 static struct module *dma_chan_to_owner(struct dma_chan *chan)
 {
-	return chan->device->dev->driver->owner;
+	return chan->device->owner;
 }
 
 /**
@@ -923,6 +923,8 @@
 		return -EIO;
 	}
 
+	device->owner = device->dev->driver->owner;
+
 	if (dma_has_cap(DMA_MEMCPY, device->cap_mask) && !device->device_prep_dma_memcpy) {
 		dev_err(device->dev,
 			"Device claims capability %s, but op is not defined\n",

diff --git a/drivers/gpio/gpio-grgpio.c b/drivers/gpio/gpio-grgpio.c
index 60a1556..c1be299 100644
--- a/drivers/gpio/gpio-grgpio.c
+++ b/drivers/gpio/gpio-grgpio.c

@@ -258,17 +258,16 @@
 	lirq->irq = irq;
 	uirq = &priv->uirqs[lirq->index];
 	if (uirq->refcnt == 0) {
+		spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags);
 		ret = request_irq(uirq->uirq, grgpio_irq_handler, 0,
 				  dev_name(priv->dev), priv);
 		if (ret) {
 			dev_err(priv->dev,
 				"Could not request underlying irq %d\n",
 				uirq->uirq);
-
-			spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags);
-
 			return ret;
 		}
+		spin_lock_irqsave(&priv->gc.bgpio_lock, flags);
 	}
 	uirq->refcnt++;
 
@@ -314,8 +313,11 @@
 	if (index >= 0) {
 		uirq = &priv->uirqs[lirq->index];
 		uirq->refcnt--;
-		if (uirq->refcnt == 0)
+		if (uirq->refcnt == 0) {
+			spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags);
 			free_irq(uirq->uirq, priv);
+			return;
+		}
 	}
 
 	spin_unlock_irqrestore(&priv->gc.bgpio_lock, flags);

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
index bf872f6..d1fbaea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c

@@ -337,17 +337,9 @@
 		path_size += le16_to_cpu(path->usSize);
 
 		if (device_support & le16_to_cpu(path->usDeviceTag)) {
-			uint8_t con_obj_id, con_obj_num, con_obj_type;
-
-			con_obj_id =
+			uint8_t con_obj_id =
 			    (le16_to_cpu(path->usConnObjectId) & OBJECT_ID_MASK)
 			    >> OBJECT_ID_SHIFT;
-			con_obj_num =
-			    (le16_to_cpu(path->usConnObjectId) & ENUM_ID_MASK)
-			    >> ENUM_ID_SHIFT;
-			con_obj_type =
-			    (le16_to_cpu(path->usConnObjectId) &
-			     OBJECT_TYPE_MASK) >> OBJECT_TYPE_SHIFT;
 
 			/* Skip TV/CV support */
 			if ((le16_to_cpu(path->usDeviceTag) ==
@@ -372,14 +364,7 @@
 			router.ddc_valid = false;
 			router.cd_valid = false;
 			for (j = 0; j < ((le16_to_cpu(path->usSize) - 8) / 2); j++) {
-				uint8_t grph_obj_id, grph_obj_num, grph_obj_type;
-
-				grph_obj_id =
-				    (le16_to_cpu(path->usGraphicObjIds[j]) &
-				     OBJECT_ID_MASK) >> OBJECT_ID_SHIFT;
-				grph_obj_num =
-				    (le16_to_cpu(path->usGraphicObjIds[j]) &
-				     ENUM_ID_MASK) >> ENUM_ID_SHIFT;
+				uint8_t grph_obj_type=
 				grph_obj_type =
 				    (le16_to_cpu(path->usGraphicObjIds[j]) &
 				     OBJECT_TYPE_MASK) >> OBJECT_TYPE_SHIFT;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index bb5a47a4..5c76a815 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h

@@ -97,6 +97,7 @@
 	uint32_t                srbm_soft_reset;
 	bool			prt_warning;
 	uint64_t		stolen_size;
+	uint32_t		sdpif_register;
 	/* apertures */
 	u64			shared_aperture_start;
 	u64			shared_aperture_end;

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index ede27da..8b25940 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

@@ -993,6 +993,19 @@
 }
 
 /**
+ * gmc_v9_0_restore_registers - restores regs
+ *
+ * @adev: amdgpu_device pointer
+ *
+ * This restores register values, saved at suspend.
+ */
+static void gmc_v9_0_restore_registers(struct amdgpu_device *adev)
+{
+	if (adev->asic_type == CHIP_RAVEN)
+		WREG32(mmDCHUBBUB_SDPIF_MMIO_CNTRL_0, adev->gmc.sdpif_register);
+}
+
+/**
  * gmc_v9_0_gart_enable - gart enable
  *
  * @adev: amdgpu_device pointer
@@ -1081,6 +1094,20 @@
 }
 
 /**
+ * gmc_v9_0_save_registers - saves regs
+ *
+ * @adev: amdgpu_device pointer
+ *
+ * This saves potential register values that should be
+ * restored upon resume
+ */
+static void gmc_v9_0_save_registers(struct amdgpu_device *adev)
+{
+	if (adev->asic_type == CHIP_RAVEN)
+		adev->gmc.sdpif_register = RREG32(mmDCHUBBUB_SDPIF_MMIO_CNTRL_0);
+}
+
+/**
  * gmc_v9_0_gart_disable - gart disable
  *
  * @adev: amdgpu_device pointer
@@ -1112,9 +1139,16 @@
 
 static int gmc_v9_0_suspend(void *handle)
 {
+	int r;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-	return gmc_v9_0_hw_fini(adev);
+	r = gmc_v9_0_hw_fini(adev);
+	if (r)
+		return r;
+
+	gmc_v9_0_save_registers(adev);
+
+	return 0;
 }
 
 static int gmc_v9_0_resume(void *handle)
@@ -1122,6 +1156,7 @@
 	int r;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	gmc_v9_0_restore_registers(adev);
 	r = gmc_v9_0_hw_init(adev);
 	if (r)
 		return r;

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 83f2717..9e74f43 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c

@@ -205,7 +205,12 @@
 
 static u32 soc15_get_xclk(struct amdgpu_device *adev)
 {
-	return adev->clock.spll.reference_freq;
+	u32 reference_clock = adev->clock.spll.reference_freq;
+
+	if (adev->asic_type == CHIP_RAVEN)
+		return reference_clock / 4;
+
+	return reference_clock;
 }
 
 

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15_common.h b/drivers/gpu/drm/amd/amdgpu/soc15_common.h
index 0942f49..9d444f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15_common.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15_common.h

@@ -51,6 +51,7 @@
 	do {							\
 		uint32_t tmp_ = RREG32(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg); \
 		uint32_t loop = adev->usec_timeout;		\
+		ret = 0;					\
 		while ((tmp_ & (mask)) != (expected_value)) {	\
 			udelay(2);				\
 			tmp_ = RREG32(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg); \

diff --git a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c b/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c
index 6342f64..b0956c3 100644
--- a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c
+++ b/drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c

@@ -1346,6 +1346,7 @@
 	struct dc_context *ctx = dc->ctx;
 	struct dm_pp_clock_levels_with_voltage fclks = {0}, dcfclks = {0};
 	bool res;
+	unsigned vmin0p65_idx, vmid0p72_idx, vnom0p8_idx, vmax0p9_idx;
 
 	/* TODO: This is not the proper way to obtain fabric_and_dram_bandwidth, should be min(fclk, memclk) */
 	res = dm_pp_get_clock_levels_by_type_with_voltage(
@@ -1357,17 +1358,28 @@
 		res = verify_clock_values(&fclks);
 
 	if (res) {
-		ASSERT(fclks.num_levels >= 3);
-		dc->dcn_soc->fabric_and_dram_bandwidth_vmin0p65 = 32 * (fclks.data[0].clocks_in_khz / 1000.0) / 1000.0;
-		dc->dcn_soc->fabric_and_dram_bandwidth_vmid0p72 = dc->dcn_soc->number_of_channels *
-				(fclks.data[fclks.num_levels - (fclks.num_levels > 2 ? 3 : 2)].clocks_in_khz / 1000.0)
-				* ddr4_dram_factor_single_Channel / 1000.0;
-		dc->dcn_soc->fabric_and_dram_bandwidth_vnom0p8 = dc->dcn_soc->number_of_channels *
-				(fclks.data[fclks.num_levels - 2].clocks_in_khz / 1000.0)
-				* ddr4_dram_factor_single_Channel / 1000.0;
-		dc->dcn_soc->fabric_and_dram_bandwidth_vmax0p9 = dc->dcn_soc->number_of_channels *
-				(fclks.data[fclks.num_levels - 1].clocks_in_khz / 1000.0)
-				* ddr4_dram_factor_single_Channel / 1000.0;
+		ASSERT(fclks.num_levels);
+
+		vmin0p65_idx = 0;
+		vmid0p72_idx = fclks.num_levels -
+			(fclks.num_levels > 2 ? 3 : (fclks.num_levels > 1 ? 2 : 1));
+		vnom0p8_idx = fclks.num_levels - (fclks.num_levels > 1 ? 2 : 1);
+		vmax0p9_idx = fclks.num_levels - 1;
+
+		dc->dcn_soc->fabric_and_dram_bandwidth_vmin0p65 =
+			32 * (fclks.data[vmin0p65_idx].clocks_in_khz / 1000.0) / 1000.0;
+		dc->dcn_soc->fabric_and_dram_bandwidth_vmid0p72 =
+			dc->dcn_soc->number_of_channels *
+			(fclks.data[vmid0p72_idx].clocks_in_khz / 1000.0)
+			* ddr4_dram_factor_single_Channel / 1000.0;
+		dc->dcn_soc->fabric_and_dram_bandwidth_vnom0p8 =
+			dc->dcn_soc->number_of_channels *
+			(fclks.data[vnom0p8_idx].clocks_in_khz / 1000.0)
+			* ddr4_dram_factor_single_Channel / 1000.0;
+		dc->dcn_soc->fabric_and_dram_bandwidth_vmax0p9 =
+			dc->dcn_soc->number_of_channels *
+			(fclks.data[vmax0p9_idx].clocks_in_khz / 1000.0)
+			* ddr4_dram_factor_single_Channel / 1000.0;
 	} else
 		BREAK_TO_DEBUGGER();
 

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c b/drivers/gpu/drm/amd/display/dc/core/dc_link.c
index 2f42964..3abc029 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c

@@ -780,8 +780,7 @@
 			same_edid = is_same_edid(&prev_sink->dc_edid, &sink->dc_edid);
 
 		if (link->connector_signal == SIGNAL_TYPE_DISPLAY_PORT &&
-			sink_caps.transaction_type == DDC_TRANSACTION_TYPE_I2C_OVER_AUX &&
-			reason != DETECT_REASON_HPDRX) {
+			sink_caps.transaction_type == DDC_TRANSACTION_TYPE_I2C_OVER_AUX) {
 			/*
 			 * TODO debug why Dell 2413 doesn't like
 			 *  two link trainings

diff --git a/drivers/gpu/drm/amd/display/dc/dml/dml_common_defs.c b/drivers/gpu/drm/amd/display/dc/dml/dml_common_defs.c
index b953b02..723af0b 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dml_common_defs.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dml_common_defs.c

@@ -24,7 +24,7 @@
  */
 
 #include "dml_common_defs.h"
-#include "../calcs/dcn_calc_math.h"
+#include "dcn_calc_math.h"
 
 #include "dml_inline_defs.h"
 

diff --git a/drivers/gpu/drm/amd/display/dc/dml/dml_inline_defs.h b/drivers/gpu/drm/amd/display/dc/dml/dml_inline_defs.h
index e8ce085..e4f595a 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dml_inline_defs.h
+++ b/drivers/gpu/drm/amd/display/dc/dml/dml_inline_defs.h

@@ -27,7 +27,7 @@
 #define __DML_INLINE_DEFS_H__
 
 #include "dml_common_defs.h"
-#include "../calcs/dcn_calc_math.h"
+#include "dcn_calc_math.h"
 #include "dml_logger.h"
 
 static inline double dml_min(double a, double b)

diff --git a/drivers/gpu/drm/amd/display/dc/calcs/dcn_calc_math.h b/drivers/gpu/drm/amd/display/dc/inc/dcn_calc_math.h
similarity index 100%
rename from drivers/gpu/drm/amd/display/dc/calcs/dcn_calc_math.h
rename to drivers/gpu/drm/amd/display/dc/inc/dcn_calc_math.h


diff --git a/drivers/gpu/drm/amd/include/asic_reg/dce/dce_12_0_offset.h b/drivers/gpu/drm/amd/include/asic_reg/dce/dce_12_0_offset.h
index b6f74bf..27bb8c1 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/dce/dce_12_0_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/dce/dce_12_0_offset.h

@@ -7376,6 +7376,8 @@
 #define mmCRTC4_CRTC_DRR_CONTROL                                                                       0x0f3e
 #define mmCRTC4_CRTC_DRR_CONTROL_BASE_IDX                                                              2
 
+#define mmDCHUBBUB_SDPIF_MMIO_CNTRL_0                                                                  0x395d
+#define mmDCHUBBUB_SDPIF_MMIO_CNTRL_0_BASE_IDX                                                         2
 
 // addressBlock: dce_dc_fmt4_dispdec
 // base address: 0x2000

diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c
index 1546bc4..48e3171 100644
--- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c
+++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu10_hwmgr.c

@@ -994,12 +994,15 @@
 
 	clocks->num_levels = 0;
 	for (i = 0; i < pclk_vol_table->count; i++) {
-		clocks->data[i].clocks_in_khz = pclk_vol_table->entries[i].clk * 10;
-		clocks->data[i].latency_in_us = latency_required ?
-						smu10_get_mem_latency(hwmgr,
-						pclk_vol_table->entries[i].clk) :
-						0;
-		clocks->num_levels++;
+		if (pclk_vol_table->entries[i].clk) {
+			clocks->data[clocks->num_levels].clocks_in_khz =
+				pclk_vol_table->entries[i].clk * 10;
+			clocks->data[clocks->num_levels].latency_in_us = latency_required ?
+				smu10_get_mem_latency(hwmgr,
+						      pclk_vol_table->entries[i].clk) :
+				0;
+			clocks->num_levels++;
+		}
 	}
 
 	return 0;
@@ -1045,9 +1048,11 @@
 
 	clocks->num_levels = 0;
 	for (i = 0; i < pclk_vol_table->count; i++) {
-		clocks->data[i].clocks_in_khz = pclk_vol_table->entries[i].clk  * 10;
-		clocks->data[i].voltage_in_mv = pclk_vol_table->entries[i].vol;
-		clocks->num_levels++;
+		if (pclk_vol_table->entries[i].clk) {
+			clocks->data[clocks->num_levels].clocks_in_khz = pclk_vol_table->entries[i].clk  * 10;
+			clocks->data[clocks->num_levels].voltage_in_mv = pclk_vol_table->entries[i].vol;
+			clocks->num_levels++;
+		}
 	}
 
 	return 0;

diff --git a/drivers/gpu/drm/drm_debugfs_crc.c b/drivers/gpu/drm/drm_debugfs_crc.c
index c88e5ff..a3c7567 100644
--- a/drivers/gpu/drm/drm_debugfs_crc.c
+++ b/drivers/gpu/drm/drm_debugfs_crc.c

@@ -101,8 +101,8 @@
 	if (IS_ERR(source))
 		return PTR_ERR(source);
 
-	if (source[len] == '\n')
-		source[len] = '\0';
+	if (source[len - 1] == '\n')
+		source[len - 1] = '\0';
 
 	spin_lock_irq(&crc->lock);
 

diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c
index adefae5..b4035ef 100644
--- a/drivers/gpu/drm/gma500/framebuffer.c
+++ b/drivers/gpu/drm/gma500/framebuffer.c

@@ -480,6 +480,7 @@
 		container_of(helper, struct psb_fbdev, psb_fb_helper);
 	struct drm_device *dev = psb_fbdev->psb_fb_helper.dev;
 	struct drm_psb_private *dev_priv = dev->dev_private;
+	unsigned int fb_size;
 	int bytespp;
 
 	bytespp = sizes->surface_bpp / 8;
@@ -489,8 +490,11 @@
 	/* If the mode will not fit in 32bit then switch to 16bit to get
 	   a console on full resolution. The X mode setting server will
 	   allocate its own 32bit GEM framebuffer */
-	if (ALIGN(sizes->fb_width * bytespp, 64) * sizes->fb_height >
-	                dev_priv->vram_stolen_size) {
+	fb_size = ALIGN(sizes->surface_width * bytespp, 64) *
+		  sizes->surface_height;
+	fb_size = ALIGN(fb_size, PAGE_SIZE);
+
+	if (fb_size > dev_priv->vram_stolen_size) {
                 sizes->surface_bpp = 16;
                 sizes->surface_depth = 16;
         }

diff --git a/drivers/gpu/drm/i915/gvt/dmabuf.c b/drivers/gpu/drm/i915/gvt/dmabuf.c
index 51ed99a..6053f5a 100644
--- a/drivers/gpu/drm/i915/gvt/dmabuf.c
+++ b/drivers/gpu/drm/i915/gvt/dmabuf.c

@@ -95,12 +95,12 @@
 			dmabuf_obj = container_of(pos,
 					struct intel_vgpu_dmabuf_obj, list);
 			if (dmabuf_obj == obj) {
+				list_del(pos);
 				intel_gvt_hypervisor_put_vfio_device(vgpu);
 				idr_remove(&vgpu->object_idr,
 					   dmabuf_obj->dmabuf_id);
 				kfree(dmabuf_obj->info);
 				kfree(dmabuf_obj);
-				list_del(pos);
 				break;
 			}
 		}

diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c
index c628be0..69bba88 100644
--- a/drivers/gpu/drm/i915/gvt/vgpu.c
+++ b/drivers/gpu/drm/i915/gvt/vgpu.c

@@ -556,9 +556,9 @@
 
 		intel_vgpu_reset_mmio(vgpu, dmlr);
 		populate_pvinfo_page(vgpu);
-		intel_vgpu_reset_display(vgpu);
 
 		if (dmlr) {
+			intel_vgpu_reset_display(vgpu);
 			intel_vgpu_reset_cfg_space(vgpu);
 			/* only reset the failsafe mode when dmlr reset */
 			vgpu->failsafe = false;

diff --git a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
index 92ecb9b..b86ee7d 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c

@@ -308,6 +308,7 @@
 static void mtk_crtc_ddp_hw_fini(struct mtk_drm_crtc *mtk_crtc)
 {
 	struct drm_device *drm = mtk_crtc->base.dev;
+	struct drm_crtc *crtc = &mtk_crtc->base;
 	int i;
 
 	DRM_DEBUG_DRIVER("%s\n", __func__);
@@ -329,6 +330,13 @@
 	mtk_disp_mutex_unprepare(mtk_crtc->mutex);
 
 	pm_runtime_put(drm->dev);
+
+	if (crtc->state->event && !crtc->state->active) {
+		spin_lock_irq(&crtc->dev->event_lock);
+		drm_crtc_send_vblank_event(crtc, crtc->state->event);
+		crtc->state->event = NULL;
+		spin_unlock_irq(&crtc->dev->event_lock);
+	}
 }
 
 static void mtk_crtc_ddp_config(struct drm_crtc *crtc)

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index dbfd2c0..6f81de8 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c

@@ -492,6 +492,14 @@
 	if (ret)
 		goto err_msm_uninit;
 
+	if (!dev->dma_parms) {
+		dev->dma_parms = devm_kzalloc(dev, sizeof(*dev->dma_parms),
+					      GFP_KERNEL);
+		if (!dev->dma_parms)
+			return -ENOMEM;
+	}
+	dma_set_max_seg_size(dev, DMA_BIT_MASK(32));
+
 	msm_gem_shrinker_init(ddev);
 
 	switch (get_mdp_ver(pdev)) {

diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.c b/drivers/gpu/drm/nouveau/dispnv50/wndw.c
index b3db455..d343ae6 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/wndw.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.c

@@ -405,6 +405,8 @@
 		asyw->clr.ntfy = armw->ntfy.handle != 0;
 		asyw->clr.sema = armw->sema.handle != 0;
 		asyw->clr.xlut = armw->xlut.handle != 0;
+		if (asyw->clr.xlut && asyw->visible)
+			asyw->set.xlut = asyw->xlut.handle != 0;
 		if (wndw->func->image_clr)
 			asyw->clr.image = armw->image.handle[0] != 0;
 	}

diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 412d49b..ba3883a 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c

@@ -157,7 +157,7 @@
 
 		fence = list_entry(fctx->pending.next, typeof(*fence), head);
 		chan = rcu_dereference_protected(fence->channel, lockdep_is_held(&fctx->lock));
-		if (nouveau_fence_update(fence->channel, fctx))
+		if (nouveau_fence_update(chan, fctx))
 			ret = NVIF_NOTIFY_DROP;
 	}
 	spin_unlock_irqrestore(&fctx->lock, flags);

diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c b/drivers/gpu/drm/nouveau/nouveau_ttm.c
index e4b977c..37715a2 100644
--- a/drivers/gpu/drm/nouveau/nouveau_ttm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c

@@ -63,14 +63,12 @@
 {
 	struct nouveau_bo *nvbo = nouveau_bo(bo);
 	struct nouveau_drm *drm = nouveau_bdev(bo->bdev);
-	struct nouveau_mem *mem;
 	int ret;
 
 	if (drm->client.device.info.ram_size == 0)
 		return -ENOMEM;
 
 	ret = nouveau_mem_new(&drm->master, nvbo->kind, nvbo->comp, reg);
-	mem = nouveau_mem(reg);
 	if (ret)
 		return ret;
 
@@ -103,11 +101,9 @@
 {
 	struct nouveau_bo *nvbo = nouveau_bo(bo);
 	struct nouveau_drm *drm = nouveau_bdev(bo->bdev);
-	struct nouveau_mem *mem;
 	int ret;
 
 	ret = nouveau_mem_new(&drm->master, nvbo->kind, nvbo->comp, reg);
-	mem = nouveau_mem(reg);
 	if (ret)
 		return ret;
 

diff --git a/drivers/gpu/drm/nouveau/nvkm/core/memory.c b/drivers/gpu/drm/nouveau/nvkm/core/memory.c
index e85a08e..4cc1862 100644
--- a/drivers/gpu/drm/nouveau/nvkm/core/memory.c
+++ b/drivers/gpu/drm/nouveau/nvkm/core/memory.c

@@ -91,8 +91,8 @@
 	}
 
 	refcount_set(&tags->refcount, 1);
+	*ptags = memory->tags = tags;
 	mutex_unlock(&fb->subdev.mutex);
-	*ptags = tags;
 	return 0;
 }
 

diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.c
index bcf32d9..50e3539 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/channv50.c

@@ -74,6 +74,8 @@
 
 	if (debug > subdev->debug)
 		return;
+	if (!mthd)
+		return;
 
 	for (i = 0; (list = mthd->data[i].mthd) != NULL; i++) {
 		u32 base = chan->head * mthd->addr;

diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c
index 500cb08..b57ab5c 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/gk20a.c

@@ -143,23 +143,24 @@
 
 	nent = (fuc.size / sizeof(struct gk20a_fw_av));
 
-	pack = vzalloc((sizeof(*pack) * max_classes) +
-		       (sizeof(*init) * (nent + 1)));
+	pack = vzalloc((sizeof(*pack) * (max_classes + 1)) +
+		       (sizeof(*init) * (nent + max_classes + 1)));
 	if (!pack) {
 		ret = -ENOMEM;
 		goto end;
 	}
 
-	init = (void *)(pack + max_classes);
+	init = (void *)(pack + max_classes + 1);
 
-	for (i = 0; i < nent; i++) {
-		struct gf100_gr_init *ent = &init[i];
+	for (i = 0; i < nent; i++, init++) {
 		struct gk20a_fw_av *av = &((struct gk20a_fw_av *)fuc.data)[i];
 		u32 class = av->addr & 0xffff;
 		u32 addr = (av->addr & 0xffff0000) >> 14;
 
 		if (prevclass != class) {
-			pack[classidx].init = ent;
+			if (prevclass) /* Add terminator to the method list. */
+				init++;
+			pack[classidx].init = init;
 			pack[classidx].type = class;
 			prevclass = class;
 			if (++classidx >= max_classes) {
@@ -169,10 +170,10 @@
 			}
 		}
 
-		ent->addr = addr;
-		ent->data = av->data;
-		ent->count = 1;
-		ent->pitch = 1;
+		init->addr = addr;
+		init->data = av->data;
+		init->count = 1;
+		init->pitch = 1;
 	}
 
 	*ppack = pack;

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fault/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fault/base.c
index 16ad91c..f18ce6f 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/fault/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fault/base.c

@@ -150,6 +150,7 @@
 	struct nvkm_fault *fault = nvkm_fault(subdev);
 	int i;
 
+	nvkm_notify_fini(&fault->nrpfb);
 	nvkm_event_fini(&fault->event);
 
 	for (i = 0; i < fault->buffer_nr; i++) {

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/gm20b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/gm20b.c
index df8b919..ace6fef 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/gm20b.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/secboot/gm20b.c

@@ -108,6 +108,7 @@
 	struct gm200_secboot *gsb;
 	struct nvkm_acr *acr;
 
+	*psb = NULL;
 	acr = acr_r352_new(BIT(NVKM_SECBOOT_FALCON_FECS) |
 			   BIT(NVKM_SECBOOT_FALCON_PMU));
 	if (IS_ERR(acr))
@@ -116,10 +117,8 @@
 	acr->optional_falcons = BIT(NVKM_SECBOOT_FALCON_PMU);
 
 	gsb = kzalloc(sizeof(*gsb), GFP_KERNEL);
-	if (!gsb) {
-		psb = NULL;
+	if (!gsb)
 		return -ENOMEM;
-	}
 	*psb = &gsb->base;
 
 	ret = nvkm_secboot_ctor(&gm20b_secboot, acr, device, index, &gsb->base);

diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
index d8e2d7b..7d1e14f 100644
--- a/drivers/gpu/drm/radeon/radeon_display.c
+++ b/drivers/gpu/drm/radeon/radeon_display.c

@@ -121,6 +121,8 @@
 
 	DRM_DEBUG_KMS("%d\n", radeon_crtc->crtc_id);
 
+	msleep(10);
+
 	WREG32(NI_INPUT_CSC_CONTROL + radeon_crtc->crtc_offset,
 	       (NI_INPUT_CSC_GRPH_MODE(NI_INPUT_CSC_BYPASS) |
 		NI_INPUT_CSC_OVL_MODE(NI_INPUT_CSC_BYPASS)));

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c
index 3b75af9..f27bd7c 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cmdbuf_res.c

@@ -210,8 +210,10 @@
 
 	cres->hash.key = user_key | (res_type << 24);
 	ret = drm_ht_insert_item(&man->resources, &cres->hash);
-	if (unlikely(ret != 0))
+	if (unlikely(ret != 0)) {
+		kfree(cres);
 		goto out_invalid_key;
+	}
 
 	cres->state = VMW_CMDBUF_RES_ADD;
 	cres->res = vmw_resource_reference(res);

diff --git a/drivers/hid/hid-alps.c b/drivers/hid/hid-alps.c
index 3cd7229..895f49b 100644
--- a/drivers/hid/hid-alps.c
+++ b/drivers/hid/hid-alps.c

@@ -734,7 +734,7 @@
 	if (data->has_sp) {
 		input2 = input_allocate_device();
 		if (!input2) {
-			input_free_device(input2);
+			ret = -ENOMEM;
 			goto exit;
 		}
 

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index e723156..2c85d075 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c

@@ -1566,7 +1566,9 @@
 
 	rsize = ((report->size - 1) >> 3) + 1;
 
-	if (rsize > HID_MAX_BUFFER_SIZE)
+	if (report_enum->numbered && rsize >= HID_MAX_BUFFER_SIZE)
+		rsize = HID_MAX_BUFFER_SIZE - 1;
+	else if (rsize > HID_MAX_BUFFER_SIZE)
 		rsize = HID_MAX_BUFFER_SIZE;
 
 	if (csize < rsize) {

diff --git a/drivers/hid/hid-ite.c b/drivers/hid/hid-ite.c
index 2ce1eb0..f2e23f81 100644
--- a/drivers/hid/hid-ite.c
+++ b/drivers/hid/hid-ite.c

@@ -44,8 +44,9 @@
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ITE, USB_DEVICE_ID_ITE8595) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_258A, USB_DEVICE_ID_258A_6A88) },
 	/* ITE8595 USB kbd ctlr, with Synaptics touchpad connected to it. */
-	{ HID_USB_DEVICE(USB_VENDOR_ID_SYNAPTICS,
-			 USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5_012) },
+	{ HID_DEVICE(BUS_USB, HID_GROUP_GENERIC,
+		     USB_VENDOR_ID_SYNAPTICS,
+		     USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5_012) },
 	{ }
 };
 MODULE_DEVICE_TABLE(hid, ite_devices);

diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index da00019..c34ef95 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c

@@ -954,9 +954,9 @@
 	hiddev->exist = 0;
 
 	if (hiddev->open) {
-		mutex_unlock(&hiddev->existancelock);
 		hid_hw_close(hiddev->hid);
 		wake_up_interruptible(&hiddev->wait);
+		mutex_unlock(&hiddev->existancelock);
 	} else {
 		mutex_unlock(&hiddev->existancelock);
 		kfree(hiddev);

diff --git a/drivers/hwmon/pmbus/ltc2978.c b/drivers/hwmon/pmbus/ltc2978.c
index 58b789c..94eea2a 100644
--- a/drivers/hwmon/pmbus/ltc2978.c
+++ b/drivers/hwmon/pmbus/ltc2978.c

@@ -89,8 +89,8 @@
 
 #define LTC_POLL_TIMEOUT		100	/* in milli-seconds */
 
-#define LTC_NOT_BUSY			BIT(5)
-#define LTC_NOT_PENDING			BIT(4)
+#define LTC_NOT_BUSY			BIT(6)
+#define LTC_NOT_PENDING			BIT(5)
 
 /*
  * LTC2978 clears peak data whenever the CLEAR_FAULTS command is executed, which

diff --git a/drivers/i2c/busses/i2c-altera.c b/drivers/i2c/busses/i2c-altera.c
index f5e1941..a1cdcfc7 100644
--- a/drivers/i2c/busses/i2c-altera.c
+++ b/drivers/i2c/busses/i2c-altera.c

@@ -182,7 +182,7 @@
 	/* SCL Low Time */
 	writel(t_low, idev->base + ALTR_I2C_SCL_LOW);
 	/* SDA Hold Time, 300ns */
-	writel(div_u64(300 * clk_mhz, 1000), idev->base + ALTR_I2C_SDA_HOLD);
+	writel(3 * clk_mhz / 10, idev->base + ALTR_I2C_SDA_HOLD);
 
 	/* Mask all master interrupt bits */
 	altr_i2c_int_enable(idev, ALTR_I2C_ALL_IRQ, false);

diff --git a/drivers/i2c/busses/i2c-jz4780.c b/drivers/i2c/busses/i2c-jz4780.c
index 30132c3..41ca9ff 100644
--- a/drivers/i2c/busses/i2c-jz4780.c
+++ b/drivers/i2c/busses/i2c-jz4780.c

@@ -82,25 +82,6 @@
 #define JZ4780_I2C_STA_TFNF		BIT(1)
 #define JZ4780_I2C_STA_ACT		BIT(0)
 
-static const char * const jz4780_i2c_abrt_src[] = {
-	"ABRT_7B_ADDR_NOACK",
-	"ABRT_10ADDR1_NOACK",
-	"ABRT_10ADDR2_NOACK",
-	"ABRT_XDATA_NOACK",
-	"ABRT_GCALL_NOACK",
-	"ABRT_GCALL_READ",
-	"ABRT_HS_ACKD",
-	"SBYTE_ACKDET",
-	"ABRT_HS_NORSTRT",
-	"SBYTE_NORSTRT",
-	"ABRT_10B_RD_NORSTRT",
-	"ABRT_MASTER_DIS",
-	"ARB_LOST",
-	"SLVFLUSH_TXFIFO",
-	"SLV_ARBLOST",
-	"SLVRD_INTX",
-};
-
 #define JZ4780_I2C_INTST_IGC		BIT(11)
 #define JZ4780_I2C_INTST_ISTT		BIT(10)
 #define JZ4780_I2C_INTST_ISTP		BIT(9)
@@ -538,21 +519,8 @@
 
 static void jz4780_i2c_txabrt(struct jz4780_i2c *i2c, int src)
 {
-	int i;
-
-	dev_err(&i2c->adap.dev, "txabrt: 0x%08x\n", src);
-	dev_err(&i2c->adap.dev, "device addr=%x\n",
-		jz4780_i2c_readw(i2c, JZ4780_I2C_TAR));
-	dev_err(&i2c->adap.dev, "send cmd count:%d  %d\n",
-		i2c->cmd, i2c->cmd_buf[i2c->cmd]);
-	dev_err(&i2c->adap.dev, "receive data count:%d  %d\n",
-		i2c->cmd, i2c->data_buf[i2c->cmd]);
-
-	for (i = 0; i < 16; i++) {
-		if (src & BIT(i))
-			dev_dbg(&i2c->adap.dev, "I2C TXABRT[%d]=%s\n",
-				i, jz4780_i2c_abrt_src[i]);
-	}
+	dev_dbg(&i2c->adap.dev, "txabrt: 0x%08x, cmd: %d, send: %d, recv: %d\n",
+		src, i2c->cmd, i2c->cmd_buf[i2c->cmd], i2c->data_buf[i2c->cmd]);
 }
 
 static inline int jz4780_i2c_xfer_read(struct jz4780_i2c *i2c,

diff --git a/drivers/ide/cmd64x.c b/drivers/ide/cmd64x.c
index b127ed6..9dde839 100644
--- a/drivers/ide/cmd64x.c
+++ b/drivers/ide/cmd64x.c

@@ -65,6 +65,9 @@
 	struct ide_timing t;
 	u8 arttim = 0;
 
+	if (drive->dn >= ARRAY_SIZE(drwtim_regs))
+		return;
+
 	ide_timing_compute(drive, mode, &t, T, 0);
 
 	/*

diff --git a/drivers/ide/serverworks.c b/drivers/ide/serverworks.c
index a97affc..0f57d45 100644
--- a/drivers/ide/serverworks.c
+++ b/drivers/ide/serverworks.c

@@ -114,6 +114,9 @@
 	struct pci_dev *dev = to_pci_dev(hwif->dev);
 	const u8 pio = drive->pio_mode - XFER_PIO_0;
 
+	if (drive->dn >= ARRAY_SIZE(drive_pci))
+		return;
+
 	pci_write_config_byte(dev, drive_pci[drive->dn], pio_modes[pio]);
 
 	if (svwks_csb_check(dev)) {
@@ -140,6 +143,9 @@
 
 	u8 ultra_enable	 = 0, ultra_timing = 0, dma_timing = 0;
 
+	if (drive->dn >= ARRAY_SIZE(drive_pci2))
+		return;
+
 	pci_read_config_byte(dev, (0x56|hwif->channel), &ultra_timing);
 	pci_read_config_byte(dev, 0x54, &ultra_enable);
 

diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
index b79b61b..4e2565c 100644
--- a/drivers/infiniband/core/security.c
+++ b/drivers/infiniband/core/security.c

@@ -336,22 +336,16 @@
 	if (!new_pps)
 		return NULL;
 
-	if (qp_attr_mask & (IB_QP_PKEY_INDEX | IB_QP_PORT)) {
-		if (!qp_pps) {
-			new_pps->main.port_num = qp_attr->port_num;
-			new_pps->main.pkey_index = qp_attr->pkey_index;
-		} else {
-			new_pps->main.port_num = (qp_attr_mask & IB_QP_PORT) ?
-						  qp_attr->port_num :
-						  qp_pps->main.port_num;
-
-			new_pps->main.pkey_index =
-					(qp_attr_mask & IB_QP_PKEY_INDEX) ?
-					 qp_attr->pkey_index :
-					 qp_pps->main.pkey_index;
-		}
+	if (qp_attr_mask & IB_QP_PORT)
+		new_pps->main.port_num =
+			(qp_pps) ? qp_pps->main.port_num : qp_attr->port_num;
+	if (qp_attr_mask & IB_QP_PKEY_INDEX)
+		new_pps->main.pkey_index = (qp_pps) ? qp_pps->main.pkey_index :
+						      qp_attr->pkey_index;
+	if ((qp_attr_mask & IB_QP_PKEY_INDEX) && (qp_attr_mask & IB_QP_PORT))
 		new_pps->main.state = IB_PORT_PKEY_VALID;
-	} else if (qp_pps) {
+
+	if (!(qp_attr_mask & (IB_QP_PKEY_INDEX || IB_QP_PORT)) && qp_pps) {
 		new_pps->main.port_num = qp_pps->main.port_num;
 		new_pps->main.pkey_index = qp_pps->main.pkey_index;
 		if (qp_pps->main.state != IB_PORT_PKEY_NOT_VALID)

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index e012ca8..5e10a40 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c

@@ -2914,12 +2914,6 @@
 	return 0;
 }
 
-static size_t kern_spec_filter_sz(const struct ib_uverbs_flow_spec_hdr *spec)
-{
-	/* Returns user space filter size, includes padding */
-	return (spec->size - sizeof(struct ib_uverbs_flow_spec_hdr)) / 2;
-}
-
 static ssize_t spec_filter_size(const void *kern_spec_filter, u16 kern_filter_size,
 				u16 ib_real_filter_sz)
 {
@@ -3063,11 +3057,16 @@
 static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec,
 				       union ib_flow_spec *ib_spec)
 {
-	ssize_t kern_filter_sz;
+	size_t kern_filter_sz;
 	void *kern_spec_mask;
 	void *kern_spec_val;
 
-	kern_filter_sz = kern_spec_filter_sz(&kern_spec->hdr);
+	if (check_sub_overflow((size_t)kern_spec->hdr.size,
+			       sizeof(struct ib_uverbs_flow_spec_hdr),
+			       &kern_filter_sz))
+		return -EINVAL;
+
+	kern_filter_sz /= 2;
 
 	kern_spec_val = (void *)kern_spec +
 		sizeof(struct ib_uverbs_flow_spec_hdr);

diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index bedd5fb..01ed0a6 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c

@@ -478,6 +478,8 @@
 			  rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu);
 	}
 
+	free_cpumask_var(available_cpus);
+	free_cpumask_var(non_intr_cpus);
 	return 0;
 
 fail:

diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index b09a4b1..1221fae 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c

@@ -1687,6 +1687,14 @@
 	return dd->verbs_dev.n_piodrain;
 }
 
+static u64 access_sw_ctx0_seq_drop(const struct cntr_entry *entry,
+				   void *context, int vl, int mode, u64 data)
+{
+	struct hfi1_devdata *dd = context;
+
+	return dd->ctx0_seq_drop;
+}
+
 static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
 			      void *context, int vl, int mode, u64 data)
 {
@@ -4247,6 +4255,8 @@
 			    access_sw_cpu_intr),
 [C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL,
 			    access_sw_cpu_rcv_limit),
+[C_SW_CTX0_SEQ_DROP] = CNTR_ELEM("SeqDrop0", 0, 0, CNTR_NORMAL,
+			    access_sw_ctx0_seq_drop),
 [C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL,
 			    access_sw_vtx_wait),
 [C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,

diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
index 36b04d6..c9a352d 100644
--- a/drivers/infiniband/hw/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h

@@ -909,6 +909,7 @@
 	C_DC_PG_STS_TX_MBE_CNT,
 	C_SW_CPU_INTR,
 	C_SW_CPU_RCV_LIM,
+	C_SW_CTX0_SEQ_DROP,
 	C_SW_VTX_WAIT,
 	C_SW_PIO_WAIT,
 	C_SW_PIO_DRAIN,

diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
index d5277c2..769e114 100644
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c

@@ -734,6 +734,7 @@
 {
 	int ret;
 
+	packet->rcd->dd->ctx0_seq_drop++;
 	/* Set up for the next packet */
 	packet->rhqoff += packet->rsize;
 	if (packet->rhqoff >= packet->maxcnt)

diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index 34ffca6..adeb2594 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c

@@ -200,23 +200,24 @@
 
 	fd = kzalloc(sizeof(*fd), GFP_KERNEL);
 
-	if (fd) {
-		fd->rec_cpu_num = -1; /* no cpu affinity by default */
-		fd->mm = current->mm;
-		mmgrab(fd->mm);
-		fd->dd = dd;
-		kobject_get(&fd->dd->kobj);
-		fp->private_data = fd;
-	} else {
-		fp->private_data = NULL;
-
-		if (atomic_dec_and_test(&dd->user_refcount))
-			complete(&dd->user_comp);
-
-		return -ENOMEM;
-	}
-
+	if (!fd || init_srcu_struct(&fd->pq_srcu))
+		goto nomem;
+	spin_lock_init(&fd->pq_rcu_lock);
+	spin_lock_init(&fd->tid_lock);
+	spin_lock_init(&fd->invalid_lock);
+	fd->rec_cpu_num = -1; /* no cpu affinity by default */
+	fd->mm = current->mm;
+	mmgrab(fd->mm);
+	fd->dd = dd;
+	kobject_get(&fd->dd->kobj);
+	fp->private_data = fd;
 	return 0;
+nomem:
+	kfree(fd);
+	fp->private_data = NULL;
+	if (atomic_dec_and_test(&dd->user_refcount))
+		complete(&dd->user_comp);
+	return -ENOMEM;
 }
 
 static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
@@ -301,21 +302,30 @@
 static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
 {
 	struct hfi1_filedata *fd = kiocb->ki_filp->private_data;
-	struct hfi1_user_sdma_pkt_q *pq = fd->pq;
+	struct hfi1_user_sdma_pkt_q *pq;
 	struct hfi1_user_sdma_comp_q *cq = fd->cq;
 	int done = 0, reqs = 0;
 	unsigned long dim = from->nr_segs;
+	int idx;
 
-	if (!cq || !pq)
+	idx = srcu_read_lock(&fd->pq_srcu);
+	pq = srcu_dereference(fd->pq, &fd->pq_srcu);
+	if (!cq || !pq) {
+		srcu_read_unlock(&fd->pq_srcu, idx);
 		return -EIO;
+	}
 
-	if (!iter_is_iovec(from) || !dim)
+	if (!iter_is_iovec(from) || !dim) {
+		srcu_read_unlock(&fd->pq_srcu, idx);
 		return -EINVAL;
+	}
 
 	trace_hfi1_sdma_request(fd->dd, fd->uctxt->ctxt, fd->subctxt, dim);
 
-	if (atomic_read(&pq->n_reqs) == pq->n_max_reqs)
+	if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) {
+		srcu_read_unlock(&fd->pq_srcu, idx);
 		return -ENOSPC;
+	}
 
 	while (dim) {
 		int ret;
@@ -333,6 +343,7 @@
 		reqs++;
 	}
 
+	srcu_read_unlock(&fd->pq_srcu, idx);
 	return reqs;
 }
 
@@ -706,6 +717,7 @@
 	if (atomic_dec_and_test(&dd->user_refcount))
 		complete(&dd->user_comp);
 
+	cleanup_srcu_struct(&fdata->pq_srcu);
 	kfree(fdata);
 	return 0;
 }

diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 232fc4b..e38de54 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h

@@ -1093,6 +1093,8 @@
 
 	char *boardname; /* human readable board info */
 
+	u64 ctx0_seq_drop;
+
 	/* reset value */
 	u64 z_int_counter;
 	u64 z_rcv_limit;
@@ -1376,10 +1378,13 @@
 
 /* Private data for file operations */
 struct hfi1_filedata {
+	struct srcu_struct pq_srcu;
 	struct hfi1_devdata *dd;
 	struct hfi1_ctxtdata *uctxt;
 	struct hfi1_user_sdma_comp_q *cq;
-	struct hfi1_user_sdma_pkt_q *pq;
+	/* update side lock for SRCU */
+	spinlock_t pq_rcu_lock;
+	struct hfi1_user_sdma_pkt_q __rcu *pq;
 	u16 subctxt;
 	/* for cpu affinity; -1 if none */
 	int rec_cpu_num;

diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
index 4e986ca..4e417ed 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c

@@ -90,9 +90,6 @@
 	struct hfi1_devdata *dd = uctxt->dd;
 	int ret = 0;
 
-	spin_lock_init(&fd->tid_lock);
-	spin_lock_init(&fd->invalid_lock);
-
 	fd->entry_to_rb = kcalloc(uctxt->expected_count,
 				  sizeof(struct rb_node *),
 				  GFP_KERNEL);
@@ -165,10 +162,12 @@
 	if (fd->handler) {
 		hfi1_mmu_rb_unregister(fd->handler);
 	} else {
+		mutex_lock(&uctxt->exp_mutex);
 		if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
 			unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
 		if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
 			unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
+		mutex_unlock(&uctxt->exp_mutex);
 	}
 
 	kfree(fd->invalid_tids);

diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index 684a298..a3b08a9 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c

@@ -179,7 +179,6 @@
 	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
 	if (!pq)
 		return -ENOMEM;
-
 	pq->dd = dd;
 	pq->ctxt = uctxt->ctxt;
 	pq->subctxt = fd->subctxt;
@@ -236,7 +235,7 @@
 		goto pq_mmu_fail;
 	}
 
-	fd->pq = pq;
+	rcu_assign_pointer(fd->pq, pq);
 	fd->cq = cq;
 
 	return 0;
@@ -264,8 +263,14 @@
 
 	trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
 
-	pq = fd->pq;
+	spin_lock(&fd->pq_rcu_lock);
+	pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
+				    lockdep_is_held(&fd->pq_rcu_lock));
 	if (pq) {
+		rcu_assign_pointer(fd->pq, NULL);
+		spin_unlock(&fd->pq_rcu_lock);
+		synchronize_srcu(&fd->pq_srcu);
+		/* at this point there can be no more new requests */
 		if (pq->handler)
 			hfi1_mmu_rb_unregister(pq->handler);
 		iowait_sdma_drain(&pq->busy);
@@ -277,7 +282,8 @@
 		kfree(pq->req_in_use);
 		kmem_cache_destroy(pq->txreq_cache);
 		kfree(pq);
-		fd->pq = NULL;
+	} else {
+		spin_unlock(&fd->pq_rcu_lock);
 	}
 	if (fd->cq) {
 		vfree(fd->cq->comps);
@@ -321,7 +327,8 @@
 {
 	int ret = 0, i;
 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
-	struct hfi1_user_sdma_pkt_q *pq = fd->pq;
+	struct hfi1_user_sdma_pkt_q *pq =
+		srcu_dereference(fd->pq, &fd->pq_srcu);
 	struct hfi1_user_sdma_comp_q *cq = fd->cq;
 	struct hfi1_devdata *dd = pq->dd;
 	unsigned long idx = 0;

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 7d03680..fbc3167 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c

@@ -58,6 +58,8 @@
 #include "trace.h"
 
 static void rvt_rc_timeout(struct timer_list *t);
+static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+			 enum ib_qp_type type);
 
 /*
  * Convert the AETH RNR timeout code into the number of microseconds.
@@ -268,40 +270,41 @@
 }
 
 /**
- * free_all_qps - check for QPs still in use
+ * rvt_free_qp_cb - callback function to reset a qp
+ * @qp: the qp to reset
+ * @v: a 64-bit value
+ *
+ * This function resets the qp and removes it from the
+ * qp hash table.
+ */
+static void rvt_free_qp_cb(struct rvt_qp *qp, u64 v)
+{
+	unsigned int *qp_inuse = (unsigned int *)v;
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+
+	/* Reset the qp and remove it from the qp hash list */
+	rvt_reset_qp(rdi, qp, qp->ibqp.qp_type);
+
+	/* Increment the qp_inuse count */
+	(*qp_inuse)++;
+}
+
+/**
+ * rvt_free_all_qps - check for QPs still in use
  * @rdi: rvt device info structure
  *
  * There should not be any QPs still in use.
  * Free memory for table.
+ * Return the number of QPs still in use.
  */
 static unsigned rvt_free_all_qps(struct rvt_dev_info *rdi)
 {
-	unsigned long flags;
-	struct rvt_qp *qp;
-	unsigned n, qp_inuse = 0;
-	spinlock_t *ql; /* work around too long line below */
-
-	if (rdi->driver_f.free_all_qps)
-		qp_inuse = rdi->driver_f.free_all_qps(rdi);
+	unsigned int qp_inuse = 0;
 
 	qp_inuse += rvt_mcast_tree_empty(rdi);
 
-	if (!rdi->qp_dev)
-		return qp_inuse;
+	rvt_qp_iter(rdi, (u64)&qp_inuse, rvt_free_qp_cb);
 
-	ql = &rdi->qp_dev->qpt_lock;
-	spin_lock_irqsave(ql, flags);
-	for (n = 0; n < rdi->qp_dev->qp_table_size; n++) {
-		qp = rcu_dereference_protected(rdi->qp_dev->qp_table[n],
-					       lockdep_is_held(ql));
-		RCU_INIT_POINTER(rdi->qp_dev->qp_table[n], NULL);
-
-		for (; qp; qp = rcu_dereference_protected(qp->next,
-							  lockdep_is_held(ql)))
-			qp_inuse++;
-	}
-	spin_unlock_irqrestore(ql, flags);
-	synchronize_rcu();
 	return qp_inuse;
 }
 
@@ -684,14 +687,14 @@
 }
 
 /**
- * rvt_reset_qp - initialize the QP state to the reset state
+ * _rvt_reset_qp - initialize the QP state to the reset state
  * @qp: the QP to reset
  * @type: the QP type
  *
  * r_lock, s_hlock, and s_lock are required to be held by the caller
  */
-static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
-			 enum ib_qp_type type)
+static void _rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+			  enum ib_qp_type type)
 	__must_hold(&qp->s_lock)
 	__must_hold(&qp->s_hlock)
 	__must_hold(&qp->r_lock)
@@ -737,6 +740,27 @@
 	lockdep_assert_held(&qp->s_lock);
 }
 
+/**
+ * rvt_reset_qp - initialize the QP state to the reset state
+ * @rdi: the device info
+ * @qp: the QP to reset
+ * @type: the QP type
+ *
+ * This is the wrapper function to acquire the r_lock, s_hlock, and s_lock
+ * before calling _rvt_reset_qp().
+ */
+static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+			 enum ib_qp_type type)
+{
+	spin_lock_irq(&qp->r_lock);
+	spin_lock(&qp->s_hlock);
+	spin_lock(&qp->s_lock);
+	_rvt_reset_qp(rdi, qp, type);
+	spin_unlock(&qp->s_lock);
+	spin_unlock(&qp->s_hlock);
+	spin_unlock_irq(&qp->r_lock);
+}
+
 /** rvt_free_qpn - Free a qpn from the bit map
  * @qpt: QP table
  * @qpn: queue pair number to free
@@ -1285,7 +1309,7 @@
 	switch (new_state) {
 	case IB_QPS_RESET:
 		if (qp->state != IB_QPS_RESET)
-			rvt_reset_qp(rdi, qp, ibqp->qp_type);
+			_rvt_reset_qp(rdi, qp, ibqp->qp_type);
 		break;
 
 	case IB_QPS_RTR:
@@ -1434,13 +1458,7 @@
 	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
 
-	spin_lock_irq(&qp->r_lock);
-	spin_lock(&qp->s_hlock);
-	spin_lock(&qp->s_lock);
 	rvt_reset_qp(rdi, qp, ibqp->qp_type);
-	spin_unlock(&qp->s_lock);
-	spin_unlock(&qp->s_hlock);
-	spin_unlock_irq(&qp->r_lock);
 
 	wait_event(qp->wait, !atomic_read(&qp->refcount));
 	/* qpn is now available for use again */

diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
index ea089cb..dc06e98 100644
--- a/drivers/infiniband/sw/rxe/rxe_comp.c
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c

@@ -329,7 +329,7 @@
 					qp->comp.psn = pkt->psn;
 					if (qp->req.wait_psn) {
 						qp->req.wait_psn = 0;
-						rxe_run_task(&qp->req.task, 1);
+						rxe_run_task(&qp->req.task, 0);
 					}
 				}
 				return COMPST_ERROR_RETRY;
@@ -457,7 +457,7 @@
 	 */
 	if (qp->req.wait_fence) {
 		qp->req.wait_fence = 0;
-		rxe_run_task(&qp->req.task, 1);
+		rxe_run_task(&qp->req.task, 0);
 	}
 }
 
@@ -473,7 +473,7 @@
 		if (qp->req.need_rd_atomic) {
 			qp->comp.timeout_retry = 0;
 			qp->req.need_rd_atomic = 0;
-			rxe_run_task(&qp->req.task, 1);
+			rxe_run_task(&qp->req.task, 0);
 		}
 	}
 
@@ -719,7 +719,7 @@
 							RXE_CNT_COMP_RETRY);
 					qp->req.need_retry = 1;
 					qp->comp.started_retry = 1;
-					rxe_run_task(&qp->req.task, 1);
+					rxe_run_task(&qp->req.task, 0);
 				}
 
 				if (pkt) {

diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index 6a75f96..b4e2436 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h

@@ -407,7 +407,7 @@
 	struct list_head	pending_mmaps;
 
 	spinlock_t		mmap_offset_lock; /* guard mmap_offset */
-	int			mmap_offset;
+	u64			mmap_offset;
 
 	atomic64_t		stats_counters[RXE_NUM_OF_COUNTERS];
 

diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 9899f7e..f39670c 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c

@@ -2584,6 +2584,17 @@
 	}
 }
 
+static void
+isert_wait4cmds(struct iscsi_conn *conn)
+{
+	isert_info("iscsi_conn %p\n", conn);
+
+	if (conn->sess) {
+		target_sess_cmd_list_set_waiting(conn->sess->se_sess);
+		target_wait_for_sess_cmds(conn->sess->se_sess);
+	}
+}
+
 /**
  * isert_put_unsol_pending_cmds() - Drop commands waiting for
  *     unsolicitate dataout
@@ -2631,6 +2642,7 @@
 
 	ib_drain_qp(isert_conn->qp);
 	isert_put_unsol_pending_cmds(conn);
+	isert_wait4cmds(conn);
 	isert_wait4logout(isert_conn);
 
 	queue_work(isert_release_wq, &isert_conn->release_work);

diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c
index e8d1134..f47e3fc 100644
--- a/drivers/input/mouse/synaptics.c
+++ b/drivers/input/mouse/synaptics.c

@@ -149,7 +149,6 @@
 	"LEN0042", /* Yoga */
 	"LEN0045",
 	"LEN0047",
-	"LEN0049",
 	"LEN2000", /* S540 */
 	"LEN2001", /* Edge E431 */
 	"LEN2002", /* Edge E531 */
@@ -169,9 +168,11 @@
 	/* all of the topbuttonpad_pnp_ids are valid, we just add some extras */
 	"LEN0048", /* X1 Carbon 3 */
 	"LEN0046", /* X250 */
+	"LEN0049", /* Yoga 11e */
 	"LEN004a", /* W541 */
 	"LEN005b", /* P50 */
 	"LEN005e", /* T560 */
+	"LEN006c", /* T470s */
 	"LEN0071", /* T480 */
 	"LEN0072", /* X1 Carbon Gen 5 (2017) - Elan/ALPS trackpoint */
 	"LEN0073", /* X1 Carbon G5 (Elantech) */
@@ -182,6 +183,7 @@
 	"LEN0097", /* X280 -> ALPS trackpoint */
 	"LEN009b", /* T580 */
 	"LEN200f", /* T450s */
+	"LEN2044", /* L470  */
 	"LEN2054", /* E480 */
 	"LEN2055", /* E580 */
 	"SYN3052", /* HP EliteBook 840 G4 */

diff --git a/drivers/input/touchscreen/edt-ft5x06.c b/drivers/input/touchscreen/edt-ft5x06.c
index 1e18ca0..3fdaa64 100644
--- a/drivers/input/touchscreen/edt-ft5x06.c
+++ b/drivers/input/touchscreen/edt-ft5x06.c

@@ -968,6 +968,7 @@
 {
 	const struct edt_i2c_chip_data *chip_data;
 	struct edt_ft5x06_ts_data *tsdata;
+	u8 buf[2] = { 0xfc, 0x00 };
 	struct input_dev *input;
 	unsigned long irq_flags;
 	int error;
@@ -1037,6 +1038,12 @@
 		return error;
 	}
 
+	/*
+	 * Dummy read access. EP0700MLP1 returns bogus data on the first
+	 * register read access and ignores writes.
+	 */
+	edt_ft5x06_ts_readwrite(tsdata->client, 2, buf, 2, buf);
+
 	edt_ft5x06_ts_set_regs(tsdata);
 	edt_ft5x06_ts_get_defaults(&client->dev, tsdata);
 	edt_ft5x06_ts_get_parameters(tsdata);

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index eff1f3a..6b76640 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c

@@ -1185,7 +1185,8 @@
 	}
 
 	arm_smmu_sync_ste_for_sid(smmu, sid);
-	dst[0] = cpu_to_le64(val);
+	/* See comment in arm_smmu_write_ctx_desc() */
+	WRITE_ONCE(dst[0], cpu_to_le64(val));
 	arm_smmu_sync_ste_for_sid(smmu, sid);
 
 	/* It's likely that we'll want to use the new STE soon */

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 7f9824b..72994d6 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c

@@ -1345,7 +1345,6 @@
 	struct qi_desc desc;
 
 	if (mask) {
-		WARN_ON_ONCE(addr & ((1ULL << (VTD_PAGE_SHIFT + mask)) - 1));
 		addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
 		desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
 	} else

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index fd8730b..5944d3b 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c

@@ -377,7 +377,7 @@
 		/* Do not use PASID 0 in caching mode (virtualised IOMMU) */
 		ret = intel_pasid_alloc_id(svm,
 					   !!cap_caching_mode(iommu->cap),
-					   pasid_max - 1, GFP_KERNEL);
+					   pasid_max, GFP_KERNEL);
 		if (ret < 0) {
 			kfree(svm);
 			kfree(sdev);

diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c
index ee70e99..9a6ed5e 100644
--- a/drivers/iommu/qcom_iommu.c
+++ b/drivers/iommu/qcom_iommu.c

@@ -333,21 +333,19 @@
 {
 	struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
 
-	if (WARN_ON(qcom_domain->iommu))    /* forgot to detach? */
-		return;
-
 	iommu_put_dma_cookie(domain);
 
-	/* NOTE: unmap can be called after client device is powered off,
-	 * for example, with GPUs or anything involving dma-buf.  So we
-	 * cannot rely on the device_link.  Make sure the IOMMU is on to
-	 * avoid unclocked accesses in the TLB inv path:
-	 */
-	pm_runtime_get_sync(qcom_domain->iommu->dev);
-
-	free_io_pgtable_ops(qcom_domain->pgtbl_ops);
-
-	pm_runtime_put_sync(qcom_domain->iommu->dev);
+	if (qcom_domain->iommu) {
+		/*
+		 * NOTE: unmap can be called after client device is powered
+		 * off, for example, with GPUs or anything involving dma-buf.
+		 * So we cannot rely on the device_link.  Make sure the IOMMU
+		 * is on to avoid unclocked accesses in the TLB inv path:
+		 */
+		pm_runtime_get_sync(qcom_domain->iommu->dev);
+		free_io_pgtable_ops(qcom_domain->pgtbl_ops);
+		pm_runtime_put_sync(qcom_domain->iommu->dev);
+	}
 
 	kfree(qcom_domain);
 }
@@ -392,7 +390,7 @@
 	struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
 	unsigned i;
 
-	if (!qcom_domain->iommu)
+	if (WARN_ON(!qcom_domain->iommu))
 		return;
 
 	pm_runtime_get_sync(qcom_iommu->dev);
@@ -405,8 +403,6 @@
 		ctx->domain = NULL;
 	}
 	pm_runtime_put_sync(qcom_iommu->dev);
-
-	qcom_domain->iommu = NULL;
 }
 
 static int qcom_iommu_map(struct iommu_domain *domain, unsigned long iova,

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 050d6e0..f9b7333 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c

@@ -208,7 +208,7 @@
 
 static struct its_collection *valid_col(struct its_collection *col)
 {
-	if (WARN_ON_ONCE(col->target_address & GENMASK_ULL(0, 15)))
+	if (WARN_ON_ONCE(col->target_address & GENMASK_ULL(15, 0)))
 		return NULL;
 
 	return col;
@@ -578,7 +578,7 @@
 						   struct its_cmd_desc *desc)
 {
 	its_encode_cmd(cmd, GITS_CMD_INVALL);
-	its_encode_collection(cmd, desc->its_mapc_cmd.col->col_id);
+	its_encode_collection(cmd, desc->its_invall_cmd.col->col_id);
 
 	its_fixup_cmd(cmd);
 

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index d5912f1..ac888d7 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c

@@ -1347,6 +1347,7 @@
 	struct redist_region *redist_regs;
 	u32 nr_redist_regions;
 	bool single_redist;
+	int enabled_rdists;
 	u32 maint_irq;
 	int maint_irq_mode;
 	phys_addr_t vcpu_base;
@@ -1441,8 +1442,10 @@
 	 * If GICC is enabled and has valid gicr base address, then it means
 	 * GICR base is presented via GICC
 	 */
-	if ((gicc->flags & ACPI_MADT_ENABLED) && gicc->gicr_base_address)
+	if ((gicc->flags & ACPI_MADT_ENABLED) && gicc->gicr_base_address) {
+		acpi_data.enabled_rdists++;
 		return 0;
+	}
 
 	/*
 	 * It's perfectly valid firmware can pass disabled GICC entry, driver
@@ -1472,8 +1475,10 @@
 
 	count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
 				      gic_acpi_match_gicc, 0);
-	if (count > 0)
+	if (count > 0) {
 		acpi_data.single_redist = true;
+		count = acpi_data.enabled_rdists;
+	}
 
 	return count;
 }

diff --git a/drivers/irqchip/irq-mbigen.c b/drivers/irqchip/irq-mbigen.c
index 98b6e1d..f7fdbf5 100644
--- a/drivers/irqchip/irq-mbigen.c
+++ b/drivers/irqchip/irq-mbigen.c

@@ -381,6 +381,7 @@
 		.name		= "Hisilicon MBIGEN-V2",
 		.of_match_table	= mbigen_of_match,
 		.acpi_match_table = ACPI_PTR(mbigen_acpi_match),
+		.suppress_bind_attrs = true,
 	},
 	.probe			= mbigen_device_probe,
 };

diff --git a/drivers/leds/leds-pca963x.c b/drivers/leds/leds-pca963x.c
index 5c09081..bbcde13b 100644
--- a/drivers/leds/leds-pca963x.c
+++ b/drivers/leds/leds-pca963x.c

@@ -43,6 +43,8 @@
 #define PCA963X_LED_PWM		0x2	/* Controlled through PWM */
 #define PCA963X_LED_GRP_PWM	0x3	/* Controlled through PWM/GRPPWM */
 
+#define PCA963X_MODE2_OUTDRV	0x04	/* Open-drain or totem pole */
+#define PCA963X_MODE2_INVRT	0x10	/* Normal or inverted direction */
 #define PCA963X_MODE2_DMBLNK	0x20	/* Enable blinking */
 
 #define PCA963X_MODE1		0x00
@@ -462,12 +464,12 @@
 						    PCA963X_MODE2);
 		/* Configure output: open-drain or totem pole (push-pull) */
 		if (pdata->outdrv == PCA963X_OPEN_DRAIN)
-			mode2 |= 0x01;
+			mode2 &= ~PCA963X_MODE2_OUTDRV;
 		else
-			mode2 |= 0x05;
+			mode2 |= PCA963X_MODE2_OUTDRV;
 		/* Configure direction: normal or inverted */
 		if (pdata->dir == PCA963X_INVERTED)
-			mode2 |= 0x10;
+			mode2 |= PCA963X_MODE2_INVRT;
 		i2c_smbus_write_byte_data(pca963x->chip->client, PCA963X_MODE2,
 					  mode2);
 	}

diff --git a/drivers/macintosh/therm_windtunnel.c b/drivers/macintosh/therm_windtunnel.c
index 8c74457..a0d87ed9 100644
--- a/drivers/macintosh/therm_windtunnel.c
+++ b/drivers/macintosh/therm_windtunnel.c

@@ -300,9 +300,11 @@
 /*	i2c probing and setup						*/
 /************************************************************************/
 
-static int
-do_attach( struct i2c_adapter *adapter )
+static void do_attach(struct i2c_adapter *adapter)
 {
+	struct i2c_board_info info = { };
+	struct device_node *np;
+
 	/* scan 0x48-0x4f (DS1775) and 0x2c-2x2f (ADM1030) */
 	static const unsigned short scan_ds1775[] = {
 		0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
@@ -313,25 +315,24 @@
 		I2C_CLIENT_END
 	};
 
-	if( strncmp(adapter->name, "uni-n", 5) )
-		return 0;
+	if (x.running || strncmp(adapter->name, "uni-n", 5))
+		return;
 
-	if( !x.running ) {
-		struct i2c_board_info info;
-
-		memset(&info, 0, sizeof(struct i2c_board_info));
-		strlcpy(info.type, "therm_ds1775", I2C_NAME_SIZE);
+	np = of_find_compatible_node(adapter->dev.of_node, NULL, "MAC,ds1775");
+	if (np) {
+		of_node_put(np);
+	} else {
+		strlcpy(info.type, "MAC,ds1775", I2C_NAME_SIZE);
 		i2c_new_probed_device(adapter, &info, scan_ds1775, NULL);
-
-		strlcpy(info.type, "therm_adm1030", I2C_NAME_SIZE);
-		i2c_new_probed_device(adapter, &info, scan_adm1030, NULL);
-
-		if( x.thermostat && x.fan ) {
-			x.running = 1;
-			x.poll_task = kthread_run(control_loop, NULL, "g4fand");
-		}
 	}
-	return 0;
+
+	np = of_find_compatible_node(adapter->dev.of_node, NULL, "MAC,adm1030");
+	if (np) {
+		of_node_put(np);
+	} else {
+		strlcpy(info.type, "MAC,adm1030", I2C_NAME_SIZE);
+		i2c_new_probed_device(adapter, &info, scan_adm1030, NULL);
+	}
 }
 
 static int
@@ -404,8 +405,8 @@
 enum chip { ds1775, adm1030 };
 
 static const struct i2c_device_id therm_windtunnel_id[] = {
-	{ "therm_ds1775", ds1775 },
-	{ "therm_adm1030", adm1030 },
+	{ "MAC,ds1775", ds1775 },
+	{ "MAC,adm1030", adm1030 },
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, therm_windtunnel_id);
@@ -414,6 +415,7 @@
 do_probe(struct i2c_client *cl, const struct i2c_device_id *id)
 {
 	struct i2c_adapter *adapter = cl->adapter;
+	int ret = 0;
 
 	if( !i2c_check_functionality(adapter, I2C_FUNC_SMBUS_WORD_DATA
 				     | I2C_FUNC_SMBUS_WRITE_BYTE) )
@@ -421,11 +423,19 @@
 
 	switch (id->driver_data) {
 	case adm1030:
-		return attach_fan( cl );
+		ret = attach_fan(cl);
+		break;
 	case ds1775:
-		return attach_thermostat(cl);
+		ret = attach_thermostat(cl);
+		break;
 	}
-	return 0;
+
+	if (!x.running && x.thermostat && x.fan) {
+		x.running = 1;
+		x.poll_task = kthread_run(control_loop, NULL, "g4fand");
+	}
+
+	return ret;
 }
 
 static struct i2c_driver g4fan_driver = {

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 8b8c123..9949fd9 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig

@@ -447,6 +447,18 @@
 
 	If unsure, say N.
 
+config DM_INIT
+	bool "DM \"dm-mod.create=\" parameter support"
+	depends on BLK_DEV_DM=y
+	---help---
+	Enable "dm-mod.create=" parameter to create mapped devices at init time.
+	This option is useful to allow mounting rootfs without requiring an
+	initramfs.
+	See Documentation/device-mapper/dm-init.txt for dm-mod.create="..."
+	format.
+
+	If unsure, say N.
+
 config DM_UEVENT
 	bool "DM uevents"
 	depends on BLK_DEV_DM

diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 822f4e8..a52b703 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile

@@ -69,6 +69,10 @@
 obj-$(CONFIG_DM_ZONED)		+= dm-zoned.o
 obj-$(CONFIG_DM_WRITECACHE)	+= dm-writecache.o
 
+ifeq ($(CONFIG_DM_INIT),y)
+dm-mod-objs			+= dm-init.o
+endif
+
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
 endif

diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index c71365e..a50dcfd 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h

@@ -397,7 +397,8 @@
 
 /* Bkey utility code */
 
-#define bset_bkey_last(i)	bkey_idx((struct bkey *) (i)->d, (i)->keys)
+#define bset_bkey_last(i)	bkey_idx((struct bkey *) (i)->d, \
+					 (unsigned int)(i)->keys)
 
 static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned int idx)
 {

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index c45d9ad..5b5cbfa 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c

@@ -1226,6 +1226,9 @@
 
 	mutex_unlock(&bch_register_lock);
 
+	if (dc->sb_bio.bi_inline_vecs[0].bv_page)
+		put_page(bio_first_page_all(&dc->sb_bio));
+
 	if (!IS_ERR_OR_NULL(dc->bdev))
 		blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 

diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c
new file mode 100644
index 0000000..6f06e6b
--- /dev/null
+++ b/drivers/md/dm-init.c

@@ -0,0 +1,554 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * dm-init.c
+ * Copyright (C) 2017 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/device-mapper.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/moduleparam.h>
+
+#define DM_MSG_PREFIX "init"
+#define DM_MAX_DEVICES 256
+#define DM_MAX_TARGETS 256
+#define DM_MAX_STR_SIZE 4096
+
+static char *create;
+
+/*
+ * Format: dm-mod.create=<name>,<uuid>,<minor>,<flags>,<table>[,<table>+][;<name>,<uuid>,<minor>,<flags>,<table>[,<table>+]+]
+ * Table format: <start_sector> <num_sectors> <target_type> <target_args>
+ *
+ * See Documentation/device-mapper/dm-init.txt for dm-mod.create="..." format
+ * details.
+ */
+
+struct dm_device {
+	struct dm_ioctl dmi;
+	struct dm_target_spec *table[DM_MAX_TARGETS];
+	char *target_args_array[DM_MAX_TARGETS];
+	struct list_head list;
+};
+
+const char * const dm_allowed_targets[] __initconst = {
+	"crypt",
+	"delay",
+	"linear",
+	"snapshot-origin",
+	"striped",
+	"verity",
+};
+
+static int __init dm_verify_target_type(const char *target)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(dm_allowed_targets); i++) {
+		if (!strcmp(dm_allowed_targets[i], target))
+			return 0;
+	}
+	return -EINVAL;
+}
+
+static void __init dm_setup_cleanup(struct list_head *devices)
+{
+	struct dm_device *dev, *tmp;
+	unsigned int i;
+
+	list_for_each_entry_safe(dev, tmp, devices, list) {
+		list_del(&dev->list);
+		for (i = 0; i < dev->dmi.target_count; i++) {
+			kfree(dev->table[i]);
+			kfree(dev->target_args_array[i]);
+		}
+		kfree(dev);
+	}
+}
+
+/**
+ * str_field_delimit - delimit a string based on a separator char.
+ * @str: the pointer to the string to delimit.
+ * @separator: char that delimits the field
+ *
+ * Find a @separator and replace it by '\0'.
+ * Remove leading and trailing spaces.
+ * Return the remainder string after the @separator.
+ */
+static char __init *str_field_delimit(char **str, char separator)
+{
+	char *s;
+
+	/* TODO: add support for escaped characters */
+	*str = skip_spaces(*str);
+	s = strchr(*str, separator);
+	/* Delimit the field and remove trailing spaces */
+	if (s)
+		*s = '\0';
+	*str = strim(*str);
+	return s ? ++s : NULL;
+}
+
+/**
+ * dm_parse_table_entry - parse a table entry
+ * @dev: device to store the parsed information.
+ * @str: the pointer to a string with the format:
+ *	<start_sector> <num_sectors> <target_type> <target_args>[, ...]
+ *
+ * Return the remainder string after the table entry, i.e, after the comma which
+ * delimits the entry or NULL if reached the end of the string.
+ */
+static char __init *dm_parse_table_entry(struct dm_device *dev, char *str)
+{
+	const unsigned int n = dev->dmi.target_count - 1;
+	struct dm_target_spec *sp;
+	unsigned int i;
+	/* fields:  */
+	char *field[4];
+	char *next;
+
+	field[0] = str;
+	/* Delimit first 3 fields that are separated by space */
+	for (i = 0; i < ARRAY_SIZE(field) - 1; i++) {
+		field[i + 1] = str_field_delimit(&field[i], ' ');
+		if (!field[i + 1])
+			return ERR_PTR(-EINVAL);
+	}
+	/* Delimit last field that can be terminated by comma */
+	next = str_field_delimit(&field[i], ',');
+
+	sp = kzalloc(sizeof(*sp), GFP_KERNEL);
+	if (!sp)
+		return ERR_PTR(-ENOMEM);
+	dev->table[n] = sp;
+
+	/* start_sector */
+	if (kstrtoull(field[0], 0, &sp->sector_start))
+		return ERR_PTR(-EINVAL);
+	/* num_sector */
+	if (kstrtoull(field[1], 0, &sp->length))
+		return ERR_PTR(-EINVAL);
+	/* target_type */
+	strscpy(sp->target_type, field[2], sizeof(sp->target_type));
+	if (dm_verify_target_type(sp->target_type)) {
+		DMERR("invalid type \"%s\"", sp->target_type);
+		return ERR_PTR(-EINVAL);
+	}
+	/* target_args */
+	dev->target_args_array[n] = kstrndup(field[3], GFP_KERNEL,
+					     DM_MAX_STR_SIZE);
+	if (!dev->target_args_array[n])
+		return ERR_PTR(-ENOMEM);
+
+	return next;
+}
+
+/**
+ * dm_parse_table - parse "dm-mod.create=" table field
+ * @dev: device to store the parsed information.
+ * @str: the pointer to a string with the format:
+ *	<table>[,<table>+]
+ */
+static int __init dm_parse_table(struct dm_device *dev, char *str)
+{
+	char *table_entry = str;
+
+	while (table_entry) {
+		DMDEBUG("parsing table \"%s\"", str);
+		if (++dev->dmi.target_count > DM_MAX_TARGETS) {
+			DMERR("too many targets %u > %d",
+			      dev->dmi.target_count, DM_MAX_TARGETS);
+			return -EINVAL;
+		}
+		table_entry = dm_parse_table_entry(dev, table_entry);
+		if (IS_ERR(table_entry)) {
+			DMERR("couldn't parse table");
+			return PTR_ERR(table_entry);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * dm_parse_device_entry - parse a device entry
+ * @dev: device to store the parsed information.
+ * @str: the pointer to a string with the format:
+ *	name,uuid,minor,flags,table[; ...]
+ *
+ * Return the remainder string after the table entry, i.e, after the semi-colon
+ * which delimits the entry or NULL if reached the end of the string.
+ */
+static char __init *dm_parse_device_entry(struct dm_device *dev, char *str)
+{
+	/* There are 5 fields: name,uuid,minor,flags,table; */
+	char *field[5];
+	unsigned int i;
+	char *next;
+
+	field[0] = str;
+	/* Delimit first 4 fields that are separated by comma */
+	for (i = 0; i < ARRAY_SIZE(field) - 1; i++) {
+		field[i+1] = str_field_delimit(&field[i], ',');
+		if (!field[i+1])
+			return ERR_PTR(-EINVAL);
+	}
+	/* Delimit last field that can be delimited by semi-colon */
+	next = str_field_delimit(&field[i], ';');
+
+	/* name */
+	strscpy(dev->dmi.name, field[0], sizeof(dev->dmi.name));
+	/* uuid */
+	strscpy(dev->dmi.uuid, field[1], sizeof(dev->dmi.uuid));
+	/* minor */
+	if (strlen(field[2])) {
+		if (kstrtoull(field[2], 0, &dev->dmi.dev))
+			return ERR_PTR(-EINVAL);
+		dev->dmi.flags |= DM_PERSISTENT_DEV_FLAG;
+	}
+	/* flags */
+	if (!strcmp(field[3], "ro"))
+		dev->dmi.flags |= DM_READONLY_FLAG;
+	else if (strcmp(field[3], "rw"))
+		return ERR_PTR(-EINVAL);
+	/* table */
+	if (dm_parse_table(dev, field[4]))
+		return ERR_PTR(-EINVAL);
+
+	return next;
+}
+
+/**
+ * dm_parse_devices - parse "dm-mod.create=" argument
+ * @devices: list of struct dm_device to store the parsed information.
+ * @str: the pointer to a string with the format:
+ *	<device>[;<device>+]
+ */
+static int __init dm_parse_devices(struct list_head *devices, char *str)
+{
+	unsigned long ndev = 0;
+	struct dm_device *dev;
+	char *device = str;
+
+	DMDEBUG("parsing \"%s\"", str);
+	while (device) {
+		dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+		if (!dev)
+			return -ENOMEM;
+		list_add_tail(&dev->list, devices);
+
+		if (++ndev > DM_MAX_DEVICES) {
+			DMERR("too many devices %lu > %d",
+			      ndev, DM_MAX_DEVICES);
+			return -EINVAL;
+		}
+
+		device = dm_parse_device_entry(dev, device);
+		if (IS_ERR(device)) {
+			DMERR("couldn't parse device");
+			return PTR_ERR(device);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * dm_init_init - parse "dm-mod.create=" argument and configure drivers
+ */
+static int __init dm_init_init(void)
+{
+	struct dm_device *dev;
+	LIST_HEAD(devices);
+	char *str;
+	int r;
+
+	if (!create)
+		return 0;
+
+	if (strlen(create) >= DM_MAX_STR_SIZE) {
+		DMERR("Argument is too big. Limit is %d\n", DM_MAX_STR_SIZE);
+		return -EINVAL;
+	}
+	str = kstrndup(create, GFP_KERNEL, DM_MAX_STR_SIZE);
+	if (!str)
+		return -ENOMEM;
+
+	r = dm_parse_devices(&devices, str);
+	if (r)
+		goto out;
+
+	DMINFO("waiting for all devices to be available before creating mapped devices\n");
+	wait_for_device_probe();
+
+	list_for_each_entry(dev, &devices, list) {
+		if (dm_early_create(&dev->dmi, dev->table,
+				    dev->target_args_array))
+			break;
+	}
+out:
+	kfree(str);
+	dm_setup_cleanup(&devices);
+	return r;
+}
+
+late_initcall(dm_init_init);
+
+module_param(create, charp, 0);
+MODULE_PARM_DESC(create, "Create a mapped device in early boot");
+
+/* ---------------------------------------------------------------
+ * ChromeOS shim - convert dm= format to dm-mod.create= format
+ * ---------------------------------------------------------------
+ */
+
+struct dm_chrome_target {
+	char *field[4];
+};
+
+struct dm_chrome_dev {
+	char *name, *uuid, *mode;
+	unsigned int num_targets;
+	struct dm_chrome_target targets[DM_MAX_TARGETS];
+};
+
+static char __init *dm_chrome_parse_target(char *str, struct dm_chrome_target *tgt)
+{
+	unsigned int i;
+
+	tgt->field[0] = str;
+	/* Delimit first 3 fields that are separated by space */
+	for (i = 0; i < ARRAY_SIZE(tgt->field) - 1; i++) {
+		tgt->field[i + 1] = str_field_delimit(&tgt->field[i], ' ');
+		if (!tgt->field[i + 1])
+			return NULL;
+	}
+	/* Delimit last field that can be terminated by comma */
+	return str_field_delimit(&tgt->field[i], ',');
+}
+
+static char __init *dm_chrome_parse_dev(char *str, struct dm_chrome_dev *dev)
+{
+	char *target, *num;
+	unsigned int i;
+
+	if (!str)
+		return ERR_PTR(-EINVAL);
+
+	target = str_field_delimit(&str, ',');
+	if (!target)
+		return ERR_PTR(-EINVAL);
+
+	/* Delimit first 3 fields that are separated by space */
+	dev->name = str;
+	dev->uuid = str_field_delimit(&dev->name, ' ');
+	if (!dev->uuid)
+		return ERR_PTR(-EINVAL);
+
+	dev->mode = str_field_delimit(&dev->uuid, ' ');
+	if (!dev->mode)
+		return ERR_PTR(-EINVAL);
+
+	/* num is optional */
+	num = str_field_delimit(&dev->mode, ' ');
+	if (!num)
+		dev->num_targets = 1;
+	else {
+		/* Delimit num and check if it the last field */
+		if(str_field_delimit(&num, ' '))
+			return ERR_PTR(-EINVAL);
+		if (kstrtouint(num, 0, &dev->num_targets))
+			return ERR_PTR(-EINVAL);
+	}
+
+	if (dev->num_targets > DM_MAX_TARGETS) {
+		DMERR("too many targets %u > %d",
+		      dev->num_targets, DM_MAX_TARGETS);
+		return ERR_PTR(-EINVAL);
+	}
+
+	for (i = 0; i < dev->num_targets - 1; i++) {
+		target = dm_chrome_parse_target(target, &dev->targets[i]);
+		if (!target)
+			return ERR_PTR(-EINVAL);
+	}
+	/* The last one can return NULL if it reaches the end of str */
+	return dm_chrome_parse_target(target, &dev->targets[i]);
+}
+
+static char __init *dm_chrome_convert(struct dm_chrome_dev *devs, unsigned int num_devs)
+{
+	char *str = kmalloc(DM_MAX_STR_SIZE, GFP_KERNEL);
+	char *p = str;
+	unsigned int i, j;
+	int ret;
+
+	if (!str)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < num_devs; i++) {
+		if (!strcmp(devs[i].uuid, "none"))
+			devs[i].uuid = "";
+		ret = snprintf(p, DM_MAX_STR_SIZE - (p - str),
+			       "%s,%s,,%s",
+			       devs[i].name,
+			       devs[i].uuid,
+			       devs[i].mode);
+		if (ret < 0)
+			goto out;
+		p += ret;
+
+		for (j = 0; j < devs[i].num_targets; j++) {
+			ret = snprintf(p, DM_MAX_STR_SIZE - (p - str),
+				       ",%s %s %s %s",
+				       devs[i].targets[j].field[0],
+				       devs[i].targets[j].field[1],
+				       devs[i].targets[j].field[2],
+				       devs[i].targets[j].field[3]);
+			if (ret < 0)
+				goto out;
+			p += ret;
+		}
+		if (i < num_devs - 1) {
+			ret = snprintf(p, DM_MAX_STR_SIZE - (p - str), ";");
+			if (ret < 0)
+				goto out;
+			p += ret;
+		}
+	}
+
+	return str;
+
+out:
+	kfree(str);
+	return ERR_PTR(ret);
+}
+
+/**
+ * dm_chrome_shim - convert old dm= format used in chromeos to the new
+ * upstream format.
+ *
+ * ChromeOS old format
+ * -------------------
+ * <device>        ::= [<num>] <device-mapper>+
+ * <device-mapper> ::= <head> "," <target>+
+ * <head>          ::= <name> <uuid> <mode> [<num>]
+ * <target>        ::= <start> <length> <type> <options> ","
+ * <mode>          ::= "ro" | "rw"
+ * <uuid>          ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "none"
+ * <type>          ::= "verity" | "bootcache" | ...
+ *
+ * Example:
+ * 2 vboot none ro 1,
+ *     0 1768000 bootcache
+ *       device=aa55b119-2a47-8c45-946a-5ac57765011f+1
+ *       signature=76e9be054b15884a9fa85973e9cb274c93afadb6
+ *       cache_start=1768000 max_blocks=100000 size_limit=23 max_trace=20000,
+ *   vroot none ro 1,
+ *     0 1740800 verity payload=254:0 hashtree=254:0 hashstart=1740800 alg=sha1
+ *       root_hexdigest=76e9be054b15884a9fa85973e9cb274c93afadb6
+ *       salt=5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe
+ *
+ * Notes:
+ *  1. uuid is a label for the device and we set it to "none".
+ *  2. The <num> field will be optional initially and assumed to be 1.
+ *     Once all the scripts that set these fields have been set, it will
+ *     be made mandatory.
+ */
+
+static char *chrome_create;
+
+static int __init dm_chrome_shim(char *arg) {
+	if (!arg || create)
+		return -EINVAL;
+	chrome_create = arg;
+	return 0;
+}
+
+static int __init dm_chrome_parse_devices(void)
+{
+	struct dm_chrome_dev *devs;
+	unsigned int num_devs, i;
+	char *next, *base_str;
+	int ret = 0;
+
+	/* Verify if dm-mod.create was not used */
+	if (!chrome_create || create)
+		return -EINVAL;
+
+	if (strlen(chrome_create) >= DM_MAX_STR_SIZE) {
+		DMERR("Argument is too big. Limit is %d\n", DM_MAX_STR_SIZE);
+		return -EINVAL;
+	}
+
+	base_str = kstrdup(chrome_create, GFP_KERNEL);
+	if (!base_str)
+		return -ENOMEM;
+
+	next = str_field_delimit(&base_str, ' ');
+	if (!next) {
+		ret = -EINVAL;
+		goto out_str;
+	}
+
+	/* if first field is not the optional <num> field */
+	if (kstrtouint(base_str, 0, &num_devs)) {
+		num_devs = 1;
+		/* rewind next pointer */
+		next = base_str;
+	}
+
+	if (num_devs > DM_MAX_DEVICES) {
+		DMERR("too many devices %u > %d", num_devs, DM_MAX_DEVICES);
+		ret = -EINVAL;
+		goto out_str;
+	}
+
+	devs = kcalloc(num_devs, sizeof(*devs), GFP_KERNEL);
+	if (!devs)
+		return -ENOMEM;
+
+	/* restore string */
+	strcpy(base_str, chrome_create);
+
+	/* parse devices */
+	for (i = 0; i < num_devs; i++) {
+		next = dm_chrome_parse_dev(next, &devs[i]);
+		if (IS_ERR(next)) {
+			DMERR("couldn't parse device");
+			ret = PTR_ERR(next);
+			goto out_devs;
+		}
+	}
+
+	create = dm_chrome_convert(devs, num_devs);
+	if (IS_ERR(create)) {
+		ret = PTR_ERR(create);
+		goto out_devs;
+	}
+
+	DMDEBUG("Converting:\n\tdm=\"%s\"\n\tdm-mod.create=\"%s\"\n",
+		chrome_create, create);
+
+	/* Call upstream code */
+	dm_init_init();
+
+	kfree(create);
+
+out_devs:
+	create = NULL;
+	kfree(devs);
+out_str:
+	kfree(base_str);
+
+	return ret;
+}
+
+late_initcall(dm_chrome_parse_devices);
+
+__setup("dm=", dm_chrome_shim);

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index f666778..1e03bc8 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c

@@ -2018,3 +2018,110 @@
 
 	return r;
 }
+
+
+/**
+ * dm_early_create - create a mapped device in early boot.
+ *
+ * @dmi: Contains main information of the device mapping to be created.
+ * @spec_array: array of pointers to struct dm_target_spec. Describes the
+ * mapping table of the device.
+ * @target_params_array: array of strings with the parameters to a specific
+ * target.
+ *
+ * Instead of having the struct dm_target_spec and the parameters for every
+ * target embedded at the end of struct dm_ioctl (as performed in a normal
+ * ioctl), pass them as arguments, so the caller doesn't need to serialize them.
+ * The size of the spec_array and target_params_array is given by
+ * @dmi->target_count.
+ * This function is supposed to be called in early boot, so locking mechanisms
+ * to protect against concurrent loads are not required.
+ */
+int __init dm_early_create(struct dm_ioctl *dmi,
+			   struct dm_target_spec **spec_array,
+			   char **target_params_array)
+{
+	int r, m = DM_ANY_MINOR;
+	struct dm_table *t, *old_map;
+	struct mapped_device *md;
+	unsigned int i;
+
+	if (!dmi->target_count)
+		return -EINVAL;
+
+	r = check_name(dmi->name);
+	if (r)
+		return r;
+
+	if (dmi->flags & DM_PERSISTENT_DEV_FLAG)
+		m = MINOR(huge_decode_dev(dmi->dev));
+
+	/* alloc dm device */
+	r = dm_create(m, &md);
+	if (r)
+		return r;
+
+	/* hash insert */
+	r = dm_hash_insert(dmi->name, *dmi->uuid ? dmi->uuid : NULL, md);
+	if (r)
+		goto err_destroy_dm;
+
+	/* alloc table */
+	r = dm_table_create(&t, get_mode(dmi), dmi->target_count, md);
+	if (r)
+		goto err_hash_remove;
+
+	/* add targets */
+	for (i = 0; i < dmi->target_count; i++) {
+		r = dm_table_add_target(t, spec_array[i]->target_type,
+					(sector_t) spec_array[i]->sector_start,
+					(sector_t) spec_array[i]->length,
+					target_params_array[i]);
+		if (r) {
+			DMWARN("error adding target to table");
+			goto err_destroy_table;
+		}
+	}
+
+	/* finish table */
+	r = dm_table_complete(t);
+	if (r)
+		goto err_destroy_table;
+
+	md->type = dm_table_get_type(t);
+	/* setup md->queue to reflect md's type (may block) */
+	r = dm_setup_md_queue(md, t);
+	if (r) {
+		DMWARN("unable to set up device queue for new table.");
+		goto err_destroy_table;
+	}
+
+	/* Set new map */
+	dm_suspend(md, 0);
+	old_map = dm_swap_table(md, t);
+	if (IS_ERR(old_map)) {
+		r = PTR_ERR(old_map);
+		goto err_destroy_table;
+	}
+	set_disk_ro(dm_disk(md), !!(dmi->flags & DM_READONLY_FLAG));
+
+	/* resume device */
+	r = dm_resume(md);
+	if (r)
+		goto err_destroy_table;
+
+	DMINFO("%s (%s) is ready", md->disk->disk_name, dmi->name);
+	dm_put(md);
+	return 0;
+
+err_destroy_table:
+	dm_table_destroy(t);
+err_hash_remove:
+	(void) __hash_remove(__get_name_cell(dmi->name));
+	/* release reference from __get_name_cell */
+	dm_put(md);
+err_destroy_dm:
+	dm_put(md);
+	dm_destroy(md);
+	return r;
+}

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 36275c5..eed37b6 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c

@@ -882,13 +882,25 @@
 }
 EXPORT_SYMBOL_GPL(dm_table_set_type);
 
-static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
-			       sector_t start, sector_t len, void *data)
+/* validate the dax capability of the target device span */
+int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
+				       sector_t start, sector_t len, void *data)
 {
-	return bdev_dax_supported(dev->bdev, PAGE_SIZE);
+	int blocksize = *(int *) data;
+
+	return generic_fsdax_supported(dev->dax_dev, dev->bdev, blocksize,
+			start, len);
 }
 
-static bool dm_table_supports_dax(struct dm_table *t)
+/* Check devices support synchronous DAX */
+static int device_synchronous(struct dm_target *ti, struct dm_dev *dev,
+				       sector_t start, sector_t len, void *data)
+{
+	return dev->dax_dev && dax_synchronous(dev->dax_dev);
+}
+
+bool dm_table_supports_dax(struct dm_table *t,
+			  iterate_devices_callout_fn iterate_fn, int *blocksize)
 {
 	struct dm_target *ti;
 	unsigned i;
@@ -901,7 +913,7 @@
 			return false;
 
 		if (!ti->type->iterate_devices ||
-		    !ti->type->iterate_devices(ti, device_supports_dax, NULL))
+			!ti->type->iterate_devices(ti, iterate_fn, blocksize))
 			return false;
 	}
 
@@ -937,6 +949,7 @@
 	struct dm_target *tgt;
 	struct list_head *devices = dm_table_get_devices(t);
 	enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
+	int page_size = PAGE_SIZE;
 
 	if (t->type != DM_TYPE_NONE) {
 		/* target already set the table's type */
@@ -981,7 +994,7 @@
 verify_bio_based:
 		/* We must use this table as bio-based */
 		t->type = DM_TYPE_BIO_BASED;
-		if (dm_table_supports_dax(t) ||
+		if (dm_table_supports_dax(t, device_supports_dax, &page_size) ||
 		    (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
 			t->type = DM_TYPE_DAX_BIO_BASED;
 		} else {
@@ -1909,6 +1922,7 @@
 			       struct queue_limits *limits)
 {
 	bool wc = false, fua = false;
+	int page_size = PAGE_SIZE;
 
 	/*
 	 * Copy table's limits to the DM device's request_queue
@@ -1936,8 +1950,11 @@
 	}
 	blk_queue_write_cache(q, wc, fua);
 
-	if (dm_table_supports_dax(t))
+	if (dm_table_supports_dax(t, device_supports_dax, &page_size)) {
 		blk_queue_flag_set(QUEUE_FLAG_DAX, q);
+		if (dm_table_supports_dax(t, device_synchronous, NULL))
+			set_dax_synchronous(t->md->dax_dev);
+	}
 	else
 		blk_queue_flag_clear(QUEUE_FLAG_DAX, q);
 

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 684af08..8306ee0 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c

@@ -11,6 +11,7 @@
 
 #include "dm-verity-fec.h"
 #include <linux/math64.h>
+#include <linux/sysfs.h>
 
 #define DM_MSG_PREFIX	"verity-fec"
 
@@ -175,9 +176,11 @@
 	if (r < 0 && neras)
 		DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
 			    v->data_dev->name, (unsigned long long)rsb, r);
-	else if (r > 0)
+	else if (r > 0) {
 		DMWARN_LIMIT("%s: FEC %llu: corrected %d errors",
 			     v->data_dev->name, (unsigned long long)rsb, r);
+		atomic_add_unless(&v->fec->corrected, 1, INT_MAX);
+	}
 
 	return r;
 }
@@ -545,6 +548,7 @@
 void verity_fec_dtr(struct dm_verity *v)
 {
 	struct dm_verity_fec *f = v->fec;
+	struct kobject *kobj = &f->kobj_holder.kobj;
 
 	if (!verity_fec_is_enabled(v))
 		goto out;
@@ -561,6 +565,12 @@
 
 	if (f->dev)
 		dm_put_device(v->ti, f->dev);
+
+	if (kobj->state_initialized) {
+		kobject_put(kobj);
+		wait_for_completion(dm_get_completion_from_kobject(kobj));
+	}
+
 out:
 	kfree(f);
 	v->fec = NULL;
@@ -649,6 +659,28 @@
 	return 0;
 }
 
+static ssize_t corrected_show(struct kobject *kobj, struct kobj_attribute *attr,
+			      char *buf)
+{
+	struct dm_verity_fec *f = container_of(kobj, struct dm_verity_fec,
+					       kobj_holder.kobj);
+
+	return sprintf(buf, "%d\n", atomic_read(&f->corrected));
+}
+
+static struct kobj_attribute attr_corrected = __ATTR_RO(corrected);
+
+static struct attribute *fec_attrs[] = {
+	&attr_corrected.attr,
+	NULL
+};
+
+static struct kobj_type fec_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_attrs = fec_attrs,
+	.release = dm_kobject_release
+};
+
 /*
  * Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr.
  */
@@ -672,8 +704,10 @@
  */
 int verity_fec_ctr(struct dm_verity *v)
 {
+	int r;
 	struct dm_verity_fec *f = v->fec;
 	struct dm_target *ti = v->ti;
+	struct mapped_device *md = dm_table_get_md(ti->table);
 	u64 hash_blocks;
 	int ret;
 
@@ -682,6 +716,16 @@
 		return 0;
 	}
 
+	/* Create a kobject and sysfs attributes */
+	init_completion(&f->kobj_holder.completion);
+
+	r = kobject_init_and_add(&f->kobj_holder.kobj, &fec_ktype,
+				 &disk_to_dev(dm_disk(md))->kobj, "%s", "fec");
+	if (r) {
+		ti->error = "Cannot create kobject";
+		return r;
+	}
+
 	/*
 	 * FEC is computed over data blocks, possible metadata, and
 	 * hash blocks. In other words, FEC covers total of fec_blocks

diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index 6ad803b..93af417 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h

@@ -12,6 +12,8 @@
 #ifndef DM_VERITY_FEC_H
 #define DM_VERITY_FEC_H
 
+#include "dm.h"
+#include "dm-core.h"
 #include "dm-verity.h"
 #include <linux/rslib.h>
 
@@ -51,6 +53,8 @@
 	mempool_t extra_pool;	/* mempool for extra buffers */
 	mempool_t output_pool;	/* mempool for output */
 	struct kmem_cache *cache;	/* cache for buffers */
+	atomic_t corrected;		/* corrected errors */
+	struct dm_kobject_holder kobj_holder;	/* for sysfs attributes */
 };
 
 /* per-bio data */

diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index e3599b4..6332a83 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c

@@ -17,8 +17,12 @@
 #include "dm-verity.h"
 #include "dm-verity-fec.h"
 
+#include <linux/async.h>
+#include <linux/delay.h>
+#include <linux/device-mapper.h>
 #include <linux/module.h>
 #include <linux/reboot.h>
+#include <crypto/hash.h>
 
 #define DM_MSG_PREFIX			"verity"
 
@@ -28,6 +32,7 @@
 #define DM_VERITY_DEFAULT_PREFETCH_SIZE	262144
 
 #define DM_VERITY_MAX_CORRUPTED_ERRS	100
+#define DM_VERITY_NUM_POSITIONAL_ARGS	10
 
 #define DM_VERITY_OPT_LOGGING		"ignore_corruption"
 #define DM_VERITY_OPT_RESTART		"restart_on_corruption"
@@ -47,6 +52,118 @@
 	unsigned n_blocks;
 };
 
+/* Provide a lightweight means of specifying the global default for
+ * error behavior: eio, reboot, or none
+ * Legacy support for 0 = eio, 1 = reboot/panic, 2 = none, 3 = notify.
+ * This is matched to the enum in dm-verity.h.
+ */
+static const char *allowed_error_behaviors[] = { "eio", "panic", "none",
+						 "notify", NULL };
+static char *error_behavior = "eio";
+module_param(error_behavior, charp, 0644);
+MODULE_PARM_DESC(error_behavior, "Behavior on error "
+				 "(eio, panic, none, notify)");
+
+/* Controls whether verity_get_device will wait forever for a device. */
+static int dev_wait;
+module_param(dev_wait, int, 0444);
+MODULE_PARM_DESC(dev_wait, "Wait forever for a backing device");
+
+static BLOCKING_NOTIFIER_HEAD(verity_error_notifier);
+
+int dm_verity_register_error_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&verity_error_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(dm_verity_register_error_notifier);
+
+int dm_verity_unregister_error_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&verity_error_notifier, nb);
+}
+EXPORT_SYMBOL_GPL(dm_verity_unregister_error_notifier);
+
+/* If the request is not successful, this handler takes action.
+ * TODO make this call a registered handler.
+ */
+static void verity_error(struct dm_verity *v, struct dm_verity_io *io,
+			 blk_status_t status)
+{
+	const char *message = v->hash_failed ? "integrity" : "block";
+	int error_behavior = DM_VERITY_ERROR_BEHAVIOR_PANIC;
+	dev_t devt = 0;
+	u64 block = ~0;
+	struct dm_verity_error_state error_state;
+	/* If the hash did not fail, then this is likely transient. */
+	int transient = !v->hash_failed;
+
+	devt = v->data_dev->bdev->bd_dev;
+	error_behavior = v->error_behavior;
+
+	DMERR_LIMIT("verification failure occurred: %s failure", message);
+
+	if (error_behavior == DM_VERITY_ERROR_BEHAVIOR_NOTIFY) {
+		error_state.code = status;
+		error_state.transient = transient;
+		error_state.block = block;
+		error_state.message = message;
+		error_state.dev_start = v->data_start;
+		error_state.dev_len = v->data_blocks;
+		error_state.dev = v->data_dev->bdev;
+		error_state.hash_dev_start = v->hash_start;
+		error_state.hash_dev_len = v->hash_blocks;
+		error_state.hash_dev = v->hash_dev->bdev;
+
+		/* Set default fallthrough behavior. */
+		error_state.behavior = DM_VERITY_ERROR_BEHAVIOR_PANIC;
+		error_behavior = DM_VERITY_ERROR_BEHAVIOR_PANIC;
+
+		if (!blocking_notifier_call_chain(
+		    &verity_error_notifier, transient, &error_state)) {
+			error_behavior = error_state.behavior;
+		}
+	}
+
+	switch (error_behavior) {
+	case DM_VERITY_ERROR_BEHAVIOR_EIO:
+		break;
+	case DM_VERITY_ERROR_BEHAVIOR_NONE:
+		break;
+	default:
+		if (!transient)
+			goto do_panic;
+	}
+	return;
+
+do_panic:
+	panic("dm-verity failure: "
+	      "device:%u:%u status:%d block:%llu message:%s",
+	      MAJOR(devt), MINOR(devt), status, (u64)block, message);
+}
+
+/**
+ * verity_parse_error_behavior - parse a behavior charp to the enum
+ * @behavior:	NUL-terminated char array
+ *
+ * Checks if the behavior is valid either as text or as an index digit
+ * and returns the proper enum value or -1 on error.
+ */
+static int verity_parse_error_behavior(const char *behavior)
+{
+	const char **allowed = allowed_error_behaviors;
+	char index = '0';
+
+	for (; *allowed; allowed++, index++)
+		if (!strcmp(*allowed, behavior) || behavior[0] == index)
+			break;
+
+	if (!*allowed)
+		return -1;
+
+	/* Convert to the integer index matching the enum. */
+	return allowed - allowed_error_behaviors;
+}
+
 /*
  * Auxiliary structure appended to each dm-bufio buffer. If the value
  * hash_verified is nonzero, hash of the block has been verified.
@@ -541,6 +658,8 @@
 	struct dm_verity *v = io->v;
 	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
 
+	if (status && !verity_fec_is_enabled(io->v))
+		verity_error(v, io, status);
 	bio->bi_end_io = io->orig_bi_end_io;
 	bio->bi_status = status;
 
@@ -564,7 +683,6 @@
 		verity_finish_io(io, bio->bi_status);
 		return;
 	}
-
 	INIT_WORK(&io->work, verity_work);
 	queue_work(io->v->verify_wq, &io->work);
 }
@@ -913,6 +1031,187 @@
 	return r;
 }
 
+static int verity_get_device(struct dm_target *ti, const char *devname,
+			     struct dm_dev **dm_dev)
+{
+	do {
+		/* Try the normal path first since if everything is ready, it
+		 * will be the fastest.
+		 */
+		if (!dm_get_device(ti, devname, /*FMODE_READ*/
+				   dm_table_get_mode(ti->table), dm_dev))
+			return 0;
+
+		/* No need to be too aggressive since this is a slow path. */
+		msleep(500);
+	} while (dev_wait && (driver_probe_done() != 0 || *dm_dev == NULL));
+	async_synchronize_full();
+	return -1;
+}
+
+struct verity_args {
+	int version;
+	char *data_device;
+	char *hash_device;
+	int data_block_size_bits;
+	int hash_block_size_bits;
+	u64 num_data_blocks;
+	u64 hash_start_block;
+	char *algorithm;
+	char *digest;
+	char *salt;
+	char *error_behavior;
+};
+
+static void pr_args(struct verity_args *args)
+{
+	printk(KERN_INFO "VERITY args: version=%d data_device=%s hash_device=%s"
+		" data_block_size_bits=%d hash_block_size_bits=%d"
+		" num_data_blocks=%lld hash_start_block=%lld"
+		" algorithm=%s digest=%s salt=%s error_behavior=%s\n",
+		args->version,
+		args->data_device,
+		args->hash_device,
+		args->data_block_size_bits,
+		args->hash_block_size_bits,
+		args->num_data_blocks,
+		args->hash_start_block,
+		args->algorithm,
+		args->digest,
+		args->salt,
+		args->error_behavior);
+}
+
+/*
+ * positional_args - collects the argments using the positional
+ * parameters.
+ * arg# - parameter
+ *    0 - version
+ *    1 - data device
+ *    2 - hash device - may be same as data device
+ *    3 - data block size log2
+ *    4 - hash block size log2
+ *    5 - number of data blocks
+ *    6 - hash start block
+ *    7 - algorithm
+ *    8 - digest
+ *    9 - salt
+ */
+static char *positional_args(unsigned argc, char **argv,
+				struct verity_args *args)
+{
+	unsigned num;
+	unsigned long long num_ll;
+	char dummy;
+
+	if (argc < DM_VERITY_NUM_POSITIONAL_ARGS)
+		return "Invalid argument count: at least 10 arguments required";
+
+	if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 ||
+	    num < 0 || num > 1)
+		return "Invalid version";
+	args->version = num;
+
+	args->data_device = argv[1];
+	args->hash_device = argv[2];
+
+
+	if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 ||
+	    !num || (num & (num - 1)) ||
+	    num > PAGE_SIZE)
+		return "Invalid data device block size";
+	args->data_block_size_bits = ffs(num) - 1;
+
+	if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
+	    !num || (num & (num - 1)) ||
+	    num > INT_MAX)
+		return "Invalid hash device block size";
+	args->hash_block_size_bits = ffs(num) - 1;
+
+	if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
+	    (sector_t)(num_ll << (args->data_block_size_bits - SECTOR_SHIFT))
+	    >> (args->data_block_size_bits - SECTOR_SHIFT) != num_ll)
+		return "Invalid data blocks";
+	args->num_data_blocks = num_ll;
+
+
+	if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 ||
+	    (sector_t)(num_ll << (args->hash_block_size_bits - SECTOR_SHIFT))
+	    >> (args->hash_block_size_bits - SECTOR_SHIFT) != num_ll)
+		return "Invalid hash start";
+	args->hash_start_block = num_ll;
+
+
+	args->algorithm = argv[7];
+	args->digest = argv[8];
+	args->salt = argv[9];
+
+	return NULL;
+}
+
+static void splitarg(char *arg, char **key, char **val)
+{
+	*key = strsep(&arg, "=");
+	*val = strsep(&arg, "");
+}
+
+static char *chromeos_args(unsigned argc, char **argv, struct verity_args *args)
+{
+	char *key, *val;
+	unsigned long num;
+	int i;
+
+	args->version = 0;
+	args->data_block_size_bits = 12;
+	args->hash_block_size_bits = 12;
+	for (i = 0; i < argc; ++i) {
+		DMDEBUG("Argument %d: '%s'", i, argv[i]);
+		splitarg(argv[i], &key, &val);
+		if (!key) {
+			DMWARN("Bad argument %d: missing key?", i);
+			return "Bad argument: missing key";
+		}
+		if (!val) {
+			DMWARN("Bad argument %d='%s': missing value", i, key);
+			return "Bad argument: missing value";
+		}
+		if (!strcmp(key, "alg")) {
+			args->algorithm = val;
+		} else if (!strcmp(key, "payload")) {
+			args->data_device = val;
+		} else if (!strcmp(key, "hashtree")) {
+			args->hash_device = val;
+		} else if (!strcmp(key, "root_hexdigest")) {
+			args->digest = val;
+		} else if (!strcmp(key, "hashstart")) {
+			if (kstrtoul(val, 10, &num))
+				return "Invalid hashstart";
+			args->hash_start_block =
+				num >> (args->hash_block_size_bits - SECTOR_SHIFT);
+			args->num_data_blocks = args->hash_start_block;
+		} else if (!strcmp(key, "error_behavior")) {
+			args->error_behavior = val;
+		} else if (!strcmp(key, "salt")) {
+			args->salt = val;
+		}
+	}
+	if (!args->salt)
+		args->salt = "";
+
+#define NEEDARG(n) \
+	if (!(n)) { \
+		return "Missing argument: " #n; \
+	}
+
+	NEEDARG(args->algorithm);
+	NEEDARG(args->data_device);
+	NEEDARG(args->hash_device);
+	NEEDARG(args->digest);
+
+#undef NEEDARG
+	return NULL;
+}
+
 /*
  * Target parameters:
  *	<version>	The current format is version 1.
@@ -929,14 +1228,22 @@
  */
 static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
+	struct verity_args args = { 0 };
 	struct dm_verity *v;
 	struct dm_arg_set as;
-	unsigned int num;
-	unsigned long long num_ll;
 	int r;
 	int i;
 	sector_t hash_position;
-	char dummy;
+
+	args.error_behavior = error_behavior;
+	if (argc >= DM_VERITY_NUM_POSITIONAL_ARGS)
+		ti->error = positional_args(argc, argv, &args);
+	else
+		ti->error = chromeos_args(argc, argv, &args);
+	if (ti->error)
+		return -EINVAL;
+	if (0)
+		pr_args(&args);
 
 	v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
 	if (!v) {
@@ -949,84 +1256,46 @@
 	r = verity_fec_ctr_alloc(v);
 	if (r)
 		goto bad;
+	v->version = args.version;
 
-	if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
-		ti->error = "Device must be readonly";
-		r = -EINVAL;
-		goto bad;
-	}
-
-	if (argc < 10) {
-		ti->error = "Not enough arguments";
-		r = -EINVAL;
-		goto bad;
-	}
-
-	if (sscanf(argv[0], "%u%c", &num, &dummy) != 1 ||
-	    num > 1) {
-		ti->error = "Invalid version";
-		r = -EINVAL;
-		goto bad;
-	}
-	v->version = num;
-
-	r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev);
+	r = verity_get_device(ti, args.data_device, &v->data_dev);
 	if (r) {
 		ti->error = "Data device lookup failed";
 		goto bad;
 	}
 
-	r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev);
+	r = verity_get_device(ti, args.hash_device, &v->hash_dev);
 	if (r) {
 		ti->error = "Hash device lookup failed";
 		goto bad;
 	}
 
-	if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 ||
-	    !num || (num & (num - 1)) ||
-	    num < bdev_logical_block_size(v->data_dev->bdev) ||
-	    num > PAGE_SIZE) {
+	v->data_dev_block_bits = args.data_block_size_bits;
+	if ((1 << v->data_dev_block_bits) <
+	    bdev_logical_block_size(v->data_dev->bdev)) {
 		ti->error = "Invalid data device block size";
 		r = -EINVAL;
 		goto bad;
 	}
-	v->data_dev_block_bits = __ffs(num);
 
-	if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
-	    !num || (num & (num - 1)) ||
-	    num < bdev_logical_block_size(v->hash_dev->bdev) ||
-	    num > INT_MAX) {
+	v->hash_dev_block_bits = args.hash_block_size_bits;
+	if ((1 << v->data_dev_block_bits) <
+	    bdev_logical_block_size(v->hash_dev->bdev)) {
 		ti->error = "Invalid hash device block size";
 		r = -EINVAL;
 		goto bad;
 	}
-	v->hash_dev_block_bits = __ffs(num);
 
-	if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
-	    (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT))
-	    >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll) {
-		ti->error = "Invalid data blocks";
-		r = -EINVAL;
-		goto bad;
-	}
-	v->data_blocks = num_ll;
-
+	v->data_blocks = args.num_data_blocks;
 	if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) {
 		ti->error = "Data device is too small";
 		r = -EINVAL;
 		goto bad;
 	}
 
-	if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 ||
-	    (sector_t)(num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT))
-	    >> (v->hash_dev_block_bits - SECTOR_SHIFT) != num_ll) {
-		ti->error = "Invalid hash start";
-		r = -EINVAL;
-		goto bad;
-	}
-	v->hash_start = num_ll;
+	v->hash_start = args.hash_start_block;
 
-	v->alg_name = kstrdup(argv[7], GFP_KERNEL);
+	v->alg_name = kstrdup(args.algorithm, GFP_KERNEL);
 	if (!v->alg_name) {
 		ti->error = "Cannot allocate algorithm name";
 		r = -ENOMEM;
@@ -1055,36 +1324,33 @@
 		r = -ENOMEM;
 		goto bad;
 	}
-	if (strlen(argv[8]) != v->digest_size * 2 ||
-	    hex2bin(v->root_digest, argv[8], v->digest_size)) {
+	if (strlen(args.digest) != v->digest_size * 2 ||
+	    hex2bin(v->root_digest, args.digest, v->digest_size)) {
 		ti->error = "Invalid root digest";
 		r = -EINVAL;
 		goto bad;
 	}
 
-	if (strcmp(argv[9], "-")) {
-		v->salt_size = strlen(argv[9]) / 2;
+	if (strcmp(args.salt, "-")) {
+		v->salt_size = strlen(args.salt) / 2;
 		v->salt = kmalloc(v->salt_size, GFP_KERNEL);
 		if (!v->salt) {
 			ti->error = "Cannot allocate salt";
 			r = -ENOMEM;
 			goto bad;
 		}
-		if (strlen(argv[9]) != v->salt_size * 2 ||
-		    hex2bin(v->salt, argv[9], v->salt_size)) {
+		if (strlen(args.salt) != v->salt_size * 2 ||
+		    hex2bin(v->salt, args.salt, v->salt_size)) {
 			ti->error = "Invalid salt";
 			r = -EINVAL;
 			goto bad;
 		}
 	}
 
-	argv += 10;
-	argc -= 10;
-
 	/* Optional parameters */
-	if (argc) {
-		as.argc = argc;
-		as.argv = argv;
+	if (argc > DM_VERITY_NUM_POSITIONAL_ARGS) {
+		as.argc = argc - DM_VERITY_NUM_POSITIONAL_ARGS;
+		as.argv = argv + DM_VERITY_NUM_POSITIONAL_ARGS;
 
 		r = verity_parse_opt_args(&as, v);
 		if (r < 0)
@@ -1156,6 +1422,16 @@
 	ti->per_io_data_size = roundup(ti->per_io_data_size,
 				       __alignof__(struct dm_verity_io));
 
+	/* chromeos allows setting error_behavior from both the module
+	 * parameters and the device args.
+	 */
+	v->error_behavior = verity_parse_error_behavior(args.error_behavior);
+	if (v->error_behavior == -1) {
+		ti->error = "Bad error_behavior supplied";
+		r = -EINVAL;
+		goto bad;
+	}
+
 	return 0;
 
 bad:

diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 3441c10..b1a8442 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h

@@ -15,6 +15,7 @@
 #include <linux/dm-bufio.h>
 #include <linux/device-mapper.h>
 #include <crypto/hash.h>
+#include <linux/notifier.h>
 
 #define DM_VERITY_MAX_LEVELS		63
 
@@ -56,6 +57,7 @@
 	int hash_failed;	/* set to 1 if hash of any block failed */
 	enum verity_mode mode;	/* mode for handling verification errors */
 	unsigned corrupted_errs;/* Number of errors for corrupted blocks */
+	int error_behavior;	/* selects error behavior on io errors */
 
 	struct workqueue_struct *verify_wq;
 
@@ -91,6 +93,40 @@
 	 */
 };
 
+struct verity_result {
+	struct completion completion;
+	int err;
+};
+
+struct dm_verity_error_state {
+	int code;
+	int transient;  /* Likely to not happen after a reboot */
+	u64 block;
+	const char *message;
+
+	sector_t dev_start;
+	sector_t dev_len;
+	struct block_device *dev;
+
+	sector_t hash_dev_start;
+	sector_t hash_dev_len;
+	struct block_device *hash_dev;
+
+	/* Final behavior after all notifications are completed. */
+	int behavior;
+};
+
+/* This enum must be matched to allowed_error_behaviors in dm-verity.c */
+enum dm_verity_error_behavior {
+	DM_VERITY_ERROR_BEHAVIOR_EIO = 0,
+	DM_VERITY_ERROR_BEHAVIOR_PANIC,
+	DM_VERITY_ERROR_BEHAVIOR_NONE,
+	DM_VERITY_ERROR_BEHAVIOR_NOTIFY
+};
+
+int dm_verity_register_error_notifier(struct notifier_block *nb);
+int dm_verity_unregister_error_notifier(struct notifier_block *nb);
+
 static inline struct ahash_request *verity_io_hash_req(struct dm_verity *v,
 						     struct dm_verity_io *io)
 {

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3965f3c..756ed70 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c

@@ -1070,6 +1070,25 @@
 	return ret;
 }
 
+static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
+		int blocksize, sector_t start, sector_t len)
+{
+	struct mapped_device *md = dax_get_private(dax_dev);
+	struct dm_table *map;
+	int srcu_idx;
+	bool ret;
+
+	map = dm_get_live_table(md, &srcu_idx);
+	if (!map)
+		return false;
+
+	ret = dm_table_supports_dax(map, device_supports_dax, &blocksize);
+
+	dm_put_live_table(md, srcu_idx);
+
+	return ret;
+}
+
 static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
 				    void *addr, size_t bytes, struct iov_iter *i)
 {
@@ -1941,7 +1960,8 @@
 	sprintf(md->disk->disk_name, "dm-%d", minor);
 
 	if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
-		dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
+		dax_dev = alloc_dax(md, md->disk->disk_name,
+					&dm_dax_ops, 0);
 		if (!dax_dev)
 			goto bad;
 	}
@@ -3184,6 +3204,7 @@
 
 static const struct dax_operations dm_dax_ops = {
 	.direct_access = dm_dax_direct_access,
+	.dax_supported = dm_dax_supported,
 	.copy_from_iter = dm_dax_copy_from_iter,
 	.copy_to_iter = dm_dax_copy_to_iter,
 };

diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 114a81b..5022e83 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h

@@ -73,6 +73,10 @@
 bool dm_table_all_blk_mq_devices(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
+bool dm_table_supports_dax(struct dm_table *t, iterate_devices_callout_fn fn,
+			   int *blocksize);
+int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
+			   sector_t start, sector_t len, void *data);
 
 void dm_lock_md_type(struct mapped_device *md);
 void dm_unlock_md_type(struct mapped_device *md);

diff --git a/drivers/media/i2c/mt9v032.c b/drivers/media/i2c/mt9v032.c
index f74730d..0478869 100644
--- a/drivers/media/i2c/mt9v032.c
+++ b/drivers/media/i2c/mt9v032.c

@@ -431,10 +431,12 @@
 				  struct v4l2_subdev_pad_config *cfg,
 				  struct v4l2_subdev_mbus_code_enum *code)
 {
+	struct mt9v032 *mt9v032 = to_mt9v032(subdev);
+
 	if (code->index > 0)
 		return -EINVAL;
 
-	code->code = MEDIA_BUS_FMT_SGRBG10_1X10;
+	code->code = mt9v032->format.code;
 	return 0;
 }
 
@@ -442,7 +444,11 @@
 				   struct v4l2_subdev_pad_config *cfg,
 				   struct v4l2_subdev_frame_size_enum *fse)
 {
-	if (fse->index >= 3 || fse->code != MEDIA_BUS_FMT_SGRBG10_1X10)
+	struct mt9v032 *mt9v032 = to_mt9v032(subdev);
+
+	if (fse->index >= 3)
+		return -EINVAL;
+	if (mt9v032->format.code != fse->code)
 		return -EINVAL;
 
 	fse->min_width = MT9V032_WINDOW_WIDTH_DEF / (1 << fse->index);

diff --git a/drivers/media/pci/cx23885/cx23885-cards.c b/drivers/media/pci/cx23885/cx23885-cards.c
index ed3210d..642aefd 100644
--- a/drivers/media/pci/cx23885/cx23885-cards.c
+++ b/drivers/media/pci/cx23885/cx23885-cards.c

@@ -811,6 +811,25 @@
 		.name		= "Hauppauge WinTV-Starburst2",
 		.portb		= CX23885_MPEG_DVB,
 	},
+	[CX23885_BOARD_AVERMEDIA_CE310B] = {
+		.name		= "AVerMedia CE310B",
+		.porta		= CX23885_ANALOG_VIDEO,
+		.force_bff	= 1,
+		.input          = {{
+			.type   = CX23885_VMUX_COMPOSITE1,
+			.vmux   = CX25840_VIN1_CH1 |
+				  CX25840_NONE_CH2 |
+				  CX25840_NONE0_CH3,
+			.amux   = CX25840_AUDIO7,
+		}, {
+			.type   = CX23885_VMUX_SVIDEO,
+			.vmux   = CX25840_VIN8_CH1 |
+				  CX25840_NONE_CH2 |
+				  CX25840_VIN7_CH3 |
+				  CX25840_SVIDEO_ON,
+			.amux   = CX25840_AUDIO7,
+		} },
+	},
 };
 const unsigned int cx23885_bcount = ARRAY_SIZE(cx23885_boards);
 
@@ -1134,6 +1153,10 @@
 		.subvendor = 0x0070,
 		.subdevice = 0xf02a,
 		.card      = CX23885_BOARD_HAUPPAUGE_STARBURST2,
+	}, {
+		.subvendor = 0x1461,
+		.subdevice = 0x3100,
+		.card      = CX23885_BOARD_AVERMEDIA_CE310B,
 	},
 };
 const unsigned int cx23885_idcount = ARRAY_SIZE(cx23885_subids);
@@ -2358,6 +2381,7 @@
 	case CX23885_BOARD_DVBSKY_T982:
 	case CX23885_BOARD_VIEWCAST_260E:
 	case CX23885_BOARD_VIEWCAST_460E:
+	case CX23885_BOARD_AVERMEDIA_CE310B:
 		dev->sd_cx25840 = v4l2_i2c_new_subdev(&dev->v4l2_dev,
 				&dev->i2c_bus[2].i2c_adap,
 				"cx25840", 0x88 >> 1, NULL);

diff --git a/drivers/media/pci/cx23885/cx23885-video.c b/drivers/media/pci/cx23885/cx23885-video.c
index f8a3dea..2a20c71 100644
--- a/drivers/media/pci/cx23885/cx23885-video.c
+++ b/drivers/media/pci/cx23885/cx23885-video.c

@@ -268,7 +268,8 @@
 		(dev->board == CX23885_BOARD_MYGICA_X8507) ||
 		(dev->board == CX23885_BOARD_AVERMEDIA_HC81R) ||
 		(dev->board == CX23885_BOARD_VIEWCAST_260E) ||
-		(dev->board == CX23885_BOARD_VIEWCAST_460E)) {
+		(dev->board == CX23885_BOARD_VIEWCAST_460E) ||
+		(dev->board == CX23885_BOARD_AVERMEDIA_CE310B)) {
 		/* Configure audio routing */
 		v4l2_subdev_call(dev->sd_cx25840, audio, s_routing,
 			INPUT(input)->amux, 0, 0);

diff --git a/drivers/media/pci/cx23885/cx23885.h b/drivers/media/pci/cx23885/cx23885.h
index cf965ef..7bbd62c 100644
--- a/drivers/media/pci/cx23885/cx23885.h
+++ b/drivers/media/pci/cx23885/cx23885.h

@@ -111,6 +111,7 @@
 #define CX23885_BOARD_HAUPPAUGE_STARBURST2     59
 #define CX23885_BOARD_HAUPPAUGE_QUADHD_DVB_885 60
 #define CX23885_BOARD_HAUPPAUGE_QUADHD_ATSC_885 61
+#define CX23885_BOARD_AVERMEDIA_CE310B         62
 
 #define GPIO_0 0x00000001
 #define GPIO_1 0x00000002

diff --git a/drivers/media/platform/sti/bdisp/bdisp-hw.c b/drivers/media/platform/sti/bdisp/bdisp-hw.c
index 26d9fa7..d57f659 100644
--- a/drivers/media/platform/sti/bdisp/bdisp-hw.c
+++ b/drivers/media/platform/sti/bdisp/bdisp-hw.c

@@ -14,8 +14,8 @@
 #define MAX_SRC_WIDTH           2048
 
 /* Reset & boot poll config */
-#define POLL_RST_MAX            50
-#define POLL_RST_DELAY_MS       20
+#define POLL_RST_MAX            500
+#define POLL_RST_DELAY_MS       2
 
 enum bdisp_target_plan {
 	BDISP_RGB,
@@ -382,7 +382,7 @@
 	for (i = 0; i < POLL_RST_MAX; i++) {
 		if (readl(bdisp->regs + BLT_STA1) & BLT_STA1_IDLE)
 			break;
-		msleep(POLL_RST_DELAY_MS);
+		udelay(POLL_RST_DELAY_MS * 1000);
 	}
 	if (i == POLL_RST_MAX)
 		dev_err(bdisp->dev, "Reset timeout\n");

diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c
index 92261c9..3afc0e5 100644
--- a/drivers/net/ethernet/amazon/ena/ena_com.c
+++ b/drivers/net/ethernet/amazon/ena/ena_com.c

@@ -201,6 +201,11 @@
 static struct ena_comp_ctx *get_comp_ctxt(struct ena_com_admin_queue *queue,
 					  u16 command_id, bool capture)
 {
+	if (unlikely(!queue->comp_ctx)) {
+		pr_err("Completion context is NULL\n");
+		return NULL;
+	}
+
 	if (unlikely(command_id >= queue->q_depth)) {
 		pr_err("command id is larger than the queue size. cmd_id: %u queue size %d\n",
 		       command_id, queue->q_depth);
@@ -842,6 +847,24 @@
 				      0);
 }
 
+static void ena_com_hash_key_fill_default_key(struct ena_com_dev *ena_dev)
+{
+	struct ena_admin_feature_rss_flow_hash_control *hash_key =
+		(ena_dev->rss).hash_key;
+
+	netdev_rss_key_fill(&hash_key->key, sizeof(hash_key->key));
+	/* The key is stored in the device in u32 array
+	 * as well as the API requires the key to be passed in this
+	 * format. Thus the size of our array should be divided by 4
+	 */
+	hash_key->keys_num = sizeof(hash_key->key) / sizeof(u32);
+}
+
+int ena_com_get_current_hash_function(struct ena_com_dev *ena_dev)
+{
+	return ena_dev->rss.hash_func;
+}
+
 static int ena_com_hash_key_allocate(struct ena_com_dev *ena_dev)
 {
 	struct ena_rss *rss = &ena_dev->rss;
@@ -2075,15 +2098,16 @@
 
 	switch (func) {
 	case ENA_ADMIN_TOEPLITZ:
-		if (key_len > sizeof(hash_key->key)) {
-			pr_err("key len (%hu) is bigger than the max supported (%zu)\n",
-			       key_len, sizeof(hash_key->key));
-			return -EINVAL;
+		if (key) {
+			if (key_len != sizeof(hash_key->key)) {
+				pr_err("key len (%hu) doesn't equal the supported size (%zu)\n",
+				       key_len, sizeof(hash_key->key));
+				return -EINVAL;
+			}
+			memcpy(hash_key->key, key, key_len);
+			rss->hash_init_val = init_val;
+			hash_key->keys_num = key_len >> 2;
 		}
-
-		memcpy(hash_key->key, key, key_len);
-		rss->hash_init_val = init_val;
-		hash_key->keys_num = key_len >> 2;
 		break;
 	case ENA_ADMIN_CRC32:
 		rss->hash_init_val = init_val;
@@ -2120,7 +2144,11 @@
 	if (unlikely(rc))
 		return rc;
 
-	rss->hash_func = get_resp.u.flow_hash_func.selected_func;
+	/* ffs() returns 1 in case the lsb is set */
+	rss->hash_func = ffs(get_resp.u.flow_hash_func.selected_func);
+	if (rss->hash_func)
+		rss->hash_func--;
+
 	if (func)
 		*func = rss->hash_func;
 
@@ -2408,6 +2436,8 @@
 	if (unlikely(rc))
 		goto err_hash_key;
 
+	ena_com_hash_key_fill_default_key(ena_dev);
+
 	rc = ena_com_hash_ctrl_init(ena_dev);
 	if (unlikely(rc))
 		goto err_hash_ctrl;

diff --git a/drivers/net/ethernet/amazon/ena/ena_com.h b/drivers/net/ethernet/amazon/ena/ena_com.h
index 7b784f8..7272fb0 100644
--- a/drivers/net/ethernet/amazon/ena/ena_com.h
+++ b/drivers/net/ethernet/amazon/ena/ena_com.h

@@ -42,6 +42,7 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/wait.h>
+#include <linux/netdevice.h>
 
 #include "ena_common_defs.h"
 #include "ena_admin_defs.h"
@@ -631,6 +632,14 @@
  */
 void ena_com_rss_destroy(struct ena_com_dev *ena_dev);
 
+/* ena_com_get_current_hash_function - Get RSS hash function
+ * @ena_dev: ENA communication layer struct
+ *
+ * Return the current hash function.
+ * @return: 0 or one of the ena_admin_hash_functions values.
+ */
+int ena_com_get_current_hash_function(struct ena_com_dev *ena_dev);
+
 /* ena_com_fill_hash_function - Fill RSS hash function
  * @ena_dev: ENA communication layer struct
  * @func: The hash function (Toeplitz or crc)

diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c
index eb9e07f..66f99251 100644
--- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c
+++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c

@@ -649,6 +649,28 @@
 	return ENA_HASH_KEY_SIZE;
 }
 
+static int ena_indirection_table_get(struct ena_adapter *adapter, u32 *indir)
+{
+	struct ena_com_dev *ena_dev = adapter->ena_dev;
+	int i, rc;
+
+	if (!indir)
+		return 0;
+
+	rc = ena_com_indirect_table_get(ena_dev, indir);
+	if (rc)
+		return rc;
+
+	/* Our internal representation of the indices is: even indices
+	 * for Tx and uneven indices for Rx. We need to convert the Rx
+	 * indices to be consecutive
+	 */
+	for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++)
+		indir[i] = ENA_IO_RXQ_IDX_TO_COMBINED_IDX(indir[i]);
+
+	return rc;
+}
+
 static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 			u8 *hfunc)
 {
@@ -657,11 +679,25 @@
 	u8 func;
 	int rc;
 
-	rc = ena_com_indirect_table_get(adapter->ena_dev, indir);
+	rc = ena_indirection_table_get(adapter, indir);
 	if (rc)
 		return rc;
 
+	/* We call this function in order to check if the device
+	 * supports getting/setting the hash function.
+	 */
 	rc = ena_com_get_hash_function(adapter->ena_dev, &ena_func, key);
+
+	if (rc) {
+		if (rc == -EOPNOTSUPP) {
+			key = NULL;
+			hfunc = NULL;
+			rc = 0;
+		}
+
+		return rc;
+	}
+
 	if (rc)
 		return rc;
 
@@ -670,7 +706,7 @@
 		func = ETH_RSS_HASH_TOP;
 		break;
 	case ENA_ADMIN_CRC32:
-		func = ETH_RSS_HASH_XOR;
+		func = ETH_RSS_HASH_CRC32;
 		break;
 	default:
 		netif_err(adapter, drv, netdev,
@@ -713,10 +749,13 @@
 	}
 
 	switch (hfunc) {
+	case ETH_RSS_HASH_NO_CHANGE:
+		func = ena_com_get_current_hash_function(ena_dev);
+		break;
 	case ETH_RSS_HASH_TOP:
 		func = ENA_ADMIN_TOEPLITZ;
 		break;
-	case ETH_RSS_HASH_XOR:
+	case ETH_RSS_HASH_CRC32:
 		func = ENA_ADMIN_CRC32;
 		break;
 	default:
@@ -817,6 +856,7 @@
 	.get_channels		= ena_get_channels,
 	.get_tunable		= ena_get_tunable,
 	.set_tunable		= ena_set_tunable,
+	.get_ts_info            = ethtool_op_get_ts_info,
 };
 
 void ena_set_ethtool_ops(struct net_device *netdev)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 9afb19e..8736718 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c

@@ -2847,8 +2847,8 @@
 	if (adapter->keep_alive_timeout == ENA_HW_HINTS_NO_TIMEOUT)
 		return;
 
-	keep_alive_expired = round_jiffies(adapter->last_keep_alive_jiffies +
-					   adapter->keep_alive_timeout);
+	keep_alive_expired = adapter->last_keep_alive_jiffies +
+			     adapter->keep_alive_timeout;
 	if (unlikely(time_is_before_jiffies(keep_alive_expired))) {
 		netif_err(adapter, drv, adapter->netdev,
 			  "Keep alive watchdog timeout.\n");
@@ -2950,7 +2950,7 @@
 	}
 
 	/* Reset the timer */
-	mod_timer(&adapter->timer_service, jiffies + HZ);
+	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
 }
 
 static int ena_calc_io_queue_num(struct pci_dev *pdev,

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h
index 7c7ae56..f4783ef 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.h
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h

@@ -113,6 +113,8 @@
 
 #define ENA_IO_TXQ_IDX(q)	(2 * (q))
 #define ENA_IO_RXQ_IDX(q)	(2 * (q) + 1)
+#define ENA_IO_TXQ_IDX_TO_COMBINED_IDX(q)	((q) / 2)
+#define ENA_IO_RXQ_IDX_TO_COMBINED_IDX(q)	(((q) - 1) / 2)
 
 #define ENA_MGMNT_IRQ_IDX		0
 #define ENA_IO_IRQ_FIRST_IDX		1

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
index 50dd6bf..3a489b2 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c

@@ -2034,7 +2034,7 @@
 	int ret;
 
 	ndev = alloc_etherdev_mqs(sizeof(struct xgene_enet_pdata),
-				  XGENE_NUM_RX_RING, XGENE_NUM_TX_RING);
+				  XGENE_NUM_TX_RING, XGENE_NUM_RX_RING);
 	if (!ndev)
 		return -ENOMEM;
 

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 8cc34b0..15dcfb6 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c

@@ -399,8 +399,10 @@
 				     dx_buff->len,
 				     DMA_TO_DEVICE);
 
-	if (unlikely(dma_mapping_error(aq_nic_get_dev(self), dx_buff->pa)))
+	if (unlikely(dma_mapping_error(aq_nic_get_dev(self), dx_buff->pa))) {
+		ret = 0;
 		goto exit;
+	}
 
 	first = dx_buff;
 	dx_buff->len_pkt = skb->len;
@@ -530,10 +532,6 @@
 	if (likely(frags)) {
 		err = self->aq_hw_ops->hw_ring_tx_xmit(self->aq_hw,
 						       ring, frags);
-		if (err >= 0) {
-			++ring->stats.tx.packets;
-			ring->stats.tx.bytes += skb->len;
-		}
 	} else {
 		err = NETDEV_TX_BUSY;
 	}

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
index b3c7994..b03e5fd 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c

@@ -162,9 +162,12 @@
 			}
 		}
 
-		if (unlikely(buff->is_eop))
-			dev_kfree_skb_any(buff->skb);
+		if (unlikely(buff->is_eop)) {
+			++self->stats.rx.packets;
+			self->stats.tx.bytes += buff->skb->len;
 
+			dev_kfree_skb_any(buff->skb);
+		}
 		buff->pa = 0U;
 		buff->eop_index = 0xffffU;
 		self->sw_head = aq_ring_next_dx(self, self->sw_head);

diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 733d917..026a3bd 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c

@@ -2013,10 +2013,10 @@
 		napi_disable(&enic->napi[i]);
 
 	netif_carrier_off(netdev);
-	netif_tx_disable(netdev);
 	if (vnic_dev_get_intr_mode(enic->vdev) == VNIC_DEV_INTR_MODE_MSIX)
 		for (i = 0; i < enic->wq_count; i++)
 			napi_disable(&enic->napi[enic_cq_wq(enic, i)]);
+	netif_tx_disable(netdev);
 
 	if (!enic_is_dynamic(enic) && !enic_is_sriov_vf(enic))
 		enic_dev_del_station_addr(enic);

diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index c97c4ed..cf2d1e8 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c

@@ -2685,13 +2685,17 @@
 	skb_dirtytx = tx_queue->skb_dirtytx;
 
 	while ((skb = tx_queue->tx_skbuff[skb_dirtytx])) {
+		bool do_tstamp;
+
+		do_tstamp = (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
+			    priv->hwts_tx_en;
 
 		frags = skb_shinfo(skb)->nr_frags;
 
 		/* When time stamping, one additional TxBD must be freed.
 		 * Also, we need to dma_unmap_single() the TxPAL.
 		 */
-		if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS))
+		if (unlikely(do_tstamp))
 			nr_txbds = frags + 2;
 		else
 			nr_txbds = frags + 1;
@@ -2705,7 +2709,7 @@
 		    (lstatus & BD_LENGTH_MASK))
 			break;
 
-		if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS)) {
+		if (unlikely(do_tstamp)) {
 			next = next_txbd(bdp, base, tx_ring_size);
 			buflen = be16_to_cpu(next->length) +
 				 GMAC_FCB_LEN + GMAC_TXPAL_LEN;
@@ -2715,7 +2719,7 @@
 		dma_unmap_single(priv->dev, be32_to_cpu(bdp->bufPtr),
 				 buflen, DMA_TO_DEVICE);
 
-		if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS)) {
+		if (unlikely(do_tstamp)) {
 			struct skb_shared_hwtstamps shhwtstamps;
 			u64 *ns = (u64 *)(((uintptr_t)skb->data + 0x10) &
 					  ~0x7UL);

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 8255d79..9a68dee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c

@@ -211,6 +211,9 @@
 			s->tx_tls_resync_bytes	+= sq_stats->tls_resync_bytes;
 #endif
 			s->tx_cqes		+= sq_stats->cqes;
+
+			/* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92657 */
+			barrier();
 		}
 	}
 

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
index 41e607a..4fe193c4 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c

@@ -215,7 +215,7 @@
 start_again:
 	err = devlink_dpipe_entry_ctx_prepare(dump_ctx);
 	if (err)
-		return err;
+		goto err_ctx_prepare;
 	j = 0;
 	for (; i < rif_count; i++) {
 		struct mlxsw_sp_rif *rif = mlxsw_sp_rif_by_index(mlxsw_sp, i);
@@ -247,6 +247,7 @@
 	return 0;
 err_entry_append:
 err_entry_get:
+err_ctx_prepare:
 	rtnl_unlock();
 	devlink_dpipe_entry_clear(&entry);
 	return err;

diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c
index 3cdf63e..4054cf9 100644
--- a/drivers/net/ethernet/mscc/ocelot_board.c
+++ b/drivers/net/ethernet/mscc/ocelot_board.c

@@ -105,6 +105,14 @@
 		if (err != 4)
 			break;
 
+		/* At this point the IFH was read correctly, so it is safe to
+		 * presume that there is no error. The err needs to be reset
+		 * otherwise a frame could come in CPU queue between the while
+		 * condition and the check for error later on. And in that case
+		 * the new frame is just removed and not processed.
+		 */
+		err = 0;
+
 		ocelot_parse_ifh(ifh, &info);
 
 		dev = ocelot->ports[info.port]->dev;

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index d242a57..dc3be8a 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h

@@ -162,6 +162,8 @@
 	struct list_head entry;
 	struct list_head rdma_event_list;
 	struct workqueue_struct *rdma_wq;
+	struct kref refcnt;
+	struct completion event_comp;
 };
 
 struct qede_ptp;

diff --git a/drivers/net/ethernet/qlogic/qede/qede_rdma.c b/drivers/net/ethernet/qlogic/qede/qede_rdma.c
index 1900bf7..cd12fb91 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_rdma.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_rdma.c

@@ -57,6 +57,9 @@
 static int qede_rdma_create_wq(struct qede_dev *edev)
 {
 	INIT_LIST_HEAD(&edev->rdma_info.rdma_event_list);
+	kref_init(&edev->rdma_info.refcnt);
+	init_completion(&edev->rdma_info.event_comp);
+
 	edev->rdma_info.rdma_wq = create_singlethread_workqueue("rdma_wq");
 	if (!edev->rdma_info.rdma_wq) {
 		DP_NOTICE(edev, "qedr: Could not create workqueue\n");
@@ -81,8 +84,23 @@
 	}
 }
 
+static void qede_rdma_complete_event(struct kref *ref)
+{
+	struct qede_rdma_dev *rdma_dev =
+		container_of(ref, struct qede_rdma_dev, refcnt);
+
+	/* no more events will be added after this */
+	complete(&rdma_dev->event_comp);
+}
+
 static void qede_rdma_destroy_wq(struct qede_dev *edev)
 {
+	/* Avoid race with add_event flow, make sure it finishes before
+	 * we start accessing the list and cleaning up the work
+	 */
+	kref_put(&edev->rdma_info.refcnt, qede_rdma_complete_event);
+	wait_for_completion(&edev->rdma_info.event_comp);
+
 	qede_rdma_cleanup_event(edev);
 	destroy_workqueue(edev->rdma_info.rdma_wq);
 }
@@ -287,15 +305,24 @@
 	if (!edev->rdma_info.qedr_dev)
 		return;
 
+	/* We don't want the cleanup flow to start while we're allocating and
+	 * scheduling the work
+	 */
+	if (!kref_get_unless_zero(&edev->rdma_info.refcnt))
+		return; /* already being destroyed */
+
 	event_node = qede_rdma_get_free_event_node(edev);
 	if (!event_node)
-		return;
+		goto out;
 
 	event_node->event = event;
 	event_node->ptr = edev;
 
 	INIT_WORK(&event_node->work, qede_rdma_handle_event);
 	queue_work(edev->rdma_info.rdma_wq, &event_node->work);
+
+out:
+	kref_put(&edev->rdma_info.refcnt, qede_rdma_complete_event);
 }
 
 void qede_rdma_dev_event_open(struct qede_dev *edev)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 4ab87fe..6ea43e4 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c

@@ -7433,6 +7433,15 @@
 	int chipset, region, i;
 	int jumbo_max, rc;
 
+	/* Some tools for creating an initramfs don't consider softdeps, then
+	 * r8169.ko may be in initramfs, but realtek.ko not. Then the generic
+	 * PHY driver is used that doesn't work with most chip versions.
+	 */
+	if (!driver_find("RTL8201CP Ethernet", &mdio_bus_type)) {
+		dev_err(&pdev->dev, "realtek.ko not loaded, maybe it needs to be added to initramfs?\n");
+		return -ENOENT;
+	}
+
 	dev = devm_alloc_etherdev(&pdev->dev, sizeof (*tp));
 	if (!dev)
 		return -ENOMEM;

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index dbfd3a0..77a9a75 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c

@@ -110,7 +110,7 @@
 
 	init_waitqueue_head(&net_device->wait_drain);
 	net_device->destroy = false;
-	net_device->tx_disable = false;
+	net_device->tx_disable = true;
 
 	net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT;
 	net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT;

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 7ab576d..bdb55db 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c

@@ -984,6 +984,7 @@
 	}
 
 	/* In any case device is now ready */
+	nvdev->tx_disable = false;
 	netif_device_attach(ndev);
 
 	/* Note: enable and attach happen when sub-channels setup */
@@ -2336,6 +2337,8 @@
 	else
 		net->max_mtu = ETH_DATA_LEN;
 
+	nvdev->tx_disable = false;
+
 	ret = register_netdevice(net);
 	if (ret != 0) {
 		pr_err("Unable to register netdev.\n");

diff --git a/drivers/net/phy/mdio-bcm-iproc.c b/drivers/net/phy/mdio-bcm-iproc.c
index 46fe1ae9..51ce3ea 100644
--- a/drivers/net/phy/mdio-bcm-iproc.c
+++ b/drivers/net/phy/mdio-bcm-iproc.c

@@ -188,6 +188,23 @@
 	return 0;
 }
 
+#ifdef CONFIG_PM_SLEEP
+int iproc_mdio_resume(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct iproc_mdio_priv *priv = platform_get_drvdata(pdev);
+
+	/* restore the mii clock configuration */
+	iproc_mdio_config_clk(priv->base);
+
+	return 0;
+}
+
+static const struct dev_pm_ops iproc_mdio_pm_ops = {
+	.resume = iproc_mdio_resume
+};
+#endif /* CONFIG_PM_SLEEP */
+
 static const struct of_device_id iproc_mdio_of_match[] = {
 	{ .compatible = "brcm,iproc-mdio", },
 	{ /* sentinel */ },
@@ -198,6 +215,9 @@
 	.driver = {
 		.name = "iproc-mdio",
 		.of_match_table = iproc_mdio_of_match,
+#ifdef CONFIG_PM_SLEEP
+		.pm = &iproc_mdio_pm_ops,
+#endif
 	},
 	.probe = iproc_mdio_probe,
 	.remove = iproc_mdio_remove,

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 13c8788..a04f857 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c

@@ -63,7 +63,6 @@
 
 enum qmi_wwan_quirks {
 	QMI_WWAN_QUIRK_DTR = 1 << 0,	/* needs "set DTR" request */
-	QMI_WWAN_QUIRK_QUECTEL_DYNCFG = 1 << 1,	/* check num. endpoints */
 };
 
 struct qmimux_hdr {
@@ -853,16 +852,6 @@
 	.data           = QMI_WWAN_QUIRK_DTR,
 };
 
-static const struct driver_info	qmi_wwan_info_quirk_quectel_dyncfg = {
-	.description	= "WWAN/QMI device",
-	.flags		= FLAG_WWAN | FLAG_SEND_ZLP,
-	.bind		= qmi_wwan_bind,
-	.unbind		= qmi_wwan_unbind,
-	.manage_power	= qmi_wwan_manage_power,
-	.rx_fixup       = qmi_wwan_rx_fixup,
-	.data           = QMI_WWAN_QUIRK_DTR | QMI_WWAN_QUIRK_QUECTEL_DYNCFG,
-};
-
 #define HUAWEI_VENDOR_ID	0x12D1
 
 /* map QMI/wwan function by a fixed interface number */
@@ -883,14 +872,18 @@
 #define QMI_GOBI_DEVICE(vend, prod) \
 	QMI_FIXED_INTF(vend, prod, 0)
 
-/* Quectel does not use fixed interface numbers on at least some of their
- * devices. We need to check the number of endpoints to ensure that we bind to
- * the correct interface.
+/* Many devices have QMI and DIAG functions which are distinguishable
+ * from other vendor specific functions by class, subclass and
+ * protocol all being 0xff. The DIAG function has exactly 2 endpoints
+ * and is silently rejected when probed.
+ *
+ * This makes it possible to match dynamically numbered QMI functions
+ * as seen on e.g. many Quectel modems.
  */
-#define QMI_QUIRK_QUECTEL_DYNCFG(vend, prod) \
+#define QMI_MATCH_FF_FF_FF(vend, prod) \
 	USB_DEVICE_AND_INTERFACE_INFO(vend, prod, USB_CLASS_VENDOR_SPEC, \
 				      USB_SUBCLASS_VENDOR_SPEC, 0xff), \
-	.driver_info = (unsigned long)&qmi_wwan_info_quirk_quectel_dyncfg
+	.driver_info = (unsigned long)&qmi_wwan_info_quirk_dtr
 
 static const struct usb_device_id products[] = {
 	/* 1. CDC ECM like devices match on the control interface */
@@ -996,10 +989,10 @@
 		USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0x581d, USB_CLASS_VENDOR_SPEC, 1, 7),
 		.driver_info = (unsigned long)&qmi_wwan_info,
 	},
-	{QMI_QUIRK_QUECTEL_DYNCFG(0x2c7c, 0x0125)},	/* Quectel EC25, EC20 R2.0  Mini PCIe */
-	{QMI_QUIRK_QUECTEL_DYNCFG(0x2c7c, 0x0306)},	/* Quectel EP06/EG06/EM06 */
-	{QMI_QUIRK_QUECTEL_DYNCFG(0x2c7c, 0x0512)},	/* Quectel EG12/EM12 */
-	{QMI_QUIRK_QUECTEL_DYNCFG(0x2c7c, 0x0800)},	/* Quectel RM500Q-GL */
+	{QMI_MATCH_FF_FF_FF(0x2c7c, 0x0125)},	/* Quectel EC25, EC20 R2.0  Mini PCIe */
+	{QMI_MATCH_FF_FF_FF(0x2c7c, 0x0306)},	/* Quectel EP06/EG06/EM06 */
+	{QMI_MATCH_FF_FF_FF(0x2c7c, 0x0512)},	/* Quectel EG12/EM12 */
+	{QMI_MATCH_FF_FF_FF(0x2c7c, 0x0800)},	/* Quectel RM500Q-GL */
 
 	/* 3. Combined interface devices matching on interface number */
 	{QMI_FIXED_INTF(0x0408, 0xea42, 4)},	/* Yota / Megafon M100-1 */
@@ -1298,6 +1291,7 @@
 	{QMI_FIXED_INTF(0x413c, 0x81b6, 8)},	/* Dell Wireless 5811e */
 	{QMI_FIXED_INTF(0x413c, 0x81b6, 10)},	/* Dell Wireless 5811e */
 	{QMI_FIXED_INTF(0x413c, 0x81d7, 0)},	/* Dell Wireless 5821e */
+	{QMI_FIXED_INTF(0x413c, 0x81d7, 1)},	/* Dell Wireless 5821e preproduction config */
 	{QMI_FIXED_INTF(0x413c, 0x81e0, 0)},	/* Dell Wireless 5821e with eSIM support*/
 	{QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)},	/* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */
 	{QMI_FIXED_INTF(0x03f0, 0x9d1d, 1)},	/* HP lt4120 Snapdragon X5 LTE */
@@ -1389,7 +1383,6 @@
 {
 	struct usb_device_id *id = (struct usb_device_id *)prod;
 	struct usb_interface_descriptor *desc = &intf->cur_altsetting->desc;
-	const struct driver_info *info;
 
 	/* Workaround to enable dynamic IDs.  This disables usbnet
 	 * blacklisting functionality.  Which, if required, can be
@@ -1425,12 +1418,8 @@
 	 * different. Ignore the current interface if the number of endpoints
 	 * equals the number for the diag interface (two).
 	 */
-	info = (void *)id->driver_info;
-
-	if (info->data & QMI_WWAN_QUIRK_QUECTEL_DYNCFG) {
-		if (desc->bNumEndpoints == 2)
-			return -ENODEV;
-	}
+	if (desc->bNumEndpoints == 2)
+		return -ENODEV;
 
 	return usbnet_probe(intf, id);
 }

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c88ee37..f6a69b5 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c

@@ -2232,6 +2232,53 @@
 	return 0;
 }
 
+static int virtnet_set_coalesce(struct net_device *dev,
+				struct ethtool_coalesce *ec)
+{
+	struct ethtool_coalesce ec_default = {
+		.cmd = ETHTOOL_SCOALESCE,
+		.rx_max_coalesced_frames = 1,
+	};
+	struct virtnet_info *vi = netdev_priv(dev);
+	int i, napi_weight;
+
+	if (ec->tx_max_coalesced_frames > 1)
+		return -EINVAL;
+
+	ec_default.tx_max_coalesced_frames = ec->tx_max_coalesced_frames;
+	napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0;
+
+	/* disallow changes to fields not explicitly tested above */
+	if (memcmp(ec, &ec_default, sizeof(ec_default)))
+		return -EINVAL;
+
+	if (napi_weight ^ vi->sq[0].napi.weight) {
+		if (dev->flags & IFF_UP)
+			return -EBUSY;
+		for (i = 0; i < vi->max_queue_pairs; i++)
+			vi->sq[i].napi.weight = napi_weight;
+	}
+
+	return 0;
+}
+
+static int virtnet_get_coalesce(struct net_device *dev,
+				struct ethtool_coalesce *ec)
+{
+	struct ethtool_coalesce ec_default = {
+		.cmd = ETHTOOL_GCOALESCE,
+		.rx_max_coalesced_frames = 1,
+	};
+	struct virtnet_info *vi = netdev_priv(dev);
+
+	memcpy(ec, &ec_default, sizeof(ec_default));
+
+	if (vi->sq[0].napi.weight)
+		ec->tx_max_coalesced_frames = 1;
+
+	return 0;
+}
+
 static void virtnet_init_settings(struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
@@ -2270,6 +2317,8 @@
 	.get_ts_info = ethtool_op_get_ts_info,
 	.get_link_ksettings = virtnet_get_link_ksettings,
 	.set_link_ksettings = virtnet_set_link_ksettings,
+	.set_coalesce = virtnet_set_coalesce,
+	.get_coalesce = virtnet_get_coalesce,
 };
 
 static void virtnet_freeze_down(struct virtio_device *vdev)

diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
index daeab33..9ab04ef 100644
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c

@@ -242,6 +242,11 @@
 		ret = -ENOMEM;
 		goto free_riptr;
 	}
+	if (riptr != (u16)riptr || tiptr != (u16)tiptr) {
+		dev_err(priv->dev, "MURAM allocation out of addressable range\n");
+		ret = -ENOMEM;
+		goto free_tiptr;
+	}
 
 	/* Set RIPTR, TIPTR */
 	iowrite16be(riptr, &priv->ucc_pram->riptr);

diff --git a/drivers/net/wan/ixp4xx_hss.c b/drivers/net/wan/ixp4xx_hss.c
index 6a505c2..a269ed6 100644
--- a/drivers/net/wan/ixp4xx_hss.c
+++ b/drivers/net/wan/ixp4xx_hss.c

@@ -261,7 +261,7 @@
 	struct hss_plat_info *plat;
 	buffer_t *rx_buff_tab[RX_DESCS], *tx_buff_tab[TX_DESCS];
 	struct desc *desc_tab;	/* coherent */
-	u32 desc_tab_phys;
+	dma_addr_t desc_tab_phys;
 	unsigned int id;
 	unsigned int clock_type, clock_rate, loopback;
 	unsigned int initialized, carrier;
@@ -861,7 +861,7 @@
 		dev->stats.tx_dropped++;
 		return NETDEV_TX_OK;
 	}
-	memcpy_swab32(mem, (u32 *)((int)skb->data & ~3), bytes / 4);
+	memcpy_swab32(mem, (u32 *)((uintptr_t)skb->data & ~3), bytes / 4);
 	dev_kfree_skb(skb);
 #endif
 

diff --git a/drivers/net/wireless/ath/ath10k/wmi.c b/drivers/net/wireless/ath/ath10k/wmi.c
index 0f6ff7a..3372dfa 100644
--- a/drivers/net/wireless/ath/ath10k/wmi.c
+++ b/drivers/net/wireless/ath/ath10k/wmi.c

@@ -9193,7 +9193,7 @@
 
 	msdu = pkt_addr->vaddr;
 	dma_unmap_single(ar->dev, pkt_addr->paddr,
-			 msdu->len, DMA_FROM_DEVICE);
+			 msdu->len, DMA_TO_DEVICE);
 	ieee80211_free_txskb(ar->hw, msdu);
 
 	return 0;

diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c
index 55f4119..770cc21 100644
--- a/drivers/net/wireless/broadcom/b43legacy/main.c
+++ b/drivers/net/wireless/broadcom/b43legacy/main.c

@@ -1304,8 +1304,9 @@
 }
 
 /* Interrupt handler bottom-half */
-static void b43legacy_interrupt_tasklet(struct b43legacy_wldev *dev)
+static void b43legacy_interrupt_tasklet(unsigned long data)
 {
+	struct b43legacy_wldev *dev = (struct b43legacy_wldev *)data;
 	u32 reason;
 	u32 dma_reason[ARRAY_SIZE(dev->dma_reason)];
 	u32 merged_dma_reason = 0;
@@ -3775,7 +3776,7 @@
 	b43legacy_set_status(wldev, B43legacy_STAT_UNINIT);
 	wldev->bad_frames_preempt = modparam_bad_frames_preempt;
 	tasklet_init(&wldev->isr_tasklet,
-		     (void (*)(unsigned long))b43legacy_interrupt_tasklet,
+		     b43legacy_interrupt_tasklet,
 		     (unsigned long)wldev);
 	if (modparam_pio)
 		wldev->__using_pio = true;

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index 5c3b62e..e0211321 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c

@@ -1934,6 +1934,7 @@
 					       BRCMF_SDIO_FT_NORMAL)) {
 				rd->len = 0;
 				brcmu_pkt_buf_free_skb(pkt);
+				continue;
 			}
 			bus->sdcnt.rx_readahead_cnt++;
 			if (rd->len != roundup(rd_new.len, 16)) {

diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
index 910db46..a3a4709 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c

@@ -3220,8 +3220,9 @@
 	}
 }
 
-static void ipw2100_irq_tasklet(struct ipw2100_priv *priv)
+static void ipw2100_irq_tasklet(unsigned long data)
 {
+	struct ipw2100_priv *priv = (struct ipw2100_priv *)data;
 	struct net_device *dev = priv->net_dev;
 	unsigned long flags;
 	u32 inta, tmp;
@@ -6025,7 +6026,7 @@
 	spin_unlock_irqrestore(&priv->low_lock, flags);
 }
 
-static void ipw2100_irq_tasklet(struct ipw2100_priv *priv);
+static void ipw2100_irq_tasklet(unsigned long data);
 
 static const struct net_device_ops ipw2100_netdev_ops = {
 	.ndo_open		= ipw2100_open,
@@ -6155,7 +6156,7 @@
 	INIT_DELAYED_WORK(&priv->rf_kill, ipw2100_rf_kill);
 	INIT_DELAYED_WORK(&priv->scan_event, ipw2100_scan_event);
 
-	tasklet_init(&priv->irq_tasklet, (void (*)(unsigned long))
+	tasklet_init(&priv->irq_tasklet,
 		     ipw2100_irq_tasklet, (unsigned long)priv);
 
 	/* NOTE:  We do not start the deferred work for status checks yet */

diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c
index 9644e7b..04aee2f 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c

@@ -1959,8 +1959,9 @@
 	wireless_send_event(priv->net_dev, SIOCGIWAP, &wrqu, NULL);
 }
 
-static void ipw_irq_tasklet(struct ipw_priv *priv)
+static void ipw_irq_tasklet(unsigned long data)
 {
+	struct ipw_priv *priv = (struct ipw_priv *)data;
 	u32 inta, inta_mask, handled = 0;
 	unsigned long flags;
 	int rc = 0;
@@ -10694,7 +10695,7 @@
 	INIT_WORK(&priv->qos_activate, ipw_bg_qos_activate);
 #endif				/* CONFIG_IPW2200_QOS */
 
-	tasklet_init(&priv->irq_tasklet, (void (*)(unsigned long))
+	tasklet_init(&priv->irq_tasklet,
 		     ipw_irq_tasklet, (unsigned long)priv);
 
 	return ret;

diff --git a/drivers/net/wireless/intel/iwlegacy/3945-mac.c b/drivers/net/wireless/intel/iwlegacy/3945-mac.c
index 57e3b6c..b536ec2 100644
--- a/drivers/net/wireless/intel/iwlegacy/3945-mac.c
+++ b/drivers/net/wireless/intel/iwlegacy/3945-mac.c

@@ -1392,8 +1392,9 @@
 }
 
 static void
-il3945_irq_tasklet(struct il_priv *il)
+il3945_irq_tasklet(unsigned long data)
 {
+	struct il_priv *il = (struct il_priv *)data;
 	u32 inta, handled = 0;
 	u32 inta_fh;
 	unsigned long flags;
@@ -3419,7 +3420,7 @@
 	timer_setup(&il->watchdog, il_bg_watchdog, 0);
 
 	tasklet_init(&il->irq_tasklet,
-		     (void (*)(unsigned long))il3945_irq_tasklet,
+		     il3945_irq_tasklet,
 		     (unsigned long)il);
 }
 

diff --git a/drivers/net/wireless/intel/iwlegacy/4965-mac.c b/drivers/net/wireless/intel/iwlegacy/4965-mac.c
index 280cd8a..6fc51c7 100644
--- a/drivers/net/wireless/intel/iwlegacy/4965-mac.c
+++ b/drivers/net/wireless/intel/iwlegacy/4965-mac.c

@@ -4360,8 +4360,9 @@
 }
 
 static void
-il4965_irq_tasklet(struct il_priv *il)
+il4965_irq_tasklet(unsigned long data)
 {
+	struct il_priv *il = (struct il_priv *)data;
 	u32 inta, handled = 0;
 	u32 inta_fh;
 	unsigned long flags;
@@ -6257,7 +6258,7 @@
 	timer_setup(&il->watchdog, il_bg_watchdog, 0);
 
 	tasklet_init(&il->irq_tasklet,
-		     (void (*)(unsigned long))il4965_irq_tasklet,
+		     il4965_irq_tasklet,
 		     (unsigned long)il);
 }
 

diff --git a/drivers/net/wireless/intel/iwlegacy/common.c b/drivers/net/wireless/intel/iwlegacy/common.c
index 6514baf..e16f259 100644
--- a/drivers/net/wireless/intel/iwlegacy/common.c
+++ b/drivers/net/wireless/intel/iwlegacy/common.c

@@ -717,7 +717,7 @@
 	u32 gp = _il_rd(il, CSR_EEPROM_GP);
 	int sz;
 	int ret;
-	u16 addr;
+	int addr;
 
 	/* allocate eeprom */
 	sz = il->cfg->eeprom_size;

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c
index 1232f63..319103f 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c

@@ -739,7 +739,8 @@
 static void iwl_mvm_thermal_zone_register(struct iwl_mvm *mvm)
 {
 	int i;
-	char name[] = "iwlwifi";
+	char name[16];
+	static atomic_t counter = ATOMIC_INIT(0);
 
 	if (!iwl_mvm_is_tt_in_fw(mvm)) {
 		mvm->tz_device.tzone = NULL;
@@ -749,6 +750,7 @@
 
 	BUILD_BUG_ON(ARRAY_SIZE(name) >= THERMAL_NAME_LENGTH);
 
+	sprintf(name, "iwlwifi_%u", atomic_inc_return(&counter) & 0xFF);
 	mvm->tz_device.tzone = thermal_zone_device_register(name,
 							IWL_MAX_DTS_TRIPS,
 							IWL_WRITABLE_TRIPS_MSK,

diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
index 4f55711..24da496 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c

@@ -3283,6 +3283,15 @@
 	spin_lock_init(&trans_pcie->reg_lock);
 	mutex_init(&trans_pcie->mutex);
 	init_waitqueue_head(&trans_pcie->ucode_write_waitq);
+
+	trans_pcie->rba.alloc_wq = alloc_workqueue("rb_allocator",
+						   WQ_HIGHPRI | WQ_UNBOUND, 1);
+	if (!trans_pcie->rba.alloc_wq) {
+		ret = -ENOMEM;
+		goto out_free_trans;
+	}
+	INIT_WORK(&trans_pcie->rba.rx_alloc, iwl_pcie_rx_allocator_work);
+
 	trans_pcie->tso_hdr_page = alloc_percpu(struct iwl_tso_hdr_page);
 	if (!trans_pcie->tso_hdr_page) {
 		ret = -ENOMEM;
@@ -3485,10 +3494,6 @@
 		trans_pcie->inta_mask = CSR_INI_SET_MASK;
 	 }
 
-	trans_pcie->rba.alloc_wq = alloc_workqueue("rb_allocator",
-						   WQ_HIGHPRI | WQ_UNBOUND, 1);
-	INIT_WORK(&trans_pcie->rba.rx_alloc, iwl_pcie_rx_allocator_work);
-
 #ifdef CONFIG_IWLWIFI_PCIE_RTPM
 	trans->runtime_pm_mode = IWL_PLAT_PM_MODE_D0I3;
 #else
@@ -3501,6 +3506,8 @@
 	iwl_pcie_free_ict(trans);
 out_no_pci:
 	free_percpu(trans_pcie->tso_hdr_page);
+	destroy_workqueue(trans_pcie->rba.alloc_wq);
+out_free_trans:
 	iwl_trans_free(trans);
 	return ERR_PTR(ret);
 }

diff --git a/drivers/net/wireless/intersil/hostap/hostap_ap.c b/drivers/net/wireless/intersil/hostap/hostap_ap.c
index 0094b1d..3ec46f4 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_ap.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_ap.c

@@ -2508,7 +2508,7 @@
 		sta->supported_rates[0] = 2;
 	if (sta->tx_supp_rates & WLAN_RATE_2M)
 		sta->supported_rates[1] = 4;
- 	if (sta->tx_supp_rates & WLAN_RATE_5M5)
+	if (sta->tx_supp_rates & WLAN_RATE_5M5)
 		sta->supported_rates[2] = 11;
 	if (sta->tx_supp_rates & WLAN_RATE_11M)
 		sta->supported_rates[3] = 22;

diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
index 2c7dd2a..b704e4b 100644
--- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
+++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c

@@ -1364,7 +1364,8 @@
 	int retval;
 
 	BUG_ON(in_interrupt());
-	BUG_ON(!upriv);
+	if (!upriv)
+		return -EINVAL;
 
 	upriv->reply_count = 0;
 	/* Write the MAGIC number on the simulated registers to keep

diff --git a/drivers/net/wireless/marvell/mwifiex/main.h b/drivers/net/wireless/marvell/mwifiex/main.h
index e39bb5c..7e52601 100644
--- a/drivers/net/wireless/marvell/mwifiex/main.h
+++ b/drivers/net/wireless/marvell/mwifiex/main.h

@@ -1294,19 +1294,6 @@
 	return pos;
 }
 
-/* This function return interface number with the same bss_type.
- */
-static inline u8
-mwifiex_get_intf_num(struct mwifiex_adapter *adapter, u8 bss_type)
-{
-	u8 i, num = 0;
-
-	for (i = 0; i < adapter->priv_num; i++)
-		if (adapter->priv[i] && adapter->priv[i]->bss_type == bss_type)
-			num++;
-	return num;
-}
-
 /*
  * This function returns the correct private structure pointer based
  * upon the BSS type and BSS number.

diff --git a/drivers/net/wireless/marvell/mwifiex/tdls.c b/drivers/net/wireless/marvell/mwifiex/tdls.c
index 6058c48..b6b7bbe 100644
--- a/drivers/net/wireless/marvell/mwifiex/tdls.c
+++ b/drivers/net/wireless/marvell/mwifiex/tdls.c

@@ -897,7 +897,7 @@
 	u8 *peer, *pos, *end;
 	u8 i, action, basic;
 	u16 cap = 0;
-	int ie_len = 0;
+	int ies_len = 0;
 
 	if (len < (sizeof(struct ethhdr) + 3))
 		return;
@@ -919,7 +919,7 @@
 		pos = buf + sizeof(struct ethhdr) + 4;
 		/* payload 1+ category 1 + action 1 + dialog 1 */
 		cap = get_unaligned_le16(pos);
-		ie_len = len - sizeof(struct ethhdr) - TDLS_REQ_FIX_LEN;
+		ies_len = len - sizeof(struct ethhdr) - TDLS_REQ_FIX_LEN;
 		pos += 2;
 		break;
 
@@ -929,7 +929,7 @@
 		/* payload 1+ category 1 + action 1 + dialog 1 + status code 2*/
 		pos = buf + sizeof(struct ethhdr) + 6;
 		cap = get_unaligned_le16(pos);
-		ie_len = len - sizeof(struct ethhdr) - TDLS_RESP_FIX_LEN;
+		ies_len = len - sizeof(struct ethhdr) - TDLS_RESP_FIX_LEN;
 		pos += 2;
 		break;
 
@@ -937,7 +937,7 @@
 		if (len < (sizeof(struct ethhdr) + TDLS_CONFIRM_FIX_LEN))
 			return;
 		pos = buf + sizeof(struct ethhdr) + TDLS_CONFIRM_FIX_LEN;
-		ie_len = len - sizeof(struct ethhdr) - TDLS_CONFIRM_FIX_LEN;
+		ies_len = len - sizeof(struct ethhdr) - TDLS_CONFIRM_FIX_LEN;
 		break;
 	default:
 		mwifiex_dbg(priv->adapter, ERROR, "Unknown TDLS frame type.\n");
@@ -950,33 +950,33 @@
 
 	sta_ptr->tdls_cap.capab = cpu_to_le16(cap);
 
-	for (end = pos + ie_len; pos + 1 < end; pos += 2 + pos[1]) {
-		if (pos + 2 + pos[1] > end)
+	for (end = pos + ies_len; pos + 1 < end; pos += 2 + pos[1]) {
+		u8 ie_len = pos[1];
+
+		if (pos + 2 + ie_len > end)
 			break;
 
 		switch (*pos) {
 		case WLAN_EID_SUPP_RATES:
-			if (pos[1] > 32)
+			if (ie_len > sizeof(sta_ptr->tdls_cap.rates))
 				return;
-			sta_ptr->tdls_cap.rates_len = pos[1];
-			for (i = 0; i < pos[1]; i++)
+			sta_ptr->tdls_cap.rates_len = ie_len;
+			for (i = 0; i < ie_len; i++)
 				sta_ptr->tdls_cap.rates[i] = pos[i + 2];
 			break;
 
 		case WLAN_EID_EXT_SUPP_RATES:
-			if (pos[1] > 32)
+			if (ie_len > sizeof(sta_ptr->tdls_cap.rates))
 				return;
 			basic = sta_ptr->tdls_cap.rates_len;
-			if (pos[1] > 32 - basic)
+			if (ie_len > sizeof(sta_ptr->tdls_cap.rates) - basic)
 				return;
-			for (i = 0; i < pos[1]; i++)
+			for (i = 0; i < ie_len; i++)
 				sta_ptr->tdls_cap.rates[basic + i] = pos[i + 2];
-			sta_ptr->tdls_cap.rates_len += pos[1];
+			sta_ptr->tdls_cap.rates_len += ie_len;
 			break;
 		case WLAN_EID_HT_CAPABILITY:
-			if (pos > end - sizeof(struct ieee80211_ht_cap) - 2)
-				return;
-			if (pos[1] != sizeof(struct ieee80211_ht_cap))
+			if (ie_len != sizeof(struct ieee80211_ht_cap))
 				return;
 			/* copy the ie's value into ht_capb*/
 			memcpy((u8 *)&sta_ptr->tdls_cap.ht_capb, pos + 2,
@@ -984,59 +984,45 @@
 			sta_ptr->is_11n_enabled = 1;
 			break;
 		case WLAN_EID_HT_OPERATION:
-			if (pos > end -
-			    sizeof(struct ieee80211_ht_operation) - 2)
-				return;
-			if (pos[1] != sizeof(struct ieee80211_ht_operation))
+			if (ie_len != sizeof(struct ieee80211_ht_operation))
 				return;
 			/* copy the ie's value into ht_oper*/
 			memcpy(&sta_ptr->tdls_cap.ht_oper, pos + 2,
 			       sizeof(struct ieee80211_ht_operation));
 			break;
 		case WLAN_EID_BSS_COEX_2040:
-			if (pos > end - 3)
-				return;
-			if (pos[1] != 1)
+			if (ie_len != sizeof(pos[2]))
 				return;
 			sta_ptr->tdls_cap.coex_2040 = pos[2];
 			break;
 		case WLAN_EID_EXT_CAPABILITY:
-			if (pos > end - sizeof(struct ieee_types_header))
+			if (ie_len < sizeof(struct ieee_types_header))
 				return;
-			if (pos[1] < sizeof(struct ieee_types_header))
-				return;
-			if (pos[1] > 8)
+			if (ie_len > 8)
 				return;
 			memcpy((u8 *)&sta_ptr->tdls_cap.extcap, pos,
 			       sizeof(struct ieee_types_header) +
-			       min_t(u8, pos[1], 8));
+			       min_t(u8, ie_len, 8));
 			break;
 		case WLAN_EID_RSN:
-			if (pos > end - sizeof(struct ieee_types_header))
+			if (ie_len < sizeof(struct ieee_types_header))
 				return;
-			if (pos[1] < sizeof(struct ieee_types_header))
-				return;
-			if (pos[1] > IEEE_MAX_IE_SIZE -
+			if (ie_len > IEEE_MAX_IE_SIZE -
 			    sizeof(struct ieee_types_header))
 				return;
 			memcpy((u8 *)&sta_ptr->tdls_cap.rsn_ie, pos,
 			       sizeof(struct ieee_types_header) +
-			       min_t(u8, pos[1], IEEE_MAX_IE_SIZE -
+			       min_t(u8, ie_len, IEEE_MAX_IE_SIZE -
 				     sizeof(struct ieee_types_header)));
 			break;
 		case WLAN_EID_QOS_CAPA:
-			if (pos > end - 3)
-				return;
-			if (pos[1] != 1)
+			if (ie_len != sizeof(pos[2]))
 				return;
 			sta_ptr->tdls_cap.qos_info = pos[2];
 			break;
 		case WLAN_EID_VHT_OPERATION:
 			if (priv->adapter->is_hw_11ac_capable) {
-				if (pos > end -
-				    sizeof(struct ieee80211_vht_operation) - 2)
-					return;
-				if (pos[1] !=
+				if (ie_len !=
 				    sizeof(struct ieee80211_vht_operation))
 					return;
 				/* copy the ie's value into vhtoper*/
@@ -1046,10 +1032,7 @@
 			break;
 		case WLAN_EID_VHT_CAPABILITY:
 			if (priv->adapter->is_hw_11ac_capable) {
-				if (pos > end -
-				    sizeof(struct ieee80211_vht_cap) - 2)
-					return;
-				if (pos[1] != sizeof(struct ieee80211_vht_cap))
+				if (ie_len != sizeof(struct ieee80211_vht_cap))
 					return;
 				/* copy the ie's value into vhtcap*/
 				memcpy((u8 *)&sta_ptr->tdls_cap.vhtcap, pos + 2,
@@ -1059,9 +1042,7 @@
 			break;
 		case WLAN_EID_AID:
 			if (priv->adapter->is_hw_11ac_capable) {
-				if (pos > end - 4)
-					return;
-				if (pos[1] != 2)
+				if (ie_len != sizeof(u16))
 					return;
 				sta_ptr->tdls_cap.aid =
 					get_unaligned_le16((pos + 2));

diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c
index 5d1fda1..8374957 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c

@@ -1082,13 +1082,15 @@
 	return ret;
 }
 
-static void _rtl_pci_irq_tasklet(struct ieee80211_hw *hw)
+static void _rtl_pci_irq_tasklet(unsigned long data)
 {
+	struct ieee80211_hw *hw = (struct ieee80211_hw *)data;
 	_rtl_pci_tx_chk_waitq(hw);
 }
 
-static void _rtl_pci_prepare_bcn_tasklet(struct ieee80211_hw *hw)
+static void _rtl_pci_prepare_bcn_tasklet(unsigned long data)
 {
+	struct ieee80211_hw *hw = (struct ieee80211_hw *)data;
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
 	struct rtl_pci *rtlpci = rtl_pcidev(rtl_pcipriv(hw));
 	struct rtl_mac *mac = rtl_mac(rtl_priv(hw));
@@ -1214,10 +1216,10 @@
 
 	/*task */
 	tasklet_init(&rtlpriv->works.irq_tasklet,
-		     (void (*)(unsigned long))_rtl_pci_irq_tasklet,
+		     _rtl_pci_irq_tasklet,
 		     (unsigned long)hw);
 	tasklet_init(&rtlpriv->works.irq_prepare_bcn_tasklet,
-		     (void (*)(unsigned long))_rtl_pci_prepare_bcn_tasklet,
+		     _rtl_pci_prepare_bcn_tasklet,
 		     (unsigned long)hw);
 	INIT_WORK(&rtlpriv->works.lps_change_work,
 		  rtl_lps_change_work_callback);

diff --git a/drivers/nfc/pn544/i2c.c b/drivers/nfc/pn544/i2c.c
index d0207f8..dcef73e 100644
--- a/drivers/nfc/pn544/i2c.c
+++ b/drivers/nfc/pn544/i2c.c

@@ -236,6 +236,7 @@
 
 out:
 	gpiod_set_value_cansleep(phy->gpiod_en, !phy->en_polarity);
+	usleep_range(10000, 15000);
 }
 
 static void pn544_hci_i2c_enable_mode(struct pn544_i2c_phy *phy, int run_mode)

diff --git a/drivers/nfc/port100.c b/drivers/nfc/port100.c
index 60ae382..06bb226 100644
--- a/drivers/nfc/port100.c
+++ b/drivers/nfc/port100.c

@@ -574,7 +574,7 @@
 {
 	struct port100_frame *frame = _frame;
 
-	frame->datalen = cpu_to_le16(le16_to_cpu(frame->datalen) + len);
+	le16_add_cpu(&frame->datalen, len);
 }
 
 static bool port100_rx_frame_is_valid(void *_frame)

diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index fb667bf..13510ba 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c

@@ -263,7 +263,7 @@
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
 	unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
 	sector_t sector = offset >> 9;
-	int rc = 0;
+	int rc = 0, ret = 0;
 
 	if (unlikely(!size))
 		return 0;
@@ -301,7 +301,9 @@
 	}
 
 	memcpy_flushcache(nsio->addr + offset, buf, size);
-	nvdimm_flush(to_nd_region(ndns->dev.parent));
+	ret = nvdimm_flush(to_nd_region(ndns->dev.parent), NULL);
+	if (ret)
+		rc = ret;
 
 	return rc;
 }

diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 01e194a..fbb01a7 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h

@@ -163,6 +163,7 @@
 	struct badblocks bb;
 	struct nd_interleave_set *nd_set;
 	struct nd_percpu_lane __percpu *lane;
+	int (*flush)(struct nd_region *nd_region, struct bio *bio);
 	struct nd_mapping mapping[0];
 };
 

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index a7ce2f1..68b4a90 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c

@@ -192,6 +192,7 @@
 
 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 {
+	int ret = 0;
 	blk_status_t rc = 0;
 	bool do_acct;
 	unsigned long start;
@@ -201,7 +202,7 @@
 	struct nd_region *nd_region = to_region(pmem);
 
 	if (bio->bi_opf & REQ_PREFLUSH)
-		nvdimm_flush(nd_region);
+		ret = nvdimm_flush(nd_region, bio);
 
 	do_acct = nd_iostat_start(bio, &start);
 	bio_for_each_segment(bvec, bio, iter) {
@@ -216,7 +217,10 @@
 		nd_iostat_end(bio, start);
 
 	if (bio->bi_opf & REQ_FUA)
-		nvdimm_flush(nd_region);
+		ret = nvdimm_flush(nd_region, bio);
+
+	if (ret)
+		bio->bi_status = errno_to_blk_status(ret);
 
 	bio_endio(bio);
 	return BLK_QC_T_NONE;
@@ -301,6 +305,7 @@
 
 static const struct dax_operations pmem_dax_ops = {
 	.direct_access = pmem_dax_direct_access,
+	.dax_supported = generic_fsdax_supported,
 	.copy_from_iter = pmem_copy_from_iter,
 	.copy_to_iter = pmem_copy_to_iter,
 };
@@ -371,6 +376,7 @@
 	struct gendisk *disk;
 	void *addr;
 	int rc;
+	unsigned long flags = 0UL;
 
 	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
 	if (!pmem)
@@ -468,14 +474,15 @@
 	nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res);
 	disk->bb = &pmem->bb;
 
-	dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
+	if (is_nvdimm_sync(nd_region))
+		flags = DAXDEV_F_SYNC;
+	dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags);
 	if (!dax_dev) {
 		put_disk(disk);
 		return -ENOMEM;
 	}
 	dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
 	pmem->dax_dev = dax_dev;
-
 	gendev = disk_to_dev(disk);
 	gendev->groups = pmem_attribute_groups;
 
@@ -533,14 +540,14 @@
 		sysfs_put(pmem->bb_state);
 		pmem->bb_state = NULL;
 	}
-	nvdimm_flush(to_nd_region(dev->parent));
+	nvdimm_flush(to_nd_region(dev->parent), NULL);
 
 	return 0;
 }
 
 static void nd_pmem_shutdown(struct device *dev)
 {
-	nvdimm_flush(to_nd_region(dev->parent));
+	nvdimm_flush(to_nd_region(dev->parent), NULL);
 }
 
 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)

diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 609fc45..aa0f6f5 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c

@@ -290,7 +290,9 @@
 		return rc;
 	if (!flush)
 		return -EINVAL;
-	nvdimm_flush(nd_region);
+	rc = nvdimm_flush(nd_region, NULL);
+	if (rc)
+		return rc;
 
 	return len;
 }
@@ -1076,6 +1078,11 @@
 	dev->of_node = ndr_desc->of_node;
 	nd_region->ndr_size = resource_size(ndr_desc->res);
 	nd_region->ndr_start = ndr_desc->res->start;
+	if (ndr_desc->flush)
+		nd_region->flush = ndr_desc->flush;
+	else
+		nd_region->flush = NULL;
+
 	nd_device_register(dev);
 
 	return nd_region;
@@ -1116,11 +1123,24 @@
 }
 EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
 
+int nvdimm_flush(struct nd_region *nd_region, struct bio *bio)
+{
+	int rc = 0;
+
+	if (!nd_region->flush)
+		rc = generic_nvdimm_flush(nd_region);
+	else {
+		if (nd_region->flush(nd_region, bio))
+			rc = -EIO;
+	}
+
+	return rc;
+}
 /**
  * nvdimm_flush - flush any posted write queues between the cpu and pmem media
  * @nd_region: blk or interleaved pmem region
  */
-void nvdimm_flush(struct nd_region *nd_region)
+int generic_nvdimm_flush(struct nd_region *nd_region)
 {
 	struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev);
 	int i, idx;
@@ -1144,6 +1164,8 @@
 		if (ndrd_get_flush_wpq(ndrd, i, 0))
 			writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
 	wmb();
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nvdimm_flush);
 
@@ -1188,6 +1210,13 @@
 }
 EXPORT_SYMBOL_GPL(nvdimm_has_cache);
 
+bool is_nvdimm_sync(struct nd_region *nd_region)
+{
+	return is_nd_pmem(&nd_region->dev) &&
+		!test_bit(ND_REGION_ASYNC, &nd_region->flags);
+}
+EXPORT_SYMBOL_GPL(is_nvdimm_sync);
+
 struct conflict_context {
 	struct nd_region *nd_region;
 	resource_size_t start, size;

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b7bd89b..6d20356 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c

@@ -1055,15 +1055,15 @@
 	return id;
 }
 
-static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
-		      void *buffer, size_t buflen, u32 *result)
+static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
+		unsigned int dword11, void *buffer, size_t buflen, u32 *result)
 {
 	struct nvme_command c;
 	union nvme_result res;
 	int ret;
 
 	memset(&c, 0, sizeof(c));
-	c.features.opcode = nvme_admin_set_features;
+	c.features.opcode = op;
 	c.features.fid = cpu_to_le32(fid);
 	c.features.dword11 = cpu_to_le32(dword11);
 
@@ -1074,6 +1074,24 @@
 	return ret;
 }
 
+int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
+		      unsigned int dword11, void *buffer, size_t buflen,
+		      u32 *result)
+{
+	return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
+			     buflen, result);
+}
+EXPORT_SYMBOL_GPL(nvme_set_features);
+
+int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
+		      unsigned int dword11, void *buffer, size_t buflen,
+		      u32 *result)
+{
+	return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
+			     buflen, result);
+}
+EXPORT_SYMBOL_GPL(nvme_get_features);
+
 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
 {
 	u32 q_count = (*count - 1) | ((*count - 1) << 16);
@@ -3449,7 +3467,7 @@
 	if (!log)
 		return;
 
-	if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log,
+	if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log,
 			sizeof(*log), 0))
 		dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
 	kfree(log);
@@ -3761,6 +3779,17 @@
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
+void nvme_sync_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_sync_queue(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_sync_queues);
+
 int __init nvme_core_init(void)
 {
 	int result = -ENOMEM;

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 838ee58..e8bc25a 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c

@@ -569,6 +569,7 @@
 	}
 
 	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
+	kfree(ctrl->ana_log_buf);
 	ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
 	if (!ctrl->ana_log_buf) {
 		error = -ENOMEM;

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cc4273f..40192b6 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h

@@ -436,6 +436,7 @@
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
+void nvme_sync_queues(struct nvme_ctrl *ctrl);
 void nvme_unfreeze(struct nvme_ctrl *ctrl);
 void nvme_wait_freeze(struct nvme_ctrl *ctrl);
 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
@@ -453,6 +454,12 @@
 		union nvme_result *result, void *buffer, unsigned bufflen,
 		unsigned timeout, int qid, int at_head,
 		blk_mq_req_flags_t flags);
+int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
+		      unsigned int dword11, void *buffer, size_t buflen,
+		      u32 *result);
+int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
+		      unsigned int dword11, void *buffer, size_t buflen,
+		      u32 *result);
 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl);

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3c68a5b..7c00c85 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c

@@ -26,6 +26,7 @@
 #include <linux/mutex.h>
 #include <linux/once.h>
 #include <linux/pci.h>
+#include <linux/suspend.h>
 #include <linux/t10-pi.h>
 #include <linux/types.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
@@ -106,6 +107,7 @@
 	u32 cmbloc;
 	struct nvme_ctrl ctrl;
 	struct completion ioq_wait;
+	u32 last_ps;
 
 	mempool_t *iod_mempool;
 
@@ -1132,7 +1134,6 @@
 	struct nvme_dev *dev = nvmeq->dev;
 	struct request *abort_req;
 	struct nvme_command cmd;
-	bool shutdown = false;
 	u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
 	/* If PCI error recovery process is happening, we cannot reset or
@@ -1169,16 +1170,18 @@
 	 * shutdown, so we return BLK_EH_DONE.
 	 */
 	switch (dev->ctrl.state) {
-	case NVME_CTRL_DELETING:
-		shutdown = true;
 	case NVME_CTRL_CONNECTING:
-	case NVME_CTRL_RESETTING:
+		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+		/* fall through */
+	case NVME_CTRL_DELETING:
 		dev_warn_ratelimited(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, shutdown);
+		nvme_dev_disable(dev, true);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
 		return BLK_EH_DONE;
+	case NVME_CTRL_RESETTING:
+		return BLK_EH_RESET_TIMER;
 	default:
 		break;
 	}
@@ -2150,7 +2153,7 @@
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 {
 	int i;
-	bool dead = true;
+	bool dead = true, freeze = false;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	mutex_lock(&dev->shutdown_lock);
@@ -2158,8 +2161,10 @@
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
 		if (dev->ctrl.state == NVME_CTRL_LIVE ||
-		    dev->ctrl.state == NVME_CTRL_RESETTING)
+		    dev->ctrl.state == NVME_CTRL_RESETTING) {
+			freeze = true;
 			nvme_start_freeze(&dev->ctrl);
+		}
 		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
 			pdev->error_state  != pci_channel_io_normal);
 	}
@@ -2168,10 +2173,8 @@
 	 * Give the controller a chance to complete all entered requests if
 	 * doing a safe shutdown.
 	 */
-	if (!dead) {
-		if (shutdown)
-			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
-	}
+	if (!dead && shutdown && freeze)
+		nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
 
 	nvme_stop_queues(&dev->ctrl);
 
@@ -2269,6 +2272,7 @@
 	 */
 	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
 		nvme_dev_disable(dev, false);
+	nvme_sync_queues(&dev->ctrl);
 
 	mutex_lock(&dev->shutdown_lock);
 	result = nvme_pci_enable(dev);
@@ -2608,12 +2612,68 @@
 }
 
 #ifdef CONFIG_PM_SLEEP
-static int nvme_suspend(struct device *dev)
+static int nvme_deep_state(struct nvme_dev *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+	struct nvme_ctrl *ctrl = &dev->ctrl;
+	int ret = -EBUSY;;
+
+	nvme_start_freeze(ctrl);
+	nvme_wait_freeze(ctrl);
+	nvme_sync_queues(ctrl);
+
+	if (ctrl->state != NVME_CTRL_LIVE &&
+	    ctrl->state != NVME_CTRL_ADMIN_ONLY)
+		goto unfreeze;
+
+	dev->last_ps = 0;
+	ret = nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0,
+				&dev->last_ps);
+	if (ret < 0)
+		goto unfreeze;
+
+	ret = nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, dev->ctrl.npss,
+				NULL, 0, NULL);
+	if (ret < 0)
+		goto unfreeze;
+	if (ret) {
+		/*
+		 * Clearing npss forces a controller reset on resume. The
+		 * correct value will be resdicovered then.
+		 */
+		ctrl->npss = 0;
+		nvme_dev_disable(dev, true);
+		ret = 0;
+	} else {
+		/*
+		 * A saved state prevents pci pm from generically controlling
+		 * the device's power. If we're using protocol specific
+		 * settings, we don't want pci interfering.
+		 */
+		pci_save_state(pdev);
+	}
+unfreeze:
+	nvme_unfreeze(ctrl);
+	return ret;
+}
+
+static int nvme_make_operational(struct nvme_dev *dev)
+{
+	struct nvme_ctrl *ctrl = &dev->ctrl;
+
+	if (nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, dev->last_ps,
+			      NULL, 0, NULL) == 0)
+		return 0;
+	nvme_reset_ctrl(ctrl);
+	return 0;
+}
+
+static int nvme_simple_resume(struct device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct nvme_dev *ndev = pci_get_drvdata(pdev);
 
-	nvme_dev_disable(ndev, true);
+	nvme_reset_ctrl(&ndev->ctrl);
 	return 0;
 }
 
@@ -2622,12 +2682,45 @@
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct nvme_dev *ndev = pci_get_drvdata(pdev);
 
-	nvme_reset_ctrl(&ndev->ctrl);
+	return pm_resume_via_firmware() || !ndev->ctrl.npss ?
+		nvme_simple_resume(dev) : nvme_make_operational(ndev);
+}
+
+static int nvme_simple_suspend(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct nvme_dev *ndev = pci_get_drvdata(pdev);
+
+	nvme_dev_disable(ndev, true);
 	return 0;
 }
-#endif
 
-static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
+static int nvme_suspend(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct nvme_dev *ndev = pci_get_drvdata(pdev);
+
+	/*
+	 * The platform does not remove power for a kernel managed suspend so
+	 * use host managed nvme power settings for lowest idle power. This
+	 * should have quicker resume latency than a full device shutdown.
+	 */
+	return pm_suspend_via_firmware() || !ndev->ctrl.npss ?
+		nvme_simple_suspend(dev) : nvme_deep_state(ndev);
+}
+
+const struct dev_pm_ops nvme_dev_pm_ops = {
+	.suspend = nvme_suspend,
+	.resume = nvme_resume,
+	.freeze = nvme_simple_suspend,
+	.thaw = nvme_simple_resume,
+	.poweroff = nvme_simple_suspend,
+	.restore = nvme_simple_resume,
+};
+
+#else
+const struct dev_pm_ops nvme_dev_pm_ops = {};
+#endif
 
 static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
 						pci_channel_state_t state)

diff --git a/drivers/pci/controller/pcie-iproc.c b/drivers/pci/controller/pcie-iproc.c
index 9d5cbc7..ec86414 100644
--- a/drivers/pci/controller/pcie-iproc.c
+++ b/drivers/pci/controller/pcie-iproc.c

@@ -1526,6 +1526,30 @@
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_BROADCOM, 0xd804,
 			quirk_paxc_disable_msi_parsing);
 
+static void quirk_paxc_bridge(struct pci_dev *pdev)
+{
+	/*
+	 * The PCI config space is shared with the PAXC root port and the first
+	 * Ethernet device.  So, we need to workaround this by telling the PCI
+	 * code that the bridge is not an Ethernet device.
+	 */
+	if (pdev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+		pdev->class = PCI_CLASS_BRIDGE_PCI << 8;
+
+	/*
+	 * MPSS is not being set properly (as it is currently 0).  This is
+	 * because that area of the PCI config space is hard coded to zero, and
+	 * is not modifiable by firmware.  Set this to 2 (e.g., 512 byte MPS)
+	 * so that the MPS can be set to the real max value.
+	 */
+	pdev->pcie_mpss = 2;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_BROADCOM, 0x16cd, quirk_paxc_bridge);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_BROADCOM, 0x16f0, quirk_paxc_bridge);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_BROADCOM, 0xd750, quirk_paxc_bridge);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_BROADCOM, 0xd802, quirk_paxc_bridge);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_BROADCOM, 0xd804, quirk_paxc_bridge);
+
 MODULE_AUTHOR("Ray Jui <rjui@broadcom.com>");
 MODULE_DESCRIPTION("Broadcom iPROC PCIe common driver");
 MODULE_LICENSE("GPL v2");

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 5b4c36a..419dda6 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c

@@ -1848,19 +1848,40 @@
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	0x260a, quirk_intel_pcie_pm);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	0x260b, quirk_intel_pcie_pm);
 
+static void quirk_d3hot_delay(struct pci_dev *dev, unsigned int delay)
+{
+	if (dev->d3_delay >= delay)
+		return;
+
+	dev->d3_delay = delay;
+	pci_info(dev, "extending delay after power-on from D3hot to %d msec\n",
+		 dev->d3_delay);
+}
+
 static void quirk_radeon_pm(struct pci_dev *dev)
 {
 	if (dev->subsystem_vendor == PCI_VENDOR_ID_APPLE &&
-	    dev->subsystem_device == 0x00e2) {
-		if (dev->d3_delay < 20) {
-			dev->d3_delay = 20;
-			pci_info(dev, "extending delay after power-on from D3 to %d msec\n",
-				 dev->d3_delay);
-		}
-	}
+	    dev->subsystem_device == 0x00e2)
+		quirk_d3hot_delay(dev, 20);
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x6741, quirk_radeon_pm);
 
+/*
+ * Ryzen5/7 XHCI controllers fail upon resume from runtime suspend or s2idle.
+ * https://bugzilla.kernel.org/show_bug.cgi?id=205587
+ *
+ * The kernel attempts to transition these devices to D3cold, but that seems
+ * to be ineffective on the platforms in question; the PCI device appears to
+ * remain on in D3hot state. The D3hot-to-D0 transition then requires an
+ * extended delay in order to succeed.
+ */
+static void quirk_ryzen_xhci_d3hot(struct pci_dev *dev)
+{
+	quirk_d3hot_delay(dev, 20);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15e0, quirk_ryzen_xhci_d3hot);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15e1, quirk_ryzen_xhci_d3hot);
+
 #ifdef CONFIG_X86_IO_APIC
 static int dmi_disable_ioapicreroute(const struct dmi_system_id *d)
 {
@@ -2358,32 +2379,6 @@
 			 PCI_DEVICE_ID_TIGON3_5719,
 			 quirk_brcm_5719_limit_mrrs);
 
-#ifdef CONFIG_PCIE_IPROC_PLATFORM
-static void quirk_paxc_bridge(struct pci_dev *pdev)
-{
-	/*
-	 * The PCI config space is shared with the PAXC root port and the first
-	 * Ethernet device.  So, we need to workaround this by telling the PCI
-	 * code that the bridge is not an Ethernet device.
-	 */
-	if (pdev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
-		pdev->class = PCI_CLASS_BRIDGE_PCI << 8;
-
-	/*
-	 * MPSS is not being set properly (as it is currently 0).  This is
-	 * because that area of the PCI config space is hard coded to zero, and
-	 * is not modifiable by firmware.  Set this to 2 (e.g., 512 byte MPS)
-	 * so that the MPS can be set to the real max value.
-	 */
-	pdev->pcie_mpss = 2;
-}
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_BROADCOM, 0x16cd, quirk_paxc_bridge);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_BROADCOM, 0x16f0, quirk_paxc_bridge);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_BROADCOM, 0xd750, quirk_paxc_bridge);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_BROADCOM, 0xd802, quirk_paxc_bridge);
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_BROADCOM, 0xd804, quirk_paxc_bridge);
-#endif
-
 /*
  * Originally in EDAC sources for i82875P: Intel tells BIOS developers to
  * hide device 6 which configures the overflow device access containing the

diff --git a/drivers/pinctrl/intel/pinctrl-baytrail.c b/drivers/pinctrl/intel/pinctrl-baytrail.c
index 021e28f..a760d8b 100644
--- a/drivers/pinctrl/intel/pinctrl-baytrail.c
+++ b/drivers/pinctrl/intel/pinctrl-baytrail.c

@@ -950,7 +950,13 @@
 
 	raw_spin_lock_irqsave(&byt_lock, flags);
 	value = readl(reg);
-	value &= ~(BYT_TRIG_POS | BYT_TRIG_NEG | BYT_TRIG_LVL);
+
+	/* Do not clear direct-irq enabled IRQs (from gpio_disable_free) */
+	if (value & BYT_DIRECT_IRQ_EN)
+		/* nothing to do */ ;
+	else
+		value &= ~(BYT_TRIG_POS | BYT_TRIG_NEG | BYT_TRIG_LVL);
+
 	writel(value, reg);
 	raw_spin_unlock_irqrestore(&byt_lock, flags);
 }

diff --git a/drivers/pinctrl/sh-pfc/pfc-sh7264.c b/drivers/pinctrl/sh-pfc/pfc-sh7264.c
index e1c34e1..3ddb956 100644
--- a/drivers/pinctrl/sh-pfc/pfc-sh7264.c
+++ b/drivers/pinctrl/sh-pfc/pfc-sh7264.c

@@ -500,17 +500,15 @@
 	SD_WP_MARK, SD_CLK_MARK, SD_CMD_MARK,
 	CRX0_MARK, CRX1_MARK,
 	CTX0_MARK, CTX1_MARK,
+	CRX0_CRX1_MARK, CTX0_CTX1_MARK,
 
 	PWM1A_MARK, PWM1B_MARK, PWM1C_MARK, PWM1D_MARK,
 	PWM1E_MARK, PWM1F_MARK, PWM1G_MARK, PWM1H_MARK,
 	PWM2A_MARK, PWM2B_MARK, PWM2C_MARK, PWM2D_MARK,
 	PWM2E_MARK, PWM2F_MARK, PWM2G_MARK, PWM2H_MARK,
 	IERXD_MARK, IETXD_MARK,
-	CRX0_CRX1_MARK,
 	WDTOVF_MARK,
 
-	CRX0X1_MARK,
-
 	/* DMAC */
 	TEND0_MARK, DACK0_MARK, DREQ0_MARK,
 	TEND1_MARK, DACK1_MARK, DREQ1_MARK,
@@ -998,12 +996,12 @@
 
 	PINMUX_DATA(PJ3_DATA, PJ3MD_00),
 	PINMUX_DATA(CRX1_MARK, PJ3MD_01),
-	PINMUX_DATA(CRX0X1_MARK, PJ3MD_10),
+	PINMUX_DATA(CRX0_CRX1_MARK, PJ3MD_10),
 	PINMUX_DATA(IRQ1_PJ_MARK, PJ3MD_11),
 
 	PINMUX_DATA(PJ2_DATA, PJ2MD_000),
 	PINMUX_DATA(CTX1_MARK, PJ2MD_001),
-	PINMUX_DATA(CRX0_CRX1_MARK, PJ2MD_010),
+	PINMUX_DATA(CTX0_CTX1_MARK, PJ2MD_010),
 	PINMUX_DATA(CS2_MARK, PJ2MD_011),
 	PINMUX_DATA(SCK0_MARK, PJ2MD_100),
 	PINMUX_DATA(LCD_M_DISP_MARK, PJ2MD_101),
@@ -1248,6 +1246,7 @@
 	GPIO_FN(CTX1),
 	GPIO_FN(CRX1),
 	GPIO_FN(CTX0),
+	GPIO_FN(CTX0_CTX1),
 	GPIO_FN(CRX0),
 	GPIO_FN(CRX0_CRX1),
 

diff --git a/drivers/pinctrl/sh-pfc/pfc-sh7269.c b/drivers/pinctrl/sh-pfc/pfc-sh7269.c
index cfdb4fc1..3df0c0d 100644
--- a/drivers/pinctrl/sh-pfc/pfc-sh7269.c
+++ b/drivers/pinctrl/sh-pfc/pfc-sh7269.c

@@ -740,13 +740,12 @@
 	CRX0_MARK, CTX0_MARK,
 	CRX1_MARK, CTX1_MARK,
 	CRX2_MARK, CTX2_MARK,
-	CRX0_CRX1_MARK,
-	CRX0_CRX1_CRX2_MARK,
-	CTX0CTX1CTX2_MARK,
+	CRX0_CRX1_MARK, CTX0_CTX1_MARK,
+	CRX0_CRX1_CRX2_MARK, CTX0_CTX1_CTX2_MARK,
 	CRX1_PJ22_MARK, CTX1_PJ23_MARK,
 	CRX2_PJ20_MARK, CTX2_PJ21_MARK,
-	CRX0CRX1_PJ22_MARK,
-	CRX0CRX1CRX2_PJ20_MARK,
+	CRX0_CRX1_PJ22_MARK, CTX0_CTX1_PJ23_MARK,
+	CRX0_CRX1_CRX2_PJ20_MARK, CTX0_CTX1_CTX2_PJ21_MARK,
 
 	/* VDC */
 	DV_CLK_MARK,
@@ -824,6 +823,7 @@
 	PINMUX_DATA(CS3_MARK, PC8MD_001),
 	PINMUX_DATA(TXD7_MARK, PC8MD_010),
 	PINMUX_DATA(CTX1_MARK, PC8MD_011),
+	PINMUX_DATA(CTX0_CTX1_MARK, PC8MD_100),
 
 	PINMUX_DATA(PC7_DATA, PC7MD_000),
 	PINMUX_DATA(CKE_MARK, PC7MD_001),
@@ -836,11 +836,12 @@
 	PINMUX_DATA(CAS_MARK, PC6MD_001),
 	PINMUX_DATA(SCK7_MARK, PC6MD_010),
 	PINMUX_DATA(CTX0_MARK, PC6MD_011),
+	PINMUX_DATA(CTX0_CTX1_CTX2_MARK, PC6MD_100),
 
 	PINMUX_DATA(PC5_DATA, PC5MD_000),
 	PINMUX_DATA(RAS_MARK, PC5MD_001),
 	PINMUX_DATA(CRX0_MARK, PC5MD_011),
-	PINMUX_DATA(CTX0CTX1CTX2_MARK, PC5MD_100),
+	PINMUX_DATA(CTX0_CTX1_CTX2_MARK, PC5MD_100),
 	PINMUX_DATA(IRQ0_PC_MARK, PC5MD_101),
 
 	PINMUX_DATA(PC4_DATA, PC4MD_00),
@@ -1292,30 +1293,32 @@
 	PINMUX_DATA(LCD_DATA23_PJ23_MARK, PJ23MD_010),
 	PINMUX_DATA(LCD_TCON6_MARK, PJ23MD_011),
 	PINMUX_DATA(IRQ3_PJ_MARK, PJ23MD_100),
-	PINMUX_DATA(CTX1_MARK, PJ23MD_101),
+	PINMUX_DATA(CTX1_PJ23_MARK, PJ23MD_101),
+	PINMUX_DATA(CTX0_CTX1_PJ23_MARK, PJ23MD_110),
 
 	PINMUX_DATA(PJ22_DATA, PJ22MD_000),
 	PINMUX_DATA(DV_DATA22_MARK, PJ22MD_001),
 	PINMUX_DATA(LCD_DATA22_PJ22_MARK, PJ22MD_010),
 	PINMUX_DATA(LCD_TCON5_MARK, PJ22MD_011),
 	PINMUX_DATA(IRQ2_PJ_MARK, PJ22MD_100),
-	PINMUX_DATA(CRX1_MARK, PJ22MD_101),
-	PINMUX_DATA(CRX0_CRX1_MARK, PJ22MD_110),
+	PINMUX_DATA(CRX1_PJ22_MARK, PJ22MD_101),
+	PINMUX_DATA(CRX0_CRX1_PJ22_MARK, PJ22MD_110),
 
 	PINMUX_DATA(PJ21_DATA, PJ21MD_000),
 	PINMUX_DATA(DV_DATA21_MARK, PJ21MD_001),
 	PINMUX_DATA(LCD_DATA21_PJ21_MARK, PJ21MD_010),
 	PINMUX_DATA(LCD_TCON4_MARK, PJ21MD_011),
 	PINMUX_DATA(IRQ1_PJ_MARK, PJ21MD_100),
-	PINMUX_DATA(CTX2_MARK, PJ21MD_101),
+	PINMUX_DATA(CTX2_PJ21_MARK, PJ21MD_101),
+	PINMUX_DATA(CTX0_CTX1_CTX2_PJ21_MARK, PJ21MD_110),
 
 	PINMUX_DATA(PJ20_DATA, PJ20MD_000),
 	PINMUX_DATA(DV_DATA20_MARK, PJ20MD_001),
 	PINMUX_DATA(LCD_DATA20_PJ20_MARK, PJ20MD_010),
 	PINMUX_DATA(LCD_TCON3_MARK, PJ20MD_011),
 	PINMUX_DATA(IRQ0_PJ_MARK, PJ20MD_100),
-	PINMUX_DATA(CRX2_MARK, PJ20MD_101),
-	PINMUX_DATA(CRX0CRX1CRX2_PJ20_MARK, PJ20MD_110),
+	PINMUX_DATA(CRX2_PJ20_MARK, PJ20MD_101),
+	PINMUX_DATA(CRX0_CRX1_CRX2_PJ20_MARK, PJ20MD_110),
 
 	PINMUX_DATA(PJ19_DATA, PJ19MD_000),
 	PINMUX_DATA(DV_DATA19_MARK, PJ19MD_001),
@@ -1666,12 +1669,24 @@
 	GPIO_FN(WDTOVF),
 
 	/* CAN */
+	GPIO_FN(CTX2),
+	GPIO_FN(CRX2),
 	GPIO_FN(CTX1),
 	GPIO_FN(CRX1),
 	GPIO_FN(CTX0),
 	GPIO_FN(CRX0),
+	GPIO_FN(CTX0_CTX1),
 	GPIO_FN(CRX0_CRX1),
+	GPIO_FN(CTX0_CTX1_CTX2),
 	GPIO_FN(CRX0_CRX1_CRX2),
+	GPIO_FN(CTX2_PJ21),
+	GPIO_FN(CRX2_PJ20),
+	GPIO_FN(CTX1_PJ23),
+	GPIO_FN(CRX1_PJ22),
+	GPIO_FN(CTX0_CTX1_PJ23),
+	GPIO_FN(CRX0_CRX1_PJ22),
+	GPIO_FN(CTX0_CTX1_CTX2_PJ21),
+	GPIO_FN(CRX0_CRX1_CRX2_PJ20),
 
 	/* DMAC */
 	GPIO_FN(TEND0),

diff --git a/drivers/pwm/pwm-omap-dmtimer.c b/drivers/pwm/pwm-omap-dmtimer.c
index f457986..527b879 100644
--- a/drivers/pwm/pwm-omap-dmtimer.c
+++ b/drivers/pwm/pwm-omap-dmtimer.c

@@ -259,7 +259,7 @@
 	if (!timer_pdev) {
 		dev_err(&pdev->dev, "Unable to find Timer pdev\n");
 		ret = -ENODEV;
-		goto put;
+		goto err_find_timer_pdev;
 	}
 
 	timer_pdata = dev_get_platdata(&timer_pdev->dev);
@@ -267,7 +267,7 @@
 		dev_dbg(&pdev->dev,
 			 "dmtimer pdata structure NULL, deferring probe\n");
 		ret = -EPROBE_DEFER;
-		goto put;
+		goto err_platdata;
 	}
 
 	pdata = timer_pdata->timer_ops;
@@ -286,30 +286,25 @@
 	    !pdata->write_counter) {
 		dev_err(&pdev->dev, "Incomplete dmtimer pdata structure\n");
 		ret = -EINVAL;
-		goto put;
+		goto err_platdata;
 	}
 
 	if (!of_get_property(timer, "ti,timer-pwm", NULL)) {
 		dev_err(&pdev->dev, "Missing ti,timer-pwm capability\n");
 		ret = -ENODEV;
-		goto put;
+		goto err_timer_property;
 	}
 
 	dm_timer = pdata->request_by_node(timer);
 	if (!dm_timer) {
 		ret = -EPROBE_DEFER;
-		goto put;
+		goto err_request_timer;
 	}
 
-put:
-	of_node_put(timer);
-	if (ret < 0)
-		return ret;
-
 	omap = devm_kzalloc(&pdev->dev, sizeof(*omap), GFP_KERNEL);
 	if (!omap) {
-		pdata->free(dm_timer);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto err_alloc_omap;
 	}
 
 	omap->pdata = pdata;
@@ -342,27 +337,56 @@
 	ret = pwmchip_add(&omap->chip);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "failed to register PWM\n");
-		omap->pdata->free(omap->dm_timer);
-		return ret;
+		goto err_pwmchip_add;
 	}
 
+	of_node_put(timer);
+
 	platform_set_drvdata(pdev, omap);
 
 	return 0;
+
+err_pwmchip_add:
+
+	/*
+	 * *omap is allocated using devm_kzalloc,
+	 * so no free necessary here
+	 */
+err_alloc_omap:
+
+	pdata->free(dm_timer);
+err_request_timer:
+
+err_timer_property:
+err_platdata:
+
+	put_device(&timer_pdev->dev);
+err_find_timer_pdev:
+
+	of_node_put(timer);
+
+	return ret;
 }
 
 static int pwm_omap_dmtimer_remove(struct platform_device *pdev)
 {
 	struct pwm_omap_dmtimer_chip *omap = platform_get_drvdata(pdev);
+	int ret;
+
+	ret = pwmchip_remove(&omap->chip);
+	if (ret)
+		return ret;
 
 	if (pm_runtime_active(&omap->dm_timer_pdev->dev))
 		omap->pdata->stop(omap->dm_timer);
 
 	omap->pdata->free(omap->dm_timer);
 
+	put_device(&omap->dm_timer_pdev->dev);
+
 	mutex_destroy(&omap->mutex);
 
-	return pwmchip_remove(&omap->chip);
+	return 0;
 }
 
 static const struct of_device_id pwm_omap_dmtimer_of_match[] = {

diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c
index 567f5e2..e1e5dfcb 100644
--- a/drivers/pwm/pwm-pca9685.c
+++ b/drivers/pwm/pwm-pca9685.c

@@ -170,13 +170,9 @@
 static void pca9685_pwm_gpio_free(struct gpio_chip *gpio, unsigned int offset)
 {
 	struct pca9685 *pca = gpiochip_get_data(gpio);
-	struct pwm_device *pwm;
 
 	pca9685_pwm_gpio_set(gpio, offset, 0);
 	pm_runtime_put(pca->chip.dev);
-	mutex_lock(&pca->lock);
-	pwm = &pca->chip.pwms[offset];
-	mutex_unlock(&pca->lock);
 }
 
 static int pca9685_pwm_gpio_get_direction(struct gpio_chip *chip,

diff --git a/drivers/regulator/rk808-regulator.c b/drivers/regulator/rk808-regulator.c
index 213b687..92498ac 100644
--- a/drivers/regulator/rk808-regulator.c
+++ b/drivers/regulator/rk808-regulator.c

@@ -714,7 +714,7 @@
 		}
 
 		if (!pdata->dvs_gpio[i]) {
-			dev_warn(dev, "there is no dvs%d gpio\n", i);
+			dev_info(dev, "there is no dvs%d gpio\n", i);
 			continue;
 		}
 

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index aa62067..abbef17 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c

@@ -1786,7 +1786,7 @@
 
 	return 0;
 }
-module_init(remoteproc_init);
+subsys_initcall(remoteproc_init);
 
 static void __exit remoteproc_exit(void)
 {

diff --git a/drivers/reset/reset-uniphier.c b/drivers/reset/reset-uniphier.c
index 5605745..adbecb2 100644
--- a/drivers/reset/reset-uniphier.c
+++ b/drivers/reset/reset-uniphier.c

@@ -202,8 +202,8 @@
 #define UNIPHIER_PERI_RESET_FI2C(id, ch)		\
 	UNIPHIER_RESETX((id), 0x114, 24 + (ch))
 
-#define UNIPHIER_PERI_RESET_SCSSI(id)			\
-	UNIPHIER_RESETX((id), 0x110, 17)
+#define UNIPHIER_PERI_RESET_SCSSI(id, ch)		\
+	UNIPHIER_RESETX((id), 0x110, 17 + (ch))
 
 #define UNIPHIER_PERI_RESET_MCSSI(id)			\
 	UNIPHIER_RESETX((id), 0x114, 14)
@@ -218,7 +218,7 @@
 	UNIPHIER_PERI_RESET_I2C(6, 2),
 	UNIPHIER_PERI_RESET_I2C(7, 3),
 	UNIPHIER_PERI_RESET_I2C(8, 4),
-	UNIPHIER_PERI_RESET_SCSSI(11),
+	UNIPHIER_PERI_RESET_SCSSI(11, 0),
 	UNIPHIER_RESET_END,
 };
 
@@ -234,8 +234,11 @@
 	UNIPHIER_PERI_RESET_FI2C(8, 4),
 	UNIPHIER_PERI_RESET_FI2C(9, 5),
 	UNIPHIER_PERI_RESET_FI2C(10, 6),
-	UNIPHIER_PERI_RESET_SCSSI(11),
-	UNIPHIER_PERI_RESET_MCSSI(12),
+	UNIPHIER_PERI_RESET_SCSSI(11, 0),
+	UNIPHIER_PERI_RESET_SCSSI(12, 1),
+	UNIPHIER_PERI_RESET_SCSSI(13, 2),
+	UNIPHIER_PERI_RESET_SCSSI(14, 3),
+	UNIPHIER_PERI_RESET_MCSSI(15),
 	UNIPHIER_RESET_END,
 };
 

diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index ce051f9..c8242d4 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c

@@ -579,7 +579,9 @@
 		struct rtc_time tm;
 		ktime_t now, onesec;
 
-		__rtc_read_time(rtc, &tm);
+		err = __rtc_read_time(rtc, &tm);
+		if (err)
+			goto out;
 		onesec = ktime_set(1, 0);
 		now = rtc_tm_to_ktime(tm);
 		rtc->uie_rtctimer.node.expires = ktime_add(now, onesec);

diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 23e526c..737dc0e 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c

@@ -59,6 +59,7 @@
 
 static const struct dax_operations dcssblk_dax_ops = {
 	.direct_access = dcssblk_dax_direct_access,
+	.dax_supported = generic_fsdax_supported,
 	.copy_from_iter = dcssblk_dax_copy_from_iter,
 	.copy_to_iter = dcssblk_dax_copy_to_iter,
 };
@@ -678,7 +679,7 @@
 		goto put_dev;
 
 	dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
-			&dcssblk_dax_ops);
+			&dcssblk_dax_ops, DAXDEV_F_SYNC);
 	if (!dev_info->dax_dev) {
 		rc = -ENOMEM;
 		goto put_dev;

diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h
index 7e85d23..1c799dd 100644
--- a/drivers/s390/crypto/ap_bus.h
+++ b/drivers/s390/crypto/ap_bus.h

@@ -158,7 +158,7 @@
 	unsigned int functions;		/* AP device function bitfield. */
 	int queue_depth;		/* AP queue depth.*/
 	int id;				/* AP card number. */
-	atomic_t total_request_count;	/* # requests ever for this AP device.*/
+	atomic64_t total_request_count;	/* # requests ever for this AP device.*/
 };
 
 #define to_ap_card(x) container_of((x), struct ap_card, ap_dev.device)
@@ -175,7 +175,7 @@
 	enum ap_state state;		/* State of the AP device. */
 	int pendingq_count;		/* # requests on pendingq list. */
 	int requestq_count;		/* # requests on requestq list. */
-	int total_request_count;	/* # requests ever for this AP device.*/
+	u64 total_request_count;	/* # requests ever for this AP device.*/
 	int request_timeout;		/* Request timeout in jiffies. */
 	struct timer_list timeout;	/* Timer for request timeouts. */
 	struct list_head pendingq;	/* List of message sent to AP queue. */

diff --git a/drivers/s390/crypto/ap_card.c b/drivers/s390/crypto/ap_card.c
index 63b4cc6..e85bfca 100644
--- a/drivers/s390/crypto/ap_card.c
+++ b/drivers/s390/crypto/ap_card.c

@@ -63,13 +63,13 @@
 				  char *buf)
 {
 	struct ap_card *ac = to_ap_card(dev);
-	unsigned int req_cnt;
+	u64 req_cnt;
 
 	req_cnt = 0;
 	spin_lock_bh(&ap_list_lock);
-	req_cnt = atomic_read(&ac->total_request_count);
+	req_cnt = atomic64_read(&ac->total_request_count);
 	spin_unlock_bh(&ap_list_lock);
-	return snprintf(buf, PAGE_SIZE, "%d\n", req_cnt);
+	return snprintf(buf, PAGE_SIZE, "%llu\n", req_cnt);
 }
 
 static ssize_t request_count_store(struct device *dev,
@@ -83,7 +83,7 @@
 	for_each_ap_queue(aq, ac)
 		aq->total_request_count = 0;
 	spin_unlock_bh(&ap_list_lock);
-	atomic_set(&ac->total_request_count, 0);
+	atomic64_set(&ac->total_request_count, 0);
 
 	return count;
 }

diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c
index 576ac08..e1647da 100644
--- a/drivers/s390/crypto/ap_queue.c
+++ b/drivers/s390/crypto/ap_queue.c

@@ -470,12 +470,12 @@
 				  char *buf)
 {
 	struct ap_queue *aq = to_ap_queue(dev);
-	unsigned int req_cnt;
+	u64 req_cnt;
 
 	spin_lock_bh(&aq->lock);
 	req_cnt = aq->total_request_count;
 	spin_unlock_bh(&aq->lock);
-	return snprintf(buf, PAGE_SIZE, "%d\n", req_cnt);
+	return snprintf(buf, PAGE_SIZE, "%llu\n", req_cnt);
 }
 
 static ssize_t request_count_store(struct device *dev,
@@ -667,7 +667,7 @@
 	list_add_tail(&ap_msg->list, &aq->requestq);
 	aq->requestq_count++;
 	aq->total_request_count++;
-	atomic_inc(&aq->card->total_request_count);
+	atomic64_inc(&aq->card->total_request_count);
 	/* Send/receive as many request from the queue as possible. */
 	ap_wait(ap_sm_event_loop(aq, AP_EVENT_POLL));
 	spin_unlock_bh(&aq->lock);

diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index b2737bf..23c24a6 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c

@@ -190,8 +190,8 @@
 	weight += atomic_read(&zc->load);
 	pref_weight += atomic_read(&pref_zc->load);
 	if (weight == pref_weight)
-		return atomic_read(&zc->card->total_request_count) >
-			atomic_read(&pref_zc->card->total_request_count);
+		return atomic64_read(&zc->card->total_request_count) >
+			atomic64_read(&pref_zc->card->total_request_count);
 	return weight > pref_weight;
 }
 
@@ -719,11 +719,12 @@
 	spin_unlock(&zcrypt_list_lock);
 }
 
-static void zcrypt_perdev_reqcnt(int reqcnt[], size_t max_adapters)
+static void zcrypt_perdev_reqcnt(u32 reqcnt[], size_t max_adapters)
 {
 	struct zcrypt_card *zc;
 	struct zcrypt_queue *zq;
 	int card;
+	u64 cnt;
 
 	memset(reqcnt, 0, sizeof(int) * max_adapters);
 	spin_lock(&zcrypt_list_lock);
@@ -735,8 +736,9 @@
 			    || card >= max_adapters)
 				continue;
 			spin_lock(&zq->queue->lock);
-			reqcnt[card] = zq->queue->total_request_count;
+			cnt = zq->queue->total_request_count;
 			spin_unlock(&zq->queue->lock);
+			reqcnt[card] = (cnt < UINT_MAX) ? (u32) cnt : UINT_MAX;
 		}
 	}
 	local_bh_enable();
@@ -907,9 +909,9 @@
 		return 0;
 	}
 	case ZCRYPT_PERDEV_REQCNT: {
-		int *reqcnt;
+		u32 *reqcnt;
 
-		reqcnt = kcalloc(AP_DEVICES, sizeof(int), GFP_KERNEL);
+		reqcnt = kcalloc(AP_DEVICES, sizeof(u32), GFP_KERNEL);
 		if (!reqcnt)
 			return -ENOMEM;
 		zcrypt_perdev_reqcnt(reqcnt, AP_DEVICES);
@@ -966,7 +968,7 @@
 	}
 	case Z90STAT_PERDEV_REQCNT: {
 		/* the old ioctl supports only 64 adapters */
-		int reqcnt[MAX_ZDEV_CARDIDS];
+		u32 reqcnt[MAX_ZDEV_CARDIDS];
 
 		zcrypt_perdev_reqcnt(reqcnt, MAX_ZDEV_CARDIDS);
 		if (copy_to_user((int __user *) arg, reqcnt, sizeof(reqcnt)))

diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index aa90004..eb917e9 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c

@@ -2148,15 +2148,14 @@
 
 	QETH_CARD_TEXT(card, 2, "vniccsch");
 
-	/* do not change anything if BridgePort is enabled */
-	if (qeth_bridgeport_is_in_use(card))
-		return -EBUSY;
-
 	/* check if characteristic and enable/disable are supported */
 	if (!(card->options.vnicc.sup_chars & vnicc) ||
 	    !(card->options.vnicc.set_char_sup & vnicc))
 		return -EOPNOTSUPP;
 
+	if (qeth_bridgeport_is_in_use(card))
+		return -EBUSY;
+
 	/* set enable/disable command and store wanted characteristic */
 	if (state) {
 		cmd = IPA_VNICC_ENABLE;
@@ -2202,14 +2201,13 @@
 
 	QETH_CARD_TEXT(card, 2, "vniccgch");
 
-	/* do not get anything if BridgePort is enabled */
-	if (qeth_bridgeport_is_in_use(card))
-		return -EBUSY;
-
 	/* check if characteristic is supported */
 	if (!(card->options.vnicc.sup_chars & vnicc))
 		return -EOPNOTSUPP;
 
+	if (qeth_bridgeport_is_in_use(card))
+		return -EBUSY;
+
 	/* if card is ready, query current VNICC state */
 	if (qeth_card_hw_is_reachable(card))
 		rc = qeth_l2_vnicc_query_chars(card);
@@ -2227,15 +2225,14 @@
 
 	QETH_CARD_TEXT(card, 2, "vniccsto");
 
-	/* do not change anything if BridgePort is enabled */
-	if (qeth_bridgeport_is_in_use(card))
-		return -EBUSY;
-
 	/* check if characteristic and set_timeout are supported */
 	if (!(card->options.vnicc.sup_chars & QETH_VNICC_LEARNING) ||
 	    !(card->options.vnicc.getset_timeout_sup & QETH_VNICC_LEARNING))
 		return -EOPNOTSUPP;
 
+	if (qeth_bridgeport_is_in_use(card))
+		return -EBUSY;
+
 	/* do we need to do anything? */
 	if (card->options.vnicc.learning_timeout == timeout)
 		return rc;
@@ -2264,14 +2261,14 @@
 
 	QETH_CARD_TEXT(card, 2, "vniccgto");
 
-	/* do not get anything if BridgePort is enabled */
-	if (qeth_bridgeport_is_in_use(card))
-		return -EBUSY;
-
 	/* check if characteristic and get_timeout are supported */
 	if (!(card->options.vnicc.sup_chars & QETH_VNICC_LEARNING) ||
 	    !(card->options.vnicc.getset_timeout_sup & QETH_VNICC_LEARNING))
 		return -EOPNOTSUPP;
+
+	if (qeth_bridgeport_is_in_use(card))
+		return -EBUSY;
+
 	/* if card is ready, get timeout. Otherwise, just return stored value */
 	*timeout = card->options.vnicc.learning_timeout;
 	if (qeth_card_hw_is_reachable(card))

diff --git a/drivers/scsi/aic7xxx/aic7xxx_core.c b/drivers/scsi/aic7xxx/aic7xxx_core.c
index 915a34f..49e02e8 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_core.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_core.c

@@ -2321,7 +2321,7 @@
 			 * At some speeds, we only support
 			 * ST transfers.
 			 */
-		 	if ((syncrate->sxfr_u2 & ST_SXFR) != 0)
+			if ((syncrate->sxfr_u2 & ST_SXFR) != 0)
 				*ppr_options &= ~MSG_EXT_PPR_DT_REQ;
 			break;
 		}

diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 55181d2..7212e3a 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c

@@ -892,6 +892,10 @@
 static void iscsi_sw_tcp_session_destroy(struct iscsi_cls_session *cls_session)
 {
 	struct Scsi_Host *shost = iscsi_session_to_shost(cls_session);
+	struct iscsi_session *session = cls_session->dd_data;
+
+	if (WARN_ON_ONCE(session->leadconn))
+		return;
 
 	iscsi_tcp_r2tpool_free(cls_session->dd_data);
 	iscsi_session_teardown(cls_session);

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 4c4781e..c0fb9e7 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c

@@ -2945,6 +2945,24 @@
 	return err;
 }
 
+static int iscsi_session_has_conns(int sid)
+{
+	struct iscsi_cls_conn *conn;
+	unsigned long flags;
+	int found = 0;
+
+	spin_lock_irqsave(&connlock, flags);
+	list_for_each_entry(conn, &connlist, conn_list) {
+		if (iscsi_conn_get_sid(conn) == sid) {
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&connlock, flags);
+
+	return found;
+}
+
 static int
 iscsi_set_iface_params(struct iscsi_transport *transport,
 		       struct iscsi_uevent *ev, uint32_t len)
@@ -3522,10 +3540,12 @@
 		break;
 	case ISCSI_UEVENT_DESTROY_SESSION:
 		session = iscsi_session_lookup(ev->u.d_session.sid);
-		if (session)
-			transport->destroy_session(session);
-		else
+		if (!session)
 			err = -EINVAL;
+		else if (iscsi_session_has_conns(ev->u.d_session.sid))
+			err = -EBUSY;
+		else
+			transport->destroy_session(session);
 		break;
 	case ISCSI_UEVENT_UNBIND_SESSION:
 		session = iscsi_session_lookup(ev->u.d_session.sid);

diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index f4fcaee..b3dee24 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c

@@ -4809,7 +4809,7 @@
 		break;
 	} /* end of switch */
 
-	if (host_byte(result) != DID_OK)
+	if ((host_byte(result) != DID_OK) && !hba->silence_err_logs)
 		ufshcd_print_trs(hba, 1 << lrbp->task_tag, true);
 	return result;
 }
@@ -5341,8 +5341,8 @@
 
 	/*
 	 * if host reset is required then skip clearing the pending
-	 * transfers forcefully because they will automatically get
-	 * cleared after link startup.
+	 * transfers forcefully because they will get cleared during
+	 * host reset and restore
 	 */
 	if (needs_reset)
 		goto skip_pending_xfer_clear;
@@ -5996,9 +5996,15 @@
 	int err;
 	unsigned long flags;
 
-	/* Reset the host controller */
+	/*
+	 * Stop the host controller and complete the requests
+	 * cleared by h/w
+	 */
 	spin_lock_irqsave(hba->host->host_lock, flags);
 	ufshcd_hba_stop(hba, false);
+	hba->silence_err_logs = true;
+	ufshcd_complete_requests(hba);
+	hba->silence_err_logs = false;
 	spin_unlock_irqrestore(hba->host->host_lock, flags);
 
 	/* scale up clocks to max frequency before full reinitialization */
@@ -6032,22 +6038,12 @@
 static int ufshcd_reset_and_restore(struct ufs_hba *hba)
 {
 	int err = 0;
-	unsigned long flags;
 	int retries = MAX_HOST_RESET_RETRIES;
 
 	do {
 		err = ufshcd_host_reset_and_restore(hba);
 	} while (err && --retries);
 
-	/*
-	 * After reset the door-bell might be cleared, complete
-	 * outstanding requests in s/w here.
-	 */
-	spin_lock_irqsave(hba->host->host_lock, flags);
-	ufshcd_transfer_req_compl(hba);
-	ufshcd_tmc_handler(hba);
-	spin_unlock_irqrestore(hba->host->host_lock, flags);
-
 	return err;
 }
 

diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
index 33fdd3f..4554a4b7 100644
--- a/drivers/scsi/ufs/ufshcd.h
+++ b/drivers/scsi/ufs/ufshcd.h

@@ -489,6 +489,7 @@
  * @uic_error: UFS interconnect layer error status
  * @saved_err: sticky error mask
  * @saved_uic_err: sticky UIC error mask
+ * @silence_err_logs: flag to silence error logs
  * @dev_cmd: ufs device management command information
  * @last_dme_cmd_tstamp: time stamp of the last completed DME command
  * @auto_bkops_enabled: to track whether bkops is enabled in device
@@ -645,6 +646,7 @@
 	u32 saved_err;
 	u32 saved_uic_err;
 	struct ufs_stats ufs_stats;
+	bool silence_err_logs;
 
 	/* Device management request data */
 	struct ufs_dev_cmd dev_cmd;

diff --git a/drivers/soc/tegra/fuse/fuse-tegra30.c b/drivers/soc/tegra/fuse/fuse-tegra30.c
index 257e254..0ec6385 100644
--- a/drivers/soc/tegra/fuse/fuse-tegra30.c
+++ b/drivers/soc/tegra/fuse/fuse-tegra30.c

@@ -47,7 +47,8 @@
     defined(CONFIG_ARCH_TEGRA_124_SOC) || \
     defined(CONFIG_ARCH_TEGRA_132_SOC) || \
     defined(CONFIG_ARCH_TEGRA_210_SOC) || \
-    defined(CONFIG_ARCH_TEGRA_186_SOC)
+    defined(CONFIG_ARCH_TEGRA_186_SOC) || \
+    defined(CONFIG_ARCH_TEGRA_194_SOC)
 static u32 tegra30_fuse_read_early(struct tegra_fuse *fuse, unsigned int offset)
 {
 	if (WARN_ON(!fuse->base))

diff --git a/drivers/soc/tegra/fuse/tegra-apbmisc.c b/drivers/soc/tegra/fuse/tegra-apbmisc.c
index e5a4d8f..d1cbb0f 100644
--- a/drivers/soc/tegra/fuse/tegra-apbmisc.c
+++ b/drivers/soc/tegra/fuse/tegra-apbmisc.c

@@ -135,7 +135,7 @@
 			apbmisc.flags = IORESOURCE_MEM;
 
 			/* strapping options */
-			if (tegra_get_chip_id() == TEGRA124) {
+			if (of_machine_is_compatible("nvidia,tegra124")) {
 				straps.start = 0x7000e864;
 				straps.end = 0x7000e867;
 			} else {

diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index be81533..e3df4bf 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c

@@ -350,8 +350,23 @@
 	       _calc_vm_trans(prot, PROT_EXEC,  VM_MAYEXEC);
 }
 
+static int ashmem_vmfile_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	/* do not allow to mmap ashmem backing shmem file directly */
+	return -EPERM;
+}
+
+static unsigned long
+ashmem_vmfile_get_unmapped_area(struct file *file, unsigned long addr,
+				unsigned long len, unsigned long pgoff,
+				unsigned long flags)
+{
+	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+}
+
 static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	static struct file_operations vmfile_fops;
 	struct ashmem_area *asma = file->private_data;
 	int ret = 0;
 
@@ -392,6 +407,19 @@
 		}
 		vmfile->f_mode |= FMODE_LSEEK;
 		asma->file = vmfile;
+		/*
+		 * override mmap operation of the vmfile so that it can't be
+		 * remapped which would lead to creation of a new vma with no
+		 * asma permission checks. Have to override get_unmapped_area
+		 * as well to prevent VM_BUG_ON check for f_ops modification.
+		 */
+		if (!vmfile_fops.mmap) {
+			vmfile_fops = *vmfile->f_op;
+			vmfile_fops.mmap = ashmem_vmfile_mmap;
+			vmfile_fops.get_unmapped_area =
+					ashmem_vmfile_get_unmapped_area;
+		}
+		vmfile->f_op = &vmfile_fops;
 	}
 	get_file(asma->file);
 

diff --git a/drivers/staging/greybus/audio_manager.c b/drivers/staging/greybus/audio_manager.c
index d44b070..0f5c68e 100644
--- a/drivers/staging/greybus/audio_manager.c
+++ b/drivers/staging/greybus/audio_manager.c

@@ -89,8 +89,8 @@
 
 	list_for_each_entry_safe(module, next, &modules_list, list) {
 		list_del(&module->list);
-		kobject_put(&module->kobj);
 		ida_simple_remove(&module_id, module->id);
+		kobject_put(&module->kobj);
 	}
 
 	is_empty = list_empty(&modules_list);

diff --git a/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c b/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c
index bee3c3a..0003f0c 100644
--- a/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c
+++ b/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c

@@ -229,18 +229,21 @@
 
 	/* parsing WPA/WPA2 IE */
 	{
-		u8 buf[MAX_WPA_IE_LEN];
+		u8 *buf;
 		u8 wpa_ie[255], rsn_ie[255];
 		u16 wpa_len = 0, rsn_len = 0;
 		u8 *p;
 
+		buf = kzalloc(MAX_WPA_IE_LEN, GFP_ATOMIC);
+		if (!buf)
+			return start;
+
 		rtw_get_sec_ie(pnetwork->network.ies, pnetwork->network.ie_length, rsn_ie, &rsn_len, wpa_ie, &wpa_len);
 		RT_TRACE(_module_rtl871x_mlme_c_, _drv_info_, ("rtw_wx_get_scan: ssid =%s\n", pnetwork->network.Ssid.Ssid));
 		RT_TRACE(_module_rtl871x_mlme_c_, _drv_info_, ("rtw_wx_get_scan: wpa_len =%d rsn_len =%d\n", wpa_len, rsn_len));
 
 		if (wpa_len > 0) {
 			p = buf;
-			memset(buf, 0, MAX_WPA_IE_LEN);
 			p += sprintf(p, "wpa_ie=");
 			for (i = 0; i < wpa_len; i++)
 				p += sprintf(p, "%02x", wpa_ie[i]);
@@ -257,7 +260,6 @@
 		}
 		if (rsn_len > 0) {
 			p = buf;
-			memset(buf, 0, MAX_WPA_IE_LEN);
 			p += sprintf(p, "rsn_ie=");
 			for (i = 0; i < rsn_len; i++)
 				p += sprintf(p, "%02x", rsn_ie[i]);
@@ -271,6 +273,7 @@
 			iwe.u.data.length = rsn_len;
 			start = iwe_stream_add_point(info, start, stop, &iwe, rsn_ie);
 		}
+		kfree(buf);
 	}
 
 	{/* parsing WPS IE */
@@ -2023,7 +2026,7 @@
 	struct ieee_param *param;
 	uint ret = 0;
 
-	if (p->length < sizeof(struct ieee_param) || !p->pointer) {
+	if (!p->pointer || p->length != sizeof(struct ieee_param)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -2816,7 +2819,7 @@
 		goto out;
 	}
 
-	if (!p->pointer) {
+	if (!p->pointer || p->length != sizeof(struct ieee_param)) {
 		ret = -EINVAL;
 		goto out;
 	}

diff --git a/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c b/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c
index 10b3f97..4a27c39 100644
--- a/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c
+++ b/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c

@@ -478,14 +478,13 @@
 	s32 ret;
 	struct adapter *padapter;
 	struct xmit_priv *pxmitpriv;
-	u8 thread_name[20] = "RTWHALXT";
-
+	u8 thread_name[20];
 
 	ret = _SUCCESS;
 	padapter = context;
 	pxmitpriv = &padapter->xmitpriv;
 
-	rtw_sprintf(thread_name, 20, "%s-"ADPT_FMT, thread_name, ADPT_ARG(padapter));
+	rtw_sprintf(thread_name, 20, "RTWHALXT-" ADPT_FMT, ADPT_ARG(padapter));
 	thread_enter(thread_name);
 
 	DBG_871X("start "FUNC_ADPT_FMT"\n", FUNC_ADPT_ARG(padapter));

diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c b/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c
index 4f120e7..466d25c 100644
--- a/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c
+++ b/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c

@@ -3400,7 +3400,7 @@
 
 	/* down(&ieee->wx_sem); */
 
-	if (p->length < sizeof(struct ieee_param) || !p->pointer) {
+	if (!p->pointer || p->length != sizeof(struct ieee_param)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -4236,7 +4236,7 @@
 
 
 	/* if (p->length < sizeof(struct ieee_param) || !p->pointer) { */
-	if (!p->pointer) {
+	if (!p->pointer || p->length != sizeof(*param)) {
 		ret = -EINVAL;
 		goto out;
 	}

diff --git a/drivers/staging/vt6656/dpc.c b/drivers/staging/vt6656/dpc.c
index 3b94e80f..879ceef 100644
--- a/drivers/staging/vt6656/dpc.c
+++ b/drivers/staging/vt6656/dpc.c

@@ -130,7 +130,7 @@
 
 	vnt_rf_rssi_to_dbm(priv, *rssi, &rx_dbm);
 
-	priv->bb_pre_ed_rssi = (u8)rx_dbm + 1;
+	priv->bb_pre_ed_rssi = (u8)-rx_dbm + 1;
 	priv->current_rssi = priv->bb_pre_ed_rssi;
 
 	skb_pull(skb, 8);

diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index 14bd54d..03e9cb1 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c

@@ -1157,9 +1157,7 @@
 		hdr->cmdsn, be32_to_cpu(hdr->data_length), payload_length,
 		conn->cid);
 
-	if (target_get_sess_cmd(&cmd->se_cmd, true) < 0)
-		return iscsit_add_reject_cmd(cmd,
-				ISCSI_REASON_WAITING_FOR_LOGOUT, buf);
+	target_get_sess_cmd(&cmd->se_cmd, true);
 
 	cmd->sense_reason = transport_lookup_cmd_lun(&cmd->se_cmd,
 						     scsilun_to_int(&hdr->lun));
@@ -2000,9 +1998,7 @@
 			      conn->sess->se_sess, 0, DMA_NONE,
 			      TCM_SIMPLE_TAG, cmd->sense_buffer + 2);
 
-	if (target_get_sess_cmd(&cmd->se_cmd, true) < 0)
-		return iscsit_add_reject_cmd(cmd,
-				ISCSI_REASON_WAITING_FOR_LOGOUT, buf);
+	target_get_sess_cmd(&cmd->se_cmd, true);
 
 	/*
 	 * TASK_REASSIGN for ERL=2 / connection stays inside of
@@ -4123,6 +4119,9 @@
 	iscsit_stop_nopin_response_timer(conn);
 	iscsit_stop_nopin_timer(conn);
 
+	if (conn->conn_transport->iscsit_wait_conn)
+		conn->conn_transport->iscsit_wait_conn(conn);
+
 	/*
 	 * During Connection recovery drop unacknowledged out of order
 	 * commands for this connection, and prepare the other commands
@@ -4205,11 +4204,6 @@
 	 * must wait until they have completed.
 	 */
 	iscsit_check_conn_usage_count(conn);
-	target_sess_cmd_list_set_waiting(sess->se_sess);
-	target_wait_for_sess_cmds(sess->se_sess);
-
-	if (conn->conn_transport->iscsit_wait_conn)
-		conn->conn_transport->iscsit_wait_conn(conn);
 
 	ahash_request_free(conn->conn_tx_hash);
 	if (conn->conn_rx_hash) {

diff --git a/drivers/thermal/broadcom/brcmstb_thermal.c b/drivers/thermal/broadcom/brcmstb_thermal.c
index 1919f91..8d16a41 100644
--- a/drivers/thermal/broadcom/brcmstb_thermal.c
+++ b/drivers/thermal/broadcom/brcmstb_thermal.c

@@ -58,7 +58,7 @@
 #define AVS_TMON_TP_TEST_ENABLE		0x20
 
 /* Default coefficients */
-#define AVS_TMON_TEMP_SLOPE		-487
+#define AVS_TMON_TEMP_SLOPE		487
 #define AVS_TMON_TEMP_OFFSET		410040
 
 /* HW related temperature constants */
@@ -117,23 +117,12 @@
 	struct thermal_zone_device *thermal;
 };
 
-static void avs_tmon_get_coeffs(struct thermal_zone_device *tz, int *slope,
-				int *offset)
-{
-	*slope = thermal_zone_get_slope(tz);
-	*offset = thermal_zone_get_offset(tz);
-}
-
 /* Convert a HW code to a temperature reading (millidegree celsius) */
 static inline int avs_tmon_code_to_temp(struct thermal_zone_device *tz,
 					u32 code)
 {
-	const int val = code & AVS_TMON_TEMP_MASK;
-	int slope, offset;
-
-	avs_tmon_get_coeffs(tz, &slope, &offset);
-
-	return slope * val + offset;
+	return (AVS_TMON_TEMP_OFFSET -
+		(int)((code & AVS_TMON_TEMP_MAX) * AVS_TMON_TEMP_SLOPE));
 }
 
 /*
@@ -145,20 +134,18 @@
 static inline u32 avs_tmon_temp_to_code(struct thermal_zone_device *tz,
 					int temp, bool low)
 {
-	int slope, offset;
-
 	if (temp < AVS_TMON_TEMP_MIN)
-		return AVS_TMON_TEMP_MAX; /* Maximum code value */
+		return AVS_TMON_TEMP_MAX;	/* Maximum code value */
 
-	avs_tmon_get_coeffs(tz, &slope, &offset);
-
-	if (temp >= offset)
+	if (temp >= AVS_TMON_TEMP_OFFSET)
 		return 0;	/* Minimum code value */
 
 	if (low)
-		return (u32)(DIV_ROUND_UP(offset - temp, abs(slope)));
+		return (u32)(DIV_ROUND_UP(AVS_TMON_TEMP_OFFSET - temp,
+					  AVS_TMON_TEMP_SLOPE));
 	else
-		return (u32)((offset - temp) / abs(slope));
+		return (u32)((AVS_TMON_TEMP_OFFSET - temp) /
+			      AVS_TMON_TEMP_SLOPE);
 }
 
 static int brcmstb_get_temp(void *data, int *temp)

diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c
index 678bf33..42d90ce 100644
--- a/drivers/thunderbolt/switch.c
+++ b/drivers/thunderbolt/switch.c

@@ -264,6 +264,12 @@
 	return ret;
 }
 
+static int tb_switch_nvm_no_read(void *priv, unsigned int offset, void *val,
+				 size_t bytes)
+{
+	return -EPERM;
+}
+
 static int tb_switch_nvm_write(void *priv, unsigned int offset, void *val,
 			       size_t bytes)
 {
@@ -309,6 +315,7 @@
 		config.read_only = true;
 	} else {
 		config.name = "nvm_non_active";
+		config.reg_read = tb_switch_nvm_no_read;
 		config.reg_write = tb_switch_nvm_write;
 		config.root_only = true;
 	}

diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c
index fa16729..048a7bc 100644
--- a/drivers/tty/serdev/serdev-ttyport.c
+++ b/drivers/tty/serdev/serdev-ttyport.c

@@ -265,7 +265,6 @@
 					struct device *parent,
 					struct tty_driver *drv, int idx)
 {
-	const struct tty_port_client_operations *old_ops;
 	struct serdev_controller *ctrl;
 	struct serport *serport;
 	int ret;
@@ -284,7 +283,6 @@
 
 	ctrl->ops = &ctrl_ops;
 
-	old_ops = port->client_ops;
 	port->client_ops = &client_ops;
 	port->client_data = ctrl;
 
@@ -297,7 +295,7 @@
 
 err_reset_data:
 	port->client_data = NULL;
-	port->client_ops = old_ops;
+	port->client_ops = &tty_port_default_client_ops;
 	serdev_controller_put(ctrl);
 
 	return ERR_PTR(ret);
@@ -312,8 +310,8 @@
 		return -ENODEV;
 
 	serdev_controller_remove(ctrl);
-	port->client_ops = NULL;
 	port->client_data = NULL;
+	port->client_ops = &tty_port_default_client_ops;
 	serdev_controller_put(ctrl);
 
 	return 0;

diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c b/drivers/tty/serial/8250/8250_aspeed_vuart.c
index 435bec4..2d5c364 100644
--- a/drivers/tty/serial/8250/8250_aspeed_vuart.c
+++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c

@@ -375,7 +375,6 @@
 		port.port.line = rc;
 
 	port.port.irq = irq_of_parse_and_map(np, 0);
-	port.port.irqflags = IRQF_SHARED;
 	port.port.handle_irq = aspeed_vuart_handle_irq;
 	port.port.iotype = UPIO_MEM;
 	port.port.type = PORT_16550A;

diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index 69aaee5d..b9567ef 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c

@@ -177,7 +177,7 @@
 	struct hlist_head *h;
 	struct hlist_node *n;
 	struct irq_info *i;
-	int ret, irq_flags = up->port.flags & UPF_SHARE_IRQ ? IRQF_SHARED : 0;
+	int ret;
 
 	mutex_lock(&hash_mutex);
 
@@ -212,9 +212,8 @@
 		INIT_LIST_HEAD(&up->list);
 		i->head = &up->list;
 		spin_unlock_irq(&i->lock);
-		irq_flags |= up->port.irqflags;
 		ret = request_irq(up->port.irq, serial8250_interrupt,
-				  irq_flags, up->port.name, i);
+				  up->port.irqflags, up->port.name, i);
 		if (ret < 0)
 			serial_do_unlink(i, up);
 	}

diff --git a/drivers/tty/serial/8250/8250_of.c b/drivers/tty/serial/8250/8250_of.c
index 2488de1..8fedc07 100644
--- a/drivers/tty/serial/8250/8250_of.c
+++ b/drivers/tty/serial/8250/8250_of.c

@@ -171,7 +171,6 @@
 
 	port->type = type;
 	port->uartclk = clk;
-	port->irqflags |= IRQF_SHARED;
 
 	if (of_property_read_bool(np, "no-loopback-test"))
 		port->flags |= UPF_SKIP_TEST;

diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index aa4de69..5a04d4d 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c

@@ -2253,6 +2253,10 @@
 		}
 	}
 
+	/* Check if we need to have shared IRQs */
+	if (port->irq && (up->port.flags & UPF_SHARE_IRQ))
+		up->port.irqflags |= IRQF_SHARED;
+
 	if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) {
 		unsigned char iir1;
 		/*

diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
index f34520e..936d401 100644
--- a/drivers/tty/serial/atmel_serial.c
+++ b/drivers/tty/serial/atmel_serial.c

@@ -490,7 +490,8 @@
 	atmel_uart_writel(port, ATMEL_US_IDR, atmel_port->tx_done_mask);
 
 	if (atmel_uart_is_half_duplex(port))
-		atmel_start_rx(port);
+		if (!atomic_read(&atmel_port->tasklet_shutdown))
+			atmel_start_rx(port);
 
 }
 

diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index 672e979..4066cb2 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c

@@ -608,7 +608,7 @@
 
 	sport->tx_bytes = uart_circ_chars_pending(xmit);
 
-	if (xmit->tail < xmit->head) {
+	if (xmit->tail < xmit->head || xmit->head == 0) {
 		sport->dma_tx_nents = 1;
 		sg_init_one(sgl, xmit->buf + xmit->tail, sport->tx_bytes);
 	} else {

diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c
index b3f7d1a..4458419 100644
--- a/drivers/tty/serial/qcom_geni_serial.c
+++ b/drivers/tty/serial/qcom_geni_serial.c

@@ -85,7 +85,7 @@
 #define DEF_FIFO_DEPTH_WORDS	16
 #define DEF_TX_WM		2
 #define DEF_FIFO_WIDTH_BITS	32
-#define UART_CONSOLE_RX_WM	2
+#define UART_RX_WM		2
 #define MAX_LOOPBACK_CFG	3
 
 #ifdef CONFIG_CONSOLE_POLL
@@ -101,10 +101,6 @@
 	u32 tx_fifo_depth;
 	u32 tx_fifo_width;
 	u32 rx_fifo_depth;
-	u32 tx_wm;
-	u32 rx_wm;
-	u32 rx_rfr;
-	enum geni_se_xfer_mode xfer_mode;
 	bool setup;
 	int (*handle_rx)(struct uart_port *uport, u32 bytes, bool drop);
 	unsigned int baud;
@@ -125,6 +121,7 @@
 static int handle_rx_uart(struct uart_port *uport, u32 bytes, bool drop);
 static unsigned int qcom_geni_serial_tx_empty(struct uart_port *port);
 static void qcom_geni_serial_stop_rx(struct uart_port *uport);
+static void qcom_geni_serial_handle_rx(struct uart_port *uport, bool drop);
 
 static const unsigned long root_freq[] = {7372800, 14745600, 19200000, 29491200,
 					32000000, 48000000, 64000000, 80000000,
@@ -226,7 +223,7 @@
 	if (uart_console(uport)) {
 		mctrl |= TIOCM_CTS;
 	} else {
-		geni_ios = readl_relaxed(uport->membase + SE_GENI_IOS);
+		geni_ios = readl(uport->membase + SE_GENI_IOS);
 		if (!(geni_ios & IO2_DATA_IN))
 			mctrl |= TIOCM_CTS;
 	}
@@ -244,7 +241,7 @@
 
 	if (!(mctrl & TIOCM_RTS))
 		uart_manual_rfr = UART_MANUAL_RFR_EN | UART_RFR_NOT_READY;
-	writel_relaxed(uart_manual_rfr, uport->membase + SE_UART_MANUAL_RFR);
+	writel(uart_manual_rfr, uport->membase + SE_UART_MANUAL_RFR);
 }
 
 static const char *qcom_geni_serial_get_type(struct uart_port *uport)
@@ -273,9 +270,6 @@
 	unsigned int fifo_bits;
 	unsigned long timeout_us = 20000;
 
-	/* Ensure polling is not re-ordered before the prior writes/reads */
-	mb();
-
 	if (uport->private_data) {
 		port = to_dev_port(uport, uport);
 		baud = port->baud;
@@ -295,7 +289,7 @@
 	 */
 	timeout_us = DIV_ROUND_UP(timeout_us, 10) * 10;
 	while (timeout_us) {
-		reg = readl_relaxed(uport->membase + offset);
+		reg = readl(uport->membase + offset);
 		if ((bool)(reg & field) == set)
 			return true;
 		udelay(10);
@@ -308,7 +302,7 @@
 {
 	u32 m_cmd;
 
-	writel_relaxed(xmit_size, uport->membase + SE_UART_TX_TRANS_LEN);
+	writel(xmit_size, uport->membase + SE_UART_TX_TRANS_LEN);
 	m_cmd = UART_START_TX << M_OPCODE_SHFT;
 	writel(m_cmd, uport->membase + SE_GENI_M_CMD0);
 }
@@ -321,13 +315,13 @@
 	done = qcom_geni_serial_poll_bit(uport, SE_GENI_M_IRQ_STATUS,
 						M_CMD_DONE_EN, true);
 	if (!done) {
-		writel_relaxed(M_GENI_CMD_ABORT, uport->membase +
+		writel(M_GENI_CMD_ABORT, uport->membase +
 						SE_GENI_M_CMD_CTRL_REG);
 		irq_clear |= M_CMD_ABORT_EN;
 		qcom_geni_serial_poll_bit(uport, SE_GENI_M_IRQ_STATUS,
 							M_CMD_ABORT_EN, true);
 	}
-	writel_relaxed(irq_clear, uport->membase + SE_GENI_M_IRQ_CLEAR);
+	writel(irq_clear, uport->membase + SE_GENI_M_IRQ_CLEAR);
 }
 
 static void qcom_geni_serial_abort_rx(struct uart_port *uport)
@@ -337,8 +331,8 @@
 	writel(S_GENI_CMD_ABORT, uport->membase + SE_GENI_S_CMD_CTRL_REG);
 	qcom_geni_serial_poll_bit(uport, SE_GENI_S_CMD_CTRL_REG,
 					S_GENI_CMD_ABORT, false);
-	writel_relaxed(irq_clear, uport->membase + SE_GENI_S_IRQ_CLEAR);
-	writel_relaxed(FORCE_DEFAULT, uport->membase + GENI_FORCE_DEFAULT_REG);
+	writel(irq_clear, uport->membase + SE_GENI_S_IRQ_CLEAR);
+	writel(FORCE_DEFAULT, uport->membase + GENI_FORCE_DEFAULT_REG);
 }
 
 #ifdef CONFIG_CONSOLE_POLL
@@ -347,19 +341,13 @@
 	u32 rx_fifo;
 	u32 status;
 
-	status = readl_relaxed(uport->membase + SE_GENI_M_IRQ_STATUS);
-	writel_relaxed(status, uport->membase + SE_GENI_M_IRQ_CLEAR);
+	status = readl(uport->membase + SE_GENI_M_IRQ_STATUS);
+	writel(status, uport->membase + SE_GENI_M_IRQ_CLEAR);
 
-	status = readl_relaxed(uport->membase + SE_GENI_S_IRQ_STATUS);
-	writel_relaxed(status, uport->membase + SE_GENI_S_IRQ_CLEAR);
+	status = readl(uport->membase + SE_GENI_S_IRQ_STATUS);
+	writel(status, uport->membase + SE_GENI_S_IRQ_CLEAR);
 
-	/*
-	 * Ensure the writes to clear interrupts is not re-ordered after
-	 * reading the data.
-	 */
-	mb();
-
-	status = readl_relaxed(uport->membase + SE_GENI_RX_FIFO_STATUS);
+	status = readl(uport->membase + SE_GENI_RX_FIFO_STATUS);
 	if (!(status & RX_FIFO_WC_MSK))
 		return NO_POLL_CHAR;
 
@@ -370,15 +358,12 @@
 static void qcom_geni_serial_poll_put_char(struct uart_port *uport,
 							unsigned char c)
 {
-	struct qcom_geni_serial_port *port = to_dev_port(uport, uport);
-
-	writel_relaxed(port->tx_wm, uport->membase + SE_GENI_TX_WATERMARK_REG);
+	writel(DEF_TX_WM, uport->membase + SE_GENI_TX_WATERMARK_REG);
 	qcom_geni_serial_setup_tx(uport, 1);
 	WARN_ON(!qcom_geni_serial_poll_bit(uport, SE_GENI_M_IRQ_STATUS,
 						M_TX_FIFO_WATERMARK_EN, true));
-	writel_relaxed(c, uport->membase + SE_GENI_TX_FIFOn);
-	writel_relaxed(M_TX_FIFO_WATERMARK_EN, uport->membase +
-							SE_GENI_M_IRQ_CLEAR);
+	writel(c, uport->membase + SE_GENI_TX_FIFOn);
+	writel(M_TX_FIFO_WATERMARK_EN, uport->membase + SE_GENI_M_IRQ_CLEAR);
 	qcom_geni_serial_poll_tx_done(uport);
 }
 #endif
@@ -386,7 +371,7 @@
 #ifdef CONFIG_SERIAL_QCOM_GENI_CONSOLE
 static void qcom_geni_serial_wr_char(struct uart_port *uport, int ch)
 {
-	writel_relaxed(ch, uport->membase + SE_GENI_TX_FIFOn);
+	writel(ch, uport->membase + SE_GENI_TX_FIFOn);
 }
 
 static void
@@ -405,7 +390,7 @@
 			bytes_to_send++;
 	}
 
-	writel_relaxed(DEF_TX_WM, uport->membase + SE_GENI_TX_WATERMARK_REG);
+	writel(DEF_TX_WM, uport->membase + SE_GENI_TX_WATERMARK_REG);
 	qcom_geni_serial_setup_tx(uport, bytes_to_send);
 	for (i = 0; i < count; ) {
 		size_t chars_to_write = 0;
@@ -423,7 +408,7 @@
 		chars_to_write = min_t(size_t, count - i, avail / 2);
 		uart_console_write(uport, s + i, chars_to_write,
 						qcom_geni_serial_wr_char);
-		writel_relaxed(M_TX_FIFO_WATERMARK_EN, uport->membase +
+		writel(M_TX_FIFO_WATERMARK_EN, uport->membase +
 							SE_GENI_M_IRQ_CLEAR);
 		i += chars_to_write;
 	}
@@ -438,6 +423,7 @@
 	bool locked = true;
 	unsigned long flags;
 	u32 geni_status;
+	u32 irq_en;
 
 	WARN_ON(co->index < 0 || co->index >= GENI_UART_CONS_PORTS);
 
@@ -451,7 +437,7 @@
 	else
 		spin_lock_irqsave(&uport->lock, flags);
 
-	geni_status = readl_relaxed(uport->membase + SE_GENI_STATUS);
+	geni_status = readl(uport->membase + SE_GENI_STATUS);
 
 	/* Cancel the current write to log the fault */
 	if (!locked) {
@@ -461,17 +447,22 @@
 			geni_se_abort_m_cmd(&port->se);
 			qcom_geni_serial_poll_bit(uport, SE_GENI_M_IRQ_STATUS,
 							M_CMD_ABORT_EN, true);
-			writel_relaxed(M_CMD_ABORT_EN, uport->membase +
+			writel(M_CMD_ABORT_EN, uport->membase +
 							SE_GENI_M_IRQ_CLEAR);
 		}
-		writel_relaxed(M_CMD_CANCEL_EN, uport->membase +
-							SE_GENI_M_IRQ_CLEAR);
+		writel(M_CMD_CANCEL_EN, uport->membase + SE_GENI_M_IRQ_CLEAR);
 	} else if ((geni_status & M_GENI_CMD_ACTIVE) && !port->tx_remaining) {
 		/*
 		 * It seems we can't interrupt existing transfers if all data
 		 * has been sent, in which case we need to look for done first.
 		 */
 		qcom_geni_serial_poll_tx_done(uport);
+
+		if (uart_circ_chars_pending(&uport->state->xmit)) {
+			irq_en = readl(uport->membase + SE_GENI_M_IRQ_EN);
+			writel(irq_en | M_TX_FIFO_WATERMARK_EN,
+					uport->membase + SE_GENI_M_IRQ_EN);
+		}
 	}
 
 	__qcom_geni_serial_console_write(uport, s, count);
@@ -556,29 +547,20 @@
 static void qcom_geni_serial_start_tx(struct uart_port *uport)
 {
 	u32 irq_en;
-	struct qcom_geni_serial_port *port = to_dev_port(uport, uport);
 	u32 status;
 
-	if (port->xfer_mode == GENI_SE_FIFO) {
-		/*
-		 * readl ensures reading & writing of IRQ_EN register
-		 * is not re-ordered before checking the status of the
-		 * Serial Engine.
-		 */
-		status = readl(uport->membase + SE_GENI_STATUS);
-		if (status & M_GENI_CMD_ACTIVE)
-			return;
+	status = readl(uport->membase + SE_GENI_STATUS);
+	if (status & M_GENI_CMD_ACTIVE)
+		return;
 
-		if (!qcom_geni_serial_tx_empty(uport))
-			return;
+	if (!qcom_geni_serial_tx_empty(uport))
+		return;
 
-		irq_en = readl_relaxed(uport->membase +	SE_GENI_M_IRQ_EN);
-		irq_en |= M_TX_FIFO_WATERMARK_EN | M_CMD_DONE_EN;
+	irq_en = readl(uport->membase +	SE_GENI_M_IRQ_EN);
+	irq_en |= M_TX_FIFO_WATERMARK_EN | M_CMD_DONE_EN;
 
-		writel_relaxed(port->tx_wm, uport->membase +
-						SE_GENI_TX_WATERMARK_REG);
-		writel_relaxed(irq_en, uport->membase +	SE_GENI_M_IRQ_EN);
-	}
+	writel(DEF_TX_WM, uport->membase + SE_GENI_TX_WATERMARK_REG);
+	writel(irq_en, uport->membase +	SE_GENI_M_IRQ_EN);
 }
 
 static void qcom_geni_serial_stop_tx(struct uart_port *uport)
@@ -587,35 +569,24 @@
 	u32 status;
 	struct qcom_geni_serial_port *port = to_dev_port(uport, uport);
 
-	irq_en = readl_relaxed(uport->membase + SE_GENI_M_IRQ_EN);
-	irq_en &= ~M_CMD_DONE_EN;
-	if (port->xfer_mode == GENI_SE_FIFO) {
-		irq_en &= ~M_TX_FIFO_WATERMARK_EN;
-		writel_relaxed(0, uport->membase +
-				     SE_GENI_TX_WATERMARK_REG);
-	}
-	writel_relaxed(irq_en, uport->membase + SE_GENI_M_IRQ_EN);
-	status = readl_relaxed(uport->membase + SE_GENI_STATUS);
+	irq_en = readl(uport->membase + SE_GENI_M_IRQ_EN);
+	irq_en &= ~(M_CMD_DONE_EN | M_TX_FIFO_WATERMARK_EN);
+	writel(0, uport->membase + SE_GENI_TX_WATERMARK_REG);
+	writel(irq_en, uport->membase + SE_GENI_M_IRQ_EN);
+	status = readl(uport->membase + SE_GENI_STATUS);
 	/* Possible stop tx is called multiple times. */
 	if (!(status & M_GENI_CMD_ACTIVE))
 		return;
 
-	/*
-	 * Ensure cancel command write is not re-ordered before checking
-	 * the status of the Primary Sequencer.
-	 */
-	mb();
-
 	geni_se_cancel_m_cmd(&port->se);
 	if (!qcom_geni_serial_poll_bit(uport, SE_GENI_M_IRQ_STATUS,
 						M_CMD_CANCEL_EN, true)) {
 		geni_se_abort_m_cmd(&port->se);
 		qcom_geni_serial_poll_bit(uport, SE_GENI_M_IRQ_STATUS,
 						M_CMD_ABORT_EN, true);
-		writel_relaxed(M_CMD_ABORT_EN, uport->membase +
-							SE_GENI_M_IRQ_CLEAR);
+		writel(M_CMD_ABORT_EN, uport->membase + SE_GENI_M_IRQ_CLEAR);
 	}
-	writel_relaxed(M_CMD_CANCEL_EN, uport->membase + SE_GENI_M_IRQ_CLEAR);
+	writel(M_CMD_CANCEL_EN, uport->membase + SE_GENI_M_IRQ_CLEAR);
 }
 
 static void qcom_geni_serial_start_rx(struct uart_port *uport)
@@ -624,27 +595,19 @@
 	u32 status;
 	struct qcom_geni_serial_port *port = to_dev_port(uport, uport);
 
-	status = readl_relaxed(uport->membase + SE_GENI_STATUS);
+	status = readl(uport->membase + SE_GENI_STATUS);
 	if (status & S_GENI_CMD_ACTIVE)
 		qcom_geni_serial_stop_rx(uport);
 
-	/*
-	 * Ensure setup command write is not re-ordered before checking
-	 * the status of the Secondary Sequencer.
-	 */
-	mb();
-
 	geni_se_setup_s_cmd(&port->se, UART_START_READ, 0);
 
-	if (port->xfer_mode == GENI_SE_FIFO) {
-		irq_en = readl_relaxed(uport->membase + SE_GENI_S_IRQ_EN);
-		irq_en |= S_RX_FIFO_WATERMARK_EN | S_RX_FIFO_LAST_EN;
-		writel_relaxed(irq_en, uport->membase + SE_GENI_S_IRQ_EN);
+	irq_en = readl(uport->membase + SE_GENI_S_IRQ_EN);
+	irq_en |= S_RX_FIFO_WATERMARK_EN | S_RX_FIFO_LAST_EN;
+	writel(irq_en, uport->membase + SE_GENI_S_IRQ_EN);
 
-		irq_en = readl_relaxed(uport->membase + SE_GENI_M_IRQ_EN);
-		irq_en |= M_RX_FIFO_WATERMARK_EN | M_RX_FIFO_LAST_EN;
-		writel_relaxed(irq_en, uport->membase + SE_GENI_M_IRQ_EN);
-	}
+	irq_en = readl(uport->membase + SE_GENI_M_IRQ_EN);
+	irq_en |= M_RX_FIFO_WATERMARK_EN | M_RX_FIFO_LAST_EN;
+	writel(irq_en, uport->membase + SE_GENI_M_IRQ_EN);
 }
 
 static void qcom_geni_serial_stop_rx(struct uart_port *uport)
@@ -652,34 +615,35 @@
 	u32 irq_en;
 	u32 status;
 	struct qcom_geni_serial_port *port = to_dev_port(uport, uport);
-	u32 irq_clear = S_CMD_DONE_EN;
+	u32 s_irq_status;
 
-	if (port->xfer_mode == GENI_SE_FIFO) {
-		irq_en = readl_relaxed(uport->membase + SE_GENI_S_IRQ_EN);
-		irq_en &= ~(S_RX_FIFO_WATERMARK_EN | S_RX_FIFO_LAST_EN);
-		writel_relaxed(irq_en, uport->membase + SE_GENI_S_IRQ_EN);
+	irq_en = readl(uport->membase + SE_GENI_S_IRQ_EN);
+	irq_en &= ~(S_RX_FIFO_WATERMARK_EN | S_RX_FIFO_LAST_EN);
+	writel(irq_en, uport->membase + SE_GENI_S_IRQ_EN);
 
-		irq_en = readl_relaxed(uport->membase + SE_GENI_M_IRQ_EN);
-		irq_en &= ~(M_RX_FIFO_WATERMARK_EN | M_RX_FIFO_LAST_EN);
-		writel_relaxed(irq_en, uport->membase + SE_GENI_M_IRQ_EN);
-	}
+	irq_en = readl(uport->membase + SE_GENI_M_IRQ_EN);
+	irq_en &= ~(M_RX_FIFO_WATERMARK_EN | M_RX_FIFO_LAST_EN);
+	writel(irq_en, uport->membase + SE_GENI_M_IRQ_EN);
 
-	status = readl_relaxed(uport->membase + SE_GENI_STATUS);
+	status = readl(uport->membase + SE_GENI_STATUS);
 	/* Possible stop rx is called multiple times. */
 	if (!(status & S_GENI_CMD_ACTIVE))
 		return;
 
-	/*
-	 * Ensure cancel command write is not re-ordered before checking
-	 * the status of the Secondary Sequencer.
-	 */
-	mb();
-
 	geni_se_cancel_s_cmd(&port->se);
-	qcom_geni_serial_poll_bit(uport, SE_GENI_S_CMD_CTRL_REG,
-					S_GENI_CMD_CANCEL, false);
-	status = readl_relaxed(uport->membase + SE_GENI_STATUS);
-	writel_relaxed(irq_clear, uport->membase + SE_GENI_S_IRQ_CLEAR);
+	qcom_geni_serial_poll_bit(uport, SE_GENI_S_IRQ_STATUS,
+					S_CMD_CANCEL_EN, true);
+	/*
+	 * If timeout occurs secondary engine remains active
+	 * and Abort sequence is executed.
+	 */
+	s_irq_status = readl(uport->membase + SE_GENI_S_IRQ_STATUS);
+	/* Flush the Rx buffer */
+	if (s_irq_status & S_RX_FIFO_LAST_EN)
+		qcom_geni_serial_handle_rx(uport, true);
+	writel(s_irq_status, uport->membase + SE_GENI_S_IRQ_CLEAR);
+
+	status = readl(uport->membase + SE_GENI_STATUS);
 	if (status & S_GENI_CMD_ACTIVE)
 		qcom_geni_serial_abort_rx(uport);
 }
@@ -693,7 +657,7 @@
 	u32 total_bytes;
 	struct qcom_geni_serial_port *port = to_dev_port(uport, uport);
 
-	status = readl_relaxed(uport->membase +	SE_GENI_RX_FIFO_STATUS);
+	status = readl(uport->membase +	SE_GENI_RX_FIFO_STATUS);
 	word_cnt = status & RX_FIFO_WC_MSK;
 	last_word_partial = status & RX_LAST;
 	last_word_byte_cnt = (status & RX_LAST_BYTE_VALID_MSK) >>
@@ -719,10 +683,11 @@
 	size_t pending;
 	int i;
 	u32 status;
+	u32 irq_en;
 	unsigned int chunk;
 	int tail;
 
-	status = readl_relaxed(uport->membase + SE_GENI_TX_FIFO_STATUS);
+	status = readl(uport->membase + SE_GENI_TX_FIFO_STATUS);
 
 	/* Complete the current tx command before taking newly added data */
 	if (active)
@@ -747,6 +712,11 @@
 	if (!port->tx_remaining) {
 		qcom_geni_serial_setup_tx(uport, pending);
 		port->tx_remaining = pending;
+
+		irq_en = readl(uport->membase + SE_GENI_M_IRQ_EN);
+		if (!(irq_en & M_TX_FIFO_WATERMARK_EN))
+			writel(irq_en | M_TX_FIFO_WATERMARK_EN,
+					uport->membase + SE_GENI_M_IRQ_EN);
 	}
 
 	remaining = chunk;
@@ -770,7 +740,23 @@
 	}
 
 	xmit->tail = tail & (UART_XMIT_SIZE - 1);
+
+	/*
+	 * The tx fifo watermark is level triggered and latched. Though we had
+	 * cleared it in qcom_geni_serial_isr it will have already reasserted
+	 * so we must clear it again here after our writes.
+	 */
+	writel(M_TX_FIFO_WATERMARK_EN,
+			uport->membase + SE_GENI_M_IRQ_CLEAR);
+
 out_write_wakeup:
+	if (!port->tx_remaining) {
+		irq_en = readl(uport->membase + SE_GENI_M_IRQ_EN);
+		if (irq_en & M_TX_FIFO_WATERMARK_EN)
+			writel(irq_en & ~M_TX_FIFO_WATERMARK_EN,
+					uport->membase + SE_GENI_M_IRQ_EN);
+	}
+
 	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
 		uart_write_wakeup(uport);
 }
@@ -791,12 +777,12 @@
 		return IRQ_NONE;
 
 	spin_lock_irqsave(&uport->lock, flags);
-	m_irq_status = readl_relaxed(uport->membase + SE_GENI_M_IRQ_STATUS);
-	s_irq_status = readl_relaxed(uport->membase + SE_GENI_S_IRQ_STATUS);
-	geni_status = readl_relaxed(uport->membase + SE_GENI_STATUS);
-	m_irq_en = readl_relaxed(uport->membase + SE_GENI_M_IRQ_EN);
-	writel_relaxed(m_irq_status, uport->membase + SE_GENI_M_IRQ_CLEAR);
-	writel_relaxed(s_irq_status, uport->membase + SE_GENI_S_IRQ_CLEAR);
+	m_irq_status = readl(uport->membase + SE_GENI_M_IRQ_STATUS);
+	s_irq_status = readl(uport->membase + SE_GENI_S_IRQ_STATUS);
+	geni_status = readl(uport->membase + SE_GENI_STATUS);
+	m_irq_en = readl(uport->membase + SE_GENI_M_IRQ_EN);
+	writel(m_irq_status, uport->membase + SE_GENI_M_IRQ_CLEAR);
+	writel(s_irq_status, uport->membase + SE_GENI_S_IRQ_CLEAR);
 
 	if (WARN_ON(m_irq_status & M_ILLEGAL_CMD_EN))
 		goto out_unlock;
@@ -806,8 +792,7 @@
 		tty_insert_flip_char(tport, 0, TTY_OVERRUN);
 	}
 
-	if (m_irq_status & (M_TX_FIFO_WATERMARK_EN | M_CMD_DONE_EN) &&
-	    m_irq_en & (M_TX_FIFO_WATERMARK_EN | M_CMD_DONE_EN))
+	if (m_irq_status & m_irq_en & (M_TX_FIFO_WATERMARK_EN | M_CMD_DONE_EN))
 		qcom_geni_serial_handle_tx(uport, m_irq_status & M_CMD_DONE_EN,
 					geni_status & M_GENI_CMD_ACTIVE);
 
@@ -842,17 +827,6 @@
 		(port->tx_fifo_depth * port->tx_fifo_width) / BITS_PER_BYTE;
 }
 
-static void set_rfr_wm(struct qcom_geni_serial_port *port)
-{
-	/*
-	 * Set RFR (Flow off) to FIFO_DEPTH - 2.
-	 * RX WM level at 10% RX_FIFO_DEPTH.
-	 * TX WM level at 10% TX_FIFO_DEPTH.
-	 */
-	port->rx_rfr = port->rx_fifo_depth - 2;
-	port->rx_wm = UART_CONSOLE_RX_WM;
-	port->tx_wm = DEF_TX_WM;
-}
 
 static void qcom_geni_serial_shutdown(struct uart_port *uport)
 {
@@ -891,21 +865,19 @@
 
 	get_tx_fifo_size(port);
 
-	set_rfr_wm(port);
-	writel_relaxed(rxstale, uport->membase + SE_UART_RX_STALE_CNT);
+	writel(rxstale, uport->membase + SE_UART_RX_STALE_CNT);
 	/*
 	 * Make an unconditional cancel on the main sequencer to reset
 	 * it else we could end up in data loss scenarios.
 	 */
-	port->xfer_mode = GENI_SE_FIFO;
 	if (uart_console(uport))
 		qcom_geni_serial_poll_tx_done(uport);
 	geni_se_config_packing(&port->se, BITS_PER_BYTE, port->tx_bytes_pw,
 						false, true, false);
 	geni_se_config_packing(&port->se, BITS_PER_BYTE, port->rx_bytes_pw,
 						false, false, true);
-	geni_se_init(&port->se, port->rx_wm, port->rx_rfr);
-	geni_se_select_mode(&port->se, port->xfer_mode);
+	geni_se_init(&port->se, UART_RX_WM, port->rx_fifo_depth - 2);
+	geni_se_select_mode(&port->se, GENI_SE_FIFO);
 	if (!uart_console(uport)) {
 		port->rx_fifo = devm_kcalloc(uport->dev,
 			port->rx_fifo_depth, sizeof(u32), GFP_KERNEL);
@@ -996,10 +968,10 @@
 	ser_clk_cfg |= clk_div << CLK_DIV_SHFT;
 
 	/* parity */
-	tx_trans_cfg = readl_relaxed(uport->membase + SE_UART_TX_TRANS_CFG);
-	tx_parity_cfg = readl_relaxed(uport->membase + SE_UART_TX_PARITY_CFG);
-	rx_trans_cfg = readl_relaxed(uport->membase + SE_UART_RX_TRANS_CFG);
-	rx_parity_cfg = readl_relaxed(uport->membase + SE_UART_RX_PARITY_CFG);
+	tx_trans_cfg = readl(uport->membase + SE_UART_TX_TRANS_CFG);
+	tx_parity_cfg = readl(uport->membase + SE_UART_TX_PARITY_CFG);
+	rx_trans_cfg = readl(uport->membase + SE_UART_RX_TRANS_CFG);
+	rx_parity_cfg = readl(uport->membase + SE_UART_RX_PARITY_CFG);
 	if (termios->c_cflag & PARENB) {
 		tx_trans_cfg |= UART_TX_PAR_EN;
 		rx_trans_cfg |= UART_RX_PAR_EN;
@@ -1055,17 +1027,17 @@
 		uart_update_timeout(uport, termios->c_cflag, baud);
 
 	if (!uart_console(uport))
-		writel_relaxed(port->loopback,
+		writel(port->loopback,
 				uport->membase + SE_UART_LOOPBACK_CFG);
-	writel_relaxed(tx_trans_cfg, uport->membase + SE_UART_TX_TRANS_CFG);
-	writel_relaxed(tx_parity_cfg, uport->membase + SE_UART_TX_PARITY_CFG);
-	writel_relaxed(rx_trans_cfg, uport->membase + SE_UART_RX_TRANS_CFG);
-	writel_relaxed(rx_parity_cfg, uport->membase + SE_UART_RX_PARITY_CFG);
-	writel_relaxed(bits_per_char, uport->membase + SE_UART_TX_WORD_LEN);
-	writel_relaxed(bits_per_char, uport->membase + SE_UART_RX_WORD_LEN);
-	writel_relaxed(stop_bit_len, uport->membase + SE_UART_TX_STOP_BIT_LEN);
-	writel_relaxed(ser_clk_cfg, uport->membase + GENI_SER_M_CLK_CFG);
-	writel_relaxed(ser_clk_cfg, uport->membase + GENI_SER_S_CLK_CFG);
+	writel(tx_trans_cfg, uport->membase + SE_UART_TX_TRANS_CFG);
+	writel(tx_parity_cfg, uport->membase + SE_UART_TX_PARITY_CFG);
+	writel(rx_trans_cfg, uport->membase + SE_UART_RX_TRANS_CFG);
+	writel(rx_parity_cfg, uport->membase + SE_UART_RX_PARITY_CFG);
+	writel(bits_per_char, uport->membase + SE_UART_TX_WORD_LEN);
+	writel(bits_per_char, uport->membase + SE_UART_RX_WORD_LEN);
+	writel(stop_bit_len, uport->membase + SE_UART_TX_STOP_BIT_LEN);
+	writel(ser_clk_cfg, uport->membase + GENI_SER_M_CLK_CFG);
+	writel(ser_clk_cfg, uport->membase + GENI_SER_S_CLK_CFG);
 out_restart_rx:
 	qcom_geni_serial_start_rx(uport);
 }
@@ -1156,13 +1128,13 @@
 	geni_se_init(&se, DEF_FIFO_DEPTH_WORDS / 2, DEF_FIFO_DEPTH_WORDS - 2);
 	geni_se_select_mode(&se, GENI_SE_FIFO);
 
-	writel_relaxed(tx_trans_cfg, uport->membase + SE_UART_TX_TRANS_CFG);
-	writel_relaxed(tx_parity_cfg, uport->membase + SE_UART_TX_PARITY_CFG);
-	writel_relaxed(rx_trans_cfg, uport->membase + SE_UART_RX_TRANS_CFG);
-	writel_relaxed(rx_parity_cfg, uport->membase + SE_UART_RX_PARITY_CFG);
-	writel_relaxed(bits_per_char, uport->membase + SE_UART_TX_WORD_LEN);
-	writel_relaxed(bits_per_char, uport->membase + SE_UART_RX_WORD_LEN);
-	writel_relaxed(stop_bit_len, uport->membase + SE_UART_TX_STOP_BIT_LEN);
+	writel(tx_trans_cfg, uport->membase + SE_UART_TX_TRANS_CFG);
+	writel(tx_parity_cfg, uport->membase + SE_UART_TX_PARITY_CFG);
+	writel(rx_trans_cfg, uport->membase + SE_UART_RX_TRANS_CFG);
+	writel(rx_parity_cfg, uport->membase + SE_UART_RX_PARITY_CFG);
+	writel(bits_per_char, uport->membase + SE_UART_TX_WORD_LEN);
+	writel(bits_per_char, uport->membase + SE_UART_RX_WORD_LEN);
+	writel(stop_bit_len, uport->membase + SE_UART_TX_STOP_BIT_LEN);
 
 	dev->con->write = qcom_geni_serial_earlycon_write;
 	dev->con->setup = NULL;

diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index b88ecf1..e9779b0 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c

@@ -1335,10 +1335,10 @@
 	DBGINFO(("%s throttle\n", info->device_name));
 	if (I_IXOFF(tty))
 		send_xchar(tty, STOP_CHAR(tty));
- 	if (C_CRTSCTS(tty)) {
+	if (C_CRTSCTS(tty)) {
 		spin_lock_irqsave(&info->lock,flags);
 		info->signals &= ~SerialSignal_RTS;
-	 	set_signals(info);
+		set_signals(info);
 		spin_unlock_irqrestore(&info->lock,flags);
 	}
 }
@@ -1360,10 +1360,10 @@
 		else
 			send_xchar(tty, START_CHAR(tty));
 	}
- 	if (C_CRTSCTS(tty)) {
+	if (C_CRTSCTS(tty)) {
 		spin_lock_irqsave(&info->lock,flags);
 		info->signals |= SerialSignal_RTS;
-	 	set_signals(info);
+		set_signals(info);
 		spin_unlock_irqrestore(&info->lock,flags);
 	}
 }
@@ -2561,8 +2561,8 @@
 	info->read_status_mask = IRQ_RXOVER;
 	if (I_INPCK(info->port.tty))
 		info->read_status_mask |= MASK_PARITY | MASK_FRAMING;
- 	if (I_BRKINT(info->port.tty) || I_PARMRK(info->port.tty))
- 		info->read_status_mask |= MASK_BREAK;
+	if (I_BRKINT(info->port.tty) || I_PARMRK(info->port.tty))
+		info->read_status_mask |= MASK_BREAK;
 	if (I_IGNPAR(info->port.tty))
 		info->ignore_status_mask |= MASK_PARITY | MASK_FRAMING;
 	if (I_IGNBRK(info->port.tty)) {
@@ -3193,7 +3193,7 @@
 		info->signals &= ~SerialSignal_DTR;
 
 	spin_lock_irqsave(&info->lock,flags);
- 	set_signals(info);
+	set_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 	return 0;
 }
@@ -3204,7 +3204,7 @@
 	struct slgt_info *info = container_of(port, struct slgt_info, port);
 
 	spin_lock_irqsave(&info->lock,flags);
- 	get_signals(info);
+	get_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 	return (info->signals & SerialSignal_DCD) ? 1 : 0;
 }
@@ -3219,7 +3219,7 @@
 		info->signals |= SerialSignal_RTS | SerialSignal_DTR;
 	else
 		info->signals &= ~(SerialSignal_RTS | SerialSignal_DTR);
- 	set_signals(info);
+	set_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 }
 

diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index 1e4d5b9..57c2c64 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c

@@ -1454,10 +1454,10 @@
 	if (I_IXOFF(tty))
 		send_xchar(tty, STOP_CHAR(tty));
 
- 	if (C_CRTSCTS(tty)) {
+	if (C_CRTSCTS(tty)) {
 		spin_lock_irqsave(&info->lock,flags);
 		info->serial_signals &= ~SerialSignal_RTS;
-	 	set_signals(info);
+		set_signals(info);
 		spin_unlock_irqrestore(&info->lock,flags);
 	}
 }
@@ -1483,10 +1483,10 @@
 			send_xchar(tty, START_CHAR(tty));
 	}
 
- 	if (C_CRTSCTS(tty)) {
+	if (C_CRTSCTS(tty)) {
 		spin_lock_irqsave(&info->lock,flags);
 		info->serial_signals |= SerialSignal_RTS;
-	 	set_signals(info);
+		set_signals(info);
 		spin_unlock_irqrestore(&info->lock,flags);
 	}
 }
@@ -2471,7 +2471,7 @@
 					if (status & SerialSignal_CTS) {
 						if ( debug_level >= DEBUG_LEVEL_ISR )
 							printk("CTS tx start...");
-			 			info->port.tty->hw_stopped = 0;
+						info->port.tty->hw_stopped = 0;
 						tx_start(info);
 						info->pending_bh |= BH_TRANSMIT;
 						return;
@@ -2480,7 +2480,7 @@
 					if (!(status & SerialSignal_CTS)) {
 						if ( debug_level >= DEBUG_LEVEL_ISR )
 							printk("CTS tx stop...");
-			 			info->port.tty->hw_stopped = 1;
+						info->port.tty->hw_stopped = 1;
 						tx_stop(info);
 					}
 				}
@@ -2807,8 +2807,8 @@
 	info->read_status_mask2 = OVRN;
 	if (I_INPCK(info->port.tty))
 		info->read_status_mask2 |= PE | FRME;
- 	if (I_BRKINT(info->port.tty) || I_PARMRK(info->port.tty))
- 		info->read_status_mask1 |= BRKD;
+	if (I_BRKINT(info->port.tty) || I_PARMRK(info->port.tty))
+		info->read_status_mask1 |= BRKD;
 	if (I_IGNPAR(info->port.tty))
 		info->ignore_status_mask2 |= PE | FRME;
 	if (I_IGNBRK(info->port.tty)) {
@@ -3178,7 +3178,7 @@
  	unsigned long flags;
 
 	spin_lock_irqsave(&info->lock,flags);
- 	get_signals(info);
+	get_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 
 	result = ((info->serial_signals & SerialSignal_RTS) ? TIOCM_RTS : 0) |
@@ -3216,7 +3216,7 @@
 		info->serial_signals &= ~SerialSignal_DTR;
 
 	spin_lock_irqsave(&info->lock,flags);
- 	set_signals(info);
+	set_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 
 	return 0;
@@ -3228,7 +3228,7 @@
 	unsigned long flags;
 
 	spin_lock_irqsave(&info->lock,flags);
- 	get_signals(info);
+	get_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 
 	return (info->serial_signals & SerialSignal_DCD) ? 1 : 0;
@@ -3244,7 +3244,7 @@
 		info->serial_signals |= SerialSignal_RTS | SerialSignal_DTR;
 	else
 		info->serial_signals &= ~(SerialSignal_RTS | SerialSignal_DTR);
- 	set_signals(info);
+	set_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 }
 

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 06ed20d..cee0274 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c

@@ -546,7 +546,6 @@
 	 */
 	orig_log_level = console_loglevel;
 	console_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
-	pr_info("SysRq : ");
 
         op_p = __sysrq_get_key_op(key);
         if (op_p) {
@@ -555,14 +554,15 @@
 		 * should not) and is the invoked operation enabled?
 		 */
 		if (!check_mask || sysrq_on_mask(op_p->enable_mask)) {
-			pr_cont("%s\n", op_p->action_msg);
+			pr_info("%s\n", op_p->action_msg);
 			console_loglevel = orig_log_level;
 			op_p->handler(key);
 		} else {
-			pr_cont("This sysrq operation is disabled.\n");
+			pr_info("This sysrq operation is disabled.\n");
+			console_loglevel = orig_log_level;
 		}
 	} else {
-		pr_cont("HELP : ");
+		pr_info("HELP : ");
 		/* Only print the help msg once per handler */
 		for (i = 0; i < ARRAY_SIZE(sysrq_key_table); i++) {
 			if (sysrq_key_table[i]) {

diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index c699d41..fbacb00 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c

@@ -52,10 +52,11 @@
 	}
 }
 
-static const struct tty_port_client_operations default_client_ops = {
+const struct tty_port_client_operations tty_port_default_client_ops = {
 	.receive_buf = tty_port_default_receive_buf,
 	.write_wakeup = tty_port_default_wakeup,
 };
+EXPORT_SYMBOL_GPL(tty_port_default_client_ops);
 
 void tty_port_init(struct tty_port *port)
 {
@@ -68,7 +69,7 @@
 	spin_lock_init(&port->lock);
 	port->close_delay = (50 * HZ) / 100;
 	port->closing_wait = (3000 * HZ) / 100;
-	port->client_ops = &default_client_ops;
+	port->client_ops = &tty_port_default_client_ops;
 	kref_init(&port->kref);
 }
 EXPORT_SYMBOL(tty_port_init);

diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c
index 07496c7..3ac4fe5 100644
--- a/drivers/tty/vt/selection.c
+++ b/drivers/tty/vt/selection.c

@@ -27,6 +27,8 @@
 #include <linux/console.h>
 #include <linux/tty_flip.h>
 
+#include <linux/sched/signal.h>
+
 /* Don't take this from <ctype.h>: 011-015 on the screen aren't spaces */
 #define isspace(c)	((c) == ' ')
 
@@ -337,6 +339,7 @@
 	unsigned int count;
 	struct  tty_ldisc *ld;
 	DECLARE_WAITQUEUE(wait, current);
+	int ret = 0;
 
 	console_lock();
 	poke_blanked_console();
@@ -350,6 +353,10 @@
 	add_wait_queue(&vc->paste_wait, &wait);
 	while (sel_buffer && sel_buffer_lth > pasted) {
 		set_current_state(TASK_INTERRUPTIBLE);
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
 		if (tty_throttled(tty)) {
 			schedule();
 			continue;
@@ -365,5 +372,5 @@
 
 	tty_buffer_unlock_exclusive(&vc->port);
 	tty_ldisc_deref(ld);
-	return 0;
+	return ret;
 }

diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index d673e35..ddaecb1 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c

@@ -936,10 +936,21 @@
 	WARN_CONSOLE_UNLOCKED();
 
 	set_origin(vc);
-	if (vc->vc_sw->con_flush_scrollback)
+	if (vc->vc_sw->con_flush_scrollback) {
 		vc->vc_sw->con_flush_scrollback(vc);
-	else
+	} else if (con_is_visible(vc)) {
+		/*
+		 * When no con_flush_scrollback method is provided then the
+		 * legacy way for flushing the scrollback buffer is to use
+		 * a side effect of the con_switch method. We do it only on
+		 * the foreground console as background consoles have no
+		 * scrollback buffers in that case and we obviously don't
+		 * want to switch to them.
+		 */
+		hide_cursor(vc);
 		vc->vc_sw->con_switch(vc);
+		set_cursor(vc);
+	}
 }
 
 /*

diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c
index 73cdc0d..2bb6de8 100644
--- a/drivers/tty/vt/vt_ioctl.c
+++ b/drivers/tty/vt/vt_ioctl.c

@@ -876,15 +876,20 @@
 			return -EINVAL;
 
 		for (i = 0; i < MAX_NR_CONSOLES; i++) {
+			struct vc_data *vcp;
+
 			if (!vc_cons[i].d)
 				continue;
 			console_lock();
-			if (v.v_vlin)
-				vc_cons[i].d->vc_scan_lines = v.v_vlin;
-			if (v.v_clin)
-				vc_cons[i].d->vc_font.height = v.v_clin;
-			vc_cons[i].d->vc_resize_user = 1;
-			vc_resize(vc_cons[i].d, v.v_cols, v.v_rows);
+			vcp = vc_cons[i].d;
+			if (vcp) {
+				if (v.v_vlin)
+					vcp->vc_scan_lines = v.v_vlin;
+				if (v.v_clin)
+					vcp->vc_font.height = v.v_clin;
+				vcp->vc_resize_user = 1;
+				vc_resize(vcp, v.v_cols, v.v_rows);
+			}
 			console_unlock();
 		}
 		break;

diff --git a/drivers/uio/uio_dmem_genirq.c b/drivers/uio/uio_dmem_genirq.c
index e1134a4..a00b4ae 100644
--- a/drivers/uio/uio_dmem_genirq.c
+++ b/drivers/uio/uio_dmem_genirq.c

@@ -135,11 +135,13 @@
 	if (irq_on) {
 		if (test_and_clear_bit(0, &priv->flags))
 			enable_irq(dev_info->irq);
+		spin_unlock_irqrestore(&priv->lock, flags);
 	} else {
-		if (!test_and_set_bit(0, &priv->flags))
+		if (!test_and_set_bit(0, &priv->flags)) {
+			spin_unlock_irqrestore(&priv->lock, flags);
 			disable_irq(dev_info->irq);
+		}
 	}
-	spin_unlock_irqrestore(&priv->lock, flags);
 
 	return 0;
 }

diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index 0bf0e62..2025261 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c

@@ -256,6 +256,7 @@
 		struct usb_host_interface *ifp, int num_ep,
 		unsigned char *buffer, int size)
 {
+	struct usb_device *udev = to_usb_device(ddev);
 	unsigned char *buffer0 = buffer;
 	struct usb_endpoint_descriptor *d;
 	struct usb_host_endpoint *endpoint;
@@ -297,6 +298,16 @@
 		goto skip_to_next_endpoint_or_interface_descriptor;
 	}
 
+	/* Ignore blacklisted endpoints */
+	if (udev->quirks & USB_QUIRK_ENDPOINT_BLACKLIST) {
+		if (usb_endpoint_is_blacklisted(udev, ifp, d)) {
+			dev_warn(ddev, "config %d interface %d altsetting %d has a blacklisted endpoint with address 0x%X, skipping\n",
+					cfgno, inum, asnum,
+					d->bEndpointAddress);
+			goto skip_to_next_endpoint_or_interface_descriptor;
+		}
+	}
+
 	endpoint = &ifp->endpoint[ifp->desc.bNumEndpoints];
 	++ifp->desc.bNumEndpoints;
 

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 6ab4ca1..27486b0 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c

@@ -36,7 +36,9 @@
 #include "otg_whitelist.h"
 
 #define USB_VENDOR_GENESYS_LOGIC		0x05e3
+#define USB_VENDOR_SMSC				0x0424
 #define HUB_QUIRK_CHECK_PORT_AUTOSUSPEND	0x01
+#define HUB_QUIRK_DISABLE_AUTOSUSPEND		0x02
 
 #define USB_TP_TRANSMISSION_DELAY	40	/* ns */
 #define USB_TP_TRANSMISSION_DELAY_MAX	65535	/* ns */
@@ -1190,11 +1192,6 @@
 #ifdef CONFIG_PM
 			udev->reset_resume = 1;
 #endif
-			/* Don't set the change_bits when the device
-			 * was powered off.
-			 */
-			if (test_bit(port1, hub->power_bits))
-				set_bit(port1, hub->change_bits);
 
 		} else {
 			/* The power session is gone; tell hub_wq */
@@ -1700,6 +1697,10 @@
 	kfree(hub->buffer);
 
 	pm_suspend_ignore_children(&intf->dev, false);
+
+	if (hub->quirk_disable_autosuspend)
+		usb_autopm_put_interface(intf);
+
 	kref_put(&hub->kref, hub_release);
 }
 
@@ -1830,6 +1831,11 @@
 	if (id->driver_info & HUB_QUIRK_CHECK_PORT_AUTOSUSPEND)
 		hub->quirk_check_port_auto_suspend = 1;
 
+	if (id->driver_info & HUB_QUIRK_DISABLE_AUTOSUSPEND) {
+		hub->quirk_disable_autosuspend = 1;
+		usb_autopm_get_interface(intf);
+	}
+
 	if (hub_configure(hub, &desc->endpoint[0].desc) >= 0)
 		return 0;
 
@@ -5410,6 +5416,10 @@
 }
 
 static const struct usb_device_id hub_id_table[] = {
+    { .match_flags = USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_INT_CLASS,
+      .idVendor = USB_VENDOR_SMSC,
+      .bInterfaceClass = USB_CLASS_HUB,
+      .driver_info = HUB_QUIRK_DISABLE_AUTOSUSPEND},
     { .match_flags = USB_DEVICE_ID_MATCH_VENDOR
 			| USB_DEVICE_ID_MATCH_INT_CLASS,
       .idVendor = USB_VENDOR_GENESYS_LOGIC,

diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h
index 4accfb6..d0bbbd7 100644
--- a/drivers/usb/core/hub.h
+++ b/drivers/usb/core/hub.h

@@ -61,6 +61,7 @@
 	unsigned		quiescing:1;
 	unsigned		disconnected:1;
 	unsigned		in_reset:1;
+	unsigned		quirk_disable_autosuspend:1;
 
 	unsigned		quirk_check_port_auto_suspend:1;
 

diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index 6b64130..2b24336 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c

@@ -354,6 +354,10 @@
 	{ USB_DEVICE(0x0904, 0x6103), .driver_info =
 			USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL },
 
+	/* Sound Devices USBPre2 */
+	{ USB_DEVICE(0x0926, 0x0202), .driver_info =
+			USB_QUIRK_ENDPOINT_BLACKLIST },
+
 	/* Keytouch QWERTY Panel keyboard */
 	{ USB_DEVICE(0x0926, 0x3333), .driver_info =
 			USB_QUIRK_CONFIG_INTF_STRINGS },
@@ -445,6 +449,9 @@
 	/* INTEL VALUE SSD */
 	{ USB_DEVICE(0x8086, 0xf1a5), .driver_info = USB_QUIRK_RESET_RESUME },
 
+	/* novation SoundControl XL */
+	{ USB_DEVICE(0x1235, 0x0061), .driver_info = USB_QUIRK_RESET_RESUME },
+
 	{ }  /* terminating entry must be last */
 };
 
@@ -472,6 +479,39 @@
 	{ }  /* terminating entry must be last */
 };
 
+/*
+ * Entries for blacklisted endpoints that should be ignored when parsing
+ * configuration descriptors.
+ *
+ * Matched for devices with USB_QUIRK_ENDPOINT_BLACKLIST.
+ */
+static const struct usb_device_id usb_endpoint_blacklist[] = {
+	{ USB_DEVICE_INTERFACE_NUMBER(0x0926, 0x0202, 1), .driver_info = 0x85 },
+	{ }
+};
+
+bool usb_endpoint_is_blacklisted(struct usb_device *udev,
+		struct usb_host_interface *intf,
+		struct usb_endpoint_descriptor *epd)
+{
+	const struct usb_device_id *id;
+	unsigned int address;
+
+	for (id = usb_endpoint_blacklist; id->match_flags; ++id) {
+		if (!usb_match_device(udev, id))
+			continue;
+
+		if (!usb_match_one_id_intf(udev, intf, id))
+			continue;
+
+		address = id->driver_info;
+		if (address == epd->bEndpointAddress)
+			return true;
+	}
+
+	return false;
+}
+
 static bool usb_match_any_interface(struct usb_device *udev,
 				    const struct usb_device_id *id)
 {

diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index d95a535..c0df5a4 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h

@@ -37,6 +37,9 @@
 extern void usb_detect_quirks(struct usb_device *udev);
 extern void usb_detect_interface_quirks(struct usb_device *udev);
 extern void usb_release_quirk_list(void);
+extern bool usb_endpoint_is_blacklisted(struct usb_device *udev,
+		struct usb_host_interface *intf,
+		struct usb_endpoint_descriptor *epd);
 extern int usb_remove_device(struct usb_device *udev);
 
 extern int usb_get_device_descriptor(struct usb_device *dev,

diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c
index f64d1cd..d842483 100644
--- a/drivers/usb/dwc2/gadget.c
+++ b/drivers/usb/dwc2/gadget.c

@@ -1004,11 +1004,6 @@
 	else
 		packets = 1;	/* send one packet if length is zero. */
 
-	if (hs_ep->isochronous && length > (hs_ep->mc * hs_ep->ep.maxpacket)) {
-		dev_err(hsotg->dev, "req length > maxpacket*mc\n");
-		return;
-	}
-
 	if (dir_in && index != 0)
 		if (hs_ep->isochronous)
 			epsize = DXEPTSIZ_MC(packets);
@@ -1312,6 +1307,13 @@
 	req->actual = 0;
 	req->status = -EINPROGRESS;
 
+	/* Don't queue ISOC request if length greater than mps*mc */
+	if (hs_ep->isochronous &&
+	    req->length > (hs_ep->mc * hs_ep->ep.maxpacket)) {
+		dev_err(hs->dev, "req length > maxpacket*mc\n");
+		return -EINVAL;
+	}
+
 	/* In DDMA mode for ISOC's don't queue request if length greater
 	 * than descriptor limits.
 	 */
@@ -1542,6 +1544,7 @@
 	struct dwc2_hsotg_ep *ep0 = hsotg->eps_out[0];
 	struct dwc2_hsotg_ep *ep;
 	__le16 reply;
+	u16 status;
 	int ret;
 
 	dev_dbg(hsotg->dev, "%s: USB_REQ_GET_STATUS\n", __func__);
@@ -1553,11 +1556,10 @@
 
 	switch (ctrl->bRequestType & USB_RECIP_MASK) {
 	case USB_RECIP_DEVICE:
-		/*
-		 * bit 0 => self powered
-		 * bit 1 => remote wakeup
-		 */
-		reply = cpu_to_le16(0);
+		status = 1 << USB_DEVICE_SELF_POWERED;
+		status |= hsotg->remote_wakeup_allowed <<
+			  USB_DEVICE_REMOTE_WAKEUP;
+		reply = cpu_to_le16(status);
 		break;
 
 	case USB_RECIP_INTERFACE:
@@ -1668,7 +1670,10 @@
 	case USB_RECIP_DEVICE:
 		switch (wValue) {
 		case USB_DEVICE_REMOTE_WAKEUP:
-			hsotg->remote_wakeup_allowed = 1;
+			if (set)
+				hsotg->remote_wakeup_allowed = 1;
+			else
+				hsotg->remote_wakeup_allowed = 0;
 			break;
 
 		case USB_DEVICE_TEST_MODE:
@@ -1678,16 +1683,17 @@
 				return -EINVAL;
 
 			hsotg->test_mode = wIndex >> 8;
-			ret = dwc2_hsotg_send_reply(hsotg, ep0, NULL, 0);
-			if (ret) {
-				dev_err(hsotg->dev,
-					"%s: failed to send reply\n", __func__);
-				return ret;
-			}
 			break;
 		default:
 			return -ENOENT;
 		}
+
+		ret = dwc2_hsotg_send_reply(hsotg, ep0, NULL, 0);
+		if (ret) {
+			dev_err(hsotg->dev,
+				"%s: failed to send reply\n", __func__);
+			return ret;
+		}
 		break;
 
 	case USB_RECIP_ENDPOINT:
@@ -3918,11 +3924,12 @@
 	 * a unique tx-fifo even if it is non-periodic.
 	 */
 	if (dir_in && hsotg->dedicated_fifos) {
+		unsigned fifo_count = dwc2_hsotg_tx_fifo_count(hsotg);
 		u32 fifo_index = 0;
 		u32 fifo_size = UINT_MAX;
 
 		size = hs_ep->ep.maxpacket * hs_ep->mc;
-		for (i = 1; i < hsotg->num_of_eps; ++i) {
+		for (i = 1; i <= fifo_count; ++i) {
 			if (hsotg->fifo_map & (1 << i))
 				continue;
 			val = dwc2_readl(hsotg, DPTXFSIZN(i));

diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index a6e682a..430cfd6 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c

@@ -2224,7 +2224,8 @@
 	if (event->status & DEPEVT_STATUS_SHORT && !chain)
 		return 1;
 
-	if (event->status & DEPEVT_STATUS_IOC)
+	if ((trb->ctrl & DWC3_TRB_CTRL_IOC) ||
+	    (trb->ctrl & DWC3_TRB_CTRL_LST))
 		return 1;
 
 	return 0;

diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index 33115e1..fea7c7e 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c

@@ -437,12 +437,10 @@
 		val = CONFIG_USB_GADGET_VBUS_DRAW;
 	if (!val)
 		return 0;
-	switch (speed) {
-	case USB_SPEED_SUPER:
-		return DIV_ROUND_UP(val, 8);
-	default:
+	if (speed < USB_SPEED_SUPER)
 		return DIV_ROUND_UP(val, 2);
-	}
+	else
+		return DIV_ROUND_UP(val, 8);
 }
 
 static int config_buf(struct usb_configuration *config,

diff --git a/drivers/usb/gadget/udc/gr_udc.c b/drivers/usb/gadget/udc/gr_udc.c
index 729e60e..e50108f 100644
--- a/drivers/usb/gadget/udc/gr_udc.c
+++ b/drivers/usb/gadget/udc/gr_udc.c

@@ -2180,8 +2180,6 @@
 		return -ENOMEM;
 	}
 
-	spin_lock(&dev->lock);
-
 	/* Inside lock so that no gadget can use this udc until probe is done */
 	retval = usb_add_gadget_udc(dev->dev, &dev->gadget);
 	if (retval) {
@@ -2190,15 +2188,21 @@
 	}
 	dev->added = 1;
 
-	retval = gr_udc_init(dev);
-	if (retval)
-		goto out;
+	spin_lock(&dev->lock);
 
-	gr_dfs_create(dev);
+	retval = gr_udc_init(dev);
+	if (retval) {
+		spin_unlock(&dev->lock);
+		goto out;
+	}
 
 	/* Clear all interrupt enables that might be left on since last boot */
 	gr_disable_interrupts_and_pullup(dev);
 
+	spin_unlock(&dev->lock);
+
+	gr_dfs_create(dev);
+
 	retval = gr_request_irq(dev, dev->irq);
 	if (retval) {
 		dev_err(dev->dev, "Failed to request irq %d\n", dev->irq);
@@ -2227,8 +2231,6 @@
 		dev_info(dev->dev, "regs: %p, irq %d\n", dev->regs, dev->irq);
 
 out:
-	spin_unlock(&dev->lock);
-
 	if (retval)
 		gr_remove(pdev);
 

diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index 9772c0d..a024230 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c

@@ -55,6 +55,7 @@
 static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf,
 				     u16 wLength)
 {
+	struct xhci_port_cap *port_cap = NULL;
 	int i, ssa_count;
 	u32 temp;
 	u16 desc_size, ssp_cap_size, ssa_size = 0;
@@ -64,16 +65,24 @@
 	ssp_cap_size = sizeof(usb_bos_descriptor) - desc_size;
 
 	/* does xhci support USB 3.1 Enhanced SuperSpeed */
-	if (xhci->usb3_rhub.min_rev >= 0x01) {
+	for (i = 0; i < xhci->num_port_caps; i++) {
+		if (xhci->port_caps[i].maj_rev == 0x03 &&
+		    xhci->port_caps[i].min_rev >= 0x01) {
+			usb3_1 = true;
+			port_cap = &xhci->port_caps[i];
+			break;
+		}
+	}
+
+	if (usb3_1) {
 		/* does xhci provide a PSI table for SSA speed attributes? */
-		if (xhci->usb3_rhub.psi_count) {
+		if (port_cap->psi_count) {
 			/* two SSA entries for each unique PSI ID, RX and TX */
-			ssa_count = xhci->usb3_rhub.psi_uid_count * 2;
+			ssa_count = port_cap->psi_uid_count * 2;
 			ssa_size = ssa_count * sizeof(u32);
 			ssp_cap_size -= 16; /* skip copying the default SSA */
 		}
 		desc_size += ssp_cap_size;
-		usb3_1 = true;
 	}
 	memcpy(buf, &usb_bos_descriptor, min(desc_size, wLength));
 
@@ -99,7 +108,7 @@
 	}
 
 	/* If PSI table exists, add the custom speed attributes from it */
-	if (usb3_1 && xhci->usb3_rhub.psi_count) {
+	if (usb3_1 && port_cap->psi_count) {
 		u32 ssp_cap_base, bm_attrib, psi, psi_mant, psi_exp;
 		int offset;
 
@@ -111,7 +120,7 @@
 
 		/* attribute count SSAC bits 4:0 and ID count SSIC bits 8:5 */
 		bm_attrib = (ssa_count - 1) & 0x1f;
-		bm_attrib |= (xhci->usb3_rhub.psi_uid_count - 1) << 5;
+		bm_attrib |= (port_cap->psi_uid_count - 1) << 5;
 		put_unaligned_le32(bm_attrib, &buf[ssp_cap_base + 4]);
 
 		if (wLength < desc_size + ssa_size)
@@ -124,8 +133,8 @@
 		 * USB 3.1 requires two SSA entries (RX and TX) for every link
 		 */
 		offset = desc_size;
-		for (i = 0; i < xhci->usb3_rhub.psi_count; i++) {
-			psi = xhci->usb3_rhub.psi[i];
+		for (i = 0; i < port_cap->psi_count; i++) {
+			psi = port_cap->psi[i];
 			psi &= ~USB_SSP_SUBLINK_SPEED_RSVD;
 			psi_exp = XHCI_EXT_PORT_PSIE(psi);
 			psi_mant = XHCI_EXT_PORT_PSIM(psi);

diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
index 82ce6d8..9e87c28 100644
--- a/drivers/usb/host/xhci-mem.c
+++ b/drivers/usb/host/xhci-mem.c

@@ -1475,9 +1475,15 @@
 	/* Allow 3 retries for everything but isoc, set CErr = 3 */
 	if (!usb_endpoint_xfer_isoc(&ep->desc))
 		err_count = 3;
-	/* Some devices get this wrong */
-	if (usb_endpoint_xfer_bulk(&ep->desc) && udev->speed == USB_SPEED_HIGH)
-		max_packet = 512;
+	/* HS bulk max packet should be 512, FS bulk supports 8, 16, 32 or 64 */
+	if (usb_endpoint_xfer_bulk(&ep->desc)) {
+		if (udev->speed == USB_SPEED_HIGH)
+			max_packet = 512;
+		if (udev->speed == USB_SPEED_FULL) {
+			max_packet = rounddown_pow_of_two(max_packet);
+			max_packet = clamp_val(max_packet, 8, 64);
+		}
+	}
 	/* xHCI 1.0 and 1.1 indicates that ctrl ep avg TRB Length should be 8 */
 	if (usb_endpoint_xfer_control(&ep->desc) && xhci->hci_version >= 0x100)
 		avg_trb_len = 8;
@@ -1909,17 +1915,17 @@
 	xhci->usb3_rhub.num_ports = 0;
 	xhci->num_active_eps = 0;
 	kfree(xhci->usb2_rhub.ports);
-	kfree(xhci->usb2_rhub.psi);
 	kfree(xhci->usb3_rhub.ports);
-	kfree(xhci->usb3_rhub.psi);
 	kfree(xhci->hw_ports);
 	kfree(xhci->rh_bw);
 	kfree(xhci->ext_caps);
+	for (i = 0; i < xhci->num_port_caps; i++)
+		kfree(xhci->port_caps[i].psi);
+	kfree(xhci->port_caps);
+	xhci->num_port_caps = 0;
 
 	xhci->usb2_rhub.ports = NULL;
-	xhci->usb2_rhub.psi = NULL;
 	xhci->usb3_rhub.ports = NULL;
-	xhci->usb3_rhub.psi = NULL;
 	xhci->hw_ports = NULL;
 	xhci->rh_bw = NULL;
 	xhci->ext_caps = NULL;
@@ -2120,6 +2126,7 @@
 	u8 major_revision, minor_revision;
 	struct xhci_hub *rhub;
 	struct device *dev = xhci_to_hcd(xhci)->self.sysdev;
+	struct xhci_port_cap *port_cap;
 
 	temp = readl(addr);
 	major_revision = XHCI_EXT_PORT_MAJOR(temp);
@@ -2154,31 +2161,39 @@
 		/* WTF? "Valid values are ‘1’ to MaxPorts" */
 		return;
 
-	rhub->psi_count = XHCI_EXT_PORT_PSIC(temp);
-	if (rhub->psi_count) {
-		rhub->psi = kcalloc_node(rhub->psi_count, sizeof(*rhub->psi),
-				    GFP_KERNEL, dev_to_node(dev));
-		if (!rhub->psi)
-			rhub->psi_count = 0;
+	port_cap = &xhci->port_caps[xhci->num_port_caps++];
+	if (xhci->num_port_caps > max_caps)
+		return;
 
-		rhub->psi_uid_count++;
-		for (i = 0; i < rhub->psi_count; i++) {
-			rhub->psi[i] = readl(addr + 4 + i);
+	port_cap->maj_rev = major_revision;
+	port_cap->min_rev = minor_revision;
+	port_cap->psi_count = XHCI_EXT_PORT_PSIC(temp);
+
+	if (port_cap->psi_count) {
+		port_cap->psi = kcalloc_node(port_cap->psi_count,
+					     sizeof(*port_cap->psi),
+					     GFP_KERNEL, dev_to_node(dev));
+		if (!port_cap->psi)
+			port_cap->psi_count = 0;
+
+		port_cap->psi_uid_count++;
+		for (i = 0; i < port_cap->psi_count; i++) {
+			port_cap->psi[i] = readl(addr + 4 + i);
 
 			/* count unique ID values, two consecutive entries can
 			 * have the same ID if link is assymetric
 			 */
-			if (i && (XHCI_EXT_PORT_PSIV(rhub->psi[i]) !=
-				  XHCI_EXT_PORT_PSIV(rhub->psi[i - 1])))
-				rhub->psi_uid_count++;
+			if (i && (XHCI_EXT_PORT_PSIV(port_cap->psi[i]) !=
+				  XHCI_EXT_PORT_PSIV(port_cap->psi[i - 1])))
+				port_cap->psi_uid_count++;
 
 			xhci_dbg(xhci, "PSIV:%d PSIE:%d PLT:%d PFD:%d LP:%d PSIM:%d\n",
-				  XHCI_EXT_PORT_PSIV(rhub->psi[i]),
-				  XHCI_EXT_PORT_PSIE(rhub->psi[i]),
-				  XHCI_EXT_PORT_PLT(rhub->psi[i]),
-				  XHCI_EXT_PORT_PFD(rhub->psi[i]),
-				  XHCI_EXT_PORT_LP(rhub->psi[i]),
-				  XHCI_EXT_PORT_PSIM(rhub->psi[i]));
+				  XHCI_EXT_PORT_PSIV(port_cap->psi[i]),
+				  XHCI_EXT_PORT_PSIE(port_cap->psi[i]),
+				  XHCI_EXT_PORT_PLT(port_cap->psi[i]),
+				  XHCI_EXT_PORT_PFD(port_cap->psi[i]),
+				  XHCI_EXT_PORT_LP(port_cap->psi[i]),
+				  XHCI_EXT_PORT_PSIM(port_cap->psi[i]));
 		}
 	}
 	/* cache usb2 port capabilities */
@@ -2225,6 +2240,7 @@
 			continue;
 		}
 		hw_port->rhub = rhub;
+		hw_port->port_cap = port_cap;
 		rhub->num_ports++;
 	}
 	/* FIXME: Should we disable ports not in the Extended Capabilities? */
@@ -2315,6 +2331,11 @@
 	if (!xhci->ext_caps)
 		return -ENOMEM;
 
+	xhci->port_caps = kcalloc_node(cap_count, sizeof(*xhci->port_caps),
+				flags, dev_to_node(dev));
+	if (!xhci->port_caps)
+		return -ENOMEM;
+
 	offset = cap_start;
 
 	while (offset) {

diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index 075c49c..58cf551 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c

@@ -41,6 +41,7 @@
 #define PCI_DEVICE_ID_INTEL_BROXTON_B_XHCI		0x1aa8
 #define PCI_DEVICE_ID_INTEL_APL_XHCI			0x5aa8
 #define PCI_DEVICE_ID_INTEL_DNV_XHCI			0x19d0
+#define PCI_DEVICE_ID_INTEL_CML_XHCI			0xa3af
 
 #define PCI_DEVICE_ID_AMD_PROMONTORYA_4			0x43b9
 #define PCI_DEVICE_ID_AMD_PROMONTORYA_3			0x43ba
@@ -179,7 +180,8 @@
 		 pdev->device == PCI_DEVICE_ID_INTEL_BROXTON_M_XHCI ||
 		 pdev->device == PCI_DEVICE_ID_INTEL_BROXTON_B_XHCI ||
 		 pdev->device == PCI_DEVICE_ID_INTEL_APL_XHCI ||
-		 pdev->device == PCI_DEVICE_ID_INTEL_DNV_XHCI)) {
+		 pdev->device == PCI_DEVICE_ID_INTEL_DNV_XHCI ||
+		 pdev->device == PCI_DEVICE_ID_INTEL_CML_XHCI)) {
 		xhci->quirks |= XHCI_PME_STUCK_QUIRK;
 	}
 	if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
@@ -283,6 +285,9 @@
 	if (!usb_hcd_is_primary_hcd(hcd))
 		return 0;
 
+	if (xhci->quirks & XHCI_PME_STUCK_QUIRK)
+		xhci_pme_acpi_rtd3_enable(pdev);
+
 	xhci_dbg(xhci, "Got SBRN %u\n", (unsigned int) xhci->sbrn);
 
 	/* Find any debug ports */
@@ -340,9 +345,6 @@
 			HCC_MAX_PSA(xhci->hcc_params) >= 4)
 		xhci->shared_hcd->can_do_streams = 1;
 
-	if (xhci->quirks & XHCI_PME_STUCK_QUIRK)
-		xhci_pme_acpi_rtd3_enable(dev);
-
 	/* USB-2 and USB-3 roothubs initialized, allow runtime pm suspend */
 	pm_runtime_put_noidle(&dev->dev);
 

diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 98b6760..509a7fc 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c

@@ -2693,6 +2693,42 @@
 }
 
 /*
+ * Update Event Ring Dequeue Pointer:
+ * - When all events have finished
+ * - To avoid "Event Ring Full Error" condition
+ */
+static void xhci_update_erst_dequeue(struct xhci_hcd *xhci,
+		union xhci_trb *event_ring_deq)
+{
+	u64 temp_64;
+	dma_addr_t deq;
+
+	temp_64 = xhci_read_64(xhci, &xhci->ir_set->erst_dequeue);
+	/* If necessary, update the HW's version of the event ring deq ptr. */
+	if (event_ring_deq != xhci->event_ring->dequeue) {
+		deq = xhci_trb_virt_to_dma(xhci->event_ring->deq_seg,
+				xhci->event_ring->dequeue);
+		if (deq == 0)
+			xhci_warn(xhci, "WARN something wrong with SW event ring dequeue ptr\n");
+		/*
+		 * Per 4.9.4, Software writes to the ERDP register shall
+		 * always advance the Event Ring Dequeue Pointer value.
+		 */
+		if ((temp_64 & (u64) ~ERST_PTR_MASK) ==
+				((u64) deq & (u64) ~ERST_PTR_MASK))
+			return;
+
+		/* Update HC event ring dequeue pointer */
+		temp_64 &= ERST_PTR_MASK;
+		temp_64 |= ((u64) deq & (u64) ~ERST_PTR_MASK);
+	}
+
+	/* Clear the event handler busy flag (RW1C) */
+	temp_64 |= ERST_EHB;
+	xhci_write_64(xhci, temp_64, &xhci->ir_set->erst_dequeue);
+}
+
+/*
  * xHCI spec says we can get an interrupt, and if the HC has an error condition,
  * we might get bad data out of the event ring.  Section 4.10.2.7 has a list of
  * indicators of an event TRB error, but we check the status *first* to be safe.
@@ -2703,9 +2739,9 @@
 	union xhci_trb *event_ring_deq;
 	irqreturn_t ret = IRQ_NONE;
 	unsigned long flags;
-	dma_addr_t deq;
 	u64 temp_64;
 	u32 status;
+	int event_loop = 0;
 
 	spin_lock_irqsave(&xhci->lock, flags);
 	/* Check if the xHC generated the interrupt, or the irq is shared */
@@ -2759,24 +2795,14 @@
 	/* FIXME this should be a delayed service routine
 	 * that clears the EHB.
 	 */
-	while (xhci_handle_event(xhci) > 0) {}
-
-	temp_64 = xhci_read_64(xhci, &xhci->ir_set->erst_dequeue);
-	/* If necessary, update the HW's version of the event ring deq ptr. */
-	if (event_ring_deq != xhci->event_ring->dequeue) {
-		deq = xhci_trb_virt_to_dma(xhci->event_ring->deq_seg,
-				xhci->event_ring->dequeue);
-		if (deq == 0)
-			xhci_warn(xhci, "WARN something wrong with SW event "
-					"ring dequeue ptr.\n");
-		/* Update HC event ring dequeue pointer */
-		temp_64 &= ERST_PTR_MASK;
-		temp_64 |= ((u64) deq & (u64) ~ERST_PTR_MASK);
+	while (xhci_handle_event(xhci) > 0) {
+		if (event_loop++ < TRBS_PER_SEGMENT / 2)
+			continue;
+		xhci_update_erst_dequeue(xhci, event_ring_deq);
+		event_loop = 0;
 	}
 
-	/* Clear the event handler busy flag (RW1C); event ring is empty. */
-	temp_64 |= ERST_EHB;
-	xhci_write_64(xhci, temp_64, &xhci->ir_set->erst_dequeue);
+	xhci_update_erst_dequeue(xhci, event_ring_deq);
 	ret = IRQ_HANDLED;
 
 out:

diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
index 9b33031..4dedc82 100644
--- a/drivers/usb/host/xhci.h
+++ b/drivers/usb/host/xhci.h

@@ -1704,11 +1704,21 @@
 	else
 		return 1;
 }
+
+struct xhci_port_cap {
+	u32			*psi;	/* array of protocol speed ID entries */
+	u8			psi_count;
+	u8			psi_uid_count;
+	u8			maj_rev;
+	u8			min_rev;
+};
+
 struct xhci_port {
 	__le32 __iomem		*addr;
 	int			hw_portnum;
 	int			hcd_portnum;
 	struct xhci_hub		*rhub;
+	struct xhci_port_cap	*port_cap;
 };
 
 struct xhci_hub {
@@ -1718,9 +1728,6 @@
 	/* supported prococol extended capabiliy values */
 	u8			maj_rev;
 	u8			min_rev;
-	u32			*psi;	/* array of protocol speed ID entries */
-	u8			psi_count;
-	u8			psi_uid_count;
 };
 
 /* There is one xhci_hcd structure per controller */
@@ -1882,6 +1889,9 @@
 	/* cached usb2 extened protocol capabilites */
 	u32                     *ext_caps;
 	unsigned int            num_ext_caps;
+	/* cached extended protocol port capabilities */
+	struct xhci_port_cap	*port_caps;
+	unsigned int		num_port_caps;
 	/* Compliance Mode Recovery Data */
 	struct timer_list	comp_mode_recovery_timer;
 	u32			port_status_u0;

diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c
index 2d9d949..92875a2 100644
--- a/drivers/usb/misc/iowarrior.c
+++ b/drivers/usb/misc/iowarrior.c

@@ -33,6 +33,14 @@
 #define USB_DEVICE_ID_CODEMERCS_IOWPV2	0x1512
 /* full speed iowarrior */
 #define USB_DEVICE_ID_CODEMERCS_IOW56	0x1503
+/* fuller speed iowarrior */
+#define USB_DEVICE_ID_CODEMERCS_IOW28	0x1504
+#define USB_DEVICE_ID_CODEMERCS_IOW28L	0x1505
+#define USB_DEVICE_ID_CODEMERCS_IOW100	0x1506
+
+/* OEMed devices */
+#define USB_DEVICE_ID_CODEMERCS_IOW24SAG	0x158a
+#define USB_DEVICE_ID_CODEMERCS_IOW56AM		0x158b
 
 /* Get a minor range for your devices from the usb maintainer */
 #ifdef CONFIG_USB_DYNAMIC_MINORS
@@ -137,6 +145,11 @@
 	{USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOWPV1)},
 	{USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOWPV2)},
 	{USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW56)},
+	{USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW24SAG)},
+	{USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW56AM)},
+	{USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW28)},
+	{USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW28L)},
+	{USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW100)},
 	{}			/* Terminating entry */
 };
 MODULE_DEVICE_TABLE(usb, iowarrior_ids);
@@ -361,6 +374,7 @@
 	}
 	switch (dev->product_id) {
 	case USB_DEVICE_ID_CODEMERCS_IOW24:
+	case USB_DEVICE_ID_CODEMERCS_IOW24SAG:
 	case USB_DEVICE_ID_CODEMERCS_IOWPV1:
 	case USB_DEVICE_ID_CODEMERCS_IOWPV2:
 	case USB_DEVICE_ID_CODEMERCS_IOW40:
@@ -375,6 +389,10 @@
 		goto exit;
 		break;
 	case USB_DEVICE_ID_CODEMERCS_IOW56:
+	case USB_DEVICE_ID_CODEMERCS_IOW56AM:
+	case USB_DEVICE_ID_CODEMERCS_IOW28:
+	case USB_DEVICE_ID_CODEMERCS_IOW28L:
+	case USB_DEVICE_ID_CODEMERCS_IOW100:
 		/* The IOW56 uses asynchronous IO and more urbs */
 		if (atomic_read(&dev->write_busy) == MAX_WRITES_IN_FLIGHT) {
 			/* Wait until we are below the limit for submitted urbs */
@@ -499,6 +517,7 @@
 	switch (cmd) {
 	case IOW_WRITE:
 		if (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW24 ||
+		    dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW24SAG ||
 		    dev->product_id == USB_DEVICE_ID_CODEMERCS_IOWPV1 ||
 		    dev->product_id == USB_DEVICE_ID_CODEMERCS_IOWPV2 ||
 		    dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW40) {
@@ -782,7 +801,11 @@
 		goto error;
 	}
 
-	if (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56) {
+	if ((dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56) ||
+	    (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56AM) ||
+	    (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28) ||
+	    (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28L) ||
+	    (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW100)) {
 		res = usb_find_last_int_out_endpoint(iface_desc,
 				&dev->int_out_endpoint);
 		if (res) {
@@ -795,7 +818,11 @@
 	/* we have to check the report_size often, so remember it in the endianness suitable for our machine */
 	dev->report_size = usb_endpoint_maxp(dev->int_in_endpoint);
 	if ((dev->interface->cur_altsetting->desc.bInterfaceNumber == 0) &&
-	    (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56))
+	    ((dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56) ||
+	     (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56AM) ||
+	     (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28) ||
+	     (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28L) ||
+	     (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW100)))
 		/* IOWarrior56 has wMaxPacketSize different from report size */
 		dev->report_size = 7;
 

diff --git a/drivers/usb/musb/omap2430.c b/drivers/usb/musb/omap2430.c
index b1dd81f..24e622c 100644
--- a/drivers/usb/musb/omap2430.c
+++ b/drivers/usb/musb/omap2430.c

@@ -361,8 +361,6 @@
 	.init		= omap2430_musb_init,
 	.exit		= omap2430_musb_exit,
 
-	.set_vbus	= omap2430_musb_set_vbus,
-
 	.enable		= omap2430_musb_enable,
 	.disable	= omap2430_musb_disable,
 

diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c
index 1c6eb3a..62ca8e2 100644
--- a/drivers/usb/storage/uas.c
+++ b/drivers/usb/storage/uas.c

@@ -45,6 +45,7 @@
 	struct scsi_cmnd *cmnd[MAX_CMNDS];
 	spinlock_t lock;
 	struct work_struct work;
+	struct work_struct scan_work;      /* for async scanning */
 };
 
 enum {
@@ -114,6 +115,17 @@
 	spin_unlock_irqrestore(&devinfo->lock, flags);
 }
 
+static void uas_scan_work(struct work_struct *work)
+{
+	struct uas_dev_info *devinfo =
+		container_of(work, struct uas_dev_info, scan_work);
+	struct Scsi_Host *shost = usb_get_intfdata(devinfo->intf);
+
+	dev_dbg(&devinfo->intf->dev, "starting scan\n");
+	scsi_scan_host(shost);
+	dev_dbg(&devinfo->intf->dev, "scan complete\n");
+}
+
 static void uas_add_work(struct uas_cmd_info *cmdinfo)
 {
 	struct scsi_pointer *scp = (void *)cmdinfo;
@@ -989,6 +1001,7 @@
 	init_usb_anchor(&devinfo->data_urbs);
 	spin_lock_init(&devinfo->lock);
 	INIT_WORK(&devinfo->work, uas_do_work);
+	INIT_WORK(&devinfo->scan_work, uas_scan_work);
 
 	result = uas_configure_endpoints(devinfo);
 	if (result)
@@ -1005,7 +1018,9 @@
 	if (result)
 		goto free_streams;
 
-	scsi_scan_host(shost);
+	/* Submit the delayed_work for SCSI-device scanning */
+	schedule_work(&devinfo->scan_work);
+
 	return result;
 
 free_streams:
@@ -1173,6 +1188,12 @@
 	usb_kill_anchored_urbs(&devinfo->data_urbs);
 	uas_zap_pending(devinfo, DID_NO_CONNECT);
 
+	/*
+	 * Prevent SCSI scanning (if it hasn't started yet)
+	 * or wait for the SCSI-scanning routine to stop.
+	 */
+	cancel_work_sync(&devinfo->scan_work);
+
 	scsi_remove_host(shost);
 	uas_free_streams(devinfo);
 	scsi_host_put(shost);

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 124356d..88c8c15 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c

@@ -1187,10 +1187,6 @@
 
 static struct socket *get_raw_socket(int fd)
 {
-	struct {
-		struct sockaddr_ll sa;
-		char  buf[MAX_ADDR_LEN];
-	} uaddr;
 	int r;
 	struct socket *sock = sockfd_lookup(fd, &r);
 
@@ -1203,11 +1199,7 @@
 		goto err;
 	}
 
-	r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, 0);
-	if (r < 0)
-		goto err;
-
-	if (uaddr.sa.sll_family != AF_PACKET) {
+	if (sock->sk->sk_family != AF_PACKET) {
 		r = -EPFNOSUPPORT;
 		goto err;
 	}

diff --git a/drivers/video/fbdev/pxa168fb.c b/drivers/video/fbdev/pxa168fb.c
index d059d04..20195d3 100644
--- a/drivers/video/fbdev/pxa168fb.c
+++ b/drivers/video/fbdev/pxa168fb.c

@@ -769,8 +769,8 @@
 failed_free_clk:
 	clk_disable_unprepare(fbi->clk);
 failed_free_fbmem:
-	dma_free_coherent(fbi->dev, info->fix.smem_len,
-			info->screen_base, fbi->fb_start_dma);
+	dma_free_wc(fbi->dev, info->fix.smem_len,
+		    info->screen_base, fbi->fb_start_dma);
 failed_free_info:
 	kfree(info);
 
@@ -804,7 +804,7 @@
 
 	irq = platform_get_irq(pdev, 0);
 
-	dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+	dma_free_wc(fbi->dev, info->fix.smem_len,
 		    info->screen_base, info->fix.smem_start);
 
 	clk_disable_unprepare(fbi->clk);

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 14ac36c..2df7b1c 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c

@@ -126,6 +126,8 @@
 {
 	unsigned int i;
 
+	BUILD_BUG_ON(VIRTIO_BALLOON_PAGES_PER_PAGE > VIRTIO_BALLOON_ARRAY_PFNS_MAX);
+
 	/*
 	 * Set balloon pfns pointing at this page.
 	 * Note that the first pfn points at start of the page.
@@ -264,7 +266,10 @@
 				pages_to_bytes(events[PSWPIN]));
 	update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT,
 				pages_to_bytes(events[PSWPOUT]));
-	update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]);
+	update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT,
+		    events[PGMAJFAULT_S] +
+		    events[PGMAJFAULT_A] +
+		    events[PGMAJFAULT_F]);
 	update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]);
 #ifdef CONFIG_HUGETLB_PAGE
 	update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,

diff --git a/drivers/visorbus/visorchipset.c b/drivers/visorbus/visorchipset.c
index ca752b8..cb1eb7e 100644
--- a/drivers/visorbus/visorchipset.c
+++ b/drivers/visorbus/visorchipset.c

@@ -1210,14 +1210,17 @@
 {
 	struct controlvm_message local_crash_bus_msg;
 	struct controlvm_message local_crash_dev_msg;
-	struct controlvm_message msg;
+	struct controlvm_message msg = {
+		.hdr.id = CONTROLVM_CHIPSET_INIT,
+		.cmd.init_chipset = {
+			.bus_count = 23,
+			.switch_count = 0,
+		},
+	};
 	u32 local_crash_msg_offset;
 	u16 local_crash_msg_count;
 
 	/* send init chipset msg */
-	msg.hdr.id = CONTROLVM_CHIPSET_INIT;
-	msg.cmd.init_chipset.bus_count = 23;
-	msg.cmd.init_chipset.switch_count = 0;
 	chipset_init(&msg);
 	/* get saved message count */
 	if (visorchannel_read(chipset_dev->controlvm_channel,

diff --git a/drivers/vme/bridges/vme_fake.c b/drivers/vme/bridges/vme_fake.c
index 7d83691..685a43bd 100644
--- a/drivers/vme/bridges/vme_fake.c
+++ b/drivers/vme/bridges/vme_fake.c

@@ -418,8 +418,9 @@
 	}
 }
 
-static u8 fake_vmeread8(struct fake_driver *bridge, unsigned long long addr,
-		u32 aspace, u32 cycle)
+static noinline_for_stack u8 fake_vmeread8(struct fake_driver *bridge,
+					   unsigned long long addr,
+					   u32 aspace, u32 cycle)
 {
 	u8 retval = 0xff;
 	int i;
@@ -450,8 +451,9 @@
 	return retval;
 }
 
-static u16 fake_vmeread16(struct fake_driver *bridge, unsigned long long addr,
-		u32 aspace, u32 cycle)
+static noinline_for_stack u16 fake_vmeread16(struct fake_driver *bridge,
+					     unsigned long long addr,
+					     u32 aspace, u32 cycle)
 {
 	u16 retval = 0xffff;
 	int i;
@@ -482,8 +484,9 @@
 	return retval;
 }
 
-static u32 fake_vmeread32(struct fake_driver *bridge, unsigned long long addr,
-		u32 aspace, u32 cycle)
+static noinline_for_stack u32 fake_vmeread32(struct fake_driver *bridge,
+					     unsigned long long addr,
+					     u32 aspace, u32 cycle)
 {
 	u32 retval = 0xffffffff;
 	int i;
@@ -613,8 +616,9 @@
 	return retval;
 }
 
-static void fake_vmewrite8(struct fake_driver *bridge, u8 *buf,
-			   unsigned long long addr, u32 aspace, u32 cycle)
+static noinline_for_stack void fake_vmewrite8(struct fake_driver *bridge,
+					      u8 *buf, unsigned long long addr,
+					      u32 aspace, u32 cycle)
 {
 	int i;
 	unsigned long long start, end, offset;
@@ -643,8 +647,9 @@
 
 }
 
-static void fake_vmewrite16(struct fake_driver *bridge, u16 *buf,
-			    unsigned long long addr, u32 aspace, u32 cycle)
+static noinline_for_stack void fake_vmewrite16(struct fake_driver *bridge,
+					       u16 *buf, unsigned long long addr,
+					       u32 aspace, u32 cycle)
 {
 	int i;
 	unsigned long long start, end, offset;
@@ -673,8 +678,9 @@
 
 }
 
-static void fake_vmewrite32(struct fake_driver *bridge, u32 *buf,
-			    unsigned long long addr, u32 aspace, u32 cycle)
+static noinline_for_stack void fake_vmewrite32(struct fake_driver *bridge,
+					       u32 *buf, unsigned long long addr,
+					       u32 aspace, u32 cycle)
 {
 	int i;
 	unsigned long long start, end, offset;

diff --git a/drivers/watchdog/wdat_wdt.c b/drivers/watchdog/wdat_wdt.c
index 56ad196..9d91ed5 100644
--- a/drivers/watchdog/wdat_wdt.c
+++ b/drivers/watchdog/wdat_wdt.c

@@ -392,7 +392,7 @@
 
 		memset(&r, 0, sizeof(r));
 		r.start = gas->address;
-		r.end = r.start + gas->access_width - 1;
+		r.end = r.start + ACPI_ACCESS_BYTE_WIDTH(gas->access_width) - 1;
 		if (gas->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) {
 			r.flags = IORESOURCE_MEM;
 		} else if (gas->space_id == ACPI_ADR_SPACE_SYSTEM_IO) {

diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c
index 08cb419..5f6b77e 100644
--- a/drivers/xen/preempt.c
+++ b/drivers/xen/preempt.c

@@ -37,7 +37,9 @@
 		 * cpu.
 		 */
 		__this_cpu_write(xen_in_preemptible_hcall, false);
-		_cond_resched();
+		local_irq_enable();
+		cond_resched();
+		local_irq_disable();
 		__this_cpu_write(xen_in_preemptible_hcall, true);
 	}
 }

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 833cf3c..3b77c8a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c

@@ -629,7 +629,6 @@
 static int btrfsic_process_superblock(struct btrfsic_state *state,
 				      struct btrfs_fs_devices *fs_devices)
 {
-	struct btrfs_fs_info *fs_info = state->fs_info;
 	struct btrfs_super_block *selected_super;
 	struct list_head *dev_head = &fs_devices->devices;
 	struct btrfs_device *device;
@@ -700,7 +699,7 @@
 			break;
 		}
 
-		num_copies = btrfs_num_copies(fs_info, next_bytenr,
+		num_copies = btrfs_num_copies(state->fs_info, next_bytenr,
 					      state->metablock_size);
 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
 			pr_info("num_copies(log_bytenr=%llu) = %d\n",

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9e467e8..b5039b1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c

@@ -3117,6 +3117,7 @@
 	/* do not make disk changes in broken FS or nologreplay is given */
 	if (btrfs_super_log_root(disk_super) != 0 &&
 	    !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
+		btrfs_info(fs_info, "start tree-log replay");
 		ret = btrfs_replay_log(fs_info, fs_devices);
 		if (ret) {
 			err = ret;
@@ -3152,6 +3153,7 @@
 	if (IS_ERR(fs_info->fs_root)) {
 		err = PTR_ERR(fs_info->fs_root);
 		btrfs_warn(fs_info, "failed to read fs tree: %d", err);
+		fs_info->fs_root = NULL;
 		goto fail_qgroup;
 	}
 
@@ -4467,7 +4469,6 @@
 	wake_up(&fs_info->transaction_wait);
 
 	btrfs_destroy_delayed_inodes(fs_info);
-	btrfs_assert_delayed_root_empty(fs_info);
 
 	btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
 				     EXTENT_DIRTY);

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6648d55..813425d 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c

@@ -228,6 +228,17 @@
 	struct extent_map *merge = NULL;
 	struct rb_node *rb;
 
+	/*
+	 * We can't modify an extent map that is in the tree and that is being
+	 * used by another task, as it can cause that other task to see it in
+	 * inconsistent state during the merging. We always have 1 reference for
+	 * the tree and 1 for this task (which is unpinning the extent map or
+	 * clearing the logging flag), so anything > 2 means it's being used by
+	 * other tasks too.
+	 */
+	if (refcount_read(&em->refs) > 2)
+		return;
+
 	if (em->start != 0) {
 		rb = rb_prev(&em->rb_node);
 		if (rb)

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 4cf2817..f9e280d 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c

@@ -275,7 +275,8 @@
 		csum += count * csum_size;
 		nblocks -= count;
 next:
-		while (count--) {
+		while (count > 0) {
+			count--;
 			disk_bytenr += fs_info->sectorsize;
 			offset += fs_info->sectorsize;
 			page_bytes_left -= fs_info->sectorsize;

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4ea9dd9..dec508a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c

@@ -10348,6 +10348,7 @@
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
 	u64 cur_offset = start;
+	u64 clear_offset = start;
 	u64 i_size;
 	u64 cur_bytes;
 	u64 last_alloc = (u64)-1;
@@ -10382,6 +10383,15 @@
 				btrfs_end_transaction(trans);
 			break;
 		}
+
+		/*
+		 * We've reserved this space, and thus converted it from
+		 * ->bytes_may_use to ->bytes_reserved.  Any error that happens
+		 * from here on out we will only need to clear our reservation
+		 * for the remaining unreserved area, so advance our
+		 * clear_offset by our extent size.
+		 */
+		clear_offset += ins.offset;
 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 
 		last_alloc = ins.offset;
@@ -10462,9 +10472,9 @@
 		if (own_trans)
 			btrfs_end_transaction(trans);
 	}
-	if (cur_offset < end)
-		btrfs_free_reserved_data_space(inode, NULL, cur_offset,
-			end - cur_offset + 1);
+	if (clear_offset < end)
+		btrfs_free_reserved_data_space(inode, NULL, clear_offset,
+			end - clear_offset + 1);
 	return ret;
 }
 

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 0c4ef20..0f6d53e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c

@@ -712,10 +712,15 @@
 		}
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		end = ordered->file_offset;
+		/*
+		 * If the ordered extent had an error save the error but don't
+		 * exit without waiting first for all other ordered extents in
+		 * the range to complete.
+		 */
 		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
 			ret = -EIO;
 		btrfs_put_ordered_extent(ordered);
-		if (ret || end == 0 || end == start)
+		if (end == 0 || end == start)
 			break;
 		end--;
 	}

diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index cd2a586..dbc685c 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c

@@ -747,6 +747,7 @@
 		 */
 		be = add_block_entry(root->fs_info, bytenr, num_bytes, ref_root);
 		if (IS_ERR(be)) {
+			kfree(ref);
 			kfree(ra);
 			ret = PTR_ERR(be);
 			goto out;
@@ -760,6 +761,8 @@
 			"re-allocated a block that still has references to it!");
 			dump_block_entry(fs_info, be);
 			dump_ref_action(fs_info, ra);
+			kfree(ref);
+			kfree(ra);
 			goto out_unlock;
 		}
 
@@ -822,6 +825,7 @@
 "dropping a ref for a existing root that doesn't have a ref on the block");
 				dump_block_entry(fs_info, be);
 				dump_ref_action(fs_info, ra);
+				kfree(ref);
 				kfree(ra);
 				goto out_unlock;
 			}
@@ -837,6 +841,7 @@
 "attempting to add another ref for an existing ref on a tree block");
 			dump_block_entry(fs_info, be);
 			dump_ref_action(fs_info, ra);
+			kfree(ref);
 			kfree(ra);
 			goto out_unlock;
 		}

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index db4002e..6a5b16a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c

@@ -1857,6 +1857,8 @@
 		}
 
 		if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+			btrfs_warn(fs_info,
+		"mount required to replay tree-log, cannot remount read-write");
 			ret = -EINVAL;
 			goto restore;
 		}

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5bbcdcf..9c3b394 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c

@@ -7260,6 +7260,8 @@
 			else
 				btrfs_dev_stat_reset(dev, i);
 		}
+		btrfs_info(fs_info, "device stats zeroed by %s (%d)",
+			   current->comm, task_pid_nr(current));
 	} else {
 		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
 			if (stats->nr_items > i)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 09db6d0..a2e9032 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c

@@ -2343,8 +2343,7 @@
 		if (!(mdsc->fsc->mount_options->flags &
 		      CEPH_MOUNT_OPT_MOUNTWAIT) &&
 		    !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
-			err = -ENOENT;
-			pr_info("probably no mds server is up\n");
+			err = -EHOSTUNREACH;
 			goto finish;
 		}
 	}

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 2bd0b1e..c4314f4 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c

@@ -1106,6 +1106,11 @@
 	return res;
 
 out_splat:
+	if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) {
+		pr_info("No mds server is up or the cluster is laggy\n");
+		err = -EHOSTUNREACH;
+	}
+
 	ceph_mdsc_close_sessions(fsc->mdsc);
 	deactivate_locked_super(sb);
 	goto out_final;

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1d377b7..130bdca 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c

@@ -603,7 +603,7 @@
 			((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
 		*pmode |= (S_IXUGO & (*pbits_to_set));
 
-	cifs_dbg(NOISY, "access flags 0x%x mode now 0x%x\n", flags, *pmode);
+	cifs_dbg(NOISY, "access flags 0x%x mode now %04o\n", flags, *pmode);
 	return;
 }
 
@@ -632,7 +632,7 @@
 	if (mode & S_IXUGO)
 		*pace_flags |= SET_FILE_EXEC_RIGHTS;
 
-	cifs_dbg(NOISY, "mode: 0x%x, access flags now 0x%x\n",
+	cifs_dbg(NOISY, "mode: %04o, access flags now 0x%x\n",
 		 mode, *pace_flags);
 	return;
 }

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 576cf71..975f800 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c

@@ -3342,8 +3342,10 @@
 {
 	struct cifs_sb_info *old = CIFS_SB(sb);
 	struct cifs_sb_info *new = mnt_data->cifs_sb;
-	bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
-	bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
+	bool old_set = (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) &&
+		old->prepath;
+	bool new_set = (new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) &&
+		new->prepath;
 
 	if (old_set && new_set && !strcmp(new->prepath, old->prepath))
 		return 1;
@@ -3792,7 +3794,7 @@
 	cifs_sb->mnt_gid = pvolume_info->linux_gid;
 	cifs_sb->mnt_file_mode = pvolume_info->file_mode;
 	cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
-	cifs_dbg(FYI, "file mode: 0x%hx  dir mode: 0x%hx\n",
+	cifs_dbg(FYI, "file mode: %04ho  dir mode: %04ho\n",
 		 cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
 
 	cifs_sb->actimeo = pvolume_info->actimeo;

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 26154db..fbebf24 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c

@@ -1579,7 +1579,7 @@
 	struct TCP_Server_Info *server;
 	char *full_path;
 
-	cifs_dbg(FYI, "In cifs_mkdir, mode = 0x%hx inode = 0x%p\n",
+	cifs_dbg(FYI, "In cifs_mkdir, mode = %04ho inode = 0x%p\n",
 		 mode, inode);
 
 	cifs_sb = CIFS_SB(inode->i_sb);

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 0d4e4d9..e2d2b74 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c

@@ -3425,6 +3425,9 @@
 				     wdata->cfile->fid.persistent_fid,
 				     tcon->tid, tcon->ses->Suid, wdata->offset,
 				     wdata->bytes, wdata->result);
+		if (wdata->result == -ENOSPC)
+			printk_once(KERN_WARNING "Out of space writing to %s\n",
+				    tcon->treeName);
 	} else
 		trace_smb3_write_done(0 /* no xid */,
 				      wdata->cfile->fid.persistent_fid,

diff --git a/fs/dax.c b/fs/dax.c
index f0d932f..d09701a 100644
--- a/fs/dax.c
+++ b/fs/dax.c

@@ -1301,6 +1301,9 @@
 		lockdep_assert_held(&inode->i_rwsem);
 	}
 
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		flags |= IOMAP_NOWAIT;
+
 	while (iov_iter_count(iter)) {
 		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
 				iter, dax_iomap_actor);

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 708f931..8e5353bd 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c

@@ -325,8 +325,10 @@
 	struct extent_crypt_result ecr;
 	int rc = 0;
 
-	BUG_ON(!crypt_stat || !crypt_stat->tfm
-	       || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
+	if (!crypt_stat || !crypt_stat->tfm
+	       || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
+		return -EINVAL;
+
 	if (unlikely(ecryptfs_verbosity > 0)) {
 		ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
 				crypt_stat->key_size);

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e74fe84..250cb23 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c

@@ -1318,7 +1318,7 @@
 		printk(KERN_WARNING "Tag 1 packet contains key larger "
 		       "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
 		rc = -EINVAL;
-		goto out;
+		goto out_free;
 	}
 	memcpy((*new_auth_tok)->session_key.encrypted_key,
 	       &data[(*packet_size)], (body_size - (ECRYPTFS_SIG_SIZE + 2)));

diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 9fdd5bc..aa3ddb48 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c

@@ -392,6 +392,7 @@
 					* ecryptfs_message_buf_len),
 				       GFP_KERNEL);
 	if (!ecryptfs_msg_ctx_arr) {
+		kfree(ecryptfs_daemon_hash);
 		rc = -ENOMEM;
 		goto out;
 	}

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 58f48ea..8b4ded9 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c

@@ -34,6 +34,7 @@
 #include <linux/mutex.h>
 #include <linux/anon_inodes.h>
 #include <linux/device.h>
+#include <linux/freezer.h>
 #include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/mman.h>
@@ -1816,7 +1817,8 @@
 			}
 
 			spin_unlock_irq(&ep->wq.lock);
-			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+			if (!freezable_schedule_hrtimeout_range(to, slack,
+								HRTIMER_MODE_ABS))
 				timed_out = 1;
 
 			spin_lock_irq(&ep->wq.lock);

diff --git a/fs/exec.c b/fs/exec.c
index 561ea64..3edcdaa 100644
--- a/fs/exec.c
+++ b/fs/exec.c

@@ -67,6 +67,7 @@
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
 
+#include <trace/events/fs.h>
 #include <trace/events/task.h>
 #include "internal.h"
 
@@ -865,9 +866,12 @@
 	if (err)
 		goto exit;
 
-	if (name->name[0] != '\0')
+	if (name->name[0] != '\0') {
 		fsnotify_open(file);
 
+		trace_open_exec(name->name);
+	}
+
 out:
 	return file;
 

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e5d6ee6..f9645de 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c

@@ -270,6 +270,7 @@
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct buffer_head *bh_p;
 
 	if (block_group >= ngroups) {
 		ext4_error(sb, "block_group >= groups_count - block_group = %u,"
@@ -280,7 +281,14 @@
 
 	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
 	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
-	if (!sbi->s_group_desc[group_desc]) {
+	bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc);
+	/*
+	 * sbi_array_rcu_deref returns with rcu unlocked, this is ok since
+	 * the pointer being dereferenced won't be dereferenced again. By
+	 * looking at the usage in add_new_gdb() the value isn't modified,
+	 * just the pointer, and so it remains valid.
+	 */
+	if (!bh_p) {
 		ext4_error(sb, "Group descriptor not loaded - "
 			   "block_group = %u, group_desc = %u, desc = %u",
 			   block_group, group_desc, offset);
@@ -288,10 +296,10 @@
 	}
 
 	desc = (struct ext4_group_desc *)(
-		(__u8 *)sbi->s_group_desc[group_desc]->b_data +
+		(__u8 *)bh_p->b_data +
 		offset * EXT4_DESC_SIZE(sb));
 	if (bh)
-		*bh = sbi->s_group_desc[group_desc];
+		*bh = bh_p;
 	return desc;
 }
 

diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 7edc817..d203cc9 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c

@@ -203,6 +203,7 @@
 		return PTR_ERR(inode);
 	num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 	while (i < num) {
+		cond_resched();
 		map.m_lblk = i;
 		map.m_len = num - i;
 		n = ext4_map_blocks(NULL, inode, &map, 0);

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index d947c5e..ae520a7 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c

@@ -126,12 +126,14 @@
 		if (err != ERR_BAD_DX_DIR) {
 			return err;
 		}
-		/*
-		 * We don't set the inode dirty flag since it's not
-		 * critical that it get flushed back to the disk.
-		 */
-		ext4_clear_inode_flag(file_inode(file),
-				      EXT4_INODE_INDEX);
+		/* Can we just clear INDEX flag to ignore htree information? */
+		if (!ext4_has_metadata_csum(sb)) {
+			/*
+			 * We don't set the inode dirty flag since it's not
+			 * critical that it gets flushed back to the disk.
+			 */
+			ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+		}
 	}
 
 	if (ext4_has_inline_data(inode)) {

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f8456a4..0a4461a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h

@@ -1372,7 +1372,7 @@
 	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
 	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
-	struct buffer_head **s_group_desc;
+	struct buffer_head * __rcu *s_group_desc;
 	unsigned int s_mount_opt;
 	unsigned int s_mount_opt2;
 	unsigned int s_mount_flags;
@@ -1430,7 +1430,7 @@
 #endif
 
 	/* for buddy allocator */
-	struct ext4_group_info ***s_group_info;
+	struct ext4_group_info ** __rcu *s_group_info;
 	struct inode *s_buddy_cache;
 	spinlock_t s_md_lock;
 	unsigned short *s_mb_offsets;
@@ -1480,7 +1480,7 @@
 	unsigned int s_extent_max_zeroout_kb;
 
 	unsigned int s_log_groups_per_flex;
-	struct flex_groups *s_flex_groups;
+	struct flex_groups * __rcu *s_flex_groups;
 	ext4_group_t s_flex_groups_allocated;
 
 	/* workqueue for reserved extent conversions (buffered io) */
@@ -1520,8 +1520,11 @@
 	struct ratelimit_state s_warning_ratelimit_state;
 	struct ratelimit_state s_msg_ratelimit_state;
 
-	/* Barrier between changing inodes' journal flags and writepages ops. */
-	struct percpu_rw_semaphore s_journal_flag_rwsem;
+	/*
+	 * Barrier between writepages ops and changing any inode's JOURNAL_DATA
+	 * or EXTENTS flag.
+	 */
+	struct percpu_rw_semaphore s_writepages_rwsem;
 	struct dax_device *s_daxdev;
 };
 
@@ -1542,6 +1545,23 @@
 }
 
 /*
+ * Returns: sbi->field[index]
+ * Used to access an array element from the following sbi fields which require
+ * rcu protection to avoid dereferencing an invalid pointer due to reassignment
+ * - s_group_desc
+ * - s_group_info
+ * - s_flex_group
+ */
+#define sbi_array_rcu_deref(sbi, field, index)				   \
+({									   \
+	typeof(*((sbi)->field)) _v;					   \
+	rcu_read_lock();						   \
+	_v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index];	   \
+	rcu_read_unlock();						   \
+	_v;								   \
+})
+
+/*
  * Inode dynamic state flags
  */
 enum {
@@ -2375,8 +2395,11 @@
 			struct ext4_filename *fname);
 static inline void ext4_update_dx_flag(struct inode *inode)
 {
-	if (!ext4_has_feature_dir_index(inode->i_sb))
+	if (!ext4_has_feature_dir_index(inode->i_sb)) {
+		/* ext4_iget() should have caught this... */
+		WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
 		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+	}
 }
 static const unsigned char ext4_filetype_table[] = {
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
@@ -2561,6 +2584,7 @@
 extern bool ext4_empty_dir(struct inode *inode);
 
 /* resize.c */
+extern void ext4_kvfree_array_rcu(void *to_free);
 extern int ext4_group_add(struct super_block *sb,
 				struct ext4_new_group_data *input);
 extern int ext4_group_extend(struct super_block *sb,
@@ -2808,13 +2832,13 @@
 struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 					    ext4_group_t group)
 {
-	 struct ext4_group_info ***grp_info;
+	 struct ext4_group_info **grp_info;
 	 long indexv, indexh;
 	 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
-	 grp_info = EXT4_SB(sb)->s_group_info;
 	 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
 	 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
-	 return grp_info[indexv][indexh];
+	 grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
+	 return grp_info[indexh];
 }
 
 /*
@@ -2864,7 +2888,7 @@
 		     !inode_is_locked(inode));
 	down_write(&EXT4_I(inode)->i_data_sem);
 	if (newsize > EXT4_I(inode)->i_disksize)
-		EXT4_I(inode)->i_disksize = newsize;
+		WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
 	up_write(&EXT4_I(inode)->i_data_sem);
 }
 

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f4a24a4..adcd424 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c

@@ -40,9 +40,10 @@
 	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t ret;
 
-	if (!inode_trylock_shared(inode)) {
-		if (iocb->ki_flags & IOCB_NOWAIT)
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (!inode_trylock_shared(inode))
 			return -EAGAIN;
+	} else {
 		inode_lock_shared(inode);
 	}
 	/*
@@ -190,9 +191,10 @@
 	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t ret;
 
-	if (!inode_trylock(inode)) {
-		if (iocb->ki_flags & IOCB_NOWAIT)
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (!inode_trylock(inode))
 			return -EAGAIN;
+	} else {
 		inode_lock(inode);
 	}
 	ret = ext4_write_checks(iocb, from);
@@ -371,15 +373,17 @@
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_mapping->host;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct dax_device *dax_dev = sbi->s_daxdev;
 
-	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+	if (unlikely(ext4_forced_shutdown(sbi)))
 		return -EIO;
 
 	/*
-	 * We don't support synchronous mappings for non-DAX files. At least
-	 * until someone comes with a sensible use case.
+	 * We don't support synchronous mappings for non-DAX files and
+	 * for DAX files if underneath dax_device is not synchronous.
 	 */
-	if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
+	if (!daxdev_mapping_supported(vma, dax_dev))
 		return -EOPNOTSUPP;
 
 	file_accessed(file);

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 091a18a..dafa7e4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c

@@ -330,11 +330,13 @@
 
 	percpu_counter_inc(&sbi->s_freeinodes_counter);
 	if (sbi->s_log_groups_per_flex) {
-		ext4_group_t f = ext4_flex_group(sbi, block_group);
+		struct flex_groups *fg;
 
-		atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+		fg = sbi_array_rcu_deref(sbi, s_flex_groups,
+					 ext4_flex_group(sbi, block_group));
+		atomic_inc(&fg->free_inodes);
 		if (is_directory)
-			atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+			atomic_dec(&fg->used_dirs);
 	}
 	BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
 	fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
@@ -370,12 +372,13 @@
 			    int flex_size, struct orlov_stats *stats)
 {
 	struct ext4_group_desc *desc;
-	struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
 
 	if (flex_size > 1) {
-		stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
-		stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
-		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+		struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb),
+							     s_flex_groups, g);
+		stats->free_inodes = atomic_read(&fg->free_inodes);
+		stats->free_clusters = atomic64_read(&fg->free_clusters);
+		stats->used_dirs = atomic_read(&fg->used_dirs);
 		return;
 	}
 
@@ -1056,7 +1059,8 @@
 		if (sbi->s_log_groups_per_flex) {
 			ext4_group_t f = ext4_flex_group(sbi, group);
 
-			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+			atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
+							f)->used_dirs);
 		}
 	}
 	if (ext4_has_group_desc_csum(sb)) {
@@ -1079,7 +1083,8 @@
 
 	if (sbi->s_log_groups_per_flex) {
 		flex_group = ext4_flex_group(sbi, group);
-		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
+		atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups,
+						flex_group)->free_inodes);
 	}
 
 	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 950e3dc..23b4b17 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c

@@ -2569,7 +2569,7 @@
 	 * truncate are avoided by checking i_size under i_data_sem.
 	 */
 	disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
-	if (disksize > EXT4_I(inode)->i_disksize) {
+	if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
 		int err2;
 		loff_t i_size;
 
@@ -2730,7 +2730,7 @@
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 		return -EIO;
 
-	percpu_down_read(&sbi->s_journal_flag_rwsem);
+	percpu_down_read(&sbi->s_writepages_rwsem);
 	trace_ext4_writepages(inode, wbc);
 
 	/*
@@ -2950,7 +2950,7 @@
 out_writepages:
 	trace_ext4_writepages_result(inode, wbc, ret,
 				     nr_to_write - wbc->nr_to_write);
-	percpu_up_read(&sbi->s_journal_flag_rwsem);
+	percpu_up_read(&sbi->s_writepages_rwsem);
 	return ret;
 }
 
@@ -2965,13 +2965,13 @@
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 		return -EIO;
 
-	percpu_down_read(&sbi->s_journal_flag_rwsem);
+	percpu_down_read(&sbi->s_writepages_rwsem);
 	trace_ext4_writepages(inode, wbc);
 
 	ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc);
 	trace_ext4_writepages_result(inode, wbc, ret,
 				     nr_to_write - wbc->nr_to_write);
-	percpu_up_read(&sbi->s_journal_flag_rwsem);
+	percpu_up_read(&sbi->s_writepages_rwsem);
 	return ret;
 }
 
@@ -4975,6 +4975,18 @@
 		ret = -EFSCORRUPTED;
 		goto bad_inode;
 	}
+	/*
+	 * If dir_index is not enabled but there's dir with INDEX flag set,
+	 * we'd normally treat htree data as empty space. But with metadata
+	 * checksumming that corrupts checksums so forbid that.
+	 */
+	if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
+	    ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
+		ext4_error_inode(inode, function, line, 0,
+			 "iget: Dir with htree data on filesystem without dir_index feature.");
+		ret = -EFSCORRUPTED;
+		goto bad_inode;
+	}
 	ei->i_disksize = inode->i_size;
 #ifdef CONFIG_QUOTA
 	ei->i_reserved_quota = 0;
@@ -6195,7 +6207,7 @@
 		}
 	}
 
-	percpu_down_write(&sbi->s_journal_flag_rwsem);
+	percpu_down_write(&sbi->s_writepages_rwsem);
 	jbd2_journal_lock_updates(journal);
 
 	/*
@@ -6212,7 +6224,7 @@
 		err = jbd2_journal_flush(journal);
 		if (err < 0) {
 			jbd2_journal_unlock_updates(journal);
-			percpu_up_write(&sbi->s_journal_flag_rwsem);
+			percpu_up_write(&sbi->s_writepages_rwsem);
 			return err;
 		}
 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
@@ -6220,7 +6232,7 @@
 	ext4_set_aops(inode);
 
 	jbd2_journal_unlock_updates(journal);
-	percpu_up_write(&sbi->s_journal_flag_rwsem);
+	percpu_up_write(&sbi->s_writepages_rwsem);
 
 	if (val)
 		up_write(&EXT4_I(inode)->i_mmap_sem);

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cc229f3..71121fc 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c

@@ -2356,7 +2356,7 @@
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	unsigned size;
-	struct ext4_group_info ***new_groupinfo;
+	struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
 
 	size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
 		EXT4_DESC_PER_BLOCK_BITS(sb);
@@ -2369,13 +2369,16 @@
 		ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
 		return -ENOMEM;
 	}
-	if (sbi->s_group_info) {
-		memcpy(new_groupinfo, sbi->s_group_info,
+	rcu_read_lock();
+	old_groupinfo = rcu_dereference(sbi->s_group_info);
+	if (old_groupinfo)
+		memcpy(new_groupinfo, old_groupinfo,
 		       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
-		kvfree(sbi->s_group_info);
-	}
-	sbi->s_group_info = new_groupinfo;
+	rcu_read_unlock();
+	rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
 	sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
+	if (old_groupinfo)
+		ext4_kvfree_array_rcu(old_groupinfo);
 	ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 
 		   sbi->s_group_info_size);
 	return 0;
@@ -2387,6 +2390,7 @@
 {
 	int i;
 	int metalen = 0;
+	int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_info **meta_group_info;
 	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
@@ -2405,12 +2409,12 @@
 				 "for a buddy group");
 			goto exit_meta_group_info;
 		}
-		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
-			meta_group_info;
+		rcu_read_lock();
+		rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
+		rcu_read_unlock();
 	}
 
-	meta_group_info =
-		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+	meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
 	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
 
 	meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
@@ -2458,8 +2462,13 @@
 exit_group_info:
 	/* If a meta_group_info table has been allocated, release it now */
 	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
-		kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
-		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
+		struct ext4_group_info ***group_info;
+
+		rcu_read_lock();
+		group_info = rcu_dereference(sbi->s_group_info);
+		kfree(group_info[idx]);
+		group_info[idx] = NULL;
+		rcu_read_unlock();
 	}
 exit_meta_group_info:
 	return -ENOMEM;
@@ -2472,6 +2481,7 @@
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int err;
 	struct ext4_group_desc *desc;
+	struct ext4_group_info ***group_info;
 	struct kmem_cache *cachep;
 
 	err = ext4_mb_alloc_groupinfo(sb, ngroups);
@@ -2506,11 +2516,16 @@
 	while (i-- > 0)
 		kmem_cache_free(cachep, ext4_get_group_info(sb, i));
 	i = sbi->s_group_info_size;
+	rcu_read_lock();
+	group_info = rcu_dereference(sbi->s_group_info);
 	while (i-- > 0)
-		kfree(sbi->s_group_info[i]);
+		kfree(group_info[i]);
+	rcu_read_unlock();
 	iput(sbi->s_buddy_cache);
 err_freesgi:
-	kvfree(sbi->s_group_info);
+	rcu_read_lock();
+	kvfree(rcu_dereference(sbi->s_group_info));
+	rcu_read_unlock();
 	return -ENOMEM;
 }
 
@@ -2699,7 +2714,7 @@
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t i;
 	int num_meta_group_infos;
-	struct ext4_group_info *grinfo;
+	struct ext4_group_info *grinfo, ***group_info;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 
@@ -2717,9 +2732,12 @@
 		num_meta_group_infos = (ngroups +
 				EXT4_DESC_PER_BLOCK(sb) - 1) >>
 			EXT4_DESC_PER_BLOCK_BITS(sb);
+		rcu_read_lock();
+		group_info = rcu_dereference(sbi->s_group_info);
 		for (i = 0; i < num_meta_group_infos; i++)
-			kfree(sbi->s_group_info[i]);
-		kvfree(sbi->s_group_info);
+			kfree(group_info[i]);
+		kvfree(group_info);
+		rcu_read_unlock();
 	}
 	kfree(sbi->s_mb_offsets);
 	kfree(sbi->s_mb_maxs);
@@ -3018,7 +3036,8 @@
 		ext4_group_t flex_group = ext4_flex_group(sbi,
 							  ac->ac_b_ex.fe_group);
 		atomic64_sub(ac->ac_b_ex.fe_len,
-			     &sbi->s_flex_groups[flex_group].free_clusters);
+			     &sbi_array_rcu_deref(sbi, s_flex_groups,
+						  flex_group)->free_clusters);
 	}
 
 	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -4912,7 +4931,8 @@
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
 		atomic64_add(count_clusters,
-			     &sbi->s_flex_groups[flex_group].free_clusters);
+			     &sbi_array_rcu_deref(sbi, s_flex_groups,
+						  flex_group)->free_clusters);
 	}
 
 	if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
@@ -5061,7 +5081,8 @@
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
 		atomic64_add(clusters_freed,
-			     &sbi->s_flex_groups[flex_group].free_clusters);
+			     &sbi_array_rcu_deref(sbi, s_flex_groups,
+						  flex_group)->free_clusters);
 	}
 
 	ext4_mb_unload_buddy(&e4b);

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a98bfca..bec4ad7 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c

@@ -427,6 +427,7 @@
 
 int ext4_ext_migrate(struct inode *inode)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	handle_t *handle;
 	int retval = 0, i;
 	__le32 *i_data;
@@ -451,6 +452,8 @@
 		 */
 		return retval;
 
+	percpu_down_write(&sbi->s_writepages_rwsem);
+
 	/*
 	 * Worst case we can touch the allocation bitmaps, a bgd
 	 * block, and a block to link in the orphan list.  We do need
@@ -461,7 +464,7 @@
 
 	if (IS_ERR(handle)) {
 		retval = PTR_ERR(handle);
-		return retval;
+		goto out_unlock;
 	}
 	goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
 		EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
@@ -472,7 +475,7 @@
 	if (IS_ERR(tmp_inode)) {
 		retval = PTR_ERR(tmp_inode);
 		ext4_journal_stop(handle);
-		return retval;
+		goto out_unlock;
 	}
 	i_size_write(tmp_inode, i_size_read(inode));
 	/*
@@ -514,7 +517,7 @@
 		 */
 		ext4_orphan_del(NULL, tmp_inode);
 		retval = PTR_ERR(handle);
-		goto out;
+		goto out_tmp_inode;
 	}
 
 	ei = EXT4_I(inode);
@@ -595,10 +598,11 @@
 	/* Reset the extent details */
 	ext4_ext_tree_init(handle, tmp_inode);
 	ext4_journal_stop(handle);
-out:
+out_tmp_inode:
 	unlock_new_inode(tmp_inode);
 	iput(tmp_inode);
-
+out_unlock:
+	percpu_up_write(&sbi->s_writepages_rwsem);
 	return retval;
 }
 
@@ -608,7 +612,8 @@
 int ext4_ind_migrate(struct inode *inode)
 {
 	struct ext4_extent_header	*eh;
-	struct ext4_super_block		*es = EXT4_SB(inode->i_sb)->s_es;
+	struct ext4_sb_info		*sbi = EXT4_SB(inode->i_sb);
+	struct ext4_super_block		*es = sbi->s_es;
 	struct ext4_inode_info		*ei = EXT4_I(inode);
 	struct ext4_extent		*ex;
 	unsigned int			i, len;
@@ -632,9 +637,13 @@
 	if (test_opt(inode->i_sb, DELALLOC))
 		ext4_alloc_da_blocks(inode);
 
+	percpu_down_write(&sbi->s_writepages_rwsem);
+
 	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_unlock;
+	}
 
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ret = ext4_ext_check_inode(inode);
@@ -669,5 +678,7 @@
 errout:
 	ext4_journal_stop(handle);
 	up_write(&EXT4_I(inode)->i_data_sem);
+out_unlock:
+	percpu_up_write(&sbi->s_writepages_rwsem);
 	return ret;
 }

diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 2305b43..9d00e0d 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c

@@ -120,10 +120,10 @@
 {
 	__ext4_warning(sb, function, line, "%s", msg);
 	__ext4_warning(sb, function, line,
-		       "MMP failure info: last update time: %llu, last update "
-		       "node: %s, last update device: %s",
-		       (long long unsigned int) le64_to_cpu(mmp->mmp_time),
-		       mmp->mmp_nodename, mmp->mmp_bdevname);
+		       "MMP failure info: last update time: %llu, last update node: %.*s, last update device: %.*s",
+		       (unsigned long long)le64_to_cpu(mmp->mmp_time),
+		       (int)sizeof(mmp->mmp_nodename), mmp->mmp_nodename,
+		       (int)sizeof(mmp->mmp_bdevname), mmp->mmp_bdevname);
 }
 
 /*
@@ -154,6 +154,7 @@
 	mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
 				 EXT4_MMP_MIN_CHECK_INTERVAL);
 	mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
 	bdevname(bh->b_bdev, mmp->mmp_bdevname);
 
 	memcpy(mmp->mmp_nodename, init_utsname()->nodename,
@@ -375,7 +376,8 @@
 	/*
 	 * Start a kernel thread to update the MMP block periodically.
 	 */
-	EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+	EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s",
+					     (int)sizeof(mmp->mmp_bdevname),
 					     bdevname(bh->b_bdev,
 						      mmp->mmp_bdevname));
 	if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 43dcb91..a8f2e35 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c

@@ -1431,6 +1431,7 @@
 		/*
 		 * We deal with the read-ahead logic here.
 		 */
+		cond_resched();
 		if (ra_ptr >= ra_max) {
 			/* Refill the readahead buffer */
 			ra_ptr = 0;
@@ -2085,6 +2086,13 @@
 		retval = ext4_dx_add_entry(handle, &fname, dir, inode);
 		if (!retval || (retval != ERR_BAD_DX_DIR))
 			goto out;
+		/* Can we just ignore htree data? */
+		if (ext4_has_metadata_csum(sb)) {
+			EXT4_ERROR_INODE(dir,
+				"Directory has corrupted htree index.");
+			retval = -EFSCORRUPTED;
+			goto out;
+		}
 		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
 		dx_fallback++;
 		ext4_mark_inode_dirty(handle, dir);

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 4d5c0fc..ef552d9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c

@@ -17,6 +17,33 @@
 
 #include "ext4_jbd2.h"
 
+struct ext4_rcu_ptr {
+	struct rcu_head rcu;
+	void *ptr;
+};
+
+static void ext4_rcu_ptr_callback(struct rcu_head *head)
+{
+	struct ext4_rcu_ptr *ptr;
+
+	ptr = container_of(head, struct ext4_rcu_ptr, rcu);
+	kvfree(ptr->ptr);
+	kfree(ptr);
+}
+
+void ext4_kvfree_array_rcu(void *to_free)
+{
+	struct ext4_rcu_ptr *ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+
+	if (ptr) {
+		ptr->ptr = to_free;
+		call_rcu(&ptr->rcu, ext4_rcu_ptr_callback);
+		return;
+	}
+	synchronize_rcu();
+	kvfree(to_free);
+}
+
 int ext4_resize_begin(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -560,8 +587,8 @@
 				brelse(gdb);
 				goto out;
 			}
-			memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
-			       gdb->b_size);
+			memcpy(gdb->b_data, sbi_array_rcu_deref(sbi,
+				s_group_desc, j)->b_data, gdb->b_size);
 			set_buffer_uptodate(gdb);
 
 			err = ext4_handle_dirty_metadata(handle, NULL, gdb);
@@ -879,13 +906,15 @@
 	}
 	brelse(dind);
 
-	o_group_desc = EXT4_SB(sb)->s_group_desc;
+	rcu_read_lock();
+	o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc);
 	memcpy(n_group_desc, o_group_desc,
 	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+	rcu_read_unlock();
 	n_group_desc[gdb_num] = gdb_bh;
-	EXT4_SB(sb)->s_group_desc = n_group_desc;
+	rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
 	EXT4_SB(sb)->s_gdb_count++;
-	kvfree(o_group_desc);
+	ext4_kvfree_array_rcu(o_group_desc);
 
 	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
 	err = ext4_handle_dirty_super(handle, sb);
@@ -929,9 +958,11 @@
 		return err;
 	}
 
-	o_group_desc = EXT4_SB(sb)->s_group_desc;
+	rcu_read_lock();
+	o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc);
 	memcpy(n_group_desc, o_group_desc,
 	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+	rcu_read_unlock();
 	n_group_desc[gdb_num] = gdb_bh;
 
 	BUFFER_TRACE(gdb_bh, "get_write_access");
@@ -942,9 +973,9 @@
 		return err;
 	}
 
-	EXT4_SB(sb)->s_group_desc = n_group_desc;
+	rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
 	EXT4_SB(sb)->s_gdb_count++;
-	kvfree(o_group_desc);
+	ext4_kvfree_array_rcu(o_group_desc);
 	return err;
 }
 
@@ -1210,7 +1241,8 @@
 		 * use non-sparse filesystems anymore.  This is already checked above.
 		 */
 		if (gdb_off) {
-			gdb_bh = sbi->s_group_desc[gdb_num];
+			gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
+						     gdb_num);
 			BUFFER_TRACE(gdb_bh, "get_write_access");
 			err = ext4_journal_get_write_access(handle, gdb_bh);
 
@@ -1292,7 +1324,7 @@
 		/*
 		 * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
 		 */
-		gdb_bh = sbi->s_group_desc[gdb_num];
+		gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num);
 		/* Update group descriptor block for new group */
 		gdp = (struct ext4_group_desc *)(gdb_bh->b_data +
 						 gdb_off * EXT4_DESC_SIZE(sb));
@@ -1420,11 +1452,14 @@
 		   percpu_counter_read(&sbi->s_freeclusters_counter));
 	if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group;
+		struct flex_groups *fg;
+
 		flex_group = ext4_flex_group(sbi, group_data[0].group);
+		fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
 		atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
-			     &sbi->s_flex_groups[flex_group].free_clusters);
+			     &fg->free_clusters);
 		atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
-			   &sbi->s_flex_groups[flex_group].free_inodes);
+			   &fg->free_inodes);
 	}
 
 	/*
@@ -1519,7 +1554,8 @@
 		for (; gdb_num <= gdb_num_end; gdb_num++) {
 			struct buffer_head *gdb_bh;
 
-			gdb_bh = sbi->s_group_desc[gdb_num];
+			gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
+						     gdb_num);
 			if (old_gdb == gdb_bh->b_blocknr)
 				continue;
 			update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 32d8bdf..d44fc3f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c

@@ -969,6 +969,8 @@
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
+	struct buffer_head **group_desc;
+	struct flex_groups **flex_groups;
 	int aborted = 0;
 	int i, err;
 
@@ -999,15 +1001,23 @@
 	if (!sb_rdonly(sb))
 		ext4_commit_super(sb, 1);
 
+	rcu_read_lock();
+	group_desc = rcu_dereference(sbi->s_group_desc);
 	for (i = 0; i < sbi->s_gdb_count; i++)
-		brelse(sbi->s_group_desc[i]);
-	kvfree(sbi->s_group_desc);
-	kvfree(sbi->s_flex_groups);
+		brelse(group_desc[i]);
+	kvfree(group_desc);
+	flex_groups = rcu_dereference(sbi->s_flex_groups);
+	if (flex_groups) {
+		for (i = 0; i < sbi->s_flex_groups_allocated; i++)
+			kvfree(flex_groups[i]);
+		kvfree(flex_groups);
+	}
+	rcu_read_unlock();
 	percpu_counter_destroy(&sbi->s_freeclusters_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
-	percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
+	percpu_free_rwsem(&sbi->s_writepages_rwsem);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
 		kfree(get_qf_name(sb, sbi, i));
@@ -2287,8 +2297,8 @@
 int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct flex_groups *new_groups;
-	int size;
+	struct flex_groups **old_groups, **new_groups;
+	int size, i, j;
 
 	if (!sbi->s_log_groups_per_flex)
 		return 0;
@@ -2297,22 +2307,37 @@
 	if (size <= sbi->s_flex_groups_allocated)
 		return 0;
 
-	size = roundup_pow_of_two(size * sizeof(struct flex_groups));
-	new_groups = kvzalloc(size, GFP_KERNEL);
+	new_groups = kvzalloc(roundup_pow_of_two(size *
+			      sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
 	if (!new_groups) {
-		ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
-			 size / (int) sizeof(struct flex_groups));
+		ext4_msg(sb, KERN_ERR,
+			 "not enough memory for %d flex group pointers", size);
 		return -ENOMEM;
 	}
-
-	if (sbi->s_flex_groups) {
-		memcpy(new_groups, sbi->s_flex_groups,
-		       (sbi->s_flex_groups_allocated *
-			sizeof(struct flex_groups)));
-		kvfree(sbi->s_flex_groups);
+	for (i = sbi->s_flex_groups_allocated; i < size; i++) {
+		new_groups[i] = kvzalloc(roundup_pow_of_two(
+					 sizeof(struct flex_groups)),
+					 GFP_KERNEL);
+		if (!new_groups[i]) {
+			for (j = sbi->s_flex_groups_allocated; j < i; j++)
+				kvfree(new_groups[j]);
+			kvfree(new_groups);
+			ext4_msg(sb, KERN_ERR,
+				 "not enough memory for %d flex groups", size);
+			return -ENOMEM;
+		}
 	}
-	sbi->s_flex_groups = new_groups;
-	sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
+	rcu_read_lock();
+	old_groups = rcu_dereference(sbi->s_flex_groups);
+	if (old_groups)
+		memcpy(new_groups, old_groups,
+		       (sbi->s_flex_groups_allocated *
+			sizeof(struct flex_groups *)));
+	rcu_read_unlock();
+	rcu_assign_pointer(sbi->s_flex_groups, new_groups);
+	sbi->s_flex_groups_allocated = size;
+	if (old_groups)
+		ext4_kvfree_array_rcu(old_groups);
 	return 0;
 }
 
@@ -2320,6 +2345,7 @@
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_desc *gdp = NULL;
+	struct flex_groups *fg;
 	ext4_group_t flex_group;
 	int i, err;
 
@@ -2337,12 +2363,11 @@
 		gdp = ext4_get_group_desc(sb, i, NULL);
 
 		flex_group = ext4_flex_group(sbi, i);
-		atomic_add(ext4_free_inodes_count(sb, gdp),
-			   &sbi->s_flex_groups[flex_group].free_inodes);
+		fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
+		atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
 		atomic64_add(ext4_free_group_clusters(sb, gdp),
-			     &sbi->s_flex_groups[flex_group].free_clusters);
-		atomic_add(ext4_used_dirs_count(sb, gdp),
-			   &sbi->s_flex_groups[flex_group].used_dirs);
+			     &fg->free_clusters);
+		atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
 	}
 
 	return 1;
@@ -2923,17 +2948,11 @@
 		return 0;
 	}
 
-#ifndef CONFIG_QUOTA
-	if (ext4_has_feature_quota(sb) && !readonly) {
+#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
+	if (!readonly && (ext4_has_feature_quota(sb) ||
+			  ext4_has_feature_project(sb))) {
 		ext4_msg(sb, KERN_ERR,
-			 "Filesystem with quota feature cannot be mounted RDWR "
-			 "without CONFIG_QUOTA");
-		return 0;
-	}
-	if (ext4_has_feature_project(sb) && !readonly) {
-		ext4_msg(sb, KERN_ERR,
-			 "Filesystem with project quota feature cannot be mounted RDWR "
-			 "without CONFIG_QUOTA");
+			 "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
 		return 0;
 	}
 #endif  /* CONFIG_QUOTA */
@@ -3554,9 +3573,10 @@
 {
 	struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
 	char *orig_data = kstrdup(data, GFP_KERNEL);
-	struct buffer_head *bh;
+	struct buffer_head *bh, **group_desc;
 	struct ext4_super_block *es = NULL;
 	struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	struct flex_groups **flex_groups;
 	ext4_fsblk_t block;
 	ext4_fsblk_t sb_block = get_sb_block(&data);
 	ext4_fsblk_t logical_sb_block;
@@ -3727,6 +3747,15 @@
 	 */
 	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
 
+	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
+	    blocksize > EXT4_MAX_BLOCK_SIZE) {
+		ext4_msg(sb, KERN_ERR,
+		       "Unsupported filesystem blocksize %d (%d log_block_size)",
+			 blocksize, le32_to_cpu(es->s_log_block_size));
+		goto failed_mount;
+	}
+
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
 		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
 		sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
@@ -3744,6 +3773,7 @@
 			ext4_msg(sb, KERN_ERR,
 			       "unsupported inode size: %d",
 			       sbi->s_inode_size);
+			ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize);
 			goto failed_mount;
 		}
 		/*
@@ -3907,14 +3937,6 @@
 	if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
 		goto failed_mount;
 
-	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
-	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
-	    blocksize > EXT4_MAX_BLOCK_SIZE) {
-		ext4_msg(sb, KERN_ERR,
-		       "Unsupported filesystem blocksize %d (%d log_block_size)",
-			 blocksize, le32_to_cpu(es->s_log_block_size));
-		goto failed_mount;
-	}
 	if (le32_to_cpu(es->s_log_block_size) >
 	    (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
 		ext4_msg(sb, KERN_ERR,
@@ -4170,9 +4192,10 @@
 			goto failed_mount;
 		}
 	}
-	sbi->s_group_desc = kvmalloc_array(db_count,
-					   sizeof(struct buffer_head *),
-					   GFP_KERNEL);
+	rcu_assign_pointer(sbi->s_group_desc,
+			   kvmalloc_array(db_count,
+					  sizeof(struct buffer_head *),
+					  GFP_KERNEL));
 	if (sbi->s_group_desc == NULL) {
 		ext4_msg(sb, KERN_ERR, "not enough memory");
 		ret = -ENOMEM;
@@ -4188,14 +4211,19 @@
 	}
 
 	for (i = 0; i < db_count; i++) {
+		struct buffer_head *bh;
+
 		block = descriptor_loc(sb, logical_sb_block, i);
-		sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
-		if (!sbi->s_group_desc[i]) {
+		bh = sb_bread_unmovable(sb, block);
+		if (!bh) {
 			ext4_msg(sb, KERN_ERR,
 			       "can't read group descriptor %d", i);
 			db_count = i;
 			goto failed_mount2;
 		}
+		rcu_read_lock();
+		rcu_dereference(sbi->s_group_desc)[i] = bh;
+		rcu_read_unlock();
 	}
 	sbi->s_gdb_count = db_count;
 	if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
@@ -4467,7 +4495,7 @@
 		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
 					  GFP_KERNEL);
 	if (!err)
-		err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
+		err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
 
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
@@ -4555,13 +4583,19 @@
 	ext4_unregister_li_request(sb);
 failed_mount6:
 	ext4_mb_release(sb);
-	if (sbi->s_flex_groups)
-		kvfree(sbi->s_flex_groups);
+	rcu_read_lock();
+	flex_groups = rcu_dereference(sbi->s_flex_groups);
+	if (flex_groups) {
+		for (i = 0; i < sbi->s_flex_groups_allocated; i++)
+			kvfree(flex_groups[i]);
+		kvfree(flex_groups);
+	}
+	rcu_read_unlock();
 	percpu_counter_destroy(&sbi->s_freeclusters_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
-	percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
+	percpu_free_rwsem(&sbi->s_writepages_rwsem);
 failed_mount5:
 	ext4_ext_release(sb);
 	ext4_release_system_zone(sb);
@@ -4592,9 +4626,12 @@
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
+	rcu_read_lock();
+	group_desc = rcu_dereference(sbi->s_group_desc);
 	for (i = 0; i < db_count; i++)
-		brelse(sbi->s_group_desc[i]);
-	kvfree(sbi->s_group_desc);
+		brelse(group_desc[i]);
+	kvfree(group_desc);
+	rcu_read_unlock();
 failed_mount:
 	if (sbi->s_chksum_driver)
 		crypto_free_shash(sbi->s_chksum_driver);

diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 0ace2c2..4f0cc0c 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c

@@ -769,6 +769,7 @@
 
 	if (whiteout) {
 		f2fs_i_links_write(inode, false);
+		inode->i_state |= I_LINKABLE;
 		*whiteout = inode;
 	} else {
 		d_tmpfile(dentry, inode);
@@ -835,6 +836,12 @@
 			F2FS_I(old_dentry->d_inode)->i_projid)))
 		return -EXDEV;
 
+	if (flags & RENAME_WHITEOUT) {
+		err = f2fs_create_whiteout(old_dir, &whiteout);
+		if (err)
+			return err;
+	}
+
 	err = dquot_initialize(old_dir);
 	if (err)
 		goto out;
@@ -865,17 +872,11 @@
 		}
 	}
 
-	if (flags & RENAME_WHITEOUT) {
-		err = f2fs_create_whiteout(old_dir, &whiteout);
-		if (err)
-			goto out_dir;
-	}
-
 	if (new_inode) {
 
 		err = -ENOTEMPTY;
 		if (old_dir_entry && !f2fs_empty_dir(new_inode))
-			goto out_whiteout;
+			goto out_dir;
 
 		err = -ENOENT;
 		new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
@@ -883,7 +884,7 @@
 		if (!new_entry) {
 			if (IS_ERR(new_page))
 				err = PTR_ERR(new_page);
-			goto out_whiteout;
+			goto out_dir;
 		}
 
 		f2fs_balance_fs(sbi, true);
@@ -915,7 +916,7 @@
 		err = f2fs_add_link(new_dentry, old_inode);
 		if (err) {
 			f2fs_unlock_op(sbi);
-			goto out_whiteout;
+			goto out_dir;
 		}
 
 		if (old_dir_entry)
@@ -939,7 +940,7 @@
 				if (IS_ERR(old_page))
 					err = PTR_ERR(old_page);
 				f2fs_unlock_op(sbi);
-				goto out_whiteout;
+				goto out_dir;
 			}
 		}
 	}
@@ -958,7 +959,6 @@
 	f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
 
 	if (whiteout) {
-		whiteout->i_state |= I_LINKABLE;
 		set_inode_flag(whiteout, FI_INC_LINK);
 		err = f2fs_add_link(old_dentry, whiteout);
 		if (err)
@@ -992,15 +992,14 @@
 	f2fs_unlock_op(sbi);
 	if (new_page)
 		f2fs_put_page(new_page, 0);
-out_whiteout:
-	if (whiteout)
-		iput(whiteout);
 out_dir:
 	if (old_dir_entry)
 		f2fs_put_page(old_dir_page, 0);
 out_old:
 	f2fs_put_page(old_page, 0);
 out:
+	if (whiteout)
+		iput(whiteout);
 	return err;
 }
 

diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 9888718..9a59f49 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c

@@ -658,10 +658,12 @@
 
 	ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype,
 				   NULL, "features");
-	if (ret)
+	if (ret) {
+		kobject_put(&f2fs_feat);
 		kset_unregister(&f2fs_kset);
-	else
+	} else {
 		f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
+	}
 	return ret;
 }
 
@@ -682,8 +684,11 @@
 	init_completion(&sbi->s_kobj_unregister);
 	err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL,
 				"%s", sb->s_id);
-	if (err)
+	if (err) {
+		kobject_put(&sbi->s_kobj);
+		wait_for_completion(&sbi->s_kobj_unregister);
 		return err;
+	}
 
 	if (f2fs_proc_root)
 		sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@@ -711,4 +716,5 @@
 		remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
 	}
 	kobject_del(&sbi->s_kobj);
+	kobject_put(&sbi->s_kobj);
 }

diff --git a/fs/file_table.c b/fs/file_table.c
index e49af4c..023fd1e 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c

@@ -276,6 +276,9 @@
 	}
 	if (file->f_op->release)
 		file->f_op->release(inode, file);
+
+	security_file_pre_free(file);
+
 	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
 		     !(file->f_mode & FMODE_PATH))) {
 		cdev_put(inode->i_cdev);

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 26f8d7e..66409cbd 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c

@@ -165,7 +165,7 @@
 				       "journal space in %s\n", __func__,
 				       journal->j_devname);
 				WARN_ON(1);
-				jbd2_journal_abort(journal, 0);
+				jbd2_journal_abort(journal, -EIO);
 			}
 			write_lock(&journal->j_state_lock);
 		} else {

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 020bd7a..4200a6f 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c

@@ -781,7 +781,7 @@
 		err = journal_submit_commit_record(journal, commit_transaction,
 						 &cbh, crc32_sum);
 		if (err)
-			__jbd2_journal_abort_hard(journal);
+			jbd2_journal_abort(journal, err);
 	}
 
 	blk_finish_plug(&plug);
@@ -874,7 +874,7 @@
 		err = journal_submit_commit_record(journal, commit_transaction,
 						&cbh, crc32_sum);
 		if (err)
-			__jbd2_journal_abort_hard(journal);
+			jbd2_journal_abort(journal, err);
 	}
 	if (cbh)
 		err = journal_wait_on_commit_record(journal, cbh);
@@ -971,29 +971,33 @@
 		 * it. */
 
 		/*
-		* A buffer which has been freed while still being journaled by
-		* a previous transaction.
-		*/
-		if (buffer_freed(bh)) {
+		 * A buffer which has been freed while still being journaled
+		 * by a previous transaction, refile the buffer to BJ_Forget of
+		 * the running transaction. If the just committed transaction
+		 * contains "add to orphan" operation, we can completely
+		 * invalidate the buffer now. We are rather through in that
+		 * since the buffer may be still accessible when blocksize <
+		 * pagesize and it is attached to the last partial page.
+		 */
+		if (buffer_freed(bh) && !jh->b_next_transaction) {
+			struct address_space *mapping;
+
+			clear_buffer_freed(bh);
+			clear_buffer_jbddirty(bh);
+
 			/*
-			 * If the running transaction is the one containing
-			 * "add to orphan" operation (b_next_transaction !=
-			 * NULL), we have to wait for that transaction to
-			 * commit before we can really get rid of the buffer.
-			 * So just clear b_modified to not confuse transaction
-			 * credit accounting and refile the buffer to
-			 * BJ_Forget of the running transaction. If the just
-			 * committed transaction contains "add to orphan"
-			 * operation, we can completely invalidate the buffer
-			 * now. We are rather through in that since the
-			 * buffer may be still accessible when blocksize <
-			 * pagesize and it is attached to the last partial
-			 * page.
+			 * Block device buffers need to stay mapped all the
+			 * time, so it is enough to clear buffer_jbddirty and
+			 * buffer_freed bits. For the file mapping buffers (i.e.
+			 * journalled data) we need to unmap buffer and clear
+			 * more bits. We also need to be careful about the check
+			 * because the data page mapping can get cleared under
+			 * out hands, which alse need not to clear more bits
+			 * because the page and buffers will be freed and can
+			 * never be reused once we are done with them.
 			 */
-			jh->b_modified = 0;
-			if (!jh->b_next_transaction) {
-				clear_buffer_freed(bh);
-				clear_buffer_jbddirty(bh);
+			mapping = READ_ONCE(bh->b_page->mapping);
+			if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
 				clear_buffer_mapped(bh);
 				clear_buffer_new(bh);
 				clear_buffer_req(bh);

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 1a2339f..a15a22d 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c

@@ -1701,6 +1701,11 @@
 		       journal->j_devname);
 		return -EFSCORRUPTED;
 	}
+	/*
+	 * clear JBD2_ABORT flag initialized in journal_init_common
+	 * here to update log tail information with the newest seq.
+	 */
+	journal->j_flags &= ~JBD2_ABORT;
 
 	/* OK, we've finished with the dynamic journal bits:
 	 * reinitialise the dynamic contents of the superblock in memory
@@ -1708,7 +1713,6 @@
 	if (journal_reset(journal))
 		goto recovery_error;
 
-	journal->j_flags &= ~JBD2_ABORT;
 	journal->j_flags |= JBD2_LOADED;
 	return 0;
 
@@ -2129,8 +2133,7 @@
 
 	if (journal->j_flags & JBD2_ABORT) {
 		write_unlock(&journal->j_state_lock);
-		if (!old_errno && old_errno != -ESHUTDOWN &&
-		    errno == -ESHUTDOWN)
+		if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN)
 			jbd2_journal_update_sb_errno(journal);
 		return;
 	}
@@ -2138,12 +2141,10 @@
 
 	__jbd2_journal_abort_hard(journal);
 
-	if (errno) {
-		jbd2_journal_update_sb_errno(journal);
-		write_lock(&journal->j_state_lock);
-		journal->j_flags |= JBD2_REC_ERR;
-		write_unlock(&journal->j_state_lock);
-	}
+	jbd2_journal_update_sb_errno(journal);
+	write_lock(&journal->j_state_lock);
+	journal->j_flags |= JBD2_REC_ERR;
+	write_unlock(&journal->j_state_lock);
 }
 
 /**
@@ -2185,11 +2186,6 @@
  * failure to disk.  ext3_error, for example, now uses this
  * functionality.
  *
- * Errors which originate from within the journaling layer will NOT
- * supply an errno; a null errno implies that absolutely no further
- * writes are done to the journal (unless there are any already in
- * progress).
- *
  */
 
 void jbd2_journal_abort(journal_t *journal, int errno)

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 911ff18..04ffef9 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c

@@ -831,8 +831,6 @@
 	char *frozen_buffer = NULL;
 	unsigned long start_lock, time_lock;
 
-	if (is_handle_aborted(handle))
-		return -EROFS;
 	journal = transaction->t_journal;
 
 	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -1084,6 +1082,9 @@
 	struct journal_head *jh;
 	int rc;
 
+	if (is_handle_aborted(handle))
+		return -EROFS;
+
 	if (jbd2_write_access_granted(handle, bh, false))
 		return 0;
 
@@ -1221,6 +1222,9 @@
 	struct journal_head *jh;
 	char *committed_data = NULL;
 
+	if (is_handle_aborted(handle))
+		return -EROFS;
+
 	if (jbd2_write_access_granted(handle, bh, true))
 		return 0;
 
@@ -2228,14 +2232,16 @@
 			return -EBUSY;
 		}
 		/*
-		 * OK, buffer won't be reachable after truncate. We just set
-		 * j_next_transaction to the running transaction (if there is
-		 * one) and mark buffer as freed so that commit code knows it
-		 * should clear dirty bits when it is done with the buffer.
+		 * OK, buffer won't be reachable after truncate. We just clear
+		 * b_modified to not confuse transaction credit accounting, and
+		 * set j_next_transaction to the running transaction (if there
+		 * is one) and mark buffer as freed so that commit code knows
+		 * it should clear dirty bits when it is done with the buffer.
 		 */
 		set_buffer_freed(bh);
 		if (journal->j_running_transaction && buffer_jbddirty(bh))
 			jh->b_next_transaction = journal->j_running_transaction;
+		jh->b_modified = 0;
 		jbd2_journal_put_journal_head(jh);
 		spin_unlock(&journal->j_list_lock);
 		jbd_unlock_bh_state(bh);

diff --git a/fs/namei.c b/fs/namei.c
index c00a7e1..5742d73 100644
--- a/fs/namei.c
+++ b/fs/namei.c

@@ -885,12 +885,65 @@
 		path_put(&last->link);
 }
 
-int sysctl_protected_symlinks __read_mostly = 0;
-int sysctl_protected_hardlinks __read_mostly = 0;
+int sysctl_protected_symlinks __read_mostly = 1;
+int sysctl_protected_hardlinks __read_mostly = 1;
 int sysctl_protected_fifos __read_mostly;
 int sysctl_protected_regular __read_mostly;
 
 /**
+ * nameidata_set_temporary - Used by Chromium OS LSM to check
+ * whether a mount point includes traversing symlinks.
+ */
+int nameidata_set_temporary(const char __user *dir_name)
+{
+	struct nameidata *tmp;
+	struct filename *name;
+
+	tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
+	if (unlikely(!tmp))
+		return -ENOMEM;
+	name = getname_flags(dir_name, LOOKUP_FOLLOW, NULL);
+	if (IS_ERR(name)) {
+		kfree(tmp);
+		return PTR_ERR(name);
+	}
+	set_nameidata(tmp, AT_FDCWD, name);
+	return 0;
+}
+
+/**
+ * nameidata_restore_temporary - Used by Chromium OS LSM to check
+ * whether a mount point includes traversing symlinks.
+ */
+void nameidata_restore_temporary(void)
+{
+	struct nameidata *tmp = current->nameidata;
+
+	restore_nameidata();
+	putname(tmp->name);
+	kfree(tmp);
+}
+
+/**
+ * nameidata_get_total_link_count - Used by security/chromiumos/lsm.c to check
+ * whether a mount point includes traversing symlinks.
+ */
+int nameidata_get_total_link_count(void)
+{
+	struct nameidata *tmp = current->nameidata;
+
+	if (unlikely(!tmp)) {
+		WARN(1, "Unexpectedly got here with current->nameidata == NULL");
+		/* Pretend we did traverse symlinks, that is the safe/sane
+		 * result here from a security point of view...
+		 */
+		return MAXSYMLINKS;
+	}
+	return tmp->total_link_count;
+}
+EXPORT_SYMBOL(nameidata_get_total_link_count);
+
+/**
  * may_follow_link - Check symlink following for unsafe situations
  * @nd: nameidata pathwalk data
  *
@@ -1368,7 +1421,7 @@
 			nd->path.dentry = parent;
 			nd->seq = seq;
 			if (unlikely(!path_connected(&nd->path)))
-				return -ENOENT;
+				return -ECHILD;
 			break;
 		} else {
 			struct mount *mnt = real_mount(nd->path.mnt);

diff --git a/fs/namespace.c b/fs/namespace.c
index 1fce41b..e92241e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c

@@ -719,8 +719,14 @@
 			goto done;
 	}
 
-	if (!new)
-		new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+	if (!new) {
+		/*
+		 * We are allocating as GFP_NOFS to appease lockdep:
+		 * since we are holding i_mutex we should not try to
+		 * recurse into filesystem code.
+		 */
+		new = kmalloc(sizeof(struct mountpoint), GFP_NOFS);
+	}
 	if (!new)
 		return ERR_PTR(-ENOMEM);
 
@@ -2736,12 +2742,19 @@
 		return -EINVAL;
 
 	/* ... and get the mountpoint */
-	retval = user_path(dir_name, &path);
+	retval = nameidata_set_temporary(dir_name);
 	if (retval)
 		return retval;
 
+	retval = user_path(dir_name, &path);
+	if (retval) {
+		nameidata_restore_temporary();
+		return retval;
+	}
+
 	retval = security_sb_mount(dev_name, &path,
 				   type_page, flags, data_page);
+	nameidata_restore_temporary();
 	if (!retval && !may_mount())
 		retval = -EPERM;
 	if (!retval && (flags & SB_MANDLOCK) && !may_mandlock())

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 94f98e1..526441d 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c

@@ -283,14 +283,14 @@
 		status = handle_async_copy(res, server, src, dst,
 				&args->src_stateid);
 		if (status)
-			return status;
+			goto out;
 	}
 
 	if ((!res->synchronous || !args->sync) &&
 			res->write_res.verifier.committed != NFS_FILE_SYNC) {
 		status = process_copy_commit(dst, pos_dst, res);
 		if (status)
-			return status;
+			goto out;
 	}
 
 	truncate_pagecache_range(dst_inode, pos_dst,

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fad7950..668b648 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c

@@ -5117,7 +5117,7 @@
 	hdr->timestamp   = jiffies;
 
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
-	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1, 0);
+	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
 	nfs4_state_protect_write(server->nfs_client, clnt, msg, hdr);
 }
 

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 97a5169..61a440b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c

@@ -702,6 +702,8 @@
 	struct fsnotify_group *group;
 	struct inode *inode;
 	struct path path;
+	struct path alteredpath;
+	struct path *canonical_path = &path;
 	struct fd f;
 	int ret;
 	unsigned flags = 0;
@@ -747,13 +749,22 @@
 	if (ret)
 		goto fput_and_out;
 
+	/* support stacked filesystems */
+	if(path.dentry && path.dentry->d_op) {
+		if (path.dentry->d_op->d_canonical_path) {
+			path.dentry->d_op->d_canonical_path(&path, &alteredpath);
+			canonical_path = &alteredpath;
+			path_put(&path);
+		}
+	}
+
 	/* inode held in place by reference to path; group by fget on fd */
-	inode = path.dentry->d_inode;
+	inode = canonical_path->dentry->d_inode;
 	group = f.file->private_data;
 
 	/* create/update an inode mark */
 	ret = inotify_update_watch(group, inode, mask);
-	path_put(&path);
+	path_put(canonical_path);
 fput_and_out:
 	fdput(f);
 	return ret;

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 30d150a4..ceab3a5f 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c

@@ -246,6 +246,7 @@
 	fput(file);
 	return ERR_PTR(-EINVAL);
 }
+EXPORT_SYMBOL(proc_ns_fget);
 
 static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
 {

diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 497a4171..bfb50fc 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h

@@ -637,9 +637,11 @@
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	oi->i_sync_tid = handle->h_transaction->t_tid;
-	if (datasync)
-		oi->i_datasync_tid = handle->h_transaction->t_tid;
+	if (!is_handle_aborted(handle)) {
+		oi->i_sync_tid = handle->h_transaction->t_tid;
+		if (datasync)
+			oi->i_datasync_tid = handle->h_transaction->t_tid;
+	}
 }
 
 #endif /* OCFS2_JOURNAL_H */

diff --git a/fs/open.c b/fs/open.c
index 8784787..bc87e0a 100644
--- a/fs/open.c
+++ b/fs/open.c

@@ -34,8 +34,11 @@
 
 #include "internal.h"
 
-int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
-	struct file *filp)
+#define CREATE_TRACE_POINTS
+#include <trace/events/fs.h>
+
+int do_truncate2(struct vfsmount *mnt, struct dentry *dentry, loff_t length,
+		unsigned int time_attrs, struct file *filp)
 {
 	int ret;
 	struct iattr newattrs;
@@ -65,6 +68,12 @@
 	return ret;
 }
 
+int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
+	struct file *filp)
+{
+	return do_truncate2(NULL, dentry, length, time_attrs, filp);
+}
+
 long vfs_truncate(const struct path *path, loff_t length)
 {
 	struct inode *inode;
@@ -1092,6 +1101,7 @@
 		} else {
 			fsnotify_open(f);
 			fd_install(fd, f);
+			trace_do_sys_open(tmp->name, flags, mode);
 		}
 	}
 	putname(tmp);

diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 0732cb0..e24738c 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c

@@ -305,6 +305,7 @@
 
 static void *help_next(struct seq_file *m, void *v, loff_t *pos)
 {
+	(*pos)++;
 	gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_next: start\n");
 
 	return NULL;

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 817c02b1..4d96a7c 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig

@@ -97,3 +97,10 @@
 
 	  Say Y if you are running any user-space software which takes benefit from
 	  this interface. For example, rkt is such a piece of software.
+
+config PROC_UID
+	bool "Include /proc/uid/ files"
+	default y
+	depends on PROC_FS && RT_MUTEXES
+	help
+	Provides aggregated per-uid information under /proc/uid.

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ead487e..3f849ca 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile

@@ -27,6 +27,7 @@
 proc-y	+= namespaces.o
 proc-y	+= self.o
 proc-y	+= thread_self.o
+proc-$(CONFIG_PROC_UID)  += uid.o
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)		+= proc_net.o
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3b9b726..40089b8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c

@@ -144,6 +144,12 @@
 		NULL, &proc_single_file_operations,	\
 		{ .proc_show = show } )
 
+#ifdef CONFIG_SECURITY_CHROMIUMOS_READONLY_PROC_SELF_MEM
+# define PROC_PID_MEM_MODE S_IRUSR
+#else
+# define PROC_PID_MEM_MODE S_IRUSR|S_IWUSR
+#endif
+
 /*
  * Count the number of hardlinks for the pid_entry table, excluding the .
  * and .. links.
@@ -876,7 +882,11 @@
 static ssize_t mem_write(struct file *file, const char __user *buf,
 			 size_t count, loff_t *ppos)
 {
+#ifdef CONFIG_SECURITY_CHROMIUMOS_READONLY_PROC_SELF_MEM
+	return -EACCES;
+#else
 	return mem_rw(file, (char __user*)buf, count, ppos, 1);
+#endif
 }
 
 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
@@ -2386,10 +2396,13 @@
 		return -ESRCH;
 
 	if (p != current) {
-		if (!capable(CAP_SYS_NICE)) {
+		rcu_read_lock();
+		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+			rcu_read_unlock();
 			count = -EPERM;
 			goto out;
 		}
+		rcu_read_unlock();
 
 		err = security_task_setscheduler(p);
 		if (err) {
@@ -2422,11 +2435,14 @@
 		return -ESRCH;
 
 	if (p != current) {
-
-		if (!capable(CAP_SYS_NICE)) {
+		rcu_read_lock();
+		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+			rcu_read_unlock();
 			err = -EPERM;
 			goto out;
 		}
+		rcu_read_unlock();
+
 		err = security_task_getscheduler(p);
 		if (err)
 			goto out;
@@ -2977,7 +2993,7 @@
 #ifdef CONFIG_NUMA
 	REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
 #endif
-	REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
+	REG("mem",        PROC_PID_MEM_MODE, proc_mem_operations),
 	LNK("cwd",        proc_cwd_link),
 	LNK("root",       proc_root_link),
 	LNK("exe",        proc_exe_link),
@@ -2989,6 +3005,7 @@
 	REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
 	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
 	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+	REG("totmaps",    S_IRUGO, proc_totmaps_operations),
 #endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -3363,7 +3380,7 @@
 #ifdef CONFIG_NUMA
 	REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
 #endif
-	REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
+	REG("mem",       PROC_PID_MEM_MODE, proc_mem_operations),
 	LNK("cwd",       proc_cwd_link),
 	LNK("root",      proc_root_link),
 	LNK("exe",       proc_exe_link),

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 95b1419..a4cd4f5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h

@@ -84,6 +84,9 @@
 		struct task_struct *task);
 };
 
+
+extern const struct file_operations proc_totmaps_operations;
+
 struct proc_inode {
 	struct pid *pid;
 	unsigned int fd;
@@ -258,6 +261,15 @@
 #endif
 
 /*
+ * uid.c
+ */
+#ifdef CONFIG_PROC_UID
+extern int proc_uid_init(void);
+#else
+static inline void proc_uid_init(void) { }
+#endif
+
+/*
  * proc_tty.c
  */
 #ifdef CONFIG_TTY
@@ -285,6 +297,7 @@
 	struct mm_struct *mm;
 #ifdef CONFIG_MMU
 	struct vm_area_struct *tail_vma;
+	struct mem_size_stats *mss;
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *task_mempolicy;

diff --git a/fs/proc/root.c b/fs/proc/root.c
index f4b1a9d..efc63a6 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c

@@ -130,6 +130,7 @@
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	proc_net_init();
+	proc_uid_init();
 	proc_mkdir("fs", NULL);
 	proc_mkdir("driver", NULL);
 	proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index efa6273..006ae59 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c

@@ -123,6 +123,56 @@
 }
 #endif
 
+static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
+{
+	const char __user *name = vma_get_anon_name(vma);
+	struct mm_struct *mm = vma->vm_mm;
+
+	unsigned long page_start_vaddr;
+	unsigned long page_offset;
+	unsigned long num_pages;
+	unsigned long max_len = NAME_MAX;
+	int i;
+
+	page_start_vaddr = (unsigned long)name & PAGE_MASK;
+	page_offset = (unsigned long)name - page_start_vaddr;
+	num_pages = DIV_ROUND_UP(page_offset + max_len, PAGE_SIZE);
+
+	seq_puts(m, "[anon:");
+
+	for (i = 0; i < num_pages; i++) {
+		int len;
+		int write_len;
+		const char *kaddr;
+		long pages_pinned;
+		struct page *page;
+
+		pages_pinned = get_user_pages_remote(current, mm,
+				page_start_vaddr, 1, 0, &page, NULL, NULL);
+		if (pages_pinned < 1) {
+			seq_puts(m, "<fault>]");
+			return;
+		}
+
+		kaddr = (const char *)kmap(page);
+		len = min(max_len, PAGE_SIZE - page_offset);
+		write_len = strnlen(kaddr + page_offset, len);
+		seq_write(m, kaddr + page_offset, write_len);
+		kunmap(page);
+		put_page(page);
+
+		/* if strnlen hit a null terminator then we're done */
+		if (write_len != len)
+			break;
+
+		max_len -= len;
+		page_offset = 0;
+		page_start_vaddr += PAGE_SIZE;
+	}
+
+	seq_putc(m, ']');
+}
+
 static void vma_stop(struct proc_maps_private *priv)
 {
 	struct mm_struct *mm = priv->mm;
@@ -348,8 +398,15 @@
 			goto done;
 		}
 
-		if (is_stack(vma))
+		if (is_stack(vma)) {
 			name = "[stack]";
+			goto done;
+		}
+
+		if (vma_get_anon_name(vma)) {
+			seq_pad(m, ' ');
+			seq_print_vma_name(m, vma);
+		}
 	}
 
 done:
@@ -421,17 +478,53 @@
 	unsigned long shared_hugetlb;
 	unsigned long private_hugetlb;
 	u64 pss;
+	u64 pss_anon;
+	u64 pss_file;
+	u64 pss_shmem;
 	u64 pss_locked;
 	u64 swap_pss;
 	bool check_shmem_swap;
 };
 
+static void smaps_page_accumulate(struct mem_size_stats *mss,
+		struct page *page, unsigned long size, unsigned long pss,
+		bool dirty, bool locked, bool private)
+{
+	mss->pss += pss;
+
+	if (PageAnon(page))
+		mss->pss_anon += pss;
+	else if (PageSwapBacked(page))
+		mss->pss_shmem += pss;
+	else
+		mss->pss_file += pss;
+
+	if (locked)
+		mss->pss_locked += pss;
+
+	if (dirty || PageDirty(page)) {
+		if (private)
+			mss->private_dirty += size;
+		else
+			mss->shared_dirty += size;
+	} else {
+		if (private)
+			mss->private_clean += size;
+		else
+			mss->shared_clean += size;
+	}
+}
+
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
 		bool compound, bool young, bool dirty, bool locked)
 {
 	int i, nr = compound ? 1 << compound_order(page) : 1;
 	unsigned long size = nr * PAGE_SIZE;
 
+	/*
+	 * First accumulate quantities that depend only on |size| and the type
+	 * of the compound page.
+	 */
 	if (PageAnon(page)) {
 		mss->anonymous += size;
 		if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
@@ -444,42 +537,26 @@
 		mss->referenced += size;
 
 	/*
+	 * Then accumulate quantities that may depend on sharing, or that may
+	 * differ page-by-page.
+	 *
 	 * page_count(page) == 1 guarantees the page is mapped exactly once.
 	 * If any subpage of the compound page mapped with PTE it would elevate
 	 * page_count().
 	 */
 	if (page_count(page) == 1) {
-		if (dirty || PageDirty(page))
-			mss->private_dirty += size;
-		else
-			mss->private_clean += size;
-		mss->pss += (u64)size << PSS_SHIFT;
-		if (locked)
-			mss->pss_locked += (u64)size << PSS_SHIFT;
+		smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
+				      locked, true);
 		return;
 	}
-
 	for (i = 0; i < nr; i++, page++) {
 		int mapcount = page_mapcount(page);
-		unsigned long pss = (PAGE_SIZE << PSS_SHIFT);
+		bool private = mapcount < 2;
+		unsigned long pss = private ? PAGE_SIZE << PSS_SHIFT :
+				    (PAGE_SIZE << PSS_SHIFT) / mapcount;
 
-		if (mapcount >= 2) {
-			if (dirty || PageDirty(page))
-				mss->shared_dirty += PAGE_SIZE;
-			else
-				mss->shared_clean += PAGE_SIZE;
-			mss->pss += pss / mapcount;
-			if (locked)
-				mss->pss_locked += pss / mapcount;
-		} else {
-			if (dirty || PageDirty(page))
-				mss->private_dirty += PAGE_SIZE;
-			else
-				mss->private_clean += PAGE_SIZE;
-			mss->pss += pss;
-			if (locked)
-				mss->pss_locked += pss;
-		}
+		smaps_page_accumulate(mss, page, PAGE_SIZE, pss,
+				      dirty, locked, private);
 	}
 }
 
@@ -758,10 +835,21 @@
 		seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
 
 /* Show the contents common for smaps and smaps_rollup */
-static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss)
+static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
+	bool rollup_mode)
 {
 	SEQ_PUT_DEC("Rss:            ", mss->resident);
 	SEQ_PUT_DEC(" kB\nPss:            ", mss->pss >> PSS_SHIFT);
+	if (rollup_mode) {
+		// These are meaningful only for smaps_rollup, otherwise two of
+		// them are zero, and the other is the same as Pss.
+		SEQ_PUT_DEC(" kB\nPss_Anon:       ",
+			    mss->pss_anon >> PSS_SHIFT);
+		SEQ_PUT_DEC(" kB\nPss_File:       ",
+			    mss->pss_file >> PSS_SHIFT);
+		SEQ_PUT_DEC(" kB\nPss_Shmem:      ",
+			    mss->pss_shmem >> PSS_SHIFT);
+	}
 	SEQ_PUT_DEC(" kB\nShared_Clean:   ", mss->shared_clean);
 	SEQ_PUT_DEC(" kB\nShared_Dirty:   ", mss->shared_dirty);
 	SEQ_PUT_DEC(" kB\nPrivate_Clean:  ", mss->private_clean);
@@ -792,13 +880,18 @@
 	smap_gather_stats(vma, &mss);
 
 	show_map_vma(m, vma);
+	if (vma_get_anon_name(vma)) {
+		seq_puts(m, "Name:           ");
+		seq_print_vma_name(m, vma);
+		seq_putc(m, '\n');
+	}
 
 	SEQ_PUT_DEC("Size:           ", vma->vm_end - vma->vm_start);
 	SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma));
 	SEQ_PUT_DEC(" kB\nMMUPageSize:    ", vma_mmu_pagesize(vma));
 	seq_puts(m, " kB\n");
 
-	__show_smap(m, &mss);
+	__show_smap(m, &mss, false);
 
 	seq_printf(m, "THPeligible:    %d\n", transparent_hugepage_enabled(vma));
 
@@ -811,6 +904,84 @@
 	return 0;
 }
 
+static void add_smaps_sum(struct mem_size_stats *mss,
+		struct mem_size_stats *mss_sum)
+{
+	mss_sum->resident += mss->resident;
+	mss_sum->pss += mss->pss;
+	mss_sum->pss_anon += mss->pss_anon;
+	mss_sum->pss_file += mss->pss_file;
+	mss_sum->pss_shmem += mss->pss_shmem;
+	mss_sum->shared_clean += mss->shared_clean;
+	mss_sum->shared_dirty += mss->shared_dirty;
+	mss_sum->private_clean += mss->private_clean;
+	mss_sum->private_dirty += mss->private_dirty;
+	mss_sum->referenced += mss->referenced;
+	mss_sum->anonymous += mss->anonymous;
+	mss_sum->anonymous_thp += mss->anonymous_thp;
+	mss_sum->swap += mss->swap;
+}
+
+static int totmaps_proc_show(struct seq_file *m, void *data)
+{
+	struct proc_maps_private *priv = m->private;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	struct mem_size_stats *mss_sum = priv->mss;
+
+	/* reference to priv->task already taken */
+	/* but need to get the mm here because */
+	/* task could be in the process of exiting */
+	mm = get_task_mm(priv->task);
+	if (!mm || IS_ERR(mm))
+		return -EINVAL;
+
+	down_read(&mm->mmap_sem);
+	hold_task_mempolicy(priv);
+
+	for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
+		struct mem_size_stats mss;
+		struct mm_walk smaps_walk = {
+			.pmd_entry = smaps_pte_range,
+			.mm = vma->vm_mm,
+			.private = &mss,
+		};
+
+		if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
+			memset(&mss, 0, sizeof(mss));
+			walk_page_vma(vma, &smaps_walk);
+			add_smaps_sum(&mss, mss_sum);
+		}
+	}
+	seq_printf(m,
+		   "Rss:            %8lu kB\n"
+		   "Pss:            %8lu kB\n"
+		   "Shared_Clean:   %8lu kB\n"
+		   "Shared_Dirty:   %8lu kB\n"
+		   "Private_Clean:  %8lu kB\n"
+		   "Private_Dirty:  %8lu kB\n"
+		   "Referenced:     %8lu kB\n"
+		   "Anonymous:      %8lu kB\n"
+		   "AnonHugePages:  %8lu kB\n"
+		   "Swap:           %8lu kB\n",
+		   mss_sum->resident >> 10,
+		   (unsigned long)(mss_sum->pss >> (10 + PSS_SHIFT)),
+		   mss_sum->shared_clean  >> 10,
+		   mss_sum->shared_dirty  >> 10,
+		   mss_sum->private_clean >> 10,
+		   mss_sum->private_dirty >> 10,
+		   mss_sum->referenced >> 10,
+		   mss_sum->anonymous >> 10,
+		   mss_sum->anonymous_thp >> 10,
+		   mss_sum->swap >> 10);
+
+	release_task_mempolicy(priv);
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+
+	return 0;
+}
+
 static int show_smaps_rollup(struct seq_file *m, void *v)
 {
 	struct proc_maps_private *priv = m->private;
@@ -848,7 +1019,7 @@
 	seq_pad(m, ' ');
 	seq_puts(m, "[rollup]\n");
 
-	__show_smap(m, &mss);
+	__show_smap(m, &mss, true);
 
 	release_task_mempolicy(priv);
 	up_read(&mm->mmap_sem);
@@ -916,6 +1087,50 @@
 	return single_release(inode, file);
 }
 
+static int totmaps_open(struct inode *inode, struct file *file)
+{
+	struct proc_maps_private *priv;
+	int ret = -ENOMEM;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv) {
+		priv->mss = kzalloc(sizeof(*priv->mss), GFP_KERNEL);
+		if (!priv->mss)
+			return -ENOMEM;
+
+		/* we need to grab references to the task_struct */
+		/* at open time, because there's a potential information */
+		/* leak where the totmaps file is opened and held open */
+		/* while the underlying pid to task mapping changes */
+		/* underneath it */
+		priv->task = get_pid_task(proc_pid(inode), PIDTYPE_PID);
+		if (!priv->task) {
+			kfree(priv->mss);
+			kfree(priv);
+			return -ESRCH;
+		}
+
+		ret = single_open(file, totmaps_proc_show, priv);
+		if (ret) {
+			put_task_struct(priv->task);
+			kfree(priv->mss);
+			kfree(priv);
+		}
+	}
+	return ret;
+}
+
+static int totmaps_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+	struct proc_maps_private *priv = m->private;
+
+	put_task_struct(priv->task);
+	kfree(priv->mss);
+	kfree(priv);
+	m->private = NULL;
+	return single_release(inode, file);
+}
+
 const struct file_operations proc_pid_smaps_operations = {
 	.open		= pid_smaps_open,
 	.read		= seq_read,
@@ -930,6 +1145,13 @@
 	.release	= smaps_rollup_release,
 };
 
+const struct file_operations proc_totmaps_operations = {
+	.open		= totmaps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= totmaps_release,
+};
+
 enum clear_refs_types {
 	CLEAR_REFS_ALL = 1,
 	CLEAR_REFS_ANON,

diff --git a/fs/proc/uid.c b/fs/proc/uid.c
new file mode 100644
index 0000000..ae720b9
--- /dev/null
+++ b/fs/proc/uid.c

@@ -0,0 +1,290 @@
+/*
+ * /proc/uid support
+ */
+
+#include <linux/fs.h>
+#include <linux/hashtable.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/rtmutex.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+static struct proc_dir_entry *proc_uid;
+
+#define UID_HASH_BITS 10
+
+static DECLARE_HASHTABLE(proc_uid_hash_table, UID_HASH_BITS);
+
+/*
+ * use rt_mutex here to avoid priority inversion between high-priority readers
+ * of these files and tasks calling proc_register_uid().
+ */
+static DEFINE_RT_MUTEX(proc_uid_lock); /* proc_uid_hash_table */
+
+struct uid_hash_entry {
+	uid_t uid;
+	struct hlist_node hash;
+};
+
+/* Caller must hold proc_uid_lock */
+static bool uid_hash_entry_exists_locked(uid_t uid)
+{
+	struct uid_hash_entry *entry;
+
+	hash_for_each_possible(proc_uid_hash_table, entry, hash, uid) {
+		if (entry->uid == uid)
+			return true;
+	}
+	return false;
+}
+
+void proc_register_uid(kuid_t kuid)
+{
+	struct uid_hash_entry *entry;
+	bool exists;
+	uid_t uid = from_kuid_munged(current_user_ns(), kuid);
+
+	rt_mutex_lock(&proc_uid_lock);
+	exists = uid_hash_entry_exists_locked(uid);
+	rt_mutex_unlock(&proc_uid_lock);
+	if (exists)
+		return;
+
+	entry = kzalloc(sizeof(struct uid_hash_entry), GFP_KERNEL);
+	if (!entry)
+		return;
+	entry->uid = uid;
+
+	rt_mutex_lock(&proc_uid_lock);
+	if (uid_hash_entry_exists_locked(uid))
+		kfree(entry);
+	else
+		hash_add(proc_uid_hash_table, &entry->hash, uid);
+	rt_mutex_unlock(&proc_uid_lock);
+}
+
+struct uid_entry {
+	const char *name;
+	int len;
+	umode_t mode;
+	const struct inode_operations *iop;
+	const struct file_operations *fop;
+};
+
+#define NOD(NAME, MODE, IOP, FOP) {			\
+	.name	= (NAME),				\
+	.len	= sizeof(NAME) - 1,			\
+	.mode	= MODE,					\
+	.iop	= IOP,					\
+	.fop	= FOP,					\
+}
+
+static const struct uid_entry uid_base_stuff[] = {};
+
+static const struct inode_operations proc_uid_def_inode_operations = {
+	.setattr	= proc_setattr,
+};
+
+static struct inode *proc_uid_make_inode(struct super_block *sb, kuid_t kuid)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return NULL;
+
+	inode->i_ino = get_next_ino();
+	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+	inode->i_op = &proc_uid_def_inode_operations;
+	inode->i_uid = kuid;
+
+	return inode;
+}
+
+static struct dentry *proc_uident_instantiate(struct dentry *dentry,
+				   struct task_struct *unused, const void *ptr)
+{
+	const struct uid_entry *u = ptr;
+	struct inode *inode;
+
+	uid_t uid = name_to_int(&dentry->d_name);
+	kuid_t kuid;
+	bool uid_exists;
+	rt_mutex_lock(&proc_uid_lock);
+	uid_exists = uid_hash_entry_exists_locked(uid);
+	rt_mutex_unlock(&proc_uid_lock);
+	if (uid_exists) {
+		kuid = make_kuid(current_user_ns(), uid);
+		inode = proc_uid_make_inode(dentry->d_sb, kuid);
+		if (!inode)
+			return ERR_PTR(-ENOENT);
+	} else {
+		return ERR_PTR(-ENOENT);
+	}
+
+	inode->i_mode = u->mode;
+	if (S_ISDIR(inode->i_mode))
+		set_nlink(inode, 2);
+	if (u->iop)
+		inode->i_op = u->iop;
+	if (u->fop)
+		inode->i_fop = u->fop;
+
+	return d_splice_alias(inode, dentry);
+}
+
+static struct dentry *proc_uid_base_lookup(struct inode *dir,
+					   struct dentry *dentry,
+					   unsigned int flags)
+{
+	const struct uid_entry *u, *last;
+	unsigned int nents = ARRAY_SIZE(uid_base_stuff);
+
+	if (nents == 0)
+		return ERR_PTR(-ENOENT);
+
+	last = &uid_base_stuff[nents - 1];
+	for (u = uid_base_stuff; u <= last; u++) {
+		if (u->len != dentry->d_name.len)
+			continue;
+		if (!memcmp(dentry->d_name.name, u->name, u->len))
+			break;
+	}
+	if (u > last)
+		return ERR_PTR(-ENOENT);
+
+	return proc_uident_instantiate(dentry, NULL, u);
+}
+
+static int proc_uid_base_readdir(struct file *file, struct dir_context *ctx)
+{
+	unsigned int nents = ARRAY_SIZE(uid_base_stuff);
+	const struct uid_entry *u;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	if (ctx->pos >= nents + 2)
+		return 0;
+
+	for (u = uid_base_stuff + (ctx->pos - 2);
+	     u < uid_base_stuff + nents; u++) {
+		if (!proc_fill_cache(file, ctx, u->name, u->len,
+				     proc_uident_instantiate, NULL, u))
+			break;
+		ctx->pos++;
+	}
+
+	return 0;
+}
+
+static const struct inode_operations proc_uid_base_inode_operations = {
+	.lookup		= proc_uid_base_lookup,
+	.setattr	= proc_setattr,
+};
+
+static const struct file_operations proc_uid_base_operations = {
+	.read		= generic_read_dir,
+	.iterate	= proc_uid_base_readdir,
+	.llseek		= default_llseek,
+};
+
+static struct dentry *proc_uid_instantiate(struct dentry *dentry,
+				struct task_struct *unused, const void *ptr)
+{
+	unsigned int i, len;
+	nlink_t nlinks;
+	kuid_t *kuid = (kuid_t *)ptr;
+	struct inode *inode = proc_uid_make_inode(dentry->d_sb, *kuid);
+
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	inode->i_mode = S_IFDIR | 0555;
+	inode->i_op = &proc_uid_base_inode_operations;
+	inode->i_fop = &proc_uid_base_operations;
+	inode->i_flags |= S_IMMUTABLE;
+
+	nlinks = 2;
+	len = ARRAY_SIZE(uid_base_stuff);
+	for (i = 0; i < len; ++i) {
+		if (S_ISDIR(uid_base_stuff[i].mode))
+			++nlinks;
+	}
+	set_nlink(inode, nlinks);
+
+	return d_splice_alias(inode, dentry);
+}
+
+static int proc_uid_readdir(struct file *file, struct dir_context *ctx)
+{
+	int last_shown, i;
+	unsigned long bkt;
+	struct uid_hash_entry *entry;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+
+	i = 0;
+	last_shown = ctx->pos - 2;
+	rt_mutex_lock(&proc_uid_lock);
+	hash_for_each(proc_uid_hash_table, bkt, entry, hash) {
+		int len;
+		char buf[PROC_NUMBUF];
+
+		if (i < last_shown)
+			continue;
+		len = snprintf(buf, sizeof(buf), "%u", entry->uid);
+		if (!proc_fill_cache(file, ctx, buf, len,
+				     proc_uid_instantiate, NULL, &entry->uid))
+			break;
+		i++;
+		ctx->pos++;
+	}
+	rt_mutex_unlock(&proc_uid_lock);
+	return 0;
+}
+
+static struct dentry *proc_uid_lookup(struct inode *dir, struct dentry *dentry,
+				      unsigned int flags)
+{
+	int result = -ENOENT;
+
+	uid_t uid = name_to_int(&dentry->d_name);
+	bool uid_exists;
+
+	rt_mutex_lock(&proc_uid_lock);
+	uid_exists = uid_hash_entry_exists_locked(uid);
+	rt_mutex_unlock(&proc_uid_lock);
+	if (uid_exists) {
+		kuid_t kuid = make_kuid(current_user_ns(), uid);
+
+		return proc_uid_instantiate(dentry, NULL, &kuid);
+	}
+	return ERR_PTR(result);
+}
+
+static const struct file_operations proc_uid_operations = {
+	.read		= generic_read_dir,
+	.iterate	= proc_uid_readdir,
+	.llseek		= default_llseek,
+};
+
+static const struct inode_operations proc_uid_inode_operations = {
+	.lookup		= proc_uid_lookup,
+	.setattr	= proc_setattr,
+};
+
+int __init proc_uid_init(void)
+{
+	proc_uid = proc_mkdir("uid", NULL);
+	if (!proc_uid)
+		return -ENOMEM;
+	proc_uid->proc_iops = &proc_uid_inode_operations;
+	proc_uid->proc_fops = &proc_uid_operations;
+
+	return 0;
+}

diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 0037aea..2946713 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c

@@ -2250,7 +2250,8 @@
 	/* also releases the path */
 	unfix_nodes(&s_ins_balance);
 #ifdef REISERQUOTA_DEBUG
-	reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
+	if (inode)
+		reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
 		       "reiserquota insert_item(): freeing %u id=%u type=%c",
 		       quota_bytes, inode->i_uid, head2type(ih));
 #endif

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 6280efe..de5eda3 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c

@@ -1954,7 +1954,7 @@
 		if (!sbi->s_jdev) {
 			SWARN(silent, s, "", "Cannot allocate memory for "
 				"journal device name");
-			goto error;
+			goto error_unlocked;
 		}
 	}
 #ifdef CONFIG_QUOTA

diff --git a/fs/sync.c b/fs/sync.c
index b54e054..055daab 100644
--- a/fs/sync.c
+++ b/fs/sync.c

@@ -9,7 +9,7 @@
 #include <linux/slab.h>
 #include <linux/export.h>
 #include <linux/namei.h>
-#include <linux/sched.h>
+#include <linux/sched/xacct.h>
 #include <linux/writeback.h>
 #include <linux/syscalls.h>
 #include <linux/linkage.h>
@@ -220,6 +220,7 @@
 	if (f.file) {
 		ret = vfs_fsync(f.file, datasync);
 		fdput(f);
+		inc_syscfs(current);
 	}
 	return ret;
 }

diff --git a/fs/udf/super.c b/fs/udf/super.c
index 6fd0f14..1676a17 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c

@@ -2469,17 +2469,29 @@
 static unsigned int udf_count_free(struct super_block *sb)
 {
 	unsigned int accum = 0;
-	struct udf_sb_info *sbi;
+	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct udf_part_map *map;
+	unsigned int part = sbi->s_partition;
+	int ptype = sbi->s_partmaps[part].s_partition_type;
 
-	sbi = UDF_SB(sb);
+	if (ptype == UDF_METADATA_MAP25) {
+		part = sbi->s_partmaps[part].s_type_specific.s_metadata.
+							s_phys_partition_ref;
+	} else if (ptype == UDF_VIRTUAL_MAP15 || ptype == UDF_VIRTUAL_MAP20) {
+		/*
+		 * Filesystems with VAT are append-only and we cannot write to
+ 		 * them. Let's just report 0 here.
+		 */
+		return 0;
+	}
+
 	if (sbi->s_lvid_bh) {
 		struct logicalVolIntegrityDesc *lvid =
 			(struct logicalVolIntegrityDesc *)
 			sbi->s_lvid_bh->b_data;
-		if (le32_to_cpu(lvid->numOfPartitions) > sbi->s_partition) {
+		if (le32_to_cpu(lvid->numOfPartitions) > part) {
 			accum = le32_to_cpu(
-					lvid->freeSpaceTable[sbi->s_partition]);
+					lvid->freeSpaceTable[part]);
 			if (accum == 0xFFFFFFFF)
 				accum = 0;
 		}
@@ -2488,7 +2500,7 @@
 	if (accum)
 		return accum;
 
-	map = &sbi->s_partmaps[sbi->s_partition];
+	map = &sbi->s_partmaps[part];
 	if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
 		accum += udf_count_free_bitmap(sb,
 					       map->s_uspace.s_bitmap);

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index d269d11..0e89a6d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c

@@ -913,7 +913,9 @@
 					 new_flags, vma->anon_vma,
 					 vma->vm_file, vma->vm_pgoff,
 					 vma_policy(vma),
-					 NULL_VM_UFFD_CTX);
+					 NULL_VM_UFFD_CTX,
+					 vma_get_anon_name(vma));
+
 			if (prev)
 				vma = prev;
 			else
@@ -1463,7 +1465,8 @@
 		prev = vma_merge(mm, prev, start, vma_end, new_flags,
 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
 				 vma_policy(vma),
-				 ((struct vm_userfaultfd_ctx){ ctx }));
+				 ((struct vm_userfaultfd_ctx){ ctx }),
+				 vma_get_anon_name(vma));
 		if (prev) {
 			vma = prev;
 			goto next;
@@ -1625,7 +1628,8 @@
 		prev = vma_merge(mm, prev, start, vma_end, new_flags,
 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
 				 vma_policy(vma),
-				 NULL_VM_UFFD_CTX);
+				 NULL_VM_UFFD_CTX,
+				 vma_get_anon_name(vma));
 		if (prev) {
 			vma = prev;
 			goto next;

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 2595496..3a56299 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c

@@ -1151,11 +1151,14 @@
 	struct file	*filp,
 	struct vm_area_struct *vma)
 {
+	struct dax_device 	*dax_dev;
+
+	dax_dev = xfs_find_daxdev_for_inode(file_inode(filp));
 	/*
-	 * We don't support synchronous mappings for non-DAX files. At least
-	 * until someone comes with a sensible use case.
+	 * We don't support synchronous mappings for non-DAX files and
+	 * for DAX files if underneath dax_device is not synchronous.
 	 */
-	if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
+	if (!daxdev_mapping_supported(vma, dax_dev))
 		return -EOPNOTSUPP;
 
 	file_accessed(filp);

diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 66ceb12..2939a6c 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h

@@ -528,11 +528,12 @@
 #define ACPI_MAKE_RSDP_SIG(dest)        (memcpy (ACPI_CAST_PTR (char, (dest)), ACPI_SIG_RSDP, 8))
 
 /*
- * Algorithm to obtain access bit width.
+ * Algorithm to obtain access bit or byte width.
  * Can be used with access_width of struct acpi_generic_address and access_size of
  * struct acpi_resource_generic_register.
  */
 #define ACPI_ACCESS_BIT_WIDTH(size)     (1 << ((size) + 2))
+#define ACPI_ACCESS_BYTE_WIDTH(size)    (1 << ((size) - 1))
 
 /*******************************************************************************
  *

diff --git a/include/linux/alt-syscall.h b/include/linux/alt-syscall.h
new file mode 100644
index 0000000..00f37c0
--- /dev/null
+++ b/include/linux/alt-syscall.h

@@ -0,0 +1,59 @@
+#ifndef _ALT_SYSCALL_H
+#define _ALT_SYSCALL_H
+
+#include <linux/errno.h>
+
+#ifdef CONFIG_ALT_SYSCALL
+
+#include <linux/list.h>
+#include <asm/syscall.h>
+
+#define ALT_SYS_CALL_NAME_MAX	32
+
+struct alt_sys_call_table {
+	char name[ALT_SYS_CALL_NAME_MAX + 1];
+	sys_call_ptr_t *table;
+	int size;
+#ifdef CONFIG_IA32_EMULATION
+	sys_call_ptr_t *compat_table;
+	int compat_size;
+#endif
+	struct list_head node;
+};
+
+/*
+ * arch_dup_sys_call_table should return the default syscall table, not
+ * the current syscall table, since we want to explicitly not allow
+ * syscall table composition. A selected syscall table should be treated
+ * as a single execution personality.
+ */
+
+int arch_dup_sys_call_table(struct alt_sys_call_table *table);
+int arch_set_sys_call_table(struct alt_sys_call_table *table);
+
+int register_alt_sys_call_table(struct alt_sys_call_table *table);
+int set_alt_sys_call_table(char __user *name);
+
+#else
+
+struct alt_sys_call_table;
+
+static inline int arch_dup_sys_call_table(struct alt_sys_call_table *table)
+{
+	return -ENOSYS;
+}
+static inline int arch_set_sys_call_table(struct alt_sys_call_table *table)
+{
+	return -ENOSYS;
+}
+static inline int register_alt_sys_call_table(struct alt_sys_call_table *table)
+{
+	return -ENOSYS;
+}
+static inline int set_alt_sys_call_table(char __user *name)
+{
+	return -ENOSYS;
+}
+#endif
+
+#endif /* _ALT_SYSCALL_H */

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 9334fbe..bea2b15 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h

@@ -85,6 +85,17 @@
 	u32				op;
 };
 
+struct audit_task_info {
+	kuid_t			loginuid;
+	unsigned int		sessionid;
+	u64			contid;
+#ifdef CONFIG_AUDITSYSCALL
+	struct audit_context	*ctx;
+#endif
+};
+
+extern struct audit_task_info init_struct_audit;
+
 extern int is_audit_feature_set(int which);
 
 extern int __init audit_register_class(int class, unsigned *list);
@@ -123,6 +134,9 @@
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
 				/* Public API */
+extern int  audit_alloc(struct task_struct *task);
+extern void audit_free(struct task_struct *task);
+extern void __init audit_task_init(void);
 extern __printf(4, 5)
 void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
 	       const char *fmt, ...);
@@ -162,8 +176,39 @@
 extern int audit_rule_change(int type, int seq, void *data, size_t datasz);
 extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);
 
+static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
+{
+	if (!tsk->audit)
+		return INVALID_UID;
+	return tsk->audit->loginuid;
+}
+
+static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
+{
+	if (!tsk->audit)
+		return AUDIT_SID_UNSET;
+	return tsk->audit->sessionid;
+}
+
+extern int audit_set_contid(struct task_struct *tsk, u64 contid);
+
+static inline u64 audit_get_contid(struct task_struct *tsk)
+{
+	if (!tsk->audit)
+		return AUDIT_CID_UNSET;
+	return tsk->audit->contid;
+}
+
 extern u32 audit_enabled;
 #else /* CONFIG_AUDIT */
+static inline int audit_alloc(struct task_struct *task)
+{
+	return 0;
+}
+static inline void audit_free(struct task_struct *task)
+{ }
+static inline void __init audit_task_init(void)
+{ }
 static inline __printf(4, 5)
 void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
 	       const char *fmt, ...)
@@ -205,6 +250,12 @@
 static inline void audit_log_task_info(struct audit_buffer *ab,
 				       struct task_struct *tsk)
 { }
+
+static inline u64 audit_get_contid(struct task_struct *tsk)
+{
+	return AUDIT_CID_UNSET;
+}
+
 #define audit_enabled AUDIT_OFF
 #endif /* CONFIG_AUDIT */
 
@@ -219,8 +270,6 @@
 
 /* These are defined in auditsc.c */
 				/* Public API */
-extern int  audit_alloc(struct task_struct *task);
-extern void __audit_free(struct task_struct *task);
 extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
 				  unsigned long a2, unsigned long a3);
 extern void __audit_syscall_exit(int ret_success, long ret_value);
@@ -242,12 +291,14 @@
 
 static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
 {
-	task->audit_context = ctx;
+	task->audit->ctx = ctx;
 }
 
 static inline struct audit_context *audit_context(void)
 {
-	return current->audit_context;
+	if (!current->audit)
+		return NULL;
+	return current->audit->ctx;
 }
 
 static inline bool audit_dummy_context(void)
@@ -255,11 +306,7 @@
 	void *p = audit_context();
 	return !p || *(int *)p;
 }
-static inline void audit_free(struct task_struct *task)
-{
-	if (unlikely(task->audit_context))
-		__audit_free(task);
-}
+
 static inline void audit_syscall_entry(int major, unsigned long a0,
 				       unsigned long a1, unsigned long a2,
 				       unsigned long a3)
@@ -329,16 +376,6 @@
 			      struct timespec64 *t, unsigned int *serial);
 extern int audit_set_loginuid(kuid_t loginuid);
 
-static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
-{
-	return tsk->loginuid;
-}
-
-static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
-{
-	return tsk->sessionid;
-}
-
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
 extern void __audit_bprm(struct linux_binprm *bprm);
@@ -461,12 +498,6 @@
 extern int audit_n_rules;
 extern int audit_signals;
 #else /* CONFIG_AUDITSYSCALL */
-static inline int audit_alloc(struct task_struct *task)
-{
-	return 0;
-}
-static inline void audit_free(struct task_struct *task)
-{ }
 static inline void audit_syscall_entry(int major, unsigned long a0,
 				       unsigned long a1, unsigned long a2,
 				       unsigned long a3)
@@ -595,6 +626,16 @@
 	return uid_valid(audit_get_loginuid(tsk));
 }
 
+static inline bool audit_contid_valid(u64 contid)
+{
+	return contid != AUDIT_CID_UNSET;
+}
+
+static inline bool audit_contid_set(struct task_struct *tsk)
+{
+	return audit_contid_valid(audit_get_contid(tsk));
+}
+
 static inline void audit_log_string(struct audit_buffer *ab, const char *buf)
 {
 	audit_log_n_string(ab, buf, strlen(buf));

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6e67aeb..63c9f44 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h

@@ -538,7 +538,7 @@
 	/*
 	 * mq queue kobject
 	 */
-	struct kobject mq_kobj;
+	struct kobject *mq_kobj;
 
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
 	struct blk_integrity integrity;

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index acb77dcf..8996c09 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h

@@ -21,6 +21,10 @@
 SUBSYS(cpuacct)
 #endif
 
+#if IS_ENABLED(CONFIG_SCHED_TUNE)
+SUBSYS(schedtune)
+#endif
+
 #if IS_ENABLED(CONFIG_BLK_CGROUP)
 SUBSYS(io)
 #endif

diff --git a/include/linux/clk.h b/include/linux/clk.h
index 4f750c4..c705271d 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h

@@ -312,7 +312,26 @@
  */
 int __must_check clk_bulk_get(struct device *dev, int num_clks,
 			      struct clk_bulk_data *clks);
-
+/**
+ * clk_bulk_get_all - lookup and obtain all available references to clock
+ *		      producer.
+ * @dev: device for clock "consumer"
+ * @clks: pointer to the clk_bulk_data table of consumer
+ *
+ * This helper function allows drivers to get all clk consumers in one
+ * operation. If any of the clk cannot be acquired then any clks
+ * that were obtained will be freed before returning to the caller.
+ *
+ * Returns a positive value for the number of clocks obtained while the
+ * clock references are stored in the clk_bulk_data table in @clks field.
+ * Returns 0 if there're none and a negative value if something failed.
+ *
+ * Drivers must assume that the clock source is not enabled.
+ *
+ * clk_bulk_get should not be called from within interrupt context.
+ */
+int __must_check clk_bulk_get_all(struct device *dev,
+				  struct clk_bulk_data **clks);
 /**
  * devm_clk_bulk_get - managed get multiple clk consumers
  * @dev: device for clock "consumer"
@@ -327,6 +346,22 @@
  */
 int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 				   struct clk_bulk_data *clks);
+/**
+ * devm_clk_bulk_get_all - managed get multiple clk consumers
+ * @dev: device for clock "consumer"
+ * @clks: pointer to the clk_bulk_data table of consumer
+ *
+ * Returns a positive value for the number of clocks obtained while the
+ * clock references are stored in the clk_bulk_data table in @clks field.
+ * Returns 0 if there're none and a negative value if something failed.
+ *
+ * This helper function allows drivers to get several clk
+ * consumers in one operation with management, the clks will
+ * automatically be freed when the device is unbound.
+ */
+
+int __must_check devm_clk_bulk_get_all(struct device *dev,
+				       struct clk_bulk_data **clks);
 
 /**
  * devm_clk_get - lookup and obtain a managed reference to a clock producer.
@@ -488,6 +523,19 @@
 void clk_bulk_put(int num_clks, struct clk_bulk_data *clks);
 
 /**
+ * clk_bulk_put_all - "free" all the clock source
+ * @num_clks: the number of clk_bulk_data
+ * @clks: the clk_bulk_data table of consumer
+ *
+ * Note: drivers must ensure that all clk_bulk_enable calls made on this
+ * clock source are balanced by clk_bulk_disable calls prior to calling
+ * this function.
+ *
+ * clk_bulk_put_all should not be called from within interrupt context.
+ */
+void clk_bulk_put_all(int num_clks, struct clk_bulk_data *clks);
+
+/**
  * devm_clk_put	- "free" a managed clock source
  * @dev: device used to acquire the clock
  * @clk: clock source acquired with devm_clk_get()
@@ -642,6 +690,12 @@
 	return 0;
 }
 
+static inline int __must_check clk_bulk_get_all(struct device *dev,
+					 struct clk_bulk_data **clks)
+{
+	return 0;
+}
+
 static inline struct clk *devm_clk_get(struct device *dev, const char *id)
 {
 	return NULL;
@@ -653,6 +707,13 @@
 	return 0;
 }
 
+static inline int __must_check devm_clk_bulk_get_all(struct device *dev,
+						     struct clk_bulk_data **clks)
+{
+
+	return 0;
+}
+
 static inline struct clk *devm_get_clk_from_child(struct device *dev,
 				struct device_node *np, const char *con_id)
 {
@@ -663,6 +724,8 @@
 
 static inline void clk_bulk_put(int num_clks, struct clk_bulk_data *clks) {}
 
+static inline void clk_bulk_put_all(int num_clks, struct clk_bulk_data *clks) {}
+
 static inline void devm_clk_put(struct device *dev, struct clk *clk) {}
 
 

diff --git a/include/linux/compat.h b/include/linux/compat.h
index de0c13b..eb67e078 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h

@@ -993,6 +993,13 @@
 
 #endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */
 
+#ifdef CONFIG_ALT_SYSCALL
+
+int compat_ksys_clock_adjtime(clockid_t which_clock,
+			      struct compat_timex __user *tp);
+int compat_ksys_adjtimex(struct compat_timex __user *utp);
+
+#endif
 
 /*
  * For most but not all architectures, "am I in a compat syscall?" and

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 3361663..a885e9c 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h

@@ -338,14 +338,15 @@
 };
 
 /* flags */
-#define CPUFREQ_STICKY		(1 << 0)	/* driver isn't removed even if
-						   all ->init() calls failed */
-#define CPUFREQ_CONST_LOOPS	(1 << 1)	/* loops_per_jiffy or other
-						   kernel "constants" aren't
-						   affected by frequency
-						   transitions */
-#define CPUFREQ_PM_NO_WARN	(1 << 2)	/* don't warn on suspend/resume
-						   speed mismatches */
+
+/* driver isn't removed even if all ->init() calls failed */
+#define CPUFREQ_STICKY				BIT(0)
+
+/* loops_per_jiffy or other kernel "constants" aren't affected by frequency transitions */
+#define CPUFREQ_CONST_LOOPS			BIT(1)
+
+/* don't warn on suspend/resume speed mismatches */
+#define CPUFREQ_PM_NO_WARN			BIT(2)
 
 /*
  * This should be set by platforms having multiple clock-domains, i.e.
@@ -353,14 +354,14 @@
  * be created in cpu/cpu<num>/cpufreq/ directory and so they can use the same
  * governor with different tunables for different clusters.
  */
-#define CPUFREQ_HAVE_GOVERNOR_PER_POLICY (1 << 3)
+#define CPUFREQ_HAVE_GOVERNOR_PER_POLICY	BIT(3)
 
 /*
  * Driver will do POSTCHANGE notifications from outside of their ->target()
  * routine and so must set cpufreq_driver->flags with this flag, so that core
  * can handle them specially.
  */
-#define CPUFREQ_ASYNC_NOTIFICATION  (1 << 4)
+#define CPUFREQ_ASYNC_NOTIFICATION		BIT(4)
 
 /*
  * Set by drivers which want cpufreq core to check if CPU is running at a
@@ -369,13 +370,13 @@
  * from the table. And if that fails, we will stop further boot process by
  * issuing a BUG_ON().
  */
-#define CPUFREQ_NEED_INITIAL_FREQ_CHECK	(1 << 5)
+#define CPUFREQ_NEED_INITIAL_FREQ_CHECK	BIT(5)
 
 /*
  * Set by drivers to disallow use of governors with "dynamic_switching" flag
  * set.
  */
-#define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING (1 << 6)
+#define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING	BIT(6)
 
 int cpufreq_register_driver(struct cpufreq_driver *driver_data);
 int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
@@ -931,6 +932,14 @@
 }
 #endif
 
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
+			struct cpufreq_governor *old_gov);
+#else
+static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
+			struct cpufreq_governor *old_gov) { }
+#endif
+
 extern void arch_freq_prepare_all(void);
 extern unsigned int arch_freq_get_on_cpu(int cpu);
 

diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 317aeca..8ccffba8 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h

@@ -220,7 +220,7 @@
 #endif
 
 /* kernel/sched/idle.c */
-extern void sched_idle_set_state(struct cpuidle_state *idle_state);
+extern void sched_idle_set_state(struct cpuidle_state *idle_state, int index);
 extern void default_idle_call(void);
 
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 450b28d..836bd3f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h

@@ -7,6 +7,9 @@
 #include <linux/radix-tree.h>
 #include <asm/pgtable.h>
 
+/* Flag for synchronous flush */
+#define DAXDEV_F_SYNC (1UL << 0)
+
 struct iomap_ops;
 struct dax_device;
 struct dax_operations {
@@ -17,6 +20,12 @@
 	 */
 	long (*direct_access)(struct dax_device *, pgoff_t, long,
 			void **, pfn_t *);
+	/*
+	 * Validate whether this device is usable as an fsdax backing
+	 * device.
+	 */
+	bool (*dax_supported)(struct dax_device *, struct block_device *, int,
+			sector_t, sector_t);
 	/* copy_from_iter: required operation for fs-dax direct-i/o */
 	size_t (*copy_from_iter)(struct dax_device *, pgoff_t, void *, size_t,
 			struct iov_iter *);
@@ -30,18 +39,40 @@
 #if IS_ENABLED(CONFIG_DAX)
 struct dax_device *dax_get_by_host(const char *host);
 struct dax_device *alloc_dax(void *private, const char *host,
-		const struct dax_operations *ops);
+		const struct dax_operations *ops, unsigned long flags);
 void put_dax(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
 bool dax_write_cache_enabled(struct dax_device *dax_dev);
+bool __dax_synchronous(struct dax_device *dax_dev);
+static inline bool dax_synchronous(struct dax_device *dax_dev)
+{
+	return  __dax_synchronous(dax_dev);
+}
+void __set_dax_synchronous(struct dax_device *dax_dev);
+static inline void set_dax_synchronous(struct dax_device *dax_dev)
+{
+	__set_dax_synchronous(dax_dev);
+}
+/*
+ * Check if given mapping is supported by the file / underlying device.
+ */
+static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
+					     struct dax_device *dax_dev)
+{
+	if (!(vma->vm_flags & VM_SYNC))
+		return true;
+	if (!IS_DAX(file_inode(vma->vm_file)))
+		return false;
+	return dax_synchronous(dax_dev);
+}
 #else
 static inline struct dax_device *dax_get_by_host(const char *host)
 {
 	return NULL;
 }
 static inline struct dax_device *alloc_dax(void *private, const char *host,
-		const struct dax_operations *ops)
+		const struct dax_operations *ops, unsigned long flags)
 {
 	/*
 	 * Callers should check IS_ENABLED(CONFIG_DAX) to know if this
@@ -62,6 +93,18 @@
 {
 	return false;
 }
+static inline bool dax_synchronous(struct dax_device *dax_dev)
+{
+	return true;
+}
+static inline void set_dax_synchronous(struct dax_device *dax_dev)
+{
+}
+static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
+				struct dax_device *dax_dev)
+{
+	return !(vma->vm_flags & VM_SYNC);
+}
 #endif
 
 struct writeback_control;
@@ -73,6 +116,17 @@
 	return __bdev_dax_supported(bdev, blocksize);
 }
 
+bool __generic_fsdax_supported(struct dax_device *dax_dev,
+		struct block_device *bdev, int blocksize, sector_t start,
+		sector_t sectors);
+static inline bool generic_fsdax_supported(struct dax_device *dax_dev,
+		struct block_device *bdev, int blocksize, sector_t start,
+		sector_t sectors)
+{
+	return __generic_fsdax_supported(dax_dev, bdev, blocksize, start,
+			sectors);
+}
+
 static inline struct dax_device *fs_dax_get_by_host(const char *host)
 {
 	return dax_get_by_host(host);
@@ -97,6 +151,13 @@
 	return false;
 }
 
+static inline bool generic_fsdax_supported(struct dax_device *dax_dev,
+		struct block_device *bdev, int blocksize, sector_t start,
+		sector_t sectors)
+{
+	return false;
+}
+
 static inline struct dax_device *fs_dax_get_by_host(const char *host)
 {
 	return NULL;
@@ -140,6 +201,8 @@
 void *dax_get_private(struct dax_device *dax_dev);
 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
 		void **kaddr, pfn_t *pfn);
+bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
+		int blocksize, sector_t start, sector_t len);
 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 		size_t bytes, struct iov_iter *i);
 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 0880bae..bd19969 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h

@@ -146,6 +146,7 @@
 	struct vfsmount *(*d_automount)(struct path *);
 	int (*d_manage)(const struct path *, bool);
 	struct dentry *(*d_real)(struct dentry *, const struct inode *);
+	void (*d_canonical_path)(const struct path *, struct path *);
 } ____cacheline_aligned;
 
 /*

diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 91f9f95..9f93f3ee 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h

@@ -10,6 +10,7 @@
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/dm-ioctl.h>
 #include <linux/math64.h>
 #include <linux/ratelimit.h>
 
@@ -425,6 +426,14 @@
 			  sector_t start);
 union map_info *dm_get_rq_mapinfo(struct request *rq);
 
+/*
+ * Device mapper functions to parse and create devices specified by the
+ * parameter "dm-mod.create="
+ */
+int __init dm_early_create(struct dm_ioctl *dmi,
+			   struct dm_target_spec **spec_array,
+			   char **target_params_array);
+
 struct queue_limits *dm_get_queue_limits(struct mapped_device *md);
 
 /*

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 0647f43..50128c3 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h

@@ -686,6 +686,7 @@
  * @fill_align: alignment shift for memset operations
  * @dev_id: unique device ID
  * @dev: struct device reference for dma mapping api
+ * @owner: owner module (automatically set based on the provided dev)
  * @src_addr_widths: bit mask of src addr widths the device supports
  *	Width is specified in bytes, e.g. for a device supporting
  *	a width of 4 the mask should have BIT(4) set.
@@ -749,6 +750,7 @@
 
 	int dev_id;
 	struct device *dev;
+	struct module *owner;
 
 	u32 src_addr_widths;
 	u32 dst_addr_widths;

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index b3419da..2fd8006 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h

@@ -2,7 +2,7 @@
 #ifndef _DYNAMIC_DEBUG_H
 #define _DYNAMIC_DEBUG_H
 
-#if defined(CONFIG_JUMP_LABEL)
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
 #include <linux/jump_label.h>
 #endif
 
@@ -38,7 +38,7 @@
 #define _DPRINTK_FLAGS_DEFAULT 0
 #endif
 	unsigned int flags:8;
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 	union {
 		struct static_key_true dd_key_true;
 		struct static_key_false dd_key_false;
@@ -83,7 +83,7 @@
 		dd_key_init(key, init)				\
 	}
 
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 
 #define dd_key_init(key, init) key = (init)
 

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
new file mode 100644
index 0000000..aa027f7
--- /dev/null
+++ b/include/linux/energy_model.h

@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ENERGY_MODEL_H
+#define _LINUX_ENERGY_MODEL_H
+#include <linux/cpumask.h>
+#include <linux/jump_label.h>
+#include <linux/kobject.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/cpufreq.h>
+#include <linux/sched/topology.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_ENERGY_MODEL
+/**
+ * em_cap_state - Capacity state of a performance domain
+ * @frequency:	The CPU frequency in KHz, for consistency with CPUFreq
+ * @power:	The power consumed by 1 CPU at this level, in milli-watts
+ * @cost:	The cost coefficient associated with this level, used during
+ *		energy calculation. Equal to: power * max_frequency / frequency
+ */
+struct em_cap_state {
+	unsigned long frequency;
+	unsigned long power;
+	unsigned long cost;
+};
+
+/**
+ * em_perf_domain - Performance domain
+ * @table:		List of capacity states, in ascending order
+ * @nr_cap_states:	Number of capacity states
+ * @cpus:		Cpumask covering the CPUs of the domain
+ *
+ * A "performance domain" represents a group of CPUs whose performance is
+ * scaled together. All CPUs of a performance domain must have the same
+ * micro-architecture. Performance domains often have a 1-to-1 mapping with
+ * CPUFreq policies.
+ */
+struct em_perf_domain {
+	struct em_cap_state *table;
+	int nr_cap_states;
+	unsigned long cpus[0];
+};
+
+#define EM_CPU_MAX_POWER 0xFFFF
+
+struct em_data_callback {
+	/**
+	 * active_power() - Provide power at the next capacity state of a CPU
+	 * @power	: Active power at the capacity state in mW (modified)
+	 * @freq	: Frequency at the capacity state in kHz (modified)
+	 * @cpu		: CPU for which we do this operation
+	 *
+	 * active_power() must find the lowest capacity state of 'cpu' above
+	 * 'freq' and update 'power' and 'freq' to the matching active power
+	 * and frequency.
+	 *
+	 * The power is the one of a single CPU in the domain, expressed in
+	 * milli-watts. It is expected to fit in the [0, EM_CPU_MAX_POWER]
+	 * range.
+	 *
+	 * Return 0 on success.
+	 */
+	int (*active_power)(unsigned long *power, unsigned long *freq, int cpu);
+};
+#define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb }
+
+struct em_perf_domain *em_cpu_get(int cpu);
+int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
+						struct em_data_callback *cb);
+
+/**
+ * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain
+ * @pd		: performance domain for which energy has to be estimated
+ * @max_util	: highest utilization among CPUs of the domain
+ * @sum_util	: sum of the utilization of all CPUs in the domain
+ *
+ * Return: the sum of the energy consumed by the CPUs of the domain assuming
+ * a capacity state satisfying the max utilization of the domain.
+ */
+static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
+				unsigned long max_util, unsigned long sum_util)
+{
+	unsigned long freq, scale_cpu;
+	struct em_cap_state *cs;
+	int i, cpu;
+
+	/*
+	 * In order to predict the capacity state, map the utilization of the
+	 * most utilized CPU of the performance domain to a requested frequency,
+	 * like schedutil.
+	 */
+	cpu = cpumask_first(to_cpumask(pd->cpus));
+	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+	cs = &pd->table[pd->nr_cap_states - 1];
+	freq = map_util_freq(max_util, cs->frequency, scale_cpu);
+
+	/*
+	 * Find the lowest capacity state of the Energy Model above the
+	 * requested frequency.
+	 */
+	for (i = 0; i < pd->nr_cap_states; i++) {
+		cs = &pd->table[i];
+		if (cs->frequency >= freq)
+			break;
+	}
+
+	/*
+	 * The capacity of a CPU in the domain at that capacity state (cs)
+	 * can be computed as:
+	 *
+	 *             cs->freq * scale_cpu
+	 *   cs->cap = --------------------                          (1)
+	 *                 cpu_max_freq
+	 *
+	 * So, ignoring the costs of idle states (which are not available in
+	 * the EM), the energy consumed by this CPU at that capacity state is
+	 * estimated as:
+	 *
+	 *             cs->power * cpu_util
+	 *   cpu_nrg = --------------------                          (2)
+	 *                   cs->cap
+	 *
+	 * since 'cpu_util / cs->cap' represents its percentage of busy time.
+	 *
+	 *   NOTE: Although the result of this computation actually is in
+	 *         units of power, it can be manipulated as an energy value
+	 *         over a scheduling period, since it is assumed to be
+	 *         constant during that interval.
+	 *
+	 * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product
+	 * of two terms:
+	 *
+	 *             cs->power * cpu_max_freq   cpu_util
+	 *   cpu_nrg = ------------------------ * ---------          (3)
+	 *                    cs->freq            scale_cpu
+	 *
+	 * The first term is static, and is stored in the em_cap_state struct
+	 * as 'cs->cost'.
+	 *
+	 * Since all CPUs of the domain have the same micro-architecture, they
+	 * share the same 'cs->cost', and the same CPU capacity. Hence, the
+	 * total energy of the domain (which is the simple sum of the energy of
+	 * all of its CPUs) can be factorized as:
+	 *
+	 *            cs->cost * \Sum cpu_util
+	 *   pd_nrg = ------------------------                       (4)
+	 *                  scale_cpu
+	 */
+	return cs->cost * sum_util / scale_cpu;
+}
+
+/**
+ * em_pd_nr_cap_states() - Get the number of capacity states of a perf. domain
+ * @pd		: performance domain for which this must be done
+ *
+ * Return: the number of capacity states in the performance domain table
+ */
+static inline int em_pd_nr_cap_states(struct em_perf_domain *pd)
+{
+	return pd->nr_cap_states;
+}
+
+#else
+struct em_perf_domain {};
+struct em_data_callback {};
+#define EM_DATA_CB(_active_power_cb) { }
+
+static inline int em_register_perf_domain(cpumask_t *span,
+			unsigned int nr_states, struct em_data_callback *cb)
+{
+	return -EINVAL;
+}
+static inline struct em_perf_domain *em_cpu_get(int cpu)
+{
+	return NULL;
+}
+static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
+			unsigned long max_util, unsigned long sum_util)
+{
+	return 0;
+}
+static inline int em_pd_nr_cap_states(struct em_perf_domain *pd)
+{
+	return 0;
+}
+#endif
+
+#endif

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index fd1ce10..61b7251 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h

@@ -210,12 +210,19 @@
 static inline void fsnotify_open(struct file *file)
 {
 	const struct path *path = &file->f_path;
+	struct path lower_path;
 	struct inode *inode = file_inode(file);
 	__u32 mask = FS_OPEN;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_ISDIR;
 
+	if (path->dentry->d_op && path->dentry->d_op->d_canonical_path) {
+		path->dentry->d_op->d_canonical_path(path, &lower_path);
+		fsnotify_parent(&lower_path, NULL, mask);
+		fsnotify(lower_path.dentry->d_inode, mask, &lower_path, FSNOTIFY_EVENT_PATH, NULL, 0);
+		path_put(&lower_path);
+	}
 	fsnotify_parent(path, NULL, mask);
 	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
 }

diff --git a/include/linux/hid.h b/include/linux/hid.h
index 8b3e5e8..8506637 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h

@@ -495,7 +495,7 @@
 };
 
 #define HID_MIN_BUFFER_SIZE	64		/* make sure there is at least a packet size of space */
-#define HID_MAX_BUFFER_SIZE	4096		/* 4kb */
+#define HID_MAX_BUFFER_SIZE	8192		/* 8kb */
 #define HID_CONTROL_FIFO_SIZE	256		/* to init devices with >100 reports */
 #define HID_OUTPUT_FIFO_SIZE	64
 

diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 99bc5b3..733eaf9 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h

@@ -130,7 +130,7 @@
 	BUG();
 }
 
-static int intel_svm_is_pasid_valid(struct device *dev, int pasid)
+static inline int intel_svm_is_pasid_valid(struct device *dev, int pasid)
 {
 	return -EINVAL;
 }

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 8301f1d..0924455 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h

@@ -188,7 +188,7 @@
 	IRQ_DOMAIN_FLAG_HIERARCHY	= (1 << 0),
 
 	/* Irq domain name was allocated in __irq_domain_add() */
-	IRQ_DOMAIN_NAME_ALLOCATED	= (1 << 6),
+	IRQ_DOMAIN_NAME_ALLOCATED	= (1 << 1),
 
 	/* Irq domain is an IPI domain with virq per cpu */
 	IRQ_DOMAIN_FLAG_IPI_PER_CPU	= (1 << 2),

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 4c3e776..1a0b6f1 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h

@@ -71,6 +71,10 @@
  * Additional babbling in: Documentation/static-keys.txt
  */
 
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
+# define HAVE_JUMP_LABEL
+#endif
+
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
@@ -82,7 +86,7 @@
 				    "%s(): static key '%pS' used before call to jump_label_init()", \
 				    __func__, (key))
 
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 
 struct static_key {
 	atomic_t enabled;
@@ -110,10 +114,10 @@
 struct static_key {
 	atomic_t enabled;
 };
-#endif	/* CONFIG_JUMP_LABEL */
+#endif	/* HAVE_JUMP_LABEL */
 #endif /* __ASSEMBLY__ */
 
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 #include <asm/jump_label.h>
 #endif
 
@@ -126,7 +130,7 @@
 
 struct module;
 
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 
 #define JUMP_TYPE_FALSE		0UL
 #define JUMP_TYPE_TRUE		1UL
@@ -180,7 +184,7 @@
 	{ .enabled = { 0 },					\
 	  { .entries = (void *)JUMP_TYPE_FALSE } }
 
-#else  /* !CONFIG_JUMP_LABEL */
+#else  /* !HAVE_JUMP_LABEL */
 
 #include <linux/atomic.h>
 #include <linux/bug.h>
@@ -267,7 +271,7 @@
 #define STATIC_KEY_INIT_TRUE	{ .enabled = ATOMIC_INIT(1) }
 #define STATIC_KEY_INIT_FALSE	{ .enabled = ATOMIC_INIT(0) }
 
-#endif	/* CONFIG_JUMP_LABEL */
+#endif	/* HAVE_JUMP_LABEL */
 
 #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
 #define jump_label_enabled static_key_enabled
@@ -331,7 +335,7 @@
 	static_key_count((struct static_key *)x) > 0;				\
 })
 
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 
 /*
  * Combine the right initial value (type) with the right branch order
@@ -413,12 +417,12 @@
 	unlikely(branch);							\
 })
 
-#else /* !CONFIG_JUMP_LABEL */
+#else /* !HAVE_JUMP_LABEL */
 
 #define static_branch_likely(x)		likely(static_key_enabled(&(x)->key))
 #define static_branch_unlikely(x)	unlikely(static_key_enabled(&(x)->key))
 
-#endif /* CONFIG_JUMP_LABEL */
+#endif /* HAVE_JUMP_LABEL */
 
 /*
  * Advanced usage; refcount, branch is enabled when: count != 0

diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h
index a49f2b4..baa8eab 100644
--- a/include/linux/jump_label_ratelimit.h
+++ b/include/linux/jump_label_ratelimit.h

@@ -5,19 +5,21 @@
 #include <linux/jump_label.h>
 #include <linux/workqueue.h>
 
-#if defined(CONFIG_JUMP_LABEL)
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
 struct static_key_deferred {
 	struct static_key key;
 	unsigned long timeout;
 	struct delayed_work work;
 };
+#endif
 
+#ifdef HAVE_JUMP_LABEL
 extern void static_key_slow_dec_deferred(struct static_key_deferred *key);
 extern void static_key_deferred_flush(struct static_key_deferred *key);
 extern void
 jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl);
 
-#else	/* !CONFIG_JUMP_LABEL */
+#else	/* !HAVE_JUMP_LABEL */
 struct static_key_deferred {
 	struct static_key  key;
 };
@@ -36,5 +38,5 @@
 {
 	STATIC_KEY_CHECK_USE(key);
 }
-#endif	/* CONFIG_JUMP_LABEL */
+#endif	/* HAVE_JUMP_LABEL */
 #endif	/* _LINUX_JUMP_LABEL_RATELIMIT_H */

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index c196176..1577a2d 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h

@@ -56,6 +56,7 @@
 int kthread_stop(struct task_struct *k);
 bool kthread_should_stop(void);
 bool kthread_should_park(void);
+bool __kthread_should_park(struct task_struct *k);
 bool kthread_freezable_should_stop(bool *was_frozen);
 void *kthread_data(struct task_struct *k);
 void *kthread_probe_data(struct task_struct *k);

diff --git a/include/linux/libata.h b/include/linux/libata.h
index aff09d0..75a916d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h

@@ -1236,6 +1236,7 @@
 };
 
 extern int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits);
+extern void ata_pci_shutdown_one(struct pci_dev *pdev);
 extern void ata_pci_remove_one(struct pci_dev *pdev);
 
 #ifdef CONFIG_PM

diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 097072c..a59cb67 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h

@@ -19,6 +19,7 @@
 #include <linux/types.h>
 #include <linux/uuid.h>
 #include <linux/spinlock.h>
+#include <linux/bio.h>
 
 struct badrange_entry {
 	u64 start;
@@ -59,6 +60,9 @@
 	 */
 	ND_REGION_PERSIST_MEMCTRL = 2,
 
+	/* Platform provides asynchronous flush mechanism */
+	ND_REGION_ASYNC = 3,
+
 	/* mark newly adjusted resources as requiring a label update */
 	DPA_RESOURCE_ADJUSTED = 1 << 0,
 };
@@ -115,6 +119,7 @@
 	int position;
 };
 
+struct nd_region;
 struct nd_region_desc {
 	struct resource *res;
 	struct nd_mapping_desc *mapping;
@@ -126,6 +131,7 @@
 	int numa_node;
 	unsigned long flags;
 	struct device_node *of_node;
+	int (*flush)(struct nd_region *nd_region, struct bio *bio);
 };
 
 struct device;
@@ -201,9 +207,11 @@
 unsigned int nd_region_acquire_lane(struct nd_region *nd_region);
 void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
-void nvdimm_flush(struct nd_region *nd_region);
+int nvdimm_flush(struct nd_region *nd_region, struct bio *bio);
+int generic_nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
 int nvdimm_has_cache(struct nd_region *nd_region);
+bool is_nvdimm_sync(struct nd_region *nd_region);
 
 #ifdef CONFIG_ARCH_HAS_PMEM_API
 #define ARCH_MEMREMAP_PMEM MEMREMAP_WB

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index 3ef9674..1ecd356 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h

@@ -72,10 +72,10 @@
 	struct hlist_nulls_node *first = h->first;
 
 	n->next = first;
-	n->pprev = &h->first;
+	WRITE_ONCE(n->pprev, &h->first);
 	h->first = n;
 	if (!is_a_nulls(first))
-		first->pprev = &n->next;
+		WRITE_ONCE(first->pprev, &n->next);
 }
 
 static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
@@ -85,13 +85,13 @@
 
 	WRITE_ONCE(*pprev, next);
 	if (!is_a_nulls(next))
-		next->pprev = pprev;
+		WRITE_ONCE(next->pprev, pprev);
 }
 
 static inline void hlist_nulls_del(struct hlist_nulls_node *n)
 {
 	__hlist_nulls_del(n);
-	n->pprev = LIST_POISON2;
+	WRITE_ONCE(n->pprev, LIST_POISON2);
 }
 
 /**

diff --git a/include/linux/list_sort.h b/include/linux/list_sort.h
index ba79956..20f178c 100644
--- a/include/linux/list_sort.h
+++ b/include/linux/list_sort.h

@@ -6,6 +6,7 @@
 
 struct list_head;
 
+__attribute__((nonnull(2,3)))
 void list_sort(void *priv, struct list_head *head,
 	       int (*cmp)(void *priv, struct list_head *a,
 			  struct list_head *b));

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 3833c87..66489b4 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h

@@ -457,6 +457,10 @@
  * @file_free_security:
  *	Deallocate and free any security structures stored in file->f_security.
  *	@file contains the file structure being modified.
+ * @file_pre_free_security:
+ *	Perform any logging or LSM state updates for a file being deleted
+ *	using fields of the file before they have been cleared.
+ *	@file contains the file structure being freed
  * @file_ioctl:
  *	@file contains the file structure.
  *	@cmd contains the operation to perform.
@@ -533,6 +537,10 @@
  *	@clone_flags contains the flags indicating what should be shared.
  *	Handle allocation of task-related resources.
  *	Returns a zero on success, negative values on failure.
+ * @task_post_alloc:
+ *	@task task being allocated.
+ *	Handle allocation of task-related resources after all task fields are
+ *	filled in.
  * @task_free:
  *	@task task about to be freed.
  *	Handle release of task-related resources. (Note that this can be called
@@ -683,6 +691,9 @@
  *	@cred contains the cred of the process where the signal originated, or
  *	NULL if the current task is the originator.
  *	Return 0 if permission is granted.
+ * @task_exit:
+ *      Called early when a task is exiting before all state is lost.
+ *      @p contains the task_struct for process.
  * @task_prctl:
  *	Check permission before performing a process control operation on the
  *	current process.
@@ -1561,6 +1572,7 @@
 	int (*file_permission)(struct file *file, int mask);
 	int (*file_alloc_security)(struct file *file);
 	void (*file_free_security)(struct file *file);
+	void (*file_pre_free_security)(struct file *file);
 	int (*file_ioctl)(struct file *file, unsigned int cmd,
 				unsigned long arg);
 	int (*mmap_addr)(unsigned long addr);
@@ -1578,6 +1590,7 @@
 	int (*file_open)(struct file *file);
 
 	int (*task_alloc)(struct task_struct *task, unsigned long clone_flags);
+	void (*task_post_alloc)(struct task_struct *task); // Do not upstream.
 	void (*task_free)(struct task_struct *task);
 	int (*cred_alloc_blank)(struct cred *cred, gfp_t gfp);
 	void (*cred_free)(struct cred *cred);
@@ -1610,6 +1623,7 @@
 	int (*task_movememory)(struct task_struct *p);
 	int (*task_kill)(struct task_struct *p, struct siginfo *info,
 				int sig, const struct cred *cred);
+	void (*task_exit)(struct task_struct *p);
 	int (*task_prctl)(int option, unsigned long arg2, unsigned long arg3,
 				unsigned long arg4, unsigned long arg5);
 	void (*task_to_inode)(struct task_struct *p, struct inode *inode);
@@ -1860,6 +1874,7 @@
 	struct hlist_head file_permission;
 	struct hlist_head file_alloc_security;
 	struct hlist_head file_free_security;
+	struct hlist_head file_pre_free_security;
 	struct hlist_head file_ioctl;
 	struct hlist_head mmap_addr;
 	struct hlist_head mmap_file;
@@ -1871,6 +1886,7 @@
 	struct hlist_head file_receive;
 	struct hlist_head file_open;
 	struct hlist_head task_alloc;
+	struct hlist_head task_post_alloc;
 	struct hlist_head task_free;
 	struct hlist_head cred_alloc_blank;
 	struct hlist_head cred_free;
@@ -1897,6 +1913,7 @@
 	struct hlist_head task_getscheduler;
 	struct hlist_head task_movememory;
 	struct hlist_head task_kill;
+	struct hlist_head task_exit;
 	struct hlist_head task_prctl;
 	struct hlist_head task_to_inode;
 	struct hlist_head ipc_permission;

diff --git a/include/linux/lzo.h b/include/linux/lzo.h
index 2ae27cb..e95c7d1 100644
--- a/include/linux/lzo.h
+++ b/include/linux/lzo.h

@@ -18,12 +18,16 @@
 #define LZO1X_1_MEM_COMPRESS	(8192 * sizeof(unsigned short))
 #define LZO1X_MEM_COMPRESS	LZO1X_1_MEM_COMPRESS
 
-#define lzo1x_worst_compress(x) ((x) + ((x) / 16) + 64 + 3)
+#define lzo1x_worst_compress(x) ((x) + ((x) / 16) + 64 + 3 + 2)
 
 /* This requires 'wrkmem' of size LZO1X_1_MEM_COMPRESS */
 int lzo1x_1_compress(const unsigned char *src, size_t src_len,
 		     unsigned char *dst, size_t *dst_len, void *wrkmem);
 
+/* This requires 'wrkmem' of size LZO1X_1_MEM_COMPRESS */
+int lzorle1x_1_compress(const unsigned char *src, size_t src_len,
+		     unsigned char *dst, size_t *dst_len, void *wrkmem);
+
 /* safe decompression with overrun testing */
 int lzo1x_decompress_safe(const unsigned char *src, size_t src_len,
 			  unsigned char *dst, size_t *dst_len);

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 45f10f5..2e81131 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h

@@ -58,6 +58,8 @@
 #define sysctl_legacy_va_layout 0
 #endif
 
+extern int min_filelist_kbytes;
+
 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
 extern const int mmap_rnd_bits_min;
 extern const int mmap_rnd_bits_max;
@@ -125,6 +127,7 @@
 #define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
 
 extern int sysctl_max_map_count;
+extern int sysctl_mmap_noexec_taint;
 
 extern unsigned long sysctl_user_reserve_kbytes;
 extern unsigned long sysctl_admin_reserve_kbytes;
@@ -2243,7 +2246,7 @@
 extern struct vm_area_struct *vma_merge(struct mm_struct *,
 	struct vm_area_struct *prev, unsigned long addr, unsigned long end,
 	unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-	struct mempolicy *, struct vm_userfaultfd_ctx);
+	struct mempolicy *, struct vm_userfaultfd_ctx, const char __user *);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
 	unsigned long addr, int new_below);
@@ -2812,5 +2815,9 @@
 static inline void setup_nr_node_ids(void) {}
 #endif
 
+#ifdef CONFIG_DISK_BASED_SWAP
+extern int sysctl_disk_based_swap;
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3a9a996..3f6ab00 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h

@@ -295,11 +295,18 @@
 	/*
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap interval tree.
+	 *
+	 * For private anonymous mappings, a pointer to a null terminated string
+	 * in the user process containing the name given to the vma, or NULL
+	 * if unnamed.
 	 */
-	struct {
-		struct rb_node rb;
-		unsigned long rb_subtree_last;
-	} shared;
+	union {
+		struct {
+			struct rb_node rb;
+			unsigned long rb_subtree_last;
+		} shared;
+		const char __user *anon_name;
+	};
 
 	/*
 	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
@@ -654,4 +661,13 @@
 	unsigned long val;
 } swp_entry_t;
 
+/* Return the name for an anonymous mapping or NULL for a file-backed mapping */
+static inline const char __user *vma_get_anon_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_file)
+		return NULL;
+
+	return vma->anon_name;
+}
+
 #endif /* _LINUX_MM_TYPES_H */

diff --git a/include/linux/module.h b/include/linux/module.h
index 9915397..ddefdcd 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h

@@ -433,7 +433,7 @@
 	unsigned int num_tracepoints;
 	tracepoint_ptr_t *tracepoints_ptrs;
 #endif
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 	struct jump_entry *jump_entries;
 	unsigned int num_jump_entries;
 #endif

diff --git a/include/linux/namei.h b/include/linux/namei.h
index a78606e..0d08bad 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h

@@ -94,6 +94,10 @@
 
 extern void nd_jump_link(struct path *path);
 
+extern int nameidata_set_temporary(const char __user *dir_name);
+extern void nameidata_restore_temporary(void);
+extern int nameidata_get_total_link_count(void);
+
 static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
 {
 	((char *) name)[min(len, maxlen)] = '\0';

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 72cb19c..bbe99d2 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h

@@ -176,7 +176,7 @@
 int nf_register_sockopt(struct nf_sockopt_ops *reg);
 void nf_unregister_sockopt(struct nf_sockopt_ops *reg);
 
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 #endif
 
@@ -198,7 +198,7 @@
 	struct nf_hook_entries *hook_head = NULL;
 	int ret = 1;
 
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 	if (__builtin_constant_p(pf) &&
 	    __builtin_constant_p(hook) &&
 	    !static_key_false(&nf_hooks_needed[pf][hook]))

diff --git a/include/linux/netfilter/xt_quota2.h b/include/linux/netfilter/xt_quota2.h
new file mode 100644
index 0000000..eadc69033
--- /dev/null
+++ b/include/linux/netfilter/xt_quota2.h

@@ -0,0 +1,25 @@
+#ifndef _XT_QUOTA_H
+#define _XT_QUOTA_H
+
+enum xt_quota_flags {
+	XT_QUOTA_INVERT    = 1 << 0,
+	XT_QUOTA_GROW      = 1 << 1,
+	XT_QUOTA_PACKET    = 1 << 2,
+	XT_QUOTA_NO_CHANGE = 1 << 3,
+	XT_QUOTA_MASK      = 0x0F,
+};
+
+struct xt_quota_counter;
+
+struct xt_quota_mtinfo2 {
+	char name[15];
+	u_int8_t flags;
+
+	/* Comparison-invariant */
+	aligned_u64 quota;
+
+	/* Used internally by the kernel */
+	struct xt_quota_counter *master __attribute__((aligned(8)));
+};
+
+#endif /* _XT_QUOTA_H */

diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
index a13774b..554c920 100644
--- a/include/linux/netfilter_ingress.h
+++ b/include/linux/netfilter_ingress.h

@@ -8,7 +8,7 @@
 #ifdef CONFIG_NETFILTER_INGRESS
 static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
 {
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 	if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS]))
 		return false;
 #endif

diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 49538b1..dded498 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h

@@ -45,6 +45,9 @@
 	int hide_pid;
 	int reboot;	/* group exit code if this pidns was rebooted */
 	struct ns_common ns;
+#ifdef CONFIG_SECURITY_CONTAINER_MONITOR
+	u64 cid;  /* Main container identifier, zero if not assigned. */
+#endif
 } __randomize_layout;
 
 extern struct pid_namespace init_pid_ns;

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 776c546..3b5d728 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h

@@ -17,11 +17,36 @@
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 
-/* Defines used for the flags field in the struct generic_pm_domain */
-#define GENPD_FLAG_PM_CLK	 (1U << 0) /* PM domain uses PM clk */
-#define GENPD_FLAG_IRQ_SAFE	 (1U << 1) /* PM domain operates in atomic */
-#define GENPD_FLAG_ALWAYS_ON	 (1U << 2) /* PM domain is always powered on */
-#define GENPD_FLAG_ACTIVE_WAKEUP (1U << 3) /* Keep devices active if wakeup */
+/*
+ * Flags to control the behaviour of a genpd.
+ *
+ * These flags may be set in the struct generic_pm_domain's flags field by a
+ * genpd backend driver. The flags must be set before it calls pm_genpd_init(),
+ * which initializes a genpd.
+ *
+ * GENPD_FLAG_PM_CLK:		Instructs genpd to use the PM clk framework,
+ *				while powering on/off attached devices.
+ *
+ * GENPD_FLAG_IRQ_SAFE:		This informs genpd that its backend callbacks,
+ *				->power_on|off(), doesn't sleep. Hence, these
+ *				can be invoked from within atomic context, which
+ *				enables genpd to power on/off the PM domain,
+ *				even when pm_runtime_is_irq_safe() returns true,
+ *				for any of its attached devices. Note that, a
+ *				genpd having this flag set, requires its
+ *				masterdomains to also have it set.
+ *
+ * GENPD_FLAG_ALWAYS_ON:	Instructs genpd to always keep the PM domain
+ *				powered on.
+ *
+ * GENPD_FLAG_ACTIVE_WAKEUP:	Instructs genpd to keep the PM domain powered
+ *				on, in case any of its attached devices is used
+ *				in the wakeup path to serve system wakeups.
+ */
+#define GENPD_FLAG_PM_CLK	 (1U << 0)
+#define GENPD_FLAG_IRQ_SAFE	 (1U << 1)
+#define GENPD_FLAG_ALWAYS_ON	 (1U << 2)
+#define GENPD_FLAG_ACTIVE_WAKEUP (1U << 3)
 
 enum gpd_status {
 	GPD_STATE_ACTIVE = 0,	/* PM domain is active */

diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index d0e1f15..a50b1c9 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h

@@ -116,6 +116,12 @@
 
 #endif /* CONFIG_PROC_FS */
 
+#ifdef CONFIG_PROC_UID
+extern void proc_register_uid(kuid_t uid);
+#else
+static inline void proc_register_uid(kuid_t uid) {}
+#endif
+
 struct net;
 
 static inline struct proc_dir_entry *proc_net_mkdir(

diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 61974c4..90f2e22 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h

@@ -34,7 +34,7 @@
 {
 	if (!hlist_nulls_unhashed(n)) {
 		__hlist_nulls_del(n);
-		n->pprev = NULL;
+		WRITE_ONCE(n->pprev, NULL);
 	}
 }
 
@@ -66,7 +66,7 @@
 static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n)
 {
 	__hlist_nulls_del(n);
-	n->pprev = LIST_POISON2;
+	WRITE_ONCE(n->pprev, LIST_POISON2);
 }
 
 /**
@@ -94,10 +94,10 @@
 	struct hlist_nulls_node *first = h->first;
 
 	n->next = first;
-	n->pprev = &h->first;
+	WRITE_ONCE(n->pprev, &h->first);
 	rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
 	if (!is_a_nulls(first))
-		first->pprev = &n->next;
+		WRITE_ONCE(first->pprev, &n->next);
 }
 
 /**

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0530de9..4edbb06 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h

@@ -30,7 +30,6 @@
 #include <linux/rseq.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
-struct audit_context;
 struct backing_dev_info;
 struct bio_list;
 struct blk_plug;
@@ -355,12 +354,6 @@
  * For cfs_rq, it is the aggregated load_avg of all runnable and
  * blocked sched_entities.
  *
- * load_avg may also take frequency scaling into account:
- *
- *   load_avg = runnable% * scale_load_down(load) * freq%
- *
- * where freq% is the CPU frequency normalized to the highest frequency.
- *
  * [util_avg definition]
  *
  *   util_avg = running% * SCHED_CAPACITY_SCALE
@@ -369,17 +362,14 @@
  * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable
  * and blocked sched_entities.
  *
- * util_avg may also factor frequency scaling and CPU capacity scaling:
+ * load_avg and util_avg don't direcly factor frequency scaling and CPU
+ * capacity scaling. The scaling is done through the rq_clock_pelt that
+ * is used for computing those signals (see update_rq_clock_pelt())
  *
- *   util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity%
- *
- * where freq% is the same as above, and capacity% is the CPU capacity
- * normalized to the greatest capacity (due to uarch differences, etc).
- *
- * N.B., the above ratios (runnable%, running%, freq%, and capacity%)
- * themselves are in the range of [0, 1]. To do fixed point arithmetics,
- * we therefore scale them to as large a range as necessary. This is for
- * example reflected by util_avg's SCHED_CAPACITY_SCALE.
+ * N.B., the above ratios (runnable% and running%) themselves are in the
+ * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
+ * to as large a range as necessary. This is for example reflected by
+ * util_avg's SCHED_CAPACITY_SCALE.
  *
  * [Overflow issue]
  *
@@ -879,10 +869,8 @@
 
 	struct callback_head		*task_works;
 
-	struct audit_context		*audit_context;
-#ifdef CONFIG_AUDITSYSCALL
-	kuid_t				loginuid;
-	unsigned int			sessionid;
+#ifdef CONFIG_AUDIT
+	struct audit_task_info		*audit;
 #endif
 	struct seccomp			seccomp;
 

diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index a4530d7..cc6bcc1 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h

@@ -23,6 +23,12 @@
 				    unsigned int flags));
 void cpufreq_remove_update_util_hook(int cpu);
 bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy);
+
+static inline unsigned long map_util_freq(unsigned long util,
+					unsigned long freq, unsigned long cap)
+{
+	return (freq + (freq >> 2)) * util / cap;
+}
 #endif /* CONFIG_CPU_FREQ */
 
 #endif /* _LINUX_SCHED_CPUFREQ_H */

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index a9c32da..77e9fa3c 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h

@@ -22,6 +22,8 @@
 
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_sync_hint_enable;
+extern unsigned int sysctl_sched_cstate_aware;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 15f3f61..beffd85 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h

@@ -23,10 +23,10 @@
 #define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
 #define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
 #define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
-#define SD_ASYM_CPUCAPACITY	0x0040  /* Groups have different max cpu capacities */
-#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu capacity */
+#define SD_ASYM_CPUCAPACITY	0x0040  /* Domain members have different CPU capacities */
+#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share CPU capacity */
 #define SD_SHARE_POWERDOMAIN	0x0100	/* Domain members share power domain */
-#define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
+#define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share CPU pkg resources */
 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
 #define SD_ASYM_PACKING		0x0800  /* Place busy groups earlier in the domain */
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
@@ -202,6 +202,17 @@
 # define SD_INIT_NAME(type)
 #endif
 
+#ifndef arch_scale_cpu_capacity
+static __always_inline
+unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+	if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
+		return sd->smt_gain / sd->span_weight;
+
+	return SCHED_CAPACITY_SCALE;
+}
+#endif
+
 #else /* CONFIG_SMP */
 
 struct sched_domain_attr;
@@ -217,6 +228,14 @@
 	return true;
 }
 
+#ifndef arch_scale_cpu_capacity
+static __always_inline
+unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
+{
+	return SCHED_CAPACITY_SCALE;
+}
+#endif
+
 #endif	/* !CONFIG_SMP */
 
 static inline int task_node(const struct task_struct *p)

diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
index 10b19a1..a3661e9 100644
--- a/include/linux/sched/wake_q.h
+++ b/include/linux/sched/wake_q.h

@@ -34,6 +34,7 @@
 struct wake_q_head {
 	struct wake_q_node *first;
 	struct wake_q_node **lastp;
+	int count;
 };
 
 #define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
@@ -45,6 +46,7 @@
 {
 	head->first = WAKE_Q_TAIL;
 	head->lastp = &head->first;
+	head->count = 0;
 }
 
 extern void wake_q_add(struct wake_q_head *head,

diff --git a/include/linux/sched/xacct.h b/include/linux/sched/xacct.h
index c078f0a..9544c9d 100644
--- a/include/linux/sched/xacct.h
+++ b/include/linux/sched/xacct.h

@@ -28,6 +28,11 @@
 {
 	tsk->ioac.syscw++;
 }
+
+static inline void inc_syscfs(struct task_struct *tsk)
+{
+	tsk->ioac.syscfs++;
+}
 #else
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
@@ -44,6 +49,10 @@
 static inline void inc_syscw(struct task_struct *tsk)
 {
 }
+
+static inline void inc_syscfs(struct task_struct *tsk)
+{
+}
 #endif
 
 #endif /* _LINUX_SCHED_XACCT_H */

diff --git a/include/linux/security.h b/include/linux/security.h
index d2240605..ded314a 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h

@@ -321,6 +321,7 @@
 int security_file_permission(struct file *file, int mask);
 int security_file_alloc(struct file *file);
 void security_file_free(struct file *file);
+void security_file_pre_free(struct file *file);
 int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 int security_mmap_file(struct file *file, unsigned long prot,
 			unsigned long flags);
@@ -335,6 +336,7 @@
 int security_file_receive(struct file *file);
 int security_file_open(struct file *file);
 int security_task_alloc(struct task_struct *task, unsigned long clone_flags);
+void security_task_post_alloc(struct task_struct *task);
 void security_task_free(struct task_struct *task);
 int security_cred_alloc_blank(struct cred *cred, gfp_t gfp);
 void security_cred_free(struct cred *cred);
@@ -366,6 +368,7 @@
 int security_task_movememory(struct task_struct *p);
 int security_task_kill(struct task_struct *p, struct siginfo *info,
 			int sig, const struct cred *cred);
+void security_task_exit(struct task_struct *p);
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			unsigned long arg4, unsigned long arg5);
 void security_task_to_inode(struct task_struct *p, struct inode *inode);
@@ -828,6 +831,9 @@
 static inline void security_file_free(struct file *file)
 { }
 
+static inline void security_file_pre_free(struct file *file)
+{ }
+
 static inline int security_file_ioctl(struct file *file, unsigned int cmd,
 				      unsigned long arg)
 {
@@ -891,6 +897,9 @@
 	return 0;
 }
 
+static inline void security_task_post_alloc(struct task_struct *task)
+{ }
+
 static inline void security_task_free(struct task_struct *task)
 { }
 
@@ -1026,6 +1035,9 @@
 	return 0;
 }
 
+static inline void security_task_exit(struct task_struct *p)
+{ }
+
 static inline int security_task_prctl(int option, unsigned long arg2,
 				      unsigned long arg3,
 				      unsigned long arg4,

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 3460b15..ff9d0ee 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h

@@ -22,6 +22,7 @@
 
 #include <linux/bitops.h>
 #include <linux/compiler.h>
+#include <linux/console.h>
 #include <linux/interrupt.h>
 #include <linux/circ_buf.h>
 #include <linux/spinlock.h>

diff --git a/include/linux/sort.h b/include/linux/sort.h
index 2b99a5d..61b96d0 100644
--- a/include/linux/sort.h
+++ b/include/linux/sort.h

@@ -4,6 +4,11 @@
 
 #include <linux/types.h>
 
+void sort_r(void *base, size_t num, size_t size,
+	    int (*cmp)(const void *, const void *, const void *),
+	    void (*swap)(void *, void *, int),
+	    const void *priv);
+
 void sort(void *base, size_t num, size_t size,
 	  int (*cmp)(const void *, const void *),
 	  void (*swap)(void *, void *, int));

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2ff814c..635a660 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h

@@ -1159,6 +1159,10 @@
 	return -EINVAL;
 }
 #endif
+
+int ksys_prctl(int option, unsigned long arg2, unsigned long arg3,
+	       unsigned long arg4, unsigned long arg5);
+
 unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
 			      unsigned long prot, unsigned long flags,
 			      unsigned long fd, unsigned long pgoff);
@@ -1293,4 +1297,22 @@
 	return old;
 }
 
+#ifdef CONFIG_ALT_SYSCALL
+
+/* Only used with ALT_SYSCALL enabled */
+
+int ksys_prctl(int option, unsigned long arg2, unsigned long arg3,
+	       unsigned long arg4, unsigned long arg5);
+int ksys_setpriority(int which, int who, int niceval);
+int ksys_getpriority(int which, int who);
+int ksys_perf_event_open(
+		struct perf_event_attr __user *attr_uptr,
+		pid_t pid, int cpu, int group_fd, unsigned long flags);
+int ksys_adjtimex(struct timex __user *txc_p);
+int ksys_clock_adjtime(clockid_t which_clock,
+		       struct timex __user *tx);
+int ksys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
+
+#endif /* CONFIG_ALT_SYSCALL */
+
 #endif

diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h
index 6f6acce..bb26108 100644
--- a/include/linux/task_io_accounting.h
+++ b/include/linux/task_io_accounting.h

@@ -19,6 +19,8 @@
 	u64 syscr;
 	/* # of write syscalls */
 	u64 syscw;
+	/* # of fsync syscalls */
+	u64 syscfs;
 #endif /* CONFIG_TASK_XACCT */
 
 #ifdef CONFIG_TASK_IO_ACCOUNTING

diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h
index bb5498b..733ab62 100644
--- a/include/linux/task_io_accounting_ops.h
+++ b/include/linux/task_io_accounting_ops.h

@@ -97,6 +97,7 @@
 	dst->wchar += src->wchar;
 	dst->syscr += src->syscr;
 	dst->syscw += src->syscw;
+	dst->syscfs += src->syscfs;
 }
 #else
 static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 0643c08..3aa0559 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h

@@ -577,7 +577,8 @@
 			       bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
-extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
+extern int  perf_uprobe_init(struct perf_event *event,
+			     unsigned long ref_ctr_offset, bool is_retprobe);
 extern void perf_uprobe_destroy(struct perf_event *event);
 extern int bpf_get_uprobe_info(const struct perf_event *event,
 			       u32 *fd_type, const char **filename,

diff --git a/include/linux/tty.h b/include/linux/tty.h
index 76db046..248a137 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h

@@ -225,6 +225,8 @@
 	void (*write_wakeup)(struct tty_port *port);
 };
 
+extern const struct tty_port_client_operations tty_port_default_client_ops;
+
 struct tty_port {
 	struct tty_bufhead	buf;		/* Locked internally */
 	struct tty_struct	*tty;		/* Back pointer */

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index bb9d208..103a48a 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h

@@ -123,6 +123,7 @@
 extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
 extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
 extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
+extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
 extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern int uprobe_mmap(struct vm_area_struct *vma);
@@ -160,6 +161,10 @@
 {
 	return -ENOSYS;
 }
+static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+{
+	return -ENOSYS;
+}
 static inline int
 uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add)
 {

diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h
index a1be64c..22c1f57 100644
--- a/include/linux/usb/quirks.h
+++ b/include/linux/usb/quirks.h

@@ -69,4 +69,7 @@
 /* Hub needs extra delay after resetting its port. */
 #define USB_QUIRK_HUB_SLOW_RESET		BIT(14)
 
+/* device has blacklisted endpoints */
+#define USB_QUIRK_ENDPOINT_BLACKLIST		BIT(15)
+
 #endif /* __LINUX_USB_QUIRKS_H */

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 47a3441..870d36e 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h

@@ -28,6 +28,7 @@
 		FOR_ALL_ZONES(PGSCAN_SKIP),
 		PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
 		PGFAULT, PGMAJFAULT,
+		PGMAJFAULT_S, PGMAJFAULT_A, PGMAJFAULT_F,
 		PGLAZYFREED,
 		PGREFILL,
 		PGSTEAL_KSWAPD,

diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h
new file mode 100644
index 0000000..7ce50f0d
--- /dev/null
+++ b/include/linux/wakeup_reason.h

@@ -0,0 +1,23 @@
+/*
+ * include/linux/wakeup_reason.h
+ *
+ * Logs the reason which caused the kernel to resume
+ * from the suspend mode.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_WAKEUP_REASON_H
+#define _LINUX_WAKEUP_REASON_H
+
+void log_wakeup_reason(int irq);
+
+#endif /* _LINUX_WAKEUP_REASON_H */

diff --git a/include/media/v4l2-device.h b/include/media/v4l2-device.h
index b330e4a..40840fec 100644
--- a/include/media/v4l2-device.h
+++ b/include/media/v4l2-device.h

@@ -372,7 +372,7 @@
 		struct v4l2_subdev *__sd;				\
 									\
 		__v4l2_device_call_subdevs_p(v4l2_dev, __sd,		\
-			!(grpid) || __sd->grp_id == (grpid), o, f ,	\
+			(grpid) == 0 || __sd->grp_id == (grpid), o, f ,	\
 			##args);					\
 	} while (0)
 
@@ -404,7 +404,7 @@
 ({									\
 	struct v4l2_subdev *__sd;					\
 	__v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd,		\
-			!(grpid) || __sd->grp_id == (grpid), o, f ,	\
+			(grpid) == 0 || __sd->grp_id == (grpid), o, f ,	\
 			##args);					\
 })
 
@@ -432,8 +432,8 @@
 		struct v4l2_subdev *__sd;				\
 									\
 		__v4l2_device_call_subdevs_p(v4l2_dev, __sd,		\
-			!(grpmsk) || (__sd->grp_id & (grpmsk)), o, f ,	\
-			##args);					\
+			(grpmsk) == 0 || (__sd->grp_id & (grpmsk)), o,	\
+			f , ##args);					\
 	} while (0)
 
 /**
@@ -463,8 +463,8 @@
 ({									\
 	struct v4l2_subdev *__sd;					\
 	__v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd,		\
-			!(grpmsk) || (__sd->grp_id & (grpmsk)), o, f ,	\
-			##args);					\
+			(grpmsk) == 0 || (__sd->grp_id & (grpmsk)), o,	\
+			f , ##args);					\
 })
 
 

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 4618cbb..99f8580 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h

@@ -5,6 +5,7 @@
 #include <linux/types.h>
 #include <linux/in6.h>
 #include <linux/siphash.h>
+#include <linux/string.h>
 #include <uapi/linux/if_ether.h>
 
 /**
@@ -306,4 +307,12 @@
 	return ((char *)target_container) + flow_dissector->offset[key_id];
 }
 
+static inline void
+flow_dissector_init_keys(struct flow_dissector_key_control *key_control,
+			 struct flow_dissector_key_basic *key_basic)
+{
+	memset(key_control, 0, sizeof(*key_control));
+	memset(key_basic, 0, sizeof(*key_basic));
+}
+
 #endif

diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h
index a4ba601..da7af8e 100644
--- a/include/net/netfilter/br_netfilter.h
+++ b/include/net/netfilter/br_netfilter.h

@@ -48,7 +48,8 @@
 	return port ? &port->br->fake_rtable : NULL;
 }
 
-struct net_device *setup_pre_routing(struct sk_buff *skb);
+struct net_device *setup_pre_routing(struct sk_buff *skb,
+				     const struct net *net);
 
 #if IS_ENABLED(CONFIG_IPV6)
 int br_validate_ipv6(struct net *net, struct sk_buff *skb);

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 366e2a6..a255a62 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h

@@ -161,6 +161,7 @@
 	int sysctl_tcp_invalid_ratelimit;
 	int sysctl_tcp_pacing_ss_ratio;
 	int sysctl_tcp_pacing_ca_ratio;
+	int sysctl_tcp_default_init_rwnd;
 	int sysctl_tcp_wmem[3];
 	int sysctl_tcp_rmem[3];
 	int sysctl_tcp_comp_sack_nr;

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 918bfd0..07597f5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h

@@ -1329,7 +1329,7 @@
 	rx_opt->num_sacks = 0;
 }
 
-u32 tcp_default_init_rwnd(u32 mss);
+u32 tcp_default_init_rwnd(const struct sock *sk, u32 mss);
 void tcp_cwnd_restart(struct sock *sk, s32 delta);
 
 static inline void tcp_slow_start_after_idle_check(struct sock *sk)

diff --git a/include/scsi/iscsi_proto.h b/include/scsi/iscsi_proto.h
index f0a01a5..df156f1 100644
--- a/include/scsi/iscsi_proto.h
+++ b/include/scsi/iscsi_proto.h

@@ -638,7 +638,6 @@
 #define ISCSI_REASON_BOOKMARK_INVALID	9
 #define ISCSI_REASON_BOOKMARK_NO_RESOURCES	10
 #define ISCSI_REASON_NEGOTIATION_RESET	11
-#define ISCSI_REASON_WAITING_FOR_LOGOUT	12
 
 /* Max. number of Key=Value pairs in a text message */
 #define MAX_KEY_VALUE_PAIRS	8192

diff --git a/include/sound/rawmidi.h b/include/sound/rawmidi.h
index 6665cb2..c2a71fd 100644
--- a/include/sound/rawmidi.h
+++ b/include/sound/rawmidi.h

@@ -92,9 +92,9 @@
 	struct list_head list;		/* list of all substream for given stream */
 	int stream;			/* direction */
 	int number;			/* substream number */
-	unsigned int opened: 1,		/* open flag */
-		     append: 1,		/* append flag (merge more streams) */
-		     active_sensing: 1; /* send active sensing when close */
+	bool opened;			/* open flag */
+	bool append;			/* append flag (merge more streams) */
+	bool active_sensing;		/* send active sensing when close */
 	int use_count;			/* use counter (for output) */
 	size_t bytes;
 	struct snd_rawmidi *rmidi;

diff --git a/include/trace/events/fs.h b/include/trace/events/fs.h
new file mode 100644
index 0000000..fb634b7
--- /dev/null
+++ b/include/trace/events/fs.h

@@ -0,0 +1,53 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM fs
+
+#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FS_H
+
+#include <linux/fs.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(do_sys_open,
+
+	TP_PROTO(const char *filename, int flags, int mode),
+
+	TP_ARGS(filename, flags, mode),
+
+	TP_STRUCT__entry(
+		__string(	filename, filename		)
+		__field(	int, flags			)
+		__field(	int, mode			)
+	),
+
+	TP_fast_assign(
+		__assign_str(filename, filename);
+		__entry->flags = flags;
+		__entry->mode = mode;
+	),
+
+	TP_printk("\"%s\" %x %o",
+		  __get_str(filename), __entry->flags, __entry->mode)
+);
+
+TRACE_EVENT(open_exec,
+
+	TP_PROTO(const char *filename),
+
+	TP_ARGS(filename),
+
+	TP_STRUCT__entry(
+		__string(	filename, filename		)
+	),
+
+	TP_fast_assign(
+		__assign_str(filename, filename);
+	),
+
+	TP_printk("\"%s\"",
+		  __get_str(filename))
+);
+
+#endif /* _TRACE_FS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9a4bdfa..8067679 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h

@@ -241,7 +241,7 @@
 DEFINE_EVENT(sched_process_template, sched_process_free,
 	     TP_PROTO(struct task_struct *p),
 	     TP_ARGS(p));
-	     
+
 
 /*
  * Tracepoint for a task exiting:
@@ -396,6 +396,30 @@
 	     TP_ARGS(tsk, delay));
 
 /*
+ * Tracepoint for recording the cause of uninterruptible sleep.
+ */
+TRACE_EVENT(sched_blocked_reason,
+
+	TP_PROTO(struct task_struct *tsk),
+
+	TP_ARGS(tsk),
+
+	TP_STRUCT__entry(
+		__field( pid_t,	pid	)
+		__field( void*, caller	)
+		__field( bool, io_wait	)
+	),
+
+	TP_fast_assign(
+		__entry->pid	= tsk->pid;
+		__entry->caller = (void*)get_wchan(tsk);
+		__entry->io_wait = tsk->in_iowait;
+	),
+
+	TP_printk("pid=%d iowait=%d caller=%pS", __entry->pid, __entry->io_wait, __entry->caller)
+);
+
+/*
  * Tracepoint for accounting runtime (time the task is executing
  * on a CPU).
  */
@@ -587,6 +611,424 @@
 
 	TP_printk("cpu=%d", __entry->cpu)
 );
+
+#ifdef CONFIG_SMP
+#ifdef CREATE_TRACE_POINTS
+static inline
+int __trace_sched_cpu(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct rq *rq = cfs_rq ? cfs_rq->rq : NULL;
+#else
+	struct rq *rq = cfs_rq ? container_of(cfs_rq, struct rq, cfs) : NULL;
+#endif
+	return rq ? cpu_of(rq)
+		  : task_cpu((container_of(se, struct task_struct, se)));
+}
+
+static inline
+int __trace_sched_path(struct cfs_rq *cfs_rq, char *path, int len)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	int l = path ? len : 0;
+
+	if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
+		return autogroup_path(cfs_rq->tg, path, l) + 1;
+	else if (cfs_rq && cfs_rq->tg->css.cgroup)
+		return cgroup_path(cfs_rq->tg->css.cgroup, path, l) + 1;
+#endif
+	if (path)
+		strcpy(path, "(null)");
+
+	return strlen("(null)");
+}
+
+static inline
+struct cfs_rq *__trace_sched_group_cfs_rq(struct sched_entity *se)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	return se->my_q;
+#else
+	return NULL;
+#endif
+}
+#endif /* CREATE_TRACE_POINTS */
+
+/*
+ * Tracepoint for cfs_rq load tracking:
+ */
+TRACE_EVENT(sched_load_cfs_rq,
+
+	TP_PROTO(struct cfs_rq *cfs_rq),
+
+	TP_ARGS(cfs_rq),
+
+	TP_STRUCT__entry(
+		__field(	int,		cpu			)
+		__dynamic_array(char,		path,
+				__trace_sched_path(cfs_rq, NULL, 0)	)
+		__field(	unsigned long,	load			)
+		__field(	unsigned long,	rbl_load		)
+		__field(	unsigned long,	util			)
+	),
+
+	TP_fast_assign(
+		__entry->cpu		= __trace_sched_cpu(cfs_rq, NULL);
+		__trace_sched_path(cfs_rq, __get_dynamic_array(path),
+				   __get_dynamic_array_len(path));
+		__entry->load		= cfs_rq->avg.load_avg;
+		__entry->rbl_load 	= cfs_rq->avg.runnable_load_avg;
+		__entry->util		= cfs_rq->avg.util_avg;
+	),
+
+	TP_printk("cpu=%d path=%s load=%lu rbl_load=%lu util=%lu",
+		  __entry->cpu, __get_str(path), __entry->load,
+		  __entry->rbl_load,__entry->util)
+);
+
+/*
+ * Tracepoint for rt_rq load tracking:
+ */
+struct rq;
+TRACE_EVENT(sched_load_rt_rq,
+
+	TP_PROTO(struct rq *rq),
+
+	TP_ARGS(rq),
+
+	TP_STRUCT__entry(
+		__field(	int,		cpu			)
+		__field(	unsigned long,	util			)
+	),
+
+	TP_fast_assign(
+		__entry->cpu	= rq->cpu;
+		__entry->util	= rq->avg_rt.util_avg;
+	),
+
+	TP_printk("cpu=%d util=%lu", __entry->cpu,
+		  __entry->util)
+);
+
+/*
+ * Tracepoint for sched_entity load tracking:
+ */
+TRACE_EVENT(sched_load_se,
+
+	TP_PROTO(struct sched_entity *se),
+
+	TP_ARGS(se),
+
+	TP_STRUCT__entry(
+		__field(	int,		cpu			      )
+		__dynamic_array(char,		path,
+		  __trace_sched_path(__trace_sched_group_cfs_rq(se), NULL, 0) )
+		__array(	char,		comm,	TASK_COMM_LEN	      )
+		__field(	pid_t,		pid			      )
+		__field(	unsigned long,	load			      )
+		__field(	unsigned long,	rbl_load		      )
+		__field(	unsigned long,	util			      )
+	),
+
+	TP_fast_assign(
+		struct cfs_rq *gcfs_rq = __trace_sched_group_cfs_rq(se);
+		struct task_struct *p = gcfs_rq ? NULL
+				    : container_of(se, struct task_struct, se);
+
+		__entry->cpu		= __trace_sched_cpu(gcfs_rq, se);
+		__trace_sched_path(gcfs_rq, __get_dynamic_array(path),
+				   __get_dynamic_array_len(path));
+		memcpy(__entry->comm, p ? p->comm : "(null)",
+				      p ? TASK_COMM_LEN : sizeof("(null)"));
+		__entry->pid = p ? p->pid : -1;
+		__entry->load = se->avg.load_avg;
+		__entry->rbl_load = se->avg.runnable_load_avg;
+		__entry->util = se->avg.util_avg;
+	),
+
+	TP_printk("cpu=%d path=%s comm=%s pid=%d load=%lu rbl_load=%lu util=%lu",
+		  __entry->cpu, __get_str(path), __entry->comm, __entry->pid,
+		  __entry->load, __entry->rbl_load, __entry->util)
+);
+
+/*
+ * Tracepoint for task_group load tracking:
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+TRACE_EVENT(sched_load_tg,
+
+	TP_PROTO(struct cfs_rq *cfs_rq),
+
+	TP_ARGS(cfs_rq),
+
+	TP_STRUCT__entry(
+		__field(	int,	cpu				)
+		__dynamic_array(char,	path,
+				__trace_sched_path(cfs_rq, NULL, 0)	)
+		__field(	long,	load				)
+	),
+
+	TP_fast_assign(
+		__entry->cpu	= cfs_rq->rq->cpu;
+		__trace_sched_path(cfs_rq, __get_dynamic_array(path),
+				   __get_dynamic_array_len(path));
+		__entry->load	= atomic_long_read(&cfs_rq->tg->load_avg);
+	),
+
+	TP_printk("cpu=%d path=%s load=%ld", __entry->cpu, __get_str(path),
+		  __entry->load)
+);
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+/*
+ * Tracepoint for tasks' estimated utilization.
+ */
+TRACE_EVENT(sched_util_est_task,
+
+	TP_PROTO(struct task_struct *tsk, struct sched_avg *avg),
+
+	TP_ARGS(tsk, avg),
+
+	TP_STRUCT__entry(
+		__array( char,	comm,	TASK_COMM_LEN		)
+		__field( pid_t,		pid			)
+		__field( int,		cpu			)
+		__field( unsigned int,	util_avg		)
+		__field( unsigned int,	est_enqueued		)
+		__field( unsigned int,	est_ewma		)
+
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid			= tsk->pid;
+		__entry->cpu			= task_cpu(tsk);
+		__entry->util_avg		= avg->util_avg;
+		__entry->est_enqueued		= avg->util_est.enqueued;
+		__entry->est_ewma		= avg->util_est.ewma;
+	),
+
+	TP_printk("comm=%s pid=%d cpu=%d util_avg=%u util_est_ewma=%u util_est_enqueued=%u",
+		  __entry->comm,
+		  __entry->pid,
+		  __entry->cpu,
+		  __entry->util_avg,
+		  __entry->est_ewma,
+		  __entry->est_enqueued)
+);
+
+/*
+ * Tracepoint for root cfs_rq's estimated utilization.
+ */
+TRACE_EVENT(sched_util_est_cpu,
+
+	TP_PROTO(int cpu, struct cfs_rq *cfs_rq),
+
+	TP_ARGS(cpu, cfs_rq),
+
+	TP_STRUCT__entry(
+		__field( int,		cpu			)
+		__field( unsigned int,	util_avg		)
+		__field( unsigned int,	util_est_enqueued	)
+	),
+
+	TP_fast_assign(
+		__entry->cpu			= cpu;
+		__entry->util_avg		= cfs_rq->avg.util_avg;
+		__entry->util_est_enqueued	= cfs_rq->avg.util_est.enqueued;
+	),
+
+	TP_printk("cpu=%d util_avg=%u util_est_enqueued=%u",
+		  __entry->cpu,
+		  __entry->util_avg,
+		  __entry->util_est_enqueued)
+);
+
+/*
+ * Tracepoint for find_best_target
+ */
+TRACE_EVENT(sched_find_best_target,
+
+	TP_PROTO(struct task_struct *tsk, bool prefer_idle,
+		 unsigned long min_util, int best_idle, int best_active,
+		 int target, int backup),
+
+	TP_ARGS(tsk, prefer_idle, min_util, best_idle,
+		best_active, target, backup),
+
+	TP_STRUCT__entry(
+		__array( char,  comm,   TASK_COMM_LEN   )
+		__field( pid_t, pid                     )
+		__field( unsigned long, min_util        )
+		__field( bool,  prefer_idle             )
+		__field( int,   best_idle               )
+		__field( int,   best_active             )
+		__field( int,   target                  )
+		__field( int,   backup                  )
+		),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid            = tsk->pid;
+		__entry->min_util       = min_util;
+		__entry->prefer_idle    = prefer_idle;
+		__entry->best_idle      = best_idle;
+		__entry->best_active    = best_active;
+		__entry->target         = target;
+		__entry->backup         = backup;
+		),
+
+	TP_printk("pid=%d comm=%s prefer_idle=%d "
+		  "best_idle=%d best_active=%d target=%d backup=%d",
+		  __entry->pid, __entry->comm, __entry->prefer_idle,
+		  __entry->best_idle, __entry->best_active,
+		  __entry->target, __entry->backup)
+);
+
+/*
+ * Tracepoint for accounting CPU  boosted utilization
+ */
+TRACE_EVENT(sched_boost_cpu,
+
+	TP_PROTO(int cpu, unsigned long util, long margin),
+
+	TP_ARGS(cpu, util, margin),
+
+	TP_STRUCT__entry(
+		__field( int,           cpu	)
+		__field( unsigned long, util	)
+		__field(long,           margin	)
+	),
+
+	TP_fast_assign(
+		__entry->cpu    = cpu;
+		__entry->util   = util;
+		__entry->margin = margin;
+	),
+
+	TP_printk("cpu=%d util=%lu margin=%ld",
+		__entry->cpu,
+		__entry->util,
+		__entry->margin)
+);
+
+/*
+ * Tracepoint for schedtune_tasks_update
+ */
+TRACE_EVENT(sched_tune_tasks_update,
+
+	TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx,
+		int boost, int max_boost, u64 group_ts),
+
+	TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost, group_ts),
+
+	TP_STRUCT__entry(
+		__array( char,  comm,   TASK_COMM_LEN   )
+		__field( pid_t,         pid             )
+		__field( int,           cpu             )
+		__field( int,           tasks           )
+		__field( int,           idx             )
+		__field( int,           boost           )
+		__field( int,           max_boost       )
+		__field( u64,		group_ts	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid            = tsk->pid;
+		__entry->cpu            = cpu;
+		__entry->tasks          = tasks;
+		__entry->idx            = idx;
+		__entry->boost          = boost;
+		__entry->max_boost      = max_boost;
+		__entry->group_ts	= group_ts;
+	),
+
+	TP_printk("pid=%d comm=%s "
+		"cpu=%d tasks=%d idx=%d boost=%d max_boost=%d timeout=%llu",
+		__entry->pid, __entry->comm,
+		__entry->cpu, __entry->tasks, __entry->idx,
+		__entry->boost, __entry->max_boost,
+		__entry->group_ts)
+);
+
+/*
+ * Tracepoint for schedtune_boostgroup_update
+ */
+TRACE_EVENT(sched_tune_boostgroup_update,
+
+	TP_PROTO(int cpu, int variation, int max_boost),
+
+	TP_ARGS(cpu, variation, max_boost),
+
+	TP_STRUCT__entry(
+		__field( int,   cpu		)
+		__field( int,   variation	)
+		__field( int,   max_boost	)
+	),
+
+	TP_fast_assign(
+		__entry->cpu            = cpu;
+		__entry->variation      = variation;
+		__entry->max_boost      = max_boost;
+	),
+
+	TP_printk("cpu=%d variation=%d max_boost=%d",
+		__entry->cpu, __entry->variation, __entry->max_boost)
+);
+
+/*
+ * Tracepoint for accounting task boosted utilization
+ */
+TRACE_EVENT(sched_boost_task,
+
+	TP_PROTO(struct task_struct *tsk, unsigned long util, long margin),
+
+	TP_ARGS(tsk, util, margin),
+
+	TP_STRUCT__entry(
+		__array( char,  comm,   TASK_COMM_LEN	)
+		__field( pid_t,         pid		)
+		__field( unsigned long, util		)
+		__field( long,          margin		)
+
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid    = tsk->pid;
+		__entry->util   = util;
+		__entry->margin = margin;
+	),
+
+	TP_printk("comm=%s pid=%d util=%lu margin=%ld",
+		__entry->comm, __entry->pid,
+		__entry->util,
+		__entry->margin)
+);
+
+/*
+ * Tracepoint for system overutilized flag
+*/
+TRACE_EVENT(sched_overutilized,
+
+	TP_PROTO(int overutilized),
+
+	TP_ARGS(overutilized),
+
+	TP_STRUCT__entry(
+		__field( int,  overutilized    )
+	),
+
+	TP_fast_assign(
+		__entry->overutilized   = overutilized;
+	),
+
+	TP_printk("overutilized=%d",
+		__entry->overutilized)
+);
+
+#endif /* CONFIG_SMP */
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 818ae69..64b327dd 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h

@@ -71,6 +71,7 @@
 #define AUDIT_TTY_SET		1017	/* Set TTY auditing status */
 #define AUDIT_SET_FEATURE	1018	/* Turn an audit feature on or off */
 #define AUDIT_GET_FEATURE	1019	/* Get which features are enabled */
+#define AUDIT_CONTAINER_OP	1020	/* Define the container id and info */
 
 #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly uninteresting to kernel */
 #define AUDIT_USER_AVC		1107	/* We filter this differently */
@@ -469,6 +470,7 @@
 
 #define AUDIT_UID_UNSET (unsigned int)-1
 #define AUDIT_SID_UNSET ((unsigned int)-1)
+#define AUDIT_CID_UNSET ((u64)-1)
 
 /* audit_rule_data supports filter rules with both integer and string
  * fields.  It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and

diff --git a/include/uapi/linux/netfilter/xt_IDLETIMER.h b/include/uapi/linux/netfilter/xt_IDLETIMER.h
index 3c586a1..c82a1c1 100644
--- a/include/uapi/linux/netfilter/xt_IDLETIMER.h
+++ b/include/uapi/linux/netfilter/xt_IDLETIMER.h

@@ -5,6 +5,7 @@
  * Header file for Xtables timer target module.
  *
  * Copyright (C) 2004, 2010 Nokia Corporation
+ *
  * Written by Timo Teras <ext-timo.teras@nokia.com>
  *
  * Converted to x_tables and forward-ported to 2.6.34
@@ -33,12 +34,19 @@
 #include <linux/types.h>
 
 #define MAX_IDLETIMER_LABEL_SIZE 28
+#define NLMSG_MAX_SIZE 64
+
+#define NL_EVENT_TYPE_INACTIVE 0
+#define NL_EVENT_TYPE_ACTIVE 1
 
 struct idletimer_tg_info {
 	__u32 timeout;
 
 	char label[MAX_IDLETIMER_LABEL_SIZE];
 
+	/* Use netlink messages for notification in addition to sysfs */
+	__u8 send_nl_msg;
+
 	/* for kernel module internal use only */
 	struct idletimer_tg *timer __attribute__((aligned(8)));
 };

diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index b17201e..e7aa960 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h

@@ -155,6 +155,9 @@
 #define PR_SET_PTRACER 0x59616d61
 # define PR_SET_PTRACER_ANY ((unsigned long)-1)
 
+#define PR_ALT_SYSCALL 0x43724f53
+# define PR_ALT_SYSCALL_SET_SYSCALL_TABLE 1
+
 #define PR_SET_CHILD_SUBREAPER	36
 #define PR_GET_CHILD_SUBREAPER	37
 
@@ -220,4 +223,7 @@
 # define PR_SPEC_DISABLE		(1UL << 2)
 # define PR_SPEC_FORCE_DISABLE		(1UL << 3)
 
+#define PR_SET_VMA		0x53564d41
+# define PR_SET_VMA_ANON_NAME		0
+
 #endif /* _LINUX_PRCTL_H */

diff --git a/include/uapi/linux/usb/charger.h b/include/uapi/linux/usb/charger.h
index 5f72af3..ad22079 100644
--- a/include/uapi/linux/usb/charger.h
+++ b/include/uapi/linux/usb/charger.h

@@ -14,18 +14,18 @@
  * ACA (Accessory Charger Adapters)
  */
 enum usb_charger_type {
-	UNKNOWN_TYPE,
-	SDP_TYPE,
-	DCP_TYPE,
-	CDP_TYPE,
-	ACA_TYPE,
+	UNKNOWN_TYPE = 0,
+	SDP_TYPE = 1,
+	DCP_TYPE = 2,
+	CDP_TYPE = 3,
+	ACA_TYPE = 4,
 };
 
 /* USB charger state */
 enum usb_charger_state {
-	USB_CHARGER_DEFAULT,
-	USB_CHARGER_PRESENT,
-	USB_CHARGER_ABSENT,
+	USB_CHARGER_DEFAULT = 0,
+	USB_CHARGER_PRESENT = 1,
+	USB_CHARGER_ABSENT = 2,
 };
 
 #endif /* _UAPI__LINUX_USB_CHARGER_H */

diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
index 9ebe4d9..682afbf 100644
--- a/include/uapi/linux/virtio_blk.h
+++ b/include/uapi/linux/virtio_blk.h

@@ -38,6 +38,8 @@
 #define VIRTIO_BLK_F_BLK_SIZE	6	/* Block size of disk is available*/
 #define VIRTIO_BLK_F_TOPOLOGY	10	/* Topology information is available */
 #define VIRTIO_BLK_F_MQ		12	/* support more than one vq */
+#define VIRTIO_BLK_F_DISCARD	13	/* DISCARD is supported */
+#define VIRTIO_BLK_F_WRITE_ZEROES	14	/* WRITE ZEROES is supported */
 
 /* Legacy feature bits */
 #ifndef VIRTIO_BLK_NO_LEGACY
@@ -86,6 +88,39 @@
 
 	/* number of vqs, only available when VIRTIO_BLK_F_MQ is set */
 	__u16 num_queues;
+
+	/* the next 3 entries are guarded by VIRTIO_BLK_F_DISCARD */
+	/*
+	 * The maximum discard sectors (in 512-byte sectors) for
+	 * one segment.
+	 */
+	__u32 max_discard_sectors;
+	/*
+	 * The maximum number of discard segments in a
+	 * discard command.
+	 */
+	__u32 max_discard_seg;
+	/* Discard commands must be aligned to this number of sectors. */
+	__u32 discard_sector_alignment;
+
+	/* the next 3 entries are guarded by VIRTIO_BLK_F_WRITE_ZEROES */
+	/*
+	 * The maximum number of write zeroes sectors (in 512-byte sectors) in
+	 * one segment.
+	 */
+	__u32 max_write_zeroes_sectors;
+	/*
+	 * The maximum number of segments in a write zeroes
+	 * command.
+	 */
+	__u32 max_write_zeroes_seg;
+	/*
+	 * Set if a VIRTIO_BLK_T_WRITE_ZEROES request may result in the
+	 * deallocation of one or more of the sectors.
+	 */
+	__u8 write_zeroes_may_unmap;
+
+	__u8 unused1[3];
 } __attribute__((packed));
 
 /*
@@ -114,6 +149,12 @@
 /* Get device ID command */
 #define VIRTIO_BLK_T_GET_ID    8
 
+/* Discard command */
+#define VIRTIO_BLK_T_DISCARD	11
+
+/* Write zeroes command */
+#define VIRTIO_BLK_T_WRITE_ZEROES	13
+
 #ifndef VIRTIO_BLK_NO_LEGACY
 /* Barrier before this op. */
 #define VIRTIO_BLK_T_BARRIER	0x80000000
@@ -133,6 +174,19 @@
 	__virtio64 sector;
 };
 
+/* Unmap this range (only valid for write zeroes command) */
+#define VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP	0x00000001
+
+/* Discard/write zeroes range for each request. */
+struct virtio_blk_discard_write_zeroes {
+	/* discard/write zeroes start sector */
+	__virtio64 sector;
+	/* number of discard/write zeroes sectors */
+	__virtio32 num_sectors;
+	/* flags for this range */
+	__virtio32 flags;
+};
+
 #ifndef VIRTIO_BLK_NO_LEGACY
 struct virtio_scsi_inhdr {
 	__virtio32 errors;

diff --git a/init/Kconfig b/init/Kconfig
index 47035b5..fe39bfd 100644
--- a/init/Kconfig
+++ b/init/Kconfig

@@ -23,9 +23,6 @@
 	int
 	default $(shell,$(srctree)/scripts/clang-version.sh $(CC))
 
-config CC_HAS_ASM_GOTO
-	def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC))
-
 config CONSTRUCTORS
 	bool
 	depends on !UML
@@ -260,6 +257,15 @@
 	  used to provide more virtual memory than the actual RAM present
 	  in your computer.  If unsure say Y.
 
+config DISK_BASED_SWAP
+	bool "Allow disk-based swap files in Chromium OS kernels"
+	depends on SWAP
+	default n
+	help
+	  By default, the Chromium OS kernel allows swapping only to
+	  zram devices. This option allows you to use disk-based files
+	  as swap devices too.  If unsure say N.
+
 config SYSVIPC
 	bool "System V IPC"
 	---help---
@@ -999,6 +1005,29 @@
 	  desktop applications.  Task group autogeneration is currently based
 	  upon task session.
 
+config SCHED_TUNE
+	bool "Boosting for CFS tasks (EXPERIMENTAL)"
+	depends on SMP
+	help
+	  This option enables support for task classification using a new
+	  cgroup controller, schedtune. Schedtune allows tasks to be given
+	  a boost value and marked as latency-sensitive or not. This option
+	  provides the "schedtune" controller.
+
+	  This new controller:
+	  1. allows only a two layers hierarchy, where the root defines the
+	     system-wide boost value and its direct childrens define each one a
+	     different "class of tasks" to be boosted with a different value
+	  2. supports up to 16 different task classes, each one which could be
+	     configured with a different boost value
+
+	  Latency-sensitive tasks are not subject to energy-aware wakeup
+	  task placement. The boost value assigned to tasks is used to
+	  influence task placement and CPU frequency selection (if
+	  utilization-driven frequency selection is in use).
+
+	  If unsure, say N.
+
 config SYSFS_DEPRECATED
 	bool "Enable deprecated sysfs features to support old userspace tools"
 	depends on SYSFS

diff --git a/init/init_task.c b/init/init_task.c
index 5aebe3b..67fe17b 100644
--- a/init/init_task.c
+++ b/init/init_task.c

@@ -11,6 +11,8 @@
 #include <linux/mm.h>
 #include <linux/audit.h>
 
+#include <linux/alt-syscall.h>
+
 #include <asm/pgtable.h>
 #include <linux/uaccess.h>
 
@@ -121,9 +123,8 @@
 	.thread_pid	= &init_struct_pid,
 	.thread_group	= LIST_HEAD_INIT(init_task.thread_group),
 	.thread_node	= LIST_HEAD_INIT(init_signals.thread_head),
-#ifdef CONFIG_AUDITSYSCALL
-	.loginuid	= INVALID_UID,
-	.sessionid	= AUDIT_SID_UNSET,
+#ifdef CONFIG_AUDIT
+	.audit		= &init_struct_audit,
 #endif
 #ifdef CONFIG_PERF_EVENTS
 	.perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex),

diff --git a/init/main.c b/init/main.c
index 38a603f..a6301fb 100644
--- a/init/main.c
+++ b/init/main.c

@@ -92,6 +92,7 @@
 #include <linux/rodata_test.h>
 #include <linux/jump_label.h>
 #include <linux/mem_encrypt.h>
+#include <linux/audit.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -720,6 +721,7 @@
 	nsfs_init();
 	cpuset_init();
 	cgroup_init();
+	audit_task_init();
 	taskstats_init_early();
 	delayacct_init();
 

diff --git a/ipc/sem.c b/ipc/sem.c
index 26f8e37..2bf535d 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c

@@ -2345,11 +2345,9 @@
 		ipc_assert_locked_object(&sma->sem_perm);
 		list_del(&un->list_id);
 
-		/* we are the last process using this ulp, acquiring ulp->lock
-		 * isn't required. Besides that, we are also protected against
-		 * IPC_RMID as we hold sma->sem_perm lock now
-		 */
+		spin_lock(&ulp->lock);
 		list_del_rcu(&un->list_proc);
+		spin_unlock(&ulp->lock);
 
 		/* perform adjustments registered in un */
 		for (i = 0; i < sma->sem_nsems; i++) {

diff --git a/kernel/Makefile b/kernel/Makefile
index ad4b324..8bfac76 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile

@@ -44,6 +44,8 @@
 obj-y += livepatch/
 obj-y += dma/
 
+obj-$(CONFIG_ALT_SYSCALL) += alt-syscall.o
+
 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o

diff --git a/kernel/alt-syscall.c b/kernel/alt-syscall.c
new file mode 100644
index 0000000..99599e1
--- /dev/null
+++ b/kernel/alt-syscall.c

@@ -0,0 +1,66 @@
+/*
+ * Alternate Syscall Table Infrastructure
+ *
+ * Copyright 2014 Google Inc. All Rights Reserved
+ *
+ * Authors:
+ *      Kees Cook   <keescook@chromium.org>
+ *      Will Drewry <wad@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/alt-syscall.h>
+
+static LIST_HEAD(alt_sys_call_tables);
+static DEFINE_SPINLOCK(alt_sys_call_tables_lock);
+
+/* XXX: there is no "unregister" yet. */
+int register_alt_sys_call_table(struct alt_sys_call_table *entry)
+{
+	if (!entry)
+		return -EINVAL;
+
+	spin_lock(&alt_sys_call_tables_lock);
+	list_add(&entry->node, &alt_sys_call_tables);
+	spin_unlock(&alt_sys_call_tables_lock);
+
+	pr_info("table '%s' available.\n", entry->name);
+
+	return 0;
+}
+
+int set_alt_sys_call_table(char * __user uname)
+{
+	char name[ALT_SYS_CALL_NAME_MAX + 1] = { };
+	struct alt_sys_call_table *entry;
+
+	if (copy_from_user(name, uname, ALT_SYS_CALL_NAME_MAX))
+		return -EFAULT;
+
+	spin_lock(&alt_sys_call_tables_lock);
+	list_for_each_entry(entry, &alt_sys_call_tables, node) {
+		if (!strcmp(entry->name, name)) {
+			if (arch_set_sys_call_table(entry))
+				continue;
+			spin_unlock(&alt_sys_call_tables_lock);
+			return 0;
+		}
+	}
+	spin_unlock(&alt_sys_call_tables_lock);
+
+	return -ENOENT;
+}

diff --git a/kernel/audit.c b/kernel/audit.c
index 2a80587..7d7755d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c

@@ -216,6 +216,75 @@
 	struct sk_buff *skb;
 };
 
+static struct kmem_cache *audit_task_cache;
+
+void __init audit_task_init(void)
+{
+	audit_task_cache = kmem_cache_create("audit_task",
+					     sizeof(struct audit_task_info),
+					     0, SLAB_PANIC, NULL);
+}
+
+/**
+ * audit_alloc - allocate an audit info block for a task
+ * @tsk: task
+ *
+ * Call audit_alloc_syscall to filter on the task information and
+ * allocate a per-task audit context if necessary.  This is called from
+ * copy_process, so no lock is needed.
+ */
+int audit_alloc(struct task_struct *tsk)
+{
+	int ret = 0;
+	struct audit_task_info *info;
+
+	info = kmem_cache_alloc(audit_task_cache, GFP_KERNEL);
+	if (!info) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	info->loginuid = audit_get_loginuid(current);
+	info->sessionid = audit_get_sessionid(current);
+	info->contid = audit_get_contid(current);
+	tsk->audit = info;
+
+	ret = audit_alloc_syscall(tsk);
+	if (ret) {
+		tsk->audit = NULL;
+		kmem_cache_free(audit_task_cache, info);
+	}
+out:
+	return ret;
+}
+
+struct audit_task_info init_struct_audit = {
+	.loginuid = INVALID_UID,
+	.sessionid = AUDIT_SID_UNSET,
+	.contid = AUDIT_CID_UNSET,
+#ifdef CONFIG_AUDITSYSCALL
+	.ctx = NULL,
+#endif
+};
+
+/**
+ * audit_free - free per-task audit info
+ * @tsk: task whose audit info block to free
+ *
+ * Called from copy_process and do_exit
+ */
+void audit_free(struct task_struct *tsk)
+{
+	struct audit_task_info *info = tsk->audit;
+
+	audit_free_syscall(tsk);
+	/* Freeing the audit_task_info struct must be performed after
+	 * audit_log_exit() due to need for loginuid and sessionid.
+	 */
+	info = tsk->audit;
+	tsk->audit = NULL;
+	kmem_cache_free(audit_task_cache, info);
+}
+
 /**
  * auditd_test_task - Check to see if a given task is an audit daemon
  * @task: the task to check
@@ -1106,13 +1175,11 @@
 	audit_log_end(ab);
 }
 
-static int audit_set_feature(struct sk_buff *skb)
+static int audit_set_feature(struct audit_features *uaf)
 {
-	struct audit_features *uaf;
 	int i;
 
 	BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));
-	uaf = nlmsg_data(nlmsg_hdr(skb));
 
 	/* if there is ever a version 2 we should handle that here */
 
@@ -1180,6 +1247,7 @@
 {
 	u32			seq;
 	void			*data;
+	int			data_len;
 	int			err;
 	struct audit_buffer	*ab;
 	u16			msg_type = nlh->nlmsg_type;
@@ -1193,6 +1261,7 @@
 
 	seq  = nlh->nlmsg_seq;
 	data = nlmsg_data(nlh);
+	data_len = nlmsg_len(nlh);
 
 	switch (msg_type) {
 	case AUDIT_GET: {
@@ -1216,7 +1285,7 @@
 		struct audit_status	s;
 		memset(&s, 0, sizeof(s));
 		/* guard against past and future API changes */
-		memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+		memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
 		if (s.mask & AUDIT_STATUS_ENABLED) {
 			err = audit_set_enabled(s.enabled);
 			if (err < 0)
@@ -1320,7 +1389,9 @@
 			return err;
 		break;
 	case AUDIT_SET_FEATURE:
-		err = audit_set_feature(skb);
+		if (data_len < sizeof(struct audit_features))
+			return -EINVAL;
+		err = audit_set_feature(data);
 		if (err)
 			return err;
 		break;
@@ -1332,6 +1403,8 @@
 
 		err = audit_filter(msg_type, AUDIT_FILTER_USER);
 		if (err == 1) { /* match or error */
+			char *str = data;
+
 			err = 0;
 			if (msg_type == AUDIT_USER_TTY) {
 				err = tty_audit_push();
@@ -1339,26 +1412,24 @@
 					break;
 			}
 			audit_log_common_recv_msg(&ab, msg_type);
-			if (msg_type != AUDIT_USER_TTY)
+			if (msg_type != AUDIT_USER_TTY) {
+				/* ensure NULL termination */
+				str[data_len - 1] = '\0';
 				audit_log_format(ab, " msg='%.*s'",
 						 AUDIT_MESSAGE_TEXT_MAX,
-						 (char *)data);
-			else {
-				int size;
-
+						 str);
+			} else {
 				audit_log_format(ab, " data=");
-				size = nlmsg_len(nlh);
-				if (size > 0 &&
-				    ((unsigned char *)data)[size - 1] == '\0')
-					size--;
-				audit_log_n_untrustedstring(ab, data, size);
+				if (data_len > 0 && str[data_len - 1] == '\0')
+					data_len--;
+				audit_log_n_untrustedstring(ab, str, data_len);
 			}
 			audit_log_end(ab);
 		}
 		break;
 	case AUDIT_ADD_RULE:
 	case AUDIT_DEL_RULE:
-		if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
+		if (data_len < sizeof(struct audit_rule_data))
 			return -EINVAL;
 		if (audit_enabled == AUDIT_LOCKED) {
 			audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
@@ -1366,7 +1437,7 @@
 			audit_log_end(ab);
 			return -EPERM;
 		}
-		err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh));
+		err = audit_rule_change(msg_type, seq, data, data_len);
 		break;
 	case AUDIT_LIST_RULES:
 		err = audit_list_rules_send(skb, seq);
@@ -1380,7 +1451,7 @@
 	case AUDIT_MAKE_EQUIV: {
 		void *bufp = data;
 		u32 sizes[2];
-		size_t msglen = nlmsg_len(nlh);
+		size_t msglen = data_len;
 		char *old, *new;
 
 		err = -EINVAL;
@@ -1456,7 +1527,7 @@
 
 		memset(&s, 0, sizeof(s));
 		/* guard against past and future API changes */
-		memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+		memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
 		/* check if new data is valid */
 		if ((s.enabled != 0 && s.enabled != 1) ||
 		    (s.log_passwd != 0 && s.log_passwd != 1))
@@ -2022,6 +2093,12 @@
 	if (prefix)
 		audit_log_format(ab, "%s", prefix);
 
+	/* The process may be exiting. */
+	if (!current->fs) {
+		audit_log_string(ab, "<unknown>");
+		return;
+	}
+
 	/* We will allow 11 spaces for ' (deleted)' to be appended */
 	pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
 	if (!pathname) {
@@ -2323,6 +2400,73 @@
 }
 
 /**
+ * audit_set_contid - set current task's audit contid
+ * @contid: contid value
+ *
+ * Returns 0 on success, -EPERM on permission failure.
+ *
+ * Called (set) from fs/proc/base.c::proc_contid_write().
+ */
+int audit_set_contid(struct task_struct *task, u64 contid)
+{
+	u64 oldcontid;
+	int rc = 0;
+	struct audit_buffer *ab;
+	uid_t uid;
+	struct tty_struct *tty;
+	char comm[sizeof(current->comm)];
+
+	task_lock(task);
+	/* Can't set if audit disabled */
+	if (!task->audit) {
+		task_unlock(task);
+		return -ENOPROTOOPT;
+	}
+	oldcontid = audit_get_contid(task);
+	read_lock(&tasklist_lock);
+	/* Don't allow the audit containerid to be unset */
+	if (!audit_contid_valid(contid))
+		rc = -EINVAL;
+	/* if we don't have caps, reject */
+	else if (!capable(CAP_AUDIT_CONTROL))
+		rc = -EPERM;
+	/* if task has children or is not single-threaded, deny */
+	else if (!list_empty(&task->children))
+		rc = -EBUSY;
+	else if (!(thread_group_leader(task) && thread_group_empty(task)))
+		rc = -EALREADY;
+	read_unlock(&tasklist_lock);
+	if (!rc)
+		task->audit->contid = contid;
+	task_unlock(task);
+
+	if (!audit_enabled)
+		return rc;
+
+	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONTAINER_OP);
+	if (!ab)
+		return rc;
+
+	uid = from_kuid(&init_user_ns, task_uid(current));
+	tty = audit_get_tty(current);
+	audit_log_format(ab,
+			 "op=set opid=%d contid=%llu old-contid=%llu pid=%d uid=%u auid=%u tty=%s ses=%u",
+			 task_tgid_nr(task), contid, oldcontid,
+			 task_tgid_nr(current), uid,
+			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
+			 tty ? tty_name(tty) : "(none)",
+			 audit_get_sessionid(current));
+	audit_put_tty(tty);
+	audit_log_task_context(ab);
+	audit_log_format(ab, " comm=");
+	audit_log_untrustedstring(ab, get_task_comm(comm, current));
+	audit_log_d_path_exe(ab, current->mm);
+	audit_log_format(ab, " res=%d", !rc);
+	audit_log_end(ab);
+	return rc;
+}
+
+/**
  * audit_log_end - end one audit record
  * @ab: the audit_buffer
  *

diff --git a/kernel/audit.h b/kernel/audit.h
index 214e149..6bf56f3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h

@@ -147,6 +147,7 @@
 	kuid_t		    target_uid;
 	unsigned int	    target_sessionid;
 	u32		    target_sid;
+	u64		    target_cid;
 	char		    target_comm[TASK_COMM_LEN];
 
 	struct audit_tree_refs *trees, *first_trees;
@@ -267,6 +268,8 @@
 
 /* audit watch functions */
 #ifdef CONFIG_AUDIT_WATCH
+extern int audit_alloc_syscall(struct task_struct *tsk);
+extern void audit_free_syscall(struct task_struct *tsk);
 extern void audit_put_watch(struct audit_watch *watch);
 extern void audit_get_watch(struct audit_watch *watch);
 extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
@@ -284,6 +287,8 @@
 extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark);
 
 #else
+#define audit_alloc_syscall(t) 0
+#define audit_free_syscall(t) {}
 #define audit_put_watch(w) {}
 #define audit_get_watch(w) {}
 #define audit_to_watch(k, p, l, o) (-EINVAL)

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 425c67e..1c8a48a 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c

@@ -452,6 +452,7 @@
 	bufp = data->buf;
 	for (i = 0; i < data->field_count; i++) {
 		struct audit_field *f = &entry->rule.fields[i];
+		u32 f_val;
 
 		err = -EINVAL;
 
@@ -460,12 +461,12 @@
 			goto exit_free;
 
 		f->type = data->fields[i];
-		f->val = data->values[i];
+		f_val = data->values[i];
 
 		/* Support legacy tests for a valid loginuid */
-		if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
+		if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) {
 			f->type = AUDIT_LOGINUID_SET;
-			f->val = 0;
+			f_val = 0;
 			entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
 		}
 
@@ -481,7 +482,7 @@
 		case AUDIT_SUID:
 		case AUDIT_FSUID:
 		case AUDIT_OBJ_UID:
-			f->uid = make_kuid(current_user_ns(), f->val);
+			f->uid = make_kuid(current_user_ns(), f_val);
 			if (!uid_valid(f->uid))
 				goto exit_free;
 			break;
@@ -490,11 +491,12 @@
 		case AUDIT_SGID:
 		case AUDIT_FSGID:
 		case AUDIT_OBJ_GID:
-			f->gid = make_kgid(current_user_ns(), f->val);
+			f->gid = make_kgid(current_user_ns(), f_val);
 			if (!gid_valid(f->gid))
 				goto exit_free;
 			break;
 		case AUDIT_ARCH:
+			f->val = f_val;
 			entry->rule.arch_f = f;
 			break;
 		case AUDIT_SUBJ_USER:
@@ -507,11 +509,13 @@
 		case AUDIT_OBJ_TYPE:
 		case AUDIT_OBJ_LEV_LOW:
 		case AUDIT_OBJ_LEV_HIGH:
-			str = audit_unpack_string(&bufp, &remain, f->val);
-			if (IS_ERR(str))
+			str = audit_unpack_string(&bufp, &remain, f_val);
+			if (IS_ERR(str)) {
+				err = PTR_ERR(str);
 				goto exit_free;
-			entry->rule.buflen += f->val;
-
+			}
+			entry->rule.buflen += f_val;
+			f->lsm_str = str;
 			err = security_audit_rule_init(f->type, f->op, str,
 						       (void **)&f->lsm_rule);
 			/* Keep currently invalid fields around in case they
@@ -520,68 +524,71 @@
 				pr_warn("audit rule for LSM \'%s\' is invalid\n",
 					str);
 				err = 0;
-			}
-			if (err) {
-				kfree(str);
+			} else if (err)
 				goto exit_free;
-			} else
-				f->lsm_str = str;
 			break;
 		case AUDIT_WATCH:
-			str = audit_unpack_string(&bufp, &remain, f->val);
-			if (IS_ERR(str))
+			str = audit_unpack_string(&bufp, &remain, f_val);
+			if (IS_ERR(str)) {
+				err = PTR_ERR(str);
 				goto exit_free;
-			entry->rule.buflen += f->val;
-
-			err = audit_to_watch(&entry->rule, str, f->val, f->op);
+			}
+			err = audit_to_watch(&entry->rule, str, f_val, f->op);
 			if (err) {
 				kfree(str);
 				goto exit_free;
 			}
+			entry->rule.buflen += f_val;
 			break;
 		case AUDIT_DIR:
-			str = audit_unpack_string(&bufp, &remain, f->val);
-			if (IS_ERR(str))
+			str = audit_unpack_string(&bufp, &remain, f_val);
+			if (IS_ERR(str)) {
+				err = PTR_ERR(str);
 				goto exit_free;
-			entry->rule.buflen += f->val;
-
+			}
 			err = audit_make_tree(&entry->rule, str, f->op);
 			kfree(str);
 			if (err)
 				goto exit_free;
+			entry->rule.buflen += f_val;
 			break;
 		case AUDIT_INODE:
+			f->val = f_val;
 			err = audit_to_inode(&entry->rule, f);
 			if (err)
 				goto exit_free;
 			break;
 		case AUDIT_FILTERKEY:
-			if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
+			if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN)
 				goto exit_free;
-			str = audit_unpack_string(&bufp, &remain, f->val);
-			if (IS_ERR(str))
-				goto exit_free;
-			entry->rule.buflen += f->val;
-			entry->rule.filterkey = str;
-			break;
-		case AUDIT_EXE:
-			if (entry->rule.exe || f->val > PATH_MAX)
-				goto exit_free;
-			str = audit_unpack_string(&bufp, &remain, f->val);
+			str = audit_unpack_string(&bufp, &remain, f_val);
 			if (IS_ERR(str)) {
 				err = PTR_ERR(str);
 				goto exit_free;
 			}
-			entry->rule.buflen += f->val;
-
-			audit_mark = audit_alloc_mark(&entry->rule, str, f->val);
+			entry->rule.buflen += f_val;
+			entry->rule.filterkey = str;
+			break;
+		case AUDIT_EXE:
+			if (entry->rule.exe || f_val > PATH_MAX)
+				goto exit_free;
+			str = audit_unpack_string(&bufp, &remain, f_val);
+			if (IS_ERR(str)) {
+				err = PTR_ERR(str);
+				goto exit_free;
+			}
+			audit_mark = audit_alloc_mark(&entry->rule, str, f_val);
 			if (IS_ERR(audit_mark)) {
 				kfree(str);
 				err = PTR_ERR(audit_mark);
 				goto exit_free;
 			}
+			entry->rule.buflen += f_val;
 			entry->rule.exe = audit_mark;
 			break;
+		default:
+			f->val = f_val;
+			break;
 		}
 	}
 

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1513873..9c32b70 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c

@@ -113,6 +113,7 @@
 	kuid_t			target_uid[AUDIT_AUX_PIDS];
 	unsigned int		target_sessionid[AUDIT_AUX_PIDS];
 	u32			target_sid[AUDIT_AUX_PIDS];
+	u64			target_cid[AUDIT_AUX_PIDS];
 	char 			target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
 	int			pid_count;
 };
@@ -841,7 +842,7 @@
 						      int return_valid,
 						      long return_code)
 {
-	struct audit_context *context = tsk->audit_context;
+	struct audit_context *context = tsk->audit->ctx;
 
 	if (!context)
 		return NULL;
@@ -926,23 +927,25 @@
 	return context;
 }
 
-/**
- * audit_alloc - allocate an audit context block for a task
+/*
+ * audit_alloc_syscall - allocate an audit context block for a task
  * @tsk: task
  *
  * Filter on the task information and allocate a per-task audit context
  * if necessary.  Doing so turns on system call auditing for the
- * specified task.  This is called from copy_process, so no lock is
- * needed.
+ * specified task.  This is called from copy_process via audit_alloc, so
+ * no lock is needed.
  */
-int audit_alloc(struct task_struct *tsk)
+int audit_alloc_syscall(struct task_struct *tsk)
 {
 	struct audit_context *context;
 	enum audit_state     state;
 	char *key = NULL;
 
-	if (likely(!audit_ever_enabled))
+	if (likely(!audit_ever_enabled)) {
+		audit_set_context(tsk, NULL);
 		return 0; /* Return if not auditing. */
+	}
 
 	state = audit_filter_task(tsk, &key);
 	if (state == AUDIT_DISABLED) {
@@ -952,7 +955,7 @@
 
 	if (!(context = audit_alloc_context(state))) {
 		kfree(key);
-		audit_log_lost("out of memory in audit_alloc");
+		audit_log_lost("out of memory in audit_alloc_syscall");
 		return -ENOMEM;
 	}
 	context->filterkey = key;
@@ -1473,16 +1476,16 @@
 }
 
 /**
- * __audit_free - free a per-task audit context
+ * audit_free_syscall - free per-task audit context info
  * @tsk: task whose audit context block to free
  *
- * Called from copy_process and do_exit
+ * Called from audit_free
  */
-void __audit_free(struct task_struct *tsk)
+void audit_free_syscall(struct task_struct *tsk)
 {
-	struct audit_context *context;
+	struct audit_task_info *info = tsk->audit;
+	struct audit_context *context = info->ctx;
 
-	context = audit_take_context(tsk, 0, 0);
 	if (!context)
 		return;
 
@@ -2075,8 +2078,8 @@
 			sessionid = (unsigned int)atomic_inc_return(&session_id);
 	}
 
-	task->sessionid = sessionid;
-	task->loginuid = loginuid;
+	task->audit->sessionid = sessionid;
+	task->audit->loginuid = loginuid;
 out:
 	audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
 	return rc;
@@ -2272,6 +2275,7 @@
 	context->target_uid = task_uid(t);
 	context->target_sessionid = audit_get_sessionid(t);
 	security_task_getsecid(t, &context->target_sid);
+	context->target_cid = audit_get_contid(t);
 	memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
 }
 
@@ -2312,6 +2316,7 @@
 		ctx->target_uid = t_uid;
 		ctx->target_sessionid = audit_get_sessionid(t);
 		security_task_getsecid(t, &ctx->target_sid);
+		ctx->target_cid = audit_get_contid(t);
 		memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
 		return 0;
 	}
@@ -2333,6 +2338,7 @@
 	axp->target_uid[axp->pid_count] = t_uid;
 	axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
 	security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
+	axp->target_cid[axp->pid_count] = audit_get_contid(t);
 	memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
 	axp->pid_count++;
 

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index dc9d7ac..c04815b 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c

@@ -198,6 +198,7 @@
 	void *key = map_iter(m)->key;
 	void *prev_key;
 
+	(*pos)++;
 	if (map_iter(m)->done)
 		return NULL;
 
@@ -210,8 +211,6 @@
 		map_iter(m)->done = true;
 		return NULL;
 	}
-
-	++(*pos);
 	return key;
 }
 

diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 86477f3..66e13aa 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c

@@ -289,7 +289,7 @@
 
 	ulen = info->jited_prog_len;
 	info->jited_prog_len = aux->offload->jited_len;
-	if (info->jited_prog_len & ulen) {
+	if (info->jited_prog_len && ulen) {
 		uinsns = u64_to_user_ptr(info->jited_prog_insns);
 		ulen = min_t(u32, info->jited_prog_len, ulen);
 		if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) {

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8d6b8b5..5bf779d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c

@@ -493,8 +493,7 @@
 	if (WARN_ON_ONCE((!cpu_online(cpu))))
 		return -ECANCELED;
 
-	/* Unpark the stopper thread and the hotplug thread of the target cpu */
-	stop_machine_unpark(cpu);
+	/* Unpark the hotplug thread of the target cpu */
 	kthread_unpark(st->thread);
 
 	/*
@@ -1048,8 +1047,8 @@
 
 /*
  * Called from the idle task. Wake up the controlling task which brings the
- * stopper and the hotplug thread of the upcoming CPU up and then delegates
- * the rest of the online bringup to the hotplug thread.
+ * hotplug thread of the upcoming CPU up and then delegates the rest of the
+ * online bringup to the hotplug thread.
  */
 void cpuhp_online_idle(enum cpuhp_state state)
 {
@@ -1059,6 +1058,12 @@
 	if (state != CPUHP_AP_ONLINE_IDLE)
 		return;
 
+	/*
+	 * Unpart the stopper thread before we start the idle loop (and start
+	 * scheduling); this ensures the stopper task is always available.
+	 */
+	stop_machine_unpark(smp_processor_id());
+
 	st->state = CPUHP_AP_ONLINE_IDLE;
 	complete_ap_thread(st, true);
 }
@@ -1221,6 +1226,7 @@
 void enable_nonboot_cpus(void)
 {
 	int cpu, error;
+	struct device *cpu_device;
 
 	/* Allow everyone to use the CPU hotplug again */
 	cpu_maps_update_begin();
@@ -1238,6 +1244,12 @@
 		trace_suspend_resume(TPS("CPU_ON"), cpu, false);
 		if (!error) {
 			pr_info("CPU%d is up\n", cpu);
+			cpu_device = get_cpu_device(cpu);
+			if (!cpu_device)
+				pr_err("%s: failed to get cpu%d device\n",
+				       __func__, cpu);
+			else
+				kobject_uevent(&cpu_device->kobj, KOBJ_ONLINE);
 			continue;
 		}
 		pr_warn("Error taking CPU%d up: %d\n", cpu, error);

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8c70ee2..4fc598a4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c

@@ -8474,30 +8474,39 @@
  *
  * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
  *                               if not set, create kprobe/uprobe
+ *
+ * The following values specify a reference counter (or semaphore in the
+ * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
+ * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
+ *
+ * PERF_UPROBE_REF_CTR_OFFSET_BITS	# of bits in config as th offset
+ * PERF_UPROBE_REF_CTR_OFFSET_SHIFT	# of bits to shift left
  */
 enum perf_probe_config {
 	PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
+	PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
+	PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
 };
 
 PMU_FORMAT_ATTR(retprobe, "config:0");
+#endif
 
-static struct attribute *probe_attrs[] = {
+#ifdef CONFIG_KPROBE_EVENTS
+static struct attribute *kprobe_attrs[] = {
 	&format_attr_retprobe.attr,
 	NULL,
 };
 
-static struct attribute_group probe_format_group = {
+static struct attribute_group kprobe_format_group = {
 	.name = "format",
-	.attrs = probe_attrs,
+	.attrs = kprobe_attrs,
 };
 
-static const struct attribute_group *probe_attr_groups[] = {
-	&probe_format_group,
+static const struct attribute_group *kprobe_attr_groups[] = {
+	&kprobe_format_group,
 	NULL,
 };
-#endif
 
-#ifdef CONFIG_KPROBE_EVENTS
 static int perf_kprobe_event_init(struct perf_event *event);
 static struct pmu perf_kprobe = {
 	.task_ctx_nr	= perf_sw_context,
@@ -8507,7 +8516,7 @@
 	.start		= perf_swevent_start,
 	.stop		= perf_swevent_stop,
 	.read		= perf_swevent_read,
-	.attr_groups	= probe_attr_groups,
+	.attr_groups	= kprobe_attr_groups,
 };
 
 static int perf_kprobe_event_init(struct perf_event *event)
@@ -8539,6 +8548,24 @@
 #endif /* CONFIG_KPROBE_EVENTS */
 
 #ifdef CONFIG_UPROBE_EVENTS
+PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
+
+static struct attribute *uprobe_attrs[] = {
+	&format_attr_retprobe.attr,
+	&format_attr_ref_ctr_offset.attr,
+	NULL,
+};
+
+static struct attribute_group uprobe_format_group = {
+	.name = "format",
+	.attrs = uprobe_attrs,
+};
+
+static const struct attribute_group *uprobe_attr_groups[] = {
+	&uprobe_format_group,
+	NULL,
+};
+
 static int perf_uprobe_event_init(struct perf_event *event);
 static struct pmu perf_uprobe = {
 	.task_ctx_nr	= perf_sw_context,
@@ -8548,12 +8575,13 @@
 	.start		= perf_swevent_start,
 	.stop		= perf_swevent_stop,
 	.read		= perf_swevent_read,
-	.attr_groups	= probe_attr_groups,
+	.attr_groups	= uprobe_attr_groups,
 };
 
 static int perf_uprobe_event_init(struct perf_event *event)
 {
 	int err;
+	unsigned long ref_ctr_offset;
 	bool is_retprobe;
 
 	if (event->attr.type != perf_uprobe.type)
@@ -8569,7 +8597,8 @@
 		return -EOPNOTSUPP;
 
 	is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
-	err = perf_uprobe_init(event, is_retprobe);
+	ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
+	err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
 	if (err)
 		return err;
 
@@ -10507,9 +10536,8 @@
  * @cpu:		target cpu
  * @group_fd:		group leader event fd
  */
-SYSCALL_DEFINE5(perf_event_open,
-		struct perf_event_attr __user *, attr_uptr,
-		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
+int ksys_perf_event_open(struct perf_event_attr __user * attr_uptr, pid_t pid,
+			 int cpu, int group_fd, unsigned long flags)
 {
 	struct perf_event *group_leader = NULL, *output_event = NULL;
 	struct perf_event *event, *sibling;
@@ -10937,6 +10965,13 @@
 	return err;
 }
 
+SYSCALL_DEFINE5(perf_event_open,
+		struct perf_event_attr __user *, attr_uptr,
+		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
+{
+	return ksys_perf_event_open(attr_uptr, pid, cpu, group_fd, flags);
+}
+
 /**
  * perf_event_create_kernel_counter
  *

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c173e41..223a7a5 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c

@@ -73,6 +73,7 @@
 	struct uprobe_consumer	*consumers;
 	struct inode		*inode;		/* Also hold a ref to inode */
 	loff_t			offset;
+	loff_t			ref_ctr_offset;
 	unsigned long		flags;
 
 	/*
@@ -88,6 +89,15 @@
 	struct arch_uprobe	arch;
 };
 
+struct delayed_uprobe {
+	struct list_head list;
+	struct uprobe *uprobe;
+	struct mm_struct *mm;
+};
+
+static DEFINE_MUTEX(delayed_uprobe_lock);
+static LIST_HEAD(delayed_uprobe_list);
+
 /*
  * Execute out of line area: anonymous executable mapping installed
  * by the probed task to execute the copy of the original instruction
@@ -282,6 +292,166 @@
 	return 1;
 }
 
+static struct delayed_uprobe *
+delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
+{
+	struct delayed_uprobe *du;
+
+	list_for_each_entry(du, &delayed_uprobe_list, list)
+		if (du->uprobe == uprobe && du->mm == mm)
+			return du;
+	return NULL;
+}
+
+static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
+{
+	struct delayed_uprobe *du;
+
+	if (delayed_uprobe_check(uprobe, mm))
+		return 0;
+
+	du  = kzalloc(sizeof(*du), GFP_KERNEL);
+	if (!du)
+		return -ENOMEM;
+
+	du->uprobe = uprobe;
+	du->mm = mm;
+	list_add(&du->list, &delayed_uprobe_list);
+	return 0;
+}
+
+static void delayed_uprobe_delete(struct delayed_uprobe *du)
+{
+	if (WARN_ON(!du))
+		return;
+	list_del(&du->list);
+	kfree(du);
+}
+
+static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
+{
+	struct list_head *pos, *q;
+	struct delayed_uprobe *du;
+
+	if (!uprobe && !mm)
+		return;
+
+	list_for_each_safe(pos, q, &delayed_uprobe_list) {
+		du = list_entry(pos, struct delayed_uprobe, list);
+
+		if (uprobe && du->uprobe != uprobe)
+			continue;
+		if (mm && du->mm != mm)
+			continue;
+
+		delayed_uprobe_delete(du);
+	}
+}
+
+static bool valid_ref_ctr_vma(struct uprobe *uprobe,
+			      struct vm_area_struct *vma)
+{
+	unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);
+
+	return uprobe->ref_ctr_offset &&
+		vma->vm_file &&
+		file_inode(vma->vm_file) == uprobe->inode &&
+		(vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
+		vma->vm_start <= vaddr &&
+		vma->vm_end > vaddr;
+}
+
+static struct vm_area_struct *
+find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
+{
+	struct vm_area_struct *tmp;
+
+	for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
+		if (valid_ref_ctr_vma(uprobe, tmp))
+			return tmp;
+
+	return NULL;
+}
+
+static int
+__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
+{
+	void *kaddr;
+	struct page *page;
+	struct vm_area_struct *vma;
+	int ret;
+	short *ptr;
+
+	if (!vaddr || !d)
+		return -EINVAL;
+
+	ret = get_user_pages_remote(NULL, mm, vaddr, 1,
+			FOLL_WRITE, &page, &vma, NULL);
+	if (unlikely(ret <= 0)) {
+		/*
+		 * We are asking for 1 page. If get_user_pages_remote() fails,
+		 * it may return 0, in that case we have to return error.
+		 */
+		return ret == 0 ? -EBUSY : ret;
+	}
+
+	kaddr = kmap_atomic(page);
+	ptr = kaddr + (vaddr & ~PAGE_MASK);
+
+	if (unlikely(*ptr + d < 0)) {
+		pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
+			"curr val: %d, delta: %d\n", vaddr, *ptr, d);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	*ptr += d;
+	ret = 0;
+out:
+	kunmap_atomic(kaddr);
+	put_page(page);
+	return ret;
+}
+
+static void update_ref_ctr_warn(struct uprobe *uprobe,
+				struct mm_struct *mm, short d)
+{
+	pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
+		"0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
+		d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
+		(unsigned long long) uprobe->offset,
+		(unsigned long long) uprobe->ref_ctr_offset, mm);
+}
+
+static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
+			  short d)
+{
+	struct vm_area_struct *rc_vma;
+	unsigned long rc_vaddr;
+	int ret = 0;
+
+	rc_vma = find_ref_ctr_vma(uprobe, mm);
+
+	if (rc_vma) {
+		rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
+		ret = __update_ref_ctr(mm, rc_vaddr, d);
+		if (ret)
+			update_ref_ctr_warn(uprobe, mm, d);
+
+		if (d > 0)
+			return ret;
+	}
+
+	mutex_lock(&delayed_uprobe_lock);
+	if (d > 0)
+		ret = delayed_uprobe_add(uprobe, mm);
+	else
+		delayed_uprobe_remove(uprobe, mm);
+	mutex_unlock(&delayed_uprobe_lock);
+
+	return ret;
+}
+
 /*
  * NOTE:
  * Expect the breakpoint instruction to be the smallest size instruction for
@@ -302,9 +472,13 @@
 int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
 			unsigned long vaddr, uprobe_opcode_t opcode)
 {
+	struct uprobe *uprobe;
 	struct page *old_page, *new_page;
 	struct vm_area_struct *vma;
-	int ret;
+	int ret, is_register, ref_ctr_updated = 0;
+
+	is_register = is_swbp_insn(&opcode);
+	uprobe = container_of(auprobe, struct uprobe, arch);
 
 retry:
 	/* Read the page with vaddr into memory */
@@ -317,6 +491,15 @@
 	if (ret <= 0)
 		goto put_old;
 
+	/* We are going to replace instruction, update ref_ctr. */
+	if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
+		ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
+		if (ret)
+			goto put_old;
+
+		ref_ctr_updated = 1;
+	}
+
 	ret = anon_vma_prepare(vma);
 	if (ret)
 		goto put_old;
@@ -337,6 +520,11 @@
 
 	if (unlikely(ret == -EAGAIN))
 		goto retry;
+
+	/* Revert back reference counter if instruction update failed. */
+	if (ret && is_register && ref_ctr_updated)
+		update_ref_ctr(uprobe, mm, -1);
+
 	return ret;
 }
 
@@ -378,8 +566,15 @@
 
 static void put_uprobe(struct uprobe *uprobe)
 {
-	if (atomic_dec_and_test(&uprobe->ref))
+	if (atomic_dec_and_test(&uprobe->ref)) {
+		/*
+		 * If application munmap(exec_vma) before uprobe_unregister()
+		 * gets called, we don't get a chance to remove uprobe from
+		 * delayed_uprobe_list from remove_breakpoint(). Do it here.
+		 */
+		delayed_uprobe_remove(uprobe, NULL);
 		kfree(uprobe);
+	}
 }
 
 static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -484,7 +679,8 @@
 	return u;
 }
 
-static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
+static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
+				   loff_t ref_ctr_offset)
 {
 	struct uprobe *uprobe, *cur_uprobe;
 
@@ -494,6 +690,7 @@
 
 	uprobe->inode = inode;
 	uprobe->offset = offset;
+	uprobe->ref_ctr_offset = ref_ctr_offset;
 	init_rwsem(&uprobe->register_rwsem);
 	init_rwsem(&uprobe->consumer_rwsem);
 
@@ -895,7 +1092,7 @@
  * else return 0 (success)
  */
 static int __uprobe_register(struct inode *inode, loff_t offset,
-			     struct uprobe_consumer *uc)
+			     loff_t ref_ctr_offset, struct uprobe_consumer *uc)
 {
 	struct uprobe *uprobe;
 	int ret;
@@ -912,7 +1109,7 @@
 		return -EINVAL;
 
  retry:
-	uprobe = alloc_uprobe(inode, offset);
+	uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
 	if (!uprobe)
 		return -ENOMEM;
 	/*
@@ -938,10 +1135,17 @@
 int uprobe_register(struct inode *inode, loff_t offset,
 		    struct uprobe_consumer *uc)
 {
-	return __uprobe_register(inode, offset, uc);
+	return __uprobe_register(inode, offset, 0, uc);
 }
 EXPORT_SYMBOL_GPL(uprobe_register);
 
+int uprobe_register_refctr(struct inode *inode, loff_t offset,
+			   loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+{
+	return __uprobe_register(inode, offset, ref_ctr_offset, uc);
+}
+EXPORT_SYMBOL_GPL(uprobe_register_refctr);
+
 /*
  * uprobe_apply - unregister an already registered probe.
  * @inode: the file in which the probe has to be removed.
@@ -1060,6 +1264,35 @@
 	spin_unlock(&uprobes_treelock);
 }
 
+/* @vma contains reference counter, not the probed instruction. */
+static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
+{
+	struct list_head *pos, *q;
+	struct delayed_uprobe *du;
+	unsigned long vaddr;
+	int ret = 0, err = 0;
+
+	mutex_lock(&delayed_uprobe_lock);
+	list_for_each_safe(pos, q, &delayed_uprobe_list) {
+		du = list_entry(pos, struct delayed_uprobe, list);
+
+		if (du->mm != vma->vm_mm ||
+		    !valid_ref_ctr_vma(du->uprobe, vma))
+			continue;
+
+		vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
+		ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
+		if (ret) {
+			update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1);
+			if (!err)
+				err = ret;
+		}
+		delayed_uprobe_delete(du);
+	}
+	mutex_unlock(&delayed_uprobe_lock);
+	return err;
+}
+
 /*
  * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
  *
@@ -1072,7 +1305,15 @@
 	struct uprobe *uprobe, *u;
 	struct inode *inode;
 
-	if (no_uprobe_events() || !valid_vma(vma, true))
+	if (no_uprobe_events())
+		return 0;
+
+	if (vma->vm_file &&
+	    (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
+	    test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
+		delayed_ref_ctr_inc(vma);
+
+	if (!valid_vma(vma, true))
 		return 0;
 
 	inode = file_inode(vma->vm_file);
@@ -1246,6 +1487,10 @@
 {
 	struct xol_area *area = mm->uprobes_state.xol_area;
 
+	mutex_lock(&delayed_uprobe_lock);
+	delayed_uprobe_remove(NULL, mm);
+	mutex_unlock(&delayed_uprobe_lock);
+
 	if (!area)
 		return;
 

diff --git a/kernel/exit.c b/kernel/exit.c
index 54c3269..34bb2b0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c

@@ -62,6 +62,7 @@
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
+#include <linux/security.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -855,6 +856,8 @@
 #endif
 		if (tsk->mm)
 			setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
+
+		security_task_exit(tsk);
 	}
 	acct_collect(code, group_dead);
 	if (group_dead)

diff --git a/kernel/fork.c b/kernel/fork.c
index 1a2d18e..01b49a5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c

@@ -1815,7 +1815,6 @@
 	posix_cpu_timers_init(p);
 
 	p->io_context = NULL;
-	audit_set_context(p, NULL);
 	cgroup_fork(p);
 #ifdef CONFIG_NUMA
 	p->mempolicy = mpol_dup(p->mempolicy);
@@ -2084,6 +2083,8 @@
 	trace_task_newtask(p, clone_flags);
 	uprobe_copy_process(p, clone_flags);
 
+	security_task_post_alloc(p);
+
 	return p;
 
 bad_fork_cancel_cgroup:

diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 1e3823f..f71c1ad 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig

@@ -53,6 +53,7 @@
 choice
 	prompt "Specify GCOV format"
 	depends on GCOV_KERNEL
+	depends on CC_IS_GCC
 	---help---
 	The gcov format is usually determined by the GCC version, and the
 	default is chosen according to your GCC version. However, there are
@@ -62,7 +63,7 @@
 
 config GCOV_FORMAT_3_4
 	bool "GCC 3.4 format"
-	depends on CC_IS_GCC && GCC_VERSION < 40700
+	depends on GCC_VERSION < 40700
 	---help---
 	Select this option to use the format defined by GCC 3.4.
 

diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index ff06d64..d66a74b 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile

@@ -2,5 +2,6 @@
 ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
 
 obj-y := base.o fs.o
-obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
-obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
+obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o
+obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o
+obj-$(CONFIG_CC_IS_CLANG) += clang.o

diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 9c7c8d5..0ffe9f1 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c

@@ -22,88 +22,8 @@
 #include <linux/sched.h>
 #include "gcov.h"
 
-static int gcov_events_enabled;
-static DEFINE_MUTEX(gcov_lock);
-
-/*
- * __gcov_init is called by gcc-generated constructor code for each object
- * file compiled with -fprofile-arcs.
- */
-void __gcov_init(struct gcov_info *info)
-{
-	static unsigned int gcov_version;
-
-	mutex_lock(&gcov_lock);
-	if (gcov_version == 0) {
-		gcov_version = gcov_info_version(info);
-		/*
-		 * Printing gcc's version magic may prove useful for debugging
-		 * incompatibility reports.
-		 */
-		pr_info("version magic: 0x%x\n", gcov_version);
-	}
-	/*
-	 * Add new profiling data structure to list and inform event
-	 * listener.
-	 */
-	gcov_info_link(info);
-	if (gcov_events_enabled)
-		gcov_event(GCOV_ADD, info);
-	mutex_unlock(&gcov_lock);
-}
-EXPORT_SYMBOL(__gcov_init);
-
-/*
- * These functions may be referenced by gcc-generated profiling code but serve
- * no function for kernel profiling.
- */
-void __gcov_flush(void)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_flush);
-
-void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_add);
-
-void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_single);
-
-void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_delta);
-
-void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_ior);
-
-void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_time_profile);
-
-void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_icall_topn);
-
-void __gcov_exit(void)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_exit);
+int gcov_events_enabled;
+DEFINE_MUTEX(gcov_lock);
 
 /**
  * gcov_enable_events - enable event reporting through gcov_event()
@@ -144,7 +64,7 @@
 
 	/* Remove entries located in module from linked list. */
 	while ((info = gcov_info_next(info))) {
-		if (within_module((unsigned long)info, mod)) {
+		if (gcov_info_within_module(info, mod)) {
 			gcov_info_unlink(prev, info);
 			if (gcov_events_enabled)
 				gcov_event(GCOV_REMOVE, info);

diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
new file mode 100644
index 0000000..c94b820
--- /dev/null
+++ b/kernel/gcov/clang.c

@@ -0,0 +1,581 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Google, Inc.
+ * modified from kernel/gcov/gcc_4_7.c
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ *
+ * LLVM uses profiling data that's deliberately similar to GCC, but has a
+ * very different way of exporting that data.  LLVM calls llvm_gcov_init() once
+ * per module, and provides a couple of callbacks that we can use to ask for
+ * more data.
+ *
+ * We care about the "writeout" callback, which in turn calls back into
+ * compiler-rt/this module to dump all the gathered coverage data to disk:
+ *
+ *    llvm_gcda_start_file()
+ *      llvm_gcda_emit_function()
+ *      llvm_gcda_emit_arcs()
+ *      llvm_gcda_emit_function()
+ *      llvm_gcda_emit_arcs()
+ *      [... repeats for each function ...]
+ *    llvm_gcda_summary_info()
+ *    llvm_gcda_end_file()
+ *
+ * This design is much more stateless and unstructured than gcc's, and is
+ * intended to run at process exit.  This forces us to keep some local state
+ * about which module we're dealing with at the moment.  On the other hand, it
+ * also means we don't depend as much on how LLVM represents profiling data
+ * internally.
+ *
+ * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more
+ * details on how this works, particularly GCOVProfiler::emitProfileArcs(),
+ * GCOVProfiler::insertCounterWriteout(), and
+ * GCOVProfiler::insertFlush().
+ */
+
+#define pr_fmt(fmt)	"gcov: " fmt
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+typedef void (*llvm_gcov_callback)(void);
+
+struct gcov_info {
+	struct list_head head;
+
+	const char *filename;
+	unsigned int version;
+	u32 checksum;
+
+	struct list_head functions;
+};
+
+struct gcov_fn_info {
+	struct list_head head;
+
+	u32 ident;
+	u32 checksum;
+	u8 use_extra_checksum;
+	u32 cfg_checksum;
+
+	u32 num_counters;
+	u64 *counters;
+	const char *function_name;
+};
+
+static struct gcov_info *current_info;
+
+static LIST_HEAD(clang_gcov_list);
+
+void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
+{
+	struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+	if (!info)
+		return;
+
+	INIT_LIST_HEAD(&info->head);
+	INIT_LIST_HEAD(&info->functions);
+
+	mutex_lock(&gcov_lock);
+
+	list_add_tail(&info->head, &clang_gcov_list);
+	current_info = info;
+	writeout();
+	current_info = NULL;
+	if (gcov_events_enabled)
+		gcov_event(GCOV_ADD, info);
+
+	mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(llvm_gcov_init);
+
+void llvm_gcda_start_file(const char *orig_filename, const char version[4],
+		u32 checksum)
+{
+	current_info->filename = orig_filename;
+	memcpy(&current_info->version, version, sizeof(current_info->version));
+	current_info->checksum = checksum;
+}
+EXPORT_SYMBOL(llvm_gcda_start_file);
+
+void llvm_gcda_emit_function(u32 ident, const char *function_name,
+		u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
+{
+	struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+	if (!info)
+		return;
+
+	INIT_LIST_HEAD(&info->head);
+	info->ident = ident;
+	info->checksum = func_checksum;
+	info->use_extra_checksum = use_extra_checksum;
+	info->cfg_checksum = cfg_checksum;
+	if (function_name)
+		info->function_name = kstrdup(function_name, GFP_KERNEL);
+
+	list_add_tail(&info->head, &current_info->functions);
+}
+EXPORT_SYMBOL(llvm_gcda_emit_function);
+
+void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
+{
+	struct gcov_fn_info *info = list_last_entry(&current_info->functions,
+			struct gcov_fn_info, head);
+
+	info->num_counters = num_counters;
+	info->counters = counters;
+}
+EXPORT_SYMBOL(llvm_gcda_emit_arcs);
+
+void llvm_gcda_summary_info(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_summary_info);
+
+void llvm_gcda_end_file(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_end_file);
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+	return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+	return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+	if (!info)
+		return list_first_entry_or_null(&clang_gcov_list,
+				struct gcov_info, head);
+	if (list_is_last(&info->head, &clang_gcov_list))
+		return NULL;
+	return list_next_entry(info, head);
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+	list_add_tail(&info->head, &clang_gcov_list);
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+	/* Generic code unlinks while iterating. */
+	__list_del_entry(&info->head);
+}
+
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+	return within_module((unsigned long)info->filename, mod);
+}
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */
+	{ 0, NULL},
+};
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+	struct gcov_fn_info *fn;
+
+	list_for_each_entry(fn, &info->functions, head)
+		memset(fn->counters, 0,
+				sizeof(fn->counters[0]) * fn->num_counters);
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+	struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null(
+			&info1->functions, struct gcov_fn_info, head);
+	struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null(
+			&info2->functions, struct gcov_fn_info, head);
+
+	if (info1->checksum != info2->checksum)
+		return false;
+	if (!fn_ptr1)
+		return fn_ptr1 == fn_ptr2;
+	while (!list_is_last(&fn_ptr1->head, &info1->functions) &&
+		!list_is_last(&fn_ptr2->head, &info2->functions)) {
+		if (fn_ptr1->checksum != fn_ptr2->checksum)
+			return false;
+		if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
+			return false;
+		if (fn_ptr1->use_extra_checksum &&
+			fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
+			return false;
+		fn_ptr1 = list_next_entry(fn_ptr1, head);
+		fn_ptr2 = list_next_entry(fn_ptr2, head);
+	}
+	return list_is_last(&fn_ptr1->head, &info1->functions) &&
+		list_is_last(&fn_ptr2->head, &info2->functions);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
+{
+	struct gcov_fn_info *dfn_ptr;
+	struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions,
+			struct gcov_fn_info, head);
+
+	list_for_each_entry(dfn_ptr, &dst->functions, head) {
+		u32 i;
+
+		for (i = 0; i < sfn_ptr->num_counters; i++)
+			dfn_ptr->counters[i] += sfn_ptr->counters[i];
+	}
+}
+
+static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
+{
+	size_t cv_size; /* counter values size */
+	struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
+			GFP_KERNEL);
+	if (!fn_dup)
+		return NULL;
+	INIT_LIST_HEAD(&fn_dup->head);
+
+	fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL);
+	if (!fn_dup->function_name)
+		goto err_name;
+
+	cv_size = fn->num_counters * sizeof(fn->counters[0]);
+	fn_dup->counters = vmalloc(cv_size);
+	if (!fn_dup->counters)
+		goto err_counters;
+	memcpy(fn_dup->counters, fn->counters, cv_size);
+
+	return fn_dup;
+
+err_counters:
+	kfree(fn_dup->function_name);
+err_name:
+	kfree(fn_dup);
+	return NULL;
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+	struct gcov_info *dup;
+	struct gcov_fn_info *fn;
+
+	dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
+	if (!dup)
+		return NULL;
+	INIT_LIST_HEAD(&dup->head);
+	INIT_LIST_HEAD(&dup->functions);
+	dup->filename = kstrdup(info->filename, GFP_KERNEL);
+	if (!dup->filename)
+		goto err;
+
+	list_for_each_entry(fn, &info->functions, head) {
+		struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn);
+
+		if (!fn_dup)
+			goto err;
+		list_add_tail(&fn_dup->head, &dup->functions);
+	}
+
+	return dup;
+
+err:
+	gcov_info_free(dup);
+	return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+	struct gcov_fn_info *fn, *tmp;
+
+	list_for_each_entry_safe(fn, tmp, &info->functions, head) {
+		kfree(fn->function_name);
+		vfree(fn->counters);
+		list_del(&fn->head);
+		kfree(fn);
+	}
+	kfree(info->filename);
+	kfree(info);
+}
+
+#define ITER_STRIDE	PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+	struct gcov_info *info;
+	void *buffer;
+	size_t size;
+	loff_t pos;
+};
+
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+	u32 *data;
+
+	if (buffer) {
+		data = buffer + off;
+		*data = v;
+	}
+
+	return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+	u32 *data;
+
+	if (buffer) {
+		data = buffer + off;
+
+		data[0] = (v & 0xffffffffUL);
+		data[1] = (v >> 32);
+	}
+
+	return sizeof(*data) * 2;
+}
+
+/**
+ * convert_to_gcda - convert profiling data set to gcda file format
+ * @buffer: the buffer to store file data or %NULL if no data should be stored
+ * @info: profiling data set to be converted
+ *
+ * Returns the number of bytes that were/would have been stored into the buffer.
+ */
+static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+{
+	struct gcov_fn_info *fi_ptr;
+	size_t pos = 0;
+
+	/* File header. */
+	pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
+	pos += store_gcov_u32(buffer, pos, info->version);
+	pos += store_gcov_u32(buffer, pos, info->checksum);
+
+	list_for_each_entry(fi_ptr, &info->functions, head) {
+		u32 i;
+		u32 len = 2;
+
+		if (fi_ptr->use_extra_checksum)
+			len++;
+
+		pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
+		pos += store_gcov_u32(buffer, pos, len);
+		pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
+		pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
+		if (fi_ptr->use_extra_checksum)
+			pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+
+		pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
+		pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
+		for (i = 0; i < fi_ptr->num_counters; i++)
+			pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]);
+	}
+
+	return pos;
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+	struct gcov_iterator *iter;
+
+	iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
+	if (!iter)
+		goto err_free;
+
+	iter->info = info;
+	/* Dry-run to get the actual buffer size. */
+	iter->size = convert_to_gcda(NULL, info);
+	iter->buffer = vmalloc(iter->size);
+	if (!iter->buffer)
+		goto err_free;
+
+	convert_to_gcda(iter->buffer, info);
+
+	return iter;
+
+err_free:
+	kfree(iter);
+	return NULL;
+}
+
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+	vfree(iter->buffer);
+	kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+	return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+	iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+	if (iter->pos < iter->size)
+		iter->pos += ITER_STRIDE;
+
+	if (iter->pos >= iter->size)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+	size_t len;
+
+	if (iter->pos >= iter->size)
+		return -EINVAL;
+
+	len = ITER_STRIDE;
+	if (iter->pos + len > iter->size)
+		len = iter->size - iter->pos;
+
+	seq_write(seq, iter->buffer + iter->pos, len);
+
+	return 0;
+}

diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 1e32e66..64d2dd9 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c

@@ -137,6 +137,18 @@
 		gcov_info_head = info->next;
 }
 
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+	return within_module((unsigned long)info, mod);
+}
+
 /* Symbolic links to be created for each profiling data file. */
 const struct gcov_link gcov_link[] = {
 	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */

diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index ca5e5c0..ec37563 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c

@@ -150,6 +150,18 @@
 		gcov_info_head = info->next;
 }
 
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+	return within_module((unsigned long)info, mod);
+}
+
 /* Symbolic links to be created for each profiling data file. */
 const struct gcov_link gcov_link[] = {
 	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */

diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c
new file mode 100644
index 0000000..3cf736b
--- /dev/null
+++ b/kernel/gcov/gcc_base.c

@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include "gcov.h"
+
+/*
+ * __gcov_init is called by gcc-generated constructor code for each object
+ * file compiled with -fprofile-arcs.
+ */
+void __gcov_init(struct gcov_info *info)
+{
+	static unsigned int gcov_version;
+
+	mutex_lock(&gcov_lock);
+	if (gcov_version == 0) {
+		gcov_version = gcov_info_version(info);
+		/*
+		 * Printing gcc's version magic may prove useful for debugging
+		 * incompatibility reports.
+		 */
+		pr_info("version magic: 0x%x\n", gcov_version);
+	}
+	/*
+	 * Add new profiling data structure to list and inform event
+	 * listener.
+	 */
+	gcov_info_link(info);
+	if (gcov_events_enabled)
+		gcov_event(GCOV_ADD, info);
+	mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(__gcov_init);
+
+/*
+ * These functions may be referenced by gcc-generated profiling code but serve
+ * no function for kernel profiling.
+ */
+void __gcov_flush(void)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_flush);
+
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_add);
+
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_single);
+
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_delta);
+
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_ior);
+
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_time_profile);
+
+void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_icall_topn);
+
+void __gcov_exit(void)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_exit);

diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index de118ad..6ab2c18 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h

@@ -15,6 +15,7 @@
 #ifndef GCOV_H
 #define GCOV_H GCOV_H
 
+#include <linux/module.h>
 #include <linux/types.h>
 
 /*
@@ -46,6 +47,7 @@
 struct gcov_info *gcov_info_next(struct gcov_info *info);
 void gcov_info_link(struct gcov_info *info);
 void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod);
 
 /* Base interface. */
 enum gcov_action {
@@ -83,4 +85,7 @@
 };
 extern const struct gcov_link gcov_link[];
 
+extern int gcov_events_enabled;
+extern struct mutex gcov_lock;
+
 #endif /* GCOV_H */

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 4a91916..3486f57 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c

@@ -200,6 +200,8 @@
 	if (hung_task_show_lock)
 		debug_show_all_locks();
 	if (hung_task_call_panic) {
+		/* Dump all tasks. */
+		show_state_filter(TASK_UNINTERRUPTIBLE);
 		trigger_all_cpu_backtrace();
 		panic("hung_task: blocked tasks");
 	}

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ea57f3d..3f46185 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h

@@ -126,8 +126,6 @@
 
 extern bool irq_can_set_affinity_usr(unsigned int irq);
 
-extern int irq_select_affinity_usr(unsigned int irq);
-
 extern void irq_set_thread_affinity(struct irq_desc *desc);
 
 extern int irq_do_set_affinity(struct irq_data *data,

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 23bcfa7..eb69b80 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c

@@ -441,23 +441,9 @@
 {
 	return irq_select_affinity(irq_desc_get_irq(desc));
 }
-#endif
+#endif /* CONFIG_AUTO_IRQ_AFFINITY */
+#endif /* CONFIG_SMP */
 
-/*
- * Called when a bogus affinity is set via /proc/irq
- */
-int irq_select_affinity_usr(unsigned int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-	unsigned long flags;
-	int ret;
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
-	ret = irq_setup_affinity(desc);
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	return ret;
-}
-#endif
 
 /**
  *	irq_set_vcpu_affinity - Set vcpu affinity for the interrupt

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index da9addb..e8c655b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c

@@ -115,6 +115,28 @@
 	return show_irq_affinity(AFFINITY_LIST, m);
 }
 
+#ifndef CONFIG_AUTO_IRQ_AFFINITY
+static inline int irq_select_affinity_usr(unsigned int irq)
+{
+	/*
+	 * If the interrupt is started up already then this fails. The
+	 * interrupt is assigned to an online CPU already. There is no
+	 * point to move it around randomly. Tell user space that the
+	 * selected mask is bogus.
+	 *
+	 * If not then any change to the affinity is pointless because the
+	 * startup code invokes irq_setup_affinity() which will select
+	 * a online CPU anyway.
+	 */
+	return -EINVAL;
+}
+#else
+/* ALPHA magic affinity auto selector. Keep it for historical reasons. */
+static inline int irq_select_affinity_usr(unsigned int irq)
+{
+	return irq_select_affinity(irq);
+}
+#endif
 
 static ssize_t write_irq_affinity(int type, struct file *file,
 		const char __user *buffer, size_t count, loff_t *pos)

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 7c82626..2e62503 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c

@@ -18,6 +18,8 @@
 #include <linux/cpu.h>
 #include <asm/sections.h>
 
+#ifdef HAVE_JUMP_LABEL
+
 /* mutex to protect coming/going of the the jump_label table */
 static DEFINE_MUTEX(jump_label_mutex);
 
@@ -58,13 +60,13 @@
 static void jump_label_update(struct static_key *key);
 
 /*
- * There are similar definitions for the !CONFIG_JUMP_LABEL case in jump_label.h.
+ * There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h.
  * The use of 'atomic_read()' requires atomic.h and its problematic for some
  * kernel headers such as kernel.h and others. Since static_key_count() is not
- * used in the branch statements as it is for the !CONFIG_JUMP_LABEL case its ok
+ * used in the branch statements as it is for the !HAVE_JUMP_LABEL case its ok
  * to have it be a function here. Similarly, for 'static_key_enable()' and
  * 'static_key_disable()', which require bug.h. This should allow jump_label.h
- * to be included from most/all places for CONFIG_JUMP_LABEL.
+ * to be included from most/all places for HAVE_JUMP_LABEL.
  */
 int static_key_count(struct static_key *key)
 {
@@ -794,3 +796,5 @@
 }
 early_initcall(jump_label_test);
 #endif /* STATIC_KEYS_SELFTEST */
+
+#endif /* HAVE_JUMP_LABEL */

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f4e4095e..00050a2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c

@@ -523,6 +523,8 @@
 	arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
 	/* Loop free_list for disarming */
 	list_for_each_entry_safe(op, tmp, &freeing_list, list) {
+		/* Switching from detour code to origin */
+		op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
 		/* Disarm probes if marked disabled */
 		if (kprobe_disabled(&op->kp))
 			arch_disarm_kprobe(&op->kp);
@@ -662,6 +664,7 @@
 {
 	lockdep_assert_cpus_held();
 	arch_unoptimize_kprobe(op);
+	op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
 	if (kprobe_disabled(&op->kp))
 		arch_disarm_kprobe(&op->kp);
 }
@@ -689,7 +692,6 @@
 		return;
 	}
 
-	op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
 	if (!list_empty(&op->list)) {
 		/* Dequeue from the optimization queue */
 		list_del_init(&op->list);

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 087d18d..65234c8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c

@@ -101,6 +101,12 @@
 }
 EXPORT_SYMBOL(kthread_should_stop);
 
+bool __kthread_should_park(struct task_struct *k)
+{
+	return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
+}
+EXPORT_SYMBOL_GPL(__kthread_should_park);
+
 /**
  * kthread_should_park - should this kthread park now?
  *
@@ -114,7 +120,7 @@
  */
 bool kthread_should_park(void)
 {
-	return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
+	return __kthread_should_park(current);
 }
 EXPORT_SYMBOL_GPL(kthread_should_park);
 

diff --git a/kernel/module.c b/kernel/module.c
index 70a75a7..7746d59 100644
--- a/kernel/module.c
+++ b/kernel/module.c

@@ -2980,9 +2980,7 @@
 
 	/* Try to find a name early so we can log errors with a module name */
 	info->index.info = find_sec(info, ".modinfo");
-	if (!info->index.info)
-		info->name = "(missing .modinfo section)";
-	else
+	if (info->index.info)
 		info->name = get_modinfo(info, "name");
 
 	/* Find internal symbols and strings. */
@@ -2997,14 +2995,15 @@
 	}
 
 	if (info->index.sym == 0) {
-		pr_warn("%s: module has no symbols (stripped?)\n", info->name);
+		pr_warn("%s: module has no symbols (stripped?)\n",
+			info->name ?: "(missing .modinfo section or name field)");
 		return -ENOEXEC;
 	}
 
 	info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
 	if (!info->index.mod) {
 		pr_warn("%s: No module found in object\n",
-			info->name ?: "(missing .modinfo name field)");
+			info->name ?: "(missing .modinfo section or name field)");
 		return -ENOEXEC;
 	}
 	/* This is temporary: point mod into copy of data. */
@@ -3120,7 +3119,7 @@
 					     sizeof(*mod->tracepoints_ptrs),
 					     &mod->num_tracepoints);
 #endif
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 	mod->jump_entries = section_objs(info, "__jump_table",
 					sizeof(*mod->jump_entries),
 					&mod->num_jump_entries);

diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index f2075ce..6b9a926 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c

@@ -83,6 +83,7 @@
 	}
 
 	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
-				      NULL, VERIFYING_MODULE_SIGNATURE,
+				      VERIFY_USE_SECONDARY_KEYRING,
+				      VERIFYING_MODULE_SIGNATURE,
 				      NULL, NULL);
 }

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 3a6c2f8..f8fe57d 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig

@@ -298,3 +298,18 @@
 
 config CPU_PM
 	bool
+
+config ENERGY_MODEL
+	bool "Energy Model for CPUs"
+	depends on SMP
+	depends on CPU_FREQ
+	default n
+	help
+	  Several subsystems (thermal and/or the task scheduler for example)
+	  can leverage information about the energy consumed by CPUs to make
+	  smarter decisions. This config option enables the framework from
+	  which subsystems can access the energy models.
+
+	  The exact usage of the energy model is subsystem-dependent.
+
+	  If in doubt, say N.

diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index a3f79f0e..3f8db83 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile

@@ -15,3 +15,6 @@
 obj-$(CONFIG_PM_WAKELOCKS)	+= wakelock.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
+
+obj-$(CONFIG_SUSPEND)	+= wakeup_reason.o
+obj-$(CONFIG_ENERGY_MODEL)	+= energy_model.o

diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
new file mode 100644
index 0000000..7d66ee6
--- /dev/null
+++ b/kernel/power/energy_model.c

@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Energy Model of CPUs
+ *
+ * Copyright (c) 2018, Arm ltd.
+ * Written by: Quentin Perret, Arm ltd.
+ */
+
+#define pr_fmt(fmt) "energy_model: " fmt
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/debugfs.h>
+#include <linux/energy_model.h>
+#include <linux/sched/topology.h>
+#include <linux/slab.h>
+
+/* Mapping of each CPU to the performance domain to which it belongs. */
+static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
+
+/*
+ * Mutex serializing the registrations of performance domains and letting
+ * callbacks defined by drivers sleep.
+ */
+static DEFINE_MUTEX(em_pd_mutex);
+
+#ifdef CONFIG_DEBUG_FS
+static struct dentry *rootdir;
+
+static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd)
+{
+	struct dentry *d;
+	char name[24];
+
+	snprintf(name, sizeof(name), "cs:%lu", cs->frequency);
+
+	/* Create per-cs directory */
+	d = debugfs_create_dir(name, pd);
+	debugfs_create_ulong("frequency", 0444, d, &cs->frequency);
+	debugfs_create_ulong("power", 0444, d, &cs->power);
+	debugfs_create_ulong("cost", 0444, d, &cs->cost);
+}
+
+static int em_debug_cpus_show(struct seq_file *s, void *unused)
+{
+	seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private)));
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
+
+static void em_debug_create_pd(struct em_perf_domain *pd, int cpu)
+{
+	struct dentry *d;
+	char name[8];
+	int i;
+
+	snprintf(name, sizeof(name), "pd%d", cpu);
+
+	/* Create the directory of the performance domain */
+	d = debugfs_create_dir(name, rootdir);
+
+	debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops);
+
+	/* Create a sub-directory for each capacity state */
+	for (i = 0; i < pd->nr_cap_states; i++)
+		em_debug_create_cs(&pd->table[i], d);
+}
+
+static int __init em_debug_init(void)
+{
+	/* Create /sys/kernel/debug/energy_model directory */
+	rootdir = debugfs_create_dir("energy_model", NULL);
+
+	return 0;
+}
+core_initcall(em_debug_init);
+#else /* CONFIG_DEBUG_FS */
+static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {}
+#endif
+static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
+						struct em_data_callback *cb)
+{
+	unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
+	unsigned long power, freq, prev_freq = 0;
+	int i, ret, cpu = cpumask_first(span);
+	struct em_cap_state *table;
+	struct em_perf_domain *pd;
+	u64 fmax;
+
+	if (!cb->active_power)
+		return NULL;
+
+	pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
+	if (!pd)
+		return NULL;
+
+	table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
+	if (!table)
+		goto free_pd;
+
+	/* Build the list of capacity states for this performance domain */
+	for (i = 0, freq = 0; i < nr_states; i++, freq++) {
+		/*
+		 * active_power() is a driver callback which ceils 'freq' to
+		 * lowest capacity state of 'cpu' above 'freq' and updates
+		 * 'power' and 'freq' accordingly.
+		 */
+		ret = cb->active_power(&power, &freq, cpu);
+		if (ret) {
+			pr_err("pd%d: invalid cap. state: %d\n", cpu, ret);
+			goto free_cs_table;
+		}
+
+		/*
+		 * We expect the driver callback to increase the frequency for
+		 * higher capacity states.
+		 */
+		if (freq <= prev_freq) {
+			pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq);
+			goto free_cs_table;
+		}
+
+		/*
+		 * The power returned by active_state() is expected to be
+		 * positive, in milli-watts and to fit into 16 bits.
+		 */
+		if (!power || power > EM_CPU_MAX_POWER) {
+			pr_err("pd%d: invalid power: %lu\n", cpu, power);
+			goto free_cs_table;
+		}
+
+		table[i].power = power;
+		table[i].frequency = prev_freq = freq;
+
+		/*
+		 * The hertz/watts efficiency ratio should decrease as the
+		 * frequency grows on sane platforms. But this isn't always
+		 * true in practice so warn the user if a higher OPP is more
+		 * power efficient than a lower one.
+		 */
+		opp_eff = freq / power;
+		if (opp_eff >= prev_opp_eff)
+			pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n",
+					cpu, i, i - 1);
+		prev_opp_eff = opp_eff;
+	}
+
+	/* Compute the cost of each capacity_state. */
+	fmax = (u64) table[nr_states - 1].frequency;
+	for (i = 0; i < nr_states; i++) {
+		table[i].cost = div64_u64(fmax * table[i].power,
+					  table[i].frequency);
+	}
+
+	pd->table = table;
+	pd->nr_cap_states = nr_states;
+	cpumask_copy(to_cpumask(pd->cpus), span);
+
+	em_debug_create_pd(pd, cpu);
+
+	return pd;
+
+free_cs_table:
+	kfree(table);
+free_pd:
+	kfree(pd);
+
+	return NULL;
+}
+
+/**
+ * em_cpu_get() - Return the performance domain for a CPU
+ * @cpu : CPU to find the performance domain for
+ *
+ * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't
+ * exist.
+ */
+struct em_perf_domain *em_cpu_get(int cpu)
+{
+	return READ_ONCE(per_cpu(em_data, cpu));
+}
+EXPORT_SYMBOL_GPL(em_cpu_get);
+
+/**
+ * em_register_perf_domain() - Register the Energy Model of a performance domain
+ * @span	: Mask of CPUs in the performance domain
+ * @nr_states	: Number of capacity states to register
+ * @cb		: Callback functions providing the data of the Energy Model
+ *
+ * Create Energy Model tables for a performance domain using the callbacks
+ * defined in cb.
+ *
+ * If multiple clients register the same performance domain, all but the first
+ * registration will be ignored.
+ *
+ * Return 0 on success
+ */
+int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
+						struct em_data_callback *cb)
+{
+	unsigned long cap, prev_cap = 0;
+	struct em_perf_domain *pd;
+	int cpu, ret = 0;
+
+	if (!span || !nr_states || !cb)
+		return -EINVAL;
+
+	/*
+	 * Use a mutex to serialize the registration of performance domains and
+	 * let the driver-defined callback functions sleep.
+	 */
+	mutex_lock(&em_pd_mutex);
+
+	for_each_cpu(cpu, span) {
+		/* Make sure we don't register again an existing domain. */
+		if (READ_ONCE(per_cpu(em_data, cpu))) {
+			ret = -EEXIST;
+			goto unlock;
+		}
+
+		/*
+		 * All CPUs of a domain must have the same micro-architecture
+		 * since they all share the same table.
+		 */
+		cap = arch_scale_cpu_capacity(NULL, cpu);
+		if (prev_cap && prev_cap != cap) {
+			pr_err("CPUs of %*pbl must have the same capacity\n",
+							cpumask_pr_args(span));
+			ret = -EINVAL;
+			goto unlock;
+		}
+		prev_cap = cap;
+	}
+
+	/* Create the performance domain and add it to the Energy Model. */
+	pd = em_create_pd(span, nr_states, cb);
+	if (!pd) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	for_each_cpu(cpu, span) {
+		/*
+		 * The per-cpu array can be read concurrently from em_cpu_get().
+		 * The barrier enforces the ordering needed to make sure readers
+		 * can only access well formed em_perf_domain structs.
+		 */
+		smp_store_release(per_cpu_ptr(&em_data, cpu), pd);
+	}
+
+	pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span));
+unlock:
+	mutex_unlock(&em_pd_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(em_register_perf_domain);

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7381d49..d76e616 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c

@@ -85,26 +85,27 @@
 	elapsed = ktime_sub(end, start);
 	elapsed_msecs = ktime_to_ms(elapsed);
 
-	if (todo) {
+	if (wakeup) {
 		pr_cont("\n");
-		pr_err("Freezing of tasks %s after %d.%03d seconds "
-		       "(%d tasks refusing to freeze, wq_busy=%d):\n",
-		       wakeup ? "aborted" : "failed",
+		pr_err("Freezing of tasks aborted after %d.%03d seconds",
+		       elapsed_msecs / 1000, elapsed_msecs % 1000);
+	} else if (todo) {
+		pr_cont("\n");
+		pr_err("Freezing of tasks failed after %d.%03d seconds"
+		       " (%d tasks refusing to freeze, wq_busy=%d):\n",
 		       elapsed_msecs / 1000, elapsed_msecs % 1000,
 		       todo - wq_busy, wq_busy);
 
 		if (wq_busy)
 			show_workqueue_state();
 
-		if (!wakeup) {
-			read_lock(&tasklist_lock);
-			for_each_process_thread(g, p) {
-				if (p != current && !freezer_should_skip(p)
-				    && freezing(p) && !frozen(p))
-					sched_show_task(p);
-			}
-			read_unlock(&tasklist_lock);
+		read_lock(&tasklist_lock);
+		for_each_process_thread(g, p) {
+			if (p != current && !freezer_should_skip(p)
+			    && freezing(p) && !frozen(p))
+				sched_show_task(p);
 		}
+		read_unlock(&tasklist_lock);
 	} else {
 		pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
 			elapsed_msecs % 1000);

diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c
new file mode 100644
index 0000000..904b65f
--- /dev/null
+++ b/kernel/power/wakeup_reason.c

@@ -0,0 +1,184 @@
+/*
+ * kernel/power/wakeup_reason.c
+ *
+ * Logs the reasons which caused the kernel to resume from
+ * the suspend mode.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/wakeup_reason.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+
+
+#define MAX_WAKEUP_REASON_IRQS 32
+static int irq_list[MAX_WAKEUP_REASON_IRQS];
+static int irqcount;
+static struct kobject *wakeup_reason;
+static spinlock_t resume_reason_lock;
+
+static ktime_t last_monotime; /* monotonic time before last suspend */
+static ktime_t curr_monotime; /* monotonic time after last suspend */
+static ktime_t last_stime; /* monotonic boottime offset before last suspend */
+static ktime_t curr_stime; /* monotonic boottime offset after last suspend */
+
+static ssize_t last_resume_reason_show(struct kobject *kobj, struct kobj_attribute *attr,
+		char *buf)
+{
+	int irq_no, buf_offset = 0;
+	struct irq_desc *desc;
+	spin_lock(&resume_reason_lock);
+	for (irq_no = 0; irq_no < irqcount; irq_no++) {
+		desc = irq_to_desc(irq_list[irq_no]);
+		if (desc && desc->action && desc->action->name)
+			buf_offset += sprintf(buf + buf_offset, "%d %s\n",
+					irq_list[irq_no], desc->action->name);
+		else
+			buf_offset += sprintf(buf + buf_offset, "%d\n",
+					irq_list[irq_no]);
+	}
+	spin_unlock(&resume_reason_lock);
+	return buf_offset;
+}
+
+static ssize_t last_suspend_time_show(struct kobject *kobj,
+			struct kobj_attribute *attr, char *buf)
+{
+	struct timespec sleep_time;
+	struct timespec total_time;
+	struct timespec suspend_resume_time;
+
+	/*
+	 * total_time is calculated from monotonic bootoffsets because
+	 * unlike CLOCK_MONOTONIC it include the time spent in suspend state.
+	 */
+	total_time = ktime_to_timespec(ktime_sub(curr_stime, last_stime));
+
+	/*
+	 * suspend_resume_time is calculated as monotonic (CLOCK_MONOTONIC)
+	 * time interval before entering suspend and post suspend.
+	 */
+	suspend_resume_time = ktime_to_timespec(ktime_sub(curr_monotime, last_monotime));
+
+	/* sleep_time = total_time - suspend_resume_time */
+	sleep_time = timespec_sub(total_time, suspend_resume_time);
+
+	/* Export suspend_resume_time and sleep_time in pair here. */
+	return sprintf(buf, "%lu.%09lu %lu.%09lu\n",
+				suspend_resume_time.tv_sec, suspend_resume_time.tv_nsec,
+				sleep_time.tv_sec, sleep_time.tv_nsec);
+}
+
+static struct kobj_attribute resume_reason = __ATTR_RO(last_resume_reason);
+static struct kobj_attribute suspend_time = __ATTR_RO(last_suspend_time);
+
+static struct attribute *attrs[] = {
+	&resume_reason.attr,
+	&suspend_time.attr,
+	NULL,
+};
+static struct attribute_group attr_group = {
+	.attrs = attrs,
+};
+
+/*
+ * logs all the wake up reasons to the kernel
+ * stores the irqs to expose them to the userspace via sysfs
+ */
+void log_wakeup_reason(int irq)
+{
+	struct irq_desc *desc;
+	desc = irq_to_desc(irq);
+	if (desc && desc->action && desc->action->name)
+		printk(KERN_INFO "Resume caused by IRQ %d, %s\n", irq,
+				desc->action->name);
+	else
+		printk(KERN_INFO "Resume caused by IRQ %d\n", irq);
+
+	spin_lock(&resume_reason_lock);
+	if (irqcount == MAX_WAKEUP_REASON_IRQS) {
+		spin_unlock(&resume_reason_lock);
+		printk(KERN_WARNING "Resume caused by more than %d IRQs\n",
+				MAX_WAKEUP_REASON_IRQS);
+		return;
+	}
+
+	irq_list[irqcount++] = irq;
+	spin_unlock(&resume_reason_lock);
+}
+
+/* Detects a suspend and clears all the previous wake up reasons*/
+static int wakeup_reason_pm_event(struct notifier_block *notifier,
+		unsigned long pm_event, void *unused)
+{
+	switch (pm_event) {
+	case PM_SUSPEND_PREPARE:
+		spin_lock(&resume_reason_lock);
+		irqcount = 0;
+		spin_unlock(&resume_reason_lock);
+		/* monotonic time since boot */
+		last_monotime = ktime_get();
+		/* monotonic time since boot including the time spent in suspend */
+		last_stime = ktime_get_boottime();
+		break;
+	case PM_POST_SUSPEND:
+		/* monotonic time since boot */
+		curr_monotime = ktime_get();
+		/* monotonic time since boot including the time spent in suspend */
+		curr_stime = ktime_get_boottime();
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block wakeup_reason_pm_notifier_block = {
+	.notifier_call = wakeup_reason_pm_event,
+};
+
+/* Initializes the sysfs parameter
+ * registers the pm_event notifier
+ */
+int __init wakeup_reason_init(void)
+{
+	int retval;
+	spin_lock_init(&resume_reason_lock);
+	retval = register_pm_notifier(&wakeup_reason_pm_notifier_block);
+	if (retval)
+		printk(KERN_WARNING "[%s] failed to register PM notifier %d\n",
+				__func__, retval);
+
+	wakeup_reason = kobject_create_and_add("wakeup_reasons", kernel_kobj);
+	if (!wakeup_reason) {
+		printk(KERN_WARNING "[%s] failed to create a sysfs kobject\n",
+				__func__);
+		return 1;
+	}
+	retval = sysfs_create_group(wakeup_reason, &attr_group);
+	if (retval) {
+		kobject_put(wakeup_reason);
+		printk(KERN_WARNING "[%s] failed to create a sysfs group %d\n",
+				__func__, retval);
+	}
+	return 0;
+}
+
+late_initcall(wakeup_reason_init);

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7fe1834..2389350 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile

@@ -24,6 +24,7 @@
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_SCHED_TUNE) += tune.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o

diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2d4ff53..2067080 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c

@@ -259,7 +259,6 @@
 }
 #endif /* CONFIG_PROC_FS */
 
-#ifdef CONFIG_SCHED_DEBUG
 int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
 	if (!task_group_is_autogroup(tg))
@@ -267,4 +266,3 @@
 
 	return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
 }
-#endif

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2befd2c..2ed0779 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c

@@ -24,7 +24,7 @@
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
 /*
  * Debugging: various feature bits
  *
@@ -181,6 +181,7 @@
 	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
 		update_irq_load_avg(rq, irq_delta + steal);
 #endif
+	update_rq_clock_pelt(rq, delta);
 }
 
 void update_rq_clock(struct rq *rq)
@@ -413,6 +414,8 @@
 	if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
 		return;
 
+	head->count++;
+
 	get_task_struct(task);
 
 	/*
@@ -422,6 +425,10 @@
 	head->lastp = &node->next;
 }
 
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+	       int sibling_count_hint);
+
 void wake_up_q(struct wake_q_head *head)
 {
 	struct wake_q_node *node = head->first;
@@ -436,10 +443,10 @@
 		task->wake_q.next = NULL;
 
 		/*
-		 * wake_up_process() executes a full barrier, which pairs with
+		 * try_to_wake_up() executes a full barrier, which pairs with
 		 * the queueing in wake_q_add() so as not to miss wakeups.
 		 */
-		wake_up_process(task);
+		try_to_wake_up(task, TASK_NORMAL, 0, head->count);
 		put_task_struct(task);
 	}
 }
@@ -702,6 +709,7 @@
 	if (idle_policy(p->policy)) {
 		load->weight = scale_load(WEIGHT_IDLEPRIO);
 		load->inv_weight = WMULT_IDLEPRIO;
+		p->se.runnable_weight = load->weight;
 		return;
 	}
 
@@ -714,6 +722,7 @@
 	} else {
 		load->weight = scale_load(sched_prio_to_weight[prio]);
 		load->inv_weight = sched_prio_to_wmult[prio];
+		p->se.runnable_weight = load->weight;
 	}
 }
 
@@ -1524,12 +1533,14 @@
  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
  */
 static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
+		   int sibling_count_hint)
 {
 	lockdep_assert_held(&p->pi_lock);
 
 	if (p->nr_cpus_allowed > 1)
-		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
+						     sibling_count_hint);
 	else
 		cpu = cpumask_any(&p->cpus_allowed);
 
@@ -1932,6 +1943,8 @@
  * @p: the thread to be awakened
  * @state: the mask of task states that can be woken
  * @wake_flags: wake modifier flags (WF_*)
+ * @sibling_count_hint: A hint at the number of threads that are being woken up
+ *                      in this event.
  *
  * If (@state & @p->state) @p->state = TASK_RUNNING.
  *
@@ -1947,7 +1960,8 @@
  *	   %false otherwise.
  */
 static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+	       int sibling_count_hint)
 {
 	unsigned long flags;
 	int cpu, success = 0;
@@ -2034,7 +2048,8 @@
 		atomic_dec(&task_rq(p)->nr_iowait);
 	}
 
-	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
+			     sibling_count_hint);
 	if (task_cpu(p) != cpu) {
 		wake_flags |= WF_MIGRATED;
 		set_task_cpu(p, cpu);
@@ -2121,13 +2136,13 @@
  */
 int wake_up_process(struct task_struct *p)
 {
-	return try_to_wake_up(p, TASK_NORMAL, 0);
+	return try_to_wake_up(p, TASK_NORMAL, 0, 1);
 }
 EXPORT_SYMBOL(wake_up_process);
 
 int wake_up_state(struct task_struct *p, unsigned int state)
 {
-	return try_to_wake_up(p, state, 0);
+	return try_to_wake_up(p, state, 0, 1);
 }
 
 /*
@@ -2409,7 +2424,7 @@
 	 * as we're not fully set-up yet.
 	 */
 	p->recent_used_cpu = task_cpu(p);
-	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
 #endif
 	rq = __task_rq_lock(p, &rf);
 	update_rq_clock(rq);
@@ -2948,7 +2963,7 @@
 	int dest_cpu;
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
 	if (dest_cpu == smp_processor_id())
 		goto unlock;
 
@@ -3311,11 +3326,7 @@
 		print_ip_sym(preempt_disable_ip);
 		pr_cont("\n");
 	}
-	if (panic_on_warn)
-		panic("scheduling while atomic\n");
-
-	dump_stack();
-	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+	BUG();
 }
 
 /*
@@ -3750,7 +3761,7 @@
 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
 			  void *key)
 {
-	return try_to_wake_up(curr->private, mode, wake_flags);
+	return try_to_wake_up(curr->private, mode, wake_flags, 1);
 }
 EXPORT_SYMBOL(default_wake_function);
 

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 1b7ec82..4a00ea9 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c

@@ -13,11 +13,13 @@
 
 #include "sched.h"
 
+#include <linux/sched/cpufreq.h>
 #include <trace/events/power.h>
 
 struct sugov_tunables {
 	struct gov_attr_set	attr_set;
-	unsigned int		rate_limit_us;
+	unsigned int		up_rate_limit_us;
+	unsigned int		down_rate_limit_us;
 };
 
 struct sugov_policy {
@@ -28,7 +30,9 @@
 
 	raw_spinlock_t		update_lock;	/* For shared policies */
 	u64			last_freq_update_time;
-	s64			freq_update_delay_ns;
+	s64			min_rate_limit_ns;
+	s64			up_rate_delay_ns;
+	s64			down_rate_delay_ns;
 	unsigned int		next_freq;
 	unsigned int		cached_raw_freq;
 
@@ -95,9 +99,32 @@
 		return true;
 	}
 
+	/* No need to recalculate next freq for min_rate_limit_us
+	 * at least. However we might still decide to further rate
+	 * limit once frequency change direction is decided, according
+	 * to the separate rate limits.
+	 */
+
+	delta_ns = time - sg_policy->last_freq_update_time;
+	return delta_ns >= sg_policy->min_rate_limit_ns;
+}
+
+static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time,
+				     unsigned int next_freq)
+{
+	s64 delta_ns;
+
 	delta_ns = time - sg_policy->last_freq_update_time;
 
-	return delta_ns >= sg_policy->freq_update_delay_ns;
+	if (next_freq > sg_policy->next_freq &&
+	    delta_ns < sg_policy->up_rate_delay_ns)
+			return true;
+
+	if (next_freq < sg_policy->next_freq &&
+	    delta_ns < sg_policy->down_rate_delay_ns)
+			return true;
+
+	return false;
 }
 
 static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
@@ -106,6 +133,9 @@
 	if (sg_policy->next_freq == next_freq)
 		return false;
 
+	if (sugov_up_down_rate_limit(sg_policy, time, next_freq))
+		return false;
+
 	sg_policy->next_freq = next_freq;
 	sg_policy->last_freq_update_time = time;
 
@@ -174,7 +204,7 @@
 	unsigned int freq = arch_scale_freq_invariant() ?
 				policy->cpuinfo.max_freq : policy->cur;
 
-	freq = (freq + (freq >> 2)) * util / max;
+	freq = map_util_freq(util, freq, max);
 
 	if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
 		return sg_policy->next_freq;
@@ -196,6 +226,9 @@
  * Where the cfs,rt and dl util numbers are tracked with the same metric and
  * synchronized windows and are thus directly comparable.
  *
+ * The @util parameter passed to this function is assumed to be the aggregation
+ * of RT and CFS util numbers. The cases of DL and IRQ are managed here.
+ *
  * The cfs,rt,dl utilization are the running times measured with rq->clock_task
  * which excludes things like IRQ and steal-time. These latter are then accrued
  * in the irq utilization.
@@ -204,15 +237,14 @@
  * based on the task model parameters and gives the minimal utilization
  * required to meet deadlines.
  */
-static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
+unsigned long schedutil_freq_util(int cpu, unsigned long util,
+				  unsigned long max, enum schedutil_type type)
 {
-	struct rq *rq = cpu_rq(sg_cpu->cpu);
-	unsigned long util, irq, max;
+	unsigned long dl_util, irq;
+	struct rq *rq = cpu_rq(cpu);
 
-	sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
-	sg_cpu->bw_dl = cpu_bw_dl(rq);
-
-	if (rt_rq_is_runnable(&rq->rt))
+	if (sched_feat(SUGOV_RT_MAX_FREQ) && type == FREQUENCY_UTIL &&
+						rt_rq_is_runnable(&rq->rt))
 		return max;
 
 	/*
@@ -225,27 +257,33 @@
 		return max;
 
 	/*
-	 * Because the time spend on RT/DL tasks is visible as 'lost' time to
-	 * CFS tasks and we use the same metric to track the effective
-	 * utilization (PELT windows are synchronized) we can directly add them
-	 * to obtain the CPU's actual utilization.
+	 * The function is called with @util defined as the aggregation (the
+	 * sum) of RT and CFS signals, hence leaving the special case of DL
+	 * to be delt with. The exact way of doing things depend on the calling
+	 * context.
 	 */
-	util = cpu_util_cfs(rq);
-	util += cpu_util_rt(rq);
+	dl_util = cpu_util_dl(rq);
 
 	/*
-	 * We do not make cpu_util_dl() a permanent part of this sum because we
-	 * want to use cpu_bw_dl() later on, but we need to check if the
-	 * CFS+RT+DL sum is saturated (ie. no idle time) such that we select
-	 * f_max when there is no idle time.
+	 * For frequency selection we do not make cpu_util_dl() a permanent part
+	 * of this sum because we want to use cpu_bw_dl() later on, but we need
+	 * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
+	 * that we select f_max when there is no idle time.
 	 *
 	 * NOTE: numerical errors or stop class might cause us to not quite hit
 	 * saturation when we should -- something for later.
 	 */
-	if ((util + cpu_util_dl(rq)) >= max)
+	if (util + dl_util >= max)
 		return max;
 
 	/*
+	 * OTOH, for energy computation we need the estimated running time, so
+	 * include util_dl and ignore dl_bw.
+	 */
+	if (type == ENERGY_UTIL)
+		util += dl_util;
+
+	/*
 	 * There is still idle time; further improve the number by using the
 	 * irq metric. Because IRQ/steal time is hidden from the task clock we
 	 * need to scale the task numbers:
@@ -267,7 +305,22 @@
 	 * bw_dl as requested freq. However, cpufreq is not yet ready for such
 	 * an interface. So, we only do the latter for now.
 	 */
-	return min(max, util + sg_cpu->bw_dl);
+	if (type == FREQUENCY_UTIL)
+		util += cpu_bw_dl(rq);
+
+	return min(max, util);
+}
+
+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
+{
+	struct rq *rq = cpu_rq(sg_cpu->cpu);
+	unsigned long util = boosted_cpu_util(sg_cpu->cpu, cpu_util_rt(rq));
+	unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+
+	sg_cpu->max = max;
+	sg_cpu->bw_dl = cpu_bw_dl(rq);
+
+	return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL);
 }
 
 /**
@@ -559,15 +612,32 @@
 	return container_of(attr_set, struct sugov_tunables, attr_set);
 }
 
-static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+static DEFINE_MUTEX(min_rate_lock);
+
+static void update_min_rate_limit_ns(struct sugov_policy *sg_policy)
+{
+	mutex_lock(&min_rate_lock);
+	sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
+					   sg_policy->down_rate_delay_ns);
+	mutex_unlock(&min_rate_lock);
+}
+
+static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
 {
 	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 
-	return sprintf(buf, "%u\n", tunables->rate_limit_us);
+	return sprintf(buf, "%u\n", tunables->up_rate_limit_us);
 }
 
-static ssize_t
-rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
+static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+	return sprintf(buf, "%u\n", tunables->down_rate_limit_us);
+}
+
+static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set,
+				      const char *buf, size_t count)
 {
 	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
 	struct sugov_policy *sg_policy;
@@ -576,18 +646,42 @@
 	if (kstrtouint(buf, 10, &rate_limit_us))
 		return -EINVAL;
 
-	tunables->rate_limit_us = rate_limit_us;
+	tunables->up_rate_limit_us = rate_limit_us;
 
-	list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
-		sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
+	list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+		sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+		update_min_rate_limit_ns(sg_policy);
+	}
 
 	return count;
 }
 
-static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
+static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
+					const char *buf, size_t count)
+{
+	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+	struct sugov_policy *sg_policy;
+	unsigned int rate_limit_us;
+
+	if (kstrtouint(buf, 10, &rate_limit_us))
+		return -EINVAL;
+
+	tunables->down_rate_limit_us = rate_limit_us;
+
+	list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+		sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+		update_min_rate_limit_ns(sg_policy);
+	}
+
+	return count;
+}
+
+static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
+static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
 
 static struct attribute *sugov_attributes[] = {
-	&rate_limit_us.attr,
+	&up_rate_limit_us.attr,
+	&down_rate_limit_us.attr,
 	NULL
 };
 
@@ -598,7 +692,7 @@
 
 /********************** cpufreq governor interface *********************/
 
-static struct cpufreq_governor schedutil_gov;
+struct cpufreq_governor schedutil_gov;
 
 static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
 {
@@ -743,7 +837,8 @@
 		goto stop_kthread;
 	}
 
-	tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
+	tunables->up_rate_limit_us = cpufreq_policy_transition_delay_us(policy);
+	tunables->down_rate_limit_us = cpufreq_policy_transition_delay_us(policy);
 
 	policy->governor_data = sg_policy;
 	sg_policy->tunables = tunables;
@@ -802,7 +897,11 @@
 	struct sugov_policy *sg_policy = policy->governor_data;
 	unsigned int cpu;
 
-	sg_policy->freq_update_delay_ns	= sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+	sg_policy->up_rate_delay_ns =
+		sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
+	sg_policy->down_rate_delay_ns =
+		sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
+	update_min_rate_limit_ns(sg_policy);
 	sg_policy->last_freq_update_time	= 0;
 	sg_policy->next_freq			= 0;
 	sg_policy->work_in_progress		= false;
@@ -861,7 +960,7 @@
 	sg_policy->limits_changed = true;
 }
 
-static struct cpufreq_governor schedutil_gov = {
+struct cpufreq_governor schedutil_gov = {
 	.name			= "schedutil",
 	.owner			= THIS_MODULE,
 	.dynamic_switching	= true,
@@ -884,3 +983,36 @@
 	return cpufreq_register_governor(&schedutil_gov);
 }
 fs_initcall(sugov_register);
+
+#ifdef CONFIG_ENERGY_MODEL
+extern bool sched_energy_update;
+extern struct mutex sched_energy_mutex;
+
+static void rebuild_sd_workfn(struct work_struct *work)
+{
+	mutex_lock(&sched_energy_mutex);
+	sched_energy_update = true;
+	rebuild_sched_domains();
+	sched_energy_update = false;
+	mutex_unlock(&sched_energy_mutex);
+}
+static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
+
+/*
+ * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
+ * on governor changes to make sure the scheduler knows about it.
+ */
+void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
+				  struct cpufreq_governor *old_gov)
+{
+	if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
+		/*
+		 * When called from the cpufreq_register_driver() path, the
+		 * cpu_hotplug_lock is already held, so use a work item to
+		 * avoid nested locking in rebuild_sched_domains().
+		 */
+		schedule_work(&rebuild_sd_work);
+	}
+
+}
+#endif

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ebec37c..84cfedc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c

@@ -1599,7 +1599,8 @@
 static int find_later_rq(struct task_struct *task);
 
 static int
-select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags,
+		  int sibling_count_hint)
 {
 	struct task_struct *curr;
 	struct rq *rq;
@@ -1793,7 +1794,7 @@
 	deadline_queue_push_tasks(rq);
 
 	if (rq->curr->sched_class != &dl_sched_class)
-		update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+		update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
 
 	return p;
 }
@@ -1802,7 +1803,7 @@
 {
 	update_curr_dl(rq);
 
-	update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
+	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
 	if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_dl_task(rq, p);
 }
@@ -1819,7 +1820,7 @@
 {
 	update_curr_dl(rq);
 
-	update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
+	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
 	/*
 	 * Even when we have runtime, update_curr_dl() might have resulted in us
 	 * not being the leftmost task anymore. In that case NEED_RESCHED will

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 78fadf0..141ea9f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c

@@ -73,7 +73,7 @@
 	return 0;
 }
 
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 
 #define jump_label_key__true  STATIC_KEY_INIT_TRUE
 #define jump_label_key__false STATIC_KEY_INIT_FALSE
@@ -99,7 +99,7 @@
 #else
 static void sched_feat_disable(int i) { };
 static void sched_feat_enable(int i) { };
-#endif /* CONFIG_JUMP_LABEL */
+#endif /* HAVE_JUMP_LABEL */
 
 static int sched_feat_set(char *cmp)
 {

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7f4f4ab..28c14e8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c

@@ -41,6 +41,16 @@
 unsigned int normalized_sysctl_sched_latency		= 6000000ULL;
 
 /*
+ * Enable/disable honoring sync flag in energy-aware wakeups.
+ */
+unsigned int sysctl_sched_sync_hint_enable = 1;
+
+/*
+ * Enable/disable using cstate knowledge in idle sibling selection
+ */
+unsigned int sysctl_sched_cstate_aware = 0;
+
+/*
  * The initial- and re-scaling of tunables is configurable
  *
  * Options are:
@@ -248,13 +258,6 @@
  */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* cpu runqueue to which this cfs_rq is attached */
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq->rq;
-}
-
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
 	SCHED_WARN_ON(!entity_is_task(se));
@@ -353,6 +356,18 @@
 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	if (cfs_rq->on_list) {
+		struct rq *rq = rq_of(cfs_rq);
+
+		/*
+		 * With cfs_rq being unthrottled/throttled during an enqueue,
+		 * it can happen the tmp_alone_branch points the a leaf that
+		 * we finally want to del. In this case, tmp_alone_branch moves
+		 * to the prev element but it will point to rq->leaf_cfs_rq_list
+		 * at the end of the enqueue.
+		 */
+		if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
+			rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
+
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 		cfs_rq->on_list = 0;
 	}
@@ -363,9 +378,10 @@
 	SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
 }
 
-/* Iterate through all cfs_rq's on a runqueue in bottom-up order */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
+	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
+				 leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
@@ -421,12 +437,6 @@
 	return container_of(se, struct task_struct, se);
 }
 
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
-	return container_of(cfs_rq, struct rq, cfs);
-}
-
-
 #define for_each_sched_entity(se) \
 		for (; se; se = NULL)
 
@@ -462,8 +472,8 @@
 {
 }
 
-#define for_each_leaf_cfs_rq(rq, cfs_rq)	\
-		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
+		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
@@ -702,12 +712,12 @@
 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
 
-#ifdef CONFIG_SMP
 #include "pelt.h"
-#include "sched-pelt.h"
+#ifdef CONFIG_SMP
 
 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
+static unsigned long capacity_of(int cpu);
 
 /* Give new sched_entity start runnable values to heavy its load in infant time */
 void init_entity_runnable_average(struct sched_entity *se)
@@ -791,7 +801,7 @@
 			 * such that the next switched_to_fair() has the
 			 * expected state.
 			 */
-			se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
+			se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
 			return;
 		}
 	}
@@ -956,6 +966,7 @@
 			}
 
 			trace_sched_stat_blocked(tsk, delta);
+			trace_sched_blocked_reason(tsk);
 
 			/*
 			 * Blocking time is in units of nanosecs, so shift by
@@ -1502,7 +1513,6 @@
 static unsigned long weighted_cpuload(struct rq *rq);
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
-static unsigned long capacity_of(int cpu);
 
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
@@ -3159,6 +3169,8 @@
 	if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
 		atomic_long_add(delta, &cfs_rq->tg->load_avg);
 		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
+
+		trace_sched_load_tg(cfs_rq);
 	}
 }
 
@@ -3207,7 +3219,7 @@
 	p_last_update_time = prev->avg.last_update_time;
 	n_last_update_time = next->avg.last_update_time;
 #endif
-	__update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
+	__update_load_avg_blocked_se(p_last_update_time, se);
 	se->avg.last_update_time = n_last_update_time;
 }
 
@@ -3342,11 +3354,11 @@
 
 	/*
 	 * runnable_sum can't be lower than running_sum
-	 * As running sum is scale with CPU capacity wehreas the runnable sum
-	 * is not we rescale running_sum 1st
+	 * Rescale running sum to be in the same range as runnable sum
+	 * running_sum is in [0 : LOAD_AVG_MAX <<  SCHED_CAPACITY_SHIFT]
+	 * runnable_sum is in [0 : LOAD_AVG_MAX]
 	 */
-	running_sum = se->avg.util_sum /
-		arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+	running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
 	runnable_sum = max(runnable_sum, running_sum);
 
 	load_sum = (s64)se_weight(se) * runnable_sum;
@@ -3401,6 +3413,9 @@
 	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
 	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
 
+	trace_sched_load_cfs_rq(cfs_rq);
+	trace_sched_load_se(se);
+
 	return 1;
 }
 
@@ -3449,7 +3464,7 @@
 
 /**
  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
- * @now: current time, as per cfs_rq_clock_task()
+ * @now: current time, as per cfs_rq_clock_pelt()
  * @cfs_rq: cfs_rq to update
  *
  * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
@@ -3494,7 +3509,7 @@
 		decayed = 1;
 	}
 
-	decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+	decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
 
 #ifndef CONFIG_64BIT
 	smp_wmb();
@@ -3553,6 +3568,8 @@
 	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
 
 	cfs_rq_util_change(cfs_rq, flags);
+
+	trace_sched_load_cfs_rq(cfs_rq);
 }
 
 /**
@@ -3572,6 +3589,8 @@
 	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
 
 	cfs_rq_util_change(cfs_rq, 0);
+
+	trace_sched_load_cfs_rq(cfs_rq);
 }
 
 /*
@@ -3584,9 +3603,7 @@
 /* Update task and its cfs_rq load average */
 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-	u64 now = cfs_rq_clock_task(cfs_rq);
-	struct rq *rq = rq_of(cfs_rq);
-	int cpu = cpu_of(rq);
+	u64 now = cfs_rq_clock_pelt(cfs_rq);
 	int decayed;
 
 	/*
@@ -3594,7 +3611,7 @@
 	 * track group sched_entity load average for task_h_load calc in migration
 	 */
 	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
-		__update_load_avg_se(now, cpu, cfs_rq, se);
+		__update_load_avg_se(now, cfs_rq, se);
 
 	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
 	decayed |= propagate_entity_load_avg(se);
@@ -3646,7 +3663,7 @@
 	u64 last_update_time;
 
 	last_update_time = cfs_rq_last_update_time(cfs_rq);
-	__update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
+	__update_load_avg_blocked_se(last_update_time, se);
 }
 
 /*
@@ -3719,6 +3736,10 @@
 	enqueued  = cfs_rq->avg.util_est.enqueued;
 	enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
 	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+	/* Update plots for Task and CPU estimated utilization */
+	trace_sched_util_est_task(p, &p->se.avg);
+	trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
 }
 
 /*
@@ -3739,6 +3760,7 @@
 {
 	long last_ewma_diff;
 	struct util_est ue;
+	int cpu;
 
 	if (!sched_feat(UTIL_EST))
 		return;
@@ -3749,6 +3771,9 @@
 			     (_task_util_est(p) | UTIL_AVG_UNCHANGED));
 	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
 
+	/* Update plots for CPU's estimated utilization */
+	trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
 	/*
 	 * Skip update of task's estimated utilization when the task has not
 	 * yet completed an activation, e.g. being migrated.
@@ -3774,6 +3799,14 @@
 		return;
 
 	/*
+	 * To avoid overestimation of actual task utilization, skip updates if
+	 * we cannot grant there is idle time in this CPU.
+	 */
+	cpu = cpu_of(rq_of(cfs_rq));
+	if (task_util(p) > capacity_orig_of(cpu))
+		return;
+
+	/*
 	 * Update Task's estimated utilization
 	 *
 	 * When *p completes an activation we can consolidate another sample
@@ -3794,6 +3827,32 @@
 	ue.ewma  += last_ewma_diff;
 	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
 	WRITE_ONCE(p->se.avg.util_est, ue);
+
+	/* Update plots for Task's estimated utilization */
+	trace_sched_util_est_task(p, &p->se.avg);
+}
+
+static inline int task_fits_capacity(struct task_struct *p, long capacity)
+{
+	return capacity * 1024 > task_util_est(p) * capacity_margin;
+}
+
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
+{
+	if (!static_branch_unlikely(&sched_asym_cpucapacity))
+		return;
+
+	if (!p) {
+		rq->misfit_task_load = 0;
+		return;
+	}
+
+	if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+		rq->misfit_task_load = 0;
+		return;
+	}
+
+	rq->misfit_task_load = task_h_load(p);
 }
 
 #else /* CONFIG_SMP */
@@ -3825,6 +3884,7 @@
 static inline void
 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
 		 bool task_sleep) {}
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
 
 #endif /* CONFIG_SMP */
 
@@ -4279,7 +4339,7 @@
 
 #ifdef CONFIG_CFS_BANDWIDTH
 
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 static struct static_key __cfs_bandwidth_used;
 
 static inline bool cfs_bandwidth_used(void)
@@ -4296,7 +4356,7 @@
 {
 	static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
 }
-#else /* CONFIG_JUMP_LABEL */
+#else /* HAVE_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)
 {
 	return true;
@@ -4304,7 +4364,7 @@
 
 void cfs_bandwidth_usage_inc(void) {}
 void cfs_bandwidth_usage_dec(void) {}
-#endif /* CONFIG_JUMP_LABEL */
+#endif /* HAVE_JUMP_LABEL */
 
 /*
  * default period for cfs group bandwidth.
@@ -4441,6 +4501,10 @@
 		/* adjust cfs_rq_clock_task() */
 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
 					     cfs_rq->throttled_clock_task;
+
+		/* Add cfs_rq with already running entity in the list */
+		if (cfs_rq->nr_running >= 1)
+			list_add_leaf_cfs_rq(cfs_rq);
 	}
 
 	return 0;
@@ -4452,8 +4516,10 @@
 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
 	/* group is entering throttled state, stop time */
-	if (!cfs_rq->throttle_count)
+	if (!cfs_rq->throttle_count) {
 		cfs_rq->throttled_clock_task = rq_clock_task(rq);
+		list_del_leaf_cfs_rq(cfs_rq);
+	}
 	cfs_rq->throttle_count++;
 
 	return 0;
@@ -4556,6 +4622,8 @@
 			break;
 	}
 
+	assert_list_leaf_cfs_rq(rq);
+
 	if (!se)
 		add_nr_running(rq, task_delta);
 
@@ -5122,6 +5190,26 @@
 }
 #endif
 
+#ifdef CONFIG_SMP
+static inline unsigned long cpu_util(int cpu);
+static unsigned long capacity_of(int cpu);
+
+static inline bool cpu_overutilized(int cpu)
+{
+	return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
+static inline void update_overutilized_status(struct rq *rq)
+{
+	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
+		WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
+		trace_sched_overutilized(1);
+	}
+}
+#else
+static inline void update_overutilized_status(struct rq *rq) { }
+#endif
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -5142,6 +5230,24 @@
 	util_est_enqueue(&rq->cfs, p);
 
 	/*
+	 * The code below (indirectly) updates schedutil which looks at
+	 * the cfs_rq utilization to select a frequency.
+	 * Let's update schedtune here to ensure the boost value of the
+	 * current task is accounted for in the selection of the OPP.
+	 *
+	 * We do it also in the case where we enqueue a throttled task;
+	 * we could argue that a throttled task should not boost a CPU,
+	 * however:
+	 * a) properly implementing CPU boosting considering throttled
+	 *    tasks will increase a lot the complexity of the solution
+	 * b) it's not easy to quantify the benefits introduced by
+	 *    such a more complex solution.
+	 * Thus, for the time being we go for the simple solution and boost
+	 * also for throttled RQs.
+	 */
+	schedtune_enqueue_task(p, cpu_of(rq));
+
+	/*
 	 * If in_iowait is set, the code below may not trigger any cpufreq
 	 * utilization updates, so do it here explicitly with the IOWAIT flag
 	 * passed.
@@ -5179,8 +5285,26 @@
 		update_cfs_group(se);
 	}
 
-	if (!se)
+	if (!se) {
 		add_nr_running(rq, 1);
+		/*
+		 * Since new tasks are assigned an initial util_avg equal to
+		 * half of the spare capacity of their CPU, tiny tasks have the
+		 * ability to cross the overutilized threshold, which will
+		 * result in the load balancer ruining all the task placement
+		 * done by EAS. As a way to mitigate that effect, do not account
+		 * for the first enqueue operation of new tasks during the
+		 * overutilized flag detection.
+		 *
+		 * A better way of solving this problem would be to wait for
+		 * the PELT signals of tasks to converge before taking them
+		 * into account, but that is not straightforward to implement,
+		 * and the following generally works well enough in practice.
+		 */
+		if (flags & ENQUEUE_WAKEUP)
+			update_overutilized_status(rq);
+
+	}
 
 	if (cfs_bandwidth_used()) {
 		/*
@@ -5215,6 +5339,14 @@
 	struct sched_entity *se = &p->se;
 	int task_sleep = flags & DEQUEUE_SLEEP;
 
+	/*
+	 * The code below (indirectly) updates schedutil which looks at
+	 * the cfs_rq utilization to select a frequency.
+	 * Let's update schedtune here to ensure the boost value of the
+	 * current task is not more accounted for in the selection of the OPP.
+	 */
+	schedtune_dequeue_task(p, cpu_of(rq));
+
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
@@ -5578,11 +5710,6 @@
 	return cpu_rq(cpu)->cpu_capacity;
 }
 
-static unsigned long capacity_orig_of(int cpu)
-{
-	return cpu_rq(cpu)->cpu_capacity_orig;
-}
-
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -5629,15 +5756,18 @@
  * whatever is irrelevant, spread criteria is apparent partner count exceeds
  * socket size.
  */
-static int wake_wide(struct task_struct *p)
+static int wake_wide(struct task_struct *p, int sibling_count_hint)
 {
 	unsigned int master = current->wakee_flips;
 	unsigned int slave = p->wakee_flips;
-	int factor = this_cpu_read(sd_llc_size);
+	int llc_size = this_cpu_read(sd_llc_size);
+
+	if (sibling_count_hint >= llc_size)
+		return 1;
 
 	if (master < slave)
 		swap(master, slave);
-	if (slave < factor || master < slave * factor)
+	if (slave < llc_size || master < slave * llc_size)
 		return 0;
 	return 1;
 }
@@ -5741,6 +5871,100 @@
 	return target;
 }
 
+#ifdef CONFIG_SCHED_TUNE
+struct reciprocal_value schedtune_spc_rdiv;
+
+static long
+schedtune_margin(unsigned long signal, long boost)
+{
+	long long margin = 0;
+
+	/*
+	 * Signal proportional compensation (SPC)
+	 *
+	 * The Boost (B) value is used to compute a Margin (M) which is
+	 * proportional to the complement of the original Signal (S):
+	 *   M = B * (SCHED_CAPACITY_SCALE - S)
+	 * The obtained M could be used by the caller to "boost" S.
+	 */
+	if (boost >= 0) {
+		margin  = SCHED_CAPACITY_SCALE - signal;
+		margin *= boost;
+	} else
+		margin = -signal * boost;
+
+	margin  = reciprocal_divide(margin, schedtune_spc_rdiv);
+
+	if (boost < 0)
+		margin *= -1;
+	return margin;
+}
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+	int boost = schedtune_cpu_boost(cpu);
+
+	if (boost == 0)
+		return 0;
+
+	return schedtune_margin(util, boost);
+}
+
+static inline long
+schedtune_task_margin(struct task_struct *task)
+{
+	int boost = schedtune_task_boost(task);
+	unsigned long util;
+	long margin;
+
+	if (boost == 0)
+		return 0;
+
+	util = task_util_est(task);
+	margin = schedtune_margin(util, boost);
+
+	return margin;
+}
+
+unsigned long
+boosted_cpu_util(int cpu, unsigned long other_util)
+{
+	unsigned long util = cpu_util_cfs(cpu_rq(cpu)) + other_util;
+	long margin = schedtune_cpu_margin(util, cpu);
+
+	trace_sched_boost_cpu(cpu, util, margin);
+
+	return util + margin;
+}
+
+#else /* CONFIG_SCHED_TUNE */
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+	return 0;
+}
+
+static inline int
+schedtune_task_margin(struct task_struct *task)
+{
+	return 0;
+}
+
+#endif /* CONFIG_SCHED_TUNE */
+
+static inline unsigned long
+boosted_task_util(struct task_struct *task)
+{
+	unsigned long util = task_util_est(task);
+	long margin = schedtune_task_margin(task);
+
+	trace_sched_boost_task(task, util, margin);
+
+	return util + margin;
+}
+
 static unsigned long cpu_util_without(int cpu, struct task_struct *p);
 
 static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
@@ -6374,6 +6598,321 @@
 }
 
 /*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+unsigned long capacity_curr_of(int cpu)
+{
+	unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig;
+	unsigned long scale_freq = arch_scale_freq_capacity(cpu);
+
+	return cap_scale(max_cap, scale_freq);
+}
+
+static void find_best_target(struct sched_domain *sd, cpumask_t *cpus,
+							struct task_struct *p)
+{
+	unsigned long min_util = boosted_task_util(p);
+	unsigned long target_capacity = ULONG_MAX;
+	unsigned long min_wake_util = ULONG_MAX;
+	unsigned long target_max_spare_cap = 0;
+	unsigned long target_util = ULONG_MAX;
+	bool prefer_idle = schedtune_prefer_idle(p);
+	bool boosted = schedtune_task_boost(p) > 0;
+	/* Initialise with deepest possible cstate (INT_MAX) */
+	int shallowest_idle_cstate = INT_MAX;
+	struct sched_group *sg;
+	int best_active_cpu = -1;
+	int best_idle_cpu = -1;
+	int target_cpu = -1;
+	int backup_cpu = -1;
+	int i;
+
+	/*
+	 * In most cases, target_capacity tracks capacity_orig of the most
+	 * energy efficient CPU candidate, thus requiring to minimise
+	 * target_capacity. For these cases target_capacity is already
+	 * initialized to ULONG_MAX.
+	 * However, for prefer_idle and boosted tasks we look for a high
+	 * performance CPU, thus requiring to maximise target_capacity. In this
+	 * case we initialise target_capacity to 0.
+	 */
+	if (prefer_idle && boosted)
+		target_capacity = 0;
+
+	/* Scan CPUs in all SDs */
+	sg = sd->groups;
+	do {
+		for_each_cpu_and(i, &p->cpus_allowed, sched_group_span(sg)) {
+			unsigned long capacity_curr = capacity_curr_of(i);
+			unsigned long capacity_orig = capacity_orig_of(i);
+			unsigned long wake_util, new_util;
+			long spare_cap;
+			int idle_idx = INT_MAX;
+
+			if (!cpu_online(i))
+				continue;
+
+			/*
+			 * p's blocked utilization is still accounted for on prev_cpu
+			 * so prev_cpu will receive a negative bias due to the double
+			 * accounting. However, the blocked utilization may be zero.
+			 */
+			wake_util = cpu_util_without(i, p);
+			new_util = wake_util + task_util_est(p);
+
+			/*
+			 * Ensure minimum capacity to grant the required boost.
+			 * The target CPU can be already at a capacity level higher
+			 * than the one required to boost the task.
+			 */
+			new_util = max(min_util, new_util);
+			if (new_util > capacity_orig)
+				continue;
+
+			/*
+			 * Pre-compute the maximum possible capacity we expect
+			 * to have available on this CPU once the task is
+			 * enqueued here.
+			 */
+			spare_cap = capacity_orig - new_util;
+
+			if (idle_cpu(i))
+				idle_idx = idle_get_state_idx(cpu_rq(i));
+
+
+			/*
+			 * Case A) Latency sensitive tasks
+			 *
+			 * Unconditionally favoring tasks that prefer idle CPU to
+			 * improve latency.
+			 *
+			 * Looking for:
+			 * - an idle CPU, whatever its idle_state is, since
+			 *   the first CPUs we explore are more likely to be
+			 *   reserved for latency sensitive tasks.
+			 * - a non idle CPU where the task fits in its current
+			 *   capacity and has the maximum spare capacity.
+			 * - a non idle CPU with lower contention from other
+			 *   tasks and running at the lowest possible OPP.
+			 *
+			 * The last two goals tries to favor a non idle CPU
+			 * where the task can run as if it is "almost alone".
+			 * A maximum spare capacity CPU is favoured since
+			 * the task already fits into that CPU's capacity
+			 * without waiting for an OPP chance.
+			 *
+			 * The following code path is the only one in the CPUs
+			 * exploration loop which is always used by
+			 * prefer_idle tasks. It exits the loop with wither a
+			 * best_active_cpu or a target_cpu which should
+			 * represent an optimal choice for latency sensitive
+			 * tasks.
+			 */
+			if (prefer_idle) {
+
+				/*
+				 * Case A.1: IDLE CPU
+				 * Return the best IDLE CPU we find:
+				 * - for boosted tasks: the CPU with the highest
+				 * performance (i.e. biggest capacity_orig)
+				 * - for !boosted tasks: the most energy
+				 * efficient CPU (i.e. smallest capacity_orig)
+				 */
+				if (idle_cpu(i)) {
+					if (boosted &&
+					    capacity_orig < target_capacity)
+						continue;
+					if (!boosted &&
+					    capacity_orig > target_capacity)
+						continue;
+					/*
+					 * Minimise value of idle state: skip
+					 * deeper idle states and pick the
+					 * shallowest.
+					 */
+					if (capacity_orig == target_capacity &&
+					    sysctl_sched_cstate_aware &&
+					    idle_idx >= shallowest_idle_cstate)
+						continue;
+
+					target_capacity = capacity_orig;
+					shallowest_idle_cstate = idle_idx;
+					best_idle_cpu = i;
+					continue;
+				}
+				if (best_idle_cpu != -1)
+					continue;
+
+				/*
+				 * Case A.2: Target ACTIVE CPU
+				 * Favor CPUs with max spare capacity.
+				 */
+				if (capacity_curr > new_util &&
+				    spare_cap > target_max_spare_cap) {
+					target_max_spare_cap = spare_cap;
+					target_cpu = i;
+					continue;
+				}
+				if (target_cpu != -1)
+					continue;
+
+
+				/*
+				 * Case A.3: Backup ACTIVE CPU
+				 * Favor CPUs with:
+				 * - lower utilization due to other tasks
+				 * - lower utilization with the task in
+				 */
+				if (wake_util > min_wake_util)
+					continue;
+				min_wake_util = wake_util;
+				best_active_cpu = i;
+				continue;
+			}
+
+			/*
+			 * Enforce EAS mode
+			 *
+			 * For non latency sensitive tasks, skip CPUs that
+			 * will be overutilized by moving the task there.
+			 *
+			 * The goal here is to remain in EAS mode as long as
+			 * possible at least for !prefer_idle tasks.
+			 */
+			if ((new_util * capacity_margin) >
+			    (capacity_orig * SCHED_CAPACITY_SCALE))
+				continue;
+
+			/*
+			 * Favor CPUs with smaller capacity for non latency
+			 * sensitive tasks.
+			 */
+			if (capacity_orig > target_capacity)
+				continue;
+
+			/*
+			 * Case B) Non latency sensitive tasks on IDLE CPUs.
+			 *
+			 * Find an optimal backup IDLE CPU for non latency
+			 * sensitive tasks.
+			 *
+			 * Looking for:
+			 * - minimizing the capacity_orig,
+			 *   i.e. preferring LITTLE CPUs
+			 * - favoring shallowest idle states
+			 *   i.e. avoid to wakeup deep-idle CPUs
+			 *
+			 * The following code path is used by non latency
+			 * sensitive tasks if IDLE CPUs are available. If at
+			 * least one of such CPUs are available it sets the
+			 * best_idle_cpu to the most suitable idle CPU to be
+			 * selected.
+			 *
+			 * If idle CPUs are available, favour these CPUs to
+			 * improve performances by spreading tasks.
+			 * Indeed, the energy_diff() computed by the caller
+			 * will take care to ensure the minimization of energy
+			 * consumptions without affecting performance.
+			 */
+			if (idle_cpu(i)) {
+				/*
+				 * Skip CPUs in deeper idle state, but only
+				 * if they are also less energy efficient.
+				 * IOW, prefer a deep IDLE LITTLE CPU vs a
+				 * shallow idle big CPU.
+				 */
+				if (capacity_orig == target_capacity &&
+				    sysctl_sched_cstate_aware &&
+				    idle_idx >= shallowest_idle_cstate)
+					continue;
+
+				target_capacity = capacity_orig;
+				shallowest_idle_cstate = idle_idx;
+				best_idle_cpu = i;
+				continue;
+			}
+
+			/*
+			 * Case C) Non latency sensitive tasks on ACTIVE CPUs.
+			 *
+			 * Pack tasks in the most energy efficient capacities.
+			 *
+			 * This task packing strategy prefers more energy
+			 * efficient CPUs (i.e. pack on smaller maximum
+			 * capacity CPUs) while also trying to spread tasks to
+			 * run them all at the lower OPP.
+			 *
+			 * This assumes for example that it's more energy
+			 * efficient to run two tasks on two CPUs at a lower
+			 * OPP than packing both on a single CPU but running
+			 * that CPU at an higher OPP.
+			 *
+			 * Thus, this case keep track of the CPU with the
+			 * smallest maximum capacity and highest spare maximum
+			 * capacity.
+			 */
+
+			/* Favor CPUs with maximum spare capacity */
+			if (capacity_orig == target_capacity &&
+			    spare_cap < target_max_spare_cap)
+				continue;
+
+			target_max_spare_cap = spare_cap;
+			target_capacity = capacity_orig;
+			target_util = new_util;
+			target_cpu = i;
+		}
+
+	} while (sg = sg->next, sg != sd->groups);
+
+	/*
+	 * For non latency sensitive tasks, cases B and C in the previous loop,
+	 * we pick the best IDLE CPU only if we was not able to find a target
+	 * ACTIVE CPU.
+	 *
+	 * Policies priorities:
+	 *
+	 * - prefer_idle tasks:
+	 *
+	 *   a) IDLE CPU available: best_idle_cpu
+	 *   b) ACTIVE CPU where task fits and has the bigger maximum spare
+	 *      capacity (i.e. target_cpu)
+	 *   c) ACTIVE CPU with less contention due to other tasks
+	 *      (i.e. best_active_cpu)
+	 *
+	 * - NON prefer_idle tasks:
+	 *
+	 *   a) ACTIVE CPU: target_cpu
+	 *   b) IDLE CPU: best_idle_cpu
+	 */
+
+	if (prefer_idle && (best_idle_cpu != -1)) {
+		target_cpu = best_idle_cpu;
+		goto target;
+	}
+
+	if (target_cpu == -1)
+		target_cpu = prefer_idle
+			? best_active_cpu
+			: best_idle_cpu;
+	else
+		backup_cpu = prefer_idle
+		? best_active_cpu
+		: best_idle_cpu;
+
+	if (backup_cpu >= 0)
+		cpumask_set_cpu(backup_cpu, cpus);
+	if (target_cpu >= 0) {
+target:
+		cpumask_set_cpu(target_cpu, cpus);
+	}
+
+	trace_sched_find_best_target(p, prefer_idle, min_util, best_idle_cpu,
+			             best_active_cpu, target_cpu, backup_cpu);
+}
+
+/*
  * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
  * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
  *
@@ -6384,8 +6923,11 @@
 {
 	long min_cap, max_cap;
 
+	if (!static_branch_unlikely(&sched_asym_cpucapacity))
+		return 0;
+
 	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
-	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
 
 	/* Minimum capacity is close to max, no need to abort wake_affine */
 	if (max_cap - min_cap < max_cap >> 3)
@@ -6394,7 +6936,255 @@
 	/* Bring task utilization in sync with prev_cpu */
 	sync_entity_load_avg(&p->se);
 
-	return min_cap * 1024 < task_util(p) * capacity_margin;
+	return !task_fits_capacity(p, min_cap);
+}
+
+/*
+ * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
+ * to @dst_cpu.
+ */
+static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+{
+	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+	unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
+
+	/*
+	 * If @p migrates from @cpu to another, remove its contribution. Or,
+	 * if @p migrates from another CPU to @cpu, add its contribution. In
+	 * the other cases, @cpu is not impacted by the migration, so the
+	 * util_avg should already be correct.
+	 */
+	if (task_cpu(p) == cpu && dst_cpu != cpu)
+		sub_positive(&util, task_util(p));
+	else if (task_cpu(p) != cpu && dst_cpu == cpu)
+		util += task_util(p);
+
+	if (sched_feat(UTIL_EST)) {
+		util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+		/*
+		 * During wake-up, the task isn't enqueued yet and doesn't
+		 * appear in the cfs_rq->avg.util_est.enqueued of any rq,
+		 * so just add it (if needed) to "simulate" what will be
+		 * cpu_util() after the task has been enqueued.
+		 */
+		if (dst_cpu == cpu)
+			util_est += _task_util_est(p);
+
+		util = max(util, util_est);
+	}
+
+	return min(util, capacity_orig_of(cpu));
+}
+
+/*
+ * compute_energy(): Estimates the energy that would be consumed if @p was
+ * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
+ * landscape of the * CPUs after the task migration, and uses the Energy Model
+ * to compute what would be the energy if we decided to actually migrate that
+ * task.
+ */
+static long
+compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
+{
+	long util, max_util, sum_util, energy = 0;
+	int cpu;
+
+	for (; pd; pd = pd->next) {
+		max_util = sum_util = 0;
+		/*
+		 * The capacity state of CPUs of the current rd can be driven by
+		 * CPUs of another rd if they belong to the same performance
+		 * domain. So, account for the utilization of these CPUs too
+		 * by masking pd with cpu_online_mask instead of the rd span.
+		 *
+		 * If an entire performance domain is outside of the current rd,
+		 * it will not appear in its pd list and will not be accounted
+		 * by compute_energy().
+		 */
+		for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) {
+			util = cpu_util_next(cpu, p, dst_cpu);
+			util += cpu_util_rt(cpu_rq(cpu));
+			util = schedutil_energy_util(cpu, util);
+			max_util = max(util, max_util);
+			sum_util += util;
+		}
+
+		energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+	}
+
+	return energy;
+}
+
+static void select_max_spare_cap_cpus(struct sched_domain *sd, cpumask_t *cpus,
+		struct perf_domain *pd, struct task_struct *p)
+{
+	unsigned long spare_cap, max_spare_cap, util, cpu_cap;
+	int cpu, max_spare_cap_cpu;
+
+	for (; pd; pd = pd->next) {
+		max_spare_cap_cpu = -1;
+		max_spare_cap = 0;
+
+		for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
+			if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+				continue;
+
+			/* Skip CPUs that will be overutilized. */
+			util = cpu_util_next(cpu, p, cpu);
+			cpu_cap = capacity_of(cpu);
+			if (cpu_cap * 1024 < util * capacity_margin)
+				continue;
+
+			/*
+			 * Find the CPU with the maximum spare capacity in
+			 * the performance domain
+			 */
+			spare_cap = cpu_cap - util;
+			if (spare_cap > max_spare_cap) {
+				max_spare_cap = spare_cap;
+				max_spare_cap_cpu = cpu;
+			}
+		}
+
+		if (max_spare_cap_cpu >= 0)
+			cpumask_set_cpu(max_spare_cap_cpu, cpus);
+	}
+}
+
+static DEFINE_PER_CPU(cpumask_t, energy_cpus);
+
+/*
+ * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
+ * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
+ * spare capacity in each performance domain and uses it as a potential
+ * candidate to execute the task. Then, it uses the Energy Model to figure
+ * out which of the CPU candidates is the most energy-efficient.
+ *
+ * The rationale for this heuristic is as follows. In a performance domain,
+ * all the most energy efficient CPU candidates (according to the Energy
+ * Model) are those for which we'll request a low frequency. When there are
+ * several CPUs for which the frequency request will be the same, we don't
+ * have enough data to break the tie between them, because the Energy Model
+ * only includes active power costs. With this model, if we assume that
+ * frequency requests follow utilization (e.g. using schedutil), the CPU with
+ * the maximum spare capacity in a performance domain is guaranteed to be among
+ * the best candidates of the performance domain.
+ *
+ * In practice, it could be preferable from an energy standpoint to pack
+ * small tasks on a CPU in order to let other CPUs go in deeper idle states,
+ * but that could also hurt our chances to go cluster idle, and we have no
+ * ways to tell with the current Energy Model if this is actually a good
+ * idea or not. So, find_energy_efficient_cpu() basically favors
+ * cluster-packing, and spreading inside a cluster. That should at least be
+ * a good thing for latency, and this is consistent with the idea that most
+ * of the energy savings of EAS come from the asymmetry of the system, and
+ * not so much from breaking the tie between identical CPUs. That's also the
+ * reason why EAS is enabled in the topology code only for systems where
+ * SD_ASYM_CPUCAPACITY is set.
+ *
+ * NOTE: Forkees are not accepted in the energy-aware wake-up path because
+ * they don't have any useful utilization data yet and it's not possible to
+ * forecast their impact on energy consumption. Consequently, they will be
+ * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
+ * to be energy-inefficient in some use-cases. The alternative would be to
+ * bias new tasks towards specific types of CPUs first, or to try to infer
+ * their util_avg from the parent task, but those heuristics could hurt
+ * other use-cases too. So, until someone finds a better way to solve this,
+ * let's keep things simple by re-using the existing slow path.
+ */
+
+static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync)
+{
+	unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+	int weight, cpu, best_energy_cpu = prev_cpu;
+	unsigned long cur_energy;
+	struct perf_domain *pd;
+	struct sched_domain *sd;
+	cpumask_t *candidates;
+
+	if (sysctl_sched_sync_hint_enable && sync) {
+		cpu = smp_processor_id();
+		if (cpumask_test_cpu(cpu, &p->cpus_allowed))
+			return cpu;
+	}
+
+	rcu_read_lock();
+	pd = rcu_dereference(rd->pd);
+	if (!pd || READ_ONCE(rd->overutilized))
+		goto fail;
+
+	/*
+	 * Energy-aware wake-up happens on the lowest sched_domain starting
+	 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
+	 */
+	sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
+	while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+		sd = sd->parent;
+	if (!sd)
+		goto fail;
+
+	sync_entity_load_avg(&p->se);
+	if (!task_util_est(p))
+		goto unlock;
+
+	/* Pre-select a set of candidate CPUs. */
+	candidates = this_cpu_ptr(&energy_cpus);
+	cpumask_clear(candidates);
+
+	if (sched_feat(FIND_BEST_TARGET))
+		find_best_target(sd, candidates, p);
+	else
+		select_max_spare_cap_cpus(sd, candidates, pd, p);
+
+	/* Bail out if no candidate was found. */
+	weight = cpumask_weight(candidates);
+	if (!weight)
+		goto unlock;
+
+	/* If there is only one sensible candidate, select it now. */
+	cpu = cpumask_first(candidates);
+	if (weight == 1 && ((schedtune_prefer_idle(p) && idle_cpu(cpu)) ||
+			    (cpu == prev_cpu))) {
+		best_energy_cpu = cpu;
+		goto unlock;
+	}
+
+	if (cpumask_test_cpu(prev_cpu, &p->cpus_allowed))
+		prev_energy = best_energy = compute_energy(p, prev_cpu, pd);
+	else
+		prev_energy = best_energy = ULONG_MAX;
+
+	/* Select the best candidate energy-wise. */
+	for_each_cpu(cpu, candidates) {
+		if (cpu == prev_cpu)
+			continue;
+		cur_energy = compute_energy(p, cpu, pd);
+		if (cur_energy < best_energy) {
+			best_energy = cur_energy;
+			best_energy_cpu = cpu;
+		}
+	}
+unlock:
+	rcu_read_unlock();
+
+	/*
+	 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
+	 * least 6% of the energy used by prev_cpu.
+	 */
+	if (prev_energy == ULONG_MAX)
+		return best_energy_cpu;
+
+	if ((prev_energy - best_energy) > (prev_energy >> 4))
+		return best_energy_cpu;
+
+	return prev_cpu;
+
+fail:
+	rcu_read_unlock();
+
+	return -1;
 }
 
 /*
@@ -6410,7 +7200,8 @@
  * preempt must be disabled.
  */
 static int
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
+		    int sibling_count_hint)
 {
 	struct sched_domain *tmp, *sd = NULL;
 	int cpu = smp_processor_id();
@@ -6420,10 +7211,23 @@
 
 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
-		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
-			      && cpumask_test_cpu(cpu, &p->cpus_allowed);
+
+		if (static_branch_unlikely(&sched_energy_present)) {
+			if (schedtune_prefer_idle(p) && !sched_feat(EAS_PREFER_IDLE) && !sync)
+				goto sd_loop;
+
+			new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync);
+			if (new_cpu >= 0)
+				return new_cpu;
+			new_cpu = prev_cpu;
+		}
+
+		want_affine = !wake_wide(p, sibling_count_hint) &&
+			      !wake_cap(p, cpu, prev_cpu) &&
+			      cpumask_test_cpu(cpu, &p->cpus_allowed);
 	}
 
+sd_loop:
 	rcu_read_lock();
 	for_each_domain(cpu, tmp) {
 		if (!(tmp->flags & SD_LOAD_BALANCE))
@@ -6813,9 +7617,12 @@
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 
+	update_misfit_status(p, rq);
+
 	return p;
 
 idle:
+	update_misfit_status(NULL, rq);
 	new_tasks = idle_balance(rq, rf);
 
 	/*
@@ -6829,6 +7636,12 @@
 	if (new_tasks > 0)
 		goto again;
 
+	/*
+	 * rq is about to be idle, check if we need to update the
+	 * lost_idle_time of clock_pelt
+	 */
+	update_idle_rq_clock_pelt(rq);
+
 	return NULL;
 }
 
@@ -7021,6 +7834,13 @@
 
 enum fbq_type { regular, remote, all };
 
+enum group_type {
+	group_other = 0,
+	group_misfit_task,
+	group_imbalanced,
+	group_overloaded,
+};
+
 #define LBF_ALL_PINNED	0x01
 #define LBF_NEED_BREAK	0x02
 #define LBF_DST_PINNED  0x04
@@ -7041,6 +7861,7 @@
 	int			new_dst_cpu;
 	enum cpu_idle_type	idle;
 	long			imbalance;
+	unsigned int		src_grp_nr_running;
 	/* The set of CPUs under consideration for load-balancing */
 	struct cpumask		*cpus;
 
@@ -7051,6 +7872,7 @@
 	unsigned int		loop_max;
 
 	enum fbq_type		fbq_type;
+	enum group_type		src_grp_type;
 	struct list_head	tasks;
 };
 
@@ -7441,10 +8263,27 @@
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->load.weight)
+		return false;
+
+	if (cfs_rq->avg.load_sum)
+		return false;
+
+	if (cfs_rq->avg.util_sum)
+		return false;
+
+	if (cfs_rq->avg.runnable_load_sum)
+		return false;
+
+	return true;
+}
+
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 	const struct sched_class *curr_class;
 	struct rq_flags rf;
 	bool done = true;
@@ -7456,14 +8295,10 @@
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
+	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
 		struct sched_entity *se;
 
-		/* throttled entities do not contribute to load */
-		if (throttled_hierarchy(cfs_rq))
-			continue;
-
-		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+		if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
 			update_tg_load_avg(cfs_rq, 0);
 
 		/* Propagate pending load changes to the parent, if any: */
@@ -7471,14 +8306,21 @@
 		if (se && !skip_blocked_update(se))
 			update_load_avg(cfs_rq_of(se), se, 0);
 
+		/*
+		 * There can be a lot of idle CPU cgroups.  Don't let fully
+		 * decayed cfs_rqs linger on the list.
+		 */
+		if (cfs_rq_is_decayed(cfs_rq))
+			list_del_leaf_cfs_rq(cfs_rq);
+
 		/* Don't need periodic decay once load/util_avg are null */
 		if (cfs_rq_has_blocked(cfs_rq))
 			done = false;
 	}
 
 	curr_class = rq->curr->sched_class;
-	update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
-	update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
 	update_irq_load_avg(rq, 0);
 	/* Don't need periodic decay once load/util_avg are null */
 	if (others_have_blocked(rq))
@@ -7548,11 +8390,11 @@
 
 	rq_lock_irqsave(rq, &rf);
 	update_rq_clock(rq);
-	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+	update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
 
 	curr_class = rq->curr->sched_class;
-	update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
-	update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
 	update_irq_load_avg(rq, 0);
 #ifdef CONFIG_NO_HZ_COMMON
 	rq->last_blocked_load_update_tick = jiffies;
@@ -7570,12 +8412,6 @@
 
 /********** Helpers for find_busiest_group ************************/
 
-enum group_type {
-	group_other = 0,
-	group_imbalanced,
-	group_overloaded,
-};
-
 /*
  * sg_lb_stats - stats of a sched_group required for load_balancing
  */
@@ -7591,6 +8427,7 @@
 	unsigned int group_weight;
 	enum group_type group_type;
 	int group_no_capacity;
+	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int nr_numa_running;
 	unsigned int nr_preferred_running;
@@ -7663,10 +8500,9 @@
 	return load_idx;
 }
 
-static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
+static unsigned long scale_rt_capacity(int cpu, unsigned long max)
 {
 	struct rq *rq = cpu_rq(cpu);
-	unsigned long max = arch_scale_cpu_capacity(sd, cpu);
 	unsigned long used, free;
 	unsigned long irq;
 
@@ -7686,12 +8522,46 @@
 	return scale_irq_capacity(free, irq, max);
 }
 
+void init_max_cpu_capacity(struct max_cpu_capacity *mcc) {
+	raw_spin_lock_init(&mcc->lock);
+	mcc->val = 0;
+	mcc->cpu = -1;
+}
+
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-	unsigned long capacity = scale_rt_capacity(sd, cpu);
+	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
 	struct sched_group *sdg = sd->groups;
+	struct max_cpu_capacity *mcc;
+	unsigned long max_capacity;
+	int max_cap_cpu;
+	unsigned long flags;
 
-	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
+	cpu_rq(cpu)->cpu_capacity_orig = capacity;
+
+	capacity *= arch_scale_max_freq_capacity(sd, cpu);
+	capacity >>= SCHED_CAPACITY_SHIFT;
+
+	mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
+
+	raw_spin_lock_irqsave(&mcc->lock, flags);
+	max_capacity = mcc->val;
+	max_cap_cpu = mcc->cpu;
+
+	if ((max_capacity > capacity && max_cap_cpu == cpu) ||
+	    (max_capacity < capacity)) {
+		mcc->val = capacity;
+		mcc->cpu = cpu;
+#ifdef CONFIG_SCHED_DEBUG
+		raw_spin_unlock_irqrestore(&mcc->lock, flags);
+		pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+		goto skip_unlock;
+#endif
+	}
+	raw_spin_unlock_irqrestore(&mcc->lock, flags);
+
+skip_unlock: __attribute__ ((unused));
+	capacity = scale_rt_capacity(cpu, capacity);
 
 	if (!capacity)
 		capacity = 1;
@@ -7699,13 +8569,14 @@
 	cpu_rq(cpu)->cpu_capacity = capacity;
 	sdg->sgc->capacity = capacity;
 	sdg->sgc->min_capacity = capacity;
+	sdg->sgc->max_capacity = capacity;
 }
 
 void update_group_capacity(struct sched_domain *sd, int cpu)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
-	unsigned long capacity, min_capacity;
+	unsigned long capacity, min_capacity, max_capacity;
 	unsigned long interval;
 
 	interval = msecs_to_jiffies(sd->balance_interval);
@@ -7719,6 +8590,7 @@
 
 	capacity = 0;
 	min_capacity = ULONG_MAX;
+	max_capacity = 0;
 
 	if (child->flags & SD_OVERLAP) {
 		/*
@@ -7749,6 +8621,7 @@
 			}
 
 			min_capacity = min(capacity, min_capacity);
+			max_capacity = max(capacity, max_capacity);
 		}
 	} else  {
 		/*
@@ -7762,12 +8635,14 @@
 
 			capacity += sgc->capacity;
 			min_capacity = min(sgc->min_capacity, min_capacity);
+			max_capacity = max(sgc->max_capacity, max_capacity);
 			group = group->next;
 		} while (group != child->groups);
 	}
 
 	sdg->sgc->capacity = capacity;
 	sdg->sgc->min_capacity = min_capacity;
+	sdg->sgc->max_capacity = max_capacity;
 }
 
 /*
@@ -7863,16 +8738,27 @@
 }
 
 /*
- * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
  * per-CPU capacity than sched_group ref.
  */
 static inline bool
-group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
 {
 	return sg->sgc->min_capacity * capacity_margin <
 						ref->sgc->min_capacity * 1024;
 }
 
+/*
+ * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-CPU capacity_orig than sched_group ref.
+ */
+static inline bool
+group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+	return sg->sgc->max_capacity * capacity_margin <
+						ref->sgc->max_capacity * 1024;
+}
+
 static inline enum
 group_type group_classify(struct sched_group *group,
 			  struct sg_lb_stats *sgs)
@@ -7883,6 +8769,9 @@
 	if (sg_imbalanced(group))
 		return group_imbalanced;
 
+	if (sgs->group_misfit_task_load)
+		return group_misfit_task;
+
 	return group_other;
 }
 
@@ -7912,16 +8801,16 @@
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
  * @group: sched_group whose statistics are to be updated.
- * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @local_group: Does group contain this_cpu.
  * @sgs: variable to hold the statistics for this group.
- * @overload: Indicate more than one runnable task for any CPU.
+ * @sg_status: Holds flag indicating the status of the sched_group
  */
 static inline void update_sg_lb_stats(struct lb_env *env,
-			struct sched_group *group, int load_idx,
-			int local_group, struct sg_lb_stats *sgs,
-			bool *overload)
+				      struct sched_group *group,
+				      struct sg_lb_stats *sgs,
+				      int *sg_status)
 {
+	int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+	int load_idx = get_sd_load_idx(env->sd, env->idle);
 	unsigned long load;
 	int i, nr_running;
 
@@ -7945,7 +8834,10 @@
 
 		nr_running = rq->nr_running;
 		if (nr_running > 1)
-			*overload = true;
+			*sg_status |= SG_OVERLOAD;
+
+		if (cpu_overutilized(i))
+			*sg_status |= SG_OVERUTILIZED;
 
 #ifdef CONFIG_NUMA_BALANCING
 		sgs->nr_numa_running += rq->nr_numa_running;
@@ -7957,6 +8849,12 @@
 		 */
 		if (!nr_running && idle_cpu(i))
 			sgs->idle_cpus++;
+
+		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+		    sgs->group_misfit_task_load < rq->misfit_task_load) {
+			sgs->group_misfit_task_load = rq->misfit_task_load;
+			*sg_status |= SG_OVERLOAD;
+		}
 	}
 
 	/* Adjust by relative CPU capacity of the group */
@@ -7992,6 +8890,17 @@
 {
 	struct sg_lb_stats *busiest = &sds->busiest_stat;
 
+	/*
+	 * Don't try to pull misfit tasks we can't help.
+	 * We can use max_capacity here as reduction in capacity on some
+	 * CPUs in the group should either be possible to resolve
+	 * internally or be covered by avg_load imbalance (eventually).
+	 */
+	if (sgs->group_type == group_misfit_task &&
+	    (!group_smaller_max_cpu_capacity(sg, sds->local) ||
+	     !group_has_capacity(env, &sds->local_stat)))
+		return false;
+
 	if (sgs->group_type > busiest->group_type)
 		return true;
 
@@ -8011,7 +8920,14 @@
 	 * power/energy consequences are not considered.
 	 */
 	if (sgs->sum_nr_running <= sgs->group_weight &&
-	    group_smaller_cpu_capacity(sds->local, sg))
+	    group_smaller_min_cpu_capacity(sds->local, sg))
+		return false;
+
+	/*
+	 * If we have more than one misfit sg go with the biggest misfit.
+	 */
+	if (sgs->group_type == group_misfit_task &&
+	    sgs->group_misfit_task_load < busiest->group_misfit_task_load)
 		return false;
 
 asym_packing:
@@ -8082,19 +8998,14 @@
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
 	struct sg_lb_stats tmp_sgs;
-	int load_idx, prefer_sibling = 0;
-	bool overload = false;
-
-	if (child && child->flags & SD_PREFER_SIBLING)
-		prefer_sibling = 1;
+	bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+	int sg_status = 0;
 
 #ifdef CONFIG_NO_HZ_COMMON
 	if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
 		env->flags |= LBF_NOHZ_STATS;
 #endif
 
-	load_idx = get_sd_load_idx(env->sd, env->idle);
-
 	do {
 		struct sg_lb_stats *sgs = &tmp_sgs;
 		int local_group;
@@ -8109,8 +9020,7 @@
 				update_group_capacity(env->sd, env->dst_cpu);
 		}
 
-		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
-						&overload);
+		update_sg_lb_stats(env, sg, sgs, &sg_status);
 
 		if (local_group)
 			goto next_group;
@@ -8158,11 +9068,22 @@
 	if (env->sd->flags & SD_NUMA)
 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 
+	env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
+
 	if (!env->sd->parent) {
+		struct root_domain *rd = env->dst_rq->rd;
+
 		/* update overload indicator if we are at root domain */
-		if (env->dst_rq->rd->overload != overload)
-			env->dst_rq->rd->overload = overload;
+		WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
+
+		/* Update over-utilization (tipping point, U >= 0) indicator */
+		WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
+		trace_sched_overutilized(!!(sg_status & SG_OVERUTILIZED));
+	} else if (sg_status & SG_OVERUTILIZED) {
+		WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
+		trace_sched_overutilized(1);
 	}
+
 }
 
 /**
@@ -8278,7 +9199,22 @@
 	capa_move /= SCHED_CAPACITY_SCALE;
 
 	/* Move if we gain throughput */
-	if (capa_move > capa_now)
+	if (capa_move > capa_now) {
+		env->imbalance = busiest->load_per_task;
+		return;
+	}
+
+	/* We can't see throughput improvement with the load-based
+	 * method, but it is possible depending upon group size and
+	 * capacity range that there might still be an underutilized
+	 * cpu available in an asymmetric capacity system. Do one last
+	 * check just in case.
+	 */
+	if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+		busiest->group_type == group_overloaded &&
+		busiest->sum_nr_running > busiest->group_weight &&
+		local->sum_nr_running < local->group_weight &&
+		local->group_capacity < busiest->group_capacity)
 		env->imbalance = busiest->load_per_task;
 }
 
@@ -8311,8 +9247,9 @@
 	 * factors in sg capacity and sgs with smaller group_type are
 	 * skipped when updating the busiest sg:
 	 */
-	if (busiest->avg_load <= sds->avg_load ||
-	    local->avg_load >= sds->avg_load) {
+	if (busiest->group_type != group_misfit_task &&
+	    (busiest->avg_load <= sds->avg_load ||
+	     local->avg_load >= sds->avg_load)) {
 		env->imbalance = 0;
 		return fix_small_imbalance(env, sds);
 	}
@@ -8346,6 +9283,22 @@
 		(sds->avg_load - local->avg_load) * local->group_capacity
 	) / SCHED_CAPACITY_SCALE;
 
+	/* Boost imbalance to allow misfit task to be balanced.
+	 * Always do this if we are doing a NEWLY_IDLE balance
+	 * on the assumption that any tasks we have must not be
+	 * long-running (and hence we cannot rely upon load).
+	 * However if we are not idle, we should assume the tasks
+	 * we have are longer running and not override load-based
+	 * calculations above unless we are sure that the local
+	 * group is underutilized.
+	 */
+	if (busiest->group_type == group_misfit_task &&
+		(env->idle == CPU_NEWLY_IDLE ||
+		local->sum_nr_running < local->group_weight)) {
+		env->imbalance = max_t(long, env->imbalance,
+				       busiest->group_misfit_task_load);
+	}
+
 	/*
 	 * if *imbalance is less than the average load per runnable task
 	 * there is no guarantee that any tasks will be moved so we'll have
@@ -8381,6 +9334,14 @@
 	 * this level.
 	 */
 	update_sd_lb_stats(env, &sds);
+
+	if (static_branch_unlikely(&sched_energy_present)) {
+		struct root_domain *rd = env->dst_rq->rd;
+
+		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+			goto out_balanced;
+	}
+
 	local = &sds.local_stat;
 	busiest = &sds.busiest_stat;
 
@@ -8412,6 +9373,10 @@
 	    busiest->group_no_capacity)
 		goto force_balance;
 
+	/* Misfit tasks should be dealt with regardless of the avg load */
+	if (busiest->group_type == group_misfit_task)
+		goto force_balance;
+
 	/*
 	 * If the local group is busier than the selected busiest group
 	 * don't try and pull any tasks.
@@ -8449,6 +9414,7 @@
 
 force_balance:
 	/* Looks like there is an imbalance. Compute it */
+	env->src_grp_type = busiest->group_type;
 	calculate_imbalance(env, &sds);
 	return env->imbalance ? sds.busiest : NULL;
 
@@ -8496,8 +9462,32 @@
 		if (rt > env->fbq_type)
 			continue;
 
+		/*
+		 * For ASYM_CPUCAPACITY domains with misfit tasks we simply
+		 * seek the "biggest" misfit task.
+		 */
+		if (env->src_grp_type == group_misfit_task) {
+			if (rq->misfit_task_load > busiest_load) {
+				busiest_load = rq->misfit_task_load;
+				busiest = rq;
+			}
+
+			continue;
+		}
+
 		capacity = capacity_of(i);
 
+		/*
+		 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
+		 * eventually lead to active_balancing high->low capacity.
+		 * Higher per-CPU capacity is considered better than balancing
+		 * average load.
+		 */
+		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+		    capacity_of(env->dst_cpu) < capacity &&
+		    rq->nr_running == 1)
+			continue;
+
 		wl = weighted_cpuload(rq);
 
 		/*
@@ -8565,6 +9555,20 @@
 			return 1;
 	}
 
+	if (env->src_grp_type == group_misfit_task)
+		return 1;
+
+	if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+				env->src_rq->cfs.h_nr_running == 1 &&
+				cpu_overutilized(env->src_cpu) &&
+				!cpu_overutilized(env->dst_cpu)) {
+		return 1;
+	}
+
+	if (env->src_grp_type == group_overloaded && env->src_rq->misfit_task_load)
+		return 1;
+
+
 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 
@@ -8783,7 +9787,8 @@
 		 * excessive cache_hot migrations and active balances.
 		 */
 		if (idle != CPU_NEWLY_IDLE)
-			sd->nr_balance_failed++;
+			if (env.src_grp_nr_running > 1)
+				sd->nr_balance_failed++;
 
 		if (need_active_balance(&env)) {
 			unsigned long flags;
@@ -9221,7 +10226,7 @@
 	if (time_before(now, nohz.next_balance))
 		goto out;
 
-	if (rq->nr_running >= 2) {
+	if (rq->nr_running >= 2 || rq->misfit_task_load) {
 		flags = NOHZ_KICK_MASK;
 		goto out;
 	}
@@ -9250,7 +10255,7 @@
 		}
 	}
 
-	sd = rcu_dereference(per_cpu(sd_asym, cpu));
+	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
 	if (sd) {
 		for_each_cpu(i, sched_domain_span(sd)) {
 			if (i == cpu ||
@@ -9590,7 +10595,7 @@
 	rq_unpin_lock(this_rq, rf);
 
 	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
-	    !this_rq->rd->overload) {
+	    !READ_ONCE(this_rq->rd->overload)) {
 
 		rcu_read_lock();
 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
@@ -9752,6 +10757,9 @@
 
 	if (static_branch_unlikely(&sched_numa_balancing))
 		task_tick_numa(rq, curr);
+
+	update_misfit_status(curr, rq);
+	update_overutilized_status(task_rq(curr));
 }
 
 /*
@@ -10256,10 +11264,10 @@
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu)
 {
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 
 	rcu_read_lock();
-	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
 		print_cfs_rq(m, cpu, cfs_rq);
 	rcu_read_unlock();
 }

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 85ae848..50bdfd7 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h

@@ -90,3 +90,33 @@
  * UtilEstimation. Use estimated CPU utilization.
  */
 SCHED_FEAT(UTIL_EST, true)
+
+/*
+ * Fast pre-selection of CPU candidates for EAS.
+ */
+SCHED_FEAT(FIND_BEST_TARGET, true)
+
+/*
+ * Energy aware scheduling algorithm choices:
+ * EAS_PREFER_IDLE
+ *   Direct tasks in a schedtune.prefer_idle=1 group through
+ *   the EAS path for wakeup task placement. Otherwise, put
+ *   those tasks through the mainline slow path.
+ */
+SCHED_FEAT(EAS_PREFER_IDLE, true)
+
+/*
+ * Request max frequency from schedutil whenever a RT task is running.
+ */
+SCHED_FEAT(SUGOV_RT_MAX_FREQ, false)
+
+/*
+ * Apply schedtune boost hold to tasks of all sched classes.
+ * If enabled, schedtune will hold the boost applied to a CPU
+ * for 50ms regardless of task activation - if the task is
+ * still running 50ms later, the boost hold expires and schedtune
+ * boost will expire immediately the task stops.
+ * If disabled, this behaviour will only apply to tasks of the
+ * RT class.
+ */
+SCHED_FEAT(SCHEDTUNE_BOOST_HOLD_ALL, false)

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 44a1736..8a88061 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c

@@ -16,9 +16,10 @@
  * sched_idle_set_state - Record idle state for the current CPU.
  * @idle_state: State to record.
  */
-void sched_idle_set_state(struct cpuidle_state *idle_state)
+void sched_idle_set_state(struct cpuidle_state *idle_state, int index)
 {
 	idle_set_state(this_rq(), idle_state);
+	idle_set_state_idx(this_rq(), index);
 }
 
 static int __read_mostly cpu_idle_force_poll;
@@ -375,7 +376,8 @@
 
 #ifdef CONFIG_SMP
 static int
-select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags,
+		    int sibling_count_hint)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }

diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 48a1264..19ba404 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c

@@ -26,9 +26,10 @@
 
 #include <linux/sched.h>
 #include "sched.h"
-#include "sched-pelt.h"
 #include "pelt.h"
 
+#include <trace/events/sched.h>
+
 /*
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
@@ -106,16 +107,12 @@
  *                     n=1
  */
 static __always_inline u32
-accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+accumulate_sum(u64 delta, struct sched_avg *sa,
 	       unsigned long load, unsigned long runnable, int running)
 {
-	unsigned long scale_freq, scale_cpu;
 	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
 	u64 periods;
 
-	scale_freq = arch_scale_freq_capacity(cpu);
-	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
 	delta += sa->period_contrib;
 	periods = delta / 1024; /* A period is 1024us (~1ms) */
 
@@ -137,13 +134,12 @@
 	}
 	sa->period_contrib = delta;
 
-	contrib = cap_scale(contrib, scale_freq);
 	if (load)
 		sa->load_sum += load * contrib;
 	if (runnable)
 		sa->runnable_load_sum += runnable * contrib;
 	if (running)
-		sa->util_sum += contrib * scale_cpu;
+		sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
 
 	return periods;
 }
@@ -177,7 +173,7 @@
  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
  */
 static __always_inline int
-___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+___update_load_sum(u64 now, struct sched_avg *sa,
 		  unsigned long load, unsigned long runnable, int running)
 {
 	u64 delta;
@@ -221,7 +217,7 @@
 	 * Step 1: accumulate *_sum since last_update_time. If we haven't
 	 * crossed period boundaries, finish.
 	 */
-	if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
+	if (!accumulate_sum(delta, sa, load, runnable, running))
 		return 0;
 
 	return 1;
@@ -267,43 +263,46 @@
  *   runnable_load_avg = \Sum se->avg.runable_load_avg
  */
 
-int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
 {
-	if (entity_is_task(se))
-		se->runnable_weight = se->load.weight;
-
-	if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+	if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
 		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+
+		trace_sched_load_se(se);
+
 		return 1;
 	}
 
 	return 0;
 }
 
-int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
+int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	if (entity_is_task(se))
-		se->runnable_weight = se->load.weight;
-
-	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+	if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq,
 				cfs_rq->curr == se)) {
 
 		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
 		cfs_se_util_change(&se->avg);
+
+		trace_sched_load_se(se);
+
 		return 1;
 	}
 
 	return 0;
 }
 
-int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
 {
-	if (___update_load_sum(now, cpu, &cfs_rq->avg,
+	if (___update_load_sum(now, &cfs_rq->avg,
 				scale_load_down(cfs_rq->load.weight),
 				scale_load_down(cfs_rq->runnable_weight),
 				cfs_rq->curr != NULL)) {
 
 		___update_load_avg(&cfs_rq->avg, 1, 1);
+
+		trace_sched_load_cfs_rq(cfs_rq);
+
 		return 1;
 	}
 
@@ -323,12 +322,15 @@
 
 int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
 {
-	if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
+	if (___update_load_sum(now, &rq->avg_rt,
 				running,
 				running,
 				running)) {
 
 		___update_load_avg(&rq->avg_rt, 1, 1);
+
+		trace_sched_load_rt_rq(rq);
+
 		return 1;
 	}
 
@@ -346,7 +348,7 @@
 
 int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
 {
-	if (___update_load_sum(now, rq->cpu, &rq->avg_dl,
+	if (___update_load_sum(now, &rq->avg_dl,
 				running,
 				running,
 				running)) {
@@ -371,22 +373,31 @@
 int update_irq_load_avg(struct rq *rq, u64 running)
 {
 	int ret = 0;
+
+	/*
+	 * We can't use clock_pelt because irq time is not accounted in
+	 * clock_task. Instead we directly scale the running time to
+	 * reflect the real amount of computation
+	 */
+	running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
+	running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+
 	/*
 	 * We know the time that has been used by interrupt since last update
 	 * but we don't when. Let be pessimistic and assume that interrupt has
 	 * happened just before the update. This is not so far from reality
 	 * because interrupt will most probably wake up task and trig an update
-	 * of rq clock during which the metric si updated.
+	 * of rq clock during which the metric is updated.
 	 * We start to decay with normal context time and then we add the
 	 * interrupt context time.
 	 * We can safely remove running from rq->clock because
 	 * rq->clock += delta with delta >= running
 	 */
-	ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
+	ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
 				0,
 				0,
 				0);
-	ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
+	ret += ___update_load_sum(rq->clock, &rq->avg_irq,
 				1,
 				1,
 				1);

diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7e56b48..7489d5f 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h

@@ -1,8 +1,9 @@
 #ifdef CONFIG_SMP
+#include "sched-pelt.h"
 
-int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
-int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
-int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
+int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
+int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
+int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
 int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
 int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
 
@@ -42,6 +43,101 @@
 	WRITE_ONCE(avg->util_est.enqueued, enqueued);
 }
 
+/*
+ * The clock_pelt scales the time to reflect the effective amount of
+ * computation done during the running delta time but then sync back to
+ * clock_task when rq is idle.
+ *
+ *
+ * absolute time   | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16
+ * @ max capacity  ------******---------------******---------------
+ * @ half capacity ------************---------************---------
+ * clock pelt      | 1| 2|    3|    4| 7| 8| 9|   10|   11|14|15|16
+ *
+ */
+static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
+{
+	if (unlikely(is_idle_task(rq->curr))) {
+		/* The rq is idle, we can sync to clock_task */
+		rq->clock_pelt  = rq_clock_task(rq);
+		return;
+	}
+
+	/*
+	 * When a rq runs at a lower compute capacity, it will need
+	 * more time to do the same amount of work than at max
+	 * capacity. In order to be invariant, we scale the delta to
+	 * reflect how much work has been really done.
+	 * Running longer results in stealing idle time that will
+	 * disturb the load signal compared to max capacity. This
+	 * stolen idle time will be automatically reflected when the
+	 * rq will be idle and the clock will be synced with
+	 * rq_clock_task.
+	 */
+
+	/*
+	 * Scale the elapsed time to reflect the real amount of
+	 * computation
+	 */
+	delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+	delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
+
+	rq->clock_pelt += delta;
+}
+
+/*
+ * When rq becomes idle, we have to check if it has lost idle time
+ * because it was fully busy. A rq is fully used when the /Sum util_sum
+ * is greater or equal to:
+ * (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT;
+ * For optimization and computing rounding purpose, we don't take into account
+ * the position in the current window (period_contrib) and we use the higher
+ * bound of util_sum to decide.
+ */
+static inline void update_idle_rq_clock_pelt(struct rq *rq)
+{
+	u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX;
+	u32 util_sum = rq->cfs.avg.util_sum;
+	util_sum += rq->avg_rt.util_sum;
+	util_sum += rq->avg_dl.util_sum;
+
+	/*
+	 * Reflecting stolen time makes sense only if the idle
+	 * phase would be present at max capacity. As soon as the
+	 * utilization of a rq has reached the maximum value, it is
+	 * considered as an always runnig rq without idle time to
+	 * steal. This potential idle time is considered as lost in
+	 * this case. We keep track of this lost idle time compare to
+	 * rq's clock_task.
+	 */
+	if (util_sum >= divider)
+		rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
+}
+
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+	lockdep_assert_held(&rq->lock);
+	assert_clock_updated(rq);
+
+	return rq->clock_pelt - rq->lost_idle_time;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+	if (unlikely(cfs_rq->throttle_count))
+		return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
+
+	return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+}
+#else
+static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+	return rq_clock_pelt(rq_of(cfs_rq));
+}
+#endif
+
 #else
 
 static inline int
@@ -67,6 +163,18 @@
 {
 	return 0;
 }
+
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+	return rq_clock_task(rq);
+}
+
+static inline void
+update_rq_clock_pelt(struct rq *rq, s64 delta) { }
+
+static inline void
+update_idle_rq_clock_pelt(struct rq *rq) { }
+
 #endif
 
 

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b980cc9..150cde3 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c

@@ -1329,6 +1329,8 @@
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 
+	schedtune_enqueue_task(p, cpu_of(rq));
+
 	if (flags & ENQUEUE_WAKEUP)
 		rt_se->timeout = 0;
 
@@ -1342,6 +1344,8 @@
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 
+	schedtune_dequeue_task(p, cpu_of(rq));
+
 	update_curr_rt(rq);
 	dequeue_rt_entity(rt_se, flags);
 
@@ -1386,7 +1390,8 @@
 static int find_lowest_rq(struct task_struct *task);
 
 static int
-select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
+		  int sibling_count_hint)
 {
 	struct task_struct *curr;
 	struct rq *rq;
@@ -1584,7 +1589,7 @@
 	 * rt task
 	 */
 	if (rq->curr->sched_class != &rt_sched_class)
-		update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
+		update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
 
 	return p;
 }
@@ -1593,7 +1598,7 @@
 {
 	update_curr_rt(rq);
 
-	update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
 
 	/*
 	 * The previous task needs to be made eligible for pushing
@@ -2324,7 +2329,7 @@
 	struct sched_rt_entity *rt_se = &p->rt;
 
 	update_curr_rt(rq);
-	update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
 
 	watchdog(rq, p);
 

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 94bec97..5625c0c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h

@@ -45,6 +45,7 @@
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
 #include <linux/delayacct.h>
+#include <linux/energy_model.h>
 #include <linux/init_task.h>
 #include <linux/kprobes.h>
 #include <linux/kthread.h>
@@ -80,6 +81,8 @@
 # define SCHED_WARN_ON(x)	({ (void)(x), 0; })
 #endif
 
+#include "tune.h"
+
 struct rq;
 struct cpuidle_state;
 
@@ -699,6 +702,22 @@
 	return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
 }
 
+struct perf_domain {
+	struct em_perf_domain *em_pd;
+	struct perf_domain *next;
+	struct rcu_head rcu;
+};
+
+struct max_cpu_capacity {
+	raw_spinlock_t lock;
+	unsigned long val;
+	int cpu;
+};
+
+/* Scheduling group status flags */
+#define SG_OVERLOAD		0x1 /* More than one runnable task on a CPU. */
+#define SG_OVERUTILIZED		0x2 /* One or more CPUs are over-utilized. */
+
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
@@ -714,8 +733,15 @@
 	cpumask_var_t		span;
 	cpumask_var_t		online;
 
-	/* Indicate more than one runnable task for any CPU */
-	bool			overload;
+	/*
+	 * Indicate pullable load on at least one CPU, e.g:
+	 * - More than one runnable task
+	 * - Running task is misfit
+	 */
+	int			overload;
+
+	/* Indicate one or more cpus over-utilized (tipping point) */
+	int			overutilized;
 
 	/*
 	 * The bit corresponding to a CPU gets set here if such CPU has more
@@ -746,13 +772,21 @@
 	cpumask_var_t		rto_mask;
 	struct cpupri		cpupri;
 
-	unsigned long		max_cpu_capacity;
+	/* Maximum cpu capacity in the system. */
+	struct max_cpu_capacity max_cpu_capacity;
+
+	/*
+	 * NULL-terminated list of performance domains intersecting with the
+	 * CPUs of the rd. Protected by RCU.
+	 */
+	struct perf_domain	*pd;
 };
 
 extern struct root_domain def_root_domain;
 extern struct mutex sched_domains_mutex;
 
 extern void init_defrootdomain(void);
+extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
 extern int sched_init_domains(const struct cpumask *cpu_map);
 extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
 extern void sched_get_rd(struct root_domain *rd);
@@ -827,7 +861,10 @@
 
 	unsigned int		clock_update_flags;
 	u64			clock;
-	u64			clock_task;
+	/* Ensure that all clocks are in the same cache line */
+	u64			clock_task ____cacheline_aligned;
+	u64			clock_pelt;
+	unsigned long		lost_idle_time;
 
 	atomic_t		nr_iowait;
 
@@ -842,6 +879,8 @@
 
 	unsigned char		idle_balance;
 
+	unsigned long		misfit_task_load;
+
 	/* For active balancing */
 	int			active_balance;
 	int			push_cpu;
@@ -912,9 +951,26 @@
 #ifdef CONFIG_CPU_IDLE
 	/* Must be inspected within a rcu lock section */
 	struct cpuidle_state	*idle_state;
+	int			idle_state_idx;
 #endif
 };
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* CPU runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->rq;
+}
+
+#else
+
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+	return container_of(cfs_rq, struct rq, cfs);
+}
+#endif
+
 static inline int cpu_of(struct rq *rq)
 {
 #ifdef CONFIG_SMP
@@ -1180,7 +1236,9 @@
 DECLARE_PER_CPU(int, sd_llc_id);
 DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+extern struct static_key_false sched_asym_cpucapacity;
 
 struct sched_group_capacity {
 	atomic_t		ref;
@@ -1190,6 +1248,7 @@
 	 */
 	unsigned long		capacity;
 	unsigned long		min_capacity;		/* Min per-CPU capacity in group */
+	unsigned long		max_capacity;		/* Max per-CPU capacity in group */
 	unsigned long		next_update;
 	int			imbalance;		/* XXX unrelated to capacity but shared group state */
 
@@ -1355,7 +1414,7 @@
 
 #undef SCHED_FEAT
 
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
 
 /*
  * To support run-time toggling of sched features, all the translation units
@@ -1375,7 +1434,7 @@
 extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
 
-#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */
+#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
 
 /*
  * Each translation unit has its own copy of sysctl_sched_features to allow
@@ -1391,7 +1450,7 @@
 
 #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 
-#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */
+#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
 
 extern struct static_key_false sched_numa_balancing;
 extern struct static_key_false sched_schedstats;
@@ -1518,7 +1577,8 @@
 	void (*put_prev_task)(struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
-	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
+	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags,
+			       int subling_count_hint);
 	void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
 
 	void (*task_woken)(struct rq *this_rq, struct task_struct *task);
@@ -1606,6 +1666,17 @@
 
 	return rq->idle_state;
 }
+
+static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
+{
+	rq->idle_state_idx = idle_state_idx;
+}
+
+static inline int idle_get_state_idx(struct rq *rq)
+{
+	WARN_ON(!rcu_read_lock_held());
+	return rq->idle_state_idx;
+}
 #else
 static inline void idle_set_state(struct rq *rq,
 				  struct cpuidle_state *idle_state)
@@ -1616,6 +1687,15 @@
 {
 	return NULL;
 }
+
+static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
+{
+}
+
+static inline int idle_get_state_idx(struct rq *rq)
+{
+	return -1;
+}
 #endif
 
 extern void schedule_idle(void);
@@ -1689,8 +1769,8 @@
 
 	if (prev_nr < 2 && rq->nr_running >= 2) {
 #ifdef CONFIG_SMP
-		if (!rq->rd->overload)
-			rq->rd->overload = true;
+		if (!READ_ONCE(rq->rd->overload))
+			WRITE_ONCE(rq->rd->overload, 1);
 #endif
 	}
 
@@ -1749,26 +1829,14 @@
 }
 #endif
 
-#ifdef CONFIG_SMP
-#ifndef arch_scale_cpu_capacity
+#ifndef arch_scale_max_freq_capacity
+struct sched_domain;
 static __always_inline
-unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
-	if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
-		return sd->smt_gain / sd->span_weight;
-
-	return SCHED_CAPACITY_SCALE;
-}
-#endif
-#else
-#ifndef arch_scale_cpu_capacity
-static __always_inline
-unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
+unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
 {
 	return SCHED_CAPACITY_SCALE;
 }
 #endif
-#endif
 
 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 	__acquires(rq->lock);
@@ -2181,7 +2249,46 @@
 # define arch_scale_freq_invariant()	false
 #endif
 
+#ifdef CONFIG_SMP
+static inline unsigned long capacity_orig_of(int cpu)
+{
+	return cpu_rq(cpu)->cpu_capacity_orig;
+}
+#endif
+
 #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+/**
+ * enum schedutil_type - CPU utilization type
+ * @FREQUENCY_UTIL:	Utilization used to select frequency
+ * @ENERGY_UTIL:	Utilization used during energy calculation
+ *
+ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
+ * need to be aggregated differently depending on the usage made of them. This
+ * enum is used within schedutil_freq_util() to differentiate the types of
+ * utilization expected by the callers, and adjust the aggregation accordingly.
+ */
+enum schedutil_type {
+	FREQUENCY_UTIL,
+	ENERGY_UTIL,
+};
+
+unsigned long schedutil_freq_util(int cpu, unsigned long util,
+			          unsigned long max, enum schedutil_type type);
+
+static inline unsigned long schedutil_energy_util(int cpu, unsigned long util)
+{
+	unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
+
+	return schedutil_freq_util(cpu, util, max, ENERGY_UTIL);
+}
+#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+static inline unsigned long schedutil_energy_util(int cpu, unsigned long util)
+{
+	return util;
+}
+#endif
+
+#ifdef CONFIG_SMP
 static inline unsigned long cpu_bw_dl(struct rq *rq)
 {
 	return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
@@ -2237,3 +2344,13 @@
 	return util;
 }
 #endif
+
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
+#else
+#define perf_domain_span(pd) NULL
+#endif
+
+#ifdef CONFIG_SMP
+extern struct static_key_false sched_energy_present;
+#endif

diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index c183b79..6446d61 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c

@@ -11,7 +11,8 @@
 
 #ifdef CONFIG_SMP
 static int
-select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags,
+		    int sibling_count_hint)
 {
 	return task_cpu(p); /* stop tasks as never migrate */
 }

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 74b6943..7bc2cdd 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c

@@ -201,6 +201,199 @@
 	return 1;
 }
 
+DEFINE_STATIC_KEY_FALSE(sched_energy_present);
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+DEFINE_MUTEX(sched_energy_mutex);
+bool sched_energy_update;
+
+static void free_pd(struct perf_domain *pd)
+{
+	struct perf_domain *tmp;
+
+	while (pd) {
+		tmp = pd->next;
+		kfree(pd);
+		pd = tmp;
+	}
+}
+
+static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
+{
+	while (pd) {
+		if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
+			return pd;
+		pd = pd->next;
+	}
+
+	return NULL;
+}
+
+static struct perf_domain *pd_init(int cpu)
+{
+	struct em_perf_domain *obj = em_cpu_get(cpu);
+	struct perf_domain *pd;
+
+	if (!obj) {
+		if (sched_debug())
+			pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
+		return NULL;
+	}
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return NULL;
+	pd->em_pd = obj;
+
+	return pd;
+}
+
+static void perf_domain_debug(const struct cpumask *cpu_map,
+						struct perf_domain *pd)
+{
+	if (!sched_debug() || !pd)
+		return;
+
+	printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
+
+	while (pd) {
+		printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
+				cpumask_first(perf_domain_span(pd)),
+				cpumask_pr_args(perf_domain_span(pd)),
+				em_pd_nr_cap_states(pd->em_pd));
+		pd = pd->next;
+	}
+
+	printk(KERN_CONT "\n");
+}
+
+static void destroy_perf_domain_rcu(struct rcu_head *rp)
+{
+	struct perf_domain *pd;
+
+	pd = container_of(rp, struct perf_domain, rcu);
+	free_pd(pd);
+}
+
+static void sched_energy_set(bool has_eas)
+{
+	if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
+		if (sched_debug())
+			pr_info("%s: stopping EAS\n", __func__);
+		static_branch_disable_cpuslocked(&sched_energy_present);
+	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
+		if (sched_debug())
+			pr_info("%s: starting EAS\n", __func__);
+		static_branch_enable_cpuslocked(&sched_energy_present);
+	}
+}
+
+/*
+ * EAS can be used on a root domain if it meets all the following conditions:
+ *    1. an Energy Model (EM) is available;
+ *    2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
+ *    3. the EM complexity is low enough to keep scheduling overheads low;
+ *    4. schedutil is driving the frequency of all CPUs of the rd;
+ *
+ * The complexity of the Energy Model is defined as:
+ *
+ *              C = nr_pd * (nr_cpus + nr_cs)
+ *
+ * with parameters defined as:
+ *  - nr_pd:    the number of performance domains
+ *  - nr_cpus:  the number of CPUs
+ *  - nr_cs:    the sum of the number of capacity states of all performance
+ *              domains (for example, on a system with 2 performance domains,
+ *              with 10 capacity states each, nr_cs = 2 * 10 = 20).
+ *
+ * It is generally not a good idea to use such a model in the wake-up path on
+ * very complex platforms because of the associated scheduling overheads. The
+ * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
+ * with per-CPU DVFS and less than 8 capacity states each, for example.
+ */
+#define EM_MAX_COMPLEXITY 2048
+
+extern struct cpufreq_governor schedutil_gov;
+static bool build_perf_domains(const struct cpumask *cpu_map)
+{
+	int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
+	struct perf_domain *pd = NULL, *tmp;
+	int cpu = cpumask_first(cpu_map);
+	struct root_domain *rd = cpu_rq(cpu)->rd;
+	struct cpufreq_policy *policy;
+	struct cpufreq_governor *gov;
+
+	/* EAS is enabled for asymmetric CPU capacity topologies. */
+	if (!per_cpu(sd_asym_cpucapacity, cpu)) {
+		if (sched_debug()) {
+			pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
+					cpumask_pr_args(cpu_map));
+		}
+		goto free;
+	}
+
+	for_each_cpu(i, cpu_map) {
+		/* Skip already covered CPUs. */
+		if (find_pd(pd, i))
+			continue;
+
+		/* Do not attempt EAS if schedutil is not being used. */
+		policy = cpufreq_cpu_get(i);
+		if (!policy)
+			goto free;
+		gov = policy->governor;
+		cpufreq_cpu_put(policy);
+		if (gov != &schedutil_gov) {
+			if (rd->pd)
+				pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
+						cpumask_pr_args(cpu_map));
+			goto free;
+		}
+
+		/* Create the new pd and add it to the local list. */
+		tmp = pd_init(i);
+		if (!tmp)
+			goto free;
+		tmp->next = pd;
+		pd = tmp;
+
+		/*
+		 * Count performance domains and capacity states for the
+		 * complexity check.
+		 */
+		nr_pd++;
+		nr_cs += em_pd_nr_cap_states(pd->em_pd);
+	}
+
+	/* Bail out if the Energy Model complexity is too high. */
+	if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
+		WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
+						cpumask_pr_args(cpu_map));
+		goto free;
+	}
+
+	perf_domain_debug(cpu_map, pd);
+
+	/* Attach the new list of performance domains to the root domain. */
+	tmp = rd->pd;
+	rcu_assign_pointer(rd->pd, pd);
+	if (tmp)
+		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
+
+	return !!pd;
+
+free:
+	free_pd(pd);
+	tmp = rd->pd;
+	rcu_assign_pointer(rd->pd, NULL);
+	if (tmp)
+		call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
+
+	return false;
+}
+#else
+static void free_pd(struct perf_domain *pd) { }
+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
+
 static void free_rootdomain(struct rcu_head *rcu)
 {
 	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
@@ -211,6 +404,7 @@
 	free_cpumask_var(rd->rto_mask);
 	free_cpumask_var(rd->online);
 	free_cpumask_var(rd->span);
+	free_pd(rd->pd);
 	kfree(rd);
 }
 
@@ -287,6 +481,9 @@
 
 	if (cpupri_init(&rd->cpupri) != 0)
 		goto free_cpudl;
+
+	init_max_cpu_capacity(&rd->max_cpu_capacity);
+
 	return 0;
 
 free_cpudl:
@@ -397,7 +594,9 @@
 DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
 
 static void update_top_cache_domain(int cpu)
 {
@@ -422,7 +621,10 @@
 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
 
 	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
-	rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+	rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
+
+	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
+	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
 }
 
 /*
@@ -692,6 +894,7 @@
 	sg_span = sched_group_span(sg);
 	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
 	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
 }
 
 static int
@@ -851,6 +1054,7 @@
 
 	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
 	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+	sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
 
 	return sg;
 }
@@ -1061,7 +1265,6 @@
  *   SD_SHARE_PKG_RESOURCES - describes shared caches
  *   SD_NUMA                - describes NUMA topologies
  *   SD_SHARE_POWERDOMAIN   - describes shared power domain
- *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
  *
  * Odd one out, which beside describing the topology has a quirk also
  * prescribes the desired behaviour that goes along with it:
@@ -1073,13 +1276,12 @@
 	 SD_SHARE_PKG_RESOURCES |	\
 	 SD_NUMA		|	\
 	 SD_ASYM_PACKING	|	\
-	 SD_ASYM_CPUCAPACITY	|	\
 	 SD_SHARE_POWERDOMAIN)
 
 static struct sched_domain *
 sd_init(struct sched_domain_topology_level *tl,
 	const struct cpumask *cpu_map,
-	struct sched_domain *child, int cpu)
+	struct sched_domain *child, int dflags, int cpu)
 {
 	struct sd_data *sdd = &tl->data;
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@ -1100,6 +1302,9 @@
 			"wrong sd_flags in topology description\n"))
 		sd_flags &= ~TOPOLOGY_SD_FLAGS;
 
+	/* Apply detected topology flags */
+	sd_flags |= dflags;
+
 	*sd = (struct sched_domain){
 		.min_interval		= sd_weight,
 		.max_interval		= 2*sd_weight,
@@ -1122,7 +1327,7 @@
 					| 0*SD_SHARE_CPUCAPACITY
 					| 0*SD_SHARE_PKG_RESOURCES
 					| 0*SD_SERIALIZE
-					| 0*SD_PREFER_SIBLING
+					| 1*SD_PREFER_SIBLING
 					| 0*SD_NUMA
 					| sd_flags
 					,
@@ -1148,17 +1353,21 @@
 	if (sd->flags & SD_ASYM_CPUCAPACITY) {
 		struct sched_domain *t = sd;
 
+		/*
+		 * Don't attempt to spread across CPUs of different capacities.
+		 */
+		if (sd->child)
+			sd->child->flags &= ~SD_PREFER_SIBLING;
+
 		for_each_lower_domain(t)
 			t->flags |= SD_BALANCE_WAKE;
 	}
 
 	if (sd->flags & SD_SHARE_CPUCAPACITY) {
-		sd->flags |= SD_PREFER_SIBLING;
 		sd->imbalance_pct = 110;
 		sd->smt_gain = 1178; /* ~15% */
 
 	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
-		sd->flags |= SD_PREFER_SIBLING;
 		sd->imbalance_pct = 117;
 		sd->cache_nice_tries = 1;
 		sd->busy_idx = 2;
@@ -1169,6 +1378,7 @@
 		sd->busy_idx = 3;
 		sd->idle_idx = 2;
 
+		sd->flags &= ~SD_PREFER_SIBLING;
 		sd->flags |= SD_SERIALIZE;
 		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
 			sd->flags &= ~(SD_BALANCE_EXEC |
@@ -1178,7 +1388,6 @@
 
 #endif
 	} else {
-		sd->flags |= SD_PREFER_SIBLING;
 		sd->cache_nice_tries = 1;
 		sd->busy_idx = 2;
 		sd->idle_idx = 1;
@@ -1604,9 +1813,9 @@
 
 static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
-		struct sched_domain *child, int cpu)
+		struct sched_domain *child, int dflags, int cpu)
 {
-	struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
+	struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
 
 	if (child) {
 		sd->level = child->level + 1;
@@ -1633,6 +1842,65 @@
 }
 
 /*
+ * Find the sched_domain_topology_level where all CPU capacities are visible
+ * for all CPUs.
+ */
+static struct sched_domain_topology_level
+*asym_cpu_capacity_level(const struct cpumask *cpu_map)
+{
+	int i, j, asym_level = 0;
+	bool asym = false;
+	struct sched_domain_topology_level *tl, *asym_tl = NULL;
+	unsigned long cap;
+
+	/* Is there any asymmetry? */
+	cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
+
+	for_each_cpu(i, cpu_map) {
+		if (arch_scale_cpu_capacity(NULL, i) != cap) {
+			asym = true;
+			break;
+		}
+	}
+
+	if (!asym)
+		return NULL;
+
+	/*
+	 * Examine topology from all CPU's point of views to detect the lowest
+	 * sched_domain_topology_level where a highest capacity CPU is visible
+	 * to everyone.
+	 */
+	for_each_cpu(i, cpu_map) {
+		unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
+		int tl_id = 0;
+
+		for_each_sd_topology(tl) {
+			if (tl_id < asym_level)
+				goto next_level;
+
+			for_each_cpu_and(j, tl->mask(i), cpu_map) {
+				unsigned long capacity;
+
+				capacity = arch_scale_cpu_capacity(NULL, j);
+
+				if (capacity <= max_capacity)
+					continue;
+
+				max_capacity = capacity;
+				asym_level = tl_id;
+				asym_tl = tl;
+			}
+next_level:
+			tl_id++;
+		}
+	}
+
+	return asym_tl;
+}
+
+
+/*
  * Build sched domains for a given set of CPUs and attach the sched domains
  * to the individual CPUs
  */
@@ -1642,20 +1910,31 @@
 	enum s_alloc alloc_state;
 	struct sched_domain *sd;
 	struct s_data d;
-	struct rq *rq = NULL;
 	int i, ret = -ENOMEM;
+	struct sched_domain_topology_level *tl_asym;
+	bool has_asym = false;
 
 	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
 	if (alloc_state != sa_rootdomain)
 		goto error;
 
+	tl_asym = asym_cpu_capacity_level(cpu_map);
+
 	/* Set up domains for CPUs specified by the cpu_map: */
 	for_each_cpu(i, cpu_map) {
 		struct sched_domain_topology_level *tl;
 
 		sd = NULL;
 		for_each_sd_topology(tl) {
-			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+			int dflags = 0;
+
+			if (tl == tl_asym) {
+				dflags |= SD_ASYM_CPUCAPACITY;
+				has_asym = true;
+			}
+
+			sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
+
 			if (tl == sched_domain_topology)
 				*per_cpu_ptr(d.sd, i) = sd;
 			if (tl->flags & SDTL_OVERLAP)
@@ -1693,21 +1972,13 @@
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {
-		rq = cpu_rq(i);
 		sd = *per_cpu_ptr(d.sd, i);
-
-		/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
-		if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
-			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
-
 		cpu_attach_domain(sd, d.rd, i);
 	}
 	rcu_read_unlock();
 
-	if (rq && sched_debug_enabled) {
-		pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
-			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
-	}
+	if (has_asym)
+		static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
 
 	ret = 0;
 error:
@@ -1852,6 +2123,7 @@
 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 			     struct sched_domain_attr *dattr_new)
 {
+	bool __maybe_unused has_eas = false;
 	int i, j, n;
 	int new_topology;
 
@@ -1879,8 +2151,8 @@
 	/* Destroy deleted domains: */
 	for (i = 0; i < ndoms_cur; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(doms_cur[i], doms_new[j])
-			    && dattrs_equal(dattr_cur, i, dattr_new, j))
+			if (cpumask_equal(doms_cur[i], doms_new[j]) &&
+			    dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
 		}
 		/* No match - a current sched domain not in new doms_new[] */
@@ -1900,8 +2172,8 @@
 	/* Build new domains: */
 	for (i = 0; i < ndoms_new; i++) {
 		for (j = 0; j < n && !new_topology; j++) {
-			if (cpumask_equal(doms_new[i], doms_cur[j])
-			    && dattrs_equal(dattr_new, i, dattr_cur, j))
+			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
+			    dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
 		}
 		/* No match - add a new doms_new */
@@ -1910,6 +2182,24 @@
 		;
 	}
 
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+	/* Build perf. domains: */
+	for (i = 0; i < ndoms_new; i++) {
+		for (j = 0; j < n && !sched_energy_update; j++) {
+			if (cpumask_equal(doms_new[i], doms_cur[j]) &&
+			    cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
+				has_eas = true;
+				goto match3;
+			}
+		}
+		/* No match - add perf. domains for a new rd */
+		has_eas |= build_perf_domains(doms_new[i]);
+match3:
+		;
+	}
+	sched_energy_set(has_eas);
+#endif
+
 	/* Remember the new sched domains: */
 	if (doms_cur != &fallback_doms)
 		free_sched_domains(doms_cur, ndoms_cur);

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
new file mode 100644
index 0000000..6e228f7
--- /dev/null
+++ b/kernel/sched/tune.c

@@ -0,0 +1,686 @@
+#include <linux/cgroup.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+
+#include <trace/events/sched.h>
+
+#include "sched.h"
+
+bool schedtune_initialized = false;
+extern struct reciprocal_value schedtune_spc_rdiv;
+
+/* We hold schedtune boost in effect for at least this long */
+#define SCHEDTUNE_BOOST_HOLD_NS 50000000ULL
+
+/*
+ * EAS scheduler tunables for task groups.
+ *
+ * When CGroup support is enabled, we have to synchronize two different
+ * paths:
+ *  - slow path: where CGroups are created/updated/removed
+ *  - fast path: where tasks in a CGroups are accounted
+ *
+ * The slow path tracks (a limited number of) CGroups and maps each on a
+ * "boost_group" index. The fastpath accounts tasks currently RUNNABLE on each
+ * "boost_group".
+ *
+ * Once a new CGroup is created, a boost group idx is assigned and the
+ * corresponding "boost_group" marked as valid on each CPU.
+ * Once a CGroup is release, the corresponding "boost_group" is marked as
+ * invalid on each CPU. The CPU boost value (boost_max) is aggregated by
+ * considering only valid boost_groups with a non null tasks counter.
+ *
+ * .:: Locking strategy
+ *
+ * The fast path uses a spin lock for each CPU boost_group which protects the
+ * tasks counter.
+ *
+ * The "valid" and "boost" values of each CPU boost_group is instead
+ * protected by the RCU lock provided by the CGroups callbacks. Thus, only the
+ * slow path can access and modify the boost_group attribtues of each CPU.
+ * The fast path will catch up the most updated values at the next scheduling
+ * event (i.e. enqueue/dequeue).
+ *
+ *                                                        |
+ *                                             SLOW PATH  |   FAST PATH
+ *                              CGroup add/update/remove  |   Scheduler enqueue/dequeue events
+ *                                                        |
+ *                                                        |
+ *                                                        |     DEFINE_PER_CPU(struct boost_groups)
+ *                                                        |     +--------------+----+---+----+----+
+ *                                                        |     |  idle        |    |   |    |    |
+ *                                                        |     |  boost_max   |    |   |    |    |
+ *                                                        |  +---->lock        |    |   |    |    |
+ *  struct schedtune                  allocated_groups    |  |  |  group[    ] |    |   |    |    |
+ *  +------------------------------+         +-------+    |  |  +--+---------+-+----+---+----+----+
+ *  | idx                          |         |       |    |  |     |  valid  |
+ *  | boots / prefer_idle          |         |       |    |  |     |  boost  |
+ *  | perf_{boost/constraints}_idx | <---------+(*)  |    |  |     |  tasks  | <------------+
+ *  | css                          |         +-------+    |  |     +---------+              |
+ *  +-+----------------------------+         |       |    |  |     |         |              |
+ *    ^                                      |       |    |  |     |         |              |
+ *    |                                      +-------+    |  |     +---------+              |
+ *    |                                      |       |    |  |     |         |              |
+ *    |                                      |       |    |  |     |         |              |
+ *    |                                      +-------+    |  |     +---------+              |
+ *    | zmalloc                              |       |    |  |     |         |              |
+ *    |                                      |       |    |  |     |         |              |
+ *    |                                      +-------+    |  |     +---------+              |
+ *    +                              BOOSTGROUPS_COUNT    |  |     BOOSTGROUPS_COUNT        |
+ *  schedtune_boostgroup_init()                           |  +                              |
+ *                                                        |  schedtune_{en,de}queue_task()  |
+ *                                                        |                                 +
+ *                                                        |          schedtune_tasks_update()
+ *                                                        |
+ */
+
+/* SchdTune tunables for a group of tasks */
+struct schedtune {
+	/* SchedTune CGroup subsystem */
+	struct cgroup_subsys_state css;
+
+	/* Boost group allocated ID */
+	int idx;
+
+	/* Boost value for tasks on that SchedTune CGroup */
+	int boost;
+
+	/* Hint to bias scheduling of tasks on that SchedTune CGroup
+	 * towards idle CPUs */
+	int prefer_idle;
+};
+
+static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct schedtune, css) : NULL;
+}
+
+static inline struct schedtune *task_schedtune(struct task_struct *tsk)
+{
+	return css_st(task_css(tsk, schedtune_cgrp_id));
+}
+
+static inline struct schedtune *parent_st(struct schedtune *st)
+{
+	return css_st(st->css.parent);
+}
+
+/*
+ * SchedTune root control group
+ * The root control group is used to defined a system-wide boosting tuning,
+ * which is applied to all tasks in the system.
+ * Task specific boost tuning could be specified by creating and
+ * configuring a child control group under the root one.
+ * By default, system-wide boosting is disabled, i.e. no boosting is applied
+ * to tasks which are not into a child control group.
+ */
+static struct schedtune
+root_schedtune = {
+	.boost	= 0,
+	.prefer_idle = 0,
+};
+
+/*
+ * Maximum number of boost groups to support
+ * When per-task boosting is used we still allow only limited number of
+ * boost groups for two main reasons:
+ * 1. on a real system we usually have only few classes of workloads which
+ *    make sense to boost with different values (e.g. background vs foreground
+ *    tasks, interactive vs low-priority tasks)
+ * 2. a limited number allows for a simpler and more memory/time efficient
+ *    implementation especially for the computation of the per-CPU boost
+ *    value
+ */
+#define BOOSTGROUPS_COUNT 16
+
+/* Array of configured boostgroups */
+static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
+	&root_schedtune,
+	NULL,
+};
+
+/* SchedTune boost groups
+ * Keep track of all the boost groups which impact on CPU, for example when a
+ * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
+ * likely with different boost values.
+ * Since on each system we expect only a limited number of boost groups, here
+ * we use a simple array to keep track of the metrics required to compute the
+ * maximum per-CPU boosting value.
+ */
+struct boost_groups {
+	/* Maximum boost value for all RUNNABLE tasks on a CPU */
+	int boost_max;
+	u64 boost_ts;
+	struct {
+		/* True when this boost group maps an actual cgroup */
+		bool valid;
+		/* The boost for tasks on that boost group */
+		int boost;
+		/* Count of RUNNABLE tasks on that boost group */
+		unsigned tasks;
+		/* Timestamp of boost activation */
+		u64 ts;
+	} group[BOOSTGROUPS_COUNT];
+	/* CPU's boost group locking */
+	raw_spinlock_t lock;
+};
+
+/* Boost groups affecting each CPU in the system */
+DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
+
+static inline bool schedtune_boost_timeout(u64 now, u64 ts)
+{
+	return ((now - ts) > SCHEDTUNE_BOOST_HOLD_NS);
+}
+
+static inline bool
+schedtune_boost_group_active(int idx, struct boost_groups* bg, u64 now)
+{
+	if (bg->group[idx].tasks)
+		return true;
+
+	return !schedtune_boost_timeout(now, bg->group[idx].ts);
+}
+
+static void
+schedtune_cpu_update(int cpu, u64 now)
+{
+	struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+	int boost_max;
+	u64 boost_ts;
+	int idx;
+
+	/* The root boost group is always active */
+	boost_max = bg->group[0].boost;
+	boost_ts = now;
+	for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
+
+		/* Ignore non boostgroups not mapping a cgroup */
+		if (!bg->group[idx].valid)
+			continue;
+
+		/*
+		 * A boost group affects a CPU only if it has
+		 * RUNNABLE tasks on that CPU or it has hold
+		 * in effect from a previous task.
+		 */
+		if (!schedtune_boost_group_active(idx, bg, now))
+			continue;
+
+		/* This boost group is active */
+		if (boost_max > bg->group[idx].boost)
+			continue;
+
+		boost_max = bg->group[idx].boost;
+		boost_ts =  bg->group[idx].ts;
+	}
+
+	/* Ensures boost_max is non-negative when all cgroup boost values
+	 * are neagtive. Avoids under-accounting of cpu capacity which may cause
+	 * task stacking and frequency spikes.*/
+	boost_max = max(boost_max, 0);
+	bg->boost_max = boost_max;
+	bg->boost_ts = boost_ts;
+}
+
+static int
+schedtune_boostgroup_update(int idx, int boost)
+{
+	struct boost_groups *bg;
+	int cur_boost_max;
+	int old_boost;
+	int cpu;
+	u64 now;
+
+	/* Update per CPU boost groups */
+	for_each_possible_cpu(cpu) {
+		bg = &per_cpu(cpu_boost_groups, cpu);
+
+		/* CGroups are never associated to non active cgroups */
+		BUG_ON(!bg->group[idx].valid);
+
+		/*
+		 * Keep track of current boost values to compute the per CPU
+		 * maximum only when it has been affected by the new value of
+		 * the updated boost group
+		 */
+		cur_boost_max = bg->boost_max;
+		old_boost = bg->group[idx].boost;
+
+		/* Update the boost value of this boost group */
+		bg->group[idx].boost = boost;
+
+		/* Check if this update increase current max */
+		now = sched_clock_cpu(cpu);
+		if (boost > cur_boost_max &&
+			schedtune_boost_group_active(idx, bg, now)) {
+			bg->boost_max = boost;
+			bg->boost_ts = bg->group[idx].ts;
+
+			trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
+			continue;
+		}
+
+		/* Check if this update has decreased current max */
+		if (cur_boost_max == old_boost && old_boost > boost) {
+			schedtune_cpu_update(cpu, now);
+			trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
+			continue;
+		}
+
+		trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
+	}
+
+	return 0;
+}
+
+#define ENQUEUE_TASK  1
+#define DEQUEUE_TASK -1
+
+static inline bool
+schedtune_update_timestamp(struct task_struct *p)
+{
+	if (sched_feat(SCHEDTUNE_BOOST_HOLD_ALL))
+		return true;
+
+	return task_has_rt_policy(p);
+}
+
+static inline void
+schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
+{
+	struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+	int tasks = bg->group[idx].tasks + task_count;
+
+	/* Update boosted tasks count while avoiding to make it negative */
+	bg->group[idx].tasks = max(0, tasks);
+
+	/* Update timeout on enqueue */
+	if (task_count > 0) {
+		u64 now = sched_clock_cpu(cpu);
+
+		if (schedtune_update_timestamp(p))
+			bg->group[idx].ts = now;
+
+		/* Boost group activation or deactivation on that RQ */
+		if (bg->group[idx].tasks == 1)
+			schedtune_cpu_update(cpu, now);
+	}
+
+	trace_sched_tune_tasks_update(p, cpu, tasks, idx,
+			bg->group[idx].boost, bg->boost_max,
+			bg->group[idx].ts);
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_enqueue_task(struct task_struct *p, int cpu)
+{
+	struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+	unsigned long irq_flags;
+	struct schedtune *st;
+	int idx;
+
+	if (unlikely(!schedtune_initialized))
+		return;
+
+	/*
+	 * Boost group accouting is protected by a per-cpu lock and requires
+	 * interrupt to be disabled to avoid race conditions for example on
+	 * do_exit()::cgroup_exit() and task migration.
+	 */
+	raw_spin_lock_irqsave(&bg->lock, irq_flags);
+	rcu_read_lock();
+
+	st = task_schedtune(p);
+	idx = st->idx;
+
+	schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
+
+	rcu_read_unlock();
+	raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+int schedtune_can_attach(struct cgroup_taskset *tset)
+{
+	struct task_struct *task;
+	struct cgroup_subsys_state *css;
+	struct boost_groups *bg;
+	struct rq_flags rq_flags;
+	unsigned int cpu;
+	struct rq *rq;
+	int src_bg; /* Source boost group index */
+	int dst_bg; /* Destination boost group index */
+	int tasks;
+	u64 now;
+
+	if (unlikely(!schedtune_initialized))
+		return 0;
+
+
+	cgroup_taskset_for_each(task, css, tset) {
+
+		/*
+		 * Lock the CPU's RQ the task is enqueued to avoid race
+		 * conditions with migration code while the task is being
+		 * accounted
+		 */
+		rq = task_rq_lock(task, &rq_flags);
+
+		if (!task->on_rq) {
+			task_rq_unlock(rq, task, &rq_flags);
+			continue;
+		}
+
+		/*
+		 * Boost group accouting is protected by a per-cpu lock and requires
+		 * interrupt to be disabled to avoid race conditions on...
+		 */
+		cpu = cpu_of(rq);
+		bg = &per_cpu(cpu_boost_groups, cpu);
+		raw_spin_lock(&bg->lock);
+
+		dst_bg = css_st(css)->idx;
+		src_bg = task_schedtune(task)->idx;
+
+		/*
+		 * Current task is not changing boostgroup, which can
+		 * happen when the new hierarchy is in use.
+		 */
+		if (unlikely(dst_bg == src_bg)) {
+			raw_spin_unlock(&bg->lock);
+			task_rq_unlock(rq, task, &rq_flags);
+			continue;
+		}
+
+		/*
+		 * This is the case of a RUNNABLE task which is switching its
+		 * current boost group.
+		 */
+
+		/* Move task from src to dst boost group */
+		tasks = bg->group[src_bg].tasks - 1;
+		bg->group[src_bg].tasks = max(0, tasks);
+		bg->group[dst_bg].tasks += 1;
+
+		/* Update boost hold start for this group */
+		now = sched_clock_cpu(cpu);
+		bg->group[dst_bg].ts = now;
+
+		/* Force boost group re-evaluation at next boost check */
+		bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS;
+
+		raw_spin_unlock(&bg->lock);
+		task_rq_unlock(rq, task, &rq_flags);
+	}
+
+	return 0;
+}
+
+void schedtune_cancel_attach(struct cgroup_taskset *tset)
+{
+	/* This can happen only if SchedTune controller is mounted with
+	 * other hierarchies ane one of them fails. Since usually SchedTune is
+	 * mouted on its own hierarcy, for the time being we do not implement
+	 * a proper rollback mechanism */
+	WARN(1, "SchedTune cancel attach not implemented");
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_dequeue_task(struct task_struct *p, int cpu)
+{
+	struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+	unsigned long irq_flags;
+	struct schedtune *st;
+	int idx;
+
+	if (unlikely(!schedtune_initialized))
+		return;
+
+	/*
+	 * Boost group accouting is protected by a per-cpu lock and requires
+	 * interrupt to be disabled to avoid race conditions on...
+	 */
+	raw_spin_lock_irqsave(&bg->lock, irq_flags);
+	rcu_read_lock();
+
+	st = task_schedtune(p);
+	idx = st->idx;
+
+	schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
+
+	rcu_read_unlock();
+	raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+int schedtune_cpu_boost(int cpu)
+{
+	struct boost_groups *bg;
+	u64 now;
+
+	bg = &per_cpu(cpu_boost_groups, cpu);
+	now = sched_clock_cpu(cpu);
+
+	/* Check to see if we have a hold in effect */
+	if (schedtune_boost_timeout(now, bg->boost_ts))
+		schedtune_cpu_update(cpu, now);
+
+	return bg->boost_max;
+}
+
+int schedtune_task_boost(struct task_struct *p)
+{
+	struct schedtune *st;
+	int task_boost;
+
+	if (unlikely(!schedtune_initialized))
+		return 0;
+
+	/* Get task boost value */
+	rcu_read_lock();
+	st = task_schedtune(p);
+	task_boost = st->boost;
+	rcu_read_unlock();
+
+	return task_boost;
+}
+
+int schedtune_prefer_idle(struct task_struct *p)
+{
+	struct schedtune *st;
+	int prefer_idle;
+
+	if (unlikely(!schedtune_initialized))
+		return 0;
+
+	/* Get prefer_idle value */
+	rcu_read_lock();
+	st = task_schedtune(p);
+	prefer_idle = st->prefer_idle;
+	rcu_read_unlock();
+
+	return prefer_idle;
+}
+
+static u64
+prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	struct schedtune *st = css_st(css);
+
+	return st->prefer_idle;
+}
+
+static int
+prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
+	    u64 prefer_idle)
+{
+	struct schedtune *st = css_st(css);
+	st->prefer_idle = !!prefer_idle;
+
+	return 0;
+}
+
+static s64
+boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	struct schedtune *st = css_st(css);
+
+	return st->boost;
+}
+
+static int
+boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
+	    s64 boost)
+{
+	struct schedtune *st = css_st(css);
+
+	if (boost < 0 || boost > 100)
+		return -EINVAL;
+
+	st->boost = boost;
+
+	/* Update CPU boost */
+	schedtune_boostgroup_update(st->idx, st->boost);
+
+	return 0;
+}
+
+static struct cftype files[] = {
+	{
+		.name = "boost",
+		.read_s64 = boost_read,
+		.write_s64 = boost_write,
+	},
+	{
+		.name = "prefer_idle",
+		.read_u64 = prefer_idle_read,
+		.write_u64 = prefer_idle_write,
+	},
+	{ }	/* terminate */
+};
+
+static void
+schedtune_boostgroup_init(struct schedtune *st, int idx)
+{
+	struct boost_groups *bg;
+	int cpu;
+
+	/* Initialize per CPUs boost group support */
+	for_each_possible_cpu(cpu) {
+		bg = &per_cpu(cpu_boost_groups, cpu);
+		bg->group[idx].boost = 0;
+		bg->group[idx].valid = true;
+		bg->group[idx].ts = 0;
+	}
+
+	/* Keep track of allocated boost groups */
+	allocated_group[idx] = st;
+	st->idx = idx;
+}
+
+static struct cgroup_subsys_state *
+schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct schedtune *st;
+	int idx;
+
+	if (!parent_css)
+		return &root_schedtune.css;
+
+	/* Allow only a limited number of boosting groups */
+	for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
+		if (!allocated_group[idx])
+			break;
+	if (idx == BOOSTGROUPS_COUNT) {
+		pr_err("Trying to create more than %d SchedTune boosting groups\n",
+		       BOOSTGROUPS_COUNT);
+		return ERR_PTR(-ENOSPC);
+	}
+
+	st = kzalloc(sizeof(*st), GFP_KERNEL);
+	if (!st)
+		goto out;
+
+	/* Initialize per CPUs boost group support */
+	schedtune_boostgroup_init(st, idx);
+
+	return &st->css;
+
+out:
+	return ERR_PTR(-ENOMEM);
+}
+
+static void
+schedtune_boostgroup_release(struct schedtune *st)
+{
+	struct boost_groups *bg;
+	int cpu;
+
+	/* Reset per CPUs boost group support */
+	for_each_possible_cpu(cpu) {
+		bg = &per_cpu(cpu_boost_groups, cpu);
+		bg->group[st->idx].valid = false;
+		bg->group[st->idx].boost = 0;
+	}
+
+	/* Keep track of allocated boost groups */
+	allocated_group[st->idx] = NULL;
+}
+
+static void
+schedtune_css_free(struct cgroup_subsys_state *css)
+{
+	struct schedtune *st = css_st(css);
+
+	/* Release per CPUs boost group support */
+	schedtune_boostgroup_release(st);
+	kfree(st);
+}
+
+struct cgroup_subsys schedtune_cgrp_subsys = {
+	.css_alloc	= schedtune_css_alloc,
+	.css_free	= schedtune_css_free,
+	.can_attach     = schedtune_can_attach,
+	.cancel_attach  = schedtune_cancel_attach,
+	.legacy_cftypes	= files,
+	.early_init	= 1,
+};
+
+static inline void
+schedtune_init_cgroups(void)
+{
+	struct boost_groups *bg;
+	int cpu;
+
+	/* Initialize the per CPU boost groups */
+	for_each_possible_cpu(cpu) {
+		bg = &per_cpu(cpu_boost_groups, cpu);
+		memset(bg, 0, sizeof(struct boost_groups));
+		bg->group[0].valid = true;
+		raw_spin_lock_init(&bg->lock);
+	}
+
+	pr_info("schedtune: configured to support %d boost groups\n",
+		BOOSTGROUPS_COUNT);
+
+	schedtune_initialized = true;
+}
+
+/*
+ * Initialize the cgroup structures
+ */
+static int
+schedtune_init(void)
+{
+	schedtune_spc_rdiv = reciprocal_value(100);
+	schedtune_init_cgroups();
+	return 0;
+}
+postcore_initcall(schedtune_init);

diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
new file mode 100644
index 0000000..821f026
--- /dev/null
+++ b/kernel/sched/tune.h

@@ -0,0 +1,37 @@
+
+#ifdef CONFIG_SCHED_TUNE
+
+#include <linux/reciprocal_div.h>
+
+/*
+ * System energy normalization constants
+ */
+struct target_nrg {
+	unsigned long min_power;
+	unsigned long max_power;
+	struct reciprocal_value rdiv;
+};
+
+int schedtune_cpu_boost(int cpu);
+int schedtune_task_boost(struct task_struct *tsk);
+
+int schedtune_prefer_idle(struct task_struct *tsk);
+
+void schedtune_enqueue_task(struct task_struct *p, int cpu);
+void schedtune_dequeue_task(struct task_struct *p, int cpu);
+
+unsigned long boosted_cpu_util(int cpu, unsigned long other_util);
+
+#else /* CONFIG_SCHED_TUNE */
+
+#define schedtune_cpu_boost(cpu)  0
+#define schedtune_task_boost(tsk) 0
+
+#define schedtune_prefer_idle(tsk) 0
+
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+
+#define boosted_cpu_util(cpu, other_util) cpu_util_cfs(cpu_rq(cpu))
+
+#endif /* CONFIG_SCHED_TUNE */

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 6f58486..1946ac6 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c

@@ -89,7 +89,8 @@
 
 	if (pending & SOFTIRQ_NOW_MASK)
 		return false;
-	return tsk && (tsk->state == TASK_RUNNING);
+	return tsk && (tsk->state == TASK_RUNNING) &&
+		!__kthread_should_park(tsk);
 }
 
 /*

diff --git a/kernel/sys.c b/kernel/sys.c
index 096932a..1d1e673 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c

@@ -42,9 +42,12 @@
 #include <linux/syscore_ops.h>
 #include <linux/version.h>
 #include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/mempolicy.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
+#include <linux/alt-syscall.h>
 #include <linux/kprobes.h>
 #include <linux/user_namespace.h>
 #include <linux/binfmts.h>
@@ -190,7 +193,7 @@
 	return error;
 }
 
-SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
+int ksys_setpriority(int which, int who, int niceval)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
@@ -254,13 +257,18 @@
 	return error;
 }
 
+SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
+{
+	return ksys_setpriority(which, who, niceval);
+}
+
 /*
  * Ugh. To avoid negative return values, "getpriority()" will
  * not return the normal nice-value, but a negated value that
  * has been offset by 20 (ie it returns 40..1 instead of -20..19)
  * to stay compatible.
  */
-SYSCALL_DEFINE2(getpriority, int, which, int, who)
+int ksys_getpriority(int which, int who)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
@@ -325,6 +333,11 @@
 	return retval;
 }
 
+SYSCALL_DEFINE2(getpriority, int, which, int, who)
+{
+	return ksys_getpriority(which, who);
+}
+
 /*
  * Unprivileged users may change the real gid to the effective gid
  * or vice versa.  (BSD-style)
@@ -2258,8 +2271,155 @@
 	return -EINVAL;
 }
 
-SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
-		unsigned long, arg4, unsigned long, arg5)
+#ifdef CONFIG_MMU
+static int prctl_update_vma_anon_name(struct vm_area_struct *vma,
+		struct vm_area_struct **prev,
+		unsigned long start, unsigned long end,
+		const char __user *name_addr)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	int error = 0;
+	pgoff_t pgoff;
+
+	if (name_addr == vma_get_anon_name(vma)) {
+		*prev = vma;
+		goto out;
+	}
+
+	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+	*prev = vma_merge(mm, *prev, start, end, vma->vm_flags, vma->anon_vma,
+				vma->vm_file, pgoff, vma_policy(vma),
+				vma->vm_userfaultfd_ctx, name_addr);
+	if (*prev) {
+		vma = *prev;
+		goto success;
+	}
+
+	*prev = vma;
+
+	if (start != vma->vm_start) {
+		error = split_vma(mm, vma, start, 1);
+		if (error)
+			goto out;
+	}
+
+	if (end != vma->vm_end) {
+		error = split_vma(mm, vma, end, 0);
+		if (error)
+			goto out;
+	}
+
+success:
+	if (!vma->vm_file)
+		vma->anon_name = name_addr;
+
+out:
+	if (error == -ENOMEM)
+		error = -EAGAIN;
+	return error;
+}
+
+static int prctl_set_vma_anon_name(unsigned long start, unsigned long end,
+			unsigned long arg)
+{
+	unsigned long tmp;
+	struct vm_area_struct *vma, *prev;
+	int unmapped_error = 0;
+	int error = -EINVAL;
+
+	/*
+	 * If the interval [start,end) covers some unmapped address
+	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 * - this matches the handling in madvise.
+	 */
+	vma = find_vma_prev(current->mm, start, &prev);
+	if (vma && start > vma->vm_start)
+		prev = vma;
+
+	for (;;) {
+		/* Still start < end. */
+		error = -ENOMEM;
+		if (!vma)
+			return error;
+
+		/* Here start < (end|vma->vm_end). */
+		if (start < vma->vm_start) {
+			unmapped_error = -ENOMEM;
+			start = vma->vm_start;
+			if (start >= end)
+				return error;
+		}
+
+		/* Here vma->vm_start <= start < (end|vma->vm_end) */
+		tmp = vma->vm_end;
+		if (end < tmp)
+			tmp = end;
+
+		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+		error = prctl_update_vma_anon_name(vma, &prev, start, tmp,
+				(const char __user *)arg);
+		if (error)
+			return error;
+		start = tmp;
+		if (prev && start < prev->vm_end)
+			start = prev->vm_end;
+		error = unmapped_error;
+		if (start >= end)
+			return error;
+		if (prev)
+			vma = prev->vm_next;
+		else	/* madvise_remove dropped mmap_sem */
+			vma = find_vma(current->mm, start);
+	}
+}
+
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+		unsigned long len_in, unsigned long arg)
+{
+	struct mm_struct *mm = current->mm;
+	int error;
+	unsigned long len;
+	unsigned long end;
+
+	if (start & ~PAGE_MASK)
+		return -EINVAL;
+	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+	/* Check to see whether len was rounded up from small -ve to zero */
+	if (len_in && !len)
+		return -EINVAL;
+
+	end = start + len;
+	if (end < start)
+		return -EINVAL;
+
+	if (end == start)
+		return 0;
+
+	down_write(&mm->mmap_sem);
+
+	switch (opt) {
+	case PR_SET_VMA_ANON_NAME:
+		error = prctl_set_vma_anon_name(start, end, arg);
+		break;
+	default:
+		error = -EINVAL;
+	}
+
+	up_write(&mm->mmap_sem);
+
+	return error;
+}
+#else /* CONFIG_MMU */
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+		unsigned long len_in, unsigned long arg)
+{
+	return -EINVAL;
+}
+#endif
+
+int ksys_prctl(int option, unsigned long arg2, unsigned long arg3,
+	       unsigned long arg4, unsigned long arg5)
 {
 	struct task_struct *me = current;
 	unsigned char comm[sizeof(me->comm)];
@@ -2342,6 +2502,12 @@
 	case PR_SET_SECCOMP:
 		error = prctl_set_seccomp(arg2, (char __user *)arg3);
 		break;
+	case PR_ALT_SYSCALL:
+		if (arg2 == PR_ALT_SYSCALL_SET_SYSCALL_TABLE)
+			error = set_alt_sys_call_table((char __user *)arg3);
+		else
+			error = -EINVAL;
+		break;
 	case PR_GET_TSC:
 		error = GET_TSC_CTL(arg2);
 		break;
@@ -2476,6 +2642,9 @@
 			return -EINVAL;
 		error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
 		break;
+	case PR_SET_VMA:
+		error = prctl_set_vma(arg2, arg3, arg4, arg5);
+		break;
 	default:
 		error = -EINVAL;
 		break;
@@ -2483,8 +2652,14 @@
 	return error;
 }
 
-SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
-		struct getcpu_cache __user *, unused)
+SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
+		unsigned long, arg4, unsigned long, arg5)
+{
+	return ksys_prctl(option, arg2, arg3, arg4, arg5);
+}
+
+int ksys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
+		struct getcpu_cache __user *unused)
 {
 	int err = 0;
 	int cpu = raw_smp_processor_id();
@@ -2496,6 +2671,12 @@
 	return err ? -EFAULT : 0;
 }
 
+SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
+		struct getcpu_cache __user *, unused)
+{
+	return ksys_getcpu(cpup, nodep, unused);
+}
+
 /**
  * do_sysinfo - fill in sysinfo struct
  * @info: pointer to buffer to fill

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4c4fd43..7874979 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c

@@ -323,6 +323,13 @@
 	},
 #ifdef CONFIG_SCHED_DEBUG
 	{
+		.procname       = "sched_cstate_aware",
+		.data           = &sysctl_sched_cstate_aware,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{
 		.procname	= "sched_min_granularity_ns",
 		.data		= &sysctl_sched_min_granularity,
 		.maxlen		= sizeof(unsigned int),
@@ -341,6 +348,13 @@
 		.extra2		= &max_sched_granularity_ns,
 	},
 	{
+		.procname	= "sched_sync_hint_enable",
+		.data		= &sysctl_sched_sync_hint_enable,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "sched_wakeup_granularity_ns",
 		.data		= &sysctl_sched_wakeup_granularity,
 		.maxlen		= sizeof(unsigned int),
@@ -1573,6 +1587,15 @@
 		.mode		= 0644,
 		.proc_handler	= mmap_min_addr_handler,
 	},
+	{
+		.procname	= "mmap_noexec_taint",
+		.data		= &sysctl_mmap_noexec_taint,
+		.maxlen		= sizeof(sysctl_mmap_noexec_taint),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 #endif
 #ifdef CONFIG_NUMA
 	{
@@ -1644,6 +1667,13 @@
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
+	{
+		.procname	= "min_filelist_kbytes",
+		.data		= &min_filelist_kbytes,
+		.maxlen		= sizeof(min_filelist_kbytes),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
 	{
 		.procname	= "mmap_rnd_bits",
@@ -1666,6 +1696,17 @@
 		.extra2		= (void *)&mmap_rnd_compat_bits_max,
 	},
 #endif
+#ifdef CONFIG_DISK_BASED_SWAP
+	{
+		.procname	= "disk_based_swap",
+		.data		= &sysctl_disk_based_swap,
+		.maxlen		= sizeof(sysctl_disk_based_swap),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 	{ }
 };
 

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 5a01c4f..0cae15e 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c

@@ -1068,8 +1068,7 @@
 	return error;
 }
 
-SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
-		struct timex __user *, utx)
+int ksys_clock_adjtime(const clockid_t which_clock, struct timex __user * utx)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timex ktx;
@@ -1091,6 +1090,12 @@
 	return err;
 }
 
+SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
+		struct timex __user *, utx)
+{
+	return ksys_clock_adjtime(which_clock, utx);
+}
+
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 		struct __kernel_timespec __user *, tp)
 {
@@ -1148,8 +1153,7 @@
 
 #ifdef CONFIG_COMPAT
 
-COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
-		       struct compat_timex __user *, utp)
+int compat_ksys_clock_adjtime(clockid_t which_clock, struct compat_timex __user * utp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timex ktx;
@@ -1172,6 +1176,12 @@
 	return err;
 }
 
+COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
+		       struct compat_timex __user *, utp)
+{
+	return compat_ksys_clock_adjtime(which_clock, utp);
+}
+
 #endif
 
 #ifdef CONFIG_COMPAT_32BIT_TIME

diff --git a/kernel/time/time.c b/kernel/time/time.c
index f7d4fa5..e97e3ff 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c

@@ -266,7 +266,7 @@
 }
 #endif
 
-SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
+int ksys_adjtimex(struct timex __user * txc_p)
 {
 	struct timex txc;		/* Local copy of parameter */
 	int ret;
@@ -281,9 +281,14 @@
 	return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
 }
 
+SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
+{
+	return ksys_adjtimex(txc_p);
+}
+
 #ifdef CONFIG_COMPAT
 
-COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
+int compat_ksys_adjtimex(struct compat_timex __user * utp)
 {
 	struct timex txc;
 	int err, ret;
@@ -300,6 +305,12 @@
 
 	return ret;
 }
+
+COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
+{
+	return compat_ksys_adjtimex(utp);
+}
+
 #endif
 
 /*

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5379523..0c379cd4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c

@@ -6525,9 +6525,10 @@
 	struct trace_array *tr = m->private;
 	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids);
 
-	if (v == FTRACE_NO_PIDS)
+	if (v == FTRACE_NO_PIDS) {
+		(*pos)++;
 		return NULL;
-
+	}
 	return trace_pid_next(pid_list, v, pos);
 }
 

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e61aa1c..8749810 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c

@@ -1549,6 +1549,7 @@
 
 	pr_info("Running postponed tracer tests:\n");
 
+	tracing_selftest_running = true;
 	list_for_each_entry_safe(p, n, &postponed_selftests, list) {
 		ret = run_tracer_selftest(p->type);
 		/* If the test fails, then warn and remove from available_tracers */
@@ -1567,6 +1568,7 @@
 		list_del(&p->list);
 		kfree(p);
 	}
+	tracing_selftest_running = false;
 
  out:
 	mutex_unlock(&trace_types_lock);
@@ -3318,33 +3320,68 @@
 }
 
 static void
+get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total,
+		      unsigned long *entries, int cpu)
+{
+	unsigned long count;
+
+	count = ring_buffer_entries_cpu(buf->buffer, cpu);
+	/*
+	 * If this buffer has skipped entries, then we hold all
+	 * entries for the trace and we need to ignore the
+	 * ones before the time stamp.
+	 */
+	if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
+		count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
+		/* total is the same as the entries */
+		*total = count;
+	} else
+		*total = count +
+			ring_buffer_overrun_cpu(buf->buffer, cpu);
+	*entries = count;
+}
+
+static void
 get_total_entries(struct trace_buffer *buf,
 		  unsigned long *total, unsigned long *entries)
 {
-	unsigned long count;
+	unsigned long t, e;
 	int cpu;
 
 	*total = 0;
 	*entries = 0;
 
 	for_each_tracing_cpu(cpu) {
-		count = ring_buffer_entries_cpu(buf->buffer, cpu);
-		/*
-		 * If this buffer has skipped entries, then we hold all
-		 * entries for the trace and we need to ignore the
-		 * ones before the time stamp.
-		 */
-		if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
-			count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
-			/* total is the same as the entries */
-			*total += count;
-		} else
-			*total += count +
-				ring_buffer_overrun_cpu(buf->buffer, cpu);
-		*entries += count;
+		get_total_entries_cpu(buf, &t, &e, cpu);
+		*total += t;
+		*entries += e;
 	}
 }
 
+unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu)
+{
+	unsigned long total, entries;
+
+	if (!tr)
+		tr = &global_trace;
+
+	get_total_entries_cpu(&tr->trace_buffer, &total, &entries, cpu);
+
+	return entries;
+}
+
+unsigned long trace_total_entries(struct trace_array *tr)
+{
+	unsigned long total, entries;
+
+	if (!tr)
+		tr = &global_trace;
+
+	get_total_entries(&tr->trace_buffer, &total, &entries);
+
+	return entries;
+}
+
 static void print_lat_help_header(struct seq_file *m)
 {
 	seq_puts(m, "#                  _------=> CPU#            \n"
@@ -4647,7 +4684,7 @@
   "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n"
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
-	"\t    place: <path>:<offset>\n"
+  "   place (uprobe): <path>:<offset>[(ref_ctr_offset)]\n"
 #endif
 	"\t     args: <name>=fetcharg[:type]\n"
 	"\t fetcharg: %<register>, @<address>, @<symbol>[+|-<offset>],\n"

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ee0c6a3..7cdda42 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h

@@ -663,6 +663,9 @@
 
 void tracing_iter_reset(struct trace_iterator *iter, int cpu);
 
+unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu);
+unsigned long trace_total_entries(struct trace_array *tr);
+
 void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index f5b3bf0..48ee92c 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c

@@ -294,7 +294,8 @@
 #endif /* CONFIG_KPROBE_EVENTS */
 
 #ifdef CONFIG_UPROBE_EVENTS
-int perf_uprobe_init(struct perf_event *p_event, bool is_retprobe)
+int perf_uprobe_init(struct perf_event *p_event,
+		     unsigned long ref_ctr_offset, bool is_retprobe)
 {
 	int ret;
 	char *path = NULL;
@@ -314,8 +315,8 @@
 		goto out;
 	}
 
-	tp_event = create_local_trace_uprobe(
-		path, p_event->attr.probe_offset, is_retprobe);
+	tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset,
+					     ref_ctr_offset, is_retprobe);
 	if (IS_ERR(tp_event)) {
 		ret = PTR_ERR(tp_event);
 		goto out;

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index b949c39..9be3d1d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c

@@ -451,8 +451,10 @@
 
 		switch (*next) {
 		case '(':					/* #2 */
-			if (top - op_stack > nr_parens)
-				return ERR_PTR(-EINVAL);
+			if (top - op_stack > nr_parens) {
+				ret = -EINVAL;
+				goto out_free;
+			}
 			*(++top) = invert;
 			continue;
 		case '!':					/* #3 */

diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index b05d1b6..9300e8b 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c

@@ -115,9 +115,10 @@
 {
 	struct trace_event_file *event_file = event_file_data(m->private);
 
-	if (t == SHOW_AVAILABLE_TRIGGERS)
+	if (t == SHOW_AVAILABLE_TRIGGERS) {
+		(*pos)++;
 		return NULL;
-
+	}
 	return seq_list_next(t, &event_file->triggers, pos);
 }
 

diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 5f52668..03b10f3 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h

@@ -412,6 +412,7 @@
 extern void destroy_local_trace_kprobe(struct trace_event_call *event_call);
 
 extern struct trace_event_call *
-create_local_trace_uprobe(char *name, unsigned long offs, bool is_return);
+create_local_trace_uprobe(char *name, unsigned long offs,
+			  unsigned long ref_ctr_offset, bool is_return);
 extern void destroy_local_trace_uprobe(struct trace_event_call *event_call);
 #endif

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 75bf1bc..92b76f9e 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c

@@ -278,18 +278,22 @@
 
 	d_tracing = tracing_init_dentry();
 	if (IS_ERR(d_tracing))
-		return 0;
+		return -ENODEV;
 
 	stat_dir = tracefs_create_dir("trace_stat", d_tracing);
-	if (!stat_dir)
+	if (!stat_dir) {
 		pr_warn("Could not create tracefs 'trace_stat' entry\n");
+		return -ENOMEM;
+	}
 	return 0;
 }
 
 static int init_stat_file(struct stat_session *session)
 {
-	if (!stat_dir && tracing_stat_init())
-		return -ENODEV;
+	int ret;
+
+	if (!stat_dir && (ret = tracing_stat_init()))
+		return ret;
 
 	session->file = tracefs_create_file(session->ts->name, 0644,
 					    stat_dir,
@@ -302,7 +306,7 @@
 int register_stat_tracer(struct tracer_stat *trace)
 {
 	struct stat_session *session, *node;
-	int ret;
+	int ret = -EINVAL;
 
 	if (!trace)
 		return -EINVAL;
@@ -313,17 +317,15 @@
 	/* Already registered? */
 	mutex_lock(&all_stat_sessions_mutex);
 	list_for_each_entry(node, &all_stat_sessions, session_list) {
-		if (node->ts == trace) {
-			mutex_unlock(&all_stat_sessions_mutex);
-			return -EINVAL;
-		}
+		if (node->ts == trace)
+			goto out;
 	}
-	mutex_unlock(&all_stat_sessions_mutex);
 
+	ret = -ENOMEM;
 	/* Init the session */
 	session = kzalloc(sizeof(*session), GFP_KERNEL);
 	if (!session)
-		return -ENOMEM;
+		goto out;
 
 	session->ts = trace;
 	INIT_LIST_HEAD(&session->session_list);
@@ -332,15 +334,16 @@
 	ret = init_stat_file(session);
 	if (ret) {
 		destroy_session(session);
-		return ret;
+		goto out;
 	}
 
+	ret = 0;
 	/* Register */
-	mutex_lock(&all_stat_sessions_mutex);
 	list_add_tail(&session->session_list, &all_stat_sessions);
+ out:
 	mutex_unlock(&all_stat_sessions_mutex);
 
-	return 0;
+	return ret;
 }
 
 void unregister_stat_tracer(struct tracer_stat *trace)

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 0da379b..251865d 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c

@@ -47,6 +47,7 @@
 	struct inode			*inode;
 	char				*filename;
 	unsigned long			offset;
+	unsigned long			ref_ctr_offset;
 	unsigned long			nhit;
 	struct trace_probe		tp;
 };
@@ -359,10 +360,10 @@
 static int create_trace_uprobe(int argc, char **argv)
 {
 	struct trace_uprobe *tu;
-	char *arg, *event, *group, *filename;
+	char *arg, *event, *group, *filename, *rctr, *rctr_end;
 	char buf[MAX_EVENT_NAME_LEN];
 	struct path path;
-	unsigned long offset;
+	unsigned long offset, ref_ctr_offset;
 	bool is_delete, is_return;
 	int i, ret;
 
@@ -371,6 +372,7 @@
 	is_return = false;
 	event = NULL;
 	group = NULL;
+	ref_ctr_offset = 0;
 
 	/* argc must be >= 1 */
 	if (argv[0][0] == '-')
@@ -445,6 +447,26 @@
 		goto fail_address_parse;
 	}
 
+	/* Parse reference counter offset if specified. */
+	rctr = strchr(arg, '(');
+	if (rctr) {
+		rctr_end = strchr(rctr, ')');
+		if (rctr > rctr_end || *(rctr_end + 1) != 0) {
+			ret = -EINVAL;
+			pr_info("Invalid reference counter offset.\n");
+			goto fail_address_parse;
+		}
+
+		*rctr++ = '\0';
+		*rctr_end = '\0';
+		ret = kstrtoul(rctr, 0, &ref_ctr_offset);
+		if (ret) {
+			pr_info("Invalid reference counter offset.\n");
+			goto fail_address_parse;
+		}
+	}
+
+	/* Parse uprobe offset. */
 	ret = kstrtoul(arg, 0, &offset);
 	if (ret)
 		goto fail_address_parse;
@@ -479,6 +501,7 @@
 		goto fail_address_parse;
 	}
 	tu->offset = offset;
+	tu->ref_ctr_offset = ref_ctr_offset;
 	tu->path = path;
 	tu->filename = kstrdup(filename, GFP_KERNEL);
 
@@ -597,6 +620,9 @@
 			trace_event_name(&tu->tp.call), tu->filename,
 			(int)(sizeof(void *) * 2), tu->offset);
 
+	if (tu->ref_ctr_offset)
+		seq_printf(m, "(0x%lx)", tu->ref_ctr_offset);
+
 	for (i = 0; i < tu->tp.nr_args; i++)
 		seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
 
@@ -912,7 +938,13 @@
 
 	tu->consumer.filter = filter;
 	tu->inode = d_real_inode(tu->path.dentry);
-	ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+	if (tu->ref_ctr_offset) {
+		ret = uprobe_register_refctr(tu->inode, tu->offset,
+				tu->ref_ctr_offset, &tu->consumer);
+	} else {
+		ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+	}
+
 	if (ret)
 		goto err_buffer;
 
@@ -1347,7 +1379,8 @@
 
 #ifdef CONFIG_PERF_EVENTS
 struct trace_event_call *
-create_local_trace_uprobe(char *name, unsigned long offs, bool is_return)
+create_local_trace_uprobe(char *name, unsigned long offs,
+			  unsigned long ref_ctr_offset, bool is_return)
 {
 	struct trace_uprobe *tu;
 	struct path path;
@@ -1379,6 +1412,7 @@
 
 	tu->offset = offs;
 	tu->path = path;
+	tu->ref_ctr_offset = ref_ctr_offset;
 	tu->filename = kstrdup(name, GFP_KERNEL);
 	init_trace_event_call(tu, &tu->tp.call);
 

diff --git a/kernel/user.c b/kernel/user.c
index 0df9b16..7f74a8a 100644
--- a/kernel/user.c
+++ b/kernel/user.c

@@ -17,6 +17,7 @@
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
 #include <linux/proc_ns.h>
 
 /*
@@ -208,6 +209,7 @@
 		}
 		spin_unlock_irq(&uidhash_lock);
 	}
+	proc_register_uid(uid);
 
 	return up;
 
@@ -229,6 +231,7 @@
 	spin_lock_irq(&uidhash_lock);
 	uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
 	spin_unlock_irq(&uidhash_lock);
+	proc_register_uid(GLOBAL_ROOT_UID);
 
 	return 0;
 }

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 923414a..4f5b7f3 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c

@@ -1322,6 +1322,7 @@
 	.owner		= userns_owner,
 	.get_parent	= ns_get_owner,
 };
+EXPORT_SYMBOL(userns_operations);
 
 static __init int user_namespaces_init(void)
 {

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index bbc4940..6d60701 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c

@@ -161,6 +161,8 @@
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
 
+#define SOFTLOCKUP_RESET	ULONG_MAX
+
 /* Global variables, exported for sysctl */
 unsigned int __read_mostly softlockup_panic =
 			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -267,7 +269,7 @@
 	 * Preemption can be enabled.  It doesn't matter which CPU's timestamp
 	 * gets zeroed here, so use the raw_ operation.
 	 */
-	raw_cpu_write(watchdog_touch_ts, 0);
+	raw_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
 }
 
 notrace void touch_softlockup_watchdog(void)
@@ -291,14 +293,14 @@
 	 * the softlockup check.
 	 */
 	for_each_cpu(cpu, &watchdog_allowed_mask)
-		per_cpu(watchdog_touch_ts, cpu) = 0;
+		per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET;
 	wq_watchdog_touch(-1);
 }
 
 void touch_softlockup_watchdog_sync(void)
 {
 	__this_cpu_write(softlockup_touch_sync, true);
-	__this_cpu_write(watchdog_touch_ts, 0);
+	__this_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET);
 }
 
 static int is_softlockup(unsigned long touch_ts)
@@ -376,7 +378,7 @@
 	/* .. and repeat */
 	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
 
-	if (touch_ts == 0) {
+	if (touch_ts == SOFTLOCKUP_RESET) {
 		if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
 			/*
 			 * If the time stamp was touched atomically

diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index dbf2b45..c7c96bc7 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c

@@ -188,7 +188,7 @@
 			newflags = (dp->flags & mask) | flags;
 			if (newflags == dp->flags)
 				continue;
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 			if (dp->flags & _DPRINTK_FLAGS_PRINT) {
 				if (!(flags & _DPRINTK_FLAGS_PRINT))
 					static_branch_disable(&dp->key.dd_key_true);

diff --git a/lib/list_sort.c b/lib/list_sort.c
index 8575992..52f0c25 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c

@@ -7,33 +7,41 @@
 #include <linux/list_sort.h>
 #include <linux/list.h>
 
-#define MAX_LIST_LENGTH_BITS 20
+typedef int __attribute__((nonnull(2,3))) (*cmp_func)(void *,
+		struct list_head const *, struct list_head const *);
 
 /*
  * Returns a list organized in an intermediate format suited
  * to chaining of merge() calls: null-terminated, no reserved or
  * sentinel head node, "prev" links not maintained.
  */
-static struct list_head *merge(void *priv,
-				int (*cmp)(void *priv, struct list_head *a,
-					struct list_head *b),
+__attribute__((nonnull(2,3,4)))
+static struct list_head *merge(void *priv, cmp_func cmp,
 				struct list_head *a, struct list_head *b)
 {
-	struct list_head head, *tail = &head;
+	struct list_head *head, **tail = &head;
 
-	while (a && b) {
+	for (;;) {
 		/* if equal, take 'a' -- important for sort stability */
-		if ((*cmp)(priv, a, b) <= 0) {
-			tail->next = a;
+		if (cmp(priv, a, b) <= 0) {
+			*tail = a;
+			tail = &a->next;
 			a = a->next;
+			if (!a) {
+				*tail = b;
+				break;
+			}
 		} else {
-			tail->next = b;
+			*tail = b;
+			tail = &b->next;
 			b = b->next;
+			if (!b) {
+				*tail = a;
+				break;
+			}
 		}
-		tail = tail->next;
 	}
-	tail->next = a?:b;
-	return head.next;
+	return head;
 }
 
 /*
@@ -43,44 +51,52 @@
  * prev-link restoration pass, or maintaining the prev links
  * throughout.
  */
-static void merge_and_restore_back_links(void *priv,
-				int (*cmp)(void *priv, struct list_head *a,
-					struct list_head *b),
-				struct list_head *head,
-				struct list_head *a, struct list_head *b)
+__attribute__((nonnull(2,3,4,5)))
+static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
+			struct list_head *a, struct list_head *b)
 {
 	struct list_head *tail = head;
 	u8 count = 0;
 
-	while (a && b) {
+	for (;;) {
 		/* if equal, take 'a' -- important for sort stability */
-		if ((*cmp)(priv, a, b) <= 0) {
+		if (cmp(priv, a, b) <= 0) {
 			tail->next = a;
 			a->prev = tail;
+			tail = a;
 			a = a->next;
+			if (!a)
+				break;
 		} else {
 			tail->next = b;
 			b->prev = tail;
+			tail = b;
 			b = b->next;
+			if (!b) {
+				b = a;
+				break;
+			}
 		}
-		tail = tail->next;
 	}
-	tail->next = a ? : b;
 
+	/* Finish linking remainder of list b on to tail */
+	tail->next = b;
 	do {
 		/*
-		 * In worst cases this loop may run many iterations.
+		 * If the merge is highly unbalanced (e.g. the input is
+		 * already sorted), this loop may run many iterations.
 		 * Continue callbacks to the client even though no
 		 * element comparison is needed, so the client's cmp()
 		 * routine can invoke cond_resched() periodically.
 		 */
-		if (unlikely(!(++count)))
-			(*cmp)(priv, tail->next, tail->next);
+		if (unlikely(!++count))
+			cmp(priv, b, b);
+		b->prev = tail;
+		tail = b;
+		b = b->next;
+	} while (b);
 
-		tail->next->prev = tail;
-		tail = tail->next;
-	} while (tail->next);
-
+	/* And the final links to make a circular doubly-linked list */
 	tail->next = head;
 	head->prev = tail;
 }
@@ -91,55 +107,152 @@
  * @head: the list to sort
  * @cmp: the elements comparison function
  *
- * This function implements "merge sort", which has O(nlog(n))
- * complexity.
+ * The comparison funtion @cmp must return > 0 if @a should sort after
+ * @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should
+ * sort before @b *or* their original order should be preserved.  It is
+ * always called with the element that came first in the input in @a,
+ * and list_sort is a stable sort, so it is not necessary to distinguish
+ * the @a < @b and @a == @b cases.
  *
- * The comparison function @cmp must return a negative value if @a
- * should sort before @b, and a positive value if @a should sort after
- * @b. If @a and @b are equivalent, and their original relative
- * ordering is to be preserved, @cmp must return 0.
+ * This is compatible with two styles of @cmp function:
+ * - The traditional style which returns <0 / =0 / >0, or
+ * - Returning a boolean 0/1.
+ * The latter offers a chance to save a few cycles in the comparison
+ * (which is used by e.g. plug_ctx_cmp() in block/blk-mq.c).
+ *
+ * A good way to write a multi-word comparison is::
+ *
+ *	if (a->high != b->high)
+ *		return a->high > b->high;
+ *	if (a->middle != b->middle)
+ *		return a->middle > b->middle;
+ *	return a->low > b->low;
+ *
+ *
+ * This mergesort is as eager as possible while always performing at least
+ * 2:1 balanced merges.  Given two pending sublists of size 2^k, they are
+ * merged to a size-2^(k+1) list as soon as we have 2^k following elements.
+ *
+ * Thus, it will avoid cache thrashing as long as 3*2^k elements can
+ * fit into the cache.  Not quite as good as a fully-eager bottom-up
+ * mergesort, but it does use 0.2*n fewer comparisons, so is faster in
+ * the common case that everything fits into L1.
+ *
+ *
+ * The merging is controlled by "count", the number of elements in the
+ * pending lists.  This is beautiully simple code, but rather subtle.
+ *
+ * Each time we increment "count", we set one bit (bit k) and clear
+ * bits k-1 .. 0.  Each time this happens (except the very first time
+ * for each bit, when count increments to 2^k), we merge two lists of
+ * size 2^k into one list of size 2^(k+1).
+ *
+ * This merge happens exactly when the count reaches an odd multiple of
+ * 2^k, which is when we have 2^k elements pending in smaller lists,
+ * so it's safe to merge away two lists of size 2^k.
+ *
+ * After this happens twice, we have created two lists of size 2^(k+1),
+ * which will be merged into a list of size 2^(k+2) before we create
+ * a third list of size 2^(k+1), so there are never more than two pending.
+ *
+ * The number of pending lists of size 2^k is determined by the
+ * state of bit k of "count" plus two extra pieces of information:
+ *
+ * - The state of bit k-1 (when k == 0, consider bit -1 always set), and
+ * - Whether the higher-order bits are zero or non-zero (i.e.
+ *   is count >= 2^(k+1)).
+ *
+ * There are six states we distinguish.  "x" represents some arbitrary
+ * bits, and "y" represents some arbitrary non-zero bits:
+ * 0:  00x: 0 pending of size 2^k;           x pending of sizes < 2^k
+ * 1:  01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * 2: x10x: 0 pending of size 2^k; 2^k     + x pending of sizes < 2^k
+ * 3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * 4: y00x: 1 pending of size 2^k; 2^k     + x pending of sizes < 2^k
+ * 5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * (merge and loop back to state 2)
+ *
+ * We gain lists of size 2^k in the 2->3 and 4->5 transitions (because
+ * bit k-1 is set while the more significant bits are non-zero) and
+ * merge them away in the 5->2 transition.  Note in particular that just
+ * before the 5->2 transition, all lower-order bits are 11 (state 3),
+ * so there is one list of each smaller size.
+ *
+ * When we reach the end of the input, we merge all the pending
+ * lists, from smallest to largest.  If you work through cases 2 to
+ * 5 above, you can see that the number of elements we merge with a list
+ * of size 2^k varies from 2^(k-1) (cases 3 and 5 when x == 0) to
+ * 2^(k+1) - 1 (second merge of case 5 when x == 2^(k-1) - 1).
  */
+__attribute__((nonnull(2,3)))
 void list_sort(void *priv, struct list_head *head,
 		int (*cmp)(void *priv, struct list_head *a,
 			struct list_head *b))
 {
-	struct list_head *part[MAX_LIST_LENGTH_BITS+1]; /* sorted partial lists
-						-- last slot is a sentinel */
-	int lev;  /* index into part[] */
-	int max_lev = 0;
-	struct list_head *list;
+	struct list_head *list = head->next, *pending = NULL;
+	size_t count = 0;	/* Count of pending */
 
-	if (list_empty(head))
+	if (list == head->prev)	/* Zero or one elements */
 		return;
 
-	memset(part, 0, sizeof(part));
-
+	/* Convert to a null-terminated singly-linked list. */
 	head->prev->next = NULL;
-	list = head->next;
 
-	while (list) {
-		struct list_head *cur = list;
+	/*
+	 * Data structure invariants:
+	 * - All lists are singly linked and null-terminated; prev
+	 *   pointers are not maintained.
+	 * - pending is a prev-linked "list of lists" of sorted
+	 *   sublists awaiting further merging.
+	 * - Each of the sorted sublists is power-of-two in size.
+	 * - Sublists are sorted by size and age, smallest & newest at front.
+	 * - There are zero to two sublists of each size.
+	 * - A pair of pending sublists are merged as soon as the number
+	 *   of following pending elements equals their size (i.e.
+	 *   each time count reaches an odd multiple of that size).
+	 *   That ensures each later final merge will be at worst 2:1.
+	 * - Each round consists of:
+	 *   - Merging the two sublists selected by the highest bit
+	 *     which flips when count is incremented, and
+	 *   - Adding an element from the input as a size-1 sublist.
+	 */
+	do {
+		size_t bits;
+		struct list_head **tail = &pending;
+
+		/* Find the least-significant clear bit in count */
+		for (bits = count; bits & 1; bits >>= 1)
+			tail = &(*tail)->prev;
+		/* Do the indicated merge */
+		if (likely(bits)) {
+			struct list_head *a = *tail, *b = a->prev;
+
+			a = merge(priv, (cmp_func)cmp, b, a);
+			/* Install the merged result in place of the inputs */
+			a->prev = b->prev;
+			*tail = a;
+		}
+
+		/* Move one element from input list to pending */
+		list->prev = pending;
+		pending = list;
 		list = list->next;
-		cur->next = NULL;
+		pending->next = NULL;
+		count++;
+	} while (list);
 
-		for (lev = 0; part[lev]; lev++) {
-			cur = merge(priv, cmp, part[lev], cur);
-			part[lev] = NULL;
-		}
-		if (lev > max_lev) {
-			if (unlikely(lev >= ARRAY_SIZE(part)-1)) {
-				printk_once(KERN_DEBUG "list too long for efficiency\n");
-				lev--;
-			}
-			max_lev = lev;
-		}
-		part[lev] = cur;
+	/* End of input; merge together all the pending lists. */
+	list = pending;
+	pending = pending->prev;
+	for (;;) {
+		struct list_head *next = pending->prev;
+
+		if (!next)
+			break;
+		list = merge(priv, (cmp_func)cmp, pending, list);
+		pending = next;
 	}
-
-	for (lev = 0; lev < max_lev; lev++)
-		if (part[lev])
-			list = merge(priv, cmp, part[lev], list);
-
-	merge_and_restore_back_links(priv, cmp, head, part[max_lev], list);
+	/* The final merge, rebuilding prev links */
+	merge_final(priv, (cmp_func)cmp, head, pending, list);
 }
 EXPORT_SYMBOL(list_sort);

diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c
index 236eb21..4525fb0 100644
--- a/lib/lzo/lzo1x_compress.c
+++ b/lib/lzo/lzo1x_compress.c

@@ -20,7 +20,8 @@
 static noinline size_t
 lzo1x_1_do_compress(const unsigned char *in, size_t in_len,
 		    unsigned char *out, size_t *out_len,
-		    size_t ti, void *wrkmem)
+		    size_t ti, void *wrkmem, signed char *state_offset,
+		    const unsigned char bitstream_version)
 {
 	const unsigned char *ip;
 	unsigned char *op;
@@ -35,27 +36,85 @@
 	ip += ti < 4 ? 4 - ti : 0;
 
 	for (;;) {
-		const unsigned char *m_pos;
+		const unsigned char *m_pos = NULL;
 		size_t t, m_len, m_off;
 		u32 dv;
+		u32 run_length = 0;
 literal:
 		ip += 1 + ((ip - ii) >> 5);
 next:
 		if (unlikely(ip >= ip_end))
 			break;
 		dv = get_unaligned_le32(ip);
-		t = ((dv * 0x1824429d) >> (32 - D_BITS)) & D_MASK;
-		m_pos = in + dict[t];
-		dict[t] = (lzo_dict_t) (ip - in);
-		if (unlikely(dv != get_unaligned_le32(m_pos)))
-			goto literal;
+
+		if (dv == 0 && bitstream_version) {
+			const unsigned char *ir = ip + 4;
+			const unsigned char *limit = ip_end
+				< (ip + MAX_ZERO_RUN_LENGTH + 1)
+				? ip_end : ip + MAX_ZERO_RUN_LENGTH + 1;
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && \
+	defined(LZO_FAST_64BIT_MEMORY_ACCESS)
+			u64 dv64;
+
+			for (; (ir + 32) <= limit; ir += 32) {
+				dv64 = get_unaligned((u64 *)ir);
+				dv64 |= get_unaligned((u64 *)ir + 1);
+				dv64 |= get_unaligned((u64 *)ir + 2);
+				dv64 |= get_unaligned((u64 *)ir + 3);
+				if (dv64)
+					break;
+			}
+			for (; (ir + 8) <= limit; ir += 8) {
+				dv64 = get_unaligned((u64 *)ir);
+				if (dv64) {
+#  if defined(__LITTLE_ENDIAN)
+					ir += __builtin_ctzll(dv64) >> 3;
+#  elif defined(__BIG_ENDIAN)
+					ir += __builtin_clzll(dv64) >> 3;
+#  else
+#    error "missing endian definition"
+#  endif
+					break;
+				}
+			}
+#else
+			while ((ir < (const unsigned char *)
+					ALIGN((uintptr_t)ir, 4)) &&
+					(ir < limit) && (*ir == 0))
+				ir++;
+			for (; (ir + 4) <= limit; ir += 4) {
+				dv = *((u32 *)ir);
+				if (dv) {
+#  if defined(__LITTLE_ENDIAN)
+					ir += __builtin_ctz(dv) >> 3;
+#  elif defined(__BIG_ENDIAN)
+					ir += __builtin_clz(dv) >> 3;
+#  else
+#    error "missing endian definition"
+#  endif
+					break;
+				}
+			}
+#endif
+			while (likely(ir < limit) && unlikely(*ir == 0))
+				ir++;
+			run_length = ir - ip;
+			if (run_length > MAX_ZERO_RUN_LENGTH)
+				run_length = MAX_ZERO_RUN_LENGTH;
+		} else {
+			t = ((dv * 0x1824429d) >> (32 - D_BITS)) & D_MASK;
+			m_pos = in + dict[t];
+			dict[t] = (lzo_dict_t) (ip - in);
+			if (unlikely(dv != get_unaligned_le32(m_pos)))
+				goto literal;
+		}
 
 		ii -= ti;
 		ti = 0;
 		t = ip - ii;
 		if (t != 0) {
 			if (t <= 3) {
-				op[-2] |= t;
+				op[*state_offset] |= t;
 				COPY4(op, ii);
 				op += t;
 			} else if (t <= 16) {
@@ -88,6 +147,17 @@
 			}
 		}
 
+		if (unlikely(run_length)) {
+			ip += run_length;
+			run_length -= MIN_ZERO_RUN_LENGTH;
+			put_unaligned_le32((run_length << 21) | 0xfffc18
+					   | (run_length & 0x7), op);
+			op += 4;
+			run_length = 0;
+			*state_offset = -3;
+			goto finished_writing_instruction;
+		}
+
 		m_len = 4;
 		{
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(LZO_USE_CTZ64)
@@ -170,7 +240,6 @@
 
 		m_off = ip - m_pos;
 		ip += m_len;
-		ii = ip;
 		if (m_len <= M2_MAX_LEN && m_off <= M2_MAX_OFFSET) {
 			m_off -= 1;
 			*op++ = (((m_len - 1) << 5) | ((m_off & 7) << 2));
@@ -207,29 +276,45 @@
 			*op++ = (m_off << 2);
 			*op++ = (m_off >> 6);
 		}
+		*state_offset = -2;
+finished_writing_instruction:
+		ii = ip;
 		goto next;
 	}
 	*out_len = op - out;
 	return in_end - (ii - ti);
 }
 
-int lzo1x_1_compress(const unsigned char *in, size_t in_len,
+int lzogeneric1x_1_compress(const unsigned char *in, size_t in_len,
 		     unsigned char *out, size_t *out_len,
-		     void *wrkmem)
+		     void *wrkmem, const unsigned char bitstream_version)
 {
 	const unsigned char *ip = in;
 	unsigned char *op = out;
 	size_t l = in_len;
 	size_t t = 0;
+	signed char state_offset = -2;
+	unsigned int m4_max_offset;
+
+	// LZO v0 will never write 17 as first byte,
+	// so this is used to version the bitstream
+	if (bitstream_version > 0) {
+		*op++ = 17;
+		*op++ = bitstream_version;
+		m4_max_offset = M4_MAX_OFFSET_V1;
+	} else {
+		m4_max_offset = M4_MAX_OFFSET_V0;
+	}
 
 	while (l > 20) {
-		size_t ll = l <= (M4_MAX_OFFSET + 1) ? l : (M4_MAX_OFFSET + 1);
+		size_t ll = l <= (m4_max_offset + 1) ? l : (m4_max_offset + 1);
 		uintptr_t ll_end = (uintptr_t) ip + ll;
 		if ((ll_end + ((t + ll) >> 5)) <= ll_end)
 			break;
 		BUILD_BUG_ON(D_SIZE * sizeof(lzo_dict_t) > LZO1X_1_MEM_COMPRESS);
 		memset(wrkmem, 0, D_SIZE * sizeof(lzo_dict_t));
-		t = lzo1x_1_do_compress(ip, ll, op, out_len, t, wrkmem);
+		t = lzo1x_1_do_compress(ip, ll, op, out_len, t, wrkmem,
+					&state_offset, bitstream_version);
 		ip += ll;
 		op += *out_len;
 		l  -= ll;
@@ -242,7 +327,7 @@
 		if (op == out && t <= 238) {
 			*op++ = (17 + t);
 		} else if (t <= 3) {
-			op[-2] |= t;
+			op[state_offset] |= t;
 		} else if (t <= 18) {
 			*op++ = (t - 3);
 		} else {
@@ -273,7 +358,24 @@
 	*out_len = op - out;
 	return LZO_E_OK;
 }
+
+int lzo1x_1_compress(const unsigned char *in, size_t in_len,
+		     unsigned char *out, size_t *out_len,
+		     void *wrkmem)
+{
+	return lzogeneric1x_1_compress(in, in_len, out, out_len, wrkmem, 0);
+}
+
+int lzorle1x_1_compress(const unsigned char *in, size_t in_len,
+		     unsigned char *out, size_t *out_len,
+		     void *wrkmem)
+{
+	return lzogeneric1x_1_compress(in, in_len, out, out_len,
+				       wrkmem, LZO_VERSION);
+}
+
 EXPORT_SYMBOL_GPL(lzo1x_1_compress);
+EXPORT_SYMBOL_GPL(lzorle1x_1_compress);
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("LZO1X-1 Compressor");

diff --git a/lib/lzo/lzo1x_decompress_safe.c b/lib/lzo/lzo1x_decompress_safe.c
index a1c387f..6d2600e 100644
--- a/lib/lzo/lzo1x_decompress_safe.c
+++ b/lib/lzo/lzo1x_decompress_safe.c

@@ -46,11 +46,23 @@
 	const unsigned char * const ip_end = in + in_len;
 	unsigned char * const op_end = out + *out_len;
 
+	unsigned char bitstream_version;
+
 	op = out;
 	ip = in;
 
 	if (unlikely(in_len < 3))
 		goto input_overrun;
+
+	if (likely(*ip == 17)) {
+		bitstream_version = ip[1];
+		ip += 2;
+		if (unlikely(in_len < 5))
+			goto input_overrun;
+	} else {
+		bitstream_version = 0;
+	}
+
 	if (*ip > 17) {
 		t = *ip++ - 17;
 		if (t < 4) {
@@ -154,32 +166,49 @@
 			m_pos -= next >> 2;
 			next &= 3;
 		} else {
-			m_pos = op;
-			m_pos -= (t & 8) << 11;
-			t = (t & 7) + (3 - 1);
-			if (unlikely(t == 2)) {
-				size_t offset;
-				const unsigned char *ip_last = ip;
-
-				while (unlikely(*ip == 0)) {
-					ip++;
-					NEED_IP(1);
-				}
-				offset = ip - ip_last;
-				if (unlikely(offset > MAX_255_COUNT))
-					return LZO_E_ERROR;
-
-				offset = (offset << 8) - offset;
-				t += offset + 7 + *ip++;
-				NEED_IP(2);
-			}
+			NEED_IP(2);
 			next = get_unaligned_le16(ip);
-			ip += 2;
-			m_pos -= next >> 2;
-			next &= 3;
-			if (m_pos == op)
-				goto eof_found;
-			m_pos -= 0x4000;
+			if (((next & 0xfffc) == 0xfffc) &&
+			    ((t & 0xf8) == 0x18) &&
+			    likely(bitstream_version)) {
+				NEED_IP(3);
+				t &= 7;
+				t |= ip[2] << 3;
+				t += MIN_ZERO_RUN_LENGTH;
+				NEED_OP(t);
+				memset(op, 0, t);
+				op += t;
+				next &= 3;
+				ip += 3;
+				goto match_next;
+			} else {
+				m_pos = op;
+				m_pos -= (t & 8) << 11;
+				t = (t & 7) + (3 - 1);
+				if (unlikely(t == 2)) {
+					size_t offset;
+					const unsigned char *ip_last = ip;
+
+					while (unlikely(*ip == 0)) {
+						ip++;
+						NEED_IP(1);
+					}
+					offset = ip - ip_last;
+					if (unlikely(offset > MAX_255_COUNT))
+						return LZO_E_ERROR;
+
+					offset = (offset << 8) - offset;
+					t += offset + 7 + *ip++;
+					NEED_IP(2);
+					next = get_unaligned_le16(ip);
+				}
+				ip += 2;
+				m_pos -= next >> 2;
+				next &= 3;
+				if (m_pos == op)
+					goto eof_found;
+				m_pos -= 0x4000;
+			}
 		}
 		TEST_LB(m_pos);
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)

diff --git a/lib/lzo/lzodefs.h b/lib/lzo/lzodefs.h
index 4edefd2..3b46f5f4 100644
--- a/lib/lzo/lzodefs.h
+++ b/lib/lzo/lzodefs.h

@@ -13,9 +13,15 @@
  */
 
 
+/* Version
+ * 0: original lzo version
+ * 1: lzo with support for RLE
+ */
+#define LZO_VERSION 1
+
 #define COPY4(dst, src)	\
 		put_unaligned(get_unaligned((const u32 *)(src)), (u32 *)(dst))
-#if defined(__x86_64__)
+#if defined(CONFIG_X86_64)
 #define COPY8(dst, src)	\
 		put_unaligned(get_unaligned((const u64 *)(src)), (u64 *)(dst))
 #else
@@ -25,19 +31,21 @@
 
 #if defined(__BIG_ENDIAN) && defined(__LITTLE_ENDIAN)
 #error "conflicting endian definitions"
-#elif defined(__x86_64__)
+#elif defined(CONFIG_X86_64)
 #define LZO_USE_CTZ64	1
 #define LZO_USE_CTZ32	1
-#elif defined(__i386__) || defined(__powerpc__)
+#define LZO_FAST_64BIT_MEMORY_ACCESS
+#elif defined(CONFIG_X86) || defined(CONFIG_PPC)
 #define LZO_USE_CTZ32	1
-#elif defined(__arm__) && (__LINUX_ARM_ARCH__ >= 5)
+#elif defined(CONFIG_ARM) && (__LINUX_ARM_ARCH__ >= 5)
 #define LZO_USE_CTZ32	1
 #endif
 
 #define M1_MAX_OFFSET	0x0400
 #define M2_MAX_OFFSET	0x0800
 #define M3_MAX_OFFSET	0x4000
-#define M4_MAX_OFFSET	0xbfff
+#define M4_MAX_OFFSET_V0	0xbfff
+#define M4_MAX_OFFSET_V1	0xbffe
 
 #define M1_MIN_LEN	2
 #define M1_MAX_LEN	2
@@ -53,6 +61,9 @@
 #define M3_MARKER	32
 #define M4_MARKER	16
 
+#define MIN_ZERO_RUN_LENGTH	4
+#define MAX_ZERO_RUN_LENGTH	(2047 + MIN_ZERO_RUN_LENGTH)
+
 #define lzo_dict_t      unsigned short
 #define D_BITS		13
 #define D_SIZE		(1u << D_BITS)

diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 8c3036c..60e7eca 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c

@@ -305,7 +305,7 @@
 			if (prv)
 				table->nents = ++table->orig_nents;
 
- 			return -ENOMEM;
+			return -ENOMEM;
 		}
 
 		sg_init_table(sg, alloc_size);

diff --git a/lib/sort.c b/lib/sort.c
index d6b7a20..d54cf97 100644
--- a/lib/sort.c
+++ b/lib/sort.c

@@ -1,8 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
+ * A fast, small, non-recursive O(n log n) sort for the Linux kernel
  *
- * Jan 23 2005  Matt Mackall <mpm@selenic.com>
+ * This performs n*log2(n) + 0.37*n + o(n) comparisons on average,
+ * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case.
+ *
+ * Glibc qsort() manages n*log2(n) - 1.26*n for random inputs (1.63*n
+ * better) at the expense of stack usage and much larger code to avoid
+ * quicksort's O(n^2) worst case.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -11,96 +16,262 @@
 #include <linux/export.h>
 #include <linux/sort.h>
 
-static int alignment_ok(const void *base, int align)
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
 {
-	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-		((unsigned long)base & (align - 1)) == 0;
-}
+	unsigned char lsbits = (unsigned char)size;
 
-static void u32_swap(void *a, void *b, int size)
-{
-	u32 t = *(u32 *)a;
-	*(u32 *)a = *(u32 *)b;
-	*(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, int size)
-{
-	u64 t = *(u64 *)a;
-	*(u64 *)a = *(u64 *)b;
-	*(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, int size)
-{
-	char t;
-
-	do {
-		t = *(char *)a;
-		*(char *)a++ = *(char *)b;
-		*(char *)b++ = t;
-	} while (--size > 0);
+	(void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+	return (lsbits & (align - 1)) == 0;
 }
 
 /**
- * sort - sort an array of elements
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is stil valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, size_t n)
+{
+	do {
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+	} while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, size_t n)
+{
+	do {
+#ifdef CONFIG_64BIT
+		u64 t = *(u64 *)(a + (n -= 8));
+		*(u64 *)(a + n) = *(u64 *)(b + n);
+		*(u64 *)(b + n) = t;
+#else
+		/* Use two 32-bit transfers to avoid base+index+4 addressing */
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+
+		t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+#endif
+	} while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void swap_bytes(void *a, void *b, size_t n)
+{
+	do {
+		char t = ((char *)a)[--n];
+		((char *)a)[n] = ((char *)b)[n];
+		((char *)b)[n] = t;
+	} while (n);
+}
+
+typedef void (*swap_func_t)(void *a, void *b, int size);
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_func_t)0
+#define SWAP_WORDS_32 (swap_func_t)1
+#define SWAP_BYTES    (swap_func_t)2
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_func_t swap_func)
+{
+	if (swap_func == SWAP_WORDS_64)
+		swap_words_64(a, b, size);
+	else if (swap_func == SWAP_WORDS_32)
+		swap_words_32(a, b, size);
+	else if (swap_func == SWAP_BYTES)
+		swap_bytes(a, b, size);
+	else
+		swap_func(a, b, (int)size);
+}
+
+typedef int (*cmp_func_t)(const void *, const void *);
+typedef int (*cmp_r_func_t)(const void *, const void *, const void *);
+#define _CMP_WRAPPER ((cmp_r_func_t)0L)
+
+static int do_cmp(const void *a, const void *b,
+		  cmp_r_func_t cmp, const void *priv)
+{
+	if (cmp == _CMP_WRAPPER)
+		return ((cmp_func_t)(priv))(a, b);
+	return cmp(a, b, priv);
+}
+
+/**
+ * parent - given the offset of the child, find the offset of the parent.
+ * @i: the offset of the heap element whose parent is sought.  Non-zero.
+ * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
+ * @size: size of each element
+ *
+ * In terms of array indexes, the parent of element j = @i/@size is simply
+ * (j-1)/2.  But when working in byte offsets, we can't use implicit
+ * truncation of integer divides.
+ *
+ * Fortunately, we only need one bit of the quotient, not the full divide.
+ * @size has a least significant bit.  That bit will be clear if @i is
+ * an even multiple of @size, and set if it's an odd multiple.
+ *
+ * Logically, we're doing "if (i & lsbit) i -= size;", but since the
+ * branch is unpredictable, it's done with a bit of clever branch-free
+ * code instead.
+ */
+__attribute_const__ __always_inline
+static size_t parent(size_t i, unsigned int lsbit, size_t size)
+{
+	i -= size;
+	i -= size & -(i & lsbit);
+	return i / 2;
+}
+
+/**
+ * sort_r - sort an array of elements
  * @base: pointer to data to sort
  * @num: number of elements
  * @size: size of each element
  * @cmp_func: pointer to comparison function
  * @swap_func: pointer to swap function or NULL
+ * @priv: third argument passed to comparison function
  *
- * This function does a heapsort on the given array. You may provide a
- * swap_func function optimized to your element type.
+ * This function does a heapsort on the given array.  You may provide
+ * a swap_func function if you need to do something more than a memory
+ * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
+ * avoids a slow retpoline and so is significantly faster.
  *
  * Sorting time is O(n log n) both on average and worst-case. While
- * qsort is about 20% faster on average, it suffers from exploitable
+ * quicksort is slightly faster on average, it suffers from exploitable
  * O(n*n) worst-case behavior and extra memory requirements that make
  * it less suitable for kernel use.
  */
+void sort_r(void *base, size_t num, size_t size,
+	    int (*cmp_func)(const void *, const void *, const void *),
+	    void (*swap_func)(void *, void *, int size),
+	    const void *priv)
+{
+	/* pre-scale counters for performance */
+	size_t n = num * size, a = (num/2) * size;
+	const unsigned int lsbit = size & -size;  /* Used to find parent */
+
+	if (!a)		/* num < 2 || size == 0 */
+		return;
+
+	if (!swap_func) {
+		if (is_aligned(base, size, 8))
+			swap_func = SWAP_WORDS_64;
+		else if (is_aligned(base, size, 4))
+			swap_func = SWAP_WORDS_32;
+		else
+			swap_func = SWAP_BYTES;
+	}
+
+	/*
+	 * Loop invariants:
+	 * 1. elements [a,n) satisfy the heap property (compare greater than
+	 *    all of their children),
+	 * 2. elements [n,num*size) are sorted, and
+	 * 3. a <= b <= c <= d <= n (whenever they are valid).
+	 */
+	for (;;) {
+		size_t b, c, d;
+
+		if (a)			/* Building heap: sift down --a */
+			a -= size;
+		else if (n -= size)	/* Sorting: Extract root to --n */
+			do_swap(base, base + n, size, swap_func);
+		else			/* Sort complete */
+			break;
+
+		/*
+		 * Sift element at "a" down into heap.  This is the
+		 * "bottom-up" variant, which significantly reduces
+		 * calls to cmp_func(): we find the sift-down path all
+		 * the way to the leaves (one compare per level), then
+		 * backtrack to find where to insert the target element.
+		 *
+		 * Because elements tend to sift down close to the leaves,
+		 * this uses fewer compares than doing two per level
+		 * on the way down.  (A bit more than half as many on
+		 * average, 3/4 worst-case.)
+		 */
+		for (b = a; c = 2*b + size, (d = c + size) < n;)
+			b = do_cmp(base + c, base + d, cmp_func, priv) >= 0 ? c : d;
+		if (d == n)	/* Special case last leaf with no sibling */
+			b = c;
+
+		/* Now backtrack from "b" to the correct location for "a" */
+		while (b != a && do_cmp(base + a, base + b, cmp_func, priv) >= 0)
+			b = parent(b, lsbit, size);
+		c = b;			/* Where "a" belongs */
+		while (b != a) {	/* Shift it into place */
+			b = parent(b, lsbit, size);
+			do_swap(base + b, base + c, size, swap_func);
+		}
+	}
+}
+EXPORT_SYMBOL(sort_r);
 
 void sort(void *base, size_t num, size_t size,
 	  int (*cmp_func)(const void *, const void *),
 	  void (*swap_func)(void *, void *, int size))
 {
-	/* pre-scale counters for performance */
-	int i = (num/2 - 1) * size, n = num * size, c, r;
-
-	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
-			swap_func = u64_swap;
-		else
-			swap_func = generic_swap;
-	}
-
-	/* heapify */
-	for ( ; i >= 0; i -= size) {
-		for (r = i; r * 2 + size < n; r  = c) {
-			c = r * 2 + size;
-			if (c < n - size &&
-					cmp_func(base + c, base + c + size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
-
-	/* sort */
-	for (i = n - size; i > 0; i -= size) {
-		swap_func(base, base + i, size);
-		for (r = 0; r * 2 + size < i; r = c) {
-			c = r * 2 + size;
-			if (c < i - size &&
-					cmp_func(base + c, base + c + size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
+	return sort_r(base, num, size, _CMP_WRAPPER, swap_func, cmp_func);
 }
-
 EXPORT_SYMBOL(sort);

diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index e513459..3376a32 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c

@@ -92,15 +92,19 @@
 		return true;
 	if (stack_slabs[depot_index] == NULL) {
 		stack_slabs[depot_index] = *prealloc;
+		*prealloc = NULL;
 	} else {
-		stack_slabs[depot_index + 1] = *prealloc;
+		/* If this is the last depot slab, do not touch the next one. */
+		if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) {
+			stack_slabs[depot_index + 1] = *prealloc;
+			*prealloc = NULL;
+		}
 		/*
 		 * This smp_store_release pairs with smp_load_acquire() from
 		 * |next_slab_inited| above and in depot_save_stack().
 		 */
 		smp_store_release(&next_slab_inited, 1);
 	}
-	*prealloc = NULL;
 	return true;
 }
 

diff --git a/mm/Kconfig b/mm/Kconfig
index b457e94..8020040 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig

@@ -327,6 +327,23 @@
 	  This value can be changed after boot using the
 	  /proc/sys/vm/mmap_min_addr tunable.
 
+config MMAP_NOEXEC_TAINT
+	int "Turns on tainting of mmap()d files from noexec mountpoints"
+	default 1 if MMU
+	default 0 if !MMU
+	help
+	  By default, the ability to change the protections of a virtual
+	  memory area to allow execution depend on if the vma has the
+	  VM_MAYEXEC flag.  When mapping regions from files, VM_MAYEXEC
+	  will be unset if the containing mountpoint is mounted MNT_NOEXEC.
+	  By setting the value to 0, any mmap()d region may be later
+	  mprotect()d with PROT_EXEC.
+
+	  If unsure, keep the value set to 1.
+
+	  This value can be changed after boot using the
+	  /proc/sys/vm/mmap_noexec_taint tunable.
+
 config ARCH_SUPPORTS_MEMORY_FAILURE
 	bool
 

diff --git a/mm/filemap.c b/mm/filemap.c
index 45f1c6d..b19706a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c

@@ -2541,8 +2541,8 @@
 	} else if (!page) {
 		/* No page in the page cache at all */
 		do_sync_mmap_readahead(vmf->vma, ra, file, offset);
-		count_vm_event(PGMAJFAULT);
-		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
+		count_vm_event(PGMAJFAULT_F);
+		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT_F);
 		ret = VM_FAULT_MAJOR;
 retry_find:
 		page = find_get_page(mapping, offset);

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5bb93cf..1469983 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c

@@ -173,16 +173,13 @@
 {
 	ssize_t ret = count;
 
-	if (!memcmp("always", buf,
-		    min(sizeof("always")-1, count))) {
+	if (sysfs_streq(buf, "always")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
 		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
-	} else if (!memcmp("madvise", buf,
-			   min(sizeof("madvise")-1, count))) {
+	} else if (sysfs_streq(buf, "madvise")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
 		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
-	} else if (!memcmp("never", buf,
-			   min(sizeof("never")-1, count))) {
+	} else if (sysfs_streq(buf, "never")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
 	} else
@@ -246,32 +243,27 @@
 			    struct kobj_attribute *attr,
 			    const char *buf, size_t count)
 {
-	if (!memcmp("always", buf,
-		    min(sizeof("always")-1, count))) {
+	if (sysfs_streq(buf, "always")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
-	} else if (!memcmp("defer+madvise", buf,
-		    min(sizeof("defer+madvise")-1, count))) {
+	} else if (sysfs_streq(buf, "defer+madvise")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
-	} else if (!memcmp("defer", buf,
-		    min(sizeof("defer")-1, count))) {
+	} else if (sysfs_streq(buf, "defer")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
-	} else if (!memcmp("madvise", buf,
-			   min(sizeof("madvise")-1, count))) {
+	} else if (sysfs_streq(buf, "madvise")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 		set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
-	} else if (!memcmp("never", buf,
-			   min(sizeof("never")-1, count))) {
+	} else if (sysfs_streq(buf, "never")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
@@ -2661,7 +2653,7 @@
 	unsigned long flags;
 	pgoff_t end;
 
-	VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+	VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 

diff --git a/mm/madvise.c b/mm/madvise.c
index 71d21df..899b19e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c

@@ -138,7 +138,7 @@
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
 			  vma->vm_file, pgoff, vma_policy(vma),
-			  vma->vm_userfaultfd_ctx);
+			  vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
 	if (*prev) {
 		vma = *prev;
 		goto success;

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3a3d109..0b3e6b8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c

@@ -419,8 +419,10 @@
 		if (mem_cgroup_is_root(memcg))
 			continue;
 		ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
-		if (ret)
+		if (ret) {
+			mem_cgroup_iter_break(NULL, memcg);
 			goto unlock;
+		}
 	}
 unlock:
 	if (!ret)
@@ -3439,6 +3441,9 @@
 	PGPGOUT,
 	PGFAULT,
 	PGMAJFAULT,
+	PGMAJFAULT_S,
+	PGMAJFAULT_A,
+	PGMAJFAULT_F,
 };
 
 static const char *const memcg1_event_names[] = {
@@ -3446,6 +3451,9 @@
 	"pgpgout",
 	"pgfault",
 	"pgmajfault",
+	"pgmajfault_s",
+	"pgmajfault_a",
+	"pgmajfault_f",
 };
 
 static int memcg_stat_show(struct seq_file *m, void *v)
@@ -5673,6 +5681,9 @@
 
 	seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
 	seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
+	seq_printf(m, "pgmajfault_s %lu\n", acc.events[PGMAJFAULT_S]);
+	seq_printf(m, "pgmajfault_a %lu\n", acc.events[PGMAJFAULT_A]);
+	seq_printf(m, "pgmajfault_f %lu\n", acc.events[PGMAJFAULT_F]);
 
 	seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
 	seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +

diff --git a/mm/memory.c b/mm/memory.c
index bbf0cc4..3d3c947 100644
--- a/mm/memory.c
+++ b/mm/memory.c

@@ -2980,8 +2980,8 @@
 
 		/* Had to read the page from swap area: Major fault */
 		ret = VM_FAULT_MAJOR;
-		count_vm_event(PGMAJFAULT);
-		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
+		count_vm_event(PGMAJFAULT_A);
+		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT_A);
 	} else if (PageHWPoison(page)) {
 		/*
 		 * hwpoisoned dirty swapcache pages are kept for killing

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index adeb163..9e7c91a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c

@@ -760,7 +760,8 @@
 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 				 vma->anon_vma, vma->vm_file, pgoff,
-				 new_pol, vma->vm_userfaultfd_ctx);
+				 new_pol, vma->vm_userfaultfd_ctx,
+				 vma_get_anon_name(vma));
 		if (prev) {
 			vma = prev;
 			next = vma->vm_next;

diff --git a/mm/mlock.c b/mm/mlock.c
index 0ab8250..02d8a89 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c

@@ -535,7 +535,7 @@
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
 			  vma->vm_file, pgoff, vma_policy(vma),
-			  vma->vm_userfaultfd_ctx);
+			  vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
 	if (*prev) {
 		vma = *prev;
 		goto success;

diff --git a/mm/mmap.c b/mm/mmap.c
index a98f09b..0636423 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c

@@ -977,7 +977,8 @@
  */
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
 				struct file *file, unsigned long vm_flags,
-				struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+				struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+				const char __user *anon_name)
 {
 	/*
 	 * VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -995,6 +996,8 @@
 		return 0;
 	if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
 		return 0;
+	if (vma_get_anon_name(vma) != anon_name)
+		return 0;
 	return 1;
 }
 
@@ -1027,9 +1030,10 @@
 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
 		     struct anon_vma *anon_vma, struct file *file,
 		     pgoff_t vm_pgoff,
-		     struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+		     struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+		     const char __user *anon_name)
 {
-	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		if (vma->vm_pgoff == vm_pgoff)
 			return 1;
@@ -1048,9 +1052,10 @@
 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 		    struct anon_vma *anon_vma, struct file *file,
 		    pgoff_t vm_pgoff,
-		    struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+		    struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+		    const char __user *anon_name)
 {
-	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		pgoff_t vm_pglen;
 		vm_pglen = vma_pages(vma);
@@ -1061,9 +1066,9 @@
 }
 
 /*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor.  Or both (it neatly fills a hole).
  *
  * In most cases - when called for mmap, brk or mremap - [addr,end) is
  * certain not to be mapped by the time vma_merge is called; but when
@@ -1105,7 +1110,8 @@
 			unsigned long end, unsigned long vm_flags,
 			struct anon_vma *anon_vma, struct file *file,
 			pgoff_t pgoff, struct mempolicy *policy,
-			struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+			struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+			const char __user *anon_name)
 {
 	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
 	struct vm_area_struct *area, *next;
@@ -1138,7 +1144,8 @@
 			mpol_equal(vma_policy(prev), policy) &&
 			can_vma_merge_after(prev, vm_flags,
 					    anon_vma, file, pgoff,
-					    vm_userfaultfd_ctx)) {
+					    vm_userfaultfd_ctx,
+					    anon_name)) {
 		/*
 		 * OK, it can.  Can we now merge in the successor as well?
 		 */
@@ -1147,7 +1154,8 @@
 				can_vma_merge_before(next, vm_flags,
 						     anon_vma, file,
 						     pgoff+pglen,
-						     vm_userfaultfd_ctx) &&
+						     vm_userfaultfd_ctx,
+						     anon_name) &&
 				is_mergeable_anon_vma(prev->anon_vma,
 						      next->anon_vma, NULL)) {
 							/* cases 1, 6 */
@@ -1170,7 +1178,8 @@
 			mpol_equal(policy, vma_policy(next)) &&
 			can_vma_merge_before(next, vm_flags,
 					     anon_vma, file, pgoff+pglen,
-					     vm_userfaultfd_ctx)) {
+					     vm_userfaultfd_ctx,
+					     anon_name)) {
 		if (prev && addr < prev->vm_end)	/* case 4 */
 			err = __vma_adjust(prev, prev->vm_start,
 					 addr, prev->vm_pgoff, NULL, next);
@@ -1479,7 +1488,8 @@
 			if (path_noexec(&file->f_path)) {
 				if (vm_flags & VM_EXEC)
 					return -EPERM;
-				vm_flags &= ~VM_MAYEXEC;
+				if (sysctl_mmap_noexec_taint)
+					vm_flags &= ~VM_MAYEXEC;
 			}
 
 			if (!file->f_op->mmap)
@@ -1715,7 +1725,7 @@
 	 * Can we just expand an old mapping?
 	 */
 	vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
-			NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
+			NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
 	if (vma)
 		goto out;
 
@@ -2785,6 +2795,7 @@
 
 	return 0;
 }
+EXPORT_SYMBOL(do_munmap);
 
 int vm_munmap(unsigned long start, size_t len)
 {
@@ -2971,7 +2982,7 @@
 
 	/* Can we just expand an old private anonymous mapping? */
 	vma = vma_merge(mm, prev, addr, addr + len, flags,
-			NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
+			NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
 	if (vma)
 		goto out;
 
@@ -3169,7 +3180,7 @@
 		return NULL;	/* should never get here */
 	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
 			    vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
-			    vma->vm_userfaultfd_ctx);
+			    vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
 	if (new_vma) {
 		/*
 		 * Source vma may have been merged into new_vma

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 6d33162..58f591d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c

@@ -398,7 +398,7 @@
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*pprev = vma_merge(mm, *pprev, start, end, newflags,
 			   vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
-			   vma->vm_userfaultfd_ctx);
+			   vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
 	if (*pprev) {
 		vma = *pprev;
 		VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);

diff --git a/mm/shmem.c b/mm/shmem.c
index a6fc82a..b7314e0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c

@@ -1690,8 +1690,8 @@
 			/* Or update major stats only when swapin succeeds?? */
 			if (fault_type) {
 				*fault_type |= VM_FAULT_MAJOR;
-				count_vm_event(PGMAJFAULT);
-				count_memcg_event_mm(charge_mm, PGMAJFAULT);
+				count_vm_event(PGMAJFAULT_S);
+				count_memcg_event_mm(charge_mm, PGMAJFAULT_S);
 			}
 			/* Here we actually start the io */
 			page = shmem_swapin(swap, gfp, info, index);

diff --git a/mm/slub.c b/mm/slub.c
index 9c3937c..8680ae0 100644
--- a/mm/slub.c
+++ b/mm/slub.c

@@ -5075,6 +5075,14 @@
 SLAB_ATTR_RO(cache_dma);
 #endif
 
+#ifdef CONFIG_ZONE_DMA32
+static ssize_t cache_dma32_show(struct kmem_cache *s, char *buf)
+{
+	return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA32));
+}
+SLAB_ATTR_RO(cache_dma32);
+#endif
+
 static ssize_t usersize_show(struct kmem_cache *s, char *buf)
 {
 	return sprintf(buf, "%u\n", s->usersize);
@@ -5415,6 +5423,9 @@
 #ifdef CONFIG_ZONE_DMA
 	&cache_dma_attr.attr,
 #endif
+#ifdef CONFIG_ZONE_DMA32
+	&cache_dma32_attr.attr,
+#endif
 #ifdef CONFIG_NUMA
 	&remote_node_defrag_ratio_attr.attr,
 #endif

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0047dca..0563c7b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c

@@ -2532,6 +2532,8 @@
 	struct swap_cluster_info *cluster_info;
 	unsigned long *frontswap_map;
 	struct file *swap_file, *victim;
+	struct path path_holder;
+	struct path *victim_path = NULL;
 	struct address_space *mapping;
 	struct inode *inode;
 	struct filename *pathname;
@@ -2549,10 +2551,16 @@
 
 	victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
 	err = PTR_ERR(victim);
-	if (IS_ERR(victim))
-		goto out;
-
-	mapping = victim->f_mapping;
+	if (IS_ERR(victim)) {
+		/* Fallback to just the inode mapping if possible. */
+		if (kern_path(pathname->name, LOOKUP_FOLLOW, &path_holder))
+			goto out;  /* Propogate the original err. */
+		victim_path = &path_holder;
+		mapping = victim_path->dentry->d_inode->i_mapping;
+		victim = NULL;
+	} else {
+		mapping = victim->f_mapping;
+	}
 	spin_lock(&swap_lock);
 	plist_for_each_entry(p, &swap_active_head, list) {
 		if (p->flags & SWP_WRITEOK) {
@@ -2685,7 +2693,10 @@
 	wake_up_interruptible(&proc_poll_wait);
 
 out_dput:
-	filp_close(victim, NULL);
+	if (victim)
+		filp_close(victim, NULL);
+	if (victim_path)
+		path_put(victim_path);
 out:
 	putname(pathname);
 	return err;
@@ -2873,12 +2884,23 @@
 	return p;
 }
 
-static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
+/* This sysctl is only exposed when CONFIG_DISK_BASED_SWAP is enabled. */
+int sysctl_disk_based_swap;
+
+static int claim_swapfile(struct swap_info_struct *p, struct inode *inode,
+			  bool allow_disk_based_swap)
 {
 	int error;
-
+	/* On Chromium OS, we only support zram swap devices. */
 	if (S_ISBLK(inode->i_mode)) {
+		char name[BDEVNAME_SIZE];
 		p->bdev = bdgrab(I_BDEV(inode));
+		bdevname(p->bdev, name);
+		if (strncmp(name, "zram", strlen("zram"))) {
+			bdput(p->bdev);
+			p->bdev = NULL;
+			return -EINVAL;
+		}
 		error = blkdev_get(p->bdev,
 				   FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
 		if (error < 0) {
@@ -2890,7 +2912,7 @@
 		if (error < 0)
 			return error;
 		p->flags |= SWP_BLKDEV;
-	} else if (S_ISREG(inode->i_mode)) {
+	} else if (S_ISREG(inode->i_mode) && allow_disk_based_swap) {
 		p->bdev = inode->i_sb->s_bdev;
 		inode_lock(inode);
 		if (IS_SWAPFILE(inode))
@@ -3119,6 +3141,7 @@
 	struct page *page = NULL;
 	struct inode *inode = NULL;
 	bool inced_nr_rotate_swap = false;
+	bool allow_disk_based_swap = sysctl_disk_based_swap;
 
 	if (swap_flags & ~SWAP_FLAGS_VALID)
 		return -EINVAL;
@@ -3153,7 +3176,7 @@
 	inode = mapping->host;
 
 	/* If S_ISREG(inode->i_mode) will do inode_lock(inode); */
-	error = claim_swapfile(p, inode);
+	error = claim_swapfile(p, inode, allow_disk_based_swap);
 	if (unlikely(error))
 		goto bad_swap;
 
@@ -3321,7 +3344,8 @@
 		atomic_dec(&nr_rotate_swap);
 	if (swap_file) {
 		if (inode && S_ISREG(inode->i_mode)) {
-			inode_unlock(inode);
+			if (allow_disk_based_swap)
+				inode_unlock(inode);
 			inode = NULL;
 		}
 		filp_close(swap_file, NULL);

diff --git a/mm/util.c b/mm/util.c
index 6a24a10..d7628b1 100644
--- a/mm/util.c
+++ b/mm/util.c

@@ -563,6 +563,7 @@
 int sysctl_overcommit_ratio __read_mostly = 50;
 unsigned long sysctl_overcommit_kbytes __read_mostly;
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+int sysctl_mmap_noexec_taint __read_mostly = CONFIG_MMAP_NOEXEC_TAINT;
 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
 

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b37610c..d8e561b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c

@@ -166,6 +166,11 @@
  */
 unsigned long vm_total_pages;
 
+/*
+ * Low watermark used to prevent fscache thrashing during low memory.
+ */
+int min_filelist_kbytes;
+
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 
@@ -2242,9 +2247,33 @@
 	return inactive * inactive_ratio < active;
 }
 
+/*
+ * Check low watermark used to prevent fscache thrashing during low memory.
+ */
+static int file_is_low(struct lruvec *lruvec, struct scan_control *sc)
+{
+	unsigned long pages_min, active, inactive;
+	enum lru_list inactive_lru = LRU_FILE;
+	enum lru_list active_lru = LRU_FILE + LRU_ACTIVE;
+
+	if (!mem_cgroup_disabled())
+		return false;
+
+	pages_min = min_filelist_kbytes >> (PAGE_SHIFT - 10);
+	inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
+	active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
+
+	return ((active + inactive) < pages_min);
+}
+
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 				 struct lruvec *lruvec, struct scan_control *sc)
 {
+	int file = is_file_lru(lru);
+
+	if (file && file_is_low(lruvec, sc))
+		return 0;
+
 	if (is_active_lru(lru)) {
 		if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
 			shrink_active_list(nr_to_scan, lruvec, sc, lru);
@@ -2285,6 +2314,15 @@
 	unsigned long ap, fp;
 	enum lru_list lru;
 
+	/*
+	 * Do not scan file pages when swap is allowed by __GFP_IO and
+	 * file page count is low.
+	 */
+	if ((sc->gfp_mask & __GFP_IO) && file_is_low(lruvec, sc)) {
+		scan_balance = SCAN_ANON;
+		goto out;
+	}
+
 	/* If we have no swap space, do not bother scanning anon pages. */
 	if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
 		scan_balance = SCAN_FILE;
@@ -2446,10 +2484,13 @@
 			/*
 			 * Scan types proportional to swappiness and
 			 * their relative recent reclaim efficiency.
-			 * Make sure we don't miss the last page
-			 * because of a round-off error.
+			 * Make sure we don't miss the last page on
+			 * the offlined memory cgroups because of a
+			 * round-off error.
 			 */
-			scan = DIV64_U64_ROUND_UP(scan * fraction[file],
+			scan = mem_cgroup_online(memcg) ?
+			       div64_u64(scan * fraction[file], denominator) :
+			       DIV64_U64_ROUND_UP(scan * fraction[file],
 						  denominator);
 			break;
 		case SCAN_FILE:

diff --git a/mm/vmstat.c b/mm/vmstat.c
index ce81b0a..6bdb077 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c

@@ -1185,6 +1185,9 @@
 
 	"pgfault",
 	"pgmajfault",
+	"pgmajfault_s",
+	"pgmajfault_a",
+	"pgmajfault_f",
 	"pglazyfreed",
 
 	"pgrefill",
@@ -1688,6 +1691,8 @@
 	all_vm_events(v);
 	v[PGPGIN] /= 2;		/* sectors -> kbytes */
 	v[PGPGOUT] /= 2;
+	/* Add up page faults */
+	v[PGMAJFAULT] = v[PGMAJFAULT_S] + v[PGMAJFAULT_A] + v[PGMAJFAULT_F];
 #endif
 	return (unsigned long *)m->private + *pos;
 }

diff --git a/net/bridge/br.c b/net/bridge/br.c
index b0a0b82..e411e40 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c

@@ -175,6 +175,22 @@
 	.notifier_call = br_switchdev_event,
 };
 
+void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on)
+{
+	bool cur = !!br_opt_get(br, opt);
+
+	br_debug(br, "toggle option: %d state: %d -> %d\n",
+		 opt, cur, on);
+
+	if (cur == on)
+		return;
+
+	if (on)
+		set_bit(opt, &br->options);
+	else
+		clear_bit(opt, &br->options);
+}
+
 static void __net_exit br_net_exit(struct net *net)
 {
 	struct net_device *dev;

diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index d42e390..6b78e63 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c

@@ -39,7 +39,7 @@
 		}
 	}
 
-	br->neigh_suppress_enabled = neigh_suppress;
+	br_opt_toggle(br, BROPT_NEIGH_SUPPRESS_ENABLED, neigh_suppress);
 }
 
 #if IS_ENABLED(CONFIG_INET)
@@ -155,7 +155,7 @@
 	    ipv4_is_multicast(tip))
 		return;
 
-	if (br->neigh_suppress_enabled) {
+	if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
 		if (p && (p->flags & BR_NEIGH_SUPPRESS))
 			return;
 		if (ipv4_is_zeronet(sip) || sip == tip) {
@@ -175,7 +175,8 @@
 			return;
 	}
 
-	if (br->neigh_suppress_enabled && br_is_local_ip(vlandev, tip)) {
+	if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
+	    br_is_local_ip(vlandev, tip)) {
 		/* its our local ip, so don't proxy reply
 		 * and don't forward to neigh suppress ports
 		 */
@@ -213,7 +214,8 @@
 			/* If we have replied or as long as we know the
 			 * mac, indicate to arp replied
 			 */
-			if (replied || br->neigh_suppress_enabled)
+			if (replied ||
+			    br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED))
 				BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
 		}
 
@@ -460,7 +462,8 @@
 			 * mac, indicate to NEIGH_SUPPRESS ports that we
 			 * have replied
 			 */
-			if (replied || br->neigh_suppress_enabled)
+			if (replied ||
+			    br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED))
 				BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
 		}
 		neigh_release(n);

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 9ce661e..e645983 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c

@@ -67,11 +67,11 @@
 	if (IS_ENABLED(CONFIG_INET) &&
 	    (eth->h_proto == htons(ETH_P_ARP) ||
 	     eth->h_proto == htons(ETH_P_RARP)) &&
-	    br->neigh_suppress_enabled) {
+	    br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
 		br_do_proxy_suppress_arp(skb, br, vid, NULL);
 	} else if (IS_ENABLED(CONFIG_IPV6) &&
 		   skb->protocol == htons(ETH_P_IPV6) &&
-		   br->neigh_suppress_enabled &&
+		   br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
 		   pskb_may_pull(skb, sizeof(struct ipv6hdr) +
 				 sizeof(struct nd_msg)) &&
 		   ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
@@ -228,7 +228,7 @@
 	dev->mtu = new_mtu;
 
 	/* this flag will be cleared if the MTU was automatically adjusted */
-	br->mtu_set_by_user = true;
+	br_opt_toggle(br, BROPT_MTU_SET_BY_USER, true);
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	/* remember the MTU in the rtable for PMTU */
 	dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu);

diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index ed2b600..9c34cf6 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c

@@ -509,14 +509,14 @@
 	ASSERT_RTNL();
 
 	/* if the bridge MTU was manually configured don't mess with it */
-	if (br->mtu_set_by_user)
+	if (br_opt_get(br, BROPT_MTU_SET_BY_USER))
 		return;
 
 	/* change to the minimum MTU and clear the flag which was set by
 	 * the bridge ndo_change_mtu callback
 	 */
 	dev_set_mtu(br->dev, br_mtu_min(br));
-	br->mtu_set_by_user = false;
+	br_opt_toggle(br, BROPT_MTU_SET_BY_USER, false);
 }
 
 static void br_set_gso_limits(struct net_bridge *br)

diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 2532c1a..e96035f 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c

@@ -120,7 +120,7 @@
 		br_do_proxy_suppress_arp(skb, br, vid, p);
 	} else if (IS_ENABLED(CONFIG_IPV6) &&
 		   skb->protocol == htons(ETH_P_IPV6) &&
-		   br->neigh_suppress_enabled &&
+		   br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
 		   pskb_may_pull(skb, sizeof(struct ipv6hdr) +
 				 sizeof(struct nd_msg)) &&
 		   ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {

diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 5519881..fb5026a 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c

@@ -84,7 +84,7 @@
 	int i, err = 0;
 	int idx = 0, s_idx = cb->args[1];
 
-	if (br->multicast_disabled)
+	if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
 		return 0;
 
 	mdb = rcu_dereference(br->mdb);
@@ -598,7 +598,7 @@
 	struct net_bridge_port *p;
 	int ret;
 
-	if (!netif_running(br->dev) || br->multicast_disabled)
+	if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED))
 		return -EINVAL;
 
 	dev = __dev_get_by_index(net, entry->ifindex);
@@ -673,7 +673,7 @@
 	struct br_ip ip;
 	int err = -EINVAL;
 
-	if (!netif_running(br->dev) || br->multicast_disabled)
+	if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED))
 		return -EINVAL;
 
 	__mdb_entry_to_br_ip(entry, &ip);

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 6a362da..526a83d 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c

@@ -158,7 +158,7 @@
 	struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb);
 	struct br_ip ip;
 
-	if (br->multicast_disabled)
+	if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
 		return NULL;
 
 	if (BR_INPUT_SKB_CB(skb)->igmp)
@@ -411,7 +411,7 @@
 	iph->frag_off = htons(IP_DF);
 	iph->ttl = 1;
 	iph->protocol = IPPROTO_IGMP;
-	iph->saddr = br->multicast_query_use_ifaddr ?
+	iph->saddr = br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR) ?
 		     inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0;
 	iph->daddr = htonl(INADDR_ALLHOSTS_GROUP);
 	((u8 *)&iph[1])[0] = IPOPT_RA;
@@ -503,11 +503,11 @@
 	if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
 			       &ip6h->saddr)) {
 		kfree_skb(skb);
-		br->has_ipv6_addr = 0;
+		br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, false);
 		return NULL;
 	}
 
-	br->has_ipv6_addr = 1;
+	br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true);
 	ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest);
 
 	hopopt = (u8 *)(ip6h + 1);
@@ -628,7 +628,7 @@
 				port ? port->dev->name : br->dev->name);
 			err = -E2BIG;
 disable:
-			br->multicast_disabled = 1;
+			br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false);
 			goto err;
 		}
 	}
@@ -894,7 +894,7 @@
 					 struct bridge_mcast_own_query *query)
 {
 	spin_lock(&br->multicast_lock);
-	if (!netif_running(br->dev) || br->multicast_disabled)
+	if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED))
 		goto out;
 
 	br_multicast_start_querier(br, query);
@@ -965,8 +965,9 @@
 	struct br_ip br_group;
 	unsigned long time;
 
-	if (!netif_running(br->dev) || br->multicast_disabled ||
-	    !br->multicast_querier)
+	if (!netif_running(br->dev) ||
+	    !br_opt_get(br, BROPT_MULTICAST_ENABLED) ||
+	    !br_opt_get(br, BROPT_MULTICAST_QUERIER))
 		return;
 
 	memset(&br_group.u, 0, sizeof(br_group.u));
@@ -1036,7 +1037,7 @@
 		.orig_dev = dev,
 		.id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
 		.flags = SWITCHDEV_F_DEFER,
-		.u.mc_disabled = value,
+		.u.mc_disabled = !value,
 	};
 
 	switchdev_port_attr_set(dev, &attr);
@@ -1054,7 +1055,8 @@
 	timer_setup(&port->ip6_own_query.timer,
 		    br_ip6_multicast_port_query_expired, 0);
 #endif
-	br_mc_disabled_update(port->dev, port->br->multicast_disabled);
+	br_mc_disabled_update(port->dev,
+			      br_opt_get(port->br, BROPT_MULTICAST_ENABLED));
 
 	port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
 	if (!port->mcast_stats)
@@ -1091,7 +1093,7 @@
 {
 	struct net_bridge *br = port->br;
 
-	if (br->multicast_disabled || !netif_running(br->dev))
+	if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || !netif_running(br->dev))
 		return;
 
 	br_multicast_enable(&port->ip4_own_query);
@@ -1641,7 +1643,7 @@
 	if (timer_pending(&other_query->timer))
 		goto out;
 
-	if (br->multicast_querier) {
+	if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) {
 		__br_multicast_send_query(br, port, &mp->addr);
 
 		time = jiffies + br->multicast_last_member_count *
@@ -1753,7 +1755,7 @@
 	struct bridge_mcast_stats __percpu *stats;
 	struct bridge_mcast_stats *pstats;
 
-	if (!br->multicast_stats_enabled)
+	if (!br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED))
 		return;
 
 	if (p)
@@ -1911,7 +1913,7 @@
 	BR_INPUT_SKB_CB(skb)->igmp = 0;
 	BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
 
-	if (br->multicast_disabled)
+	if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
 		return 0;
 
 	switch (skb->protocol) {
@@ -1963,8 +1965,6 @@
 	br->hash_max = 512;
 
 	br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
-	br->multicast_querier = 0;
-	br->multicast_query_use_ifaddr = 0;
 	br->multicast_last_member_count = 2;
 	br->multicast_startup_query_count = 2;
 
@@ -1983,7 +1983,7 @@
 	br->ip6_other_query.delay_time = 0;
 	br->ip6_querier.port = NULL;
 #endif
-	br->has_ipv6_addr = 1;
+	br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true);
 
 	spin_lock_init(&br->multicast_lock);
 	timer_setup(&br->multicast_router_timer,
@@ -2005,7 +2005,7 @@
 {
 	query->startup_sent = 0;
 
-	if (br->multicast_disabled)
+	if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
 		return;
 
 	mod_timer(&query->timer, jiffies);
@@ -2182,12 +2182,12 @@
 	int err = 0;
 
 	spin_lock_bh(&br->multicast_lock);
-	if (br->multicast_disabled == !val)
+	if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val)
 		goto unlock;
 
-	br_mc_disabled_update(br->dev, !val);
-	br->multicast_disabled = !val;
-	if (br->multicast_disabled)
+	br_mc_disabled_update(br->dev, val);
+	br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val);
+	if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
 		goto unlock;
 
 	if (!netif_running(br->dev))
@@ -2198,7 +2198,7 @@
 		if (mdb->old) {
 			err = -EEXIST;
 rollback:
-			br->multicast_disabled = !!val;
+			br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false);
 			goto unlock;
 		}
 
@@ -2222,7 +2222,7 @@
 {
 	struct net_bridge *br = netdev_priv(dev);
 
-	return !br->multicast_disabled;
+	return !!br_opt_get(br, BROPT_MULTICAST_ENABLED);
 }
 EXPORT_SYMBOL_GPL(br_multicast_enabled);
 
@@ -2245,10 +2245,10 @@
 	val = !!val;
 
 	spin_lock_bh(&br->multicast_lock);
-	if (br->multicast_querier == val)
+	if (br_opt_get(br, BROPT_MULTICAST_QUERIER) == val)
 		goto unlock;
 
-	br->multicast_querier = val;
+	br_opt_toggle(br, BROPT_MULTICAST_QUERIER, !!val);
 	if (!val)
 		goto unlock;
 
@@ -2569,7 +2569,7 @@
 	struct bridge_mcast_stats __percpu *stats;
 
 	/* if multicast_disabled is true then igmp type can't be set */
-	if (!type || !br->multicast_stats_enabled)
+	if (!type || !br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED))
 		return;
 
 	if (p)

diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index ccab290..ad207463 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c

@@ -51,25 +51,22 @@
 
 struct brnf_net {
 	bool enabled;
-};
 
 #ifdef CONFIG_SYSCTL
-static struct ctl_table_header *brnf_sysctl_header;
-static int brnf_call_iptables __read_mostly = 1;
-static int brnf_call_ip6tables __read_mostly = 1;
-static int brnf_call_arptables __read_mostly = 1;
-static int brnf_filter_vlan_tagged __read_mostly;
-static int brnf_filter_pppoe_tagged __read_mostly;
-static int brnf_pass_vlan_indev __read_mostly;
-#else
-#define brnf_call_iptables 1
-#define brnf_call_ip6tables 1
-#define brnf_call_arptables 1
-#define brnf_filter_vlan_tagged 0
-#define brnf_filter_pppoe_tagged 0
-#define brnf_pass_vlan_indev 0
+	struct ctl_table_header *ctl_hdr;
 #endif
 
+	/* default value is 1 */
+	int call_iptables;
+	int call_ip6tables;
+	int call_arptables;
+
+	/* default value is 0 */
+	int filter_vlan_tagged;
+	int filter_pppoe_tagged;
+	int pass_vlan_indev;
+};
+
 #define IS_IP(skb) \
 	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
 
@@ -89,17 +86,28 @@
 		return 0;
 }
 
-#define IS_VLAN_IP(skb) \
-	(vlan_proto(skb) == htons(ETH_P_IP) && \
-	 brnf_filter_vlan_tagged)
+static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net)
+{
+	struct brnf_net *brnet = net_generic(net, brnf_net_id);
 
-#define IS_VLAN_IPV6(skb) \
-	(vlan_proto(skb) == htons(ETH_P_IPV6) && \
-	 brnf_filter_vlan_tagged)
+	return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged;
+}
 
-#define IS_VLAN_ARP(skb) \
-	(vlan_proto(skb) == htons(ETH_P_ARP) &&	\
-	 brnf_filter_vlan_tagged)
+static inline bool is_vlan_ipv6(const struct sk_buff *skb,
+				const struct net *net)
+{
+	struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+	return vlan_proto(skb) == htons(ETH_P_IPV6) &&
+	       brnet->filter_vlan_tagged;
+}
+
+static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net)
+{
+	struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+	return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged;
+}
 
 static inline __be16 pppoe_proto(const struct sk_buff *skb)
 {
@@ -107,15 +115,23 @@
 			    sizeof(struct pppoe_hdr)));
 }
 
-#define IS_PPPOE_IP(skb) \
-	(skb->protocol == htons(ETH_P_PPP_SES) && \
-	 pppoe_proto(skb) == htons(PPP_IP) && \
-	 brnf_filter_pppoe_tagged)
+static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net)
+{
+	struct brnf_net *brnet = net_generic(net, brnf_net_id);
 
-#define IS_PPPOE_IPV6(skb) \
-	(skb->protocol == htons(ETH_P_PPP_SES) && \
-	 pppoe_proto(skb) == htons(PPP_IPV6) && \
-	 brnf_filter_pppoe_tagged)
+	return skb->protocol == htons(ETH_P_PPP_SES) &&
+	       pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged;
+}
+
+static inline bool is_pppoe_ipv6(const struct sk_buff *skb,
+				 const struct net *net)
+{
+	struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+	return skb->protocol == htons(ETH_P_PPP_SES) &&
+	       pppoe_proto(skb) == htons(PPP_IPV6) &&
+	       brnet->filter_pppoe_tagged;
+}
 
 /* largest possible L2 header, see br_nf_dev_queue_xmit() */
 #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
@@ -425,12 +441,16 @@
 	return 0;
 }
 
-static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev)
+static struct net_device *brnf_get_logical_dev(struct sk_buff *skb,
+					       const struct net_device *dev,
+					       const struct net *net)
 {
 	struct net_device *vlan, *br;
+	struct brnf_net *brnet = net_generic(net, brnf_net_id);
 
 	br = bridge_parent(dev);
-	if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
+
+	if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
 		return br;
 
 	vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
@@ -440,7 +460,7 @@
 }
 
 /* Some common code for IPv4/IPv6 */
-struct net_device *setup_pre_routing(struct sk_buff *skb)
+struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net)
 {
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 
@@ -451,7 +471,7 @@
 
 	nf_bridge->in_prerouting = 1;
 	nf_bridge->physindev = skb->dev;
-	skb->dev = brnf_get_logical_dev(skb, skb->dev);
+	skb->dev = brnf_get_logical_dev(skb, skb->dev, net);
 
 	if (skb->protocol == htons(ETH_P_8021Q))
 		nf_bridge->orig_proto = BRNF_PROTO_8021Q;
@@ -477,6 +497,7 @@
 	struct net_bridge_port *p;
 	struct net_bridge *br;
 	__u32 len = nf_bridge_encap_header_len(skb);
+	struct brnf_net *brnet;
 
 	if (unlikely(!pskb_may_pull(skb, len)))
 		return NF_DROP;
@@ -486,18 +507,22 @@
 		return NF_DROP;
 	br = p->br;
 
-	if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
-		if (!brnf_call_ip6tables && !br->nf_call_ip6tables)
+	brnet = net_generic(state->net, brnf_net_id);
+	if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+	    is_pppoe_ipv6(skb, state->net)) {
+		if (!brnet->call_ip6tables &&
+		    !br_opt_get(br, BROPT_NF_CALL_IP6TABLES))
 			return NF_ACCEPT;
 
 		nf_bridge_pull_encap_header_rcsum(skb);
 		return br_nf_pre_routing_ipv6(priv, skb, state);
 	}
 
-	if (!brnf_call_iptables && !br->nf_call_iptables)
+	if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
 		return NF_ACCEPT;
 
-	if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb))
+	if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) &&
+	    !is_pppoe_ip(skb, state->net))
 		return NF_ACCEPT;
 
 	nf_bridge_pull_encap_header_rcsum(skb);
@@ -508,7 +533,7 @@
 	nf_bridge_put(skb->nf_bridge);
 	if (!nf_bridge_alloc(skb))
 		return NF_DROP;
-	if (!setup_pre_routing(skb))
+	if (!setup_pre_routing(skb, state->net))
 		return NF_DROP;
 
 	nf_bridge = nf_bridge_info_get(skb);
@@ -531,7 +556,7 @@
 	struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 	struct net_device *in;
 
-	if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
+	if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) {
 
 		if (skb->protocol == htons(ETH_P_IP))
 			nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
@@ -585,9 +610,11 @@
 	if (!parent)
 		return NF_DROP;
 
-	if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+	if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
+	    is_pppoe_ip(skb, state->net))
 		pf = NFPROTO_IPV4;
-	else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+	else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+		 is_pppoe_ipv6(skb, state->net))
 		pf = NFPROTO_IPV6;
 	else
 		return NF_ACCEPT;
@@ -618,7 +645,7 @@
 		skb->protocol = htons(ETH_P_IPV6);
 
 	NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb,
-		brnf_get_logical_dev(skb, state->in),
+		brnf_get_logical_dev(skb, state->in, state->net),
 		parent,	br_nf_forward_finish);
 
 	return NF_STOLEN;
@@ -631,17 +658,19 @@
 	struct net_bridge_port *p;
 	struct net_bridge *br;
 	struct net_device **d = (struct net_device **)(skb->cb);
+	struct brnf_net *brnet;
 
 	p = br_port_get_rcu(state->out);
 	if (p == NULL)
 		return NF_ACCEPT;
 	br = p->br;
 
-	if (!brnf_call_arptables && !br->nf_call_arptables)
+	brnet = net_generic(state->net, brnf_net_id);
+	if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
 		return NF_ACCEPT;
 
 	if (!IS_ARP(skb)) {
-		if (!IS_VLAN_ARP(skb))
+		if (!is_vlan_arp(skb, state->net))
 			return NF_ACCEPT;
 		nf_bridge_pull_encap_header(skb);
 	}
@@ -650,7 +679,7 @@
 		return NF_DROP;
 
 	if (arp_hdr(skb)->ar_pln != 4) {
-		if (IS_VLAN_ARP(skb))
+		if (is_vlan_arp(skb, state->net))
 			nf_bridge_push_encap_header(skb);
 		return NF_ACCEPT;
 	}
@@ -805,9 +834,11 @@
 	if (!realoutdev)
 		return NF_DROP;
 
-	if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+	if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
+	    is_pppoe_ip(skb, state->net))
 		pf = NFPROTO_IPV4;
-	else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+	else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+		 is_pppoe_ipv6(skb, state->net))
 		pf = NFPROTO_IPV6;
 	else
 		return NF_ACCEPT;
@@ -955,23 +986,6 @@
 	return NOTIFY_OK;
 }
 
-static void __net_exit brnf_exit_net(struct net *net)
-{
-	struct brnf_net *brnet = net_generic(net, brnf_net_id);
-
-	if (!brnet->enabled)
-		return;
-
-	nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
-	brnet->enabled = false;
-}
-
-static struct pernet_operations brnf_net_ops __read_mostly = {
-	.exit = brnf_exit_net,
-	.id   = &brnf_net_id,
-	.size = sizeof(struct brnf_net),
-};
-
 static struct notifier_block brnf_notifier __read_mostly = {
 	.notifier_call = brnf_device_event,
 };
@@ -1030,50 +1044,125 @@
 static struct ctl_table brnf_table[] = {
 	{
 		.procname	= "bridge-nf-call-arptables",
-		.data		= &brnf_call_arptables,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-call-iptables",
-		.data		= &brnf_call_iptables,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-call-ip6tables",
-		.data		= &brnf_call_ip6tables,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-filter-vlan-tagged",
-		.data		= &brnf_filter_vlan_tagged,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-filter-pppoe-tagged",
-		.data		= &brnf_filter_pppoe_tagged,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{
 		.procname	= "bridge-nf-pass-vlan-input-dev",
-		.data		= &brnf_pass_vlan_indev,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= brnf_sysctl_call_tables,
 	},
 	{ }
 };
+
+static inline void br_netfilter_sysctl_default(struct brnf_net *brnf)
+{
+	brnf->call_iptables = 1;
+	brnf->call_ip6tables = 1;
+	brnf->call_arptables = 1;
+	brnf->filter_vlan_tagged = 0;
+	brnf->filter_pppoe_tagged = 0;
+	brnf->pass_vlan_indev = 0;
+}
+
+static int br_netfilter_sysctl_init_net(struct net *net)
+{
+	struct ctl_table *table = brnf_table;
+	struct brnf_net *brnet;
+
+	if (!net_eq(net, &init_net)) {
+		table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL);
+		if (!table)
+			return -ENOMEM;
+	}
+
+	brnet = net_generic(net, brnf_net_id);
+	table[0].data = &brnet->call_arptables;
+	table[1].data = &brnet->call_iptables;
+	table[2].data = &brnet->call_ip6tables;
+	table[3].data = &brnet->filter_vlan_tagged;
+	table[4].data = &brnet->filter_pppoe_tagged;
+	table[5].data = &brnet->pass_vlan_indev;
+
+	br_netfilter_sysctl_default(brnet);
+
+	brnet->ctl_hdr = register_net_sysctl(net, "net/bridge", table);
+	if (!brnet->ctl_hdr) {
+		if (!net_eq(net, &init_net))
+			kfree(table);
+
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void br_netfilter_sysctl_exit_net(struct net *net,
+					 struct brnf_net *brnet)
+{
+	struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg;
+
+	unregister_net_sysctl_table(brnet->ctl_hdr);
+	if (!net_eq(net, &init_net))
+		kfree(table);
+}
+
+static int __net_init brnf_init_net(struct net *net)
+{
+	return br_netfilter_sysctl_init_net(net);
+}
 #endif
 
+static void __net_exit brnf_exit_net(struct net *net)
+{
+	struct brnf_net *brnet;
+
+	brnet = net_generic(net, brnf_net_id);
+	if (brnet->enabled) {
+		nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
+		brnet->enabled = false;
+	}
+
+#ifdef CONFIG_SYSCTL
+	br_netfilter_sysctl_exit_net(net, brnet);
+#endif
+}
+
+static struct pernet_operations brnf_net_ops __read_mostly = {
+#ifdef CONFIG_SYSCTL
+	.init = brnf_init_net,
+#endif
+	.exit = brnf_exit_net,
+	.id   = &brnf_net_id,
+	.size = sizeof(struct brnf_net),
+};
+
 static int __init br_netfilter_init(void)
 {
 	int ret;
@@ -1088,16 +1177,6 @@
 		return ret;
 	}
 
-#ifdef CONFIG_SYSCTL
-	brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table);
-	if (brnf_sysctl_header == NULL) {
-		printk(KERN_WARNING
-		       "br_netfilter: can't register to sysctl.\n");
-		unregister_netdevice_notifier(&brnf_notifier);
-		unregister_pernet_subsys(&brnf_net_ops);
-		return -ENOMEM;
-	}
-#endif
 	RCU_INIT_POINTER(nf_br_ops, &br_ops);
 	printk(KERN_NOTICE "Bridge firewalling registered\n");
 	return 0;
@@ -1108,9 +1187,6 @@
 	RCU_INIT_POINTER(nf_br_ops, NULL);
 	unregister_netdevice_notifier(&brnf_notifier);
 	unregister_pernet_subsys(&brnf_net_ops);
-#ifdef CONFIG_SYSCTL
-	unregister_net_sysctl_table(brnf_sysctl_header);
-#endif
 }
 
 module_init(br_netfilter_init);

diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 09d5e0c..1e0fe6b 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c

@@ -228,7 +228,7 @@
 	nf_bridge_put(skb->nf_bridge);
 	if (!nf_bridge_alloc(skb))
 		return NF_DROP;
-	if (!setup_pre_routing(skb))
+	if (!setup_pre_routing(skb, state->net))
 		return NF_DROP;
 
 	nf_bridge = nf_bridge_info_get(skb);

diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index ec2b58a..e5a5bc5 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c

@@ -1139,7 +1139,7 @@
 		spin_lock_bh(&br->lock);
 		memcpy(br->group_addr, new_addr, sizeof(br->group_addr));
 		spin_unlock_bh(&br->lock);
-		br->group_addr_set = true;
+		br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true);
 		br_recalculate_fwd_mask(br);
 	}
 
@@ -1167,7 +1167,7 @@
 		u8 val;
 
 		val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]);
-		br->multicast_query_use_ifaddr = !!val;
+		br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val);
 	}
 
 	if (data[IFLA_BR_MCAST_QUERIER]) {
@@ -1244,7 +1244,7 @@
 		__u8 mcast_stats;
 
 		mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]);
-		br->multicast_stats_enabled = !!mcast_stats;
+		br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!mcast_stats);
 	}
 
 	if (data[IFLA_BR_MCAST_IGMP_VERSION]) {
@@ -1271,19 +1271,19 @@
 	if (data[IFLA_BR_NF_CALL_IPTABLES]) {
 		u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]);
 
-		br->nf_call_iptables = val ? true : false;
+		br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val);
 	}
 
 	if (data[IFLA_BR_NF_CALL_IP6TABLES]) {
 		u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]);
 
-		br->nf_call_ip6tables = val ? true : false;
+		br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val);
 	}
 
 	if (data[IFLA_BR_NF_CALL_ARPTABLES]) {
 		u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]);
 
-		br->nf_call_arptables = val ? true : false;
+		br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val);
 	}
 #endif
 
@@ -1416,17 +1416,20 @@
 #ifdef CONFIG_BRIDGE_VLAN_FILTERING
 	if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) ||
 	    nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) ||
-	    nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, br->vlan_stats_enabled))
+	    nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED,
+		       br_opt_get(br, BROPT_VLAN_STATS_ENABLED)))
 		return -EMSGSIZE;
 #endif
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) ||
-	    nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, !br->multicast_disabled) ||
+	    nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING,
+		       br_opt_get(br, BROPT_MULTICAST_ENABLED)) ||
 	    nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR,
-		       br->multicast_query_use_ifaddr) ||
-	    nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) ||
+		       br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)) ||
+	    nla_put_u8(skb, IFLA_BR_MCAST_QUERIER,
+		       br_opt_get(br, BROPT_MULTICAST_QUERIER)) ||
 	    nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED,
-		       br->multicast_stats_enabled) ||
+		       br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) ||
 	    nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY,
 			br->hash_elasticity) ||
 	    nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) ||
@@ -1469,11 +1472,11 @@
 #endif
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES,
-		       br->nf_call_iptables ? 1 : 0) ||
+		       br_opt_get(br, BROPT_NF_CALL_IPTABLES) ? 1 : 0) ||
 	    nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES,
-		       br->nf_call_ip6tables ? 1 : 0) ||
+		       br_opt_get(br, BROPT_NF_CALL_IP6TABLES) ? 1 : 0) ||
 	    nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES,
-		       br->nf_call_arptables ? 1 : 0))
+		       br_opt_get(br, BROPT_NF_CALL_ARPTABLES) ? 1 : 0))
 		return -EMSGSIZE;
 #endif
 

diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 11ed202..c18bf39 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h

@@ -54,14 +54,12 @@
 typedef struct mac_addr mac_addr;
 typedef __u16 port_id;
 
-struct bridge_id
-{
+struct bridge_id {
 	unsigned char	prio[2];
 	unsigned char	addr[ETH_ALEN];
 };
 
-struct mac_addr
-{
+struct mac_addr {
 	unsigned char	addr[ETH_ALEN];
 };
 
@@ -206,8 +204,7 @@
 	unsigned char			eth_addr[ETH_ALEN];
 };
 
-struct net_bridge_mdb_entry
-{
+struct net_bridge_mdb_entry {
 	struct hlist_node		hlist[2];
 	struct net_bridge		*br;
 	struct net_bridge_port_group __rcu *ports;
@@ -217,8 +214,7 @@
 	bool				host_joined;
 };
 
-struct net_bridge_mdb_htable
-{
+struct net_bridge_mdb_htable {
 	struct hlist_head		*mhash;
 	struct rcu_head			rcu;
 	struct net_bridge_mdb_htable	*old;
@@ -309,16 +305,31 @@
 		rcu_dereference_rtnl(dev->rx_handler_data) : NULL;
 }
 
+enum net_bridge_opts {
+	BROPT_VLAN_ENABLED,
+	BROPT_VLAN_STATS_ENABLED,
+	BROPT_NF_CALL_IPTABLES,
+	BROPT_NF_CALL_IP6TABLES,
+	BROPT_NF_CALL_ARPTABLES,
+	BROPT_GROUP_ADDR_SET,
+	BROPT_MULTICAST_ENABLED,
+	BROPT_MULTICAST_QUERIER,
+	BROPT_MULTICAST_QUERY_USE_IFADDR,
+	BROPT_MULTICAST_STATS_ENABLED,
+	BROPT_HAS_IPV6_ADDR,
+	BROPT_NEIGH_SUPPRESS_ENABLED,
+	BROPT_MTU_SET_BY_USER,
+};
+
 struct net_bridge {
 	spinlock_t			lock;
 	spinlock_t			hash_lock;
 	struct list_head		port_list;
 	struct net_device		*dev;
 	struct pcpu_sw_netstats		__percpu *stats;
+	unsigned long			options;
 	/* These fields are accessed on each packet */
 #ifdef CONFIG_BRIDGE_VLAN_FILTERING
-	u8				vlan_enabled;
-	u8				vlan_stats_enabled;
 	__be16				vlan_proto;
 	u16				default_pvid;
 	struct net_bridge_vlan_group	__rcu *vlgrp;
@@ -330,9 +341,6 @@
 		struct rtable		fake_rtable;
 		struct rt6_info		fake_rt6_info;
 	};
-	bool				nf_call_iptables;
-	bool				nf_call_ip6tables;
-	bool				nf_call_arptables;
 #endif
 	u16				group_fwd_mask;
 	u16				group_fwd_mask_required;
@@ -340,7 +348,6 @@
 	/* STP */
 	bridge_id			designated_root;
 	bridge_id			bridge_id;
-	u32				root_path_cost;
 	unsigned char			topology_change;
 	unsigned char			topology_change_detected;
 	u16				root_port;
@@ -352,9 +359,9 @@
 	unsigned long			bridge_hello_time;
 	unsigned long			bridge_forward_delay;
 	unsigned long			bridge_ageing_time;
+	u32				root_path_cost;
 
 	u8				group_addr[ETH_ALEN];
-	bool				group_addr_set;
 
 	enum {
 		BR_NO_STP, 		/* no spanning tree */
@@ -363,13 +370,6 @@
 	} stp_enabled;
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
-	unsigned char			multicast_router;
-
-	u8				multicast_disabled:1;
-	u8				multicast_querier:1;
-	u8				multicast_query_use_ifaddr:1;
-	u8				has_ipv6_addr:1;
-	u8				multicast_stats_enabled:1;
 
 	u32				hash_elasticity;
 	u32				hash_max;
@@ -378,7 +378,11 @@
 	u32				multicast_startup_query_count;
 
 	u8				multicast_igmp_version;
-
+	u8				multicast_router;
+#if IS_ENABLED(CONFIG_IPV6)
+	u8				multicast_mld_version;
+#endif
+	spinlock_t			multicast_lock;
 	unsigned long			multicast_last_member_interval;
 	unsigned long			multicast_membership_interval;
 	unsigned long			multicast_querier_interval;
@@ -386,7 +390,6 @@
 	unsigned long			multicast_query_response_interval;
 	unsigned long			multicast_startup_query_interval;
 
-	spinlock_t			multicast_lock;
 	struct net_bridge_mdb_htable __rcu *mdb;
 	struct hlist_head		router_list;
 
@@ -399,7 +402,6 @@
 	struct bridge_mcast_other_query	ip6_other_query;
 	struct bridge_mcast_own_query	ip6_own_query;
 	struct bridge_mcast_querier	ip6_querier;
-	u8				multicast_mld_version;
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 #endif
 
@@ -413,8 +415,6 @@
 #ifdef CONFIG_NET_SWITCHDEV
 	int offload_fwd_mark;
 #endif
-	bool				neigh_suppress_enabled;
-	bool				mtu_set_by_user;
 	struct hlist_head		fdb_list;
 };
 
@@ -492,6 +492,14 @@
 	return true;
 }
 
+static inline int br_opt_get(const struct net_bridge *br,
+			     enum net_bridge_opts opt)
+{
+	return test_bit(opt, &br->options);
+}
+
+void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on);
+
 /* br_device.c */
 void br_dev_setup(struct net_device *dev);
 void br_dev_delete(struct net_device *dev, struct list_head *list);
@@ -698,8 +706,8 @@
 {
 	bool own_querier_enabled;
 
-	if (br->multicast_querier) {
-		if (is_ipv6 && !br->has_ipv6_addr)
+	if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) {
+		if (is_ipv6 && !br_opt_get(br, BROPT_HAS_IPV6_ADDR))
 			own_querier_enabled = false;
 		else
 			own_querier_enabled = true;

diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 0318a69..c93c572 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c

@@ -303,7 +303,7 @@
 	ether_addr_copy(br->group_addr, new_addr);
 	spin_unlock_bh(&br->lock);
 
-	br->group_addr_set = true;
+	br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true);
 	br_recalculate_fwd_mask(br);
 	netdev_state_change(br->dev);
 
@@ -349,7 +349,7 @@
 				       char *buf)
 {
 	struct net_bridge *br = to_bridge(d);
-	return sprintf(buf, "%d\n", !br->multicast_disabled);
+	return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED));
 }
 
 static ssize_t multicast_snooping_store(struct device *d,
@@ -365,12 +365,13 @@
 					       char *buf)
 {
 	struct net_bridge *br = to_bridge(d);
-	return sprintf(buf, "%d\n", br->multicast_query_use_ifaddr);
+	return sprintf(buf, "%d\n",
+		       br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR));
 }
 
 static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val)
 {
-	br->multicast_query_use_ifaddr = !!val;
+	br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val);
 	return 0;
 }
 
@@ -388,7 +389,7 @@
 				      char *buf)
 {
 	struct net_bridge *br = to_bridge(d);
-	return sprintf(buf, "%d\n", br->multicast_querier);
+	return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_QUERIER));
 }
 
 static ssize_t multicast_querier_store(struct device *d,
@@ -636,12 +637,13 @@
 {
 	struct net_bridge *br = to_bridge(d);
 
-	return sprintf(buf, "%u\n", br->multicast_stats_enabled);
+	return sprintf(buf, "%d\n",
+		       br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED));
 }
 
 static int set_stats_enabled(struct net_bridge *br, unsigned long val)
 {
-	br->multicast_stats_enabled = !!val;
+	br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!val);
 	return 0;
 }
 
@@ -678,12 +680,12 @@
 	struct device *d, struct device_attribute *attr, char *buf)
 {
 	struct net_bridge *br = to_bridge(d);
-	return sprintf(buf, "%u\n", br->nf_call_iptables);
+	return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES));
 }
 
 static int set_nf_call_iptables(struct net_bridge *br, unsigned long val)
 {
-	br->nf_call_iptables = val ? true : false;
+	br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val);
 	return 0;
 }
 
@@ -699,12 +701,12 @@
 	struct device *d, struct device_attribute *attr, char *buf)
 {
 	struct net_bridge *br = to_bridge(d);
-	return sprintf(buf, "%u\n", br->nf_call_ip6tables);
+	return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES));
 }
 
 static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val)
 {
-	br->nf_call_ip6tables = val ? true : false;
+	br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val);
 	return 0;
 }
 
@@ -720,12 +722,12 @@
 	struct device *d, struct device_attribute *attr, char *buf)
 {
 	struct net_bridge *br = to_bridge(d);
-	return sprintf(buf, "%u\n", br->nf_call_arptables);
+	return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES));
 }
 
 static int set_nf_call_arptables(struct net_bridge *br, unsigned long val)
 {
-	br->nf_call_arptables = val ? true : false;
+	br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val);
 	return 0;
 }
 
@@ -743,7 +745,7 @@
 				   char *buf)
 {
 	struct net_bridge *br = to_bridge(d);
-	return sprintf(buf, "%d\n", br->vlan_enabled);
+	return sprintf(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED));
 }
 
 static ssize_t vlan_filtering_store(struct device *d,
@@ -791,7 +793,7 @@
 				       char *buf)
 {
 	struct net_bridge *br = to_bridge(d);
-	return sprintf(buf, "%u\n", br->vlan_stats_enabled);
+	return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED));
 }
 
 static ssize_t vlan_stats_enabled_store(struct device *d,

diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 5f3950f..f1744b6 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c

@@ -386,7 +386,7 @@
 			return NULL;
 		}
 	}
-	if (br->vlan_stats_enabled) {
+	if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
 		stats = this_cpu_ptr(v->stats);
 		u64_stats_update_begin(&stats->syncp);
 		stats->tx_bytes += skb->len;
@@ -475,14 +475,14 @@
 			skb->vlan_tci |= pvid;
 
 		/* if stats are disabled we can avoid the lookup */
-		if (!br->vlan_stats_enabled)
+		if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED))
 			return true;
 	}
 	v = br_vlan_find(vg, *vid);
 	if (!v || !br_vlan_should_use(v))
 		goto drop;
 
-	if (br->vlan_stats_enabled) {
+	if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
 		stats = this_cpu_ptr(v->stats);
 		u64_stats_update_begin(&stats->syncp);
 		stats->rx_bytes += skb->len;
@@ -504,7 +504,7 @@
 	/* If VLAN filtering is disabled on the bridge, all packets are
 	 * permitted.
 	 */
-	if (!br->vlan_enabled) {
+	if (!br_opt_get(br, BROPT_VLAN_ENABLED)) {
 		BR_INPUT_SKB_CB(skb)->vlan_filtered = false;
 		return true;
 	}
@@ -538,7 +538,7 @@
 	struct net_bridge *br = p->br;
 
 	/* If filtering was disabled at input, let it pass. */
-	if (!br->vlan_enabled)
+	if (!br_opt_get(br, BROPT_VLAN_ENABLED))
 		return true;
 
 	vg = nbp_vlan_group_rcu(p);
@@ -700,11 +700,12 @@
 /* Must be protected by RTNL. */
 static void recalculate_group_addr(struct net_bridge *br)
 {
-	if (br->group_addr_set)
+	if (br_opt_get(br, BROPT_GROUP_ADDR_SET))
 		return;
 
 	spin_lock_bh(&br->lock);
-	if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q)) {
+	if (!br_opt_get(br, BROPT_VLAN_ENABLED) ||
+	    br->vlan_proto == htons(ETH_P_8021Q)) {
 		/* Bridge Group Address */
 		br->group_addr[5] = 0x00;
 	} else { /* vlan_enabled && ETH_P_8021AD */
@@ -717,7 +718,8 @@
 /* Must be protected by RTNL. */
 void br_recalculate_fwd_mask(struct net_bridge *br)
 {
-	if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q))
+	if (!br_opt_get(br, BROPT_VLAN_ENABLED) ||
+	    br->vlan_proto == htons(ETH_P_8021Q))
 		br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;
 	else /* vlan_enabled && ETH_P_8021AD */
 		br->group_fwd_mask_required = BR_GROUPFWD_8021AD &
@@ -734,14 +736,14 @@
 	};
 	int err;
 
-	if (br->vlan_enabled == val)
+	if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val)
 		return 0;
 
 	err = switchdev_port_attr_set(br->dev, &attr);
 	if (err && err != -EOPNOTSUPP)
 		return err;
 
-	br->vlan_enabled = val;
+	br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val);
 	br_manage_promisc(br);
 	recalculate_group_addr(br);
 	br_recalculate_fwd_mask(br);
@@ -758,7 +760,7 @@
 {
 	struct net_bridge *br = netdev_priv(dev);
 
-	return !!br->vlan_enabled;
+	return br_opt_get(br, BROPT_VLAN_ENABLED);
 }
 EXPORT_SYMBOL_GPL(br_vlan_enabled);
 
@@ -824,7 +826,7 @@
 	switch (val) {
 	case 0:
 	case 1:
-		br->vlan_stats_enabled = val;
+		br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val);
 		break;
 	default:
 		return -EINVAL;
@@ -970,7 +972,7 @@
 		goto out;
 
 	/* Only allow default pvid change when filtering is disabled */
-	if (br->vlan_enabled) {
+	if (br_opt_get(br, BROPT_VLAN_ENABLED)) {
 		pr_info_once("Please disable vlan filtering to change default_pvid\n");
 		err = -EPERM;
 		goto out;
@@ -1024,7 +1026,7 @@
 		.orig_dev = p->br->dev,
 		.id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
 		.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
-		.u.vlan_filtering = p->br->vlan_enabled,
+		.u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED),
 	};
 	struct net_bridge_vlan_group *vg;
 	int ret = -ENOMEM;

diff --git a/net/core/dev.c b/net/core/dev.c
index 1c0224e..4f655e5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c

@@ -1822,7 +1822,7 @@
 #endif
 
 static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 static atomic_t netstamp_needed_deferred;
 static atomic_t netstamp_wanted;
 static void netstamp_clear(struct work_struct *work)
@@ -1841,7 +1841,7 @@
 
 void net_enable_timestamp(void)
 {
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 	int wanted;
 
 	while (1) {
@@ -1861,7 +1861,7 @@
 
 void net_disable_timestamp(void)
 {
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 	int wanted;
 
 	while (1) {
@@ -4306,14 +4306,14 @@
 	/* Reinjected packets coming from act_mirred or similar should
 	 * not get XDP generic processing.
 	 */
-	if (skb_cloned(skb) || skb_is_tc_redirected(skb))
+	if (skb_is_tc_redirected(skb))
 		return XDP_PASS;
 
 	/* XDP packets must be linear and must have sufficient headroom
 	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
 	 * native XDP provides, thus we need to do it here as well.
 	 */
-	if (skb_is_nonlinear(skb) ||
+	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
 	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
 		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
 		int troom = skb->tail + skb->data_len - skb->end;

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 0ff3953..8916c5d 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c

@@ -968,7 +968,7 @@
 
 	frh = nlmsg_data(nlh);
 	frh->family = ops->family;
-	frh->table = rule->table;
+	frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT;
 	if (nla_put_u32(skb, FRA_TABLE, rule->table))
 		goto nla_put_failure;
 	if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))

diff --git a/net/core/filter.c b/net/core/filter.c
index 9daf1a4..40b3af0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c

@@ -3207,7 +3207,7 @@
 		return err;
 	}
 	default:
-		break;
+		return -EBADRQC;
 	}
 	return 0;
 }

diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 66a9521..9c0dd31 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c

@@ -41,7 +41,7 @@
 	struct dsa_port *dp = dsa_slave_to_port(dev);
 	u16 *phdr, hdr;
 
-	if (skb_cow_head(skb, 0) < 0)
+	if (skb_cow_head(skb, QCA_HDR_LEN) < 0)
 		return NULL;
 
 	skb_push(skb, QCA_HDR_LEN);

diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7446b98..433e35a 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile

@@ -20,6 +20,7 @@
 
 obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
 obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
+obj-$(CONFIG_SYSFS) += sysfs_net_ipv4.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
 obj-$(CONFIG_IP_MROUTE) += ipmr.o

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3955a6d..fee29dc 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c

@@ -348,12 +348,18 @@
 
 static inline void empty_child_inc(struct key_vector *n)
 {
-	++tn_info(n)->empty_children ? : ++tn_info(n)->full_children;
+	tn_info(n)->empty_children++;
+
+	if (!tn_info(n)->empty_children)
+		tn_info(n)->full_children++;
 }
 
 static inline void empty_child_dec(struct key_vector *n)
 {
-	tn_info(n)->empty_children-- ? : tn_info(n)->full_children--;
+	if (!tn_info(n)->empty_children)
+		tn_info(n)->full_children--;
+
+	tn_info(n)->empty_children--;
 }
 
 static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 82f341e..aa3fd61 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c

@@ -343,6 +343,8 @@
 		return -EINVAL;
 
 	new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+	if (on && !new_ra)
+		return -ENOMEM;
 
 	mutex_lock(&net->ipv4.ra_mutex);
 	for (rap = &net->ipv4.ra_chain;

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ad132b6..6357b03 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c

@@ -222,6 +222,21 @@
 	return ret;
 }
 
+/* Validate changes from /proc interface. */
+static int proc_tcp_default_init_rwnd(struct ctl_table *ctl, int write,
+				      void __user *buffer,
+				      size_t *lenp, loff_t *ppos)
+{
+	int old_value = *(int *)ctl->data;
+	int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+	int new_value = *(int *)ctl->data;
+
+	if (write && ret == 0 && (new_value < 3 || new_value > 100))
+		*(int *)ctl->data = old_value;
+
+	return ret;
+}
+
 static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
 				       void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -1191,6 +1206,13 @@
 		.extra2		= &thousand,
 	},
 	{
+		.procname       = "tcp_default_init_rwnd",
+		.data           = &init_net.ipv4.sysctl_tcp_default_init_rwnd,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_tcp_default_init_rwnd
+	},
+	{
 		.procname	= "tcp_wmem",
 		.data		= &init_net.ipv4.sysctl_tcp_wmem,
 		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_wmem),

diff --git a/net/ipv4/sysfs_net_ipv4.c b/net/ipv4/sysfs_net_ipv4.c
new file mode 100644
index 0000000..35a651aa
--- /dev/null
+++ b/net/ipv4/sysfs_net_ipv4.c

@@ -0,0 +1,88 @@
+/*
+ * net/ipv4/sysfs_net_ipv4.c
+ *
+ * sysfs-based networking knobs (so we can, unlike with sysctl, control perms)
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * Robert Love <rlove@google.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <net/tcp.h>
+
+#define CREATE_IPV4_FILE(_name, _var) \
+static ssize_t _name##_show(struct kobject *kobj, \
+			    struct kobj_attribute *attr, char *buf) \
+{ \
+	return sprintf(buf, "%d\n", _var); \
+} \
+static ssize_t _name##_store(struct kobject *kobj, \
+			     struct kobj_attribute *attr, \
+			     const char *buf, size_t count) \
+{ \
+	int val, ret; \
+	ret = sscanf(buf, "%d", &val); \
+	if (ret != 1) \
+		return -EINVAL; \
+	if (val < 0) \
+		return -EINVAL; \
+	_var = val; \
+	return count; \
+} \
+static struct kobj_attribute _name##_attr = \
+	__ATTR(_name, 0644, _name##_show, _name##_store)
+
+CREATE_IPV4_FILE(tcp_wmem_min, init_net.ipv4.sysctl_tcp_wmem[0]);
+CREATE_IPV4_FILE(tcp_wmem_def, init_net.ipv4.sysctl_tcp_wmem[1]);
+CREATE_IPV4_FILE(tcp_wmem_max, init_net.ipv4.sysctl_tcp_wmem[2]);
+
+CREATE_IPV4_FILE(tcp_rmem_min, init_net.ipv4.sysctl_tcp_rmem[0]);
+CREATE_IPV4_FILE(tcp_rmem_def, init_net.ipv4.sysctl_tcp_rmem[1]);
+CREATE_IPV4_FILE(tcp_rmem_max, init_net.ipv4.sysctl_tcp_rmem[2]);
+
+static struct attribute *ipv4_attrs[] = {
+	&tcp_wmem_min_attr.attr,
+	&tcp_wmem_def_attr.attr,
+	&tcp_wmem_max_attr.attr,
+	&tcp_rmem_min_attr.attr,
+	&tcp_rmem_def_attr.attr,
+	&tcp_rmem_max_attr.attr,
+	NULL
+};
+
+static struct attribute_group ipv4_attr_group = {
+	.attrs = ipv4_attrs,
+};
+
+static __init int sysfs_ipv4_init(void)
+{
+	struct kobject *ipv4_kobject;
+	int ret;
+
+	ipv4_kobject = kobject_create_and_add("ipv4", kernel_kobj);
+	if (!ipv4_kobject)
+		return -ENOMEM;
+
+	ret = sysfs_create_group(ipv4_kobject, &ipv4_attr_group);
+	if (ret) {
+		kobject_put(ipv4_kobject);
+		return ret;
+	}
+
+	return 0;
+}
+
+subsys_initcall(sysfs_ipv4_init);

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6da3930..5e7d66b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c

@@ -2581,6 +2581,7 @@
 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
+	net->ipv4.sysctl_tcp_default_init_rwnd = TCP_INIT_CWND * 2;
 	if (net != &init_net) {
 		memcpy(net->ipv4.sysctl_tcp_rmem,
 		       init_net.ipv4.sysctl_tcp_rmem,

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index cc4ba42..fc4ed9e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c

@@ -232,6 +232,7 @@
 			(*rcv_wscale)++;
 		}
 	}
+
 	/* Set the clamp no higher than max representable value */
 	(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
 }

diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index ae365df..1af240dc 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c

@@ -166,15 +166,15 @@
  * to explore inner IPv6 header, eg. ICMPv6 error messages.
  *
  * If target header is found, its offset is set in *offset and return protocol
- * number. Otherwise, return -1.
+ * number. Otherwise, return -ENOENT or -EBADMSG.
  *
  * If the first fragment doesn't contain the final protocol header or
  * NEXTHDR_NONE it is considered invalid.
  *
  * Note that non-1st fragment is special case that "the protocol number
  * of last header" is "next header" field in Fragment header. In this case,
- * *offset is meaningless and fragment offset is stored in *fragoff if fragoff
- * isn't NULL.
+ * *offset is meaningless. If fragoff is not NULL, the fragment offset is
+ * stored in *fragoff; if it is NULL, return -EINVAL.
  *
  * if flags is not NULL and it's a fragment, then the frag flag
  * IP6_FH_F_FRAG will be set. If it's an AH header, the
@@ -251,9 +251,12 @@
 				if (target < 0 &&
 				    ((!ipv6_ext_hdr(hp->nexthdr)) ||
 				     hp->nexthdr == NEXTHDR_NONE)) {
-					if (fragoff)
+					if (fragoff) {
 						*fragoff = _frag_off;
-					return hp->nexthdr;
+						return hp->nexthdr;
+					} else {
+						return -EINVAL;
+					}
 				}
 				if (!found)
 					return -ENOENT;

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 7091568..5e8979c 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c

@@ -981,8 +981,7 @@
 					found++;
 					break;
 				}
-				if (rt_can_ecmp)
-					fallback_ins = fallback_ins ?: ins;
+				fallback_ins = fallback_ins ?: ins;
 				goto next_iter;
 			}
 
@@ -1025,7 +1024,9 @@
 	}
 
 	if (fallback_ins && !found) {
-		/* No ECMP-able route found, replace first non-ECMP one */
+		/* No matching route with same ecmp-able-ness found, replace
+		 * first matching route
+		 */
 		ins = fallback_ins;
 		iter = rcu_dereference_protected(*ins,
 				    lockdep_is_held(&rt->fib6_table->tb6_lock));

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index c0cac9c..4bc97b1 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c

@@ -68,6 +68,8 @@
 		return -ENOPROTOOPT;
 
 	new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+	if (sel >= 0 && !new_ra)
+		return -ENOMEM;
 
 	write_lock_bh(&ip6_ra_lock);
 	for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) {

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f8fe4c9..9c36a74 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c

@@ -4514,6 +4514,7 @@
 		 */
 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
 						     NLM_F_REPLACE);
+		cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
 		nhn++;
 	}
 

diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index f101a64..7fa9871 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c

@@ -945,16 +945,22 @@
 				elem_parse_failed = true;
 			break;
 		case WLAN_EID_VHT_OPERATION:
-			if (elen >= sizeof(struct ieee80211_vht_operation))
+			if (elen >= sizeof(struct ieee80211_vht_operation)) {
 				elems->vht_operation = (void *)pos;
-			else
-				elem_parse_failed = true;
+				if (calc_crc)
+					crc = crc32_be(crc, pos - 2, elen + 2);
+				break;
+			}
+			elem_parse_failed = true;
 			break;
 		case WLAN_EID_OPMODE_NOTIF:
-			if (elen > 0)
+			if (elen > 0) {
 				elems->opmode_notif = pos;
-			else
-				elem_parse_failed = true;
+				if (calc_crc)
+					crc = crc32_be(crc, pos - 2, elen + 2);
+				break;
+			}
+			elem_parse_failed = true;
 			break;
 		case WLAN_EID_MESH_ID:
 			elems->mesh_id = pos;

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e0fb56d..bc6983f 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig

@@ -1461,6 +1461,29 @@
 	  If you want to compile it as a module, say M here and read
 	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
 
+config NETFILTER_XT_MATCH_QUOTA2
+	tristate '"quota2" match support'
+	depends on NETFILTER_ADVANCED
+	help
+	  This option adds a `quota2' match, which allows to match on a
+	  byte counter correctly and not per CPU.
+	  It allows naming the quotas.
+	  This is based on http://xtables-addons.git.sourceforge.net
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_QUOTA2_LOG
+	bool '"quota2" Netfilter LOG support'
+	depends on NETFILTER_XT_MATCH_QUOTA2
+	default n
+	help
+	  This option allows `quota2' to log ONCE when a quota limit
+	  is passed. It logs via NETLINK using the NETLINK_NFLOG family.
+	  It logs similarly to how ipt_ULOG would without data.
+
+	  If unsure, say `N'.
+
 config NETFILTER_XT_MATCH_RATEEST
 	tristate '"rateest" match support'
 	depends on NETFILTER_ADVANCED

diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 16895e0..9c87ed5 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile

@@ -191,6 +191,7 @@
 obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA2) += xt_quota2.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 93aaec3..dc240cb 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c

@@ -33,7 +33,7 @@
 DEFINE_PER_CPU(bool, nf_skb_duplicated);
 EXPORT_SYMBOL_GPL(nf_skb_duplicated);
 
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 EXPORT_SYMBOL(nf_hooks_needed);
 #endif
@@ -347,7 +347,7 @@
 	if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 		net_inc_ingress_queue();
 #endif
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 	static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]);
 #endif
 	BUG_ON(p == new_hooks);
@@ -405,7 +405,7 @@
 		if (pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS)
 			net_dec_ingress_queue();
 #endif
-#ifdef CONFIG_JUMP_LABEL
+#ifdef HAVE_JUMP_LABEL
 		static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]);
 #endif
 	} else {

diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index e5444f3..1c6d15e 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c

@@ -218,8 +218,9 @@
 }
 
 static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = {
+	[NFTA_TUNNEL_KEY_ERSPAN_VERSION]	= { .type = NLA_U32 },
 	[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX]	= { .type = NLA_U32 },
-	[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR]	= { .type = NLA_U8 },
+	[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR]		= { .type = NLA_U8 },
 	[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID]	= { .type = NLA_U8 },
 };
 
@@ -466,8 +467,8 @@
 static int nft_tunnel_ports_dump(struct sk_buff *skb,
 				 struct ip_tunnel_info *info)
 {
-	if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, htons(info->key.tp_src)) < 0 ||
-	    nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, htons(info->key.tp_dst)) < 0)
+	if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, info->key.tp_src) < 0 ||
+	    nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, info->key.tp_dst) < 0)
 		return -1;
 
 	return 0;

diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index 25453a1..559acfa 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c

@@ -5,6 +5,7 @@
  * After timer expires a kevent will be sent.
  *
  * Copyright (C) 2004, 2010 Nokia Corporation
+ *
  * Written by Timo Teras <ext-timo.teras@nokia.com>
  *
  * Converted to x_tables and reworked for upstream inclusion
@@ -38,8 +39,17 @@
 #include <linux/netfilter/xt_IDLETIMER.h>
 #include <linux/kdev_t.h>
 #include <linux/kobject.h>
+#include <linux/skbuff.h>
 #include <linux/workqueue.h>
 #include <linux/sysfs.h>
+#include <linux/rtc.h>
+#include <linux/time.h>
+#include <linux/math64.h>
+#include <linux/suspend.h>
+#include <linux/notifier.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
 
 struct idletimer_tg_attr {
 	struct attribute attr;
@@ -55,14 +65,110 @@
 	struct kobject *kobj;
 	struct idletimer_tg_attr attr;
 
+	struct timespec delayed_timer_trigger;
+	struct timespec last_modified_timer;
+	struct timespec last_suspend_time;
+	struct notifier_block pm_nb;
+
+	int timeout;
 	unsigned int refcnt;
+	bool work_pending;
+	bool send_nl_msg;
+	bool active;
+	uid_t uid;
 };
 
 static LIST_HEAD(idletimer_tg_list);
 static DEFINE_MUTEX(list_mutex);
+static DEFINE_SPINLOCK(timestamp_lock);
 
 static struct kobject *idletimer_tg_kobj;
 
+static bool check_for_delayed_trigger(struct idletimer_tg *timer,
+		struct timespec *ts)
+{
+	bool state;
+	struct timespec temp;
+	spin_lock_bh(&timestamp_lock);
+	timer->work_pending = false;
+	if ((ts->tv_sec - timer->last_modified_timer.tv_sec) > timer->timeout ||
+			timer->delayed_timer_trigger.tv_sec != 0) {
+		state = false;
+		temp.tv_sec = timer->timeout;
+		temp.tv_nsec = 0;
+		if (timer->delayed_timer_trigger.tv_sec != 0) {
+			temp = timespec_add(timer->delayed_timer_trigger, temp);
+			ts->tv_sec = temp.tv_sec;
+			ts->tv_nsec = temp.tv_nsec;
+			timer->delayed_timer_trigger.tv_sec = 0;
+			timer->work_pending = true;
+			schedule_work(&timer->work);
+		} else {
+			temp = timespec_add(timer->last_modified_timer, temp);
+			ts->tv_sec = temp.tv_sec;
+			ts->tv_nsec = temp.tv_nsec;
+		}
+	} else {
+		state = timer->active;
+	}
+	spin_unlock_bh(&timestamp_lock);
+	return state;
+}
+
+static void notify_netlink_uevent(const char *iface, struct idletimer_tg *timer)
+{
+	char iface_msg[NLMSG_MAX_SIZE];
+	char state_msg[NLMSG_MAX_SIZE];
+	char timestamp_msg[NLMSG_MAX_SIZE];
+	char uid_msg[NLMSG_MAX_SIZE];
+	char *envp[] = { iface_msg, state_msg, timestamp_msg, uid_msg, NULL };
+	int res;
+	struct timespec ts;
+	uint64_t time_ns;
+	bool state;
+
+	res = snprintf(iface_msg, NLMSG_MAX_SIZE, "INTERFACE=%s",
+		       iface);
+	if (NLMSG_MAX_SIZE <= res) {
+		pr_err("message too long (%d)", res);
+		return;
+	}
+
+	get_monotonic_boottime(&ts);
+	state = check_for_delayed_trigger(timer, &ts);
+	res = snprintf(state_msg, NLMSG_MAX_SIZE, "STATE=%s",
+			state ? "active" : "inactive");
+
+	if (NLMSG_MAX_SIZE <= res) {
+		pr_err("message too long (%d)", res);
+		return;
+	}
+
+	if (state) {
+		res = snprintf(uid_msg, NLMSG_MAX_SIZE, "UID=%u", timer->uid);
+		if (NLMSG_MAX_SIZE <= res)
+			pr_err("message too long (%d)", res);
+	} else {
+		res = snprintf(uid_msg, NLMSG_MAX_SIZE, "UID=");
+		if (NLMSG_MAX_SIZE <= res)
+			pr_err("message too long (%d)", res);
+	}
+
+	time_ns = timespec_to_ns(&ts);
+	res = snprintf(timestamp_msg, NLMSG_MAX_SIZE, "TIME_NS=%llu", time_ns);
+	if (NLMSG_MAX_SIZE <= res) {
+		timestamp_msg[0] = '\0';
+		pr_err("message too long (%d)", res);
+	}
+
+	pr_debug("putting nlmsg: <%s> <%s> <%s> <%s>\n", iface_msg, state_msg,
+		 timestamp_msg, uid_msg);
+	kobject_uevent_env(idletimer_tg_kobj, KOBJ_CHANGE, envp);
+	return;
+
+
+}
+
 static
 struct idletimer_tg *__idletimer_tg_find_by_label(const char *label)
 {
@@ -83,6 +189,7 @@
 {
 	struct idletimer_tg *timer;
 	unsigned long expires = 0;
+	unsigned long now = jiffies;
 
 	mutex_lock(&list_mutex);
 
@@ -92,11 +199,15 @@
 
 	mutex_unlock(&list_mutex);
 
-	if (time_after(expires, jiffies))
+	if (time_after(expires, now))
 		return sprintf(buf, "%u\n",
-			       jiffies_to_msecs(expires - jiffies) / 1000);
+			       jiffies_to_msecs(expires - now) / 1000);
 
-	return sprintf(buf, "0\n");
+	if (timer->send_nl_msg)
+		return sprintf(buf, "0 %d\n",
+			jiffies_to_msecs(now - expires) / 1000);
+	else
+		return sprintf(buf, "0\n");
 }
 
 static void idletimer_tg_work(struct work_struct *work)
@@ -105,6 +216,9 @@
 						  work);
 
 	sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name);
+
+	if (timer->send_nl_msg)
+		notify_netlink_uevent(timer->attr.attr.name, timer);
 }
 
 static void idletimer_tg_expired(struct timer_list *t)
@@ -112,8 +226,55 @@
 	struct idletimer_tg *timer = from_timer(timer, t, timer);
 
 	pr_debug("timer %s expired\n", timer->attr.attr.name);
-
+	spin_lock_bh(&timestamp_lock);
+	timer->active = false;
+	timer->work_pending = true;
 	schedule_work(&timer->work);
+	spin_unlock_bh(&timestamp_lock);
+}
+
+static int idletimer_resume(struct notifier_block *notifier,
+		unsigned long pm_event, void *unused)
+{
+	struct timespec ts;
+	unsigned long time_diff, now = jiffies;
+	struct idletimer_tg *timer = container_of(notifier,
+			struct idletimer_tg, pm_nb);
+	if (!timer)
+		return NOTIFY_DONE;
+	switch (pm_event) {
+	case PM_SUSPEND_PREPARE:
+		get_monotonic_boottime(&timer->last_suspend_time);
+		break;
+	case PM_POST_SUSPEND:
+		spin_lock_bh(&timestamp_lock);
+		if (!timer->active) {
+			spin_unlock_bh(&timestamp_lock);
+			break;
+		}
+		/* since jiffies are not updated when suspended now represents
+		 * the time it would have suspended */
+		if (time_after(timer->timer.expires, now)) {
+			get_monotonic_boottime(&ts);
+			ts = timespec_sub(ts, timer->last_suspend_time);
+			time_diff = timespec_to_jiffies(&ts);
+			if (timer->timer.expires > (time_diff + now)) {
+				mod_timer_pending(&timer->timer,
+						(timer->timer.expires - time_diff));
+			} else {
+				del_timer(&timer->timer);
+				timer->timer.expires = 0;
+				timer->active = false;
+				timer->work_pending = true;
+				schedule_work(&timer->work);
+			}
+		}
+		spin_unlock_bh(&timestamp_lock);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
 }
 
 static int idletimer_check_sysfs_name(const char *name, unsigned int size)
@@ -165,6 +326,21 @@
 
 	timer_setup(&info->timer->timer, idletimer_tg_expired, 0);
 	info->timer->refcnt = 1;
+	info->timer->send_nl_msg = (info->send_nl_msg == 0) ? false : true;
+	info->timer->active = true;
+	info->timer->timeout = info->timeout;
+
+	info->timer->delayed_timer_trigger.tv_sec = 0;
+	info->timer->delayed_timer_trigger.tv_nsec = 0;
+	info->timer->work_pending = false;
+	info->timer->uid = 0;
+	get_monotonic_boottime(&info->timer->last_modified_timer);
+
+	info->timer->pm_nb.notifier_call = idletimer_resume;
+	ret = register_pm_notifier(&info->timer->pm_nb);
+	if (ret)
+		printk(KERN_WARNING "[%s] Failed to register pm notifier %d\n",
+				__func__, ret);
 
 	INIT_WORK(&info->timer->work, idletimer_tg_work);
 
@@ -181,6 +357,42 @@
 	return ret;
 }
 
+static void reset_timer(const struct idletimer_tg_info *info,
+			struct sk_buff *skb)
+{
+	unsigned long now = jiffies;
+	struct idletimer_tg *timer = info->timer;
+	bool timer_prev;
+
+	spin_lock_bh(&timestamp_lock);
+	timer_prev = timer->active;
+	timer->active = true;
+	/* timer_prev is used to guard overflow problem in time_before*/
+	if (!timer_prev || time_before(timer->timer.expires, now)) {
+		pr_debug("Starting Checkentry timer (Expired, Jiffies): %lu, %lu\n",
+				timer->timer.expires, now);
+
+		/* Stores the uid resposible for waking up the radio */
+		if (skb && (skb->sk)) {
+			timer->uid = from_kuid_munged(current_user_ns(),
+					sock_i_uid(skb_to_full_sk(skb)));
+		}
+
+		/* checks if there is a pending inactive notification*/
+		if (timer->work_pending)
+			timer->delayed_timer_trigger = timer->last_modified_timer;
+		else {
+			timer->work_pending = true;
+			schedule_work(&timer->work);
+		}
+	}
+
+	get_monotonic_boottime(&timer->last_modified_timer);
+	mod_timer(&timer->timer,
+			msecs_to_jiffies(info->timeout * 1000) + now);
+	spin_unlock_bh(&timestamp_lock);
+}
+
 /*
  * The actual xt_tables plugin.
  */
@@ -188,15 +400,23 @@
 					 const struct xt_action_param *par)
 {
 	const struct idletimer_tg_info *info = par->targinfo;
+	unsigned long now = jiffies;
 
 	pr_debug("resetting timer %s, timeout period %u\n",
 		 info->label, info->timeout);
 
 	BUG_ON(!info->timer);
 
-	mod_timer(&info->timer->timer,
-		  msecs_to_jiffies(info->timeout * 1000) + jiffies);
+	info->timer->active = true;
 
+	if (time_before(info->timer->timer.expires, now)) {
+		schedule_work(&info->timer->work);
+		pr_debug("Starting timer %s (Expired, Jiffies): %lu, %lu\n",
+			 info->label, info->timer->timer.expires, now);
+	}
+
+	/* TODO: Avoid modifying timers on each packet */
+	reset_timer(info, skb);
 	return XT_CONTINUE;
 }
 
@@ -205,7 +425,7 @@
 	struct idletimer_tg_info *info = par->targinfo;
 	int ret;
 
-	pr_debug("checkentry targinfo%s\n", info->label);
+	pr_debug("checkentry targinfo %s\n", info->label);
 
 	if (info->timeout == 0) {
 		pr_debug("timeout value is zero\n");
@@ -227,9 +447,7 @@
 	info->timer = __idletimer_tg_find_by_label(info->label);
 	if (info->timer) {
 		info->timer->refcnt++;
-		mod_timer(&info->timer->timer,
-			  msecs_to_jiffies(info->timeout * 1000) + jiffies);
-
+		reset_timer(info, NULL);
 		pr_debug("increased refcnt of timer %s to %u\n",
 			 info->label, info->timer->refcnt);
 	} else {
@@ -242,6 +460,7 @@
 	}
 
 	mutex_unlock(&list_mutex);
+
 	return 0;
 }
 
@@ -258,13 +477,14 @@
 
 		list_del(&info->timer->entry);
 		del_timer_sync(&info->timer->timer);
-		cancel_work_sync(&info->timer->work);
 		sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+		unregister_pm_notifier(&info->timer->pm_nb);
+		cancel_work_sync(&info->timer->work);
 		kfree(info->timer->attr.attr.name);
 		kfree(info->timer);
 	} else {
 		pr_debug("decreased refcnt of timer %s to %u\n",
-			 info->label, info->timer->refcnt);
+		info->label, info->timer->refcnt);
 	}
 
 	mutex_unlock(&list_mutex);
@@ -272,6 +492,7 @@
 
 static struct xt_target idletimer_tg __read_mostly = {
 	.name		= "IDLETIMER",
+	.revision	= 1,
 	.family		= NFPROTO_UNSPEC,
 	.target		= idletimer_tg_target,
 	.targetsize     = sizeof(struct idletimer_tg_info),
@@ -338,3 +559,4 @@
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS("ipt_IDLETIMER");
 MODULE_ALIAS("ip6t_IDLETIMER");
+MODULE_ALIAS("arpt_IDLETIMER");

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 1ad4017..0c2dc6d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c

@@ -845,6 +845,8 @@
 	return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 3);
 }
 
+#define HASHLIMIT_MAX_SIZE 1048576
+
 static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
 				     struct xt_hashlimit_htable **hinfo,
 				     struct hashlimit_cfg3 *cfg,
@@ -855,6 +857,14 @@
 
 	if (cfg->gc_interval == 0 || cfg->expire == 0)
 		return -EINVAL;
+	if (cfg->size > HASHLIMIT_MAX_SIZE) {
+		cfg->size = HASHLIMIT_MAX_SIZE;
+		pr_info_ratelimited("size too large, truncated to %u\n", cfg->size);
+	}
+	if (cfg->max > HASHLIMIT_MAX_SIZE) {
+		cfg->max = HASHLIMIT_MAX_SIZE;
+		pr_info_ratelimited("max too large, truncated to %u\n", cfg->max);
+	}
 	if (par->family == NFPROTO_IPV4) {
 		if (cfg->srcmask > 32 || cfg->dstmask > 32)
 			return -EINVAL;

diff --git a/net/netfilter/xt_quota2.c b/net/netfilter/xt_quota2.c
new file mode 100644
index 0000000..24b7742
--- /dev/null
+++ b/net/netfilter/xt_quota2.c

@@ -0,0 +1,401 @@
+/*
+ * xt_quota2 - enhanced xt_quota that can count upwards and in packets
+ * as a minimal accounting match.
+ * by Jan Engelhardt <jengelh@medozas.de>, 2008
+ *
+ * Originally based on xt_quota.c:
+ * 	netfilter module to enforce network quotas
+ * 	Sam Johnston <samj@samj.net>
+ *
+ *	This program is free software; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License; either
+ *	version 2 of the License, as published by the Free Software Foundation.
+ */
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_quota2.h>
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+/* For compatibility, these definitions are copied from the
+ * deprecated header file <linux/netfilter_ipv4/ipt_ULOG.h> */
+#define ULOG_MAC_LEN	80
+#define ULOG_PREFIX_LEN	32
+
+/* Format of the ULOG packets passed through netlink */
+typedef struct ulog_packet_msg {
+	unsigned long mark;
+	long timestamp_sec;
+	long timestamp_usec;
+	unsigned int hook;
+	char indev_name[IFNAMSIZ];
+	char outdev_name[IFNAMSIZ];
+	size_t data_len;
+	char prefix[ULOG_PREFIX_LEN];
+	unsigned char mac_len;
+	unsigned char mac[ULOG_MAC_LEN];
+	unsigned char payload[0];
+} ulog_packet_msg_t;
+#endif
+
+/**
+ * @lock:	lock to protect quota writers from each other
+ */
+struct xt_quota_counter {
+	u_int64_t quota;
+	spinlock_t lock;
+	struct list_head list;
+	atomic_t ref;
+	char name[sizeof(((struct xt_quota_mtinfo2 *)NULL)->name)];
+	struct proc_dir_entry *procfs_entry;
+};
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+/* Harald's favorite number +1 :D From ipt_ULOG.C */
+static int qlog_nl_event = 112;
+module_param_named(event_num, qlog_nl_event, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(event_num,
+		 "Event number for NETLINK_NFLOG message. 0 disables log."
+		 "111 is what ipt_ULOG uses.");
+static struct sock *nflognl;
+#endif
+
+static LIST_HEAD(counter_list);
+static DEFINE_SPINLOCK(counter_list_lock);
+
+static struct proc_dir_entry *proc_xt_quota;
+static unsigned int quota_list_perms = S_IRUGO | S_IWUSR;
+static kuid_t quota_list_uid = KUIDT_INIT(0);
+static kgid_t quota_list_gid = KGIDT_INIT(0);
+module_param_named(perms, quota_list_perms, uint, S_IRUGO | S_IWUSR);
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+static void quota2_log(unsigned int hooknum,
+		       const struct sk_buff *skb,
+		       const struct net_device *in,
+		       const struct net_device *out,
+		       const char *prefix)
+{
+	ulog_packet_msg_t *pm;
+	struct sk_buff *log_skb;
+	size_t size;
+	struct nlmsghdr *nlh;
+
+	if (!qlog_nl_event)
+		return;
+
+	size = NLMSG_SPACE(sizeof(*pm));
+	size = max(size, (size_t)NLMSG_GOODSIZE);
+	log_skb = alloc_skb(size, GFP_ATOMIC);
+	if (!log_skb) {
+		pr_err("xt_quota2: cannot alloc skb for logging\n");
+		return;
+	}
+
+	nlh = nlmsg_put(log_skb, /*pid*/0, /*seq*/0, qlog_nl_event,
+			sizeof(*pm), 0);
+	if (!nlh) {
+		pr_err("xt_quota2: nlmsg_put failed\n");
+		kfree_skb(log_skb);
+		return;
+	}
+	pm = nlmsg_data(nlh);
+	if (skb->tstamp == 0)
+		__net_timestamp((struct sk_buff *)skb);
+	pm->data_len = 0;
+	pm->hook = hooknum;
+	if (prefix != NULL)
+		strlcpy(pm->prefix, prefix, sizeof(pm->prefix));
+	else
+		*(pm->prefix) = '\0';
+	if (in)
+		strlcpy(pm->indev_name, in->name, sizeof(pm->indev_name));
+	else
+		pm->indev_name[0] = '\0';
+
+	if (out)
+		strlcpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
+	else
+		pm->outdev_name[0] = '\0';
+
+	NETLINK_CB(log_skb).dst_group = 1;
+	pr_debug("throwing 1 packets to netlink group 1\n");
+	netlink_broadcast(nflognl, log_skb, 0, 1, GFP_ATOMIC);
+}
+#else
+static void quota2_log(unsigned int hooknum,
+		       const struct sk_buff *skb,
+		       const struct net_device *in,
+		       const struct net_device *out,
+		       const char *prefix)
+{
+}
+#endif  /* if+else CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG */
+
+static ssize_t quota_proc_read(struct file *file, char __user *buf,
+			   size_t size, loff_t *ppos)
+{
+	struct xt_quota_counter *e = PDE_DATA(file_inode(file));
+	char tmp[24];
+	size_t tmp_size;
+
+	spin_lock_bh(&e->lock);
+	tmp_size = scnprintf(tmp, sizeof(tmp), "%llu\n", e->quota);
+	spin_unlock_bh(&e->lock);
+	return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size);
+}
+
+static ssize_t quota_proc_write(struct file *file, const char __user *input,
+                            size_t size, loff_t *ppos)
+{
+	struct xt_quota_counter *e = PDE_DATA(file_inode(file));
+	char buf[sizeof("18446744073709551616")];
+
+	if (size > sizeof(buf))
+		size = sizeof(buf);
+	if (copy_from_user(buf, input, size) != 0)
+		return -EFAULT;
+	buf[sizeof(buf)-1] = '\0';
+
+	spin_lock_bh(&e->lock);
+	e->quota = simple_strtoull(buf, NULL, 0);
+	spin_unlock_bh(&e->lock);
+	return size;
+}
+
+static const struct file_operations q2_counter_fops = {
+	.read		= quota_proc_read,
+	.write		= quota_proc_write,
+	.llseek		= default_llseek,
+};
+
+static struct xt_quota_counter *
+q2_new_counter(const struct xt_quota_mtinfo2 *q, bool anon)
+{
+	struct xt_quota_counter *e;
+	unsigned int size;
+
+	/* Do not need all the procfs things for anonymous counters. */
+	size = anon ? offsetof(typeof(*e), list) : sizeof(*e);
+	e = kmalloc(size, GFP_KERNEL);
+	if (e == NULL)
+		return NULL;
+
+	e->quota = q->quota;
+	spin_lock_init(&e->lock);
+	if (!anon) {
+		INIT_LIST_HEAD(&e->list);
+		atomic_set(&e->ref, 1);
+		strlcpy(e->name, q->name, sizeof(e->name));
+	}
+	return e;
+}
+
+/**
+ * q2_get_counter - get ref to counter or create new
+ * @name:	name of counter
+ */
+static struct xt_quota_counter *
+q2_get_counter(const struct xt_quota_mtinfo2 *q)
+{
+	struct proc_dir_entry *p;
+	struct xt_quota_counter *e = NULL;
+	struct xt_quota_counter *new_e;
+
+	if (*q->name == '\0')
+		return q2_new_counter(q, true);
+
+	/* No need to hold a lock while getting a new counter */
+	new_e = q2_new_counter(q, false);
+	if (new_e == NULL)
+		goto out;
+
+	spin_lock_bh(&counter_list_lock);
+	list_for_each_entry(e, &counter_list, list)
+		if (strcmp(e->name, q->name) == 0) {
+			atomic_inc(&e->ref);
+			spin_unlock_bh(&counter_list_lock);
+			kfree(new_e);
+			pr_debug("xt_quota2: old counter name=%s", e->name);
+			return e;
+		}
+	e = new_e;
+	pr_debug("xt_quota2: new_counter name=%s", e->name);
+	list_add_tail(&e->list, &counter_list);
+	/* The entry having a refcount of 1 is not directly destructible.
+	 * This func has not yet returned the new entry, thus iptables
+	 * has not references for destroying this entry.
+	 * For another rule to try to destroy it, it would 1st need for this
+	 * func* to be re-invoked, acquire a new ref for the same named quota.
+	 * Nobody will access the e->procfs_entry either.
+	 * So release the lock. */
+	spin_unlock_bh(&counter_list_lock);
+
+	/* create_proc_entry() is not spin_lock happy */
+	p = e->procfs_entry = proc_create_data(e->name, quota_list_perms,
+	                      proc_xt_quota, &q2_counter_fops, e);
+
+	if (IS_ERR_OR_NULL(p)) {
+		spin_lock_bh(&counter_list_lock);
+		list_del(&e->list);
+		spin_unlock_bh(&counter_list_lock);
+		goto out;
+	}
+	proc_set_user(p, quota_list_uid, quota_list_gid);
+	return e;
+
+ out:
+	kfree(e);
+	return NULL;
+}
+
+static int quota_mt2_check(const struct xt_mtchk_param *par)
+{
+	struct xt_quota_mtinfo2 *q = par->matchinfo;
+
+	pr_debug("xt_quota2: check() flags=0x%04x", q->flags);
+
+	if (q->flags & ~XT_QUOTA_MASK)
+		return -EINVAL;
+
+	q->name[sizeof(q->name)-1] = '\0';
+	if (*q->name == '.' || strchr(q->name, '/') != NULL) {
+		printk(KERN_ERR "xt_quota.3: illegal name\n");
+		return -EINVAL;
+	}
+
+	q->master = q2_get_counter(q);
+	if (q->master == NULL) {
+		printk(KERN_ERR "xt_quota.3: memory alloc failure\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void quota_mt2_destroy(const struct xt_mtdtor_param *par)
+{
+	struct xt_quota_mtinfo2 *q = par->matchinfo;
+	struct xt_quota_counter *e = q->master;
+
+	if (*q->name == '\0') {
+		kfree(e);
+		return;
+	}
+
+	spin_lock_bh(&counter_list_lock);
+	if (!atomic_dec_and_test(&e->ref)) {
+		spin_unlock_bh(&counter_list_lock);
+		return;
+	}
+
+	list_del(&e->list);
+	remove_proc_entry(e->name, proc_xt_quota);
+	spin_unlock_bh(&counter_list_lock);
+	kfree(e);
+}
+
+static bool
+quota_mt2(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct xt_quota_mtinfo2 *q = (void *)par->matchinfo;
+	struct xt_quota_counter *e = q->master;
+	bool ret = q->flags & XT_QUOTA_INVERT;
+
+	spin_lock_bh(&e->lock);
+	if (q->flags & XT_QUOTA_GROW) {
+		/*
+		 * While no_change is pointless in "grow" mode, we will
+		 * implement it here simply to have a consistent behavior.
+		 */
+		if (!(q->flags & XT_QUOTA_NO_CHANGE)) {
+			e->quota += (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len;
+		}
+		ret = true;
+	} else {
+		if (e->quota >= skb->len) {
+			if (!(q->flags & XT_QUOTA_NO_CHANGE))
+				e->quota -= (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len;
+			ret = !ret;
+		} else {
+			/* We are transitioning, log that fact. */
+			if (e->quota) {
+				quota2_log(xt_hooknum(par),
+					   skb,
+					   xt_in(par),
+					   xt_out(par),
+					   q->name);
+			}
+			/* we do not allow even small packets from now on */
+			e->quota = 0;
+		}
+	}
+	spin_unlock_bh(&e->lock);
+	return ret;
+}
+
+static struct xt_match quota_mt2_reg[] __read_mostly = {
+	{
+		.name       = "quota2",
+		.revision   = 3,
+		.family     = NFPROTO_IPV4,
+		.checkentry = quota_mt2_check,
+		.match      = quota_mt2,
+		.destroy    = quota_mt2_destroy,
+		.matchsize  = sizeof(struct xt_quota_mtinfo2),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "quota2",
+		.revision   = 3,
+		.family     = NFPROTO_IPV6,
+		.checkentry = quota_mt2_check,
+		.match      = quota_mt2,
+		.destroy    = quota_mt2_destroy,
+		.matchsize  = sizeof(struct xt_quota_mtinfo2),
+		.me         = THIS_MODULE,
+	},
+};
+
+static int __init quota_mt2_init(void)
+{
+	int ret;
+	pr_debug("xt_quota2: init()");
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+	nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, NULL);
+	if (!nflognl)
+		return -ENOMEM;
+#endif
+
+	proc_xt_quota = proc_mkdir("xt_quota", init_net.proc_net);
+	if (proc_xt_quota == NULL)
+		return -EACCES;
+
+	ret = xt_register_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg));
+	if (ret < 0)
+		remove_proc_entry("xt_quota", init_net.proc_net);
+	pr_debug("xt_quota2: init() %d", ret);
+	return ret;
+}
+
+static void __exit quota_mt2_exit(void)
+{
+	xt_unregister_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg));
+	remove_proc_entry("xt_quota", init_net.proc_net);
+}
+
+module_init(quota_mt2_init);
+module_exit(quota_mt2_exit);
+MODULE_DESCRIPTION("Xtables: countdown quota match; up counter");
+MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_quota2");
+MODULE_ALIAS("ip6t_quota2");

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 930d17f..4a1b1bb 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c

@@ -1029,7 +1029,8 @@
 	if (nlk->netlink_bind && groups) {
 		int group;
 
-		for (group = 0; group < nlk->ngroups; group++) {
+		/* nl_groups is a u32, so cap the maximum groups we can bind */
+		for (group = 0; group < BITS_PER_TYPE(u32); group++) {
 			if (!test_bit(group, &groups))
 				continue;
 			err = nlk->netlink_bind(net, group + 1);
@@ -1048,7 +1049,7 @@
 			netlink_insert(sk, nladdr->nl_pid) :
 			netlink_autobind(sock);
 		if (err) {
-			netlink_undo_bind(nlk->ngroups, groups, sk);
+			netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk);
 			goto unlock;
 		}
 	}

diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 17fdfce..964c4e4 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c

@@ -647,11 +647,11 @@
 }
 
 /*
- * Final call destruction under RCU.
+ * Final call destruction - but must be done in process context.
  */
-static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
+static void rxrpc_destroy_call(struct work_struct *work)
 {
-	struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
+	struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor);
 	struct rxrpc_net *rxnet = call->rxnet;
 
 	rxrpc_put_connection(call->conn);
@@ -664,6 +664,22 @@
 }
 
 /*
+ * Final call destruction under RCU.
+ */
+static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
+{
+	struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
+
+	if (in_softirq()) {
+		INIT_WORK(&call->processor, rxrpc_destroy_call);
+		if (!rxrpc_queue_work(&call->processor))
+			BUG();
+	} else {
+		rxrpc_destroy_call(&call->processor);
+	}
+}
+
+/*
  * clean up a call
  */
 void rxrpc_cleanup_call(struct rxrpc_call *call)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 2241531..44ca31f 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c

@@ -196,6 +196,7 @@
 	struct fl_flow_key skb_mkey;
 
 	list_for_each_entry_rcu(mask, &head->masks, list) {
+		flow_dissector_init_keys(&skb_key.control, &skb_key.basic);
 		fl_clear_masked_range(&skb_key, mask);
 
 		skb_key.indev_ifindex = skb->skb_iif;
@@ -486,6 +487,7 @@
 	[TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
 	[TCA_FLOWER_KEY_ENC_OPTS]	= { .type = NLA_NESTED },
 	[TCA_FLOWER_KEY_ENC_OPTS_MASK]	= { .type = NLA_NESTED },
+	[TCA_FLOWER_FLAGS]		= { .type = NLA_U32 },
 };
 
 static const struct nla_policy

diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 40be745..74863b0 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c

@@ -137,6 +137,7 @@
 static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
 	[TCA_MATCHALL_UNSPEC]		= { .type = NLA_UNSPEC },
 	[TCA_MATCHALL_CLASSID]		= { .type = NLA_U32 },
+	[TCA_MATCHALL_FLAGS]		= { .type = NLA_U32 },
 };
 
 static int mall_set_parms(struct net *net, struct tcf_proto *tp,

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 559f09a..9f4d325 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c

@@ -185,6 +185,16 @@
 	return true;
 }
 
+/* Check for format error in an ABORT chunk */
+static inline bool sctp_err_chunk_valid(struct sctp_chunk *chunk)
+{
+	struct sctp_errhdr *err;
+
+	sctp_walk_errors(err, chunk->chunk_hdr);
+
+	return (void *)err == (void *)chunk->chunk_end;
+}
+
 /**********************************************************
  * These are the state functions for handling chunk events.
  **********************************************************/
@@ -2270,6 +2280,9 @@
 		    sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
 		return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
 
+	if (!sctp_err_chunk_valid(chunk))
+		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
 	return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands);
 }
 
@@ -2313,6 +2326,9 @@
 		    sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
 		return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
 
+	if (!sctp_err_chunk_valid(chunk))
+		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
 	/* Stop the T2-shutdown timer. */
 	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
 			SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
@@ -2580,6 +2596,9 @@
 		    sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
 		return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
 
+	if (!sctp_err_chunk_valid(chunk))
+		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
 	return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands);
 }
 
@@ -2597,16 +2616,8 @@
 
 	/* See if we have an error cause code in the chunk.  */
 	len = ntohs(chunk->chunk_hdr->length);
-	if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) {
-		struct sctp_errhdr *err;
-
-		sctp_walk_errors(err, chunk->chunk_hdr);
-		if ((void *)err != (void *)chunk->chunk_end)
-			return sctp_sf_pdiscard(net, ep, asoc, type, arg,
-						commands);
-
+	if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr))
 		error = ((struct sctp_errhdr *)chunk->skb->data)->cause;
-	}
 
 	sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNRESET));
 	/* ASSOC_FAILED will DELETE_TCB. */

diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 52241d6..aa9a17a 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c

@@ -364,7 +364,9 @@
 	dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
 	dclc.hdr.version = SMC_CLC_V1;
 	dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0;
-	memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid));
+	if (smc->conn.lgr && !smc->conn.lgr->is_smcd)
+		memcpy(dclc.id_for_peer, local_systemid,
+		       sizeof(local_systemid));
 	dclc.peer_diagnosis = htonl(peer_diag_info);
 	memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 

diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 371b4cf..2379a02 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c

@@ -38,16 +38,15 @@
 {
 	struct smc_sock *smc = smc_sk(sk);
 
+	memset(r, 0, sizeof(*r));
 	r->diag_family = sk->sk_family;
+	sock_diag_save_cookie(sk, r->id.idiag_cookie);
 	if (!smc->clcsock)
 		return;
 	r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
 	r->id.idiag_dport = smc->clcsock->sk->sk_dport;
 	r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
-	sock_diag_save_cookie(sk, r->id.idiag_cookie);
 	if (sk->sk_protocol == SMCPROTO_SMC) {
-		memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
-		memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
 		r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
 		r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
 #if IS_ENABLED(CONFIG_IPV6)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 0a613e0..8f40bbfd 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c

@@ -506,7 +506,7 @@
 				       u32 seq, u64 *p_record_sn)
 {
 	u64 record_sn = context->hint_record_sn;
-	struct tls_record_info *info;
+	struct tls_record_info *info, *last;
 
 	info = context->retransmit_hint;
 	if (!info ||
@@ -516,6 +516,25 @@
 		 */
 		info = list_first_entry(&context->records_list,
 					struct tls_record_info, list);
+
+		/* send the start_marker record if seq number is before the
+		 * tls offload start marker sequence number. This record is
+		 * required to handle TCP packets which are before TLS offload
+		 * started.
+		 *  And if it's not start marker, look if this seq number
+		 * belongs to the list.
+		 */
+		if (likely(!tls_record_is_start_marker(info))) {
+			/* we have the first record, get the last record to see
+			 * if this seq number belongs to the list.
+			 */
+			last = list_last_entry(&context->records_list,
+					       struct tls_record_info, list);
+
+			if (!between(seq, tls_record_start_seq(info),
+				     last->end_seq))
+				return NULL;
+		}
 		record_sn = context->unacked_record_sn;
 	}
 

diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c
index a9c0f36..24e1840 100644
--- a/net/wireless/ethtool.c
+++ b/net/wireless/ethtool.c

@@ -7,9 +7,13 @@
 void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct device *pdev = wiphy_dev(wdev->wiphy);
 
-	strlcpy(info->driver, wiphy_dev(wdev->wiphy)->driver->name,
-		sizeof(info->driver));
+	if (pdev->driver)
+		strlcpy(info->driver, pdev->driver->name,
+			sizeof(info->driver));
+	else
+		strlcpy(info->driver, "N/A", sizeof(info->driver));
 
 	strlcpy(info->version, init_utsname()->release, sizeof(info->version));
 

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 823dea1..dfde06b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c

@@ -323,6 +323,7 @@
 	[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG },
 	[NL80211_ATTR_CONTROL_PORT_OVER_NL80211] = { .type = NLA_FLAG },
 	[NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG },
+	[NL80211_ATTR_STATUS_CODE] = { .type = NLA_U16 },
 	[NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 },
 	[NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 },
 	[NL80211_ATTR_PID] = { .type = NLA_U32 },

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 85a6e8f..23c4653 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile

@@ -268,7 +268,7 @@
 	@echo "  CLANG-bpf " $@
 	$(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \
 		-I$(srctree)/tools/testing/selftests/bpf/ \
-		-D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \
+		-D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \
 		-D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \
 		-Wno-gnu-variable-sized-type-not-at-end \
 		-Wno-address-of-packed-member -Wno-tautological-compare \

diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index ce53639..df0f0f4 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include

@@ -138,10 +138,6 @@
 cc-disable-warning = $(call try-run,\
 	$(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1)))
 
-# cc-name
-# Expands to either gcc or clang
-cc-name = $(shell $(CC) -v 2>&1 | grep -q "clang version" && echo clang || echo gcc)
-
 # cc-version
 cc-version = $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC))
 

diff --git a/scripts/Kconfig.include b/scripts/Kconfig.include
index 3b2861f..79455ad 100644
--- a/scripts/Kconfig.include
+++ b/scripts/Kconfig.include

@@ -20,7 +20,7 @@
 
 # $(cc-option,<flag>)
 # Return y if the compiler supports <flag>, n otherwise
-cc-option = $(success,$(CC) -Werror $(CLANG_FLAGS) $(1) -E -x c /dev/null -o /dev/null)
+cc-option = $(success,$(CC) -Werror $(CLANG_FLAGS) $(1) -S -x c /dev/null -o /dev/null)
 
 # $(ld-option,<flag>)
 # Return y if the linker supports <flag>, n otherwise

diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn
index 8d53570..e235ac6 100644
--- a/scripts/Makefile.extrawarn
+++ b/scripts/Makefile.extrawarn

@@ -23,14 +23,16 @@
 warning-1 := -Wextra -Wunused -Wno-unused-parameter
 warning-1 += -Wmissing-declarations
 warning-1 += -Wmissing-format-attribute
-warning-1 += $(call cc-option, -Wmissing-prototypes)
+warning-1 += -Wmissing-prototypes
 warning-1 += -Wold-style-definition
-warning-1 += $(call cc-option, -Wmissing-include-dirs)
+warning-1 += -Wmissing-include-dirs
 warning-1 += $(call cc-option, -Wunused-but-set-variable)
 warning-1 += $(call cc-option, -Wunused-const-variable)
 warning-1 += $(call cc-option, -Wpacked-not-aligned)
-warning-1 += $(call cc-disable-warning, missing-field-initializers)
-warning-1 += $(call cc-disable-warning, sign-compare)
+warning-1 += $(call cc-option, -Wstringop-truncation)
+# The following turn off the warnings enabled by -Wextra
+warning-1 += -Wno-missing-field-initializers
+warning-1 += -Wno-sign-compare
 
 warning-2 := -Waggregate-return
 warning-2 += -Wcast-align
@@ -38,8 +40,8 @@
 warning-2 += -Wnested-externs
 warning-2 += -Wshadow
 warning-2 += $(call cc-option, -Wlogical-op)
-warning-2 += $(call cc-option, -Wmissing-field-initializers)
-warning-2 += $(call cc-option, -Wsign-compare)
+warning-2 += -Wmissing-field-initializers
+warning-2 += -Wsign-compare
 warning-2 += $(call cc-option, -Wmaybe-uninitialized)
 warning-2 += $(call cc-option, -Wunused-macros)
 
@@ -65,12 +67,11 @@
 KBUILD_CFLAGS += $(warning)
 else
 
-ifeq ($(cc-name),clang)
-KBUILD_CFLAGS += $(call cc-disable-warning, initializer-overrides)
-KBUILD_CFLAGS += $(call cc-disable-warning, unused-value)
-KBUILD_CFLAGS += $(call cc-disable-warning, format)
-KBUILD_CFLAGS += $(call cc-disable-warning, sign-compare)
-KBUILD_CFLAGS += $(call cc-disable-warning, format-zero-length)
-KBUILD_CFLAGS += $(call cc-disable-warning, uninitialized)
+ifdef CONFIG_CC_IS_CLANG
+KBUILD_CFLAGS += -Wno-initializer-overrides
+KBUILD_CFLAGS += -Wno-format
+KBUILD_CFLAGS += -Wno-sign-compare
+KBUILD_CFLAGS += -Wno-format-zero-length
+KBUILD_CFLAGS += -Wno-uninitialized
 endif
 endif

diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst
index ff5ca98..8ff0669 100644
--- a/scripts/Makefile.modinst
+++ b/scripts/Makefile.modinst

@@ -30,7 +30,7 @@
 INSTALL_MOD_DIR ?= extra
 ext-mod-dir = $(INSTALL_MOD_DIR)$(subst $(patsubst %/,%,$(KBUILD_EXTMOD)),,$(@D))
 
-modinst_dir = $(if $(KBUILD_EXTMOD),$(ext-mod-dir),kernel/$(@D))
+modinst_dir ?= $(if $(KBUILD_EXTMOD),$(ext-mod-dir),kernel/$(@D))
 
 $(modules):
 	$(call cmd,modules_install,$(MODLIB)/$(modinst_dir))

diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 5aa75a0a..092ead2 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh

@@ -28,7 +28,7 @@
 		local objfile=${modcache[$module]}
 	else
 		[[ $modpath == "" ]] && return
-		local objfile=$(find "$modpath" -name $module.ko -print -quit)
+		local objfile=$(find "$modpath" -name "${module//_/[-_]}.ko*" -print -quit)
 		[[ $objfile == "" ]] && return
 		modcache[$module]=$objfile
 	fi

diff --git a/scripts/gcc-goto.sh b/scripts/gcc-goto.sh
index 8b980fb22..083c526 100755
--- a/scripts/gcc-goto.sh
+++ b/scripts/gcc-goto.sh

@@ -3,7 +3,7 @@
 # Test for gcc 'asm goto' support
 # Copyright (C) 2010, Jason Baron <jbaron@redhat.com>
 
-cat << "END" | $@ -x c - -fno-PIE -c -o /dev/null
+cat << "END" | $@ -x c - -c -o /dev/null >/dev/null 2>&1 && echo "y"
 int main(void)
 {
 #if defined(__arm__) || defined(__aarch64__)

diff --git a/scripts/gen_compile_commands.py b/scripts/gen_compile_commands.py
new file mode 100755
index 0000000..7915823
--- /dev/null
+++ b/scripts/gen_compile_commands.py

@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) Google LLC, 2018
+#
+# Author: Tom Roeder <tmroeder@google.com>
+#
+"""A tool for generating compile_commands.json in the Linux kernel."""
+
+import argparse
+import json
+import logging
+import os
+import re
+
+_DEFAULT_OUTPUT = 'compile_commands.json'
+_DEFAULT_LOG_LEVEL = 'WARNING'
+
+_FILENAME_PATTERN = r'^\..*\.cmd$'
+_LINE_PATTERN = r'^cmd_[^ ]*\.o := (.* )([^ ]*\.c)$'
+_VALID_LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
+
+# A kernel build generally has over 2000 entries in its compile_commands.json
+# database. If this code finds 500 or fewer, then warn the user that they might
+# not have all the .cmd files, and they might need to compile the kernel.
+_LOW_COUNT_THRESHOLD = 500
+
+
+def parse_arguments():
+    """Sets up and parses command-line arguments.
+
+    Returns:
+        log_level: A logging level to filter log output.
+        directory: The directory to search for .cmd files.
+        output: Where to write the compile-commands JSON file.
+    """
+    usage = 'Creates a compile_commands.json database from kernel .cmd files'
+    parser = argparse.ArgumentParser(description=usage)
+
+    directory_help = ('Path to the kernel source directory to search '
+                      '(defaults to the working directory)')
+    parser.add_argument('-d', '--directory', type=str, help=directory_help)
+
+    output_help = ('The location to write compile_commands.json (defaults to '
+                   'compile_commands.json in the search directory)')
+    parser.add_argument('-o', '--output', type=str, help=output_help)
+
+    log_level_help = ('The level of log messages to produce (one of ' +
+                      ', '.join(_VALID_LOG_LEVELS) + '; defaults to ' +
+                      _DEFAULT_LOG_LEVEL + ')')
+    parser.add_argument(
+        '--log_level', type=str, default=_DEFAULT_LOG_LEVEL,
+        help=log_level_help)
+
+    args = parser.parse_args()
+
+    log_level = args.log_level
+    if log_level not in _VALID_LOG_LEVELS:
+        raise ValueError('%s is not a valid log level' % log_level)
+
+    directory = args.directory or os.getcwd()
+    output = args.output or os.path.join(directory, _DEFAULT_OUTPUT)
+    directory = os.path.abspath(directory)
+
+    return log_level, directory, output
+
+
+def process_line(root_directory, file_directory, command_prefix, relative_path):
+    """Extracts information from a .cmd line and creates an entry from it.
+
+    Args:
+        root_directory: The directory that was searched for .cmd files. Usually
+            used directly in the "directory" entry in compile_commands.json.
+        file_directory: The path to the directory the .cmd file was found in.
+        command_prefix: The extracted command line, up to the last element.
+        relative_path: The .c file from the end of the extracted command.
+            Usually relative to root_directory, but sometimes relative to
+            file_directory and sometimes neither.
+
+    Returns:
+        An entry to append to compile_commands.
+
+    Raises:
+        ValueError: Could not find the extracted file based on relative_path and
+            root_directory or file_directory.
+    """
+    # The .cmd files are intended to be included directly by Make, so they
+    # escape the pound sign '#', either as '\#' or '$(pound)' (depending on the
+    # kernel version). The compile_commands.json file is not interepreted
+    # by Make, so this code replaces the escaped version with '#'.
+    prefix = command_prefix.replace('\#', '#').replace('$(pound)', '#')
+
+    cur_dir = root_directory
+    expected_path = os.path.join(cur_dir, relative_path)
+    if not os.path.exists(expected_path):
+        # Try using file_directory instead. Some of the tools have a different
+        # style of .cmd file than the kernel.
+        cur_dir = file_directory
+        expected_path = os.path.join(cur_dir, relative_path)
+        if not os.path.exists(expected_path):
+            raise ValueError('File %s not in %s or %s' %
+                             (relative_path, root_directory, file_directory))
+    return {
+        'directory': cur_dir,
+        'file': relative_path,
+        'command': prefix + relative_path,
+    }
+
+
+def main():
+    """Walks through the directory and finds and parses .cmd files."""
+    log_level, directory, output = parse_arguments()
+
+    level = getattr(logging, log_level)
+    logging.basicConfig(format='%(levelname)s: %(message)s', level=level)
+
+    filename_matcher = re.compile(_FILENAME_PATTERN)
+    line_matcher = re.compile(_LINE_PATTERN)
+
+    compile_commands = []
+    for dirpath, _, filenames in os.walk(directory):
+        for filename in filenames:
+            if not filename_matcher.match(filename):
+                continue
+            filepath = os.path.join(dirpath, filename)
+
+            with open(filepath, 'rt') as f:
+                for line in f:
+                    result = line_matcher.match(line)
+                    if not result:
+                        continue
+
+                    try:
+                        entry = process_line(directory, dirpath,
+                                             result.group(1), result.group(2))
+                        compile_commands.append(entry)
+                    except ValueError as err:
+                        logging.info('Could not add line from %s: %s',
+                                     filepath, err)
+
+    with open(output, 'wt') as f:
+        json.dump(compile_commands, f, indent=2, sort_keys=True)
+
+    count = len(compile_commands)
+    if count < _LOW_COUNT_THRESHOLD:
+        logging.warning(
+            'Found %s entries. Have you compiled the kernel?', count)
+
+
+if __name__ == '__main__':
+    main()

diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c
index 0dde19c..2caf5fa 100644
--- a/scripts/kconfig/confdata.c
+++ b/scripts/kconfig/confdata.c

@@ -1314,7 +1314,7 @@
 
 		sym_calc_value(csym);
 		if (mode == def_random)
-			has_changed = randomize_choice_values(csym);
+			has_changed |= randomize_choice_values(csym);
 		else {
 			set_all_choice_values(csym);
 			has_changed = true;

diff --git a/security/Kconfig b/security/Kconfig
index d9aa521..07e9d4c 100644
--- a/security/Kconfig
+++ b/security/Kconfig

@@ -236,6 +236,8 @@
 source security/apparmor/Kconfig
 source security/loadpin/Kconfig
 source security/yama/Kconfig
+source security/chromiumos/Kconfig
+source security/container/Kconfig
 
 source security/integrity/Kconfig
 
@@ -276,5 +278,13 @@
 	default "apparmor" if DEFAULT_SECURITY_APPARMOR
 	default "" if DEFAULT_SECURITY_DAC
 
-endmenu
+config ARCH_HAS_ALT_SYSCALL
+	def_bool n
 
+config ALT_SYSCALL
+	bool "Alternate syscall table support"
+	depends on ARCH_HAS_ALT_SYSCALL
+	help
+	  Allow syscall table to be swapped on a running process.
+
+endmenu

diff --git a/security/Makefile b/security/Makefile
index 4d2d378..e08af6f 100644
--- a/security/Makefile
+++ b/security/Makefile

@@ -10,6 +10,8 @@
 subdir-$(CONFIG_SECURITY_APPARMOR)	+= apparmor
 subdir-$(CONFIG_SECURITY_YAMA)		+= yama
 subdir-$(CONFIG_SECURITY_LOADPIN)	+= loadpin
+subdir-$(CONFIG_SECURITY_CHROMIUMOS)	+= chromiumos
+subdir-$(CONFIG_SECURITY_CONTAINER_MONITOR) += container
 
 # always enable default capabilities
 obj-y					+= commoncap.o
@@ -18,6 +20,7 @@
 # Object file lists
 obj-$(CONFIG_SECURITY)			+= security.o
 obj-$(CONFIG_SECURITYFS)		+= inode.o
+obj-$(CONFIG_SECURITY_CHROMIUMOS)	+= chromiumos/
 obj-$(CONFIG_SECURITY_SELINUX)		+= selinux/
 obj-$(CONFIG_SECURITY_SMACK)		+= smack/
 obj-$(CONFIG_AUDIT)			+= lsm_audit.o
@@ -26,6 +29,7 @@
 obj-$(CONFIG_SECURITY_YAMA)		+= yama/
 obj-$(CONFIG_SECURITY_LOADPIN)		+= loadpin/
 obj-$(CONFIG_CGROUP_DEVICE)		+= device_cgroup.o
+obj-$(CONFIG_SECURITY_CONTAINER_MONITOR) += container/
 
 # Object integrity file lists
 subdir-$(CONFIG_INTEGRITY)		+= integrity

diff --git a/security/chromiumos/Kconfig b/security/chromiumos/Kconfig
new file mode 100644
index 0000000..aaaa295
--- /dev/null
+++ b/security/chromiumos/Kconfig

@@ -0,0 +1,50 @@
+config SECURITY_CHROMIUMOS
+	bool "Chromium OS Security Module"
+	depends on SECURITY
+	depends on X86_64 || ARM64
+	help
+	  The purpose of the Chromium OS security module is to reduce attacking
+	  surface by preventing access to general purpose access modes not
+	  required by Chromium OS. Currently: the mount operation is
+	  restricted by requiring a mount point path without symbolic links,
+	  and loading modules is limited to only the root filesystem. This
+	  LSM is stacked ahead of any primary "full" LSM.
+
+config SECURITY_CHROMIUMOS_NO_SYMLINK_MOUNT
+	bool "Chromium OS Security: prohibit mount to symlinked target"
+	depends on SECURITY_CHROMIUMOS
+	default y
+	help
+	  When enabled mount() syscall will return ELOOP whenever target path
+	  contains any symlinks.
+
+config SECURITY_CHROMIUMOS_NO_UNPRIVILEGED_UNSAFE_MOUNTS
+	bool "Chromium OS Security: prohibit unsafe mounts in unprivileged user namespaces"
+	depends on SECURITY_CHROMIUMOS
+	default y
+	help
+	  When enabled, mount() syscall will return EPERM whenever a new mount
+	  is attempted that would cause the filesystem to have the exec, suid,
+	  or dev flags if the caller does not have the CAP_SYS_ADMIN capability
+	  in the init namespace.
+
+config ALT_SYSCALL_CHROMIUMOS
+	bool "Chromium OS Alt-Syscall Tables"
+	depends on ALT_SYSCALL
+	help
+	  Register restricted, alternate syscall tables used by Chromium OS
+	  using the alt-syscall infrastructure.  Alternate syscall tables
+	  can be selected with prctl(PR_ALT_SYSCALL).
+
+config ALT_SYSCALL_CHROMIUMOS_LEGACY_API
+       bool
+       default y
+       depends on ALT_SYSCALL_CHROMIUMOS && ARM
+
+config SECURITY_CHROMIUMOS_READONLY_PROC_SELF_MEM
+	bool "Force /proc/<pid>/mem paths to be read-only"
+	default y
+	help
+	  When enabled, attempts to open /proc/self/mem for write access
+	  will always fail.  Write access to this file allows bypassing
+	  of memory map permissions (such as modifying read-only code).

diff --git a/security/chromiumos/Makefile b/security/chromiumos/Makefile
new file mode 100644
index 0000000..a59b4ec
--- /dev/null
+++ b/security/chromiumos/Makefile

@@ -0,0 +1,5 @@
+obj-$(CONFIG_SECURITY_CHROMIUMOS) := chromiumos_lsm.o
+
+chromiumos_lsm-y := inode_mark.o lsm.o securityfs.o utils.o
+
+obj-$(CONFIG_ALT_SYSCALL_CHROMIUMOS) += alt-syscall.o

diff --git a/security/chromiumos/alt-syscall.c b/security/chromiumos/alt-syscall.c
new file mode 100644
index 0000000..c4adfcd
--- /dev/null
+++ b/security/chromiumos/alt-syscall.c

@@ -0,0 +1,492 @@
+/*
+ * Chromium OS alt-syscall tables
+ *
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/alt-syscall.h>
+#include <linux/compat.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/prctl.h>
+#include <linux/sched/types.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/syscalls.h>
+#include <linux/timex.h>
+
+#include <asm/unistd.h>
+
+#include "alt-syscall.h"
+#include "android_whitelists.h"
+#include "complete_whitelists.h"
+#include "read_write_test_whitelists.h"
+#include "third_party_whitelists.h"
+
+#ifdef CONFIG_ALT_SYSCALL_CHROMIUMOS_LEGACY_API
+
+/* Intercept and log blocked syscalls. */
+static asmlinkage long block_syscall(void)
+{
+	struct task_struct *task = current;
+	struct pt_regs *regs = task_pt_regs(task);
+
+	pr_warn_ratelimited("[%d] %s: blocked syscall %d\n", task_pid_nr(task),
+			    task->comm, syscall_get_nr(task, regs));
+
+	return -ENOSYS;
+}
+
+/*
+ * In permissive mode, warn that the syscall was blocked, but still allow
+ * it to go through.  Note that since we don't have an easy way to map from
+ * syscall to number of arguments, we pass the maximum (6).
+ */
+static asmlinkage long warn_syscall(void)
+{
+	struct task_struct *task = current;
+	struct pt_regs *regs = task_pt_regs(task);
+	int nr = syscall_get_nr(task, regs);
+	sys_call_ptr_t fn = default_table.table[nr];
+	unsigned long args[6];
+
+	pr_warn_ratelimited("[%d] %s: syscall %d not whitelisted\n",
+			    task_pid_nr(task), task->comm, nr);
+
+	syscall_get_arguments(task, regs, 0, ARRAY_SIZE(args), args);
+
+	return fn(args[0], args[1], args[2], args[3], args[4], args[5]);
+}
+
+#else
+
+/* Intercept and log blocked syscalls. */
+static asmlinkage long block_syscall(struct pt_regs *regs)
+{
+	struct task_struct *task = current;
+
+	pr_warn_ratelimited("[%d] %s: blocked syscall %d\n", task_pid_nr(task),
+		task->comm, syscall_get_nr(task, regs));
+
+	return -ENOSYS;
+}
+
+/*
+ * In permissive mode, warn that the syscall was blocked, but still allow
+ * it to go through.  Note that since we don't have an easy way to map from
+ * syscall to number of arguments, we pass the maximum (6).
+ */
+static asmlinkage long warn_syscall(struct pt_regs *regs)
+{
+	struct task_struct *task = current;
+	int nr = syscall_get_nr(task, regs);
+	sys_call_ptr_t fn = (sys_call_ptr_t)default_table.table[nr];
+
+	pr_warn_ratelimited("[%d] %s: syscall %d not whitelisted\n",
+			    task_pid_nr(task), task->comm, nr);
+
+	return fn(regs);
+}
+
+#ifdef CONFIG_COMPAT
+static asmlinkage long warn_compat_syscall(struct pt_regs *regs)
+{
+	struct task_struct *task = current;
+	int nr = syscall_get_nr(task, regs);
+	sys_call_ptr_t fn = (sys_call_ptr_t)default_table.compat_table[nr];
+
+	pr_warn_ratelimited("[%d] %s: compat syscall %d not whitelisted\n",
+			    task_pid_nr(task), task->comm, nr);
+
+	return fn(regs);
+}
+#endif /* CONFIG_COMPAT */
+#endif /* CONFIG_ALT_SYSCALL_CHROMIUMOS_LEGACY_API */
+
+static inline long do_alt_sys_prctl(int option, unsigned long arg2,
+				    unsigned long arg3, unsigned long arg4,
+				    unsigned long arg5)
+
+{
+	if (option == PR_ALT_SYSCALL &&
+	    arg2 == PR_ALT_SYSCALL_SET_SYSCALL_TABLE)
+		return -EPERM;
+
+	return ksys_prctl(option, arg2, arg3, arg4, arg5);
+}
+DEF_ALT_SYS(alt_sys_prctl, 5, int, unsigned long, unsigned long,
+	    unsigned long, unsigned long);
+
+/* Thread priority used by Android. */
+#define ANDROID_PRIORITY_FOREGROUND     -2
+#define ANDROID_PRIORITY_DISPLAY        -4
+#define ANDROID_PRIORITY_URGENT_DISPLAY -8
+#define ANDROID_PRIORITY_AUDIO         -16
+#define ANDROID_PRIORITY_URGENT_AUDIO  -19
+#define ANDROID_PRIORITY_HIGHEST       -20
+
+/* Reduced priority when running inside container. */
+#define CONTAINER_PRIORITY_FOREGROUND     -1
+#define CONTAINER_PRIORITY_DISPLAY        -2
+#define CONTAINER_PRIORITY_URGENT_DISPLAY -4
+#define CONTAINER_PRIORITY_AUDIO          -8
+#define CONTAINER_PRIORITY_URGENT_AUDIO   -9
+#define CONTAINER_PRIORITY_HIGHEST       -10
+
+/*
+ * TODO(mortonm): Move the implementation of these Android-specific
+ * alt-syscalls (starting with android_*) to their own .c file.
+ */
+static long do_android_getpriority(int which, int who)
+{
+	int prio, nice;
+
+	prio = ksys_getpriority(which, who);
+	if (prio <= 20)
+		return prio;
+
+	nice = -(prio - 20);
+	switch (nice) {
+	case CONTAINER_PRIORITY_FOREGROUND:
+		nice = ANDROID_PRIORITY_FOREGROUND;
+		break;
+	case CONTAINER_PRIORITY_DISPLAY:
+		nice = ANDROID_PRIORITY_DISPLAY;
+		break;
+	case CONTAINER_PRIORITY_URGENT_DISPLAY:
+		nice = ANDROID_PRIORITY_URGENT_DISPLAY;
+		break;
+	case CONTAINER_PRIORITY_AUDIO:
+		nice = ANDROID_PRIORITY_AUDIO;
+		break;
+	case CONTAINER_PRIORITY_URGENT_AUDIO:
+		nice = ANDROID_PRIORITY_URGENT_AUDIO;
+		break;
+	case CONTAINER_PRIORITY_HIGHEST:
+		nice = ANDROID_PRIORITY_HIGHEST;
+		break;
+	}
+
+	return -nice + 20;
+}
+DEF_ALT_SYS(android_getpriority, 2, int, int);
+
+static inline long do_android_keyctl(int cmd, unsigned long arg2,
+				     unsigned long arg3, unsigned long arg4,
+				     unsigned long arg5)
+{
+	return -EACCES;
+}
+DEF_ALT_SYS(android_keyctl, 5, int, unsigned long, unsigned long,
+	    unsigned long, unsigned long);
+
+static inline int do_android_setpriority(int which, int who, int niceval)
+{
+	if (niceval < 0) {
+		if (niceval < -20)
+			niceval = -20;
+
+		niceval = niceval / 2;
+	}
+
+	return ksys_setpriority(which, who, niceval);
+}
+DEF_ALT_SYS(android_setpriority, 3, int, int, int);
+
+static inline long
+do_android_sched_setscheduler(pid_t pid, int policy,
+			      struct sched_param __user *param)
+{
+	struct sched_param lparam;
+	struct task_struct *p;
+	long retval;
+
+	if (policy < 0)
+		return -EINVAL;
+
+	if (!param || pid < 0)
+		return -EINVAL;
+	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+		return -EFAULT;
+
+	rcu_read_lock();
+	retval = -ESRCH;
+	p = pid ? find_task_by_vpid(pid) : current;
+	if (p != NULL) {
+		const struct cred *cred = current_cred();
+		kuid_t android_root_uid, android_system_uid;
+
+		/*
+		 * Allow root(0) and system(1000) processes to set RT scheduler.
+		 *
+		 * The system_server process run under system provides
+		 * SchedulingPolicyService which is used by audioflinger and
+		 * other services to boost their threads, so allow it to set RT
+		 * scheduler for other threads.
+		 */
+		android_root_uid = make_kuid(cred->user_ns, 0);
+		android_system_uid = make_kuid(cred->user_ns, 1000);
+		if ((uid_eq(cred->euid, android_root_uid) ||
+		     uid_eq(cred->euid, android_system_uid)) &&
+		    ns_capable(cred->user_ns, CAP_SYS_NICE))
+			retval = sched_setscheduler_nocheck(p, policy, &lparam);
+		else
+			retval = sched_setscheduler(p, policy, &lparam);
+	}
+	rcu_read_unlock();
+
+	return retval;
+}
+DEF_ALT_SYS(android_sched_setscheduler, 3, pid_t, int,
+	    struct sched_param __user *);
+
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY -1
+
+static inline long do_android_sched_setparam(pid_t pid,
+					     struct sched_param __user *param)
+{
+	return do_android_sched_setscheduler(pid, SETPARAM_POLICY, param);
+}
+DEF_ALT_SYS(android_sched_setparam, 2, pid_t, struct sched_param __user *);
+
+static inline long do_android_socket(int domain, int type, int protocol)
+{
+	if (domain == AF_VSOCK)
+	       return -EACCES;
+
+	return __sys_socket(domain, type, protocol);
+}
+DEF_ALT_SYS(android_socket, 3, int, int, int);
+
+static inline long do_android_perf_event_open(
+	struct perf_event_attr __user *attr_uptr, pid_t pid, int cpu,
+	int group_fd, unsigned long flags)
+{
+	if (!allow_devmode_syscalls)
+		return -EACCES;
+
+	return ksys_perf_event_open(attr_uptr, pid, cpu, group_fd, flags);
+}
+DEF_ALT_SYS(android_perf_event_open, 5, struct perf_event_attr __user *,
+	    pid_t, int, int, unsigned long);
+
+static inline long do_android_adjtimex(struct timex __user *buf)
+{
+	struct timex kbuf;
+
+	/* adjtimex() is allowed only for read. */
+	if (copy_from_user(&kbuf, buf, sizeof(struct timex)))
+		return -EFAULT;
+	if (kbuf.modes != 0)
+		return -EPERM;
+
+	return ksys_adjtimex(buf);
+}
+DEF_ALT_SYS(android_adjtimex, 1, struct timex __user *);
+
+static inline long do_android_clock_adjtime(clockid_t which_clock,
+					    struct timex __user *buf)
+{
+	struct timex kbuf;
+
+	/* clock_adjtime() is allowed only for read. */
+	if (copy_from_user(&kbuf, buf, sizeof(struct timex)))
+		return -EFAULT;
+
+	if (kbuf.modes != 0)
+		return -EPERM;
+
+	return ksys_clock_adjtime(which_clock, buf);
+}
+DEF_ALT_SYS(android_clock_adjtime, 2, clockid_t, struct timex __user *);
+
+static inline long do_android_getcpu(unsigned __user *cpu,
+				     unsigned __user *node,
+				     struct getcpu_cache __user *cache)
+{
+	if (node || cache)
+		return -EPERM;
+
+	return ksys_getcpu(cpu, node, cache);
+}
+DEF_ALT_SYS(android_getcpu, 3, unsigned __user *, unsigned __user *,
+	    struct getcpu_cache __user *);
+
+#ifdef CONFIG_COMPAT
+static inline long do_android_compat_adjtimex(struct compat_timex __user *buf)
+{
+	struct compat_timex kbuf;
+
+	/* adjtimex() is allowed only for read. */
+	if (copy_from_user(&kbuf, buf, sizeof(struct compat_timex)))
+		return -EFAULT;
+
+	if (kbuf.modes != 0)
+		return -EPERM;
+
+	return compat_ksys_adjtimex(buf);
+}
+DEF_ALT_SYS(android_compat_adjtimex, 1, struct compat_timex __user *);
+
+static inline long
+do_android_compat_clock_adjtime(clockid_t which_clock,
+                                struct compat_timex __user *buf)
+{
+	struct compat_timex kbuf;
+
+	/* clock_adjtime() is allowed only for read. */
+	if (copy_from_user(&kbuf, buf, sizeof(struct compat_timex)))
+		return -EFAULT;
+
+	if (kbuf.modes != 0)
+		return -EPERM;
+
+	return compat_ksys_clock_adjtime(which_clock, buf);
+}
+DEF_ALT_SYS(android_compat_clock_adjtime, 2, clockid_t,
+	    struct compat_timex __user *);
+
+#endif /* CONFIG_COMPAT */
+
+static struct syscall_whitelist whitelists[] = {
+	SYSCALL_WHITELIST(read_write_test),
+	SYSCALL_WHITELIST(android),
+	PERMISSIVE_SYSCALL_WHITELIST(android),
+	SYSCALL_WHITELIST(third_party),
+	PERMISSIVE_SYSCALL_WHITELIST(third_party),
+	SYSCALL_WHITELIST(complete),
+	PERMISSIVE_SYSCALL_WHITELIST(complete)
+};
+
+static int alt_syscall_apply_whitelist(const struct syscall_whitelist *wl,
+				       struct alt_sys_call_table *t)
+{
+	unsigned int i;
+	DECLARE_BITMAP(whitelist, t->size);
+
+	bitmap_zero(whitelist, t->size);
+	for (i = 0; i < wl->nr_whitelist; i++) {
+		unsigned int nr = wl->whitelist[i].nr;
+
+		if (nr >= t->size)
+			return -EINVAL;
+		bitmap_set(whitelist, nr, 1);
+		if (wl->whitelist[i].alt)
+			t->table[nr] = wl->whitelist[i].alt;
+	}
+
+	for (i = 0; i < t->size; i++) {
+		if (!test_bit(i, whitelist)) {
+			t->table[i] = wl->permissive ?
+				(sys_call_ptr_t)warn_syscall :
+				(sys_call_ptr_t)block_syscall;
+		}
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+static int
+alt_syscall_apply_compat_whitelist(const struct syscall_whitelist *wl,
+				   struct alt_sys_call_table *t)
+{
+	unsigned int i;
+	DECLARE_BITMAP(whitelist, t->compat_size);
+
+	bitmap_zero(whitelist, t->compat_size);
+	for (i = 0; i < wl->nr_compat_whitelist; i++) {
+		unsigned int nr = wl->compat_whitelist[i].nr;
+
+		if (nr >= t->compat_size)
+			return -EINVAL;
+		bitmap_set(whitelist, nr, 1);
+		if (wl->compat_whitelist[i].alt)
+			t->compat_table[nr] = wl->compat_whitelist[i].alt;
+	}
+
+	for (i = 0; i < t->compat_size; i++) {
+		if (!test_bit(i, whitelist)) {
+			t->compat_table[i] = wl->permissive ?
+				(sys_call_ptr_t)warn_compat_syscall :
+				(sys_call_ptr_t)block_syscall;
+		}
+	}
+
+	return 0;
+}
+#else
+static inline int
+alt_syscall_apply_compat_whitelist(const struct syscall_whitelist *wl,
+				   struct alt_sys_call_table *t)
+{
+	return 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static int alt_syscall_init_one(const struct syscall_whitelist *wl)
+{
+	struct alt_sys_call_table *t;
+	int err;
+
+	t = kzalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return -ENOMEM;
+	strncpy(t->name, wl->name, sizeof(t->name));
+
+	err = arch_dup_sys_call_table(t);
+	if (err)
+		return err;
+
+	err = alt_syscall_apply_whitelist(wl, t);
+	if (err)
+		return err;
+	err = alt_syscall_apply_compat_whitelist(wl, t);
+	if (err)
+		return err;
+
+	return register_alt_sys_call_table(t);
+}
+
+/*
+ * Register an alternate syscall table for each whitelist.  Note that the
+ * lack of a module_exit() is intentional - once a syscall table is registered
+ * it cannot be unregistered.
+ *
+ * TODO(abrestic) Support unregistering syscall tables?
+ */
+static int chromiumos_alt_syscall_init(void)
+{
+	unsigned int i;
+	int err;
+
+#ifdef CONFIG_SYSCTL
+	if (!register_sysctl_paths(chromiumos_sysctl_path,
+				   chromiumos_sysctl_table))
+		pr_warn("Failed to register sysctl\n");
+#endif
+
+	err = arch_dup_sys_call_table(&default_table);
+	if (err)
+		return err;
+
+	for (i = 0; i < ARRAY_SIZE(whitelists); i++) {
+		err = alt_syscall_init_one(&whitelists[i]);
+		if (err)
+			pr_warn("Failed to register syscall table %s: %d\n",
+				whitelists[i].name, err);
+	}
+
+	return 0;
+}
+module_init(chromiumos_alt_syscall_init);

diff --git a/security/chromiumos/alt-syscall.h b/security/chromiumos/alt-syscall.h
new file mode 100644
index 0000000..16473a2
--- /dev/null
+++ b/security/chromiumos/alt-syscall.h

@@ -0,0 +1,440 @@
+/*
+ * Linux Security Module for Chromium OS
+ *
+ * Copyright 2018 Google LLC. All Rights Reserved
+ *
+ * Authors:
+ *      Micah Morton <mortonm@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef ALT_SYSCALL_H
+#define ALT_SYSCALL_H
+
+/*
+ * NOTE: this file uses the 'static' keyword for variable and function
+ * definitions because alt-syscall.c is the only .c file that is expected to
+ * include this header. Definitions were pulled out from alt-syscall.c into
+ * this header and the *_whitelists.h headers for the sake of readability.
+ */
+
+static int allow_devmode_syscalls;
+
+#ifdef CONFIG_SYSCTL
+static int zero;
+static int one = 1;
+
+static struct ctl_path chromiumos_sysctl_path[] = {
+        { .procname = "kernel", },
+        { .procname = "chromiumos", },
+        { .procname = "alt_syscall", },
+        { }
+};
+
+static struct ctl_table chromiumos_sysctl_table[] = {
+        {
+                .procname       = "allow_devmode_syscalls",
+                .data           = &allow_devmode_syscalls,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+        { }
+};
+#endif
+
+struct syscall_whitelist_entry {
+        unsigned int nr;
+        sys_call_ptr_t alt;
+};
+
+struct syscall_whitelist {
+        const char *name;
+        const struct syscall_whitelist_entry *whitelist;
+        unsigned int nr_whitelist;
+#ifdef CONFIG_COMPAT
+        const struct syscall_whitelist_entry *compat_whitelist;
+        unsigned int nr_compat_whitelist;
+#endif
+        bool permissive;
+};
+
+static struct alt_sys_call_table default_table;
+
+#define SYSCALL_ENTRY_ALT(name, func)                                   \
+        {                                                               \
+                .nr = __NR_ ## name,                                    \
+                .alt = (sys_call_ptr_t)func,                            \
+        }
+#define SYSCALL_ENTRY(name) SYSCALL_ENTRY_ALT(name, NULL)
+#define COMPAT_SYSCALL_ENTRY_ALT(name, func)                            \
+        {                                                               \
+                .nr = __NR_compat_ ## name,                             \
+                .alt = (sys_call_ptr_t)func,                            \
+        }
+#define COMPAT_SYSCALL_ENTRY(name) COMPAT_SYSCALL_ENTRY_ALT(name, NULL)
+
+
+#ifdef CONFIG_ALT_SYSCALL_CHROMIUMOS_LEGACY_API
+#define ALT_SYS_ARG(n)	arg ## n
+#else
+#define ALT_SYS_ARG(n)	args[n - 1]
+#endif
+
+#define ALT_SYS_ARGS1_CALL(dt1) \
+	(dt1)ALT_SYS_ARG(1)
+#define ALT_SYS_ARGS2_CALL(dt1, dt2) \
+	ALT_SYS_ARGS1_CALL(dt1), (dt2)ALT_SYS_ARG(2)
+#define ALT_SYS_ARGS3_CALL(dt1, dt2, dt3) \
+	ALT_SYS_ARGS2_CALL(dt1, dt2), (dt3)ALT_SYS_ARG(3)
+#define ALT_SYS_ARGS4_CALL(dt1, dt2, dt3, dt4) \
+	ALT_SYS_ARGS3_CALL(dt1, dt2, dt3), (dt4)ALT_SYS_ARG(4)
+#define ALT_SYS_ARGS5_CALL(dt1, dt2, dt3, dt4, dt5) \
+	ALT_SYS_ARGS4_CALL(dt1, dt2, dt3, dt4), (dt5)ALT_SYS_ARG(5)
+#define ALT_SYS_ARGS6_CALL(dt1, dt2, dt3, dt4, dt5, dt6) \
+	ALT_SYS_ARGS5_CALL(dt1, dt2, dt3, dt4, dt5), (dt6)ALT_SYS_ARG(6)
+
+#ifdef CONFIG_ALT_SYSCALL_CHROMIUMOS_LEGACY_API
+
+#define ALT_SYS_ARGS1_HDR	unsigned long arg1
+#define ALT_SYS_ARGS2_HDR	ALT_SYS_ARGS1_HDR, unsigned long arg2
+#define ALT_SYS_ARGS3_HDR	ALT_SYS_ARGS2_HDR, unsigned long arg3
+#define ALT_SYS_ARGS4_HDR	ALT_SYS_ARGS3_HDR, unsigned long arg4
+#define ALT_SYS_ARGS5_HDR	ALT_SYS_ARGS4_HDR, unsigned long arg5
+#define ALT_SYS_ARGS6_HDR	ALT_SYS_ARGS5_HDR, unsigned long arg6
+
+#define DEF_ALT_SYS(func_name, nargs, ...)					\
+static asmlinkage long func_name(ALT_SYS_ARGS ## nargs ## _HDR)			\
+{										\
+	return do_##func_name(ALT_SYS_ARGS ## nargs ## _CALL(__VA_ARGS__));	\
+}
+
+#define DECL_ALT_SYS(func_name, nargs) \
+static asmlinkage long func_name(ALT_SYS_ARGS ## nargs ## _HDR)
+
+#else
+
+#define DEF_ALT_SYS(func_name, nargs, ...)			\
+static asmlinkage long func_name(struct pt_regs *regs)		\
+{								\
+	struct task_struct *task = current;			\
+	unsigned long args[nargs];				\
+								\
+	syscall_get_arguments(task, regs, 0, nargs, args);	\
+								\
+	return do_##func_name(ALT_SYS_ARGS ## nargs ## _CALL(__VA_ARGS__));	\
+}
+
+#define DECL_ALT_SYS(func_name, nargs)			\
+static asmlinkage long func_name(struct pt_regs *regs)
+
+#endif /* CONFIG_ALT_SYSCALL_CHROMIUMOS_LEGACY_API */
+
+/*
+ * If an alt_syscall table allows prctl(), override it to prevent a process
+ * from changing its syscall table.
+ */
+DECL_ALT_SYS(alt_sys_prctl, 5);
+
+#ifdef CONFIG_COMPAT
+#define SYSCALL_WHITELIST_COMPAT(x)                                     \
+        .compat_whitelist = x ## _compat_whitelist,                     \
+        .nr_compat_whitelist = ARRAY_SIZE(x ## _compat_whitelist),
+#else
+#define SYSCALL_WHITELIST_COMPAT(x)
+#endif
+
+#define SYSCALL_WHITELIST(x)                                            \
+        {                                                               \
+                .name = #x,                                             \
+                .whitelist = x ## _whitelist,                           \
+                .nr_whitelist = ARRAY_SIZE(x ## _whitelist),            \
+                SYSCALL_WHITELIST_COMPAT(x)                             \
+        }
+
+#define PERMISSIVE_SYSCALL_WHITELIST(x)                                 \
+        {                                                               \
+                .name = #x "_permissive",                               \
+                .permissive = true,                                     \
+                .whitelist = x ## _whitelist,                           \
+                .nr_whitelist = ARRAY_SIZE(x ## _whitelist),            \
+                SYSCALL_WHITELIST_COMPAT(x)                             \
+        }
+
+#ifdef CONFIG_COMPAT
+#ifdef CONFIG_X86_64
+#define __NR_compat_access      __NR_ia32_access
+#define __NR_compat_adjtimex    __NR_ia32_adjtimex
+#define __NR_compat_brk __NR_ia32_brk
+#define __NR_compat_capget      __NR_ia32_capget
+#define __NR_compat_capset      __NR_ia32_capset
+#define __NR_compat_chdir       __NR_ia32_chdir
+#define __NR_compat_chmod       __NR_ia32_chmod
+#define __NR_compat_clock_adjtime       __NR_ia32_clock_adjtime
+#define __NR_compat_clock_getres        __NR_ia32_clock_getres
+#define __NR_compat_clock_gettime       __NR_ia32_clock_gettime
+#define __NR_compat_clock_nanosleep     __NR_ia32_clock_nanosleep
+#define __NR_compat_clock_settime       __NR_ia32_clock_settime
+#define __NR_compat_clone       __NR_ia32_clone
+#define __NR_compat_close       __NR_ia32_close
+#define __NR_compat_creat       __NR_ia32_creat
+#define __NR_compat_dup __NR_ia32_dup
+#define __NR_compat_dup2        __NR_ia32_dup2
+#define __NR_compat_dup3        __NR_ia32_dup3
+#define __NR_compat_epoll_create        __NR_ia32_epoll_create
+#define __NR_compat_epoll_create1       __NR_ia32_epoll_create1
+#define __NR_compat_epoll_ctl   __NR_ia32_epoll_ctl
+#define __NR_compat_epoll_wait  __NR_ia32_epoll_wait
+#define __NR_compat_epoll_pwait __NR_ia32_epoll_pwait
+#define __NR_compat_eventfd     __NR_ia32_eventfd
+#define __NR_compat_eventfd2    __NR_ia32_eventfd2
+#define __NR_compat_execve      __NR_ia32_execve
+#define __NR_compat_exit        __NR_ia32_exit
+#define __NR_compat_exit_group  __NR_ia32_exit_group
+#define __NR_compat_faccessat   __NR_ia32_faccessat
+#define __NR_compat_fallocate   __NR_ia32_fallocate
+#define __NR_compat_fchdir      __NR_ia32_fchdir
+#define __NR_compat_fchmod      __NR_ia32_fchmod
+#define __NR_compat_fchmodat    __NR_ia32_fchmodat
+#define __NR_compat_fchown      __NR_ia32_fchown
+#define __NR_compat_fchownat    __NR_ia32_fchownat
+#define __NR_compat_fcntl       __NR_ia32_fcntl
+#define __NR_compat_fdatasync   __NR_ia32_fdatasync
+#define __NR_compat_fgetxattr   __NR_ia32_fgetxattr
+#define __NR_compat_flistxattr  __NR_ia32_flistxattr
+#define __NR_compat_flock       __NR_ia32_flock
+#define __NR_compat_fork        __NR_ia32_fork
+#define __NR_compat_fremovexattr        __NR_ia32_fremovexattr
+#define __NR_compat_fsetxattr   __NR_ia32_fsetxattr
+#define __NR_compat_fstat       __NR_ia32_fstat
+#define __NR_compat_fstatfs     __NR_ia32_fstatfs
+#define __NR_compat_fsync       __NR_ia32_fsync
+#define __NR_compat_ftruncate   __NR_ia32_ftruncate
+#define __NR_compat_futex       __NR_ia32_futex
+#define __NR_compat_futimesat   __NR_ia32_futimesat
+#define __NR_compat_getcpu      __NR_ia32_getcpu
+#define __NR_compat_getcwd      __NR_ia32_getcwd
+#define __NR_compat_getdents    __NR_ia32_getdents
+#define __NR_compat_getdents64  __NR_ia32_getdents64
+#define __NR_compat_getegid     __NR_ia32_getegid
+#define __NR_compat_geteuid     __NR_ia32_geteuid
+#define __NR_compat_getgid      __NR_ia32_getgid
+#define __NR_compat_getgroups32 __NR_ia32_getgroups32
+#define __NR_compat_getpgid     __NR_ia32_getpgid
+#define __NR_compat_getpgrp     __NR_ia32_getpgrp
+#define __NR_compat_getpid      __NR_ia32_getpid
+#define __NR_compat_getppid     __NR_ia32_getppid
+#define __NR_compat_getpriority __NR_ia32_getpriority
+#define __NR_compat_getrandom   __NR_ia32_getrandom
+#define __NR_compat_getresgid   __NR_ia32_getresgid
+#define __NR_compat_getresuid   __NR_ia32_getresuid
+#define __NR_compat_getrlimit   __NR_ia32_getrlimit
+#define __NR_compat_getrusage   __NR_ia32_getrusage
+#define __NR_compat_getsid      __NR_ia32_getsid
+#define __NR_compat_gettid      __NR_ia32_gettid
+#define __NR_compat_gettimeofday        __NR_ia32_gettimeofday
+#define __NR_compat_getuid      __NR_ia32_getuid
+#define __NR_compat_getxattr    __NR_ia32_getxattr
+#define __NR_compat_inotify_add_watch   __NR_ia32_inotify_add_watch
+#define __NR_compat_inotify_init        __NR_ia32_inotify_init
+#define __NR_compat_inotify_init1       __NR_ia32_inotify_init1
+#define __NR_compat_inotify_rm_watch    __NR_ia32_inotify_rm_watch
+#define __NR_compat_ioctl       __NR_ia32_ioctl
+#define __NR_compat_io_destroy  __NR_ia32_io_destroy
+#define __NR_compat_io_getevents      __NR_ia32_io_getevents
+#define __NR_compat_io_setup  __NR_ia32_io_setup
+#define __NR_compat_io_submit __NR_ia32_io_submit
+#define __NR_compat_ioprio_set  __NR_ia32_ioprio_set
+#define __NR_compat_keyctl      __NR_ia32_keyctl
+#define __NR_compat_kill        __NR_ia32_kill
+#define __NR_compat_lgetxattr   __NR_ia32_lgetxattr
+#define __NR_compat_link        __NR_ia32_link
+#define __NR_compat_linkat      __NR_ia32_linkat
+#define __NR_compat_listxattr   __NR_ia32_listxattr
+#define __NR_compat_llistxattr  __NR_ia32_llistxattr
+#define __NR_compat_lremovexattr        __NR_ia32_lremovexattr
+#define __NR_compat_lseek       __NR_ia32_lseek
+#define __NR_compat_lsetxattr   __NR_ia32_lsetxattr
+#define __NR_compat_lstat       __NR_ia32_lstat
+#define __NR_compat_madvise     __NR_ia32_madvise
+#define __NR_compat_memfd_create        __NR_ia32_memfd_create
+#define __NR_compat_mincore     __NR_ia32_mincore
+#define __NR_compat_mkdir       __NR_ia32_mkdir
+#define __NR_compat_mkdirat     __NR_ia32_mkdirat
+#define __NR_compat_mknod       __NR_ia32_mknod
+#define __NR_compat_mknodat     __NR_ia32_mknodat
+#define __NR_compat_mlock       __NR_ia32_mlock
+#define __NR_compat_munlock     __NR_ia32_munlock
+#define __NR_compat_mlockall    __NR_ia32_mlockall
+#define __NR_compat_munlockall  __NR_ia32_munlockall
+#define __NR_compat_modify_ldt  __NR_ia32_modify_ldt
+#define __NR_compat_mount       __NR_ia32_mount
+#define __NR_compat_mprotect    __NR_ia32_mprotect
+#define __NR_compat_mremap      __NR_ia32_mremap
+#define __NR_compat_msync       __NR_ia32_msync
+#define __NR_compat_munmap      __NR_ia32_munmap
+#define __NR_compat_name_to_handle_at   __NR_ia32_name_to_handle_at
+#define __NR_compat_nanosleep   __NR_ia32_nanosleep
+#define __NR_compat_open        __NR_ia32_open
+#define __NR_compat_open_by_handle_at   __NR_ia32_open_by_handle_at
+#define __NR_compat_openat      __NR_ia32_openat
+#define __NR_compat_perf_event_open     __NR_ia32_perf_event_open
+#define __NR_compat_personality __NR_ia32_personality
+#define __NR_compat_pipe        __NR_ia32_pipe
+#define __NR_compat_pipe2       __NR_ia32_pipe2
+#define __NR_compat_poll        __NR_ia32_poll
+#define __NR_compat_ppoll       __NR_ia32_ppoll
+#define __NR_compat_prctl       __NR_ia32_prctl
+#define __NR_compat_pread64     __NR_ia32_pread64
+#define __NR_compat_preadv      __NR_ia32_preadv
+#define __NR_compat_prlimit64   __NR_ia32_prlimit64
+#define __NR_compat_process_vm_readv    __NR_ia32_process_vm_readv
+#define __NR_compat_process_vm_writev   __NR_ia32_process_vm_writev
+#define __NR_compat_pselect6    __NR_ia32_pselect6
+#define __NR_compat_ptrace      __NR_ia32_ptrace
+#define __NR_compat_pwrite64    __NR_ia32_pwrite64
+#define __NR_compat_pwritev     __NR_ia32_pwritev
+#define __NR_compat_read        __NR_ia32_read
+#define __NR_compat_readahead   __NR_ia32_readahead
+#define __NR_compat_readv       __NR_ia32_readv
+#define __NR_compat_readlink    __NR_ia32_readlink
+#define __NR_compat_readlinkat  __NR_ia32_readlinkat
+#define __NR_compat_recvmmsg    __NR_ia32_recvmmsg
+#define __NR_compat_remap_file_pages    __NR_ia32_remap_file_pages
+#define __NR_compat_removexattr __NR_ia32_removexattr
+#define __NR_compat_rename      __NR_ia32_rename
+#define __NR_compat_renameat    __NR_ia32_renameat
+#define __NR_compat_restart_syscall     __NR_ia32_restart_syscall
+#define __NR_compat_rmdir       __NR_ia32_rmdir
+#define __NR_compat_rt_sigaction        __NR_ia32_rt_sigaction
+#define __NR_compat_rt_sigpending       __NR_ia32_rt_sigpending
+#define __NR_compat_rt_sigprocmask      __NR_ia32_rt_sigprocmask
+#define __NR_compat_rt_sigqueueinfo     __NR_ia32_rt_sigqueueinfo
+#define __NR_compat_rt_sigreturn        __NR_ia32_rt_sigreturn
+#define __NR_compat_rt_sigsuspend       __NR_ia32_rt_sigsuspend
+#define __NR_compat_rt_sigtimedwait     __NR_ia32_rt_sigtimedwait
+#define __NR_compat_rt_tgsigqueueinfo   __NR_ia32_rt_tgsigqueueinfo
+#define __NR_compat_sched_get_priority_max      __NR_ia32_sched_get_priority_max
+#define __NR_compat_sched_get_priority_min      __NR_ia32_sched_get_priority_min
+#define __NR_compat_sched_getaffinity   __NR_ia32_sched_getaffinity
+#define __NR_compat_sched_getparam      __NR_ia32_sched_getparam
+#define __NR_compat_sched_getscheduler  __NR_ia32_sched_getscheduler
+#define __NR_compat_sched_setaffinity   __NR_ia32_sched_setaffinity
+#define __NR_compat_sched_setparam      __NR_ia32_sched_setparam
+#define __NR_compat_sched_setscheduler  __NR_ia32_sched_setscheduler
+#define __NR_compat_sched_yield __NR_ia32_sched_yield
+#define __NR_compat_seccomp     __NR_ia32_seccomp
+#define __NR_compat_sendfile    __NR_ia32_sendfile
+#define __NR_compat_sendfile64  __NR_ia32_sendfile64
+#define __NR_compat_sendmmsg    __NR_ia32_sendmmsg
+#define __NR_compat_setdomainname       __NR_ia32_setdomainname
+#define __NR_compat_set_robust_list     __NR_ia32_set_robust_list
+#define __NR_compat_set_tid_address     __NR_ia32_set_tid_address
+#define __NR_compat_set_thread_area     __NR_ia32_set_thread_area
+#define __NR_compat_setgid      __NR_ia32_setgid
+#define __NR_compat_setgroups   __NR_ia32_setgroups
+#define __NR_compat_setitimer   __NR_ia32_setitimer
+#define __NR_compat_setns       __NR_ia32_setns
+#define __NR_compat_setpgid     __NR_ia32_setpgid
+#define __NR_compat_setpriority __NR_ia32_setpriority
+#define __NR_compat_setregid    __NR_ia32_setregid
+#define __NR_compat_setresgid   __NR_ia32_setresgid
+#define __NR_compat_setresuid   __NR_ia32_setresuid
+#define __NR_compat_setrlimit   __NR_ia32_setrlimit
+#define __NR_compat_setsid      __NR_ia32_setsid
+#define __NR_compat_settimeofday        __NR_ia32_settimeofday
+#define __NR_compat_setuid      __NR_ia32_setuid
+#define __NR_compat_setxattr    __NR_ia32_setxattr
+#define __NR_compat_signalfd4   __NR_ia32_signalfd4
+#define __NR_compat_sigaltstack __NR_ia32_sigaltstack
+#define __NR_compat_socketcall  __NR_ia32_socketcall
+#define __NR_compat_splice      __NR_ia32_splice
+#define __NR_compat_stat        __NR_ia32_stat
+#define __NR_compat_statfs      __NR_ia32_statfs
+#define __NR_compat_symlink     __NR_ia32_symlink
+#define __NR_compat_symlinkat   __NR_ia32_symlinkat
+#define __NR_compat_sync        __NR_ia32_sync
+#define __NR_compat_syncfs      __NR_ia32_syncfs
+#define __NR_compat_sync_file_range     __NR_ia32_sync_file_range
+#define __NR_compat_sysinfo     __NR_ia32_sysinfo
+#define __NR_compat_syslog      __NR_ia32_syslog
+#define __NR_compat_tee         __NR_ia32_tee
+#define __NR_compat_tgkill      __NR_ia32_tgkill
+#define __NR_compat_tkill       __NR_ia32_tkill
+#define __NR_compat_time        __NR_ia32_time
+#define __NR_compat_timer_create        __NR_ia32_timer_create
+#define __NR_compat_timer_delete        __NR_ia32_timer_delete
+#define __NR_compat_timer_getoverrun    __NR_ia32_timer_getoverrun
+#define __NR_compat_timer_gettime       __NR_ia32_timer_gettime
+#define __NR_compat_timer_settime       __NR_ia32_timer_settime
+#define __NR_compat_timerfd_create      __NR_ia32_timerfd_create
+#define __NR_compat_timerfd_gettime     __NR_ia32_timerfd_gettime
+#define __NR_compat_timerfd_settime     __NR_ia32_timerfd_settime
+#define __NR_compat_times               __NR_ia32_times
+#define __NR_compat_truncate    __NR_ia32_truncate
+#define __NR_compat_umask       __NR_ia32_umask
+#define __NR_compat_umount2     __NR_ia32_umount2
+#define __NR_compat_uname       __NR_ia32_uname
+#define __NR_compat_unlink      __NR_ia32_unlink
+#define __NR_compat_unlinkat    __NR_ia32_unlinkat
+#define __NR_compat_unshare     __NR_ia32_unshare
+#define __NR_compat_ustat       __NR_ia32_ustat
+#define __NR_compat_utimensat   __NR_ia32_utimensat
+#define __NR_compat_utimes      __NR_ia32_utimes
+#define __NR_compat_vfork       __NR_ia32_vfork
+#define __NR_compat_vmsplice    __NR_ia32_vmsplice
+#define __NR_compat_wait4       __NR_ia32_wait4
+#define __NR_compat_waitid      __NR_ia32_waitid
+#define __NR_compat_waitpid     __NR_ia32_waitpid
+#define __NR_compat_write       __NR_ia32_write
+#define __NR_compat_writev      __NR_ia32_writev
+#define __NR_compat_chown32     __NR_ia32_chown32
+#define __NR_compat_fadvise64   __NR_ia32_fadvise64
+#define __NR_compat_fadvise64_64        __NR_ia32_fadvise64_64
+#define __NR_compat_fchown32    __NR_ia32_fchown32
+#define __NR_compat_fcntl64     __NR_ia32_fcntl64
+#define __NR_compat_fstat64     __NR_ia32_fstat64
+#define __NR_compat_fstatat64   __NR_ia32_fstatat64
+#define __NR_compat_fstatfs64   __NR_ia32_fstatfs64
+#define __NR_compat_ftruncate64 __NR_ia32_ftruncate64
+#define __NR_compat_getegid32   __NR_ia32_getegid32
+#define __NR_compat_geteuid32   __NR_ia32_geteuid32
+#define __NR_compat_getgid32    __NR_ia32_getgid32
+#define __NR_compat_getresgid32 __NR_ia32_getresgid32
+#define __NR_compat_getresuid32 __NR_ia32_getresuid32
+#define __NR_compat_getuid32    __NR_ia32_getuid32
+#define __NR_compat_lchown32    __NR_ia32_lchown32
+#define __NR_compat_lstat64     __NR_ia32_lstat64
+#define __NR_compat_mmap2       __NR_ia32_mmap2
+#define __NR_compat__newselect  __NR_ia32__newselect
+#define __NR_compat__llseek     __NR_ia32__llseek
+#define __NR_compat_sigaction   __NR_ia32_sigaction
+#define __NR_compat_sigpending  __NR_ia32_sigpending
+#define __NR_compat_sigprocmask __NR_ia32_sigprocmask
+#define __NR_compat_sigreturn   __NR_ia32_sigreturn
+#define __NR_compat_sigsuspend  __NR_ia32_sigsuspend
+#define __NR_compat_setgid32    __NR_ia32_setgid32
+#define __NR_compat_setgroups32 __NR_ia32_setgroups32
+#define __NR_compat_setregid32  __NR_ia32_setregid32
+#define __NR_compat_setresgid32 __NR_ia32_setresgid32
+#define __NR_compat_setresuid32 __NR_ia32_setresuid32
+#define __NR_compat_setreuid32  __NR_ia32_setreuid32
+#define __NR_compat_setuid32    __NR_ia32_setuid32
+#define __NR_compat_stat64      __NR_ia32_stat64
+#define __NR_compat_statfs64    __NR_ia32_statfs64
+#define __NR_compat_truncate64  __NR_ia32_truncate64
+#define __NR_compat_ugetrlimit  __NR_ia32_ugetrlimit
+#endif
+#endif
+
+#endif /* ALT_SYSCALL_H */

diff --git a/security/chromiumos/android_whitelists.h b/security/chromiumos/android_whitelists.h
new file mode 100644
index 0000000..56ae74a
--- /dev/null
+++ b/security/chromiumos/android_whitelists.h

@@ -0,0 +1,697 @@
+/*
+ * Linux Security Module for Chromium OS
+ *
+ * Copyright 2018 Google LLC. All Rights Reserved
+ *
+ * Authors:
+ *      Micah Morton <mortonm@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef ANDROID_WHITELISTS_H
+#define ANDROID_WHITELISTS_H
+
+/*
+ * NOTE: the purpose of this header is only to pull out the definition of this
+ * array from alt-syscall.c for the purposes of readability. It should not be
+ * included in other .c files.
+ */
+
+#include "alt-syscall.h"
+
+/*
+ * Syscall overrides for android.
+ */
+
+/*
+ * Reflect the priority adjustment done by android_setpriority.
+ * Note that the prio returned by getpriority has been offset by 20.
+ * (returns 40..1 instead of -20..19)
+ */
+DECL_ALT_SYS(android_getpriority, 2);
+/* Android does not get to call keyctl. */
+DECL_ALT_SYS(android_keyctl, 5);
+/* Make sure nothing sets a nice value more favorable than -10. */
+DECL_ALT_SYS(android_setpriority, 3);
+DECL_ALT_SYS(android_sched_setscheduler, 3);
+DECL_ALT_SYS(android_sched_setparam, 2);
+DECL_ALT_SYS(android_socket, 3);
+DECL_ALT_SYS(android_perf_event_open, 5);
+DECL_ALT_SYS(android_adjtimex, 1);
+DECL_ALT_SYS(android_clock_adjtime, 2);
+DECL_ALT_SYS(android_getcpu, 3);
+
+#ifdef CONFIG_COMPAT
+DECL_ALT_SYS(android_compat_adjtimex, 1);
+DECL_ALT_SYS(android_compat_clock_adjtime, 2);
+#endif /* CONFIG_COMPAT */
+
+static struct syscall_whitelist_entry android_whitelist[] = {
+	SYSCALL_ENTRY(accept),
+	SYSCALL_ENTRY(accept4),
+	SYSCALL_ENTRY_ALT(adjtimex, android_adjtimex),
+	SYSCALL_ENTRY(bind),
+	SYSCALL_ENTRY(brk),
+	SYSCALL_ENTRY(capget),
+	SYSCALL_ENTRY(capset),
+	SYSCALL_ENTRY(chdir),
+	SYSCALL_ENTRY_ALT(clock_adjtime, android_clock_adjtime),
+	SYSCALL_ENTRY(clock_getres),
+	SYSCALL_ENTRY(clock_gettime),
+	SYSCALL_ENTRY(clock_nanosleep),
+	SYSCALL_ENTRY(clock_settime),
+	SYSCALL_ENTRY(clone),
+	SYSCALL_ENTRY(close),
+	SYSCALL_ENTRY(connect),
+	SYSCALL_ENTRY(dup),
+	SYSCALL_ENTRY(dup3),
+	SYSCALL_ENTRY(epoll_create1),
+	SYSCALL_ENTRY(epoll_ctl),
+	SYSCALL_ENTRY(epoll_pwait),
+	SYSCALL_ENTRY(eventfd2),
+	SYSCALL_ENTRY(execve),
+	SYSCALL_ENTRY(exit),
+	SYSCALL_ENTRY(exit_group),
+	SYSCALL_ENTRY(faccessat),
+	SYSCALL_ENTRY(fallocate),
+	SYSCALL_ENTRY(fchdir),
+	SYSCALL_ENTRY(fchmod),
+	SYSCALL_ENTRY(fchmodat),
+	SYSCALL_ENTRY(fchownat),
+	SYSCALL_ENTRY(fcntl),
+	SYSCALL_ENTRY(fdatasync),
+	SYSCALL_ENTRY(fgetxattr),
+	SYSCALL_ENTRY(flistxattr),
+	SYSCALL_ENTRY(flock),
+	SYSCALL_ENTRY(fremovexattr),
+	SYSCALL_ENTRY(fsetxattr),
+	SYSCALL_ENTRY(fstat),
+	SYSCALL_ENTRY(fstatfs),
+	SYSCALL_ENTRY(fsync),
+	SYSCALL_ENTRY(ftruncate),
+	SYSCALL_ENTRY(futex),
+	SYSCALL_ENTRY_ALT(getcpu, android_getcpu),
+	SYSCALL_ENTRY(getcwd),
+	SYSCALL_ENTRY(getdents64),
+	SYSCALL_ENTRY(getpeername),
+	SYSCALL_ENTRY(getpgid),
+	SYSCALL_ENTRY(getpid),
+	SYSCALL_ENTRY(getppid),
+	SYSCALL_ENTRY_ALT(getpriority, android_getpriority),
+        SYSCALL_ENTRY(getrandom),
+	SYSCALL_ENTRY(getrusage),
+	SYSCALL_ENTRY(getsid),
+	SYSCALL_ENTRY(getsockname),
+	SYSCALL_ENTRY(getsockopt),
+	SYSCALL_ENTRY(gettid),
+	SYSCALL_ENTRY(gettimeofday),
+	SYSCALL_ENTRY(getxattr),
+	SYSCALL_ENTRY(inotify_add_watch),
+	SYSCALL_ENTRY(inotify_init1),
+	SYSCALL_ENTRY(inotify_rm_watch),
+	SYSCALL_ENTRY(ioctl),
+        SYSCALL_ENTRY(io_destroy),
+        SYSCALL_ENTRY(io_getevents),
+        SYSCALL_ENTRY(io_setup),
+        SYSCALL_ENTRY(io_submit),
+	SYSCALL_ENTRY(ioprio_set),
+        SYSCALL_ENTRY_ALT(keyctl, android_keyctl),
+	SYSCALL_ENTRY(kill),
+	SYSCALL_ENTRY(lgetxattr),
+	SYSCALL_ENTRY(linkat),
+	SYSCALL_ENTRY(listxattr),
+	SYSCALL_ENTRY(listen),
+	SYSCALL_ENTRY(llistxattr),
+	SYSCALL_ENTRY(lremovexattr),
+	SYSCALL_ENTRY(lseek),
+	SYSCALL_ENTRY(lsetxattr),
+	SYSCALL_ENTRY(madvise),
+        SYSCALL_ENTRY(memfd_create),
+	SYSCALL_ENTRY(mincore),
+	SYSCALL_ENTRY(mkdirat),
+	SYSCALL_ENTRY(mknodat),
+	SYSCALL_ENTRY(mlock),
+	SYSCALL_ENTRY(mlockall),
+	SYSCALL_ENTRY(munlock),
+	SYSCALL_ENTRY(munlockall),
+	SYSCALL_ENTRY(mount),
+	SYSCALL_ENTRY(mprotect),
+	SYSCALL_ENTRY(mremap),
+	SYSCALL_ENTRY(msync),
+	SYSCALL_ENTRY(munmap),
+	SYSCALL_ENTRY(name_to_handle_at),
+	SYSCALL_ENTRY(nanosleep),
+	SYSCALL_ENTRY(open_by_handle_at),
+	SYSCALL_ENTRY(openat),
+	SYSCALL_ENTRY_ALT(perf_event_open, android_perf_event_open),
+	SYSCALL_ENTRY(personality),
+	SYSCALL_ENTRY(pipe2),
+	SYSCALL_ENTRY(ppoll),
+	SYSCALL_ENTRY_ALT(prctl, alt_sys_prctl),
+	SYSCALL_ENTRY(pread64),
+	SYSCALL_ENTRY(preadv),
+	SYSCALL_ENTRY(prlimit64),
+	SYSCALL_ENTRY(process_vm_readv),
+	SYSCALL_ENTRY(process_vm_writev),
+	SYSCALL_ENTRY(pselect6),
+	SYSCALL_ENTRY(ptrace),
+	SYSCALL_ENTRY(pwrite64),
+	SYSCALL_ENTRY(pwritev),
+	SYSCALL_ENTRY(read),
+	SYSCALL_ENTRY(readahead),
+	SYSCALL_ENTRY(readv),
+	SYSCALL_ENTRY(readlinkat),
+	SYSCALL_ENTRY(recvfrom),
+	SYSCALL_ENTRY(recvmmsg),
+	SYSCALL_ENTRY(recvmsg),
+	SYSCALL_ENTRY(remap_file_pages),
+	SYSCALL_ENTRY(removexattr),
+	SYSCALL_ENTRY(renameat),
+	SYSCALL_ENTRY(restart_syscall),
+	SYSCALL_ENTRY(rt_sigaction),
+	SYSCALL_ENTRY(rt_sigpending),
+	SYSCALL_ENTRY(rt_sigprocmask),
+	SYSCALL_ENTRY(rt_sigqueueinfo),
+	SYSCALL_ENTRY(rt_sigreturn),
+	SYSCALL_ENTRY(rt_sigsuspend),
+	SYSCALL_ENTRY(rt_sigtimedwait),
+	SYSCALL_ENTRY(rt_tgsigqueueinfo),
+	SYSCALL_ENTRY(sched_get_priority_max),
+	SYSCALL_ENTRY(sched_get_priority_min),
+	SYSCALL_ENTRY(sched_getaffinity),
+	SYSCALL_ENTRY(sched_getparam),
+	SYSCALL_ENTRY(sched_getscheduler),
+	SYSCALL_ENTRY(sched_setaffinity),
+        SYSCALL_ENTRY_ALT(sched_setparam, android_sched_setparam),
+	SYSCALL_ENTRY_ALT(sched_setscheduler, android_sched_setscheduler),
+	SYSCALL_ENTRY(sched_yield),
+	SYSCALL_ENTRY(seccomp),
+	SYSCALL_ENTRY(sendfile),
+	SYSCALL_ENTRY(sendmmsg),
+	SYSCALL_ENTRY(sendmsg),
+	SYSCALL_ENTRY(sendto),
+        SYSCALL_ENTRY(setdomainname),
+	SYSCALL_ENTRY(set_robust_list),
+	SYSCALL_ENTRY(set_tid_address),
+	SYSCALL_ENTRY(setitimer),
+	SYSCALL_ENTRY(setns),
+	SYSCALL_ENTRY(setpgid),
+	SYSCALL_ENTRY_ALT(setpriority, android_setpriority),
+	SYSCALL_ENTRY(setrlimit),
+	SYSCALL_ENTRY(setsid),
+	SYSCALL_ENTRY(setsockopt),
+	SYSCALL_ENTRY(settimeofday),
+	SYSCALL_ENTRY(setxattr),
+	SYSCALL_ENTRY(shutdown),
+	SYSCALL_ENTRY(signalfd4),
+	SYSCALL_ENTRY(sigaltstack),
+	SYSCALL_ENTRY_ALT(socket, android_socket),
+	SYSCALL_ENTRY(socketpair),
+	SYSCALL_ENTRY(splice),
+	SYSCALL_ENTRY(statfs),
+	SYSCALL_ENTRY(symlinkat),
+        SYSCALL_ENTRY(sync),
+        SYSCALL_ENTRY(syncfs),
+	SYSCALL_ENTRY(sysinfo),
+	SYSCALL_ENTRY(syslog),
+	SYSCALL_ENTRY(tee),
+	SYSCALL_ENTRY(tgkill),
+	SYSCALL_ENTRY(tkill),
+	SYSCALL_ENTRY(timer_create),
+	SYSCALL_ENTRY(timer_delete),
+	SYSCALL_ENTRY(timer_gettime),
+	SYSCALL_ENTRY(timer_getoverrun),
+	SYSCALL_ENTRY(timer_settime),
+	SYSCALL_ENTRY(timerfd_create),
+	SYSCALL_ENTRY(timerfd_gettime),
+	SYSCALL_ENTRY(timerfd_settime),
+	SYSCALL_ENTRY(times),
+	SYSCALL_ENTRY(truncate),
+	SYSCALL_ENTRY(umask),
+	SYSCALL_ENTRY(umount2),
+	SYSCALL_ENTRY(uname),
+	SYSCALL_ENTRY(unlinkat),
+	SYSCALL_ENTRY(unshare),
+	SYSCALL_ENTRY(utimensat),
+	SYSCALL_ENTRY(vmsplice),
+	SYSCALL_ENTRY(wait4),
+	SYSCALL_ENTRY(waitid),
+	SYSCALL_ENTRY(write),
+	SYSCALL_ENTRY(writev),
+
+	/*
+	 * Deprecated syscalls which are not wired up on new architectures
+	 * such as ARM64.
+	 */
+#ifndef CONFIG_ARM64
+	SYSCALL_ENTRY(access),
+	SYSCALL_ENTRY(chmod),
+	SYSCALL_ENTRY(open),
+	SYSCALL_ENTRY(creat),
+	SYSCALL_ENTRY(dup2),
+	SYSCALL_ENTRY(epoll_create),
+	SYSCALL_ENTRY(epoll_wait),
+	SYSCALL_ENTRY(eventfd),
+	SYSCALL_ENTRY(fork),
+	SYSCALL_ENTRY(futimesat),
+	SYSCALL_ENTRY(getdents),
+	SYSCALL_ENTRY(getpgrp),
+	SYSCALL_ENTRY(inotify_init),
+	SYSCALL_ENTRY(link),
+	SYSCALL_ENTRY(lstat),
+	SYSCALL_ENTRY(mkdir),
+	SYSCALL_ENTRY(mknod),
+	SYSCALL_ENTRY(pipe),
+	SYSCALL_ENTRY(poll),
+	SYSCALL_ENTRY(readlink),
+	SYSCALL_ENTRY(rename),
+	SYSCALL_ENTRY(rmdir),
+	SYSCALL_ENTRY(stat),
+	SYSCALL_ENTRY(symlink),
+	SYSCALL_ENTRY(unlink),
+	SYSCALL_ENTRY(ustat),
+	SYSCALL_ENTRY(utimes),
+	SYSCALL_ENTRY(vfork),
+#endif
+
+	/*
+	 * recv(2)/send(2) are officially deprecated, but their entry-points
+	 * still exist on ARM.
+	 */
+#ifdef CONFIG_ARM
+	SYSCALL_ENTRY(recv),
+	SYSCALL_ENTRY(send),
+#endif
+
+	/*
+	 * posix_fadvise(2) and sync_file_range(2) have ARM-specific wrappers
+	 * to deal with register alignment.
+	 */
+#ifdef CONFIG_ARM
+	SYSCALL_ENTRY(arm_fadvise64_64),
+	SYSCALL_ENTRY(sync_file_range2),
+#else
+	SYSCALL_ENTRY(fadvise64),
+	SYSCALL_ENTRY(sync_file_range),
+#endif
+
+	/* 64-bit only syscalls. */
+#if defined(CONFIG_X86_64) || defined(CONFIG_ARM64)
+	SYSCALL_ENTRY(fchown),
+	SYSCALL_ENTRY(getegid),
+	SYSCALL_ENTRY(geteuid),
+	SYSCALL_ENTRY(getgid),
+	SYSCALL_ENTRY(getgroups),
+	SYSCALL_ENTRY(getresgid),
+	SYSCALL_ENTRY(getresuid),
+	SYSCALL_ENTRY(getrlimit),
+	SYSCALL_ENTRY(getuid),
+	SYSCALL_ENTRY(newfstatat),
+	SYSCALL_ENTRY(mmap),
+	SYSCALL_ENTRY(setgid),
+	SYSCALL_ENTRY(setgroups),
+	SYSCALL_ENTRY(setregid),
+	SYSCALL_ENTRY(setresgid),
+	SYSCALL_ENTRY(setresuid),
+	SYSCALL_ENTRY(setreuid),
+	SYSCALL_ENTRY(setuid),
+	/*
+	 * chown(2) and lchown(2) are deprecated and not wired up
+	 * on ARM64.
+	 */
+#ifndef CONFIG_ARM64
+	SYSCALL_ENTRY(chown),
+	SYSCALL_ENTRY(lchown),
+#endif
+#endif
+
+	/* ARM32 only syscalls. */
+#if defined(CONFIG_ARM)
+	SYSCALL_ENTRY(chown32),
+	SYSCALL_ENTRY(fchown32),
+	SYSCALL_ENTRY(fcntl64),
+	SYSCALL_ENTRY(fstat64),
+	SYSCALL_ENTRY(fstatat64),
+	SYSCALL_ENTRY(fstatfs64),
+	SYSCALL_ENTRY(ftruncate64),
+	SYSCALL_ENTRY(getegid32),
+	SYSCALL_ENTRY(geteuid32),
+	SYSCALL_ENTRY(getgid32),
+	SYSCALL_ENTRY(getgroups32),
+	SYSCALL_ENTRY(getresgid32),
+	SYSCALL_ENTRY(getresuid32),
+	SYSCALL_ENTRY(getuid32),
+	SYSCALL_ENTRY(lchown32),
+	SYSCALL_ENTRY(lstat64),
+	SYSCALL_ENTRY(mmap2),
+	SYSCALL_ENTRY(_newselect),
+	SYSCALL_ENTRY(_llseek),
+	SYSCALL_ENTRY(sigaction),
+	SYSCALL_ENTRY(sigpending),
+	SYSCALL_ENTRY(sigprocmask),
+	SYSCALL_ENTRY(sigreturn),
+	SYSCALL_ENTRY(sigsuspend),
+	SYSCALL_ENTRY(sendfile64),
+	SYSCALL_ENTRY(setgid32),
+	SYSCALL_ENTRY(setgroups32),
+	SYSCALL_ENTRY(setregid32),
+	SYSCALL_ENTRY(setresgid32),
+	SYSCALL_ENTRY(setresuid32),
+	SYSCALL_ENTRY(setreuid32),
+	SYSCALL_ENTRY(setuid32),
+	SYSCALL_ENTRY(stat64),
+	SYSCALL_ENTRY(statfs64),
+	SYSCALL_ENTRY(truncate64),
+	SYSCALL_ENTRY(ugetrlimit),
+#endif
+
+	/* X86_64-specific syscalls. */
+#ifdef CONFIG_X86_64
+	SYSCALL_ENTRY(arch_prctl),
+	SYSCALL_ENTRY(modify_ldt),
+	SYSCALL_ENTRY(select),
+	SYSCALL_ENTRY(set_thread_area),
+	SYSCALL_ENTRY(time),
+#endif
+
+}; /* end android_whitelist */
+
+#ifdef CONFIG_COMPAT
+static struct syscall_whitelist_entry android_compat_whitelist[] = {
+	COMPAT_SYSCALL_ENTRY(access),
+	COMPAT_SYSCALL_ENTRY_ALT(adjtimex, android_compat_adjtimex),
+	COMPAT_SYSCALL_ENTRY(brk),
+	COMPAT_SYSCALL_ENTRY(capget),
+	COMPAT_SYSCALL_ENTRY(capset),
+	COMPAT_SYSCALL_ENTRY(chdir),
+	COMPAT_SYSCALL_ENTRY(chmod),
+	COMPAT_SYSCALL_ENTRY_ALT(clock_adjtime, android_compat_clock_adjtime),
+	COMPAT_SYSCALL_ENTRY(clock_getres),
+	COMPAT_SYSCALL_ENTRY(clock_gettime),
+	COMPAT_SYSCALL_ENTRY(clock_nanosleep),
+	COMPAT_SYSCALL_ENTRY(clock_settime),
+	COMPAT_SYSCALL_ENTRY(clone),
+	COMPAT_SYSCALL_ENTRY(close),
+	COMPAT_SYSCALL_ENTRY(creat),
+	COMPAT_SYSCALL_ENTRY(dup),
+	COMPAT_SYSCALL_ENTRY(dup2),
+	COMPAT_SYSCALL_ENTRY(dup3),
+	COMPAT_SYSCALL_ENTRY(epoll_create),
+	COMPAT_SYSCALL_ENTRY(epoll_create1),
+	COMPAT_SYSCALL_ENTRY(epoll_ctl),
+	COMPAT_SYSCALL_ENTRY(epoll_wait),
+	COMPAT_SYSCALL_ENTRY(epoll_pwait),
+	COMPAT_SYSCALL_ENTRY(eventfd),
+	COMPAT_SYSCALL_ENTRY(eventfd2),
+	COMPAT_SYSCALL_ENTRY(execve),
+	COMPAT_SYSCALL_ENTRY(exit),
+	COMPAT_SYSCALL_ENTRY(exit_group),
+	COMPAT_SYSCALL_ENTRY(faccessat),
+	COMPAT_SYSCALL_ENTRY(fallocate),
+	COMPAT_SYSCALL_ENTRY(fchdir),
+	COMPAT_SYSCALL_ENTRY(fchmod),
+	COMPAT_SYSCALL_ENTRY(fchmodat),
+	COMPAT_SYSCALL_ENTRY(fchownat),
+	COMPAT_SYSCALL_ENTRY(fcntl),
+	COMPAT_SYSCALL_ENTRY(fdatasync),
+	COMPAT_SYSCALL_ENTRY(fgetxattr),
+	COMPAT_SYSCALL_ENTRY(flistxattr),
+	COMPAT_SYSCALL_ENTRY(flock),
+	COMPAT_SYSCALL_ENTRY(fork),
+	COMPAT_SYSCALL_ENTRY(fremovexattr),
+	COMPAT_SYSCALL_ENTRY(fsetxattr),
+	COMPAT_SYSCALL_ENTRY(fstat),
+	COMPAT_SYSCALL_ENTRY(fstatfs),
+	COMPAT_SYSCALL_ENTRY(fsync),
+	COMPAT_SYSCALL_ENTRY(ftruncate),
+	COMPAT_SYSCALL_ENTRY(futex),
+	COMPAT_SYSCALL_ENTRY(futimesat),
+	COMPAT_SYSCALL_ENTRY_ALT(getcpu, android_getcpu),
+	COMPAT_SYSCALL_ENTRY(getcwd),
+	COMPAT_SYSCALL_ENTRY(getdents),
+	COMPAT_SYSCALL_ENTRY(getdents64),
+	COMPAT_SYSCALL_ENTRY(getpgid),
+	COMPAT_SYSCALL_ENTRY(getpgrp),
+	COMPAT_SYSCALL_ENTRY(getpid),
+	COMPAT_SYSCALL_ENTRY(getppid),
+	COMPAT_SYSCALL_ENTRY_ALT(getpriority, android_getpriority),
+        COMPAT_SYSCALL_ENTRY(getrandom),
+	COMPAT_SYSCALL_ENTRY(getrusage),
+	COMPAT_SYSCALL_ENTRY(getsid),
+	COMPAT_SYSCALL_ENTRY(gettid),
+	COMPAT_SYSCALL_ENTRY(gettimeofday),
+	COMPAT_SYSCALL_ENTRY(getxattr),
+	COMPAT_SYSCALL_ENTRY(inotify_add_watch),
+	COMPAT_SYSCALL_ENTRY(inotify_init),
+	COMPAT_SYSCALL_ENTRY(inotify_init1),
+	COMPAT_SYSCALL_ENTRY(inotify_rm_watch),
+	COMPAT_SYSCALL_ENTRY(ioctl),
+        COMPAT_SYSCALL_ENTRY(io_destroy),
+        COMPAT_SYSCALL_ENTRY(io_getevents),
+        COMPAT_SYSCALL_ENTRY(io_setup),
+        COMPAT_SYSCALL_ENTRY(io_submit),
+	COMPAT_SYSCALL_ENTRY(ioprio_set),
+        COMPAT_SYSCALL_ENTRY_ALT(keyctl, android_keyctl),
+	COMPAT_SYSCALL_ENTRY(kill),
+	COMPAT_SYSCALL_ENTRY(lgetxattr),
+	COMPAT_SYSCALL_ENTRY(link),
+	COMPAT_SYSCALL_ENTRY(linkat),
+	COMPAT_SYSCALL_ENTRY(listxattr),
+	COMPAT_SYSCALL_ENTRY(llistxattr),
+	COMPAT_SYSCALL_ENTRY(lremovexattr),
+	COMPAT_SYSCALL_ENTRY(lseek),
+	COMPAT_SYSCALL_ENTRY(lsetxattr),
+	COMPAT_SYSCALL_ENTRY(lstat),
+	COMPAT_SYSCALL_ENTRY(madvise),
+        COMPAT_SYSCALL_ENTRY(memfd_create),
+	COMPAT_SYSCALL_ENTRY(mincore),
+	COMPAT_SYSCALL_ENTRY(mkdir),
+	COMPAT_SYSCALL_ENTRY(mkdirat),
+	COMPAT_SYSCALL_ENTRY(mknod),
+	COMPAT_SYSCALL_ENTRY(mknodat),
+	COMPAT_SYSCALL_ENTRY(mlock),
+	COMPAT_SYSCALL_ENTRY(mlockall),
+	COMPAT_SYSCALL_ENTRY(munlock),
+	COMPAT_SYSCALL_ENTRY(munlockall),
+	COMPAT_SYSCALL_ENTRY(mount),
+	COMPAT_SYSCALL_ENTRY(mprotect),
+	COMPAT_SYSCALL_ENTRY(mremap),
+	COMPAT_SYSCALL_ENTRY(msync),
+	COMPAT_SYSCALL_ENTRY(munmap),
+	COMPAT_SYSCALL_ENTRY(name_to_handle_at),
+	COMPAT_SYSCALL_ENTRY(nanosleep),
+	COMPAT_SYSCALL_ENTRY(open),
+	COMPAT_SYSCALL_ENTRY(open_by_handle_at),
+	COMPAT_SYSCALL_ENTRY(openat),
+	COMPAT_SYSCALL_ENTRY_ALT(perf_event_open, android_perf_event_open),
+	COMPAT_SYSCALL_ENTRY(personality),
+	COMPAT_SYSCALL_ENTRY(pipe),
+	COMPAT_SYSCALL_ENTRY(pipe2),
+	COMPAT_SYSCALL_ENTRY(poll),
+	COMPAT_SYSCALL_ENTRY(ppoll),
+	COMPAT_SYSCALL_ENTRY_ALT(prctl, alt_sys_prctl),
+	COMPAT_SYSCALL_ENTRY(pread64),
+	COMPAT_SYSCALL_ENTRY(preadv),
+	COMPAT_SYSCALL_ENTRY(prlimit64),
+	COMPAT_SYSCALL_ENTRY(process_vm_readv),
+	COMPAT_SYSCALL_ENTRY(process_vm_writev),
+	COMPAT_SYSCALL_ENTRY(pselect6),
+	COMPAT_SYSCALL_ENTRY(ptrace),
+	COMPAT_SYSCALL_ENTRY(pwrite64),
+	COMPAT_SYSCALL_ENTRY(pwritev),
+	COMPAT_SYSCALL_ENTRY(read),
+	COMPAT_SYSCALL_ENTRY(readahead),
+	COMPAT_SYSCALL_ENTRY(readv),
+	COMPAT_SYSCALL_ENTRY(readlink),
+	COMPAT_SYSCALL_ENTRY(readlinkat),
+	COMPAT_SYSCALL_ENTRY(recvmmsg),
+	COMPAT_SYSCALL_ENTRY(remap_file_pages),
+	COMPAT_SYSCALL_ENTRY(removexattr),
+	COMPAT_SYSCALL_ENTRY(rename),
+	COMPAT_SYSCALL_ENTRY(renameat),
+	COMPAT_SYSCALL_ENTRY(restart_syscall),
+	COMPAT_SYSCALL_ENTRY(rmdir),
+	COMPAT_SYSCALL_ENTRY(rt_sigaction),
+	COMPAT_SYSCALL_ENTRY(rt_sigpending),
+	COMPAT_SYSCALL_ENTRY(rt_sigprocmask),
+	COMPAT_SYSCALL_ENTRY(rt_sigqueueinfo),
+	COMPAT_SYSCALL_ENTRY(rt_sigreturn),
+	COMPAT_SYSCALL_ENTRY(rt_sigsuspend),
+	COMPAT_SYSCALL_ENTRY(rt_sigtimedwait),
+	COMPAT_SYSCALL_ENTRY(rt_tgsigqueueinfo),
+	COMPAT_SYSCALL_ENTRY(sched_get_priority_max),
+	COMPAT_SYSCALL_ENTRY(sched_get_priority_min),
+	COMPAT_SYSCALL_ENTRY(sched_getaffinity),
+	COMPAT_SYSCALL_ENTRY(sched_getparam),
+	COMPAT_SYSCALL_ENTRY(sched_getscheduler),
+	COMPAT_SYSCALL_ENTRY(sched_setaffinity),
+        COMPAT_SYSCALL_ENTRY_ALT(sched_setparam,
+                                 android_sched_setparam),
+	COMPAT_SYSCALL_ENTRY_ALT(sched_setscheduler,
+				 android_sched_setscheduler),
+	COMPAT_SYSCALL_ENTRY(sched_yield),
+	COMPAT_SYSCALL_ENTRY(seccomp),
+	COMPAT_SYSCALL_ENTRY(sendfile),
+	COMPAT_SYSCALL_ENTRY(sendfile64),
+	COMPAT_SYSCALL_ENTRY(sendmmsg),
+        COMPAT_SYSCALL_ENTRY(setdomainname),
+	COMPAT_SYSCALL_ENTRY(set_robust_list),
+	COMPAT_SYSCALL_ENTRY(set_tid_address),
+	COMPAT_SYSCALL_ENTRY(setitimer),
+	COMPAT_SYSCALL_ENTRY(setns),
+	COMPAT_SYSCALL_ENTRY(setpgid),
+	COMPAT_SYSCALL_ENTRY_ALT(setpriority, android_setpriority),
+	COMPAT_SYSCALL_ENTRY(setrlimit),
+	COMPAT_SYSCALL_ENTRY(setsid),
+	COMPAT_SYSCALL_ENTRY(settimeofday),
+	COMPAT_SYSCALL_ENTRY(setxattr),
+	COMPAT_SYSCALL_ENTRY(signalfd4),
+	COMPAT_SYSCALL_ENTRY(sigaltstack),
+	COMPAT_SYSCALL_ENTRY(splice),
+	COMPAT_SYSCALL_ENTRY(stat),
+	COMPAT_SYSCALL_ENTRY(statfs),
+	COMPAT_SYSCALL_ENTRY(symlink),
+	COMPAT_SYSCALL_ENTRY(symlinkat),
+        COMPAT_SYSCALL_ENTRY(sync),
+        COMPAT_SYSCALL_ENTRY(syncfs),
+	COMPAT_SYSCALL_ENTRY(sysinfo),
+	COMPAT_SYSCALL_ENTRY(syslog),
+	COMPAT_SYSCALL_ENTRY(tgkill),
+	COMPAT_SYSCALL_ENTRY(tee),
+	COMPAT_SYSCALL_ENTRY(tkill),
+	COMPAT_SYSCALL_ENTRY(timer_create),
+	COMPAT_SYSCALL_ENTRY(timer_delete),
+	COMPAT_SYSCALL_ENTRY(timer_gettime),
+	COMPAT_SYSCALL_ENTRY(timer_getoverrun),
+	COMPAT_SYSCALL_ENTRY(timer_settime),
+	COMPAT_SYSCALL_ENTRY(timerfd_create),
+	COMPAT_SYSCALL_ENTRY(timerfd_gettime),
+	COMPAT_SYSCALL_ENTRY(timerfd_settime),
+	COMPAT_SYSCALL_ENTRY(times),
+	COMPAT_SYSCALL_ENTRY(truncate),
+	COMPAT_SYSCALL_ENTRY(umask),
+	COMPAT_SYSCALL_ENTRY(umount2),
+	COMPAT_SYSCALL_ENTRY(uname),
+	COMPAT_SYSCALL_ENTRY(unlink),
+	COMPAT_SYSCALL_ENTRY(unlinkat),
+	COMPAT_SYSCALL_ENTRY(unshare),
+	COMPAT_SYSCALL_ENTRY(ustat),
+	COMPAT_SYSCALL_ENTRY(utimensat),
+	COMPAT_SYSCALL_ENTRY(utimes),
+	COMPAT_SYSCALL_ENTRY(vfork),
+	COMPAT_SYSCALL_ENTRY(vmsplice),
+	COMPAT_SYSCALL_ENTRY(wait4),
+	COMPAT_SYSCALL_ENTRY(waitid),
+	COMPAT_SYSCALL_ENTRY(write),
+	COMPAT_SYSCALL_ENTRY(writev),
+	COMPAT_SYSCALL_ENTRY(chown32),
+	COMPAT_SYSCALL_ENTRY(fchown32),
+	COMPAT_SYSCALL_ENTRY(fcntl64),
+	COMPAT_SYSCALL_ENTRY(fstat64),
+	COMPAT_SYSCALL_ENTRY(fstatat64),
+	COMPAT_SYSCALL_ENTRY(fstatfs64),
+	COMPAT_SYSCALL_ENTRY(ftruncate64),
+	COMPAT_SYSCALL_ENTRY(getegid),
+	COMPAT_SYSCALL_ENTRY(getegid32),
+	COMPAT_SYSCALL_ENTRY(geteuid),
+	COMPAT_SYSCALL_ENTRY(geteuid32),
+	COMPAT_SYSCALL_ENTRY(getgid),
+	COMPAT_SYSCALL_ENTRY(getgid32),
+	COMPAT_SYSCALL_ENTRY(getgroups32),
+	COMPAT_SYSCALL_ENTRY(getresgid32),
+	COMPAT_SYSCALL_ENTRY(getresuid32),
+	COMPAT_SYSCALL_ENTRY(getuid),
+	COMPAT_SYSCALL_ENTRY(getuid32),
+	COMPAT_SYSCALL_ENTRY(lchown32),
+	COMPAT_SYSCALL_ENTRY(lstat64),
+	COMPAT_SYSCALL_ENTRY(mmap2),
+	COMPAT_SYSCALL_ENTRY(_newselect),
+	COMPAT_SYSCALL_ENTRY(_llseek),
+	COMPAT_SYSCALL_ENTRY(sigaction),
+	COMPAT_SYSCALL_ENTRY(sigpending),
+	COMPAT_SYSCALL_ENTRY(sigprocmask),
+	COMPAT_SYSCALL_ENTRY(sigreturn),
+	COMPAT_SYSCALL_ENTRY(sigsuspend),
+	COMPAT_SYSCALL_ENTRY(setgid32),
+	COMPAT_SYSCALL_ENTRY(setgroups32),
+	COMPAT_SYSCALL_ENTRY(setregid32),
+	COMPAT_SYSCALL_ENTRY(setresgid32),
+	COMPAT_SYSCALL_ENTRY(setresuid32),
+	COMPAT_SYSCALL_ENTRY(setreuid32),
+	COMPAT_SYSCALL_ENTRY(setuid32),
+	COMPAT_SYSCALL_ENTRY(stat64),
+	COMPAT_SYSCALL_ENTRY(statfs64),
+	COMPAT_SYSCALL_ENTRY(truncate64),
+	COMPAT_SYSCALL_ENTRY(ugetrlimit),
+
+#ifdef CONFIG_X86_64
+	/*
+	 * waitpid(2) is deprecated on most architectures, but still exists
+	 * on IA32.
+	 */
+	COMPAT_SYSCALL_ENTRY(waitpid),
+
+	/* IA32 uses the common socketcall(2) entrypoint for socket calls. */
+	COMPAT_SYSCALL_ENTRY(socketcall),
+#endif
+
+#ifdef CONFIG_ARM64
+	COMPAT_SYSCALL_ENTRY(accept),
+	COMPAT_SYSCALL_ENTRY(accept4),
+	COMPAT_SYSCALL_ENTRY(bind),
+	COMPAT_SYSCALL_ENTRY(connect),
+	COMPAT_SYSCALL_ENTRY(getpeername),
+	COMPAT_SYSCALL_ENTRY(getsockname),
+	COMPAT_SYSCALL_ENTRY(getsockopt),
+	COMPAT_SYSCALL_ENTRY(listen),
+	COMPAT_SYSCALL_ENTRY(recvfrom),
+	COMPAT_SYSCALL_ENTRY(recvmsg),
+	COMPAT_SYSCALL_ENTRY(sendmsg),
+	COMPAT_SYSCALL_ENTRY(sendto),
+	COMPAT_SYSCALL_ENTRY(setsockopt),
+	COMPAT_SYSCALL_ENTRY(shutdown),
+	COMPAT_SYSCALL_ENTRY(socket),
+	COMPAT_SYSCALL_ENTRY(socketpair),
+	COMPAT_SYSCALL_ENTRY(recv),
+	COMPAT_SYSCALL_ENTRY(send),
+#endif
+
+	/*
+	 * posix_fadvise(2) and sync_file_range(2) have ARM-specific wrappers
+	 * to deal with register alignment.
+	 */
+#ifdef CONFIG_ARM64
+	COMPAT_SYSCALL_ENTRY(arm_fadvise64_64),
+	COMPAT_SYSCALL_ENTRY(sync_file_range2),
+#else
+	COMPAT_SYSCALL_ENTRY(fadvise64_64),
+	COMPAT_SYSCALL_ENTRY(fadvise64),
+	COMPAT_SYSCALL_ENTRY(sync_file_range),
+#endif
+
+	/*
+	 * getrlimit(2) and time(2) are deprecated and not wired in the ARM
+         * compat table on ARM64.
+	 */
+#ifndef CONFIG_ARM64
+	COMPAT_SYSCALL_ENTRY(getrlimit),
+        COMPAT_SYSCALL_ENTRY(time),
+#endif
+
+	/* x86-specific syscalls. */
+#ifdef CONFIG_X86_64
+	COMPAT_SYSCALL_ENTRY(modify_ldt),
+	COMPAT_SYSCALL_ENTRY(set_thread_area),
+#endif
+}; /* end android_compat_whitelist */
+#endif /* CONFIG_COMPAT */
+
+#endif /* ANDROID_WHITELISTS_H */

diff --git a/security/chromiumos/complete_whitelists.h b/security/chromiumos/complete_whitelists.h
new file mode 100644
index 0000000..cb3a8f3
--- /dev/null
+++ b/security/chromiumos/complete_whitelists.h

@@ -0,0 +1,401 @@
+/*
+ * Linux Security Module for Chromium OS
+ *
+ * Copyright 2018 Google LLC. All Rights Reserved
+ *
+ * Authors:
+ *      Micah Morton <mortonm@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef COMPLETE_WHITELISTS_H
+#define COMPLETE_WHITELISTS_H
+
+/*
+ * NOTE: the purpose of this header is only to pull out the definition of this
+ * array from alt-syscall.c for the purposes of readability. It should not be
+ * included in other .c files.
+ */
+
+#include "alt-syscall.h"
+
+static struct syscall_whitelist_entry complete_whitelist[] = {
+	/* Syscalls wired up on ARM32/ARM64 and x86_64. */
+	SYSCALL_ENTRY(accept),
+	SYSCALL_ENTRY(accept4),
+	SYSCALL_ENTRY(acct),
+	SYSCALL_ENTRY(add_key),
+	SYSCALL_ENTRY(adjtimex),
+	SYSCALL_ENTRY(bind),
+	SYSCALL_ENTRY(brk),
+	SYSCALL_ENTRY(capget),
+	SYSCALL_ENTRY(capset),
+	SYSCALL_ENTRY(chdir),
+	SYSCALL_ENTRY(chroot),
+	SYSCALL_ENTRY(clock_adjtime),
+	SYSCALL_ENTRY(clock_getres),
+	SYSCALL_ENTRY(clock_gettime),
+	SYSCALL_ENTRY(clock_nanosleep),
+	SYSCALL_ENTRY(clock_settime),
+	SYSCALL_ENTRY(clone),
+	SYSCALL_ENTRY(close),
+	SYSCALL_ENTRY(connect),
+	SYSCALL_ENTRY(copy_file_range),
+	SYSCALL_ENTRY(delete_module),
+	SYSCALL_ENTRY(dup),
+	SYSCALL_ENTRY(dup3),
+	SYSCALL_ENTRY(epoll_create1),
+	SYSCALL_ENTRY(epoll_ctl),
+	SYSCALL_ENTRY(epoll_pwait),
+	SYSCALL_ENTRY(eventfd2),
+	SYSCALL_ENTRY(execve),
+	SYSCALL_ENTRY(exit),
+	SYSCALL_ENTRY(exit_group),
+	SYSCALL_ENTRY(faccessat),
+	SYSCALL_ENTRY(fallocate),
+	SYSCALL_ENTRY(fanotify_init),
+	SYSCALL_ENTRY(fanotify_mark),
+	SYSCALL_ENTRY(fchdir),
+	SYSCALL_ENTRY(fchmod),
+	SYSCALL_ENTRY(fchmodat),
+	SYSCALL_ENTRY(fchown),
+	SYSCALL_ENTRY(fchownat),
+	SYSCALL_ENTRY(fcntl),
+	SYSCALL_ENTRY(fdatasync),
+	SYSCALL_ENTRY(fgetxattr),
+	SYSCALL_ENTRY(finit_module),
+	SYSCALL_ENTRY(flistxattr),
+	SYSCALL_ENTRY(flock),
+	SYSCALL_ENTRY(fremovexattr),
+	SYSCALL_ENTRY(fsetxattr),
+	SYSCALL_ENTRY(fstatfs),
+	SYSCALL_ENTRY(fsync),
+	SYSCALL_ENTRY(ftruncate),
+	SYSCALL_ENTRY(futex),
+	SYSCALL_ENTRY(getcpu),
+	SYSCALL_ENTRY(getcwd),
+	SYSCALL_ENTRY(getdents64),
+	SYSCALL_ENTRY(getegid),
+	SYSCALL_ENTRY(geteuid),
+	SYSCALL_ENTRY(getgid),
+	SYSCALL_ENTRY(getgroups),
+	SYSCALL_ENTRY(getitimer),
+	SYSCALL_ENTRY(get_mempolicy),
+	SYSCALL_ENTRY(getpeername),
+	SYSCALL_ENTRY(getpgid),
+	SYSCALL_ENTRY(getpid),
+	SYSCALL_ENTRY(getppid),
+	SYSCALL_ENTRY(getpriority),
+	SYSCALL_ENTRY(getrandom),
+	SYSCALL_ENTRY(getresgid),
+	SYSCALL_ENTRY(getresuid),
+	SYSCALL_ENTRY(get_robust_list),
+	SYSCALL_ENTRY(getrusage),
+	SYSCALL_ENTRY(getsid),
+	SYSCALL_ENTRY(getsockname),
+	SYSCALL_ENTRY(getsockopt),
+	SYSCALL_ENTRY(gettid),
+	SYSCALL_ENTRY(gettimeofday),
+	SYSCALL_ENTRY(getuid),
+	SYSCALL_ENTRY(getxattr),
+	SYSCALL_ENTRY(init_module),
+	SYSCALL_ENTRY(inotify_add_watch),
+	SYSCALL_ENTRY(inotify_init1),
+	SYSCALL_ENTRY(inotify_rm_watch),
+	SYSCALL_ENTRY(io_cancel),
+	SYSCALL_ENTRY(ioctl),
+	SYSCALL_ENTRY(io_destroy),
+	SYSCALL_ENTRY(io_getevents),
+	SYSCALL_ENTRY(ioprio_get),
+	SYSCALL_ENTRY(ioprio_set),
+	SYSCALL_ENTRY(io_setup),
+	SYSCALL_ENTRY(io_submit),
+	SYSCALL_ENTRY(kcmp),
+	SYSCALL_ENTRY(kexec_load),
+	SYSCALL_ENTRY(keyctl),
+	SYSCALL_ENTRY(kill),
+	SYSCALL_ENTRY(lgetxattr),
+	SYSCALL_ENTRY(linkat),
+	SYSCALL_ENTRY(listen),
+	SYSCALL_ENTRY(listxattr),
+	SYSCALL_ENTRY(llistxattr),
+	SYSCALL_ENTRY(lookup_dcookie),
+	SYSCALL_ENTRY(lremovexattr),
+	SYSCALL_ENTRY(lseek),
+	SYSCALL_ENTRY(lsetxattr),
+	SYSCALL_ENTRY(madvise),
+	SYSCALL_ENTRY(mbind),
+	SYSCALL_ENTRY(memfd_create),
+	SYSCALL_ENTRY(mincore),
+	SYSCALL_ENTRY(mkdirat),
+	SYSCALL_ENTRY(mknodat),
+	SYSCALL_ENTRY(mlock),
+	SYSCALL_ENTRY(mlockall),
+	SYSCALL_ENTRY(mount),
+	SYSCALL_ENTRY(move_pages),
+	SYSCALL_ENTRY(mprotect),
+	SYSCALL_ENTRY(mq_getsetattr),
+	SYSCALL_ENTRY(mq_notify),
+	SYSCALL_ENTRY(mq_open),
+	SYSCALL_ENTRY(mq_timedreceive),
+	SYSCALL_ENTRY(mq_timedsend),
+	SYSCALL_ENTRY(mq_unlink),
+	SYSCALL_ENTRY(mremap),
+	SYSCALL_ENTRY(msgctl),
+	SYSCALL_ENTRY(msgget),
+	SYSCALL_ENTRY(msgrcv),
+	SYSCALL_ENTRY(msgsnd),
+	SYSCALL_ENTRY(msync),
+	SYSCALL_ENTRY(munlock),
+	SYSCALL_ENTRY(munlockall),
+	SYSCALL_ENTRY(munmap),
+	SYSCALL_ENTRY(name_to_handle_at),
+	SYSCALL_ENTRY(nanosleep),
+	SYSCALL_ENTRY(openat),
+	SYSCALL_ENTRY(open_by_handle_at),
+	SYSCALL_ENTRY(perf_event_open),
+	SYSCALL_ENTRY(personality),
+	SYSCALL_ENTRY(pipe2),
+	SYSCALL_ENTRY(pivot_root),
+	SYSCALL_ENTRY(pkey_alloc),
+	SYSCALL_ENTRY(pkey_free),
+	SYSCALL_ENTRY(pkey_mprotect),
+	SYSCALL_ENTRY(ppoll),
+	SYSCALL_ENTRY_ALT(prctl, alt_sys_prctl),
+	SYSCALL_ENTRY(pread64),
+	SYSCALL_ENTRY(preadv),
+	SYSCALL_ENTRY(preadv2),
+	SYSCALL_ENTRY(pwritev2),
+	SYSCALL_ENTRY(prlimit64),
+	SYSCALL_ENTRY(process_vm_readv),
+	SYSCALL_ENTRY(process_vm_writev),
+	SYSCALL_ENTRY(pselect6),
+	SYSCALL_ENTRY(ptrace),
+	SYSCALL_ENTRY(pwrite64),
+	SYSCALL_ENTRY(pwritev),
+	SYSCALL_ENTRY(quotactl),
+	SYSCALL_ENTRY(read),
+	SYSCALL_ENTRY(readahead),
+	SYSCALL_ENTRY(readlinkat),
+	SYSCALL_ENTRY(readv),
+	SYSCALL_ENTRY(reboot),
+	SYSCALL_ENTRY(recvfrom),
+	SYSCALL_ENTRY(recvmmsg),
+	SYSCALL_ENTRY(recvmsg),
+	SYSCALL_ENTRY(remap_file_pages),
+	SYSCALL_ENTRY(removexattr),
+	SYSCALL_ENTRY(renameat),
+	SYSCALL_ENTRY(request_key),
+	SYSCALL_ENTRY(restart_syscall),
+	SYSCALL_ENTRY(rt_sigaction),
+	SYSCALL_ENTRY(rt_sigpending),
+	SYSCALL_ENTRY(rt_sigprocmask),
+	SYSCALL_ENTRY(rt_sigqueueinfo),
+	SYSCALL_ENTRY(rt_sigsuspend),
+	SYSCALL_ENTRY(rt_sigtimedwait),
+	SYSCALL_ENTRY(rt_tgsigqueueinfo),
+	SYSCALL_ENTRY(sched_getaffinity),
+	SYSCALL_ENTRY(sched_getattr),
+	SYSCALL_ENTRY(sched_getparam),
+	SYSCALL_ENTRY(sched_get_priority_max),
+	SYSCALL_ENTRY(sched_get_priority_min),
+	SYSCALL_ENTRY(sched_getscheduler),
+	SYSCALL_ENTRY(sched_rr_get_interval),
+	SYSCALL_ENTRY(sched_setaffinity),
+	SYSCALL_ENTRY(sched_setattr),
+	SYSCALL_ENTRY(sched_setparam),
+	SYSCALL_ENTRY(sched_setscheduler),
+	SYSCALL_ENTRY(sched_yield),
+	SYSCALL_ENTRY(seccomp),
+	SYSCALL_ENTRY(semctl),
+	SYSCALL_ENTRY(semget),
+	SYSCALL_ENTRY(semop),
+	SYSCALL_ENTRY(semtimedop),
+	SYSCALL_ENTRY(sendfile),
+	SYSCALL_ENTRY(sendmmsg),
+	SYSCALL_ENTRY(sendmsg),
+	SYSCALL_ENTRY(sendto),
+	SYSCALL_ENTRY(setdomainname),
+	SYSCALL_ENTRY(setfsgid),
+	SYSCALL_ENTRY(setfsuid),
+	SYSCALL_ENTRY(setgid),
+	SYSCALL_ENTRY(setgroups),
+	SYSCALL_ENTRY(sethostname),
+	SYSCALL_ENTRY(setitimer),
+	SYSCALL_ENTRY(set_mempolicy),
+	SYSCALL_ENTRY(setns),
+	SYSCALL_ENTRY(setpgid),
+	SYSCALL_ENTRY(setpriority),
+	SYSCALL_ENTRY(setregid),
+	SYSCALL_ENTRY(setresgid),
+	SYSCALL_ENTRY(setresuid),
+	SYSCALL_ENTRY(setreuid),
+	SYSCALL_ENTRY(setrlimit),
+	SYSCALL_ENTRY(set_robust_list),
+	SYSCALL_ENTRY(setsid),
+	SYSCALL_ENTRY(setsockopt),
+	SYSCALL_ENTRY(set_tid_address),
+	SYSCALL_ENTRY(settimeofday),
+	SYSCALL_ENTRY(setuid),
+	SYSCALL_ENTRY(setxattr),
+	SYSCALL_ENTRY(shmat),
+	SYSCALL_ENTRY(shmctl),
+	SYSCALL_ENTRY(shmdt),
+	SYSCALL_ENTRY(shmget),
+	SYSCALL_ENTRY(shutdown),
+	SYSCALL_ENTRY(sigaltstack),
+	SYSCALL_ENTRY(signalfd4),
+	SYSCALL_ENTRY(socket),
+	SYSCALL_ENTRY(socketpair),
+	SYSCALL_ENTRY(splice),
+	SYSCALL_ENTRY(statfs),
+	SYSCALL_ENTRY(statx),
+	SYSCALL_ENTRY(swapoff),
+	SYSCALL_ENTRY(swapon),
+	SYSCALL_ENTRY(symlinkat),
+	SYSCALL_ENTRY(sync),
+	SYSCALL_ENTRY(syncfs),
+	SYSCALL_ENTRY(sysinfo),
+	SYSCALL_ENTRY(syslog),
+	SYSCALL_ENTRY(tee),
+	SYSCALL_ENTRY(tgkill),
+	SYSCALL_ENTRY(timer_create),
+	SYSCALL_ENTRY(timer_delete),
+	SYSCALL_ENTRY(timerfd_create),
+	SYSCALL_ENTRY(timerfd_gettime),
+	SYSCALL_ENTRY(timerfd_settime),
+	SYSCALL_ENTRY(timer_getoverrun),
+	SYSCALL_ENTRY(timer_gettime),
+	SYSCALL_ENTRY(timer_settime),
+	SYSCALL_ENTRY(times),
+	SYSCALL_ENTRY(tkill),
+	SYSCALL_ENTRY(truncate),
+	SYSCALL_ENTRY(umask),
+	SYSCALL_ENTRY(unlinkat),
+	SYSCALL_ENTRY(unshare),
+	SYSCALL_ENTRY(utimensat),
+	SYSCALL_ENTRY(vhangup),
+	SYSCALL_ENTRY(vmsplice),
+	SYSCALL_ENTRY(wait4),
+	SYSCALL_ENTRY(waitid),
+	SYSCALL_ENTRY(write),
+	SYSCALL_ENTRY(writev),
+
+	/* Exist for x86_64 and ARM32 but not ARM64. */
+#ifndef CONFIG_ARM64
+	SYSCALL_ENTRY(access),
+	SYSCALL_ENTRY(chmod),
+	SYSCALL_ENTRY(chown),
+	SYSCALL_ENTRY(creat),
+	SYSCALL_ENTRY(dup2),
+	SYSCALL_ENTRY(epoll_create),
+	SYSCALL_ENTRY(epoll_wait),
+	SYSCALL_ENTRY(eventfd),
+	SYSCALL_ENTRY(fork),
+	SYSCALL_ENTRY(futimesat),
+	SYSCALL_ENTRY(getdents),
+	SYSCALL_ENTRY(getpgrp),
+	SYSCALL_ENTRY(inotify_init),
+	SYSCALL_ENTRY(lchown),
+	SYSCALL_ENTRY(link),
+	SYSCALL_ENTRY(mkdir),
+	SYSCALL_ENTRY(mknod),
+	SYSCALL_ENTRY(open),
+	SYSCALL_ENTRY(pause),
+	SYSCALL_ENTRY(pipe),
+	SYSCALL_ENTRY(poll),
+	SYSCALL_ENTRY(readlink),
+	SYSCALL_ENTRY(rename),
+	SYSCALL_ENTRY(rmdir),
+	SYSCALL_ENTRY(signalfd),
+	SYSCALL_ENTRY(symlink),
+	SYSCALL_ENTRY(sysfs),
+	SYSCALL_ENTRY(unlink),
+	SYSCALL_ENTRY(ustat),
+	SYSCALL_ENTRY(utimes),
+	SYSCALL_ENTRY(vfork),
+#endif
+
+	/* Exist for x86_64 and ARM64 but not ARM32 */
+#if defined(CONFIG_ARM64) || defined(CONFIG_X86_64)
+	SYSCALL_ENTRY(fadvise64),
+	SYSCALL_ENTRY(fstat),
+	SYSCALL_ENTRY(getrlimit),
+	SYSCALL_ENTRY(migrate_pages),
+	SYSCALL_ENTRY(mmap),
+	SYSCALL_ENTRY(rt_sigreturn),
+	SYSCALL_ENTRY(sync_file_range),
+	SYSCALL_ENTRY(umount2),
+	SYSCALL_ENTRY(uname),
+#endif
+
+	/* Unique to ARM32. */
+#ifdef CONFIG_ARM
+	SYSCALL_ENTRY(arm_fadvise64_64),
+	SYSCALL_ENTRY(bdflush),
+	SYSCALL_ENTRY(fcntl64),
+	SYSCALL_ENTRY(fstat64),
+	SYSCALL_ENTRY(fstatat64),
+	SYSCALL_ENTRY(ftruncate64),
+	SYSCALL_ENTRY(lstat64),
+	SYSCALL_ENTRY(mmap2),
+	SYSCALL_ENTRY(nice),
+	SYSCALL_ENTRY(pciconfig_iobase),
+	SYSCALL_ENTRY(pciconfig_read),
+	SYSCALL_ENTRY(pciconfig_write),
+	SYSCALL_ENTRY(recv),
+	SYSCALL_ENTRY(send),
+	SYSCALL_ENTRY(sendfile64),
+	SYSCALL_ENTRY(sigaction),
+	SYSCALL_ENTRY(sigpending),
+	SYSCALL_ENTRY(sigprocmask),
+	SYSCALL_ENTRY(sigsuspend),
+	SYSCALL_ENTRY(stat64),
+	SYSCALL_ENTRY(truncate64),
+	SYSCALL_ENTRY(uselib),
+#endif
+
+	/* Unique to x86_64. */
+#ifdef CONFIG_X86_64
+	SYSCALL_ENTRY(alarm),
+	SYSCALL_ENTRY(arch_prctl),
+	SYSCALL_ENTRY(ioperm),
+	SYSCALL_ENTRY(iopl),
+	SYSCALL_ENTRY(kexec_file_load),
+	SYSCALL_ENTRY(lstat),
+	SYSCALL_ENTRY(modify_ldt),
+	SYSCALL_ENTRY(newfstatat),
+	SYSCALL_ENTRY(select),
+	SYSCALL_ENTRY(stat),
+	SYSCALL_ENTRY(time),
+	SYSCALL_ENTRY(_sysctl),
+	SYSCALL_ENTRY(utime),
+#endif
+
+	/* Unique to ARM64. */
+#ifdef CONFIG_ARM64
+	SYSCALL_ENTRY(nfsservctl),
+	SYSCALL_ENTRY(renameat2),
+#endif
+}; /* end complete_whitelist */
+
+#ifdef CONFIG_COMPAT
+/*
+ * For now not adding a 32-bit-compatible version of the complete whitelist.
+ * Since we are not whitelisting any compat syscalls here, a call into the
+ * compat section of this "complete" alt syscall table will be redirected to
+ * block_syscall() (unless the permissive mode is used in which case the call
+ * will be redirected to warn_compat_syscall()).
+ */
+static struct syscall_whitelist_entry complete_compat_whitelist[] = {};
+#endif /* CONFIG_COMPAT */
+
+#endif /* COMPLETE_WHITELISTS_H */

diff --git a/security/chromiumos/inode_mark.c b/security/chromiumos/inode_mark.c
new file mode 100644
index 0000000..009debb
--- /dev/null
+++ b/security/chromiumos/inode_mark.c

@@ -0,0 +1,354 @@
+/*
+ * Linux Security Module for Chromium OS
+ *
+ * Copyright 2016 Google Inc. All Rights Reserved
+ *
+ * Authors:
+ *      Mattias Nissler <mnissler@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/hash.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "inode_mark.h"
+
+/*
+ * This file implements facilities to pin inodes in core and attach some
+ * meta data to them. We use fsnotify inode marks as a vehicle to attach the
+ * meta data.
+ */
+struct chromiumos_inode_mark {
+	struct fsnotify_mark mark;
+	struct inode *inode;
+	enum chromiumos_inode_security_policy
+		policies[CHROMIUMOS_NUMBER_OF_POLICIES];
+};
+
+static inline struct chromiumos_inode_mark *
+chromiumos_to_inode_mark(struct fsnotify_mark *mark)
+{
+	return container_of(mark, struct chromiumos_inode_mark, mark);
+}
+
+/*
+ * Hashtable entry that contains tracking information specific to the file
+ * system identified by the corresponding super_block. This contains the
+ * fsnotify group that holds all the marks for inodes belonging to the
+ * super_block.
+ */
+struct chromiumos_super_block_mark {
+	atomic_t refcnt;
+	struct hlist_node node;
+	struct super_block *sb;
+	struct fsnotify_group *fsn_group;
+};
+
+#define CHROMIUMOS_SUPER_BLOCK_HASH_BITS 8
+#define CHROMIUMOS_SUPER_BLOCK_HASH_SIZE (1 << CHROMIUMOS_SUPER_BLOCK_HASH_BITS)
+
+static struct hlist_head chromiumos_super_block_hash_table
+	[CHROMIUMOS_SUPER_BLOCK_HASH_SIZE] __read_mostly;
+static DEFINE_MUTEX(chromiumos_super_block_hash_lock);
+
+static struct hlist_head *chromiumos_super_block_hlist(struct super_block *sb)
+{
+	return &chromiumos_super_block_hash_table[hash_ptr(
+		sb, CHROMIUMOS_SUPER_BLOCK_HASH_BITS)];
+}
+
+static void chromiumos_super_block_put(struct chromiumos_super_block_mark *sbm)
+{
+	if (atomic_dec_and_test(&sbm->refcnt)) {
+		mutex_lock(&chromiumos_super_block_hash_lock);
+		hlist_del_rcu(&sbm->node);
+		mutex_unlock(&chromiumos_super_block_hash_lock);
+
+		synchronize_rcu();
+
+		fsnotify_destroy_group(sbm->fsn_group);
+		kfree(sbm);
+	}
+}
+
+static struct chromiumos_super_block_mark *
+chromiumos_super_block_lookup(struct super_block *sb)
+{
+	struct hlist_head *hlist = chromiumos_super_block_hlist(sb);
+	struct chromiumos_super_block_mark *sbm;
+	struct chromiumos_super_block_mark *matching_sbm = NULL;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(sbm, hlist, node) {
+		if (sbm->sb == sb && atomic_inc_not_zero(&sbm->refcnt)) {
+			matching_sbm = sbm;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return matching_sbm;
+}
+
+static int chromiumos_handle_fsnotify_event(struct fsnotify_group *group,
+					    struct inode *inode,
+					    u32 mask, const void *data,
+					    int data_type,
+					    const unsigned char *file_name,
+					    u32 cookie,
+					    struct fsnotify_iter_info *iter_info)
+{
+	/*
+	 * This should never get called because a zero mask is set on the inode
+	 * marks. All cases of marks going away (inode deletion, unmount,
+	 * explicit removal) are handled in chromiumos_freeing_mark.
+	 */
+	WARN_ON_ONCE(1);
+	return 0;
+}
+
+static void chromiumos_freeing_mark(struct fsnotify_mark *mark,
+				    struct fsnotify_group *group)
+{
+	struct chromiumos_inode_mark *inode_mark =
+		chromiumos_to_inode_mark(mark);
+
+	iput(inode_mark->inode);
+	inode_mark->inode = NULL;
+	chromiumos_super_block_put(group->private);
+}
+
+static void chromiumos_free_mark(struct fsnotify_mark *mark)
+{
+	iput(chromiumos_to_inode_mark(mark)->inode);
+	kfree(mark);
+}
+
+static const struct fsnotify_ops chromiumos_fsn_ops = {
+	.handle_event = chromiumos_handle_fsnotify_event,
+	.freeing_mark = chromiumos_freeing_mark,
+	.free_mark = chromiumos_free_mark,
+};
+
+static struct chromiumos_super_block_mark *
+chromiumos_super_block_create(struct super_block *sb)
+{
+	struct hlist_head *hlist = chromiumos_super_block_hlist(sb);
+	struct chromiumos_super_block_mark *sbm = NULL;
+
+	WARN_ON(!mutex_is_locked(&chromiumos_super_block_hash_lock));
+
+	/* No match found, create a new entry. */
+	sbm = kzalloc(sizeof(*sbm), GFP_KERNEL);
+	if (!sbm)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&sbm->refcnt, 1);
+	sbm->sb = sb;
+	sbm->fsn_group = fsnotify_alloc_group(&chromiumos_fsn_ops);
+	if (IS_ERR(sbm->fsn_group)) {
+		int ret = PTR_ERR(sbm->fsn_group);
+
+		kfree(sbm);
+		return ERR_PTR(ret);
+	}
+	sbm->fsn_group->private = sbm;
+	hlist_add_head_rcu(&sbm->node, hlist);
+
+	return sbm;
+}
+
+static struct chromiumos_super_block_mark *
+chromiumos_super_block_get(struct super_block *sb)
+{
+	struct chromiumos_super_block_mark *sbm;
+
+	mutex_lock(&chromiumos_super_block_hash_lock);
+	sbm = chromiumos_super_block_lookup(sb);
+	if (!sbm)
+		sbm = chromiumos_super_block_create(sb);
+
+	mutex_unlock(&chromiumos_super_block_hash_lock);
+	return sbm;
+}
+
+/*
+ * This will only ever get called if the metadata does not already exist for
+ * an inode, so no need to worry about freeing an existing mark.
+ */
+static int
+chromiumos_inode_mark_create(
+	struct chromiumos_super_block_mark *sbm,
+	struct inode *inode,
+	enum chromiumos_inode_security_policy_type type,
+	enum chromiumos_inode_security_policy policy)
+{
+	struct chromiumos_inode_mark *inode_mark;
+	int ret;
+	size_t i;
+
+	WARN_ON(!mutex_is_locked(&sbm->fsn_group->mark_mutex));
+
+	inode_mark = kzalloc(sizeof(*inode_mark), GFP_KERNEL);
+	if (!inode_mark)
+		return -ENOMEM;
+
+	fsnotify_init_mark(&inode_mark->mark, sbm->fsn_group);
+	inode_mark->inode = igrab(inode);
+	if (!inode_mark->inode) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	/* Initialize all policies to inherit. */
+	for (i = 0; i < CHROMIUMOS_NUMBER_OF_POLICIES; i++)
+		inode_mark->policies[i] = CHROMIUMOS_INODE_POLICY_INHERIT;
+
+	inode_mark->policies[type] = policy;
+	ret = fsnotify_add_mark_locked(&inode_mark->mark, &inode->i_fsnotify_marks,
+				       type, false);
+	if (ret)
+		goto out;
+
+	/* Take an sbm reference so the created mark is accounted for. */
+	atomic_inc(&sbm->refcnt);
+
+out:
+	fsnotify_put_mark(&inode_mark->mark);
+	return ret;
+}
+
+int chromiumos_update_inode_security_policy(
+	struct inode *inode,
+	enum chromiumos_inode_security_policy_type type,
+	enum chromiumos_inode_security_policy policy)
+{
+	struct chromiumos_super_block_mark *sbm;
+	struct fsnotify_mark *mark;
+	bool free_mark = false;
+	int ret;
+	size_t i;
+
+	sbm = chromiumos_super_block_get(inode->i_sb);
+	if (IS_ERR(sbm))
+		return PTR_ERR(sbm);
+
+	mutex_lock(&sbm->fsn_group->mark_mutex);
+
+	mark = fsnotify_find_mark(&inode->i_fsnotify_marks, sbm->fsn_group);
+	if (mark) {
+		WRITE_ONCE(chromiumos_to_inode_mark(mark)->policies[type],
+				   policy);
+		/*
+		 * Frees mark if all policies are
+		 * CHROMIUM_INODE_POLICY_INHERIT.
+		 */
+		free_mark = true;
+		for (i = 0; i < CHROMIUMOS_NUMBER_OF_POLICIES; i++) {
+			if (chromiumos_to_inode_mark(mark)->policies[i]
+				!= CHROMIUMOS_INODE_POLICY_INHERIT) {
+				free_mark = false;
+				break;
+			}
+		}
+		if (free_mark)
+			fsnotify_detach_mark(mark);
+		ret = 0;
+	} else {
+		ret = chromiumos_inode_mark_create(sbm, inode, type, policy);
+	}
+
+	mutex_unlock(&sbm->fsn_group->mark_mutex);
+	chromiumos_super_block_put(sbm);
+
+	/* This must happen after dropping the mark mutex. */
+	if (free_mark)
+		fsnotify_free_mark(mark);
+	if (mark)
+		fsnotify_put_mark(mark);
+
+	return ret;
+}
+
+/* Flushes all inode security policies. */
+int chromiumos_flush_inode_security_policies(struct super_block *sb)
+{
+	struct chromiumos_super_block_mark *sbm;
+
+	sbm = chromiumos_super_block_lookup(sb);
+	if (sbm) {
+		fsnotify_clear_marks_by_group(sbm->fsn_group,
+					      FSNOTIFY_OBJ_ALL_TYPES_MASK);
+		chromiumos_super_block_put(sbm);
+	}
+
+	return 0;
+}
+
+enum chromiumos_inode_security_policy chromiumos_get_inode_security_policy(
+	struct dentry *dentry, struct inode *inode,
+	enum chromiumos_inode_security_policy_type type)
+{
+	struct chromiumos_super_block_mark *sbm;
+	/*
+	 * Initializes policy to CHROMIUM_INODE_POLICY_INHERIT, which is
+	 * the value that will be returned if neither |dentry| nor any
+	 * directory in its path has been asigned an inode security policy
+	 * value for the given type.
+	 */
+	enum chromiumos_inode_security_policy policy =
+		CHROMIUMOS_INODE_POLICY_INHERIT;
+
+	if (!dentry || !inode || type >= CHROMIUMOS_NUMBER_OF_POLICIES)
+		return policy;
+
+	sbm = chromiumos_super_block_lookup(inode->i_sb);
+	if (!sbm)
+		return policy;
+
+	/* Walk the dentry path and look for a traversal policy. */
+	rcu_read_lock();
+	while (1) {
+		struct fsnotify_mark *mark = fsnotify_find_mark(
+			&inode->i_fsnotify_marks, sbm->fsn_group);
+		if (mark) {
+			struct chromiumos_inode_mark *inode_mark =
+				chromiumos_to_inode_mark(mark);
+			policy = READ_ONCE(inode_mark->policies[type]);
+			fsnotify_put_mark(mark);
+
+			if (policy != CHROMIUMOS_INODE_POLICY_INHERIT)
+				break;
+		}
+
+		if (IS_ROOT(dentry))
+			break;
+		dentry = READ_ONCE(dentry->d_parent);
+		if (!dentry)
+			break;
+		inode = d_inode_rcu(dentry);
+		if (!inode)
+			break;
+	}
+	rcu_read_unlock();
+
+	chromiumos_super_block_put(sbm);
+
+	return policy;
+}

diff --git a/security/chromiumos/inode_mark.h b/security/chromiumos/inode_mark.h
new file mode 100644
index 0000000..ec00bb4
--- /dev/null
+++ b/security/chromiumos/inode_mark.h

@@ -0,0 +1,47 @@
+/*
+ * Linux Security Module for Chromium OS
+ *
+ * Copyright 2016 Google Inc. All Rights Reserved
+ *
+ * Authors:
+ *      Mattias Nissler <mnissler@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+/* FS feature availability policy for inode. */
+enum chromiumos_inode_security_policy {
+	CHROMIUMOS_INODE_POLICY_INHERIT, /* Inherit policy from parent dir */
+	CHROMIUMOS_INODE_POLICY_ALLOW,
+	CHROMIUMOS_INODE_POLICY_BLOCK,
+};
+
+/*
+ * Inode security policy types available for use. To add an additional
+ * security policy, simply add a new member here, add the corresponding policy
+ * files in securityfs.c, and associate the files being added with the new enum
+ * member.
+ */
+enum chromiumos_inode_security_policy_type {
+	CHROMIUMOS_SYMLINK_TRAVERSAL = 0,
+	CHROMIUMOS_FIFO_ACCESS,
+	CHROMIUMOS_NUMBER_OF_POLICIES, /* Do not add entries after this line. */
+};
+
+extern int chromiumos_update_inode_security_policy(
+	struct inode *inode,
+	enum chromiumos_inode_security_policy_type type,
+	enum chromiumos_inode_security_policy policy);
+int chromiumos_flush_inode_security_policies(struct super_block *sb);
+
+extern enum chromiumos_inode_security_policy
+chromiumos_get_inode_security_policy(
+	struct dentry *dentry, struct inode *inode,
+	enum chromiumos_inode_security_policy_type type);

diff --git a/security/chromiumos/lsm.c b/security/chromiumos/lsm.c
new file mode 100644
index 0000000..c2128de
--- /dev/null
+++ b/security/chromiumos/lsm.c

@@ -0,0 +1,442 @@
+/*
+ * Linux Security Module for Chromium OS
+ *
+ * Copyright 2011 Google Inc. All Rights Reserved
+ *
+ * Authors:
+ *      Stephan Uphoff  <ups@google.com>
+ *      Kees Cook       <keescook@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "Chromium OS LSM: " fmt
+
+#include <asm/syscall.h>
+#include <linux/cred.h>
+#include <linux/fs.h>
+#include <linux/fs_struct.h>
+#include <linux/hashtable.h>
+#include <linux/lsm_hooks.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>	/* for nameidata_get_total_link_count */
+#include <linux/path.h>
+#include <linux/ptrace.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched.h>	/* current and other task related stuff */
+#include <linux/security.h>
+
+#include "inode_mark.h"
+#include "utils.h"
+
+#define NUM_BITS 8 // 128 buckets in hash table
+
+static DEFINE_HASHTABLE(sb_nosymfollow_hashtable, NUM_BITS);
+
+struct sb_entry {
+	struct hlist_node next;
+	struct hlist_node dlist; /* for deletion cleanup */
+	uintptr_t sb;
+};
+
+#if defined(CONFIG_SECURITY_CHROMIUMOS_NO_UNPRIVILEGED_UNSAFE_MOUNTS) || \
+	defined(CONFIG_SECURITY_CHROMIUMOS_NO_SYMLINK_MOUNT)
+static void report(const char *origin, const struct path *path, char *operation)
+{
+	char *alloced = NULL, *cmdline;
+	char *pathname; /* Pointer to either static string or "alloced". */
+
+	if (!path)
+		pathname = "<unknown>";
+	else {
+		/* We will allow 11 spaces for ' (deleted)' to be appended */
+		alloced = pathname = kmalloc(PATH_MAX+11, GFP_KERNEL);
+		if (!pathname)
+			pathname = "<no_memory>";
+		else {
+			pathname = d_path(path, pathname, PATH_MAX+11);
+			if (IS_ERR(pathname))
+				pathname = "<too_long>";
+			else {
+				pathname = printable(pathname, PATH_MAX+11);
+				kfree(alloced);
+				alloced = pathname;
+			}
+		}
+	}
+
+	cmdline = printable_cmdline(current);
+
+	pr_notice("%s %s obj=%s pid=%d cmdline=%s\n", origin,
+		  operation, pathname, task_pid_nr(current), cmdline);
+
+	kfree(cmdline);
+	kfree(alloced);
+}
+#endif
+
+static int chromiumos_security_sb_mount(const char *dev_name,
+					const struct path *path,
+					const char *type, unsigned long flags,
+					void *data)
+{
+#ifdef CONFIG_SECURITY_CHROMIUMOS_NO_SYMLINK_MOUNT
+	if (nameidata_get_total_link_count()) {
+		report("sb_mount", path, "Mount path with symlinks prohibited");
+		pr_notice("sb_mount dev=%s type=%s flags=%#lx\n",
+			  dev_name, type, flags);
+		return -ELOOP;
+	}
+#endif
+
+#ifdef CONFIG_SECURITY_CHROMIUMOS_NO_UNPRIVILEGED_UNSAFE_MOUNTS
+	if ((!(flags & (MS_BIND | MS_MOVE | MS_SHARED | MS_PRIVATE | MS_SLAVE |
+			MS_UNBINDABLE)) ||
+	     ((flags & MS_REMOUNT) && (flags & MS_BIND))) &&
+	    !capable(CAP_SYS_ADMIN)) {
+		int required_mnt_flags = MNT_NOEXEC | MNT_NOSUID | MNT_NODEV;
+
+		if (flags & MS_REMOUNT) {
+			/*
+			 * If this is a remount, we only require that the
+			 * requested flags are a superset of the original mount
+			 * flags.
+			 */
+			required_mnt_flags &= path->mnt->mnt_flags;
+		}
+		/*
+		 * The three flags we are interested in disallowing in
+		 * unprivileged user namespaces (MS_NOEXEC, MS_NOSUID, MS_NODEV)
+		 * cannot be modified when doing a bind-mount. The kernel
+		 * attempts to dispatch calls to do_mount() within
+		 * fs/namespace.c in the following order:
+		 *
+		 * * If the MS_REMOUNT flag is present, it calls do_remount().
+		 *   When MS_BIND is also present, it only allows to modify the
+		 *   per-mount flags, which are copied into
+		 *   |required_mnt_flags|.  Otherwise it bails in the absence of
+		 *   the CAP_SYS_ADMIN in the init ns.
+		 * * If the MS_BIND flag is present, the only other flag checked
+		 *   is MS_REC.
+		 * * If any of the mount propagation flags are present
+		 *   (MS_SHARED, MS_PRIVATE, MS_SLAVE, MS_UNBINDABLE),
+		 *   flags_to_propagation_type() filters out any additional
+		 *   flags.
+		 * * If MS_MOVE flag is present, all other flags are ignored.
+		 */
+		if ((required_mnt_flags & MNT_NOEXEC) && !(flags & MS_NOEXEC)) {
+			report("sb_mount", path,
+			       "Mounting a filesystem with 'exec' flag requires CAP_SYS_ADMIN in init ns");
+			pr_notice("sb_mount dev=%s type=%s flags=%#lx\n",
+				  dev_name, type, flags);
+			return -EPERM;
+		}
+		if ((required_mnt_flags & MNT_NOSUID) && !(flags & MS_NOSUID)) {
+			report("sb_mount", path,
+			       "Mounting a filesystem with 'suid' flag requires CAP_SYS_ADMIN in init ns");
+			pr_notice("sb_mount dev=%s type=%s flags=%#lx\n",
+				  dev_name, type, flags);
+			return -EPERM;
+		}
+		if ((required_mnt_flags & MNT_NODEV) && !(flags & MS_NODEV) &&
+		    strcmp(type, "devpts")) {
+			report("sb_mount", path,
+			       "Mounting a filesystem with 'dev' flag requires CAP_SYS_ADMIN in init ns");
+			pr_notice("sb_mount dev=%s type=%s flags=%#lx\n",
+				  dev_name, type, flags);
+			return -EPERM;
+		}
+	}
+#endif
+
+	return 0;
+}
+
+static DEFINE_SPINLOCK(sb_nosymfollow_hashtable_spinlock);
+
+/* Check for entry in hash table. */
+static bool chromiumos_check_sb_nosymfollow_hashtable(struct super_block *sb)
+{
+	struct sb_entry *entry;
+	uintptr_t sb_pointer = (uintptr_t)sb;
+	bool found = false;
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu(sb_nosymfollow_hashtable,
+				   entry, next, sb_pointer) {
+		if (entry->sb == sb_pointer) {
+			found = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	/*
+	 * Its possible that a policy gets added in between the time we check
+	 * above and when we return false here. Such a race condition should
+	 * not affect this check however, since it would only be relevant if
+	 * userspace tried to traverse a symlink on a filesystem before that
+	 * filesystem was done being mounted (or potentially while it was being
+	 * remounted with new mount flags).
+	 */
+	return found;
+}
+
+/* Add entry to hash table. */
+static int chromiumos_add_sb_nosymfollow_hashtable(struct super_block *sb)
+{
+	struct sb_entry *new;
+	uintptr_t sb_pointer = (uintptr_t)sb;
+
+	/* Return if entry already exists */
+	if (chromiumos_check_sb_nosymfollow_hashtable(sb))
+		return 0;
+
+	new = kzalloc(sizeof(struct sb_entry), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+	new->sb = sb_pointer;
+	spin_lock(&sb_nosymfollow_hashtable_spinlock);
+	hash_add_rcu(sb_nosymfollow_hashtable, &new->next, sb_pointer);
+	spin_unlock(&sb_nosymfollow_hashtable_spinlock);
+	return 0;
+}
+
+/* Flush all entries from hash table. */
+void chromiumos_flush_sb_nosymfollow_hashtable(void)
+{
+	struct sb_entry *entry;
+	struct hlist_node *hlist_node;
+	unsigned int bkt_loop_cursor;
+	HLIST_HEAD(free_list);
+
+	/*
+	 * Could probably use hash_for_each_rcu here instead, but this should
+	 * be fine as well.
+	 */
+	spin_lock(&sb_nosymfollow_hashtable_spinlock);
+	hash_for_each_safe(sb_nosymfollow_hashtable, bkt_loop_cursor,
+			   hlist_node, entry, next) {
+		hash_del_rcu(&entry->next);
+		hlist_add_head(&entry->dlist, &free_list);
+	}
+	spin_unlock(&sb_nosymfollow_hashtable_spinlock);
+	synchronize_rcu();
+	hlist_for_each_entry_safe(entry, hlist_node, &free_list, dlist)
+		kfree(entry);
+}
+
+/* Remove entry from hash table. */
+static void chromiumos_remove_sb_nosymfollow_hashtable(struct super_block *sb)
+{
+	struct sb_entry *entry;
+	struct hlist_node *hlist_node;
+	uintptr_t sb_pointer = (uintptr_t)sb;
+	bool free_entry = false;
+
+	/*
+	 * Could probably use hash_for_each_rcu here instead, but this should
+	 * be fine as well.
+	 */
+	spin_lock(&sb_nosymfollow_hashtable_spinlock);
+	hash_for_each_possible_safe(sb_nosymfollow_hashtable, entry,
+			   hlist_node, next, sb_pointer) {
+		if (entry->sb == sb_pointer) {
+			hash_del_rcu(&entry->next);
+			free_entry = true;
+			break;
+		}
+	}
+	spin_unlock(&sb_nosymfollow_hashtable_spinlock);
+	if (free_entry) {
+		synchronize_rcu();
+		kfree(entry);
+	}
+}
+
+int chromiumos_security_sb_umount(struct vfsmount *mnt, int flags)
+{
+	/* If mnt->mnt_sb is in nosymfollow hashtable, remove it. */
+	chromiumos_remove_sb_nosymfollow_hashtable(mnt->mnt_sb);
+
+	return 0;
+}
+
+/*
+ * NOTE: The WARN() calls will emit a warning in cases of blocked symlink
+ * traversal attempts. These will show up in kernel warning reports
+ * collected by the crash reporter, so we have some insight on spurious
+ * failures that need addressing.
+ */
+static int chromiumos_security_inode_follow_link(struct dentry *dentry,
+						 struct inode *inode, bool rcu)
+{
+	static char accessed_path[PATH_MAX];
+	enum chromiumos_inode_security_policy policy;
+
+	/* Deny if symlinks have been disabled on this superblock. */
+	if (chromiumos_check_sb_nosymfollow_hashtable(dentry->d_sb)) {
+		WARN(1,
+		     "Blocked symlink traversal for path %x:%x:%s (symlinks were disabled on this FS through the 'nosymfollow' mount option)\n",
+		     MAJOR(dentry->d_sb->s_dev),
+		     MINOR(dentry->d_sb->s_dev),
+		     dentry_path(dentry, accessed_path, PATH_MAX));
+		return -EACCES;
+	}
+
+	policy = chromiumos_get_inode_security_policy(
+		dentry, inode,
+		CHROMIUMOS_SYMLINK_TRAVERSAL);
+
+	WARN(policy == CHROMIUMOS_INODE_POLICY_BLOCK,
+	     "Blocked symlink traversal for path %x:%x:%s (see https://goo.gl/8xICW6 for context and rationale)\n",
+	     MAJOR(dentry->d_sb->s_dev), MINOR(dentry->d_sb->s_dev),
+	     dentry_path(dentry, accessed_path, PATH_MAX));
+
+	return policy == CHROMIUMOS_INODE_POLICY_BLOCK ? -EACCES : 0;
+}
+
+static int chromiumos_security_file_open(struct file *file)
+{
+	static char accessed_path[PATH_MAX];
+	enum chromiumos_inode_security_policy policy;
+	struct dentry *dentry = file->f_path.dentry;
+
+	/* Returns 0 if file is not a FIFO */
+	if (!S_ISFIFO(file->f_inode->i_mode))
+		return 0;
+
+	policy = chromiumos_get_inode_security_policy(
+		dentry, dentry->d_inode,
+		CHROMIUMOS_FIFO_ACCESS);
+
+	/*
+	 * Emit a warning in cases of blocked fifo access attempts. These will
+	 * show up in kernel warning reports collected by the crash reporter,
+	 * so we have some insight on spurious failures that need addressing.
+	 */
+	WARN(policy == CHROMIUMOS_INODE_POLICY_BLOCK,
+	     "Blocked fifo access for path %x:%x:%s\n (see https://goo.gl/8xICW6 for context and rationale)\n",
+	     MAJOR(dentry->d_sb->s_dev), MINOR(dentry->d_sb->s_dev),
+	     dentry_path(dentry, accessed_path, PATH_MAX));
+
+	return policy == CHROMIUMOS_INODE_POLICY_BLOCK ? -EACCES : 0;
+}
+
+/*
+ * This hook inspects the string pointed to by the first parameter, looking for
+ * the "nosymfollow" mount option. The second parameter points to an empty
+ * page-sized buffer that is used for holding LSM-specific mount options that
+ * are grabbed (after this function executes, in security_sb_copy_data) from
+ * the mount string in the first parameter. Since the chromiumos LSM is stacked
+ * ahead of SELinux for ChromeOS, the page-sized buffer is empty when this
+ * function is called. If the "nosymfollow" mount option is encountered in this
+ * function, we write "nosymflw" to the empty page-sized buffer which lets us
+ * transmit information which will be visible in chromiumos_sb_kern_mount
+ * signifying that symlinks should be disabled for the sb. We store this token
+ * at a spot in the buffer that is at a greater offset than the bytes needed to
+ * record the rest of the LSM-specific mount options (e.g. those for SELinux).
+ * The "nosymfollow" option will be stripped from the mount string if it is
+ * encountered.
+ */
+int chromiumos_sb_copy_data(char *orig, char *copy)
+{
+	char *orig_copy;
+	char *orig_copy_cur;
+	char *option;
+	size_t offset = 0;
+	bool found = false;
+
+	if (!orig || *orig == 0)
+		return 0;
+
+	orig_copy = alloc_secdata();
+	if (!orig_copy)
+		return -ENOMEM;
+	strncpy(orig_copy, orig, PAGE_SIZE);
+
+	memset(orig, 0, strlen(orig));
+
+	orig_copy_cur = orig_copy;
+	while (orig_copy_cur) {
+		option = strsep(&orig_copy_cur, ",");
+		if (strcmp(option, "nosymfollow") == 0) {
+			if (found) /* Found multiple times. */
+				return -EINVAL;
+			found = true;
+		} else {
+			if (offset > 0) {
+				orig[offset] = ',';
+				offset++;
+			}
+			strcpy(orig + offset, option);
+			offset += strlen(option);
+		}
+	}
+
+	if (found)
+		strcpy(copy + offset + 1, "nosymflw");
+
+	free_secdata(orig_copy);
+	return 0;
+}
+
+/* Unfortunately the kernel doesn't implement memmem function. */
+static void *search_buffer(void *haystack, size_t haystacklen,
+			   const void *needle, size_t needlelen)
+{
+	if (!needlelen)
+		return (void *)haystack;
+	while (haystacklen >= needlelen) {
+		haystacklen--;
+		if (!memcmp(haystack, needle, needlelen))
+			return (void *)haystack;
+		haystack++;
+	}
+	return NULL;
+}
+
+int chromiumos_sb_kern_mount(struct super_block *sb, int flags, void *data)
+{
+	int ret;
+	char search_str[10] = "\0nosymflw";
+
+	if (!data)
+		return 0;
+
+	if (search_buffer(data, PAGE_SIZE, search_str, 10)) {
+		ret = chromiumos_add_sb_nosymfollow_hashtable(sb);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct security_hook_list chromiumos_security_hooks[] = {
+	LSM_HOOK_INIT(sb_mount, chromiumos_security_sb_mount),
+	LSM_HOOK_INIT(inode_follow_link, chromiumos_security_inode_follow_link),
+	LSM_HOOK_INIT(file_open, chromiumos_security_file_open),
+	LSM_HOOK_INIT(sb_copy_data, chromiumos_sb_copy_data),
+	LSM_HOOK_INIT(sb_kern_mount, chromiumos_sb_kern_mount),
+	LSM_HOOK_INIT(sb_umount, chromiumos_security_sb_umount)
+};
+
+static int __init chromiumos_security_init(void)
+{
+	security_add_hooks(chromiumos_security_hooks,
+			   ARRAY_SIZE(chromiumos_security_hooks), "chromiumos");
+
+	pr_info("enabled");
+
+	return 0;
+}
+security_initcall(chromiumos_security_init);

diff --git a/security/chromiumos/read_write_test_whitelists.h b/security/chromiumos/read_write_test_whitelists.h
new file mode 100644
index 0000000..5aa7370
--- /dev/null
+++ b/security/chromiumos/read_write_test_whitelists.h

@@ -0,0 +1,56 @@
+/*
+ * Linux Security Module for Chromium OS
+ *
+ * Copyright 2018 Google LLC. All Rights Reserved
+ *
+ * Authors:
+ *      Micah Morton <mortonm@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef READ_WRITE_TESTS_WHITELISTS_H
+#define READ_WRITE_TESTS_WHITELISTS_H
+
+/*
+ * NOTE: the purpose of this header is only to pull out the definition of this
+ * array from alt-syscall.c for the purposes of readability. It should not be
+ * included in other .c files.
+ */
+
+#include "alt-syscall.h"
+
+static struct syscall_whitelist_entry read_write_test_whitelist[] = {
+	SYSCALL_ENTRY(exit),
+	SYSCALL_ENTRY(openat),
+	SYSCALL_ENTRY(close),
+	SYSCALL_ENTRY(read),
+	SYSCALL_ENTRY(write),
+	SYSCALL_ENTRY_ALT(prctl, alt_sys_prctl),
+
+	/* open(2) is deprecated and not wired up on ARM64. */
+#ifndef CONFIG_ARM64
+	SYSCALL_ENTRY(open),
+#endif
+}; /* end read_write_test_whitelist */
+
+#ifdef CONFIG_COMPAT
+static struct syscall_whitelist_entry read_write_test_compat_whitelist[] = {
+	COMPAT_SYSCALL_ENTRY(exit),
+	COMPAT_SYSCALL_ENTRY(open),
+	COMPAT_SYSCALL_ENTRY(openat),
+	COMPAT_SYSCALL_ENTRY(close),
+	COMPAT_SYSCALL_ENTRY(read),
+	COMPAT_SYSCALL_ENTRY(write),
+	COMPAT_SYSCALL_ENTRY_ALT(prctl, alt_sys_prctl),
+}; /* end read_write_test_compat_whitelist */
+#endif /* CONFIG_COMPAT */
+
+#endif /* READ_WRITE_TESTS_WHITELISTS_H */

diff --git a/security/chromiumos/securityfs.c b/security/chromiumos/securityfs.c
new file mode 100644
index 0000000..ae2d76a
--- /dev/null
+++ b/security/chromiumos/securityfs.c

@@ -0,0 +1,241 @@
+/*
+ * Linux Security Module for Chromium OS
+ *
+ * Copyright 2016 Google Inc. All Rights Reserved
+ *
+ * Authors:
+ *      Mattias Nissler <mnissler@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/capability.h>
+#include <linux/cred.h>
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+#include <linux/security.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+
+#include "inode_mark.h"
+
+static struct dentry *chromiumos_dir;
+static struct dentry *chromiumos_inode_policy_dir;
+
+struct chromiumos_inode_policy_file_entry {
+	const char *name;
+	int (*handle_write)(struct chromiumos_inode_policy_file_entry *,
+			    struct dentry *);
+	enum chromiumos_inode_security_policy_type type;
+	enum chromiumos_inode_security_policy policy;
+	struct dentry *dentry;
+};
+
+static int chromiumos_inode_policy_file_write(
+	struct chromiumos_inode_policy_file_entry *file_entry,
+	struct dentry *dentry)
+{
+	return chromiumos_update_inode_security_policy(dentry->d_inode,
+		file_entry->type, file_entry->policy);
+}
+
+/*
+ * Causes all marks to be removed from inodes thus removing all inode security
+ * policies.
+ */
+static int chromiumos_inode_policy_file_flush_write(
+	struct chromiumos_inode_policy_file_entry *file_entry,
+	struct dentry *dentry)
+{
+	return chromiumos_flush_inode_security_policies(dentry->d_sb);
+}
+
+static struct chromiumos_inode_policy_file_entry
+		chromiumos_inode_policy_files[] = {
+	{.name = "block_symlink",
+	 .handle_write = chromiumos_inode_policy_file_write,
+	 .type = CHROMIUMOS_SYMLINK_TRAVERSAL,
+	 .policy = CHROMIUMOS_INODE_POLICY_BLOCK},
+	{.name = "allow_symlink",
+	 .handle_write = chromiumos_inode_policy_file_write,
+	 .type = CHROMIUMOS_SYMLINK_TRAVERSAL,
+	 .policy = CHROMIUMOS_INODE_POLICY_ALLOW},
+	{.name = "reset_symlink",
+	 .handle_write = chromiumos_inode_policy_file_write,
+	 .type = CHROMIUMOS_SYMLINK_TRAVERSAL,
+	 .policy = CHROMIUMOS_INODE_POLICY_INHERIT},
+	{.name = "block_fifo",
+	 .handle_write = chromiumos_inode_policy_file_write,
+	 .type = CHROMIUMOS_FIFO_ACCESS,
+	 .policy = CHROMIUMOS_INODE_POLICY_BLOCK},
+	{.name = "allow_fifo",
+	 .handle_write = chromiumos_inode_policy_file_write,
+	 .type = CHROMIUMOS_FIFO_ACCESS,
+	 .policy = CHROMIUMOS_INODE_POLICY_ALLOW},
+	{.name = "reset_fifo",
+	 .handle_write = chromiumos_inode_policy_file_write,
+	 .type = CHROMIUMOS_FIFO_ACCESS,
+	 .policy = CHROMIUMOS_INODE_POLICY_INHERIT},
+	{.name = "flush_policies",
+	 .handle_write = &chromiumos_inode_policy_file_flush_write},
+};
+
+static int chromiumos_resolve_path(const char __user *buf, size_t len,
+				   struct path *path)
+{
+	char *filename = NULL;
+	char *canonical_buf = NULL;
+	char *canonical;
+	int ret;
+
+	if (len + 1 > PATH_MAX)
+		return -EINVAL;
+
+	/*
+	 * Copy the path to a kernel buffer. We can't use user_path_at()
+	 * since it expects a zero-terminated path, which we generally don't
+	 * have here.
+	 */
+	filename = kzalloc(len + 1, GFP_KERNEL);
+	if (!filename)
+		return -ENOMEM;
+
+	if (copy_from_user(filename, buf, len)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = kern_path(filename, 0, path);
+	if (ret)
+		goto out;
+
+	/*
+	 * Make sure the path is canonical, i.e. it didn't contain symlinks. To
+	 * check this we convert |path| back to an absolute path (within the
+	 * global root) and compare the resulting path name with the passed-in
+	 * |filename|. This is stricter than needed (i.e. consecutive slashes
+	 * don't get ignored), but that's fine for our purposes.
+	 */
+	canonical_buf = kzalloc(len + 1, GFP_KERNEL);
+	if (!canonical_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	canonical = d_absolute_path(path, canonical_buf, len + 1);
+	if (IS_ERR(canonical)) {
+		ret = PTR_ERR(canonical);
+
+		/* Buffer too short implies |filename| wasn't canonical. */
+		if (ret == -ENAMETOOLONG)
+			ret = -EMLINK;
+
+		goto out;
+	}
+
+	ret = strcmp(filename, canonical) ? -EMLINK : 0;
+
+out:
+	kfree(canonical_buf);
+	if (ret < 0)
+		path_put(path);
+	kfree(filename);
+	return ret;
+}
+
+static ssize_t chromiumos_inode_file_write(
+	struct file *file,
+	const char __user *buf,
+	size_t len,
+	loff_t *ppos)
+{
+	struct chromiumos_inode_policy_file_entry *file_entry =
+		file->f_inode->i_private;
+	struct path path = {};
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (*ppos != 0)
+		return -EINVAL;
+
+	ret = chromiumos_resolve_path(buf, len, &path);
+	if (ret)
+		return ret;
+
+	ret = file_entry->handle_write(file_entry, path.dentry);
+	path_put(&path);
+	return ret < 0 ? ret : len;
+}
+
+static const struct file_operations chromiumos_inode_policy_file_fops = {
+	.write = chromiumos_inode_file_write,
+};
+
+static void chromiumos_shutdown_securityfs(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(chromiumos_inode_policy_files); ++i) {
+		struct chromiumos_inode_policy_file_entry *entry =
+			&chromiumos_inode_policy_files[i];
+		securityfs_remove(entry->dentry);
+		entry->dentry = NULL;
+	}
+
+	securityfs_remove(chromiumos_inode_policy_dir);
+	chromiumos_inode_policy_dir = NULL;
+
+	securityfs_remove(chromiumos_dir);
+	chromiumos_dir = NULL;
+}
+
+static int chromiumos_init_securityfs(void)
+{
+	int i;
+	int ret;
+
+	chromiumos_dir = securityfs_create_dir("chromiumos", NULL);
+	if (!chromiumos_dir) {
+		ret = PTR_ERR(chromiumos_dir);
+		goto error;
+	}
+
+	chromiumos_inode_policy_dir =
+		securityfs_create_dir(
+			"inode_security_policies",
+			chromiumos_dir);
+	if (!chromiumos_inode_policy_dir) {
+		ret = PTR_ERR(chromiumos_inode_policy_dir);
+		goto error;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(chromiumos_inode_policy_files); ++i) {
+		struct chromiumos_inode_policy_file_entry *entry =
+			&chromiumos_inode_policy_files[i];
+		entry->dentry = securityfs_create_file(
+			entry->name, 0200, chromiumos_inode_policy_dir,
+			entry, &chromiumos_inode_policy_file_fops);
+		if (IS_ERR(entry->dentry)) {
+			ret = PTR_ERR(entry->dentry);
+			goto error;
+		}
+	}
+
+	return 0;
+
+error:
+	chromiumos_shutdown_securityfs();
+	return ret;
+}
+fs_initcall(chromiumos_init_securityfs);

diff --git a/security/chromiumos/third_party_whitelists.h b/security/chromiumos/third_party_whitelists.h
new file mode 100644
index 0000000..dfc4e01
--- /dev/null
+++ b/security/chromiumos/third_party_whitelists.h

@@ -0,0 +1,256 @@
+/*
+ * Linux Security Module for Chromium OS
+ *
+ * Copyright 2018 Google LLC. All Rights Reserved
+ *
+ * Authors:
+ *      Micah Morton <mortonm@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef THIRD_PARTY_WHITELISTS_H
+#define THIRD_PARTY_WHITELISTS_H
+
+/*
+ * NOTE: the purpose of this header is only to pull out the definition of this
+ * array from alt-syscall.c for the purposes of readability. It should not be
+ * included in other .c files.
+ */
+
+#include "alt-syscall.h"
+
+static struct syscall_whitelist_entry third_party_whitelist[] = {
+	SYSCALL_ENTRY(accept),
+	SYSCALL_ENTRY(bind),
+	SYSCALL_ENTRY(brk),
+	SYSCALL_ENTRY(chdir),
+	SYSCALL_ENTRY(clock_gettime),
+	SYSCALL_ENTRY(clone),
+	SYSCALL_ENTRY(close),
+	SYSCALL_ENTRY(connect),
+	SYSCALL_ENTRY(dup),
+	SYSCALL_ENTRY(execve),
+	SYSCALL_ENTRY(exit),
+	SYSCALL_ENTRY(exit_group),
+	SYSCALL_ENTRY(fcntl),
+	SYSCALL_ENTRY(fstat),
+	SYSCALL_ENTRY(futex),
+	SYSCALL_ENTRY(getcwd),
+	SYSCALL_ENTRY(getdents64),
+	SYSCALL_ENTRY(getpid),
+	SYSCALL_ENTRY(getpgid),
+	SYSCALL_ENTRY(getppid),
+	SYSCALL_ENTRY(getpriority),
+	SYSCALL_ENTRY(getsid),
+	SYSCALL_ENTRY(gettimeofday),
+	SYSCALL_ENTRY(ioctl),
+	SYSCALL_ENTRY(listen),
+	SYSCALL_ENTRY(lseek),
+	SYSCALL_ENTRY(madvise),
+        SYSCALL_ENTRY(memfd_create),
+	SYSCALL_ENTRY(mprotect),
+	SYSCALL_ENTRY(munmap),
+	SYSCALL_ENTRY(nanosleep),
+	SYSCALL_ENTRY(openat),
+	SYSCALL_ENTRY(prlimit64),
+	SYSCALL_ENTRY(read),
+	SYSCALL_ENTRY(recvfrom),
+	SYSCALL_ENTRY(recvmsg),
+	SYSCALL_ENTRY(rt_sigaction),
+	SYSCALL_ENTRY(rt_sigprocmask),
+	SYSCALL_ENTRY(rt_sigreturn),
+	SYSCALL_ENTRY(sendfile),
+	SYSCALL_ENTRY(sendmsg),
+	SYSCALL_ENTRY(sendto),
+	SYSCALL_ENTRY(set_robust_list),
+	SYSCALL_ENTRY(set_tid_address),
+	SYSCALL_ENTRY(setpgid),
+	SYSCALL_ENTRY(setpriority),
+	SYSCALL_ENTRY(setsid),
+	SYSCALL_ENTRY(setsockopt),
+	SYSCALL_ENTRY(socket),
+	SYSCALL_ENTRY(socketpair),
+	SYSCALL_ENTRY(syslog),
+	SYSCALL_ENTRY(statfs),
+	SYSCALL_ENTRY(umask),
+	SYSCALL_ENTRY(uname),
+	SYSCALL_ENTRY(wait4),
+	SYSCALL_ENTRY(write),
+	SYSCALL_ENTRY(writev),
+
+	/*
+	 * Deprecated syscalls which are not wired up on new architectures
+	 * such as ARM64.
+	 */
+#ifndef CONFIG_ARM64
+	SYSCALL_ENTRY(access),
+	SYSCALL_ENTRY(creat),
+	SYSCALL_ENTRY(dup2),
+	SYSCALL_ENTRY(getdents),
+	SYSCALL_ENTRY(getpgrp),
+	SYSCALL_ENTRY(lstat),
+	SYSCALL_ENTRY(mkdir),
+	SYSCALL_ENTRY(open),
+	SYSCALL_ENTRY(pipe),
+	SYSCALL_ENTRY(poll),
+	SYSCALL_ENTRY(readlink),
+	SYSCALL_ENTRY(stat),
+	SYSCALL_ENTRY(unlink),
+#endif
+
+	/* ARM32 only syscalls. */
+#if defined(CONFIG_ARM)
+	SYSCALL_ENTRY(fcntl64),
+	SYSCALL_ENTRY(fstat64),
+	SYSCALL_ENTRY(geteuid32),
+	SYSCALL_ENTRY(getuid32),
+	SYSCALL_ENTRY(_llseek),
+	SYSCALL_ENTRY(lstat64),
+	SYSCALL_ENTRY(_newselect),
+	SYSCALL_ENTRY(mmap2),
+	SYSCALL_ENTRY(stat64),
+	SYSCALL_ENTRY(ugetrlimit),
+#endif
+
+	/* 64-bit only syscalls. */
+#if defined(CONFIG_X86_64) || defined(CONFIG_ARM64)
+	SYSCALL_ENTRY(getegid),
+	SYSCALL_ENTRY(geteuid),
+	SYSCALL_ENTRY(getgid),
+	SYSCALL_ENTRY(getrlimit),
+	SYSCALL_ENTRY(getuid),
+	SYSCALL_ENTRY(mmap),
+	SYSCALL_ENTRY(setgid),
+	SYSCALL_ENTRY(setuid),
+	/*
+	 * chown(2), lchown(2), and select(2) are deprecated and not wired up
+	 * on ARM64.
+	 */
+#ifndef CONFIG_ARM64
+	SYSCALL_ENTRY(select),
+#endif
+#endif
+
+	/* X86_64-specific syscalls. */
+#ifdef CONFIG_X86_64
+	SYSCALL_ENTRY(arch_prctl),
+#endif
+}; /* end third_party_whitelist */
+
+#ifdef CONFIG_COMPAT
+static struct syscall_whitelist_entry third_party_compat_whitelist[] = {
+	COMPAT_SYSCALL_ENTRY(access),
+	COMPAT_SYSCALL_ENTRY(brk),
+	COMPAT_SYSCALL_ENTRY(chdir),
+	COMPAT_SYSCALL_ENTRY(clock_gettime),
+	COMPAT_SYSCALL_ENTRY(clone),
+	COMPAT_SYSCALL_ENTRY(close),
+	COMPAT_SYSCALL_ENTRY(creat),
+	COMPAT_SYSCALL_ENTRY(dup),
+	COMPAT_SYSCALL_ENTRY(dup2),
+	COMPAT_SYSCALL_ENTRY(execve),
+	COMPAT_SYSCALL_ENTRY(exit),
+	COMPAT_SYSCALL_ENTRY(exit_group),
+	COMPAT_SYSCALL_ENTRY(fcntl),
+	COMPAT_SYSCALL_ENTRY(fcntl64),
+	COMPAT_SYSCALL_ENTRY(fstat),
+	COMPAT_SYSCALL_ENTRY(fstat64),
+	COMPAT_SYSCALL_ENTRY(futex),
+	COMPAT_SYSCALL_ENTRY(getcwd),
+	COMPAT_SYSCALL_ENTRY(getdents),
+	COMPAT_SYSCALL_ENTRY(getdents64),
+	COMPAT_SYSCALL_ENTRY(getegid),
+	COMPAT_SYSCALL_ENTRY(geteuid),
+	COMPAT_SYSCALL_ENTRY(geteuid32),
+	COMPAT_SYSCALL_ENTRY(getgid),
+	COMPAT_SYSCALL_ENTRY(getpgid),
+	COMPAT_SYSCALL_ENTRY(getpgrp),
+	COMPAT_SYSCALL_ENTRY(getpid),
+	COMPAT_SYSCALL_ENTRY(getpriority),
+	COMPAT_SYSCALL_ENTRY(getppid),
+	COMPAT_SYSCALL_ENTRY(getsid),
+	COMPAT_SYSCALL_ENTRY(gettimeofday),
+	COMPAT_SYSCALL_ENTRY(getuid),
+	COMPAT_SYSCALL_ENTRY(getuid32),
+	COMPAT_SYSCALL_ENTRY(ioctl),
+	COMPAT_SYSCALL_ENTRY(_llseek),
+	COMPAT_SYSCALL_ENTRY(lseek),
+	COMPAT_SYSCALL_ENTRY(lstat),
+	COMPAT_SYSCALL_ENTRY(lstat64),
+	COMPAT_SYSCALL_ENTRY(madvise),
+        COMPAT_SYSCALL_ENTRY(memfd_create),
+	COMPAT_SYSCALL_ENTRY(mkdir),
+	COMPAT_SYSCALL_ENTRY(mmap2),
+	COMPAT_SYSCALL_ENTRY(mprotect),
+	COMPAT_SYSCALL_ENTRY(munmap),
+	COMPAT_SYSCALL_ENTRY(nanosleep),
+	COMPAT_SYSCALL_ENTRY(_newselect),
+	COMPAT_SYSCALL_ENTRY(open),
+	COMPAT_SYSCALL_ENTRY(openat),
+	COMPAT_SYSCALL_ENTRY(pipe),
+	COMPAT_SYSCALL_ENTRY(poll),
+	COMPAT_SYSCALL_ENTRY(prlimit64),
+	COMPAT_SYSCALL_ENTRY(read),
+	COMPAT_SYSCALL_ENTRY(readlink),
+	COMPAT_SYSCALL_ENTRY(rt_sigaction),
+	COMPAT_SYSCALL_ENTRY(rt_sigprocmask),
+	COMPAT_SYSCALL_ENTRY(rt_sigreturn),
+	COMPAT_SYSCALL_ENTRY(sendfile),
+	COMPAT_SYSCALL_ENTRY(set_robust_list),
+	COMPAT_SYSCALL_ENTRY(set_tid_address),
+	COMPAT_SYSCALL_ENTRY(setgid32),
+	COMPAT_SYSCALL_ENTRY(setuid32),
+	COMPAT_SYSCALL_ENTRY(setpgid),
+	COMPAT_SYSCALL_ENTRY(setpriority),
+	COMPAT_SYSCALL_ENTRY(setsid),
+	COMPAT_SYSCALL_ENTRY(stat),
+	COMPAT_SYSCALL_ENTRY(stat64),
+	COMPAT_SYSCALL_ENTRY(statfs),
+	COMPAT_SYSCALL_ENTRY(syslog),
+	COMPAT_SYSCALL_ENTRY(ugetrlimit),
+	COMPAT_SYSCALL_ENTRY(umask),
+	COMPAT_SYSCALL_ENTRY(uname),
+	COMPAT_SYSCALL_ENTRY(unlink),
+	COMPAT_SYSCALL_ENTRY(wait4),
+	COMPAT_SYSCALL_ENTRY(write),
+	COMPAT_SYSCALL_ENTRY(writev),
+
+	/* IA32 uses the common socketcall(2) entrypoint for socket calls. */
+#ifdef CONFIG_X86_64
+	COMPAT_SYSCALL_ENTRY(socketcall),
+#endif
+
+#ifdef CONFIG_ARM64
+	COMPAT_SYSCALL_ENTRY(accept),
+	COMPAT_SYSCALL_ENTRY(bind),
+	COMPAT_SYSCALL_ENTRY(connect),
+	COMPAT_SYSCALL_ENTRY(listen),
+	COMPAT_SYSCALL_ENTRY(recvfrom),
+	COMPAT_SYSCALL_ENTRY(recvmsg),
+	COMPAT_SYSCALL_ENTRY(sendmsg),
+	COMPAT_SYSCALL_ENTRY(sendto),
+	COMPAT_SYSCALL_ENTRY(setsockopt),
+	COMPAT_SYSCALL_ENTRY(socket),
+	COMPAT_SYSCALL_ENTRY(socketpair),
+#endif
+
+	/*
+	 * getrlimit(2) is deprecated and not wired in the ARM compat table
+	 * on ARM64.
+	 */
+#ifndef CONFIG_ARM64
+	COMPAT_SYSCALL_ENTRY(getrlimit),
+#endif
+
+}; /* end third_party_compat_whitelist */
+#endif /* CONFIG_COMPAT */
+
+#endif /* THIRD_PARTY_WHITELISTS_H */

diff --git a/security/chromiumos/utils.c b/security/chromiumos/utils.c
new file mode 100644
index 0000000..d0d82d7
--- /dev/null
+++ b/security/chromiumos/utils.c

@@ -0,0 +1,157 @@
+/*
+ * Utilities for the Linux Security Module for Chromium OS
+ * (Since CONFIG_AUDIT is disabled for Chrome OS, we must repurpose
+ * a bunch of the audit string handling logic here instead.)
+ *
+ * Copyright 2012 Google Inc. All Rights Reserved
+ *
+ * Author:
+ *      Kees Cook       <keescook@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/sched/mm.h>
+#include <linux/security.h>
+
+#include "utils.h"
+
+/* Disallow double-quote and control characters other than space. */
+static int contains_unprintable(const char *source, size_t len)
+{
+	const unsigned char *p;
+	for (p = source; p < (const unsigned char *)source + len; p++) {
+		if (*p == '"' || *p < 0x20 || *p > 0x7e)
+			return 1;
+	}
+	return 0;
+}
+
+static char *hex_printable(const char *source, size_t len)
+{
+	size_t i;
+	char *dest, *ptr;
+	const char *hex = "0123456789ABCDEF";
+
+	/* Need to double the length of the string, plus a NULL. */
+	if (len > (INT_MAX - 1) / 2)
+		return NULL;
+	dest = kmalloc((len * 2) + 1, GFP_KERNEL);
+	if (!dest)
+		return NULL;
+
+	for (ptr = dest, i = 0; i < len; i++) {
+		*ptr++ = hex[(source[i] & 0xF0) >> 4];
+		*ptr++ = hex[source[i] & 0x0F];
+	}
+	*ptr = '\0';
+
+	return dest;
+}
+
+static char *quoted_printable(const char *source, size_t len)
+{
+	char *dest;
+
+	/* Need to add 2 double quotes and a NULL. */
+	if (len > INT_MAX - 3)
+		return NULL;
+	dest = kmalloc(len + 3, GFP_KERNEL);
+	if (!dest)
+		return NULL;
+
+	dest[0] = '"';
+	strncpy(dest + 1, source, len);
+	dest[len + 1] = '"';
+	dest[len + 2] = '\0';
+	return dest;
+}
+
+/* Return a string that has been sanitized and is safe to log. It is either
+ * in double-quotes, or is a series of hex digits.
+ */
+char *printable(char *source, size_t max_len)
+{
+	size_t len;
+
+	if (!source)
+		return NULL;
+
+	len = strnlen(source, max_len);
+	if (contains_unprintable(source, len))
+		return hex_printable(source, len);
+	else
+		return quoted_printable(source, len);
+}
+
+/* Repurposed from fs/proc/base.c, with NULL-replacement for saner printing.
+ * Allocates the buffer itself.
+ */
+char *printable_cmdline(struct task_struct *task)
+{
+	char *buffer = NULL, *sanitized;
+	int res, i;
+	unsigned int len;
+	struct mm_struct *mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out;
+
+	if (!mm->arg_end)
+		goto out_mm;	/* Shh! No looking before we're done */
+
+	buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!buffer)
+		goto out_mm;
+
+	len = mm->arg_end - mm->arg_start;
+
+	if (len > PAGE_SIZE)
+		len = PAGE_SIZE;
+
+	res = access_process_vm(task, mm->arg_start, buffer, len, 0);
+
+	/* Space-fill NULLs. */
+	if (res > 1)
+		for (i = 0; i < res - 2; ++i)
+			if (buffer[i] == '\0')
+				buffer[i] = ' ';
+
+	/* If the NULL at the end of args has been overwritten, then
+	 * assume application is using setproctitle(3).
+	 */
+	if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
+		len = strnlen(buffer, res);
+		if (len < res) {
+			res = len;
+		} else {
+			len = mm->env_end - mm->env_start;
+			if (len > PAGE_SIZE - res)
+				len = PAGE_SIZE - res;
+			res += access_process_vm(task, mm->env_start,
+						 buffer+res, len, 0);
+		}
+	}
+
+	/* Make sure the buffer is always NULL-terminated. */
+	buffer[PAGE_SIZE-1] = 0;
+
+	/* Make sure result is printable. */
+	sanitized = printable(buffer, res);
+	kfree(buffer);
+	buffer = sanitized;
+
+out_mm:
+	mmput(mm);
+out:
+	return buffer;
+}

diff --git a/security/chromiumos/utils.h b/security/chromiumos/utils.h
new file mode 100644
index 0000000..7151bba
--- /dev/null
+++ b/security/chromiumos/utils.h

@@ -0,0 +1,30 @@
+/*
+ * Utilities for the Linux Security Module for Chromium OS
+ * (Since CONFIG_AUDIT is disabled for Chrome OS, we must repurpose
+ * a bunch of the audit string handling logic here instead.)
+ *
+ * Copyright 2012 Google Inc. All Rights Reserved
+ *
+ * Author:
+ *      Kees Cook       <keescook@chromium.org>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _SECURITY_CHROMIUMOS_UTILS_H
+#define _SECURITY_CHROMIUMOS_UTILS_H
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+
+char *printable(char *source, size_t max_len);
+char *printable_cmdline(struct task_struct *task);
+
+#endif /* _SECURITY_CHROMIUMOS_UTILS_H */

diff --git a/security/container/Kconfig b/security/container/Kconfig
new file mode 100644
index 0000000..827fe0a
--- /dev/null
+++ b/security/container/Kconfig

@@ -0,0 +1,18 @@
+config SECURITY_CONTAINER_MONITOR
+	bool "Monitor containerized processes"
+	depends on SECURITY
+	depends on MMU
+	depends on VSOCKETS=y
+	depends on X86_64
+	select SECURITYFS
+	help
+	  Instrument the Linux kernel to collect more information about containers
+	  and identify security threats.
+
+config SECURITY_CONTAINER_MONITOR_DEBUG
+    bool "Enable debug pr_devel logs"
+	depends on SECURITY_CONTAINER_MONITOR
+	help
+	  Define DEBUG for CSM files to compile verbose debugging messages.
+
+	  Only for debugging/testing do not enable for production.

diff --git a/security/container/Makefile b/security/container/Makefile
new file mode 100644
index 0000000..678d0f7
--- /dev/null
+++ b/security/container/Makefile

@@ -0,0 +1,16 @@
+PB_CCFLAGS := -DPB_SYSTEM_HEADER="<pbsystem.h>" \
+	-DPB_NO_ERRMSG \
+	-DPB_FIELD_16BIT \
+	-DPB_BUFFER_ONLY
+export PB_CCFLAGS
+
+subdir-$(CONFIG_SECURITY_CONTAINER_MONITOR) += protos
+
+obj-$(CONFIG_SECURITY_CONTAINER_MONITOR) += protos/
+obj-$(CONFIG_SECURITY_CONTAINER_MONITOR) += monitor.o pb.o process.o vsock.o
+
+ccflags-y := -I$(srctree)/security/container/protos \
+	-I$(srctree)/security/container/protos/nanopb \
+	-I$(srctree)/fs \
+	$(PB_CCFLAGS)
+ccflags-$(CONFIG_SECURITY_CONTAINER_MONITOR_DEBUG) += -DDEBUG

diff --git a/security/container/monitor.c b/security/container/monitor.c
new file mode 100644
index 0000000..05e54e5
--- /dev/null
+++ b/security/container/monitor.c

@@ -0,0 +1,782 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Container Security Monitor module
+ *
+ * Copyright (c) 2018 Google, Inc
+ */
+
+#include "monitor.h"
+#include "process.h"
+
+#include <linux/audit.h>
+#include <linux/lsm_hooks.h>
+#include <linux/module.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/rwsem.h>
+#include <linux/string.h>
+#include <linux/sysctl.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+#include <linux/vm_sockets.h>
+#include <linux/file.h>
+
+/* protects csm_*_enabled and configurations. */
+DECLARE_RWSEM(csm_rwsem_config);
+
+/* protects csm_host_port and csm_vsocket. */
+DECLARE_RWSEM(csm_rwsem_vsocket);
+
+/* queue used for poll wait on config changes. */
+static DECLARE_WAIT_QUEUE_HEAD(config_wait);
+
+/* increase each time a new configuration is applied. */
+static unsigned long config_version;
+
+/* Stats gathered from the LSM. */
+struct container_stats csm_stats;
+
+struct container_stats_mapping {
+	const char *key;
+	size_t *value;
+};
+
+/* Key value pair mapping for the sysfs entry. */
+struct container_stats_mapping csm_stats_mapping[] = {
+	{ "ProtoEncodingFailed", &csm_stats.proto_encoding_failed },
+	{ "WorkQueueFailed", &csm_stats.workqueue_failed },
+	{ "EventWritingFailed", &csm_stats.event_writing_failed },
+	{ "SizePickingFailed", &csm_stats.size_picking_failed },
+	{ "PipeAlreadyOpened", &csm_stats.pipe_already_opened },
+};
+
+/*
+ * Is monitoring enabled? Defaults to disabled.
+ * These variables might be used without locking csm_rwsem_config to check if an
+ * LSM hook can bail quickly. The semaphore is taken later to ensure CSM is
+ * still enabled.
+ *
+ * csm_enabled is true if any collector is enabled.
+ */
+bool csm_enabled;
+static bool csm_container_enabled;
+bool csm_execute_enabled;
+bool csm_memexec_enabled;
+
+/* securityfs control files */
+static struct dentry *csm_dir;
+static struct dentry *csm_enabled_file;
+static struct dentry *csm_container_file;
+static struct dentry *csm_config_file;
+static struct dentry *csm_config_vers_file;
+static struct dentry *csm_pipe_file;
+static struct dentry *csm_stats_file;
+
+/* Pipes to forward data to user-mode. */
+DECLARE_RWSEM(csm_rwsem_pipe);
+static struct file *csm_user_read_pipe;
+struct file *csm_user_write_pipe;
+
+/* Option to disable the CSM features at boot. */
+static bool cmdline_boot_disabled;
+bool cmdline_boot_vsock_enabled;
+
+/* Options disabled by default. */
+static bool cmdline_boot_pipe_enabled;
+static bool cmdline_boot_config_enabled;
+
+/* Option to fully enabled the LSM at boot for automated testing. */
+static bool cmdline_default_enabled;
+
+static int csm_boot_disabled_setup(char *str)
+{
+	return kstrtobool(str, &cmdline_boot_disabled);
+}
+early_param("csm.disabled", csm_boot_disabled_setup);
+
+static int csm_default_enabled_setup(char *str)
+{
+	return kstrtobool(str, &cmdline_default_enabled);
+}
+early_param("csm.default.enabled", csm_default_enabled_setup);
+
+static int csm_boot_vsock_enabled_setup(char *str)
+{
+	return kstrtobool(str, &cmdline_boot_vsock_enabled);
+}
+early_param("csm.vsock.enabled", csm_boot_vsock_enabled_setup);
+
+static int csm_boot_pipe_enabled_setup(char *str)
+{
+	return kstrtobool(str, &cmdline_boot_pipe_enabled);
+}
+early_param("csm.pipe.enabled", csm_boot_pipe_enabled_setup);
+
+static int csm_boot_config_enabled_setup(char *str)
+{
+	return kstrtobool(str, &cmdline_boot_config_enabled);
+}
+early_param("csm.config.enabled", csm_boot_config_enabled_setup);
+
+static bool pipe_in_use(void)
+{
+	struct pipe_inode_info *pipe;
+
+	lockdep_assert_held_exclusive(&csm_rwsem_config);
+	if (csm_user_read_pipe) {
+		pipe = get_pipe_info(csm_user_read_pipe);
+		if (pipe)
+			return READ_ONCE(pipe->readers) > 1;
+	}
+	return false;
+}
+
+/* Close pipe, force has to be true to close pipe if it is still being used. */
+int close_pipe_files(bool force)
+{
+	if (csm_user_read_pipe) {
+		/* Pipe is still used. */
+		if (pipe_in_use()) {
+			if (!force)
+				return -EBUSY;
+			pr_warn("pipe is closed while it is still being used.\n");
+		}
+
+		fput(csm_user_read_pipe);
+		fput(csm_user_write_pipe);
+		csm_user_read_pipe = NULL;
+		csm_user_write_pipe = NULL;
+	}
+	return 0;
+}
+
+static void csm_update_config(schema_ConfigurationRequest *req)
+{
+	schema_ExecuteCollectorConfig *econf;
+	size_t i;
+	bool enumerate_processes = false;
+
+	/* Expect the lock to be held for write before this call. */
+	lockdep_assert_held_exclusive(&csm_rwsem_config);
+
+	/* This covers the scenario where a client is connected and the config
+	 * transitions the execute collector from disabled to enabled. In that
+	 * case there may have been execute events not sent. So they are
+	 * enumerated.
+	 */
+	if (!csm_execute_enabled && req->execute_config.enabled &&
+	    pipe_in_use())
+		enumerate_processes = true;
+
+	csm_container_enabled = req->container_config.enabled;
+	csm_execute_enabled = req->execute_config.enabled;
+	csm_memexec_enabled = req->memexec_config.enabled;
+
+	/* csm_enabled is true if any collector is enabled. */
+	csm_enabled = csm_container_enabled || csm_execute_enabled ||
+		csm_memexec_enabled;
+
+	/* Clean-up existing configurations. */
+	kfree(csm_execute_config.envp_allowlist);
+	memset(&csm_execute_config, 0, sizeof(csm_execute_config));
+
+	if (csm_execute_enabled) {
+		econf = &req->execute_config;
+		csm_execute_config.argv_limit = econf->argv_limit;
+		csm_execute_config.envp_limit = econf->envp_limit;
+
+		/* Swap the allowlist so it is not freed on return. */
+		csm_execute_config.envp_allowlist = econf->envp_allowlist.arg;
+		econf->envp_allowlist.arg = NULL;
+	}
+
+	/* Reset all stats and close pipe if disabled. */
+	if (!csm_enabled) {
+		for (i = 0; i < ARRAY_SIZE(csm_stats_mapping); i++)
+			*csm_stats_mapping[i].value = 0;
+
+		close_pipe_files(true);
+	}
+
+	config_version++;
+	if (enumerate_processes)
+		csm_enumerate_processes();
+	wake_up(&config_wait);
+}
+
+int csm_update_config_from_buffer(void *data, size_t size)
+{
+	schema_ConfigurationRequest c = schema_ConfigurationRequest_init_zero;
+	pb_istream_t istream;
+
+	c.execute_config.envp_allowlist.funcs.decode = pb_decode_string_array;
+
+	istream = pb_istream_from_buffer(data, size);
+	if (!pb_decode(&istream, schema_ConfigurationRequest_fields, &c)) {
+		kfree(c.execute_config.envp_allowlist.arg);
+		return -EINVAL;
+	}
+
+	down_write(&csm_rwsem_config);
+	csm_update_config(&c);
+	up_write(&csm_rwsem_config);
+
+	return 0;
+}
+
+static ssize_t csm_config_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	ssize_t err = 0;
+	void *mem;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* No partial writes. */
+	if (*ppos != 0)
+		return -EINVAL;
+
+	/* Duplicate user memory to safely parse protobuf. */
+	mem = memdup_user(buf, count);
+	if (IS_ERR(mem))
+		return PTR_ERR(mem);
+
+	err = csm_update_config_from_buffer(mem, count);
+	if (!err)
+		err = count;
+
+	kfree(mem);
+	return err;
+}
+
+static const struct file_operations csm_config_fops = {
+	.write = csm_config_write,
+};
+
+static void csm_enable(void)
+{
+	schema_ConfigurationRequest req = schema_ConfigurationRequest_init_zero;
+
+	/* Expect the lock to be held for write before this call. */
+	lockdep_assert_held_exclusive(&csm_rwsem_config);
+
+	/* Default configuration */
+	req.container_config.enabled = true;
+	req.execute_config.enabled = true;
+	req.execute_config.argv_limit = UINT_MAX;
+	req.execute_config.envp_limit = UINT_MAX;
+	req.memexec_config.enabled = true;
+	csm_update_config(&req);
+}
+
+static void csm_disable(void)
+{
+	schema_ConfigurationRequest req = schema_ConfigurationRequest_init_zero;
+
+	/* Expect the lock to be held for write before this call. */
+	lockdep_assert_held_exclusive(&csm_rwsem_config);
+
+	/* Zero configuration disable all collectors. */
+	csm_update_config(&req);
+	pr_info("disabled\n");
+}
+
+static ssize_t csm_enabled_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	const char *str = csm_enabled ? "1\n" : "0\n";
+
+	return simple_read_from_buffer(buf, count, ppos, str, 2);
+}
+
+static ssize_t csm_enabled_write(struct file *file, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	bool enabled;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (count <= 0 || count > PAGE_SIZE || *ppos)
+		return -EINVAL;
+
+	err = kstrtobool_from_user(buf, count, &enabled);
+	if (err)
+		return err;
+
+	down_write(&csm_rwsem_config);
+
+	if (enabled)
+		csm_enable();
+	else
+		csm_disable();
+
+	up_write(&csm_rwsem_config);
+
+	return count;
+}
+
+static const struct file_operations csm_enabled_fops = {
+	.read = csm_enabled_read,
+	.write = csm_enabled_write,
+};
+
+static int csm_config_version_open(struct inode *inode, struct file *file)
+{
+	/* private_data is used to keep the latest config version read. */
+	file->private_data = (void*)-1;
+	return 0;
+}
+
+static ssize_t csm_config_version_read(struct file *file, char __user *buf,
+				       size_t count, loff_t *ppos)
+{
+	unsigned long version = config_version;
+	file->private_data = (void*)version;
+	return simple_read_from_buffer(buf, count, ppos, &version,
+				       sizeof(version));
+}
+
+static __poll_t csm_config_version_poll(struct file *file,
+					struct poll_table_struct *poll_tab)
+{
+	if ((unsigned long)file->private_data != config_version)
+		return EPOLLIN;
+	poll_wait(file, &config_wait, poll_tab);
+	if ((unsigned long)file->private_data != config_version)
+		return EPOLLIN;
+	return 0;
+}
+
+static const struct file_operations csm_config_version_fops = {
+	.open = csm_config_version_open,
+	.read = csm_config_version_read,
+	.poll = csm_config_version_poll,
+};
+
+static int csm_pipe_open(struct inode *inode, struct file *file)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (!csm_enabled)
+		return -EAGAIN;
+	return 0;
+}
+
+/* Similar to file_clone_open that is available only in 4.19 and up. */
+static inline struct file *pipe_clone_open(struct file *file)
+{
+	return dentry_open(&file->f_path, file->f_flags, file->f_cred);
+}
+
+/* Check if the pipe is still used, else recreate and dup it. */
+static struct file *csm_dup_pipe(void)
+{
+	long pipe_size = 1024 * PAGE_SIZE;
+	long actual_size;
+	struct file *pipes[2] = {NULL, NULL};
+	struct file *ret;
+	int err;
+
+	down_write(&csm_rwsem_pipe);
+
+	err = close_pipe_files(false);
+	if (err) {
+		ret = ERR_PTR(err);
+		csm_stats.pipe_already_opened++;
+		goto out;
+	}
+
+	err = create_pipe_files(pipes, O_NONBLOCK);
+	if (err) {
+		ret = ERR_PTR(err);
+		goto out;
+	}
+
+	/*
+	 * Try to increase the pipe size to 1024 pages, if there is not
+	 * enough memory, pipes will stay unchanged.
+	 */
+	actual_size = pipe_fcntl(pipes[0], F_SETPIPE_SZ, pipe_size);
+	if (actual_size != pipe_size)
+		pr_err("failed to resize pipe to 1024 pages, error: %ld, fallback to the default value\n",
+		       actual_size);
+
+	csm_user_read_pipe = pipes[0];
+	csm_user_write_pipe = pipes[1];
+
+	/* Clone the file so we can track if the reader is still used. */
+	ret = pipe_clone_open(csm_user_read_pipe);
+
+out:
+	up_write(&csm_rwsem_pipe);
+	return ret;
+}
+
+static ssize_t csm_pipe_read(struct file *file, char __user *buf,
+				       size_t count, loff_t *ppos)
+{
+	int fd;
+	ssize_t err;
+	struct file *local_pipe;
+
+	/* No partial reads. */
+	if (*ppos != 0)
+		return -EINVAL;
+
+	fd = get_unused_fd_flags(0);
+	if (fd < 0)
+		return fd;
+
+	local_pipe = csm_dup_pipe();
+	if (IS_ERR(local_pipe)) {
+		err = PTR_ERR(local_pipe);
+		local_pipe = NULL;
+		goto error;
+	}
+
+	err = simple_read_from_buffer(buf, count, ppos, &fd, sizeof(fd));
+	if (err < 0)
+		goto error;
+
+	if (err < sizeof(fd)) {
+		err = -EINVAL;
+		goto error;
+	}
+
+	/* Install the file descriptor when we know everything succeeded. */
+	fd_install(fd, local_pipe);
+
+	csm_enumerate_processes();
+
+	return err;
+
+error:
+	if (local_pipe)
+		fput(local_pipe);
+	put_unused_fd(fd);
+	return err;
+}
+
+
+static const struct file_operations csm_pipe_fops = {
+	.open = csm_pipe_open,
+	.read = csm_pipe_read,
+};
+
+static void set_container_decode_callbacks(schema_Container *container)
+{
+	container->pod_namespace.funcs.decode = pb_decode_string_field;
+	container->pod_name.funcs.decode = pb_decode_string_field;
+	container->container_name.funcs.decode = pb_decode_string_field;
+	container->container_image_uri.funcs.decode = pb_decode_string_field;
+	container->labels.funcs.decode = pb_decode_string_array;
+}
+
+static void set_container_encode_callbacks(schema_Container *container)
+{
+	container->pod_namespace.funcs.encode = pb_encode_string_field;
+	container->pod_name.funcs.encode = pb_encode_string_field;
+	container->container_name.funcs.encode = pb_encode_string_field;
+	container->container_image_uri.funcs.encode = pb_encode_string_field;
+	container->labels.funcs.encode = pb_encode_string_array;
+}
+
+static void free_container_callbacks_args(schema_Container *container)
+{
+	kfree(container->pod_namespace.arg);
+	kfree(container->pod_name.arg);
+	kfree(container->container_name.arg);
+	kfree(container->container_image_uri.arg);
+	kfree(container->labels.arg);
+}
+
+static ssize_t csm_container_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	ssize_t err = 0;
+	void *mem;
+	u64 cid;
+	pb_istream_t istream;
+	struct task_struct *task;
+	schema_ContainerReport report = schema_ContainerReport_init_zero;
+	schema_Event event = schema_Event_init_zero;
+	schema_Container *container;
+	char *uuid = NULL;
+
+	/* Notify that this collector is not yet enabled. */
+	if (!csm_container_enabled)
+		return -EAGAIN;
+
+	/* No partial writes. */
+	if (*ppos != 0)
+		return -EINVAL;
+
+	/* Duplicate user memory to safely parse protobuf. */
+	mem = memdup_user(buf, count);
+	if (IS_ERR(mem))
+		return PTR_ERR(mem);
+
+	/* Callback to decode string in protobuf. */
+	set_container_decode_callbacks(&report.container);
+
+	istream = pb_istream_from_buffer(mem, count);
+	if (!pb_decode(&istream, schema_ContainerReport_fields, &report)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* Check protobuf is as expected */
+	if (report.pid == 0 ||
+	    report.container.container_id != 0) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* Find if the process id is linked to an existing container-id. */
+	rcu_read_lock();
+	task = find_task_by_pid_ns(report.pid, &init_pid_ns);
+	if (task) {
+		cid = audit_get_contid(task);
+		if (cid == AUDIT_CID_UNSET)
+			err = -ENOENT;
+	} else {
+		err = -ENOENT;
+	}
+	rcu_read_unlock();
+
+	if (err)
+		goto out;
+
+	uuid = kzalloc(PROCESS_UUID_SIZE, GFP_KERNEL);
+	if (!uuid)
+		goto out;
+
+	/* Provide the uuid for the top process of the container. */
+	err = get_process_uuid_by_pid(report.pid, uuid, PROCESS_UUID_SIZE);
+	if (err)
+		goto out;
+
+	/* Correct the container-id and feed the event to vsock */
+	report.container.container_id = cid;
+	report.container.init_uuid.funcs.encode = pb_encode_uuid_field;
+	report.container.init_uuid.arg = uuid;
+	container = &event.event.container.container;
+	*container = report.container;
+
+	/* Use encode callback to generate the final proto. */
+	set_container_encode_callbacks(container);
+
+	event.which_event = schema_Event_container_tag;
+
+	err = csm_sendeventproto(schema_Event_fields, &event);
+	if (!err)
+		err = count;
+
+out:
+	/* Free any allocated nanopb callback arguments. */
+	free_container_callbacks_args(&report.container);
+	kfree(uuid);
+	kfree(mem);
+	return err;
+}
+
+static const struct file_operations csm_container_fops = {
+	.write = csm_container_write,
+};
+
+static int csm_show_stats(struct seq_file *p, void *v)
+{
+	size_t i;
+
+	for (i = 0; i < ARRAY_SIZE(csm_stats_mapping); i++) {
+		seq_printf(p, "%s:\t%zu\n",
+			   csm_stats_mapping[i].key,
+			   *csm_stats_mapping[i].value);
+	}
+
+	return 0;
+}
+
+static int csm_stats_open(struct inode *inode, struct file *file)
+{
+	size_t i, size = 1; /* Start at one for the null byte. */
+
+	for (i = 0; i < ARRAY_SIZE(csm_stats_mapping); i++) {
+		/*
+		 * Calculate the maximum length:
+		 * - Length of the key
+		 * - 3 additional chars :\t\n
+		 * - longest unsigned 64-bit integer.
+		 */
+		size += strlen(csm_stats_mapping[i].key)
+			+ 3 + sizeof("18446744073709551615");
+	}
+
+	return single_open_size(file, csm_show_stats, NULL, size);
+}
+
+static const struct file_operations csm_stats_fops = {
+	.open		= csm_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+/* Prevent user-mode from using vsock on our port. */
+static int csm_socket_connect(struct socket *sock, struct sockaddr *address,
+			      int addrlen)
+{
+	struct sockaddr_vm *vaddr = (struct sockaddr_vm *)address;
+
+	/* Filter only vsock sockets */
+	if (!sock->sk || sock->sk->sk_family != AF_VSOCK)
+		return 0;
+
+	/* Allow kernel sockets. */
+	if (sock->sk->sk_kern_sock)
+		return 0;
+
+	if (addrlen < sizeof(*vaddr))
+		return -EINVAL;
+
+	/* Forbid access to the CSM VMM backend port. */
+	if (vaddr->svm_port == CSM_HOST_PORT)
+		return -EPERM;
+
+	return 0;
+}
+
+static int csm_setxattr(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags)
+{
+	if (csm_enabled && !strcmp(name, XATTR_SECURITY_CSM))
+		return -EPERM;
+	return 0;
+}
+
+static struct security_hook_list csm_hooks[] __lsm_ro_after_init = {
+	/* Track process execution. */
+	LSM_HOOK_INIT(bprm_check_security, csm_bprm_check_security),
+	LSM_HOOK_INIT(task_post_alloc, csm_task_post_alloc),
+	LSM_HOOK_INIT(task_exit, csm_task_exit),
+
+	/* Block vsock access when relevant. */
+	LSM_HOOK_INIT(socket_connect, csm_socket_connect),
+
+	/* Track memory execution */
+	LSM_HOOK_INIT(file_mprotect, csm_mprotect),
+	LSM_HOOK_INIT(mmap_file, csm_mmap_file),
+
+	/* Track file modification provenance. */
+	LSM_HOOK_INIT(file_pre_free_security, csm_file_pre_free),
+
+	/* Block modyfing csm xattr. */
+	LSM_HOOK_INIT(inode_setxattr, csm_setxattr),
+};
+
+static int __init csm_init(void)
+{
+	int err;
+
+	if (cmdline_boot_disabled)
+		return 0;
+
+	/*
+	 * If cmdline_boot_vsock_enabled is false, only the event pool will be
+	 * allocated. The destroy function will clean-up only what was reserved.
+	 */
+	err = vsock_initialize();
+	if (err)
+		return err;
+
+	csm_dir = securityfs_create_dir("container_monitor", NULL);
+	if (IS_ERR(csm_dir)) {
+		err = PTR_ERR(csm_dir);
+		goto error;
+	}
+
+	csm_enabled_file = securityfs_create_file("enabled", 0644, csm_dir,
+						  NULL, &csm_enabled_fops);
+	if (IS_ERR(csm_enabled_file)) {
+		err = PTR_ERR(csm_enabled_file);
+		goto error_rmdir;
+	}
+
+	csm_container_file = securityfs_create_file("container", 0200, csm_dir,
+						  NULL, &csm_container_fops);
+	if (IS_ERR(csm_container_file)) {
+		err = PTR_ERR(csm_container_file);
+		goto error_rm_enabled;
+	}
+
+	csm_config_vers_file = securityfs_create_file("config_version", 0400,
+						      csm_dir, NULL,
+						      &csm_config_version_fops);
+	if (IS_ERR(csm_config_vers_file)) {
+		err = PTR_ERR(csm_config_vers_file);
+		goto error_rm_container;
+	}
+
+	if (cmdline_boot_config_enabled) {
+		csm_config_file = securityfs_create_file("config", 0200,
+							 csm_dir, NULL,
+							 &csm_config_fops);
+		if (IS_ERR(csm_config_file)) {
+			err = PTR_ERR(csm_config_file);
+			goto error_rm_config_vers;
+		}
+	}
+
+	if (cmdline_boot_pipe_enabled) {
+		csm_pipe_file = securityfs_create_file("pipe", 0400, csm_dir,
+						       NULL, &csm_pipe_fops);
+		if (IS_ERR(csm_pipe_file)) {
+			err = PTR_ERR(csm_pipe_file);
+			goto error_rm_config;
+		}
+	}
+
+	csm_stats_file = securityfs_create_file("stats", 0400, csm_dir,
+						 NULL, &csm_stats_fops);
+	if (IS_ERR(csm_stats_file)) {
+		err = PTR_ERR(csm_stats_file);
+		goto error_rm_pipe;
+	}
+
+	pr_debug("created securityfs control files\n");
+
+	security_add_hooks(csm_hooks, ARRAY_SIZE(csm_hooks), "csm");
+	pr_debug("registered hooks\n");
+
+	/* Off-by-default, only used for testing images. */
+	if (cmdline_default_enabled) {
+		down_write(&csm_rwsem_config);
+		csm_enable();
+		up_write(&csm_rwsem_config);
+	}
+
+	return 0;
+
+error_rm_pipe:
+	if (cmdline_boot_pipe_enabled)
+		securityfs_remove(csm_pipe_file);
+error_rm_config:
+	if (cmdline_boot_config_enabled)
+		securityfs_remove(csm_config_file);
+error_rm_config_vers:
+	securityfs_remove(csm_config_vers_file);
+error_rm_container:
+	securityfs_remove(csm_container_file);
+error_rm_enabled:
+	securityfs_remove(csm_enabled_file);
+error_rmdir:
+	securityfs_remove(csm_dir);
+error:
+	vsock_destroy();
+	pr_warn("fs initialization error: %d", err);
+	return err;
+}
+
+late_initcall(csm_init);

diff --git a/security/container/monitor.h b/security/container/monitor.h
new file mode 100644
index 0000000..cab3d5b
--- /dev/null
+++ b/security/container/monitor.h

@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Container Security Monitor module
+ *
+ * Copyright (c) 2018 Google, Inc
+ */
+
+#define pr_fmt(fmt)	"container-security-monitor: " fmt
+
+#include <linux/kernel.h>
+#include <linux/security.h>
+#include <linux/fs.h>
+#include <linux/rwsem.h>
+#include <linux/binfmts.h>
+#include <linux/xattr.h>
+#include <config.pb.h>
+#include <event.pb.h>
+#include <pb_encode.h>
+#include <pb_decode.h>
+
+#include "monitoring_protocol.h"
+
+/* Part of the CSM configuration response. */
+#define CSM_VERSION 1
+
+/* protects csm_*_enabled and configurations. */
+extern struct rw_semaphore csm_rwsem_config;
+
+/* protects csm_host_port and csm_vsocket. */
+extern struct rw_semaphore csm_rwsem_vsocket;
+
+/* Port to connect to the host on, over virtio-vsock */
+#define CSM_HOST_PORT 4444
+
+/*
+ * Is monitoring enabled? Defaults to disabled.
+ * These variables might be used as gates without locking (as processor ensures
+ * valid proper access for native scalar values) so it can bail quickly.
+ */
+extern bool csm_enabled;
+extern bool csm_execute_enabled;
+extern bool csm_memexec_enabled;
+
+/* Configuration options for execute collector. */
+struct execute_config {
+	size_t argv_limit;
+	size_t envp_limit;
+	char *envp_allowlist;
+};
+
+extern struct execute_config csm_execute_config;
+
+/* pipe to forward vsock packets to user-mode. */
+extern struct rw_semaphore csm_rwsem_pipe;
+extern struct file *csm_user_write_pipe;
+
+/* Was vsock enabled at boot time? */
+extern bool cmdline_boot_vsock_enabled;
+
+/* Stats on LSM events. */
+struct container_stats {
+	size_t proto_encoding_failed;
+	size_t event_writing_failed;
+	size_t workqueue_failed;
+	size_t size_picking_failed;
+	size_t pipe_already_opened;
+};
+
+extern struct container_stats csm_stats;
+
+/* Streams file numbers are unknown from the kernel */
+#define STDIN_FILENO	0
+#define STDOUT_FILENO	1
+#define STDERR_FILENO	2
+
+/* security attribute for file provenance. */
+#define XATTR_SECURITY_CSM XATTR_SECURITY_PREFIX "csm"
+
+/* monitor functions */
+int csm_update_config_from_buffer(void *data, size_t size);
+
+/* vsock functions */
+int vsock_initialize(void);
+void vsock_destroy(void);
+int vsock_late_initialize(void);
+int csm_sendeventproto(const pb_field_t fields[], schema_Event *event);
+int csm_sendconfigrespproto(const pb_field_t fields[],
+			    schema_ConfigurationResponse *resp);
+
+/* process events functions */
+int csm_bprm_check_security(struct linux_binprm *bprm);
+void csm_task_exit(struct task_struct *task);
+void csm_task_post_alloc(struct task_struct *task);
+int get_process_uuid_by_pid(pid_t pid_nr, char *buffer, size_t size);
+
+/* memory execution events functions */
+int csm_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
+					  unsigned long prot);
+int csm_mmap_file(struct file *file, unsigned long reqprot,
+				  unsigned long prot, unsigned long flags);
+
+/* Tracking of file modification provenance. */
+void csm_file_pre_free(struct file *file);
+
+/* nano functions */
+bool pb_encode_string_field(pb_ostream_t *stream, const pb_field_t *field,
+			    void * const *arg);
+bool pb_decode_string_field(pb_istream_t *stream, const pb_field_t *field,
+		      void **arg);
+ssize_t pb_encode_string_field_limit(pb_ostream_t *stream,
+				     const pb_field_t *field,
+				     void * const *arg, size_t limit);
+bool pb_encode_string_array(pb_ostream_t *stream, const pb_field_t *field,
+			    void * const *arg);
+bool pb_decode_string_array(pb_istream_t *stream, const pb_field_t *field,
+			    void **arg);
+bool pb_encode_uuid_field(pb_ostream_t *stream, const pb_field_t *field,
+			  void * const *arg);
+bool pb_encode_ip4(pb_ostream_t *stream, const pb_field_t *field,
+		   void * const *arg);
+bool pb_encode_ip6(pb_ostream_t *stream, const pb_field_t *field,
+		   void * const *arg);
+

diff --git a/security/container/monitoring_protocol.h b/security/container/monitoring_protocol.h
new file mode 100644
index 0000000..dbdfc9c
--- /dev/null
+++ b/security/container/monitoring_protocol.h

@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+/* Container security monitoring protocol definitions */
+
+#include <linux/types.h>
+
+enum csm_msgtype {
+	CSM_MSG_TYPE_HEARTBEAT = 1,
+	CSM_MSG_EVENT_PROTO = 2,
+	CSM_MSG_CONFIG_REQUEST_PROTO = 3,
+	CSM_MSG_CONFIG_RESPONSE_PROTO = 4,
+};
+
+struct csm_msg_hdr {
+	__le32 msg_type;
+	__le32 msg_length;
+};
+
+/* The process uuid is a 128-bits identifier */
+#define PROCESS_UUID_SIZE 16
+
+/* The entire structure forms the collision domain. */
+union process_uuid {
+	struct {
+		__u32 machineid;
+		__u64 start_time;
+		__u32 tgid;
+	} __attribute__((packed));
+	__u8 data[PROCESS_UUID_SIZE];
+};

diff --git a/security/container/pb.c b/security/container/pb.c
new file mode 100644
index 0000000..f24e58f
--- /dev/null
+++ b/security/container/pb.c

@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Container Security Monitor module
+ *
+ * Copyright (c) 2018 Google, Inc
+ */
+
+#include "monitor.h"
+
+#include <linux/string.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+bool pb_encode_string_field(pb_ostream_t *stream, const pb_field_t *field,
+			    void * const *arg)
+{
+	const uint8_t *str = (const uint8_t *)*arg;
+
+	/* If the string is not set, skip this string. */
+	if (!str)
+		return true;
+
+	if (!pb_encode_tag_for_field(stream, field))
+		return false;
+
+	return pb_encode_string(stream, str, strlen(str));
+}
+
+bool pb_decode_string_field(pb_istream_t *stream, const pb_field_t *field,
+			    void **arg)
+{
+	size_t size;
+	void *data;
+
+	*arg = NULL;
+
+	size = stream->bytes_left;
+
+	/* Ensure a null-byte at the end */
+	if (size + 1 < size)
+		return false;
+
+	data = kzalloc(size + 1, GFP_KERNEL);
+	if (!data)
+		return false;
+
+	if (!pb_read(stream, data, size)) {
+		kfree(data);
+		return false;
+	}
+
+	*arg = data;
+
+	return true;
+}
+
+bool pb_encode_string_array(pb_ostream_t *stream, const pb_field_t *field,
+			    void * const *arg)
+{
+	char *strs = (char *)*arg;
+
+	/* If the string array is not set, skip this string array. */
+	if (!strs)
+		return true;
+
+	do {
+		if (!pb_encode_string_field(stream, field,
+					    (void * const *) &strs))
+			return false;
+
+		strs += strlen(strs) + 1;
+	} while (*strs != 0);
+
+	return true;
+}
+
+/* Limit the encoded string size and return how many characters were added. */
+ssize_t pb_encode_string_field_limit(pb_ostream_t *stream,
+				     const pb_field_t *field,
+				     void * const *arg, size_t limit)
+{
+	char *str = (char *)*arg;
+	size_t length;
+
+	/* If the string is not set, skip this string. */
+	if (!str)
+		return 0;
+
+	if (!pb_encode_tag_for_field(stream, field))
+		return -EINVAL;
+
+	length = strlen(str);
+	if (length > limit)
+		length = limit;
+
+	if (!pb_encode_string(stream, (uint8_t *)str, length))
+		return -EINVAL;
+
+	return length;
+}
+
+bool pb_decode_string_array(pb_istream_t *stream, const pb_field_t *field,
+			    void **arg)
+{
+	size_t needed, used = 0;
+	char *data, *strs;
+
+	/* String length, and two null-bytes for the end of the list. */
+	needed = stream->bytes_left + 2;
+	if (needed < stream->bytes_left)
+		return false;
+
+	if (*arg) {
+		/* Calculate used space from the current list. */
+		strs = (char *)*arg;
+		do {
+			used += strlen(strs + used) + 1;
+		} while (strs[used] != 0);
+
+		if (used + needed < needed)
+			return false;
+	}
+
+	data = krealloc(*arg, used + needed, GFP_KERNEL);
+	if (!data)
+		return false;
+
+	/* Will always be freed by the caller */
+	*arg = data;
+
+	/* Reset the new part of the buffer. */
+	memset(data + used, 0, needed);
+
+	/* Read what's in the stream buffer only. */
+	if (!pb_read(stream, data + used, stream->bytes_left))
+		return false;
+
+	return true;
+}
+
+bool pb_encode_fixed_string(pb_ostream_t *stream, const pb_field_t *field,
+			    const uint8_t *data, size_t length)
+{
+	/* If the data is not set, skip this string. */
+	if (!data)
+		return true;
+
+	if (!pb_encode_tag_for_field(stream, field))
+		return false;
+
+	return pb_encode_string(stream, data, length);
+}
+
+
+bool pb_encode_uuid_field(pb_ostream_t *stream, const pb_field_t *field,
+			  void * const *arg)
+{
+	return pb_encode_fixed_string(stream, field, (const uint8_t *)*arg,
+				      PROCESS_UUID_SIZE);
+}
+
+bool pb_encode_ip4(pb_ostream_t *stream, const pb_field_t *field,
+		   void * const *arg)
+{
+	return pb_encode_fixed_string(stream, field, (const uint8_t *)*arg,
+				      sizeof(struct in_addr));
+}
+
+bool pb_encode_ip6(pb_ostream_t *stream, const pb_field_t *field,
+		   void * const *arg)
+{
+	return pb_encode_fixed_string(stream, field, (const uint8_t *)*arg,
+				      sizeof(struct in6_addr));
+}

diff --git a/security/container/process.c b/security/container/process.c
new file mode 100644
index 0000000..a0c33e7
--- /dev/null
+++ b/security/container/process.c

@@ -0,0 +1,1149 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Container Security Monitor module
+ *
+ * Copyright (c) 2018 Google, Inc
+ */
+
+#include "monitor.h"
+
+#include <linux/atomic.h>
+#include <linux/audit.h>
+#include <linux/file.h>
+#include <linux/highmem.h>
+#include <linux/mempool.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/notifier.h>
+#include <linux/net.h>
+#include <linux/path.h>
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/timekeeping.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+#include <linux/xattr.h>
+#include <net/ipv6.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <overlayfs/overlayfs.h>
+#include <uapi/linux/magic.h>
+#include <uapi/asm/mman.h>
+
+/* Configuration options for execute collector. */
+struct execute_config csm_execute_config;
+
+/* unique atomic value for the machine boot instance */
+static atomic_t machine_rand = ATOMIC_INIT(0);
+
+/* sequential container identifier */
+static atomic_t contid = ATOMIC_INIT(0);
+
+/* Generation id for each enumeration invocation. */
+static atomic_t enumeration_count = ATOMIC_INIT(0);
+
+struct file_provenance {
+	/* pid of the process doing the first write. */
+	pid_t tgid;
+	/* start_time of the process to uniquely identify it. */
+	u64 start_time;
+};
+
+struct csm_enumerate_processes_work_data {
+	struct work_struct work;
+	int enumeration_count;
+};
+
+static void *kmap_argument_stack(struct linux_binprm *bprm, void **ctx)
+{
+	char *argv;
+	int err;
+	unsigned long i, pos, count;
+	void *map;
+	struct page *page;
+
+	/* vma_pages() returns the number of pages reserved for the stack */
+	count = vma_pages(bprm->vma);
+
+	if (likely(count == 1)) {
+		err = get_user_pages_remote(current, bprm->mm, bprm->p, 1,
+					    FOLL_FORCE, &page, NULL, NULL);
+		if (err != 1)
+			return NULL;
+
+		argv = kmap(page);
+		*ctx = page;
+	} else {
+		/*
+		 * If more than one pages is needed, copy all of them to a set
+		 * of pages. Parsing the argument across kmap pages in different
+		 * addresses would make it impractical.
+		 */
+		argv = vmalloc(count * PAGE_SIZE);
+		if (!argv)
+			return NULL;
+
+		for (i = 0; i < count; i++) {
+			pos = ALIGN_DOWN(bprm->p, PAGE_SIZE) + i * PAGE_SIZE;
+			err = get_user_pages_remote(current, bprm->mm, pos, 1,
+						    FOLL_FORCE, &page, NULL,
+						    NULL);
+			if (err <= 0) {
+				vfree(argv);
+				return NULL;
+			}
+
+			map = kmap(page);
+			memcpy(argv + i * PAGE_SIZE, map, PAGE_SIZE);
+			kunmap(page);
+			put_page(page);
+		}
+		*ctx = bprm;
+	}
+
+	return argv;
+}
+
+static void kunmap_argument_stack(struct linux_binprm *bprm, void *addr,
+				  void *ctx)
+{
+	struct page *page;
+
+	if (!addr)
+		return;
+
+	if (likely(vma_pages(bprm->vma) == 1)) {
+		page = (struct page *)ctx;
+		kunmap(page);
+		put_page(ctx);
+	} else {
+		vfree(addr);
+	}
+}
+
+static char *find_array_next_entry(char *array, unsigned long *offset,
+				   unsigned long end)
+{
+	char *entry;
+	unsigned long off = *offset;
+
+	if (off >= end)
+		return NULL;
+
+	/* Check the entry is null terminated and in bound */
+	entry = array + off;
+	while (array[off]) {
+		if (++off >= end)
+			return NULL;
+	}
+
+	/* Pass the null byte for the next iteration */
+	*offset = off + 1;
+
+	return entry;
+}
+
+struct string_arr_ctx {
+	struct linux_binprm *bprm;
+	void *stack;
+};
+
+static size_t get_config_limit(size_t *config_ptr)
+{
+	lockdep_assert_held_read(&csm_rwsem_config);
+
+	/*
+	 * If execute is not enabled, do not capture arguments.
+	 * The vsock packet won't be sent anyway.
+	 */
+	if (!csm_execute_enabled)
+		return 0;
+
+	return *config_ptr;
+}
+
+static bool encode_current_argv(pb_ostream_t *stream, const pb_field_t *field,
+				void * const *arg)
+{
+	struct string_arr_ctx *ctx = (struct string_arr_ctx *)*arg;
+	int i;
+	struct linux_binprm *bprm = ctx->bprm;
+	unsigned long offset = bprm->p % PAGE_SIZE;
+	unsigned long end = vma_pages(bprm->vma) * PAGE_SIZE;
+	char *argv = ctx->stack;
+	char *entry;
+	size_t limit, used = 0;
+	ssize_t ret;
+
+	limit = get_config_limit(&csm_execute_config.argv_limit);
+	if (!limit)
+		return true;
+
+	for (i = 0; i < bprm->argc; i++) {
+		entry = find_array_next_entry(argv, &offset, end);
+		if (!entry)
+			return false;
+
+		ret = pb_encode_string_field_limit(stream, field,
+						   (void * const *)&entry,
+						   limit - used);
+		if (ret < 0)
+			return false;
+
+		used += ret;
+
+		if (used >= limit)
+			break;
+	}
+
+	return true;
+}
+
+static bool check_envp_allowlist(char *envp)
+{
+	bool ret = false;
+	char *strs, *equal;
+	size_t str_size, equal_pos;
+
+	/* If execute is not enabled, skip all. */
+	if (!csm_execute_enabled)
+		goto out;
+
+	/* No filter, allow all. */
+	strs = csm_execute_config.envp_allowlist;
+	if (!strs) {
+		ret = true;
+		goto out;
+	}
+
+	/*
+	 * Identify the key=value separation.
+	 * If none exists use the whole string as a key.
+	 */
+	equal = strchr(envp, '=');
+	equal_pos = equal ? (equal - envp) : strlen(envp);
+
+	/* Default to skip if no match found. */
+	ret = false;
+
+	do {
+		str_size = strlen(strs);
+
+		/*
+		 * If the filter length align with the key value equal sign,
+		 * it might be a match, check the key value.
+		 */
+		if (str_size == equal_pos &&
+		    !strncmp(strs, envp, str_size)) {
+			ret = true;
+			goto out;
+		}
+
+		strs += str_size + 1;
+	} while (*strs != 0);
+
+out:
+	return ret;
+}
+
+static bool encode_current_envp(pb_ostream_t *stream, const pb_field_t *field,
+				void * const *arg)
+{
+	struct string_arr_ctx *ctx = (struct string_arr_ctx *)*arg;
+	int i;
+	struct linux_binprm *bprm = ctx->bprm;
+	unsigned long offset = bprm->p % PAGE_SIZE;
+	unsigned long end = vma_pages(bprm->vma) * PAGE_SIZE;
+	char *argv = ctx->stack;
+	char *entry;
+	size_t limit, used = 0;
+	ssize_t ret;
+
+	limit = get_config_limit(&csm_execute_config.envp_limit);
+	if (!limit)
+		return true;
+
+	/* Skip arguments */
+	for (i = 0; i < bprm->argc; i++) {
+		if (!find_array_next_entry(argv, &offset, end))
+			return false;
+	}
+
+	for (i = 0; i < bprm->envc; i++) {
+		entry = find_array_next_entry(argv, &offset, end);
+		if (!entry)
+			return false;
+
+		if (!check_envp_allowlist(entry))
+			continue;
+
+		ret = pb_encode_string_field_limit(stream, field,
+						   (void * const *)&entry,
+						   limit - used);
+		if (ret < 0)
+			return false;
+
+		used += ret;
+
+		if (used >= limit)
+			break;
+	}
+
+	return true;
+}
+
+static bool is_overlayfs_mounted(struct file *file)
+{
+	struct vfsmount *mnt;
+	struct super_block *mnt_sb;
+
+	mnt = file->f_path.mnt;
+	if (mnt == NULL)
+		return false;
+
+	mnt_sb = mnt->mnt_sb;
+	if (mnt_sb == NULL || mnt_sb->s_magic != OVERLAYFS_SUPER_MAGIC)
+		return false;
+
+	return true;
+}
+
+/*
+ * Before the process starts, identify a possible container by checking if the
+ * task is on a pid namespace and the target file is using an overlayfs mounting
+ * point. This check is valid for COS and GKE but not all existing containers.
+ */
+static bool is_possible_container(struct task_struct *task,
+				  struct file *file)
+{
+	if (task_active_pid_ns(task) == &init_pid_ns)
+		return false;
+
+	return is_overlayfs_mounted(file);
+}
+
+/*
+ * Generates a random identifier for this boot instance.
+ * This identifier is generated only when needed to increase the entropy
+ * available compared to doing it at early boot.
+ */
+static u32 get_machine_id(void)
+{
+	int machineid, old;
+
+	machineid = atomic_read(&machine_rand);
+
+	if (unlikely(machineid == 0)) {
+		machineid = (int)get_random_int();
+		if (machineid == 0)
+			machineid = 1;
+		old = atomic_cmpxchg(&machine_rand, 0, machineid);
+
+		/* If someone beat us, use their value. */
+		if (old != 0)
+			machineid = old;
+	}
+
+	return (u32)machineid;
+}
+
+/*
+ * Generate a 128-bit unique identifier for the process by appending:
+ *  - A machine identifier unique per boot.
+ *  - The start time of the process in nanoseconds.
+ *  - The tgid for the set of threads in a process.
+ */
+static int get_process_uuid(struct task_struct *task, char *buffer, size_t size)
+{
+	union process_uuid *id = (union process_uuid *)buffer;
+
+	memset(buffer, 0, size);
+
+	if (WARN_ON(size < PROCESS_UUID_SIZE))
+		return -EINVAL;
+
+	id->machineid = get_machine_id();
+	id->start_time = ktime_mono_to_real(task->group_leader->start_time);
+	id->tgid = task_tgid_nr(task);
+
+	return 0;
+}
+
+int get_process_uuid_by_pid(pid_t pid_nr, char *buffer, size_t size)
+{
+	int err;
+	struct task_struct *task = NULL;
+
+	rcu_read_lock();
+	task = find_task_by_pid_ns(pid_nr, &init_pid_ns);
+	if (!task) {
+		err = -ENOENT;
+		goto out;
+	}
+	err = get_process_uuid(task, buffer, size);
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static int get_process_uuid_from_xattr(struct file *file, char *buffer,
+				       size_t size)
+{
+	struct dentry *dentry;
+	int err;
+	struct file_provenance prov;
+	union process_uuid *id = (union process_uuid *)buffer;
+
+	memset(buffer, 0, size);
+
+	if (WARN_ON(size < PROCESS_UUID_SIZE))
+		return -EINVAL;
+
+	/* The file is part of overlayfs on the upper layer. */
+	if (!is_overlayfs_mounted(file))
+		return -ENODATA;
+
+	dentry = ovl_dentry_upper(file->f_path.dentry);
+	if (!dentry)
+		return -ENODATA;
+
+	err = __vfs_getxattr(dentry, dentry->d_inode,
+			     XATTR_SECURITY_CSM, &prov, sizeof(prov));
+	/* returns -ENODATA if the xattr does not exist. */
+	if (err < 0)
+		return err;
+	if (err != sizeof(prov)) {
+		pr_err("unexpected size for xattr: %zu -> %d\n",
+		       size, err);
+		return -ENODATA;
+	}
+
+	id->machineid = get_machine_id();
+	id->start_time = prov.start_time;
+	id->tgid = prov.tgid;
+	return 0;
+}
+
+u64 csm_set_contid(struct task_struct *task)
+{
+	u64 cid;
+	struct pid_namespace *ns;
+
+	ns = task_active_pid_ns(task);
+	if (WARN_ON(!task->audit) || WARN_ON(!ns))
+		return AUDIT_CID_UNSET;
+
+	cid = atomic_inc_return(&contid);
+	task->audit->contid = cid;
+
+	/*
+	 * If the namespace container-id is not set, use the one assigned
+	 * to the first process created.
+	 */
+	cmpxchg(&ns->cid, 0, cid);
+	return cid;
+}
+
+u64 csm_get_ns_contid(struct pid_namespace *ns)
+{
+	if (!ns || !ns->cid)
+		return AUDIT_CID_UNSET;
+
+	return ns->cid;
+}
+
+union ip_data {
+	struct in_addr ip4;
+	struct in6_addr ip6;
+};
+
+struct file_data {
+	void *allocated;
+	union ip_data local;
+	union ip_data remote;
+	char modified_uuid[PROCESS_UUID_SIZE];
+};
+
+static void free_file_data(struct file_data *fdata)
+{
+	free_page((unsigned long)fdata->allocated);
+	fdata->allocated = NULL;
+}
+
+static void fill_socket_description(struct sockaddr_storage *saddr,
+				   union ip_data *idata,
+				   schema_SocketIp *schema_socketip)
+{
+	struct sockaddr_in *sin4 = (struct sockaddr_in *)saddr;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)saddr;
+
+	schema_socketip->family = saddr->ss_family;
+
+	switch (saddr->ss_family) {
+	case AF_INET:
+		schema_socketip->port = ntohs(sin4->sin_port);
+		idata->ip4 = sin4->sin_addr;
+		schema_socketip->ip.funcs.encode = pb_encode_ip4;
+		schema_socketip->ip.arg = &idata->ip4;
+		break;
+	case AF_INET6:
+		schema_socketip->port = ntohs(sin6->sin6_port);
+		idata->ip6 = sin6->sin6_addr;
+		schema_socketip->ip.funcs.encode = pb_encode_ip6;
+		schema_socketip->ip.arg = &idata->ip6;
+		break;
+	}
+}
+
+static int fill_file_overlayfs(struct file *file, schema_File *schema_file,
+			       struct file_data *fdata)
+{
+	struct dentry *dentry;
+	int err;
+	schema_Overlay *overlayfs;
+
+	/* If not an overlayfs superblock, done. */
+	if (!is_overlayfs_mounted(file))
+		return 0;
+
+	dentry = file->f_path.dentry;
+	schema_file->which_filesystem = schema_File_overlayfs_tag;
+	overlayfs = &schema_file->filesystem.overlayfs;
+	overlayfs->lower_layer = ovl_dentry_lower(dentry);
+	overlayfs->upper_layer = ovl_dentry_upper(dentry);
+
+	err = get_process_uuid_from_xattr(file, fdata->modified_uuid,
+					  sizeof(fdata->modified_uuid));
+	/* If there is no xattr, just skip the modified_uuid field. */
+	if (err == -ENODATA)
+		return 0;
+	if (err < 0)
+		return err;
+
+	overlayfs->modified_uuid.funcs.encode = pb_encode_uuid_field;
+	overlayfs->modified_uuid.arg = fdata->modified_uuid;
+	return 0;
+}
+
+static int fill_file_description(struct file *file, schema_File *schema_file,
+				 struct file_data *fdata)
+{
+	char *buf;
+	int err;
+	u32 mode;
+	char *path;
+	struct socket *socket;
+	schema_Socket *socketfs;
+	struct sockaddr_storage saddr;
+
+	memset(fdata, 0, sizeof(*fdata));
+
+	if (file == NULL)
+		return 0;
+
+	schema_file->ino = file_inode(file)->i_ino;
+	mode = file_inode(file)->i_mode;
+
+	/* For pipes, no need to resolve the path. */
+	if (S_ISFIFO(mode))
+		return 0;
+
+	if (S_ISSOCK(mode)) {
+		socket = (struct socket *)file->private_data;
+		socketfs = &schema_file->filesystem.socket;
+
+		/* Local socket */
+		err = kernel_getsockname(socket, (struct sockaddr *)&saddr);
+		if (err >= 0) {
+			fill_socket_description(&saddr, &fdata->local,
+					       &socketfs->local);
+		}
+
+		/* Remote socket, might not be connected. */
+		err = kernel_getpeername(socket, (struct sockaddr *)&saddr);
+		if (err >= 0) {
+			fill_socket_description(&saddr, &fdata->remote,
+					       &socketfs->remote);
+		}
+
+		schema_file->which_filesystem = schema_File_socket_tag;
+		return 0;
+	}
+
+	/*
+	 * From this point, we care about all the other types of files as their
+	 * path provides interesting insight.
+	 */
+	buf = (char *)__get_free_page(GFP_KERNEL);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	fdata->allocated = buf;
+
+	path = d_path(&file->f_path, buf, PAGE_SIZE);
+	if (IS_ERR(path)) {
+		free_file_data(fdata);
+		return PTR_ERR(path);
+	}
+
+	schema_file->fullpath.funcs.encode = pb_encode_string_field;
+	schema_file->fullpath.arg = path; /* buf is freed in free_file_data. */
+
+	err = fill_file_overlayfs(file, schema_file, fdata);
+	if (err) {
+		free_file_data(fdata);
+		return err;
+	}
+
+	return 0;
+}
+
+static int fill_stream_description(schema_Descriptor *desc, int fd,
+				   struct file_data *fdata)
+{
+	struct fd sfd;
+	struct file *file;
+	int err = 0;
+
+	sfd = fdget(fd);
+	file = sfd.file;
+
+	if (file == NULL) {
+		memset(fdata, 0, sizeof(*fdata));
+		goto end;
+	}
+
+	desc->mode = file_inode(file)->i_mode;
+	err = fill_file_description(file, &desc->file, fdata);
+
+end:
+	fdput(sfd);
+	return err;
+}
+
+static int populate_proc_uuid_common(schema_Process *proc, char *uuid,
+				     size_t uuid_size, char *parent_uuid,
+				     size_t parent_uuid_size,
+				     struct task_struct *task)
+{
+	int err;
+	struct task_struct *parent;
+	/* Generate unique identifier for the process and its parent */
+	err = get_process_uuid(task, uuid, uuid_size);
+	if (err)
+		return err;
+
+	proc->uuid.funcs.encode = pb_encode_uuid_field;
+	proc->uuid.arg = uuid;
+
+	rcu_read_lock();
+
+	if (!pid_alive(task))
+		goto out;
+	/*
+	 * I don't think this needs to be task_rcu_dereference because
+	 * real_parent is only supposed to be accessed using RCU.
+	 */
+	parent = rcu_dereference(task->real_parent);
+
+	if (parent) {
+		err = get_process_uuid(parent, parent_uuid, parent_uuid_size);
+		if (!err) {
+			proc->parent_uuid.funcs.encode = pb_encode_uuid_field;
+			proc->parent_uuid.arg = parent_uuid;
+		}
+	}
+
+out:
+	rcu_read_unlock();
+
+	return err;
+}
+
+/* Populate the fields that we always want to set in Process messages. */
+static int populate_proc_common(schema_Process *proc, char *uuid,
+				size_t uuid_size, char *parent_uuid,
+				size_t parent_uuid_size,
+				struct task_struct *task)
+{
+	u64 cid;
+	struct pid_namespace *ns = task_active_pid_ns(task);
+
+	/* Container identifier for the current namespace. */
+	proc->container_id = csm_get_ns_contid(ns);
+
+	/*
+	 * If the process container-id is different, the process tree is part of
+	 * a different session within the namespace (kubectl/docker exec,
+	 * liveness probe or others).
+	 */
+	cid = audit_get_contid(task);
+	if (proc->container_id != cid)
+		proc->exec_session_id = cid;
+
+	/* Add information about pid in different namespaces */
+	proc->pid = task_pid_nr(task);
+	proc->parent_pid = task_ppid_nr(task);
+	proc->container_pid = task_pid_nr_ns(task, ns);
+	proc->container_parent_pid = task_ppid_nr_ns(task, ns);
+
+	return populate_proc_uuid_common(proc, uuid, uuid_size, parent_uuid,
+					 parent_uuid_size, task);
+}
+
+int csm_bprm_check_security(struct linux_binprm *bprm)
+{
+	char uuid[PROCESS_UUID_SIZE];
+	char parent_uuid[PROCESS_UUID_SIZE];
+	int err;
+	schema_Event event = schema_Event_init_zero;
+	schema_Process *proc;
+	struct string_arr_ctx argv_ctx;
+	void *stack = NULL, *ctx = NULL;
+	u64 cid;
+	struct file_data path_data = {};
+	struct file_data stdin_data = {};
+	struct file_data stdout_data = {};
+	struct file_data stderr_data = {};
+
+	/*
+	 * Always create a container-id for containerized processes.
+	 * If the LSM is enabled later, we can track existing containers.
+	 */
+	cid = audit_get_contid(current);
+
+	if (cid == AUDIT_CID_UNSET) {
+		if (!is_possible_container(current, bprm->file))
+			return 0;
+
+		cid = csm_set_contid(current);
+
+		if (cid == AUDIT_CID_UNSET)
+			return 0;
+	}
+
+	if (!csm_execute_enabled)
+		return 0;
+
+	/* The interpreter will call us again with more context. */
+	if (bprm->buf[0] == '#' && bprm->buf[1] == '!')
+		return 0;
+
+	proc = &event.event.execute.proc;
+	err = populate_proc_common(proc, uuid, sizeof(uuid), parent_uuid,
+				   sizeof(parent_uuid), current);
+	if (err)
+		goto out_free_buf;
+
+	proc->creation_timestamp = ktime_get_real_ns();
+
+	/* Provide information about the launched binary. */
+	err = fill_file_description(bprm->file, &proc->binary, &path_data);
+	if (err)
+		goto out_free_buf;
+
+	/* Information about streams */
+	err = fill_stream_description(&proc->streams.stdin, STDIN_FILENO,
+				      &stdin_data);
+	if (err)
+		goto out_free_buf;
+
+	err = fill_stream_description(&proc->streams.stdout, STDOUT_FILENO,
+				      &stdout_data);
+	if (err)
+		goto out_free_buf;
+
+	err = fill_stream_description(&proc->streams.stderr, STDERR_FILENO,
+				      &stderr_data);
+	if (err)
+		goto out_free_buf;
+
+	stack = kmap_argument_stack(bprm, &ctx);
+	if (!stack) {
+		err = -EFAULT;
+		goto out_free_buf;
+	}
+
+	/* Capture process argument */
+	argv_ctx.bprm = bprm;
+	argv_ctx.stack = stack;
+	proc->args.argv.funcs.encode = encode_current_argv;
+	proc->args.argv.arg = &argv_ctx;
+
+	/* Capture process environment variables */
+	proc->args.envp.funcs.encode = encode_current_envp;
+	proc->args.envp.arg = &argv_ctx;
+
+	event.which_event = schema_Event_execute_tag;
+
+	/*
+	 * Configurations options are checked when computing the serialized
+	 * protobufs.
+	 */
+	down_read(&csm_rwsem_config);
+	err = csm_sendeventproto(schema_Event_fields, &event);
+	up_read(&csm_rwsem_config);
+
+	if (err)
+		pr_err("csm_sendeventproto returned %d on execve\n", err);
+	err = 0;
+
+out_free_buf:
+	kunmap_argument_stack(bprm, stack, ctx);
+	free_file_data(&path_data);
+	free_file_data(&stdin_data);
+	free_file_data(&stdout_data);
+	free_file_data(&stderr_data);
+
+	/*
+	 * On failure, enforce it only if the execute config is enabled.
+	 * If the collector was disabled, prefer to succeed to not impact the
+	 * system.
+	 */
+	if (unlikely(err < 0 && !csm_execute_enabled))
+		err = 0;
+
+	return err;
+}
+
+/* Create a clone event when a new task leader is created. */
+void csm_task_post_alloc(struct task_struct *task)
+{
+	int err;
+	char uuid[PROCESS_UUID_SIZE];
+	char parent_uuid[PROCESS_UUID_SIZE];
+	schema_Event event = schema_Event_init_zero;
+	schema_Process *proc;
+
+	if (!csm_execute_enabled ||
+	    audit_get_contid(task) == AUDIT_CID_UNSET ||
+	    !thread_group_leader(task))
+		return;
+
+	proc = &event.event.clone.proc;
+
+	err = populate_proc_uuid_common(proc, uuid, sizeof(uuid), parent_uuid,
+					sizeof(parent_uuid), task);
+
+	event.which_event = schema_Event_clone_tag;
+	err = csm_sendeventproto(schema_Event_fields, &event);
+	if (err)
+		pr_err("csm_sendeventproto returned %d on exit\n", err);
+}
+
+/*
+ * This LSM hook callback doesn't exist upstream and is called only when the
+ * last thread of a thread group exit.
+ */
+void csm_task_exit(struct task_struct *task)
+{
+	int err;
+	schema_Event event = schema_Event_init_zero;
+	schema_ExitEvent *exit;
+	char uuid[PROCESS_UUID_SIZE];
+
+	if (!csm_execute_enabled ||
+	    audit_get_contid(task) == AUDIT_CID_UNSET)
+		return;
+
+	exit = &event.event.exit;
+
+	/* Fetch the unique identifier for this process */
+	err = get_process_uuid(task, uuid, sizeof(uuid));
+	if (err) {
+		pr_err("failed to get process uuid on exit\n");
+		return;
+	}
+
+	exit->process_uuid.funcs.encode = pb_encode_uuid_field;
+	exit->process_uuid.arg = uuid;
+
+	event.which_event = schema_Event_exit_tag;
+
+	err = csm_sendeventproto(schema_Event_fields, &event);
+	if (err)
+		pr_err("csm_sendeventproto returned %d on exit\n", err);
+}
+
+int csm_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
+		unsigned long prot)
+{
+	char uuid[PROCESS_UUID_SIZE];
+	char parent_uuid[PROCESS_UUID_SIZE];
+	int err;
+	schema_Event event = schema_Event_init_zero;
+	schema_MemoryExecEvent *memexec;
+	u64 cid;
+	struct file_data path_data = {};
+
+	cid = audit_get_contid(current);
+
+	if (!csm_memexec_enabled ||
+	    !(prot & PROT_EXEC) ||
+	    vma->vm_file == NULL ||
+	    cid == AUDIT_CID_UNSET)
+		return 0;
+
+	memexec = &event.event.memexec;
+
+	err = fill_file_description(vma->vm_file, &memexec->mapped_file,
+				    &path_data);
+	if (err)
+		return err;
+
+	err = populate_proc_common(&memexec->proc, uuid, sizeof(uuid),
+				   parent_uuid, sizeof(parent_uuid), current);
+	if (err)
+		goto out;
+
+	memexec->prot_exec_timestamp = ktime_get_real_ns();
+	memexec->new_flags = prot;
+	memexec->req_flags = reqprot;
+	memexec->old_vm_flags = vma->vm_flags;
+
+	memexec->action = schema_MemoryExecEvent_Action_MPROTECT;
+	memexec->start_addr = vma->vm_start;
+	memexec->end_addr = vma->vm_end;
+
+	event.which_event = schema_Event_memexec_tag;
+
+	err = csm_sendeventproto(schema_Event_fields, &event);
+	if (err)
+		pr_err("csm_sendeventproto returned %d on mprotect\n", err);
+	err = 0;
+
+	if (unlikely(err < 0 && !csm_memexec_enabled))
+		err = 0;
+
+out:
+	free_file_data(&path_data);
+	return err;
+}
+
+int csm_mmap_file(struct file *file, unsigned long reqprot,
+		unsigned long prot, unsigned long flags)
+{
+	char uuid[PROCESS_UUID_SIZE];
+	char parent_uuid[PROCESS_UUID_SIZE];
+	int err;
+	schema_Event event = schema_Event_init_zero;
+	schema_MemoryExecEvent *memexec;
+	struct file *exe_file;
+	u64 cid;
+	struct file_data path_data = {};
+
+	cid = audit_get_contid(current);
+
+	if (!csm_memexec_enabled ||
+	    !(prot & PROT_EXEC) ||
+	    file == NULL ||
+	    cid == AUDIT_CID_UNSET)
+		return 0;
+
+	memexec = &event.event.memexec;
+	err = fill_file_description(file, &memexec->mapped_file,
+				    &path_data);
+	if (err)
+		return err;
+
+	err = populate_proc_common(&memexec->proc, uuid, sizeof(uuid),
+				   parent_uuid, sizeof(parent_uuid), current);
+	if (err)
+		goto out;
+
+	/* get_mm_exe_file does its own locking on mm_sem. */
+	exe_file = get_mm_exe_file(current->mm);
+	if (exe_file) {
+		if (path_equal(&file->f_path, &exe_file->f_path))
+			memexec->is_initial_mmap = 1;
+		fput(exe_file);
+	}
+
+	memexec->prot_exec_timestamp = ktime_get_real_ns();
+	memexec->new_flags = prot;
+	memexec->req_flags = reqprot;
+	memexec->mmap_flags = flags;
+	memexec->action = schema_MemoryExecEvent_Action_MMAP_FILE;
+	event.which_event = schema_Event_memexec_tag;
+
+	err = csm_sendeventproto(schema_Event_fields, &event);
+	if (err)
+		pr_err("csm_sendeventproto returned %d on mmap_file\n", err);
+	err = 0;
+
+	if (unlikely(err < 0 && !csm_memexec_enabled))
+		err = 0;
+
+out:
+	free_file_data(&path_data);
+	return err;
+}
+
+void csm_file_pre_free(struct file *file)
+{
+	struct dentry *dentry;
+	int err;
+	struct file_provenance prov;
+
+	/* The file was opened to be modified and the LSM is enabled */
+	if (!(file->f_mode & FMODE_WRITE) ||
+	    !csm_enabled)
+		return;
+
+	/* The current process is containerized. */
+	if (audit_get_contid(current) == AUDIT_CID_UNSET)
+		return;
+
+	/* The file is part of overlayfs on the upper layer. */
+	if (!is_overlayfs_mounted(file))
+		return;
+
+	dentry = ovl_dentry_upper(file->f_path.dentry);
+	if (!dentry)
+		return;
+
+	err = __vfs_getxattr(dentry, dentry->d_inode, XATTR_SECURITY_CSM,
+			     NULL, 0);
+	if (err != -ENODATA) {
+		if (err < 0)
+			pr_err("failed to get security attribute: %d\n", err);
+		return;
+	}
+
+	prov.tgid = task_tgid_nr(current);
+	prov.start_time = ktime_mono_to_real(current->group_leader->start_time);
+
+	err = __vfs_setxattr(dentry, dentry->d_inode, XATTR_SECURITY_CSM, &prov,
+			     sizeof(prov), 0);
+	if (err < 0)
+		pr_err("failed to set security attribute: %d\n", err);
+}
+
+/*
+ * Based off of fs/proc/base.c:next_tgid
+ *
+ * next_thread_group_leader returns the task_struct of the next task with a pid
+ * greater than or equal to tgid. The reference count is increased so that
+ * rcu_read_unlock may be called, and preemption reenabled.
+ */
+static struct task_struct *next_thread_group_leader(pid_t *tgid)
+{
+	struct pid *pid;
+	struct task_struct *task;
+
+	cond_resched();
+	rcu_read_lock();
+retry:
+	task = NULL;
+	pid = find_ge_pid(*tgid, &init_pid_ns);
+	if (pid) {
+		*tgid = pid_nr_ns(pid, &init_pid_ns);
+		task = pid_task(pid, PIDTYPE_PID);
+		if (!task || !has_group_leader_pid(task) ||
+		    audit_get_contid(task) == AUDIT_CID_UNSET) {
+			(*tgid) += 1;
+			goto retry;
+		}
+
+		/*
+		 * Increment the reference count on the task before leaving
+		 * the RCU grace period.
+		 */
+		get_task_struct(task);
+		(*tgid) += 1;
+	}
+
+	rcu_read_unlock();
+	return task;
+}
+
+void delayed_enumerate_processes(struct work_struct *work)
+{
+	pid_t tgid = 0;
+	struct task_struct *task;
+	struct csm_enumerate_processes_work_data *wd = container_of(
+		work, struct csm_enumerate_processes_work_data, work);
+	int wd_enumeration_count = wd->enumeration_count;
+
+	kfree(wd);
+	wd = NULL;
+	work = NULL;
+
+	/*
+	 * Try for only a single enumeration routine at a time, as long as the
+	 * execute collector is enabled.
+	 */
+	while ((wd_enumeration_count == atomic_read(&enumeration_count)) &&
+	       READ_ONCE(csm_execute_enabled) &&
+	       (task = next_thread_group_leader(&tgid))) {
+		int err;
+		char uuid[PROCESS_UUID_SIZE];
+		char parent_uuid[PROCESS_UUID_SIZE];
+		struct file *exe_file = NULL;
+		struct file_data path_data = {};
+		schema_Event event = schema_Event_init_zero;
+		schema_Process *proc = &event.event.enumproc.proc;
+
+		exe_file = get_task_exe_file(task);
+		if (!exe_file) {
+			pr_err("failed to get enumerated process executable, pid: %u\n",
+			       task_pid_nr(task));
+			goto next;
+		}
+
+		err = fill_file_description(exe_file, &proc->binary,
+					    &path_data);
+		if (err) {
+			pr_err("failed to fill enumerated process %u executable description: %d\n",
+			       task_pid_nr(task), err);
+			goto next;
+		}
+
+		err = populate_proc_common(proc, uuid, sizeof(uuid),
+					   parent_uuid, sizeof(parent_uuid),
+					   task);
+		if (err) {
+			pr_err("failed to set pid %u common fields: %d\n",
+			       task_pid_nr(task), err);
+			goto next;
+		}
+
+		if (task->flags & PF_EXITING)
+			goto next;
+
+		event.which_event = schema_Event_enumproc_tag;
+		err = csm_sendeventproto(schema_Event_fields,
+					 &event);
+		if (err) {
+			pr_err("failed to send pid %u enumerated process: %d\n",
+			       task_pid_nr(task), err);
+			goto next;
+		}
+next:
+		free_file_data(&path_data);
+		if (exe_file)
+			fput(exe_file);
+
+		put_task_struct(task);
+	}
+}
+
+void csm_enumerate_processes(unsigned long const config_version)
+{
+	struct csm_enumerate_processes_work_data *wd;
+
+	wd = kmalloc(sizeof(*wd), GFP_KERNEL);
+	if (!wd)
+		return;
+
+	INIT_WORK(&wd->work, delayed_enumerate_processes);
+	wd->enumeration_count = atomic_add_return(1, &enumeration_count);
+	schedule_work(&wd->work);
+}

diff --git a/security/container/process.h b/security/container/process.h
new file mode 100644
index 0000000..1c98134
--- /dev/null
+++ b/security/container/process.h

@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Container Security Monitor module
+ *
+ * Copyright (c) 2019 Google, Inc
+ */
+
+void csm_enumerate_processes(void);

diff --git a/security/container/protos/Makefile b/security/container/protos/Makefile
new file mode 100644
index 0000000..a88068b
--- /dev/null
+++ b/security/container/protos/Makefile

@@ -0,0 +1,10 @@
+subdir-$(CONFIG_SECURITY_CONTAINER_MONITOR) += nanopb
+
+obj-$(CONFIG_SECURITY_CONTAINER_MONITOR) += nanopb/
+obj-$(CONFIG_SECURITY_CONTAINER_MONITOR) += protos.o
+
+protos-y := config.pb.o event.pb.o
+
+ccflags-y := -I$(srctree)/security/container/protos \
+	-I$(srctree)/security/container/protos/nanopb \
+	$(PB_CCFLAGS)

diff --git a/security/container/protos/README b/security/container/protos/README
new file mode 100644
index 0000000..1b0628a
--- /dev/null
+++ b/security/container/protos/README

@@ -0,0 +1,18 @@
+This document provides guidance on how to change the protos used in this directory.
+
+Any change made to a proto file require to reformat it and regenerate nanopb
+sources. It also requires the proto files to be compatible to previously released versions.
+
+To reformat any proto file run: "clang-format -style=Google -i <file.proto>"
+
+To regenerate nanopb files:
+ - Install protoc
+   - apt-get install protobuf-compiler
+ - Clone/setup nanopb for version 0.3.9.1 (or clone the internal depot)
+   - git clone --depth=1 https://github.com/nanopb/nanopb.git
+   - cd nanopb
+   - git fetch --tags
+   - git checkout tags/0.3.9.1
+   - make -C generator/proto
+ - Run protoc with the nanopb definition
+   - protoc --plugin=<path_to_nanopb>/generator/protoc-gen-nanopb --nanopb_out=<path_to_linux>/security/container/protos/ <path_to_linux>/security/container/protos/<file.proto> --proto_path=<path_to_linux>/security/container/protos

diff --git a/security/container/protos/config.pb.c b/security/container/protos/config.pb.c
new file mode 100644
index 0000000..211bab0
--- /dev/null
+++ b/security/container/protos/config.pb.c

@@ -0,0 +1,72 @@
+/* Automatically generated nanopb constant definitions */
+/* Generated by nanopb-0.3.9.3 at Wed Jun  5 11:00:24 2019. */
+
+#include "config.pb.h"
+
+/* @@protoc_insertion_point(includes) */
+#if PB_PROTO_HEADER_VERSION != 30
+#error Regenerate this file with the current version of nanopb generator.
+#endif
+
+
+
+const pb_field_t schema_ContainerCollectorConfig_fields[2] = {
+    PB_FIELD(  1, BOOL    , SINGULAR, STATIC  , FIRST, schema_ContainerCollectorConfig, enabled, enabled, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_ExecuteCollectorConfig_fields[5] = {
+    PB_FIELD(  1, BOOL    , SINGULAR, STATIC  , FIRST, schema_ExecuteCollectorConfig, enabled, enabled, 0),
+    PB_FIELD(  2, UINT32  , SINGULAR, STATIC  , OTHER, schema_ExecuteCollectorConfig, argv_limit, enabled, 0),
+    PB_FIELD(  3, UINT32  , SINGULAR, STATIC  , OTHER, schema_ExecuteCollectorConfig, envp_limit, argv_limit, 0),
+    PB_FIELD(  4, STRING  , REPEATED, CALLBACK, OTHER, schema_ExecuteCollectorConfig, envp_allowlist, envp_limit, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_MemExecCollectorConfig_fields[2] = {
+    PB_FIELD(  1, BOOL    , SINGULAR, STATIC  , FIRST, schema_MemExecCollectorConfig, enabled, enabled, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_ConfigurationRequest_fields[4] = {
+    PB_FIELD(  1, MESSAGE , SINGULAR, STATIC  , FIRST, schema_ConfigurationRequest, container_config, container_config, &schema_ContainerCollectorConfig_fields),
+    PB_FIELD(  2, MESSAGE , SINGULAR, STATIC  , OTHER, schema_ConfigurationRequest, execute_config, container_config, &schema_ExecuteCollectorConfig_fields),
+    PB_FIELD(  3, MESSAGE , SINGULAR, STATIC  , OTHER, schema_ConfigurationRequest, memexec_config, execute_config, &schema_MemExecCollectorConfig_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_ConfigurationResponse_fields[5] = {
+    PB_FIELD(  1, UENUM   , SINGULAR, STATIC  , FIRST, schema_ConfigurationResponse, error, error, 0),
+    PB_FIELD(  2, STRING  , SINGULAR, CALLBACK, OTHER, schema_ConfigurationResponse, msg, error, 0),
+    PB_FIELD(  3, UINT64  , SINGULAR, STATIC  , OTHER, schema_ConfigurationResponse, version, msg, 0),
+    PB_FIELD(  4, UINT32  , SINGULAR, STATIC  , OTHER, schema_ConfigurationResponse, kernel_version, version, 0),
+    PB_LAST_FIELD
+};
+
+
+
+/* Check that field information fits in pb_field_t */
+#if !defined(PB_FIELD_32BIT)
+/* If you get an error here, it means that you need to define PB_FIELD_32BIT
+ * compile-time option. You can do that in pb.h or on compiler command line.
+ * 
+ * The reason you need to do this is that some of your messages contain tag
+ * numbers or field sizes that are larger than what can fit in 8 or 16 bit
+ * field descriptors.
+ */
+PB_STATIC_ASSERT((pb_membersize(schema_ConfigurationRequest, container_config) < 65536 && pb_membersize(schema_ConfigurationRequest, execute_config) < 65536 && pb_membersize(schema_ConfigurationRequest, memexec_config) < 65536), YOU_MUST_DEFINE_PB_FIELD_32BIT_FOR_MESSAGES_schema_ContainerCollectorConfig_schema_ExecuteCollectorConfig_schema_MemExecCollectorConfig_schema_ConfigurationRequest_schema_ConfigurationResponse)
+#endif
+
+#if !defined(PB_FIELD_16BIT) && !defined(PB_FIELD_32BIT)
+/* If you get an error here, it means that you need to define PB_FIELD_16BIT
+ * compile-time option. You can do that in pb.h or on compiler command line.
+ * 
+ * The reason you need to do this is that some of your messages contain tag
+ * numbers or field sizes that are larger than what can fit in the default
+ * 8 bit descriptors.
+ */
+PB_STATIC_ASSERT((pb_membersize(schema_ConfigurationRequest, container_config) < 256 && pb_membersize(schema_ConfigurationRequest, execute_config) < 256 && pb_membersize(schema_ConfigurationRequest, memexec_config) < 256), YOU_MUST_DEFINE_PB_FIELD_16BIT_FOR_MESSAGES_schema_ContainerCollectorConfig_schema_ExecuteCollectorConfig_schema_MemExecCollectorConfig_schema_ConfigurationRequest_schema_ConfigurationResponse)
+#endif
+
+
+/* @@protoc_insertion_point(eof) */

diff --git a/security/container/protos/config.pb.h b/security/container/protos/config.pb.h
new file mode 100644
index 0000000..2685d2b
--- /dev/null
+++ b/security/container/protos/config.pb.h

@@ -0,0 +1,116 @@
+/* Automatically generated nanopb header */
+/* Generated by nanopb-0.3.9.3 at Wed Jun  5 11:00:24 2019. */
+
+#ifndef PB_SCHEMA_CONFIG_PB_H_INCLUDED
+#define PB_SCHEMA_CONFIG_PB_H_INCLUDED
+#include <pb.h>
+
+/* @@protoc_insertion_point(includes) */
+#if PB_PROTO_HEADER_VERSION != 30
+#error Regenerate this file with the current version of nanopb generator.
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Enum definitions */
+typedef enum _schema_ConfigurationResponse_ErrorCode {
+    schema_ConfigurationResponse_ErrorCode_NO_ERROR = 0,
+    schema_ConfigurationResponse_ErrorCode_UNKNOWN = 2
+} schema_ConfigurationResponse_ErrorCode;
+#define _schema_ConfigurationResponse_ErrorCode_MIN schema_ConfigurationResponse_ErrorCode_NO_ERROR
+#define _schema_ConfigurationResponse_ErrorCode_MAX schema_ConfigurationResponse_ErrorCode_UNKNOWN
+#define _schema_ConfigurationResponse_ErrorCode_ARRAYSIZE ((schema_ConfigurationResponse_ErrorCode)(schema_ConfigurationResponse_ErrorCode_UNKNOWN+1))
+
+/* Struct definitions */
+typedef struct _schema_ConfigurationResponse {
+    schema_ConfigurationResponse_ErrorCode error;
+    pb_callback_t msg;
+    uint64_t version;
+    uint32_t kernel_version;
+/* @@protoc_insertion_point(struct:schema_ConfigurationResponse) */
+} schema_ConfigurationResponse;
+
+typedef struct _schema_ContainerCollectorConfig {
+    bool enabled;
+/* @@protoc_insertion_point(struct:schema_ContainerCollectorConfig) */
+} schema_ContainerCollectorConfig;
+
+typedef struct _schema_ExecuteCollectorConfig {
+    bool enabled;
+    uint32_t argv_limit;
+    uint32_t envp_limit;
+    pb_callback_t envp_allowlist;
+/* @@protoc_insertion_point(struct:schema_ExecuteCollectorConfig) */
+} schema_ExecuteCollectorConfig;
+
+typedef struct _schema_MemExecCollectorConfig {
+    bool enabled;
+/* @@protoc_insertion_point(struct:schema_MemExecCollectorConfig) */
+} schema_MemExecCollectorConfig;
+
+typedef struct _schema_ConfigurationRequest {
+    schema_ContainerCollectorConfig container_config;
+    schema_ExecuteCollectorConfig execute_config;
+    schema_MemExecCollectorConfig memexec_config;
+/* @@protoc_insertion_point(struct:schema_ConfigurationRequest) */
+} schema_ConfigurationRequest;
+
+/* Default values for struct fields */
+
+/* Initializer values for message structs */
+#define schema_ContainerCollectorConfig_init_default {0}
+#define schema_ExecuteCollectorConfig_init_default {0, 0, 0, {{NULL}, NULL}}
+#define schema_MemExecCollectorConfig_init_default {0}
+#define schema_ConfigurationRequest_init_default {schema_ContainerCollectorConfig_init_default, schema_ExecuteCollectorConfig_init_default, schema_MemExecCollectorConfig_init_default}
+#define schema_ConfigurationResponse_init_default {_schema_ConfigurationResponse_ErrorCode_MIN, {{NULL}, NULL}, 0, 0}
+#define schema_ContainerCollectorConfig_init_zero {0}
+#define schema_ExecuteCollectorConfig_init_zero  {0, 0, 0, {{NULL}, NULL}}
+#define schema_MemExecCollectorConfig_init_zero  {0}
+#define schema_ConfigurationRequest_init_zero    {schema_ContainerCollectorConfig_init_zero, schema_ExecuteCollectorConfig_init_zero, schema_MemExecCollectorConfig_init_zero}
+#define schema_ConfigurationResponse_init_zero   {_schema_ConfigurationResponse_ErrorCode_MIN, {{NULL}, NULL}, 0, 0}
+
+/* Field tags (for use in manual encoding/decoding) */
+#define schema_ConfigurationResponse_error_tag   1
+#define schema_ConfigurationResponse_msg_tag     2
+#define schema_ConfigurationResponse_version_tag 3
+#define schema_ConfigurationResponse_kernel_version_tag 4
+#define schema_ContainerCollectorConfig_enabled_tag 1
+#define schema_ExecuteCollectorConfig_enabled_tag 1
+#define schema_ExecuteCollectorConfig_argv_limit_tag 2
+#define schema_ExecuteCollectorConfig_envp_limit_tag 3
+#define schema_ExecuteCollectorConfig_envp_allowlist_tag 4
+#define schema_MemExecCollectorConfig_enabled_tag 1
+#define schema_ConfigurationRequest_container_config_tag 1
+#define schema_ConfigurationRequest_execute_config_tag 2
+#define schema_ConfigurationRequest_memexec_config_tag 3
+
+/* Struct field encoding specification for nanopb */
+extern const pb_field_t schema_ContainerCollectorConfig_fields[2];
+extern const pb_field_t schema_ExecuteCollectorConfig_fields[5];
+extern const pb_field_t schema_MemExecCollectorConfig_fields[2];
+extern const pb_field_t schema_ConfigurationRequest_fields[4];
+extern const pb_field_t schema_ConfigurationResponse_fields[5];
+
+/* Maximum encoded size of messages (where known) */
+#define schema_ContainerCollectorConfig_size     2
+/* schema_ExecuteCollectorConfig_size depends on runtime parameters */
+#define schema_MemExecCollectorConfig_size       2
+/* schema_ConfigurationRequest_size depends on runtime parameters */
+/* schema_ConfigurationResponse_size depends on runtime parameters */
+
+/* Message IDs (where set with "msgid" option) */
+#ifdef PB_MSGID
+
+#define CONFIG_MESSAGES \
+
+
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+/* @@protoc_insertion_point(eof) */
+
+#endif

diff --git a/security/container/protos/config.proto b/security/container/protos/config.proto
new file mode 100644
index 0000000..e32a517
--- /dev/null
+++ b/security/container/protos/config.proto

@@ -0,0 +1,51 @@
+syntax = "proto3";
+
+package schema;
+
+// Collect information about running containers
+message ContainerCollectorConfig {
+  bool enabled = 1;
+}
+
+message ExecuteCollectorConfig {
+  bool enabled = 1;
+
+  // truncate argv/envp if cumulative length exceeds limit
+  uint32 argv_limit = 2;
+  uint32 envp_limit = 3;
+
+  // If specified, only report the named environment variables.  An
+  // empty envp_allowlist indicates that all environment variables
+  // should be reported up to a cumulative total of envp_limit bytes.
+  repeated string envp_allowlist = 4;
+}
+
+// Collect information about executable memory mappings.
+message MemExecCollectorConfig {
+  bool enabled = 1;
+}
+
+// Convey configuration information to Guest LSM
+message ConfigurationRequest {
+  ContainerCollectorConfig container_config = 1;
+  ExecuteCollectorConfig execute_config = 2;
+  MemExecCollectorConfig memexec_config = 3;
+
+  // Additional configuration messages will be added as new collectors
+  // are implemented
+}
+
+// Report success or failure of previous ConfigurationRequest
+message ConfigurationResponse {
+  enum ErrorCode {
+    // Keep values in sync with
+    // https://github.com/googleapis/googleapis/blob/master/google/rpc/code.proto
+    NO_ERROR = 0;
+    UNKNOWN = 2;
+  }
+
+  ErrorCode error = 1;
+  string msg = 2;
+  uint64 version = 3;         // Version of the LSM
+  uint32 kernel_version = 4;  // LINUX_VERSION_CODE
+}

diff --git a/security/container/protos/event.pb.c b/security/container/protos/event.pb.c
new file mode 100644
index 0000000..1018ce2
--- /dev/null
+++ b/security/container/protos/event.pb.c

@@ -0,0 +1,174 @@
+/* Automatically generated nanopb constant definitions */
+/* Generated by nanopb-0.3.9.1 at Mon Nov 11 16:11:19 2019. */
+
+#include "event.pb.h"
+
+/* @@protoc_insertion_point(includes) */
+#if PB_PROTO_HEADER_VERSION != 30
+#error Regenerate this file with the current version of nanopb generator.
+#endif
+
+
+
+const pb_field_t schema_SocketIp_fields[4] = {
+    PB_FIELD(  1, UINT32  , SINGULAR, STATIC  , FIRST, schema_SocketIp, family, family, 0),
+    PB_FIELD(  2, BYTES   , SINGULAR, CALLBACK, OTHER, schema_SocketIp, ip, family, 0),
+    PB_FIELD(  3, UINT32  , SINGULAR, STATIC  , OTHER, schema_SocketIp, port, ip, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_Socket_fields[3] = {
+    PB_FIELD(  1, MESSAGE , SINGULAR, STATIC  , FIRST, schema_Socket, local, local, &schema_SocketIp_fields),
+    PB_FIELD(  2, MESSAGE , SINGULAR, STATIC  , OTHER, schema_Socket, remote, local, &schema_SocketIp_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_Overlay_fields[4] = {
+    PB_FIELD(  1, BOOL    , SINGULAR, STATIC  , FIRST, schema_Overlay, lower_layer, lower_layer, 0),
+    PB_FIELD(  2, BOOL    , SINGULAR, STATIC  , OTHER, schema_Overlay, upper_layer, lower_layer, 0),
+    PB_FIELD(  3, BYTES   , SINGULAR, CALLBACK, OTHER, schema_Overlay, modified_uuid, upper_layer, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_File_fields[5] = {
+    PB_FIELD(  1, BYTES   , SINGULAR, CALLBACK, FIRST, schema_File, fullpath, fullpath, 0),
+    PB_ONEOF_FIELD(filesystem,   2, MESSAGE , ONEOF, STATIC  , OTHER, schema_File, overlayfs, fullpath, &schema_Overlay_fields),
+    PB_ONEOF_FIELD(filesystem,   4, MESSAGE , ONEOF, STATIC  , UNION, schema_File, socket, fullpath, &schema_Socket_fields),
+    PB_FIELD(  3, UINT32  , SINGULAR, STATIC  , OTHER, schema_File, ino, filesystem.socket, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_ProcessArguments_fields[5] = {
+    PB_FIELD(  1, BYTES   , REPEATED, CALLBACK, FIRST, schema_ProcessArguments, argv, argv, 0),
+    PB_FIELD(  2, UINT32  , SINGULAR, STATIC  , OTHER, schema_ProcessArguments, argv_truncated, argv, 0),
+    PB_FIELD(  3, BYTES   , REPEATED, CALLBACK, OTHER, schema_ProcessArguments, envp, argv_truncated, 0),
+    PB_FIELD(  4, UINT32  , SINGULAR, STATIC  , OTHER, schema_ProcessArguments, envp_truncated, envp, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_Descriptor_fields[3] = {
+    PB_FIELD(  1, UINT32  , SINGULAR, STATIC  , FIRST, schema_Descriptor, mode, mode, 0),
+    PB_FIELD(  2, MESSAGE , SINGULAR, STATIC  , OTHER, schema_Descriptor, file, mode, &schema_File_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_Streams_fields[4] = {
+    PB_FIELD(  1, MESSAGE , SINGULAR, STATIC  , FIRST, schema_Streams, stdin, stdin, &schema_Descriptor_fields),
+    PB_FIELD(  2, MESSAGE , SINGULAR, STATIC  , OTHER, schema_Streams, stdout, stdin, &schema_Descriptor_fields),
+    PB_FIELD(  3, MESSAGE , SINGULAR, STATIC  , OTHER, schema_Streams, stderr, stdout, &schema_Descriptor_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_Process_fields[13] = {
+    PB_FIELD(  1, UINT64  , SINGULAR, STATIC  , FIRST, schema_Process, creation_timestamp, creation_timestamp, 0),
+    PB_FIELD(  2, BYTES   , SINGULAR, CALLBACK, OTHER, schema_Process, uuid, creation_timestamp, 0),
+    PB_FIELD(  3, UINT32  , SINGULAR, STATIC  , OTHER, schema_Process, pid, uuid, 0),
+    PB_FIELD(  4, MESSAGE , SINGULAR, STATIC  , OTHER, schema_Process, binary, pid, &schema_File_fields),
+    PB_FIELD(  5, UINT32  , SINGULAR, STATIC  , OTHER, schema_Process, parent_pid, binary, 0),
+    PB_FIELD(  6, BYTES   , SINGULAR, CALLBACK, OTHER, schema_Process, parent_uuid, parent_pid, 0),
+    PB_FIELD(  7, UINT64  , SINGULAR, STATIC  , OTHER, schema_Process, container_id, parent_uuid, 0),
+    PB_FIELD(  8, UINT32  , SINGULAR, STATIC  , OTHER, schema_Process, container_pid, container_id, 0),
+    PB_FIELD(  9, UINT32  , SINGULAR, STATIC  , OTHER, schema_Process, container_parent_pid, container_pid, 0),
+    PB_FIELD( 10, MESSAGE , SINGULAR, STATIC  , OTHER, schema_Process, args, container_parent_pid, &schema_ProcessArguments_fields),
+    PB_FIELD( 11, MESSAGE , SINGULAR, STATIC  , OTHER, schema_Process, streams, args, &schema_Streams_fields),
+    PB_FIELD( 12, UINT64  , SINGULAR, STATIC  , OTHER, schema_Process, exec_session_id, streams, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_Container_fields[10] = {
+    PB_FIELD(  1, UINT64  , SINGULAR, STATIC  , FIRST, schema_Container, creation_timestamp, creation_timestamp, 0),
+    PB_FIELD(  2, BYTES   , SINGULAR, CALLBACK, OTHER, schema_Container, pod_namespace, creation_timestamp, 0),
+    PB_FIELD(  3, BYTES   , SINGULAR, CALLBACK, OTHER, schema_Container, pod_name, pod_namespace, 0),
+    PB_FIELD(  4, UINT64  , SINGULAR, STATIC  , OTHER, schema_Container, container_id, pod_name, 0),
+    PB_FIELD(  5, BYTES   , SINGULAR, CALLBACK, OTHER, schema_Container, container_name, container_id, 0),
+    PB_FIELD(  6, BYTES   , SINGULAR, CALLBACK, OTHER, schema_Container, container_image_uri, container_name, 0),
+    PB_FIELD(  7, BYTES   , REPEATED, CALLBACK, OTHER, schema_Container, labels, container_image_uri, 0),
+    PB_FIELD(  8, BYTES   , SINGULAR, CALLBACK, OTHER, schema_Container, init_uuid, labels, 0),
+    PB_FIELD(  9, BYTES   , SINGULAR, CALLBACK, OTHER, schema_Container, container_image_id, init_uuid, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_ExecuteEvent_fields[2] = {
+    PB_FIELD(  1, MESSAGE , SINGULAR, STATIC  , FIRST, schema_ExecuteEvent, proc, proc, &schema_Process_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_CloneEvent_fields[2] = {
+    PB_FIELD(  1, MESSAGE , SINGULAR, STATIC  , FIRST, schema_CloneEvent, proc, proc, &schema_Process_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_EnumerateProcessEvent_fields[2] = {
+    PB_FIELD(  1, MESSAGE , SINGULAR, STATIC  , FIRST, schema_EnumerateProcessEvent, proc, proc, &schema_Process_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_MemoryExecEvent_fields[12] = {
+    PB_FIELD(  1, MESSAGE , SINGULAR, STATIC  , FIRST, schema_MemoryExecEvent, proc, proc, &schema_Process_fields),
+    PB_FIELD(  2, UINT64  , SINGULAR, STATIC  , OTHER, schema_MemoryExecEvent, prot_exec_timestamp, proc, 0),
+    PB_FIELD(  3, UINT64  , SINGULAR, STATIC  , OTHER, schema_MemoryExecEvent, new_flags, prot_exec_timestamp, 0),
+    PB_FIELD(  4, UINT64  , SINGULAR, STATIC  , OTHER, schema_MemoryExecEvent, req_flags, new_flags, 0),
+    PB_FIELD(  5, UINT64  , SINGULAR, STATIC  , OTHER, schema_MemoryExecEvent, old_vm_flags, req_flags, 0),
+    PB_FIELD(  6, UINT64  , SINGULAR, STATIC  , OTHER, schema_MemoryExecEvent, mmap_flags, old_vm_flags, 0),
+    PB_FIELD(  7, MESSAGE , SINGULAR, STATIC  , OTHER, schema_MemoryExecEvent, mapped_file, mmap_flags, &schema_File_fields),
+    PB_FIELD(  8, UENUM   , SINGULAR, STATIC  , OTHER, schema_MemoryExecEvent, action, mapped_file, 0),
+    PB_FIELD(  9, UINT64  , SINGULAR, STATIC  , OTHER, schema_MemoryExecEvent, start_addr, action, 0),
+    PB_FIELD( 10, UINT64  , SINGULAR, STATIC  , OTHER, schema_MemoryExecEvent, end_addr, start_addr, 0),
+    PB_FIELD( 11, BOOL    , SINGULAR, STATIC  , OTHER, schema_MemoryExecEvent, is_initial_mmap, end_addr, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_ContainerInfoEvent_fields[2] = {
+    PB_FIELD(  1, MESSAGE , SINGULAR, STATIC  , FIRST, schema_ContainerInfoEvent, container, container, &schema_Container_fields),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_ExitEvent_fields[2] = {
+    PB_FIELD(  1, BYTES   , SINGULAR, CALLBACK, FIRST, schema_ExitEvent, process_uuid, process_uuid, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_Event_fields[8] = {
+    PB_ONEOF_FIELD(event,   1, MESSAGE , ONEOF, STATIC  , FIRST, schema_Event, execute, execute, &schema_ExecuteEvent_fields),
+    PB_ONEOF_FIELD(event,   2, MESSAGE , ONEOF, STATIC  , UNION, schema_Event, container, container, &schema_ContainerInfoEvent_fields),
+    PB_ONEOF_FIELD(event,   3, MESSAGE , ONEOF, STATIC  , UNION, schema_Event, exit, exit, &schema_ExitEvent_fields),
+    PB_ONEOF_FIELD(event,   4, MESSAGE , ONEOF, STATIC  , UNION, schema_Event, memexec, memexec, &schema_MemoryExecEvent_fields),
+    PB_ONEOF_FIELD(event,   5, MESSAGE , ONEOF, STATIC  , UNION, schema_Event, clone, clone, &schema_CloneEvent_fields),
+    PB_ONEOF_FIELD(event,   7, MESSAGE , ONEOF, STATIC  , UNION, schema_Event, enumproc, enumproc, &schema_EnumerateProcessEvent_fields),
+    PB_FIELD(  6, UINT64  , SINGULAR, STATIC  , OTHER, schema_Event, timestamp, event.enumproc, 0),
+    PB_LAST_FIELD
+};
+
+const pb_field_t schema_ContainerReport_fields[3] = {
+    PB_FIELD(  1, UINT32  , SINGULAR, STATIC  , FIRST, schema_ContainerReport, pid, pid, 0),
+    PB_FIELD(  2, MESSAGE , SINGULAR, STATIC  , OTHER, schema_ContainerReport, container, pid, &schema_Container_fields),
+    PB_LAST_FIELD
+};
+
+
+
+/* Check that field information fits in pb_field_t */
+#if !defined(PB_FIELD_32BIT)
+/* If you get an error here, it means that you need to define PB_FIELD_32BIT
+ * compile-time option. You can do that in pb.h or on compiler command line.
+ * 
+ * The reason you need to do this is that some of your messages contain tag
+ * numbers or field sizes that are larger than what can fit in 8 or 16 bit
+ * field descriptors.
+ */
+PB_STATIC_ASSERT((pb_membersize(schema_Socket, local) < 65536 && pb_membersize(schema_Socket, remote) < 65536 && pb_membersize(schema_File, filesystem.overlayfs) < 65536 && pb_membersize(schema_File, filesystem.socket) < 65536 && pb_membersize(schema_Descriptor, file) < 65536 && pb_membersize(schema_Streams, stdin) < 65536 && pb_membersize(schema_Streams, stdout) < 65536 && pb_membersize(schema_Streams, stderr) < 65536 && pb_membersize(schema_Process, binary) < 65536 && pb_membersize(schema_Process, args) < 65536 && pb_membersize(schema_Process, streams) < 65536 && pb_membersize(schema_ExecuteEvent, proc) < 65536 && pb_membersize(schema_CloneEvent, proc) < 65536 && pb_membersize(schema_EnumerateProcessEvent, proc) < 65536 && pb_membersize(schema_MemoryExecEvent, proc) < 65536 && pb_membersize(schema_MemoryExecEvent, mapped_file) < 65536 && pb_membersize(schema_ContainerInfoEvent, container) < 65536 && pb_membersize(schema_Event, event.execute) < 65536 && pb_membersize(schema_Event, event.container) < 65536 && pb_membersize(schema_Event, event.exit) < 65536 && pb_membersize(schema_Event, event.memexec) < 65536 && pb_membersize(schema_Event, event.clone) < 65536 && pb_membersize(schema_Event, event.enumproc) < 65536 && pb_membersize(schema_ContainerReport, container) < 65536), YOU_MUST_DEFINE_PB_FIELD_32BIT_FOR_MESSAGES_schema_SocketIp_schema_Socket_schema_Overlay_schema_File_schema_ProcessArguments_schema_Descriptor_schema_Streams_schema_Process_schema_Container_schema_ExecuteEvent_schema_CloneEvent_schema_EnumerateProcessEvent_schema_MemoryExecEvent_schema_ContainerInfoEvent_schema_ExitEvent_schema_Event_schema_ContainerReport)
+#endif
+
+#if !defined(PB_FIELD_16BIT) && !defined(PB_FIELD_32BIT)
+/* If you get an error here, it means that you need to define PB_FIELD_16BIT
+ * compile-time option. You can do that in pb.h or on compiler command line.
+ * 
+ * The reason you need to do this is that some of your messages contain tag
+ * numbers or field sizes that are larger than what can fit in the default
+ * 8 bit descriptors.
+ */
+PB_STATIC_ASSERT((pb_membersize(schema_Socket, local) < 256 && pb_membersize(schema_Socket, remote) < 256 && pb_membersize(schema_File, filesystem.overlayfs) < 256 && pb_membersize(schema_File, filesystem.socket) < 256 && pb_membersize(schema_Descriptor, file) < 256 && pb_membersize(schema_Streams, stdin) < 256 && pb_membersize(schema_Streams, stdout) < 256 && pb_membersize(schema_Streams, stderr) < 256 && pb_membersize(schema_Process, binary) < 256 && pb_membersize(schema_Process, args) < 256 && pb_membersize(schema_Process, streams) < 256 && pb_membersize(schema_ExecuteEvent, proc) < 256 && pb_membersize(schema_CloneEvent, proc) < 256 && pb_membersize(schema_EnumerateProcessEvent, proc) < 256 && pb_membersize(schema_MemoryExecEvent, proc) < 256 && pb_membersize(schema_MemoryExecEvent, mapped_file) < 256 && pb_membersize(schema_ContainerInfoEvent, container) < 256 && pb_membersize(schema_Event, event.execute) < 256 && pb_membersize(schema_Event, event.container) < 256 && pb_membersize(schema_Event, event.exit) < 256 && pb_membersize(schema_Event, event.memexec) < 256 && pb_membersize(schema_Event, event.clone) < 256 && pb_membersize(schema_Event, event.enumproc) < 256 && pb_membersize(schema_ContainerReport, container) < 256), YOU_MUST_DEFINE_PB_FIELD_16BIT_FOR_MESSAGES_schema_SocketIp_schema_Socket_schema_Overlay_schema_File_schema_ProcessArguments_schema_Descriptor_schema_Streams_schema_Process_schema_Container_schema_ExecuteEvent_schema_CloneEvent_schema_EnumerateProcessEvent_schema_MemoryExecEvent_schema_ContainerInfoEvent_schema_ExitEvent_schema_Event_schema_ContainerReport)
+#endif
+
+
+/* @@protoc_insertion_point(eof) */

diff --git a/security/container/protos/event.pb.h b/security/container/protos/event.pb.h
new file mode 100644
index 0000000..0634e8e
--- /dev/null
+++ b/security/container/protos/event.pb.h

@@ -0,0 +1,327 @@
+/* Automatically generated nanopb header */
+/* Generated by nanopb-0.3.9.1 at Mon Nov 11 16:11:19 2019. */
+
+#ifndef PB_SCHEMA_EVENT_PB_H_INCLUDED
+#define PB_SCHEMA_EVENT_PB_H_INCLUDED
+#include <pb.h>
+
+/* @@protoc_insertion_point(includes) */
+#if PB_PROTO_HEADER_VERSION != 30
+#error Regenerate this file with the current version of nanopb generator.
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Enum definitions */
+typedef enum _schema_MemoryExecEvent_Action {
+    schema_MemoryExecEvent_Action_UNDEFINED = 0,
+    schema_MemoryExecEvent_Action_MPROTECT = 1,
+    schema_MemoryExecEvent_Action_MMAP_FILE = 2
+} schema_MemoryExecEvent_Action;
+#define _schema_MemoryExecEvent_Action_MIN schema_MemoryExecEvent_Action_UNDEFINED
+#define _schema_MemoryExecEvent_Action_MAX schema_MemoryExecEvent_Action_MMAP_FILE
+#define _schema_MemoryExecEvent_Action_ARRAYSIZE ((schema_MemoryExecEvent_Action)(schema_MemoryExecEvent_Action_MMAP_FILE+1))
+
+/* Struct definitions */
+typedef struct _schema_ExitEvent {
+    pb_callback_t process_uuid;
+/* @@protoc_insertion_point(struct:schema_ExitEvent) */
+} schema_ExitEvent;
+
+typedef struct _schema_Container {
+    uint64_t creation_timestamp;
+    pb_callback_t pod_namespace;
+    pb_callback_t pod_name;
+    uint64_t container_id;
+    pb_callback_t container_name;
+    pb_callback_t container_image_uri;
+    pb_callback_t labels;
+    pb_callback_t init_uuid;
+    pb_callback_t container_image_id;
+/* @@protoc_insertion_point(struct:schema_Container) */
+} schema_Container;
+
+typedef struct _schema_Overlay {
+    bool lower_layer;
+    bool upper_layer;
+    pb_callback_t modified_uuid;
+/* @@protoc_insertion_point(struct:schema_Overlay) */
+} schema_Overlay;
+
+typedef struct _schema_ProcessArguments {
+    pb_callback_t argv;
+    uint32_t argv_truncated;
+    pb_callback_t envp;
+    uint32_t envp_truncated;
+/* @@protoc_insertion_point(struct:schema_ProcessArguments) */
+} schema_ProcessArguments;
+
+typedef struct _schema_SocketIp {
+    uint32_t family;
+    pb_callback_t ip;
+    uint32_t port;
+/* @@protoc_insertion_point(struct:schema_SocketIp) */
+} schema_SocketIp;
+
+typedef struct _schema_ContainerInfoEvent {
+    schema_Container container;
+/* @@protoc_insertion_point(struct:schema_ContainerInfoEvent) */
+} schema_ContainerInfoEvent;
+
+typedef struct _schema_ContainerReport {
+    uint32_t pid;
+    schema_Container container;
+/* @@protoc_insertion_point(struct:schema_ContainerReport) */
+} schema_ContainerReport;
+
+typedef struct _schema_Socket {
+    schema_SocketIp local;
+    schema_SocketIp remote;
+/* @@protoc_insertion_point(struct:schema_Socket) */
+} schema_Socket;
+
+typedef struct _schema_File {
+    pb_callback_t fullpath;
+    pb_size_t which_filesystem;
+    union {
+        schema_Overlay overlayfs;
+        schema_Socket socket;
+    } filesystem;
+    uint32_t ino;
+/* @@protoc_insertion_point(struct:schema_File) */
+} schema_File;
+
+typedef struct _schema_Descriptor {
+    uint32_t mode;
+    schema_File file;
+/* @@protoc_insertion_point(struct:schema_Descriptor) */
+} schema_Descriptor;
+
+typedef struct _schema_Streams {
+    schema_Descriptor stdin;
+    schema_Descriptor stdout;
+    schema_Descriptor stderr;
+/* @@protoc_insertion_point(struct:schema_Streams) */
+} schema_Streams;
+
+typedef struct _schema_Process {
+    uint64_t creation_timestamp;
+    pb_callback_t uuid;
+    uint32_t pid;
+    schema_File binary;
+    uint32_t parent_pid;
+    pb_callback_t parent_uuid;
+    uint64_t container_id;
+    uint32_t container_pid;
+    uint32_t container_parent_pid;
+    schema_ProcessArguments args;
+    schema_Streams streams;
+    uint64_t exec_session_id;
+/* @@protoc_insertion_point(struct:schema_Process) */
+} schema_Process;
+
+typedef struct _schema_CloneEvent {
+    schema_Process proc;
+/* @@protoc_insertion_point(struct:schema_CloneEvent) */
+} schema_CloneEvent;
+
+typedef struct _schema_EnumerateProcessEvent {
+    schema_Process proc;
+/* @@protoc_insertion_point(struct:schema_EnumerateProcessEvent) */
+} schema_EnumerateProcessEvent;
+
+typedef struct _schema_ExecuteEvent {
+    schema_Process proc;
+/* @@protoc_insertion_point(struct:schema_ExecuteEvent) */
+} schema_ExecuteEvent;
+
+typedef struct _schema_MemoryExecEvent {
+    schema_Process proc;
+    uint64_t prot_exec_timestamp;
+    uint64_t new_flags;
+    uint64_t req_flags;
+    uint64_t old_vm_flags;
+    uint64_t mmap_flags;
+    schema_File mapped_file;
+    schema_MemoryExecEvent_Action action;
+    uint64_t start_addr;
+    uint64_t end_addr;
+    bool is_initial_mmap;
+/* @@protoc_insertion_point(struct:schema_MemoryExecEvent) */
+} schema_MemoryExecEvent;
+
+typedef struct _schema_Event {
+    pb_size_t which_event;
+    union {
+        schema_ExecuteEvent execute;
+        schema_ContainerInfoEvent container;
+        schema_ExitEvent exit;
+        schema_MemoryExecEvent memexec;
+        schema_CloneEvent clone;
+        schema_EnumerateProcessEvent enumproc;
+    } event;
+    uint64_t timestamp;
+/* @@protoc_insertion_point(struct:schema_Event) */
+} schema_Event;
+
+/* Default values for struct fields */
+
+/* Initializer values for message structs */
+#define schema_SocketIp_init_default             {0, {{NULL}, NULL}, 0}
+#define schema_Socket_init_default               {schema_SocketIp_init_default, schema_SocketIp_init_default}
+#define schema_Overlay_init_default              {0, 0, {{NULL}, NULL}}
+#define schema_File_init_default                 {{{NULL}, NULL}, 0, {schema_Overlay_init_default}, 0}
+#define schema_ProcessArguments_init_default     {{{NULL}, NULL}, 0, {{NULL}, NULL}, 0}
+#define schema_Descriptor_init_default           {0, schema_File_init_default}
+#define schema_Streams_init_default              {schema_Descriptor_init_default, schema_Descriptor_init_default, schema_Descriptor_init_default}
+#define schema_Process_init_default              {0, {{NULL}, NULL}, 0, schema_File_init_default, 0, {{NULL}, NULL}, 0, 0, 0, schema_ProcessArguments_init_default, schema_Streams_init_default, 0}
+#define schema_Container_init_default            {0, {{NULL}, NULL}, {{NULL}, NULL}, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define schema_ExecuteEvent_init_default         {schema_Process_init_default}
+#define schema_CloneEvent_init_default           {schema_Process_init_default}
+#define schema_EnumerateProcessEvent_init_default {schema_Process_init_default}
+#define schema_MemoryExecEvent_init_default      {schema_Process_init_default, 0, 0, 0, 0, 0, schema_File_init_default, _schema_MemoryExecEvent_Action_MIN, 0, 0, 0}
+#define schema_ContainerInfoEvent_init_default   {schema_Container_init_default}
+#define schema_ExitEvent_init_default            {{{NULL}, NULL}}
+#define schema_Event_init_default                {0, {schema_ExecuteEvent_init_default}, 0}
+#define schema_ContainerReport_init_default      {0, schema_Container_init_default}
+#define schema_SocketIp_init_zero                {0, {{NULL}, NULL}, 0}
+#define schema_Socket_init_zero                  {schema_SocketIp_init_zero, schema_SocketIp_init_zero}
+#define schema_Overlay_init_zero                 {0, 0, {{NULL}, NULL}}
+#define schema_File_init_zero                    {{{NULL}, NULL}, 0, {schema_Overlay_init_zero}, 0}
+#define schema_ProcessArguments_init_zero        {{{NULL}, NULL}, 0, {{NULL}, NULL}, 0}
+#define schema_Descriptor_init_zero              {0, schema_File_init_zero}
+#define schema_Streams_init_zero                 {schema_Descriptor_init_zero, schema_Descriptor_init_zero, schema_Descriptor_init_zero}
+#define schema_Process_init_zero                 {0, {{NULL}, NULL}, 0, schema_File_init_zero, 0, {{NULL}, NULL}, 0, 0, 0, schema_ProcessArguments_init_zero, schema_Streams_init_zero, 0}
+#define schema_Container_init_zero               {0, {{NULL}, NULL}, {{NULL}, NULL}, 0, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}, {{NULL}, NULL}}
+#define schema_ExecuteEvent_init_zero            {schema_Process_init_zero}
+#define schema_CloneEvent_init_zero              {schema_Process_init_zero}
+#define schema_EnumerateProcessEvent_init_zero   {schema_Process_init_zero}
+#define schema_MemoryExecEvent_init_zero         {schema_Process_init_zero, 0, 0, 0, 0, 0, schema_File_init_zero, _schema_MemoryExecEvent_Action_MIN, 0, 0, 0}
+#define schema_ContainerInfoEvent_init_zero      {schema_Container_init_zero}
+#define schema_ExitEvent_init_zero               {{{NULL}, NULL}}
+#define schema_Event_init_zero                   {0, {schema_ExecuteEvent_init_zero}, 0}
+#define schema_ContainerReport_init_zero         {0, schema_Container_init_zero}
+
+/* Field tags (for use in manual encoding/decoding) */
+#define schema_ExitEvent_process_uuid_tag        1
+#define schema_Container_creation_timestamp_tag  1
+#define schema_Container_pod_namespace_tag       2
+#define schema_Container_pod_name_tag            3
+#define schema_Container_container_id_tag        4
+#define schema_Container_container_name_tag      5
+#define schema_Container_container_image_uri_tag 6
+#define schema_Container_labels_tag              7
+#define schema_Container_init_uuid_tag           8
+#define schema_Container_container_image_id_tag  9
+#define schema_Overlay_lower_layer_tag           1
+#define schema_Overlay_upper_layer_tag           2
+#define schema_Overlay_modified_uuid_tag         3
+#define schema_ProcessArguments_argv_tag         1
+#define schema_ProcessArguments_argv_truncated_tag 2
+#define schema_ProcessArguments_envp_tag         3
+#define schema_ProcessArguments_envp_truncated_tag 4
+#define schema_SocketIp_family_tag               1
+#define schema_SocketIp_ip_tag                   2
+#define schema_SocketIp_port_tag                 3
+#define schema_ContainerInfoEvent_container_tag  1
+#define schema_ContainerReport_pid_tag           1
+#define schema_ContainerReport_container_tag     2
+#define schema_Socket_local_tag                  1
+#define schema_Socket_remote_tag                 2
+#define schema_File_overlayfs_tag                2
+#define schema_File_socket_tag                   4
+#define schema_File_fullpath_tag                 1
+#define schema_File_ino_tag                      3
+#define schema_Descriptor_mode_tag               1
+#define schema_Descriptor_file_tag               2
+#define schema_Streams_stdin_tag                 1
+#define schema_Streams_stdout_tag                2
+#define schema_Streams_stderr_tag                3
+#define schema_Process_creation_timestamp_tag    1
+#define schema_Process_uuid_tag                  2
+#define schema_Process_pid_tag                   3
+#define schema_Process_binary_tag                4
+#define schema_Process_parent_pid_tag            5
+#define schema_Process_parent_uuid_tag           6
+#define schema_Process_container_id_tag          7
+#define schema_Process_container_pid_tag         8
+#define schema_Process_container_parent_pid_tag  9
+#define schema_Process_args_tag                  10
+#define schema_Process_streams_tag               11
+#define schema_Process_exec_session_id_tag       12
+#define schema_CloneEvent_proc_tag               1
+#define schema_EnumerateProcessEvent_proc_tag    1
+#define schema_ExecuteEvent_proc_tag             1
+#define schema_MemoryExecEvent_proc_tag          1
+#define schema_MemoryExecEvent_prot_exec_timestamp_tag 2
+#define schema_MemoryExecEvent_new_flags_tag     3
+#define schema_MemoryExecEvent_req_flags_tag     4
+#define schema_MemoryExecEvent_old_vm_flags_tag  5
+#define schema_MemoryExecEvent_mmap_flags_tag    6
+#define schema_MemoryExecEvent_mapped_file_tag   7
+#define schema_MemoryExecEvent_action_tag        8
+#define schema_MemoryExecEvent_start_addr_tag    9
+#define schema_MemoryExecEvent_end_addr_tag      10
+#define schema_MemoryExecEvent_is_initial_mmap_tag 11
+#define schema_Event_execute_tag                 1
+#define schema_Event_container_tag               2
+#define schema_Event_exit_tag                    3
+#define schema_Event_memexec_tag                 4
+#define schema_Event_clone_tag                   5
+#define schema_Event_enumproc_tag                7
+#define schema_Event_timestamp_tag               6
+
+/* Struct field encoding specification for nanopb */
+extern const pb_field_t schema_SocketIp_fields[4];
+extern const pb_field_t schema_Socket_fields[3];
+extern const pb_field_t schema_Overlay_fields[4];
+extern const pb_field_t schema_File_fields[5];
+extern const pb_field_t schema_ProcessArguments_fields[5];
+extern const pb_field_t schema_Descriptor_fields[3];
+extern const pb_field_t schema_Streams_fields[4];
+extern const pb_field_t schema_Process_fields[13];
+extern const pb_field_t schema_Container_fields[10];
+extern const pb_field_t schema_ExecuteEvent_fields[2];
+extern const pb_field_t schema_CloneEvent_fields[2];
+extern const pb_field_t schema_EnumerateProcessEvent_fields[2];
+extern const pb_field_t schema_MemoryExecEvent_fields[12];
+extern const pb_field_t schema_ContainerInfoEvent_fields[2];
+extern const pb_field_t schema_ExitEvent_fields[2];
+extern const pb_field_t schema_Event_fields[8];
+extern const pb_field_t schema_ContainerReport_fields[3];
+
+/* Maximum encoded size of messages (where known) */
+/* schema_SocketIp_size depends on runtime parameters */
+#define schema_Socket_size                       (12 + schema_SocketIp_size + schema_SocketIp_size)
+/* schema_Overlay_size depends on runtime parameters */
+/* schema_File_size depends on runtime parameters */
+/* schema_ProcessArguments_size depends on runtime parameters */
+#define schema_Descriptor_size                   (12 + schema_File_size)
+#define schema_Streams_size                      (54 + schema_File_size + schema_File_size + schema_File_size)
+/* schema_Process_size depends on runtime parameters */
+/* schema_Container_size depends on runtime parameters */
+#define schema_ExecuteEvent_size                 (6 + schema_Process_size)
+#define schema_CloneEvent_size                   (6 + schema_Process_size)
+#define schema_EnumerateProcessEvent_size        (6 + schema_Process_size)
+#define schema_MemoryExecEvent_size              (93 + schema_Process_size + schema_File_size)
+#define schema_ContainerInfoEvent_size           (6 + schema_Container_size)
+/* schema_ExitEvent_size depends on runtime parameters */
+#define schema_Event_size                        (11 + ((((((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) > schema_ExitEvent_size ? ((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) : schema_ExitEvent_size) > schema_ContainerInfoEvent_size ? (((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) > schema_ExitEvent_size ? ((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) : schema_ExitEvent_size) : schema_ContainerInfoEvent_size) > schema_MemoryExecEvent_size ? ((((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) > schema_ExitEvent_size ? ((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) : schema_ExitEvent_size) > schema_ContainerInfoEvent_size ? (((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) > schema_ExitEvent_size ? ((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) : schema_ExitEvent_size) : schema_ContainerInfoEvent_size) : schema_MemoryExecEvent_size) > 0 ? (((((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) > schema_ExitEvent_size ? ((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) : schema_ExitEvent_size) > schema_ContainerInfoEvent_size ? (((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) > schema_ExitEvent_size ? ((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) : schema_ExitEvent_size) : schema_ContainerInfoEvent_size) > schema_MemoryExecEvent_size ? ((((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) > schema_ExitEvent_size ? ((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) : schema_ExitEvent_size) > schema_ContainerInfoEvent_size ? (((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) > schema_ExitEvent_size ? ((schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) > schema_ExecuteEvent_size ? (schema_CloneEvent_size > schema_EnumerateProcessEvent_size ? schema_CloneEvent_size : schema_EnumerateProcessEvent_size) : schema_ExecuteEvent_size) : schema_ExitEvent_size) : schema_ContainerInfoEvent_size) : schema_MemoryExecEvent_size) : 0))
+#define schema_ContainerReport_size              (12 + schema_Container_size)
+
+/* Message IDs (where set with "msgid" option) */
+#ifdef PB_MSGID
+
+#define EVENT_MESSAGES \
+
+
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+/* @@protoc_insertion_point(eof) */
+
+#endif

diff --git a/security/container/protos/event.proto b/security/container/protos/event.proto
new file mode 100644
index 0000000..dfe483f
--- /dev/null
+++ b/security/container/protos/event.proto

@@ -0,0 +1,151 @@
+syntax = "proto3";
+
+package schema;
+
+message SocketIp {
+  uint32 family = 1;  // AF_* for socket type.
+  bytes ip = 2;       // ip4 or ip6 address.
+  uint32 port = 3;    // port bind or connected.
+}
+
+message Socket {
+  SocketIp local = 1;
+  SocketIp remote = 2;  // unset if not connected.
+}
+
+message Overlay {
+  bool lower_layer = 1;
+  bool upper_layer = 2;
+  bytes modified_uuid = 3;  // The process who first modified the file.
+}
+
+message File {
+  bytes fullpath = 1;
+  uint32 ino = 3;  // inode number.
+  oneof filesystem {
+    Overlay overlayfs = 2;
+    Socket socket = 4;
+  }
+}
+
+message ProcessArguments {
+  repeated bytes argv = 1;    // process arguments
+  uint32 argv_truncated = 2;  // number of characters truncated from argv
+  repeated bytes envp = 3;    // process environment variables
+  uint32 envp_truncated = 4;  // number of characters truncated from envp
+}
+
+message Descriptor {
+  uint32 mode = 1;  // file mode (stat st_mode)
+  File file = 2;
+}
+
+message Streams {
+  Descriptor stdin = 1;
+  Descriptor stdout = 2;
+  Descriptor stderr = 3;
+}
+
+message Process {
+  uint64 creation_timestamp = 1;  // Only populated in ExecuteEvent, in ns.
+  bytes uuid = 2;
+  uint32 pid = 3;
+  File binary = 4;  // Only populated in ExecuteEvent.
+  uint32 parent_pid = 5;
+  bytes parent_uuid = 6;
+  uint64 container_id = 7;          // unique id of process's container
+  uint32 container_pid = 8;         // pid inside the container namespace pid
+  uint32 container_parent_pid = 9;  // optional
+  ProcessArguments args = 10;       // Only populated in ExecuteEvent.
+  Streams streams = 11;             // Only populated in ExecuteEvent.
+  uint64 exec_session_id = 12;      // identifier set for kubectl exec sessions.
+}
+
+message Container {
+  uint64 creation_timestamp = 1;  // container create time in ns
+  bytes pod_namespace = 2;
+  bytes pod_name = 3;
+  uint64 container_id = 4;  // unique across lifetime of Node
+  bytes container_name = 5;
+  bytes container_image_uri = 6;
+  repeated bytes labels = 7;
+  bytes init_uuid = 8;
+  bytes container_image_id = 9;
+}
+
+// A binary being executed.
+// e.g., execve()
+message ExecuteEvent {
+  Process proc = 1;
+}
+
+// A process clone is being created. This message means that a cloning operation
+// is being attempted. It may be sent even if fork fails.
+message CloneEvent {
+  Process proc = 1;
+}
+
+// Processes that are enumerated at startup will be sent with this event. There
+// is no distinction from events we would have seen from fork or exec.
+message EnumerateProcessEvent {
+  Process proc = 1;
+}
+
+// Collect information about mmap/mprotect calls with the PROT_EXEC flag set.
+message MemoryExecEvent {
+  Process proc = 1;  // The origin process
+  // The timestamp in ns when the memory was set executable
+  uint64 prot_exec_timestamp = 2;
+  // The prot flags granted by the kernel for the operation
+  uint64 new_flags = 3;
+  // The prot flags requested for the mprotect/mmap operation
+  uint64 req_flags = 4;
+  // The vm_flags prior to the mprotect operation, if relevant
+  uint64 old_vm_flags = 5;
+  // The operational flags for the mmap operation, if relevant
+  uint64 mmap_flags = 6;
+  // Derived from the file struct describing the fd being mapped
+  File mapped_file = 7;
+  enum Action {
+    UNDEFINED = 0;
+    MPROTECT = 1;
+    MMAP_FILE = 2;
+  }
+  Action action = 8;
+
+  uint64 start_addr = 9;  // The executable memory region start addr
+  uint64 end_addr = 10;   // The executable memory region end addr
+  // True if this event is a mmap of the process' binary
+  bool is_initial_mmap = 11;
+}
+
+// Associate the following container information with all processes
+// that have the indicated container_id.
+message ContainerInfoEvent {
+  Container container = 1;
+}
+
+// The process with the indicated pid has exited.
+message ExitEvent {
+  bytes process_uuid = 1;
+}
+
+// Next ID: 8
+message Event {
+  oneof event {
+    ExecuteEvent execute = 1;
+    ContainerInfoEvent container = 2;
+    ExitEvent exit = 3;
+    MemoryExecEvent memexec = 4;
+    CloneEvent clone = 5;
+    EnumerateProcessEvent enumproc = 7;
+  }
+
+  uint64 timestamp = 6;  // In nanoseconds
+}
+
+// Message sent by the daemonset to the LSM for container enlightenment.
+message ContainerReport {
+  uint32 pid = 1;           // Top pid of the running container.
+  Container container = 2;  // Information collected about the container.
+}

diff --git a/security/container/protos/nanopb/LICENSE b/security/container/protos/nanopb/LICENSE
new file mode 100644
index 0000000..a83630a
--- /dev/null
+++ b/security/container/protos/nanopb/LICENSE

@@ -0,0 +1,20 @@
+Copyright (c) 2011 Petteri Aimonen <jpa at nanopb.mail.kapsi.fi>
+
+This software is provided 'as-is', without any express or
+implied warranty. In no event will the authors be held liable
+for any damages arising from the use of this software.
+
+Permission is granted to anyone to use this software for any
+purpose, including commercial applications, and to alter it and
+redistribute it freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you
+   must not claim that you wrote the original software. If you use
+   this software in a product, an acknowledgment in the product
+   documentation would be appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and
+   must not be misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+   distribution.

diff --git a/security/container/protos/nanopb/Makefile b/security/container/protos/nanopb/Makefile
new file mode 100644
index 0000000..b7e15f8
--- /dev/null
+++ b/security/container/protos/nanopb/Makefile

@@ -0,0 +1,7 @@
+obj-$(CONFIG_SECURITY_CONTAINER_MONITOR) += nanopb.o
+
+nanopb-y := pb_encode.o pb_decode.o pb_common.o
+
+ccflags-y := -I$(srctree)/security/container/protos \
+	-I$(srctree)/security/container/protos/nanopb \
+	$(PB_CCFLAGS)

diff --git a/security/container/protos/nanopb/pb.h b/security/container/protos/nanopb/pb.h
new file mode 100644
index 0000000..174a84b
--- /dev/null
+++ b/security/container/protos/nanopb/pb.h

@@ -0,0 +1,593 @@
+/* Common parts of the nanopb library. Most of these are quite low-level
+ * stuff. For the high-level interface, see pb_encode.h and pb_decode.h.
+ */
+
+#ifndef PB_H_INCLUDED
+#define PB_H_INCLUDED
+
+/*****************************************************************
+ * Nanopb compilation time options. You can change these here by *
+ * uncommenting the lines, or on the compiler command line.      *
+ *****************************************************************/
+
+/* Enable support for dynamically allocated fields */
+/* #define PB_ENABLE_MALLOC 1 */
+
+/* Define this if your CPU / compiler combination does not support
+ * unaligned memory access to packed structures. */
+/* #define PB_NO_PACKED_STRUCTS 1 */
+
+/* Increase the number of required fields that are tracked.
+ * A compiler warning will tell if you need this. */
+/* #define PB_MAX_REQUIRED_FIELDS 256 */
+
+/* Add support for tag numbers > 255 and fields larger than 255 bytes. */
+/* #define PB_FIELD_16BIT 1 */
+
+/* Add support for tag numbers > 65536 and fields larger than 65536 bytes. */
+/* #define PB_FIELD_32BIT 1 */
+
+/* Disable support for error messages in order to save some code space. */
+/* #define PB_NO_ERRMSG 1 */
+
+/* Disable support for custom streams (support only memory buffers). */
+/* #define PB_BUFFER_ONLY 1 */
+
+/* Switch back to the old-style callback function signature.
+ * This was the default until nanopb-0.2.1. */
+/* #define PB_OLD_CALLBACK_STYLE */
+
+
+/******************************************************************
+ * You usually don't need to change anything below this line.     *
+ * Feel free to look around and use the defined macros, though.   *
+ ******************************************************************/
+
+
+/* Version of the nanopb library. Just in case you want to check it in
+ * your own program. */
+#define NANOPB_VERSION nanopb-0.3.9.1
+
+/* Include all the system headers needed by nanopb. You will need the
+ * definitions of the following:
+ * - strlen, memcpy, memset functions
+ * - [u]int_least8_t, uint_fast8_t, [u]int_least16_t, [u]int32_t, [u]int64_t
+ * - size_t
+ * - bool
+ *
+ * If you don't have the standard header files, you can instead provide
+ * a custom header that defines or includes all this. In that case,
+ * define PB_SYSTEM_HEADER to the path of this file.
+ */
+#ifdef PB_SYSTEM_HEADER
+#include PB_SYSTEM_HEADER
+#else
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+
+#ifdef PB_ENABLE_MALLOC
+#include <stdlib.h>
+#endif
+#endif
+
+/* Macro for defining packed structures (compiler dependent).
+ * This just reduces memory requirements, but is not required.
+ */
+#if defined(PB_NO_PACKED_STRUCTS)
+    /* Disable struct packing */
+#   define PB_PACKED_STRUCT_START
+#   define PB_PACKED_STRUCT_END
+#   define pb_packed
+#elif defined(__GNUC__) || defined(__clang__)
+    /* For GCC and clang */
+#   define PB_PACKED_STRUCT_START
+#   define PB_PACKED_STRUCT_END
+#   define pb_packed __attribute__((packed))
+#elif defined(__ICCARM__) || defined(__CC_ARM)
+    /* For IAR ARM and Keil MDK-ARM compilers */
+#   define PB_PACKED_STRUCT_START _Pragma("pack(push, 1)")
+#   define PB_PACKED_STRUCT_END _Pragma("pack(pop)")
+#   define pb_packed
+#elif defined(_MSC_VER) && (_MSC_VER >= 1500)
+    /* For Microsoft Visual C++ */
+#   define PB_PACKED_STRUCT_START __pragma(pack(push, 1))
+#   define PB_PACKED_STRUCT_END __pragma(pack(pop))
+#   define pb_packed
+#else
+    /* Unknown compiler */
+#   define PB_PACKED_STRUCT_START
+#   define PB_PACKED_STRUCT_END
+#   define pb_packed
+#endif
+
+/* Handly macro for suppressing unreferenced-parameter compiler warnings. */
+#ifndef PB_UNUSED
+#define PB_UNUSED(x) (void)(x)
+#endif
+
+/* Compile-time assertion, used for checking compatible compilation options.
+ * If this does not work properly on your compiler, use
+ * #define PB_NO_STATIC_ASSERT to disable it.
+ *
+ * But before doing that, check carefully the error message / place where it
+ * comes from to see if the error has a real cause. Unfortunately the error
+ * message is not always very clear to read, but you can see the reason better
+ * in the place where the PB_STATIC_ASSERT macro was called.
+ */
+#ifndef PB_NO_STATIC_ASSERT
+#ifndef PB_STATIC_ASSERT
+#define PB_STATIC_ASSERT(COND,MSG) typedef char PB_STATIC_ASSERT_MSG(MSG, __LINE__, __COUNTER__)[(COND)?1:-1];
+#define PB_STATIC_ASSERT_MSG(MSG, LINE, COUNTER) PB_STATIC_ASSERT_MSG_(MSG, LINE, COUNTER)
+#define PB_STATIC_ASSERT_MSG_(MSG, LINE, COUNTER) pb_static_assertion_##MSG##LINE##COUNTER
+#endif
+#else
+#define PB_STATIC_ASSERT(COND,MSG)
+#endif
+
+/* Number of required fields to keep track of. */
+#ifndef PB_MAX_REQUIRED_FIELDS
+#define PB_MAX_REQUIRED_FIELDS 64
+#endif
+
+#if PB_MAX_REQUIRED_FIELDS < 64
+#error You should not lower PB_MAX_REQUIRED_FIELDS from the default value (64).
+#endif
+
+/* List of possible field types. These are used in the autogenerated code.
+ * Least-significant 4 bits tell the scalar type
+ * Most-significant 4 bits specify repeated/required/packed etc.
+ */
+
+typedef uint_least8_t pb_type_t;
+
+/**** Field data types ****/
+
+/* Numeric types */
+#define PB_LTYPE_VARINT  0x00 /* int32, int64, enum, bool */
+#define PB_LTYPE_UVARINT 0x01 /* uint32, uint64 */
+#define PB_LTYPE_SVARINT 0x02 /* sint32, sint64 */
+#define PB_LTYPE_FIXED32 0x03 /* fixed32, sfixed32, float */
+#define PB_LTYPE_FIXED64 0x04 /* fixed64, sfixed64, double */
+
+/* Marker for last packable field type. */
+#define PB_LTYPE_LAST_PACKABLE 0x04
+
+/* Byte array with pre-allocated buffer.
+ * data_size is the length of the allocated PB_BYTES_ARRAY structure. */
+#define PB_LTYPE_BYTES 0x05
+
+/* String with pre-allocated buffer.
+ * data_size is the maximum length. */
+#define PB_LTYPE_STRING 0x06
+
+/* Submessage
+ * submsg_fields is pointer to field descriptions */
+#define PB_LTYPE_SUBMESSAGE 0x07
+
+/* Extension pseudo-field
+ * The field contains a pointer to pb_extension_t */
+#define PB_LTYPE_EXTENSION 0x08
+
+/* Byte array with inline, pre-allocated byffer.
+ * data_size is the length of the inline, allocated buffer.
+ * This differs from PB_LTYPE_BYTES by defining the element as
+ * pb_byte_t[data_size] rather than pb_bytes_array_t. */
+#define PB_LTYPE_FIXED_LENGTH_BYTES 0x09
+
+/* Number of declared LTYPES */
+#define PB_LTYPES_COUNT 0x0A
+#define PB_LTYPE_MASK 0x0F
+
+/**** Field repetition rules ****/
+
+#define PB_HTYPE_REQUIRED 0x00
+#define PB_HTYPE_OPTIONAL 0x10
+#define PB_HTYPE_REPEATED 0x20
+#define PB_HTYPE_ONEOF    0x30
+#define PB_HTYPE_MASK     0x30
+
+/**** Field allocation types ****/
+ 
+#define PB_ATYPE_STATIC   0x00
+#define PB_ATYPE_POINTER  0x80
+#define PB_ATYPE_CALLBACK 0x40
+#define PB_ATYPE_MASK     0xC0
+
+#define PB_ATYPE(x) ((x) & PB_ATYPE_MASK)
+#define PB_HTYPE(x) ((x) & PB_HTYPE_MASK)
+#define PB_LTYPE(x) ((x) & PB_LTYPE_MASK)
+
+/* Data type used for storing sizes of struct fields
+ * and array counts.
+ */
+#if defined(PB_FIELD_32BIT)
+    typedef uint32_t pb_size_t;
+    typedef int32_t pb_ssize_t;
+#elif defined(PB_FIELD_16BIT)
+    typedef uint_least16_t pb_size_t;
+    typedef int_least16_t pb_ssize_t;
+#else
+    typedef uint_least8_t pb_size_t;
+    typedef int_least8_t pb_ssize_t;
+#endif
+#define PB_SIZE_MAX ((pb_size_t)-1)
+
+/* Data type for storing encoded data and other byte streams.
+ * This typedef exists to support platforms where uint8_t does not exist.
+ * You can regard it as equivalent on uint8_t on other platforms.
+ */
+typedef uint_least8_t pb_byte_t;
+
+/* This structure is used in auto-generated constants
+ * to specify struct fields.
+ * You can change field sizes if you need structures
+ * larger than 256 bytes or field tags larger than 256.
+ * The compiler should complain if your .proto has such
+ * structures. Fix that by defining PB_FIELD_16BIT or
+ * PB_FIELD_32BIT.
+ */
+PB_PACKED_STRUCT_START
+typedef struct pb_field_s pb_field_t;
+struct pb_field_s {
+    pb_size_t tag;
+    pb_type_t type;
+    pb_size_t data_offset; /* Offset of field data, relative to previous field. */
+    pb_ssize_t size_offset; /* Offset of array size or has-boolean, relative to data */
+    pb_size_t data_size; /* Data size in bytes for a single item */
+    pb_size_t array_size; /* Maximum number of entries in array */
+    
+    /* Field definitions for submessage
+     * OR default value for all other non-array, non-callback types
+     * If null, then field will zeroed. */
+    const void *ptr;
+} pb_packed;
+PB_PACKED_STRUCT_END
+
+/* Make sure that the standard integer types are of the expected sizes.
+ * Otherwise fixed32/fixed64 fields can break.
+ *
+ * If you get errors here, it probably means that your stdint.h is not
+ * correct for your platform.
+ */
+#ifndef PB_WITHOUT_64BIT
+PB_STATIC_ASSERT(sizeof(int64_t) == 2 * sizeof(int32_t), INT64_T_WRONG_SIZE)
+PB_STATIC_ASSERT(sizeof(uint64_t) == 2 * sizeof(uint32_t), UINT64_T_WRONG_SIZE)
+#endif
+
+/* This structure is used for 'bytes' arrays.
+ * It has the number of bytes in the beginning, and after that an array.
+ * Note that actual structs used will have a different length of bytes array.
+ */
+#define PB_BYTES_ARRAY_T(n) struct { pb_size_t size; pb_byte_t bytes[n]; }
+#define PB_BYTES_ARRAY_T_ALLOCSIZE(n) ((size_t)n + offsetof(pb_bytes_array_t, bytes))
+
+struct pb_bytes_array_s {
+    pb_size_t size;
+    pb_byte_t bytes[1];
+};
+typedef struct pb_bytes_array_s pb_bytes_array_t;
+
+/* This structure is used for giving the callback function.
+ * It is stored in the message structure and filled in by the method that
+ * calls pb_decode.
+ *
+ * The decoding callback will be given a limited-length stream
+ * If the wire type was string, the length is the length of the string.
+ * If the wire type was a varint/fixed32/fixed64, the length is the length
+ * of the actual value.
+ * The function may be called multiple times (especially for repeated types,
+ * but also otherwise if the message happens to contain the field multiple
+ * times.)
+ *
+ * The encoding callback will receive the actual output stream.
+ * It should write all the data in one call, including the field tag and
+ * wire type. It can write multiple fields.
+ *
+ * The callback can be null if you want to skip a field.
+ */
+typedef struct pb_istream_s pb_istream_t;
+typedef struct pb_ostream_s pb_ostream_t;
+typedef struct pb_callback_s pb_callback_t;
+struct pb_callback_s {
+#ifdef PB_OLD_CALLBACK_STYLE
+    /* Deprecated since nanopb-0.2.1 */
+    union {
+        bool (*decode)(pb_istream_t *stream, const pb_field_t *field, void *arg);
+        bool (*encode)(pb_ostream_t *stream, const pb_field_t *field, const void *arg);
+    } funcs;
+#else
+    /* New function signature, which allows modifying arg contents in callback. */
+    union {
+        bool (*decode)(pb_istream_t *stream, const pb_field_t *field, void **arg);
+        bool (*encode)(pb_ostream_t *stream, const pb_field_t *field, void * const *arg);
+    } funcs;
+#endif    
+    
+    /* Free arg for use by callback */
+    void *arg;
+};
+
+/* Wire types. Library user needs these only in encoder callbacks. */
+typedef enum {
+    PB_WT_VARINT = 0,
+    PB_WT_64BIT  = 1,
+    PB_WT_STRING = 2,
+    PB_WT_32BIT  = 5
+} pb_wire_type_t;
+
+/* Structure for defining the handling of unknown/extension fields.
+ * Usually the pb_extension_type_t structure is automatically generated,
+ * while the pb_extension_t structure is created by the user. However,
+ * if you want to catch all unknown fields, you can also create a custom
+ * pb_extension_type_t with your own callback.
+ */
+typedef struct pb_extension_type_s pb_extension_type_t;
+typedef struct pb_extension_s pb_extension_t;
+struct pb_extension_type_s {
+    /* Called for each unknown field in the message.
+     * If you handle the field, read off all of its data and return true.
+     * If you do not handle the field, do not read anything and return true.
+     * If you run into an error, return false.
+     * Set to NULL for default handler.
+     */
+    bool (*decode)(pb_istream_t *stream, pb_extension_t *extension,
+                   uint32_t tag, pb_wire_type_t wire_type);
+    
+    /* Called once after all regular fields have been encoded.
+     * If you have something to write, do so and return true.
+     * If you do not have anything to write, just return true.
+     * If you run into an error, return false.
+     * Set to NULL for default handler.
+     */
+    bool (*encode)(pb_ostream_t *stream, const pb_extension_t *extension);
+    
+    /* Free field for use by the callback. */
+    const void *arg;
+};
+
+struct pb_extension_s {
+    /* Type describing the extension field. Usually you'll initialize
+     * this to a pointer to the automatically generated structure. */
+    const pb_extension_type_t *type;
+    
+    /* Destination for the decoded data. This must match the datatype
+     * of the extension field. */
+    void *dest;
+    
+    /* Pointer to the next extension handler, or NULL.
+     * If this extension does not match a field, the next handler is
+     * automatically called. */
+    pb_extension_t *next;
+
+    /* The decoder sets this to true if the extension was found.
+     * Ignored for encoding. */
+    bool found;
+};
+
+/* Memory allocation functions to use. You can define pb_realloc and
+ * pb_free to custom functions if you want. */
+#ifdef PB_ENABLE_MALLOC
+#   ifndef pb_realloc
+#       define pb_realloc(ptr, size) realloc(ptr, size)
+#   endif
+#   ifndef pb_free
+#       define pb_free(ptr) free(ptr)
+#   endif
+#endif
+
+/* This is used to inform about need to regenerate .pb.h/.pb.c files. */
+#define PB_PROTO_HEADER_VERSION 30
+
+/* These macros are used to declare pb_field_t's in the constant array. */
+/* Size of a structure member, in bytes. */
+#define pb_membersize(st, m) (sizeof ((st*)0)->m)
+/* Number of entries in an array. */
+#define pb_arraysize(st, m) (pb_membersize(st, m) / pb_membersize(st, m[0]))
+/* Delta from start of one member to the start of another member. */
+#define pb_delta(st, m1, m2) ((int)offsetof(st, m1) - (int)offsetof(st, m2))
+/* Marks the end of the field list */
+#define PB_LAST_FIELD {0,(pb_type_t) 0,0,0,0,0,0}
+
+/* Macros for filling in the data_offset field */
+/* data_offset for first field in a message */
+#define PB_DATAOFFSET_FIRST(st, m1, m2) (offsetof(st, m1))
+/* data_offset for subsequent fields */
+#define PB_DATAOFFSET_OTHER(st, m1, m2) (offsetof(st, m1) - offsetof(st, m2) - pb_membersize(st, m2))
+/* data offset for subsequent fields inside an union (oneof) */
+#define PB_DATAOFFSET_UNION(st, m1, m2) (PB_SIZE_MAX)
+/* Choose first/other based on m1 == m2 (deprecated, remains for backwards compatibility) */
+#define PB_DATAOFFSET_CHOOSE(st, m1, m2) (int)(offsetof(st, m1) == offsetof(st, m2) \
+                                  ? PB_DATAOFFSET_FIRST(st, m1, m2) \
+                                  : PB_DATAOFFSET_OTHER(st, m1, m2))
+
+/* Required fields are the simplest. They just have delta (padding) from
+ * previous field end, and the size of the field. Pointer is used for
+ * submessages and default values.
+ */
+#define PB_REQUIRED_STATIC(tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_STATIC | PB_HTYPE_REQUIRED | ltype, \
+    fd, 0, pb_membersize(st, m), 0, ptr}
+
+/* Optional fields add the delta to the has_ variable. */
+#define PB_OPTIONAL_STATIC(tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_STATIC | PB_HTYPE_OPTIONAL | ltype, \
+    fd, \
+    pb_delta(st, has_ ## m, m), \
+    pb_membersize(st, m), 0, ptr}
+
+#define PB_SINGULAR_STATIC(tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_STATIC | PB_HTYPE_OPTIONAL | ltype, \
+    fd, 0, pb_membersize(st, m), 0, ptr}
+
+/* Repeated fields have a _count field and also the maximum number of entries. */
+#define PB_REPEATED_STATIC(tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_STATIC | PB_HTYPE_REPEATED | ltype, \
+    fd, \
+    pb_delta(st, m ## _count, m), \
+    pb_membersize(st, m[0]), \
+    pb_arraysize(st, m), ptr}
+
+/* Allocated fields carry the size of the actual data, not the pointer */
+#define PB_REQUIRED_POINTER(tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_POINTER | PB_HTYPE_REQUIRED | ltype, \
+    fd, 0, pb_membersize(st, m[0]), 0, ptr}
+
+/* Optional fields don't need a has_ variable, as information would be redundant */
+#define PB_OPTIONAL_POINTER(tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_POINTER | PB_HTYPE_OPTIONAL | ltype, \
+    fd, 0, pb_membersize(st, m[0]), 0, ptr}
+
+/* Same as optional fields*/
+#define PB_SINGULAR_POINTER(tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_POINTER | PB_HTYPE_OPTIONAL | ltype, \
+    fd, 0, pb_membersize(st, m[0]), 0, ptr}
+
+/* Repeated fields have a _count field and a pointer to array of pointers */
+#define PB_REPEATED_POINTER(tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_POINTER | PB_HTYPE_REPEATED | ltype, \
+    fd, pb_delta(st, m ## _count, m), \
+    pb_membersize(st, m[0]), 0, ptr}
+
+/* Callbacks are much like required fields except with special datatype. */
+#define PB_REQUIRED_CALLBACK(tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_CALLBACK | PB_HTYPE_REQUIRED | ltype, \
+    fd, 0, pb_membersize(st, m), 0, ptr}
+
+#define PB_OPTIONAL_CALLBACK(tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_CALLBACK | PB_HTYPE_OPTIONAL | ltype, \
+    fd, 0, pb_membersize(st, m), 0, ptr}
+
+#define PB_SINGULAR_CALLBACK(tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_CALLBACK | PB_HTYPE_OPTIONAL | ltype, \
+    fd, 0, pb_membersize(st, m), 0, ptr}
+    
+#define PB_REPEATED_CALLBACK(tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_CALLBACK | PB_HTYPE_REPEATED | ltype, \
+    fd, 0, pb_membersize(st, m), 0, ptr}
+
+/* Optional extensions don't have the has_ field, as that would be redundant.
+ * Furthermore, the combination of OPTIONAL without has_ field is used
+ * for indicating proto3 style fields. Extensions exist in proto2 mode only,
+ * so they should be encoded according to proto2 rules. To avoid the conflict,
+ * extensions are marked as REQUIRED instead.
+ */
+#define PB_OPTEXT_STATIC(tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_STATIC | PB_HTYPE_REQUIRED | ltype, \
+    0, \
+    0, \
+    pb_membersize(st, m), 0, ptr}
+
+#define PB_OPTEXT_POINTER(tag, st, m, fd, ltype, ptr) \
+    PB_OPTIONAL_POINTER(tag, st, m, fd, ltype, ptr)
+
+#define PB_OPTEXT_CALLBACK(tag, st, m, fd, ltype, ptr) \
+    PB_OPTIONAL_CALLBACK(tag, st, m, fd, ltype, ptr)
+
+/* The mapping from protobuf types to LTYPEs is done using these macros. */
+#define PB_LTYPE_MAP_BOOL               PB_LTYPE_VARINT
+#define PB_LTYPE_MAP_BYTES              PB_LTYPE_BYTES
+#define PB_LTYPE_MAP_DOUBLE             PB_LTYPE_FIXED64
+#define PB_LTYPE_MAP_ENUM               PB_LTYPE_VARINT
+#define PB_LTYPE_MAP_UENUM              PB_LTYPE_UVARINT
+#define PB_LTYPE_MAP_FIXED32            PB_LTYPE_FIXED32
+#define PB_LTYPE_MAP_FIXED64            PB_LTYPE_FIXED64
+#define PB_LTYPE_MAP_FLOAT              PB_LTYPE_FIXED32
+#define PB_LTYPE_MAP_INT32              PB_LTYPE_VARINT
+#define PB_LTYPE_MAP_INT64              PB_LTYPE_VARINT
+#define PB_LTYPE_MAP_MESSAGE            PB_LTYPE_SUBMESSAGE
+#define PB_LTYPE_MAP_SFIXED32           PB_LTYPE_FIXED32
+#define PB_LTYPE_MAP_SFIXED64           PB_LTYPE_FIXED64
+#define PB_LTYPE_MAP_SINT32             PB_LTYPE_SVARINT
+#define PB_LTYPE_MAP_SINT64             PB_LTYPE_SVARINT
+#define PB_LTYPE_MAP_STRING             PB_LTYPE_STRING
+#define PB_LTYPE_MAP_UINT32             PB_LTYPE_UVARINT
+#define PB_LTYPE_MAP_UINT64             PB_LTYPE_UVARINT
+#define PB_LTYPE_MAP_EXTENSION          PB_LTYPE_EXTENSION
+#define PB_LTYPE_MAP_FIXED_LENGTH_BYTES PB_LTYPE_FIXED_LENGTH_BYTES
+
+/* This is the actual macro used in field descriptions.
+ * It takes these arguments:
+ * - Field tag number
+ * - Field type:   BOOL, BYTES, DOUBLE, ENUM, UENUM, FIXED32, FIXED64,
+ *                 FLOAT, INT32, INT64, MESSAGE, SFIXED32, SFIXED64
+ *                 SINT32, SINT64, STRING, UINT32, UINT64 or EXTENSION
+ * - Field rules:  REQUIRED, OPTIONAL or REPEATED
+ * - Allocation:   STATIC, CALLBACK or POINTER
+ * - Placement: FIRST or OTHER, depending on if this is the first field in structure.
+ * - Message name
+ * - Field name
+ * - Previous field name (or field name again for first field)
+ * - Pointer to default value or submsg fields.
+ */
+
+#define PB_FIELD(tag, type, rules, allocation, placement, message, field, prevfield, ptr) \
+        PB_ ## rules ## _ ## allocation(tag, message, field, \
+        PB_DATAOFFSET_ ## placement(message, field, prevfield), \
+        PB_LTYPE_MAP_ ## type, ptr)
+
+/* Field description for repeated static fixed count fields.*/
+#define PB_REPEATED_FIXED_COUNT(tag, type, placement, message, field, prevfield, ptr) \
+    {tag, PB_ATYPE_STATIC | PB_HTYPE_REPEATED | PB_LTYPE_MAP_ ## type, \
+    PB_DATAOFFSET_ ## placement(message, field, prevfield), \
+    0, \
+    pb_membersize(message, field[0]), \
+    pb_arraysize(message, field), ptr}
+
+/* Field description for oneof fields. This requires taking into account the
+ * union name also, that's why a separate set of macros is needed.
+ */
+#define PB_ONEOF_STATIC(u, tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_STATIC | PB_HTYPE_ONEOF | ltype, \
+    fd, pb_delta(st, which_ ## u, u.m), \
+    pb_membersize(st, u.m), 0, ptr}
+
+#define PB_ONEOF_POINTER(u, tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_POINTER | PB_HTYPE_ONEOF | ltype, \
+    fd, pb_delta(st, which_ ## u, u.m), \
+    pb_membersize(st, u.m[0]), 0, ptr}
+
+#define PB_ONEOF_FIELD(union_name, tag, type, rules, allocation, placement, message, field, prevfield, ptr) \
+        PB_ONEOF_ ## allocation(union_name, tag, message, field, \
+        PB_DATAOFFSET_ ## placement(message, union_name.field, prevfield), \
+        PB_LTYPE_MAP_ ## type, ptr)
+
+#define PB_ANONYMOUS_ONEOF_STATIC(u, tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_STATIC | PB_HTYPE_ONEOF | ltype, \
+    fd, pb_delta(st, which_ ## u, m), \
+    pb_membersize(st, m), 0, ptr}
+
+#define PB_ANONYMOUS_ONEOF_POINTER(u, tag, st, m, fd, ltype, ptr) \
+    {tag, PB_ATYPE_POINTER | PB_HTYPE_ONEOF | ltype, \
+    fd, pb_delta(st, which_ ## u, m), \
+    pb_membersize(st, m[0]), 0, ptr}
+
+#define PB_ANONYMOUS_ONEOF_FIELD(union_name, tag, type, rules, allocation, placement, message, field, prevfield, ptr) \
+        PB_ANONYMOUS_ONEOF_ ## allocation(union_name, tag, message, field, \
+        PB_DATAOFFSET_ ## placement(message, field, prevfield), \
+        PB_LTYPE_MAP_ ## type, ptr)
+
+/* These macros are used for giving out error messages.
+ * They are mostly a debugging aid; the main error information
+ * is the true/false return value from functions.
+ * Some code space can be saved by disabling the error
+ * messages if not used.
+ *
+ * PB_SET_ERROR() sets the error message if none has been set yet.
+ *                msg must be a constant string literal.
+ * PB_GET_ERROR() always returns a pointer to a string.
+ * PB_RETURN_ERROR() sets the error and returns false from current
+ *                   function.
+ */
+#ifdef PB_NO_ERRMSG
+#define PB_SET_ERROR(stream, msg) PB_UNUSED(stream)
+#define PB_GET_ERROR(stream) "(errmsg disabled)"
+#else
+#define PB_SET_ERROR(stream, msg) (stream->errmsg = (stream)->errmsg ? (stream)->errmsg : (msg))
+#define PB_GET_ERROR(stream) ((stream)->errmsg ? (stream)->errmsg : "(none)")
+#endif
+
+#define PB_RETURN_ERROR(stream, msg) return PB_SET_ERROR(stream, msg), false
+
+#endif

diff --git a/security/container/protos/nanopb/pb_common.c b/security/container/protos/nanopb/pb_common.c
new file mode 100644
index 0000000..4fb7186
--- /dev/null
+++ b/security/container/protos/nanopb/pb_common.c

@@ -0,0 +1,97 @@
+/* pb_common.c: Common support functions for pb_encode.c and pb_decode.c.
+ *
+ * 2014 Petteri Aimonen <jpa@kapsi.fi>
+ */
+
+#include "pb_common.h"
+
+bool pb_field_iter_begin(pb_field_iter_t *iter, const pb_field_t *fields, void *dest_struct)
+{
+    iter->start = fields;
+    iter->pos = fields;
+    iter->required_field_index = 0;
+    iter->dest_struct = dest_struct;
+    iter->pData = (char*)dest_struct + iter->pos->data_offset;
+    iter->pSize = (char*)iter->pData + iter->pos->size_offset;
+    
+    return (iter->pos->tag != 0);
+}
+
+bool pb_field_iter_next(pb_field_iter_t *iter)
+{
+    const pb_field_t *prev_field = iter->pos;
+
+    if (prev_field->tag == 0)
+    {
+        /* Handle empty message types, where the first field is already the terminator.
+         * In other cases, the iter->pos never points to the terminator. */
+        return false;
+    }
+    
+    iter->pos++;
+    
+    if (iter->pos->tag == 0)
+    {
+        /* Wrapped back to beginning, reinitialize */
+        (void)pb_field_iter_begin(iter, iter->start, iter->dest_struct);
+        return false;
+    }
+    else
+    {
+        /* Increment the pointers based on previous field size */
+        size_t prev_size = prev_field->data_size;
+    
+        if (PB_HTYPE(prev_field->type) == PB_HTYPE_ONEOF &&
+            PB_HTYPE(iter->pos->type) == PB_HTYPE_ONEOF &&
+            iter->pos->data_offset == PB_SIZE_MAX)
+        {
+            /* Don't advance pointers inside unions */
+            return true;
+        }
+        else if (PB_ATYPE(prev_field->type) == PB_ATYPE_STATIC &&
+                 PB_HTYPE(prev_field->type) == PB_HTYPE_REPEATED)
+        {
+            /* In static arrays, the data_size tells the size of a single entry and
+             * array_size is the number of entries */
+            prev_size *= prev_field->array_size;
+        }
+        else if (PB_ATYPE(prev_field->type) == PB_ATYPE_POINTER)
+        {
+            /* Pointer fields always have a constant size in the main structure.
+             * The data_size only applies to the dynamically allocated area. */
+            prev_size = sizeof(void*);
+        }
+
+        if (PB_HTYPE(prev_field->type) == PB_HTYPE_REQUIRED)
+        {
+            /* Count the required fields, in order to check their presence in the
+             * decoder. */
+            iter->required_field_index++;
+        }
+    
+        iter->pData = (char*)iter->pData + prev_size + iter->pos->data_offset;
+        iter->pSize = (char*)iter->pData + iter->pos->size_offset;
+        return true;
+    }
+}
+
+bool pb_field_iter_find(pb_field_iter_t *iter, uint32_t tag)
+{
+    const pb_field_t *start = iter->pos;
+    
+    do {
+        if (iter->pos->tag == tag &&
+            PB_LTYPE(iter->pos->type) != PB_LTYPE_EXTENSION)
+        {
+            /* Found the wanted field */
+            return true;
+        }
+        
+        (void)pb_field_iter_next(iter);
+    } while (iter->pos != start);
+    
+    /* Searched all the way back to start, and found nothing. */
+    return false;
+}
+
+

diff --git a/security/container/protos/nanopb/pb_common.h b/security/container/protos/nanopb/pb_common.h
new file mode 100644
index 0000000..60b3d37
--- /dev/null
+++ b/security/container/protos/nanopb/pb_common.h

@@ -0,0 +1,42 @@
+/* pb_common.h: Common support functions for pb_encode.c and pb_decode.c.
+ * These functions are rarely needed by applications directly.
+ */
+
+#ifndef PB_COMMON_H_INCLUDED
+#define PB_COMMON_H_INCLUDED
+
+#include "pb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Iterator for pb_field_t list */
+struct pb_field_iter_s {
+    const pb_field_t *start;       /* Start of the pb_field_t array */
+    const pb_field_t *pos;         /* Current position of the iterator */
+    unsigned required_field_index; /* Zero-based index that counts only the required fields */
+    void *dest_struct;             /* Pointer to start of the structure */
+    void *pData;                   /* Pointer to current field value */
+    void *pSize;                   /* Pointer to count/has field */
+};
+typedef struct pb_field_iter_s pb_field_iter_t;
+
+/* Initialize the field iterator structure to beginning.
+ * Returns false if the message type is empty. */
+bool pb_field_iter_begin(pb_field_iter_t *iter, const pb_field_t *fields, void *dest_struct);
+
+/* Advance the iterator to the next field.
+ * Returns false when the iterator wraps back to the first field. */
+bool pb_field_iter_next(pb_field_iter_t *iter);
+
+/* Advance the iterator until it points at a field with the given tag.
+ * Returns false if no such field exists. */
+bool pb_field_iter_find(pb_field_iter_t *iter, uint32_t tag);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
+

diff --git a/security/container/protos/nanopb/pb_decode.c b/security/container/protos/nanopb/pb_decode.c
new file mode 100644
index 0000000..4b80e81
--- /dev/null
+++ b/security/container/protos/nanopb/pb_decode.c

@@ -0,0 +1,1508 @@
+/* pb_decode.c -- decode a protobuf using minimal resources
+ *
+ * 2011 Petteri Aimonen <jpa@kapsi.fi>
+ */
+
+/* Use the GCC warn_unused_result attribute to check that all return values
+ * are propagated correctly. On other compilers and gcc before 3.4.0 just
+ * ignore the annotation.
+ */
+#if !defined(__GNUC__) || ( __GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 4)
+    #define checkreturn
+#else
+    #define checkreturn __attribute__((warn_unused_result))
+#endif
+
+#include "pb.h"
+#include "pb_decode.h"
+#include "pb_common.h"
+
+/**************************************
+ * Declarations internal to this file *
+ **************************************/
+
+typedef bool (*pb_decoder_t)(pb_istream_t *stream, const pb_field_t *field, void *dest) checkreturn;
+
+static bool checkreturn buf_read(pb_istream_t *stream, pb_byte_t *buf, size_t count);
+static bool checkreturn read_raw_value(pb_istream_t *stream, pb_wire_type_t wire_type, pb_byte_t *buf, size_t *size);
+static bool checkreturn decode_static_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *iter);
+static bool checkreturn decode_callback_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *iter);
+static bool checkreturn decode_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *iter);
+static void iter_from_extension(pb_field_iter_t *iter, pb_extension_t *extension);
+static bool checkreturn default_extension_decoder(pb_istream_t *stream, pb_extension_t *extension, uint32_t tag, pb_wire_type_t wire_type);
+static bool checkreturn decode_extension(pb_istream_t *stream, uint32_t tag, pb_wire_type_t wire_type, pb_field_iter_t *iter);
+static bool checkreturn find_extension_field(pb_field_iter_t *iter);
+static void pb_field_set_to_default(pb_field_iter_t *iter);
+static void pb_message_set_to_defaults(const pb_field_t fields[], void *dest_struct);
+static bool checkreturn pb_dec_varint(pb_istream_t *stream, const pb_field_t *field, void *dest);
+static bool checkreturn pb_decode_varint32_eof(pb_istream_t *stream, uint32_t *dest, bool *eof);
+static bool checkreturn pb_dec_uvarint(pb_istream_t *stream, const pb_field_t *field, void *dest);
+static bool checkreturn pb_dec_svarint(pb_istream_t *stream, const pb_field_t *field, void *dest);
+static bool checkreturn pb_dec_fixed32(pb_istream_t *stream, const pb_field_t *field, void *dest);
+static bool checkreturn pb_dec_fixed64(pb_istream_t *stream, const pb_field_t *field, void *dest);
+static bool checkreturn pb_dec_bytes(pb_istream_t *stream, const pb_field_t *field, void *dest);
+static bool checkreturn pb_dec_string(pb_istream_t *stream, const pb_field_t *field, void *dest);
+static bool checkreturn pb_dec_submessage(pb_istream_t *stream, const pb_field_t *field, void *dest);
+static bool checkreturn pb_dec_fixed_length_bytes(pb_istream_t *stream, const pb_field_t *field, void *dest);
+static bool checkreturn pb_skip_varint(pb_istream_t *stream);
+static bool checkreturn pb_skip_string(pb_istream_t *stream);
+
+#ifdef PB_ENABLE_MALLOC
+static bool checkreturn allocate_field(pb_istream_t *stream, void *pData, size_t data_size, size_t array_size);
+static bool checkreturn pb_release_union_field(pb_istream_t *stream, pb_field_iter_t *iter);
+static void pb_release_single_field(const pb_field_iter_t *iter);
+#endif
+
+#ifdef PB_WITHOUT_64BIT
+#define pb_int64_t int32_t
+#define pb_uint64_t uint32_t
+#else
+#define pb_int64_t int64_t
+#define pb_uint64_t uint64_t
+#endif
+
+/* --- Function pointers to field decoders ---
+ * Order in the array must match pb_action_t LTYPE numbering.
+ */
+static const pb_decoder_t PB_DECODERS[PB_LTYPES_COUNT] = {
+    &pb_dec_varint,
+    &pb_dec_uvarint,
+    &pb_dec_svarint,
+    &pb_dec_fixed32,
+    &pb_dec_fixed64,
+    
+    &pb_dec_bytes,
+    &pb_dec_string,
+    &pb_dec_submessage,
+    NULL, /* extensions */
+    &pb_dec_fixed_length_bytes
+};
+
+/*******************************
+ * pb_istream_t implementation *
+ *******************************/
+
+static bool checkreturn buf_read(pb_istream_t *stream, pb_byte_t *buf, size_t count)
+{
+    size_t i;
+    const pb_byte_t *source = (const pb_byte_t*)stream->state;
+    stream->state = (pb_byte_t*)stream->state + count;
+    
+    if (buf != NULL)
+    {
+        for (i = 0; i < count; i++)
+            buf[i] = source[i];
+    }
+    
+    return true;
+}
+
+bool checkreturn pb_read(pb_istream_t *stream, pb_byte_t *buf, size_t count)
+{
+#ifndef PB_BUFFER_ONLY
+	if (buf == NULL && stream->callback != buf_read)
+	{
+		/* Skip input bytes */
+		pb_byte_t tmp[16];
+		while (count > 16)
+		{
+			if (!pb_read(stream, tmp, 16))
+				return false;
+			
+			count -= 16;
+		}
+		
+		return pb_read(stream, tmp, count);
+	}
+#endif
+
+    if (stream->bytes_left < count)
+        PB_RETURN_ERROR(stream, "end-of-stream");
+    
+#ifndef PB_BUFFER_ONLY
+    if (!stream->callback(stream, buf, count))
+        PB_RETURN_ERROR(stream, "io error");
+#else
+    if (!buf_read(stream, buf, count))
+        return false;
+#endif
+    
+    stream->bytes_left -= count;
+    return true;
+}
+
+/* Read a single byte from input stream. buf may not be NULL.
+ * This is an optimization for the varint decoding. */
+static bool checkreturn pb_readbyte(pb_istream_t *stream, pb_byte_t *buf)
+{
+    if (stream->bytes_left == 0)
+        PB_RETURN_ERROR(stream, "end-of-stream");
+
+#ifndef PB_BUFFER_ONLY
+    if (!stream->callback(stream, buf, 1))
+        PB_RETURN_ERROR(stream, "io error");
+#else
+    *buf = *(const pb_byte_t*)stream->state;
+    stream->state = (pb_byte_t*)stream->state + 1;
+#endif
+
+    stream->bytes_left--;
+    
+    return true;    
+}
+
+pb_istream_t pb_istream_from_buffer(const pb_byte_t *buf, size_t bufsize)
+{
+    pb_istream_t stream;
+    /* Cast away the const from buf without a compiler error.  We are
+     * careful to use it only in a const manner in the callbacks.
+     */
+    union {
+        void *state;
+        const void *c_state;
+    } state;
+#ifdef PB_BUFFER_ONLY
+    stream.callback = NULL;
+#else
+    stream.callback = &buf_read;
+#endif
+    state.c_state = buf;
+    stream.state = state.state;
+    stream.bytes_left = bufsize;
+#ifndef PB_NO_ERRMSG
+    stream.errmsg = NULL;
+#endif
+    return stream;
+}
+
+/********************
+ * Helper functions *
+ ********************/
+
+static bool checkreturn pb_decode_varint32_eof(pb_istream_t *stream, uint32_t *dest, bool *eof)
+{
+    pb_byte_t byte;
+    uint32_t result;
+    
+    if (!pb_readbyte(stream, &byte))
+    {
+        if (stream->bytes_left == 0)
+        {
+            if (eof)
+            {
+                *eof = true;
+            }
+        }
+
+        return false;
+    }
+    
+    if ((byte & 0x80) == 0)
+    {
+        /* Quick case, 1 byte value */
+        result = byte;
+    }
+    else
+    {
+        /* Multibyte case */
+        uint_fast8_t bitpos = 7;
+        result = byte & 0x7F;
+        
+        do
+        {
+            if (!pb_readbyte(stream, &byte))
+                return false;
+            
+            if (bitpos >= 32)
+            {
+                /* Note: The varint could have trailing 0x80 bytes, or 0xFF for negative. */
+                uint8_t sign_extension = (bitpos < 63) ? 0xFF : 0x01;
+                
+                if ((byte & 0x7F) != 0x00 && ((result >> 31) == 0 || byte != sign_extension))
+                {
+                    PB_RETURN_ERROR(stream, "varint overflow");
+                }
+            }
+            else
+            {
+                result |= (uint32_t)(byte & 0x7F) << bitpos;
+            }
+            bitpos = (uint_fast8_t)(bitpos + 7);
+        } while (byte & 0x80);
+        
+        if (bitpos == 35 && (byte & 0x70) != 0)
+        {
+            /* The last byte was at bitpos=28, so only bottom 4 bits fit. */
+            PB_RETURN_ERROR(stream, "varint overflow");
+        }
+   }
+   
+   *dest = result;
+   return true;
+}
+
+bool checkreturn pb_decode_varint32(pb_istream_t *stream, uint32_t *dest)
+{
+    return pb_decode_varint32_eof(stream, dest, NULL);
+}
+
+#ifndef PB_WITHOUT_64BIT
+bool checkreturn pb_decode_varint(pb_istream_t *stream, uint64_t *dest)
+{
+    pb_byte_t byte;
+    uint_fast8_t bitpos = 0;
+    uint64_t result = 0;
+    
+    do
+    {
+        if (bitpos >= 64)
+            PB_RETURN_ERROR(stream, "varint overflow");
+        
+        if (!pb_readbyte(stream, &byte))
+            return false;
+
+        result |= (uint64_t)(byte & 0x7F) << bitpos;
+        bitpos = (uint_fast8_t)(bitpos + 7);
+    } while (byte & 0x80);
+    
+    *dest = result;
+    return true;
+}
+#endif
+
+bool checkreturn pb_skip_varint(pb_istream_t *stream)
+{
+    pb_byte_t byte;
+    do
+    {
+        if (!pb_read(stream, &byte, 1))
+            return false;
+    } while (byte & 0x80);
+    return true;
+}
+
+bool checkreturn pb_skip_string(pb_istream_t *stream)
+{
+    uint32_t length;
+    if (!pb_decode_varint32(stream, &length))
+        return false;
+    
+    return pb_read(stream, NULL, length);
+}
+
+bool checkreturn pb_decode_tag(pb_istream_t *stream, pb_wire_type_t *wire_type, uint32_t *tag, bool *eof)
+{
+    uint32_t temp;
+    *eof = false;
+    *wire_type = (pb_wire_type_t) 0;
+    *tag = 0;
+    
+    if (!pb_decode_varint32_eof(stream, &temp, eof))
+    {
+        return false;
+    }
+    
+    if (temp == 0)
+    {
+        *eof = true; /* Special feature: allow 0-terminated messages. */
+        return false;
+    }
+    
+    *tag = temp >> 3;
+    *wire_type = (pb_wire_type_t)(temp & 7);
+    return true;
+}
+
+bool checkreturn pb_skip_field(pb_istream_t *stream, pb_wire_type_t wire_type)
+{
+    switch (wire_type)
+    {
+        case PB_WT_VARINT: return pb_skip_varint(stream);
+        case PB_WT_64BIT: return pb_read(stream, NULL, 8);
+        case PB_WT_STRING: return pb_skip_string(stream);
+        case PB_WT_32BIT: return pb_read(stream, NULL, 4);
+        default: PB_RETURN_ERROR(stream, "invalid wire_type");
+    }
+}
+
+/* Read a raw value to buffer, for the purpose of passing it to callback as
+ * a substream. Size is maximum size on call, and actual size on return.
+ */
+static bool checkreturn read_raw_value(pb_istream_t *stream, pb_wire_type_t wire_type, pb_byte_t *buf, size_t *size)
+{
+    size_t max_size = *size;
+    switch (wire_type)
+    {
+        case PB_WT_VARINT:
+            *size = 0;
+            do
+            {
+                (*size)++;
+                if (*size > max_size) return false;
+                if (!pb_read(stream, buf, 1)) return false;
+            } while (*buf++ & 0x80);
+            return true;
+            
+        case PB_WT_64BIT:
+            *size = 8;
+            return pb_read(stream, buf, 8);
+        
+        case PB_WT_32BIT:
+            *size = 4;
+            return pb_read(stream, buf, 4);
+        
+        case PB_WT_STRING:
+            /* Calling read_raw_value with a PB_WT_STRING is an error.
+             * Explicitly handle this case and fallthrough to default to avoid
+             * compiler warnings.
+             */
+
+        default: PB_RETURN_ERROR(stream, "invalid wire_type");
+    }
+}
+
+/* Decode string length from stream and return a substream with limited length.
+ * Remember to close the substream using pb_close_string_substream().
+ */
+bool checkreturn pb_make_string_substream(pb_istream_t *stream, pb_istream_t *substream)
+{
+    uint32_t size;
+    if (!pb_decode_varint32(stream, &size))
+        return false;
+    
+    *substream = *stream;
+    if (substream->bytes_left < size)
+        PB_RETURN_ERROR(stream, "parent stream too short");
+    
+    substream->bytes_left = size;
+    stream->bytes_left -= size;
+    return true;
+}
+
+bool checkreturn pb_close_string_substream(pb_istream_t *stream, pb_istream_t *substream)
+{
+    if (substream->bytes_left) {
+        if (!pb_read(substream, NULL, substream->bytes_left))
+            return false;
+    }
+
+    stream->state = substream->state;
+
+#ifndef PB_NO_ERRMSG
+    stream->errmsg = substream->errmsg;
+#endif
+    return true;
+}
+
+/*************************
+ * Decode a single field *
+ *************************/
+
+static bool checkreturn decode_static_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *iter)
+{
+    pb_type_t type;
+    pb_decoder_t func;
+    
+    type = iter->pos->type;
+    func = PB_DECODERS[PB_LTYPE(type)];
+
+    switch (PB_HTYPE(type))
+    {
+        case PB_HTYPE_REQUIRED:
+            return func(stream, iter->pos, iter->pData);
+            
+        case PB_HTYPE_OPTIONAL:
+            if (iter->pSize != iter->pData)
+                *(bool*)iter->pSize = true;
+            return func(stream, iter->pos, iter->pData);
+    
+        case PB_HTYPE_REPEATED:
+            if (wire_type == PB_WT_STRING
+                && PB_LTYPE(type) <= PB_LTYPE_LAST_PACKABLE)
+            {
+                /* Packed array */
+                bool status = true;
+                pb_size_t *size = (pb_size_t*)iter->pSize;
+
+                pb_istream_t substream;
+                if (!pb_make_string_substream(stream, &substream))
+                    return false;
+
+                while (substream.bytes_left > 0 && *size < iter->pos->array_size)
+                {
+                    void *pItem = (char*)iter->pData + iter->pos->data_size * (*size);
+                    if (!func(&substream, iter->pos, pItem))
+                    {
+                        status = false;
+                        break;
+                    }
+                    (*size)++;
+                }
+
+                if (substream.bytes_left != 0)
+                    PB_RETURN_ERROR(stream, "array overflow");
+                if (!pb_close_string_substream(stream, &substream))
+                    return false;
+
+                return status;
+            }
+            else
+            {
+                /* Repeated field */
+                pb_size_t *size = (pb_size_t*)iter->pSize;
+                char *pItem = (char*)iter->pData + iter->pos->data_size * (*size);
+
+                if ((*size)++ >= iter->pos->array_size)
+                    PB_RETURN_ERROR(stream, "array overflow");
+
+                return func(stream, iter->pos, pItem);
+            }
+
+        case PB_HTYPE_ONEOF:
+            *(pb_size_t*)iter->pSize = iter->pos->tag;
+            if (PB_LTYPE(type) == PB_LTYPE_SUBMESSAGE)
+            {
+                /* We memset to zero so that any callbacks are set to NULL.
+                 * Then set any default values. */
+                memset(iter->pData, 0, iter->pos->data_size);
+                pb_message_set_to_defaults((const pb_field_t*)iter->pos->ptr, iter->pData);
+            }
+            return func(stream, iter->pos, iter->pData);
+
+        default:
+            PB_RETURN_ERROR(stream, "invalid field type");
+    }
+}
+
+#ifdef PB_ENABLE_MALLOC
+/* Allocate storage for the field and store the pointer at iter->pData.
+ * array_size is the number of entries to reserve in an array.
+ * Zero size is not allowed, use pb_free() for releasing.
+ */
+static bool checkreturn allocate_field(pb_istream_t *stream, void *pData, size_t data_size, size_t array_size)
+{    
+    void *ptr = *(void**)pData;
+    
+    if (data_size == 0 || array_size == 0)
+        PB_RETURN_ERROR(stream, "invalid size");
+    
+    /* Check for multiplication overflows.
+     * This code avoids the costly division if the sizes are small enough.
+     * Multiplication is safe as long as only half of bits are set
+     * in either multiplicand.
+     */
+    {
+        const size_t check_limit = (size_t)1 << (sizeof(size_t) * 4);
+        if (data_size >= check_limit || array_size >= check_limit)
+        {
+            const size_t size_max = (size_t)-1;
+            if (size_max / array_size < data_size)
+            {
+                PB_RETURN_ERROR(stream, "size too large");
+            }
+        }
+    }
+    
+    /* Allocate new or expand previous allocation */
+    /* Note: on failure the old pointer will remain in the structure,
+     * the message must be freed by caller also on error return. */
+    ptr = pb_realloc(ptr, array_size * data_size);
+    if (ptr == NULL)
+        PB_RETURN_ERROR(stream, "realloc failed");
+    
+    *(void**)pData = ptr;
+    return true;
+}
+
+/* Clear a newly allocated item in case it contains a pointer, or is a submessage. */
+static void initialize_pointer_field(void *pItem, pb_field_iter_t *iter)
+{
+    if (PB_LTYPE(iter->pos->type) == PB_LTYPE_STRING ||
+        PB_LTYPE(iter->pos->type) == PB_LTYPE_BYTES)
+    {
+        *(void**)pItem = NULL;
+    }
+    else if (PB_LTYPE(iter->pos->type) == PB_LTYPE_SUBMESSAGE)
+    {
+        /* We memset to zero so that any callbacks are set to NULL.
+         * Then set any default values. */
+        memset(pItem, 0, iter->pos->data_size);
+        pb_message_set_to_defaults((const pb_field_t *) iter->pos->ptr, pItem);
+    }
+}
+#endif
+
+static bool checkreturn decode_pointer_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *iter)
+{
+#ifndef PB_ENABLE_MALLOC
+    PB_UNUSED(wire_type);
+    PB_UNUSED(iter);
+    PB_RETURN_ERROR(stream, "no malloc support");
+#else
+    pb_type_t type;
+    pb_decoder_t func;
+    
+    type = iter->pos->type;
+    func = PB_DECODERS[PB_LTYPE(type)];
+    
+    switch (PB_HTYPE(type))
+    {
+        case PB_HTYPE_REQUIRED:
+        case PB_HTYPE_OPTIONAL:
+        case PB_HTYPE_ONEOF:
+            if (PB_LTYPE(type) == PB_LTYPE_SUBMESSAGE &&
+                *(void**)iter->pData != NULL)
+            {
+                /* Duplicate field, have to release the old allocation first. */
+                pb_release_single_field(iter);
+            }
+        
+            if (PB_HTYPE(type) == PB_HTYPE_ONEOF)
+            {
+                *(pb_size_t*)iter->pSize = iter->pos->tag;
+            }
+
+            if (PB_LTYPE(type) == PB_LTYPE_STRING ||
+                PB_LTYPE(type) == PB_LTYPE_BYTES)
+            {
+                return func(stream, iter->pos, iter->pData);
+            }
+            else
+            {
+                if (!allocate_field(stream, iter->pData, iter->pos->data_size, 1))
+                    return false;
+                
+                initialize_pointer_field(*(void**)iter->pData, iter);
+                return func(stream, iter->pos, *(void**)iter->pData);
+            }
+    
+        case PB_HTYPE_REPEATED:
+            if (wire_type == PB_WT_STRING
+                && PB_LTYPE(type) <= PB_LTYPE_LAST_PACKABLE)
+            {
+                /* Packed array, multiple items come in at once. */
+                bool status = true;
+                pb_size_t *size = (pb_size_t*)iter->pSize;
+                size_t allocated_size = *size;
+                void *pItem;
+                pb_istream_t substream;
+                
+                if (!pb_make_string_substream(stream, &substream))
+                    return false;
+                
+                while (substream.bytes_left)
+                {
+                    if ((size_t)*size + 1 > allocated_size)
+                    {
+                        /* Allocate more storage. This tries to guess the
+                         * number of remaining entries. Round the division
+                         * upwards. */
+                        allocated_size += (substream.bytes_left - 1) / iter->pos->data_size + 1;
+                        
+                        if (!allocate_field(&substream, iter->pData, iter->pos->data_size, allocated_size))
+                        {
+                            status = false;
+                            break;
+                        }
+                    }
+
+                    /* Decode the array entry */
+                    pItem = *(char**)iter->pData + iter->pos->data_size * (*size);
+                    initialize_pointer_field(pItem, iter);
+                    if (!func(&substream, iter->pos, pItem))
+                    {
+                        status = false;
+                        break;
+                    }
+                    
+                    if (*size == PB_SIZE_MAX)
+                    {
+#ifndef PB_NO_ERRMSG
+                        stream->errmsg = "too many array entries";
+#endif
+                        status = false;
+                        break;
+                    }
+                    
+                    (*size)++;
+                }
+                if (!pb_close_string_substream(stream, &substream))
+                    return false;
+                
+                return status;
+            }
+            else
+            {
+                /* Normal repeated field, i.e. only one item at a time. */
+                pb_size_t *size = (pb_size_t*)iter->pSize;
+                void *pItem;
+                
+                if (*size == PB_SIZE_MAX)
+                    PB_RETURN_ERROR(stream, "too many array entries");
+                
+                (*size)++;
+                if (!allocate_field(stream, iter->pData, iter->pos->data_size, *size))
+                    return false;
+            
+                pItem = *(char**)iter->pData + iter->pos->data_size * (*size - 1);
+                initialize_pointer_field(pItem, iter);
+                return func(stream, iter->pos, pItem);
+            }
+
+        default:
+            PB_RETURN_ERROR(stream, "invalid field type");
+    }
+#endif
+}
+
+static bool checkreturn decode_callback_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *iter)
+{
+    pb_callback_t *pCallback = (pb_callback_t*)iter->pData;
+    
+#ifdef PB_OLD_CALLBACK_STYLE
+    void *arg = pCallback->arg;
+#else
+    void **arg = &(pCallback->arg);
+#endif
+    
+    if (pCallback == NULL || pCallback->funcs.decode == NULL)
+        return pb_skip_field(stream, wire_type);
+    
+    if (wire_type == PB_WT_STRING)
+    {
+        pb_istream_t substream;
+        
+        if (!pb_make_string_substream(stream, &substream))
+            return false;
+        
+        do
+        {
+            if (!pCallback->funcs.decode(&substream, iter->pos, arg))
+                PB_RETURN_ERROR(stream, "callback failed");
+        } while (substream.bytes_left);
+        
+        if (!pb_close_string_substream(stream, &substream))
+            return false;
+
+        return true;
+    }
+    else
+    {
+        /* Copy the single scalar value to stack.
+         * This is required so that we can limit the stream length,
+         * which in turn allows to use same callback for packed and
+         * not-packed fields. */
+        pb_istream_t substream;
+        pb_byte_t buffer[10];
+        size_t size = sizeof(buffer);
+        
+        if (!read_raw_value(stream, wire_type, buffer, &size))
+            return false;
+        substream = pb_istream_from_buffer(buffer, size);
+        
+        return pCallback->funcs.decode(&substream, iter->pos, arg);
+    }
+}
+
+static bool checkreturn decode_field(pb_istream_t *stream, pb_wire_type_t wire_type, pb_field_iter_t *iter)
+{
+#ifdef PB_ENABLE_MALLOC
+    /* When decoding an oneof field, check if there is old data that must be
+     * released first. */
+    if (PB_HTYPE(iter->pos->type) == PB_HTYPE_ONEOF)
+    {
+        if (!pb_release_union_field(stream, iter))
+            return false;
+    }
+#endif
+
+    switch (PB_ATYPE(iter->pos->type))
+    {
+        case PB_ATYPE_STATIC:
+            return decode_static_field(stream, wire_type, iter);
+        
+        case PB_ATYPE_POINTER:
+            return decode_pointer_field(stream, wire_type, iter);
+        
+        case PB_ATYPE_CALLBACK:
+            return decode_callback_field(stream, wire_type, iter);
+        
+        default:
+            PB_RETURN_ERROR(stream, "invalid field type");
+    }
+}
+
+static void iter_from_extension(pb_field_iter_t *iter, pb_extension_t *extension)
+{
+    /* Fake a field iterator for the extension field.
+     * It is not actually safe to advance this iterator, but decode_field
+     * will not even try to. */
+    const pb_field_t *field = (const pb_field_t*)extension->type->arg;
+    (void)pb_field_iter_begin(iter, field, extension->dest);
+    iter->pData = extension->dest;
+    iter->pSize = &extension->found;
+    
+    if (PB_ATYPE(field->type) == PB_ATYPE_POINTER)
+    {
+        /* For pointer extensions, the pointer is stored directly
+         * in the extension structure. This avoids having an extra
+         * indirection. */
+        iter->pData = &extension->dest;
+    }
+}
+
+/* Default handler for extension fields. Expects a pb_field_t structure
+ * in extension->type->arg. */
+static bool checkreturn default_extension_decoder(pb_istream_t *stream,
+    pb_extension_t *extension, uint32_t tag, pb_wire_type_t wire_type)
+{
+    const pb_field_t *field = (const pb_field_t*)extension->type->arg;
+    pb_field_iter_t iter;
+    
+    if (field->tag != tag)
+        return true;
+    
+    iter_from_extension(&iter, extension);
+    extension->found = true;
+    return decode_field(stream, wire_type, &iter);
+}
+
+/* Try to decode an unknown field as an extension field. Tries each extension
+ * decoder in turn, until one of them handles the field or loop ends. */
+static bool checkreturn decode_extension(pb_istream_t *stream,
+    uint32_t tag, pb_wire_type_t wire_type, pb_field_iter_t *iter)
+{
+    pb_extension_t *extension = *(pb_extension_t* const *)iter->pData;
+    size_t pos = stream->bytes_left;
+    
+    while (extension != NULL && pos == stream->bytes_left)
+    {
+        bool status;
+        if (extension->type->decode)
+            status = extension->type->decode(stream, extension, tag, wire_type);
+        else
+            status = default_extension_decoder(stream, extension, tag, wire_type);
+
+        if (!status)
+            return false;
+        
+        extension = extension->next;
+    }
+    
+    return true;
+}
+
+/* Step through the iterator until an extension field is found or until all
+ * entries have been checked. There can be only one extension field per
+ * message. Returns false if no extension field is found. */
+static bool checkreturn find_extension_field(pb_field_iter_t *iter)
+{
+    const pb_field_t *start = iter->pos;
+    
+    do {
+        if (PB_LTYPE(iter->pos->type) == PB_LTYPE_EXTENSION)
+            return true;
+        (void)pb_field_iter_next(iter);
+    } while (iter->pos != start);
+    
+    return false;
+}
+
+/* Initialize message fields to default values, recursively */
+static void pb_field_set_to_default(pb_field_iter_t *iter)
+{
+    pb_type_t type;
+    type = iter->pos->type;
+    
+    if (PB_LTYPE(type) == PB_LTYPE_EXTENSION)
+    {
+        pb_extension_t *ext = *(pb_extension_t* const *)iter->pData;
+        while (ext != NULL)
+        {
+            pb_field_iter_t ext_iter;
+            ext->found = false;
+            iter_from_extension(&ext_iter, ext);
+            pb_field_set_to_default(&ext_iter);
+            ext = ext->next;
+        }
+    }
+    else if (PB_ATYPE(type) == PB_ATYPE_STATIC)
+    {
+        bool init_data = true;
+        if (PB_HTYPE(type) == PB_HTYPE_OPTIONAL && iter->pSize != iter->pData)
+        {
+            /* Set has_field to false. Still initialize the optional field
+             * itself also. */
+            *(bool*)iter->pSize = false;
+        }
+        else if (PB_HTYPE(type) == PB_HTYPE_REPEATED ||
+                 PB_HTYPE(type) == PB_HTYPE_ONEOF)
+        {
+            /* REPEATED: Set array count to 0, no need to initialize contents.
+               ONEOF: Set which_field to 0. */
+            *(pb_size_t*)iter->pSize = 0;
+            init_data = false;
+        }
+
+        if (init_data)
+        {
+            if (PB_LTYPE(iter->pos->type) == PB_LTYPE_SUBMESSAGE)
+            {
+                /* Initialize submessage to defaults */
+                pb_message_set_to_defaults((const pb_field_t *) iter->pos->ptr, iter->pData);
+            }
+            else if (iter->pos->ptr != NULL)
+            {
+                /* Initialize to default value */
+                memcpy(iter->pData, iter->pos->ptr, iter->pos->data_size);
+            }
+            else
+            {
+                /* Initialize to zeros */
+                memset(iter->pData, 0, iter->pos->data_size);
+            }
+        }
+    }
+    else if (PB_ATYPE(type) == PB_ATYPE_POINTER)
+    {
+        /* Initialize the pointer to NULL. */
+        *(void**)iter->pData = NULL;
+        
+        /* Initialize array count to 0. */
+        if (PB_HTYPE(type) == PB_HTYPE_REPEATED ||
+            PB_HTYPE(type) == PB_HTYPE_ONEOF)
+        {
+            *(pb_size_t*)iter->pSize = 0;
+        }
+    }
+    else if (PB_ATYPE(type) == PB_ATYPE_CALLBACK)
+    {
+        /* Don't overwrite callback */
+    }
+}
+
+static void pb_message_set_to_defaults(const pb_field_t fields[], void *dest_struct)
+{
+    pb_field_iter_t iter;
+
+    if (!pb_field_iter_begin(&iter, fields, dest_struct))
+        return; /* Empty message type */
+    
+    do
+    {
+        pb_field_set_to_default(&iter);
+    } while (pb_field_iter_next(&iter));
+}
+
+/*********************
+ * Decode all fields *
+ *********************/
+
+bool checkreturn pb_decode_noinit(pb_istream_t *stream, const pb_field_t fields[], void *dest_struct)
+{
+    uint32_t fields_seen[(PB_MAX_REQUIRED_FIELDS + 31) / 32] = {0, 0};
+    const uint32_t allbits = ~(uint32_t)0;
+    uint32_t extension_range_start = 0;
+    pb_field_iter_t iter;
+
+    /* 'fixed_count_field' and 'fixed_count_size' track position of a repeated fixed
+     * count field. This can only handle _one_ repeated fixed count field that
+     * is unpacked and unordered among other (non repeated fixed count) fields.
+     */
+    const pb_field_t *fixed_count_field = NULL;
+    pb_size_t fixed_count_size = 0;
+
+    /* Return value ignored, as empty message types will be correctly handled by
+     * pb_field_iter_find() anyway. */
+    (void)pb_field_iter_begin(&iter, fields, dest_struct);
+
+    while (stream->bytes_left)
+    {
+        uint32_t tag;
+        pb_wire_type_t wire_type;
+        bool eof;
+
+        if (!pb_decode_tag(stream, &wire_type, &tag, &eof))
+        {
+            if (eof)
+                break;
+            else
+                return false;
+        }
+
+        if (!pb_field_iter_find(&iter, tag))
+        {
+            /* No match found, check if it matches an extension. */
+            if (tag >= extension_range_start)
+            {
+                if (!find_extension_field(&iter))
+                    extension_range_start = (uint32_t)-1;
+                else
+                    extension_range_start = iter.pos->tag;
+
+                if (tag >= extension_range_start)
+                {
+                    size_t pos = stream->bytes_left;
+
+                    if (!decode_extension(stream, tag, wire_type, &iter))
+                        return false;
+
+                    if (pos != stream->bytes_left)
+                    {
+                        /* The field was handled */
+                        continue;
+                    }
+                }
+            }
+
+            /* No match found, skip data */
+            if (!pb_skip_field(stream, wire_type))
+                return false;
+            continue;
+        }
+
+        /* If a repeated fixed count field was found, get size from
+         * 'fixed_count_field' as there is no counter contained in the struct.
+         */
+        if (PB_HTYPE(iter.pos->type) == PB_HTYPE_REPEATED
+            && iter.pSize == iter.pData)
+        {
+            if (fixed_count_field != iter.pos) {
+                /* If the new fixed count field does not match the previous one,
+                 * check that the previous one is NULL or that it finished
+                 * receiving all the expected data.
+                 */
+                if (fixed_count_field != NULL &&
+                    fixed_count_size != fixed_count_field->array_size)
+                {
+                    PB_RETURN_ERROR(stream, "wrong size for fixed count field");
+                }
+
+                fixed_count_field = iter.pos;
+                fixed_count_size = 0;
+            }
+
+            iter.pSize = &fixed_count_size;
+        }
+
+        if (PB_HTYPE(iter.pos->type) == PB_HTYPE_REQUIRED
+            && iter.required_field_index < PB_MAX_REQUIRED_FIELDS)
+        {
+            uint32_t tmp = ((uint32_t)1 << (iter.required_field_index & 31));
+            fields_seen[iter.required_field_index >> 5] |= tmp;
+        }
+
+        if (!decode_field(stream, wire_type, &iter))
+            return false;
+    }
+
+    /* Check that all elements of the last decoded fixed count field were present. */
+    if (fixed_count_field != NULL &&
+        fixed_count_size != fixed_count_field->array_size)
+    {
+        PB_RETURN_ERROR(stream, "wrong size for fixed count field");
+    }
+
+    /* Check that all required fields were present. */
+    {
+        /* First figure out the number of required fields by
+         * seeking to the end of the field array. Usually we
+         * are already close to end after decoding.
+         */
+        unsigned req_field_count;
+        pb_type_t last_type;
+        unsigned i;
+        do {
+            req_field_count = iter.required_field_index;
+            last_type = iter.pos->type;
+        } while (pb_field_iter_next(&iter));
+        
+        /* Fixup if last field was also required. */
+        if (PB_HTYPE(last_type) == PB_HTYPE_REQUIRED && iter.pos->tag != 0)
+            req_field_count++;
+        
+        if (req_field_count > PB_MAX_REQUIRED_FIELDS)
+            req_field_count = PB_MAX_REQUIRED_FIELDS;
+
+        if (req_field_count > 0)
+        {
+            /* Check the whole words */
+            for (i = 0; i < (req_field_count >> 5); i++)
+            {
+                if (fields_seen[i] != allbits)
+                    PB_RETURN_ERROR(stream, "missing required field");
+            }
+            
+            /* Check the remaining bits (if any) */
+            if ((req_field_count & 31) != 0)
+            {
+                if (fields_seen[req_field_count >> 5] !=
+                    (allbits >> (32 - (req_field_count & 31))))
+                {
+                    PB_RETURN_ERROR(stream, "missing required field");
+                }
+            }
+        }
+    }
+    
+    return true;
+}
+
+bool checkreturn pb_decode(pb_istream_t *stream, const pb_field_t fields[], void *dest_struct)
+{
+    bool status;
+    pb_message_set_to_defaults(fields, dest_struct);
+    status = pb_decode_noinit(stream, fields, dest_struct);
+    
+#ifdef PB_ENABLE_MALLOC
+    if (!status)
+        pb_release(fields, dest_struct);
+#endif
+    
+    return status;
+}
+
+bool pb_decode_delimited_noinit(pb_istream_t *stream, const pb_field_t fields[], void *dest_struct)
+{
+    pb_istream_t substream;
+    bool status;
+
+    if (!pb_make_string_substream(stream, &substream))
+        return false;
+
+    status = pb_decode_noinit(&substream, fields, dest_struct);
+
+    if (!pb_close_string_substream(stream, &substream))
+        return false;
+    return status;
+}
+
+bool pb_decode_delimited(pb_istream_t *stream, const pb_field_t fields[], void *dest_struct)
+{
+    pb_istream_t substream;
+    bool status;
+    
+    if (!pb_make_string_substream(stream, &substream))
+        return false;
+    
+    status = pb_decode(&substream, fields, dest_struct);
+
+    if (!pb_close_string_substream(stream, &substream))
+        return false;
+    return status;
+}
+
+bool pb_decode_nullterminated(pb_istream_t *stream, const pb_field_t fields[], void *dest_struct)
+{
+    /* This behaviour will be separated in nanopb-0.4.0, see issue #278. */
+    return pb_decode(stream, fields, dest_struct);
+}
+
+#ifdef PB_ENABLE_MALLOC
+/* Given an oneof field, if there has already been a field inside this oneof,
+ * release it before overwriting with a different one. */
+static bool pb_release_union_field(pb_istream_t *stream, pb_field_iter_t *iter)
+{
+    pb_size_t old_tag = *(pb_size_t*)iter->pSize; /* Previous which_ value */
+    pb_size_t new_tag = iter->pos->tag; /* New which_ value */
+
+    if (old_tag == 0)
+        return true; /* Ok, no old data in union */
+
+    if (old_tag == new_tag)
+        return true; /* Ok, old data is of same type => merge */
+
+    /* Release old data. The find can fail if the message struct contains
+     * invalid data. */
+    if (!pb_field_iter_find(iter, old_tag))
+        PB_RETURN_ERROR(stream, "invalid union tag");
+
+    pb_release_single_field(iter);
+
+    /* Restore iterator to where it should be.
+     * This shouldn't fail unless the pb_field_t structure is corrupted. */
+    if (!pb_field_iter_find(iter, new_tag))
+        PB_RETURN_ERROR(stream, "iterator error");
+    
+    return true;
+}
+
+static void pb_release_single_field(const pb_field_iter_t *iter)
+{
+    pb_type_t type;
+    type = iter->pos->type;
+
+    if (PB_HTYPE(type) == PB_HTYPE_ONEOF)
+    {
+        if (*(pb_size_t*)iter->pSize != iter->pos->tag)
+            return; /* This is not the current field in the union */
+    }
+
+    /* Release anything contained inside an extension or submsg.
+     * This has to be done even if the submsg itself is statically
+     * allocated. */
+    if (PB_LTYPE(type) == PB_LTYPE_EXTENSION)
+    {
+        /* Release fields from all extensions in the linked list */
+        pb_extension_t *ext = *(pb_extension_t**)iter->pData;
+        while (ext != NULL)
+        {
+            pb_field_iter_t ext_iter;
+            iter_from_extension(&ext_iter, ext);
+            pb_release_single_field(&ext_iter);
+            ext = ext->next;
+        }
+    }
+    else if (PB_LTYPE(type) == PB_LTYPE_SUBMESSAGE)
+    {
+        /* Release fields in submessage or submsg array */
+        void *pItem = iter->pData;
+        pb_size_t count = 1;
+        
+        if (PB_ATYPE(type) == PB_ATYPE_POINTER)
+        {
+            pItem = *(void**)iter->pData;
+        }
+        
+        if (PB_HTYPE(type) == PB_HTYPE_REPEATED)
+        {
+            if (PB_ATYPE(type) == PB_ATYPE_STATIC && iter->pSize == iter->pData) {
+                /* No _count field so use size of the array */
+                count = iter->pos->array_size;
+            } else {
+                count = *(pb_size_t*)iter->pSize;
+            }
+
+            if (PB_ATYPE(type) == PB_ATYPE_STATIC && count > iter->pos->array_size)
+            {
+                /* Protect against corrupted _count fields */
+                count = iter->pos->array_size;
+            }
+        }
+        
+        if (pItem)
+        {
+            while (count--)
+            {
+                pb_release((const pb_field_t*)iter->pos->ptr, pItem);
+                pItem = (char*)pItem + iter->pos->data_size;
+            }
+        }
+    }
+    
+    if (PB_ATYPE(type) == PB_ATYPE_POINTER)
+    {
+        if (PB_HTYPE(type) == PB_HTYPE_REPEATED &&
+            (PB_LTYPE(type) == PB_LTYPE_STRING ||
+             PB_LTYPE(type) == PB_LTYPE_BYTES))
+        {
+            /* Release entries in repeated string or bytes array */
+            void **pItem = *(void***)iter->pData;
+            pb_size_t count = *(pb_size_t*)iter->pSize;
+            while (count--)
+            {
+                pb_free(*pItem);
+                *pItem++ = NULL;
+            }
+        }
+        
+        if (PB_HTYPE(type) == PB_HTYPE_REPEATED)
+        {
+            /* We are going to release the array, so set the size to 0 */
+            *(pb_size_t*)iter->pSize = 0;
+        }
+        
+        /* Release main item */
+        pb_free(*(void**)iter->pData);
+        *(void**)iter->pData = NULL;
+    }
+}
+
+void pb_release(const pb_field_t fields[], void *dest_struct)
+{
+    pb_field_iter_t iter;
+    
+    if (!dest_struct)
+        return; /* Ignore NULL pointers, similar to free() */
+
+    if (!pb_field_iter_begin(&iter, fields, dest_struct))
+        return; /* Empty message type */
+    
+    do
+    {
+        pb_release_single_field(&iter);
+    } while (pb_field_iter_next(&iter));
+}
+#endif
+
+/* Field decoders */
+
+bool pb_decode_svarint(pb_istream_t *stream, pb_int64_t *dest)
+{
+    pb_uint64_t value;
+    if (!pb_decode_varint(stream, &value))
+        return false;
+    
+    if (value & 1)
+        *dest = (pb_int64_t)(~(value >> 1));
+    else
+        *dest = (pb_int64_t)(value >> 1);
+    
+    return true;
+}
+
+bool pb_decode_fixed32(pb_istream_t *stream, void *dest)
+{
+    pb_byte_t bytes[4];
+
+    if (!pb_read(stream, bytes, 4))
+        return false;
+    
+    *(uint32_t*)dest = ((uint32_t)bytes[0] << 0) |
+                       ((uint32_t)bytes[1] << 8) |
+                       ((uint32_t)bytes[2] << 16) |
+                       ((uint32_t)bytes[3] << 24);
+    return true;
+}
+
+#ifndef PB_WITHOUT_64BIT
+bool pb_decode_fixed64(pb_istream_t *stream, void *dest)
+{
+    pb_byte_t bytes[8];
+
+    if (!pb_read(stream, bytes, 8))
+        return false;
+    
+    *(uint64_t*)dest = ((uint64_t)bytes[0] << 0) |
+                       ((uint64_t)bytes[1] << 8) |
+                       ((uint64_t)bytes[2] << 16) |
+                       ((uint64_t)bytes[3] << 24) |
+                       ((uint64_t)bytes[4] << 32) |
+                       ((uint64_t)bytes[5] << 40) |
+                       ((uint64_t)bytes[6] << 48) |
+                       ((uint64_t)bytes[7] << 56);
+    
+    return true;
+}
+#endif
+
+static bool checkreturn pb_dec_varint(pb_istream_t *stream, const pb_field_t *field, void *dest)
+{
+    pb_uint64_t value;
+    pb_int64_t svalue;
+    pb_int64_t clamped;
+    if (!pb_decode_varint(stream, &value))
+        return false;
+    
+    /* See issue 97: Google's C++ protobuf allows negative varint values to
+     * be cast as int32_t, instead of the int64_t that should be used when
+     * encoding. Previous nanopb versions had a bug in encoding. In order to
+     * not break decoding of such messages, we cast <=32 bit fields to
+     * int32_t first to get the sign correct.
+     */
+    if (field->data_size == sizeof(pb_int64_t))
+        svalue = (pb_int64_t)value;
+    else
+        svalue = (int32_t)value;
+
+    /* Cast to the proper field size, while checking for overflows */
+    if (field->data_size == sizeof(pb_int64_t))
+        clamped = *(pb_int64_t*)dest = svalue;
+    else if (field->data_size == sizeof(int32_t))
+        clamped = *(int32_t*)dest = (int32_t)svalue;
+    else if (field->data_size == sizeof(int_least16_t))
+        clamped = *(int_least16_t*)dest = (int_least16_t)svalue;
+    else if (field->data_size == sizeof(int_least8_t))
+        clamped = *(int_least8_t*)dest = (int_least8_t)svalue;
+    else
+        PB_RETURN_ERROR(stream, "invalid data_size");
+
+    if (clamped != svalue)
+        PB_RETURN_ERROR(stream, "integer too large");
+    
+    return true;
+}
+
+static bool checkreturn pb_dec_uvarint(pb_istream_t *stream, const pb_field_t *field, void *dest)
+{
+    pb_uint64_t value, clamped;
+    if (!pb_decode_varint(stream, &value))
+        return false;
+    
+    /* Cast to the proper field size, while checking for overflows */
+    if (field->data_size == sizeof(pb_uint64_t))
+        clamped = *(pb_uint64_t*)dest = value;
+    else if (field->data_size == sizeof(uint32_t))
+        clamped = *(uint32_t*)dest = (uint32_t)value;
+    else if (field->data_size == sizeof(uint_least16_t))
+        clamped = *(uint_least16_t*)dest = (uint_least16_t)value;
+    else if (field->data_size == sizeof(uint_least8_t))
+        clamped = *(uint_least8_t*)dest = (uint_least8_t)value;
+    else
+        PB_RETURN_ERROR(stream, "invalid data_size");
+    
+    if (clamped != value)
+        PB_RETURN_ERROR(stream, "integer too large");
+
+    return true;
+}
+
+static bool checkreturn pb_dec_svarint(pb_istream_t *stream, const pb_field_t *field, void *dest)
+{
+    pb_int64_t value, clamped;
+    if (!pb_decode_svarint(stream, &value))
+        return false;
+    
+    /* Cast to the proper field size, while checking for overflows */
+    if (field->data_size == sizeof(pb_int64_t))
+        clamped = *(pb_int64_t*)dest = value;
+    else if (field->data_size == sizeof(int32_t))
+        clamped = *(int32_t*)dest = (int32_t)value;
+    else if (field->data_size == sizeof(int_least16_t))
+        clamped = *(int_least16_t*)dest = (int_least16_t)value;
+    else if (field->data_size == sizeof(int_least8_t))
+        clamped = *(int_least8_t*)dest = (int_least8_t)value;
+    else
+        PB_RETURN_ERROR(stream, "invalid data_size");
+
+    if (clamped != value)
+        PB_RETURN_ERROR(stream, "integer too large");
+    
+    return true;
+}
+
+static bool checkreturn pb_dec_fixed32(pb_istream_t *stream, const pb_field_t *field, void *dest)
+{
+    PB_UNUSED(field);
+    return pb_decode_fixed32(stream, dest);
+}
+
+static bool checkreturn pb_dec_fixed64(pb_istream_t *stream, const pb_field_t *field, void *dest)
+{
+    PB_UNUSED(field);
+#ifndef PB_WITHOUT_64BIT
+    return pb_decode_fixed64(stream, dest);
+#else
+    PB_UNUSED(dest);
+    PB_RETURN_ERROR(stream, "no 64bit support");
+#endif
+}
+
+static bool checkreturn pb_dec_bytes(pb_istream_t *stream, const pb_field_t *field, void *dest)
+{
+    uint32_t size;
+    size_t alloc_size;
+    pb_bytes_array_t *bdest;
+    
+    if (!pb_decode_varint32(stream, &size))
+        return false;
+    
+    if (size > PB_SIZE_MAX)
+        PB_RETURN_ERROR(stream, "bytes overflow");
+    
+    alloc_size = PB_BYTES_ARRAY_T_ALLOCSIZE(size);
+    if (size > alloc_size)
+        PB_RETURN_ERROR(stream, "size too large");
+    
+    if (PB_ATYPE(field->type) == PB_ATYPE_POINTER)
+    {
+#ifndef PB_ENABLE_MALLOC
+        PB_RETURN_ERROR(stream, "no malloc support");
+#else
+        if (!allocate_field(stream, dest, alloc_size, 1))
+            return false;
+        bdest = *(pb_bytes_array_t**)dest;
+#endif
+    }
+    else
+    {
+        if (alloc_size > field->data_size)
+            PB_RETURN_ERROR(stream, "bytes overflow");
+        bdest = (pb_bytes_array_t*)dest;
+    }
+
+    bdest->size = (pb_size_t)size;
+    return pb_read(stream, bdest->bytes, size);
+}
+
+static bool checkreturn pb_dec_string(pb_istream_t *stream, const pb_field_t *field, void *dest)
+{
+    uint32_t size;
+    size_t alloc_size;
+    bool status;
+    if (!pb_decode_varint32(stream, &size))
+        return false;
+    
+    /* Space for null terminator */
+    alloc_size = size + 1;
+    
+    if (alloc_size < size)
+        PB_RETURN_ERROR(stream, "size too large");
+    
+    if (PB_ATYPE(field->type) == PB_ATYPE_POINTER)
+    {
+#ifndef PB_ENABLE_MALLOC
+        PB_RETURN_ERROR(stream, "no malloc support");
+#else
+        if (!allocate_field(stream, dest, alloc_size, 1))
+            return false;
+        dest = *(void**)dest;
+#endif
+    }
+    else
+    {
+        if (alloc_size > field->data_size)
+            PB_RETURN_ERROR(stream, "string overflow");
+    }
+    
+    status = pb_read(stream, (pb_byte_t*)dest, size);
+    *((pb_byte_t*)dest + size) = 0;
+    return status;
+}
+
+static bool checkreturn pb_dec_submessage(pb_istream_t *stream, const pb_field_t *field, void *dest)
+{
+    bool status;
+    pb_istream_t substream;
+    const pb_field_t* submsg_fields = (const pb_field_t*)field->ptr;
+    
+    if (!pb_make_string_substream(stream, &substream))
+        return false;
+    
+    if (field->ptr == NULL)
+        PB_RETURN_ERROR(stream, "invalid field descriptor");
+    
+    /* New array entries need to be initialized, while required and optional
+     * submessages have already been initialized in the top-level pb_decode. */
+    if (PB_HTYPE(field->type) == PB_HTYPE_REPEATED)
+        status = pb_decode(&substream, submsg_fields, dest);
+    else
+        status = pb_decode_noinit(&substream, submsg_fields, dest);
+    
+    if (!pb_close_string_substream(stream, &substream))
+        return false;
+    return status;
+}
+
+static bool checkreturn pb_dec_fixed_length_bytes(pb_istream_t *stream, const pb_field_t *field, void *dest)
+{
+    uint32_t size;
+
+    if (!pb_decode_varint32(stream, &size))
+        return false;
+
+    if (size > PB_SIZE_MAX)
+        PB_RETURN_ERROR(stream, "bytes overflow");
+
+    if (size == 0)
+    {
+        /* As a special case, treat empty bytes string as all zeros for fixed_length_bytes. */
+        memset(dest, 0, field->data_size);
+        return true;
+    }
+
+    if (size != field->data_size)
+        PB_RETURN_ERROR(stream, "incorrect fixed length bytes size");
+
+    return pb_read(stream, (pb_byte_t*)dest, field->data_size);
+}

diff --git a/security/container/protos/nanopb/pb_decode.h b/security/container/protos/nanopb/pb_decode.h
new file mode 100644
index 0000000..398b24a
--- /dev/null
+++ b/security/container/protos/nanopb/pb_decode.h

@@ -0,0 +1,175 @@
+/* pb_decode.h: Functions to decode protocol buffers. Depends on pb_decode.c.
+ * The main function is pb_decode. You also need an input stream, and the
+ * field descriptions created by nanopb_generator.py.
+ */
+
+#ifndef PB_DECODE_H_INCLUDED
+#define PB_DECODE_H_INCLUDED
+
+#include "pb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Structure for defining custom input streams. You will need to provide
+ * a callback function to read the bytes from your storage, which can be
+ * for example a file or a network socket.
+ * 
+ * The callback must conform to these rules:
+ *
+ * 1) Return false on IO errors. This will cause decoding to abort.
+ * 2) You can use state to store your own data (e.g. buffer pointer),
+ *    and rely on pb_read to verify that no-body reads past bytes_left.
+ * 3) Your callback may be used with substreams, in which case bytes_left
+ *    is different than from the main stream. Don't use bytes_left to compute
+ *    any pointers.
+ */
+struct pb_istream_s
+{
+#ifdef PB_BUFFER_ONLY
+    /* Callback pointer is not used in buffer-only configuration.
+     * Having an int pointer here allows binary compatibility but
+     * gives an error if someone tries to assign callback function.
+     */
+    int *callback;
+#else
+    bool (*callback)(pb_istream_t *stream, pb_byte_t *buf, size_t count);
+#endif
+
+    void *state; /* Free field for use by callback implementation */
+    size_t bytes_left;
+    
+#ifndef PB_NO_ERRMSG
+    const char *errmsg;
+#endif
+};
+
+/***************************
+ * Main decoding functions *
+ ***************************/
+ 
+/* Decode a single protocol buffers message from input stream into a C structure.
+ * Returns true on success, false on any failure.
+ * The actual struct pointed to by dest must match the description in fields.
+ * Callback fields of the destination structure must be initialized by caller.
+ * All other fields will be initialized by this function.
+ *
+ * Example usage:
+ *    MyMessage msg = {};
+ *    uint8_t buffer[64];
+ *    pb_istream_t stream;
+ *    
+ *    // ... read some data into buffer ...
+ *
+ *    stream = pb_istream_from_buffer(buffer, count);
+ *    pb_decode(&stream, MyMessage_fields, &msg);
+ */
+bool pb_decode(pb_istream_t *stream, const pb_field_t fields[], void *dest_struct);
+
+/* Same as pb_decode, except does not initialize the destination structure
+ * to default values. This is slightly faster if you need no default values
+ * and just do memset(struct, 0, sizeof(struct)) yourself.
+ *
+ * This can also be used for 'merging' two messages, i.e. update only the
+ * fields that exist in the new message.
+ *
+ * Note: If this function returns with an error, it will not release any
+ * dynamically allocated fields. You will need to call pb_release() yourself.
+ */
+bool pb_decode_noinit(pb_istream_t *stream, const pb_field_t fields[], void *dest_struct);
+
+/* Same as pb_decode, except expects the stream to start with the message size
+ * encoded as varint. Corresponds to parseDelimitedFrom() in Google's
+ * protobuf API.
+ */
+bool pb_decode_delimited(pb_istream_t *stream, const pb_field_t fields[], void *dest_struct);
+
+/* Same as pb_decode_delimited, except that it does not initialize the destination structure.
+ * See pb_decode_noinit
+ */
+bool pb_decode_delimited_noinit(pb_istream_t *stream, const pb_field_t fields[], void *dest_struct);
+
+/* Same as pb_decode, except allows the message to be terminated with a null byte.
+ * NOTE: Until nanopb-0.4.0, pb_decode() also allows null-termination. This behaviour
+ * is not supported in most other protobuf implementations, so pb_decode_delimited()
+ * is a better option for compatibility.
+ */
+bool pb_decode_nullterminated(pb_istream_t *stream, const pb_field_t fields[], void *dest_struct);
+
+#ifdef PB_ENABLE_MALLOC
+/* Release any allocated pointer fields. If you use dynamic allocation, you should
+ * call this for any successfully decoded message when you are done with it. If
+ * pb_decode() returns with an error, the message is already released.
+ */
+void pb_release(const pb_field_t fields[], void *dest_struct);
+#endif
+
+
+/**************************************
+ * Functions for manipulating streams *
+ **************************************/
+
+/* Create an input stream for reading from a memory buffer.
+ *
+ * Alternatively, you can use a custom stream that reads directly from e.g.
+ * a file or a network socket.
+ */
+pb_istream_t pb_istream_from_buffer(const pb_byte_t *buf, size_t bufsize);
+
+/* Function to read from a pb_istream_t. You can use this if you need to
+ * read some custom header data, or to read data in field callbacks.
+ */
+bool pb_read(pb_istream_t *stream, pb_byte_t *buf, size_t count);
+
+
+/************************************************
+ * Helper functions for writing field callbacks *
+ ************************************************/
+
+/* Decode the tag for the next field in the stream. Gives the wire type and
+ * field tag. At end of the message, returns false and sets eof to true. */
+bool pb_decode_tag(pb_istream_t *stream, pb_wire_type_t *wire_type, uint32_t *tag, bool *eof);
+
+/* Skip the field payload data, given the wire type. */
+bool pb_skip_field(pb_istream_t *stream, pb_wire_type_t wire_type);
+
+/* Decode an integer in the varint format. This works for bool, enum, int32,
+ * int64, uint32 and uint64 field types. */
+#ifndef PB_WITHOUT_64BIT
+bool pb_decode_varint(pb_istream_t *stream, uint64_t *dest);
+#else
+#define pb_decode_varint pb_decode_varint32
+#endif
+
+/* Decode an integer in the varint format. This works for bool, enum, int32,
+ * and uint32 field types. */
+bool pb_decode_varint32(pb_istream_t *stream, uint32_t *dest);
+
+/* Decode an integer in the zig-zagged svarint format. This works for sint32
+ * and sint64. */
+#ifndef PB_WITHOUT_64BIT
+bool pb_decode_svarint(pb_istream_t *stream, int64_t *dest);
+#else
+bool pb_decode_svarint(pb_istream_t *stream, int32_t *dest);
+#endif
+
+/* Decode a fixed32, sfixed32 or float value. You need to pass a pointer to
+ * a 4-byte wide C variable. */
+bool pb_decode_fixed32(pb_istream_t *stream, void *dest);
+
+#ifndef PB_WITHOUT_64BIT
+/* Decode a fixed64, sfixed64 or double value. You need to pass a pointer to
+ * a 8-byte wide C variable. */
+bool pb_decode_fixed64(pb_istream_t *stream, void *dest);
+#endif
+
+/* Make a limited-length substream for reading a PB_WT_STRING field. */
+bool pb_make_string_substream(pb_istream_t *stream, pb_istream_t *substream);
+bool pb_close_string_substream(pb_istream_t *stream, pb_istream_t *substream);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif

diff --git a/security/container/protos/nanopb/pb_encode.c b/security/container/protos/nanopb/pb_encode.c
new file mode 100644
index 0000000..089172c
--- /dev/null
+++ b/security/container/protos/nanopb/pb_encode.c

@@ -0,0 +1,869 @@
+/* pb_encode.c -- encode a protobuf using minimal resources
+ *
+ * 2011 Petteri Aimonen <jpa@kapsi.fi>
+ */
+
+#include "pb.h"
+#include "pb_encode.h"
+#include "pb_common.h"
+
+/* Use the GCC warn_unused_result attribute to check that all return values
+ * are propagated correctly. On other compilers and gcc before 3.4.0 just
+ * ignore the annotation.
+ */
+#if !defined(__GNUC__) || ( __GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 4)
+    #define checkreturn
+#else
+    #define checkreturn __attribute__((warn_unused_result))
+#endif
+
+/**************************************
+ * Declarations internal to this file *
+ **************************************/
+typedef bool (*pb_encoder_t)(pb_ostream_t *stream, const pb_field_t *field, const void *src) checkreturn;
+
+static bool checkreturn buf_write(pb_ostream_t *stream, const pb_byte_t *buf, size_t count);
+static bool checkreturn encode_array(pb_ostream_t *stream, const pb_field_t *field, const void *pData, size_t count, pb_encoder_t func);
+static bool checkreturn encode_field(pb_ostream_t *stream, const pb_field_t *field, const void *pData);
+static bool checkreturn default_extension_encoder(pb_ostream_t *stream, const pb_extension_t *extension);
+static bool checkreturn encode_extension_field(pb_ostream_t *stream, const pb_field_t *field, const void *pData);
+static void *pb_const_cast(const void *p);
+static bool checkreturn pb_enc_varint(pb_ostream_t *stream, const pb_field_t *field, const void *src);
+static bool checkreturn pb_enc_uvarint(pb_ostream_t *stream, const pb_field_t *field, const void *src);
+static bool checkreturn pb_enc_svarint(pb_ostream_t *stream, const pb_field_t *field, const void *src);
+static bool checkreturn pb_enc_fixed32(pb_ostream_t *stream, const pb_field_t *field, const void *src);
+static bool checkreturn pb_enc_fixed64(pb_ostream_t *stream, const pb_field_t *field, const void *src);
+static bool checkreturn pb_enc_bytes(pb_ostream_t *stream, const pb_field_t *field, const void *src);
+static bool checkreturn pb_enc_string(pb_ostream_t *stream, const pb_field_t *field, const void *src);
+static bool checkreturn pb_enc_submessage(pb_ostream_t *stream, const pb_field_t *field, const void *src);
+static bool checkreturn pb_enc_fixed_length_bytes(pb_ostream_t *stream, const pb_field_t *field, const void *src);
+
+#ifdef PB_WITHOUT_64BIT
+#define pb_int64_t int32_t
+#define pb_uint64_t uint32_t
+
+static bool checkreturn pb_encode_negative_varint(pb_ostream_t *stream, pb_uint64_t value);
+#else
+#define pb_int64_t int64_t
+#define pb_uint64_t uint64_t
+#endif
+
+/* --- Function pointers to field encoders ---
+ * Order in the array must match pb_action_t LTYPE numbering.
+ */
+static const pb_encoder_t PB_ENCODERS[PB_LTYPES_COUNT] = {
+    &pb_enc_varint,
+    &pb_enc_uvarint,
+    &pb_enc_svarint,
+    &pb_enc_fixed32,
+    &pb_enc_fixed64,
+    
+    &pb_enc_bytes,
+    &pb_enc_string,
+    &pb_enc_submessage,
+    NULL, /* extensions */
+    &pb_enc_fixed_length_bytes
+};
+
+/*******************************
+ * pb_ostream_t implementation *
+ *******************************/
+
+static bool checkreturn buf_write(pb_ostream_t *stream, const pb_byte_t *buf, size_t count)
+{
+    size_t i;
+    pb_byte_t *dest = (pb_byte_t*)stream->state;
+    stream->state = dest + count;
+    
+    for (i = 0; i < count; i++)
+        dest[i] = buf[i];
+    
+    return true;
+}
+
+pb_ostream_t pb_ostream_from_buffer(pb_byte_t *buf, size_t bufsize)
+{
+    pb_ostream_t stream;
+#ifdef PB_BUFFER_ONLY
+    stream.callback = (void*)1; /* Just a marker value */
+#else
+    stream.callback = &buf_write;
+#endif
+    stream.state = buf;
+    stream.max_size = bufsize;
+    stream.bytes_written = 0;
+#ifndef PB_NO_ERRMSG
+    stream.errmsg = NULL;
+#endif
+    return stream;
+}
+
+bool checkreturn pb_write(pb_ostream_t *stream, const pb_byte_t *buf, size_t count)
+{
+    if (stream->callback != NULL)
+    {
+        if (stream->bytes_written + count > stream->max_size)
+            PB_RETURN_ERROR(stream, "stream full");
+
+#ifdef PB_BUFFER_ONLY
+        if (!buf_write(stream, buf, count))
+            PB_RETURN_ERROR(stream, "io error");
+#else        
+        if (!stream->callback(stream, buf, count))
+            PB_RETURN_ERROR(stream, "io error");
+#endif
+    }
+    
+    stream->bytes_written += count;
+    return true;
+}
+
+/*************************
+ * Encode a single field *
+ *************************/
+
+/* Encode a static array. Handles the size calculations and possible packing. */
+static bool checkreturn encode_array(pb_ostream_t *stream, const pb_field_t *field,
+                         const void *pData, size_t count, pb_encoder_t func)
+{
+    size_t i;
+    const void *p;
+    size_t size;
+    
+    if (count == 0)
+        return true;
+
+    if (PB_ATYPE(field->type) != PB_ATYPE_POINTER && count > field->array_size)
+        PB_RETURN_ERROR(stream, "array max size exceeded");
+    
+    /* We always pack arrays if the datatype allows it. */
+    if (PB_LTYPE(field->type) <= PB_LTYPE_LAST_PACKABLE)
+    {
+        if (!pb_encode_tag(stream, PB_WT_STRING, field->tag))
+            return false;
+        
+        /* Determine the total size of packed array. */
+        if (PB_LTYPE(field->type) == PB_LTYPE_FIXED32)
+        {
+            size = 4 * count;
+        }
+        else if (PB_LTYPE(field->type) == PB_LTYPE_FIXED64)
+        {
+            size = 8 * count;
+        }
+        else
+        { 
+            pb_ostream_t sizestream = PB_OSTREAM_SIZING;
+            p = pData;
+            for (i = 0; i < count; i++)
+            {
+                if (!func(&sizestream, field, p))
+                    return false;
+                p = (const char*)p + field->data_size;
+            }
+            size = sizestream.bytes_written;
+        }
+        
+        if (!pb_encode_varint(stream, (pb_uint64_t)size))
+            return false;
+        
+        if (stream->callback == NULL)
+            return pb_write(stream, NULL, size); /* Just sizing.. */
+        
+        /* Write the data */
+        p = pData;
+        for (i = 0; i < count; i++)
+        {
+            if (!func(stream, field, p))
+                return false;
+            p = (const char*)p + field->data_size;
+        }
+    }
+    else
+    {
+        p = pData;
+        for (i = 0; i < count; i++)
+        {
+            if (!pb_encode_tag_for_field(stream, field))
+                return false;
+
+            /* Normally the data is stored directly in the array entries, but
+             * for pointer-type string and bytes fields, the array entries are
+             * actually pointers themselves also. So we have to dereference once
+             * more to get to the actual data. */
+            if (PB_ATYPE(field->type) == PB_ATYPE_POINTER &&
+                (PB_LTYPE(field->type) == PB_LTYPE_STRING ||
+                 PB_LTYPE(field->type) == PB_LTYPE_BYTES))
+            {
+                if (!func(stream, field, *(const void* const*)p))
+                    return false;
+            }
+            else
+            {
+                if (!func(stream, field, p))
+                    return false;
+            }
+            p = (const char*)p + field->data_size;
+        }
+    }
+    
+    return true;
+}
+
+/* In proto3, all fields are optional and are only encoded if their value is "non-zero".
+ * This function implements the check for the zero value. */
+static bool pb_check_proto3_default_value(const pb_field_t *field, const void *pData)
+{
+    pb_type_t type = field->type;
+    const void *pSize = (const char*)pData + field->size_offset;
+
+    if (PB_HTYPE(type) == PB_HTYPE_REQUIRED)
+    {
+        /* Required proto2 fields inside proto3 submessage, pretty rare case */
+        return false;
+    }
+    else if (PB_HTYPE(type) == PB_HTYPE_REPEATED)
+    {
+        /* Repeated fields inside proto3 submessage: present if count != 0 */
+        return *(const pb_size_t*)pSize == 0;
+    }
+    else if (PB_HTYPE(type) == PB_HTYPE_ONEOF)
+    {
+        /* Oneof fields */
+        return *(const pb_size_t*)pSize == 0;
+    }
+    else if (PB_HTYPE(type) == PB_HTYPE_OPTIONAL && field->size_offset)
+    {
+        /* Proto2 optional fields inside proto3 submessage */
+        return *(const bool*)pSize == false;
+    }
+
+    /* Rest is proto3 singular fields */
+
+    if (PB_ATYPE(type) == PB_ATYPE_STATIC)
+    {
+        if (PB_LTYPE(type) == PB_LTYPE_BYTES)
+        {
+            const pb_bytes_array_t *bytes = (const pb_bytes_array_t*)pData;
+            return bytes->size == 0;
+        }
+        else if (PB_LTYPE(type) == PB_LTYPE_STRING)
+        {
+            return *(const char*)pData == '\0';
+        }
+        else if (PB_LTYPE(type) == PB_LTYPE_FIXED_LENGTH_BYTES)
+        {
+            /* Fixed length bytes is only empty if its length is fixed
+             * as 0. Which would be pretty strange, but we can check
+             * it anyway. */
+            return field->data_size == 0;
+        }
+        else if (PB_LTYPE(type) == PB_LTYPE_SUBMESSAGE)
+        {
+            /* Check all fields in the submessage to find if any of them
+             * are non-zero. The comparison cannot be done byte-per-byte
+             * because the C struct may contain padding bytes that must
+             * be skipped.
+             */
+            pb_field_iter_t iter;
+            if (pb_field_iter_begin(&iter, (const pb_field_t*)field->ptr, pb_const_cast(pData)))
+            {
+                do
+                {
+                    if (!pb_check_proto3_default_value(iter.pos, iter.pData))
+                    {
+                        return false;
+                    }
+                } while (pb_field_iter_next(&iter));
+            }
+            return true;
+        }
+    }
+    
+	{
+	    /* Catch-all branch that does byte-per-byte comparison for zero value.
+	     *
+	     * This is for all pointer fields, and for static PB_LTYPE_VARINT,
+	     * UVARINT, SVARINT, FIXED32, FIXED64, EXTENSION fields, and also
+	     * callback fields. These all have integer or pointer value which
+	     * can be compared with 0.
+	     */
+	    pb_size_t i;
+	    const char *p = (const char*)pData;
+	    for (i = 0; i < field->data_size; i++)
+	    {
+	        if (p[i] != 0)
+	        {
+	            return false;
+	        }
+	    }
+
+	    return true;
+	}
+}
+
+/* Encode a field with static or pointer allocation, i.e. one whose data
+ * is available to the encoder directly. */
+static bool checkreturn encode_basic_field(pb_ostream_t *stream,
+    const pb_field_t *field, const void *pData)
+{
+    pb_encoder_t func;
+    bool implicit_has;
+    const void *pSize = &implicit_has;
+    
+    func = PB_ENCODERS[PB_LTYPE(field->type)];
+    
+    if (field->size_offset)
+    {
+        /* Static optional, repeated or oneof field */
+        pSize = (const char*)pData + field->size_offset;
+    }
+    else if (PB_HTYPE(field->type) == PB_HTYPE_OPTIONAL)
+    {
+        /* Proto3 style field, optional but without explicit has_ field. */
+        implicit_has = !pb_check_proto3_default_value(field, pData);
+    }
+    else
+    {
+        /* Required field, always present */
+        implicit_has = true;
+    }
+
+    if (PB_ATYPE(field->type) == PB_ATYPE_POINTER)
+    {
+        /* pData is a pointer to the field, which contains pointer to
+         * the data. If the 2nd pointer is NULL, it is interpreted as if
+         * the has_field was false.
+         */
+        pData = *(const void* const*)pData;
+        implicit_has = (pData != NULL);
+    }
+
+    switch (PB_HTYPE(field->type))
+    {
+        case PB_HTYPE_REQUIRED:
+            if (!pData)
+                PB_RETURN_ERROR(stream, "missing required field");
+            if (!pb_encode_tag_for_field(stream, field))
+                return false;
+            if (!func(stream, field, pData))
+                return false;
+            break;
+        
+        case PB_HTYPE_OPTIONAL:
+            if (*(const bool*)pSize)
+            {
+                if (!pb_encode_tag_for_field(stream, field))
+                    return false;
+            
+                if (!func(stream, field, pData))
+                    return false;
+            }
+            break;
+        
+        case PB_HTYPE_REPEATED: {
+            pb_size_t count;
+            if (field->size_offset != 0) {
+                count = *(const pb_size_t*)pSize;
+            } else {
+                count = field->array_size;
+            }
+            if (!encode_array(stream, field, pData, count, func))
+                return false;
+            break;
+        }
+        
+        case PB_HTYPE_ONEOF:
+            if (*(const pb_size_t*)pSize == field->tag)
+            {
+                if (!pb_encode_tag_for_field(stream, field))
+                    return false;
+
+                if (!func(stream, field, pData))
+                    return false;
+            }
+            break;
+            
+        default:
+            PB_RETURN_ERROR(stream, "invalid field type");
+    }
+    
+    return true;
+}
+
+/* Encode a field with callback semantics. This means that a user function is
+ * called to provide and encode the actual data. */
+static bool checkreturn encode_callback_field(pb_ostream_t *stream,
+    const pb_field_t *field, const void *pData)
+{
+    const pb_callback_t *callback = (const pb_callback_t*)pData;
+    
+#ifdef PB_OLD_CALLBACK_STYLE
+    const void *arg = callback->arg;
+#else
+    void * const *arg = &(callback->arg);
+#endif    
+    
+    if (callback->funcs.encode != NULL)
+    {
+        if (!callback->funcs.encode(stream, field, arg))
+            PB_RETURN_ERROR(stream, "callback error");
+    }
+    return true;
+}
+
+/* Encode a single field of any callback or static type. */
+static bool checkreturn encode_field(pb_ostream_t *stream,
+    const pb_field_t *field, const void *pData)
+{
+    switch (PB_ATYPE(field->type))
+    {
+        case PB_ATYPE_STATIC:
+        case PB_ATYPE_POINTER:
+            return encode_basic_field(stream, field, pData);
+        
+        case PB_ATYPE_CALLBACK:
+            return encode_callback_field(stream, field, pData);
+        
+        default:
+            PB_RETURN_ERROR(stream, "invalid field type");
+    }
+}
+
+/* Default handler for extension fields. Expects to have a pb_field_t
+ * pointer in the extension->type->arg field. */
+static bool checkreturn default_extension_encoder(pb_ostream_t *stream,
+    const pb_extension_t *extension)
+{
+    const pb_field_t *field = (const pb_field_t*)extension->type->arg;
+    
+    if (PB_ATYPE(field->type) == PB_ATYPE_POINTER)
+    {
+        /* For pointer extensions, the pointer is stored directly
+         * in the extension structure. This avoids having an extra
+         * indirection. */
+        return encode_field(stream, field, &extension->dest);
+    }
+    else
+    {
+        return encode_field(stream, field, extension->dest);
+    }
+}
+
+/* Walk through all the registered extensions and give them a chance
+ * to encode themselves. */
+static bool checkreturn encode_extension_field(pb_ostream_t *stream,
+    const pb_field_t *field, const void *pData)
+{
+    const pb_extension_t *extension = *(const pb_extension_t* const *)pData;
+    PB_UNUSED(field);
+    
+    while (extension)
+    {
+        bool status;
+        if (extension->type->encode)
+            status = extension->type->encode(stream, extension);
+        else
+            status = default_extension_encoder(stream, extension);
+
+        if (!status)
+            return false;
+        
+        extension = extension->next;
+    }
+    
+    return true;
+}
+
+/*********************
+ * Encode all fields *
+ *********************/
+
+static void *pb_const_cast(const void *p)
+{
+    /* Note: this casts away const, in order to use the common field iterator
+     * logic for both encoding and decoding. */
+    union {
+        void *p1;
+        const void *p2;
+    } t;
+    t.p2 = p;
+    return t.p1;
+}
+
+bool checkreturn pb_encode(pb_ostream_t *stream, const pb_field_t fields[], const void *src_struct)
+{
+    pb_field_iter_t iter;
+    if (!pb_field_iter_begin(&iter, fields, pb_const_cast(src_struct)))
+        return true; /* Empty message type */
+    
+    do {
+        if (PB_LTYPE(iter.pos->type) == PB_LTYPE_EXTENSION)
+        {
+            /* Special case for the extension field placeholder */
+            if (!encode_extension_field(stream, iter.pos, iter.pData))
+                return false;
+        }
+        else
+        {
+            /* Regular field */
+            if (!encode_field(stream, iter.pos, iter.pData))
+                return false;
+        }
+    } while (pb_field_iter_next(&iter));
+    
+    return true;
+}
+
+bool pb_encode_delimited(pb_ostream_t *stream, const pb_field_t fields[], const void *src_struct)
+{
+    return pb_encode_submessage(stream, fields, src_struct);
+}
+
+bool pb_encode_nullterminated(pb_ostream_t *stream, const pb_field_t fields[], const void *src_struct)
+{
+    const pb_byte_t zero = 0;
+
+    if (!pb_encode(stream, fields, src_struct))
+        return false;
+
+    return pb_write(stream, &zero, 1);
+}
+
+bool pb_get_encoded_size(size_t *size, const pb_field_t fields[], const void *src_struct)
+{
+    pb_ostream_t stream = PB_OSTREAM_SIZING;
+    
+    if (!pb_encode(&stream, fields, src_struct))
+        return false;
+    
+    *size = stream.bytes_written;
+    return true;
+}
+
+/********************
+ * Helper functions *
+ ********************/
+
+#ifdef PB_WITHOUT_64BIT
+bool checkreturn pb_encode_negative_varint(pb_ostream_t *stream, pb_uint64_t value)
+{
+  pb_byte_t buffer[10];
+  size_t i = 0;
+  size_t compensation = 32;/* we need to compensate 32 bits all set to 1 */
+
+  while (value)
+  {
+    buffer[i] = (pb_byte_t)((value & 0x7F) | 0x80);
+    value >>= 7;
+    if (compensation)
+    {
+      /* re-set all the compensation bits we can or need */
+      size_t bits = compensation > 7 ? 7 : compensation;
+      value ^= (pb_uint64_t)((0xFFu >> (8 - bits)) << 25); /* set the number of bits needed on the lowest of the most significant 7 bits */
+      compensation -= bits;
+    }
+    i++;
+  }
+  buffer[i - 1] &= 0x7F; /* Unset top bit on last byte */
+
+  return pb_write(stream, buffer, i);
+}
+#endif
+
+bool checkreturn pb_encode_varint(pb_ostream_t *stream, pb_uint64_t value)
+{
+    pb_byte_t buffer[10];
+    size_t i = 0;
+    
+    if (value <= 0x7F)
+    {
+        pb_byte_t v = (pb_byte_t)value;
+        return pb_write(stream, &v, 1);
+    }
+    
+    while (value)
+    {
+        buffer[i] = (pb_byte_t)((value & 0x7F) | 0x80);
+        value >>= 7;
+        i++;
+    }
+    buffer[i-1] &= 0x7F; /* Unset top bit on last byte */
+    
+    return pb_write(stream, buffer, i);
+}
+
+bool checkreturn pb_encode_svarint(pb_ostream_t *stream, pb_int64_t value)
+{
+    pb_uint64_t zigzagged;
+    if (value < 0)
+        zigzagged = ~((pb_uint64_t)value << 1);
+    else
+        zigzagged = (pb_uint64_t)value << 1;
+    
+    return pb_encode_varint(stream, zigzagged);
+}
+
+bool checkreturn pb_encode_fixed32(pb_ostream_t *stream, const void *value)
+{
+    uint32_t val = *(const uint32_t*)value;
+    pb_byte_t bytes[4];
+    bytes[0] = (pb_byte_t)(val & 0xFF);
+    bytes[1] = (pb_byte_t)((val >> 8) & 0xFF);
+    bytes[2] = (pb_byte_t)((val >> 16) & 0xFF);
+    bytes[3] = (pb_byte_t)((val >> 24) & 0xFF);
+    return pb_write(stream, bytes, 4);
+}
+
+#ifndef PB_WITHOUT_64BIT
+bool checkreturn pb_encode_fixed64(pb_ostream_t *stream, const void *value)
+{
+    uint64_t val = *(const uint64_t*)value;
+    pb_byte_t bytes[8];
+    bytes[0] = (pb_byte_t)(val & 0xFF);
+    bytes[1] = (pb_byte_t)((val >> 8) & 0xFF);
+    bytes[2] = (pb_byte_t)((val >> 16) & 0xFF);
+    bytes[3] = (pb_byte_t)((val >> 24) & 0xFF);
+    bytes[4] = (pb_byte_t)((val >> 32) & 0xFF);
+    bytes[5] = (pb_byte_t)((val >> 40) & 0xFF);
+    bytes[6] = (pb_byte_t)((val >> 48) & 0xFF);
+    bytes[7] = (pb_byte_t)((val >> 56) & 0xFF);
+    return pb_write(stream, bytes, 8);
+}
+#endif
+
+bool checkreturn pb_encode_tag(pb_ostream_t *stream, pb_wire_type_t wiretype, uint32_t field_number)
+{
+    pb_uint64_t tag = ((pb_uint64_t)field_number << 3) | wiretype;
+    return pb_encode_varint(stream, tag);
+}
+
+bool checkreturn pb_encode_tag_for_field(pb_ostream_t *stream, const pb_field_t *field)
+{
+    pb_wire_type_t wiretype;
+    switch (PB_LTYPE(field->type))
+    {
+        case PB_LTYPE_VARINT:
+        case PB_LTYPE_UVARINT:
+        case PB_LTYPE_SVARINT:
+            wiretype = PB_WT_VARINT;
+            break;
+        
+        case PB_LTYPE_FIXED32:
+            wiretype = PB_WT_32BIT;
+            break;
+        
+        case PB_LTYPE_FIXED64:
+            wiretype = PB_WT_64BIT;
+            break;
+        
+        case PB_LTYPE_BYTES:
+        case PB_LTYPE_STRING:
+        case PB_LTYPE_SUBMESSAGE:
+        case PB_LTYPE_FIXED_LENGTH_BYTES:
+            wiretype = PB_WT_STRING;
+            break;
+        
+        default:
+            PB_RETURN_ERROR(stream, "invalid field type");
+    }
+    
+    return pb_encode_tag(stream, wiretype, field->tag);
+}
+
+bool checkreturn pb_encode_string(pb_ostream_t *stream, const pb_byte_t *buffer, size_t size)
+{
+    if (!pb_encode_varint(stream, (pb_uint64_t)size))
+        return false;
+    
+    return pb_write(stream, buffer, size);
+}
+
+bool checkreturn pb_encode_submessage(pb_ostream_t *stream, const pb_field_t fields[], const void *src_struct)
+{
+    /* First calculate the message size using a non-writing substream. */
+    pb_ostream_t substream = PB_OSTREAM_SIZING;
+    size_t size;
+    bool status;
+    
+    if (!pb_encode(&substream, fields, src_struct))
+    {
+#ifndef PB_NO_ERRMSG
+        stream->errmsg = substream.errmsg;
+#endif
+        return false;
+    }
+    
+    size = substream.bytes_written;
+    
+    if (!pb_encode_varint(stream, (pb_uint64_t)size))
+        return false;
+    
+    if (stream->callback == NULL)
+        return pb_write(stream, NULL, size); /* Just sizing */
+    
+    if (stream->bytes_written + size > stream->max_size)
+        PB_RETURN_ERROR(stream, "stream full");
+        
+    /* Use a substream to verify that a callback doesn't write more than
+     * what it did the first time. */
+    substream.callback = stream->callback;
+    substream.state = stream->state;
+    substream.max_size = size;
+    substream.bytes_written = 0;
+#ifndef PB_NO_ERRMSG
+    substream.errmsg = NULL;
+#endif
+    
+    status = pb_encode(&substream, fields, src_struct);
+    
+    stream->bytes_written += substream.bytes_written;
+    stream->state = substream.state;
+#ifndef PB_NO_ERRMSG
+    stream->errmsg = substream.errmsg;
+#endif
+    
+    if (substream.bytes_written != size)
+        PB_RETURN_ERROR(stream, "submsg size changed");
+    
+    return status;
+}
+
+/* Field encoders */
+
+static bool checkreturn pb_enc_varint(pb_ostream_t *stream, const pb_field_t *field, const void *src)
+{
+    pb_int64_t value = 0;
+    
+    if (field->data_size == sizeof(int_least8_t))
+        value = *(const int_least8_t*)src;
+    else if (field->data_size == sizeof(int_least16_t))
+        value = *(const int_least16_t*)src;
+    else if (field->data_size == sizeof(int32_t))
+        value = *(const int32_t*)src;
+    else if (field->data_size == sizeof(pb_int64_t))
+        value = *(const pb_int64_t*)src;
+    else
+        PB_RETURN_ERROR(stream, "invalid data_size");
+    
+#ifdef PB_WITHOUT_64BIT
+    if (value < 0)
+      return pb_encode_negative_varint(stream, (pb_uint64_t)value);
+    else
+#endif
+      return pb_encode_varint(stream, (pb_uint64_t)value);
+}
+
+static bool checkreturn pb_enc_uvarint(pb_ostream_t *stream, const pb_field_t *field, const void *src)
+{
+    pb_uint64_t value = 0;
+    
+    if (field->data_size == sizeof(uint_least8_t))
+        value = *(const uint_least8_t*)src;
+    else if (field->data_size == sizeof(uint_least16_t))
+        value = *(const uint_least16_t*)src;
+    else if (field->data_size == sizeof(uint32_t))
+        value = *(const uint32_t*)src;
+    else if (field->data_size == sizeof(pb_uint64_t))
+        value = *(const pb_uint64_t*)src;
+    else
+        PB_RETURN_ERROR(stream, "invalid data_size");
+    
+    return pb_encode_varint(stream, value);
+}
+
+static bool checkreturn pb_enc_svarint(pb_ostream_t *stream, const pb_field_t *field, const void *src)
+{
+    pb_int64_t value = 0;
+    
+    if (field->data_size == sizeof(int_least8_t))
+        value = *(const int_least8_t*)src;
+    else if (field->data_size == sizeof(int_least16_t))
+        value = *(const int_least16_t*)src;
+    else if (field->data_size == sizeof(int32_t))
+        value = *(const int32_t*)src;
+    else if (field->data_size == sizeof(pb_int64_t))
+        value = *(const pb_int64_t*)src;
+    else
+        PB_RETURN_ERROR(stream, "invalid data_size");
+    
+    return pb_encode_svarint(stream, value);
+}
+
+static bool checkreturn pb_enc_fixed64(pb_ostream_t *stream, const pb_field_t *field, const void *src)
+{
+    PB_UNUSED(field);
+#ifndef PB_WITHOUT_64BIT
+    return pb_encode_fixed64(stream, src);
+#else
+    PB_UNUSED(src);
+    PB_RETURN_ERROR(stream, "no 64bit support");
+#endif
+}
+
+static bool checkreturn pb_enc_fixed32(pb_ostream_t *stream, const pb_field_t *field, const void *src)
+{
+    PB_UNUSED(field);
+    return pb_encode_fixed32(stream, src);
+}
+
+static bool checkreturn pb_enc_bytes(pb_ostream_t *stream, const pb_field_t *field, const void *src)
+{
+    const pb_bytes_array_t *bytes = NULL;
+
+    bytes = (const pb_bytes_array_t*)src;
+    
+    if (src == NULL)
+    {
+        /* Treat null pointer as an empty bytes field */
+        return pb_encode_string(stream, NULL, 0);
+    }
+    
+    if (PB_ATYPE(field->type) == PB_ATYPE_STATIC &&
+        PB_BYTES_ARRAY_T_ALLOCSIZE(bytes->size) > field->data_size)
+    {
+        PB_RETURN_ERROR(stream, "bytes size exceeded");
+    }
+    
+    return pb_encode_string(stream, bytes->bytes, bytes->size);
+}
+
+static bool checkreturn pb_enc_string(pb_ostream_t *stream, const pb_field_t *field, const void *src)
+{
+    size_t size = 0;
+    size_t max_size = field->data_size;
+    const char *p = (const char*)src;
+    
+    if (PB_ATYPE(field->type) == PB_ATYPE_POINTER)
+        max_size = (size_t)-1;
+
+    if (src == NULL)
+    {
+        size = 0; /* Treat null pointer as an empty string */
+    }
+    else
+    {
+        /* strnlen() is not always available, so just use a loop */
+        while (size < max_size && *p != '\0')
+        {
+            size++;
+            p++;
+        }
+    }
+
+    return pb_encode_string(stream, (const pb_byte_t*)src, size);
+}
+
+static bool checkreturn pb_enc_submessage(pb_ostream_t *stream, const pb_field_t *field, const void *src)
+{
+    if (field->ptr == NULL)
+        PB_RETURN_ERROR(stream, "invalid field descriptor");
+    
+    return pb_encode_submessage(stream, (const pb_field_t*)field->ptr, src);
+}
+
+static bool checkreturn pb_enc_fixed_length_bytes(pb_ostream_t *stream, const pb_field_t *field, const void *src)
+{
+    return pb_encode_string(stream, (const pb_byte_t*)src, field->data_size);
+}
+

diff --git a/security/container/protos/nanopb/pb_encode.h b/security/container/protos/nanopb/pb_encode.h
new file mode 100644
index 0000000..8bf78dd
--- /dev/null
+++ b/security/container/protos/nanopb/pb_encode.h

@@ -0,0 +1,170 @@
+/* pb_encode.h: Functions to encode protocol buffers. Depends on pb_encode.c.
+ * The main function is pb_encode. You also need an output stream, and the
+ * field descriptions created by nanopb_generator.py.
+ */
+
+#ifndef PB_ENCODE_H_INCLUDED
+#define PB_ENCODE_H_INCLUDED
+
+#include "pb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Structure for defining custom output streams. You will need to provide
+ * a callback function to write the bytes to your storage, which can be
+ * for example a file or a network socket.
+ *
+ * The callback must conform to these rules:
+ *
+ * 1) Return false on IO errors. This will cause encoding to abort.
+ * 2) You can use state to store your own data (e.g. buffer pointer).
+ * 3) pb_write will update bytes_written after your callback runs.
+ * 4) Substreams will modify max_size and bytes_written. Don't use them
+ *    to calculate any pointers.
+ */
+struct pb_ostream_s
+{
+#ifdef PB_BUFFER_ONLY
+    /* Callback pointer is not used in buffer-only configuration.
+     * Having an int pointer here allows binary compatibility but
+     * gives an error if someone tries to assign callback function.
+     * Also, NULL pointer marks a 'sizing stream' that does not
+     * write anything.
+     */
+    int *callback;
+#else
+    bool (*callback)(pb_ostream_t *stream, const pb_byte_t *buf, size_t count);
+#endif
+    void *state;          /* Free field for use by callback implementation. */
+    size_t max_size;      /* Limit number of output bytes written (or use SIZE_MAX). */
+    size_t bytes_written; /* Number of bytes written so far. */
+    
+#ifndef PB_NO_ERRMSG
+    const char *errmsg;
+#endif
+};
+
+/***************************
+ * Main encoding functions *
+ ***************************/
+
+/* Encode a single protocol buffers message from C structure into a stream.
+ * Returns true on success, false on any failure.
+ * The actual struct pointed to by src_struct must match the description in fields.
+ * All required fields in the struct are assumed to have been filled in.
+ *
+ * Example usage:
+ *    MyMessage msg = {};
+ *    uint8_t buffer[64];
+ *    pb_ostream_t stream;
+ *
+ *    msg.field1 = 42;
+ *    stream = pb_ostream_from_buffer(buffer, sizeof(buffer));
+ *    pb_encode(&stream, MyMessage_fields, &msg);
+ */
+bool pb_encode(pb_ostream_t *stream, const pb_field_t fields[], const void *src_struct);
+
+/* Same as pb_encode, but prepends the length of the message as a varint.
+ * Corresponds to writeDelimitedTo() in Google's protobuf API.
+ */
+bool pb_encode_delimited(pb_ostream_t *stream, const pb_field_t fields[], const void *src_struct);
+
+/* Same as pb_encode, but appends a null byte to the message for termination.
+ * NOTE: This behaviour is not supported in most other protobuf implementations, so pb_encode_delimited()
+ * is a better option for compatibility.
+ */
+bool pb_encode_nullterminated(pb_ostream_t *stream, const pb_field_t fields[], const void *src_struct);
+
+/* Encode the message to get the size of the encoded data, but do not store
+ * the data. */
+bool pb_get_encoded_size(size_t *size, const pb_field_t fields[], const void *src_struct);
+
+/**************************************
+ * Functions for manipulating streams *
+ **************************************/
+
+/* Create an output stream for writing into a memory buffer.
+ * The number of bytes written can be found in stream.bytes_written after
+ * encoding the message.
+ *
+ * Alternatively, you can use a custom stream that writes directly to e.g.
+ * a file or a network socket.
+ */
+pb_ostream_t pb_ostream_from_buffer(pb_byte_t *buf, size_t bufsize);
+
+/* Pseudo-stream for measuring the size of a message without actually storing
+ * the encoded data.
+ * 
+ * Example usage:
+ *    MyMessage msg = {};
+ *    pb_ostream_t stream = PB_OSTREAM_SIZING;
+ *    pb_encode(&stream, MyMessage_fields, &msg);
+ *    printf("Message size is %d\n", stream.bytes_written);
+ */
+#ifndef PB_NO_ERRMSG
+#define PB_OSTREAM_SIZING {0,0,0,0,0}
+#else
+#define PB_OSTREAM_SIZING {0,0,0,0}
+#endif
+
+/* Function to write into a pb_ostream_t stream. You can use this if you need
+ * to append or prepend some custom headers to the message.
+ */
+bool pb_write(pb_ostream_t *stream, const pb_byte_t *buf, size_t count);
+
+
+/************************************************
+ * Helper functions for writing field callbacks *
+ ************************************************/
+
+/* Encode field header based on type and field number defined in the field
+ * structure. Call this from the callback before writing out field contents. */
+bool pb_encode_tag_for_field(pb_ostream_t *stream, const pb_field_t *field);
+
+/* Encode field header by manually specifing wire type. You need to use this
+ * if you want to write out packed arrays from a callback field. */
+bool pb_encode_tag(pb_ostream_t *stream, pb_wire_type_t wiretype, uint32_t field_number);
+
+/* Encode an integer in the varint format.
+ * This works for bool, enum, int32, int64, uint32 and uint64 field types. */
+#ifndef PB_WITHOUT_64BIT
+bool pb_encode_varint(pb_ostream_t *stream, uint64_t value);
+#else
+bool pb_encode_varint(pb_ostream_t *stream, uint32_t value);
+#endif
+
+/* Encode an integer in the zig-zagged svarint format.
+ * This works for sint32 and sint64. */
+#ifndef PB_WITHOUT_64BIT
+bool pb_encode_svarint(pb_ostream_t *stream, int64_t value);
+#else
+bool pb_encode_svarint(pb_ostream_t *stream, int32_t value);
+#endif
+
+/* Encode a string or bytes type field. For strings, pass strlen(s) as size. */
+bool pb_encode_string(pb_ostream_t *stream, const pb_byte_t *buffer, size_t size);
+
+/* Encode a fixed32, sfixed32 or float value.
+ * You need to pass a pointer to a 4-byte wide C variable. */
+bool pb_encode_fixed32(pb_ostream_t *stream, const void *value);
+
+#ifndef PB_WITHOUT_64BIT
+/* Encode a fixed64, sfixed64 or double value.
+ * You need to pass a pointer to a 8-byte wide C variable. */
+bool pb_encode_fixed64(pb_ostream_t *stream, const void *value);
+#endif
+
+/* Encode a submessage field.
+ * You need to pass the pb_field_t array and pointer to struct, just like
+ * with pb_encode(). This internally encodes the submessage twice, first to
+ * calculate message size and then to actually write it out.
+ */
+bool pb_encode_submessage(pb_ostream_t *stream, const pb_field_t fields[], const void *src_struct);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif

diff --git a/security/container/protos/pbsystem.h b/security/container/protos/pbsystem.h
new file mode 100644
index 0000000..f2308f8
--- /dev/null
+++ b/security/container/protos/pbsystem.h

@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Header and types for nanopb to work with the Linux kernel */
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+/* Small types.  */
+
+/* Signed.  */
+typedef signed char		int_least8_t;
+typedef short int		int_least16_t;
+typedef int			int_least32_t;
+typedef long int		int_least64_t;
+
+/* Unsigned.  */
+typedef unsigned char		uint_least8_t;
+typedef unsigned short int	uint_least16_t;
+typedef unsigned int		uint_least32_t;
+typedef unsigned long int	uint_least64_t;
+
+/* Fast types.  */
+
+/* Signed.  */
+typedef signed char		int_fast8_t;
+typedef long int		int_fast16_t;
+typedef long int		int_fast32_t;
+typedef long int		int_fast64_t;
+
+/* Unsigned.  */
+typedef unsigned char		uint_fast8_t;
+typedef unsigned long int	uint_fast16_t;
+typedef unsigned long int	uint_fast32_t;
+typedef unsigned long int	uint_fast64_t;

diff --git a/security/container/vsock.c b/security/container/vsock.c
new file mode 100644
index 0000000..8d1710239
--- /dev/null
+++ b/security/container/vsock.c

@@ -0,0 +1,559 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Container Security Monitor module
+ *
+ * Copyright (c) 2018 Google, Inc
+ */
+
+#include "monitor.h"
+
+#include <net/net_namespace.h>
+#include <net/vsock_addr.h>
+#include <net/sock.h>
+#include <linux/socket.h>
+#include <linux/workqueue.h>
+#include <linux/jiffies.h>
+#include <linux/mutex.h>
+#include <linux/version.h>
+#include <linux/kthread.h>
+#include <linux/printk.h>
+#include <linux/delay.h>
+#include <linux/timekeeping.h>
+
+/*
+ * virtio vsocket over which to send events to the host.
+ * NULL if monitoring is disabled, or if the socket was disconnected and we're
+ * trying to reconnect to the host.
+ */
+static struct socket *csm_vsocket;
+
+/* reconnect delay */
+#define CSM_RECONNECT_FREQ_MSEC 5000
+
+/* config pull delay */
+#define CSM_CONFIG_FREQ_MSEC 1000
+
+/* vsock receive attempts and delay until giving up */
+#define CSM_RECV_ATTEMPTS 2
+#define CSM_RECV_DELAY_MSEC 100
+
+/* heartbeat work */
+#define CSM_HEARTBEAT_FREQ msecs_to_jiffies(5000)
+static void csm_heartbeat(struct work_struct *work);
+static DECLARE_DELAYED_WORK(csm_heartbeat_work, csm_heartbeat);
+
+/* csm protobuf work */
+static void csm_sendmsg_pipe_handler(struct work_struct *work);
+
+/* csm message work container*/
+struct msg_work_data {
+	struct work_struct msg_work;
+	size_t pos_bytes_written;
+	char msg[];
+};
+
+/* size used for the config error message. */
+#define CSM_ERROR_BUF_SIZE 40
+
+/* Running thread to manage vsock connections. */
+static struct task_struct *socket_thread;
+
+/* Mutex to ensure sequential dumping of protos */
+static DEFINE_MUTEX(protodump);
+
+static struct socket *csm_create_socket(void)
+{
+	int err;
+	struct sockaddr_vm host_addr;
+	struct socket *sock;
+
+	err = sock_create_kern(&init_net, AF_VSOCK, SOCK_STREAM, 0,
+			       &sock);
+	if (err) {
+		pr_debug("error creating AF_VSOCK socket: %d\n", err);
+		return ERR_PTR(err);
+	}
+
+	vsock_addr_init(&host_addr, VMADDR_CID_HYPERVISOR, CSM_HOST_PORT);
+
+	err = kernel_connect(sock, (struct sockaddr *)&host_addr,
+			     sizeof(host_addr), 0);
+	if (err) {
+		if (err != -ECONNRESET) {
+			pr_debug("error connecting AF_VSOCK socket to host port %u: %d\n",
+				CSM_HOST_PORT, err);
+		}
+		goto error_release;
+	}
+
+	return sock;
+
+error_release:
+	sock_release(sock);
+	return ERR_PTR(err);
+}
+
+static void csm_destroy_socket(void)
+{
+	down_write(&csm_rwsem_vsocket);
+	if (csm_vsocket) {
+		sock_release(csm_vsocket);
+		csm_vsocket = NULL;
+	}
+	up_write(&csm_rwsem_vsocket);
+}
+
+static int csm_vsock_sendmsg(struct kvec *vecs, size_t vecs_size,
+			     size_t total_length)
+{
+	struct msghdr msg = { };
+	int res = -EPIPE;
+
+	if (!cmdline_boot_vsock_enabled)
+		return 0;
+
+	down_read(&csm_rwsem_vsocket);
+	if (csm_vsocket) {
+		res = kernel_sendmsg(csm_vsocket, &msg, vecs, vecs_size,
+				     total_length);
+		if (res > 0)
+			res = 0;
+	}
+	up_read(&csm_rwsem_vsocket);
+
+	return res;
+}
+
+static ssize_t csm_user_pipe_write(struct kvec *vecs, size_t vecs_size,
+				   size_t total_length)
+{
+	ssize_t perr = 0;
+	struct iov_iter io = { };
+	loff_t pos = 0;
+	struct pipe_inode_info *pipe;
+	unsigned int readers;
+
+	if (!csm_user_write_pipe)
+		return 0;
+
+	down_read(&csm_rwsem_pipe);
+
+	if (csm_user_write_pipe == NULL)
+		goto end;
+
+	/* The pipe info is the same for reader and write files. */
+	pipe = get_pipe_info(csm_user_write_pipe);
+
+	/* If nobody is listening, don't write events. */
+	readers = READ_ONCE(pipe->readers);
+	if (readers <= 1) {
+		WARN_ON(readers == 0);
+		goto end;
+	}
+
+
+	iov_iter_kvec(&io, ITER_KVEC|WRITE, vecs, vecs_size,
+		      total_length);
+
+	file_start_write(csm_user_write_pipe);
+	perr = vfs_iter_write(csm_user_write_pipe, &io, &pos, 0);
+	file_end_write(csm_user_write_pipe);
+
+end:
+	up_read(&csm_rwsem_pipe);
+	return perr;
+}
+
+static int csm_sendmsg(int type, const void *buf, size_t len)
+{
+	struct csm_msg_hdr hdr = {
+		.msg_type = cpu_to_le32(type),
+		.msg_length = cpu_to_le32(sizeof(hdr) + len),
+	};
+	struct kvec vecs[] = {
+		{
+			.iov_base = &hdr,
+			.iov_len = sizeof(hdr),
+		}, {
+			.iov_base = (void *)buf,
+			.iov_len = len,
+		}
+	};
+	int res;
+	ssize_t perr;
+
+	res = csm_vsock_sendmsg(vecs, ARRAY_SIZE(vecs),
+				le32_to_cpu(hdr.msg_length));
+	if (res < 0) {
+		pr_warn_ratelimited("sendmsg error (msg_type=%d, msg_length=%u): %d\n",
+				    type, le32_to_cpu(hdr.msg_length), res);
+	}
+
+	perr = csm_user_pipe_write(vecs, ARRAY_SIZE(vecs),
+				   le32_to_cpu(hdr.msg_length));
+	if (perr < 0) {
+		pr_warn_ratelimited("vfs_iter_write error (msg_type=%d, msg_length=%u): %zd\n",
+				    type, le32_to_cpu(hdr.msg_length), perr);
+	}
+
+	/* If one of them failed, increase the stats once. */
+	if (res < 0 || perr < 0)
+		csm_stats.event_writing_failed++;
+
+	return res;
+}
+
+static bool csm_get_expected_size(size_t *size, const pb_field_t fields[],
+				    const void *src_struct)
+{
+	schema_Event *event;
+
+	if (fields != schema_Event_fields)
+		goto other;
+
+	/* Size above 99% of the 100 containers tested running k8s. */
+	event = (schema_Event *)src_struct;
+	switch (event->which_event) {
+	case schema_Event_execute_tag:
+		*size = 3344;
+		return true;
+	case schema_Event_memexec_tag:
+		*size = 176;
+		return true;
+	case schema_Event_clone_tag:
+		*size = 50;
+		return true;
+	case schema_Event_exit_tag:
+		*size = 30;
+		return true;
+	}
+
+other:
+	/* If unknown, do the pre-computation. */
+	return pb_get_encoded_size(size, fields, src_struct);
+}
+
+static struct msg_work_data *csm_encodeproto(size_t size,
+					     const pb_field_t fields[],
+					     const void *src_struct)
+{
+	pb_ostream_t pos;
+	struct msg_work_data *wd;
+	size_t total;
+
+	total = size + sizeof(*wd);
+	if (total < size)
+		return ERR_PTR(-EINVAL);
+
+	wd = kmalloc(total, GFP_KERNEL);
+	if (!wd)
+		return ERR_PTR(-ENOMEM);
+
+	pos = pb_ostream_from_buffer(wd->msg, size);
+	if (!pb_encode(&pos, fields, src_struct)) {
+		kfree(wd);
+		return ERR_PTR(-EINVAL);
+	}
+
+	INIT_WORK(&wd->msg_work, csm_sendmsg_pipe_handler);
+	wd->pos_bytes_written = pos.bytes_written;
+	return wd;
+}
+
+static int csm_sendproto(int type, const pb_field_t fields[],
+			 const void *src_struct)
+{
+	int err = 0;
+	size_t size, previous_size;
+	struct msg_work_data *wd;
+
+	/* Use the expected size first. */
+	if (!csm_get_expected_size(&size, fields, src_struct))
+		return -EINVAL;
+
+	wd = csm_encodeproto(size, fields, src_struct);
+	if (unlikely(IS_ERR(wd))) {
+		/* If it failed, retry with the exact size. */
+		csm_stats.size_picking_failed++;
+		previous_size = size;
+
+		if (!pb_get_encoded_size(&size, fields, src_struct))
+			return -EINVAL;
+
+		wd = csm_encodeproto(size, fields, src_struct);
+		if (IS_ERR(wd)) {
+			csm_stats.proto_encoding_failed++;
+			return PTR_ERR(wd);
+		}
+
+		pr_debug("size picking failed %lu vs %lu\n", previous_size,
+			 size);
+	}
+
+	/* The work handler takes care of cleanup, if successfully scheduled. */
+	if (likely(schedule_work(&wd->msg_work)))
+		return 0;
+
+	csm_stats.workqueue_failed++;
+	pr_err_ratelimited("Sent msg to workqueue unsuccessfully (assume dropped).\n");
+
+	kfree(wd);
+	return err;
+}
+
+static void csm_sendmsg_pipe_handler(struct work_struct *work)
+{
+	int err;
+	int type = CSM_MSG_EVENT_PROTO;
+	struct msg_work_data *wd = container_of(work, struct msg_work_data,
+						msg_work);
+
+	err = csm_sendmsg(type, wd->msg, wd->pos_bytes_written);
+	if (err)
+		pr_err_ratelimited("csm_sendmsg failed in work handler %s\n",
+				   __func__);
+
+	kfree(wd);
+}
+
+int csm_sendeventproto(const pb_field_t fields[], schema_Event *event)
+{
+	/* Last check before generating and sending an event. */
+	if (!csm_enabled)
+		return -ENOTSUPP;
+
+	event->timestamp = ktime_get_real_ns();
+
+	return csm_sendproto(CSM_MSG_EVENT_PROTO, fields, event);
+}
+
+int csm_sendconfigrespproto(const pb_field_t fields[],
+			    schema_ConfigurationResponse *resp)
+{
+	return csm_sendproto(CSM_MSG_CONFIG_RESPONSE_PROTO, fields, resp);
+}
+
+static void csm_heartbeat(struct work_struct *work)
+{
+	csm_sendmsg(CSM_MSG_TYPE_HEARTBEAT, NULL, 0);
+	schedule_delayed_work(&csm_heartbeat_work, CSM_HEARTBEAT_FREQ);
+}
+
+static int config_send_response(int err)
+{
+	char buf[CSM_ERROR_BUF_SIZE] = {};
+	schema_ConfigurationResponse resp =
+		schema_ConfigurationResponse_init_zero;
+
+	resp.error = schema_ConfigurationResponse_ErrorCode_NO_ERROR;
+	resp.version = CSM_VERSION;
+	resp.kernel_version = LINUX_VERSION_CODE;
+
+	if (err) {
+		resp.error = schema_ConfigurationResponse_ErrorCode_UNKNOWN;
+		snprintf(buf, sizeof(buf) - 1, "error code: %d", err);
+		resp.msg.funcs.encode = pb_encode_string_field;
+		resp.msg.arg = buf;
+	}
+
+	return csm_sendconfigrespproto(schema_ConfigurationResponse_fields,
+				       &resp);
+}
+
+static int csm_recvmsg(void *buf, size_t len, bool expected)
+{
+	int err = 0;
+	struct msghdr msg = {};
+	struct kvec vecs;
+	size_t pos = 0;
+	size_t attempts = 0;
+
+	while (pos < len) {
+		vecs.iov_base = (char *)buf + pos;
+		vecs.iov_len = len - pos;
+
+		down_read(&csm_rwsem_vsocket);
+		if (csm_vsocket) {
+			err = kernel_recvmsg(csm_vsocket, &msg, &vecs, 1, len,
+					     MSG_DONTWAIT);
+		} else {
+			pr_err("csm_vsocket was unset while the config thread was running\n");
+			err = -ENOENT;
+		}
+		up_read(&csm_rwsem_vsocket);
+
+		if (err == 0) {
+			err = -ENOTCONN;
+			pr_warn_ratelimited("vsock connection was reset\n");
+			break;
+		}
+
+		if (err == -EAGAIN) {
+			/*
+			 * If nothing is received and nothing was expected
+			 * just bail.
+			 */
+			if (!expected && pos == 0) {
+				err = -EAGAIN;
+				break;
+			}
+
+			/*
+			 * If we missing data after multiple attempts
+			 * reset the connection.
+			 */
+			if (++attempts >= CSM_RECV_ATTEMPTS) {
+				err = -EPIPE;
+				break;
+			}
+
+			msleep(CSM_RECV_DELAY_MSEC);
+			continue;
+		}
+
+		if (err < 0) {
+			pr_err_ratelimited("kernel_recvmsg failed with %d\n",
+					   err);
+			break;
+		}
+
+		pos += err;
+	}
+
+	return err;
+}
+
+/*
+ * Listen for configuration until connection is closed or desynchronize.
+ * If something wrong happens while parsing the packet buffer that may
+ * desynchronize the thread with the backend, the connection is reset.
+ */
+
+static void listen_configuration(void *buf)
+{
+	int err;
+	struct csm_msg_hdr hdr = {};
+	uint32_t msg_type, msg_length;
+
+	pr_debug("listening for configuration messages\n");
+
+	while (true) {
+		err = csm_recvmsg(&hdr, sizeof(hdr), false);
+
+		/* Nothing available, wait and try again. */
+		if (err == -EAGAIN) {
+			msleep(CSM_CONFIG_FREQ_MSEC);
+			continue;
+		}
+
+		if (err < 0)
+			break;
+
+		msg_type = le32_to_cpu(hdr.msg_type);
+
+		if (msg_type != CSM_MSG_CONFIG_REQUEST_PROTO) {
+			pr_warn_ratelimited("unexpected message type: %d\n",
+					    msg_type);
+			break;
+		}
+
+		msg_length = le32_to_cpu(hdr.msg_length);
+
+		if (msg_length <= sizeof(hdr) || msg_length > PAGE_SIZE) {
+			pr_warn_ratelimited("unexpected message length: %d\n",
+					    msg_length);
+			break;
+		}
+
+		/* The message length include the size of the header. */
+		msg_length -= sizeof(hdr);
+
+		err = csm_recvmsg(buf, msg_length, true);
+		if (err < 0) {
+			pr_warn_ratelimited("failed to gather configuration: %d\n",
+					    err);
+			break;
+		}
+
+		err = csm_update_config_from_buffer(buf, msg_length);
+		if (err < 0) {
+			/*
+			 * Warn of the error but continue listening for
+			 * configuration changes.
+			 */
+			pr_warn_ratelimited("config update failed: %d\n", err);
+		} else {
+			pr_debug("config received and applied\n");
+		}
+
+		err = config_send_response(err);
+		if (err < 0) {
+			pr_err_ratelimited("config response failed: %d\n", err);
+			break;
+		}
+
+		pr_debug("config response sent\n");
+	}
+}
+
+/* Thread managing connection and listening for new configurations. */
+static int socket_thread_fn(void *unsued)
+{
+	void *buf;
+	struct socket *sock;
+
+	/* One page should be enough for current configurations. */
+	buf = (void *)__get_free_page(GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	while (true) {
+		sock = csm_create_socket();
+		if (IS_ERR(sock)) {
+			pr_debug("unable to connect to host (port %u), will retry in %u ms\n",
+				 CSM_HOST_PORT, CSM_RECONNECT_FREQ_MSEC);
+			msleep(CSM_RECONNECT_FREQ_MSEC);
+			continue;
+		}
+
+		down_write(&csm_rwsem_vsocket);
+		csm_vsocket = sock;
+		up_write(&csm_rwsem_vsocket);
+
+		schedule_delayed_work(&csm_heartbeat_work, 0);
+
+		listen_configuration(buf);
+
+		pr_warn("vsock state incorrect, disconnecting. Messages will be lost.\n");
+
+		cancel_delayed_work_sync(&csm_heartbeat_work);
+		csm_destroy_socket();
+	}
+
+	return 0;
+}
+
+void __init vsock_destroy(void)
+{
+	if (socket_thread) {
+		kthread_stop(socket_thread);
+		socket_thread = NULL;
+	}
+}
+
+int __init vsock_initialize(void)
+{
+	struct task_struct *task;
+
+	if (cmdline_boot_vsock_enabled) {
+		task = kthread_run(socket_thread_fn, NULL, "csm-vsock-thread");
+		if (IS_ERR(task)) {
+			pr_err("failed to create socket thread: %ld\n", PTR_ERR(task));
+			vsock_destroy();
+			return PTR_ERR(task);
+		}
+
+		socket_thread = task;
+	}
+	return 0;
+}

diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index 0716af2..379fcb8 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c

@@ -45,6 +45,8 @@
 }
 
 static int enabled = IS_ENABLED(CONFIG_SECURITY_LOADPIN_ENABLED);
+static char *exclude_read_files[READING_MAX_ID];
+static int ignore_read_file_id[READING_MAX_ID] __ro_after_init;
 static struct super_block *pinned_root;
 static DEFINE_SPINLOCK(pinned_root_spinlock);
 
@@ -126,6 +128,13 @@
 	struct super_block *load_root;
 	const char *origin = kernel_read_file_id_str(id);
 
+	/* If the file id is excluded, ignore the pinning. */
+	if ((unsigned int)id < ARRAY_SIZE(ignore_read_file_id) &&
+	    ignore_read_file_id[id]) {
+		report_load(origin, file, "pinning-excluded");
+		return 0;
+	}
+
 	/* This handles the older init_module API that has a NULL file. */
 	if (!file) {
 		if (!enabled) {
@@ -184,12 +193,51 @@
 	LSM_HOOK_INIT(kernel_load_data, loadpin_load_data),
 };
 
+static void __init parse_exclude(void)
+{
+	int i, j;
+	char *cur;
+
+	/*
+	 * Make sure all the arrays stay within expected sizes. This
+	 * is slightly weird because kernel_read_file_str[] includes
+	 * READING_MAX_ID, which isn't actually meaningful here.
+	 */
+	BUILD_BUG_ON(ARRAY_SIZE(exclude_read_files) !=
+		     ARRAY_SIZE(ignore_read_file_id));
+	BUILD_BUG_ON(ARRAY_SIZE(kernel_read_file_str) <
+		     ARRAY_SIZE(ignore_read_file_id));
+
+	for (i = 0; i < ARRAY_SIZE(exclude_read_files); i++) {
+		cur = exclude_read_files[i];
+		if (!cur)
+			break;
+		if (*cur == '\0')
+			continue;
+
+		for (j = 0; j < ARRAY_SIZE(ignore_read_file_id); j++) {
+			if (strcmp(cur, kernel_read_file_str[j]) == 0) {
+				pr_info("excluding: %s\n",
+					kernel_read_file_str[j]);
+				ignore_read_file_id[j] = 1;
+				/*
+				 * Can not break, because one read_file_str
+				 * may map to more than on read_file_id.
+				 */
+			}
+		}
+	}
+}
+
 void __init loadpin_add_hooks(void)
 {
 	pr_info("ready to pin (currently %sabled)", enabled ? "en" : "dis");
+	parse_exclude();
 	security_add_hooks(loadpin_hooks, ARRAY_SIZE(loadpin_hooks), "loadpin");
 }
 
 /* Should not be mutable after boot, so not listed in sysfs (perm == 0). */
 module_param(enabled, int, 0);
 MODULE_PARM_DESC(enabled, "Pin module/firmware loading (default: true)");
+module_param_array_named(exclude, exclude_read_files, charp, NULL, 0);
+MODULE_PARM_DESC(exclude, "Exclude pinning specific read file types");

diff --git a/security/security.c b/security/security.c
index 9478444..82dcca2 100644
--- a/security/security.c
+++ b/security/security.c

@@ -885,6 +885,11 @@
 	call_void_hook(file_free_security, file);
 }
 
+void security_file_pre_free(struct file *file)
+{
+	call_void_hook(file_pre_free_security, file);
+}
+
 int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	return call_int_hook(file_ioctl, 0, file, cmd, arg);
@@ -987,6 +992,11 @@
 	return call_int_hook(task_alloc, 0, task, clone_flags);
 }
 
+void security_task_post_alloc(struct task_struct *task)
+{
+	call_void_hook(task_post_alloc, task);
+}
+
 void security_task_free(struct task_struct *task)
 {
 	call_void_hook(task_free, task);
@@ -1156,6 +1166,11 @@
 	return call_int_hook(task_kill, 0, p, info, sig, cred);
 }
 
+void security_task_exit(struct task_struct *p)
+{
+	call_void_hook(task_exit, p);
+}
+
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			 unsigned long arg4, unsigned long arg5)
 {

diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 5de18a6..d52be7b 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c

@@ -496,7 +496,7 @@
 	if (likely(!audited))
 		return 0;
 	return slow_avc_audit(state, ssid, tsid, tclass, requested,
-			audited, denied, result, ad, 0);
+			audited, denied, result, ad);
 }
 
 static void avc_node_free(struct rcu_head *rhead)
@@ -689,40 +689,37 @@
 	struct avc_node *pos, *node = NULL;
 	int hvalue;
 	unsigned long flag;
+	spinlock_t *lock;
+	struct hlist_head *head;
 
 	if (avc_latest_notif_update(avc, avd->seqno, 1))
-		goto out;
+		return NULL;
 
 	node = avc_alloc_node(avc);
-	if (node) {
-		struct hlist_head *head;
-		spinlock_t *lock;
-		int rc = 0;
+	if (!node)
+		return NULL;
 
-		hvalue = avc_hash(ssid, tsid, tclass);
-		avc_node_populate(node, ssid, tsid, tclass, avd);
-		rc = avc_xperms_populate(node, xp_node);
-		if (rc) {
-			kmem_cache_free(avc_node_cachep, node);
-			return NULL;
-		}
-		head = &avc->avc_cache.slots[hvalue];
-		lock = &avc->avc_cache.slots_lock[hvalue];
-
-		spin_lock_irqsave(lock, flag);
-		hlist_for_each_entry(pos, head, list) {
-			if (pos->ae.ssid == ssid &&
-			    pos->ae.tsid == tsid &&
-			    pos->ae.tclass == tclass) {
-				avc_node_replace(avc, node, pos);
-				goto found;
-			}
-		}
-		hlist_add_head_rcu(&node->list, head);
-found:
-		spin_unlock_irqrestore(lock, flag);
+	avc_node_populate(node, ssid, tsid, tclass, avd);
+	if (avc_xperms_populate(node, xp_node)) {
+		avc_node_kill(avc, node);
+		return NULL;
 	}
-out:
+
+	hvalue = avc_hash(ssid, tsid, tclass);
+	head = &avc->avc_cache.slots[hvalue];
+	lock = &avc->avc_cache.slots_lock[hvalue];
+	spin_lock_irqsave(lock, flag);
+	hlist_for_each_entry(pos, head, list) {
+		if (pos->ae.ssid == ssid &&
+			pos->ae.tsid == tsid &&
+			pos->ae.tclass == tclass) {
+			avc_node_replace(avc, node, pos);
+			goto found;
+		}
+	}
+	hlist_add_head_rcu(&node->list, head);
+found:
+	spin_unlock_irqrestore(lock, flag);
 	return node;
 }
 
@@ -766,8 +763,7 @@
 noinline int slow_avc_audit(struct selinux_state *state,
 			    u32 ssid, u32 tsid, u16 tclass,
 			    u32 requested, u32 audited, u32 denied, int result,
-			    struct common_audit_data *a,
-			    unsigned int flags)
+			    struct common_audit_data *a)
 {
 	struct common_audit_data stack_data;
 	struct selinux_audit_data sad;
@@ -777,17 +773,6 @@
 		a->type = LSM_AUDIT_DATA_NONE;
 	}
 
-	/*
-	 * When in a RCU walk do the audit on the RCU retry.  This is because
-	 * the collection of the dname in an inode audit message is not RCU
-	 * safe.  Note this may drop some audits when the situation changes
-	 * during retry. However this is logically just as if the operation
-	 * happened a little later.
-	 */
-	if ((a->type == LSM_AUDIT_DATA_INODE) &&
-	    (flags & MAY_NOT_BLOCK))
-		return -ECHILD;
-
 	sad.tclass = tclass;
 	sad.requested = requested;
 	sad.ssid = ssid;
@@ -860,16 +845,14 @@
 	/*
 	 * If we are in a non-blocking code path, e.g. VFS RCU walk,
 	 * then we must not add permissions to a cache entry
-	 * because we cannot safely audit the denial.  Otherwise,
+	 * because we will not audit the denial.  Otherwise,
 	 * during the subsequent blocking retry (e.g. VFS ref walk), we
 	 * will find the permissions already granted in the cache entry
 	 * and won't audit anything at all, leading to silent denials in
 	 * permissive mode that only appear when in enforcing mode.
 	 *
-	 * See the corresponding handling in slow_avc_audit(), and the
-	 * logic in selinux_inode_follow_link and selinux_inode_permission
-	 * for the VFS MAY_NOT_BLOCK flag, which is transliterated into
-	 * AVC_NONBLOCKING for avc_has_perm_noaudit().
+	 * See the corresponding handling of MAY_NOT_BLOCK in avc_audit()
+	 * and selinux_inode_permission().
 	 */
 	if (flags & AVC_NONBLOCKING)
 		return 0;
@@ -913,7 +896,7 @@
 	if (orig->ae.xp_node) {
 		rc = avc_xperms_populate(node, orig->ae.xp_node);
 		if (rc) {
-			kmem_cache_free(avc_node_cachep, node);
+			avc_node_kill(avc, node);
 			goto out_unlock;
 		}
 	}

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 040c843..c574285 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c

@@ -3171,8 +3171,7 @@
 
 static noinline int audit_inode_permission(struct inode *inode,
 					   u32 perms, u32 audited, u32 denied,
-					   int result,
-					   unsigned flags)
+					   int result)
 {
 	struct common_audit_data ad;
 	struct inode_security_struct *isec = inode->i_security;
@@ -3183,7 +3182,7 @@
 
 	rc = slow_avc_audit(&selinux_state,
 			    current_sid(), isec->sid, isec->sclass, perms,
-			    audited, denied, result, &ad, flags);
+			    audited, denied, result, &ad);
 	if (rc)
 		return rc;
 	return 0;
@@ -3230,7 +3229,11 @@
 	if (likely(!audited))
 		return rc;
 
-	rc2 = audit_inode_permission(inode, perms, audited, denied, rc, flags);
+	/* fall back to ref-walk if we have to generate audit */
+	if (flags & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	rc2 = audit_inode_permission(inode, perms, audited, denied, rc);
 	if (rc2)
 		return rc2;
 	return rc;

diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h
index 74ea509..cf4cc3e 100644
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h

@@ -100,8 +100,7 @@
 int slow_avc_audit(struct selinux_state *state,
 		   u32 ssid, u32 tsid, u16 tclass,
 		   u32 requested, u32 audited, u32 denied, int result,
-		   struct common_audit_data *a,
-		   unsigned flags);
+		   struct common_audit_data *a);
 
 /**
  * avc_audit - Audit the granting or denial of permissions.
@@ -135,9 +134,12 @@
 	audited = avc_audit_required(requested, avd, result, 0, &denied);
 	if (likely(!audited))
 		return 0;
+	/* fall back to ref-walk if we have to generate audit */
+	if (flags & MAY_NOT_BLOCK)
+		return -ECHILD;
 	return slow_avc_audit(state, ssid, tsid, tclass,
 			      requested, audited, denied, result,
-			      a, flags);
+			      a);
 }
 
 #define AVC_STRICT 1 /* Ignore permissive mode. */

diff --git a/sound/core/control.c b/sound/core/control.c
index 649d321..d1312f1 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c

@@ -1468,8 +1468,9 @@
 	if (kctl->tlv.c == NULL)
 		return -ENXIO;
 
-	/* When locked, this is unavailable. */
-	if (vd->owner != NULL && vd->owner != file)
+	/* Write and command operations are not allowed for locked element. */
+	if (op_flag != SNDRV_CTL_TLV_OP_READ &&
+	    vd->owner != NULL && vd->owner != file)
 		return -EPERM;
 
 	return kctl->tlv.c(kctl, op_flag, size, buf);

diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c
index bd3d68e0..aaf9c41 100644
--- a/sound/core/seq/seq_clientmgr.c
+++ b/sound/core/seq/seq_clientmgr.c

@@ -563,7 +563,7 @@
 	event->queue = queue;
 	event->flags &= ~SNDRV_SEQ_TIME_STAMP_MASK;
 	if (real_time) {
-		event->time.time = snd_seq_timer_get_cur_time(q->timer);
+		event->time.time = snd_seq_timer_get_cur_time(q->timer, true);
 		event->flags |= SNDRV_SEQ_TIME_STAMP_REAL;
 	} else {
 		event->time.tick = snd_seq_timer_get_cur_tick(q->timer);
@@ -1642,7 +1642,7 @@
 	tmr = queue->timer;
 	status->events = queue->tickq->cells + queue->timeq->cells;
 
-	status->time = snd_seq_timer_get_cur_time(tmr);
+	status->time = snd_seq_timer_get_cur_time(tmr, true);
 	status->tick = snd_seq_timer_get_cur_tick(tmr);
 
 	status->running = tmr->running;

diff --git a/sound/core/seq/seq_queue.c b/sound/core/seq/seq_queue.c
index 3b3ac96..28b4dd4 100644
--- a/sound/core/seq/seq_queue.c
+++ b/sound/core/seq/seq_queue.c

@@ -251,6 +251,8 @@
 {
 	unsigned long flags;
 	struct snd_seq_event_cell *cell;
+	snd_seq_tick_time_t cur_tick;
+	snd_seq_real_time_t cur_time;
 
 	if (q == NULL)
 		return;
@@ -267,17 +269,18 @@
 
       __again:
 	/* Process tick queue... */
+	cur_tick = snd_seq_timer_get_cur_tick(q->timer);
 	for (;;) {
-		cell = snd_seq_prioq_cell_out(q->tickq,
-					      &q->timer->tick.cur_tick);
+		cell = snd_seq_prioq_cell_out(q->tickq, &cur_tick);
 		if (!cell)
 			break;
 		snd_seq_dispatch_event(cell, atomic, hop);
 	}
 
 	/* Process time queue... */
+	cur_time = snd_seq_timer_get_cur_time(q->timer, false);
 	for (;;) {
-		cell = snd_seq_prioq_cell_out(q->timeq, &q->timer->cur_time);
+		cell = snd_seq_prioq_cell_out(q->timeq, &cur_time);
 		if (!cell)
 			break;
 		snd_seq_dispatch_event(cell, atomic, hop);
@@ -405,6 +408,7 @@
 int snd_seq_queue_set_owner(int queueid, int client, int locked)
 {
 	struct snd_seq_queue *q = queueptr(queueid);
+	unsigned long flags;
 
 	if (q == NULL)
 		return -EINVAL;
@@ -414,8 +418,10 @@
 		return -EPERM;
 	}
 
+	spin_lock_irqsave(&q->owner_lock, flags);
 	q->locked = locked ? 1 : 0;
 	q->owner = client;
+	spin_unlock_irqrestore(&q->owner_lock, flags);
 	queue_access_unlock(q);
 	queuefree(q);
 
@@ -552,15 +558,17 @@
 	unsigned long flags;
 	int i;
 	struct snd_seq_queue *q;
+	bool matched;
 
 	for (i = 0; i < SNDRV_SEQ_MAX_QUEUES; i++) {
 		if ((q = queueptr(i)) == NULL)
 			continue;
 		spin_lock_irqsave(&q->owner_lock, flags);
-		if (q->owner == client)
+		matched = (q->owner == client);
+		if (matched)
 			q->klocked = 1;
 		spin_unlock_irqrestore(&q->owner_lock, flags);
-		if (q->owner == client) {
+		if (matched) {
 			if (q->timer->running)
 				snd_seq_timer_stop(q->timer);
 			snd_seq_timer_reset(q->timer);
@@ -752,6 +760,8 @@
 	int i, bpm;
 	struct snd_seq_queue *q;
 	struct snd_seq_timer *tmr;
+	bool locked;
+	int owner;
 
 	for (i = 0; i < SNDRV_SEQ_MAX_QUEUES; i++) {
 		if ((q = queueptr(i)) == NULL)
@@ -763,9 +773,14 @@
 		else
 			bpm = 0;
 
+		spin_lock_irq(&q->owner_lock);
+		locked = q->locked;
+		owner = q->owner;
+		spin_unlock_irq(&q->owner_lock);
+
 		snd_iprintf(buffer, "queue %d: [%s]\n", q->queue, q->name);
-		snd_iprintf(buffer, "owned by client    : %d\n", q->owner);
-		snd_iprintf(buffer, "lock status        : %s\n", q->locked ? "Locked" : "Free");
+		snd_iprintf(buffer, "owned by client    : %d\n", owner);
+		snd_iprintf(buffer, "lock status        : %s\n", locked ? "Locked" : "Free");
 		snd_iprintf(buffer, "queued time events : %d\n", snd_seq_prioq_avail(q->timeq));
 		snd_iprintf(buffer, "queued tick events : %d\n", snd_seq_prioq_avail(q->tickq));
 		snd_iprintf(buffer, "timer state        : %s\n", tmr->running ? "Running" : "Stopped");

diff --git a/sound/core/seq/seq_timer.c b/sound/core/seq/seq_timer.c
index aed8e1c..3da44a4 100644
--- a/sound/core/seq/seq_timer.c
+++ b/sound/core/seq/seq_timer.c

@@ -437,14 +437,15 @@
 }
 
 /* return current 'real' time. use timeofday() to get better granularity. */
-snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr)
+snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr,
+					       bool adjust_ktime)
 {
 	snd_seq_real_time_t cur_time;
 	unsigned long flags;
 
 	spin_lock_irqsave(&tmr->lock, flags);
 	cur_time = tmr->cur_time;
-	if (tmr->running) { 
+	if (adjust_ktime && tmr->running) {
 		struct timespec64 tm;
 
 		ktime_get_ts64(&tm);
@@ -461,7 +462,13 @@
  high PPQ values) */
 snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr)
 {
-	return tmr->tick.cur_tick;
+	snd_seq_tick_time_t cur_tick;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tmr->lock, flags);
+	cur_tick = tmr->tick.cur_tick;
+	spin_unlock_irqrestore(&tmr->lock, flags);
+	return cur_tick;
 }
 
 

diff --git a/sound/core/seq/seq_timer.h b/sound/core/seq/seq_timer.h
index 62f3906..44f52f5 100644
--- a/sound/core/seq/seq_timer.h
+++ b/sound/core/seq/seq_timer.h

@@ -135,7 +135,8 @@
 int snd_seq_timer_set_position_tick(struct snd_seq_timer *tmr, snd_seq_tick_time_t position);
 int snd_seq_timer_set_position_time(struct snd_seq_timer *tmr, snd_seq_real_time_t position);
 int snd_seq_timer_set_skew(struct snd_seq_timer *tmr, unsigned int skew, unsigned int base);
-snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr);
+snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr,
+					       bool adjust_ktime);
 snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr);
 
 extern int seq_default_timer_class;

diff --git a/sound/hda/hdmi_chmap.c b/sound/hda/hdmi_chmap.c
index f21633c..acbe61b 100644
--- a/sound/hda/hdmi_chmap.c
+++ b/sound/hda/hdmi_chmap.c

@@ -249,7 +249,7 @@
 
 	for (i = 0, j = 0; i < ARRAY_SIZE(cea_speaker_allocation_names); i++) {
 		if (spk_alloc & (1 << i))
-			j += snprintf(buf + j, buflen - j,  " %s",
+			j += scnprintf(buf + j, buflen - j,  " %s",
 					cea_speaker_allocation_names[i]);
 	}
 	buf[j] = '\0';	/* necessary when j == 0 */

diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c
index 82b0dc9..f3a6b1d 100644
--- a/sound/pci/hda/hda_codec.c
+++ b/sound/pci/hda/hda_codec.c

@@ -4019,7 +4019,7 @@
 
 	for (i = 0, j = 0; i < ARRAY_SIZE(bits); i++)
 		if (pcm & (AC_SUPPCM_BITS_8 << i))
-			j += snprintf(buf + j, buflen - j,  " %d", bits[i]);
+			j += scnprintf(buf + j, buflen - j,  " %d", bits[i]);
 
 	buf[j] = '\0'; /* necessary when j == 0 */
 }

diff --git a/sound/pci/hda/hda_eld.c b/sound/pci/hda/hda_eld.c
index ba7fe9b..864cc8c 100644
--- a/sound/pci/hda/hda_eld.c
+++ b/sound/pci/hda/hda_eld.c

@@ -373,7 +373,7 @@
 
 	for (i = 0, j = 0; i < ARRAY_SIZE(alsa_rates); i++)
 		if (pcm & (1 << i))
-			j += snprintf(buf + j, buflen - j,  " %d",
+			j += scnprintf(buf + j, buflen - j,  " %d",
 				alsa_rates[i]);
 
 	buf[j] = '\0'; /* necessary when j == 0 */

diff --git a/sound/pci/hda/hda_sysfs.c b/sound/pci/hda/hda_sysfs.c
index 6ec79c5..6535155 100644
--- a/sound/pci/hda/hda_sysfs.c
+++ b/sound/pci/hda/hda_sysfs.c

@@ -221,7 +221,7 @@
 	int i, len = 0;
 	mutex_lock(&codec->user_mutex);
 	snd_array_for_each(&codec->init_verbs, i, v) {
-		len += snprintf(buf + len, PAGE_SIZE - len,
+		len += scnprintf(buf + len, PAGE_SIZE - len,
 				"0x%02x 0x%03x 0x%04x\n",
 				v->nid, v->verb, v->param);
 	}
@@ -271,7 +271,7 @@
 	int i, len = 0;
 	mutex_lock(&codec->user_mutex);
 	snd_array_for_each(&codec->hints, i, hint) {
-		len += snprintf(buf + len, PAGE_SIZE - len,
+		len += scnprintf(buf + len, PAGE_SIZE - len,
 				"%s = %s\n", hint->key, hint->val);
 	}
 	mutex_unlock(&codec->user_mutex);

diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
index 5500dd43..78bb962 100644
--- a/sound/pci/hda/patch_conexant.c
+++ b/sound/pci/hda/patch_conexant.c

@@ -935,6 +935,7 @@
 	SND_PCI_QUIRK(0x17aa, 0x215f, "Lenovo T510", CXT_PINCFG_LENOVO_TP410),
 	SND_PCI_QUIRK(0x17aa, 0x21ce, "Lenovo T420", CXT_PINCFG_LENOVO_TP410),
 	SND_PCI_QUIRK(0x17aa, 0x21cf, "Lenovo T520", CXT_PINCFG_LENOVO_TP410),
+	SND_PCI_QUIRK(0x17aa, 0x21d2, "Lenovo T420s", CXT_PINCFG_LENOVO_TP410),
 	SND_PCI_QUIRK(0x17aa, 0x21da, "Lenovo X220", CXT_PINCFG_LENOVO_TP410),
 	SND_PCI_QUIRK(0x17aa, 0x21db, "Lenovo X220-tablet", CXT_PINCFG_LENOVO_TP410),
 	SND_PCI_QUIRK(0x17aa, 0x38af, "Lenovo IdeaPad Z560", CXT_FIXUP_MUTE_LED_EAPD),

diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c
index c827a2a..c67fadd 100644
--- a/sound/pci/hda/patch_hdmi.c
+++ b/sound/pci/hda/patch_hdmi.c

@@ -2604,9 +2604,12 @@
 /* parse and post-process for Intel codecs */
 static int parse_intel_hdmi(struct hda_codec *codec)
 {
-	int err;
+	int err, retries = 3;
 
-	err = hdmi_parse_codec(codec);
+	do {
+		err = hdmi_parse_codec(codec);
+	} while (err < 0 && retries--);
+
 	if (err < 0) {
 		generic_spec_free(codec);
 		return err;

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index ae735bc..94fffc0 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c

@@ -2442,6 +2442,9 @@
 	SND_PCI_QUIRK(0x1071, 0x8258, "Evesham Voyaeger", ALC882_FIXUP_EAPD),
 	SND_PCI_QUIRK(0x1458, 0xa002, "Gigabyte EP45-DS3/Z87X-UD3H", ALC889_FIXUP_FRONT_HP_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1458, 0xa0b8, "Gigabyte AZ370-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS),
+	SND_PCI_QUIRK(0x1462, 0x1228, "MSI-GP63", ALC1220_FIXUP_CLEVO_P950),
+	SND_PCI_QUIRK(0x1462, 0x1276, "MSI-GL73", ALC1220_FIXUP_CLEVO_P950),
+	SND_PCI_QUIRK(0x1462, 0x1293, "MSI-GP65", ALC1220_FIXUP_CLEVO_P950),
 	SND_PCI_QUIRK(0x1462, 0x7350, "MSI-7350", ALC889_FIXUP_CD),
 	SND_PCI_QUIRK(0x1462, 0xda57, "MSI Z270-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS),
 	SND_PCI_QUIRK_VENDOR(0x1462, "MSI", ALC882_FIXUP_GPIO3),

diff --git a/sound/sh/aica.c b/sound/sh/aica.c
index ad3f713..69ac44b 100644
--- a/sound/sh/aica.c
+++ b/sound/sh/aica.c

@@ -117,10 +117,10 @@
 }
 
 /* spu_memload - write to SPU address space */
-static void spu_memload(u32 toi, void *from, int length)
+static void spu_memload(u32 toi, const void *from, int length)
 {
 	unsigned long flags;
-	u32 *froml = from;
+	const u32 *froml = from;
 	u32 __iomem *to = (u32 __iomem *) (SPU_MEMORY_BASE + toi);
 	int i;
 	u32 val;

diff --git a/sound/sh/sh_dac_audio.c b/sound/sh/sh_dac_audio.c
index 834b257..6251b5e 100644
--- a/sound/sh/sh_dac_audio.c
+++ b/sound/sh/sh_dac_audio.c

@@ -190,7 +190,6 @@
 {
 	/* channel is not used (interleaved data) */
 	struct snd_sh_dac *chip = snd_pcm_substream_chip(substream);
-	struct snd_pcm_runtime *runtime = substream->runtime;
 
 	if (copy_from_user_toio(chip->data_buffer + pos, src, count))
 		return -EFAULT;
@@ -210,7 +209,6 @@
 {
 	/* channel is not used (interleaved data) */
 	struct snd_sh_dac *chip = snd_pcm_substream_chip(substream);
-	struct snd_pcm_runtime *runtime = substream->runtime;
 
 	memcpy_toio(chip->data_buffer + pos, src, count);
 	chip->buffer_end = chip->data_buffer + pos + count;
@@ -229,7 +227,6 @@
 {
 	/* channel is not used (interleaved data) */
 	struct snd_sh_dac *chip = snd_pcm_substream_chip(substream);
-	struct snd_pcm_runtime *runtime = substream->runtime;
 
 	memset_io(chip->data_buffer + pos, 0, count);
 	chip->buffer_end = chip->data_buffer + pos + count;

diff --git a/sound/soc/atmel/Kconfig b/sound/soc/atmel/Kconfig
index 64b784e..dad778e 100644
--- a/sound/soc/atmel/Kconfig
+++ b/sound/soc/atmel/Kconfig

@@ -25,6 +25,8 @@
 
 config SND_ATMEL_SOC_SSC_DMA
 	tristate
+	select SND_ATMEL_SOC_DMA
+	select SND_ATMEL_SOC_PDC
 
 config SND_ATMEL_SOC_SSC
 	tristate

diff --git a/sound/soc/sunxi/sun8i-codec.c b/sound/soc/sunxi/sun8i-codec.c
index a3db6a6..8bcdeb2 100644
--- a/sound/soc/sunxi/sun8i-codec.c
+++ b/sound/soc/sunxi/sun8i-codec.c

@@ -89,6 +89,7 @@
 
 #define SUN8I_SYS_SR_CTRL_AIF1_FS_MASK		GENMASK(15, 12)
 #define SUN8I_SYS_SR_CTRL_AIF2_FS_MASK		GENMASK(11, 8)
+#define SUN8I_AIF1CLK_CTRL_AIF1_DATA_FMT_MASK	GENMASK(3, 2)
 #define SUN8I_AIF1CLK_CTRL_AIF1_WORD_SIZ_MASK	GENMASK(5, 4)
 #define SUN8I_AIF1CLK_CTRL_AIF1_LRCK_DIV_MASK	GENMASK(8, 6)
 #define SUN8I_AIF1CLK_CTRL_AIF1_BCLK_DIV_MASK	GENMASK(12, 9)
@@ -250,7 +251,7 @@
 		return -EINVAL;
 	}
 	regmap_update_bits(scodec->regmap, SUN8I_AIF1CLK_CTRL,
-			   BIT(SUN8I_AIF1CLK_CTRL_AIF1_DATA_FMT),
+			   SUN8I_AIF1CLK_CTRL_AIF1_DATA_FMT_MASK,
 			   value << SUN8I_AIF1CLK_CTRL_AIF1_DATA_FMT);
 
 	return 0;

diff --git a/sound/usb/clock.c b/sound/usb/clock.c
index e313498..bfe5540 100644
--- a/sound/usb/clock.c
+++ b/sound/usb/clock.c

@@ -165,8 +165,34 @@
 	return ret;
 }
 
+/*
+ * Assume the clock is valid if clock source supports only one single sample
+ * rate, the terminal is connected directly to it (there is no clock selector)
+ * and clock type is internal. This is to deal with some Denon DJ controllers
+ * that always reports that clock is invalid.
+ */
+static bool uac_clock_source_is_valid_quirk(struct snd_usb_audio *chip,
+					    struct audioformat *fmt,
+					    int source_id)
+{
+	if (fmt->protocol == UAC_VERSION_2) {
+		struct uac_clock_source_descriptor *cs_desc =
+			snd_usb_find_clock_source(chip->ctrl_intf, source_id);
+
+		if (!cs_desc)
+			return false;
+
+		return (fmt->nr_rates == 1 &&
+			(fmt->clock & 0xff) == cs_desc->bClockID &&
+			(cs_desc->bmAttributes & 0x3) !=
+				UAC_CLOCK_SOURCE_TYPE_EXT);
+	}
+
+	return false;
+}
+
 static bool uac_clock_source_is_valid(struct snd_usb_audio *chip,
-				      int protocol,
+				      struct audioformat *fmt,
 				      int source_id)
 {
 	int err;
@@ -174,26 +200,26 @@
 	struct usb_device *dev = chip->dev;
 	u32 bmControls;
 
-	if (protocol == UAC_VERSION_3) {
+	if (fmt->protocol == UAC_VERSION_3) {
 		struct uac3_clock_source_descriptor *cs_desc =
 			snd_usb_find_clock_source_v3(chip->ctrl_intf, source_id);
 
 		if (!cs_desc)
-			return 0;
+			return false;
 		bmControls = le32_to_cpu(cs_desc->bmControls);
 	} else { /* UAC_VERSION_1/2 */
 		struct uac_clock_source_descriptor *cs_desc =
 			snd_usb_find_clock_source(chip->ctrl_intf, source_id);
 
 		if (!cs_desc)
-			return 0;
+			return false;
 		bmControls = cs_desc->bmControls;
 	}
 
 	/* If a clock source can't tell us whether it's valid, we assume it is */
 	if (!uac_v2v3_control_is_readable(bmControls,
 				      UAC2_CS_CONTROL_CLOCK_VALID))
-		return 1;
+		return true;
 
 	err = snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), UAC2_CS_CUR,
 			      USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_IN,
@@ -205,13 +231,17 @@
 		dev_warn(&dev->dev,
 			 "%s(): cannot get clock validity for id %d\n",
 			   __func__, source_id);
-		return 0;
+		return false;
 	}
 
-	return !!data;
+	if (data)
+		return true;
+	else
+		return uac_clock_source_is_valid_quirk(chip, fmt, source_id);
 }
 
-static int __uac_clock_find_source(struct snd_usb_audio *chip, int entity_id,
+static int __uac_clock_find_source(struct snd_usb_audio *chip,
+				   struct audioformat *fmt, int entity_id,
 				   unsigned long *visited, bool validate)
 {
 	struct uac_clock_source_descriptor *source;
@@ -231,7 +261,7 @@
 	source = snd_usb_find_clock_source(chip->ctrl_intf, entity_id);
 	if (source) {
 		entity_id = source->bClockID;
-		if (validate && !uac_clock_source_is_valid(chip, UAC_VERSION_2,
+		if (validate && !uac_clock_source_is_valid(chip, fmt,
 								entity_id)) {
 			usb_audio_err(chip,
 				"clock source %d is not valid, cannot use\n",
@@ -262,8 +292,9 @@
 		}
 
 		cur = ret;
-		ret = __uac_clock_find_source(chip, selector->baCSourceID[ret - 1],
-					       visited, validate);
+		ret = __uac_clock_find_source(chip, fmt,
+					      selector->baCSourceID[ret - 1],
+					      visited, validate);
 		if (!validate || ret > 0 || !chip->autoclock)
 			return ret;
 
@@ -274,8 +305,9 @@
 			if (i == cur)
 				continue;
 
-			ret = __uac_clock_find_source(chip, selector->baCSourceID[i - 1],
-				visited, true);
+			ret = __uac_clock_find_source(chip, fmt,
+						      selector->baCSourceID[i - 1],
+						      visited, true);
 			if (ret < 0)
 				continue;
 
@@ -295,14 +327,16 @@
 	/* FIXME: multipliers only act as pass-thru element for now */
 	multiplier = snd_usb_find_clock_multiplier(chip->ctrl_intf, entity_id);
 	if (multiplier)
-		return __uac_clock_find_source(chip, multiplier->bCSourceID,
-						visited, validate);
+		return __uac_clock_find_source(chip, fmt,
+					       multiplier->bCSourceID,
+					       visited, validate);
 
 	return -EINVAL;
 }
 
-static int __uac3_clock_find_source(struct snd_usb_audio *chip, int entity_id,
-				   unsigned long *visited, bool validate)
+static int __uac3_clock_find_source(struct snd_usb_audio *chip,
+				    struct audioformat *fmt, int entity_id,
+				    unsigned long *visited, bool validate)
 {
 	struct uac3_clock_source_descriptor *source;
 	struct uac3_clock_selector_descriptor *selector;
@@ -321,7 +355,7 @@
 	source = snd_usb_find_clock_source_v3(chip->ctrl_intf, entity_id);
 	if (source) {
 		entity_id = source->bClockID;
-		if (validate && !uac_clock_source_is_valid(chip, UAC_VERSION_3,
+		if (validate && !uac_clock_source_is_valid(chip, fmt,
 								entity_id)) {
 			usb_audio_err(chip,
 				"clock source %d is not valid, cannot use\n",
@@ -352,7 +386,8 @@
 		}
 
 		cur = ret;
-		ret = __uac3_clock_find_source(chip, selector->baCSourceID[ret - 1],
+		ret = __uac3_clock_find_source(chip, fmt,
+					       selector->baCSourceID[ret - 1],
 					       visited, validate);
 		if (!validate || ret > 0 || !chip->autoclock)
 			return ret;
@@ -364,8 +399,9 @@
 			if (i == cur)
 				continue;
 
-			ret = __uac3_clock_find_source(chip, selector->baCSourceID[i - 1],
-				visited, true);
+			ret = __uac3_clock_find_source(chip, fmt,
+						       selector->baCSourceID[i - 1],
+						       visited, true);
 			if (ret < 0)
 				continue;
 
@@ -386,7 +422,8 @@
 	multiplier = snd_usb_find_clock_multiplier_v3(chip->ctrl_intf,
 						      entity_id);
 	if (multiplier)
-		return __uac3_clock_find_source(chip, multiplier->bCSourceID,
+		return __uac3_clock_find_source(chip, fmt,
+						multiplier->bCSourceID,
 						visited, validate);
 
 	return -EINVAL;
@@ -403,18 +440,18 @@
  *
  * Returns the clock source UnitID (>=0) on success, or an error.
  */
-int snd_usb_clock_find_source(struct snd_usb_audio *chip, int protocol,
-			      int entity_id, bool validate)
+int snd_usb_clock_find_source(struct snd_usb_audio *chip,
+			      struct audioformat *fmt, bool validate)
 {
 	DECLARE_BITMAP(visited, 256);
 	memset(visited, 0, sizeof(visited));
 
-	switch (protocol) {
+	switch (fmt->protocol) {
 	case UAC_VERSION_2:
-		return __uac_clock_find_source(chip, entity_id, visited,
+		return __uac_clock_find_source(chip, fmt, fmt->clock, visited,
 					       validate);
 	case UAC_VERSION_3:
-		return __uac3_clock_find_source(chip, entity_id, visited,
+		return __uac3_clock_find_source(chip, fmt, fmt->clock, visited,
 					       validate);
 	default:
 		return -EINVAL;
@@ -515,8 +552,7 @@
 	 * automatic clock selection if the current clock is not
 	 * valid.
 	 */
-	clock = snd_usb_clock_find_source(chip, fmt->protocol,
-					  fmt->clock, true);
+	clock = snd_usb_clock_find_source(chip, fmt, true);
 	if (clock < 0) {
 		/* We did not find a valid clock, but that might be
 		 * because the current sample rate does not match an
@@ -524,8 +560,7 @@
 		 * and we will do another validation after setting the
 		 * rate.
 		 */
-		clock = snd_usb_clock_find_source(chip, fmt->protocol,
-						  fmt->clock, false);
+		clock = snd_usb_clock_find_source(chip, fmt, false);
 		if (clock < 0)
 			return clock;
 	}
@@ -591,7 +626,7 @@
 
 validation:
 	/* validate clock after rate change */
-	if (!uac_clock_source_is_valid(chip, fmt->protocol, clock))
+	if (!uac_clock_source_is_valid(chip, fmt, clock))
 		return -ENXIO;
 	return 0;
 }

diff --git a/sound/usb/clock.h b/sound/usb/clock.h
index 076e31b..68df0fb 100644
--- a/sound/usb/clock.h
+++ b/sound/usb/clock.h

@@ -6,7 +6,7 @@
 			     struct usb_host_interface *alts,
 			     struct audioformat *fmt, int rate);
 
-int snd_usb_clock_find_source(struct snd_usb_audio *chip, int protocol,
-			     int entity_id, bool validate);
+int snd_usb_clock_find_source(struct snd_usb_audio *chip,
+			      struct audioformat *fmt, bool validate);
 
 #endif /* __USBAUDIO_CLOCK_H */

diff --git a/sound/usb/format.c b/sound/usb/format.c
index fd13ac1..9d27429 100644
--- a/sound/usb/format.c
+++ b/sound/usb/format.c

@@ -306,8 +306,7 @@
 	struct usb_device *dev = chip->dev;
 	unsigned char tmp[2], *data;
 	int nr_triplets, data_size, ret = 0;
-	int clock = snd_usb_clock_find_source(chip, fp->protocol,
-					      fp->clock, false);
+	int clock = snd_usb_clock_find_source(chip, fp, false);
 
 	if (clock < 0) {
 		dev_err(&dev->dev,

diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index 6ac6a09..f2e173b 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c

@@ -912,6 +912,15 @@
 	return 0;
 }
 
+static int parse_term_effect_unit(struct mixer_build *state,
+				  struct usb_audio_term *term,
+				  void *p1, int id)
+{
+	term->type = UAC3_EFFECT_UNIT << 16; /* virtual type */
+	term->id = id;
+	return 0;
+}
+
 static int parse_term_uac2_clock_source(struct mixer_build *state,
 					struct usb_audio_term *term,
 					void *p1, int id)
@@ -996,8 +1005,7 @@
 						    UAC3_PROCESSING_UNIT);
 		case PTYPE(UAC_VERSION_2, UAC2_EFFECT_UNIT):
 		case PTYPE(UAC_VERSION_3, UAC3_EFFECT_UNIT):
-			return parse_term_proc_unit(state, term, p1, id,
-						    UAC3_EFFECT_UNIT);
+			return parse_term_effect_unit(state, term, p1, id);
 		case PTYPE(UAC_VERSION_1, UAC1_EXTENSION_UNIT):
 		case PTYPE(UAC_VERSION_2, UAC2_EXTENSION_UNIT_V2):
 		case PTYPE(UAC_VERSION_3, UAC3_EXTENSION_UNIT):

diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index 49f6f61..5bbfd75 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c

@@ -1182,6 +1182,7 @@
 	case USB_ID(0x1395, 0x740a): /* Sennheiser DECT */
 	case USB_ID(0x1901, 0x0191): /* GE B850V3 CP2114 audio interface */
 	case USB_ID(0x21B4, 0x0081): /* AudioQuest DragonFly */
+	case USB_ID(0x2912, 0x30c8): /* Audioengine D1 */
 		return true;
 	}
 

diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c
index c1dd9a7..36b3459 100644
--- a/sound/usb/usx2y/usX2Yhwdep.c
+++ b/sound/usb/usx2y/usX2Yhwdep.c

@@ -131,7 +131,7 @@
 	info->num_dsps = 2;		// 0: Prepad Data, 1: FPGA Code
 	if (us428->chip_status & USX2Y_STAT_CHIP_INIT)
 		info->chip_ready = 1;
- 	info->version = USX2Y_DRIVER_VERSION; 
+	info->version = USX2Y_DRIVER_VERSION;
 	return 0;
 }
 

diff --git a/tools/arch/x86/include/asm/rmwcc.h b/tools/arch/x86/include/asm/rmwcc.h
index fee7983..dc90c0c 100644
--- a/tools/arch/x86/include/asm/rmwcc.h
+++ b/tools/arch/x86/include/asm/rmwcc.h

@@ -2,7 +2,7 @@
 #ifndef _TOOLS_LINUX_ASM_X86_RMWcc
 #define _TOOLS_LINUX_ASM_X86_RMWcc
 
-#ifdef CONFIG_CC_HAS_ASM_GOTO
+#ifdef CC_HAVE_ASM_GOTO
 
 #define __GEN_RMWcc(fullop, var, cc, ...)				\
 do {									\
@@ -20,7 +20,7 @@
 #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)			\
 	__GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val))
 
-#else /* !CONFIG_CC_HAS_ASM_GOTO */
+#else /* !CC_HAVE_ASM_GOTO */
 
 #define __GEN_RMWcc(fullop, var, cc, ...)				\
 do {									\
@@ -37,6 +37,6 @@
 #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)			\
 	__GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val))
 
-#endif /* CONFIG_CC_HAS_ASM_GOTO */
+#endif /* CC_HAVE_ASM_GOTO */
 
 #endif /* _TOOLS_LINUX_ASM_X86_RMWcc */

diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c
index 7aba824..bd021a0 100644
--- a/tools/lib/api/fs/fs.c
+++ b/tools/lib/api/fs/fs.c

@@ -210,6 +210,7 @@
 	size_t name_len = strlen(fs->name);
 	/* name + "_PATH" + '\0' */
 	char upper_name[name_len + 5 + 1];
+
 	memcpy(upper_name, fs->name, name_len);
 	mem_toupper(upper_name, name_len);
 	strcpy(&upper_name[name_len], "_PATH");
@@ -219,7 +220,8 @@
 		return false;
 
 	fs->found = true;
-	strncpy(fs->path, override_path, sizeof(fs->path));
+	strncpy(fs->path, override_path, sizeof(fs->path) - 1);
+	fs->path[sizeof(fs->path) - 1] = '\0';
 	return true;
 }
 

diff --git a/tools/objtool/arch/x86/lib/x86-opcode-map.txt b/tools/objtool/arch/x86/lib/x86-opcode-map.txt
index 0a0e911..5cb9f00 100644
--- a/tools/objtool/arch/x86/lib/x86-opcode-map.txt
+++ b/tools/objtool/arch/x86/lib/x86-opcode-map.txt

@@ -909,7 +909,7 @@
 
 GrpTable: Grp3_2
 0: TEST Ev,Iz
-1:
+1: TEST Ev,Iz
 2: NOT Ev
 3: NEG Ev
 4: MUL rAX,Ev

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 692d2fa..ed34902 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c

@@ -2931,6 +2931,7 @@
 
 				continue;
 			}
+			actions->ms.map = map;
 			top = pstack__peek(browser->pstack);
 			if (top == &browser->hists->dso_filter) {
 				/*

diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index bbb0e04..5947528 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c

@@ -209,12 +209,12 @@
 				    int cpu, struct runtime_stat *st)
 {
 	int ctx = evsel_context(counter);
+	u64 count_ns = count;
 
 	count *= counter->scale;
 
-	if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK) ||
-	    perf_evsel__match(counter, SOFTWARE, SW_CPU_CLOCK))
-		update_runtime_stat(st, STAT_NSECS, 0, cpu, count);
+	if (perf_evsel__is_clock(counter))
+		update_runtime_stat(st, STAT_NSECS, 0, cpu, count_ns);
 	else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
 		update_runtime_stat(st, STAT_CYCLES, ctx, cpu, count);
 	else if (perf_stat_evsel__is(counter, CYCLES_IN_TX))

diff --git a/tools/testing/selftests/bpf/test_select_reuseport.c b/tools/testing/selftests/bpf/test_select_reuseport.c
index 75646d9..cdbbdab 100644
--- a/tools/testing/selftests/bpf/test_select_reuseport.c
+++ b/tools/testing/selftests/bpf/test_select_reuseport.c

@@ -30,7 +30,7 @@
 #define REUSEPORT_ARRAY_SIZE 32
 
 static int result_map, tmp_index_ovr_map, linum_map, data_check_map;
-static enum result expected_results[NR_RESULTS];
+static __u32 expected_results[NR_RESULTS];
 static int sk_fds[REUSEPORT_ARRAY_SIZE];
 static int reuseport_array, outer_map;
 static int select_by_skb_data_prog;
@@ -610,7 +610,19 @@
 
 static void cleanup_per_test(void)
 {
-	int i, err;
+	int i, err, zero = 0;
+
+	memset(expected_results, 0, sizeof(expected_results));
+
+	for (i = 0; i < NR_RESULTS; i++) {
+		err = bpf_map_update_elem(result_map, &i, &zero, BPF_ANY);
+		RET_IF(err, "reset elem in result_map",
+		       "i:%u err:%d errno:%d\n", i, err, errno);
+	}
+
+	err = bpf_map_update_elem(linum_map, &zero, &zero, BPF_ANY);
+	RET_IF(err, "reset line number in linum_map", "err:%d errno:%d\n",
+	       err, errno);
 
 	for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++)
 		close(sk_fds[i]);

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index c0885fb..7d1a7c0d 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh

@@ -848,6 +848,12 @@
 	check_route6 "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024"
 	log_test $? 0 "Multipath with single path via multipath attribute"
 
+	# multipath with dev-only
+	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+	run_cmd "$IP -6 ro replace 2001:db8:104::/64 dev veth1"
+	check_route6 "2001:db8:104::/64 dev veth1 metric 1024"
+	log_test $? 0 "Multipath with dev-only"
+
 	# route replace fails - invalid nexthop 1
 	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
 	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:111::3 nexthop via 2001:db8:103::3"

diff --git a/tools/testing/selftests/size/get_size.c b/tools/testing/selftests/size/get_size.c
index d4b59ab..f55943b 100644
--- a/tools/testing/selftests/size/get_size.c
+++ b/tools/testing/selftests/size/get_size.c

@@ -12,23 +12,35 @@
  * own execution.  It also attempts to have as few dependencies
  * on kernel features as possible.
  *
- * It should be statically linked, with startup libs avoided.
- * It uses no library calls, and only the following 3 syscalls:
+ * It should be statically linked, with startup libs avoided.  It uses
+ * no library calls except the syscall() function for the following 3
+ * syscalls:
  *   sysinfo(), write(), and _exit()
  *
  * For output, it avoids printf (which in some C libraries
  * has large external dependencies) by  implementing it's own
  * number output and print routines, and using __builtin_strlen()
+ *
+ * The test may crash if any of the above syscalls fails because in some
+ * libc implementations (e.g. the GNU C Library) errno is saved in
+ * thread-local storage, which does not get initialized due to avoiding
+ * startup libs.
  */
 
 #include <sys/sysinfo.h>
 #include <unistd.h>
+#include <sys/syscall.h>
 
 #define STDOUT_FILENO 1
 
 static int print(const char *s)
 {
-	return write(STDOUT_FILENO, s, __builtin_strlen(s));
+	size_t len = 0;
+
+	while (s[len] != '\0')
+		len++;
+
+	return syscall(SYS_write, STDOUT_FILENO, s, len);
 }
 
 static inline char *num_to_str(unsigned long num, char *buf, int len)
@@ -80,12 +92,12 @@
 	print("TAP version 13\n");
 	print("# Testing system size.\n");
 
-	ccode = sysinfo(&info);
+	ccode = syscall(SYS_sysinfo, &info);
 	if (ccode < 0) {
 		print("not ok 1");
 		print(test_name);
 		print(" ---\n reason: \"could not get sysinfo\"\n ...\n");
-		_exit(ccode);
+		syscall(SYS_exit, ccode);
 	}
 	print("ok 1");
 	print(test_name);
@@ -101,5 +113,5 @@
 	print(" ...\n");
 	print("1..1\n");
 
-	_exit(0);
+	syscall(SYS_exit, 0);
 }

diff --git a/tools/usb/usbip/src/usbip_network.c b/tools/usb/usbip/src/usbip_network.c
index 8ffcd47..902f552 100644
--- a/tools/usb/usbip/src/usbip_network.c
+++ b/tools/usb/usbip/src/usbip_network.c

@@ -62,39 +62,39 @@
 	info("using port %d (\"%s\")", usbip_port, usbip_port_string);
 }
 
-void usbip_net_pack_uint32_t(int pack, uint32_t *num)
+uint32_t usbip_net_pack_uint32_t(int pack, uint32_t num)
 {
 	uint32_t i;
 
 	if (pack)
-		i = htonl(*num);
+		i = htonl(num);
 	else
-		i = ntohl(*num);
+		i = ntohl(num);
 
-	*num = i;
+	return i;
 }
 
-void usbip_net_pack_uint16_t(int pack, uint16_t *num)
+uint16_t usbip_net_pack_uint16_t(int pack, uint16_t num)
 {
 	uint16_t i;
 
 	if (pack)
-		i = htons(*num);
+		i = htons(num);
 	else
-		i = ntohs(*num);
+		i = ntohs(num);
 
-	*num = i;
+	return i;
 }
 
 void usbip_net_pack_usb_device(int pack, struct usbip_usb_device *udev)
 {
-	usbip_net_pack_uint32_t(pack, &udev->busnum);
-	usbip_net_pack_uint32_t(pack, &udev->devnum);
-	usbip_net_pack_uint32_t(pack, &udev->speed);
+	udev->busnum = usbip_net_pack_uint32_t(pack, udev->busnum);
+	udev->devnum = usbip_net_pack_uint32_t(pack, udev->devnum);
+	udev->speed = usbip_net_pack_uint32_t(pack, udev->speed);
 
-	usbip_net_pack_uint16_t(pack, &udev->idVendor);
-	usbip_net_pack_uint16_t(pack, &udev->idProduct);
-	usbip_net_pack_uint16_t(pack, &udev->bcdDevice);
+	udev->idVendor = usbip_net_pack_uint16_t(pack, udev->idVendor);
+	udev->idProduct = usbip_net_pack_uint16_t(pack, udev->idProduct);
+	udev->bcdDevice = usbip_net_pack_uint16_t(pack, udev->bcdDevice);
 }
 
 void usbip_net_pack_usb_interface(int pack __attribute__((unused)),
@@ -141,6 +141,14 @@
 	return usbip_net_xmit(sockfd, buff, bufflen, 1);
 }
 
+static inline void usbip_net_pack_op_common(int pack,
+					    struct op_common *op_common)
+{
+	op_common->version = usbip_net_pack_uint16_t(pack, op_common->version);
+	op_common->code = usbip_net_pack_uint16_t(pack, op_common->code);
+	op_common->status = usbip_net_pack_uint32_t(pack, op_common->status);
+}
+
 int usbip_net_send_op_common(int sockfd, uint32_t code, uint32_t status)
 {
 	struct op_common op_common;
@@ -152,7 +160,7 @@
 	op_common.code    = code;
 	op_common.status  = status;
 
-	PACK_OP_COMMON(1, &op_common);
+	usbip_net_pack_op_common(1, &op_common);
 
 	rc = usbip_net_send(sockfd, &op_common, sizeof(op_common));
 	if (rc < 0) {
@@ -176,7 +184,7 @@
 		goto err;
 	}
 
-	PACK_OP_COMMON(0, &op_common);
+	usbip_net_pack_op_common(0, &op_common);
 
 	if (op_common.version != USBIP_VERSION) {
 		err("USBIP Kernel and tool version mismatch: %d %d:",

diff --git a/tools/usb/usbip/src/usbip_network.h b/tools/usb/usbip/src/usbip_network.h
index 555215e..83b4c53 100644
--- a/tools/usb/usbip/src/usbip_network.h
+++ b/tools/usb/usbip/src/usbip_network.h

@@ -32,12 +32,6 @@
 
 } __attribute__((packed));
 
-#define PACK_OP_COMMON(pack, op_common)  do {\
-	usbip_net_pack_uint16_t(pack, &(op_common)->version);\
-	usbip_net_pack_uint16_t(pack, &(op_common)->code);\
-	usbip_net_pack_uint32_t(pack, &(op_common)->status);\
-} while (0)
-
 /* ---------------------------------------------------------------------- */
 /* Dummy Code */
 #define OP_UNSPEC	0x00
@@ -163,11 +157,11 @@
 } while (0)
 
 #define PACK_OP_DEVLIST_REPLY(pack, reply)  do {\
-	usbip_net_pack_uint32_t(pack, &(reply)->ndev);\
+	(reply)->ndev = usbip_net_pack_uint32_t(pack, (reply)->ndev);\
 } while (0)
 
-void usbip_net_pack_uint32_t(int pack, uint32_t *num);
-void usbip_net_pack_uint16_t(int pack, uint16_t *num);
+uint32_t usbip_net_pack_uint32_t(int pack, uint32_t num);
+uint16_t usbip_net_pack_uint16_t(int pack, uint16_t num);
 void usbip_net_pack_usb_device(int pack, struct usbip_usb_device *udev);
 void usbip_net_pack_usb_interface(int pack, struct usbip_usb_interface *uinf);
 

diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c
index 334b16d..4ee1bf6 100644
--- a/tools/vm/slabinfo.c
+++ b/tools/vm/slabinfo.c

@@ -29,7 +29,7 @@
 	char *name;
 	int alias;
 	int refs;
-	int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu;
+	int aliases, align, cache_dma, cache_dma32, cpu_slabs, destroy_by_rcu;
 	unsigned int hwcache_align, object_size, objs_per_slab;
 	unsigned int sanity_checks, slab_size, store_user, trace;
 	int order, poison, reclaim_account, red_zone;
@@ -531,6 +531,8 @@
 		printf("** Hardware cacheline aligned\n");
 	if (s->cache_dma)
 		printf("** Memory is allocated in a special DMA zone\n");
+	if (s->cache_dma32)
+		printf("** Memory is allocated in a special DMA32 zone\n");
 	if (s->destroy_by_rcu)
 		printf("** Slabs are destroyed via RCU\n");
 	if (s->reclaim_account)
@@ -599,6 +601,8 @@
 		*p++ = '*';
 	if (s->cache_dma)
 		*p++ = 'd';
+	if (s->cache_dma32)
+		*p++ = 'D';
 	if (s->hwcache_align)
 		*p++ = 'A';
 	if (s->poison)
@@ -1205,6 +1209,7 @@
 			slab->aliases = get_obj("aliases");
 			slab->align = get_obj("align");
 			slab->cache_dma = get_obj("cache_dma");
+			slab->cache_dma32 = get_obj("cache_dma32");
 			slab->cpu_slabs = get_obj("cpu_slabs");
 			slab->destroy_by_rcu = get_obj("destroy_by_rcu");
 			slab->hwcache_align = get_obj("hwcache_align");

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index beec19f..4e499b7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c

@@ -2024,12 +2024,12 @@
 	if (slots->generation != ghc->generation)
 		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
 
-	if (unlikely(!ghc->memslot))
-		return kvm_write_guest(kvm, gpa, data, len);
-
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
 
+	if (unlikely(!ghc->memslot))
+		return kvm_write_guest(kvm, gpa, data, len);
+
 	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
 	if (r)
 		return -EFAULT;
@@ -2057,12 +2057,12 @@
 	if (slots->generation != ghc->generation)
 		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
 
-	if (unlikely(!ghc->memslot))
-		return kvm_read_guest(kvm, ghc->gpa, data, len);
-
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
 
+	if (unlikely(!ghc->memslot))
+		return kvm_read_guest(kvm, ghc->gpa, data, len);
+
 	r = __copy_from_user(data, (void __user *)ghc->hva, len);
 	if (r)
 		return -EFAULT;
commit	8bd983177d60daba1e1df7e90bf02cfd0294e9b9	[log] [tgz]
author	Vaibhav Rustagi <vaibhavrustagi@google.com>	Tue Mar 10 15:10:28 2020 -0700
committer	Vaibhav Rustagi <vaibhavrustagi@google.com>	Tue Mar 10 15:11:30 2020 -0700
tree	1493a270e26d1bedd8cbd8507e1b896415e31d79
parent	b0f6c7137076f6294fffb18fed6f715573372042 [diff]
parent	cbfc70d9eef0ebbd53fcf581cf1af76777e7071b [diff]