From 833b45de69a6016c4b0cebe6765d526a31a81580 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 30 Sep 2019 18:48:44 +0200
Subject: kvm: x86, powerpc: do not allow clearing largepages debugfs entry

The largepages debugfs entry is incremented/decremented as shadow
pages are created or destroyed.  Clearing it will result in an
underflow, which is harmless to KVM but ugly (and could be
misinterpreted by tools that use debugfs information), so make
this particular statistic read-only.

Cc: kvm-ppc@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fcb46b3374c6..719fc3e15ea4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1090,6 +1090,7 @@ enum kvm_stat_kind {
 
 struct kvm_stat_data {
 	int offset;
+	int mode;
 	struct kvm *kvm;
 };
 
@@ -1097,6 +1098,7 @@ struct kvm_stats_debugfs_item {
 	const char *name;
 	int offset;
 	enum kvm_stat_kind kind;
+	int mode;
 };
 extern struct kvm_stats_debugfs_item debugfs_entries[];
 extern struct dentry *kvm_debugfs_dir;
-- 
cgit v1.2.3-59-g8ed1b


From f5a1a536fa14895ccff4e94e6a5af90901ce86aa Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 1 Oct 2019 11:10:52 +1000
Subject: lib: introduce copy_struct_from_user() helper

A common pattern for syscall extensions is increasing the size of a
struct passed from userspace, such that the zero-value of the new fields
result in the old kernel behaviour (allowing for a mix of userspace and
kernel vintages to operate on one another in most cases).

While this interface exists for communication in both directions, only
one interface is straightforward to have reasonable semantics for
(userspace passing a struct to the kernel). For kernel returns to
userspace, what the correct semantics are (whether there should be an
error if userspace is unaware of a new extension) is very
syscall-dependent and thus probably cannot be unified between syscalls
(a good example of this problem is [1]).

Previously there was no common lib/ function that implemented
the necessary extension-checking semantics (and different syscalls
implemented them slightly differently or incompletely[2]). Future
patches replace common uses of this pattern to make use of
copy_struct_from_user().

Some in-kernel selftests that insure that the handling of alignment and
various byte patterns are all handled identically to memchr_inv() usage.

[1]: commit 1251201c0d34 ("sched/core: Fix uclamp ABI bug, clean up and
     robustify sched_read_attr() ABI logic and code")

[2]: For instance {sched_setattr,perf_event_open,clone3}(2) all do do
     similar checks to copy_struct_from_user() while rt_sigprocmask(2)
     always rejects differently-sized struct arguments.

Suggested-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/20191001011055.19283-2-cyphar@cyphar.com
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/linux/bitops.h  |   7 +++
 include/linux/uaccess.h |  70 +++++++++++++++++++++++++
 lib/strnlen_user.c      |   8 +--
 lib/test_user_copy.c    | 136 +++++++++++++++++++++++++++++++++++++++++++++---
 lib/usercopy.c          |  55 ++++++++++++++++++++
 5 files changed, 263 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf074bce3eb3..c94a9ff9f082 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -4,6 +4,13 @@
 #include <asm/types.h>
 #include <linux/bits.h>
 
+/* Set bits in the first 'n' bytes when loaded from memory */
+#ifdef __LITTLE_ENDIAN
+#  define aligned_byte_mask(n) ((1UL << 8*(n))-1)
+#else
+#  define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n)))
+#endif
+
 #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
 #define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
 
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 70bbdc38dc37..e47d0522a1f4 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -231,6 +231,76 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from,
 
 #endif		/* ARCH_HAS_NOCACHE_UACCESS */
 
+extern __must_check int check_zeroed_user(const void __user *from, size_t size);
+
+/**
+ * copy_struct_from_user: copy a struct from userspace
+ * @dst:   Destination address, in kernel space. This buffer must be @ksize
+ *         bytes long.
+ * @ksize: Size of @dst struct.
+ * @src:   Source address, in userspace.
+ * @usize: (Alleged) size of @src struct.
+ *
+ * Copies a struct from userspace to kernel space, in a way that guarantees
+ * backwards-compatibility for struct syscall arguments (as long as future
+ * struct extensions are made such that all new fields are *appended* to the
+ * old struct, and zeroed-out new fields have the same meaning as the old
+ * struct).
+ *
+ * @ksize is just sizeof(*dst), and @usize should've been passed by userspace.
+ * The recommended usage is something like the following:
+ *
+ *   SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize)
+ *   {
+ *      int err;
+ *      struct foo karg = {};
+ *
+ *      if (usize > PAGE_SIZE)
+ *        return -E2BIG;
+ *      if (usize < FOO_SIZE_VER0)
+ *        return -EINVAL;
+ *
+ *      err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
+ *      if (err)
+ *        return err;
+ *
+ *      // ...
+ *   }
+ *
+ * There are three cases to consider:
+ *  * If @usize == @ksize, then it's copied verbatim.
+ *  * If @usize < @ksize, then the userspace has passed an old struct to a
+ *    newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize)
+ *    are to be zero-filled.
+ *  * If @usize > @ksize, then the userspace has passed a new struct to an
+ *    older kernel. The trailing bytes unknown to the kernel (@usize - @ksize)
+ *    are checked to ensure they are zeroed, otherwise -E2BIG is returned.
+ *
+ * Returns (in all cases, some data may have been copied):
+ *  * -E2BIG:  (@usize > @ksize) and there are non-zero trailing bytes in @src.
+ *  * -EFAULT: access to userspace failed.
+ */
+static __always_inline __must_check int
+copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
+		      size_t usize)
+{
+	size_t size = min(ksize, usize);
+	size_t rest = max(ksize, usize) - size;
+
+	/* Deal with trailing bytes. */
+	if (usize < ksize) {
+		memset(dst + size, 0, rest);
+	} else if (usize > ksize) {
+		int ret = check_zeroed_user(src + size, rest);
+		if (ret <= 0)
+			return ret ?: -E2BIG;
+	}
+	/* Copy the interoperable parts of the struct. */
+	if (copy_from_user(dst, src, size))
+		return -EFAULT;
+	return 0;
+}
+
 /*
  * probe_kernel_read(): safely attempt to read from a location
  * @dst: pointer to the buffer that shall take the data
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index 28ff554a1be8..6c0005d5dd5c 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -3,16 +3,10 @@
 #include <linux/export.h>
 #include <linux/uaccess.h>
 #include <linux/mm.h>
+#include <linux/bitops.h>
 
 #include <asm/word-at-a-time.h>
 
-/* Set bits in the first 'n' bytes when loaded from memory */
-#ifdef __LITTLE_ENDIAN
-#  define aligned_byte_mask(n) ((1ul << 8*(n))-1)
-#else
-#  define aligned_byte_mask(n) (~0xfful << (BITS_PER_LONG - 8 - 8*(n)))
-#endif
-
 /*
  * Do a strnlen, return length of string *with* final '\0'.
  * 'count' is the user-supplied count, while 'max' is the
diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c
index 67bcd5dfd847..950ee88cd6ac 100644
--- a/lib/test_user_copy.c
+++ b/lib/test_user_copy.c
@@ -31,14 +31,133 @@
 # define TEST_U64
 #endif
 
-#define test(condition, msg)		\
-({					\
-	int cond = (condition);		\
-	if (cond)			\
-		pr_warn("%s\n", msg);	\
-	cond;				\
+#define test(condition, msg, ...)					\
+({									\
+	int cond = (condition);						\
+	if (cond)							\
+		pr_warn("[%d] " msg "\n", __LINE__, ##__VA_ARGS__);	\
+	cond;								\
 })
 
+static bool is_zeroed(void *from, size_t size)
+{
+	return memchr_inv(from, 0x0, size) == NULL;
+}
+
+static int test_check_nonzero_user(char *kmem, char __user *umem, size_t size)
+{
+	int ret = 0;
+	size_t start, end, i;
+	size_t zero_start = size / 4;
+	size_t zero_end = size - zero_start;
+
+	/*
+	 * We conduct a series of check_nonzero_user() tests on a block of memory
+	 * with the following byte-pattern (trying every possible [start,end]
+	 * pair):
+	 *
+	 *   [ 00 ff 00 ff ... 00 00 00 00 ... ff 00 ff 00 ]
+	 *
+	 * And we verify that check_nonzero_user() acts identically to memchr_inv().
+	 */
+
+	memset(kmem, 0x0, size);
+	for (i = 1; i < zero_start; i += 2)
+		kmem[i] = 0xff;
+	for (i = zero_end; i < size; i += 2)
+		kmem[i] = 0xff;
+
+	ret |= test(copy_to_user(umem, kmem, size),
+		    "legitimate copy_to_user failed");
+
+	for (start = 0; start <= size; start++) {
+		for (end = start; end <= size; end++) {
+			size_t len = end - start;
+			int retval = check_zeroed_user(umem + start, len);
+			int expected = is_zeroed(kmem + start, len);
+
+			ret |= test(retval != expected,
+				    "check_nonzero_user(=%d) != memchr_inv(=%d) mismatch (start=%zu, end=%zu)",
+				    retval, expected, start, end);
+		}
+	}
+
+	return ret;
+}
+
+static int test_copy_struct_from_user(char *kmem, char __user *umem,
+				      size_t size)
+{
+	int ret = 0;
+	char *umem_src = NULL, *expected = NULL;
+	size_t ksize, usize;
+
+	umem_src = kmalloc(size, GFP_KERNEL);
+	if (ret |= test(umem_src == NULL, "kmalloc failed"))
+		goto out_free;
+
+	expected = kmalloc(size, GFP_KERNEL);
+	if (ret |= test(expected == NULL, "kmalloc failed"))
+		goto out_free;
+
+	/* Fill umem with a fixed byte pattern. */
+	memset(umem_src, 0x3e, size);
+	ret |= test(copy_to_user(umem, umem_src, size),
+		    "legitimate copy_to_user failed");
+
+	/* Check basic case -- (usize == ksize). */
+	ksize = size;
+	usize = size;
+
+	memcpy(expected, umem_src, ksize);
+
+	memset(kmem, 0x0, size);
+	ret |= test(copy_struct_from_user(kmem, ksize, umem, usize),
+		    "copy_struct_from_user(usize == ksize) failed");
+	ret |= test(memcmp(kmem, expected, ksize),
+		    "copy_struct_from_user(usize == ksize) gives unexpected copy");
+
+	/* Old userspace case -- (usize < ksize). */
+	ksize = size;
+	usize = size / 2;
+
+	memcpy(expected, umem_src, usize);
+	memset(expected + usize, 0x0, ksize - usize);
+
+	memset(kmem, 0x0, size);
+	ret |= test(copy_struct_from_user(kmem, ksize, umem, usize),
+		    "copy_struct_from_user(usize < ksize) failed");
+	ret |= test(memcmp(kmem, expected, ksize),
+		    "copy_struct_from_user(usize < ksize) gives unexpected copy");
+
+	/* New userspace (-E2BIG) case -- (usize > ksize). */
+	ksize = size / 2;
+	usize = size;
+
+	memset(kmem, 0x0, size);
+	ret |= test(copy_struct_from_user(kmem, ksize, umem, usize) != -E2BIG,
+		    "copy_struct_from_user(usize > ksize) didn't give E2BIG");
+
+	/* New userspace (success) case -- (usize > ksize). */
+	ksize = size / 2;
+	usize = size;
+
+	memcpy(expected, umem_src, ksize);
+	ret |= test(clear_user(umem + ksize, usize - ksize),
+		    "legitimate clear_user failed");
+
+	memset(kmem, 0x0, size);
+	ret |= test(copy_struct_from_user(kmem, ksize, umem, usize),
+		    "copy_struct_from_user(usize > ksize) failed");
+	ret |= test(memcmp(kmem, expected, ksize),
+		    "copy_struct_from_user(usize > ksize) gives unexpected copy");
+
+out_free:
+	kfree(expected);
+	kfree(umem_src);
+	return ret;
+}
+
 static int __init test_user_copy_init(void)
 {
 	int ret = 0;
@@ -106,6 +225,11 @@ static int __init test_user_copy_init(void)
 #endif
 #undef test_legit
 
+	/* Test usage of check_nonzero_user(). */
+	ret |= test_check_nonzero_user(kmem, usermem, 2 * PAGE_SIZE);
+	/* Test usage of copy_struct_from_user(). */
+	ret |= test_copy_struct_from_user(kmem, usermem, 2 * PAGE_SIZE);
+
 	/*
 	 * Invalid usage: none of these copies should succeed.
 	 */
diff --git a/lib/usercopy.c b/lib/usercopy.c
index c2bfbcaeb3dc..cbb4d9ec00f2 100644
--- a/lib/usercopy.c
+++ b/lib/usercopy.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/uaccess.h>
+#include <linux/bitops.h>
 
 /* out-of-line parts */
 
@@ -31,3 +32,57 @@ unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
 }
 EXPORT_SYMBOL(_copy_to_user);
 #endif
+
+/**
+ * check_zeroed_user: check if a userspace buffer only contains zero bytes
+ * @from: Source address, in userspace.
+ * @size: Size of buffer.
+ *
+ * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for
+ * userspace addresses (and is more efficient because we don't care where the
+ * first non-zero byte is).
+ *
+ * Returns:
+ *  * 0: There were non-zero bytes present in the buffer.
+ *  * 1: The buffer was full of zero bytes.
+ *  * -EFAULT: access to userspace failed.
+ */
+int check_zeroed_user(const void __user *from, size_t size)
+{
+	unsigned long val;
+	uintptr_t align = (uintptr_t) from % sizeof(unsigned long);
+
+	if (unlikely(size == 0))
+		return 1;
+
+	from -= align;
+	size += align;
+
+	if (!user_access_begin(from, size))
+		return -EFAULT;
+
+	unsafe_get_user(val, (unsigned long __user *) from, err_fault);
+	if (align)
+		val &= ~aligned_byte_mask(align);
+
+	while (size > sizeof(unsigned long)) {
+		if (unlikely(val))
+			goto done;
+
+		from += sizeof(unsigned long);
+		size -= sizeof(unsigned long);
+
+		unsafe_get_user(val, (unsigned long __user *) from, err_fault);
+	}
+
+	if (size < sizeof(unsigned long))
+		val &= aligned_byte_mask(size);
+
+done:
+	user_access_end();
+	return (val == 0);
+err_fault:
+	user_access_end();
+	return -EFAULT;
+}
+EXPORT_SYMBOL(check_zeroed_user);
-- 
cgit v1.2.3-59-g8ed1b


From 895b5c9f206eb7d25dc1360a8ccfc5958895eb89 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 29 Sep 2019 20:54:03 +0200
Subject: netfilter: drop bridge nf reset from nf_reset

commit 174e23810cd31
("sk_buff: drop all skb extensions on free and skb scrubbing") made napi
recycle always drop skb extensions.  The additional skb_ext_del() that is
performed via nf_reset on napi skb recycle is not needed anymore.

Most nf_reset() calls in the stack are there so queued skb won't block
'rmmod nf_conntrack' indefinitely.

This removes the skb_ext_del from nf_reset, and renames it to a more
fitting nf_reset_ct().

In a few selected places, add a call to skb_ext_reset to make sure that
no active extensions remain.

I am submitting this for "net", because we're still early in the release
cycle.  The patch applies to net-next too, but I think the rename causes
needless divergence between those trees.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 drivers/net/ppp/pptp.c                | 4 ++--
 drivers/net/tun.c                     | 2 +-
 drivers/net/virtio_net.c              | 2 +-
 drivers/net/vrf.c                     | 8 ++++----
 drivers/net/wireless/mac80211_hwsim.c | 4 ++--
 drivers/staging/octeon/ethernet-tx.c  | 6 ++----
 include/linux/skbuff.h                | 5 +----
 net/batman-adv/soft-interface.c       | 2 +-
 net/core/skbuff.c                     | 2 +-
 net/dccp/ipv4.c                       | 2 +-
 net/ipv4/ip_input.c                   | 2 +-
 net/ipv4/ipmr.c                       | 4 ++--
 net/ipv4/netfilter/nf_dup_ipv4.c      | 2 +-
 net/ipv4/raw.c                        | 2 +-
 net/ipv4/tcp_ipv4.c                   | 2 +-
 net/ipv4/udp.c                        | 4 ++--
 net/ipv6/ip6_input.c                  | 2 +-
 net/ipv6/netfilter/nf_dup_ipv6.c      | 2 +-
 net/ipv6/raw.c                        | 2 +-
 net/l2tp/l2tp_core.c                  | 2 +-
 net/l2tp/l2tp_eth.c                   | 2 +-
 net/l2tp/l2tp_ip.c                    | 2 +-
 net/l2tp/l2tp_ip6.c                   | 2 +-
 net/netfilter/ipvs/ip_vs_xmit.c       | 2 +-
 net/openvswitch/vport-internal_dev.c  | 2 +-
 net/packet/af_packet.c                | 4 ++--
 net/sctp/input.c                      | 2 +-
 net/xfrm/xfrm_input.c                 | 2 +-
 net/xfrm/xfrm_interface.c             | 2 +-
 net/xfrm/xfrm_output.c                | 2 +-
 net/xfrm/xfrm_policy.c                | 2 +-
 31 files changed, 40 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index 734de7de03f7..e1fabb3e3246 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -238,7 +238,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
 	skb_dst_drop(skb);
 	skb_dst_set(skb, &rt->dst);
 
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	skb->ip_summed = CHECKSUM_NONE;
 	ip_select_ident(net, skb, NULL);
@@ -358,7 +358,7 @@ static int pptp_rcv(struct sk_buff *skb)
 	po = lookup_chan(htons(header->call_id), iph->saddr);
 	if (po) {
 		skb_dst_drop(skb);
-		nf_reset(skb);
+		nf_reset_ct(skb);
 		return sk_receive_skb(sk_pppox(po), skb, 0);
 	}
 drop:
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index aab0be40d443..812dc3a65efb 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1104,7 +1104,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 	 */
 	skb_orphan(skb);
 
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	if (ptr_ring_produce(&tfile->tx_ring, skb))
 		goto drop;
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index ba98e0971b84..5a635f028bdc 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1585,7 +1585,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Don't wait up for transmitted skbs to be freed. */
 	if (!use_napi) {
 		skb_orphan(skb);
-		nf_reset(skb);
+		nf_reset_ct(skb);
 	}
 
 	/* If running out of space, stop queue to avoid getting packets that we
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index a4b38a980c3c..ee52bde058df 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -366,7 +366,7 @@ static int vrf_finish_output6(struct net *net, struct sock *sk,
 	struct neighbour *neigh;
 	int ret;
 
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	skb->protocol = htons(ETH_P_IPV6);
 	skb->dev = dev;
@@ -459,7 +459,7 @@ static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev,
 
 	/* reset skb device */
 	if (likely(err == 1))
-		nf_reset(skb);
+		nf_reset_ct(skb);
 	else
 		skb = NULL;
 
@@ -560,7 +560,7 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
 	bool is_v6gw = false;
 	int ret = -EINVAL;
 
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	/* Be paranoid, rather than too clever. */
 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
@@ -670,7 +670,7 @@ static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev,
 
 	/* reset skb device */
 	if (likely(err == 1))
-		nf_reset(skb);
+		nf_reset_ct(skb);
 	else
 		skb = NULL;
 
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 635956024e88..45c73a6f09a1 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1261,8 +1261,8 @@ static bool mac80211_hwsim_tx_frame_no_nl(struct ieee80211_hw *hw,
 	skb_orphan(skb);
 	skb_dst_drop(skb);
 	skb->mark = 0;
-	secpath_reset(skb);
-	nf_reset(skb);
+	skb_ext_reset(skb);
+	nf_reset_ct(skb);
 
 	/*
 	 * Get absolute mactime here so all HWs RX at the "same time", and
diff --git a/drivers/staging/octeon/ethernet-tx.c b/drivers/staging/octeon/ethernet-tx.c
index c64728fc21f2..a62057555d1b 100644
--- a/drivers/staging/octeon/ethernet-tx.c
+++ b/drivers/staging/octeon/ethernet-tx.c
@@ -349,10 +349,8 @@ int cvm_oct_xmit(struct sk_buff *skb, struct net_device *dev)
 	 */
 	dst_release(skb_dst(skb));
 	skb_dst_set(skb, NULL);
-#ifdef CONFIG_XFRM
-	secpath_reset(skb);
-#endif
-	nf_reset(skb);
+	skb_ext_reset(skb);
+	nf_reset_ct(skb);
 
 #ifdef CONFIG_NET_SCHED
 	skb->tc_index = 0;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e7d3b1a513ef..4351577b14d7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4160,15 +4160,12 @@ static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
 static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
 #endif /* CONFIG_SKB_EXTENSIONS */
 
-static inline void nf_reset(struct sk_buff *skb)
+static inline void nf_reset_ct(struct sk_buff *skb)
 {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	nf_conntrack_put(skb_nfct(skb));
 	skb->_nfct = 0;
 #endif
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	skb_ext_del(skb, SKB_EXT_BRIDGE_NF);
-#endif
 }
 
 static inline void nf_reset_trace(struct sk_buff *skb)
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index a1146cb10919..9cbed6f5a85a 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -436,7 +436,7 @@ void batadv_interface_rx(struct net_device *soft_iface,
 	/* clean the netfilter state now that the batman-adv header has been
 	 * removed
 	 */
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
 		goto dropped;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 01d65206f4fb..529133611ea2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5120,7 +5120,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 	skb->ignore_df = 0;
 	skb_dst_drop(skb);
 	skb_ext_reset(skb);
-	nf_reset(skb);
+	nf_reset_ct(skb);
 	nf_reset_trace(skb);
 
 #ifdef CONFIG_NET_SWITCHDEV
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b685bc82f8d0..d9b4200ed12d 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -871,7 +871,7 @@ lookup:
 
 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_and_relse;
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted);
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 1e2392b7c64e..c59a78a267c3 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -199,7 +199,7 @@ resubmit:
 				kfree_skb(skb);
 				return;
 			}
-			nf_reset(skb);
+			nf_reset_ct(skb);
 		}
 		ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
 				      skb);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 313470f6bb14..716d5472c022 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1794,7 +1794,7 @@ static void ip_encap(struct net *net, struct sk_buff *skb,
 	ip_send_check(iph);
 
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-	nf_reset(skb);
+	nf_reset_ct(skb);
 }
 
 static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
@@ -2140,7 +2140,7 @@ int ip_mr_input(struct sk_buff *skb)
 
 			mroute_sk = rcu_dereference(mrt->mroute_sk);
 			if (mroute_sk) {
-				nf_reset(skb);
+				nf_reset_ct(skb);
 				raw_rcv(mroute_sk, skb);
 				return 0;
 			}
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index af3fbf76dbd3..6cc5743c553a 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -65,7 +65,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	/* Avoid counting cloned packets towards the original connection. */
-	nf_reset(skb);
+	nf_reset_ct(skb);
 	nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
 #endif
 	/*
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 80da5a66d5d7..3183413ebc6c 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -332,7 +332,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
 		kfree_skb(skb);
 		return NET_RX_DROP;
 	}
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	skb_push(skb, skb->data - skb_network_header(skb));
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2ee45e3755e9..bf124b1742df 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1916,7 +1916,7 @@ process:
 	if (tcp_v4_inbound_md5_hash(sk, skb))
 		goto discard_and_relse;
 
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	if (tcp_filter(sk, skb))
 		goto discard_and_relse;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cf755156a684..e8443cc5c1ab 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1969,7 +1969,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 	 */
 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto drop;
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
 		int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
@@ -2298,7 +2298,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 
 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 		goto drop;
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	/* No socket. Drop packet silently, if checksum is wrong */
 	if (udp_lib_checksum_complete(skb))
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index d432d0011c16..7e5df23cbe7b 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -371,7 +371,7 @@ resubmit_final:
 			/* Free reference early: we don't need it any more,
 			   and it may hold ip_conntrack module loaded
 			   indefinitely. */
-			nf_reset(skb);
+			nf_reset_ct(skb);
 
 			skb_postpull_rcsum(skb, skb_network_header(skb),
 					   skb_network_header_len(skb));
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index e6c9da9866b1..a0a2de30be3e 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -54,7 +54,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
 		return;
 
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	nf_reset(skb);
+	nf_reset_ct(skb);
 	nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
 #endif
 	if (hooknum == NF_INET_PRE_ROUTING ||
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 6e1888ee4036..a77f6b7d3a7c 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -215,7 +215,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 
 			/* Not releasing hash table! */
 			if (clone) {
-				nf_reset(clone);
+				nf_reset_ct(clone);
 				rawv6_rcv(sk, clone);
 			}
 		}
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 105e5a7092e7..f82ea12bac37 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1078,7 +1078,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
 			      IPSKB_REROUTED);
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index bd3f39349d40..fd5ac2788e45 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -151,7 +151,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
 	skb->ip_summed = CHECKSUM_NONE;
 
 	skb_dst_drop(skb);
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	rcu_read_lock();
 	dev = rcu_dereference(spriv->dev);
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 622833317dcb..0d7c887a2b75 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -193,7 +193,7 @@ pass_up:
 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_put;
 
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	return sk_receive_skb(sk, skb, 1);
 
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 687e23a8b326..802f19aba7e3 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -206,7 +206,7 @@ pass_up:
 	if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_put;
 
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	return sk_receive_skb(sk, skb, 1);
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 9c464d24beec..888d3068a492 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -613,7 +613,7 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
 	if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
 		ret = ip_vs_confirm_conntrack(skb);
 	if (ret == NF_ACCEPT) {
-		nf_reset(skb);
+		nf_reset_ct(skb);
 		skb_forward_csum(skb);
 	}
 	return ret;
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index d2437b5b2f6a..21c90d3a7ebf 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -237,7 +237,7 @@ static netdev_tx_t internal_dev_recv(struct sk_buff *skb)
 	}
 
 	skb_dst_drop(skb);
-	nf_reset(skb);
+	nf_reset_ct(skb);
 	secpath_reset(skb);
 
 	skb->pkt_type = PACKET_HOST;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index e2742b006d25..82a50e850245 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1821,7 +1821,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
 	skb_dst_drop(skb);
 
 	/* drop conntrack reference */
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	spkt = &PACKET_SKB_CB(skb)->sa.pkt;
 
@@ -2121,7 +2121,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 	skb_dst_drop(skb);
 
 	/* drop conntrack reference */
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	spin_lock(&sk->sk_receive_queue.lock);
 	po->stats.stats1.tp_packets++;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 1008cdc44dd6..5a070fb5b278 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -201,7 +201,7 @@ int sctp_rcv(struct sk_buff *skb)
 
 	if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family))
 		goto discard_release;
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	if (sk_filter(sk, skb))
 		goto discard_release;
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 6088bc2dc11e..9b599ed66d97 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -706,7 +706,7 @@ resume:
 	if (err)
 		goto drop;
 
-	nf_reset(skb);
+	nf_reset_ct(skb);
 
 	if (decaps) {
 		sp = skb_sec_path(skb);
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index 2ab4859df55a..0f5131bc3342 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -185,7 +185,7 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet)
 	skb->skb_iif = 0;
 	skb->ignore_df = 0;
 	skb_dst_drop(skb);
-	nf_reset(skb);
+	nf_reset_ct(skb);
 	nf_reset_trace(skb);
 
 	if (!xnet)
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 9499b35feb92..b1db55b50ba1 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -502,7 +502,7 @@ int xfrm_output_resume(struct sk_buff *skb, int err)
 	struct net *net = xs_net(skb_dst(skb)->xfrm);
 
 	while (likely((err = xfrm_output_one(skb, err)) == 0)) {
-		nf_reset(skb);
+		nf_reset_ct(skb);
 
 		err = skb_dst(skb)->ops->local_out(net, skb->sk, skb);
 		if (unlikely(err != 1))
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 21e939235b39..f2d1e573ea55 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2808,7 +2808,7 @@ static void xfrm_policy_queue_process(struct timer_list *t)
 			continue;
 		}
 
-		nf_reset(skb);
+		nf_reset_ct(skb);
 		skb_dst_drop(skb);
 		skb_dst_set(skb, dst);
 
-- 
cgit v1.2.3-59-g8ed1b


From 82af5b6609673f1cc7d3453d0be3d292dfffc813 Mon Sep 17 00:00:00 2001
From: Nayna Jain <nayna@linux.ibm.com>
Date: Tue, 1 Oct 2019 19:37:18 -0400
Subject: sysfs: Fixes __BIN_ATTR_WO() macro

This patch fixes the size and write parameter for the macro
__BIN_ATTR_WO().

Fixes: 7f905761e15a8 ("sysfs: add BIN_ATTR_WO() macro")
Signed-off-by: Nayna Jain <nayna@linux.ibm.com>
Link: https://lore.kernel.org/r/1569973038-2710-1-git-send-email-nayna@linux.ibm.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sysfs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 5420817ed317..fa7ee503fb76 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -196,9 +196,9 @@ struct bin_attribute {
 	.size	= _size,						\
 }
 
-#define __BIN_ATTR_WO(_name) {						\
+#define __BIN_ATTR_WO(_name, _size) {					\
 	.attr	= { .name = __stringify(_name), .mode = 0200 },		\
-	.store	= _name##_store,					\
+	.write	= _name##_write,					\
 	.size	= _size,						\
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 30945d31e5761436d9eba6b8cff468a5f7c9c266 Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Tue, 24 Sep 2019 14:49:43 +0200
Subject: hwmon: Fix HWMON_P_MIN_ALARM mask
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both HWMON_P_MIN_ALARM and HWMON_P_MAX_ALARM were using
BIT(hwmon_power_max_alarm).

Fixes: aa7f29b07c870 ("hwmon: Add support for power min, lcrit, min_alarm and lcrit_alarm")
CC: <stable@vger.kernel.org>
Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20190924124945.491326-2-nuno.sa@analog.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 include/linux/hwmon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index 04c36b7a61dd..72579168189d 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -235,7 +235,7 @@ enum hwmon_power_attributes {
 #define HWMON_P_LABEL			BIT(hwmon_power_label)
 #define HWMON_P_ALARM			BIT(hwmon_power_alarm)
 #define HWMON_P_CAP_ALARM		BIT(hwmon_power_cap_alarm)
-#define HWMON_P_MIN_ALARM		BIT(hwmon_power_max_alarm)
+#define HWMON_P_MIN_ALARM		BIT(hwmon_power_min_alarm)
 #define HWMON_P_MAX_ALARM		BIT(hwmon_power_max_alarm)
 #define HWMON_P_LCRIT_ALARM		BIT(hwmon_power_lcrit_alarm)
 #define HWMON_P_CRIT_ALARM		BIT(hwmon_power_crit_alarm)
-- 
cgit v1.2.3-59-g8ed1b


From 3e8db7e56082156a37b71d7334860c10fcea8025 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Tue, 1 Oct 2019 21:58:19 +0300
Subject: net: dsa: sja1105: Fix sleeping while atomic in .port_hwtstamp_set

Currently this stack trace can be seen with CONFIG_DEBUG_ATOMIC_SLEEP=y:

[   41.568348] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:909
[   41.576757] in_atomic(): 1, irqs_disabled(): 0, pid: 208, name: ptp4l
[   41.583212] INFO: lockdep is turned off.
[   41.587123] CPU: 1 PID: 208 Comm: ptp4l Not tainted 5.3.0-rc6-01445-ge950f2d4bc7f-dirty #1827
[   41.599873] [<c0313d7c>] (unwind_backtrace) from [<c030e13c>] (show_stack+0x10/0x14)
[   41.607584] [<c030e13c>] (show_stack) from [<c1212d50>] (dump_stack+0xd4/0x100)
[   41.614863] [<c1212d50>] (dump_stack) from [<c037dfc8>] (___might_sleep+0x1c8/0x2b4)
[   41.622574] [<c037dfc8>] (___might_sleep) from [<c122ea90>] (__mutex_lock+0x48/0xab8)
[   41.630368] [<c122ea90>] (__mutex_lock) from [<c122f51c>] (mutex_lock_nested+0x1c/0x24)
[   41.638340] [<c122f51c>] (mutex_lock_nested) from [<c0c6fe08>] (sja1105_static_config_reload+0x30/0x27c)
[   41.647779] [<c0c6fe08>] (sja1105_static_config_reload) from [<c0c7015c>] (sja1105_hwtstamp_set+0x108/0x1cc)
[   41.657562] [<c0c7015c>] (sja1105_hwtstamp_set) from [<c0feb650>] (dev_ifsioc+0x18c/0x330)
[   41.665788] [<c0feb650>] (dev_ifsioc) from [<c0febbd8>] (dev_ioctl+0x320/0x6e8)
[   41.673064] [<c0febbd8>] (dev_ioctl) from [<c0f8b1f4>] (sock_ioctl+0x334/0x5e8)
[   41.680340] [<c0f8b1f4>] (sock_ioctl) from [<c05404a8>] (do_vfs_ioctl+0xb0/0xa10)
[   41.687789] [<c05404a8>] (do_vfs_ioctl) from [<c0540e3c>] (ksys_ioctl+0x34/0x58)
[   41.695151] [<c0540e3c>] (ksys_ioctl) from [<c0301000>] (ret_fast_syscall+0x0/0x28)
[   41.702768] Exception stack(0xe8495fa8 to 0xe8495ff0)
[   41.707796] 5fa0:                   beff4a8c 00000001 00000011 000089b0 beff4a8c beff4a80
[   41.715933] 5fc0: beff4a8c 00000001 0000000c 00000036 b6fa98c8 004e19c1 00000001 00000000
[   41.724069] 5fe0: 004dcedc beff4a6c 004c0738 b6e7af4c
[   41.729860] BUG: scheduling while atomic: ptp4l/208/0x00000002
[   41.735682] INFO: lockdep is turned off.

Enabling RX timestamping will logically disturb the fastpath (processing
of meta frames). Replace bool hwts_rx_en with a bit that is checked
atomically from the fastpath and temporarily unset from the sleepable
context during a change of the RX timestamping process (a destructive
operation anyways, requires switch reset).
If found unset, the fastpath (net/dsa/tag_sja1105.c) will just drop any
received meta frame and not take the meta_lock at all.

Fixes: a602afd200f5 ("net: dsa: sja1105: Expose PTP timestamping ioctls to userspace")
Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/sja1105/sja1105_main.c | 19 +++++++++++--------
 include/linux/dsa/sja1105.h            |  4 +++-
 net/dsa/tag_sja1105.c                  | 12 +++++++++++-
 3 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c
index ea2e7f4f96d0..7687ddcae159 100644
--- a/drivers/net/dsa/sja1105/sja1105_main.c
+++ b/drivers/net/dsa/sja1105/sja1105_main.c
@@ -1897,7 +1897,9 @@ static int sja1105_set_ageing_time(struct dsa_switch *ds,
 	return sja1105_static_config_reload(priv);
 }
 
-/* Caller must hold priv->tagger_data.meta_lock */
+/* Must be called only with priv->tagger_data.state bit
+ * SJA1105_HWTS_RX_EN cleared
+ */
 static int sja1105_change_rxtstamping(struct sja1105_private *priv,
 				      bool on)
 {
@@ -1954,16 +1956,17 @@ static int sja1105_hwtstamp_set(struct dsa_switch *ds, int port,
 		break;
 	}
 
-	if (rx_on != priv->tagger_data.hwts_rx_en) {
-		spin_lock(&priv->tagger_data.meta_lock);
+	if (rx_on != test_bit(SJA1105_HWTS_RX_EN, &priv->tagger_data.state)) {
+		clear_bit(SJA1105_HWTS_RX_EN, &priv->tagger_data.state);
+
 		rc = sja1105_change_rxtstamping(priv, rx_on);
-		spin_unlock(&priv->tagger_data.meta_lock);
 		if (rc < 0) {
 			dev_err(ds->dev,
 				"Failed to change RX timestamping: %d\n", rc);
-			return -EFAULT;
+			return rc;
 		}
-		priv->tagger_data.hwts_rx_en = rx_on;
+		if (rx_on)
+			set_bit(SJA1105_HWTS_RX_EN, &priv->tagger_data.state);
 	}
 
 	if (copy_to_user(ifr->ifr_data, &config, sizeof(config)))
@@ -1982,7 +1985,7 @@ static int sja1105_hwtstamp_get(struct dsa_switch *ds, int port,
 		config.tx_type = HWTSTAMP_TX_ON;
 	else
 		config.tx_type = HWTSTAMP_TX_OFF;
-	if (priv->tagger_data.hwts_rx_en)
+	if (test_bit(SJA1105_HWTS_RX_EN, &priv->tagger_data.state))
 		config.rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
 	else
 		config.rx_filter = HWTSTAMP_FILTER_NONE;
@@ -2031,7 +2034,7 @@ static bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port,
 	struct sja1105_private *priv = ds->priv;
 	struct sja1105_tagger_data *data = &priv->tagger_data;
 
-	if (!data->hwts_rx_en)
+	if (!test_bit(SJA1105_HWTS_RX_EN, &data->state))
 		return false;
 
 	/* We need to read the full PTP clock to reconstruct the Rx
diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 79435cfc20eb..897e799dbcb9 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -31,6 +31,8 @@
 #define SJA1105_META_SMAC			0x222222222222ull
 #define SJA1105_META_DMAC			0x0180C200000Eull
 
+#define SJA1105_HWTS_RX_EN			0
+
 /* Global tagger data: each struct sja1105_port has a reference to
  * the structure defined in struct sja1105_private.
  */
@@ -42,7 +44,7 @@ struct sja1105_tagger_data {
 	 * from taggers running on multiple ports on SMP systems
 	 */
 	spinlock_t meta_lock;
-	bool hwts_rx_en;
+	unsigned long state;
 };
 
 struct sja1105_skb_cb {
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 9c9aff3e52cf..63ef2a14c934 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -156,7 +156,11 @@ static struct sk_buff
 	/* Step 1: A timestampable frame was received.
 	 * Buffer it until we get its meta frame.
 	 */
-	if (is_link_local && sp->data->hwts_rx_en) {
+	if (is_link_local) {
+		if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state))
+			/* Do normal processing. */
+			return skb;
+
 		spin_lock(&sp->data->meta_lock);
 		/* Was this a link-local frame instead of the meta
 		 * that we were expecting?
@@ -187,6 +191,12 @@ static struct sk_buff
 	} else if (is_meta) {
 		struct sk_buff *stampable_skb;
 
+		/* Drop the meta frame if we're not in the right state
+		 * to process it.
+		 */
+		if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state))
+			return NULL;
+
 		spin_lock(&sp->data->meta_lock);
 
 		stampable_skb = sp->data->stampable_skb;
-- 
cgit v1.2.3-59-g8ed1b


From 4cf6c57e61fee954f7b7685de31b80ec26843d27 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 4 Oct 2019 17:05:58 +0100
Subject: net: phy: fix write to mii-ctrl1000 register

When userspace writes to the MII_ADVERTISE register, we update phylib's
advertising mask and trigger a renegotiation.  However, writing to the
MII_CTRL1000 register, which contains the gigabit advertisement, does
neither.  This can lead to phylib's copy of the advertisement becoming
de-synced with the values in the PHY register set, which can result in
incorrect negotiation resolution.

Fixes: 5502b218e001 ("net: phy: use phy_resolve_aneg_linkmode in genphy_read_status")
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 5 +++++
 include/linux/mii.h   | 9 +++++++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 7c92afd36bbe..119e6f466056 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -457,6 +457,11 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd)
 							   val);
 				change_autoneg = true;
 				break;
+			case MII_CTRL1000:
+				mii_ctrl1000_mod_linkmode_adv_t(phydev->advertising,
+							        val);
+				change_autoneg = true;
+				break;
 			default:
 				/* do nothing */
 				break;
diff --git a/include/linux/mii.h b/include/linux/mii.h
index 5cd824c1c0ca..4ce8901a1af6 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -455,6 +455,15 @@ static inline void mii_lpa_mod_linkmode_lpa_t(unsigned long *lp_advertising,
 			 lp_advertising, lpa & LPA_LPACK);
 }
 
+static inline void mii_ctrl1000_mod_linkmode_adv_t(unsigned long *advertising,
+						   u32 ctrl1000)
+{
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, advertising,
+			 ctrl1000 & ADVERTISE_1000HALF);
+	linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, advertising,
+			 ctrl1000 & ADVERTISE_1000FULL);
+}
+
 /**
  * linkmode_adv_to_lcl_adv_t
  * @advertising:pointer to linkmode advertising
-- 
cgit v1.2.3-59-g8ed1b


From 8d3dc3ac9dd6801c732a72ca6979698c38451b4f Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 4 Oct 2019 17:06:04 +0100
Subject: net: phy: extract link partner advertisement reading

Move reading the link partner advertisement out of genphy_read_status()
into its own separate function.  This will allow re-use of this code by
PHY drivers that are able to read the resolved status from the PHY.

Tested-by: tinywrkb <tinywrkb@gmail.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 65 +++++++++++++++++++++++++++-----------------
 include/linux/phy.h          |  1 +
 2 files changed, 41 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index d347ddcac45b..9d2bbb13293e 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1783,32 +1783,9 @@ done:
 }
 EXPORT_SYMBOL(genphy_update_link);
 
-/**
- * genphy_read_status - check the link status and update current link state
- * @phydev: target phy_device struct
- *
- * Description: Check the link, then figure out the current state
- *   by comparing what we advertise with what the link partner
- *   advertises.  Start by checking the gigabit possibilities,
- *   then move on to 10/100.
- */
-int genphy_read_status(struct phy_device *phydev)
+int genphy_read_lpa(struct phy_device *phydev)
 {
-	int lpa, lpagb, err, old_link = phydev->link;
-
-	/* Update the link, but return if there was an error */
-	err = genphy_update_link(phydev);
-	if (err)
-		return err;
-
-	/* why bother the PHY if nothing can have changed */
-	if (phydev->autoneg == AUTONEG_ENABLE && old_link && phydev->link)
-		return 0;
-
-	phydev->speed = SPEED_UNKNOWN;
-	phydev->duplex = DUPLEX_UNKNOWN;
-	phydev->pause = 0;
-	phydev->asym_pause = 0;
+	int lpa, lpagb;
 
 	if (phydev->autoneg == AUTONEG_ENABLE && phydev->autoneg_complete) {
 		if (phydev->is_gigabit_capable) {
@@ -1838,6 +1815,44 @@ int genphy_read_status(struct phy_device *phydev)
 			return lpa;
 
 		mii_lpa_mod_linkmode_lpa_t(phydev->lp_advertising, lpa);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(genphy_read_lpa);
+
+/**
+ * genphy_read_status - check the link status and update current link state
+ * @phydev: target phy_device struct
+ *
+ * Description: Check the link, then figure out the current state
+ *   by comparing what we advertise with what the link partner
+ *   advertises.  Start by checking the gigabit possibilities,
+ *   then move on to 10/100.
+ */
+int genphy_read_status(struct phy_device *phydev)
+{
+	int err, old_link = phydev->link;
+
+	/* Update the link, but return if there was an error */
+	err = genphy_update_link(phydev);
+	if (err)
+		return err;
+
+	/* why bother the PHY if nothing can have changed */
+	if (phydev->autoneg == AUTONEG_ENABLE && old_link && phydev->link)
+		return 0;
+
+	phydev->speed = SPEED_UNKNOWN;
+	phydev->duplex = DUPLEX_UNKNOWN;
+	phydev->pause = 0;
+	phydev->asym_pause = 0;
+
+	err = genphy_read_lpa(phydev);
+	if (err < 0)
+		return err;
+
+	if (phydev->autoneg == AUTONEG_ENABLE && phydev->autoneg_complete) {
 		phy_resolve_aneg_linkmode(phydev);
 	} else if (phydev->autoneg == AUTONEG_DISABLE) {
 		int bmcr = phy_read(phydev, MII_BMCR);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index a7ecbe0e55aa..7abee820d05c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1076,6 +1076,7 @@ int genphy_config_eee_advert(struct phy_device *phydev);
 int __genphy_config_aneg(struct phy_device *phydev, bool changed);
 int genphy_aneg_done(struct phy_device *phydev);
 int genphy_update_link(struct phy_device *phydev);
+int genphy_read_lpa(struct phy_device *phydev);
 int genphy_read_status(struct phy_device *phydev);
 int genphy_suspend(struct phy_device *phydev);
 int genphy_resume(struct phy_device *phydev);
-- 
cgit v1.2.3-59-g8ed1b


From 2d880b8709c013d47472f85a9d42ea1aca3bce47 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 4 Oct 2019 17:06:09 +0100
Subject: net: phy: extract pause mode

Extract the update of phylib's software pause mode state from
genphy_read_status(), so that we can re-use this functionality with
PHYs that have alternative ways to read the negotiation results.

Tested-by: tinywrkb <tinywrkb@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-core.c | 20 +++++++++++++-------
 include/linux/phy.h        |  1 +
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index 369903d9b6ec..9412669b579c 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -283,6 +283,18 @@ void of_set_phy_eee_broken(struct phy_device *phydev)
 	phydev->eee_broken_modes = broken;
 }
 
+void phy_resolve_aneg_pause(struct phy_device *phydev)
+{
+	if (phydev->duplex == DUPLEX_FULL) {
+		phydev->pause = linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT,
+						  phydev->lp_advertising);
+		phydev->asym_pause = linkmode_test_bit(
+			ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+			phydev->lp_advertising);
+	}
+}
+EXPORT_SYMBOL_GPL(phy_resolve_aneg_pause);
+
 /**
  * phy_resolve_aneg_linkmode - resolve the advertisements into phy settings
  * @phydev: The phy_device struct
@@ -305,13 +317,7 @@ void phy_resolve_aneg_linkmode(struct phy_device *phydev)
 			break;
 		}
 
-	if (phydev->duplex == DUPLEX_FULL) {
-		phydev->pause = linkmode_test_bit(ETHTOOL_LINK_MODE_Pause_BIT,
-						  phydev->lp_advertising);
-		phydev->asym_pause = linkmode_test_bit(
-			ETHTOOL_LINK_MODE_Asym_Pause_BIT,
-			phydev->lp_advertising);
-	}
+	phy_resolve_aneg_pause(phydev);
 }
 EXPORT_SYMBOL_GPL(phy_resolve_aneg_linkmode);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 7abee820d05c..9a0e981df502 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -678,6 +678,7 @@ static inline bool phy_is_started(struct phy_device *phydev)
 	return phydev->state >= PHY_UP;
 }
 
+void phy_resolve_aneg_pause(struct phy_device *phydev);
 void phy_resolve_aneg_linkmode(struct phy_device *phydev);
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From f1da567f1dc1b55d178b8f2d0cfe8353858aac19 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sat, 5 Oct 2019 23:04:47 +0200
Subject: driver core: platform: Add platform_get_irq_byname_optional()

Some drivers (e.g dwc3) first try to get an IRQ byname and then fall
back to the one at index 0. In this case we do not want the error(s)
printed by platform_get_irq_byname(). This commit adds a new
platform_get_irq_byname_optional(), which does not print errors, for this.

While at it also improve the kdoc text for platform_get_irq_byname() a bit.

BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=205037
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20191005210449.3926-2-hdegoede@redhat.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/platform.c         | 46 ++++++++++++++++++++++++++++++++++-------
 include/linux/platform_device.h |  2 ++
 2 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index b6c6c7d97d5b..b230beb6ccb4 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -241,12 +241,8 @@ struct resource *platform_get_resource_byname(struct platform_device *dev,
 }
 EXPORT_SYMBOL_GPL(platform_get_resource_byname);
 
-/**
- * platform_get_irq_byname - get an IRQ for a device by name
- * @dev: platform device
- * @name: IRQ name
- */
-int platform_get_irq_byname(struct platform_device *dev, const char *name)
+static int __platform_get_irq_byname(struct platform_device *dev,
+				     const char *name)
 {
 	struct resource *r;
 
@@ -262,11 +258,47 @@ int platform_get_irq_byname(struct platform_device *dev, const char *name)
 	if (r)
 		return r->start;
 
-	dev_err(&dev->dev, "IRQ %s not found\n", name);
 	return -ENXIO;
 }
+
+/**
+ * platform_get_irq_byname - get an IRQ for a device by name
+ * @dev: platform device
+ * @name: IRQ name
+ *
+ * Get an IRQ like platform_get_irq(), but then by name rather then by index.
+ *
+ * Return: IRQ number on success, negative error number on failure.
+ */
+int platform_get_irq_byname(struct platform_device *dev, const char *name)
+{
+	int ret;
+
+	ret = __platform_get_irq_byname(dev, name);
+	if (ret < 0 && ret != -EPROBE_DEFER)
+		dev_err(&dev->dev, "IRQ %s not found\n", name);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(platform_get_irq_byname);
 
+/**
+ * platform_get_irq_byname_optional - get an optional IRQ for a device by name
+ * @dev: platform device
+ * @name: IRQ name
+ *
+ * Get an optional IRQ by name like platform_get_irq_byname(). Except that it
+ * does not print an error message if an IRQ can not be obtained.
+ *
+ * Return: IRQ number on success, negative error number on failure.
+ */
+int platform_get_irq_byname_optional(struct platform_device *dev,
+				     const char *name)
+{
+	return __platform_get_irq_byname(dev, name);
+}
+EXPORT_SYMBOL_GPL(platform_get_irq_byname_optional);
+
 /**
  * platform_add_devices - add a numbers of platform devices
  * @devs: array of platform devices to add
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 1b5cec067533..f2688404d1cd 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -64,6 +64,8 @@ extern struct resource *platform_get_resource_byname(struct platform_device *,
 						     unsigned int,
 						     const char *);
 extern int platform_get_irq_byname(struct platform_device *, const char *);
+extern int platform_get_irq_byname_optional(struct platform_device *dev,
+					    const char *name);
 extern int platform_add_devices(struct platform_device **, int);
 
 struct platform_device_info {
-- 
cgit v1.2.3-59-g8ed1b


From 047d50aee341d940350897c85799e56ae57c3849 Mon Sep 17 00:00:00 2001
From: Peter Jones <pjones@redhat.com>
Date: Wed, 2 Oct 2019 18:59:00 +0200
Subject: efi/tpm: Don't access event->count when it isn't mapped

Some machines generate a lot of event log entries.  When we're
iterating over them, the code removes the old mapping and adds a
new one, so once we cross the page boundary we're unmapping the page
with the count on it.  Hilarity ensues.

This patch keeps the info from the header in local variables so we don't
need to access that page again or keep track of if it's mapped.

Tested-by: Lyude Paul <lyude@redhat.com>
Signed-off-by: Peter Jones <pjones@redhat.com>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Acked-by: Matthew Garrett <mjg59@google.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Ben Dooks <ben.dooks@codethink.co.uk>
Cc: Dave Young <dyoung@redhat.com>
Cc: Jerry Snitselaar <jsnitsel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Octavian Purdila <octavian.purdila@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Talbert <swt@techie.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Cc: linux-integrity@vger.kernel.org
Cc: stable@vger.kernel.org
Fixes: 44038bc514a2 ("tpm: Abstract crypto agile event size calculations")
Link: https://lkml.kernel.org/r/20191002165904.8819-4-ard.biesheuvel@linaro.org
[ Minor edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/tpm_eventlog.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index 63238c84dc0b..b50cc3adca18 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -170,6 +170,7 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
 	u16 halg;
 	int i;
 	int j;
+	u32 count, event_type;
 
 	marker = event;
 	marker_start = marker;
@@ -190,16 +191,22 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
 	}
 
 	event = (struct tcg_pcr_event2_head *)mapping;
+	/*
+	 * The loop below will unmap these fields if the log is larger than
+	 * one page, so save them here for reference:
+	 */
+	count = READ_ONCE(event->count);
+	event_type = READ_ONCE(event->event_type);
 
 	efispecid = (struct tcg_efi_specid_event_head *)event_header->event;
 
 	/* Check if event is malformed. */
-	if (event->count > efispecid->num_algs) {
+	if (count > efispecid->num_algs) {
 		size = 0;
 		goto out;
 	}
 
-	for (i = 0; i < event->count; i++) {
+	for (i = 0; i < count; i++) {
 		halg_size = sizeof(event->digests[i].alg_id);
 
 		/* Map the digest's algorithm identifier */
@@ -256,8 +263,9 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
 		+ event_field->event_size;
 	size = marker - marker_start;
 
-	if ((event->event_type == 0) && (event_field->event_size == 0))
+	if (event_type == 0 && event_field->event_size == 0)
 		size = 0;
+
 out:
 	if (do_mapping)
 		TPM_MEMUNMAP(mapping, mapping_size);
-- 
cgit v1.2.3-59-g8ed1b


From e658c82be5561412c5e83b5e74e9da4830593f3e Mon Sep 17 00:00:00 2001
From: Jerry Snitselaar <jsnitsel@redhat.com>
Date: Wed, 2 Oct 2019 18:59:02 +0200
Subject: efi/tpm: Only set 'efi_tpm_final_log_size' after successful event log
 parsing

If __calc_tpm2_event_size() fails to parse an event it will return 0,
resulting tpm2_calc_event_log_size() returning -1. Currently there is
no check of this return value, and 'efi_tpm_final_log_size' can end up
being set to this negative value resulting in a crash like this one:

  BUG: unable to handle page fault for address: ffffbc8fc00866ad
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page

  RIP: 0010:memcpy_erms+0x6/0x10
  Call Trace:
   tpm_read_log_efi()
   tpm_bios_log_setup()
   tpm_chip_register()
   tpm_tis_core_init.cold.9+0x28c/0x466
   tpm_tis_plat_probe()
   platform_drv_probe()
   ...

Also __calc_tpm2_event_size() returns a size of 0 when it fails
to parse an event, so update function documentation to reflect this.

The root cause of the issue that caused the failure of event parsing
in this case is resolved by Peter Jone's patchset dealing with large
event logs where crossing over a page boundary causes the page with
the event count to be unmapped.

Signed-off-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Ben Dooks <ben.dooks@codethink.co.uk>
Cc: Dave Young <dyoung@redhat.com>
Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Matthew Garrett <mjg59@google.com>
Cc: Octavian Purdila <octavian.purdila@intel.com>
Cc: Peter Jones <pjones@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Talbert <swt@techie.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Cc: linux-integrity@vger.kernel.org
Cc: stable@vger.kernel.org
Fixes: c46f3405692de ("tpm: Reserve the TPM final events table")
Link: https://lkml.kernel.org/r/20191002165904.8819-6-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/tpm.c   | 9 ++++++++-
 include/linux/tpm_eventlog.h | 2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/tpm.c b/drivers/firmware/efi/tpm.c
index b9ae5c6f9b9c..703469c1ab8e 100644
--- a/drivers/firmware/efi/tpm.c
+++ b/drivers/firmware/efi/tpm.c
@@ -85,11 +85,18 @@ int __init efi_tpm_eventlog_init(void)
 						    final_tbl->nr_events,
 						    log_tbl->log);
 	}
+
+	if (tbl_size < 0) {
+		pr_err(FW_BUG "Failed to parse event in TPM Final Events Log\n");
+		goto out_calc;
+	}
+
 	memblock_reserve((unsigned long)final_tbl,
 			 tbl_size + sizeof(*final_tbl));
-	early_memunmap(final_tbl, sizeof(*final_tbl));
 	efi_tpm_final_log_size = tbl_size;
 
+out_calc:
+	early_memunmap(final_tbl, sizeof(*final_tbl));
 out:
 	early_memunmap(log_tbl, sizeof(*log_tbl));
 	return ret;
diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index b50cc3adca18..131ea1bad458 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -152,7 +152,7 @@ struct tcg_algorithm_info {
  * total. Once we've done this we know the offset of the data length field,
  * and can calculate the total size of the event.
  *
- * Return: size of the event on success, <0 on failure
+ * Return: size of the event on success, 0 on failure
  */
 
 static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
-- 
cgit v1.2.3-59-g8ed1b


From bf70b0503abd19194dba25fe383d143d0229dc6a Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 3 Oct 2019 16:58:21 +0900
Subject: module: swap the order of symbol.namespace

Currently, EXPORT_SYMBOL_NS(_GPL) constructs the kernel symbol as
follows:

  __ksymtab_SYMBOL.NAMESPACE

The sym_extract_namespace() in modpost allocates memory for the part
SYMBOL.NAMESPACE when '.' is contained. One problem is that the pointer
returned by strdup() is lost because the symbol name will be copied to
malloc'ed memory by alloc_symbol(). No one will keep track of the
pointer of strdup'ed memory.

sym->namespace still points to the NAMESPACE part. So, you can free it
with complicated code like this:

   free(sym->namespace - strlen(sym->name) - 1);

It complicates memory free.

To fix it elegantly, I swapped the order of the symbol and the
namespace as follows:

  __ksymtab_NAMESPACE.SYMBOL

then, simplified sym_extract_namespace() so that it allocates memory
only for the NAMESPACE part.

I prefer this order because it is intuitive and also matches to major
languages. For example, NAMESPACE::NAME in C++, MODULE.NAME in Python.

Reviewed-by: Matthias Maennich <maennich@google.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/export.h |  4 ++--
 scripts/mod/modpost.c  | 16 +++++++---------
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/export.h b/include/linux/export.h
index 95f55b7f83a0..0695d4e847d9 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -52,7 +52,7 @@ extern struct module __this_module;
 	__ADDRESSABLE(sym)						\
 	asm("	.section \"___ksymtab" sec "+" #sym "\", \"a\"	\n"	\
 	    "	.balign	4					\n"	\
-	    "__ksymtab_" #sym NS_SEPARATOR #ns ":		\n"	\
+	    "__ksymtab_" #ns NS_SEPARATOR #sym ":		\n"	\
 	    "	.long	" #sym "- .				\n"	\
 	    "	.long	__kstrtab_" #sym "- .			\n"	\
 	    "	.long	__kstrtab_ns_" #sym "- .		\n"	\
@@ -76,7 +76,7 @@ struct kernel_symbol {
 #else
 #define __KSYMTAB_ENTRY_NS(sym, sec, ns)				\
 	static const struct kernel_symbol __ksymtab_##sym##__##ns	\
-	asm("__ksymtab_" #sym NS_SEPARATOR #ns)				\
+	asm("__ksymtab_" #ns NS_SEPARATOR #sym)				\
 	__attribute__((section("___ksymtab" sec "+" #sym), used))	\
 	__aligned(sizeof(void *))					\
 	= { (unsigned long)&sym, __kstrtab_##sym, __kstrtab_ns_##sym }
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 3961941e8e7a..c4b536ac1327 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -350,18 +350,16 @@ static enum export export_from_sec(struct elf_info *elf, unsigned int sec)
 
 static const char *sym_extract_namespace(const char **symname)
 {
-	size_t n;
-	char *dupsymname;
+	char *namespace = NULL;
+	char *ns_separator;
 
-	n = strcspn(*symname, ".");
-	if (n < strlen(*symname) - 1) {
-		dupsymname = NOFAIL(strdup(*symname));
-		dupsymname[n] = '\0';
-		*symname = dupsymname;
-		return dupsymname + n + 1;
+	ns_separator = strchr(*symname, '.');
+	if (ns_separator) {
+		namespace = NOFAIL(strndup(*symname, ns_separator - *symname));
+		*symname = ns_separator + 1;
 	}
 
-	return NULL;
+	return namespace;
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From fa6643cdc5cd726b10d30eec45ff8dca267de735 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 3 Oct 2019 16:58:23 +0900
Subject: module: rename __kstrtab_ns_* to __kstrtabns_* to avoid symbol
 conflict

The module namespace produces __strtab_ns_<sym> symbols to store
namespace strings, but it does not guarantee the name uniqueness.
This is a potential problem because we have exported symbols starting
with "ns_".

For example, kernel/capability.c exports the following symbols:

  EXPORT_SYMBOL(ns_capable);
  EXPORT_SYMBOL(capable);

Assume a situation where those are converted as follows:

  EXPORT_SYMBOL_NS(ns_capable, some_namespace);
  EXPORT_SYMBOL_NS(capable, some_namespace);

The former expands to "__kstrtab_ns_capable" and "__kstrtab_ns_ns_capable",
and the latter to "__kstrtab_capable" and "__kstrtab_ns_capable".
Then, we have the duplicated "__kstrtab_ns_capable".

To ensure the uniqueness, rename "__kstrtab_ns_*" to "__kstrtabns_*".

Reviewed-by: Matthias Maennich <maennich@google.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/export.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/export.h b/include/linux/export.h
index 0695d4e847d9..621158ecd2e2 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -55,7 +55,7 @@ extern struct module __this_module;
 	    "__ksymtab_" #ns NS_SEPARATOR #sym ":		\n"	\
 	    "	.long	" #sym "- .				\n"	\
 	    "	.long	__kstrtab_" #sym "- .			\n"	\
-	    "	.long	__kstrtab_ns_" #sym "- .		\n"	\
+	    "	.long	__kstrtabns_" #sym "- .			\n"	\
 	    "	.previous					\n")
 
 #define __KSYMTAB_ENTRY(sym, sec)					\
@@ -79,7 +79,7 @@ struct kernel_symbol {
 	asm("__ksymtab_" #ns NS_SEPARATOR #sym)				\
 	__attribute__((section("___ksymtab" sec "+" #sym), used))	\
 	__aligned(sizeof(void *))					\
-	= { (unsigned long)&sym, __kstrtab_##sym, __kstrtab_ns_##sym }
+	= { (unsigned long)&sym, __kstrtab_##sym, __kstrtabns_##sym }
 
 #define __KSYMTAB_ENTRY(sym, sec)					\
 	static const struct kernel_symbol __ksymtab_##sym		\
@@ -112,7 +112,7 @@ struct kernel_symbol {
 /* For every exported symbol, place a struct in the __ksymtab section */
 #define ___EXPORT_SYMBOL_NS(sym, sec, ns)				\
 	___export_symbol_common(sym, sec);				\
-	static const char __kstrtab_ns_##sym[]				\
+	static const char __kstrtabns_##sym[]				\
 	__attribute__((section("__ksymtab_strings"), used, aligned(1)))	\
 	= #ns;								\
 	__KSYMTAB_ENTRY_NS(sym, sec, ns)
-- 
cgit v1.2.3-59-g8ed1b


From c512c69187197fe08026cb5bbe7b9709f4f89b73 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 7 Oct 2019 12:56:48 -0700
Subject: uaccess: implement a proper unsafe_copy_to_user() and switch filldir
 over to it

In commit 9f79b78ef744 ("Convert filldir[64]() from __put_user() to
unsafe_put_user()") I made filldir() use unsafe_put_user(), which
improves code generation on x86 enormously.

But because we didn't have a "unsafe_copy_to_user()", the dirent name
copy was also done by hand with unsafe_put_user() in a loop, and it
turns out that a lot of other architectures didn't like that, because
unlike x86, they have various alignment issues.

Most non-x86 architectures trap and fix it up, and some (like xtensa)
will just fail unaligned put_user() accesses unconditionally.  Which
makes that "copy using put_user() in a loop" not work for them at all.

I could make that code do explicit alignment etc, but the architectures
that don't like unaligned accesses also don't really use the fancy
"user_access_begin/end()" model, so they might just use the regular old
__copy_to_user() interface.

So this commit takes that looping implementation, turns it into the x86
version of "unsafe_copy_to_user()", and makes other architectures
implement the unsafe copy version as __copy_to_user() (the same way they
do for the other unsafe_xyz() accessor functions).

Note that it only does this for the copying _to_ user space, and we
still don't have a unsafe version of copy_from_user().

That's partly because we have no current users of it, but also partly
because the copy_from_user() case is slightly different and cannot
efficiently be implemented in terms of a unsafe_get_user() loop (because
gcc can't do asm goto with outputs).

It would be trivial to do this using "rep movsb", which would work
really nicely on newer x86 cores, but really badly on some older ones.

Al Viro is looking at cleaning up all our user copy routines to make
this all a non-issue, but for now we have this simple-but-stupid version
for x86 that works fine for the dirent name copy case because those
names are short strings and we simply don't need anything fancier.

Fixes: 9f79b78ef744 ("Convert filldir[64]() from __put_user() to unsafe_put_user()")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Reported-and-tested-by: Tony Luck <tony.luck@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess.h | 23 ++++++++++++++++++++++
 fs/readdir.c                   | 44 ++----------------------------------------
 include/linux/uaccess.h        |  6 ++++--
 3 files changed, 29 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 35c225ede0e4..61d93f062a36 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -734,5 +734,28 @@ do {										\
 	if (unlikely(__gu_err)) goto err_label;					\
 } while (0)
 
+/*
+ * We want the unsafe accessors to always be inlined and use
+ * the error labels - thus the macro games.
+ */
+#define unsafe_copy_loop(dst, src, len, type, label)			\
+	while (len >= sizeof(type)) {					\
+		unsafe_put_user(*(type *)src,(type __user *)dst,label);	\
+		dst += sizeof(type);					\
+		src += sizeof(type);					\
+		len -= sizeof(type);					\
+	}
+
+#define unsafe_copy_to_user(_dst,_src,_len,label)			\
+do {									\
+	char __user *__ucu_dst = (_dst);				\
+	const char *__ucu_src = (_src);					\
+	size_t __ucu_len = (_len);					\
+	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label);	\
+	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label);	\
+	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label);	\
+	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label);	\
+} while (0)
+
 #endif /* _ASM_X86_UACCESS_H */
 
diff --git a/fs/readdir.c b/fs/readdir.c
index 19bea591c3f1..6e2623e57b2e 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -27,53 +27,13 @@
 /*
  * Note the "unsafe_put_user() semantics: we goto a
  * label for errors.
- *
- * Also note how we use a "while()" loop here, even though
- * only the biggest size needs to loop. The compiler (well,
- * at least gcc) is smart enough to turn the smaller sizes
- * into just if-statements, and this way we don't need to
- * care whether 'u64' or 'u32' is the biggest size.
- */
-#define unsafe_copy_loop(dst, src, len, type, label) 		\
-	while (len >= sizeof(type)) {				\
-		unsafe_put_user(get_unaligned((type *)src),	\
-			(type __user *)dst, label);		\
-		dst += sizeof(type);				\
-		src += sizeof(type);				\
-		len -= sizeof(type);				\
-	}
-
-/*
- * We avoid doing 64-bit copies on 32-bit architectures. They
- * might be better, but the component names are mostly small,
- * and the 64-bit cases can end up being much more complex and
- * put much more register pressure on the code, so it's likely
- * not worth the pain of unaligned accesses etc.
- *
- * So limit the copies to "unsigned long" size. I did verify
- * that at least the x86-32 case is ok without this limiting,
- * but I worry about random other legacy 32-bit cases that
- * might not do as well.
- */
-#define unsafe_copy_type(dst, src, len, type, label) do {	\
-	if (sizeof(type) <= sizeof(unsigned long))		\
-		unsafe_copy_loop(dst, src, len, type, label);	\
-} while (0)
-
-/*
- * Copy the dirent name to user space, and NUL-terminate
- * it. This should not be a function call, since we're doing
- * the copy inside a "user_access_begin/end()" section.
  */
 #define unsafe_copy_dirent_name(_dst, _src, _len, label) do {	\
 	char __user *dst = (_dst);				\
 	const char *src = (_src);				\
 	size_t len = (_len);					\
-	unsafe_copy_type(dst, src, len, u64, label);	 	\
-	unsafe_copy_type(dst, src, len, u32, label);		\
-	unsafe_copy_type(dst, src, len, u16, label);		\
-	unsafe_copy_type(dst, src, len, u8,  label);		\
-	unsafe_put_user(0, dst, label);				\
+	unsafe_put_user(0, dst+len, label);			\
+	unsafe_copy_to_user(dst, src, len, label);		\
 } while (0)
 
 
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index e47d0522a1f4..d4ee6e942562 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -355,8 +355,10 @@ extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count);
 #ifndef user_access_begin
 #define user_access_begin(ptr,len) access_ok(ptr, len)
 #define user_access_end() do { } while (0)
-#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0)
-#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0)
+#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
+#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e)
+#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
+#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
 static inline unsigned long user_access_save(void) { return 0UL; }
 static inline void user_access_restore(unsigned long flags) { }
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 08d1d0e6d0a00c6e687201774f3bf61177741e80 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Sun, 6 Oct 2019 17:58:15 -0700
Subject: memcg: only record foreign writebacks with dirty pages when memcg is
 not disabled

In kdump kernel, memcg usually is disabled with 'cgroup_disable=memory'
for saving memory.  Now kdump kernel will always panic when dump vmcore
to local disk:

  BUG: kernel NULL pointer dereference, address: 0000000000000ab8
  Oops: 0000 [#1] SMP NOPTI
  CPU: 0 PID: 598 Comm: makedumpfile Not tainted 5.3.0+ #26
  Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 10/02/2018
  RIP: 0010:mem_cgroup_track_foreign_dirty_slowpath+0x38/0x140
  Call Trace:
   __set_page_dirty+0x52/0xc0
   iomap_set_page_dirty+0x50/0x90
   iomap_write_end+0x6e/0x270
   iomap_write_actor+0xce/0x170
   iomap_apply+0xba/0x11e
   iomap_file_buffered_write+0x62/0x90
   xfs_file_buffered_aio_write+0xca/0x320 [xfs]
   new_sync_write+0x12d/0x1d0
   vfs_write+0xa5/0x1a0
   ksys_write+0x59/0xd0
   do_syscall_64+0x59/0x1e0
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

And this will corrupt the 1st kernel too with 'cgroup_disable=memory'.

Via the trace and with debugging, it is pointing to commit 97b27821b485
("writeback, memcg: Implement foreign dirty flushing") which introduced
this regression.  Disabling memcg causes the null pointer dereference at
uninitialized data in function mem_cgroup_track_foreign_dirty_slowpath().

Fix it by returning directly if memcg is disabled, but not trying to
record the foreign writebacks with dirty pages.

Link: http://lkml.kernel.org/r/20190924141928.GD31919@MiWiFi-R3L-srv
Fixes: 97b27821b485 ("writeback, memcg: Implement foreign dirty flushing")
Signed-off-by: Baoquan He <bhe@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9b60863429cc..98380779f6d5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1264,6 +1264,9 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
 static inline void mem_cgroup_track_foreign_dirty(struct page *page,
 						  struct bdi_writeback *wb)
 {
+	if (mem_cgroup_disabled())
+		return;
+
 	if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
 		mem_cgroup_track_foreign_dirty_slowpath(page, wb);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 9783aa9917f8ae24759e67bf882f1aba32fe4ea1 Mon Sep 17 00:00:00 2001
From: Chris Down <chris@chrisdown.name>
Date: Sun, 6 Oct 2019 17:58:32 -0700
Subject: mm, memcg: proportional memory.{low,min} reclaim

cgroup v2 introduces two memory protection thresholds: memory.low
(best-effort) and memory.min (hard protection).  While they generally do
what they say on the tin, there is a limitation in their implementation
that makes them difficult to use effectively: that cliff behaviour often
manifests when they become eligible for reclaim.  This patch implements
more intuitive and usable behaviour, where we gradually mount more
reclaim pressure as cgroups further and further exceed their protection
thresholds.

This cliff edge behaviour happens because we only choose whether or not
to reclaim based on whether the memcg is within its protection limits
(see the use of mem_cgroup_protected in shrink_node), but we don't vary
our reclaim behaviour based on this information.  Imagine the following
timeline, with the numbers the lruvec size in this zone:

1. memory.low=1000000, memory.current=999999. 0 pages may be scanned.
2. memory.low=1000000, memory.current=1000000. 0 pages may be scanned.
3. memory.low=1000000, memory.current=1000001. 1000001* pages may be
   scanned. (?!)

* Of course, we won't usually scan all available pages in the zone even
  without this patch because of scan control priority, over-reclaim
  protection, etc.  However, as shown by the tests at the end, these
  techniques don't sufficiently throttle such an extreme change in input,
  so cliff-like behaviour isn't really averted by their existence alone.

Here's an example of how this plays out in practice.  At Facebook, we are
trying to protect various workloads from "system" software, like
configuration management tools, metric collectors, etc (see this[0] case
study).  In order to find a suitable memory.low value, we start by
determining the expected memory range within which the workload will be
comfortable operating.  This isn't an exact science -- memory usage deemed
"comfortable" will vary over time due to user behaviour, differences in
composition of work, etc, etc.  As such we need to ballpark memory.low,
but doing this is currently problematic:

1. If we end up setting it too low for the workload, it won't have
   *any* effect (see discussion above).  The group will receive the full
   weight of reclaim and won't have any priority while competing with the
   less important system software, as if we had no memory.low configured
   at all.

2. Because of this behaviour, we end up erring on the side of setting
   it too high, such that the comfort range is reliably covered.  However,
   protected memory is completely unavailable to the rest of the system,
   so we might cause undue memory and IO pressure there when we *know* we
   have some elasticity in the workload.

3. Even if we get the value totally right, smack in the middle of the
   comfort zone, we get extreme jumps between no pressure and full
   pressure that cause unpredictable pressure spikes in the workload due
   to the current binary reclaim behaviour.

With this patch, we can set it to our ballpark estimation without too much
worry.  Any undesirable behaviour, such as too much or too little reclaim
pressure on the workload or system will be proportional to how far our
estimation is off.  This means we can set memory.low much more
conservatively and thus waste less resources *without* the risk of the
workload falling off a cliff if we overshoot.

As a more abstract technical description, this unintuitive behaviour
results in having to give high-priority workloads a large protection
buffer on top of their expected usage to function reliably, as otherwise
we have abrupt periods of dramatically increased memory pressure which
hamper performance.  Having to set these thresholds so high wastes
resources and generally works against the principle of work conservation.
In addition, having proportional memory reclaim behaviour has other
benefits.  Most notably, before this patch it's basically mandatory to set
memory.low to a higher than desirable value because otherwise as soon as
you exceed memory.low, all protection is lost, and all pages are eligible
to scan again.  By contrast, having a gradual ramp in reclaim pressure
means that you now still get some protection when thresholds are exceeded,
which means that one can now be more comfortable setting memory.low to
lower values without worrying that all protection will be lost.  This is
important because workingset size is really hard to know exactly,
especially with variable workloads, so at least getting *some* protection
if your workingset size grows larger than you expect increases user
confidence in setting memory.low without a huge buffer on top being
needed.

Thanks a lot to Johannes Weiner and Tejun Heo for their advice and
assistance in thinking about how to make this work better.

In testing these changes, I intended to verify that:

1. Changes in page scanning become gradual and proportional instead of
   binary.

   To test this, I experimented stepping further and further down
   memory.low protection on a workload that floats around 19G workingset
   when under memory.low protection, watching page scan rates for the
   workload cgroup:

   +------------+-----------------+--------------------+--------------+
   | memory.low | test (pgscan/s) | control (pgscan/s) | % of control |
   +------------+-----------------+--------------------+--------------+
   |        21G |               0 |                  0 | N/A          |
   |        17G |             867 |               3799 | 23%          |
   |        12G |            1203 |               3543 | 34%          |
   |         8G |            2534 |               3979 | 64%          |
   |         4G |            3980 |               4147 | 96%          |
   |          0 |            3799 |               3980 | 95%          |
   +------------+-----------------+--------------------+--------------+

   As you can see, the test kernel (with a kernel containing this
   patch) ramps up page scanning significantly more gradually than the
   control kernel (without this patch).

2. More gradual ramp up in reclaim aggression doesn't result in
   premature OOMs.

   To test this, I wrote a script that slowly increments the number of
   pages held by stress(1)'s --vm-keep mode until a production system
   entered severe overall memory contention.  This script runs in a highly
   protected slice taking up the majority of available system memory.
   Watching vmstat revealed that page scanning continued essentially
   nominally between test and control, without causing forward reclaim
   progress to become arrested.

[0]: https://facebookmicrosites.github.io/cgroup2/docs/overview.html#case-study-the-fbtax2-project

[akpm@linux-foundation.org: reflow block comments to fit in 80 cols]
[chris@chrisdown.name: handle cgroup_disable=memory when getting memcg protection]
  Link: http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name
Link: http://lkml.kernel.org/r/20190124014455.GA6396@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 20 +++++---
 include/linux/memcontrol.h              | 20 ++++++++
 mm/memcontrol.c                         |  5 ++
 mm/vmscan.c                             | 82 ++++++++++++++++++++++++++++++---
 4 files changed, 115 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 0fa8c0e615c2..5361ebec3361 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -615,8 +615,8 @@ on an IO device and is an example of this type.
 Protections
 -----------
 
-A cgroup is protected to be allocated upto the configured amount of
-the resource if the usages of all its ancestors are under their
+A cgroup is protected upto the configured amount of the resource
+as long as the usages of all its ancestors are under their
 protected levels.  Protections can be hard guarantees or best effort
 soft boundaries.  Protections can also be over-committed in which case
 only upto the amount available to the parent is protected among
@@ -1096,7 +1096,10 @@ PAGE_SIZE multiple when read back.
 	is within its effective min boundary, the cgroup's memory
 	won't be reclaimed under any conditions. If there is no
 	unprotected reclaimable memory available, OOM killer
-	is invoked.
+	is invoked. Above the effective min boundary (or
+	effective low boundary if it is higher), pages are reclaimed
+	proportionally to the overage, reducing reclaim pressure for
+	smaller overages.
 
        Effective min boundary is limited by memory.min values of
 	all ancestor cgroups. If there is memory.min overcommitment
@@ -1118,7 +1121,10 @@ PAGE_SIZE multiple when read back.
 	Best-effort memory protection.  If the memory usage of a
 	cgroup is within its effective low boundary, the cgroup's
 	memory won't be reclaimed unless memory can be reclaimed
-	from unprotected cgroups.
+	from unprotected cgroups.  Above the effective low boundary (or
+	effective min boundary if it is higher), pages are reclaimed
+	proportionally to the overage, reducing reclaim pressure for
+	smaller overages.
 
 	Effective low boundary is limited by memory.low values of
 	all ancestor cgroups. If there is memory.low overcommitment
@@ -2482,8 +2488,10 @@ system performance due to overreclaim, to the point where the feature
 becomes self-defeating.
 
 The memory.low boundary on the other hand is a top-down allocated
-reserve.  A cgroup enjoys reclaim protection when it's within its low,
-which makes delegation of subtrees possible.
+reserve.  A cgroup enjoys reclaim protection when it's within its
+effective low, which makes delegation of subtrees possible. It also
+enjoys having reclaim pressure proportional to its overage when
+above its effective low.
 
 The original high boundary, the hard limit, is defined as a strict
 limit that can not budge, even if the OOM killer has to be called.
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 98380779f6d5..fa9ba2edf7e0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -356,6 +356,14 @@ static inline bool mem_cgroup_disabled(void)
 	return !cgroup_subsys_enabled(memory_cgrp_subsys);
 }
 
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled())
+		return 0;
+
+	return max(READ_ONCE(memcg->memory.emin), READ_ONCE(memcg->memory.elow));
+}
+
 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
 						struct mem_cgroup *memcg);
 
@@ -537,6 +545,8 @@ void mem_cgroup_handle_over_high(void);
 
 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
 
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg);
+
 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
 				struct task_struct *p);
 
@@ -829,6 +839,11 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
 {
 }
 
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg)
+{
+	return 0;
+}
+
 static inline enum mem_cgroup_protection mem_cgroup_protected(
 	struct mem_cgroup *root, struct mem_cgroup *memcg)
 {
@@ -968,6 +983,11 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
 	return 0;
 }
 
+static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+	return 0;
+}
+
 static inline void
 mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c313c49074ca..bdac56009a38 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1567,6 +1567,11 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
 	return max;
 }
 
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+	return page_counter_read(&memcg->memory);
+}
+
 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				     int order)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5d52d6a24af..dfefa1d99d1b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2459,17 +2459,80 @@ out:
 	*lru_pages = 0;
 	for_each_evictable_lru(lru) {
 		int file = is_file_lru(lru);
-		unsigned long size;
+		unsigned long lruvec_size;
 		unsigned long scan;
+		unsigned long protection;
+
+		lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+		protection = mem_cgroup_protection(memcg);
+
+		if (protection > 0) {
+			/*
+			 * Scale a cgroup's reclaim pressure by proportioning
+			 * its current usage to its memory.low or memory.min
+			 * setting.
+			 *
+			 * This is important, as otherwise scanning aggression
+			 * becomes extremely binary -- from nothing as we
+			 * approach the memory protection threshold, to totally
+			 * nominal as we exceed it.  This results in requiring
+			 * setting extremely liberal protection thresholds. It
+			 * also means we simply get no protection at all if we
+			 * set it too low, which is not ideal.
+			 */
+			unsigned long cgroup_size = mem_cgroup_size(memcg);
+			unsigned long baseline = 0;
+
+			/*
+			 * During the reclaim first pass, we only consider
+			 * cgroups in excess of their protection setting, but if
+			 * that doesn't produce free pages, we come back for a
+			 * second pass where we reclaim from all groups.
+			 *
+			 * To maintain fairness in both cases, the first pass
+			 * targets groups in proportion to their overage, and
+			 * the second pass targets groups in proportion to their
+			 * protection utilization.
+			 *
+			 * So on the first pass, a group whose size is 130% of
+			 * its protection will be targeted at 30% of its size.
+			 * On the second pass, a group whose size is at 40% of
+			 * its protection will be
+			 * targeted at 40% of its size.
+			 */
+			if (!sc->memcg_low_reclaim)
+				baseline = lruvec_size;
+			scan = lruvec_size * cgroup_size / protection - baseline;
+
+			/*
+			 * Don't allow the scan target to exceed the lruvec
+			 * size, which otherwise could happen if we have >200%
+			 * overage in the normal case, or >100% overage when
+			 * sc->memcg_low_reclaim is set.
+			 *
+			 * This is important because other cgroups without
+			 * memory.low have their scan target initially set to
+			 * their lruvec size, so allowing values >100% of the
+			 * lruvec size here could result in penalising cgroups
+			 * with memory.low set even *more* than their peers in
+			 * some cases in the case of large overages.
+			 *
+			 * Also, minimally target SWAP_CLUSTER_MAX pages to keep
+			 * reclaim moving forwards.
+			 */
+			scan = clamp(scan, SWAP_CLUSTER_MAX, lruvec_size);
+		} else {
+			scan = lruvec_size;
+		}
+
+		scan >>= sc->priority;
 
-		size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
-		scan = size >> sc->priority;
 		/*
 		 * If the cgroup's already been deleted, make sure to
 		 * scrape out the remaining cache.
 		 */
 		if (!scan && !mem_cgroup_online(memcg))
-			scan = min(size, SWAP_CLUSTER_MAX);
+			scan = min(lruvec_size, SWAP_CLUSTER_MAX);
 
 		switch (scan_balance) {
 		case SCAN_EQUAL:
@@ -2489,7 +2552,7 @@ out:
 		case SCAN_ANON:
 			/* Scan one type exclusively */
 			if ((scan_balance == SCAN_FILE) != file) {
-				size = 0;
+				lruvec_size = 0;
 				scan = 0;
 			}
 			break;
@@ -2498,7 +2561,7 @@ out:
 			BUG();
 		}
 
-		*lru_pages += size;
+		*lru_pages += lruvec_size;
 		nr[lru] = scan;
 	}
 }
@@ -2742,6 +2805,13 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 				memcg_memory_event(memcg, MEMCG_LOW);
 				break;
 			case MEMCG_PROT_NONE:
+				/*
+				 * All protection thresholds breached. We may
+				 * still choose to vary the scan pressure
+				 * applied based on by how much the cgroup in
+				 * question has exceeded its protection
+				 * thresholds (see get_scan_count).
+				 */
 				break;
 			}
 
-- 
cgit v1.2.3-59-g8ed1b


From 9de7ca46ad2688bd51e80f7119fefa301ad7f3fa Mon Sep 17 00:00:00 2001
From: Chris Down <chris@chrisdown.name>
Date: Sun, 6 Oct 2019 17:58:35 -0700
Subject: mm, memcg: make memory.emin the baseline for utilisation
 determination

Roman points out that when when we do the low reclaim pass, we scale the
reclaim pressure relative to position between 0 and the maximum
protection threshold.

However, if the maximum protection is based on memory.elow, and
memory.emin is above zero, this means we still may get binary behaviour
on second-pass low reclaim.  This is because we scale starting at 0, not
starting at memory.emin, and since we don't scan at all below emin, we
end up with cliff behaviour.

This should be a fairly uncommon case since usually we don't go into the
second pass, but it makes sense to scale our low reclaim pressure
starting at emin.

You can test this by catting two large sparse files, one in a cgroup
with emin set to some moderate size compared to physical RAM, and
another cgroup without any emin.  In both cgroups, set an elow larger
than 50% of physical RAM.  The one with emin will have less page
scanning, as reclaim pressure is lower.

Rebase on top of and apply the same idea as what was applied to handle
cgroup_memory=disable properly for the original proportional patch
http://lkml.kernel.org/r/20190201045711.GA18302@chrisdown.name ("mm,
memcg: Handle cgroup_disable=memory when getting memcg protection").

Link: http://lkml.kernel.org/r/20190201051810.GA18895@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Suggested-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 19 +++++++++++-----
 mm/vmscan.c                | 55 +++++++++++++++++++++++++++-------------------
 2 files changed, 46 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fa9ba2edf7e0..1cbad1248e5a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -356,12 +356,17 @@ static inline bool mem_cgroup_disabled(void)
 	return !cgroup_subsys_enabled(memory_cgrp_subsys);
 }
 
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg)
+static inline void mem_cgroup_protection(struct mem_cgroup *memcg,
+					 unsigned long *min, unsigned long *low)
 {
-	if (mem_cgroup_disabled())
-		return 0;
+	if (mem_cgroup_disabled()) {
+		*min = 0;
+		*low = 0;
+		return;
+	}
 
-	return max(READ_ONCE(memcg->memory.emin), READ_ONCE(memcg->memory.elow));
+	*min = READ_ONCE(memcg->memory.emin);
+	*low = READ_ONCE(memcg->memory.elow);
 }
 
 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
@@ -839,9 +844,11 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
 {
 }
 
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg)
+static inline void mem_cgroup_protection(struct mem_cgroup *memcg,
+					 unsigned long *min, unsigned long *low)
 {
-	return 0;
+	*min = 0;
+	*low = 0;
 }
 
 static inline enum mem_cgroup_protection mem_cgroup_protected(
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dfefa1d99d1b..70347d626fb3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2461,12 +2461,12 @@ out:
 		int file = is_file_lru(lru);
 		unsigned long lruvec_size;
 		unsigned long scan;
-		unsigned long protection;
+		unsigned long min, low;
 
 		lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
-		protection = mem_cgroup_protection(memcg);
+		mem_cgroup_protection(memcg, &min, &low);
 
-		if (protection > 0) {
+		if (min || low) {
 			/*
 			 * Scale a cgroup's reclaim pressure by proportioning
 			 * its current usage to its memory.low or memory.min
@@ -2481,28 +2481,38 @@ out:
 			 * set it too low, which is not ideal.
 			 */
 			unsigned long cgroup_size = mem_cgroup_size(memcg);
-			unsigned long baseline = 0;
 
 			/*
-			 * During the reclaim first pass, we only consider
-			 * cgroups in excess of their protection setting, but if
-			 * that doesn't produce free pages, we come back for a
-			 * second pass where we reclaim from all groups.
+			 * If there is any protection in place, we adjust scan
+			 * pressure in proportion to how much a group's current
+			 * usage exceeds that, in percent.
 			 *
-			 * To maintain fairness in both cases, the first pass
-			 * targets groups in proportion to their overage, and
-			 * the second pass targets groups in proportion to their
-			 * protection utilization.
-			 *
-			 * So on the first pass, a group whose size is 130% of
-			 * its protection will be targeted at 30% of its size.
-			 * On the second pass, a group whose size is at 40% of
-			 * its protection will be
-			 * targeted at 40% of its size.
+			 * There is one special case: in the first reclaim pass,
+			 * we skip over all groups that are within their low
+			 * protection. If that fails to reclaim enough pages to
+			 * satisfy the reclaim goal, we come back and override
+			 * the best-effort low protection. However, we still
+			 * ideally want to honor how well-behaved groups are in
+			 * that case instead of simply punishing them all
+			 * equally. As such, we reclaim them based on how much
+			 * of their best-effort protection they are using. Usage
+			 * below memory.min is excluded from consideration when
+			 * calculating utilisation, as it isn't ever
+			 * reclaimable, so it might as well not exist for our
+			 * purposes.
 			 */
-			if (!sc->memcg_low_reclaim)
-				baseline = lruvec_size;
-			scan = lruvec_size * cgroup_size / protection - baseline;
+			if (sc->memcg_low_reclaim && low > min) {
+				/*
+				 * Reclaim according to utilisation between min
+				 * and low
+				 */
+				scan = lruvec_size * (cgroup_size - min) /
+					(low - min);
+			} else {
+				/* Reclaim according to protection overage */
+				scan = lruvec_size * cgroup_size /
+					max(min, low) - lruvec_size;
+			}
 
 			/*
 			 * Don't allow the scan target to exceed the lruvec
@@ -2518,7 +2528,8 @@ out:
 			 * some cases in the case of large overages.
 			 *
 			 * Also, minimally target SWAP_CLUSTER_MAX pages to keep
-			 * reclaim moving forwards.
+			 * reclaim moving forwards, avoiding decremeting
+			 * sc->priority further than desirable.
 			 */
 			scan = clamp(scan, SWAP_CLUSTER_MAX, lruvec_size);
 		} else {
-- 
cgit v1.2.3-59-g8ed1b


From 1bc63fb1272be0773e925f78c0fbd06c89701d55 Mon Sep 17 00:00:00 2001
From: Chris Down <chris@chrisdown.name>
Date: Sun, 6 Oct 2019 17:58:38 -0700
Subject: mm, memcg: make scan aggression always exclude protection

This patch is an incremental improvement on the existing
memory.{low,min} relative reclaim work to base its scan pressure
calculations on how much protection is available compared to the current
usage, rather than how much the current usage is over some protection
threshold.

This change doesn't change the experience for the user in the normal
case too much.  One benefit is that it replaces the (somewhat arbitrary)
100% cutoff with an indefinite slope, which makes it easier to ballpark
a memory.low value.

As well as this, the old methodology doesn't quite apply generically to
machines with varying amounts of physical memory.  Let's say we have a
top level cgroup, workload.slice, and another top level cgroup,
system-management.slice.  We want to roughly give 12G to
system-management.slice, so on a 32GB machine we set memory.low to 20GB
in workload.slice, and on a 64GB machine we set memory.low to 52GB.
However, because these are relative amounts to the total machine size,
while the amount of memory we want to generally be willing to yield to
system.slice is absolute (12G), we end up putting more pressure on
system.slice just because we have a larger machine and a larger workload
to fill it, which seems fairly unintuitive.  With this new behaviour, we
don't end up with this unintended side effect.

Previously the way that memory.low protection works is that if you are
50% over a certain baseline, you get 50% of your normal scan pressure.
This is certainly better than the previous cliff-edge behaviour, but it
can be improved even further by always considering memory under the
currently enforced protection threshold to be out of bounds.  This means
that we can set relatively low memory.low thresholds for variable or
bursty workloads while still getting a reasonable level of protection,
whereas with the previous version we may still trivially hit the 100%
clamp.  The previous 100% clamp is also somewhat arbitrary, whereas this
one is more concretely based on the currently enforced protection
threshold, which is likely easier to reason about.

There is also a subtle issue with the way that proportional reclaim
worked previously -- it promotes having no memory.low, since it makes
pressure higher during low reclaim.  This happens because we base our
scan pressure modulation on how far memory.current is between memory.min
and memory.low, but if memory.low is unset, we only use the overage
method.  In most cromulent configurations, this then means that we end
up with *more* pressure than with no memory.low at all when we're in low
reclaim, which is not really very usable or expected.

With this patch, memory.low and memory.min affect reclaim pressure in a
more understandable and composable way.  For example, from a user
standpoint, "protected" memory now remains untouchable from a reclaim
aggression standpoint, and users can also have more confidence that
bursty workloads will still receive some amount of guaranteed
protection.

Link: http://lkml.kernel.org/r/20190322160307.GA3316@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 25 +++++++++----------
 mm/vmscan.c                | 61 +++++++++++++++-------------------------------
 2 files changed, 32 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1cbad1248e5a..ae703ea3ef48 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -356,17 +356,17 @@ static inline bool mem_cgroup_disabled(void)
 	return !cgroup_subsys_enabled(memory_cgrp_subsys);
 }
 
-static inline void mem_cgroup_protection(struct mem_cgroup *memcg,
-					 unsigned long *min, unsigned long *low)
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+						  bool in_low_reclaim)
 {
-	if (mem_cgroup_disabled()) {
-		*min = 0;
-		*low = 0;
-		return;
-	}
+	if (mem_cgroup_disabled())
+		return 0;
+
+	if (in_low_reclaim)
+		return READ_ONCE(memcg->memory.emin);
 
-	*min = READ_ONCE(memcg->memory.emin);
-	*low = READ_ONCE(memcg->memory.elow);
+	return max(READ_ONCE(memcg->memory.emin),
+		   READ_ONCE(memcg->memory.elow));
 }
 
 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
@@ -844,11 +844,10 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
 {
 }
 
-static inline void mem_cgroup_protection(struct mem_cgroup *memcg,
-					 unsigned long *min, unsigned long *low)
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+						  bool in_low_reclaim)
 {
-	*min = 0;
-	*low = 0;
+	return 0;
 }
 
 static inline enum mem_cgroup_protection mem_cgroup_protected(
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 70347d626fb3..c6659bb758a4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2461,12 +2461,13 @@ out:
 		int file = is_file_lru(lru);
 		unsigned long lruvec_size;
 		unsigned long scan;
-		unsigned long min, low;
+		unsigned long protection;
 
 		lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
-		mem_cgroup_protection(memcg, &min, &low);
+		protection = mem_cgroup_protection(memcg,
+						   sc->memcg_low_reclaim);
 
-		if (min || low) {
+		if (protection) {
 			/*
 			 * Scale a cgroup's reclaim pressure by proportioning
 			 * its current usage to its memory.low or memory.min
@@ -2479,13 +2480,10 @@ out:
 			 * setting extremely liberal protection thresholds. It
 			 * also means we simply get no protection at all if we
 			 * set it too low, which is not ideal.
-			 */
-			unsigned long cgroup_size = mem_cgroup_size(memcg);
-
-			/*
-			 * If there is any protection in place, we adjust scan
-			 * pressure in proportion to how much a group's current
-			 * usage exceeds that, in percent.
+			 *
+			 * If there is any protection in place, we reduce scan
+			 * pressure by how much of the total memory used is
+			 * within protection thresholds.
 			 *
 			 * There is one special case: in the first reclaim pass,
 			 * we skip over all groups that are within their low
@@ -2495,43 +2493,24 @@ out:
 			 * ideally want to honor how well-behaved groups are in
 			 * that case instead of simply punishing them all
 			 * equally. As such, we reclaim them based on how much
-			 * of their best-effort protection they are using. Usage
-			 * below memory.min is excluded from consideration when
-			 * calculating utilisation, as it isn't ever
-			 * reclaimable, so it might as well not exist for our
-			 * purposes.
+			 * memory they are using, reducing the scan pressure
+			 * again by how much of the total memory used is under
+			 * hard protection.
 			 */
-			if (sc->memcg_low_reclaim && low > min) {
-				/*
-				 * Reclaim according to utilisation between min
-				 * and low
-				 */
-				scan = lruvec_size * (cgroup_size - min) /
-					(low - min);
-			} else {
-				/* Reclaim according to protection overage */
-				scan = lruvec_size * cgroup_size /
-					max(min, low) - lruvec_size;
-			}
+			unsigned long cgroup_size = mem_cgroup_size(memcg);
+
+			/* Avoid TOCTOU with earlier protection check */
+			cgroup_size = max(cgroup_size, protection);
+
+			scan = lruvec_size - lruvec_size * protection /
+				cgroup_size;
 
 			/*
-			 * Don't allow the scan target to exceed the lruvec
-			 * size, which otherwise could happen if we have >200%
-			 * overage in the normal case, or >100% overage when
-			 * sc->memcg_low_reclaim is set.
-			 *
-			 * This is important because other cgroups without
-			 * memory.low have their scan target initially set to
-			 * their lruvec size, so allowing values >100% of the
-			 * lruvec size here could result in penalising cgroups
-			 * with memory.low set even *more* than their peers in
-			 * some cases in the case of large overages.
-			 *
-			 * Also, minimally target SWAP_CLUSTER_MAX pages to keep
+			 * Minimally target SWAP_CLUSTER_MAX pages to keep
 			 * reclaim moving forwards, avoiding decremeting
 			 * sc->priority further than desirable.
 			 */
-			scan = clamp(scan, SWAP_CLUSTER_MAX, lruvec_size);
+			scan = max(scan, SWAP_CLUSTER_MAX);
 		} else {
 			scan = lruvec_size;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 59bb47985c1db229ccff8c5deebecd54fc77d2a9 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Sun, 6 Oct 2019 17:58:45 -0700
Subject: mm, sl[aou]b: guarantee natural alignment for kmalloc(power-of-two)

In most configurations, kmalloc() happens to return naturally aligned
(i.e.  aligned to the block size itself) blocks for power of two sizes.

That means some kmalloc() users might unknowingly rely on that
alignment, until stuff breaks when the kernel is built with e.g.
CONFIG_SLUB_DEBUG or CONFIG_SLOB, and blocks stop being aligned.  Then
developers have to devise workaround such as own kmem caches with
specified alignment [1], which is not always practical, as recently
evidenced in [2].

The topic has been discussed at LSF/MM 2019 [3].  Adding a
'kmalloc_aligned()' variant would not help with code unknowingly relying
on the implicit alignment.  For slab implementations it would either
require creating more kmalloc caches, or allocate a larger size and only
give back part of it.  That would be wasteful, especially with a generic
alignment parameter (in contrast with a fixed alignment to size).

Ideally we should provide to mm users what they need without difficult
workarounds or own reimplementations, so let's make the kmalloc()
alignment to size explicitly guaranteed for power-of-two sizes under all
configurations.  What this means for the three available allocators?

* SLAB object layout happens to be mostly unchanged by the patch.  The
  implicitly provided alignment could be compromised with
  CONFIG_DEBUG_SLAB due to redzoning, however SLAB disables redzoning for
  caches with alignment larger than unsigned long long.  Practically on at
  least x86 this includes kmalloc caches as they use cache line alignment,
  which is larger than that.  Still, this patch ensures alignment on all
  arches and cache sizes.

* SLUB layout is also unchanged unless redzoning is enabled through
  CONFIG_SLUB_DEBUG and boot parameter for the particular kmalloc cache.
  With this patch, explicit alignment is guaranteed with redzoning as
  well.  This will result in more memory being wasted, but that should be
  acceptable in a debugging scenario.

* SLOB has no implicit alignment so this patch adds it explicitly for
  kmalloc().  The potential downside is increased fragmentation.  While
  pathological allocation scenarios are certainly possible, in my testing,
  after booting a x86_64 kernel+userspace with virtme, around 16MB memory
  was consumed by slab pages both before and after the patch, with
  difference in the noise.

[1] https://lore.kernel.org/linux-btrfs/c3157c8e8e0e7588312b40c853f65c02fe6c957a.1566399731.git.christophe.leroy@c-s.fr/
[2] https://lore.kernel.org/linux-fsdevel/20190225040904.5557-1-ming.lei@redhat.com/
[3] https://lwn.net/Articles/787740/

[akpm@linux-foundation.org: documentation fixlet, per Matthew]
Link: http://lkml.kernel.org/r/20190826111627.7505-3-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Cc: David Sterba <dsterba@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: "Darrick J . Wong" <darrick.wong@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/core-api/memory-allocation.rst |  4 +++
 include/linux/slab.h                         |  4 +++
 mm/slab_common.c                             | 11 +++++++-
 mm/slob.c                                    | 42 ++++++++++++++++++++--------
 4 files changed, 49 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/core-api/memory-allocation.rst b/Documentation/core-api/memory-allocation.rst
index 7744aa3bf2e0..939e3dfc86e9 100644
--- a/Documentation/core-api/memory-allocation.rst
+++ b/Documentation/core-api/memory-allocation.rst
@@ -98,6 +98,10 @@ limited. The actual limit depends on the hardware and the kernel
 configuration, but it is a good practice to use `kmalloc` for objects
 smaller than page size.
 
+The address of a chunk allocated with `kmalloc` is aligned to at least
+ARCH_KMALLOC_MINALIGN bytes.  For sizes which are a power of two, the
+alignment is also guaranteed to be at least the respective size.
+
 For large allocations you can use :c:func:`vmalloc` and
 :c:func:`vzalloc`, or directly request pages from the page
 allocator. The memory allocated by `vmalloc` and related functions is
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ab2b98ad76e1..4d2a2fa55ed5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -493,6 +493,10 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
  * kmalloc is the normal method of allocating memory
  * for objects smaller than page size in the kernel.
  *
+ * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN
+ * bytes. For @size of power of two bytes, the alignment is also guaranteed
+ * to be at least to the size.
+ *
  * The @flags argument may be one of the GFP flags defined at
  * include/linux/gfp.h and described at
  * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0a94cf858aa4..c29f03adca91 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1030,10 +1030,19 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
 		unsigned int useroffset, unsigned int usersize)
 {
 	int err;
+	unsigned int align = ARCH_KMALLOC_MINALIGN;
 
 	s->name = name;
 	s->size = s->object_size = size;
-	s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+
+	/*
+	 * For power of two sizes, guarantee natural alignment for kmalloc
+	 * caches, regardless of SL*B debugging options.
+	 */
+	if (is_power_of_2(size))
+		align = max(align, size);
+	s->align = calculate_alignment(flags, align, size);
+
 	s->useroffset = useroffset;
 	s->usersize = usersize;
 
diff --git a/mm/slob.c b/mm/slob.c
index 835088d55645..fa53e9f73893 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -224,6 +224,7 @@ static void slob_free_pages(void *b, int order)
  * @sp: Page to look in.
  * @size: Size of the allocation.
  * @align: Allocation alignment.
+ * @align_offset: Offset in the allocated block that will be aligned.
  * @page_removed_from_list: Return parameter.
  *
  * Tries to find a chunk of memory at least @size bytes big within @page.
@@ -234,7 +235,7 @@ static void slob_free_pages(void *b, int order)
  *         true (set to false otherwise).
  */
 static void *slob_page_alloc(struct page *sp, size_t size, int align,
-			     bool *page_removed_from_list)
+			      int align_offset, bool *page_removed_from_list)
 {
 	slob_t *prev, *cur, *aligned = NULL;
 	int delta = 0, units = SLOB_UNITS(size);
@@ -243,8 +244,17 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
 	for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
 		slobidx_t avail = slob_units(cur);
 
+		/*
+		 * 'aligned' will hold the address of the slob block so that the
+		 * address 'aligned'+'align_offset' is aligned according to the
+		 * 'align' parameter. This is for kmalloc() which prepends the
+		 * allocated block with its size, so that the block itself is
+		 * aligned when needed.
+		 */
 		if (align) {
-			aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+			aligned = (slob_t *)
+				(ALIGN((unsigned long)cur + align_offset, align)
+				 - align_offset);
 			delta = aligned - cur;
 		}
 		if (avail >= units + delta) { /* room enough? */
@@ -288,7 +298,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
 /*
  * slob_alloc: entry point into the slob allocator.
  */
-static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
+							int align_offset)
 {
 	struct page *sp;
 	struct list_head *slob_list;
@@ -319,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 		if (sp->units < SLOB_UNITS(size))
 			continue;
 
-		b = slob_page_alloc(sp, size, align, &page_removed_from_list);
+		b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list);
 		if (!b)
 			continue;
 
@@ -356,7 +367,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 		INIT_LIST_HEAD(&sp->slab_list);
 		set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
 		set_slob_page_free(sp, slob_list);
-		b = slob_page_alloc(sp, size, align, &_unused);
+		b = slob_page_alloc(sp, size, align, align_offset, &_unused);
 		BUG_ON(!b);
 		spin_unlock_irqrestore(&slob_lock, flags);
 	}
@@ -458,7 +469,7 @@ static __always_inline void *
 __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
 {
 	unsigned int *m;
-	int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+	int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
 	void *ret;
 
 	gfp &= gfp_allowed_mask;
@@ -466,19 +477,28 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
 	fs_reclaim_acquire(gfp);
 	fs_reclaim_release(gfp);
 
-	if (size < PAGE_SIZE - align) {
+	if (size < PAGE_SIZE - minalign) {
+		int align = minalign;
+
+		/*
+		 * For power of two sizes, guarantee natural alignment for
+		 * kmalloc()'d objects.
+		 */
+		if (is_power_of_2(size))
+			align = max(minalign, (int) size);
+
 		if (!size)
 			return ZERO_SIZE_PTR;
 
-		m = slob_alloc(size + align, gfp, align, node);
+		m = slob_alloc(size + minalign, gfp, align, node, minalign);
 
 		if (!m)
 			return NULL;
 		*m = size;
-		ret = (void *)m + align;
+		ret = (void *)m + minalign;
 
 		trace_kmalloc_node(caller, ret,
-				   size, size + align, gfp, node);
+				   size, size + minalign, gfp, node);
 	} else {
 		unsigned int order = get_order(size);
 
@@ -579,7 +599,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 	fs_reclaim_release(flags);
 
 	if (c->size < PAGE_SIZE) {
-		b = slob_alloc(c->size, flags, c->align, node);
+		b = slob_alloc(c->size, flags, c->align, node, 0);
 		trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
 					    SLOB_UNITS(c->size) * SLOB_UNIT,
 					    flags, node);
-- 
cgit v1.2.3-59-g8ed1b


From bec500777089b3c96c53681fc0aa6fee59711d4a Mon Sep 17 00:00:00 2001
From: Arvind Sankar <nivedita@alum.mit.edu>
Date: Mon, 7 Oct 2019 18:00:02 -0400
Subject: lib/string: Make memzero_explicit() inline instead of external

With the use of the barrier implied by barrier_data(), there is no need
for memzero_explicit() to be extern. Making it inline saves the overhead
of a function call, and allows the code to be reused in arch/*/purgatory
without having to duplicate the implementation.

Tested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Arvind Sankar <nivedita@alum.mit.edu>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H . Peter Anvin <hpa@zytor.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephan Mueller <smueller@chronox.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-crypto@vger.kernel.org
Cc: linux-s390@vger.kernel.org
Fixes: 906a4bb97f5d ("crypto: sha256 - Use get/put_unaligned_be32 to get input, memzero_explicit")
Link: https://lkml.kernel.org/r/20191007220000.GA408752@rani.riverdale.lan
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/string.h | 21 ++++++++++++++++++++-
 lib/string.c           | 21 ---------------------
 2 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index b2f9df7f0761..b6ccdc2c7f02 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -227,7 +227,26 @@ static inline bool strstarts(const char *str, const char *prefix)
 }
 
 size_t memweight(const void *ptr, size_t bytes);
-void memzero_explicit(void *s, size_t count);
+
+/**
+ * memzero_explicit - Fill a region of memory (e.g. sensitive
+ *		      keying data) with 0s.
+ * @s: Pointer to the start of the area.
+ * @count: The size of the area.
+ *
+ * Note: usually using memset() is just fine (!), but in cases
+ * where clearing out _local_ data at the end of a scope is
+ * necessary, memzero_explicit() should be used instead in
+ * order to prevent the compiler from optimising away zeroing.
+ *
+ * memzero_explicit() doesn't need an arch-specific version as
+ * it just invokes the one of memset() implicitly.
+ */
+static inline void memzero_explicit(void *s, size_t count)
+{
+	memset(s, 0, count);
+	barrier_data(s);
+}
 
 /**
  * kbasename - return the last part of a pathname.
diff --git a/lib/string.c b/lib/string.c
index cd7a10c19210..08ec58cc673b 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -748,27 +748,6 @@ void *memset(void *s, int c, size_t count)
 EXPORT_SYMBOL(memset);
 #endif
 
-/**
- * memzero_explicit - Fill a region of memory (e.g. sensitive
- *		      keying data) with 0s.
- * @s: Pointer to the start of the area.
- * @count: The size of the area.
- *
- * Note: usually using memset() is just fine (!), but in cases
- * where clearing out _local_ data at the end of a scope is
- * necessary, memzero_explicit() should be used instead in
- * order to prevent the compiler from optimising away zeroing.
- *
- * memzero_explicit() doesn't need an arch-specific version as
- * it just invokes the one of memset() implicitly.
- */
-void memzero_explicit(void *s, size_t count)
-{
-	memset(s, 0, count);
-	barrier_data(s);
-}
-EXPORT_SYMBOL(memzero_explicit);
-
 #ifndef __HAVE_ARCH_MEMSET16
 /**
  * memset16() - Fill a memory area with a uint16_t
-- 
cgit v1.2.3-59-g8ed1b


From e3f1271474182682638654021123b94e6ec1626b Mon Sep 17 00:00:00 2001
From: Dan Murphy <dmurphy@ti.com>
Date: Wed, 2 Oct 2019 07:40:42 -0500
Subject: leds: core: Fix leds.h structure documentation

Update the leds.h structure documentation to define the
correct arguments.

Signed-off-by: Dan Murphy <dmurphy@ti.com>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 include/linux/leds.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index b8df71193329..efb309dba914 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -247,7 +247,7 @@ extern void led_set_brightness(struct led_classdev *led_cdev,
 /**
  * led_set_brightness_sync - set LED brightness synchronously
  * @led_cdev: the LED to set
- * @brightness: the brightness to set it to
+ * @value: the brightness to set it to
  *
  * Set an LED's brightness immediately. This function will block
  * the caller for the time required for accessing device registers,
@@ -301,8 +301,7 @@ extern void led_sysfs_enable(struct led_classdev *led_cdev);
 /**
  * led_compose_name - compose LED class device name
  * @dev: LED controller device object
- * @child: child fwnode_handle describing a LED or a group of synchronized LEDs;
- *	   it must be provided only for fwnode based LEDs
+ * @init_data: the LED class device initialization data
  * @led_classdev_name: composed LED class device name
  *
  * Create LED class device name basing on the provided init_data argument.
-- 
cgit v1.2.3-59-g8ed1b


From af84537dbd1b39505d1f3d8023029b4a59666513 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Wed, 2 Oct 2019 10:40:55 -0400
Subject: SUNRPC: fix race to sk_err after xs_error_report

Since commit 4f8943f80883 ("SUNRPC: Replace direct task wakeups from
softirq context") there has been a race to the value of the sk_err if both
XPRT_SOCK_WAKE_ERROR and XPRT_SOCK_WAKE_DISCONNECT are set.  In that case,
we may end up losing the sk_err value that existed when xs_error_report was
called.

Fix this by reverting to the previous behavior: instead of using SO_ERROR
to retrieve the value at a later time (which might also return sk_err_soft),
copy the sk_err value onto struct sock_xprt, and use that value to wake
pending tasks.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Fixes: 4f8943f80883 ("SUNRPC: Replace direct task wakeups from softirq context")
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprtsock.h |  1 +
 net/sunrpc/xprtsock.c           | 17 ++++++++---------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index 7638dbe7bc50..a940de03808d 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -61,6 +61,7 @@ struct sock_xprt {
 	struct mutex		recv_mutex;
 	struct sockaddr_storage	srcaddr;
 	unsigned short		srcport;
+	int			xprt_err;
 
 	/*
 	 * UDP socket buffer size parameters
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 9ac88722fa83..70e52f567b2a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1249,19 +1249,21 @@ static void xs_error_report(struct sock *sk)
 {
 	struct sock_xprt *transport;
 	struct rpc_xprt *xprt;
-	int err;
 
 	read_lock_bh(&sk->sk_callback_lock);
 	if (!(xprt = xprt_from_sock(sk)))
 		goto out;
 
 	transport = container_of(xprt, struct sock_xprt, xprt);
-	err = -sk->sk_err;
-	if (err == 0)
+	transport->xprt_err = -sk->sk_err;
+	if (transport->xprt_err == 0)
 		goto out;
 	dprintk("RPC:       xs_error_report client %p, error=%d...\n",
-			xprt, -err);
-	trace_rpc_socket_error(xprt, sk->sk_socket, err);
+			xprt, -transport->xprt_err);
+	trace_rpc_socket_error(xprt, sk->sk_socket, transport->xprt_err);
+
+	/* barrier ensures xprt_err is set before XPRT_SOCK_WAKE_ERROR */
+	smp_mb__before_atomic();
 	xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR);
  out:
 	read_unlock_bh(&sk->sk_callback_lock);
@@ -2476,7 +2478,6 @@ static void xs_wake_write(struct sock_xprt *transport)
 static void xs_wake_error(struct sock_xprt *transport)
 {
 	int sockerr;
-	int sockerr_len = sizeof(sockerr);
 
 	if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state))
 		return;
@@ -2485,9 +2486,7 @@ static void xs_wake_error(struct sock_xprt *transport)
 		goto out;
 	if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state))
 		goto out;
-	if (kernel_getsockopt(transport->sock, SOL_SOCKET, SO_ERROR,
-				(char *)&sockerr, &sockerr_len) != 0)
-		goto out;
+	sockerr = xchg(&transport->xprt_err, 0);
 	if (sockerr < 0)
 		xprt_wake_pending_tasks(&transport->xprt, sockerr);
 out:
-- 
cgit v1.2.3-59-g8ed1b


From 294f69e662d1570703e9b56e95be37a9fd3afba5 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sat, 5 Oct 2019 09:46:42 -0700
Subject: compiler_attributes.h: Add 'fallthrough' pseudo keyword for
 switch/case use

Reserve the pseudo keyword 'fallthrough' for the ability to convert the
various case block /* fallthrough */ style comments to appear to be an
actual reserved word with the same gcc case block missing fallthrough
warning capability.

All switch/case blocks now should end in one of:

	break;
	fallthrough;
	goto <label>;
	return [expression];
	continue;

In C mode, GCC supports the __fallthrough__ attribute since 7.1,
the same time the warning and the comment parsing were introduced.

fallthrough devolves to an empty "do {} while (0)" if the compiler
version (any version less than gcc 7) does not support the attribute.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Suggested-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler_attributes.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index 6b318efd8a74..cdf016596659 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -40,6 +40,7 @@
 # define __GCC4_has_attribute___noclone__             1
 # define __GCC4_has_attribute___nonstring__           0
 # define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8)
+# define __GCC4_has_attribute___fallthrough__         0
 #endif
 
 /*
@@ -185,6 +186,22 @@
 # define __noclone
 #endif
 
+/*
+ * Add the pseudo keyword 'fallthrough' so case statement blocks
+ * must end with any of these keywords:
+ *   break;
+ *   fallthrough;
+ *   goto <label>;
+ *   return [expression];
+ *
+ *  gcc: https://gcc.gnu.org/onlinedocs/gcc/Statement-Attributes.html#Statement-Attributes
+ */
+#if __has_attribute(__fallthrough__)
+# define fallthrough                    __attribute__((__fallthrough__))
+#else
+# define fallthrough                    do {} while (0)  /* fallthrough */
+#endif
+
 /*
  * Note the missing underscores.
  *
-- 
cgit v1.2.3-59-g8ed1b


From d983ea6f16b835dcde2ee9a58a1e764ce68bfccc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 Oct 2019 20:17:38 -0700
Subject: tcp: add rcu protection around tp->fastopen_rsk

Both tcp_v4_err() and tcp_v6_err() do the following operations
while they do not own the socket lock :

	fastopen = tp->fastopen_rsk;
 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;

The problem is that without appropriate barrier, the compiler
might reload tp->fastopen_rsk and trigger a NULL deref.

request sockets are protected by RCU, we can simply add
the missing annotations and barriers to solve the issue.

Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h             |  6 +++---
 net/core/request_sock.c         |  2 +-
 net/ipv4/inet_connection_sock.c |  4 ++--
 net/ipv4/tcp.c                  | 11 ++++++++---
 net/ipv4/tcp_fastopen.c         |  2 +-
 net/ipv4/tcp_input.c            | 13 +++++++++----
 net/ipv4/tcp_ipv4.c             |  4 ++--
 net/ipv4/tcp_minisocks.c        |  2 +-
 net/ipv4/tcp_output.c           |  2 +-
 net/ipv4/tcp_timer.c            | 11 ++++++-----
 net/ipv6/tcp_ipv6.c             |  2 +-
 11 files changed, 35 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 99617e528ea2..668e25a76d69 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -393,7 +393,7 @@ struct tcp_sock {
 	/* fastopen_rsk points to request_sock that resulted in this big
 	 * socket. Used to retransmit SYNACKs etc.
 	 */
-	struct request_sock *fastopen_rsk;
+	struct request_sock __rcu *fastopen_rsk;
 	u32	*saved_syn;
 };
 
@@ -447,8 +447,8 @@ static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
 
 static inline bool tcp_passive_fastopen(const struct sock *sk)
 {
-	return (sk->sk_state == TCP_SYN_RECV &&
-		tcp_sk(sk)->fastopen_rsk != NULL);
+	return sk->sk_state == TCP_SYN_RECV &&
+	       rcu_access_pointer(tcp_sk(sk)->fastopen_rsk) != NULL;
 }
 
 static inline void fastopen_queue_tune(struct sock *sk, int backlog)
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index c9bb00008528..f35c2e998406 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -96,7 +96,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
 
 	fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;
 
-	tcp_sk(sk)->fastopen_rsk = NULL;
+	RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL);
 	spin_lock_bh(&fastopenq->lock);
 	fastopenq->qlen--;
 	tcp_rsk(req)->tfo_listener = false;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index dbcf34ec8dd2..eb30fc1770de 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -906,7 +906,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
 	percpu_counter_inc(sk->sk_prot->orphan_count);
 
 	if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
-		BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+		BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
 		BUG_ON(sk != req->rsk_listener);
 
 		/* Paranoid, to prevent race condition if
@@ -915,7 +915,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
 		 * Also to satisfy an assertion in
 		 * tcp_v4_destroy_sock().
 		 */
-		tcp_sk(child)->fastopen_rsk = NULL;
+		RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL);
 	}
 	inet_csk_destroy_sock(child);
 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8781a92ea4b6..c59d0bd29c5c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -543,7 +543,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 	/* Connected or passive Fast Open socket? */
 	if (state != TCP_SYN_SENT &&
-	    (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
+	    (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
 		int target = sock_rcvlowat(sk, 0, INT_MAX);
 
 		if (tp->urg_seq == tp->copied_seq &&
@@ -2487,7 +2487,10 @@ adjudge_to_death:
 	}
 
 	if (sk->sk_state == TCP_CLOSE) {
-		struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+		struct request_sock *req;
+
+		req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
+						lockdep_sock_is_held(sk));
 		/* We could get here with a non-NULL req if the socket is
 		 * aborted (e.g., closed with unread data) before 3WHS
 		 * finishes.
@@ -3831,8 +3834,10 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
 
 void tcp_done(struct sock *sk)
 {
-	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+	struct request_sock *req;
 
+	req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
+					lockdep_sock_is_held(sk));
 	if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
 		TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
 
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 3fd451271a70..a915ade0c818 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -253,7 +253,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 	 */
 	tp = tcp_sk(child);
 
-	tp->fastopen_rsk = req;
+	rcu_assign_pointer(tp->fastopen_rsk, req);
 	tcp_rsk(req)->tfo_listener = true;
 
 	/* RFC1323: The window in SYN & SYN/ACK segments is never
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3578357abe30..5f9b102c3b55 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2666,7 +2666,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
 	struct tcp_sock *tp = tcp_sk(sk);
 	bool recovered = !before(tp->snd_una, tp->high_seq);
 
-	if ((flag & FLAG_SND_UNA_ADVANCED || tp->fastopen_rsk) &&
+	if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) &&
 	    tcp_try_undo_loss(sk, false))
 		return;
 
@@ -2990,7 +2990,7 @@ void tcp_rearm_rto(struct sock *sk)
 	/* If the retrans timer is currently being used by Fast Open
 	 * for SYN-ACK retrans purpose, stay put.
 	 */
-	if (tp->fastopen_rsk)
+	if (rcu_access_pointer(tp->fastopen_rsk))
 		return;
 
 	if (!tp->packets_out) {
@@ -6087,6 +6087,8 @@ reset_and_undo:
 
 static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
 {
+	struct request_sock *req;
+
 	tcp_try_undo_loss(sk, false);
 
 	/* Reset rtx states to prevent spurious retransmits_timed_out() */
@@ -6096,7 +6098,9 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
 	/* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
 	 * we no longer need req so release it.
 	 */
-	reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false);
+	req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
+					lockdep_sock_is_held(sk));
+	reqsk_fastopen_remove(sk, req, false);
 
 	/* Re-arm the timer because data may have been sent out.
 	 * This is similar to the regular data transmission case
@@ -6171,7 +6175,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 
 	tcp_mstamp_refresh(tp);
 	tp->rx_opt.saw_tstamp = 0;
-	req = tp->fastopen_rsk;
+	req = rcu_dereference_protected(tp->fastopen_rsk,
+					lockdep_sock_is_held(sk));
 	if (req) {
 		bool req_stolen;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 492bf6a6b023..ffa366099eb2 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -478,7 +478,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 	icsk = inet_csk(sk);
 	tp = tcp_sk(sk);
 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
-	fastopen = tp->fastopen_rsk;
+	fastopen = rcu_dereference(tp->fastopen_rsk);
 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 	if (sk->sk_state != TCP_LISTEN &&
 	    !between(seq, snd_una, tp->snd_nxt)) {
@@ -2121,7 +2121,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	if (inet_csk(sk)->icsk_bind_hash)
 		inet_put_port(sk);
 
-	BUG_ON(tp->fastopen_rsk);
+	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
 
 	/* If socket is aborted during connect operation */
 	tcp_free_fastopen_req(tp);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index bb140a5db8c0..5401dbd39c8f 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -541,7 +541,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 	newtp->rx_opt.mss_clamp = req->mss;
 	tcp_ecn_openreq_child(newtp, req);
 	newtp->fastopen_req = NULL;
-	newtp->fastopen_rsk = NULL;
+	RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
 
 	__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fec6d67bfd14..84ae4d1449ea 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2482,7 +2482,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
 	/* Don't do any loss probe on a Fast Open connection before 3WHS
 	 * finishes.
 	 */
-	if (tp->fastopen_rsk)
+	if (rcu_access_pointer(tp->fastopen_rsk))
 		return false;
 
 	early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 05be564414e9..dd5a6317a801 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -386,15 +386,13 @@ abort:		tcp_write_err(sk);
  *	Timer for Fast Open socket to retransmit SYNACK. Note that the
  *	sk here is the child socket, not the parent (listener) socket.
  */
-static void tcp_fastopen_synack_timer(struct sock *sk)
+static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	int max_retries = icsk->icsk_syn_retries ? :
 	    sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct request_sock *req;
 
-	req = tcp_sk(sk)->fastopen_rsk;
 	req->rsk_ops->syn_ack_timeout(req);
 
 	if (req->num_timeout >= max_retries) {
@@ -435,11 +433,14 @@ void tcp_retransmit_timer(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct net *net = sock_net(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct request_sock *req;
 
-	if (tp->fastopen_rsk) {
+	req = rcu_dereference_protected(tp->fastopen_rsk,
+					lockdep_sock_is_held(sk));
+	if (req) {
 		WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
 			     sk->sk_state != TCP_FIN_WAIT1);
-		tcp_fastopen_synack_timer(sk);
+		tcp_fastopen_synack_timer(sk, req);
 		/* Before we receive ACK to our SYN-ACK don't retransmit
 		 * anything else (e.g., data or FIN segments).
 		 */
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e3d9f4559c99..45a95e032bdf 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -406,7 +406,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 	tp = tcp_sk(sk);
 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
-	fastopen = tp->fastopen_rsk;
+	fastopen = rcu_dereference(tp->fastopen_rsk);
 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
 	if (sk->sk_state != TCP_LISTEN &&
 	    !between(seq, snd_una, tp->snd_nxt)) {
-- 
cgit v1.2.3-59-g8ed1b


From 5556cfe8d994d5e7b4d50fd91597b8dc0b3a82fd Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 14 Oct 2019 14:11:40 -0700
Subject: mm, page_owner: fix off-by-one error in __set_page_owner_handle()

Patch series "followups to debug_pagealloc improvements through
page_owner", v3.

These are followups to [1] which made it to Linus meanwhile.  Patches 1
and 3 are based on Kirill's review, patch 2 on KASAN request [2].  It
would be nice if all of this made it to 5.4 with [1] already there (or
at least Patch 1).

This patch (of 3):

As noted by Kirill, commit 7e2f2a0cd17c ("mm, page_owner: record page
owner for each subpage") has introduced an off-by-one error in
__set_page_owner_handle() when looking up page_ext for subpages.  As a
result, the head page page_owner info is set twice, while for the last
tail page, it's not set at all.

Fix this and also make the code more efficient by advancing the page_ext
pointer we already have, instead of calling lookup_page_ext() for each
subpage.  Since the full size of struct page_ext is not known at compile
time, we can't use a simple page_ext++ statement, so introduce a
page_ext_next() inline function for that.

Link: http://lkml.kernel.org/r/20190930122916.14969-2-vbabka@suse.cz
Fixes: 7e2f2a0cd17c ("mm, page_owner: record page owner for each subpage")
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Kirill A. Shutemov <kirill@shutemov.name>
Reported-by: Miles Chen <miles.chen@mediatek.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Walter Wu <walter-zh.wu@mediatek.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_ext.h |  8 ++++++++
 mm/page_ext.c            | 23 +++++++++--------------
 mm/page_owner.c          | 15 +++++++--------
 3 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 682fd465df06..5e856512bafb 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -36,6 +36,7 @@ struct page_ext {
 	unsigned long flags;
 };
 
+extern unsigned long page_ext_size;
 extern void pgdat_page_ext_init(struct pglist_data *pgdat);
 
 #ifdef CONFIG_SPARSEMEM
@@ -52,6 +53,13 @@ static inline void page_ext_init(void)
 
 struct page_ext *lookup_page_ext(const struct page *page);
 
+static inline struct page_ext *page_ext_next(struct page_ext *curr)
+{
+	void *next = curr;
+	next += page_ext_size;
+	return next;
+}
+
 #else /* !CONFIG_PAGE_EXTENSION */
 struct page_ext;
 
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 5f5769c7db3b..4ade843ff588 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -67,8 +67,9 @@ static struct page_ext_operations *page_ext_ops[] = {
 #endif
 };
 
+unsigned long page_ext_size = sizeof(struct page_ext);
+
 static unsigned long total_usage;
-static unsigned long extra_mem;
 
 static bool __init invoke_need_callbacks(void)
 {
@@ -78,9 +79,8 @@ static bool __init invoke_need_callbacks(void)
 
 	for (i = 0; i < entries; i++) {
 		if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
-			page_ext_ops[i]->offset = sizeof(struct page_ext) +
-						extra_mem;
-			extra_mem += page_ext_ops[i]->size;
+			page_ext_ops[i]->offset = page_ext_size;
+			page_ext_size += page_ext_ops[i]->size;
 			need = true;
 		}
 	}
@@ -99,14 +99,9 @@ static void __init invoke_init_callbacks(void)
 	}
 }
 
-static unsigned long get_entry_size(void)
-{
-	return sizeof(struct page_ext) + extra_mem;
-}
-
 static inline struct page_ext *get_entry(void *base, unsigned long index)
 {
-	return base + get_entry_size() * index;
+	return base + page_ext_size * index;
 }
 
 #if !defined(CONFIG_SPARSEMEM)
@@ -156,7 +151,7 @@ static int __init alloc_node_page_ext(int nid)
 		!IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
 		nr_pages += MAX_ORDER_NR_PAGES;
 
-	table_size = get_entry_size() * nr_pages;
+	table_size = page_ext_size * nr_pages;
 
 	base = memblock_alloc_try_nid(
 			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
@@ -234,7 +229,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
 	if (section->page_ext)
 		return 0;
 
-	table_size = get_entry_size() * PAGES_PER_SECTION;
+	table_size = page_ext_size * PAGES_PER_SECTION;
 	base = alloc_page_ext(table_size, nid);
 
 	/*
@@ -254,7 +249,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
 	 * we need to apply a mask.
 	 */
 	pfn &= PAGE_SECTION_MASK;
-	section->page_ext = (void *)base - get_entry_size() * pfn;
+	section->page_ext = (void *)base - page_ext_size * pfn;
 	total_usage += table_size;
 	return 0;
 }
@@ -267,7 +262,7 @@ static void free_page_ext(void *addr)
 		struct page *page = virt_to_page(addr);
 		size_t table_size;
 
-		table_size = get_entry_size() * PAGES_PER_SECTION;
+		table_size = page_ext_size * PAGES_PER_SECTION;
 
 		BUG_ON(PageReserved(page));
 		kmemleak_free(addr);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index dee931184788..d3cf5d336ccf 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -156,10 +156,10 @@ void __reset_page_owner(struct page *page, unsigned int order)
 		handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
 #endif
 
+	page_ext = lookup_page_ext(page);
+	if (unlikely(!page_ext))
+		return;
 	for (i = 0; i < (1 << order); i++) {
-		page_ext = lookup_page_ext(page + i);
-		if (unlikely(!page_ext))
-			continue;
 		__clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags);
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if (debug_pagealloc_enabled()) {
@@ -167,6 +167,7 @@ void __reset_page_owner(struct page *page, unsigned int order)
 			page_owner->free_handle = handle;
 		}
 #endif
+		page_ext = page_ext_next(page_ext);
 	}
 }
 
@@ -186,7 +187,7 @@ static inline void __set_page_owner_handle(struct page *page,
 		__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
 		__set_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags);
 
-		page_ext = lookup_page_ext(page + i);
+		page_ext = page_ext_next(page_ext);
 	}
 }
 
@@ -224,12 +225,10 @@ void __split_page_owner(struct page *page, unsigned int order)
 	if (unlikely(!page_ext))
 		return;
 
-	page_owner = get_page_owner(page_ext);
-	page_owner->order = 0;
-	for (i = 1; i < (1 << order); i++) {
-		page_ext = lookup_page_ext(page + i);
+	for (i = 0; i < (1 << order); i++) {
 		page_owner = get_page_owner(page_ext);
 		page_owner->order = 0;
+		page_ext = page_ext_next(page_ext);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From fdf3bf809162592b54c278b9b0e84f3e126f8844 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 14 Oct 2019 14:11:47 -0700
Subject: mm, page_owner: rename flag indicating that page is allocated

Commit 37389167a281 ("mm, page_owner: keep owner info when freeing the
page") has introduced a flag PAGE_EXT_OWNER_ACTIVE to indicate that page
is tracked as being allocated.  Kirril suggested naming it
PAGE_EXT_OWNER_ALLOCATED to make it more clear, as "active is somewhat
loaded term for a page".

Link: http://lkml.kernel.org/r/20190930122916.14969-4-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Suggested-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Walter Wu <walter-zh.wu@mediatek.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_ext.h |  2 +-
 mm/page_owner.c          | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 5e856512bafb..cfce186f0c4e 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -18,7 +18,7 @@ struct page_ext_operations {
 
 enum page_ext_flags {
 	PAGE_EXT_OWNER,
-	PAGE_EXT_OWNER_ACTIVE,
+	PAGE_EXT_OWNER_ALLOCATED,
 #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
 	PAGE_EXT_YOUNG,
 	PAGE_EXT_IDLE,
diff --git a/mm/page_owner.c b/mm/page_owner.c
index de1916ac3e24..e327bcd0380e 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -152,7 +152,7 @@ void __reset_page_owner(struct page *page, unsigned int order)
 	if (unlikely(!page_ext))
 		return;
 	for (i = 0; i < (1 << order); i++) {
-		__clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags);
+		__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
 		page_owner = get_page_owner(page_ext);
 		page_owner->free_handle = handle;
 		page_ext = page_ext_next(page_ext);
@@ -173,7 +173,7 @@ static inline void __set_page_owner_handle(struct page *page,
 		page_owner->gfp_mask = gfp_mask;
 		page_owner->last_migrate_reason = -1;
 		__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
-		__set_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags);
+		__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
 
 		page_ext = page_ext_next(page_ext);
 	}
@@ -247,7 +247,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
 	 * the new page, which will be freed.
 	 */
 	__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
-	__set_bit(PAGE_EXT_OWNER_ACTIVE, &new_ext->flags);
+	__set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
 }
 
 void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -307,7 +307,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 			if (unlikely(!page_ext))
 				continue;
 
-			if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
+			if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
 				continue;
 
 			page_owner = get_page_owner(page_ext);
@@ -422,7 +422,7 @@ void __dump_page_owner(struct page *page)
 		return;
 	}
 
-	if (test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
+	if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
 		pr_alert("page_owner tracks the page as allocated\n");
 	else
 		pr_alert("page_owner tracks the page as freed\n");
@@ -512,7 +512,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 		 * Although we do have the info about past allocation of free
 		 * pages, it's not relevant for current memory usage.
 		 */
-		if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
+		if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
 			continue;
 
 		page_owner = get_page_owner(page_ext);
-- 
cgit v1.2.3-59-g8ed1b


From 2a7e582f429bd983816ad366cf0f1fcf87ec6ba6 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 14 Oct 2019 14:12:20 -0700
Subject: bitmap.h: fix kernel-doc warning and typo

Fix kernel-doc warning in <linux/bitmap.h>:

  include/linux/bitmap.h:341: warning: Function parameter or member 'nbits' not described in 'bitmap_or_equal'

Also fix small typo (bitnaps).

Link: http://lkml.kernel.org/r/0729ea7a-2c0d-b2c5-7dd3-3629ee0803e2@infradead.org
Fixes: b9fa6442f704 ("cpumask: Implement cpumask_or_equal()")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 90528f12bdfa..29fc933df3bf 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -326,10 +326,11 @@ static inline int bitmap_equal(const unsigned long *src1,
 }
 
 /**
- * bitmap_or_equal - Check whether the or of two bitnaps is equal to a third
+ * bitmap_or_equal - Check whether the or of two bitmaps is equal to a third
  * @src1:	Pointer to bitmap 1
  * @src2:	Pointer to bitmap 2 will be or'ed with bitmap 1
  * @src3:	Pointer to bitmap 3. Compare to the result of *@src1 | *@src2
+ * @nbits:	number of bits in each of these bitmaps
  *
  * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise
  */
-- 
cgit v1.2.3-59-g8ed1b


From 13bea898cd9153be91baa92de68ec32ff0b99362 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 14 Oct 2019 14:12:23 -0700
Subject: xarray.h: fix kernel-doc warning

Fix (Sphinx) kernel-doc warning in <linux/xarray.h>:

  include/linux/xarray.h:232: WARNING: Unexpected indentation.

Link: http://lkml.kernel.org/r/89ba2134-ce23-7c10-5ee1-ef83b35aa984@infradead.org
Fixes: a3e4d3f97ec8 ("XArray: Redesign xa_alloc API")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/xarray.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 5921599b6dc4..86eecbd98e84 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -230,8 +230,8 @@ static inline int xa_err(void *entry)
  * This structure is used either directly or via the XA_LIMIT() macro
  * to communicate the range of IDs that are valid for allocation.
  * Two common ranges are predefined for you:
- *  * xa_limit_32b	- [0 - UINT_MAX]
- *  * xa_limit_31b	- [0 - INT_MAX]
+ * * xa_limit_32b	- [0 - UINT_MAX]
+ * * xa_limit_31b	- [0 - INT_MAX]
  */
 struct xa_limit {
 	u32 max;
-- 
cgit v1.2.3-59-g8ed1b


From 9411e3aaa6342eb730daa55cf3377463a37d2aa7 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 9 Oct 2019 17:34:44 +0300
Subject: gpiolib: Initialize the hardware with a callback

After changing the drivers to use GPIO core to add an IRQ chip
it appears that some of them requires a hardware initialization
before adding the IRQ chip.

Add an optional callback ->init_hw() to allow that drivers
to initialize hardware if needed.

This change is a part of the fix NULL pointer dereference
brought to the several drivers recently.

Cc: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 22 +++++++++++++++++++++-
 include/linux/gpio/driver.h |  8 ++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 5833e4f380d6..104ed299d5ea 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -86,6 +86,7 @@ static int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
 				struct lock_class_key *lock_key,
 				struct lock_class_key *request_key);
 static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip);
+static int gpiochip_irqchip_init_hw(struct gpio_chip *gpiochip);
 static int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gpiochip);
 static void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gpiochip);
 
@@ -1406,6 +1407,10 @@ int gpiochip_add_data_with_key(struct gpio_chip *chip, void *data,
 
 	machine_gpiochip_add(chip);
 
+	ret = gpiochip_irqchip_init_hw(chip);
+	if (ret)
+		goto err_remove_acpi_chip;
+
 	ret = gpiochip_irqchip_init_valid_mask(chip);
 	if (ret)
 		goto err_remove_acpi_chip;
@@ -1622,6 +1627,16 @@ static struct gpio_chip *find_chip_by_name(const char *name)
  * The following is irqchip helper code for gpiochips.
  */
 
+static int gpiochip_irqchip_init_hw(struct gpio_chip *gc)
+{
+	struct gpio_irq_chip *girq = &gc->irq;
+
+	if (!girq->init_hw)
+		return 0;
+
+	return girq->init_hw(gc);
+}
+
 static int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gc)
 {
 	struct gpio_irq_chip *girq = &gc->irq;
@@ -2446,8 +2461,13 @@ static inline int gpiochip_add_irqchip(struct gpio_chip *gpiochip,
 {
 	return 0;
 }
-
 static void gpiochip_irqchip_remove(struct gpio_chip *gpiochip) {}
+
+static inline int gpiochip_irqchip_init_hw(struct gpio_chip *gpiochip)
+{
+	return 0;
+}
+
 static inline int gpiochip_irqchip_init_valid_mask(struct gpio_chip *gpiochip)
 {
 	return 0;
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index f8245d67f070..5dd9c982e2cb 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -201,6 +201,14 @@ struct gpio_irq_chip {
 	 */
 	bool threaded;
 
+	/**
+	 * @init_hw: optional routine to initialize hardware before
+	 * an IRQ chip will be added. This is quite useful when
+	 * a particular driver wants to clear IRQ related registers
+	 * in order to avoid undesired events.
+	 */
+	int (*init_hw)(struct gpio_chip *chip);
+
 	/**
 	 * @init_valid_mask: optional routine to initialize @valid_mask, to be
 	 * used if not all GPIO lines are valid interrupts. Sometimes some
-- 
cgit v1.2.3-59-g8ed1b


From fa4e0f8855fcba600e0be2575ee29c69166f74bd Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Sat, 12 Oct 2019 13:55:07 +0200
Subject: net/sched: fix corrupted L2 header with MPLS 'push' and 'pop' actions

the following script:

 # tc qdisc add dev eth0 clsact
 # tc filter add dev eth0 egress protocol ip matchall \
 > action mpls push protocol mpls_uc label 0x355aa bos 1

causes corruption of all IP packets transmitted by eth0. On TC egress, we
can't rely on the value of skb->mac_len, because it's 0 and a MPLS 'push'
operation will result in an overwrite of the first 4 octets in the packet
L2 header (e.g. the Destination Address if eth0 is an Ethernet); the same
error pattern is present also in the MPLS 'pop' operation. Fix this error
in act_mpls data plane, computing 'mac_len' as the difference between the
network header and the mac header (when not at TC ingress), and use it in
MPLS 'push'/'pop' core functions.

v2: unbreak 'make htmldocs' because of missing documentation of 'mac_len'
    in skb_mpls_pop(), reported by kbuild test robot

CC: Lorenzo Bianconi <lorenzo@kernel.org>
Fixes: 2a2ea50870ba ("net: sched: add mpls manipulation actions to TC")
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  5 +++--
 net/core/skbuff.c         | 19 +++++++++++--------
 net/openvswitch/actions.c |  5 +++--
 net/sched/act_mpls.c      | 12 ++++++++----
 4 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4351577b14d7..7914fdaf4226 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3510,8 +3510,9 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len);
 int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
 int skb_vlan_pop(struct sk_buff *skb);
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
-int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto);
-int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto);
+int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
+		  int mac_len);
+int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len);
 int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse);
 int skb_mpls_dec_ttl(struct sk_buff *skb);
 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 03b6809ebde4..867e61df00db 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5477,12 +5477,14 @@ static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
  * @skb: buffer
  * @mpls_lse: MPLS label stack entry to push
  * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
+ * @mac_len: length of the MAC header
  *
  * Expects skb->data at mac header.
  *
  * Returns 0 on success, -errno otherwise.
  */
-int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto)
+int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
+		  int mac_len)
 {
 	struct mpls_shim_hdr *lse;
 	int err;
@@ -5499,15 +5501,15 @@ int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto)
 		return err;
 
 	if (!skb->inner_protocol) {
-		skb_set_inner_network_header(skb, skb->mac_len);
+		skb_set_inner_network_header(skb, mac_len);
 		skb_set_inner_protocol(skb, skb->protocol);
 	}
 
 	skb_push(skb, MPLS_HLEN);
 	memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
-		skb->mac_len);
+		mac_len);
 	skb_reset_mac_header(skb);
-	skb_set_network_header(skb, skb->mac_len);
+	skb_set_network_header(skb, mac_len);
 
 	lse = mpls_hdr(skb);
 	lse->label_stack_entry = mpls_lse;
@@ -5526,29 +5528,30 @@ EXPORT_SYMBOL_GPL(skb_mpls_push);
  *
  * @skb: buffer
  * @next_proto: ethertype of header after popped MPLS header
+ * @mac_len: length of the MAC header
  *
  * Expects skb->data at mac header.
  *
  * Returns 0 on success, -errno otherwise.
  */
-int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto)
+int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len)
 {
 	int err;
 
 	if (unlikely(!eth_p_mpls(skb->protocol)))
 		return 0;
 
-	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
+	err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
 	if (unlikely(err))
 		return err;
 
 	skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
 	memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
-		skb->mac_len);
+		mac_len);
 
 	__skb_pull(skb, MPLS_HLEN);
 	skb_reset_mac_header(skb);
-	skb_set_network_header(skb, skb->mac_len);
+	skb_set_network_header(skb, mac_len);
 
 	if (skb->dev && skb->dev->type == ARPHRD_ETHER) {
 		struct ethhdr *hdr;
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 3572e11b6f21..1c77f520f474 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -165,7 +165,8 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 {
 	int err;
 
-	err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype);
+	err = skb_mpls_push(skb, mpls->mpls_lse, mpls->mpls_ethertype,
+			    skb->mac_len);
 	if (err)
 		return err;
 
@@ -178,7 +179,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
 {
 	int err;
 
-	err = skb_mpls_pop(skb, ethertype);
+	err = skb_mpls_pop(skb, ethertype, skb->mac_len);
 	if (err)
 		return err;
 
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index e168df0e008a..4cf6c553bb0b 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -55,7 +55,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
 	struct tcf_mpls *m = to_mpls(a);
 	struct tcf_mpls_params *p;
 	__be32 new_lse;
-	int ret;
+	int ret, mac_len;
 
 	tcf_lastuse_update(&m->tcf_tm);
 	bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
@@ -63,8 +63,12 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
 	/* Ensure 'data' points at mac_header prior calling mpls manipulating
 	 * functions.
 	 */
-	if (skb_at_tc_ingress(skb))
+	if (skb_at_tc_ingress(skb)) {
 		skb_push_rcsum(skb, skb->mac_len);
+		mac_len = skb->mac_len;
+	} else {
+		mac_len = skb_network_header(skb) - skb_mac_header(skb);
+	}
 
 	ret = READ_ONCE(m->tcf_action);
 
@@ -72,12 +76,12 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a,
 
 	switch (p->tcfm_action) {
 	case TCA_MPLS_ACT_POP:
-		if (skb_mpls_pop(skb, p->tcfm_proto))
+		if (skb_mpls_pop(skb, p->tcfm_proto, mac_len))
 			goto drop;
 		break;
 	case TCA_MPLS_ACT_PUSH:
 		new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol));
-		if (skb_mpls_push(skb, new_lse, p->tcfm_proto))
+		if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len))
 			goto drop;
 		break;
 	case TCA_MPLS_ACT_MODIFY:
-- 
cgit v1.2.3-59-g8ed1b


From 19c95f261c6558d4c2cbbfacd2d8bb6501384601 Mon Sep 17 00:00:00 2001
From: Julien Thierry <julien.thierry@arm.com>
Date: Tue, 15 Oct 2019 18:25:44 +0100
Subject: arm64: entry.S: Do not preempt from IRQ before all cpufeatures are
 enabled

Preempting from IRQ-return means that the task has its PSTATE saved
on the stack, which will get restored when the task is resumed and does
the actual IRQ return.

However, enabling some CPU features requires modifying the PSTATE. This
means that, if a task was scheduled out during an IRQ-return before all
CPU features are enabled, the task might restore a PSTATE that does not
include the feature enablement changes once scheduled back in.

* Task 1:

PAN == 0 ---|                          |---------------
            |                          |<- return from IRQ, PSTATE.PAN = 0
            | <- IRQ                   |
            +--------+ <- preempt()  +--
                                     ^
                                     |
                                     reschedule Task 1, PSTATE.PAN == 1
* Init:
        --------------------+------------------------
                            ^
                            |
                            enable_cpu_features
                            set PSTATE.PAN on all CPUs

Worse than this, since PSTATE is untouched when task switching is done,
a task missing the new bits in PSTATE might affect another task, if both
do direct calls to schedule() (outside of IRQ/exception contexts).

Fix this by preventing preemption on IRQ-return until features are
enabled on all CPUs.

This way the only PSTATE values that are saved on the stack are from
synchronous exceptions. These are expected to be fatal this early, the
exception is BRK for WARN_ON(), but as this uses do_debug_exception()
which keeps IRQs masked, it shouldn't call schedule().

Signed-off-by: Julien Thierry <julien.thierry@arm.com>
[james: Replaced a really cool hack, with an even simpler static key in C.
 expanded commit message with Julien's cover-letter ascii art]
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/entry.S   |  2 +-
 arch/arm64/kernel/process.c | 18 ++++++++++++++++++
 include/linux/sched.h       |  1 +
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index e304fe04b098..e1859e010c5f 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -680,7 +680,7 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING
 	orr	x24, x24, x0
 alternative_else_nop_endif
 	cbnz	x24, 1f				// preempt count != 0 || NMI return path
-	bl	preempt_schedule_irq		// irq en/disable is done inside
+	bl	arm64_preempt_schedule_irq	// irq en/disable is done inside
 1:
 #endif
 
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 1fb2819fc048..71f788cd2b18 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -17,6 +17,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
+#include <linux/lockdep.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
 #include <linux/sysctl.h>
@@ -44,6 +45,7 @@
 #include <asm/alternative.h>
 #include <asm/arch_gicv3.h>
 #include <asm/compat.h>
+#include <asm/cpufeature.h>
 #include <asm/cacheflush.h>
 #include <asm/exec.h>
 #include <asm/fpsimd.h>
@@ -631,3 +633,19 @@ static int __init tagged_addr_init(void)
 
 core_initcall(tagged_addr_init);
 #endif	/* CONFIG_ARM64_TAGGED_ADDR_ABI */
+
+asmlinkage void __sched arm64_preempt_schedule_irq(void)
+{
+	lockdep_assert_irqs_disabled();
+
+	/*
+	 * Preempting a task from an IRQ means we leave copies of PSTATE
+	 * on the stack. cpufeature's enable calls may modify PSTATE, but
+	 * resuming one of these preempted tasks would undo those changes.
+	 *
+	 * Only allow a task to be preempted once cpufeatures have been
+	 * enabled.
+	 */
+	if (static_branch_likely(&arm64_const_caps_ready))
+		preempt_schedule_irq();
+}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c2e56bd8913..67a1d86981a9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -223,6 +223,7 @@ extern long schedule_timeout_uninterruptible(long timeout);
 extern long schedule_timeout_idle(long timeout);
 asmlinkage void schedule(void);
 extern void schedule_preempt_disabled(void);
+asmlinkage void preempt_schedule_irq(void);
 
 extern int __must_check io_schedule_prepare(void);
 extern void io_schedule_finish(int token);
-- 
cgit v1.2.3-59-g8ed1b


From 1d951ba3da67bbc7a9b0e05987e09552c2060e18 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Wed, 16 Oct 2019 15:35:07 +0200
Subject: net: phy: micrel: Update KSZ87xx PHY name

The KSZ8795 PHY ID is in fact used by KSZ8794/KSZ8795/KSZ8765 switches.
Update the PHY ID and name to reflect that, as this family of switches
is commonly refered to as KSZ87xx

Signed-off-by: Marek Vasut <marex@denx.de>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: David S. Miller <davem@davemloft.net>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: George McCollister <george.mccollister@gmail.com>
Cc: Heiner Kallweit <hkallweit1@gmail.com>
Cc: Sean Nyekjaer <sean.nyekjaer@prevas.dk>
Cc: Tristram Ha <Tristram.Ha@microchip.com>
Cc: Woojung Huh <woojung.huh@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/micrel.c   | 4 ++--
 include/linux/micrel_phy.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index a0444e28c6e7..63dedec0433d 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -395,7 +395,7 @@ static int ksz8061_config_init(struct phy_device *phydev)
 
 static int ksz8795_match_phy_device(struct phy_device *phydev)
 {
-	return ksz8051_ksz8795_match_phy_device(phydev, PHY_ID_KSZ8795);
+	return ksz8051_ksz8795_match_phy_device(phydev, PHY_ID_KSZ87XX);
 }
 
 static int ksz9021_load_values_from_of(struct phy_device *phydev,
@@ -1174,7 +1174,7 @@ static struct phy_driver ksphy_driver[] = {
 	.suspend	= genphy_suspend,
 	.resume		= genphy_resume,
 }, {
-	.name		= "Micrel KSZ8795",
+	.name		= "Micrel KSZ87XX Switch",
 	/* PHY_BASIC_FEATURES */
 	.config_init	= kszphy_config_init,
 	.config_aneg	= ksz8873mll_config_aneg,
diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index ad24554f11f9..75f880c25bb8 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -31,7 +31,7 @@
 #define PHY_ID_KSZ886X		0x00221430
 #define PHY_ID_KSZ8863		0x00221435
 
-#define PHY_ID_KSZ8795		0x00221550
+#define PHY_ID_KSZ87XX		0x00221550
 
 #define	PHY_ID_KSZ9477		0x00221631
 
-- 
cgit v1.2.3-59-g8ed1b


From 69923208431e097ce3830647aee98e5bd3e889c8 Mon Sep 17 00:00:00 2001
From: Matthias Maennich <maennich@google.com>
Date: Fri, 18 Oct 2019 10:31:42 +0100
Subject: symbol namespaces: revert to previous __ksymtab name scheme

The introduction of Symbol Namespaces changed the naming schema of the
__ksymtab entries from __kysmtab__symbol to __ksymtab_NAMESPACE.symbol.

That caused some breakages in tools that depend on the name layout in
either the binaries(vmlinux,*.ko) or in System.map. E.g. kmod's depmod
would not be able to read System.map without a patch to support symbol
namespaces. A warning reported by depmod for namespaced symbols would
look like

  depmod: WARNING: [...]/uas.ko needs unknown symbol usb_stor_adjust_quirks

In order to address this issue, revert to the original naming scheme and
rather read the __kstrtabns_<symbol> entries and their corresponding
values from __ksymtab_strings to update the namespace values for
symbols. After having read all symbols and handled them in
handle_modversions(), the symbols are created. In a second pass, read
the __kstrtabns_ entries and update the namespaces accordingly.

Fixes: 8651ec01daed ("module: add support for symbol namespaces.")
Reported-by: Stefan Wahren <stefan.wahren@i2se.com>
Suggested-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Will Deacon <will@kernel.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Matthias Maennich <maennich@google.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/export.h | 14 +++++---------
 scripts/mod/modpost.c  | 33 ++++++++++++++++++---------------
 scripts/mod/modpost.h  |  1 +
 3 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/export.h b/include/linux/export.h
index 621158ecd2e2..941d075f03d6 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -18,8 +18,6 @@ extern struct module __this_module;
 #define THIS_MODULE ((struct module *)0)
 #endif
 
-#define NS_SEPARATOR "."
-
 #ifdef CONFIG_MODVERSIONS
 /* Mark the CRC weak since genksyms apparently decides not to
  * generate a checksums for some symbols */
@@ -48,11 +46,11 @@ extern struct module __this_module;
  * absolute relocations that require runtime processing on relocatable
  * kernels.
  */
-#define __KSYMTAB_ENTRY_NS(sym, sec, ns)				\
+#define __KSYMTAB_ENTRY_NS(sym, sec)					\
 	__ADDRESSABLE(sym)						\
 	asm("	.section \"___ksymtab" sec "+" #sym "\", \"a\"	\n"	\
 	    "	.balign	4					\n"	\
-	    "__ksymtab_" #ns NS_SEPARATOR #sym ":		\n"	\
+	    "__ksymtab_" #sym ":				\n"	\
 	    "	.long	" #sym "- .				\n"	\
 	    "	.long	__kstrtab_" #sym "- .			\n"	\
 	    "	.long	__kstrtabns_" #sym "- .			\n"	\
@@ -74,16 +72,14 @@ struct kernel_symbol {
 	int namespace_offset;
 };
 #else
-#define __KSYMTAB_ENTRY_NS(sym, sec, ns)				\
-	static const struct kernel_symbol __ksymtab_##sym##__##ns	\
-	asm("__ksymtab_" #ns NS_SEPARATOR #sym)				\
+#define __KSYMTAB_ENTRY_NS(sym, sec)					\
+	static const struct kernel_symbol __ksymtab_##sym		\
 	__attribute__((section("___ksymtab" sec "+" #sym), used))	\
 	__aligned(sizeof(void *))					\
 	= { (unsigned long)&sym, __kstrtab_##sym, __kstrtabns_##sym }
 
 #define __KSYMTAB_ENTRY(sym, sec)					\
 	static const struct kernel_symbol __ksymtab_##sym		\
-	asm("__ksymtab_" #sym)						\
 	__attribute__((section("___ksymtab" sec "+" #sym), used))	\
 	__aligned(sizeof(void *))					\
 	= { (unsigned long)&sym, __kstrtab_##sym, NULL }
@@ -115,7 +111,7 @@ struct kernel_symbol {
 	static const char __kstrtabns_##sym[]				\
 	__attribute__((section("__ksymtab_strings"), used, aligned(1)))	\
 	= #ns;								\
-	__KSYMTAB_ENTRY_NS(sym, sec, ns)
+	__KSYMTAB_ENTRY_NS(sym, sec)
 
 #define ___EXPORT_SYMBOL(sym, sec)					\
 	___export_symbol_common(sym, sec);				\
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index c43f1b1532f7..d2a30a7b3f07 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -348,18 +348,11 @@ static enum export export_from_sec(struct elf_info *elf, unsigned int sec)
 		return export_unknown;
 }
 
-static char *sym_extract_namespace(const char **symname)
+static const char *namespace_from_kstrtabns(struct elf_info *info,
+					    Elf_Sym *kstrtabns)
 {
-	char *namespace = NULL;
-	char *ns_separator;
-
-	ns_separator = strchr(*symname, '.');
-	if (ns_separator) {
-		namespace = NOFAIL(strndup(*symname, ns_separator - *symname));
-		*symname = ns_separator + 1;
-	}
-
-	return namespace;
+	char *value = info->ksymtab_strings + kstrtabns->st_value;
+	return value[0] ? value : NULL;
 }
 
 static void sym_update_namespace(const char *symname, const char *namespace)
@@ -600,6 +593,10 @@ static int parse_elf(struct elf_info *info, const char *filename)
 			info->export_unused_gpl_sec = i;
 		else if (strcmp(secname, "__ksymtab_gpl_future") == 0)
 			info->export_gpl_future_sec = i;
+		else if (strcmp(secname, "__ksymtab_strings") == 0)
+			info->ksymtab_strings = (void *)hdr +
+						sechdrs[i].sh_offset -
+						sechdrs[i].sh_addr;
 
 		if (sechdrs[i].sh_type == SHT_SYMTAB) {
 			unsigned int sh_link_idx;
@@ -689,7 +686,6 @@ static void handle_modversions(struct module *mod, struct elf_info *info,
 	enum export export;
 	bool is_crc = false;
 	const char *name;
-	char *namespace;
 
 	if ((!is_vmlinux(mod->name) || mod->is_dot_o) &&
 	    strstarts(symname, "__ksymtab"))
@@ -762,10 +758,7 @@ static void handle_modversions(struct module *mod, struct elf_info *info,
 		/* All exported symbols */
 		if (strstarts(symname, "__ksymtab_")) {
 			name = symname + strlen("__ksymtab_");
-			namespace = sym_extract_namespace(&name);
 			sym_add_exported(name, mod, export);
-			sym_update_namespace(name, namespace);
-			free(namespace);
 		}
 		if (strcmp(symname, "init_module") == 0)
 			mod->has_init = 1;
@@ -2061,6 +2054,16 @@ static void read_symbols(const char *modname)
 		handle_moddevtable(mod, &info, sym, symname);
 	}
 
+	/* Apply symbol namespaces from __kstrtabns_<symbol> entries. */
+	for (sym = info.symtab_start; sym < info.symtab_stop; sym++) {
+		symname = remove_dot(info.strtab + sym->st_name);
+
+		if (strstarts(symname, "__kstrtabns_"))
+			sym_update_namespace(symname + strlen("__kstrtabns_"),
+					     namespace_from_kstrtabns(&info,
+								      sym));
+	}
+
 	// check for static EXPORT_SYMBOL_* functions && global vars
 	for (sym = info.symtab_start; sym < info.symtab_stop; sym++) {
 		unsigned char bind = ELF_ST_BIND(sym->st_info);
diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h
index 92a926d375d2..ad271bc6c313 100644
--- a/scripts/mod/modpost.h
+++ b/scripts/mod/modpost.h
@@ -143,6 +143,7 @@ struct elf_info {
 	Elf_Section  export_gpl_sec;
 	Elf_Section  export_unused_gpl_sec;
 	Elf_Section  export_gpl_future_sec;
+	char	     *ksymtab_strings;
 	char         *strtab;
 	char	     *modinfo;
 	unsigned int modinfo_len;
-- 
cgit v1.2.3-59-g8ed1b


From 77751a466ebd1a785456556061a2db6d60ea3898 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 16 Oct 2019 12:41:24 +0200
Subject: PM: QoS: Introduce frequency QoS

Introduce frequency QoS, based on the "raw" low-level PM QoS, to
represent min and max frequency requests and aggregate constraints.

The min and max frequency requests are to be represented by
struct freq_qos_request objects and the aggregate constraints are to
be represented by struct freq_constraints objects.  The latter are
expected to be initialized with the help of freq_constraints_init().

The freq_qos_read_value() helper is defined to retrieve the aggregate
constraints values from a given struct freq_constraints object and
there are the freq_qos_add_request(), freq_qos_update_request() and
freq_qos_remove_request() helpers to manipulate the min and max
frequency requests.  It is assumed that the the helpers will not
run concurrently with each other for the same struct freq_qos_request
object, so if that may be the case, their uses must ensure proper
synchronization between them (e.g. through locking).

In addition, freq_qos_add_notifier() and freq_qos_remove_notifier()
are provided to add and remove notifiers that will trigger on aggregate
constraint changes to and from a given struct freq_constraints object,
respectively.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/pm_qos.h |  44 +++++++++
 kernel/power/qos.c     | 240 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 284 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 222c3e01397c..57b93e3d8f29 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -267,4 +267,48 @@ static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev)
 }
 #endif
 
+#define FREQ_QOS_MIN_DEFAULT_VALUE	0
+#define FREQ_QOS_MAX_DEFAULT_VALUE	(-1)
+
+enum freq_qos_req_type {
+	FREQ_QOS_MIN = 1,
+	FREQ_QOS_MAX,
+};
+
+struct freq_constraints {
+	struct pm_qos_constraints min_freq;
+	struct blocking_notifier_head min_freq_notifiers;
+	struct pm_qos_constraints max_freq;
+	struct blocking_notifier_head max_freq_notifiers;
+};
+
+struct freq_qos_request {
+	enum freq_qos_req_type type;
+	struct plist_node pnode;
+	struct freq_constraints *qos;
+};
+
+static inline int freq_qos_request_active(struct freq_qos_request *req)
+{
+	return !IS_ERR_OR_NULL(req->qos);
+}
+
+void freq_constraints_init(struct freq_constraints *qos);
+
+s32 freq_qos_read_value(struct freq_constraints *qos,
+			enum freq_qos_req_type type);
+
+int freq_qos_add_request(struct freq_constraints *qos,
+			 struct freq_qos_request *req,
+			 enum freq_qos_req_type type, s32 value);
+int freq_qos_update_request(struct freq_qos_request *req, s32 new_value);
+int freq_qos_remove_request(struct freq_qos_request *req);
+
+int freq_qos_add_notifier(struct freq_constraints *qos,
+			  enum freq_qos_req_type type,
+			  struct notifier_block *notifier);
+int freq_qos_remove_notifier(struct freq_constraints *qos,
+			     enum freq_qos_req_type type,
+			     struct notifier_block *notifier);
+
 #endif
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9568a2fe7c11..04e83fdfbe80 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -650,3 +650,243 @@ static int __init pm_qos_power_init(void)
 }
 
 late_initcall(pm_qos_power_init);
+
+/* Definitions related to the frequency QoS below. */
+
+/**
+ * freq_constraints_init - Initialize frequency QoS constraints.
+ * @qos: Frequency QoS constraints to initialize.
+ */
+void freq_constraints_init(struct freq_constraints *qos)
+{
+	struct pm_qos_constraints *c;
+
+	c = &qos->min_freq;
+	plist_head_init(&c->list);
+	c->target_value = FREQ_QOS_MIN_DEFAULT_VALUE;
+	c->default_value = FREQ_QOS_MIN_DEFAULT_VALUE;
+	c->no_constraint_value = FREQ_QOS_MIN_DEFAULT_VALUE;
+	c->type = PM_QOS_MAX;
+	c->notifiers = &qos->min_freq_notifiers;
+	BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers);
+
+	c = &qos->max_freq;
+	plist_head_init(&c->list);
+	c->target_value = FREQ_QOS_MAX_DEFAULT_VALUE;
+	c->default_value = FREQ_QOS_MAX_DEFAULT_VALUE;
+	c->no_constraint_value = FREQ_QOS_MAX_DEFAULT_VALUE;
+	c->type = PM_QOS_MIN;
+	c->notifiers = &qos->max_freq_notifiers;
+	BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers);
+}
+
+/**
+ * freq_qos_read_value - Get frequency QoS constraint for a given list.
+ * @qos: Constraints to evaluate.
+ * @type: QoS request type.
+ */
+s32 freq_qos_read_value(struct freq_constraints *qos,
+			enum freq_qos_req_type type)
+{
+	s32 ret;
+
+	switch (type) {
+	case FREQ_QOS_MIN:
+		ret = IS_ERR_OR_NULL(qos) ?
+			FREQ_QOS_MIN_DEFAULT_VALUE :
+			pm_qos_read_value(&qos->min_freq);
+		break;
+	case FREQ_QOS_MAX:
+		ret = IS_ERR_OR_NULL(qos) ?
+			FREQ_QOS_MAX_DEFAULT_VALUE :
+			pm_qos_read_value(&qos->max_freq);
+		break;
+	default:
+		WARN_ON(1);
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/**
+ * freq_qos_apply - Add/modify/remove frequency QoS request.
+ * @req: Constraint request to apply.
+ * @action: Action to perform (add/update/remove).
+ * @value: Value to assign to the QoS request.
+ */
+static int freq_qos_apply(struct freq_qos_request *req,
+			  enum pm_qos_req_action action, s32 value)
+{
+	int ret;
+
+	switch(req->type) {
+	case FREQ_QOS_MIN:
+		ret = pm_qos_update_target(&req->qos->min_freq, &req->pnode,
+					   action, value);
+		break;
+	case FREQ_QOS_MAX:
+		ret = pm_qos_update_target(&req->qos->max_freq, &req->pnode,
+					   action, value);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+/**
+ * freq_qos_add_request - Insert new frequency QoS request into a given list.
+ * @qos: Constraints to update.
+ * @req: Preallocated request object.
+ * @type: Request type.
+ * @value: Request value.
+ *
+ * Insert a new entry into the @qos list of requests, recompute the effective
+ * QoS constraint value for that list and initialize the @req object.  The
+ * caller needs to save that object for later use in updates and removal.
+ *
+ * Return 1 if the effective constraint value has changed, 0 if the effective
+ * constraint value has not changed, or a negative error code on failures.
+ */
+int freq_qos_add_request(struct freq_constraints *qos,
+			 struct freq_qos_request *req,
+			 enum freq_qos_req_type type, s32 value)
+{
+	int ret;
+
+	if (IS_ERR_OR_NULL(qos) || !req)
+		return -EINVAL;
+
+	if (WARN(freq_qos_request_active(req),
+		 "%s() called for active request\n", __func__))
+		return -EINVAL;
+
+	req->qos = qos;
+	req->type = type;
+	ret = freq_qos_apply(req, PM_QOS_ADD_REQ, value);
+	if (ret < 0) {
+		req->qos = NULL;
+		req->type = 0;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(freq_qos_add_request);
+
+/**
+ * freq_qos_update_request - Modify existing frequency QoS request.
+ * @req: Request to modify.
+ * @new_value: New request value.
+ *
+ * Update an existing frequency QoS request along with the effective constraint
+ * value for the list of requests it belongs to.
+ *
+ * Return 1 if the effective constraint value has changed, 0 if the effective
+ * constraint value has not changed, or a negative error code on failures.
+ */
+int freq_qos_update_request(struct freq_qos_request *req, s32 new_value)
+{
+	if (!req)
+		return -EINVAL;
+
+	if (WARN(!freq_qos_request_active(req),
+		 "%s() called for unknown object\n", __func__))
+		return -EINVAL;
+
+	if (req->pnode.prio == new_value)
+		return 0;
+
+	return freq_qos_apply(req, PM_QOS_UPDATE_REQ, new_value);
+}
+EXPORT_SYMBOL_GPL(freq_qos_update_request);
+
+/**
+ * freq_qos_remove_request - Remove frequency QoS request from its list.
+ * @req: Request to remove.
+ *
+ * Remove the given frequency QoS request from the list of constraints it
+ * belongs to and recompute the effective constraint value for that list.
+ *
+ * Return 1 if the effective constraint value has changed, 0 if the effective
+ * constraint value has not changed, or a negative error code on failures.
+ */
+int freq_qos_remove_request(struct freq_qos_request *req)
+{
+	if (!req)
+		return -EINVAL;
+
+	if (WARN(!freq_qos_request_active(req),
+		 "%s() called for unknown object\n", __func__))
+		return -EINVAL;
+
+	return freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
+}
+EXPORT_SYMBOL_GPL(freq_qos_remove_request);
+
+/**
+ * freq_qos_add_notifier - Add frequency QoS change notifier.
+ * @qos: List of requests to add the notifier to.
+ * @type: Request type.
+ * @notifier: Notifier block to add.
+ */
+int freq_qos_add_notifier(struct freq_constraints *qos,
+			  enum freq_qos_req_type type,
+			  struct notifier_block *notifier)
+{
+	int ret;
+
+	if (IS_ERR_OR_NULL(qos) || !notifier)
+		return -EINVAL;
+
+	switch (type) {
+	case FREQ_QOS_MIN:
+		ret = blocking_notifier_chain_register(qos->min_freq.notifiers,
+						       notifier);
+		break;
+	case FREQ_QOS_MAX:
+		ret = blocking_notifier_chain_register(qos->max_freq.notifiers,
+						       notifier);
+		break;
+	default:
+		WARN_ON(1);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(freq_qos_add_notifier);
+
+/**
+ * freq_qos_remove_notifier - Remove frequency QoS change notifier.
+ * @qos: List of requests to remove the notifier from.
+ * @type: Request type.
+ * @notifier: Notifier block to remove.
+ */
+int freq_qos_remove_notifier(struct freq_constraints *qos,
+			     enum freq_qos_req_type type,
+			     struct notifier_block *notifier)
+{
+	int ret;
+
+	if (IS_ERR_OR_NULL(qos) || !notifier)
+		return -EINVAL;
+
+	switch (type) {
+	case FREQ_QOS_MIN:
+		ret = blocking_notifier_chain_unregister(qos->min_freq.notifiers,
+							 notifier);
+		break;
+	case FREQ_QOS_MAX:
+		ret = blocking_notifier_chain_unregister(qos->max_freq.notifiers,
+							 notifier);
+		break;
+	default:
+		WARN_ON(1);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(freq_qos_remove_notifier);
-- 
cgit v1.2.3-59-g8ed1b


From 3000ce3c52f8b8db093e4dc649cd172390f71137 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 16 Oct 2019 12:47:06 +0200
Subject: cpufreq: Use per-policy frequency QoS

Replace the CPU device PM QoS used for the management of min and max
frequency constraints in cpufreq (and its users) with per-policy
frequency QoS to avoid problems with cpufreq policies covering
more then one CPU.

Namely, a cpufreq driver is registered with the subsys interface
which calls cpufreq_add_dev() for each CPU, starting from CPU0, so
currently the PM QoS notifiers are added to the first CPU in the
policy (i.e. CPU0 in the majority of cases).

In turn, when the cpufreq driver is unregistered, the subsys interface
doing that calls cpufreq_remove_dev() for each CPU, starting from CPU0,
and the PM QoS notifiers are only removed when cpufreq_remove_dev() is
called for the last CPU in the policy, say CPUx, which as a rule is
not CPU0 if the policy covers more than one CPU.  Then, the PM QoS
notifiers cannot be removed, because CPUx does not have them, and
they are still there in the device PM QoS notifiers list of CPU0,
which prevents new PM QoS notifiers from being registered for CPU0
on the next attempt to register the cpufreq driver.

The same issue occurs when the first CPU in the policy goes offline
before unregistering the driver.

After this change it does not matter which CPU is the policy CPU at
the driver registration time and whether or not it is online all the
time, because the frequency QoS is per policy and not per CPU.

Fixes: 67d874c3b2c6 ("cpufreq: Register notifiers with the PM QoS framework")
Reported-by: Dmitry Osipenko <digetx@gmail.com>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
Reported-by: Sudeep Holla <sudeep.holla@arm.com>
Tested-by: Sudeep Holla <sudeep.holla@arm.com>
Diagnosed-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://lore.kernel.org/linux-pm/5ad2624194baa2f53acc1f1e627eb7684c577a19.1562210705.git.viresh.kumar@linaro.org/T/#md2d89e95906b8c91c15f582146173dce2e86e99f
Link: https://lore.kernel.org/linux-pm/20191017094612.6tbkwoq4harsjcqv@vireshk-i7/T/#m30d48cc23b9a80467fbaa16e30f90b3828a5a29b
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/acpi/processor_driver.c            |  9 ++---
 drivers/acpi/processor_perflib.c           | 18 ++++-----
 drivers/acpi/processor_thermal.c           | 18 ++++-----
 drivers/cpufreq/cpufreq.c                  | 59 +++++++++++++-----------------
 drivers/cpufreq/intel_pstate.c             | 30 +++++++--------
 drivers/cpufreq/ppc_cbe_cpufreq_pmi.c      | 15 ++++----
 drivers/macintosh/windfarm_cpufreq_clamp.c | 38 +++++++++++--------
 drivers/thermal/cpu_cooling.c              | 14 +++----
 include/acpi/processor.h                   | 20 +++++-----
 include/linux/cpufreq.h                    |  7 +++-
 10 files changed, 114 insertions(+), 114 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index 08da9c29f1e9..62114a03a51a 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -290,14 +290,13 @@ static int acpi_processor_notifier(struct notifier_block *nb,
 				   unsigned long event, void *data)
 {
 	struct cpufreq_policy *policy = data;
-	int cpu = policy->cpu;
 
 	if (event == CPUFREQ_CREATE_POLICY) {
-		acpi_thermal_cpufreq_init(cpu);
-		acpi_processor_ppc_init(cpu);
+		acpi_thermal_cpufreq_init(policy);
+		acpi_processor_ppc_init(policy);
 	} else if (event == CPUFREQ_REMOVE_POLICY) {
-		acpi_processor_ppc_exit(cpu);
-		acpi_thermal_cpufreq_exit(cpu);
+		acpi_processor_ppc_exit(policy);
+		acpi_thermal_cpufreq_exit(policy);
 	}
 
 	return 0;
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 930a49fa4dfc..753e171de006 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -81,10 +81,10 @@ static int acpi_processor_get_platform_limit(struct acpi_processor *pr)
 	pr->performance_platform_limit = (int)ppc;
 
 	if (ppc >= pr->performance->state_count ||
-	    unlikely(!dev_pm_qos_request_active(&pr->perflib_req)))
+	    unlikely(!freq_qos_request_active(&pr->perflib_req)))
 		return 0;
 
-	ret = dev_pm_qos_update_request(&pr->perflib_req,
+	ret = freq_qos_update_request(&pr->perflib_req,
 			pr->performance->states[ppc].core_frequency * 1000);
 	if (ret < 0) {
 		pr_warn("Failed to update perflib freq constraint: CPU%d (%d)\n",
@@ -157,28 +157,28 @@ void acpi_processor_ignore_ppc_init(void)
 		ignore_ppc = 0;
 }
 
-void acpi_processor_ppc_init(int cpu)
+void acpi_processor_ppc_init(struct cpufreq_policy *policy)
 {
+	int cpu = policy->cpu;
 	struct acpi_processor *pr = per_cpu(processors, cpu);
 	int ret;
 
 	if (!pr)
 		return;
 
-	ret = dev_pm_qos_add_request(get_cpu_device(cpu),
-				     &pr->perflib_req, DEV_PM_QOS_MAX_FREQUENCY,
-				     INT_MAX);
+	ret = freq_qos_add_request(&policy->constraints, &pr->perflib_req,
+				   FREQ_QOS_MAX, INT_MAX);
 	if (ret < 0)
 		pr_err("Failed to add freq constraint for CPU%d (%d)\n", cpu,
 		       ret);
 }
 
-void acpi_processor_ppc_exit(int cpu)
+void acpi_processor_ppc_exit(struct cpufreq_policy *policy)
 {
-	struct acpi_processor *pr = per_cpu(processors, cpu);
+	struct acpi_processor *pr = per_cpu(processors, policy->cpu);
 
 	if (pr)
-		dev_pm_qos_remove_request(&pr->perflib_req);
+		freq_qos_remove_request(&pr->perflib_req);
 }
 
 static int acpi_processor_get_performance_control(struct acpi_processor *pr)
diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c
index 8227c7dd75b1..c77a5b1fb107 100644
--- a/drivers/acpi/processor_thermal.c
+++ b/drivers/acpi/processor_thermal.c
@@ -105,7 +105,7 @@ static int cpufreq_set_cur_state(unsigned int cpu, int state)
 
 		pr = per_cpu(processors, i);
 
-		if (unlikely(!dev_pm_qos_request_active(&pr->thermal_req)))
+		if (unlikely(!freq_qos_request_active(&pr->thermal_req)))
 			continue;
 
 		policy = cpufreq_cpu_get(i);
@@ -116,7 +116,7 @@ static int cpufreq_set_cur_state(unsigned int cpu, int state)
 
 		cpufreq_cpu_put(policy);
 
-		ret = dev_pm_qos_update_request(&pr->thermal_req, max_freq);
+		ret = freq_qos_update_request(&pr->thermal_req, max_freq);
 		if (ret < 0) {
 			pr_warn("Failed to update thermal freq constraint: CPU%d (%d)\n",
 				pr->id, ret);
@@ -125,28 +125,28 @@ static int cpufreq_set_cur_state(unsigned int cpu, int state)
 	return 0;
 }
 
-void acpi_thermal_cpufreq_init(int cpu)
+void acpi_thermal_cpufreq_init(struct cpufreq_policy *policy)
 {
+	int cpu = policy->cpu;
 	struct acpi_processor *pr = per_cpu(processors, cpu);
 	int ret;
 
 	if (!pr)
 		return;
 
-	ret = dev_pm_qos_add_request(get_cpu_device(cpu),
-				     &pr->thermal_req, DEV_PM_QOS_MAX_FREQUENCY,
-				     INT_MAX);
+	ret = freq_qos_add_request(&policy->constraints, &pr->thermal_req,
+				   FREQ_QOS_MAX, INT_MAX);
 	if (ret < 0)
 		pr_err("Failed to add freq constraint for CPU%d (%d)\n", cpu,
 		       ret);
 }
 
-void acpi_thermal_cpufreq_exit(int cpu)
+void acpi_thermal_cpufreq_exit(struct cpufreq_policy *policy)
 {
-	struct acpi_processor *pr = per_cpu(processors, cpu);
+	struct acpi_processor *pr = per_cpu(processors, policy->cpu);
 
 	if (pr)
-		dev_pm_qos_remove_request(&pr->thermal_req);
+		freq_qos_remove_request(&pr->thermal_req);
 }
 #else				/* ! CONFIG_CPU_FREQ */
 static int cpufreq_get_max_state(unsigned int cpu)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index bffc11b87247..8478ff6f3045 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -720,7 +720,7 @@ static ssize_t store_##file_name					\
 	if (ret != 1)							\
 		return -EINVAL;						\
 									\
-	ret = dev_pm_qos_update_request(policy->object##_freq_req, val);\
+	ret = freq_qos_update_request(policy->object##_freq_req, val);\
 	return ret >= 0 ? count : ret;					\
 }
 
@@ -1202,19 +1202,21 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 		goto err_free_real_cpus;
 	}
 
+	freq_constraints_init(&policy->constraints);
+
 	policy->nb_min.notifier_call = cpufreq_notifier_min;
 	policy->nb_max.notifier_call = cpufreq_notifier_max;
 
-	ret = dev_pm_qos_add_notifier(dev, &policy->nb_min,
-				      DEV_PM_QOS_MIN_FREQUENCY);
+	ret = freq_qos_add_notifier(&policy->constraints, FREQ_QOS_MIN,
+				    &policy->nb_min);
 	if (ret) {
 		dev_err(dev, "Failed to register MIN QoS notifier: %d (%*pbl)\n",
 			ret, cpumask_pr_args(policy->cpus));
 		goto err_kobj_remove;
 	}
 
-	ret = dev_pm_qos_add_notifier(dev, &policy->nb_max,
-				      DEV_PM_QOS_MAX_FREQUENCY);
+	ret = freq_qos_add_notifier(&policy->constraints, FREQ_QOS_MAX,
+				    &policy->nb_max);
 	if (ret) {
 		dev_err(dev, "Failed to register MAX QoS notifier: %d (%*pbl)\n",
 			ret, cpumask_pr_args(policy->cpus));
@@ -1232,8 +1234,8 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	return policy;
 
 err_min_qos_notifier:
-	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
-				   DEV_PM_QOS_MIN_FREQUENCY);
+	freq_qos_remove_notifier(&policy->constraints, FREQ_QOS_MIN,
+				 &policy->nb_min);
 err_kobj_remove:
 	cpufreq_policy_put_kobj(policy);
 err_free_real_cpus:
@@ -1250,7 +1252,6 @@ err_free_policy:
 
 static void cpufreq_policy_free(struct cpufreq_policy *policy)
 {
-	struct device *dev = get_cpu_device(policy->cpu);
 	unsigned long flags;
 	int cpu;
 
@@ -1262,10 +1263,10 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy)
 		per_cpu(cpufreq_cpu_data, cpu) = NULL;
 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
-	dev_pm_qos_remove_notifier(dev, &policy->nb_max,
-				   DEV_PM_QOS_MAX_FREQUENCY);
-	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
-				   DEV_PM_QOS_MIN_FREQUENCY);
+	freq_qos_remove_notifier(&policy->constraints, FREQ_QOS_MAX,
+				 &policy->nb_max);
+	freq_qos_remove_notifier(&policy->constraints, FREQ_QOS_MIN,
+				 &policy->nb_min);
 
 	if (policy->max_freq_req) {
 		/*
@@ -1274,10 +1275,10 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy)
 		 */
 		blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 					     CPUFREQ_REMOVE_POLICY, policy);
-		dev_pm_qos_remove_request(policy->max_freq_req);
+		freq_qos_remove_request(policy->max_freq_req);
 	}
 
-	dev_pm_qos_remove_request(policy->min_freq_req);
+	freq_qos_remove_request(policy->min_freq_req);
 	kfree(policy->min_freq_req);
 
 	cpufreq_policy_put_kobj(policy);
@@ -1357,8 +1358,6 @@ static int cpufreq_online(unsigned int cpu)
 	cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);
 
 	if (new_policy) {
-		struct device *dev = get_cpu_device(cpu);
-
 		for_each_cpu(j, policy->related_cpus) {
 			per_cpu(cpufreq_cpu_data, j) = policy;
 			add_cpu_dev_symlink(policy, j);
@@ -1369,36 +1368,31 @@ static int cpufreq_online(unsigned int cpu)
 		if (!policy->min_freq_req)
 			goto out_destroy_policy;
 
-		ret = dev_pm_qos_add_request(dev, policy->min_freq_req,
-					     DEV_PM_QOS_MIN_FREQUENCY,
-					     policy->min);
+		ret = freq_qos_add_request(&policy->constraints,
+					   policy->min_freq_req, FREQ_QOS_MIN,
+					   policy->min);
 		if (ret < 0) {
 			/*
-			 * So we don't call dev_pm_qos_remove_request() for an
+			 * So we don't call freq_qos_remove_request() for an
 			 * uninitialized request.
 			 */
 			kfree(policy->min_freq_req);
 			policy->min_freq_req = NULL;
-
-			dev_err(dev, "Failed to add min-freq constraint (%d)\n",
-				ret);
 			goto out_destroy_policy;
 		}
 
 		/*
 		 * This must be initialized right here to avoid calling
-		 * dev_pm_qos_remove_request() on uninitialized request in case
+		 * freq_qos_remove_request() on uninitialized request in case
 		 * of errors.
 		 */
 		policy->max_freq_req = policy->min_freq_req + 1;
 
-		ret = dev_pm_qos_add_request(dev, policy->max_freq_req,
-					     DEV_PM_QOS_MAX_FREQUENCY,
-					     policy->max);
+		ret = freq_qos_add_request(&policy->constraints,
+					   policy->max_freq_req, FREQ_QOS_MAX,
+					   policy->max);
 		if (ret < 0) {
 			policy->max_freq_req = NULL;
-			dev_err(dev, "Failed to add max-freq constraint (%d)\n",
-				ret);
 			goto out_destroy_policy;
 		}
 
@@ -2374,7 +2368,6 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 		       struct cpufreq_policy *new_policy)
 {
 	struct cpufreq_governor *old_gov;
-	struct device *cpu_dev = get_cpu_device(policy->cpu);
 	int ret;
 
 	pr_debug("setting new policy for CPU %u: %u - %u kHz\n",
@@ -2386,8 +2379,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 	 * PM QoS framework collects all the requests from users and provide us
 	 * the final aggregated value here.
 	 */
-	new_policy->min = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MIN_FREQUENCY);
-	new_policy->max = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MAX_FREQUENCY);
+	new_policy->min = freq_qos_read_value(&policy->constraints, FREQ_QOS_MIN);
+	new_policy->max = freq_qos_read_value(&policy->constraints, FREQ_QOS_MAX);
 
 	/* verify the cpu speed can be set within this limit */
 	ret = cpufreq_driver->verify(new_policy);
@@ -2518,7 +2511,7 @@ static int cpufreq_boost_set_sw(int state)
 			break;
 		}
 
-		ret = dev_pm_qos_update_request(policy->max_freq_req, policy->max);
+		ret = freq_qos_update_request(policy->max_freq_req, policy->max);
 		if (ret < 0)
 			break;
 	}
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 9f02de9a1b47..53a51c169451 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1088,10 +1088,10 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b,
 
 static struct cpufreq_driver intel_pstate;
 
-static void update_qos_request(enum dev_pm_qos_req_type type)
+static void update_qos_request(enum freq_qos_req_type type)
 {
 	int max_state, turbo_max, freq, i, perf_pct;
-	struct dev_pm_qos_request *req;
+	struct freq_qos_request *req;
 	struct cpufreq_policy *policy;
 
 	for_each_possible_cpu(i) {
@@ -1112,7 +1112,7 @@ static void update_qos_request(enum dev_pm_qos_req_type type)
 		else
 			turbo_max = cpu->pstate.turbo_pstate;
 
-		if (type == DEV_PM_QOS_MIN_FREQUENCY) {
+		if (type == FREQ_QOS_MIN) {
 			perf_pct = global.min_perf_pct;
 		} else {
 			req++;
@@ -1122,7 +1122,7 @@ static void update_qos_request(enum dev_pm_qos_req_type type)
 		freq = DIV_ROUND_UP(turbo_max * perf_pct, 100);
 		freq *= cpu->pstate.scaling;
 
-		if (dev_pm_qos_update_request(req, freq) < 0)
+		if (freq_qos_update_request(req, freq) < 0)
 			pr_warn("Failed to update freq constraint: CPU%d\n", i);
 	}
 }
@@ -1153,7 +1153,7 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct kobj_attribute *b,
 	if (intel_pstate_driver == &intel_pstate)
 		intel_pstate_update_policies();
 	else
-		update_qos_request(DEV_PM_QOS_MAX_FREQUENCY);
+		update_qos_request(FREQ_QOS_MAX);
 
 	mutex_unlock(&intel_pstate_driver_lock);
 
@@ -1187,7 +1187,7 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct kobj_attribute *b,
 	if (intel_pstate_driver == &intel_pstate)
 		intel_pstate_update_policies();
 	else
-		update_qos_request(DEV_PM_QOS_MIN_FREQUENCY);
+		update_qos_request(FREQ_QOS_MIN);
 
 	mutex_unlock(&intel_pstate_driver_lock);
 
@@ -2381,7 +2381,7 @@ static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
 static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
 	int max_state, turbo_max, min_freq, max_freq, ret;
-	struct dev_pm_qos_request *req;
+	struct freq_qos_request *req;
 	struct cpudata *cpu;
 	struct device *dev;
 
@@ -2416,15 +2416,15 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	max_freq = DIV_ROUND_UP(turbo_max * global.max_perf_pct, 100);
 	max_freq *= cpu->pstate.scaling;
 
-	ret = dev_pm_qos_add_request(dev, req, DEV_PM_QOS_MIN_FREQUENCY,
-				     min_freq);
+	ret = freq_qos_add_request(&policy->constraints, req, FREQ_QOS_MIN,
+				   min_freq);
 	if (ret < 0) {
 		dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
 		goto free_req;
 	}
 
-	ret = dev_pm_qos_add_request(dev, req + 1, DEV_PM_QOS_MAX_FREQUENCY,
-				     max_freq);
+	ret = freq_qos_add_request(&policy->constraints, req + 1, FREQ_QOS_MAX,
+				   max_freq);
 	if (ret < 0) {
 		dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret);
 		goto remove_min_req;
@@ -2435,7 +2435,7 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	return 0;
 
 remove_min_req:
-	dev_pm_qos_remove_request(req);
+	freq_qos_remove_request(req);
 free_req:
 	kfree(req);
 pstate_exit:
@@ -2446,12 +2446,12 @@ pstate_exit:
 
 static int intel_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 {
-	struct dev_pm_qos_request *req;
+	struct freq_qos_request *req;
 
 	req = policy->driver_data;
 
-	dev_pm_qos_remove_request(req + 1);
-	dev_pm_qos_remove_request(req);
+	freq_qos_remove_request(req + 1);
+	freq_qos_remove_request(req);
 	kfree(req);
 
 	return intel_pstate_cpu_exit(policy);
diff --git a/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c b/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c
index bc9dd30395c4..037fe23bc6ed 100644
--- a/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c
+++ b/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c
@@ -65,7 +65,7 @@ EXPORT_SYMBOL_GPL(cbe_cpufreq_set_pmode_pmi);
 static void cbe_cpufreq_handle_pmi(pmi_message_t pmi_msg)
 {
 	struct cpufreq_policy *policy;
-	struct dev_pm_qos_request *req;
+	struct freq_qos_request *req;
 	u8 node, slow_mode;
 	int cpu, ret;
 
@@ -86,7 +86,7 @@ static void cbe_cpufreq_handle_pmi(pmi_message_t pmi_msg)
 
 	req = policy->driver_data;
 
-	ret = dev_pm_qos_update_request(req,
+	ret = freq_qos_update_request(req,
 			policy->freq_table[slow_mode].frequency);
 	if (ret < 0)
 		pr_warn("Failed to update freq constraint: %d\n", ret);
@@ -103,7 +103,7 @@ static struct pmi_handler cbe_pmi_handler = {
 
 void cbe_cpufreq_pmi_policy_init(struct cpufreq_policy *policy)
 {
-	struct dev_pm_qos_request *req;
+	struct freq_qos_request *req;
 	int ret;
 
 	if (!cbe_cpufreq_has_pmi)
@@ -113,9 +113,8 @@ void cbe_cpufreq_pmi_policy_init(struct cpufreq_policy *policy)
 	if (!req)
 		return;
 
-	ret = dev_pm_qos_add_request(get_cpu_device(policy->cpu), req,
-				     DEV_PM_QOS_MAX_FREQUENCY,
-				     policy->freq_table[0].frequency);
+	ret = freq_qos_add_request(&policy->constraints, req, FREQ_QOS_MAX,
+				   policy->freq_table[0].frequency);
 	if (ret < 0) {
 		pr_err("Failed to add freq constraint (%d)\n", ret);
 		kfree(req);
@@ -128,10 +127,10 @@ EXPORT_SYMBOL_GPL(cbe_cpufreq_pmi_policy_init);
 
 void cbe_cpufreq_pmi_policy_exit(struct cpufreq_policy *policy)
 {
-	struct dev_pm_qos_request *req = policy->driver_data;
+	struct freq_qos_request *req = policy->driver_data;
 
 	if (cbe_cpufreq_has_pmi) {
-		dev_pm_qos_remove_request(req);
+		freq_qos_remove_request(req);
 		kfree(req);
 	}
 }
diff --git a/drivers/macintosh/windfarm_cpufreq_clamp.c b/drivers/macintosh/windfarm_cpufreq_clamp.c
index 705c6200814b..7b726f00f183 100644
--- a/drivers/macintosh/windfarm_cpufreq_clamp.c
+++ b/drivers/macintosh/windfarm_cpufreq_clamp.c
@@ -18,7 +18,7 @@
 
 static int clamped;
 static struct wf_control *clamp_control;
-static struct dev_pm_qos_request qos_req;
+static struct freq_qos_request qos_req;
 static unsigned int min_freq, max_freq;
 
 static int clamp_set(struct wf_control *ct, s32 value)
@@ -35,7 +35,7 @@ static int clamp_set(struct wf_control *ct, s32 value)
 	}
 	clamped = value;
 
-	return dev_pm_qos_update_request(&qos_req, freq);
+	return freq_qos_update_request(&qos_req, freq);
 }
 
 static int clamp_get(struct wf_control *ct, s32 *value)
@@ -77,38 +77,44 @@ static int __init wf_cpufreq_clamp_init(void)
 
 	min_freq = policy->cpuinfo.min_freq;
 	max_freq = policy->cpuinfo.max_freq;
+
+	ret = freq_qos_add_request(&policy->constraints, &qos_req, FREQ_QOS_MAX,
+				   max_freq);
+
 	cpufreq_cpu_put(policy);
 
+	if (ret < 0) {
+		pr_err("%s: Failed to add freq constraint (%d)\n", __func__,
+		       ret);
+		return ret;
+	}
+
 	dev = get_cpu_device(0);
 	if (unlikely(!dev)) {
 		pr_warn("%s: No cpu device for cpu0\n", __func__);
-		return -ENODEV;
+		ret = -ENODEV;
+		goto fail;
 	}
 
 	clamp = kmalloc(sizeof(struct wf_control), GFP_KERNEL);
-	if (clamp == NULL)
-		return -ENOMEM;
-
-	ret = dev_pm_qos_add_request(dev, &qos_req, DEV_PM_QOS_MAX_FREQUENCY,
-				     max_freq);
-	if (ret < 0) {
-		pr_err("%s: Failed to add freq constraint (%d)\n", __func__,
-		       ret);
-		goto free;
+	if (clamp == NULL) {
+		ret = -ENOMEM;
+		goto fail;
 	}
 
 	clamp->ops = &clamp_ops;
 	clamp->name = "cpufreq-clamp";
 	ret = wf_register_control(clamp);
 	if (ret)
-		goto fail;
+		goto free;
+
 	clamp_control = clamp;
 	return 0;
- fail:
-	dev_pm_qos_remove_request(&qos_req);
 
  free:
 	kfree(clamp);
+ fail:
+	freq_qos_remove_request(&qos_req);
 	return ret;
 }
 
@@ -116,7 +122,7 @@ static void __exit wf_cpufreq_clamp_exit(void)
 {
 	if (clamp_control) {
 		wf_unregister_control(clamp_control);
-		dev_pm_qos_remove_request(&qos_req);
+		freq_qos_remove_request(&qos_req);
 	}
 }
 
diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index 391f39776c6a..6b9865c786ba 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -88,7 +88,7 @@ struct cpufreq_cooling_device {
 	struct cpufreq_policy *policy;
 	struct list_head node;
 	struct time_in_idle *idle_time;
-	struct dev_pm_qos_request qos_req;
+	struct freq_qos_request qos_req;
 };
 
 static DEFINE_IDA(cpufreq_ida);
@@ -331,7 +331,7 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
 
 	cpufreq_cdev->cpufreq_state = state;
 
-	return dev_pm_qos_update_request(&cpufreq_cdev->qos_req,
+	return freq_qos_update_request(&cpufreq_cdev->qos_req,
 				cpufreq_cdev->freq_table[state].frequency);
 }
 
@@ -615,9 +615,9 @@ __cpufreq_cooling_register(struct device_node *np,
 		cooling_ops = &cpufreq_cooling_ops;
 	}
 
-	ret = dev_pm_qos_add_request(dev, &cpufreq_cdev->qos_req,
-				     DEV_PM_QOS_MAX_FREQUENCY,
-				     cpufreq_cdev->freq_table[0].frequency);
+	ret = freq_qos_add_request(&policy->constraints,
+				   &cpufreq_cdev->qos_req, FREQ_QOS_MAX,
+				   cpufreq_cdev->freq_table[0].frequency);
 	if (ret < 0) {
 		pr_err("%s: Failed to add freq constraint (%d)\n", __func__,
 		       ret);
@@ -637,7 +637,7 @@ __cpufreq_cooling_register(struct device_node *np,
 	return cdev;
 
 remove_qos_req:
-	dev_pm_qos_remove_request(&cpufreq_cdev->qos_req);
+	freq_qos_remove_request(&cpufreq_cdev->qos_req);
 remove_ida:
 	ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
 free_table:
@@ -736,7 +736,7 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 	mutex_unlock(&cooling_list_lock);
 
 	thermal_cooling_device_unregister(cdev);
-	dev_pm_qos_remove_request(&cpufreq_cdev->qos_req);
+	freq_qos_remove_request(&cpufreq_cdev->qos_req);
 	ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
 	kfree(cpufreq_cdev->idle_time);
 	kfree(cpufreq_cdev->freq_table);
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index f936033cb9e6..47805172e73d 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -232,8 +232,8 @@ struct acpi_processor {
 	struct acpi_processor_limit limit;
 	struct thermal_cooling_device *cdev;
 	struct device *dev; /* Processor device. */
-	struct dev_pm_qos_request perflib_req;
-	struct dev_pm_qos_request thermal_req;
+	struct freq_qos_request perflib_req;
+	struct freq_qos_request thermal_req;
 };
 
 struct acpi_processor_errata {
@@ -302,8 +302,8 @@ static inline void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx
 #ifdef CONFIG_CPU_FREQ
 extern bool acpi_processor_cpufreq_init;
 void acpi_processor_ignore_ppc_init(void);
-void acpi_processor_ppc_init(int cpu);
-void acpi_processor_ppc_exit(int cpu);
+void acpi_processor_ppc_init(struct cpufreq_policy *policy);
+void acpi_processor_ppc_exit(struct cpufreq_policy *policy);
 void acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag);
 extern int acpi_processor_get_bios_limit(int cpu, unsigned int *limit);
 #else
@@ -311,11 +311,11 @@ static inline void acpi_processor_ignore_ppc_init(void)
 {
 	return;
 }
-static inline void acpi_processor_ppc_init(int cpu)
+static inline void acpi_processor_ppc_init(struct cpufreq_policy *policy)
 {
 	return;
 }
-static inline void acpi_processor_ppc_exit(int cpu)
+static inline void acpi_processor_ppc_exit(struct cpufreq_policy *policy)
 {
 	return;
 }
@@ -431,14 +431,14 @@ static inline int acpi_processor_hotplug(struct acpi_processor *pr)
 int acpi_processor_get_limit_info(struct acpi_processor *pr);
 extern const struct thermal_cooling_device_ops processor_cooling_ops;
 #if defined(CONFIG_ACPI_CPU_FREQ_PSS) & defined(CONFIG_CPU_FREQ)
-void acpi_thermal_cpufreq_init(int cpu);
-void acpi_thermal_cpufreq_exit(int cpu);
+void acpi_thermal_cpufreq_init(struct cpufreq_policy *policy);
+void acpi_thermal_cpufreq_exit(struct cpufreq_policy *policy);
 #else
-static inline void acpi_thermal_cpufreq_init(int cpu)
+static inline void acpi_thermal_cpufreq_init(struct cpufreq_policy *policy)
 {
 	return;
 }
-static inline void acpi_thermal_cpufreq_exit(int cpu)
+static inline void acpi_thermal_cpufreq_exit(struct cpufreq_policy *policy)
 {
 	return;
 }
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index c57e88e85c41..92d5fdc8154e 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -13,6 +13,7 @@
 #include <linux/completion.h>
 #include <linux/kobject.h>
 #include <linux/notifier.h>
+#include <linux/pm_qos.h>
 #include <linux/spinlock.h>
 #include <linux/sysfs.h>
 
@@ -76,8 +77,10 @@ struct cpufreq_policy {
 	struct work_struct	update; /* if update_policy() needs to be
 					 * called, but you're in IRQ context */
 
-	struct dev_pm_qos_request *min_freq_req;
-	struct dev_pm_qos_request *max_freq_req;
+	struct freq_constraints	constraints;
+	struct freq_qos_request	*min_freq_req;
+	struct freq_qos_request	*max_freq_req;
+
 	struct cpufreq_frequency_table	*freq_table;
 	enum cpufreq_table_sorting freq_table_sorted;
 
-- 
cgit v1.2.3-59-g8ed1b


From 2aac8bdf7a0fbd3e2a34141d28b57a7e21482cf7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 16 Oct 2019 12:47:53 +0200
Subject: PM: QoS: Drop frequency QoS types from device PM QoS

There are no more active users of DEV_PM_QOS_MIN_FREQUENCY and
DEV_PM_QOS_MAX_FREQUENCY device PM QoS request types, so drop them
along with the code supporting them.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/base/power/qos.c | 70 ++----------------------------------------------
 include/linux/pm_qos.h   | 12 ---------
 2 files changed, 2 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 6c90fd7e2ff8..350dcafd751f 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -115,20 +115,10 @@ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type)
 
 	spin_lock_irqsave(&dev->power.lock, flags);
 
-	switch (type) {
-	case DEV_PM_QOS_RESUME_LATENCY:
+	if (type == DEV_PM_QOS_RESUME_LATENCY) {
 		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT
 			: pm_qos_read_value(&qos->resume_latency);
-		break;
-	case DEV_PM_QOS_MIN_FREQUENCY:
-		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE
-			: pm_qos_read_value(&qos->min_frequency);
-		break;
-	case DEV_PM_QOS_MAX_FREQUENCY:
-		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE
-			: pm_qos_read_value(&qos->max_frequency);
-		break;
-	default:
+	} else {
 		WARN_ON(1);
 		ret = 0;
 	}
@@ -169,14 +159,6 @@ static int apply_constraint(struct dev_pm_qos_request *req,
 			req->dev->power.set_latency_tolerance(req->dev, value);
 		}
 		break;
-	case DEV_PM_QOS_MIN_FREQUENCY:
-		ret = pm_qos_update_target(&qos->min_frequency,
-					   &req->data.pnode, action, value);
-		break;
-	case DEV_PM_QOS_MAX_FREQUENCY:
-		ret = pm_qos_update_target(&qos->max_frequency,
-					   &req->data.pnode, action, value);
-		break;
 	case DEV_PM_QOS_FLAGS:
 		ret = pm_qos_update_flags(&qos->flags, &req->data.flr,
 					  action, value);
@@ -227,24 +209,6 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
 	c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
 	c->type = PM_QOS_MIN;
 
-	c = &qos->min_frequency;
-	plist_head_init(&c->list);
-	c->target_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
-	c->default_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
-	c->no_constraint_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
-	c->type = PM_QOS_MAX;
-	c->notifiers = ++n;
-	BLOCKING_INIT_NOTIFIER_HEAD(n);
-
-	c = &qos->max_frequency;
-	plist_head_init(&c->list);
-	c->target_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
-	c->default_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
-	c->no_constraint_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
-	c->type = PM_QOS_MIN;
-	c->notifiers = ++n;
-	BLOCKING_INIT_NOTIFIER_HEAD(n);
-
 	INIT_LIST_HEAD(&qos->flags.list);
 
 	spin_lock_irq(&dev->power.lock);
@@ -305,18 +269,6 @@ void dev_pm_qos_constraints_destroy(struct device *dev)
 		memset(req, 0, sizeof(*req));
 	}
 
-	c = &qos->min_frequency;
-	plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
-		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE);
-		memset(req, 0, sizeof(*req));
-	}
-
-	c = &qos->max_frequency;
-	plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
-		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
-		memset(req, 0, sizeof(*req));
-	}
-
 	f = &qos->flags;
 	list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) {
 		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
@@ -428,8 +380,6 @@ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req,
 	switch(req->type) {
 	case DEV_PM_QOS_RESUME_LATENCY:
 	case DEV_PM_QOS_LATENCY_TOLERANCE:
-	case DEV_PM_QOS_MIN_FREQUENCY:
-	case DEV_PM_QOS_MAX_FREQUENCY:
 		curr_value = req->data.pnode.prio;
 		break;
 	case DEV_PM_QOS_FLAGS:
@@ -557,14 +507,6 @@ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier,
 		ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers,
 						       notifier);
 		break;
-	case DEV_PM_QOS_MIN_FREQUENCY:
-		ret = blocking_notifier_chain_register(dev->power.qos->min_frequency.notifiers,
-						       notifier);
-		break;
-	case DEV_PM_QOS_MAX_FREQUENCY:
-		ret = blocking_notifier_chain_register(dev->power.qos->max_frequency.notifiers,
-						       notifier);
-		break;
 	default:
 		WARN_ON(1);
 		ret = -EINVAL;
@@ -604,14 +546,6 @@ int dev_pm_qos_remove_notifier(struct device *dev,
 		ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers,
 							 notifier);
 		break;
-	case DEV_PM_QOS_MIN_FREQUENCY:
-		ret = blocking_notifier_chain_unregister(dev->power.qos->min_frequency.notifiers,
-							 notifier);
-		break;
-	case DEV_PM_QOS_MAX_FREQUENCY:
-		ret = blocking_notifier_chain_unregister(dev->power.qos->max_frequency.notifiers,
-							 notifier);
-		break;
 	default:
 		WARN_ON(1);
 		ret = -EINVAL;
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 57b93e3d8f29..ebf5ef17cc2a 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -34,8 +34,6 @@ enum pm_qos_flags_status {
 #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT	PM_QOS_LATENCY_ANY
 #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS	PM_QOS_LATENCY_ANY_NS
 #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE	0
-#define PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE	0
-#define PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE	(-1)
 #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT	(-1)
 
 #define PM_QOS_FLAG_NO_POWER_OFF	(1 << 0)
@@ -54,8 +52,6 @@ struct pm_qos_flags_request {
 enum dev_pm_qos_req_type {
 	DEV_PM_QOS_RESUME_LATENCY = 1,
 	DEV_PM_QOS_LATENCY_TOLERANCE,
-	DEV_PM_QOS_MIN_FREQUENCY,
-	DEV_PM_QOS_MAX_FREQUENCY,
 	DEV_PM_QOS_FLAGS,
 };
 
@@ -97,14 +93,10 @@ struct pm_qos_flags {
 struct dev_pm_qos {
 	struct pm_qos_constraints resume_latency;
 	struct pm_qos_constraints latency_tolerance;
-	struct pm_qos_constraints min_frequency;
-	struct pm_qos_constraints max_frequency;
 	struct pm_qos_flags flags;
 	struct dev_pm_qos_request *resume_latency_req;
 	struct dev_pm_qos_request *latency_tolerance_req;
 	struct dev_pm_qos_request *flags_req;
-	struct dev_pm_qos_request *min_frequency_req;
-	struct dev_pm_qos_request *max_frequency_req;
 };
 
 /* Action requested to pm_qos_update_target */
@@ -199,10 +191,6 @@ static inline s32 dev_pm_qos_read_value(struct device *dev,
 	switch (type) {
 	case DEV_PM_QOS_RESUME_LATENCY:
 		return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
-	case DEV_PM_QOS_MIN_FREQUENCY:
-		return PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
-	case DEV_PM_QOS_MAX_FREQUENCY:
-		return PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
 	default:
 		WARN_ON(1);
 		return 0;
-- 
cgit v1.2.3-59-g8ed1b