From 6899b32b5b2dee358936b82b8363b716607a138f Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Mon, 23 Apr 2018 18:09:21 +0100
Subject: bpf: disable and restore preemption in __BPF_PROG_RUN_ARRAY

Running bpf programs requires disabled preemption,
however at least some* of the BPF_PROG_RUN_ARRAY users
do not follow this rule.

To fix this bug, and also to make it not happen in the future,
let's add explicit preemption disabling/re-enabling
to the __BPF_PROG_RUN_ARRAY code.

* for example:
 [   17.624472] RIP: 0010:__cgroup_bpf_run_filter_sk+0x1c4/0x1d0
 ...
 [   17.640890]  inet6_create+0x3eb/0x520
 [   17.641405]  __sock_create+0x242/0x340
 [   17.641939]  __sys_socket+0x57/0xe0
 [   17.642370]  ? trace_hardirqs_off_thunk+0x1a/0x1c
 [   17.642944]  SyS_socket+0xa/0x10
 [   17.643357]  do_syscall_64+0x79/0x220
 [   17.643879]  entry_SYSCALL_64_after_hwframe+0x42/0xb7

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 486e65e3db26..dc586cc64bc2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -351,6 +351,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 		struct bpf_prog **_prog, *__prog;	\
 		struct bpf_prog_array *_array;		\
 		u32 _ret = 1;				\
+		preempt_disable();			\
 		rcu_read_lock();			\
 		_array = rcu_dereference(array);	\
 		if (unlikely(check_non_null && !_array))\
@@ -362,6 +363,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 		}					\
 _out:							\
 		rcu_read_unlock();			\
+		preempt_enable_no_resched();		\
 		_ret;					\
 	 })
 
-- 
cgit v1.2.3


From ba6b8de423f8d0dee48d6030288ed81c03ddf9f0 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 23 Apr 2018 15:39:23 -0700
Subject: bpf: sockmap, map_release does not hold refcnt for pinned maps

Relying on map_release hook to decrement the reference counts when a
map is removed only works if the map is not being pinned. In the
pinned case the ref is decremented immediately and the BPF programs
released. After this BPF programs may not be in-use which is not
what the user would expect.

This patch moves the release logic into bpf_map_put_uref() and brings
sockmap in-line with how a similar case is handled in prog array maps.

Fixes: 3d9e952697de ("bpf: sockmap, fix leaking maps with attached but not detached progs")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index dc586cc64bc2..469b20e1dd7e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -31,6 +31,7 @@ struct bpf_map_ops {
 	void (*map_release)(struct bpf_map *map, struct file *map_file);
 	void (*map_free)(struct bpf_map *map);
 	int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
+	void (*map_release_uref)(struct bpf_map *map);
 
 	/* funcs callable from userspace and from eBPF programs */
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
@@ -436,7 +437,6 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value);
 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 				 void *key, void *value, u64 map_flags);
 int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
-void bpf_fd_array_map_clear(struct bpf_map *map);
 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 				void *key, void *value, u64 map_flags);
 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
-- 
cgit v1.2.3


From 6082d9c9c94a408d7409b5f2e4e42ac9e8b16d0d Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Thu, 12 Apr 2018 09:49:11 +0000
Subject: net/mlx5: Fix mlx5_get_vector_affinity function

Adding the vector offset when calling to mlx5_vector2eqn() is wrong.
This is because mlx5_vector2eqn() checks if EQ index is equal to vector number
and the fact that the internal completion vectors that mlx5 allocates
don't get an EQ index.

The second problem here is that using effective_affinity_mask gives the same
CPU for different vectors.
This leads to unmapped queues when calling it from blk_mq_rdma_map_queues().
This doesn't happen when using affinity_hint mask.

Fixes: 2572cf57d75a ("mlx5: fix mlx5_get_vector_affinity to start from completion vector 0")
Fixes: 05e0cc84e00c ("net/mlx5: Fix get vector affinity helper function")
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
 include/linux/mlx5/driver.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 767d193c269a..2a156c5dfadd 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1284,25 +1284,19 @@ enum {
 };
 
 static inline const struct cpumask *
-mlx5_get_vector_affinity(struct mlx5_core_dev *dev, int vector)
+mlx5_get_vector_affinity_hint(struct mlx5_core_dev *dev, int vector)
 {
-	const struct cpumask *mask;
 	struct irq_desc *desc;
 	unsigned int irq;
 	int eqn;
 	int err;
 
-	err = mlx5_vector2eqn(dev, MLX5_EQ_VEC_COMP_BASE + vector, &eqn, &irq);
+	err = mlx5_vector2eqn(dev, vector, &eqn, &irq);
 	if (err)
 		return NULL;
 
 	desc = irq_to_desc(irq);
-#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
-	mask = irq_data_get_effective_affinity_mask(&desc->irq_data);
-#else
-	mask = desc->irq_common_data.affinity;
-#endif
-	return mask;
+	return desc->affinity_hint;
 }
 
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3


From de08481a253ac658433a8304a303ce9f018d71e5 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 27 Apr 2018 19:02:05 +0300
Subject: vhost: make msg padding explicit

There's a 32 bit hole just after type. It's best to
give it a name, this way compiler is forced to initialize
it with rest of the structure.

Reported-by: Kevin Easton <kevin@guarana.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/vhost.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index c51f8e5cc608..5a8ad064445b 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -68,6 +68,7 @@ struct vhost_iotlb_msg {
 
 struct vhost_msg {
 	int type;
+	int padding0;
 	union {
 		struct vhost_iotlb_msg iotlb;
 		__u8 padding[64];
-- 
cgit v1.2.3


From edd7ceb78296fb1574958991b6655c3c2cedf124 Mon Sep 17 00:00:00 2001
From: Thomas Winter <Thomas.Winter@alliedtelesis.co.nz>
Date: Tue, 1 May 2018 09:15:29 +1200
Subject: ipv6: Allow non-gateway ECMP for IPv6

It is valid to have static routes where the nexthop
is an interface not an address such as tunnels.
For IPv4 it was possible to use ECMP on these routes
but not for IPv6.

Signed-off-by: Thomas Winter <Thomas.Winter@alliedtelesis.co.nz>
Cc: David Ahern <dsahern@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 08b132381984..abceb5864d99 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -68,8 +68,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 
 static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt)
 {
-	return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
-	       RTF_GATEWAY;
+	return (rt->rt6i_flags & (RTF_ADDRCONF | RTF_DYNAMIC)) == 0;
 }
 
 void ip6_route_input(struct sk_buff *skb);
-- 
cgit v1.2.3


From c212d2c7fc4736d49be102fb7a1a545cdc2f1fea Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Tue, 1 May 2018 13:05:39 -0700
Subject: net/tls: Don't recursively call push_record during tls_write_space
 callbacks

It is reported that in some cases, write_space may be called in
do_tcp_sendpages, such that we recursively invoke do_tcp_sendpages again:

[  660.468802]  ? do_tcp_sendpages+0x8d/0x580
[  660.468826]  ? tls_push_sg+0x74/0x130 [tls]
[  660.468852]  ? tls_push_record+0x24a/0x390 [tls]
[  660.468880]  ? tls_write_space+0x6a/0x80 [tls]
...

tls_push_sg already does a loop over all sending sg's, so ignore
any tls_write_space notifications until we are done sending.
We then have to call the previous write_space to wake up
poll() waiters after we are done with the send loop.

Reported-by: Andre Tomt <andre@tomt.net>
Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 3da8e13a6d96..b400d0bb7448 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -148,6 +148,7 @@ struct tls_context {
 	struct scatterlist *partially_sent_record;
 	u16 partially_sent_offset;
 	unsigned long flags;
+	bool in_tcp_sendpages;
 
 	u16 pending_open_record_frags;
 	int (*push_pending_record)(struct sock *sk, int flags);
-- 
cgit v1.2.3


From c818aa88d2d0cfc4938bfa9e226c0792af2dc45f Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 2 May 2018 17:19:05 +0300
Subject: Revert "vhost: make msg padding explicit"

This reverts commit 93c0d549c4c5a7382ad70de6b86610b7aae57406.

Unfortunately the padding will break 32 bit userspace.
Ouch. Need to add some compat code, revert for now.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/vhost.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 5a8ad064445b..c51f8e5cc608 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -68,7 +68,6 @@ struct vhost_iotlb_msg {
 
 struct vhost_msg {
 	int type;
-	int padding0;
 	union {
 		struct vhost_iotlb_msg iotlb;
 		__u8 padding[64];
-- 
cgit v1.2.3


From 30ca22e4a5d0063dd9a9cdf35cd139c5807cbeb3 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 2 May 2018 22:41:56 +0300
Subject: ipv6: Revert "ipv6: Allow non-gateway ECMP for IPv6"

This reverts commit edd7ceb78296 ("ipv6: Allow non-gateway ECMP for
IPv6").

Eric reported a division by zero in rt6_multipath_rebalance() which is
caused by above commit that considers identical local routes to be
siblings. The division by zero happens because a nexthop weight is not
set for local routes.

Revert the commit as it does not fix a bug and has side effects.

To reproduce:

# ip -6 address add 2001:db8::1/64 dev dummy0
# ip -6 address add 2001:db8::1/64 dev dummy1

Fixes: edd7ceb78296 ("ipv6: Allow non-gateway ECMP for IPv6")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Tested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index abceb5864d99..08b132381984 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -68,7 +68,8 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 
 static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt)
 {
-	return (rt->rt6i_flags & (RTF_ADDRCONF | RTF_DYNAMIC)) == 0;
+	return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
+	       RTF_GATEWAY;
 }
 
 void ip6_route_input(struct sk_buff *skb);
-- 
cgit v1.2.3