From 6899b32b5b2dee358936b82b8363b716607a138f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 23 Apr 2018 18:09:21 +0100 Subject: bpf: disable and restore preemption in __BPF_PROG_RUN_ARRAY Running bpf programs requires disabled preemption, however at least some* of the BPF_PROG_RUN_ARRAY users do not follow this rule. To fix this bug, and also to make it not happen in the future, let's add explicit preemption disabling/re-enabling to the __BPF_PROG_RUN_ARRAY code. * for example: [ 17.624472] RIP: 0010:__cgroup_bpf_run_filter_sk+0x1c4/0x1d0 ... [ 17.640890] inet6_create+0x3eb/0x520 [ 17.641405] __sock_create+0x242/0x340 [ 17.641939] __sys_socket+0x57/0xe0 [ 17.642370] ? trace_hardirqs_off_thunk+0x1a/0x1c [ 17.642944] SyS_socket+0xa/0x10 [ 17.643357] do_syscall_64+0x79/0x220 [ 17.643879] entry_SYSCALL_64_after_hwframe+0x42/0xb7 Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 486e65e3db26..dc586cc64bc2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -351,6 +351,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog **_prog, *__prog; \ struct bpf_prog_array *_array; \ u32 _ret = 1; \ + preempt_disable(); \ rcu_read_lock(); \ _array = rcu_dereference(array); \ if (unlikely(check_non_null && !_array))\ @@ -362,6 +363,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, } \ _out: \ rcu_read_unlock(); \ + preempt_enable_no_resched(); \ _ret; \ }) -- cgit v1.2.3 From ba6b8de423f8d0dee48d6030288ed81c03ddf9f0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 23 Apr 2018 15:39:23 -0700 Subject: bpf: sockmap, map_release does not hold refcnt for pinned maps Relying on map_release hook to decrement the reference counts when a map is removed only works if the map is not being pinned. In the pinned case the ref is decremented immediately and the BPF programs released. After this BPF programs may not be in-use which is not what the user would expect. This patch moves the release logic into bpf_map_put_uref() and brings sockmap in-line with how a similar case is handled in prog array maps. Fixes: 3d9e952697de ("bpf: sockmap, fix leaking maps with attached but not detached progs") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dc586cc64bc2..469b20e1dd7e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -31,6 +31,7 @@ struct bpf_map_ops { void (*map_release)(struct bpf_map *map, struct file *map_file); void (*map_free)(struct bpf_map *map); int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); + void (*map_release_uref)(struct bpf_map *map); /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); @@ -436,7 +437,6 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); -void bpf_fd_array_map_clear(struct bpf_map *map); int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); -- cgit v1.2.3 From 6082d9c9c94a408d7409b5f2e4e42ac9e8b16d0d Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Thu, 12 Apr 2018 09:49:11 +0000 Subject: net/mlx5: Fix mlx5_get_vector_affinity function Adding the vector offset when calling to mlx5_vector2eqn() is wrong. This is because mlx5_vector2eqn() checks if EQ index is equal to vector number and the fact that the internal completion vectors that mlx5 allocates don't get an EQ index. The second problem here is that using effective_affinity_mask gives the same CPU for different vectors. This leads to unmapped queues when calling it from blk_mq_rdma_map_queues(). This doesn't happen when using affinity_hint mask. Fixes: 2572cf57d75a ("mlx5: fix mlx5_get_vector_affinity to start from completion vector 0") Fixes: 05e0cc84e00c ("net/mlx5: Fix get vector affinity helper function") Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg --- include/linux/mlx5/driver.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 767d193c269a..2a156c5dfadd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1284,25 +1284,19 @@ enum { }; static inline const struct cpumask * -mlx5_get_vector_affinity(struct mlx5_core_dev *dev, int vector) +mlx5_get_vector_affinity_hint(struct mlx5_core_dev *dev, int vector) { - const struct cpumask *mask; struct irq_desc *desc; unsigned int irq; int eqn; int err; - err = mlx5_vector2eqn(dev, MLX5_EQ_VEC_COMP_BASE + vector, &eqn, &irq); + err = mlx5_vector2eqn(dev, vector, &eqn, &irq); if (err) return NULL; desc = irq_to_desc(irq); -#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK - mask = irq_data_get_effective_affinity_mask(&desc->irq_data); -#else - mask = desc->irq_common_data.affinity; -#endif - return mask; + return desc->affinity_hint; } #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From de08481a253ac658433a8304a303ce9f018d71e5 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 27 Apr 2018 19:02:05 +0300 Subject: vhost: make msg padding explicit There's a 32 bit hole just after type. It's best to give it a name, this way compiler is forced to initialize it with rest of the structure. Reported-by: Kevin Easton Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/uapi/linux/vhost.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index c51f8e5cc608..5a8ad064445b 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -68,6 +68,7 @@ struct vhost_iotlb_msg { struct vhost_msg { int type; + int padding0; union { struct vhost_iotlb_msg iotlb; __u8 padding[64]; -- cgit v1.2.3 From edd7ceb78296fb1574958991b6655c3c2cedf124 Mon Sep 17 00:00:00 2001 From: Thomas Winter Date: Tue, 1 May 2018 09:15:29 +1200 Subject: ipv6: Allow non-gateway ECMP for IPv6 It is valid to have static routes where the nexthop is an interface not an address such as tunnels. For IPv4 it was possible to use ECMP on these routes but not for IPv6. Signed-off-by: Thomas Winter Cc: David Ahern Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 08b132381984..abceb5864d99 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -68,8 +68,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) { - return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; + return (rt->rt6i_flags & (RTF_ADDRCONF | RTF_DYNAMIC)) == 0; } void ip6_route_input(struct sk_buff *skb); -- cgit v1.2.3 From c212d2c7fc4736d49be102fb7a1a545cdc2f1fea Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Tue, 1 May 2018 13:05:39 -0700 Subject: net/tls: Don't recursively call push_record during tls_write_space callbacks It is reported that in some cases, write_space may be called in do_tcp_sendpages, such that we recursively invoke do_tcp_sendpages again: [ 660.468802] ? do_tcp_sendpages+0x8d/0x580 [ 660.468826] ? tls_push_sg+0x74/0x130 [tls] [ 660.468852] ? tls_push_record+0x24a/0x390 [tls] [ 660.468880] ? tls_write_space+0x6a/0x80 [tls] ... tls_push_sg already does a loop over all sending sg's, so ignore any tls_write_space notifications until we are done sending. We then have to call the previous write_space to wake up poll() waiters after we are done with the send loop. Reported-by: Andre Tomt Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 3da8e13a6d96..b400d0bb7448 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -148,6 +148,7 @@ struct tls_context { struct scatterlist *partially_sent_record; u16 partially_sent_offset; unsigned long flags; + bool in_tcp_sendpages; u16 pending_open_record_frags; int (*push_pending_record)(struct sock *sk, int flags); -- cgit v1.2.3 From c818aa88d2d0cfc4938bfa9e226c0792af2dc45f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 2 May 2018 17:19:05 +0300 Subject: Revert "vhost: make msg padding explicit" This reverts commit 93c0d549c4c5a7382ad70de6b86610b7aae57406. Unfortunately the padding will break 32 bit userspace. Ouch. Need to add some compat code, revert for now. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/uapi/linux/vhost.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 5a8ad064445b..c51f8e5cc608 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -68,7 +68,6 @@ struct vhost_iotlb_msg { struct vhost_msg { int type; - int padding0; union { struct vhost_iotlb_msg iotlb; __u8 padding[64]; -- cgit v1.2.3 From 30ca22e4a5d0063dd9a9cdf35cd139c5807cbeb3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 2 May 2018 22:41:56 +0300 Subject: ipv6: Revert "ipv6: Allow non-gateway ECMP for IPv6" This reverts commit edd7ceb78296 ("ipv6: Allow non-gateway ECMP for IPv6"). Eric reported a division by zero in rt6_multipath_rebalance() which is caused by above commit that considers identical local routes to be siblings. The division by zero happens because a nexthop weight is not set for local routes. Revert the commit as it does not fix a bug and has side effects. To reproduce: # ip -6 address add 2001:db8::1/64 dev dummy0 # ip -6 address add 2001:db8::1/64 dev dummy1 Fixes: edd7ceb78296 ("ipv6: Allow non-gateway ECMP for IPv6") Signed-off-by: Ido Schimmel Reported-by: Eric Dumazet Tested-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_route.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index abceb5864d99..08b132381984 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -68,7 +68,8 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) { - return (rt->rt6i_flags & (RTF_ADDRCONF | RTF_DYNAMIC)) == 0; + return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + RTF_GATEWAY; } void ip6_route_input(struct sk_buff *skb); -- cgit v1.2.3