From becf2319f320cae43e20cf179cc51a355a0deb5f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Mar 2019 23:11:53 +0100 Subject: selftests: netfilter: check icmp pkttoobig errors are set as related When an icmp error such as pkttoobig is received, conntrack checks if the "inner" header (header of packet that did not fit link mtu) is matches an existing connection, and, if so, sets that packet as being related to the conntrack entry it found. It was recently reported that this "related" setting also works if the inner header is from another, different connection (i.e., artificial/forged icmp error). Add a test, followup patch will add additional "inner dst matches outer dst in reverse direction" check before setting related state. Link: https://www.synacktiv.com/posts/systems/icmp-reachable.html Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/Makefile | 2 +- .../selftests/netfilter/conntrack_icmp_related.sh | 283 +++++++++++++++++++++ 2 files changed, 284 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/netfilter/conntrack_icmp_related.sh diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index c9ff2b47bd1c..a37cb1192c6a 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for netfilter selftests -TEST_PROGS := nft_trans_stress.sh nft_nat.sh +TEST_PROGS := nft_trans_stress.sh nft_nat.sh conntrack_icmp_related.sh include ../lib.mk diff --git a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh b/tools/testing/selftests/netfilter/conntrack_icmp_related.sh new file mode 100755 index 000000000000..b48e1833bc89 --- /dev/null +++ b/tools/testing/selftests/netfilter/conntrack_icmp_related.sh @@ -0,0 +1,283 @@ +#!/bin/bash +# +# check that ICMP df-needed/pkttoobig icmp are set are set as related +# state +# +# Setup is: +# +# nsclient1 -> nsrouter1 -> nsrouter2 -> nsclient2 +# MTU 1500, except for nsrouter2 <-> nsclient2 link (1280). +# ping nsclient2 from nsclient1, checking that conntrack did set RELATED +# 'fragmentation needed' icmp packet. +# +# In addition, nsrouter1 will perform IP masquerading, i.e. also +# check the icmp errors are propagated to the correct host as per +# nat of "established" icmp-echo "connection". + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 +ret=0 + +nft --version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without nft tool" + exit $ksft_skip +fi + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +cleanup() { + for i in 1 2;do ip netns del nsclient$i;done + for i in 1 2;do ip netns del nsrouter$i;done +} + +ipv4() { + echo -n 192.168.$1.2 +} + +ipv6 () { + echo -n dead:$1::2 +} + +check_counter() +{ + ns=$1 + name=$2 + expect=$3 + local lret=0 + + cnt=$(ip netns exec $ns nft list counter inet filter "$name" | grep -q "$expect") + if [ $? -ne 0 ]; then + echo "ERROR: counter $name in $ns has unexpected value (expected $expect)" 1>&2 + ip netns exec $ns nft list counter inet filter "$name" 1>&2 + lret=1 + fi + + return $lret +} + +check_unknown() +{ + expect="packets 0 bytes 0" + for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do + check_counter $n "unknown" "$expect" + if [ $? -ne 0 ] ;then + return 1 + fi + done + + return 0 +} + +for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do + ip netns add $n + ip -net $n link set lo up +done + +DEV=veth0 +ip link add $DEV netns nsclient1 type veth peer name eth1 netns nsrouter1 +DEV=veth0 +ip link add $DEV netns nsclient2 type veth peer name eth1 netns nsrouter2 + +DEV=veth0 +ip link add $DEV netns nsrouter1 type veth peer name eth2 netns nsrouter2 + +DEV=veth0 +for i in 1 2; do + ip -net nsclient$i link set $DEV up + ip -net nsclient$i addr add $(ipv4 $i)/24 dev $DEV + ip -net nsclient$i addr add $(ipv6 $i)/64 dev $DEV +done + +ip -net nsrouter1 link set eth1 up +ip -net nsrouter1 link set veth0 up + +ip -net nsrouter2 link set eth1 up +ip -net nsrouter2 link set eth2 up + +ip -net nsclient1 route add default via 192.168.1.1 +ip -net nsclient1 -6 route add default via dead:1::1 + +ip -net nsclient2 route add default via 192.168.2.1 +ip -net nsclient2 route add default via dead:2::1 + +i=3 +ip -net nsrouter1 addr add 192.168.1.1/24 dev eth1 +ip -net nsrouter1 addr add 192.168.3.1/24 dev veth0 +ip -net nsrouter1 addr add dead:1::1/64 dev eth1 +ip -net nsrouter1 addr add dead:3::1/64 dev veth0 +ip -net nsrouter1 route add default via 192.168.3.10 +ip -net nsrouter1 -6 route add default via dead:3::10 + +ip -net nsrouter2 addr add 192.168.2.1/24 dev eth1 +ip -net nsrouter2 addr add 192.168.3.10/24 dev eth2 +ip -net nsrouter2 addr add dead:2::1/64 dev eth1 +ip -net nsrouter2 addr add dead:3::10/64 dev eth2 +ip -net nsrouter2 route add default via 192.168.3.1 +ip -net nsrouter2 route add default via dead:3::1 + +sleep 2 +for i in 4 6; do + ip netns exec nsrouter1 sysctl -q net.ipv$i.conf.all.forwarding=1 + ip netns exec nsrouter2 sysctl -q net.ipv$i.conf.all.forwarding=1 +done + +for netns in nsrouter1 nsrouter2; do +ip netns exec $netns nft -f - </dev/null +if [ $? -ne 0 ]; then + echo "ERROR: netns ip routing/connectivity broken" 1>&2 + cleanup + exit 1 +fi +ip netns exec nsclient1 ping6 -q -c 1 -s 1000 dead:2::2 >/dev/null +if [ $? -ne 0 ]; then + echo "ERROR: netns ipv6 routing/connectivity broken" 1>&2 + cleanup + exit 1 +fi + +check_unknown +if [ $? -ne 0 ]; then + ret=1 +fi + +expect="packets 0 bytes 0" +for netns in nsrouter1 nsrouter2 nsclient1;do + check_counter "$netns" "related" "$expect" + if [ $? -ne 0 ]; then + ret=1 + fi +done + +expect="packets 2 bytes 2076" +check_counter nsclient2 "new" "$expect" +if [ $? -ne 0 ]; then + ret=1 +fi + +ip netns exec nsclient1 ping -q -c 1 -s 1300 -M do 192.168.2.2 > /dev/null +if [ $? -eq 0 ]; then + echo "ERROR: ping should have failed with PMTU too big error" 1>&2 + ret=1 +fi + +# nsrouter2 should have generated the icmp error, so +# related counter should be 0 (its in forward). +expect="packets 0 bytes 0" +check_counter "nsrouter2" "related" "$expect" +if [ $? -ne 0 ]; then + ret=1 +fi + +# but nsrouter1 should have seen it, same for nsclient1. +expect="packets 1 bytes 576" +for netns in nsrouter1 nsclient1;do + check_counter "$netns" "related" "$expect" + if [ $? -ne 0 ]; then + ret=1 + fi +done + +ip netns exec nsclient1 ping6 -c 1 -s 1300 dead:2::2 > /dev/null +if [ $? -eq 0 ]; then + echo "ERROR: ping6 should have failed with PMTU too big error" 1>&2 + ret=1 +fi + +expect="packets 2 bytes 1856" +for netns in nsrouter1 nsclient1;do + check_counter "$netns" "related" "$expect" + if [ $? -ne 0 ]; then + ret=1 + fi +done + +if [ $ret -eq 0 ];then + echo "PASS: icmp mtu error had RELATED state" +else + echo "ERROR: icmp error RELATED state test has failed" +fi + +cleanup +exit $ret -- cgit v1.2.3 From 1025ce75212bf06d93910297a03ed6a4d41d8213 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 25 Mar 2019 23:11:54 +0100 Subject: netfilter: conntrack: don't set related state for different outer address Luca Moro says: ------ The issue lies in the filtering of ICMP and ICMPv6 errors that include an inner IP datagram. For these packets, icmp_error_message() extract the ICMP error and inner layer to search of a known state. If a state is found the packet is tagged as related (IP_CT_RELATED). The problem is that there is no correlation check between the inner and outer layer of the packet. So one can encapsulate an error with an inner layer matching a known state, while its outer layer is directed to a filtered host. In this case the whole packet will be tagged as related. This has various implications from a rule bypass (if a rule to related trafic is allow), to a known state oracle. Unfortunately, we could not find a real statement in a RFC on how this case should be filtered. The closest we found is RFC5927 (Section 4.3) but it is not very clear. A possible fix would be to check that the inner IP source is the same than the outer destination. We believed this kind of attack was not documented yet, so we started to write a blog post about it. You can find it attached to this mail (sorry for the extract quality). It contains more technical details, PoC and discussion about the identified behavior. We discovered later that https://www.gont.com.ar/papers/filtering-of-icmp-error-messages.pdf described a similar attack concept in 2004 but without the stateful filtering in mind. ----- This implements above suggested fix: In icmp(v6) error handler, take outer destination address, then pass that into the common function that does the "related" association. After obtaining the nf_conn of the matching inner-headers connection, check that the destination address of the opposite direction tuple is the same as the outer address and only set RELATED if thats the case. Reported-by: Luca Moro Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 6 ++ net/netfilter/nf_conntrack_proto_icmp.c | 93 +++++++++++++++++++++------- net/netfilter/nf_conntrack_proto_icmpv6.c | 52 ++-------------- 3 files changed, 84 insertions(+), 67 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 778087591983..a49edfdf47e8 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -75,6 +75,12 @@ bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple, bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig); +int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state, + u8 l4proto, + union nf_inet_addr *outer_daddr); + int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 7df477996b16..9becac953587 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -103,49 +103,94 @@ int nf_conntrack_icmp_packet(struct nf_conn *ct, return NF_ACCEPT; } -/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ -static int -icmp_error_message(struct nf_conn *tmpl, struct sk_buff *skb, - const struct nf_hook_state *state) +/* Check inner header is related to any of the existing connections */ +int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state, + u8 l4proto, union nf_inet_addr *outer_daddr) { struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_zone *zone; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; + union nf_inet_addr *ct_daddr; + enum ip_conntrack_dir dir; + struct nf_conn *ct; WARN_ON(skb_nfct(skb)); zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); /* Are they talking about one of our connections? */ - if (!nf_ct_get_tuplepr(skb, - skb_network_offset(skb) + ip_hdrlen(skb) - + sizeof(struct icmphdr), - PF_INET, state->net, &origtuple)) { - pr_debug("icmp_error_message: failed to get tuple\n"); + if (!nf_ct_get_tuplepr(skb, dataoff, + state->pf, state->net, &origtuple)) return -NF_ACCEPT; - } /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&innertuple, &origtuple)) { - pr_debug("icmp_error_message: no match\n"); + if (!nf_ct_invert_tuple(&innertuple, &origtuple)) return -NF_ACCEPT; - } - - ctinfo = IP_CT_RELATED; h = nf_conntrack_find_get(state->net, zone, &innertuple); - if (!h) { - pr_debug("icmp_error_message: no match\n"); + if (!h) + return -NF_ACCEPT; + + /* Consider: A -> T (=This machine) -> B + * Conntrack entry will look like this: + * Original: A->B + * Reply: B->T (SNAT case) OR A + * + * When this function runs, we got packet that looks like this: + * iphdr|icmphdr|inner_iphdr|l4header (tcp, udp, ..). + * + * Above nf_conntrack_find_get() makes lookup based on inner_hdr, + * so we should expect that destination of the found connection + * matches outer header destination address. + * + * In above example, we can consider these two cases: + * 1. Error coming in reply direction from B or M (middle box) to + * T (SNAT case) or A. + * Inner saddr will be B, dst will be T or A. + * The found conntrack will be reply tuple (B->T/A). + * 2. Error coming in original direction from A or M to B. + * Inner saddr will be A, inner daddr will be B. + * The found conntrack will be original tuple (A->B). + * + * In both cases, conntrack[dir].dst == inner.dst. + * + * A bogus packet could look like this: + * Inner: B->T + * Outer: B->X (other machine reachable by T). + * + * In this case, lookup yields connection A->B and will + * set packet from B->X as *RELATED*, even though no connection + * from X was ever seen. + */ + ct = nf_ct_tuplehash_to_ctrack(h); + dir = NF_CT_DIRECTION(h); + ct_daddr = &ct->tuplehash[dir].tuple.dst.u3; + if (!nf_inet_addr_cmp(outer_daddr, ct_daddr)) { + if (state->pf == AF_INET) { + nf_l4proto_log_invalid(skb, state->net, state->pf, + l4proto, + "outer daddr %pI4 != inner %pI4", + &outer_daddr->ip, &ct_daddr->ip); + } else if (state->pf == AF_INET6) { + nf_l4proto_log_invalid(skb, state->net, state->pf, + l4proto, + "outer daddr %pI6 != inner %pI6", + &outer_daddr->ip6, &ct_daddr->ip6); + } + nf_ct_put(ct); return -NF_ACCEPT; } - if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + ctinfo = IP_CT_RELATED; + if (dir == IP_CT_DIR_REPLY) ctinfo += IP_CT_IS_REPLY; /* Update skb to refer to this connection */ - nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); + nf_ct_set(skb, ct, ctinfo); return NF_ACCEPT; } @@ -162,11 +207,12 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { + union nf_inet_addr outer_daddr; const struct icmphdr *icmph; struct icmphdr _ih; /* Not enough header? */ - icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); + icmph = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmph == NULL) { icmp_error_log(skb, state, "short packet"); return -NF_ACCEPT; @@ -199,7 +245,12 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, icmph->type != ICMP_REDIRECT) return NF_ACCEPT; - return icmp_error_message(tmpl, skb, state); + memset(&outer_daddr, 0, sizeof(outer_daddr)); + outer_daddr.ip = ip_hdr(skb)->daddr; + + dataoff += sizeof(*icmph); + return nf_conntrack_inet_error(tmpl, skb, dataoff, state, + IPPROTO_ICMP, &outer_daddr); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index bec4a3211658..c63ee3612855 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -123,51 +123,6 @@ int nf_conntrack_icmpv6_packet(struct nf_conn *ct, return NF_ACCEPT; } -static int -icmpv6_error_message(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, - unsigned int icmp6off) -{ - struct nf_conntrack_tuple intuple, origtuple; - const struct nf_conntrack_tuple_hash *h; - enum ip_conntrack_info ctinfo; - struct nf_conntrack_zone tmp; - - WARN_ON(skb_nfct(skb)); - - /* Are they talking about one of our connections? */ - if (!nf_ct_get_tuplepr(skb, - skb_network_offset(skb) - + sizeof(struct ipv6hdr) - + sizeof(struct icmp6hdr), - PF_INET6, net, &origtuple)) { - pr_debug("icmpv6_error: Can't get tuple\n"); - return -NF_ACCEPT; - } - - /* Ordinarily, we'd expect the inverted tupleproto, but it's - been preserved inside the ICMP. */ - if (!nf_ct_invert_tuple(&intuple, &origtuple)) { - pr_debug("icmpv6_error: Can't invert tuple\n"); - return -NF_ACCEPT; - } - - ctinfo = IP_CT_RELATED; - - h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp), - &intuple); - if (!h) { - pr_debug("icmpv6_error: no match\n"); - return -NF_ACCEPT; - } else { - if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) - ctinfo += IP_CT_IS_REPLY; - } - - /* Update skb to refer to this connection */ - nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); - return NF_ACCEPT; -} static void icmpv6_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, @@ -182,6 +137,7 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, unsigned int dataoff, const struct nf_hook_state *state) { + union nf_inet_addr outer_daddr; const struct icmp6hdr *icmp6h; struct icmp6hdr _ih; int type; @@ -210,7 +166,11 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; - return icmpv6_error_message(state->net, tmpl, skb, dataoff); + memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, + sizeof(outer_daddr.ip6)); + dataoff += sizeof(*icmp6h); + return nf_conntrack_inet_error(tmpl, skb, dataoff, state, + IPPROTO_ICMPV6, &outer_daddr); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) -- cgit v1.2.3 From 8176c8332751bf27597488d6e45c9b8f530593bf Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 28 Mar 2019 10:47:20 +0100 Subject: netfilter: conntrack: initialize ct->timeout KMSAN started reporting an error when accessing ct->timeout for the first time without initialization: BUG: KMSAN: uninit-value in __nf_ct_refresh_acct+0x1ae/0x470 net/netfilter/nf_conntrack_core.c:1765 ... dump_stack+0x173/0x1d0 lib/dump_stack.c:113 kmsan_report+0x131/0x2a0 mm/kmsan/kmsan.c:624 __msan_warning+0x7a/0xf0 mm/kmsan/kmsan_instr.c:310 __nf_ct_refresh_acct+0x1ae/0x470 net/netfilter/nf_conntrack_core.c:1765 nf_ct_refresh_acct ./include/net/netfilter/nf_conntrack.h:201 nf_conntrack_udp_packet+0xb44/0x1040 net/netfilter/nf_conntrack_proto_udp.c:122 nf_conntrack_handle_packet net/netfilter/nf_conntrack_core.c:1605 nf_conntrack_in+0x1250/0x26c9 net/netfilter/nf_conntrack_core.c:1696 ... Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:205 kmsan_internal_poison_shadow+0x92/0x150 mm/kmsan/kmsan.c:159 kmsan_kmalloc+0xa9/0x130 mm/kmsan/kmsan_hooks.c:173 kmem_cache_alloc+0x554/0xb10 mm/slub.c:2789 __nf_conntrack_alloc+0x16f/0x690 net/netfilter/nf_conntrack_core.c:1342 init_conntrack+0x6cb/0x2490 net/netfilter/nf_conntrack_core.c:1421 Signed-off-by: Alexander Potapenko Fixes: cc16921351d8ba1 ("netfilter: conntrack: avoid same-timeout update") Cc: Florian Westphal Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 82bfbeef46af..a137d4e7f218 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1350,6 +1350,7 @@ __nf_conntrack_alloc(struct net *net, /* save hash for reusing when confirming */ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; ct->status = 0; + ct->timeout = 0; write_pnet(&ct->ct_net, net); memset(&ct->__nfct_init_offset[0], 0, offsetof(struct nf_conn, proto) - -- cgit v1.2.3 From 0261ea1bd1eb0da5c0792a9119b8655cf33c80a3 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 31 Mar 2019 13:24:52 +0300 Subject: ipvs: do not schedule icmp errors from tunnels We can receive ICMP errors from client or from tunneling real server. While the former can be scheduled to real server, the latter should not be scheduled, they are decapsulated only when existing connection is found. Fixes: 6044eeffafbe ("ipvs: attempt to schedule icmp packets") Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 43bbaa32b1d6..14457551bcb4 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1678,7 +1678,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, if (!cp) { int v; - if (!sysctl_schedule_icmp(ipvs)) + if (ipip || !sysctl_schedule_icmp(ipvs)) return NF_ACCEPT; if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) -- cgit v1.2.3 From 3c79107631db1f7fd32cf3f7368e4672004a3010 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 1 Apr 2019 13:08:54 +0200 Subject: netfilter: ctnetlink: don't use conntrack/expect object addresses as id else, we leak the addresses to userspace via ctnetlink events and dumps. Compute an ID on demand based on the immutable parts of nf_conn struct. Another advantage compared to using an address is that there is no immediate re-use of the same ID in case the conntrack entry is freed and reallocated again immediately. Fixes: 3583240249ef ("[NETFILTER]: nf_conntrack_expect: kill unique ID") Fixes: 7f85f914721f ("[NETFILTER]: nf_conntrack: kill unique ID") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 ++ net/netfilter/nf_conntrack_core.c | 35 +++++++++++++++++++++++++++++++++++ net/netfilter/nf_conntrack_netlink.c | 34 +++++++++++++++++++++++++++++----- 3 files changed, 66 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 5ee7b30b4917..d2bc733a2ef1 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -316,6 +316,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, gfp_t flags); void nf_ct_tmpl_free(struct nf_conn *tmpl); +u32 nf_ct_get_id(const struct nf_conn *ct); + static inline void nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info) { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index a137d4e7f218..3c48d44d6fff 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -449,6 +450,40 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, } EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); +/* Generate a almost-unique pseudo-id for a given conntrack. + * + * intentionally doesn't re-use any of the seeds used for hash + * table location, we assume id gets exposed to userspace. + * + * Following nf_conn items do not change throughout lifetime + * of the nf_conn after it has been committed to main hash table: + * + * 1. nf_conn address + * 2. nf_conn->ext address + * 3. nf_conn->master address (normally NULL) + * 4. tuple + * 5. the associated net namespace + */ +u32 nf_ct_get_id(const struct nf_conn *ct) +{ + static __read_mostly siphash_key_t ct_id_seed; + unsigned long a, b, c, d; + + net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); + + a = (unsigned long)ct; + b = (unsigned long)ct->master ^ net_hash_mix(nf_ct_net(ct)); + c = (unsigned long)ct->ext; + d = (unsigned long)siphash(&ct->tuplehash, sizeof(ct->tuplehash), + &ct_id_seed); +#ifdef CONFIG_64BIT + return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); +#else + return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); +#endif +} +EXPORT_SYMBOL_GPL(nf_ct_get_id); + static void clean_from_lists(struct nf_conn *ct) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 66c596d287a5..d7f61b0547c6 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -485,7 +486,9 @@ nla_put_failure: static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) { - if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct))) + __be32 id = (__force __be32)nf_ct_get_id(ct); + + if (nla_put_be32(skb, CTA_ID, id)) goto nla_put_failure; return 0; @@ -1286,8 +1289,9 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, } if (cda[CTA_ID]) { - u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID])); - if (id != (u32)(unsigned long)ct) { + __be32 id = nla_get_be32(cda[CTA_ID]); + + if (id != (__force __be32)nf_ct_get_id(ct)) { nf_ct_put(ct); return -ENOENT; } @@ -2692,6 +2696,25 @@ nla_put_failure: static const union nf_inet_addr any_addr; +static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp) +{ + static __read_mostly siphash_key_t exp_id_seed; + unsigned long a, b, c, d; + + net_get_random_once(&exp_id_seed, sizeof(exp_id_seed)); + + a = (unsigned long)exp; + b = (unsigned long)exp->helper; + c = (unsigned long)exp->master; + d = (unsigned long)siphash(&exp->tuple, sizeof(exp->tuple), &exp_id_seed); + +#ifdef CONFIG_64BIT + return (__force __be32)siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &exp_id_seed); +#else + return (__force __be32)siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &exp_id_seed); +#endif +} + static int ctnetlink_exp_dump_expect(struct sk_buff *skb, const struct nf_conntrack_expect *exp) @@ -2739,7 +2762,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, } #endif if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) || - nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) || + nla_put_be32(skb, CTA_EXPECT_ID, nf_expect_get_id(exp)) || nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) || nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class))) goto nla_put_failure; @@ -3044,7 +3067,8 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, if (cda[CTA_EXPECT_ID]) { __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); - if (ntohl(id) != (u32)(unsigned long)exp) { + + if (id != nf_expect_get_id(exp)) { nf_ct_expect_put(exp); return -ENOENT; } -- cgit v1.2.3 From 33d1c018179d0a30c39cc5f1682b77867282694b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 6 Apr 2019 08:26:52 +0300 Subject: netfilter: nf_tables: prevent shift wrap in nft_chain_parse_hook() I believe that "hook->num" can be up to UINT_MAX. Shifting more than 31 bits would is undefined in C but in practice it would lead to shift wrapping. That would lead to an array overflow in nf_tables_addchain(): ops->hook = hook.type->hooks[ops->hooknum]; Fixes: fe19c04ca137 ("netfilter: nf_tables: remove nhooks field from struct nft_af_info") Signed-off-by: Dan Carpenter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ef7772e976cc..1606eaa5ae0d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1545,7 +1545,7 @@ static int nft_chain_parse_hook(struct net *net, if (IS_ERR(type)) return PTR_ERR(type); } - if (!(type->hook_mask & (1 << hook->num))) + if (hook->num > NF_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) return -EOPNOTSUPP; if (type->type == NFT_CHAIN_T_NAT && -- cgit v1.2.3 From 5bdac418f33f60b07a34e01e722889140ee8fac9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Apr 2019 14:45:20 +0200 Subject: netfilter: nat: fix icmp id randomization Sven Auhagen reported that a 2nd ping request will fail if 'fully-random' mode is used. Reason is that if no proto information is given, min/max are both 0, so we set the icmp id to 0 instead of chosing a random value between 0 and 65535. Update test case as well to catch this, without fix this yields: [..] ERROR: cannot ping ns1 from ns2 with ip masquerade fully-random (attempt 2) ERROR: cannot ping ns1 from ns2 with ipv6 masquerade fully-random (attempt 2) ... becaus 2nd ping clashes with existing 'id 0' icmp conntrack and gets dropped. Fixes: 203f2e78200c27e ("netfilter: nat: remove l4proto->unique_tuple") Reported-by: Sven Auhagen Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 11 ++++++--- tools/testing/selftests/netfilter/nft_nat.sh | 36 +++++++++++++++++++++------- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index af7dc6537758..000952719adf 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -415,9 +415,14 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, case IPPROTO_ICMPV6: /* id is same for either direction... */ keyptr = &tuple->src.u.icmp.id; - min = range->min_proto.icmp.id; - range_size = ntohs(range->max_proto.icmp.id) - - ntohs(range->min_proto.icmp.id) + 1; + if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { + min = 0; + range_size = 65536; + } else { + min = ntohs(range->min_proto.icmp.id); + range_size = ntohs(range->max_proto.icmp.id) - + ntohs(range->min_proto.icmp.id) + 1; + } goto find_free_id; #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) case IPPROTO_GRE: diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh index 8ec76681605c..3194007cf8d1 100755 --- a/tools/testing/selftests/netfilter/nft_nat.sh +++ b/tools/testing/selftests/netfilter/nft_nat.sh @@ -321,6 +321,7 @@ EOF test_masquerade6() { + local natflags=$1 local lret=0 ip netns exec ns0 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null @@ -354,13 +355,13 @@ ip netns exec ns0 nft -f - < /dev/null # ping ns2->ns1 if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping ns1 from ns2 with active ipv6 masquerading" + echo "ERROR: cannot ping ns1 from ns2 with active ipv6 masquerade $natflags" lret=1 fi @@ -397,19 +398,26 @@ EOF fi done + ip netns exec ns2 ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping ns1 from ns2 with active ipv6 masquerade $natflags (attempt 2)" + lret=1 + fi + ip netns exec ns0 nft flush chain ip6 nat postrouting if [ $? -ne 0 ]; then echo "ERROR: Could not flush ip6 nat postrouting" 1>&2 lret=1 fi - test $lret -eq 0 && echo "PASS: IPv6 masquerade for ns2" + test $lret -eq 0 && echo "PASS: IPv6 masquerade $natflags for ns2" return $lret } test_masquerade() { + local natflags=$1 local lret=0 ip netns exec ns0 sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null @@ -417,7 +425,7 @@ test_masquerade() ip netns exec ns2 ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 if [ $? -ne 0 ] ; then - echo "ERROR: canot ping ns1 from ns2" + echo "ERROR: cannot ping ns1 from ns2 $natflags" lret=1 fi @@ -443,13 +451,13 @@ ip netns exec ns0 nft -f - < /dev/null # ping ns2->ns1 if [ $? -ne 0 ] ; then - echo "ERROR: cannot ping ns1 from ns2 with active ip masquerading" + echo "ERROR: cannot ping ns1 from ns2 with active ip masquere $natflags" lret=1 fi @@ -485,13 +493,19 @@ EOF fi done + ip netns exec ns2 ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 + if [ $? -ne 0 ] ; then + echo "ERROR: cannot ping ns1 from ns2 with active ip masquerade $natflags (attempt 2)" + lret=1 + fi + ip netns exec ns0 nft flush chain ip nat postrouting if [ $? -ne 0 ]; then echo "ERROR: Could not flush nat postrouting" 1>&2 lret=1 fi - test $lret -eq 0 && echo "PASS: IP masquerade for ns2" + test $lret -eq 0 && echo "PASS: IP masquerade $natflags for ns2" return $lret } @@ -750,8 +764,12 @@ test_local_dnat test_local_dnat6 reset_counters -test_masquerade -test_masquerade6 +test_masquerade "" +test_masquerade6 "" + +reset_counters +test_masquerade "fully-random" +test_masquerade6 "fully-random" reset_counters test_redirect -- cgit v1.2.3 From 19fad20d15a6494f47f85d869f00b11343ee5c78 Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Tue, 16 Apr 2019 09:47:24 +0800 Subject: ipv4: set the tcp_min_rtt_wlen range from 0 to one day There is a UBSAN report as below: UBSAN: Undefined behaviour in net/ipv4/tcp_input.c:2877:56 signed integer overflow: 2147483647 * 1000 cannot be represented in type 'int' CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.1.0-rc4-00058-g582549e #1 Call Trace: dump_stack+0x8c/0xba ubsan_epilogue+0x11/0x60 handle_overflow+0x12d/0x170 ? ttwu_do_wakeup+0x21/0x320 __ubsan_handle_mul_overflow+0x12/0x20 tcp_ack_update_rtt+0x76c/0x780 tcp_clean_rtx_queue+0x499/0x14d0 tcp_ack+0x69e/0x1240 ? __wake_up_sync_key+0x2c/0x50 ? update_group_capacity+0x50/0x680 tcp_rcv_established+0x4e2/0xe10 tcp_v4_do_rcv+0x22b/0x420 tcp_v4_rcv+0xfe8/0x1190 ip_protocol_deliver_rcu+0x36/0x180 ip_local_deliver+0x15b/0x1a0 ip_rcv+0xac/0xd0 __netif_receive_skb_one_core+0x7f/0xb0 __netif_receive_skb+0x33/0xc0 netif_receive_skb_internal+0x84/0x1c0 napi_gro_receive+0x2a0/0x300 receive_buf+0x3d4/0x2350 ? detach_buf_split+0x159/0x390 virtnet_poll+0x198/0x840 ? reweight_entity+0x243/0x4b0 net_rx_action+0x25c/0x770 __do_softirq+0x19b/0x66d irq_exit+0x1eb/0x230 do_IRQ+0x7a/0x150 common_interrupt+0xf/0xf It can be reproduced by: echo 2147483647 > /proc/sys/net/ipv4/tcp_min_rtt_wlen Fixes: f672258391b42 ("tcp: track min RTT using windowed min-filter") Signed-off-by: ZhangXiaoxu Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 1 + net/ipv4/sysctl_net_ipv4.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index acdfb5d2bcaa..e2142fe40cda 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -422,6 +422,7 @@ tcp_min_rtt_wlen - INTEGER minimum RTT when it is moved to a longer path (e.g., due to traffic engineering). A longer window makes the filter more resistant to RTT inflations such as transient congestion. The unit is seconds. + Possible values: 0 - 86400 (1 day) Default: 300 tcp_moderate_rcvbuf - BOOLEAN diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ba0fc4b18465..eeb4041fa5f9 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -49,6 +49,7 @@ static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int comp_sack_nr_max = 255; static u32 u32_max_div_HZ = UINT_MAX / HZ; +static int one_day_secs = 24 * 3600; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1151,7 +1152,9 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one_day_secs }, { .procname = "tcp_autocorking", -- cgit v1.2.3 From f87db4dbd52f2f8a170a2b51cb0926221ca7c9e2 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 17 Apr 2019 09:49:39 +0800 Subject: net: stmmac: Use bfsize1 in ndesc_init_rx_desc gcc warn this: drivers/net/ethernet/stmicro/stmmac/norm_desc.c: In function ndesc_init_rx_desc: drivers/net/ethernet/stmicro/stmmac/norm_desc.c:138:6: warning: variable 'bfsize1' set but not used [-Wunused-but-set-variable] Like enh_desc_init_rx_desc, we should use bfsize1 in ndesc_init_rx_desc to calculate 'p->des1' Fixes: 583e63614149 ("net: stmmac: use correct DMA buffer size in the RX descriptor") Signed-off-by: YueHaibing Reviewed-by: Aaro Koskinen Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/norm_desc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c index b7dd4e3c760d..6d690678c20e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/norm_desc.c +++ b/drivers/net/ethernet/stmicro/stmmac/norm_desc.c @@ -140,7 +140,7 @@ static void ndesc_init_rx_desc(struct dma_desc *p, int disable_rx_ic, int mode, p->des0 |= cpu_to_le32(RDES0_OWN); bfsize1 = min(bfsize, BUF_SIZE_2KiB - 1); - p->des1 |= cpu_to_le32(bfsize & RDES1_BUFFER1_SIZE_MASK); + p->des1 |= cpu_to_le32(bfsize1 & RDES1_BUFFER1_SIZE_MASK); if (mode == STMMAC_CHAIN_MODE) ndesc_rx_set_on_chain(p, end); -- cgit v1.2.3 From d003d772e64df08af04ee63609d47169ee82ae0e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 17 Apr 2019 14:15:00 +0100 Subject: nfp: abm: fix spelling mistake "offseting" -> "offsetting" There are a couple of spelling mistakes in NL_SET_ERR_MSG_MOD error messages. Fix these. Signed-off-by: Colin Ian King Acked-by: Jakub Kicinski Reviewed-by: Mukesh Ojha Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/abm/cls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/abm/cls.c b/drivers/net/ethernet/netronome/nfp/abm/cls.c index 9852080cf454..ff3913085665 100644 --- a/drivers/net/ethernet/netronome/nfp/abm/cls.c +++ b/drivers/net/ethernet/netronome/nfp/abm/cls.c @@ -39,7 +39,7 @@ nfp_abm_u32_check_knode(struct nfp_abm *abm, struct tc_cls_u32_knode *knode, } if (knode->sel->off || knode->sel->offshift || knode->sel->offmask || knode->sel->offoff || knode->fshift) { - NL_SET_ERR_MSG_MOD(extack, "variable offseting not supported"); + NL_SET_ERR_MSG_MOD(extack, "variable offsetting not supported"); return false; } if (knode->sel->hoff || knode->sel->hmask) { @@ -78,7 +78,7 @@ nfp_abm_u32_check_knode(struct nfp_abm *abm, struct tc_cls_u32_knode *knode, k = &knode->sel->keys[0]; if (k->offmask) { - NL_SET_ERR_MSG_MOD(extack, "offset mask - variable offseting not supported"); + NL_SET_ERR_MSG_MOD(extack, "offset mask - variable offsetting not supported"); return false; } if (k->off) { -- cgit v1.2.3 From 27b141fc234a3670d21bd742c35d7205d03cbb3a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 Apr 2019 18:29:13 +0200 Subject: s390: ctcm: fix ctcm_new_device error return code clang points out that the return code from this function is undefined for one of the error paths: ../drivers/s390/net/ctcm_main.c:1595:7: warning: variable 'result' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized] if (priv->channel[direction] == NULL) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/s390/net/ctcm_main.c:1638:9: note: uninitialized use occurs here return result; ^~~~~~ ../drivers/s390/net/ctcm_main.c:1595:3: note: remove the 'if' if its condition is always false if (priv->channel[direction] == NULL) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/s390/net/ctcm_main.c:1539:12: note: initialize the variable 'result' to silence this warning int result; ^ Make it return -ENODEV here, as in the related failure cases. gcc has a known bug in underreporting some of these warnings when it has already eliminated the assignment of the return code based on some earlier optimization step. Reviewed-by: Nathan Chancellor Signed-off-by: Arnd Bergmann Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/ctcm_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/s390/net/ctcm_main.c b/drivers/s390/net/ctcm_main.c index 7617d21cb296..f63c5c871d3d 100644 --- a/drivers/s390/net/ctcm_main.c +++ b/drivers/s390/net/ctcm_main.c @@ -1595,6 +1595,7 @@ static int ctcm_new_device(struct ccwgroup_device *cgdev) if (priv->channel[direction] == NULL) { if (direction == CTCM_WRITE) channel_free(priv->channel[CTCM_READ]); + result = -ENODEV; goto out_dev; } priv->channel[direction]->netdev = dev; -- cgit v1.2.3 From f476b3f809fa02f47af6333ed63715058c3fc348 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 18 Apr 2019 07:14:13 +0000 Subject: mlxsw: spectrum: Put MC TCs into DWRR mode Both Spectrum-1 and Spectrum-2 chips are currently configured such that pairs of TC n (which is used for UC traffic) and TC n+8 (which is used for MC traffic) are feeding into the same subgroup. Strict prioritization is configured between the two TCs, and by enabling MC-aware mode on the switch, the lower-numbered (UC) TCs are favored over the higher-numbered (MC) TCs. On Spectrum-2 however, there is an issue in configuration of the MC-aware mode. As a result, MC traffic is prioritized over UC traffic. To work around the issue, configure the MC TCs with DWRR mode (while keeping the UC TCs in strict mode). With this patch, the multicast-unicast arbitration results in the same behavior on both Spectrum-1 and Spectrum-2 chips. Fixes: 7b8195306694 ("mlxsw: spectrum: Configure MC-aware mode on mlxsw ports") Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 9eb63300c1d3..3745ea194632 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3316,7 +3316,7 @@ static int mlxsw_sp_port_ets_init(struct mlxsw_sp_port *mlxsw_sp_port) err = mlxsw_sp_port_ets_set(mlxsw_sp_port, MLXSW_REG_QEEC_HIERARCY_TC, i + 8, i, - false, 0); + true, 100); if (err) return err; } -- cgit v1.2.3 From 1ab3030193d25878b3b1409060e1e0a879800c95 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 18 Apr 2019 07:14:14 +0000 Subject: mlxsw: pci: Reincrease PCI reset timeout During driver initialization the driver sends a reset to the device and waits for the firmware to signal that it is ready to continue. Commit d2f372ba0914 ("mlxsw: pci: Increase PCI SW reset timeout") increased the timeout to 13 seconds due to longer PHY calibration in Spectrum-2 compared to Spectrum-1. Recently it became apparent that this timeout is too short and therefore this patch increases it again to a safer limit that will be reduced in the future. Fixes: c3ab435466d5 ("mlxsw: spectrum: Extend to support Spectrum-2 ASIC") Fixes: d2f372ba0914 ("mlxsw: pci: Increase PCI SW reset timeout") Signed-off-by: Ido Schimmel Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/pci_hw.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h index ffee38e36ce8..8648ca171254 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h +++ b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h @@ -27,7 +27,7 @@ #define MLXSW_PCI_SW_RESET 0xF0010 #define MLXSW_PCI_SW_RESET_RST_BIT BIT(0) -#define MLXSW_PCI_SW_RESET_TIMEOUT_MSECS 13000 +#define MLXSW_PCI_SW_RESET_TIMEOUT_MSECS 20000 #define MLXSW_PCI_SW_RESET_WAIT_MSECS 100 #define MLXSW_PCI_FW_READY 0xA1844 #define MLXSW_PCI_FW_READY_MASK 0xFFFF -- cgit v1.2.3 From 151f0dddbbfe4c35c9c5b64873115aafd436af9d Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Thu, 18 Apr 2019 07:14:16 +0000 Subject: mlxsw: spectrum: Fix autoneg status in ethtool If link is down and autoneg is set to on/off, the status in ethtool does not change. The reason is when the link is down the function returns with zero before changing autoneg value. Move the checking of link state (up/down) to be performed after setting autoneg value, in order to be sure that autoneg will change in any case. Fixes: 56ade8fe3fe1 ("mlxsw: spectrum: Add initial support for Spectrum ASIC") Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 3745ea194632..6b8aa3761899 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3126,11 +3126,11 @@ mlxsw_sp_port_set_link_ksettings(struct net_device *dev, if (err) return err; + mlxsw_sp_port->link.autoneg = autoneg; + if (!netif_running(dev)) return 0; - mlxsw_sp_port->link.autoneg = autoneg; - mlxsw_sp_port_admin_status_set(mlxsw_sp_port, false); mlxsw_sp_port_admin_status_set(mlxsw_sp_port, true); -- cgit v1.2.3 From d5f6db353829fe3867bbf9cd73fd8d631e2346f1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 18 Apr 2019 11:39:18 +0100 Subject: net: ipv6: addrlabel: fix spelling mistake "requewst" -> "request" There is a spelling mistake in a NL_SET_ERR_MSG_MOD error message, fix it. Signed-off-by: Colin Ian King Reviewed-by: Mukesh Ojha Signed-off-by: David S. Miller --- net/ipv6/addrlabel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index d43d076c98f5..1766325423b5 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -476,7 +476,7 @@ static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh, } if (nlmsg_attrlen(nlh, sizeof(*ifal))) { - NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump requewst"); + NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump request"); return -EINVAL; } -- cgit v1.2.3 From a7cf2cc3cd3622eae9d5619cdde56b4462398c7f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 18 Apr 2019 18:03:50 +0100 Subject: firestream: fix spelling mistake "tramsitted" -> "transmitted" There is a spelling mistake in a debug message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/atm/firestream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index 11e1663bdc4d..b2c06da4f62e 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -1646,7 +1646,7 @@ static irqreturn_t fs_irq (int irq, void *dev_id) } if (status & ISR_TBRQ_W) { - fs_dprintk (FS_DEBUG_IRQ, "Data tramsitted!\n"); + fs_dprintk (FS_DEBUG_IRQ, "Data transmitted!\n"); process_txdone_queue (dev, &dev->tx_relq); } -- cgit v1.2.3 From e0c1d14a1a3211dccf0540a6703ffbd5d2a75bdb Mon Sep 17 00:00:00 2001 From: Su Bao Cheng Date: Thu, 18 Apr 2019 11:14:56 +0200 Subject: stmmac: pci: Adjust IOT2000 matching Since there are more IOT2040 variants with identical hardware but different asset tags, the asset tag matching should be adjusted to support them. For the board name "SIMATIC IOT2000", currently there are 2 types of hardware, IOT2020 and IOT2040. The IOT2020 is identified by its unique asset tag. Match on it first. If we then match on the board name only, we will catch all IOT2040 variants. In the future there will be no other devices with the "SIMATIC IOT2000" DMI board name but different hardware. Signed-off-by: Su Bao Cheng Reviewed-by: Jan Kiszka Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index d819e8eaba12..cc1e887e47b5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -159,6 +159,12 @@ static const struct dmi_system_id quark_pci_dmi[] = { }, .driver_data = (void *)&galileo_stmmac_dmi_data, }, + /* + * There are 2 types of SIMATIC IOT2000: IOT20202 and IOT2040. + * The asset tag "6ES7647-0AA00-0YA2" is only for IOT2020 which + * has only one pci network device while other asset tags are + * for IOT2040 which has two. + */ { .matches = { DMI_EXACT_MATCH(DMI_BOARD_NAME, "SIMATIC IOT2000"), @@ -170,8 +176,6 @@ static const struct dmi_system_id quark_pci_dmi[] = { { .matches = { DMI_EXACT_MATCH(DMI_BOARD_NAME, "SIMATIC IOT2000"), - DMI_EXACT_MATCH(DMI_BOARD_ASSET_TAG, - "6ES7647-0AA00-1YA2"), }, .driver_data = (void *)&iot2040_stmmac_dmi_data, }, -- cgit v1.2.3 From 9188d5ca454fd665145904267e726e9e8d122f5c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Apr 2019 10:51:19 -0700 Subject: net/tls: fix refcount adjustment in fallback Unlike atomic_add(), refcount_add() does not deal well with a negative argument. TLS fallback code reallocates the skb and is very likely to shrink the truesize, leading to: [ 189.513254] WARNING: CPU: 5 PID: 0 at lib/refcount.c:81 refcount_add_not_zero_checked+0x15c/0x180 Call Trace: refcount_add_checked+0x6/0x40 tls_enc_skb+0xb93/0x13e0 [tls] Once wmem_allocated count saturates the application can no longer send data on the socket. This is similar to Eric's fixes for GSO, TCP: commit 7ec318feeed1 ("tcp: gso: avoid refcount_t warning from tcp_gso_segment()") and UDP: commit 575b65bc5bff ("udp: avoid refcount_t saturation in __udp_gso_segment()"). Unlike the GSO case, for TLS fallback it's likely that the skb has shrunk, so the "likely" annotation is the other way around (likely branch being "sub"). Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Jakub Kicinski Reviewed-by: John Hurley Signed-off-by: David S. Miller --- net/tls/tls_device_fallback.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 54c3a758f2a7..a3ebd4b02714 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -194,6 +194,9 @@ static void update_chksum(struct sk_buff *skb, int headln) static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) { + struct sock *sk = skb->sk; + int delta; + skb_copy_header(nskb, skb); skb_put(nskb, skb->len); @@ -201,11 +204,15 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) update_chksum(nskb, headln); nskb->destructor = skb->destructor; - nskb->sk = skb->sk; + nskb->sk = sk; skb->destructor = NULL; skb->sk = NULL; - refcount_add(nskb->truesize - skb->truesize, - &nskb->sk->sk_wmem_alloc); + + delta = nskb->truesize - skb->truesize; + if (likely(delta < 0)) + WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc)); + else if (delta) + refcount_add(delta, &sk->sk_wmem_alloc); } /* This function may be called after the user socket is already -- cgit v1.2.3 From 36ad7022536e0c65f8baeeaa5efde11dec44808a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20=C5=A0tetiar?= Date: Wed, 17 Apr 2019 22:09:12 +0200 Subject: of_net: Fix residues after of_get_nvmem_mac_address removal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I've discovered following discrepancy in the bindings/net/ethernet.txt documentation, where it states following: - nvmem-cells: phandle, reference to an nvmem node for the MAC address; - nvmem-cell-names: string, should be "mac-address" if nvmem is to be.. which is actually misleading and confusing. There are only two ethernet drivers in the tree, cadence/macb and davinci which supports this properties. This nvmem-cell* properties were introduced in commit 9217e566bdee ("of_net: Implement of_get_nvmem_mac_address helper"), but commit afa64a72b862 ("of: net: kill of_get_nvmem_mac_address()") forget to properly clean up this parts. So this patch fixes the documentation by moving the nvmem-cell* properties at the appropriate places. While at it, I've removed unused include as well. Cc: Bartosz Golaszewski Fixes: afa64a72b862 ("of: net: kill of_get_nvmem_mac_address()") Signed-off-by: Petr Štetiar Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/davinci_emac.txt | 2 ++ Documentation/devicetree/bindings/net/ethernet.txt | 2 -- Documentation/devicetree/bindings/net/macb.txt | 4 ++++ drivers/of/of_net.c | 1 - 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/net/davinci_emac.txt b/Documentation/devicetree/bindings/net/davinci_emac.txt index 24c5cdaba8d2..ca83dcc84fb8 100644 --- a/Documentation/devicetree/bindings/net/davinci_emac.txt +++ b/Documentation/devicetree/bindings/net/davinci_emac.txt @@ -20,6 +20,8 @@ Required properties: Optional properties: - phy-handle: See ethernet.txt file in the same directory. If absent, davinci_emac driver defaults to 100/FULL. +- nvmem-cells: phandle, reference to an nvmem node for the MAC address +- nvmem-cell-names: string, should be "mac-address" if nvmem is to be used - ti,davinci-rmii-en: 1 byte, 1 means use RMII - ti,davinci-no-bd-ram: boolean, does EMAC have BD RAM? diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt index cfc376bc977a..2974e63ba311 100644 --- a/Documentation/devicetree/bindings/net/ethernet.txt +++ b/Documentation/devicetree/bindings/net/ethernet.txt @@ -10,8 +10,6 @@ Documentation/devicetree/bindings/phy/phy-bindings.txt. the boot program; should be used in cases where the MAC address assigned to the device by the boot program is different from the "local-mac-address" property; -- nvmem-cells: phandle, reference to an nvmem node for the MAC address; -- nvmem-cell-names: string, should be "mac-address" if nvmem is to be used; - max-speed: number, specifies maximum speed in Mbit/s supported by the device; - max-frame-size: number, maximum transfer unit (IEEE defined MTU), rather than the maximum frame size (there's contradiction in the Devicetree diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt index 174f292d8a3e..8b80515729d7 100644 --- a/Documentation/devicetree/bindings/net/macb.txt +++ b/Documentation/devicetree/bindings/net/macb.txt @@ -26,6 +26,10 @@ Required properties: Optional elements: 'tsu_clk' - clocks: Phandles to input clocks. +Optional properties: +- nvmem-cells: phandle, reference to an nvmem node for the MAC address +- nvmem-cell-names: string, should be "mac-address" if nvmem is to be used + Optional properties for PHY child node: - reset-gpios : Should specify the gpio for phy reset - magic-packet : If present, indicates that the hardware supports waking diff --git a/drivers/of/of_net.c b/drivers/of/of_net.c index 810ab0fbcccb..d820f3edd431 100644 --- a/drivers/of/of_net.c +++ b/drivers/of/of_net.c @@ -7,7 +7,6 @@ */ #include #include -#include #include #include #include -- cgit v1.2.3 From 12fc512f5741443a03adde2ead20724da8ad550a Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Fri, 15 Mar 2019 16:41:43 +0200 Subject: net/mlx5e: Fix use-after-free after xdp_return_frame xdp_return_frame releases the frame. It leads to releasing the page, so it's not allowed to access xdpi.xdpf->len after that, because xdpi.xdpf is at xdp->data_hard_start after convert_to_xdp_frame. This patch moves the memory access to precede the return of the frame. Fixes: 58b99ee3e3ebe ("net/mlx5e: Add support for XDP_REDIRECT in device-out side") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 03b2a9f9c589..10a99cd3e598 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -304,9 +304,9 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq) mlx5e_xdpi_fifo_pop(xdpi_fifo); if (is_redirect) { - xdp_return_frame(xdpi.xdpf); dma_unmap_single(sq->pdev, xdpi.dma_addr, xdpi.xdpf->len, DMA_TO_DEVICE); + xdp_return_frame(xdpi.xdpf); } else { /* Recycle RX page */ mlx5e_page_release(rq, &xdpi.di, true); @@ -345,9 +345,9 @@ void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq) mlx5e_xdpi_fifo_pop(xdpi_fifo); if (is_redirect) { - xdp_return_frame(xdpi.xdpf); dma_unmap_single(sq->pdev, xdpi.dma_addr, xdpi.xdpf->len, DMA_TO_DEVICE); + xdp_return_frame(xdpi.xdpf); } else { /* Recycle RX page */ mlx5e_page_release(rq, &xdpi.di, false); -- cgit v1.2.3 From d460c2718906252a2a69bc6f89b537071f792e6e Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Mon, 8 Apr 2019 15:12:45 +0300 Subject: net/mlx5e: Fix the max MTU check in case of XDP MLX5E_XDP_MAX_MTU was calculated incorrectly. It didn't account for NET_IP_ALIGN and MLX5E_HW2SW_MTU, and it also misused MLX5_SKB_FRAG_SZ. This commit fixes the calculations and adds a brief explanation for the formula used. Fixes: a26a5bdf3ee2d ("net/mlx5e: Restrict the combination of large MTU and XDP") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 20 ++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h | 3 +-- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 +++-- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 10a99cd3e598..cad34d6f5f45 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -33,6 +33,26 @@ #include #include "en/xdp.h" +int mlx5e_xdp_max_mtu(struct mlx5e_params *params) +{ + int hr = NET_IP_ALIGN + XDP_PACKET_HEADROOM; + + /* Let S := SKB_DATA_ALIGN(sizeof(struct skb_shared_info)). + * The condition checked in mlx5e_rx_is_linear_skb is: + * SKB_DATA_ALIGN(sw_mtu + hard_mtu + hr) + S <= PAGE_SIZE (1) + * (Note that hw_mtu == sw_mtu + hard_mtu.) + * What is returned from this function is: + * max_mtu = PAGE_SIZE - S - hr - hard_mtu (2) + * After assigning sw_mtu := max_mtu, the left side of (1) turns to + * SKB_DATA_ALIGN(PAGE_SIZE - S) + S, which is equal to PAGE_SIZE, + * because both PAGE_SIZE and S are already aligned. Any number greater + * than max_mtu would make the left side of (1) greater than PAGE_SIZE, + * so max_mtu is the maximum MTU allowed. + */ + + return MLX5E_HW2SW_MTU(params, SKB_MAX_HEAD(hr)); +} + static inline bool mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_dma_info *di, struct xdp_buff *xdp) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h index ee27a7c8cd87..553956cadc8a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h @@ -34,13 +34,12 @@ #include "en.h" -#define MLX5E_XDP_MAX_MTU ((int)(PAGE_SIZE - \ - MLX5_SKB_FRAG_SZ(XDP_PACKET_HEADROOM))) #define MLX5E_XDP_MIN_INLINE (ETH_HLEN + VLAN_HLEN) #define MLX5E_XDP_TX_EMPTY_DS_COUNT \ (sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS) #define MLX5E_XDP_TX_DS_COUNT (MLX5E_XDP_TX_EMPTY_DS_COUNT + 1 /* SG DS */) +int mlx5e_xdp_max_mtu(struct mlx5e_params *params); bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, void *va, u16 *rx_headroom, u32 *len); bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index f7eb521db580..46157e2a1e5a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3777,7 +3777,7 @@ int mlx5e_change_mtu(struct net_device *netdev, int new_mtu, if (params->xdp_prog && !mlx5e_rx_is_linear_skb(priv->mdev, &new_channels.params)) { netdev_err(netdev, "MTU(%d) > %d is not allowed while XDP enabled\n", - new_mtu, MLX5E_XDP_MAX_MTU); + new_mtu, mlx5e_xdp_max_mtu(params)); err = -EINVAL; goto out; } @@ -4212,7 +4212,8 @@ static int mlx5e_xdp_allowed(struct mlx5e_priv *priv, struct bpf_prog *prog) if (!mlx5e_rx_is_linear_skb(priv->mdev, &new_channels.params)) { netdev_warn(netdev, "XDP is not allowed with MTU(%d) > %d\n", - new_channels.params.sw_mtu, MLX5E_XDP_MAX_MTU); + new_channels.params.sw_mtu, + mlx5e_xdp_max_mtu(&new_channels.params)); return -EINVAL; } -- cgit v1.2.3 From ace329f4ab3ba434be2adf618073c752d083b524 Mon Sep 17 00:00:00 2001 From: Erez Alfasi Date: Thu, 11 Apr 2019 10:41:03 +0300 Subject: net/mlx5e: ethtool, Remove unsupported SFP EEPROM high pages query Querying EEPROM high pages data for SFP module is currently not supported by our driver and yet queried, resulting in invalid FW queries. Set the EEPROM ethtool data length to 256 for SFP module will limit the reading for page 0 only and prevent invalid FW queries. Fixes: bb64143eee8c ("net/mlx5e: Add ethtool support for dump module EEPROM") Signed-off-by: Erez Alfasi Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/port.c | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 76a3d01a489e..78dc8fe2a83c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1586,7 +1586,7 @@ static int mlx5e_get_module_info(struct net_device *netdev, break; case MLX5_MODULE_ID_SFP: modinfo->type = ETH_MODULE_SFF_8472; - modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN; + modinfo->eeprom_len = MLX5_EEPROM_PAGE_LENGTH; break; default: netdev_err(priv->netdev, "%s: cable type not recognized:0x%x\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 21b7f05b16a5..361468e0435d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -317,10 +317,6 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, size -= offset + size - MLX5_EEPROM_PAGE_LENGTH; i2c_addr = MLX5_I2C_ADDR_LOW; - if (offset >= MLX5_EEPROM_PAGE_LENGTH) { - i2c_addr = MLX5_I2C_ADDR_HIGH; - offset -= MLX5_EEPROM_PAGE_LENGTH; - } MLX5_SET(mcia_reg, in, l, 0); MLX5_SET(mcia_reg, in, module, module_num); -- cgit v1.2.3 From 30c04d796b693e22405c38e9b78e9a364e4c77e6 Mon Sep 17 00:00:00 2001 From: Po-Hsu Lin Date: Thu, 18 Apr 2019 19:57:25 +0800 Subject: selftests/net: correct the return value for run_netsocktests The run_netsocktests will be marked as passed regardless the actual test result from the ./socket: selftests: net: run_netsocktests ======================================== -------------------- running socket test -------------------- [FAIL] ok 1..6 selftests: net: run_netsocktests [PASS] This is because the test script itself has been successfully executed. Fix this by exit 1 when the test failed. Signed-off-by: Po-Hsu Lin Signed-off-by: David S. Miller --- tools/testing/selftests/net/run_netsocktests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/run_netsocktests b/tools/testing/selftests/net/run_netsocktests index b093f39c298c..14e41faf2c57 100755 --- a/tools/testing/selftests/net/run_netsocktests +++ b/tools/testing/selftests/net/run_netsocktests @@ -7,7 +7,7 @@ echo "--------------------" ./socket if [ $? -ne 0 ]; then echo "[FAIL]" + exit 1 else echo "[PASS]" fi - -- cgit v1.2.3 From 925b0c841e066b488cc3a60272472b2c56300704 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 19 Apr 2019 14:31:00 +0800 Subject: team: fix possible recursive locking when add slaves If we add a bond device which is already the master of the team interface, we will hold the team->lock in team_add_slave() first and then request the lock in team_set_mac_address() again. The functions are called like: - team_add_slave() - team_port_add() - team_port_enter() - team_modeop_port_enter() - __set_port_dev_addr() - dev_set_mac_address() - bond_set_mac_address() - dev_set_mac_address() - team_set_mac_address Although team_upper_dev_link() would check the upper devices but it is called too late. Fix it by adding a checking before processing the slave. v2: Do not split the string in netdev_err() Fixes: 3d249d4ca7d0 ("net: introduce ethernet teaming device") Acked-by: Jiri Pirko Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- drivers/net/team/team.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 9ce61b019aad..16963f7a88f7 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1156,6 +1156,13 @@ static int team_port_add(struct team *team, struct net_device *port_dev, return -EINVAL; } + if (netdev_has_upper_dev(dev, port_dev)) { + NL_SET_ERR_MSG(extack, "Device is already an upper device of the team interface"); + netdev_err(dev, "Device %s is already an upper device of the team interface\n", + portname); + return -EBUSY; + } + if (port_dev->features & NETIF_F_VLAN_CHALLENGED && vlan_uses_dev(dev)) { NL_SET_ERR_MSG(extack, "Device is VLAN challenged and team device has VLAN set up"); -- cgit v1.2.3 From 8c03557c3f25271e62e39154af66ebdd1b59c9ca Mon Sep 17 00:00:00 2001 From: Po-Hsu Lin Date: Fri, 19 Apr 2019 19:01:13 +0800 Subject: selftests/net: correct the return value for run_afpackettests The run_afpackettests will be marked as passed regardless the return value of those sub-tests in the script: -------------------- running psock_tpacket test -------------------- [FAIL] selftests: run_afpackettests [PASS] Fix this by changing the return value for each tests. Signed-off-by: Po-Hsu Lin Signed-off-by: David S. Miller --- tools/testing/selftests/net/run_afpackettests | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/net/run_afpackettests b/tools/testing/selftests/net/run_afpackettests index 2dc95fda7ef7..ea5938ec009a 100755 --- a/tools/testing/selftests/net/run_afpackettests +++ b/tools/testing/selftests/net/run_afpackettests @@ -6,12 +6,14 @@ if [ $(id -u) != 0 ]; then exit 0 fi +ret=0 echo "--------------------" echo "running psock_fanout test" echo "--------------------" ./in_netns.sh ./psock_fanout if [ $? -ne 0 ]; then echo "[FAIL]" + ret=1 else echo "[PASS]" fi @@ -22,6 +24,7 @@ echo "--------------------" ./in_netns.sh ./psock_tpacket if [ $? -ne 0 ]; then echo "[FAIL]" + ret=1 else echo "[PASS]" fi @@ -32,6 +35,8 @@ echo "--------------------" ./in_netns.sh ./txring_overwrite if [ $? -ne 0 ]; then echo "[FAIL]" + ret=1 else echo "[PASS]" fi +exit $ret -- cgit v1.2.3 From 62ef81d5632634d5e310ed25b9b940b2b6612b46 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Apr 2019 16:51:38 -0700 Subject: net/tls: avoid potential deadlock in tls_set_device_offload_rx() If device supports offload, but offload fails tls_set_device_offload_rx() will call tls_sw_free_resources_rx() which (unhelpfully) releases and reacquires the socket lock. For a small fix release and reacquire the device_offload_lock. Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- net/tls/tls_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 9f3bdbc1e593..1b5f55cac613 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -904,7 +904,9 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) goto release_netdev; free_sw_resources: + up_read(&device_offload_lock); tls_sw_free_resources_rx(sk); + down_read(&device_offload_lock); release_ctx: ctx->priv_ctx_rx = NULL; release_netdev: -- cgit v1.2.3 From 12c7686111326148b4b5db189130522a4ad1be4a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Apr 2019 16:52:19 -0700 Subject: net/tls: don't leak IV and record seq when offload fails When device refuses the offload in tls_set_device_offload_rx() it calls tls_sw_free_resources_rx() to clean up software context state. Unfortunately, tls_sw_free_resources_rx() does not free all the state tls_set_sw_offload() allocated - it leaks IV and sequence number buffers. All other code paths which lead to tls_sw_release_resources_rx() (which tls_sw_free_resources_rx() calls) free those right before the call. Avoid the leak by moving freeing of iv and rec_seq into tls_sw_release_resources_rx(). Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- net/tls/tls_device.c | 2 -- net/tls/tls_main.c | 5 +---- net/tls/tls_sw.c | 3 +++ 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 1b5f55cac613..cc0256939eb6 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -941,8 +941,6 @@ void tls_device_offload_cleanup_rx(struct sock *sk) } out: up_read(&device_offload_lock); - kfree(tls_ctx->rx.rec_seq); - kfree(tls_ctx->rx.iv); tls_sw_release_resources_rx(sk); } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 9547cea0ce3b..478603f43964 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -293,11 +293,8 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) #endif } - if (ctx->rx_conf == TLS_SW) { - kfree(ctx->rx.rec_seq); - kfree(ctx->rx.iv); + if (ctx->rx_conf == TLS_SW) tls_sw_free_resources_rx(sk); - } #ifdef CONFIG_TLS_DEVICE if (ctx->rx_conf == TLS_HW) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index b50ced862f6f..29d6af43dd24 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2078,6 +2078,9 @@ void tls_sw_release_resources_rx(struct sock *sk) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + kfree(tls_ctx->rx.rec_seq); + kfree(tls_ctx->rx.iv); + if (ctx->aead_recv) { kfree_skb(ctx->recv_pkt); ctx->recv_pkt = NULL; -- cgit v1.2.3 From 39420fe04f093c15e1674ef56dbae0df109738ec Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Sat, 20 Apr 2019 18:14:33 +0000 Subject: dt-bindings: add an explanation for internal phy-mode When working on the Allwinner internal PHY, the first work was to use the "internal" mode, but some answer was made my mail on what are really internal mean for PHY. This patch write that in the doc. Signed-off-by: Corentin Labbe Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/ethernet.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt index 2974e63ba311..a68621580584 100644 --- a/Documentation/devicetree/bindings/net/ethernet.txt +++ b/Documentation/devicetree/bindings/net/ethernet.txt @@ -16,7 +16,8 @@ Documentation/devicetree/bindings/phy/phy-bindings.txt. Specification). - phy-mode: string, operation mode of the PHY interface. This is now a de-facto standard property; supported values are: - * "internal" + * "internal" (Internal means there is not a standard bus between the MAC and + the PHY, something proprietary is being used to embed the PHY in the MAC.) * "mii" * "gmii" * "sgmii" -- cgit v1.2.3 From 26d1b8586b4fe14814ff4fd471cfc56014359e59 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Sat, 20 Apr 2019 16:43:01 +0000 Subject: Documentation: decnet: remove reference to CONFIG_DECNET_ROUTE_FWMARK CONFIG_DECNET_ROUTE_FWMARK was removed in commit 47dcf0cb1005 ("[NET]: Rethink mark field in struct flowi") Since nothing replace it (and nothindg need to replace it, simply remove it from documentation. Signed-off-by: Corentin Labbe Signed-off-by: David S. Miller --- Documentation/networking/decnet.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/networking/decnet.txt b/Documentation/networking/decnet.txt index e12a4900cf72..d192f8b9948b 100644 --- a/Documentation/networking/decnet.txt +++ b/Documentation/networking/decnet.txt @@ -22,8 +22,6 @@ you'll need the following options as well... CONFIG_DECNET_ROUTER (to be able to add/delete routes) CONFIG_NETFILTER (will be required for the DECnet routing daemon) - CONFIG_DECNET_ROUTE_FWMARK is optional - Don't turn on SIOCGIFCONF support for DECnet unless you are really sure that you need it, in general you won't and it can cause ifconfig to malfunction. -- cgit v1.2.3 From 7caa56f006e9d712b44f27b32520c66420d5cbc6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 15 Apr 2019 00:43:00 +0200 Subject: netfilter: ebtables: CONFIG_COMPAT: drop a bogus WARN_ON It means userspace gave us a ruleset where there is some other data after the ebtables target but before the beginning of the next rule. Fixes: 81e675c227ec ("netfilter: ebtables: add CONFIG_COMPAT support") Reported-by: syzbot+659574e7bcc7f7eb4df7@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index eb15891f8b9f..3cad01ac64e4 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -2032,7 +2032,8 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, if (match_kern) match_kern->match_size = ret; - if (WARN_ON(type == EBT_COMPAT_TARGET && size_left)) + /* rule should have no remaining data after target */ + if (type == EBT_COMPAT_TARGET && size_left) return -EINVAL; match32 = (struct compat_ebt_entry_mwt *) buf; -- cgit v1.2.3 From 916f6efae62305796e012e7c3a7884a267cbacbf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 17 Apr 2019 02:17:23 +0200 Subject: netfilter: never get/set skb->tstamp setting net.netfilter.nf_conntrack_timestamp=1 breaks xmit with fq scheduler. skb->tstamp might be "refreshed" using ktime_get_real(), but fq expects CLOCK_MONOTONIC. This patch removes all places in netfilter that check/set skb->tstamp: 1. To fix the bogus "start" time seen with conntrack timestamping for outgoing packets, never use skb->tstamp and always use current time. 2. In nfqueue and nflog, only use skb->tstamp for incoming packets, as determined by current hook (prerouting, input, forward). 3. xt_time has to use system clock as well rather than skb->tstamp. We could still use skb->tstamp for prerouting/input/foward, but I see no advantage to make this conditional. Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") Cc: Eric Dumazet Reported-by: Michal Soltys Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 7 ++----- net/netfilter/nfnetlink_log.c | 2 +- net/netfilter/nfnetlink_queue.c | 2 +- net/netfilter/xt_time.c | 23 ++++++++++++++--------- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3c48d44d6fff..2a714527cde1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1017,12 +1017,9 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* set conntrack timestamp, if enabled. */ tstamp = nf_conn_tstamp_find(ct); - if (tstamp) { - if (skb->tstamp == 0) - __net_timestamp(skb); + if (tstamp) + tstamp->start = ktime_get_real_ns(); - tstamp->start = ktime_to_ns(skb->tstamp); - } /* Since the lookup is lockless, hash insertion must be done after * starting the timer and setting the CONFIRMED bit. The RCU barriers * guarantee that no other CPU can find the conntrack before the above diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index b1f9c5303f02..0b3347570265 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -540,7 +540,7 @@ __build_packet_message(struct nfnl_log_net *log, goto nla_put_failure; } - if (skb->tstamp) { + if (hooknum <= NF_INET_FORWARD && skb->tstamp) { struct nfulnl_msg_packet_timestamp ts; struct timespec64 kts = ktime_to_timespec64(skb->tstamp); ts.sec = cpu_to_be64(kts.tv_sec); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 0dcc3592d053..e057b2961d31 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -582,7 +582,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, if (nfqnl_put_bridge(entry, skb) < 0) goto nla_put_failure; - if (entskb->tstamp) { + if (entry->state.hook <= NF_INET_FORWARD && entskb->tstamp) { struct nfqnl_msg_packet_timestamp ts; struct timespec64 kts = ktime_to_timespec64(entskb->tstamp); diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index c13bcd0ab491..8dbb4d48f2ed 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -163,19 +163,24 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) s64 stamp; /* - * We cannot use get_seconds() instead of __net_timestamp() here. + * We need real time here, but we can neither use skb->tstamp + * nor __net_timestamp(). + * + * skb->tstamp and skb->skb_mstamp_ns overlap, however, they + * use different clock types (real vs monotonic). + * * Suppose you have two rules: - * 1. match before 13:00 - * 2. match after 13:00 + * 1. match before 13:00 + * 2. match after 13:00 + * * If you match against processing time (get_seconds) it * may happen that the same packet matches both rules if - * it arrived at the right moment before 13:00. + * it arrived at the right moment before 13:00, so it would be + * better to check skb->tstamp and set it via __net_timestamp() + * if needed. This however breaks outgoing packets tx timestamp, + * and causes them to get delayed forever by fq packet scheduler. */ - if (skb->tstamp == 0) - __net_timestamp((struct sk_buff *)skb); - - stamp = ktime_to_ns(skb->tstamp); - stamp = div_s64(stamp, NSEC_PER_SEC); + stamp = get_seconds(); if (info->flags & XT_TIME_LOCAL_TZ) /* Adjust for local timezone */ -- cgit v1.2.3 From d48668052b2603b6262459625c86108c493588dd Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 17 Apr 2019 09:49:44 -0700 Subject: netfilter: fix nf_l4proto_log_invalid to log invalid packets It doesn't log a packet if sysctl_log_invalid isn't equal to protonum OR sysctl_log_invalid isn't equal to IPPROTO_RAW. This sentence is always true. I believe we need to replace OR to AND. Cc: Florian Westphal Fixes: c4f3db1595827 ("netfilter: conntrack: add and use nf_l4proto_log_invalid") Signed-off-by: Andrei Vagin Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index b9403a266a2e..37bb530d848f 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -55,7 +55,7 @@ void nf_l4proto_log_invalid(const struct sk_buff *skb, struct va_format vaf; va_list args; - if (net->ct.sysctl_log_invalid != protonum || + if (net->ct.sysctl_log_invalid != protonum && net->ct.sysctl_log_invalid != IPPROTO_RAW) return; -- cgit v1.2.3 From b561af36b1841088552464cdc3f6371d92f17710 Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Mon, 22 Apr 2019 15:15:32 +0530 Subject: net: stmmac: move stmmac_check_ether_addr() to driver probe stmmac_check_ether_addr() checks the MAC address and assigns one in driver open(). In many cases when we create slave netdevice, the dev addr is inherited from master but the master dev addr maybe NULL at that time, so move this call to driver probe so that address is always valid. Signed-off-by: Xiaofei Shen Tested-by: Xiaofei Shen Signed-off-by: Sneh Shah Signed-off-by: Vinod Koul Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index a26e36dbb5df..48712437d0da 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2616,8 +2616,6 @@ static int stmmac_open(struct net_device *dev) u32 chan; int ret; - stmmac_check_ether_addr(priv); - if (priv->hw->pcs != STMMAC_PCS_RGMII && priv->hw->pcs != STMMAC_PCS_TBI && priv->hw->pcs != STMMAC_PCS_RTBI) { @@ -4303,6 +4301,8 @@ int stmmac_dvr_probe(struct device *device, if (ret) goto error_hw_init; + stmmac_check_ether_addr(priv); + /* Configure real RX and TX queues */ netif_set_real_num_rx_queues(ndev, priv->plat->rx_queues_to_use); netif_set_real_num_tx_queues(ndev, priv->plat->tx_queues_to_use); -- cgit v1.2.3 From 2f23a2a768bee7ad2ff1e9527c3f7e279e794a46 Mon Sep 17 00:00:00 2001 From: Daniel Gomez Date: Mon, 22 Apr 2019 21:08:03 +0200 Subject: spi: Micrel eth switch: declare missing of table Add missing table for SPI driver relying on SPI device match since compatible is in a DT binding or in a DTS. Before this patch: modinfo drivers/net/phy/spi_ks8995.ko | grep alias alias: spi:ksz8795 alias: spi:ksz8864 alias: spi:ks8995 After this patch: modinfo drivers/net/phy/spi_ks8995.ko | grep alias alias: spi:ksz8795 alias: spi:ksz8864 alias: spi:ks8995 alias: of:N*T*Cmicrel,ksz8795C* alias: of:N*T*Cmicrel,ksz8795 alias: of:N*T*Cmicrel,ksz8864C* alias: of:N*T*Cmicrel,ksz8864 alias: of:N*T*Cmicrel,ks8995C* alias: of:N*T*Cmicrel,ks8995 Reported-by: Javier Martinez Canillas Signed-off-by: Daniel Gomez Signed-off-by: David S. Miller --- drivers/net/phy/spi_ks8995.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/phy/spi_ks8995.c b/drivers/net/phy/spi_ks8995.c index 92b64e254b44..7475cef17cf7 100644 --- a/drivers/net/phy/spi_ks8995.c +++ b/drivers/net/phy/spi_ks8995.c @@ -159,6 +159,14 @@ static const struct spi_device_id ks8995_id[] = { }; MODULE_DEVICE_TABLE(spi, ks8995_id); +static const struct of_device_id ks8895_spi_of_match[] = { + { .compatible = "micrel,ks8995" }, + { .compatible = "micrel,ksz8864" }, + { .compatible = "micrel,ksz8795" }, + { }, + }; +MODULE_DEVICE_TABLE(of, ks8895_spi_of_match); + static inline u8 get_chip_id(u8 val) { return (val >> ID1_CHIPID_S) & ID1_CHIPID_M; @@ -526,6 +534,7 @@ static int ks8995_remove(struct spi_device *spi) static struct spi_driver ks8995_driver = { .driver = { .name = "spi-ks8995", + .of_match_table = of_match_ptr(ks8895_spi_of_match), }, .probe = ks8995_probe, .remove = ks8995_remove, -- cgit v1.2.3 From d04830531d0c4a99c897a44038e5da3d23331d2f Mon Sep 17 00:00:00 2001 From: Daniel Gomez Date: Mon, 22 Apr 2019 21:08:04 +0200 Subject: spi: ST ST95HF NFC: declare missing of table Add missing table for SPI driver relying on SPI device match since compatible is in a DT binding or in a DTS. Before this patch: modinfo drivers/nfc/st95hf/st95hf.ko | grep alias alias: spi:st95hf After this patch: modinfo drivers/nfc/st95hf/st95hf.ko | grep alias alias: spi:st95hf alias: of:N*T*Cst,st95hfC* alias: of:N*T*Cst,st95hf Reported-by: Javier Martinez Canillas Signed-off-by: Daniel Gomez Signed-off-by: David S. Miller --- drivers/nfc/st95hf/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/nfc/st95hf/core.c b/drivers/nfc/st95hf/core.c index 2b26f762fbc3..01acb6e53365 100644 --- a/drivers/nfc/st95hf/core.c +++ b/drivers/nfc/st95hf/core.c @@ -1074,6 +1074,12 @@ static const struct spi_device_id st95hf_id[] = { }; MODULE_DEVICE_TABLE(spi, st95hf_id); +static const struct of_device_id st95hf_spi_of_match[] = { + { .compatible = "st,st95hf" }, + { }, +}; +MODULE_DEVICE_TABLE(of, st95hf_spi_of_match); + static int st95hf_probe(struct spi_device *nfc_spi_dev) { int ret; @@ -1260,6 +1266,7 @@ static struct spi_driver st95hf_driver = { .driver = { .name = "st95hf", .owner = THIS_MODULE, + .of_match_table = of_match_ptr(st95hf_spi_of_match), }, .id_table = st95hf_id, .probe = st95hf_probe, -- cgit v1.2.3 From 66c031716bcd9647cd2304e4875163663b086405 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 23 Apr 2019 15:30:07 +0100 Subject: net: atheros: fix spelling mistake "underun" -> "underrun" There are spelling mistakes in structure elements, fix these. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/atheros/atlx/atl1.c | 4 ++-- drivers/net/ethernet/atheros/atlx/atl1.h | 2 +- drivers/net/ethernet/atheros/atlx/atl2.c | 2 +- drivers/net/ethernet/atheros/atlx/atl2.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c b/drivers/net/ethernet/atheros/atlx/atl1.c index 9e07b469066a..156fbc5601ca 100644 --- a/drivers/net/ethernet/atheros/atlx/atl1.c +++ b/drivers/net/ethernet/atheros/atlx/atl1.c @@ -1721,7 +1721,7 @@ static void atl1_inc_smb(struct atl1_adapter *adapter) adapter->soft_stats.scc += smb->tx_1_col; adapter->soft_stats.mcc += smb->tx_2_col; adapter->soft_stats.latecol += smb->tx_late_col; - adapter->soft_stats.tx_underun += smb->tx_underrun; + adapter->soft_stats.tx_underrun += smb->tx_underrun; adapter->soft_stats.tx_trunc += smb->tx_trunc; adapter->soft_stats.tx_pause += smb->tx_pause; @@ -3179,7 +3179,7 @@ static struct atl1_stats atl1_gstrings_stats[] = { {"tx_deferred_ok", ATL1_STAT(soft_stats.deffer)}, {"tx_single_coll_ok", ATL1_STAT(soft_stats.scc)}, {"tx_multi_coll_ok", ATL1_STAT(soft_stats.mcc)}, - {"tx_underun", ATL1_STAT(soft_stats.tx_underun)}, + {"tx_underrun", ATL1_STAT(soft_stats.tx_underrun)}, {"tx_trunc", ATL1_STAT(soft_stats.tx_trunc)}, {"tx_pause", ATL1_STAT(soft_stats.tx_pause)}, {"rx_pause", ATL1_STAT(soft_stats.rx_pause)}, diff --git a/drivers/net/ethernet/atheros/atlx/atl1.h b/drivers/net/ethernet/atheros/atlx/atl1.h index 34a58cd846a0..eacff19ea05b 100644 --- a/drivers/net/ethernet/atheros/atlx/atl1.h +++ b/drivers/net/ethernet/atheros/atlx/atl1.h @@ -681,7 +681,7 @@ struct atl1_sft_stats { u64 scc; /* packets TX after a single collision */ u64 mcc; /* packets TX after multiple collisions */ u64 latecol; /* TX packets w/ late collisions */ - u64 tx_underun; /* TX packets aborted due to TX FIFO underrun + u64 tx_underrun; /* TX packets aborted due to TX FIFO underrun * or TRD FIFO underrun */ u64 tx_trunc; /* TX packets truncated due to size > MTU */ u64 rx_pause; /* num Pause packets received. */ diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c index d99317b3d891..98da0fa27192 100644 --- a/drivers/net/ethernet/atheros/atlx/atl2.c +++ b/drivers/net/ethernet/atheros/atlx/atl2.c @@ -553,7 +553,7 @@ static void atl2_intr_tx(struct atl2_adapter *adapter) netdev->stats.tx_aborted_errors++; if (txs->late_col) netdev->stats.tx_window_errors++; - if (txs->underun) + if (txs->underrun) netdev->stats.tx_fifo_errors++; } while (1); diff --git a/drivers/net/ethernet/atheros/atlx/atl2.h b/drivers/net/ethernet/atheros/atlx/atl2.h index c64a6bdfa7ae..25ec84cb4853 100644 --- a/drivers/net/ethernet/atheros/atlx/atl2.h +++ b/drivers/net/ethernet/atheros/atlx/atl2.h @@ -260,7 +260,7 @@ struct tx_pkt_status { unsigned multi_col:1; unsigned late_col:1; unsigned abort_col:1; - unsigned underun:1; /* current packet is aborted + unsigned underrun:1; /* current packet is aborted * due to txram underrun */ unsigned:3; /* reserved */ unsigned update:1; /* always 1'b1 in tx_status_buf */ -- cgit v1.2.3 From ffbf9870dcf1342592a1a26f4cf70bda39046134 Mon Sep 17 00:00:00 2001 From: Ilias Apalodimas Date: Tue, 23 Apr 2019 09:01:41 +0300 Subject: net: socionext: replace napi_alloc_frag with the netdev variant on init The netdev variant is usable on any context since it disables interrupts. The napi variant of the call should only be used within softirq context. Replace napi_alloc_frag on driver init with the correct netdev_alloc_frag call Changes since v1: - Adjusted commit message Acked-by: Ard Biesheuvel Acked-by: Jassi Brar Fixes: 4acb20b46214 ("net: socionext: different approach on DMA") Signed-off-by: Ilias Apalodimas Signed-off-by: David S. Miller --- drivers/net/ethernet/socionext/netsec.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index a18149720aa2..cba5881b2746 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -673,7 +673,8 @@ static void netsec_process_tx(struct netsec_priv *priv) } static void *netsec_alloc_rx_data(struct netsec_priv *priv, - dma_addr_t *dma_handle, u16 *desc_len) + dma_addr_t *dma_handle, u16 *desc_len, + bool napi) { size_t total_len = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); size_t payload_len = NETSEC_RX_BUF_SZ; @@ -682,7 +683,7 @@ static void *netsec_alloc_rx_data(struct netsec_priv *priv, total_len += SKB_DATA_ALIGN(payload_len + NETSEC_SKB_PAD); - buf = napi_alloc_frag(total_len); + buf = napi ? napi_alloc_frag(total_len) : netdev_alloc_frag(total_len); if (!buf) return NULL; @@ -765,7 +766,8 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget) /* allocate a fresh buffer and map it to the hardware. * This will eventually replace the old buffer in the hardware */ - buf_addr = netsec_alloc_rx_data(priv, &dma_handle, &desc_len); + buf_addr = netsec_alloc_rx_data(priv, &dma_handle, &desc_len, + true); if (unlikely(!buf_addr)) break; @@ -1069,7 +1071,8 @@ static int netsec_setup_rx_dring(struct netsec_priv *priv) void *buf; u16 len; - buf = netsec_alloc_rx_data(priv, &dma_handle, &len); + buf = netsec_alloc_rx_data(priv, &dma_handle, &len, + false); if (!buf) { netsec_uninit_pkt_dring(priv, NETSEC_RING_RX); goto err_out; -- cgit v1.2.3 From 1c5c12ee308aacf635c8819cd4baa3bd58f8a8b7 Mon Sep 17 00:00:00 2001 From: Tao Ren Date: Wed, 24 Apr 2019 01:43:32 +0000 Subject: net/ncsi: handle overflow when incrementing mac address Previously BMC's MAC address is calculated by simply adding 1 to the last byte of network controller's MAC address, and it produces incorrect result when network controller's MAC address ends with 0xFF. The problem can be fixed by calling eth_addr_inc() function to increment MAC address; besides, the MAC address is also validated before assigning to BMC. Fixes: cb10c7c0dfd9 ("net/ncsi: Add NCSI Broadcom OEM command") Signed-off-by: Tao Ren Acked-by: Jakub Kicinski Acked-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 12 ++++++++++++ net/ncsi/ncsi-rsp.c | 6 +++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index e2f3b21cd72a..aa8bfd6f738c 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -448,6 +448,18 @@ static inline void eth_addr_dec(u8 *addr) u64_to_ether_addr(u, addr); } +/** + * eth_addr_inc() - Increment the given MAC address. + * @addr: Pointer to a six-byte array containing Ethernet address to increment. + */ +static inline void eth_addr_inc(u8 *addr) +{ + u64 u = ether_addr_to_u64(addr); + + u++; + u64_to_ether_addr(u, addr); +} + /** * is_etherdev_addr - Tell if given Ethernet address belongs to the device. * @dev: Pointer to a device structure diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index dc07fcc7938e..802db01e3075 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -667,7 +668,10 @@ static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr) ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; memcpy(saddr.sa_data, &rsp->data[BCM_MAC_ADDR_OFFSET], ETH_ALEN); /* Increase mac address by 1 for BMC's address */ - saddr.sa_data[ETH_ALEN - 1]++; + eth_addr_inc((u8 *)saddr.sa_data); + if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) + return -ENXIO; + ret = ops->ndo_set_mac_address(ndev, &saddr); if (ret < 0) netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); -- cgit v1.2.3 From 6819e3f6d83a24777813b0d031ebe0861694db5a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 20 Apr 2019 12:09:39 +0800 Subject: net: vrf: Fix operation not supported when set vrf mac Vrf device is not able to change mac address now because lack of ndo_set_mac_address. Complete this in case some apps need to do this. Reported-by: Hui Wang Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- drivers/net/vrf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index cd15c32b2e43..9ee4d7402ca2 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -875,6 +875,7 @@ static const struct net_device_ops vrf_netdev_ops = { .ndo_init = vrf_dev_init, .ndo_uninit = vrf_dev_uninit, .ndo_start_xmit = vrf_xmit, + .ndo_set_mac_address = eth_mac_addr, .ndo_get_stats64 = vrf_get_stats64, .ndo_add_slave = vrf_add_slave, .ndo_del_slave = vrf_del_slave, @@ -1274,6 +1275,7 @@ static void vrf_setup(struct net_device *dev) /* default to no qdisc; user can add if desired */ dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_NO_RX_HANDLER; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; /* VRF devices do not care about MTU, but if the MTU is set * too low then the ipv4 and ipv6 protocols are disabled -- cgit v1.2.3 From 4b9fc7146249a6e0e3175d0acc033fdcd2bfcb17 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Wed, 24 Apr 2019 02:56:42 -0400 Subject: net: rds: exchange of 8K and 1M pool Before the commit 490ea5967b0d ("RDS: IB: move FMR code to its own file"), when the dirty_count is greater than 9/10 of max_items of 8K pool, 1M pool is used, Vice versa. After the commit 490ea5967b0d ("RDS: IB: move FMR code to its own file"), the above is removed. When we make the following tests. Server: rds-stress -r 1.1.1.16 -D 1M Client: rds-stress -r 1.1.1.14 -s 1.1.1.16 -D 1M The following will appear. " connecting to 1.1.1.16:4000 negotiated options, tasks will start in 2 seconds Starting up..header from 1.1.1.166:4001 to id 4001 bogus .. tsks tx/s rx/s tx+rx K/s mbi K/s mbo K/s tx us/c rtt us cpu % 1 0 0 0.00 0.00 0.00 0.00 0.00 -1.00 1 0 0 0.00 0.00 0.00 0.00 0.00 -1.00 1 0 0 0.00 0.00 0.00 0.00 0.00 -1.00 1 0 0 0.00 0.00 0.00 0.00 0.00 -1.00 1 0 0 0.00 0.00 0.00 0.00 0.00 -1.00 ... " So this exchange between 8K and 1M pool is added back. Fixes: commit 490ea5967b0d ("RDS: IB: move FMR code to its own file") Signed-off-by: Zhu Yanjun Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_fmr.c | 11 +++++++++++ net/rds/ib_rdma.c | 3 --- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c index 31cf37da4510..93c0437e6a5f 100644 --- a/net/rds/ib_fmr.c +++ b/net/rds/ib_fmr.c @@ -44,6 +44,17 @@ struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages) else pool = rds_ibdev->mr_1m_pool; + if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) + queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); + + /* Switch pools if one of the pool is reaching upper limit */ + if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) { + if (pool->pool_type == RDS_IB_MR_8K_POOL) + pool = rds_ibdev->mr_1m_pool; + else + pool = rds_ibdev->mr_8k_pool; + } + ibmr = rds_ib_try_reuse_ibmr(pool); if (ibmr) return ibmr; diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 63c8d107adcf..d664e9ade74d 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -454,9 +454,6 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) struct rds_ib_mr *ibmr = NULL; int iter = 0; - if (atomic_read(&pool->dirty_count) >= pool->max_items_soft / 10) - queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); - while (1) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) -- cgit v1.2.3 From 032be5f19a94de51093851757089133dcc1e92aa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Apr 2019 09:44:11 -0700 Subject: rxrpc: fix race condition in rxrpc_input_packet() After commit 5271953cad31 ("rxrpc: Use the UDP encap_rcv hook"), rxrpc_input_packet() is directly called from lockless UDP receive path, under rcu_read_lock() protection. It must therefore use RCU rules : - udp_sk->sk_user_data can be cleared at any point in this function. rcu_dereference_sk_user_data() is what we need here. - Also, since sk_user_data might have been set in rxrpc_open_socket() we must observe a proper RCU grace period before kfree(local) in rxrpc_lookup_local() v4: @local can be NULL in xrpc_lookup_local() as reported by kbuild test robot and Julia Lawall , thanks ! v3,v2 : addressed David Howells feedback, thanks ! syzbot reported : kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 19236 Comm: syz-executor703 Not tainted 5.1.0-rc6 #79 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__lock_acquire+0xbef/0x3fb0 kernel/locking/lockdep.c:3573 Code: 00 0f 85 a5 1f 00 00 48 81 c4 10 01 00 00 5b 41 5c 41 5d 41 5e 41 5f 5d c3 48 b8 00 00 00 00 00 fc ff df 4c 89 ea 48 c1 ea 03 <80> 3c 02 00 0f 85 4a 21 00 00 49 81 7d 00 20 54 9c 89 0f 84 cf f4 RSP: 0018:ffff88809d7aef58 EFLAGS: 00010002 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000026 RSI: 0000000000000000 RDI: 0000000000000001 RBP: ffff88809d7af090 R08: 0000000000000001 R09: 0000000000000001 R10: ffffed1015d05bc7 R11: ffff888089428600 R12: 0000000000000000 R13: 0000000000000130 R14: 0000000000000001 R15: 0000000000000001 FS: 00007f059044d700(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004b6040 CR3: 00000000955ca000 CR4: 00000000001406f0 Call Trace: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:4211 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:152 skb_queue_tail+0x26/0x150 net/core/skbuff.c:2972 rxrpc_reject_packet net/rxrpc/input.c:1126 [inline] rxrpc_input_packet+0x4a0/0x5536 net/rxrpc/input.c:1414 udp_queue_rcv_one_skb+0xaf2/0x1780 net/ipv4/udp.c:2011 udp_queue_rcv_skb+0x128/0x730 net/ipv4/udp.c:2085 udp_unicast_rcv_skb.isra.0+0xb9/0x360 net/ipv4/udp.c:2245 __udp4_lib_rcv+0x701/0x2ca0 net/ipv4/udp.c:2301 udp_rcv+0x22/0x30 net/ipv4/udp.c:2482 ip_protocol_deliver_rcu+0x60/0x8f0 net/ipv4/ip_input.c:208 ip_local_deliver_finish+0x23b/0x390 net/ipv4/ip_input.c:234 NF_HOOK include/linux/netfilter.h:289 [inline] NF_HOOK include/linux/netfilter.h:283 [inline] ip_local_deliver+0x1e9/0x520 net/ipv4/ip_input.c:255 dst_input include/net/dst.h:450 [inline] ip_rcv_finish+0x1e1/0x300 net/ipv4/ip_input.c:413 NF_HOOK include/linux/netfilter.h:289 [inline] NF_HOOK include/linux/netfilter.h:283 [inline] ip_rcv+0xe8/0x3f0 net/ipv4/ip_input.c:523 __netif_receive_skb_one_core+0x115/0x1a0 net/core/dev.c:4987 __netif_receive_skb+0x2c/0x1c0 net/core/dev.c:5099 netif_receive_skb_internal+0x117/0x660 net/core/dev.c:5202 napi_frags_finish net/core/dev.c:5769 [inline] napi_gro_frags+0xade/0xd10 net/core/dev.c:5843 tun_get_user+0x2f24/0x3fb0 drivers/net/tun.c:1981 tun_chr_write_iter+0xbd/0x156 drivers/net/tun.c:2027 call_write_iter include/linux/fs.h:1866 [inline] do_iter_readv_writev+0x5e1/0x8e0 fs/read_write.c:681 do_iter_write fs/read_write.c:957 [inline] do_iter_write+0x184/0x610 fs/read_write.c:938 vfs_writev+0x1b3/0x2f0 fs/read_write.c:1002 do_writev+0x15e/0x370 fs/read_write.c:1037 __do_sys_writev fs/read_write.c:1110 [inline] __se_sys_writev fs/read_write.c:1107 [inline] __x64_sys_writev+0x75/0xb0 fs/read_write.c:1107 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: 5271953cad31 ("rxrpc: Use the UDP encap_rcv hook") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/input.c | 12 ++++++++---- net/rxrpc/local_object.c | 3 ++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 4c6f9d0a00e7..c2c35cf4e308 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1161,19 +1161,19 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) * handle data received on the local endpoint * - may be called in interrupt context * - * The socket is locked by the caller and this prevents the socket from being - * shut down and the local endpoint from going away, thus sk_user_data will not - * be cleared until this function returns. + * [!] Note that as this is called from the encap_rcv hook, the socket is not + * held locked by the caller and nothing prevents sk_user_data on the UDP from + * being cleared in the middle of processing this function. * * Called with the RCU read lock held from the IP layer via UDP. */ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) { + struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk); struct rxrpc_connection *conn; struct rxrpc_channel *chan; struct rxrpc_call *call = NULL; struct rxrpc_skb_priv *sp; - struct rxrpc_local *local = udp_sk->sk_user_data; struct rxrpc_peer *peer = NULL; struct rxrpc_sock *rx = NULL; unsigned int channel; @@ -1181,6 +1181,10 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) _enter("%p", udp_sk); + if (unlikely(!local)) { + kfree_skb(skb); + return 0; + } if (skb->tstamp == 0) skb->tstamp = ktime_get_real(); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 15cf42d5b53a..01959db51445 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -304,7 +304,8 @@ nomem: ret = -ENOMEM; sock_error: mutex_unlock(&rxnet->local_mutex); - kfree(local); + if (local) + call_rcu(&local->rcu, rxrpc_local_rcu); _leave(" = %d", ret); return ERR_PTR(ret); -- cgit v1.2.3 From 0453c682459583910d611a96de928f4442205493 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Apr 2019 05:35:00 -0700 Subject: net/rose: fix unbound loop in rose_loopback_timer() This patch adds a limit on the number of skbs that fuzzers can queue into loopback_queue. 1000 packets for rose loopback seems more than enough. Then, since we now have multiple cpus in most linux hosts, we also need to limit the number of skbs rose_loopback_timer() can dequeue at each round. rose_loopback_queue() can be drop-monitor friendly, calling consume_skb() or kfree_skb() appropriately. Finally, use mod_timer() instead of del_timer() + add_timer() syzbot report was : rcu: INFO: rcu_preempt self-detected stall on CPU rcu: 0-...!: (10499 ticks this GP) idle=536/1/0x4000000000000002 softirq=103291/103291 fqs=34 rcu: (t=10500 jiffies g=140321 q=323) rcu: rcu_preempt kthread starved for 10426 jiffies! g140321 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 ->cpu=1 rcu: RCU grace-period kthread stack dump: rcu_preempt I29168 10 2 0x80000000 Call Trace: context_switch kernel/sched/core.c:2877 [inline] __schedule+0x813/0x1cc0 kernel/sched/core.c:3518 schedule+0x92/0x180 kernel/sched/core.c:3562 schedule_timeout+0x4db/0xfd0 kernel/time/timer.c:1803 rcu_gp_fqs_loop kernel/rcu/tree.c:1971 [inline] rcu_gp_kthread+0x962/0x17b0 kernel/rcu/tree.c:2128 kthread+0x357/0x430 kernel/kthread.c:253 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352 NMI backtrace for cpu 0 CPU: 0 PID: 7632 Comm: kworker/0:4 Not tainted 5.1.0-rc5+ #172 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events iterate_cleanup_work Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 nmi_cpu_backtrace.cold+0x63/0xa4 lib/nmi_backtrace.c:101 nmi_trigger_cpumask_backtrace+0x1be/0x236 lib/nmi_backtrace.c:62 arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38 trigger_single_cpu_backtrace include/linux/nmi.h:164 [inline] rcu_dump_cpu_stacks+0x183/0x1cf kernel/rcu/tree.c:1223 print_cpu_stall kernel/rcu/tree.c:1360 [inline] check_cpu_stall kernel/rcu/tree.c:1434 [inline] rcu_pending kernel/rcu/tree.c:3103 [inline] rcu_sched_clock_irq.cold+0x500/0xa4a kernel/rcu/tree.c:2544 update_process_times+0x32/0x80 kernel/time/timer.c:1635 tick_sched_handle+0xa2/0x190 kernel/time/tick-sched.c:161 tick_sched_timer+0x47/0x130 kernel/time/tick-sched.c:1271 __run_hrtimer kernel/time/hrtimer.c:1389 [inline] __hrtimer_run_queues+0x33e/0xde0 kernel/time/hrtimer.c:1451 hrtimer_interrupt+0x314/0x770 kernel/time/hrtimer.c:1509 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1035 [inline] smp_apic_timer_interrupt+0x120/0x570 arch/x86/kernel/apic/apic.c:1060 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:807 RIP: 0010:__sanitizer_cov_trace_pc+0x0/0x50 kernel/kcov.c:95 Code: 89 25 b4 6e ec 08 41 bc f4 ff ff ff e8 cd 5d ea ff 48 c7 05 9e 6e ec 08 00 00 00 00 e9 a4 e9 ff ff 90 90 90 90 90 90 90 90 90 <55> 48 89 e5 48 8b 75 08 65 48 8b 04 25 00 ee 01 00 65 8b 15 c8 60 RSP: 0018:ffff8880ae807ce0 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13 RAX: ffff88806fd40640 RBX: dffffc0000000000 RCX: ffffffff863fbc56 RDX: 0000000000000100 RSI: ffffffff863fbc1d RDI: ffff88808cf94228 RBP: ffff8880ae807d10 R08: ffff88806fd40640 R09: ffffed1015d00f8b R10: ffffed1015d00f8a R11: 0000000000000003 R12: ffff88808cf941c0 R13: 00000000fffff034 R14: ffff8882166cd840 R15: 0000000000000000 rose_loopback_timer+0x30d/0x3f0 net/rose/rose_loopback.c:91 call_timer_fn+0x190/0x720 kernel/time/timer.c:1325 expire_timers kernel/time/timer.c:1362 [inline] __run_timers kernel/time/timer.c:1681 [inline] __run_timers kernel/time/timer.c:1649 [inline] run_timer_softirq+0x652/0x1700 kernel/time/timer.c:1694 __do_softirq+0x266/0x95a kernel/softirq.c:293 do_softirq_own_stack+0x2a/0x40 arch/x86/entry/entry_64.S:1027 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/rose/rose_loopback.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index 7af4f99c4a93..094a6621f8e8 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -16,6 +16,7 @@ #include static struct sk_buff_head loopback_queue; +#define ROSE_LOOPBACK_LIMIT 1000 static struct timer_list loopback_timer; static void rose_set_loopback_timer(void); @@ -35,29 +36,27 @@ static int rose_loopback_running(void) int rose_loopback_queue(struct sk_buff *skb, struct rose_neigh *neigh) { - struct sk_buff *skbn; + struct sk_buff *skbn = NULL; - skbn = skb_clone(skb, GFP_ATOMIC); + if (skb_queue_len(&loopback_queue) < ROSE_LOOPBACK_LIMIT) + skbn = skb_clone(skb, GFP_ATOMIC); - kfree_skb(skb); - - if (skbn != NULL) { + if (skbn) { + consume_skb(skb); skb_queue_tail(&loopback_queue, skbn); if (!rose_loopback_running()) rose_set_loopback_timer(); + } else { + kfree_skb(skb); } return 1; } - static void rose_set_loopback_timer(void) { - del_timer(&loopback_timer); - - loopback_timer.expires = jiffies + 10; - add_timer(&loopback_timer); + mod_timer(&loopback_timer, jiffies + 10); } static void rose_loopback_timer(struct timer_list *unused) @@ -68,8 +67,12 @@ static void rose_loopback_timer(struct timer_list *unused) struct sock *sk; unsigned short frametype; unsigned int lci_i, lci_o; + int count; - while ((skb = skb_dequeue(&loopback_queue)) != NULL) { + for (count = 0; count < ROSE_LOOPBACK_LIMIT; count++) { + skb = skb_dequeue(&loopback_queue); + if (!skb) + return; if (skb->len < ROSE_MIN_LEN) { kfree_skb(skb); continue; @@ -106,6 +109,8 @@ static void rose_loopback_timer(struct timer_list *unused) kfree_skb(skb); } } + if (!skb_queue_empty(&loopback_queue)) + mod_timer(&loopback_timer, jiffies + 1); } void __exit rose_loopback_clear(void) -- cgit v1.2.3 From 20ff83f10f113c88d0bb74589389b05250994c16 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Apr 2019 08:04:05 -0700 Subject: ipv4: add sanity checks in ipv4_link_failure() Before calling __ip_options_compile(), we need to ensure the network header is a an IPv4 one, and that it is already pulled in skb->head. RAW sockets going through a tunnel can end up calling ipv4_link_failure() with total garbage in the skb, or arbitrary lengthes. syzbot report : BUG: KASAN: stack-out-of-bounds in memcpy include/linux/string.h:355 [inline] BUG: KASAN: stack-out-of-bounds in __ip_options_echo+0x294/0x1120 net/ipv4/ip_options.c:123 Write of size 69 at addr ffff888096abf068 by task syz-executor.4/9204 CPU: 0 PID: 9204 Comm: syz-executor.4 Not tainted 5.1.0-rc5+ #77 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:187 kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317 check_memory_region_inline mm/kasan/generic.c:185 [inline] check_memory_region+0x123/0x190 mm/kasan/generic.c:191 memcpy+0x38/0x50 mm/kasan/common.c:133 memcpy include/linux/string.h:355 [inline] __ip_options_echo+0x294/0x1120 net/ipv4/ip_options.c:123 __icmp_send+0x725/0x1400 net/ipv4/icmp.c:695 ipv4_link_failure+0x29f/0x550 net/ipv4/route.c:1204 dst_link_failure include/net/dst.h:427 [inline] vti6_xmit net/ipv6/ip6_vti.c:514 [inline] vti6_tnl_xmit+0x10d4/0x1c0c net/ipv6/ip6_vti.c:553 __netdev_start_xmit include/linux/netdevice.h:4414 [inline] netdev_start_xmit include/linux/netdevice.h:4423 [inline] xmit_one net/core/dev.c:3292 [inline] dev_hard_start_xmit+0x1b2/0x980 net/core/dev.c:3308 __dev_queue_xmit+0x271d/0x3060 net/core/dev.c:3878 dev_queue_xmit+0x18/0x20 net/core/dev.c:3911 neigh_direct_output+0x16/0x20 net/core/neighbour.c:1527 neigh_output include/net/neighbour.h:508 [inline] ip_finish_output2+0x949/0x1740 net/ipv4/ip_output.c:229 ip_finish_output+0x73c/0xd50 net/ipv4/ip_output.c:317 NF_HOOK_COND include/linux/netfilter.h:278 [inline] ip_output+0x21f/0x670 net/ipv4/ip_output.c:405 dst_output include/net/dst.h:444 [inline] NF_HOOK include/linux/netfilter.h:289 [inline] raw_send_hdrinc net/ipv4/raw.c:432 [inline] raw_sendmsg+0x1d2b/0x2f20 net/ipv4/raw.c:663 inet_sendmsg+0x147/0x5d0 net/ipv4/af_inet.c:798 sock_sendmsg_nosec net/socket.c:651 [inline] sock_sendmsg+0xdd/0x130 net/socket.c:661 sock_write_iter+0x27c/0x3e0 net/socket.c:988 call_write_iter include/linux/fs.h:1866 [inline] new_sync_write+0x4c7/0x760 fs/read_write.c:474 __vfs_write+0xe4/0x110 fs/read_write.c:487 vfs_write+0x20c/0x580 fs/read_write.c:549 ksys_write+0x14f/0x2d0 fs/read_write.c:599 __do_sys_write fs/read_write.c:611 [inline] __se_sys_write fs/read_write.c:608 [inline] __x64_sys_write+0x73/0xb0 fs/read_write.c:608 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x458c29 Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f293b44bc78 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000458c29 RDX: 0000000000000014 RSI: 00000000200002c0 RDI: 0000000000000003 RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f293b44c6d4 R13: 00000000004c8623 R14: 00000000004ded68 R15: 00000000ffffffff The buggy address belongs to the page: page:ffffea00025aafc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0 flags: 0x1fffc0000000000() raw: 01fffc0000000000 0000000000000000 ffffffff025a0101 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888096abef80: 00 00 00 f2 f2 f2 f2 f2 00 00 00 00 00 00 00 f2 ffff888096abf000: f2 f2 f2 f2 00 00 00 00 00 00 00 00 00 00 00 00 >ffff888096abf080: 00 00 f3 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 ^ ffff888096abf100: 00 00 00 00 f1 f1 f1 f1 00 00 f3 f3 00 00 00 00 ffff888096abf180: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 Fixes: ed0de45a1008 ("ipv4: recompile ip options in ipv4_link_failure") Signed-off-by: Eric Dumazet Cc: Stephen Suryaputra Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/route.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 88ce038dd495..6fdf1c195d8e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1183,25 +1183,39 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) return dst; } -static void ipv4_link_failure(struct sk_buff *skb) +static void ipv4_send_dest_unreach(struct sk_buff *skb) { struct ip_options opt; - struct rtable *rt; int res; /* Recompile ip options since IPCB may not be valid anymore. + * Also check we have a reasonable ipv4 header. */ - memset(&opt, 0, sizeof(opt)); - opt.optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); + if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) || + ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) + return; - rcu_read_lock(); - res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); - rcu_read_unlock(); + memset(&opt, 0, sizeof(opt)); + if (ip_hdr(skb)->ihl > 5) { + if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) + return; + opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); - if (res) - return; + rcu_read_lock(); + res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); + rcu_read_unlock(); + if (res) + return; + } __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); +} + +static void ipv4_link_failure(struct sk_buff *skb) +{ + struct rtable *rt; + + ipv4_send_dest_unreach(skb); rt = skb_rtable(skb); if (rt) -- cgit v1.2.3