From 988cf74deb45bd6ee27433b7b5d1be6004d842b8 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 3 Jul 2017 07:07:18 -0700
Subject: inet: Stop generating UFO packets.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c | 76 ---------------------------------------------------
 1 file changed, 76 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1422d6c08377..c6ec06465ce0 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1114,69 +1114,6 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 }
 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
 
-static inline int ip6_ufo_append_data(struct sock *sk,
-			struct sk_buff_head *queue,
-			int getfrag(void *from, char *to, int offset, int len,
-			int odd, struct sk_buff *skb),
-			void *from, int length, int hh_len, int fragheaderlen,
-			int exthdrlen, int transhdrlen, int mtu,
-			unsigned int flags, const struct flowi6 *fl6)
-
-{
-	struct sk_buff *skb;
-	int err;
-
-	/* There is support for UDP large send offload by network
-	 * device, so create one single skb packet containing complete
-	 * udp datagram
-	 */
-	skb = skb_peek_tail(queue);
-	if (!skb) {
-		skb = sock_alloc_send_skb(sk,
-			hh_len + fragheaderlen + transhdrlen + 20,
-			(flags & MSG_DONTWAIT), &err);
-		if (!skb)
-			return err;
-
-		/* reserve space for Hardware header */
-		skb_reserve(skb, hh_len);
-
-		/* create space for UDP/IP header */
-		skb_put(skb, fragheaderlen + transhdrlen);
-
-		/* initialize network header pointer */
-		skb_set_network_header(skb, exthdrlen);
-
-		/* initialize protocol header pointer */
-		skb->transport_header = skb->network_header + fragheaderlen;
-
-		skb->protocol = htons(ETH_P_IPV6);
-		skb->csum = 0;
-
-		if (flags & MSG_CONFIRM)
-			skb_set_dst_pending_confirm(skb, 1);
-
-		__skb_queue_tail(queue, skb);
-	} else if (skb_is_gso(skb)) {
-		goto append;
-	}
-
-	skb->ip_summed = CHECKSUM_PARTIAL;
-	/* Specify the length of each IPv6 datagram fragment.
-	 * It has to be a multiple of 8.
-	 */
-	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
-				     sizeof(struct frag_hdr)) & ~7;
-	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
-	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
-							 &fl6->daddr,
-							 &fl6->saddr);
-
-append:
-	return skb_append_datato_frags(sk, skb, getfrag, from,
-				       (length - transhdrlen));
-}
-
 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
 					       gfp_t gfp)
 {
@@ -1385,19 +1322,6 @@ emsgsize:
 	 */
 
 	cork->length += length;
-	if ((((length + (skb ? skb->len : headersize)) > mtu) ||
-	     (skb && skb_is_gso(skb))) &&
-	    (sk->sk_protocol == IPPROTO_UDP) &&
-	    (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
-	    (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
-		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
-					  hh_len, fragheaderlen, exthdrlen,
-					  transhdrlen, mtu, flags, fl6);
-		if (err)
-			goto error;
-		return 0;
-	}
-
 	if (!skb)
 		goto alloc_new_skb;
 
-- 
cgit v1.2.3


From 6800b2e040edda01f593aba28203c2ebf1679f4c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 7 Jul 2017 10:30:55 +0100
Subject: inet: Remove software UFO fragmenting code.

Rename udp{4,6}_ufo_fragment() to udp{4,6}_tunnel_segment() and only
handle tunnel segmentation.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/udp_offload.c | 100 ++-----------------------------------------------
 1 file changed, 3 insertions(+), 97 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index a2267f80febb..455fd4e39333 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -17,109 +17,15 @@
 #include <net/ip6_checksum.h>
 #include "ip6_offload.h"
 
-static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
-					 netdev_features_t features)
+static struct sk_buff *udp6_tunnel_segment(struct sk_buff *skb,
+					   netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
-	unsigned int mss;
-	unsigned int unfrag_ip6hlen, unfrag_len;
-	struct frag_hdr *fptr;
-	u8 *packet_start, *prevhdr;
-	u8 nexthdr;
-	u8 frag_hdr_sz = sizeof(struct frag_hdr);
-	__wsum csum;
-	int tnl_hlen;
-	int err;
-
-	mss = skb_shinfo(skb)->gso_size;
-	if (unlikely(skb->len <= mss))
-		goto out;
-
-	if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
-		/* Packet is from an untrusted source, reset gso_segs. */
-
-		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
-
-		/* Set the IPv6 fragment id if not set yet */
-		if (!skb_shinfo(skb)->ip6_frag_id)
-			ipv6_proxy_select_ident(dev_net(skb->dev), skb);
-
-		segs = NULL;
-		goto out;
-	}
 
 	if (skb->encapsulation && skb_shinfo(skb)->gso_type &
 	    (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))
 		segs = skb_udp_tunnel_segment(skb, features, true);
-	else {
-		const struct ipv6hdr *ipv6h;
-		struct udphdr *uh;
-
-		if (!pskb_may_pull(skb, sizeof(struct udphdr)))
-			goto out;
-
-		/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
-		 * do checksum of UDP packets sent as multiple IP fragments.
-		 */
-
-		uh = udp_hdr(skb);
-		ipv6h = ipv6_hdr(skb);
-
-		uh->check = 0;
-		csum = skb_checksum(skb, 0, skb->len, 0);
-		uh->check = udp_v6_check(skb->len, &ipv6h->saddr,
-					  &ipv6h->daddr, csum);
-		if (uh->check == 0)
-			uh->check = CSUM_MANGLED_0;
-
-		skb->ip_summed = CHECKSUM_NONE;
-
-		/* If there is no outer header we can fake a checksum offload
-		 * due to the fact that we have already done the checksum in
-		 * software prior to segmenting the frame.
-		 */
-		if (!skb->encap_hdr_csum)
-			features |= NETIF_F_HW_CSUM;
-
-		/* Check if there is enough headroom to insert fragment header. */
-		tnl_hlen = skb_tnl_header_len(skb);
-		if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) {
-			if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz))
-				goto out;
-		}
-
-		/* Find the unfragmentable header and shift it left by frag_hdr_sz
-		 * bytes to insert fragment header.
-		 */
-		err = ip6_find_1stfragopt(skb, &prevhdr);
-		if (err < 0)
-			return ERR_PTR(err);
-		unfrag_ip6hlen = err;
-		nexthdr = *prevhdr;
-		*prevhdr = NEXTHDR_FRAGMENT;
-		unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
-			     unfrag_ip6hlen + tnl_hlen;
-		packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset;
-		memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len);
-
-		SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz;
-		skb->mac_header -= frag_hdr_sz;
-		skb->network_header -= frag_hdr_sz;
-
-		fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
-		fptr->nexthdr = nexthdr;
-		fptr->reserved = 0;
-		if (!skb_shinfo(skb)->ip6_frag_id)
-			ipv6_proxy_select_ident(dev_net(skb->dev), skb);
-		fptr->identification = skb_shinfo(skb)->ip6_frag_id;
-
-		/* Fragment the skb. ipv6 header and the remaining fields of the
-		 * fragment header are updated in ipv6_gso_segment()
-		 */
-		segs = skb_segment(skb, features);
-	}
 
-out:
 	return segs;
 }
 
@@ -169,7 +75,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 
 static const struct net_offload udpv6_offload = {
 	.callbacks = {
-		.gso_segment	=	udp6_ufo_fragment,
+		.gso_segment	=	udp6_tunnel_segment,
 		.gro_receive	=	udp6_gro_receive,
 		.gro_complete	=	udp6_gro_complete,
 	},
-- 
cgit v1.2.3


From 6b1c42e9726bdb00370342909d95efdc331d10ac Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 17 Jul 2017 13:57:19 +0200
Subject: vti: revert flush x-netns xfrm cache when vti interface is removed

flow cache is removed in next commit.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_vti.c | 31 -------------------------------
 1 file changed, 31 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 486c2305f53c..79444a4bfd6d 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -1145,33 +1145,6 @@ static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = {
 	.priority	=	100,
 };
 
-static bool is_vti6_tunnel(const struct net_device *dev)
-{
-	return dev->netdev_ops == &vti6_netdev_ops;
-}
-
-static int vti6_device_event(struct notifier_block *unused,
-			     unsigned long event, void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct ip6_tnl *t = netdev_priv(dev);
-
-	if (!is_vti6_tunnel(dev))
-		return NOTIFY_DONE;
-
-	switch (event) {
-	case NETDEV_DOWN:
-		if (!net_eq(t->net, dev_net(dev)))
-			xfrm_garbage_collect(t->net);
-		break;
-	}
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block vti6_notifier_block __read_mostly = {
-	.notifier_call = vti6_device_event,
-};
-
 /**
  * vti6_tunnel_init - register protocol and reserve needed resources
  *
@@ -1182,8 +1155,6 @@ static int __init vti6_tunnel_init(void)
 	const char *msg;
 	int err;
 
-	register_netdevice_notifier(&vti6_notifier_block);
-
 	msg = "tunnel device";
 	err = register_pernet_device(&vti6_net_ops);
 	if (err < 0)
@@ -1216,7 +1187,6 @@ xfrm_proto_ah_failed:
 xfrm_proto_esp_failed:
 	unregister_pernet_device(&vti6_net_ops);
 pernet_dev_failed:
-	unregister_netdevice_notifier(&vti6_notifier_block);
 	pr_err("vti6 init: failed to register %s\n", msg);
 	return err;
 }
@@ -1231,7 +1201,6 @@ static void __exit vti6_tunnel_cleanup(void)
 	xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
 	xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
 	unregister_pernet_device(&vti6_net_ops);
-	unregister_netdevice_notifier(&vti6_notifier_block);
 }
 
 module_init(vti6_tunnel_init);
-- 
cgit v1.2.3


From 3c2a89ddc11896cf5498115c0380ab54b1c424b7 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 17 Jul 2017 13:57:20 +0200
Subject: net: xfrm: revert to lower xfrm dst gc limit

revert c386578f1cdb4dac230395 ("xfrm: Let the flowcache handle its size by default.").

Once we remove flow cache, we don't have a flow cache limit anymore.
We must not allow (virtually) unlimited allocations of xfrm dst entries.
Revert back to the old xfrm dst gc limits.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/xfrm6_policy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 79651bc71bf0..ae30dc4973e8 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -286,7 +286,7 @@ static struct dst_ops xfrm6_dst_ops_template = {
 	.destroy =		xfrm6_dst_destroy,
 	.ifdown =		xfrm6_dst_ifdown,
 	.local_out =		__ip6_local_out,
-	.gc_thresh =		INT_MAX,
+	.gc_thresh =		32768,
 };
 
 static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
-- 
cgit v1.2.3


From 09c7570480f7544ffbf8e6db365208b0b0c154c6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 17 Jul 2017 13:57:26 +0200
Subject: xfrm: remove flow cache

After rcu conversions performance degradation in forward tests isn't that
noticeable anymore.

See next patch for some numbers.

A followup patcg could then also remove genid from the policies
as we do not cache bundles anymore.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/xfrm6_policy.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index ae30dc4973e8..f44b25a48478 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -214,14 +214,6 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
 	}
 }
 
-static inline int xfrm6_garbage_collect(struct dst_ops *ops)
-{
-	struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
-
-	xfrm_garbage_collect_deferred(net);
-	return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
-}
-
 static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk,
 			      struct sk_buff *skb, u32 mtu)
 {
@@ -279,7 +271,6 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 
 static struct dst_ops xfrm6_dst_ops_template = {
 	.family =		AF_INET6,
-	.gc =			xfrm6_garbage_collect,
 	.update_pmtu =		xfrm6_update_pmtu,
 	.redirect =		xfrm6_redirect,
 	.cow_metrics =		dst_cow_metrics_generic,
-- 
cgit v1.2.3


From e42e24c3cc072088756d84ef07b492ac2a3ae2e5 Mon Sep 17 00:00:00 2001
From: Matvejchikov Ilya <matvejchikov@gmail.com>
Date: Mon, 24 Jul 2017 16:02:12 +0400
Subject: tcp: remove redundant argument from tcp_rcv_established()

The last (4th) argument of tcp_rcv_established() is redundant as it
always equals to skb->len and the skb itself is always passed as 2th
agrument. There is no reason to have it.

Signed-off-by: Ilya V. Matveychikov <matvejchikov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2521690d62d6..90a32576c3d0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1296,7 +1296,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 			}
 		}
 
-		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
+		tcp_rcv_established(sk, skb, tcp_hdr(skb));
 		if (opt_skb)
 			goto ipv6_pktoptions;
 		return 0;
-- 
cgit v1.2.3


From 3a3a4e3054137c5ff5d4d306ec834f6d25d7f95b Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia.lawall@lip6.fr>
Date: Fri, 28 Jul 2017 22:18:57 +0200
Subject: ipv6: constify inet6_protocol structures

The inet6_protocol structure is only passed as the first argument to
inet6_add_protocol or inet6_del_protocol, both of which are declared as
const.  Thus the inet6_protocol structure itself can be const.

Also drop __read_mostly where present on the newly const structures.

Done with the help of Coccinelle.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c  | 2 +-
 net/ipv6/tcp_ipv6.c | 2 +-
 net/ipv6/udp.c      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 67ff2aaf5dcb..33865d67bcb4 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1080,7 +1080,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev)
 }
 
 
-static struct inet6_protocol ip6gre_protocol __read_mostly = {
+static const struct inet6_protocol ip6gre_protocol = {
 	.handler     = gre_rcv,
 	.err_handler = ip6gre_err,
 	.flags       = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 90a32576c3d0..2968a33cca7d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1945,7 +1945,7 @@ struct proto tcpv6_prot = {
 	.diag_destroy		= tcp_abort,
 };
 
-static struct inet6_protocol tcpv6_protocol = {
+static const struct inet6_protocol tcpv6_protocol = {
 	.early_demux	=	tcp_v6_early_demux,
 	.early_demux_handler =  tcp_v6_early_demux,
 	.handler	=	tcp_v6_rcv,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4a3e65626e8b..5f8b8d766c63 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1448,7 +1448,7 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
 }
 #endif
 
-static struct inet6_protocol udpv6_protocol = {
+static const struct inet6_protocol udpv6_protocol = {
 	.early_demux	=	udp_v6_early_demux,
 	.early_demux_handler =  udp_v6_early_demux,
 	.handler	=	udpv6_rcv,
-- 
cgit v1.2.3


From f347ec852c7a83e1803192d2c1fce4e42e0715a5 Mon Sep 17 00:00:00 2001
From: "Pablo M. Bermudo Garay" <pablombg@gmail.com>
Date: Fri, 21 Jul 2017 01:54:37 +0200
Subject: netfilter: nf_tables: fib: use skb_header_pointer

This is a preparatory patch for adding fib support to the netdev family.

The netdev family receives the packets from ingress hook. At this point
we have no guarantee that the ip header is linear. So this patch
replaces ip_hdr with skb_header_pointer in order to address that
possible situation.

Signed-off-by: Pablo M. Bermudo Garay <pablombg@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nft_fib_ipv6.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
index 43f91d9b086c..54b5899543ef 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -25,9 +25,9 @@ static int get_ifindex(const struct net_device *dev)
 
 static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,
 			       const struct nft_pktinfo *pkt,
-			       const struct net_device *dev)
+			       const struct net_device *dev,
+			       struct ipv6hdr *iph)
 {
-	const struct ipv6hdr *iph = ipv6_hdr(pkt->skb);
 	int lookup_flags = 0;
 
 	if (priv->flags & NFTA_FIB_F_DADDR) {
@@ -55,7 +55,8 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,
 }
 
 static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
-				const struct nft_pktinfo *pkt)
+				const struct nft_pktinfo *pkt,
+				struct ipv6hdr *iph)
 {
 	const struct net_device *dev = NULL;
 	const struct nf_ipv6_ops *v6ops;
@@ -77,7 +78,7 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
 	else if (priv->flags & NFTA_FIB_F_OIF)
 		dev = nft_out(pkt);
 
-	nft_fib6_flowi_init(&fl6, priv, pkt, dev);
+	nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph);
 
 	v6ops = nf_get_ipv6_ops();
 	if (dev && v6ops && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true))
@@ -131,9 +132,17 @@ void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
 			const struct nft_pktinfo *pkt)
 {
 	const struct nft_fib *priv = nft_expr_priv(expr);
+	int noff = skb_network_offset(pkt->skb);
 	u32 *dest = &regs->data[priv->dreg];
+	struct ipv6hdr *iph, _iph;
 
-	*dest = __nft_fib6_eval_type(priv, pkt);
+	iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
+	if (!iph) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
+	*dest = __nft_fib6_eval_type(priv, pkt, iph);
 }
 EXPORT_SYMBOL_GPL(nft_fib6_eval_type);
 
@@ -141,8 +150,10 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
 		   const struct nft_pktinfo *pkt)
 {
 	const struct nft_fib *priv = nft_expr_priv(expr);
+	int noff = skb_network_offset(pkt->skb);
 	const struct net_device *oif = NULL;
 	u32 *dest = &regs->data[priv->dreg];
+	struct ipv6hdr *iph, _iph;
 	struct flowi6 fl6 = {
 		.flowi6_iif = LOOPBACK_IFINDEX,
 		.flowi6_proto = pkt->tprot,
@@ -155,7 +166,13 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
 	else if (priv->flags & NFTA_FIB_F_OIF)
 		oif = nft_out(pkt);
 
-	lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif);
+	iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
+	if (!iph) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
+	lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph);
 
 	if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
 	    nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
-- 
cgit v1.2.3


From 591bb2789bc2a93f379b13d277f441f1b427102d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 26 Jul 2017 11:40:52 +0200
Subject: netfilter: nf_hook_ops structs can be const

We no longer place these on a list so they can be const.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/ila/ila_xlat.c                        | 2 +-
 net/ipv6/netfilter/ip6t_SYNPROXY.c             | 2 +-
 net/ipv6/netfilter/ip6table_nat.c              | 2 +-
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 2 +-
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 77f7f8c7d93d..5bd419c1abc8 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -208,7 +208,7 @@ ila_nf_input(void *priv,
 	return NF_ACCEPT;
 }
 
-static struct nf_hook_ops ila_nf_hook_ops[] __read_mostly = {
+static const struct nf_hook_ops ila_nf_hook_ops[] = {
 	{
 		.hook = ila_nf_input,
 		.pf = NFPROTO_IPV6,
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index ce203dd729e0..a5cd43d75393 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -438,7 +438,7 @@ static unsigned int ipv6_synproxy_hook(void *priv,
 	return NF_ACCEPT;
 }
 
-static struct nf_hook_ops ipv6_synproxy_ops[] __read_mostly = {
+static const struct nf_hook_ops ipv6_synproxy_ops[] = {
 	{
 		.hook		= ipv6_synproxy_hook,
 		.pf		= NFPROTO_IPV6,
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 7d2bd940291f..991512576c8c 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -69,7 +69,7 @@ static unsigned int ip6table_nat_local_fn(void *priv,
 	return nf_nat_ipv6_local_fn(priv, skb, state, ip6table_nat_do_chain);
 }
 
-static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = {
+static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
 	/* Before packet filtering, change destination */
 	{
 		.hook		= ip6table_nat_in,
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 4e3402486833..f2d2f4a9294b 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -191,7 +191,7 @@ static unsigned int ipv6_conntrack_local(void *priv,
 	return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
 }
 
-static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
+static const struct nf_hook_ops ipv6_conntrack_ops[] = {
 	{
 		.hook		= ipv6_conntrack_in,
 		.pf		= NFPROTO_IPV6,
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index ada60d1a991b..b326da59257f 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -74,7 +74,7 @@ static unsigned int ipv6_defrag(void *priv,
 	return err == 0 ? NF_ACCEPT : NF_DROP;
 }
 
-static struct nf_hook_ops ipv6_defrag_ops[] = {
+static const struct nf_hook_ops ipv6_defrag_ops[] = {
 	{
 		.hook		= ipv6_defrag,
 		.pf		= NFPROTO_IPV6,
-- 
cgit v1.2.3


From 4d3a57f23dec59f0a2362e63540b2d01b37afe0a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 28 Jul 2017 11:22:04 +0200
Subject: netfilter: conntrack: do not enable connection tracking unless needed

Discussion during NFWS 2017 in Faro has shown that the current
conntrack behaviour is unreasonable.

Even if conntrack module is loaded on behalf of a single net namespace,
its turned on for all namespaces, which is expensive.  Commit
481fa373476 ("netfilter: conntrack: add nf_conntrack_default_on sysctl")
attempted to provide an alternative to the 'default on' behaviour by
adding a sysctl to change it.

However, as Eric points out, the sysctl only becomes available
once the module is loaded, and then its too late.

So we either have to move the sysctl to the core, or, alternatively,
change conntrack to become active only once the rule set requires this.

This does the latter, conntrack is only enabled when a rule needs it.

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index f2d2f4a9294b..ddef5ee9e0a8 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -398,25 +398,12 @@ static struct nf_conntrack_l4proto *builtin_l4proto6[] = {
 
 static int ipv6_net_init(struct net *net)
 {
-	int ret = 0;
-
-	ret = nf_ct_l4proto_pernet_register(net, builtin_l4proto6,
-					    ARRAY_SIZE(builtin_l4proto6));
-	if (ret < 0)
-		return ret;
-
-	ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv6);
-	if (ret < 0) {
-		pr_err("nf_conntrack_ipv6: pernet registration failed.\n");
-		nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6,
-						ARRAY_SIZE(builtin_l4proto6));
-	}
-	return ret;
+	return nf_ct_l4proto_pernet_register(net, builtin_l4proto6,
+					     ARRAY_SIZE(builtin_l4proto6));
 }
 
 static void ipv6_net_exit(struct net *net)
 {
-	nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv6);
 	nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6,
 					ARRAY_SIZE(builtin_l4proto6));
 }
-- 
cgit v1.2.3


From e7942d0633c47c791ece6afa038be9cf977226de Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 30 Jul 2017 03:57:18 +0200
Subject: tcp: remove prequeue support

prequeue is a tcp receive optimization that moves part of rx processing
from bh to process context.

This only works if the socket being processed belongs to a process that
is blocked in recv on that socket.

In practice, this doesn't happen anymore that often because nowadays
servers tend to use an event driven (epoll) model.

Even normal client applications (web browsers) commonly use many tcp
connections in parallel.

This has measureable impact only in netperf (which uses plain recv and
thus allows prequeue use) from host to locally running vm (~4%), however,
there were no changes when using netperf between two physical hosts with
ixgbe interfaces.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2968a33cca7d..39ee8e7fc4bd 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1505,8 +1505,7 @@ process:
 	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
-		if (!tcp_prequeue(sk, skb))
-			ret = tcp_v6_do_rcv(sk, skb);
+		ret = tcp_v6_do_rcv(sk, skb);
 	} else if (tcp_add_backlog(sk, skb)) {
 		goto discard_and_relse;
 	}
-- 
cgit v1.2.3


From 1f139ed9ec40521b4497aa34e107d38bb082b0e0 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jkbs@redhat.com>
Date: Mon, 31 Jul 2017 10:09:41 +0200
Subject: ipv6: Avoid going through ->sk_net to access the netns

There is no need to go through sk->sk_net to access the net namespace
and its sysctl variables because we allocate the sock and initialize
sk_net just a few lines earlier in the same routine.

Signed-off-by: Jakub Sitnicki <jkbs@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index a88b5b5b7955..0a7c74049a0c 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -210,7 +210,7 @@ lookup_protocol:
 	np->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;
 	np->mc_loop	= 1;
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
-	np->autoflowlabel = ip6_default_np_autolabel(sock_net(sk));
+	np->autoflowlabel = ip6_default_np_autolabel(net);
 	sk->sk_ipv6only	= net->ipv6.sysctl.bindv6only;
 
 	/* Init the ipv4 part of the socket since we can have sockets
-- 
cgit v1.2.3


From 39294c3df2a8dc5ec64513633446511f7e842acd Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia.lawall@lip6.fr>
Date: Tue, 1 Aug 2017 18:27:28 +0200
Subject: Revert "ipv6: constify inet6_protocol structures"

This reverts commit 3a3a4e3054137c5ff5d4d306ec834f6d25d7f95b.

inet6_add_protocol and inet6_del_protocol include casts that remove the
effect of the const annotation on their parameter, leading to possible
runtime crashes.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c  | 2 +-
 net/ipv6/tcp_ipv6.c | 2 +-
 net/ipv6/udp.c      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 33865d67bcb4..67ff2aaf5dcb 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1080,7 +1080,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev)
 }
 
 
-static const struct inet6_protocol ip6gre_protocol = {
+static struct inet6_protocol ip6gre_protocol __read_mostly = {
 	.handler     = gre_rcv,
 	.err_handler = ip6gre_err,
 	.flags       = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 39ee8e7fc4bd..ced5dcf37465 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1944,7 +1944,7 @@ struct proto tcpv6_prot = {
 	.diag_destroy		= tcp_abort,
 };
 
-static const struct inet6_protocol tcpv6_protocol = {
+static struct inet6_protocol tcpv6_protocol = {
 	.early_demux	=	tcp_v6_early_demux,
 	.early_demux_handler =  tcp_v6_early_demux,
 	.handler	=	tcp_v6_rcv,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5f8b8d766c63..4a3e65626e8b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1448,7 +1448,7 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
 }
 #endif
 
-static const struct inet6_protocol udpv6_protocol = {
+static struct inet6_protocol udpv6_protocol = {
 	.early_demux	=	udp_v6_early_demux,
 	.early_demux_handler =  udp_v6_early_demux,
 	.handler	=	udpv6_rcv,
-- 
cgit v1.2.3


From e51a64727079f46fc3a99f380de384d5ab01fffa Mon Sep 17 00:00:00 2001
From: Ilan Tayari <ilant@mellanox.com>
Date: Tue, 1 Aug 2017 12:49:05 +0300
Subject: esp6: Support RX checksum with crypto offload

Keep the device's reported ip_summed indication in case crypto
was offloaded by the device. Subtract the csum values of the
stripped parts (esp header+iv, esp trailer+auth_data) to keep
value correct.

Note: CHECKSUM_COMPLETE should be indicated only if skb->csum
has the post-decryption offload csum value.

Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/esp6.c         | 14 +++++++++++---
 net/ipv6/esp6_offload.c |  4 +++-
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 9ed35473dcb5..0ca1db62e381 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -470,7 +470,8 @@ int esp6_input_done2(struct sk_buff *skb, int err)
 	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
 	int elen = skb->len - hlen;
 	int hdr_len = skb_network_header_len(skb);
-	int padlen;
+	int padlen, trimlen;
+	__wsum csumdiff;
 	u8 nexthdr[2];
 
 	if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
@@ -492,8 +493,15 @@ int esp6_input_done2(struct sk_buff *skb, int err)
 
 	/* ... check padding bits here. Silly. :-) */
 
-	pskb_trim(skb, skb->len - alen - padlen - 2);
-	__skb_pull(skb, hlen);
+	trimlen = alen + padlen + 2;
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
+		skb->csum = csum_block_sub(skb->csum, csumdiff,
+					   skb->len - trimlen);
+	}
+	pskb_trim(skb, skb->len - trimlen);
+
+	skb_pull_rcsum(skb, hlen);
 	if (x->props.mode == XFRM_MODE_TUNNEL)
 		skb_reset_transport_header(skb);
 	else
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index f02f131f6435..eec3add177fe 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -209,11 +209,13 @@ out:
 static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct crypto_aead *aead = x->data;
+	struct xfrm_offload *xo = xfrm_offload(skb);
 
 	if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead)))
 		return -EINVAL;
 
-	skb->ip_summed = CHECKSUM_NONE;
+	if (!(xo->flags & CRYPTO_DONE))
+		skb->ip_summed = CHECKSUM_NONE;
 
 	return esp6_input_done2(skb, 0);
 }
-- 
cgit v1.2.3


From e9cba69448df7686e1c35e74be48fc715d41ac45 Mon Sep 17 00:00:00 2001
From: Yossi Kuperman <yossiku@mellanox.com>
Date: Tue, 1 Aug 2017 12:49:06 +0300
Subject: xfrm6: Fix CHECKSUM_COMPLETE after IPv6 header push

xfrm6_transport_finish rebuilds the IPv6 header based on the
original one and pushes it back without fixing skb->csum.
Therefore, CHECKSUM_COMPLETE is no longer valid and the packet
gets dropped.

Fix skb->csum by calling skb_postpush_rcsum.

Note: A valid IPv4 header has checksum 0, unlike IPv6. Thus,
the change is not needed in the sibling xfrm4_transport_finish
function.

Signed-off-by: Yossi Kuperman <yossiku@mellanox.com>
Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/xfrm6_input.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 3ef5d913e7a3..f95943a13abc 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -34,6 +34,7 @@ EXPORT_SYMBOL(xfrm6_rcv_spi);
 int xfrm6_transport_finish(struct sk_buff *skb, int async)
 {
 	struct xfrm_offload *xo = xfrm_offload(skb);
+	int nhlen = skb->data - skb_network_header(skb);
 
 	skb_network_header(skb)[IP6CB(skb)->nhoff] =
 		XFRM_MODE_SKB_CB(skb)->protocol;
@@ -43,8 +44,9 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
 		return 1;
 #endif
 
-	__skb_push(skb, skb->data - skb_network_header(skb));
+	__skb_push(skb, nhlen);
 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+	skb_postpush_rcsum(skb, skb_network_header(skb), nhlen);
 
 	if (xo && (xo->flags & XFRM_GRO)) {
 		skb_mac_header_rebuild(skb);
-- 
cgit v1.2.3


From a9b28c2bf05d9d9998d5d3c6453fd75bc4cf8a6d Mon Sep 17 00:00:00 2001
From: Yossi Kuperman <yossiku@mellanox.com>
Date: Tue, 1 Aug 2017 12:49:07 +0300
Subject: esp6: Fix RX checksum after header pull

Both ip6_input_finish (non-GRO) and esp6_gro_receive (GRO) strip
the IPv6 header without adjusting skb->csum accordingly. As a
result CHECKSUM_COMPLETE breaks and "hw csum failure" is written
to the kernel log by netdev_rx_csum_fault (dev.c).

Fix skb->csum by substracting the checksum value of the pulled IPv6
header using a call to skb_postpull_rcsum.

This affects both transport and tunnel modes.

Note that the fix occurs far from the place that the header was
pulled. This is based on existing code, see:
ipv6_srh_rcv() in exthdrs.c and rawv6_rcv() in raw.c

Signed-off-by: Yossi Kuperman <yossiku@mellanox.com>
Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/esp6.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 0ca1db62e381..74bde202eb9a 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -495,6 +495,8 @@ int esp6_input_done2(struct sk_buff *skb, int err)
 
 	trimlen = alen + padlen + 2;
 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		skb_postpull_rcsum(skb, skb_network_header(skb),
+				   skb_network_header_len(skb));
 		csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
 		skb->csum = csum_block_sub(skb->csum, csumdiff,
 					   skb->len - trimlen);
-- 
cgit v1.2.3


From ffdb5211da1c20354f1b40c204b6cf6c29c68161 Mon Sep 17 00:00:00 2001
From: Ilan Tayari <ilant@mellanox.com>
Date: Tue, 1 Aug 2017 12:49:08 +0300
Subject: xfrm: Auto-load xfrm offload modules

IPSec crypto offload depends on the protocol-specific
offload module (such as esp_offload.ko).

When the user installs an SA with crypto-offload, load
the offload module automatically, in the same way
that the protocol module is loaded (such as esp.ko)

Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/esp6_offload.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index eec3add177fe..8d4e2ba9163d 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -334,3 +334,4 @@ module_init(esp6_offload_init);
 module_exit(esp6_offload_exit);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
+MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET6, XFRM_PROTO_ESP);
-- 
cgit v1.2.3


From 549d2d41c1a448380872858302ee91be5a3ed499 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia.lawall@lip6.fr>
Date: Tue, 1 Aug 2017 12:48:03 +0200
Subject: netfilter: constify nf_loginfo structures

The nf_loginfo structures are only passed as the seventh argument to
nf_log_trace, which is declared as const or stored in a local const
variable.  Thus the nf_loginfo structures themselves can be const.

Done with the help of Coccinelle.

// <smpl>
@r disable optional_qualifier@
identifier i;
position p;
@@
static struct nf_loginfo i@p = { ... };

@ok1@
identifier r.i;
expression list[6] es;
position p;
@@
 nf_log_trace(es,&i@p,...)

@ok2@
identifier r.i;
const struct nf_loginfo *e;
position p;
@@
 e = &i@p

@bad@
position p != {r.p,ok1.p,ok2.p};
identifier r.i;
struct nf_loginfo e;
@@
e@i@p

@depends on !bad disable optional_qualifier@
identifier r.i;
@@
static
+const
 struct nf_loginfo i = { ... };
// </smpl>

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/ip6_tables.c  | 2 +-
 net/ipv6/netfilter/nf_log_ipv6.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 1f90644056ac..9f6644958e5e 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -176,7 +176,7 @@ static const char *const comments[] = {
 	[NF_IP6_TRACE_COMMENT_POLICY]	= "policy",
 };
 
-static struct nf_loginfo trace_loginfo = {
+static const struct nf_loginfo trace_loginfo = {
 	.type = NF_LOG_TYPE_LOG,
 	.u = {
 		.log = {
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 97c724224da7..b397a8fe88b9 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -25,7 +25,7 @@
 #include <linux/netfilter/xt_LOG.h>
 #include <net/netfilter/nf_log.h>
 
-static struct nf_loginfo default_loginfo = {
+static const struct nf_loginfo default_loginfo = {
 	.type	= NF_LOG_TYPE_LOG,
 	.u = {
 		.log = {
-- 
cgit v1.2.3


From e3ea973159d53559c5ae9a9dbc824da9aba6cac0 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:15 +0200
Subject: ipv6: fib_rules: Check if rule is a default rule

As explained in commit 3c71006d15fd ("ipv4: fib_rules: Check if rule is
a default rule"), drivers supporting IPv6 FIB offload need to be able to
sanitize the rules they don't support and potentially flush their
tables.

Add an IPv6 helper to check if a FIB rule is a default rule.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/fib6_rules.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index ec849d88a662..ef1fcee6bf16 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -29,6 +29,26 @@ struct fib6_rule {
 	u8			tclass;
 };
 
+static bool fib6_rule_matchall(const struct fib_rule *rule)
+{
+	struct fib6_rule *r = container_of(rule, struct fib6_rule, common);
+
+	if (r->dst.plen || r->src.plen || r->tclass)
+		return false;
+	return fib_rule_matchall(rule);
+}
+
+bool fib6_rule_default(const struct fib_rule *rule)
+{
+	if (!fib6_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL ||
+	    rule->l3mdev)
+		return false;
+	if (rule->table != RT6_TABLE_LOCAL && rule->table != RT6_TABLE_MAIN)
+		return false;
+	return true;
+}
+EXPORT_SYMBOL_GPL(fib6_rule_default);
+
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   int flags, pol_lookup_t lookup)
 {
-- 
cgit v1.2.3


From 16ab6d7d4d8cc037bb4be12c2b849ac92787e1ff Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:16 +0200
Subject: ipv6: fib: Add FIB notifiers callbacks

We're about to add IPv6 FIB offload support, so implement the necessary
callbacks in IPv6 code, which will later allow us to add routes and
rules notifications.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/Makefile        |  2 +-
 net/ipv6/fib6_notifier.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/ip6_fib.c       |  7 ++++++
 3 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 net/ipv6/fib6_notifier.c

(limited to 'net/ipv6')

diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 217e9ff0e24b..f8b24c2e0d77 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -9,7 +9,7 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
 		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
 		raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
 		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
-		udp_offload.o seg6.o
+		udp_offload.o seg6.o fib6_notifier.o
 
 ipv6-offload :=	ip6_offload.o tcpv6_offload.o exthdrs_offload.o
 
diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c
new file mode 100644
index 000000000000..c2bb1ab5b5eb
--- /dev/null
+++ b/net/ipv6/fib6_notifier.c
@@ -0,0 +1,55 @@
+#include <linux/notifier.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <net/net_namespace.h>
+#include <net/fib_notifier.h>
+#include <net/netns/ipv6.h>
+#include <net/ip6_fib.h>
+
+int call_fib6_notifier(struct notifier_block *nb, struct net *net,
+		       enum fib_event_type event_type,
+		       struct fib_notifier_info *info)
+{
+	info->family = AF_INET6;
+	return call_fib_notifier(nb, net, event_type, info);
+}
+
+int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
+			struct fib_notifier_info *info)
+{
+	info->family = AF_INET6;
+	return call_fib_notifiers(net, event_type, info);
+}
+
+static unsigned int fib6_seq_read(struct net *net)
+{
+	return 0;
+}
+
+static int fib6_dump(struct net *net, struct notifier_block *nb)
+{
+	return 0;
+}
+
+static const struct fib_notifier_ops fib6_notifier_ops_template = {
+	.family		= AF_INET6,
+	.fib_seq_read	= fib6_seq_read,
+	.fib_dump	= fib6_dump,
+};
+
+int __net_init fib6_notifier_init(struct net *net)
+{
+	struct fib_notifier_ops *ops;
+
+	ops = fib_notifier_ops_register(&fib6_notifier_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+	net->ipv6.notifier_ops = ops;
+
+	return 0;
+}
+
+void __net_exit fib6_notifier_exit(struct net *net)
+{
+	fib_notifier_ops_unregister(net->ipv6.notifier_ops);
+}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ebb299cf72b7..f93976e3f65c 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1839,6 +1839,11 @@ static void fib6_gc_timer_cb(unsigned long arg)
 static int __net_init fib6_net_init(struct net *net)
 {
 	size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
+	int err;
+
+	err = fib6_notifier_init(net);
+	if (err)
+		return err;
 
 	spin_lock_init(&net->ipv6.fib6_gc_lock);
 	rwlock_init(&net->ipv6.fib6_walker_lock);
@@ -1891,6 +1896,7 @@ out_fib_table_hash:
 out_rt6_stats:
 	kfree(net->ipv6.rt6_stats);
 out_timer:
+	fib6_notifier_exit(net);
 	return -ENOMEM;
 }
 
@@ -1907,6 +1913,7 @@ static void fib6_net_exit(struct net *net)
 	kfree(net->ipv6.fib6_main_tbl);
 	kfree(net->ipv6.fib_table_hash);
 	kfree(net->ipv6.rt6_stats);
+	fib6_notifier_exit(net);
 }
 
 static struct pernet_operations fib6_net_ops = {
-- 
cgit v1.2.3


From df77fe4d9865c6354372876632bcbceeda84f6c8 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:17 +0200
Subject: ipv6: fib: Add in-kernel notifications for route add / delete

As with IPv4, allow listeners of the FIB notification chain to receive
notifications whenever a route is added, replaced or deleted. This is
done by placing calls to the FIB notification chain in the two lowest
level functions that end up performing these operations - namely,
fib6_add_rt2node() and fib6_del_route().

Unlike IPv4, APPEND notifications aren't sent as the kernel doesn't
distinguish between "append" (NLM_F_CREATE|NLM_F_APPEND) and "prepend"
(NLM_F_CREATE). If NLM_F_EXCL isn't set, duplicate routes are always
added after the existing duplicate routes.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index f93976e3f65c..595a57cbbc7b 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -33,6 +33,7 @@
 #include <net/ndisc.h>
 #include <net/addrconf.h>
 #include <net/lwtunnel.h>
+#include <net/fib_notifier.h>
 
 #include <net/ip6_fib.h>
 #include <net/ip6_route.h>
@@ -302,6 +303,17 @@ static void __net_init fib6_tables_init(struct net *net)
 
 #endif
 
+static int call_fib6_entry_notifiers(struct net *net,
+				     enum fib_event_type event_type,
+				     struct rt6_info *rt)
+{
+	struct fib6_entry_notifier_info info = {
+		.rt = rt,
+	};
+
+	return call_fib6_notifiers(net, event_type, &info.info);
+}
+
 static int fib6_dump_node(struct fib6_walker *w)
 {
 	int res;
@@ -879,6 +891,8 @@ add:
 		*ins = rt;
 		rt->rt6i_node = fn;
 		atomic_inc(&rt->rt6i_ref);
+		call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD,
+					  rt);
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
 		info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
@@ -906,6 +920,8 @@ add:
 		rt->rt6i_node = fn;
 		rt->dst.rt6_next = iter->dst.rt6_next;
 		atomic_inc(&rt->rt6i_ref);
+		call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
+					  rt);
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
 		if (!(fn->fn_flags & RTN_RTINFO)) {
@@ -1459,6 +1475,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 
 	fib6_purge_rt(rt, fn, net);
 
+	call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt);
 	if (!info->skip_notify)
 		inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
 	rt6_release(rt);
-- 
cgit v1.2.3


From dcb18f762f6ac83a6dc9cdc26dd694dcc167beb7 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:18 +0200
Subject: ipv6: fib_rules: Dump rules during registration to FIB chain

Allow users of the FIB notification chain to receive a complete view of
the IPv6 FIB rules upon registration to the chain.

The integrity of the dump is ensured by a per-family sequence counter
that is incremented (under RTNL) whenever a rule is added or deleted.

All the sequence counters are read (under RTNL) and summed, prior and
after the dump. In case the counters differ, then the dump is either
restarted or the registration fails.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/fib6_notifier.c |  4 ++--
 net/ipv6/fib6_rules.c    | 11 +++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c
index c2bb1ab5b5eb..298efc678f3b 100644
--- a/net/ipv6/fib6_notifier.c
+++ b/net/ipv6/fib6_notifier.c
@@ -23,12 +23,12 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
 
 static unsigned int fib6_seq_read(struct net *net)
 {
-	return 0;
+	return fib6_rules_seq_read(net);
 }
 
 static int fib6_dump(struct net *net, struct notifier_block *nb)
 {
-	return 0;
+	return fib6_rules_dump(net, nb);
 }
 
 static const struct fib_notifier_ops fib6_notifier_ops_template = {
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index ef1fcee6bf16..2f29e4e33bd3 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/netdevice.h>
+#include <linux/notifier.h>
 #include <linux/export.h>
 
 #include <net/fib_rules.h>
@@ -49,6 +50,16 @@ bool fib6_rule_default(const struct fib_rule *rule)
 }
 EXPORT_SYMBOL_GPL(fib6_rule_default);
 
+int fib6_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return fib_rules_dump(net, nb, AF_INET6);
+}
+
+unsigned int fib6_rules_seq_read(struct net *net)
+{
+	return fib_rules_seq_read(net, AF_INET6);
+}
+
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   int flags, pol_lookup_t lookup)
 {
-- 
cgit v1.2.3


From e1ee0a5ba35d999caef94d659b4cb842e63aeb68 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:19 +0200
Subject: ipv6: fib: Dump tables during registration to FIB chain

Dump all the FIB tables in each net namespace upon registration to the
FIB notification chain so that the callee will have a complete view of
the tables.

The integrity of the dump is ensured by a per-table sequence counter
that is incremented (under write lock) whenever a route is added or
deleted from the table.

All the sequence counters are read (under each table's read lock) and
summed, prior and after the dump. In case the counters differ, then the
dump is either restarted or the registration fails.

While it's possible for a table to be modified after its counter has
been read, this isn't really a problem. In case it happened before it
was read the second time, then the comparison at the end will fail. If
it happened afterwards, then we're guaranteed to be notified about the
change, as the notification block is registered prior to the second
read.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/fib6_notifier.c | 10 ++++--
 net/ipv6/ip6_fib.c       | 92 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c
index 298efc678f3b..66a103ef7e86 100644
--- a/net/ipv6/fib6_notifier.c
+++ b/net/ipv6/fib6_notifier.c
@@ -23,12 +23,18 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
 
 static unsigned int fib6_seq_read(struct net *net)
 {
-	return fib6_rules_seq_read(net);
+	return fib6_tables_seq_read(net) + fib6_rules_seq_read(net);
 }
 
 static int fib6_dump(struct net *net, struct notifier_block *nb)
 {
-	return fib6_rules_dump(net, nb);
+	int err;
+
+	err = fib6_rules_dump(net, nb);
+	if (err)
+		return err;
+
+	return fib6_tables_dump(net, nb);
 }
 
 static const struct fib_notifier_ops fib6_notifier_ops_template = {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 595a57cbbc7b..719c10480c74 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -303,6 +303,37 @@ static void __net_init fib6_tables_init(struct net *net)
 
 #endif
 
+unsigned int fib6_tables_seq_read(struct net *net)
+{
+	unsigned int h, fib_seq = 0;
+
+	rcu_read_lock();
+	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+		struct hlist_head *head = &net->ipv6.fib_table_hash[h];
+		struct fib6_table *tb;
+
+		hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
+			read_lock_bh(&tb->tb6_lock);
+			fib_seq += tb->fib_seq;
+			read_unlock_bh(&tb->tb6_lock);
+		}
+	}
+	rcu_read_unlock();
+
+	return fib_seq;
+}
+
+static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
+				    enum fib_event_type event_type,
+				    struct rt6_info *rt)
+{
+	struct fib6_entry_notifier_info info = {
+		.rt = rt,
+	};
+
+	return call_fib6_notifier(nb, net, event_type, &info.info);
+}
+
 static int call_fib6_entry_notifiers(struct net *net,
 				     enum fib_event_type event_type,
 				     struct rt6_info *rt)
@@ -311,9 +342,70 @@ static int call_fib6_entry_notifiers(struct net *net,
 		.rt = rt,
 	};
 
+	rt->rt6i_table->fib_seq++;
 	return call_fib6_notifiers(net, event_type, &info.info);
 }
 
+struct fib6_dump_arg {
+	struct net *net;
+	struct notifier_block *nb;
+};
+
+static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg)
+{
+	if (rt == arg->net->ipv6.ip6_null_entry)
+		return;
+	call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
+}
+
+static int fib6_node_dump(struct fib6_walker *w)
+{
+	struct rt6_info *rt;
+
+	for (rt = w->leaf; rt; rt = rt->dst.rt6_next)
+		fib6_rt_dump(rt, w->args);
+	w->leaf = NULL;
+	return 0;
+}
+
+static void fib6_table_dump(struct net *net, struct fib6_table *tb,
+			    struct fib6_walker *w)
+{
+	w->root = &tb->tb6_root;
+	read_lock_bh(&tb->tb6_lock);
+	fib6_walk(net, w);
+	read_unlock_bh(&tb->tb6_lock);
+}
+
+/* Called with rcu_read_lock() */
+int fib6_tables_dump(struct net *net, struct notifier_block *nb)
+{
+	struct fib6_dump_arg arg;
+	struct fib6_walker *w;
+	unsigned int h;
+
+	w = kzalloc(sizeof(*w), GFP_ATOMIC);
+	if (!w)
+		return -ENOMEM;
+
+	w->func = fib6_node_dump;
+	arg.net = net;
+	arg.nb = nb;
+	w->args = &arg;
+
+	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+		struct hlist_head *head = &net->ipv6.fib_table_hash[h];
+		struct fib6_table *tb;
+
+		hlist_for_each_entry_rcu(tb, head, tb6_hlist)
+			fib6_table_dump(net, tb, w);
+	}
+
+	kfree(w);
+
+	return 0;
+}
+
 static int fib6_dump_node(struct fib6_walker *w)
 {
 	int res;
-- 
cgit v1.2.3


From 61e4d01e16acddadb9723143637a20417fa67ac9 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:20 +0200
Subject: ipv6: fib: Add offload indication to routes

Allow user space applications to see which routes are offloaded and
which aren't by setting the RTNH_F_OFFLOAD flag when dumping them.

To be consistent with IPv4, offload indication is provided on a
per-nexthop basis.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4d30c96a819d..aba07fce67fb 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1820,6 +1820,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 
+	if (cfg->fc_flags & RTF_OFFLOAD) {
+		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_OFFLOAD");
+		goto out;
+	}
+
 	if (cfg->fc_dst_len > 128) {
 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
 		goto out;
@@ -3327,6 +3332,9 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
 			goto nla_put_failure;
 	}
 
+	if (rt->rt6i_flags & RTF_OFFLOAD)
+		*flags |= RTNH_F_OFFLOAD;
+
 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
 	if (!skip_oif && rt->dst.dev &&
 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
-- 
cgit v1.2.3


From c5b12410fa591acb1d48e167b9bd0d2a7a38498d Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:21 +0200
Subject: ipv6: fib: Don't assume only nodes hold a reference on routes

The code currently assumes that only FIB nodes can hold a reference on
routes. Therefore, after fib6_purge_rt() has run and the route is no
longer present in any intermediate nodes, it's assumed that its
reference count would be 1 - taken by the node where it's currently
stored.

However, we're going to allow users other than the FIB to take a
reference on a route, so this assumption is no longer valid and the
BUG_ON() needs to be removed.

Note that purging only takes place if the initial reference count is
different than 1. I've left that check intact, as in the majority of
systems (where routes are only referenced by the FIB), it does actually
mean the route is present in intermediate nodes.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 719c10480c74..fa27905de92e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -837,8 +837,6 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
 			}
 			fn = fn->parent;
 		}
-		/* No more references are possible at this point. */
-		BUG_ON(atomic_read(&rt->rt6i_ref) != 1);
 	}
 }
 
-- 
cgit v1.2.3


From 7483cea79957312e9f8e9cf760a1bc5d6c507113 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:22 +0200
Subject: ipv6: fib: Unlink replaced routes from their nodes

When a route is deleted its node pointer is set to NULL to indicate it's
no longer linked to its node. Do the same for routes that are replaced.

This will later allow us to test if a route is still in the FIB by
checking its node pointer instead of its reference count.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index fa27905de92e..fe193b48ef61 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1019,6 +1019,7 @@ add:
 			fn->fn_flags |= RTN_RTINFO;
 		}
 		nsiblings = iter->rt6i_nsiblings;
+		iter->rt6i_node = NULL;
 		fib6_purge_rt(iter, fn, info->nl_net);
 		rt6_release(iter);
 
@@ -1031,6 +1032,7 @@ add:
 					break;
 				if (rt6_qualify_for_ecmp(iter)) {
 					*ins = iter->dst.rt6_next;
+					iter->rt6i_node = NULL;
 					fib6_purge_rt(iter, fn, info->nl_net);
 					rt6_release(iter);
 					nsiblings--;
-- 
cgit v1.2.3


From 9217d8c2fe743f02a3ce6d430fe3b5d514fd5f1c Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:23 +0200
Subject: ipv6: Regenerate host route according to node pointer upon loopback
 up

When the loopback device is brought back up we need to check if the host
route attached to the address is still in the FIB and regenerate one in
case it's not.

Host routes using the loopback device are always inserted into and
removed from the FIB under RTNL (under which this function is called),
so we can test their node pointer instead of the reference count in
order to check if the route is in the FIB or not.

Tested using the following script from Nicolas mentioned in
commit a220445f9f43 ("ipv6: correctly add local routes when lo goes up"):

$ ip link add dummy1 type dummy
$ ip link set dummy1 up
$ ip link set lo down ; ip link set lo up

The host route is correctly regenerated.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 3c46e9513a31..e8e4f867b994 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3066,7 +3066,7 @@ static void init_loopback(struct net_device *dev)
 				 * lo device down, release this obsolete dst and
 				 * reallocate a new router for ifa.
 				 */
-				if (!atomic_read(&sp_ifa->rt->rt6i_ref)) {
+				if (!sp_ifa->rt->rt6i_node) {
 					ip6_rt_put(sp_ifa->rt);
 					sp_ifa->rt = NULL;
 				} else {
-- 
cgit v1.2.3


From fc882fcff1ee774cb6be9d3c714ae5ab9eec5aa4 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:24 +0200
Subject: ipv6: Regenerate host route according to node pointer upon interface
 up

When an interface is brought back up, the kernel tries to restore the
host routes tied to its permanent addresses.

However, if the host route was removed from the FIB, then we need to
reinsert it. This is done by releasing the current dst and allocating a
new, so as to not reuse a dst with obsolete values.

Since this function is called under RTNL and using the same explanation
from the previous patch, we can test if the route is in the FIB by
checking its node pointer instead of its reference count.

Tested using the following script and Andrey's reproducer mentioned
in commit 8048ced9beb2 ("net: ipv6: regenerate host route if moved to gc
list") and linked below:

$ ip link set dev lo up
$ ip link add dummy1 type dummy
$ ip -6 address add cafe::1/64 dev dummy1
$ ip link set dev lo down	# cafe::1/128 is removed
$ ip link set dev dummy1 up
$ ip link set dev lo up

The host route is correctly regenerated.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Link: http://lkml.kernel.org/r/CAAeHK+zSe82vc5gCRgr_EoUwiALPnWVdWJBPwJZBpbxYz=kGJw@mail.gmail.com
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index e8e4f867b994..30ee23eef268 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3321,11 +3321,11 @@ static void addrconf_gre_config(struct net_device *dev)
 static int fixup_permanent_addr(struct inet6_dev *idev,
 				struct inet6_ifaddr *ifp)
 {
-	/* rt6i_ref == 0 means the host route was removed from the
+	/* !rt6i_node means the host route was removed from the
 	 * FIB, for example, if 'lo' device is taken down. In that
 	 * case regenerate the host route.
 	 */
-	if (!ifp->rt || !atomic_read(&ifp->rt->rt6i_ref)) {
+	if (!ifp->rt || !ifp->rt->rt6i_node) {
 		struct rt6_info *rt, *prev;
 
 		rt = addrconf_dst_alloc(idev, &ifp->addr, false);
-- 
cgit v1.2.3


From a460aa83963b185a32a6377eb486b6e613ac8e38 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Aug 2017 13:28:25 +0200
Subject: ipv6: fib: Add helpers to hold / drop a reference on rt6_info

Similar to commit 1c677b3d2828 ("ipv4: fib: Add fib_info_hold() helper")
and commit b423cb10807b ("ipv4: fib: Export free_fib_info()") add an
helper to hold a reference on rt6_info and export rt6_release() to drop
it and potentially release the route.

This is needed so that drivers capable of FIB offload could hold a
reference on the route before queueing it for offload and drop it after
the route has been programmed to the device's tables.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index fe193b48ef61..69ed0043d117 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -154,7 +154,7 @@ static void node_free(struct fib6_node *fn)
 	kmem_cache_free(fib6_node_kmem, fn);
 }
 
-static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
+void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
 {
 	int cpu;
 
@@ -177,15 +177,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
 	free_percpu(non_pcpu_rt->rt6i_pcpu);
 	non_pcpu_rt->rt6i_pcpu = NULL;
 }
-
-static void rt6_release(struct rt6_info *rt)
-{
-	if (atomic_dec_and_test(&rt->rt6i_ref)) {
-		rt6_free_pcpu(rt);
-		dst_dev_put(&rt->dst);
-		dst_release(&rt->dst);
-	}
-}
+EXPORT_SYMBOL_GPL(rt6_free_pcpu);
 
 static void fib6_link_table(struct net *net, struct fib6_table *tb)
 {
-- 
cgit v1.2.3


From 1801b570dd2ae50b90231f283e79a9a94fbe7875 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 7 Aug 2017 08:44:20 -0700
Subject: net: ipv6: add second dif to udp socket lookups

Add a second device index, sdif, to udp socket lookups. sdif is the
index for ingress devices enslaved to an l3mdev. It allows the lookups
to consider the enslaved device as well as the L3 domain when searching
for a socket.

Early demux lookups are handled in the next patch as part of INET_MATCH
changes.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/udp.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 578142b7ca3e..d96a877798a7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -129,7 +129,7 @@ static void udp_v6_rehash(struct sock *sk)
 static int compute_score(struct sock *sk, struct net *net,
 			 const struct in6_addr *saddr, __be16 sport,
 			 const struct in6_addr *daddr, unsigned short hnum,
-			 int dif, bool exact_dif)
+			 int dif, int sdif, bool exact_dif)
 {
 	int score;
 	struct inet_sock *inet;
@@ -161,9 +161,13 @@ static int compute_score(struct sock *sk, struct net *net,
 	}
 
 	if (sk->sk_bound_dev_if || exact_dif) {
-		if (sk->sk_bound_dev_if != dif)
+		bool dev_match = (sk->sk_bound_dev_if == dif ||
+				  sk->sk_bound_dev_if == sdif);
+
+		if (exact_dif && !dev_match)
 			return -1;
-		score++;
+		if (sk->sk_bound_dev_if && dev_match)
+			score++;
 	}
 
 	if (sk->sk_incoming_cpu == raw_smp_processor_id())
@@ -175,9 +179,9 @@ static int compute_score(struct sock *sk, struct net *net,
 /* called with rcu_read_lock() */
 static struct sock *udp6_lib_lookup2(struct net *net,
 		const struct in6_addr *saddr, __be16 sport,
-		const struct in6_addr *daddr, unsigned int hnum, int dif,
-		bool exact_dif, struct udp_hslot *hslot2,
-		struct sk_buff *skb)
+		const struct in6_addr *daddr, unsigned int hnum,
+		int dif, int sdif, bool exact_dif,
+		struct udp_hslot *hslot2, struct sk_buff *skb)
 {
 	struct sock *sk, *result;
 	int score, badness, matches = 0, reuseport = 0;
@@ -187,7 +191,7 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 	badness = -1;
 	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		score = compute_score(sk, net, saddr, sport,
-				      daddr, hnum, dif, exact_dif);
+				      daddr, hnum, dif, sdif, exact_dif);
 		if (score > badness) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -214,10 +218,10 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 
 /* rcu_read_lock() must be held */
 struct sock *__udp6_lib_lookup(struct net *net,
-				      const struct in6_addr *saddr, __be16 sport,
-				      const struct in6_addr *daddr, __be16 dport,
-				      int dif, struct udp_table *udptable,
-				      struct sk_buff *skb)
+			       const struct in6_addr *saddr, __be16 sport,
+			       const struct in6_addr *daddr, __be16 dport,
+			       int dif, int sdif, struct udp_table *udptable,
+			       struct sk_buff *skb)
 {
 	struct sock *sk, *result;
 	unsigned short hnum = ntohs(dport);
@@ -235,7 +239,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 			goto begin;
 
 		result = udp6_lib_lookup2(net, saddr, sport,
-					  daddr, hnum, dif, exact_dif,
+					  daddr, hnum, dif, sdif, exact_dif,
 					  hslot2, skb);
 		if (!result) {
 			unsigned int old_slot2 = slot2;
@@ -250,7 +254,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
 				goto begin;
 
 			result = udp6_lib_lookup2(net, saddr, sport,
-						  daddr, hnum, dif,
+						  daddr, hnum, dif, sdif,
 						  exact_dif, hslot2,
 						  skb);
 		}
@@ -261,7 +265,7 @@ begin:
 	badness = -1;
 	sk_for_each_rcu(sk, &hslot->head) {
 		score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
-				      exact_dif);
+				      sdif, exact_dif);
 		if (score > badness) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -294,7 +298,7 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
 
 	return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
 				 &iph->daddr, dport, inet6_iif(skb),
-				 udptable, skb);
+				 inet6_sdif(skb), udptable, skb);
 }
 
 struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
@@ -304,7 +308,7 @@ struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
 
 	return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
 				 &iph->daddr, dport, inet6_iif(skb),
-				 &udp_table, skb);
+				 inet6_sdif(skb), &udp_table, skb);
 }
 EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
 
@@ -320,7 +324,7 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be
 	struct sock *sk;
 
 	sk =  __udp6_lib_lookup(net, saddr, sport, daddr, dport,
-				dif, &udp_table, NULL);
+				dif, 0, &udp_table, NULL);
 	if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
 		sk = NULL;
 	return sk;
@@ -501,7 +505,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	struct net *net = dev_net(skb->dev);
 
 	sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
-			       inet6_iif(skb), udptable, skb);
+			       inet6_iif(skb), 0, udptable, skb);
 	if (!sk) {
 		__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
 				  ICMP6_MIB_INERRORS);
-- 
cgit v1.2.3


From 4297a0ef085729af98adab9131d128c576ed3044 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 7 Aug 2017 08:44:21 -0700
Subject: net: ipv6: add second dif to inet6 socket lookups

Add a second device index, sdif, to inet6 socket lookups. sdif is the
index for ingress devices enslaved to an l3mdev. It allows the lookups
to consider the enslaved device as well as the L3 domain when searching
for a socket.

TCP moves the data in the cb. Prior to tcp_v4_rcv (e.g., early demux) the
ingress index is obtained from IPCB using inet_sdif and after tcp_v4_rcv
tcp_v4_sdif is used.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/inet6_hashtables.c | 28 +++++++++++++++++-----------
 net/ipv6/tcp_ipv6.c         | 13 ++++++++-----
 net/ipv6/udp.c              |  7 ++++---
 3 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index b13b8f93079d..b01858f5deb1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -56,7 +56,7 @@ struct sock *__inet6_lookup_established(struct net *net,
 					   const __be16 sport,
 					   const struct in6_addr *daddr,
 					   const u16 hnum,
-					   const int dif)
+					   const int dif, const int sdif)
 {
 	struct sock *sk;
 	const struct hlist_nulls_node *node;
@@ -73,12 +73,12 @@ begin:
 	sk_nulls_for_each_rcu(sk, node, &head->chain) {
 		if (sk->sk_hash != hash)
 			continue;
-		if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif))
+		if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif, sdif))
 			continue;
 		if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
 			goto out;
 
-		if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif))) {
+		if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif, sdif))) {
 			sock_gen_put(sk);
 			goto begin;
 		}
@@ -96,7 +96,7 @@ EXPORT_SYMBOL(__inet6_lookup_established);
 static inline int compute_score(struct sock *sk, struct net *net,
 				const unsigned short hnum,
 				const struct in6_addr *daddr,
-				const int dif, bool exact_dif)
+				const int dif, const int sdif, bool exact_dif)
 {
 	int score = -1;
 
@@ -110,9 +110,13 @@ static inline int compute_score(struct sock *sk, struct net *net,
 			score++;
 		}
 		if (sk->sk_bound_dev_if || exact_dif) {
-			if (sk->sk_bound_dev_if != dif)
+			bool dev_match = (sk->sk_bound_dev_if == dif ||
+					  sk->sk_bound_dev_if == sdif);
+
+			if (exact_dif && !dev_match)
 				return -1;
-			score++;
+			if (sk->sk_bound_dev_if && dev_match)
+				score++;
 		}
 		if (sk->sk_incoming_cpu == raw_smp_processor_id())
 			score++;
@@ -126,7 +130,7 @@ struct sock *inet6_lookup_listener(struct net *net,
 		struct sk_buff *skb, int doff,
 		const struct in6_addr *saddr,
 		const __be16 sport, const struct in6_addr *daddr,
-		const unsigned short hnum, const int dif)
+		const unsigned short hnum, const int dif, const int sdif)
 {
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
@@ -136,7 +140,7 @@ struct sock *inet6_lookup_listener(struct net *net,
 	u32 phash = 0;
 
 	sk_for_each(sk, &ilb->head) {
-		score = compute_score(sk, net, hnum, daddr, dif, exact_dif);
+		score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
 		if (score > hiscore) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
@@ -171,7 +175,7 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
 	bool refcounted;
 
 	sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
-			    ntohs(dport), dif, &refcounted);
+			    ntohs(dport), dif, 0, &refcounted);
 	if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
 		sk = NULL;
 	return sk;
@@ -187,8 +191,9 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 	const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr;
 	const struct in6_addr *saddr = &sk->sk_v6_daddr;
 	const int dif = sk->sk_bound_dev_if;
-	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
 	struct net *net = sock_net(sk);
+	const int sdif = l3mdev_master_ifindex_by_index(net, dif);
+	const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
 	const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr,
 						inet->inet_dport);
 	struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
@@ -203,7 +208,8 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 		if (sk2->sk_hash != hash)
 			continue;
 
-		if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, dif))) {
+		if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports,
+				       dif, sdif))) {
 			if (sk2->sk_state == TCP_TIME_WAIT) {
 				tw = inet_twsk(sk2);
 				if (twsk_unique(sk, sk2, twp))
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ced5dcf37465..f776ec4ecf6d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -350,7 +350,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	sk = __inet6_lookup_established(net, &tcp_hashinfo,
 					&hdr->daddr, th->dest,
 					&hdr->saddr, ntohs(th->source),
-					skb->dev->ifindex);
+					skb->dev->ifindex, inet6_sdif(skb));
 
 	if (!sk) {
 		__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
@@ -918,7 +918,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 					   &tcp_hashinfo, NULL, 0,
 					   &ipv6h->saddr,
 					   th->source, &ipv6h->daddr,
-					   ntohs(th->source), tcp_v6_iif(skb));
+					   ntohs(th->source), tcp_v6_iif(skb),
+					   tcp_v6_sdif(skb));
 		if (!sk1)
 			goto out;
 
@@ -1397,6 +1398,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
 
 static int tcp_v6_rcv(struct sk_buff *skb)
 {
+	int sdif = inet6_sdif(skb);
 	const struct tcphdr *th;
 	const struct ipv6hdr *hdr;
 	bool refcounted;
@@ -1430,7 +1432,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
 
 lookup:
 	sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th),
-				th->source, th->dest, inet6_iif(skb),
+				th->source, th->dest, inet6_iif(skb), sdif,
 				&refcounted);
 	if (!sk)
 		goto no_tcp_socket;
@@ -1563,7 +1565,8 @@ do_time_wait:
 					    skb, __tcp_hdrlen(th),
 					    &ipv6_hdr(skb)->saddr, th->source,
 					    &ipv6_hdr(skb)->daddr,
-					    ntohs(th->dest), tcp_v6_iif(skb));
+					    ntohs(th->dest), tcp_v6_iif(skb),
+					    sdif);
 		if (sk2) {
 			struct inet_timewait_sock *tw = inet_twsk(sk);
 			inet_twsk_deschedule_put(tw);
@@ -1610,7 +1613,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb)
 	sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
 					&hdr->saddr, th->source,
 					&hdr->daddr, ntohs(th->dest),
-					inet6_iif(skb));
+					inet6_iif(skb), inet6_sdif(skb));
 	if (sk) {
 		skb->sk = sk;
 		skb->destructor = sock_edemux;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d96a877798a7..19afcaf4a22e 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -897,7 +897,7 @@ discard:
 static struct sock *__udp6_lib_demux_lookup(struct net *net,
 			__be16 loc_port, const struct in6_addr *loc_addr,
 			__be16 rmt_port, const struct in6_addr *rmt_addr,
-			int dif)
+			int dif, int sdif)
 {
 	unsigned short hnum = ntohs(loc_port);
 	unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum);
@@ -908,7 +908,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
 
 	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 		if (sk->sk_state == TCP_ESTABLISHED &&
-		    INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif))
+		    INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif, sdif))
 			return sk;
 		/* Only check first socket in chain */
 		break;
@@ -923,6 +923,7 @@ static void udp_v6_early_demux(struct sk_buff *skb)
 	struct sock *sk;
 	struct dst_entry *dst;
 	int dif = skb->dev->ifindex;
+	int sdif = inet6_sdif(skb);
 
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) +
 	    sizeof(struct udphdr)))
@@ -934,7 +935,7 @@ static void udp_v6_early_demux(struct sk_buff *skb)
 		sk = __udp6_lib_demux_lookup(net, uh->dest,
 					     &ipv6_hdr(skb)->daddr,
 					     uh->source, &ipv6_hdr(skb)->saddr,
-					     dif);
+					     dif, sdif);
 	else
 		return;
 
-- 
cgit v1.2.3


From 5108ab4bf446fa9ad2c71f5fc1d839067b72636f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 7 Aug 2017 08:44:22 -0700
Subject: net: ipv6: add second dif to raw socket lookups

Add a second device index, sdif, to raw socket lookups. sdif is the
index for ingress devices enslaved to an l3mdev. It allows the lookups
to consider the enslaved device as well as the L3 domain when searching
for a socket.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/raw.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 60be012fe708..e4462b0ff801 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -72,7 +72,7 @@ EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
 
 struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
 		unsigned short num, const struct in6_addr *loc_addr,
-		const struct in6_addr *rmt_addr, int dif)
+		const struct in6_addr *rmt_addr, int dif, int sdif)
 {
 	bool is_multicast = ipv6_addr_is_multicast(loc_addr);
 
@@ -86,7 +86,9 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
 			    !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
 				continue;
 
-			if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+			if (sk->sk_bound_dev_if &&
+			    sk->sk_bound_dev_if != dif &&
+			    sk->sk_bound_dev_if != sdif)
 				continue;
 
 			if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
@@ -178,7 +180,8 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 		goto out;
 
 	net = dev_net(skb->dev);
-	sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, inet6_iif(skb));
+	sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr,
+			     inet6_iif(skb), inet6_sdif(skb));
 
 	while (sk) {
 		int filtered;
@@ -222,7 +225,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 			}
 		}
 		sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr,
-				     inet6_iif(skb));
+				     inet6_iif(skb), inet6_sdif(skb));
 	}
 out:
 	read_unlock(&raw_v6_hashinfo.lock);
@@ -378,7 +381,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
 		net = dev_net(skb->dev);
 
 		while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr,
-						inet6_iif(skb)))) {
+					     inet6_iif(skb), inet6_iif(skb)))) {
 			rawv6_err(sk, skb, NULL, type, code,
 					inner_offset, info);
 			sk = sk_next(sk);
-- 
cgit v1.2.3


From 925615ceda94532ba612e641549c7b1e41ebff10 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Sat, 5 Aug 2017 12:38:24 +0200
Subject: ipv6: sr: allow SRH insertion with arbitrary segments_left value

The seg6_validate_srh() function only allows SRHs whose active segment is
the first segment of the path. However, an application may insert an SRH
whose active segment is not the first one. Such an application might be
for example an SR-aware Virtual Network Function.

This patch enables to insert SRHs with an arbitrary active segment.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/exthdrs.c | 4 ++--
 net/ipv6/seg6.c    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 3cec529c6113..95516138e861 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -882,7 +882,7 @@ static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto,
 	       (hops - 1) * sizeof(struct in6_addr));
 
 	sr_phdr->segments[0] = **addr_p;
-	*addr_p = &sr_ihdr->segments[hops - 1];
+	*addr_p = &sr_ihdr->segments[sr_ihdr->segments_left];
 
 #ifdef CONFIG_IPV6_SEG6_HMAC
 	if (sr_has_hmac(sr_phdr)) {
@@ -1174,7 +1174,7 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
 	{
 		struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt;
 
-		fl6->daddr = srh->segments[srh->first_segment];
+		fl6->daddr = srh->segments[srh->segments_left];
 		break;
 	}
 	default:
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 15fba55e3da8..81c2339b3285 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -40,7 +40,7 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)
 	if (((srh->hdrlen + 1) << 3) != len)
 		return false;
 
-	if (srh->segments_left != srh->first_segment)
+	if (srh->segments_left > srh->first_segment)
 		return false;
 
 	tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4);
-- 
cgit v1.2.3


From b04c80d3a7e228cfb832cdb1c9ce8151f174669c Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Sat, 5 Aug 2017 12:38:25 +0200
Subject: ipv6: sr: export SRH insertion functions

This patch exports the seg6_do_srh_encap() and seg6_do_srh_inline()
functions. It also removes the CONFIG_IPV6_SEG6_INLINE knob
that enabled the compilation of seg6_do_srh_inline(). This function
is now built-in.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/Kconfig         | 12 ------------
 net/ipv6/seg6_iptunnel.c | 12 ++++--------
 2 files changed, 4 insertions(+), 20 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 48c452959d2c..50181a96923e 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -315,18 +315,6 @@ config IPV6_SEG6_LWTUNNEL
 
 	  If unsure, say N.
 
-config IPV6_SEG6_INLINE
-	bool "IPv6: direct Segment Routing Header insertion "
-	depends on IPV6_SEG6_LWTUNNEL
-	---help---
-	  Support for direct insertion of the Segment Routing Header,
-	  also known as inline mode. Be aware that direct insertion of
-	  extension headers (as opposed to encapsulation) may break
-	  multiple mechanisms such as PMTUD or IPSec AH. Use this feature
-	  only if you know exactly what you are doing.
-
-	  If unsure, say N.
-
 config IPV6_SEG6_HMAC
 	bool "IPv6: Segment Routing HMAC support"
 	depends on IPV6
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 264d772d3c7d..501233040570 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -91,7 +91,7 @@ static void set_tun_src(struct net *net, struct net_device *dev,
 }
 
 /* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
-static int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
 {
 	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct ipv6hdr *hdr, *inner_hdr;
@@ -141,10 +141,10 @@ static int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(seg6_do_srh_encap);
 
 /* insert an SRH within an IPv6 packet, just after the IPv6 header */
-#ifdef CONFIG_IPV6_SEG6_INLINE
-static int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
 {
 	struct ipv6hdr *hdr, *oldhdr;
 	struct ipv6_sr_hdr *isrh;
@@ -193,7 +193,7 @@ static int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
 
 	return 0;
 }
-#endif
+EXPORT_SYMBOL_GPL(seg6_do_srh_inline);
 
 static int seg6_do_srh(struct sk_buff *skb)
 {
@@ -209,12 +209,10 @@ static int seg6_do_srh(struct sk_buff *skb)
 	}
 
 	switch (tinfo->mode) {
-#ifdef CONFIG_IPV6_SEG6_INLINE
 	case SEG6_IPTUN_MODE_INLINE:
 		err = seg6_do_srh_inline(skb, tinfo->srh);
 		skb_reset_inner_headers(skb);
 		break;
-#endif
 	case SEG6_IPTUN_MODE_ENCAP:
 		err = seg6_do_srh_encap(skb, tinfo->srh);
 		break;
@@ -357,10 +355,8 @@ static int seg6_build_state(struct nlattr *nla,
 		return -EINVAL;
 
 	switch (tuninfo->mode) {
-#ifdef CONFIG_IPV6_SEG6_INLINE
 	case SEG6_IPTUN_MODE_INLINE:
 		break;
-#endif
 	case SEG6_IPTUN_MODE_ENCAP:
 		break;
 	default:
-- 
cgit v1.2.3


From d1df6fd8a1d22d37cffa0075ab8ad423ce656777 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Sat, 5 Aug 2017 12:38:26 +0200
Subject: ipv6: sr: define core operations for seg6local lightweight tunnel

This patch implements a new type of lightweight tunnel named seg6local.
A seg6local lwt is defined by a type of action and a set of parameters.
The action represents the operation to perform on the packets matching the
lwt's route, and is not necessarily an encapsulation. The set of parameters
are arguments for the processing function.

Each action is defined in a struct seg6_action_desc within
seg6_action_table[]. This structure contains the action, mandatory
attributes, the processing function, and a static headroom size required by
the action. The mandatory attributes are encoded as a bitmask field. The
static headroom is set to a non-zero value when the processing function
always add a constant number of bytes to the skb (e.g. the header size for
encapsulations).

To facilitate rtnetlink-related operations such as parsing, fill_encap,
and cmp_encap, each type of action parameter is associated to three
function pointers, in seg6_action_params[].

All actions defined in seg6_local.h are detailed in [1].

[1] https://tools.ietf.org/html/draft-filsfils-spring-srv6-network-programming-01

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/Kconfig      |   3 +-
 net/ipv6/Makefile     |   2 +-
 net/ipv6/seg6.c       |   5 +
 net/ipv6/seg6_local.c | 320 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 328 insertions(+), 2 deletions(-)
 create mode 100644 net/ipv6/seg6_local.c

(limited to 'net/ipv6')

diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 50181a96923e..0d722396dce6 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -311,7 +311,8 @@ config IPV6_SEG6_LWTUNNEL
 	---help---
 	  Support for encapsulation of packets within an outer IPv6
 	  header and a Segment Routing Header using the lightweight
-	  tunnels mechanism.
+	  tunnels mechanism. Also enable support for advanced local
+	  processing of SRv6 packets based on their active segment.
 
 	  If unsure, say N.
 
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index f8b24c2e0d77..10e342363793 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -23,7 +23,7 @@ ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
 ipv6-$(CONFIG_PROC_FS) += proc.o
 ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
 ipv6-$(CONFIG_NETLABEL) += calipso.o
-ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o
+ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o
 ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
 
 ipv6-objs += $(ipv6-y)
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 81c2339b3285..c81407770956 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -456,6 +456,10 @@ int __init seg6_init(void)
 	err = seg6_iptunnel_init();
 	if (err)
 		goto out_unregister_pernet;
+
+	err = seg6_local_init();
+	if (err)
+		goto out_unregister_pernet;
 #endif
 
 #ifdef CONFIG_IPV6_SEG6_HMAC
@@ -471,6 +475,7 @@ out:
 #ifdef CONFIG_IPV6_SEG6_HMAC
 out_unregister_iptun:
 #ifdef CONFIG_IPV6_SEG6_LWTUNNEL
+	seg6_local_exit();
 	seg6_iptunnel_exit();
 #endif
 #endif
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
new file mode 100644
index 000000000000..53615d7e0723
--- /dev/null
+++ b/net/ipv6/seg6_local.c
@@ -0,0 +1,320 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *        modify it under the terms of the GNU General Public License
+ *        as published by the Free Software Foundation; either version
+ *        2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/lwtunnel.h>
+#include <net/netevent.h>
+#include <net/netns/generic.h>
+#include <net/ip6_fib.h>
+#include <net/route.h>
+#include <net/seg6.h>
+#include <linux/seg6.h>
+#include <linux/seg6_local.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <net/dst_cache.h>
+#ifdef CONFIG_IPV6_SEG6_HMAC
+#include <net/seg6_hmac.h>
+#endif
+
+struct seg6_local_lwt;
+
+struct seg6_action_desc {
+	int action;
+	unsigned long attrs;
+	int (*input)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
+	int static_headroom;
+};
+
+struct seg6_local_lwt {
+	int action;
+	struct ipv6_sr_hdr *srh;
+	int table;
+	struct in_addr nh4;
+	struct in6_addr nh6;
+	int iif;
+	int oif;
+
+	int headroom;
+	struct seg6_action_desc *desc;
+};
+
+static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt)
+{
+	return (struct seg6_local_lwt *)lwt->data;
+}
+
+static struct seg6_action_desc seg6_action_table[] = {
+	{
+		.action		= SEG6_LOCAL_ACTION_END,
+		.attrs		= 0,
+	},
+};
+
+static struct seg6_action_desc *__get_action_desc(int action)
+{
+	struct seg6_action_desc *desc;
+	int i, count;
+
+	count = sizeof(seg6_action_table) / sizeof(struct seg6_action_desc);
+	for (i = 0; i < count; i++) {
+		desc = &seg6_action_table[i];
+		if (desc->action == action)
+			return desc;
+	}
+
+	return NULL;
+}
+
+static int seg6_local_input(struct sk_buff *skb)
+{
+	struct dst_entry *orig_dst = skb_dst(skb);
+	struct seg6_action_desc *desc;
+	struct seg6_local_lwt *slwt;
+
+	slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
+	desc = slwt->desc;
+
+	return desc->input(skb, slwt);
+}
+
+static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
+	[SEG6_LOCAL_ACTION]	= { .type = NLA_U32 },
+	[SEG6_LOCAL_SRH]	= { .type = NLA_BINARY },
+	[SEG6_LOCAL_TABLE]	= { .type = NLA_U32 },
+	[SEG6_LOCAL_NH4]	= { .type = NLA_BINARY,
+				    .len = sizeof(struct in_addr) },
+	[SEG6_LOCAL_NH6]	= { .type = NLA_BINARY,
+				    .len = sizeof(struct in6_addr) },
+	[SEG6_LOCAL_IIF]	= { .type = NLA_U32 },
+	[SEG6_LOCAL_OIF]	= { .type = NLA_U32 },
+};
+
+struct seg6_action_param {
+	int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
+	int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
+	int (*cmp)(struct seg6_local_lwt *a, struct seg6_local_lwt *b);
+};
+
+static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
+	[SEG6_LOCAL_SRH]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_TABLE]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_NH4]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_NH6]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_IIF]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+
+	[SEG6_LOCAL_OIF]	= { .parse = NULL,
+				    .put = NULL,
+				    .cmp = NULL },
+};
+
+static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+{
+	struct seg6_action_param *param;
+	struct seg6_action_desc *desc;
+	int i, err;
+
+	desc = __get_action_desc(slwt->action);
+	if (!desc)
+		return -EINVAL;
+
+	if (!desc->input)
+		return -EOPNOTSUPP;
+
+	slwt->desc = desc;
+	slwt->headroom += desc->static_headroom;
+
+	for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
+		if (desc->attrs & (1 << i)) {
+			if (!attrs[i])
+				return -EINVAL;
+
+			param = &seg6_action_params[i];
+
+			err = param->parse(attrs, slwt);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
+				  const void *cfg, struct lwtunnel_state **ts,
+				  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[SEG6_LOCAL_MAX + 1];
+	struct lwtunnel_state *newts;
+	struct seg6_local_lwt *slwt;
+	int err;
+
+	err = nla_parse_nested(tb, SEG6_LOCAL_MAX, nla, seg6_local_policy,
+			       extack);
+
+	if (err < 0)
+		return err;
+
+	if (!tb[SEG6_LOCAL_ACTION])
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(sizeof(*slwt));
+	if (!newts)
+		return -ENOMEM;
+
+	slwt = seg6_local_lwtunnel(newts);
+	slwt->action = nla_get_u32(tb[SEG6_LOCAL_ACTION]);
+
+	err = parse_nla_action(tb, slwt);
+	if (err < 0)
+		goto out_free;
+
+	newts->type = LWTUNNEL_ENCAP_SEG6_LOCAL;
+	newts->flags = LWTUNNEL_STATE_INPUT_REDIRECT;
+	newts->headroom = slwt->headroom;
+
+	*ts = newts;
+
+	return 0;
+
+out_free:
+	kfree(slwt->srh);
+	kfree(newts);
+	return err;
+}
+
+static void seg6_local_destroy_state(struct lwtunnel_state *lwt)
+{
+	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
+
+	kfree(slwt->srh);
+}
+
+static int seg6_local_fill_encap(struct sk_buff *skb,
+				 struct lwtunnel_state *lwt)
+{
+	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
+	struct seg6_action_param *param;
+	int i, err;
+
+	if (nla_put_u32(skb, SEG6_LOCAL_ACTION, slwt->action))
+		return -EMSGSIZE;
+
+	for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
+		if (slwt->desc->attrs & (1 << i)) {
+			param = &seg6_action_params[i];
+			err = param->put(skb, slwt);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
+{
+	struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
+	unsigned long attrs;
+	int nlsize;
+
+	nlsize = nla_total_size(4); /* action */
+
+	attrs = slwt->desc->attrs;
+
+	if (attrs & (1 << SEG6_LOCAL_SRH))
+		nlsize += nla_total_size((slwt->srh->hdrlen + 1) << 3);
+
+	if (attrs & (1 << SEG6_LOCAL_TABLE))
+		nlsize += nla_total_size(4);
+
+	if (attrs & (1 << SEG6_LOCAL_NH4))
+		nlsize += nla_total_size(4);
+
+	if (attrs & (1 << SEG6_LOCAL_NH6))
+		nlsize += nla_total_size(16);
+
+	if (attrs & (1 << SEG6_LOCAL_IIF))
+		nlsize += nla_total_size(4);
+
+	if (attrs & (1 << SEG6_LOCAL_OIF))
+		nlsize += nla_total_size(4);
+
+	return nlsize;
+}
+
+static int seg6_local_cmp_encap(struct lwtunnel_state *a,
+				struct lwtunnel_state *b)
+{
+	struct seg6_local_lwt *slwt_a, *slwt_b;
+	struct seg6_action_param *param;
+	int i;
+
+	slwt_a = seg6_local_lwtunnel(a);
+	slwt_b = seg6_local_lwtunnel(b);
+
+	if (slwt_a->action != slwt_b->action)
+		return 1;
+
+	if (slwt_a->desc->attrs != slwt_b->desc->attrs)
+		return 1;
+
+	for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
+		if (slwt_a->desc->attrs & (1 << i)) {
+			param = &seg6_action_params[i];
+			if (param->cmp(slwt_a, slwt_b))
+				return 1;
+		}
+	}
+
+	return 0;
+}
+
+static const struct lwtunnel_encap_ops seg6_local_ops = {
+	.build_state	= seg6_local_build_state,
+	.destroy_state	= seg6_local_destroy_state,
+	.input		= seg6_local_input,
+	.fill_encap	= seg6_local_fill_encap,
+	.get_encap_size	= seg6_local_get_encap_size,
+	.cmp_encap	= seg6_local_cmp_encap,
+	.owner		= THIS_MODULE,
+};
+
+int __init seg6_local_init(void)
+{
+	return lwtunnel_encap_add_ops(&seg6_local_ops,
+				      LWTUNNEL_ENCAP_SEG6_LOCAL);
+}
+
+void seg6_local_exit(void)
+{
+	lwtunnel_encap_del_ops(&seg6_local_ops, LWTUNNEL_ENCAP_SEG6_LOCAL);
+}
-- 
cgit v1.2.3


From 2d9cc60aee6bfbd72a06516615af9cde9cb6189a Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Sat, 5 Aug 2017 12:38:27 +0200
Subject: ipv6: sr: add rtnetlink functions for seg6local action parameters

This patch adds the necessary functions to parse, fill, and compare
seg6local rtnetlink attributes, for all defined action parameters.

- The SRH parameter defines an SRH to be inserted or encapsulated.
- The TABLE parameter defines the table to use for the route lookup of
  the next segment or the inner decapsulated packet.
- The NH4 parameter defines the IPv4 next-hop for an inner decapsulated
  IPv4 packet.
- The NH6 parameter defines the IPv6 next-hop for the next segment or
  for an inner decapsulated IPv6 packet
- The IIF parameter defines an ingress interface index.
- The OIF parameter defines an egress interface index.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_local.c | 211 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 193 insertions(+), 18 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 53615d7e0723..ab1fc1b17ddf 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -104,6 +104,181 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
 	[SEG6_LOCAL_OIF]	= { .type = NLA_U32 },
 };
 
+static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+{
+	struct ipv6_sr_hdr *srh;
+	int len;
+
+	srh = nla_data(attrs[SEG6_LOCAL_SRH]);
+	len = nla_len(attrs[SEG6_LOCAL_SRH]);
+
+	/* SRH must contain at least one segment */
+	if (len < sizeof(*srh) + sizeof(struct in6_addr))
+		return -EINVAL;
+
+	if (!seg6_validate_srh(srh, len))
+		return -EINVAL;
+
+	slwt->srh = kmalloc(len, GFP_KERNEL);
+	if (!slwt->srh)
+		return -ENOMEM;
+
+	memcpy(slwt->srh, srh, len);
+
+	slwt->headroom += len;
+
+	return 0;
+}
+
+static int put_nla_srh(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	struct ipv6_sr_hdr *srh;
+	struct nlattr *nla;
+	int len;
+
+	srh = slwt->srh;
+	len = (srh->hdrlen + 1) << 3;
+
+	nla = nla_reserve(skb, SEG6_LOCAL_SRH, len);
+	if (!nla)
+		return -EMSGSIZE;
+
+	memcpy(nla_data(nla), srh, len);
+
+	return 0;
+}
+
+static int cmp_nla_srh(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+	int len = (a->srh->hdrlen + 1) << 3;
+
+	if (len != ((b->srh->hdrlen + 1) << 3))
+		return 1;
+
+	return memcmp(a->srh, b->srh, len);
+}
+
+static int parse_nla_table(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+{
+	slwt->table = nla_get_u32(attrs[SEG6_LOCAL_TABLE]);
+
+	return 0;
+}
+
+static int put_nla_table(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	if (nla_put_u32(skb, SEG6_LOCAL_TABLE, slwt->table))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int cmp_nla_table(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+	if (a->table != b->table)
+		return 1;
+
+	return 0;
+}
+
+static int parse_nla_nh4(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+{
+	memcpy(&slwt->nh4, nla_data(attrs[SEG6_LOCAL_NH4]),
+	       sizeof(struct in_addr));
+
+	return 0;
+}
+
+static int put_nla_nh4(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	struct nlattr *nla;
+
+	nla = nla_reserve(skb, SEG6_LOCAL_NH4, sizeof(struct in_addr));
+	if (!nla)
+		return -EMSGSIZE;
+
+	memcpy(nla_data(nla), &slwt->nh4, sizeof(struct in_addr));
+
+	return 0;
+}
+
+static int cmp_nla_nh4(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+	return memcmp(&a->nh4, &b->nh4, sizeof(struct in_addr));
+}
+
+static int parse_nla_nh6(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+{
+	memcpy(&slwt->nh6, nla_data(attrs[SEG6_LOCAL_NH6]),
+	       sizeof(struct in6_addr));
+
+	return 0;
+}
+
+static int put_nla_nh6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	struct nlattr *nla;
+
+	nla = nla_reserve(skb, SEG6_LOCAL_NH6, sizeof(struct in6_addr));
+	if (!nla)
+		return -EMSGSIZE;
+
+	memcpy(nla_data(nla), &slwt->nh6, sizeof(struct in6_addr));
+
+	return 0;
+}
+
+static int cmp_nla_nh6(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+	return memcmp(&a->nh6, &b->nh6, sizeof(struct in6_addr));
+}
+
+static int parse_nla_iif(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+{
+	slwt->iif = nla_get_u32(attrs[SEG6_LOCAL_IIF]);
+
+	return 0;
+}
+
+static int put_nla_iif(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	if (nla_put_u32(skb, SEG6_LOCAL_IIF, slwt->iif))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int cmp_nla_iif(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+	if (a->iif != b->iif)
+		return 1;
+
+	return 0;
+}
+
+static int parse_nla_oif(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+{
+	slwt->oif = nla_get_u32(attrs[SEG6_LOCAL_OIF]);
+
+	return 0;
+}
+
+static int put_nla_oif(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	if (nla_put_u32(skb, SEG6_LOCAL_OIF, slwt->oif))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int cmp_nla_oif(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+	if (a->oif != b->oif)
+		return 1;
+
+	return 0;
+}
+
 struct seg6_action_param {
 	int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
 	int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
@@ -111,29 +286,29 @@ struct seg6_action_param {
 };
 
 static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
-	[SEG6_LOCAL_SRH]	= { .parse = NULL,
-				    .put = NULL,
-				    .cmp = NULL },
+	[SEG6_LOCAL_SRH]	= { .parse = parse_nla_srh,
+				    .put = put_nla_srh,
+				    .cmp = cmp_nla_srh },
 
-	[SEG6_LOCAL_TABLE]	= { .parse = NULL,
-				    .put = NULL,
-				    .cmp = NULL },
+	[SEG6_LOCAL_TABLE]	= { .parse = parse_nla_table,
+				    .put = put_nla_table,
+				    .cmp = cmp_nla_table },
 
-	[SEG6_LOCAL_NH4]	= { .parse = NULL,
-				    .put = NULL,
-				    .cmp = NULL },
+	[SEG6_LOCAL_NH4]	= { .parse = parse_nla_nh4,
+				    .put = put_nla_nh4,
+				    .cmp = cmp_nla_nh4 },
 
-	[SEG6_LOCAL_NH6]	= { .parse = NULL,
-				    .put = NULL,
-				    .cmp = NULL },
+	[SEG6_LOCAL_NH6]	= { .parse = parse_nla_nh6,
+				    .put = put_nla_nh6,
+				    .cmp = cmp_nla_nh6 },
 
-	[SEG6_LOCAL_IIF]	= { .parse = NULL,
-				    .put = NULL,
-				    .cmp = NULL },
+	[SEG6_LOCAL_IIF]	= { .parse = parse_nla_iif,
+				    .put = put_nla_iif,
+				    .cmp = cmp_nla_iif },
 
-	[SEG6_LOCAL_OIF]	= { .parse = NULL,
-				    .put = NULL,
-				    .cmp = NULL },
+	[SEG6_LOCAL_OIF]	= { .parse = parse_nla_oif,
+				    .put = put_nla_oif,
+				    .cmp = cmp_nla_oif },
 };
 
 static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
-- 
cgit v1.2.3


From 140f04c33bbcf89440dcaf8bef04918abff5a52d Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Sat, 5 Aug 2017 12:39:48 +0200
Subject: ipv6: sr: implement several seg6local actions

This patch implements the following seg6local actions.

- SEG6_LOCAL_ACTION_END: regular SRH processing. The DA of the packet
  is updated to the next segment and forwarded accordingly.

- SEG6_LOCAL_ACTION_END_X: same as above, except that the packet is
  forwarded to the specified IPv6 next-hop.

- SEG6_LOCAL_ACTION_END_DX6: decapsulate the packet and forward to
  inner IPv6 packet to the specified IPv6 next-hop.

- SEG6_LOCAL_ACTION_END_B6: insert the specified SRH directly after
  the IPv6 header of the packet.

- SEG6_LOCAL_ACTION_END_B6_ENCAP: encapsulate the packet within
  an outer IPv6 header, containing the specified SRH.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_local.c | 271 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 271 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index ab1fc1b17ddf..147680e7a00c 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -58,11 +58,282 @@ static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt)
 	return (struct seg6_local_lwt *)lwt->data;
 }
 
+static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb)
+{
+	struct ipv6_sr_hdr *srh;
+	struct ipv6hdr *hdr;
+	int len;
+
+	hdr = ipv6_hdr(skb);
+	if (hdr->nexthdr != IPPROTO_ROUTING)
+		return NULL;
+
+	srh = (struct ipv6_sr_hdr *)(hdr + 1);
+	len = (srh->hdrlen + 1) << 3;
+
+	if (!pskb_may_pull(skb, sizeof(*hdr) + len))
+		return NULL;
+
+	if (!seg6_validate_srh(srh, len))
+		return NULL;
+
+	return srh;
+}
+
+static struct ipv6_sr_hdr *get_and_validate_srh(struct sk_buff *skb)
+{
+	struct ipv6_sr_hdr *srh;
+
+	srh = get_srh(skb);
+	if (!srh)
+		return NULL;
+
+	if (srh->segments_left == 0)
+		return NULL;
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	if (!seg6_hmac_validate_skb(skb))
+		return NULL;
+#endif
+
+	return srh;
+}
+
+/* regular endpoint function */
+static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	struct ipv6_sr_hdr *srh;
+	struct in6_addr *addr;
+
+	srh = get_and_validate_srh(skb);
+	if (!srh)
+		goto drop;
+
+	srh->segments_left--;
+	addr = srh->segments + srh->segments_left;
+
+	ipv6_hdr(skb)->daddr = *addr;
+
+	skb_dst_drop(skb);
+	ip6_route_input(skb);
+
+	return dst_input(skb);
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+/* regular endpoint, and forward to specified nexthop */
+static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	struct net *net = dev_net(skb->dev);
+	struct ipv6_sr_hdr *srh;
+	struct dst_entry *dst;
+	struct in6_addr *addr;
+	struct ipv6hdr *hdr;
+	struct flowi6 fl6;
+	int flags;
+
+	srh = get_and_validate_srh(skb);
+	if (!srh)
+		goto drop;
+
+	srh->segments_left--;
+	addr = srh->segments + srh->segments_left;
+
+	hdr = ipv6_hdr(skb);
+	hdr->daddr = *addr;
+
+	skb_dst_drop(skb);
+
+	fl6.flowi6_iif = skb->dev->ifindex;
+	fl6.daddr = slwt->nh6;
+	fl6.saddr = hdr->saddr;
+	fl6.flowlabel = ip6_flowinfo(hdr);
+	fl6.flowi6_mark = skb->mark;
+	fl6.flowi6_proto = hdr->nexthdr;
+
+	flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE |
+		RT6_LOOKUP_F_REACHABLE;
+
+	dst = ip6_route_input_lookup(net, skb->dev, &fl6, flags);
+	if (dst->dev->flags & IFF_LOOPBACK)
+		goto drop;
+
+	skb_dst_set(skb, dst);
+
+	return dst_input(skb);
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+/* decapsulate and forward to specified nexthop */
+static int input_action_end_dx6(struct sk_buff *skb,
+				struct seg6_local_lwt *slwt)
+{
+	struct net *net = dev_net(skb->dev);
+	struct ipv6hdr *inner_hdr;
+	struct ipv6_sr_hdr *srh;
+	struct dst_entry *dst;
+	unsigned int off = 0;
+	struct flowi6 fl6;
+	bool use_nh;
+	int flags;
+
+	/* this function accepts IPv6 encapsulated packets, with either
+	 * an SRH with SL=0, or no SRH.
+	 */
+
+	srh = get_srh(skb);
+	if (srh && srh->segments_left > 0)
+		goto drop;
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	if (srh && !seg6_hmac_validate_skb(skb))
+		goto drop;
+#endif
+
+	if (ipv6_find_hdr(skb, &off, IPPROTO_IPV6, NULL, NULL) < 0)
+		goto drop;
+
+	if (!pskb_pull(skb, off))
+		goto drop;
+
+	skb_postpull_rcsum(skb, skb_network_header(skb), off);
+
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb->encapsulation = 0;
+
+	inner_hdr = ipv6_hdr(skb);
+
+	/* The inner packet is not associated to any local interface,
+	 * so we do not call netif_rx().
+	 *
+	 * If slwt->nh6 is set to ::, then lookup the nexthop for the
+	 * inner packet's DA. Otherwise, use the specified nexthop.
+	 */
+
+	use_nh = !ipv6_addr_any(&slwt->nh6);
+
+	skb_dst_drop(skb);
+
+	fl6.flowi6_iif = skb->dev->ifindex;
+	fl6.daddr = use_nh ? slwt->nh6 : inner_hdr->daddr;
+	fl6.saddr = inner_hdr->saddr;
+	fl6.flowlabel = ip6_flowinfo(inner_hdr);
+	fl6.flowi6_mark = skb->mark;
+	fl6.flowi6_proto = inner_hdr->nexthdr;
+
+	flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_REACHABLE;
+	if (use_nh)
+		flags |= RT6_LOOKUP_F_IFACE;
+
+	dst = ip6_route_input_lookup(net, skb->dev, &fl6, flags);
+	if (dst->dev->flags & IFF_LOOPBACK)
+		goto drop;
+
+	skb_dst_set(skb, dst);
+
+	return dst_input(skb);
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+/* push an SRH on top of the current one */
+static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	struct ipv6_sr_hdr *srh;
+	int err = -EINVAL;
+
+	srh = get_and_validate_srh(skb);
+	if (!srh)
+		goto drop;
+
+	err = seg6_do_srh_inline(skb, slwt->srh);
+	if (err)
+		goto drop;
+
+	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+	skb_dst_drop(skb);
+	ip6_route_input(skb);
+
+	return dst_input(skb);
+
+drop:
+	kfree_skb(skb);
+	return err;
+}
+
+/* encapsulate within an outer IPv6 header and a specified SRH */
+static int input_action_end_b6_encap(struct sk_buff *skb,
+				     struct seg6_local_lwt *slwt)
+{
+	struct ipv6_sr_hdr *srh;
+	struct in6_addr *addr;
+	int err = -EINVAL;
+
+	srh = get_and_validate_srh(skb);
+	if (!srh)
+		goto drop;
+
+	srh->segments_left--;
+	addr = srh->segments + srh->segments_left;
+	ipv6_hdr(skb)->daddr = *addr;
+
+	skb_reset_inner_headers(skb);
+	skb->encapsulation = 1;
+
+	err = seg6_do_srh_encap(skb, slwt->srh);
+	if (err)
+		goto drop;
+
+	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+	skb_dst_drop(skb);
+	ip6_route_input(skb);
+
+	return dst_input(skb);
+
+drop:
+	kfree_skb(skb);
+	return err;
+}
+
 static struct seg6_action_desc seg6_action_table[] = {
 	{
 		.action		= SEG6_LOCAL_ACTION_END,
 		.attrs		= 0,
+		.input		= input_action_end,
+	},
+	{
+		.action		= SEG6_LOCAL_ACTION_END_X,
+		.attrs		= (1 << SEG6_LOCAL_NH6),
+		.input		= input_action_end_x,
 	},
+	{
+		.action		= SEG6_LOCAL_ACTION_END_DX6,
+		.attrs		= (1 << SEG6_LOCAL_NH6),
+		.input		= input_action_end_dx6,
+	},
+	{
+		.action		= SEG6_LOCAL_ACTION_END_B6,
+		.attrs		= (1 << SEG6_LOCAL_SRH),
+		.input		= input_action_end_b6,
+	},
+	{
+		.action		= SEG6_LOCAL_ACTION_END_B6_ENCAP,
+		.attrs		= (1 << SEG6_LOCAL_SRH),
+		.input		= input_action_end_b6_encap,
+		.static_headroom	= sizeof(struct ipv6hdr),
+	}
 };
 
 static struct seg6_action_desc *__get_action_desc(int action)
-- 
cgit v1.2.3


From feca7d8c135bc1527b244fe817b8b6498066ccec Mon Sep 17 00:00:00 2001
From: Vincent Bernat <vincent@bernat.im>
Date: Tue, 8 Aug 2017 20:23:49 +0200
Subject: net: ipv6: avoid overhead when no custom FIB rules are installed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the user hasn't installed any custom rules, don't go through the
whole FIB rules layer. This is pretty similar to f4530fa574df (ipv4:
Avoid overhead when no custom FIB rules are installed).

Using a micro-benchmark module [1], timing ip6_route_output() with
get_cycles(), with 40,000 routes in the main routing table, before this
patch:

    min=606 max=12911 count=627 average=1959 95th=4903 90th=3747 50th=1602 mad=821
    table=254 avgdepth=21.8 maxdepth=39
    value │                         ┊                            count
      600 │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒                                         199
      880 │▒▒▒░░░░░░░░░░░░░░░░                                      43
     1160 │▒▒▒░░░░░░░░░░░░░░░░░░░░                                  48
     1440 │▒▒▒░░░░░░░░░░░░░░░░░░░░░░░                               43
     1720 │▒▒▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░                          59
     2000 │▒▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                      50
     2280 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                    26
     2560 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                  31
     2840 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░               28
     3120 │▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░              17
     3400 │▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░             17
     3680 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░             8
     3960 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░           11
     4240 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░            6
     4520 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░           6
     4800 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░           9

After:

    min=544 max=11687 count=627 average=1776 95th=4546 90th=3585 50th=1227 mad=565
    table=254 avgdepth=21.8 maxdepth=39
    value │                         ┊                            count
      540 │▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒                                        201
      800 │▒▒▒▒▒░░░░░░░░░░░░░░░░                                    63
     1060 │▒▒▒▒▒░░░░░░░░░░░░░░░░░░░░░                               68
     1320 │▒▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░                            39
     1580 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                         32
     1840 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                       32
     2100 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                    34
     2360 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░                 33
     2620 │▒▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░               26
     2880 │▒░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░              22
     3140 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░              9
     3400 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░             8
     3660 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░             9
     3920 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░            8
     4180 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░           8
     4440 │░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░           8

At the frequency of the host during the bench (~ 3.7 GHz), this is
about a 100 ns difference on the median value.

A next step would be to collapse local and main tables, as in
0ddcf43d5d4a (ipv4: FIB Local/MAIN table collapse).

[1]: https://github.com/vincentbernat/network-lab/blob/master/lab-routes-ipv6/kbench_mod.c

Signed-off-by: Vincent Bernat <vincent@bernat.im>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/fib6_rules.c | 40 +++++++++++++++++++++++++++-------------
 net/ipv6/route.c      |  1 +
 2 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 2f29e4e33bd3..b240f24a6e52 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -63,19 +63,32 @@ unsigned int fib6_rules_seq_read(struct net *net)
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 				   int flags, pol_lookup_t lookup)
 {
-	struct fib_lookup_arg arg = {
-		.lookup_ptr = lookup,
-		.flags = FIB_LOOKUP_NOREF,
-	};
-
-	/* update flow if oif or iif point to device enslaved to l3mdev */
-	l3mdev_update_flow(net, flowi6_to_flowi(fl6));
-
-	fib_rules_lookup(net->ipv6.fib6_rules_ops,
-			 flowi6_to_flowi(fl6), flags, &arg);
-
-	if (arg.result)
-		return arg.result;
+	if (net->ipv6.fib6_has_custom_rules) {
+		struct fib_lookup_arg arg = {
+			.lookup_ptr = lookup,
+			.flags = FIB_LOOKUP_NOREF,
+		};
+
+		/* update flow if oif or iif point to device enslaved to l3mdev */
+		l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+		fib_rules_lookup(net->ipv6.fib6_rules_ops,
+				 flowi6_to_flowi(fl6), flags, &arg);
+
+		if (arg.result)
+			return arg.result;
+	} else {
+		struct rt6_info *rt;
+
+		rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, flags);
+		if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN)
+			return &rt->dst;
+		ip6_rt_put(rt);
+		rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
+		if (rt->dst.error != -EAGAIN)
+			return &rt->dst;
+		ip6_rt_put(rt);
+	}
 
 	dst_hold(&net->ipv6.ip6_null_entry->dst);
 	return &net->ipv6.ip6_null_entry->dst;
@@ -245,6 +258,7 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 	rule6->dst.plen = frh->dst_len;
 	rule6->tclass = frh->tos;
 
+	net->ipv6.fib6_has_custom_rules = true;
 	err = 0;
 errout:
 	return err;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index aba07fce67fb..7ecbe5eb19f8 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3934,6 +3934,7 @@ static int __net_init ip6_route_net_init(struct net *net)
 			 ip6_template_metrics, true);
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	net->ipv6.fib6_has_custom_rules = false;
 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
 					       sizeof(*net->ipv6.ip6_prohibit_entry),
 					       GFP_KERNEL);
-- 
cgit v1.2.3


From b97bac64a589d0158cf866e8995e831030f68f4f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 9 Aug 2017 20:41:48 +0200
Subject: rtnetlink: make rtnl_register accept a flags parameter

This change allows us to later indicate to rtnetlink core that certain
doit functions should be called without acquiring rtnl_mutex.

This change should have no effect, we simply replace the last (now
unused) calcit argument with the new flag.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c  | 14 +++++++-------
 net/ipv6/addrlabel.c |  6 +++---
 net/ipv6/ip6_fib.c   |  2 +-
 net/ipv6/ip6mr.c     |  2 +-
 net/ipv6/route.c     |  6 +++---
 5 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 30ee23eef268..640792e1ecb7 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6605,21 +6605,21 @@ int __init addrconf_init(void)
 	rtnl_af_register(&inet6_ops);
 
 	err = __rtnl_register(PF_INET6, RTM_GETLINK, NULL, inet6_dump_ifinfo,
-			      NULL);
+			      0);
 	if (err < 0)
 		goto errout;
 
 	/* Only the first call to __rtnl_register can fail */
-	__rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, NULL);
-	__rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, NULL);
+	__rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0);
+	__rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0);
 	__rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr,
-			inet6_dump_ifaddr, NULL);
+			inet6_dump_ifaddr, 0);
 	__rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL,
-			inet6_dump_ifmcaddr, NULL);
+			inet6_dump_ifmcaddr, 0);
 	__rtnl_register(PF_INET6, RTM_GETANYCAST, NULL,
-			inet6_dump_ifacaddr, NULL);
+			inet6_dump_ifacaddr, 0);
 	__rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf,
-			inet6_netconf_dump_devconf, NULL);
+			inet6_netconf_dump_devconf, 0);
 
 	ipv6_addr_label_rtnl_register();
 
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 7a428f65c7ec..cea5eb488013 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -593,10 +593,10 @@ out:
 void __init ipv6_addr_label_rtnl_register(void)
 {
 	__rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel,
-			NULL, NULL);
+			NULL, 0);
 	__rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel,
-			NULL, NULL);
+			NULL, 0);
 	__rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get,
-			ip6addrlbl_dump, NULL);
+			ip6addrlbl_dump, 0);
 }
 
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 69ed0043d117..8c58c7558de0 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2038,7 +2038,7 @@ int __init fib6_init(void)
 		goto out_kmem_cache_create;
 
 	ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib,
-			      NULL);
+			      0);
 	if (ret)
 		goto out_unregister_subsys;
 
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 7454850f2098..f5500f5444e9 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1427,7 +1427,7 @@ int __init ip6_mr_init(void)
 	}
 #endif
 	rtnl_register(RTNL_FAMILY_IP6MR, RTM_GETROUTE, NULL,
-		      ip6mr_rtm_dumproute, NULL);
+		      ip6mr_rtm_dumproute, 0);
 	return 0;
 #ifdef CONFIG_IPV6_PIMSM_V2
 add_proto_fail:
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c73e61750642..035762fed07d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4105,9 +4105,9 @@ int __init ip6_route_init(void)
 		goto fib6_rules_init;
 
 	ret = -ENOBUFS;
-	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
-	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
-	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
+	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
+	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
+	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, 0))
 		goto out_register_late_subsys;
 
 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
-- 
cgit v1.2.3


From 6eb7939371104e4ff78261c165c6f33ff6d90031 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 8 Aug 2017 15:51:02 -0600
Subject: net: ipv6: lower ndisc notifier priority below addrconf

ndisc_notify is used to send unsolicited neighbor advertisements
(e.g., on a link up). Currently, the ndisc notifier is run before the
addrconf notifer which means NA's are not sent for link-local addresses
which are added by the addrconf notifier.

Fix by lowering the priority of the ndisc notifier. Setting the priority
to ADDRCONF_NOTIFY_PRIORITY - 5 means it runs after addrconf and before
the route notifier which is ADDRCONF_NOTIFY_PRIORITY - 10.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ndisc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0327c1f2e6fc..5e338eb89509 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1779,6 +1779,7 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event,
 
 static struct notifier_block ndisc_netdev_notifier = {
 	.notifier_call = ndisc_netdev_event,
+	.priority = ADDRCONF_NOTIFY_PRIORITY - 5,
 };
 
 #ifdef CONFIG_SYSCTL
-- 
cgit v1.2.3


From 077fbac405bfc6d41419ad6c1725804ad4e9887c Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 11 Aug 2017 02:11:33 +0900
Subject: net: xfrm: support setting an output mark.

On systems that use mark-based routing it may be necessary for
routing lookups to use marks in order for packets to be routed
correctly. An example of such a system is Android, which uses
socket marks to route packets via different networks.

Currently, routing lookups in tunnel mode always use a mark of
zero, making routing incorrect on such systems.

This patch adds a new output_mark element to the xfrm state and
a corresponding XFRMA_OUTPUT_MARK netlink attribute. The output
mark differs from the existing xfrm mark in two ways:

1. The xfrm mark is used to match xfrm policies and states, while
   the xfrm output mark is used to set the mark (and influence
   the routing) of the packets emitted by those states.
2. The existing mark is constrained to be a subset of the bits of
   the originating socket or transformed packet, but the output
   mark is arbitrary and depends only on the state.

The use of a separate mark provides additional flexibility. For
example:

- A packet subject to two transforms (e.g., transport mode inside
  tunnel mode) can have two different output marks applied to it,
  one for the transport mode SA and one for the tunnel mode SA.
- On a system where socket marks determine routing, the packets
  emitted by an IPsec tunnel can be routed based on a mark that
  is determined by the tunnel, not by the marks of the
  unencrypted packets.
- Support for setting the output marks can be introduced without
  breaking any existing setups that employ both mark-based
  routing and xfrm tunnel mode. Simply changing the code to use
  the xfrm mark for routing output packets could xfrm mark could
  change behaviour in a way that breaks these setups.

If the output mark is unspecified or set to zero, the mark is not
set or changed.

Tested: make allyesconfig; make -j64
Tested: https://android-review.googlesource.com/452776
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/xfrm6_policy.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index f44b25a48478..11d1314ab6c5 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -27,7 +27,8 @@
 
 static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 					  const xfrm_address_t *saddr,
-					  const xfrm_address_t *daddr)
+					  const xfrm_address_t *daddr,
+					  u32 mark)
 {
 	struct flowi6 fl6;
 	struct dst_entry *dst;
@@ -36,6 +37,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 	memset(&fl6, 0, sizeof(fl6));
 	fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif);
 	fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF;
+	fl6.flowi6_mark = mark;
 	memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
 	if (saddr)
 		memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr));
@@ -52,12 +54,13 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
 }
 
 static int xfrm6_get_saddr(struct net *net, int oif,
-			   xfrm_address_t *saddr, xfrm_address_t *daddr)
+			   xfrm_address_t *saddr, xfrm_address_t *daddr,
+			   u32 mark)
 {
 	struct dst_entry *dst;
 	struct net_device *dev;
 
-	dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr);
+	dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark);
 	if (IS_ERR(dst))
 		return -EHOSTUNREACH;
 
-- 
cgit v1.2.3


From fe4007999599c02598c17b643e8de43e487d48e8 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Tue, 15 Aug 2017 09:09:49 +0200
Subject: ipv6: fib: Provide offload indication using nexthop flags

IPv6 routes currently lack nexthop flags as in IPv4. This has several
implications.

In the forwarding path, it requires us to check the carrier state of the
nexthop device and potentially ignore a linkdown route, instead of
checking for RTNH_F_LINKDOWN.

It also requires capable drivers to use the user facing IPv6-specific
route flags to provide offload indication, instead of using the nexthop
flags as in IPv4.

Add nexthop flags to IPv6 routes in the 40 bytes hole and use it to
provide offload indication instead of the RTF_OFFLOAD flag, which is
removed while it's still not part of any official kernel release.

In the near future we would like to use the field for the
RTNH_F_{LINKDOWN,DEAD} flags, but this change is more involved and might
not be ready in time for the current cycle.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 035762fed07d..6793135d49db 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1820,11 +1820,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		goto out;
 	}
 
-	if (cfg->fc_flags & RTF_OFFLOAD) {
-		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_OFFLOAD");
-		goto out;
-	}
-
 	if (cfg->fc_dst_len > 128) {
 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
 		goto out;
@@ -3335,7 +3330,7 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
 			goto nla_put_failure;
 	}
 
-	if (rt->rt6i_flags & RTF_OFFLOAD)
+	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
 		*flags |= RTNH_F_OFFLOAD;
 
 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
-- 
cgit v1.2.3


From 121622dba8da9c709b72d801eae7664fa7da7c36 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 15 Aug 2017 16:34:42 +0200
Subject: ipv6: route: make rtm_getroute not assume rtnl is locked

__dev_get_by_index assumes RTNL is held, use _rcu version instead.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6793135d49db..60705b4d2c62 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3611,8 +3611,11 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		struct net_device *dev;
 		int flags = 0;
 
-		dev = __dev_get_by_index(net, iif);
+		rcu_read_lock();
+
+		dev = dev_get_by_index_rcu(net, iif);
 		if (!dev) {
+			rcu_read_unlock();
 			err = -ENODEV;
 			goto errout;
 		}
@@ -3624,6 +3627,8 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 
 		if (!fibmatch)
 			dst = ip6_route_input_lookup(net, dev, &fl6, flags);
+
+		rcu_read_unlock();
 	} else {
 		fl6.flowi6_oif = oif;
 
-- 
cgit v1.2.3


From e3a22b7f5cfb3b422669fbf3d668315ac7634e5a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 15 Aug 2017 16:34:43 +0200
Subject: ipv6: route: set ipv6 RTM_GETROUTE to not use rtnl

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 60705b4d2c62..11ff19ba7efd 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4107,7 +4107,8 @@ int __init ip6_route_init(void)
 	ret = -ENOBUFS;
 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
-	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, 0))
+	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
+			    RTNL_FLAG_DOIT_UNLOCKED))
 		goto out_register_late_subsys;
 
 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
-- 
cgit v1.2.3


From 45f91bdcd5c5ba559a4bb7c3a0e0709476cf570f Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 15 Aug 2017 22:31:10 -0700
Subject: net: add sendmsg_locked and sendpage_locked to af_inet6

To complete the sendmsg_locked and sendpage_locked implementation add
the hooks for af_inet6 as well.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 0a7c74049a0c..3b58ee709f33 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -554,6 +554,8 @@ const struct proto_ops inet6_stream_ops = {
 	.recvmsg	   = inet_recvmsg,		/* ok		*/
 	.mmap		   = sock_no_mmap,
 	.sendpage	   = inet_sendpage,
+	.sendmsg_locked    = tcp_sendmsg_locked,
+	.sendpage_locked   = tcp_sendpage_locked,
 	.splice_read	   = tcp_splice_read,
 	.read_sock	   = tcp_read_sock,
 	.peek_len	   = tcp_peek_len,
-- 
cgit v1.2.3


From 401481e06099533892b3d8f1db498cbc480b5b24 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 18 Aug 2017 13:34:22 +0200
Subject: ipv6: fix false-postive maybe-uninitialized warning

Adding a lock around one of the assignments prevents gcc from
tracking the state of the local 'fibmatch' variable, so it can no
longer prove that 'dst' is always initialized, leading to a bogus
warning:

net/ipv6/route.c: In function 'inet6_rtm_getroute':
net/ipv6/route.c:3659:2: error: 'dst' may be used uninitialized in this function [-Werror=maybe-uninitialized]

This moves the other assignment into the same lock to shut up the
warning.

Fixes: 121622dba8da ("ipv6: route: make rtm_getroute not assume rtnl is locked")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index dc021ed6dd37..bec12ae3e6b7 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3624,6 +3624,8 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 
 		if (!fibmatch)
 			dst = ip6_route_input_lookup(net, dev, &fl6, flags);
+		else
+			dst = ip6_route_lookup(net, &fl6, 0);
 
 		rcu_read_unlock();
 	} else {
@@ -3631,10 +3633,10 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 
 		if (!fibmatch)
 			dst = ip6_route_output(net, NULL, &fl6);
+		else
+			dst = ip6_route_lookup(net, &fl6, 0);
 	}
 
-	if (fibmatch)
-		dst = ip6_route_lookup(net, &fl6, 0);
 
 	rt = container_of(dst, struct rt6_info, dst);
 	if (rt->dst.error) {
-- 
cgit v1.2.3


From 4832c30d5458387ff2533ff66fbde26ad8bb5a2d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 17 Aug 2017 12:17:20 -0700
Subject: net: ipv6: put host and anycast routes on device with address

One nagging difference between ipv4 and ipv6 is host routes for ipv6
addresses are installed using the loopback device or VRF / L3 Master
device. e.g.,

    2001:db8:1::/120 dev veth0 proto kernel metric 256 pref medium
    local 2001:db8:1::1 dev lo table local proto kernel metric 0 pref medium

Using the loopback device is convenient -- necessary for local tx, but
has some nasty side effects, most notably setting the 'lo' device down
causes all host routes for all local IPv6 address to be removed from the
FIB and completely breaks IPv6 networking across all interfaces.

This patch puts FIB entries for IPv6 routes against the device. This
simplifies the routes in the FIB, for example by making dst->dev and
rt6i_idev->dev the same (a future patch can look at removing the device
reference taken for rt6i_idev for FIB entries).

When copies are made on FIB lookups, the cloned route has dst->dev
set to loopback (or the L3 master device). This is needed for the
local Tx of packets to local addresses.

With fib entries allocated against the real network device, the addrconf
code that reinserts host routes on admin up of 'lo' is no longer needed.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 42 ------------------------------------------
 net/ipv6/icmp.c     | 15 +++++++++++++--
 net/ipv6/route.c    | 46 ++++++++++++++++++++++++++++++++++------------
 3 files changed, 47 insertions(+), 56 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 640792e1ecb7..45d0a24644de 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3030,9 +3030,6 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
 static void init_loopback(struct net_device *dev)
 {
 	struct inet6_dev  *idev;
-	struct net_device *sp_dev;
-	struct inet6_ifaddr *sp_ifa;
-	struct rt6_info *sp_rt;
 
 	/* ::1 */
 
@@ -3045,45 +3042,6 @@ static void init_loopback(struct net_device *dev)
 	}
 
 	add_addr(idev, &in6addr_loopback, 128, IFA_HOST);
-
-	/* Add routes to other interface's IPv6 addresses */
-	for_each_netdev(dev_net(dev), sp_dev) {
-		if (!strcmp(sp_dev->name, dev->name))
-			continue;
-
-		idev = __in6_dev_get(sp_dev);
-		if (!idev)
-			continue;
-
-		read_lock_bh(&idev->lock);
-		list_for_each_entry(sp_ifa, &idev->addr_list, if_list) {
-
-			if (sp_ifa->flags & (IFA_F_DADFAILED | IFA_F_TENTATIVE))
-				continue;
-
-			if (sp_ifa->rt) {
-				/* This dst has been added to garbage list when
-				 * lo device down, release this obsolete dst and
-				 * reallocate a new router for ifa.
-				 */
-				if (!sp_ifa->rt->rt6i_node) {
-					ip6_rt_put(sp_ifa->rt);
-					sp_ifa->rt = NULL;
-				} else {
-					continue;
-				}
-			}
-
-			sp_rt = addrconf_dst_alloc(idev, &sp_ifa->addr, false);
-
-			/* Failure cases are ignored */
-			if (!IS_ERR(sp_rt)) {
-				sp_ifa->rt = sp_rt;
-				ip6_ins_rt(sp_rt);
-			}
-		}
-		read_unlock_bh(&idev->lock);
-	}
 }
 
 void addrconf_add_linklocal(struct inet6_dev *idev,
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 8d7b113958b1..4f82830fc068 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -459,9 +459,20 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	 *	Source addr check
 	 */
 
-	if (__ipv6_addr_needs_scope_id(addr_type))
+	if (__ipv6_addr_needs_scope_id(addr_type)) {
 		iif = skb->dev->ifindex;
-	else {
+
+		/* for local packets, get the real device index */
+		if (iif == LOOPBACK_IFINDEX) {
+			dst = skb_dst(skb);
+			if (dst) {
+				struct rt6_info *rt;
+
+				rt = container_of(dst, struct rt6_info, dst);
+				iif = rt->rt6i_idev->dev->ifindex;
+			}
+		}
+	} else {
 		dst = skb_dst(skb);
 		iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev);
 	}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index bec12ae3e6b7..9b02064c3335 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -958,10 +958,34 @@ int ip6_ins_rt(struct rt6_info *rt)
 	return __ip6_ins_rt(rt, &info, &mxc, NULL);
 }
 
+/* called with rcu_lock held */
+static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
+{
+	struct net_device *dev = rt->dst.dev;
+
+	if (rt->rt6i_flags & RTF_LOCAL) {
+		/* for copies of local routes, dst->dev needs to be the
+		 * device if it is a master device, the master device if
+		 * device is enslaved, and the loopback as the default
+		 */
+		if (netif_is_l3_slave(dev) &&
+		    !rt6_need_strict(&rt->rt6i_dst.addr))
+			dev = l3mdev_master_dev_rcu(dev);
+		else if (!netif_is_l3_master(dev))
+			dev = dev_net(dev)->loopback_dev;
+		/* last case is netif_is_l3_master(dev) is true in which
+		 * case we want dev returned to be dev
+		 */
+	}
+
+	return dev;
+}
+
 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 					   const struct in6_addr *daddr,
 					   const struct in6_addr *saddr)
 {
+	struct net_device *dev;
 	struct rt6_info *rt;
 
 	/*
@@ -971,8 +995,10 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
 		ort = (struct rt6_info *)ort->dst.from;
 
-	rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
-
+	rcu_read_lock();
+	dev = ip6_rt_get_dev_rcu(ort);
+	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
+	rcu_read_unlock();
 	if (!rt)
 		return NULL;
 
@@ -1000,11 +1026,13 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 
 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
 {
+	struct net_device *dev;
 	struct rt6_info *pcpu_rt;
 
-	pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
-				  rt->dst.dev, rt->dst.flags);
-
+	rcu_read_lock();
+	dev = ip6_rt_get_dev_rcu(rt);
+	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
+	rcu_read_unlock();
 	if (!pcpu_rt)
 		return NULL;
 	ip6_rt_copy_init(pcpu_rt, rt);
@@ -2688,15 +2716,9 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 {
 	u32 tb_id;
 	struct net *net = dev_net(idev->dev);
-	struct net_device *dev = net->loopback_dev;
+	struct net_device *dev = idev->dev;
 	struct rt6_info *rt;
 
-	/* use L3 Master device as loopback for host routes if device
-	 * is enslaved and address is not link local or multicast
-	 */
-	if (!rt6_need_strict(addr))
-		dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
-
 	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
 	if (!rt)
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3


From 98aaa913b4ed250324429f0a9e6d5f77a3b5276c Mon Sep 17 00:00:00 2001
From: Mike Maloney <maloney@google.com>
Date: Tue, 22 Aug 2017 17:08:48 -0400
Subject: tcp: Extend SOF_TIMESTAMPING_RX_SOFTWARE to TCP recvmsg

When SOF_TIMESTAMPING_RX_SOFTWARE is enabled for tcp sockets, return the
timestamp corresponding to the highest sequence number data returned.

Previously the skb->tstamp is overwritten when a TCP packet is placed
in the out of order queue.  While the packet is in the ooo queue, save the
timestamp in the TCB_SKB_CB.  This space is shared with the gso_*
options which are only used on the tx path, and a previously unused 4
byte hole.

When skbs are coalesced either in the sk_receive_queue or the
out_of_order_queue always choose the timestamp of the appended skb to
maintain the invariant of returning the timestamp of the last byte in
the recvmsg buffer.

Signed-off-by: Mike Maloney <maloney@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d79a1af3252e..abba3bc2a3d9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1394,6 +1394,8 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
 	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
 	TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
 	TCP_SKB_CB(skb)->sacked = 0;
+	TCP_SKB_CB(skb)->has_rxtstamp =
+			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
 }
 
 static int tcp_v6_rcv(struct sk_buff *skb)
-- 
cgit v1.2.3


From 0d03510038bda70b5a4a252e8216822e6ce0cbdb Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 12 Aug 2017 00:57:02 +0200
Subject: netfilter: conntrack: compute l3proto nla size at compile time

avoids a pointer and allows struct to be const later on.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index ddef5ee9e0a8..6b4d59fd0214 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -308,11 +308,6 @@ static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
 
 	return 0;
 }
-
-static int ipv6_nlattr_tuple_size(void)
-{
-	return nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1);
-}
 #endif
 
 static int ipv6_hooks_register(struct net *net)
@@ -360,9 +355,10 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
 	.get_l4proto		= ipv6_get_l4proto,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= ipv6_tuple_to_nlattr,
-	.nlattr_tuple_size	= ipv6_nlattr_tuple_size,
 	.nlattr_to_tuple	= ipv6_nlattr_to_tuple,
 	.nla_policy		= ipv6_nla_policy,
+	.nla_size		= NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])) +
+				  NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])),
 #endif
 	.net_ns_get		= ipv6_hooks_register,
 	.net_ns_put		= ipv6_hooks_unregister,
@@ -421,6 +417,12 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
 
 	need_conntrack();
 
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+	if (WARN_ON(nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1) !=
+	    nf_conntrack_l3proto_ipv6.nla_size))
+		return -EINVAL;
+#endif
+
 	ret = nf_register_sockopt(&so_getorigdst6);
 	if (ret < 0) {
 		pr_err("Unable to register netfilter socket option\n");
-- 
cgit v1.2.3


From a3134d537f8209f5b149d7ed9f287047158845f0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 12 Aug 2017 00:57:03 +0200
Subject: netfilter: conntrack: remove protocol name from l3proto struct

no need to waste storage for something that is only needed
in one place and can be deduced from protocol number.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 6b4d59fd0214..91d37fbe28de 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -348,7 +348,6 @@ static void ipv6_hooks_unregister(struct net *net)
 
 struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
 	.l3proto		= PF_INET6,
-	.name			= "ipv6",
 	.pkt_to_tuple		= ipv6_pkt_to_tuple,
 	.invert_tuple		= ipv6_invert_tuple,
 	.print_tuple		= ipv6_print_tuple,
-- 
cgit v1.2.3


From 09ec82f5af99d1e35614eb0844b920fc335a313d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 12 Aug 2017 00:57:04 +0200
Subject: netfilter: conntrack: remove protocol name from l4proto struct

no need to waste storage for something that is only needed
in one place and can be deduced from protocol number.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index d5f028e33f65..805ab122767a 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -367,7 +367,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly =
 {
 	.l3proto		= PF_INET6,
 	.l4proto		= IPPROTO_ICMPV6,
-	.name			= "icmpv6",
 	.pkt_to_tuple		= icmpv6_pkt_to_tuple,
 	.invert_tuple		= icmpv6_invert_tuple,
 	.print_tuple		= icmpv6_print_tuple,
-- 
cgit v1.2.3


From 91950833dd5a34ac6336aa88da6d43aaeb56ac6d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 12 Aug 2017 00:57:06 +0200
Subject: netfilter: conntrack: place print_tuple in procfs part

CONFIG_NF_CONNTRACK_PROCFS is deprecated, no need to use a function
pointer in the trackers for this. Place the printf formatting in
the one place that uses it.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |  8 --------
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 11 -----------
 2 files changed, 19 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 91d37fbe28de..fe01dc953c56 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -67,13 +67,6 @@ static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
-static void ipv6_print_tuple(struct seq_file *s,
-			    const struct nf_conntrack_tuple *tuple)
-{
-	seq_printf(s, "src=%pI6 dst=%pI6 ",
-		   tuple->src.u3.ip6, tuple->dst.u3.ip6);
-}
-
 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 			    unsigned int *dataoff, u_int8_t *protonum)
 {
@@ -350,7 +343,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = {
 	.l3proto		= PF_INET6,
 	.pkt_to_tuple		= ipv6_pkt_to_tuple,
 	.invert_tuple		= ipv6_invert_tuple,
-	.print_tuple		= ipv6_print_tuple,
 	.get_l4proto		= ipv6_get_l4proto,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= ipv6_tuple_to_nlattr,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 805ab122767a..808f63e2e1ff 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -84,16 +84,6 @@ static bool icmpv6_invert_tuple(struct nf_conntrack_tuple *tuple,
 	return true;
 }
 
-/* Print out the per-protocol part of the tuple. */
-static void icmpv6_print_tuple(struct seq_file *s,
-			      const struct nf_conntrack_tuple *tuple)
-{
-	seq_printf(s, "type=%u code=%u id=%u ",
-		   tuple->dst.u.icmp.type,
-		   tuple->dst.u.icmp.code,
-		   ntohs(tuple->src.u.icmp.id));
-}
-
 static unsigned int *icmpv6_get_timeouts(struct net *net)
 {
 	return &icmpv6_pernet(net)->timeout;
@@ -369,7 +359,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly =
 	.l4proto		= IPPROTO_ICMPV6,
 	.pkt_to_tuple		= icmpv6_pkt_to_tuple,
 	.invert_tuple		= icmpv6_invert_tuple,
-	.print_tuple		= icmpv6_print_tuple,
 	.packet			= icmpv6_packet,
 	.get_timeouts		= icmpv6_get_timeouts,
 	.new			= icmpv6_new,
-- 
cgit v1.2.3


From 22b6722bfa591ba03d6a0c5521b600d4ab2d9a27 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jkbs@redhat.com>
Date: Wed, 23 Aug 2017 09:55:41 +0200
Subject: ipv6: Add sysctl for per namespace flow label reflection

Reflecting IPv6 Flow Label at server nodes is useful in environments
that employ multipath routing to load balance the requests. As "IPv6
Flow Label Reflection" standard draft [1] points out - ICMPv6 PTB error
messages generated in response to a downstream packets from the server
can be routed by a load balancer back to the original server without
looking at transport headers, if the server applies the flow label
reflection. This enables the Path MTU Discovery past the ECMP router in
load-balance or anycast environments where each server node is reachable
by only one path.

Introduce a sysctl to enable flow label reflection per net namespace for
all newly created sockets. Same could be earlier achieved only per
socket by setting the IPV6_FL_F_REFLECT flag for the IPV6_FLOWLABEL_MGR
socket option.

[1] https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01

Signed-off-by: Jakub Sitnicki <jkbs@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c        | 1 +
 net/ipv6/sysctl_net_ipv6.c | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 3b58ee709f33..fe5262fd6aa5 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -211,6 +211,7 @@ lookup_protocol:
 	np->mc_loop	= 1;
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
 	np->autoflowlabel = ip6_default_np_autolabel(net);
+	np->repflow	= net->ipv6.sysctl.flowlabel_reflect;
 	sk->sk_ipv6only	= net->ipv6.sysctl.bindv6only;
 
 	/* Init the ipv4 part of the socket since we can have sockets
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 69c50e737c54..6fbf8ae5e52c 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -90,6 +90,13 @@ static struct ctl_table ipv6_table_template[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "flowlabel_reflect",
+		.data		= &init_net.ipv6.sysctl.flowlabel_reflect,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
@@ -149,6 +156,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
 	ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay;
 	ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges;
 	ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind;
+	ipv6_table[9].data = &net->ipv6.sysctl.flowlabel_reflect;
 
 	ipv6_route_table = ipv6_route_sysctl_init(net);
 	if (!ipv6_route_table)
-- 
cgit v1.2.3


From 23aebdacb05dab9efdf22b9e0413491cbd5f128f Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jkbs@redhat.com>
Date: Wed, 23 Aug 2017 09:58:29 +0200
Subject: ipv6: Compute multipath hash for ICMP errors from offending packet

When forwarding or sending out an ICMPv6 error, look at the embedded
packet that triggered the error and compute a flow hash over its
headers.

This let's us route the ICMP error together with the flow it belongs to
when multipath (ECMP) routing is in use, which in turn makes Path MTU
Discovery work in ECMP load-balanced or anycast setups (RFC 7690).

Granted, end-hosts behind the ECMP router (aka servers) need to reflect
the IPv6 Flow Label for PMTUD to work.

The code is organized to be in parallel with ipv4 stack:

  ip_multipath_l3_keys -> ip6_multipath_l3_keys
  fib_multipath_hash   -> rt6_multipath_hash

Signed-off-by: Jakub Sitnicki <jkbs@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/icmp.c  |  1 +
 net/ipv6/route.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 4f82830fc068..dd7608cf1d72 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -519,6 +519,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	fl6.fl6_icmp_type = type;
 	fl6.fl6_icmp_code = code;
 	fl6.flowi6_uid = sock_net_uid(net, NULL);
+	fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
 
 	sk = icmpv6_xmit_lock(net);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9b02064c3335..6c4dd5796a31 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1214,6 +1214,54 @@ struct dst_entry *ip6_route_input_lookup(struct net *net,
 }
 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
 
+static void ip6_multipath_l3_keys(const struct sk_buff *skb,
+				  struct flow_keys *keys)
+{
+	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
+	const struct ipv6hdr *key_iph = outer_iph;
+	const struct ipv6hdr *inner_iph;
+	const struct icmp6hdr *icmph;
+	struct ipv6hdr _inner_iph;
+
+	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
+		goto out;
+
+	icmph = icmp6_hdr(skb);
+	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
+	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
+	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
+	    icmph->icmp6_type != ICMPV6_PARAMPROB)
+		goto out;
+
+	inner_iph = skb_header_pointer(skb,
+				       skb_transport_offset(skb) + sizeof(*icmph),
+				       sizeof(_inner_iph), &_inner_iph);
+	if (!inner_iph)
+		goto out;
+
+	key_iph = inner_iph;
+out:
+	memset(keys, 0, sizeof(*keys));
+	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+	keys->addrs.v6addrs.src = key_iph->saddr;
+	keys->addrs.v6addrs.dst = key_iph->daddr;
+	keys->tags.flow_label = ip6_flowinfo(key_iph);
+	keys->basic.ip_proto = key_iph->nexthdr;
+}
+
+/* if skb is set it will be used and fl6 can be NULL */
+u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
+{
+	struct flow_keys hash_keys;
+
+	if (skb) {
+		ip6_multipath_l3_keys(skb, &hash_keys);
+		return flow_hash_from_keys(&hash_keys);
+	}
+
+	return get_hash_from_flowi6(fl6);
+}
+
 void ip6_route_input(struct sk_buff *skb)
 {
 	const struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -1232,6 +1280,8 @@ void ip6_route_input(struct sk_buff *skb)
 	tun_info = skb_tunnel_info(skb);
 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
+	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
+		fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
 	skb_dst_drop(skb);
 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
 }
-- 
cgit v1.2.3


From 956b45318a27cefa5e99fbbbe703ea8a2a3aee13 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jkbs@redhat.com>
Date: Wed, 23 Aug 2017 09:58:30 +0200
Subject: ipv6: Fold rt6_info_hash_nhsfn() into its only caller

Commit 644d0e656958 ("ipv6 Use get_hash_from_flowi6 for rt6 hash") has
turned rt6_info_hash_nhsfn() into a one-liner, so it no longer makes
sense to keep it around. Also remove the accompanying comment that has
become outdated.

Signed-off-by: Jakub Sitnicki <jkbs@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6c4dd5796a31..246e7d7d30a4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -445,16 +445,6 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	return false;
 }
 
-/* Multipath route selection:
- *   Hash based function using packet header and flowlabel.
- * Adapted from fib_info_hashfn()
- */
-static int rt6_info_hash_nhsfn(unsigned int candidate_count,
-			       const struct flowi6 *fl6)
-{
-	return get_hash_from_flowi6(fl6) % candidate_count;
-}
-
 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 					     struct flowi6 *fl6, int oif,
 					     int strict)
@@ -462,7 +452,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 	struct rt6_info *sibling, *next_sibling;
 	int route_choosen;
 
-	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
+	route_choosen = get_hash_from_flowi6(fl6) % (match->rt6i_nsiblings + 1);
 	/* Don't change the route, if route_choosen == 0
 	 * (siblings does not include ourself)
 	 */
-- 
cgit v1.2.3


From b673d6cceae2b3e106addd27b79d321cc739bd43 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jkbs@redhat.com>
Date: Wed, 23 Aug 2017 09:58:31 +0200
Subject: ipv6: Use multipath hash from flow info if available

Allow our callers to influence the choice of ECMP link by honoring the
hash passed together with the flow info. This allows for special
treatment of ICMP errors which we would like to route over the same path
as the IPv6 datagram that triggered the error.

Also go through rt6_multipath_hash(), in the usual case when we aren't
dealing with an ICMP error, so that there is one central place where
multipath hash is computed.

Signed-off-by: Jakub Sitnicki <jkbs@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 246e7d7d30a4..4d0273459d49 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -452,7 +452,13 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 	struct rt6_info *sibling, *next_sibling;
 	int route_choosen;
 
-	route_choosen = get_hash_from_flowi6(fl6) % (match->rt6i_nsiblings + 1);
+	/* We might have already computed the hash for ICMPv6 errors. In such
+	 * case it will always be non-zero. Otherwise now is the time to do it.
+	 */
+	if (!fl6->mp_hash)
+		fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
+
+	route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
 	/* Don't change the route, if route_choosen == 0
 	 * (siblings does not include ourself)
 	 */
-- 
cgit v1.2.3


From 32d99d0b670299720dd0db92a974c9612c230889 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Fri, 25 Aug 2017 09:56:44 +0200
Subject: ipv6: sr: add support for ip4ip6 encapsulation

This patch enables the SRv6 encapsulation mode to carry an IPv4 payload.
All the infrastructure was already present, I just had to add a parameter
to seg6_do_srh_encap() to specify the inner packet protocol, and perform
some additional checks.

Usage example:
ip route add 1.2.3.4 encap seg6 mode encap segs fc00::1,fc00::2 dev eth0

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_iptunnel.c | 47 +++++++++++++++++++++++++++++++++++++----------
 net/ipv6/seg6_local.c    |  2 +-
 2 files changed, 38 insertions(+), 11 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 501233040570..5bec7817a7b9 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -91,7 +91,7 @@ static void set_tun_src(struct net *net, struct net_device *dev,
 }
 
 /* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
-int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
 {
 	struct net *net = dev_net(skb_dst(skb)->dev);
 	struct ipv6hdr *hdr, *inner_hdr;
@@ -116,15 +116,22 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
 	 * hlim will be decremented in ip6_forward() afterwards and
 	 * decapsulation will overwrite inner hlim with outer hlim
 	 */
-	ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
-		     ip6_flowlabel(inner_hdr));
-	hdr->hop_limit = inner_hdr->hop_limit;
+
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
+			     ip6_flowlabel(inner_hdr));
+		hdr->hop_limit = inner_hdr->hop_limit;
+	} else {
+		ip6_flow_hdr(hdr, 0, 0);
+		hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
+	}
+
 	hdr->nexthdr = NEXTHDR_ROUTING;
 
 	isrh = (void *)hdr + sizeof(*hdr);
 	memcpy(isrh, osrh, hdrlen);
 
-	isrh->nexthdr = NEXTHDR_IPV6;
+	isrh->nexthdr = proto;
 
 	hdr->daddr = isrh->segments[isrh->first_segment];
 	set_tun_src(net, skb->dev, &hdr->daddr, &hdr->saddr);
@@ -199,7 +206,7 @@ static int seg6_do_srh(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct seg6_iptunnel_encap *tinfo;
-	int err = 0;
+	int proto, err = 0;
 
 	tinfo = seg6_encap_lwtunnel(dst->lwtstate);
 
@@ -210,17 +217,31 @@ static int seg6_do_srh(struct sk_buff *skb)
 
 	switch (tinfo->mode) {
 	case SEG6_IPTUN_MODE_INLINE:
+		if (skb->protocol != htons(ETH_P_IPV6))
+			return -EINVAL;
+
 		err = seg6_do_srh_inline(skb, tinfo->srh);
+		if (err)
+			return err;
+
 		skb_reset_inner_headers(skb);
 		break;
 	case SEG6_IPTUN_MODE_ENCAP:
-		err = seg6_do_srh_encap(skb, tinfo->srh);
+		if (skb->protocol == htons(ETH_P_IPV6))
+			proto = IPPROTO_IPV6;
+		else if (skb->protocol == htons(ETH_P_IP))
+			proto = IPPROTO_IPIP;
+		else
+			return -EINVAL;
+
+		err = seg6_do_srh_encap(skb, tinfo->srh, proto);
+		if (err)
+			return err;
+
+		skb->protocol = htons(ETH_P_IPV6);
 		break;
 	}
 
-	if (err)
-		return err;
-
 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
 
@@ -334,6 +355,9 @@ static int seg6_build_state(struct nlattr *nla,
 	struct seg6_lwt *slwt;
 	int err;
 
+	if (family != AF_INET && family != AF_INET6)
+		return -EINVAL;
+
 	err = nla_parse_nested(tb, SEG6_IPTUNNEL_MAX, nla,
 			       seg6_iptunnel_policy, extack);
 
@@ -356,6 +380,9 @@ static int seg6_build_state(struct nlattr *nla,
 
 	switch (tuninfo->mode) {
 	case SEG6_IPTUN_MODE_INLINE:
+		if (family != AF_INET6)
+			return -EINVAL;
+
 		break;
 	case SEG6_IPTUN_MODE_ENCAP:
 		break;
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 147680e7a00c..609b94e970de 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -290,7 +290,7 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
 	skb_reset_inner_headers(skb);
 	skb->encapsulation = 1;
 
-	err = seg6_do_srh_encap(skb, slwt->srh);
+	err = seg6_do_srh_encap(skb, slwt->srh, IPPROTO_IPV6);
 	if (err)
 		goto drop;
 
-- 
cgit v1.2.3


From 38ee7f2d47565689f35662d488d25e7afc43477d Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Fri, 25 Aug 2017 09:56:45 +0200
Subject: ipv6: sr: add support for encapsulation of L2 frames

This patch implements the L2 frame encapsulation mechanism, referred to
as T.Encaps.L2 in the SRv6 specifications [1].

A new type of SRv6 tunnel mode is added (SEG6_IPTUN_MODE_L2ENCAP). It only
accepts packets with an existing MAC header (i.e., it will not work for
locally generated packets). The resulting packet looks like IPv6 -> SRH ->
Ethernet -> original L3 payload. The next header field of the SRH is set to
NEXTHDR_NONE.

[1] https://tools.ietf.org/html/draft-filsfils-spring-srv6-network-programming-01

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_iptunnel.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 5bec7817a7b9..bd6cc688bd19 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -238,6 +238,22 @@ static int seg6_do_srh(struct sk_buff *skb)
 		if (err)
 			return err;
 
+		skb->protocol = htons(ETH_P_IPV6);
+		break;
+	case SEG6_IPTUN_MODE_L2ENCAP:
+		if (!skb_mac_header_was_set(skb))
+			return -EINVAL;
+
+		if (pskb_expand_head(skb, skb->mac_len, 0, GFP_ATOMIC) < 0)
+			return -ENOMEM;
+
+		skb_mac_header_rebuild(skb);
+		skb_push(skb, skb->mac_len);
+
+		err = seg6_do_srh_encap(skb, tinfo->srh, NEXTHDR_NONE);
+		if (err)
+			return err;
+
 		skb->protocol = htons(ETH_P_IPV6);
 		break;
 	}
@@ -386,6 +402,8 @@ static int seg6_build_state(struct nlattr *nla,
 		break;
 	case SEG6_IPTUN_MODE_ENCAP:
 		break;
+	case SEG6_IPTUN_MODE_L2ENCAP:
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -409,8 +427,11 @@ static int seg6_build_state(struct nlattr *nla,
 	memcpy(&slwt->tuninfo, tuninfo, tuninfo_len);
 
 	newts->type = LWTUNNEL_ENCAP_SEG6;
-	newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
-			LWTUNNEL_STATE_INPUT_REDIRECT;
+	newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+
+	if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP)
+		newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
 	newts->headroom = seg6_lwt_headroom(tuninfo);
 
 	*ts = newts;
-- 
cgit v1.2.3


From 6285217f0c29e68b744533a9ddf50e110d36d65e Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Fri, 25 Aug 2017 09:56:46 +0200
Subject: ipv6: sr: enforce IPv6 packets for seg6local lwt

This patch ensures that the seg6local lightweight tunnel is used solely
with IPv6 routes and processes only IPv6 packets.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_local.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 609b94e970de..c6263256fcf6 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -357,6 +357,11 @@ static int seg6_local_input(struct sk_buff *skb)
 	struct seg6_action_desc *desc;
 	struct seg6_local_lwt *slwt;
 
+	if (skb->protocol != htons(ETH_P_IPV6)) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
 	slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
 	desc = slwt->desc;
 
@@ -623,6 +628,9 @@ static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
 	struct seg6_local_lwt *slwt;
 	int err;
 
+	if (family != AF_INET6)
+		return -EINVAL;
+
 	err = nla_parse_nested(tb, SEG6_LOCAL_MAX, nla, seg6_local_policy,
 			       extack);
 
-- 
cgit v1.2.3


From d7a669dd2f8ba07a17423f4ad586dfc0379882f7 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Fri, 25 Aug 2017 09:56:47 +0200
Subject: ipv6: sr: add helper functions for seg6local

This patch adds three helper functions to be used with the seg6local packet
processing actions.

The decap_and_validate() function will be used by the End.D* actions, that
decapsulate an SR-enabled packet.

The advance_nextseg() function applies the fundamental operations to update
an SRH for the next segment.

The lookup_nexthop() function helps select the next-hop for the processed
SR packets. It supports an optional next-hop address to route the packet
specifically through it, and an optional routing table to use.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/Kconfig      |   1 +
 net/ipv6/seg6_local.c | 189 ++++++++++++++++++++++++++------------------------
 2 files changed, 101 insertions(+), 89 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 0d722396dce6..ea71e4b0ab7a 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -308,6 +308,7 @@ config IPV6_SEG6_LWTUNNEL
 	depends on IPV6
 	select LWTUNNEL
 	select DST_CACHE
+	select IPV6_MULTIPLE_TABLES
 	---help---
 	  Support for encapsulation of packets within an outer IPv6
 	  header and a Segment Routing Header using the lightweight
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index c6263256fcf6..26db4d3e8755 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -99,23 +99,105 @@ static struct ipv6_sr_hdr *get_and_validate_srh(struct sk_buff *skb)
 	return srh;
 }
 
+static bool decap_and_validate(struct sk_buff *skb, int proto)
+{
+	struct ipv6_sr_hdr *srh;
+	unsigned int off = 0;
+
+	srh = get_srh(skb);
+	if (srh && srh->segments_left > 0)
+		return false;
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	if (srh && !seg6_hmac_validate_skb(skb))
+		return false;
+#endif
+
+	if (ipv6_find_hdr(skb, &off, proto, NULL, NULL) < 0)
+		return false;
+
+	if (!pskb_pull(skb, off))
+		return false;
+
+	skb_postpull_rcsum(skb, skb_network_header(skb), off);
+
+	skb_reset_network_header(skb);
+	skb_reset_transport_header(skb);
+	skb->encapsulation = 0;
+
+	return true;
+}
+
+static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr)
+{
+	struct in6_addr *addr;
+
+	srh->segments_left--;
+	addr = srh->segments + srh->segments_left;
+	*daddr = *addr;
+}
+
+static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+			   u32 tbl_id)
+{
+	struct net *net = dev_net(skb->dev);
+	struct ipv6hdr *hdr = ipv6_hdr(skb);
+	int flags = RT6_LOOKUP_F_HAS_SADDR;
+	struct dst_entry *dst = NULL;
+	struct rt6_info *rt;
+	struct flowi6 fl6;
+
+	fl6.flowi6_iif = skb->dev->ifindex;
+	fl6.daddr = nhaddr ? *nhaddr : hdr->daddr;
+	fl6.saddr = hdr->saddr;
+	fl6.flowlabel = ip6_flowinfo(hdr);
+	fl6.flowi6_mark = skb->mark;
+	fl6.flowi6_proto = hdr->nexthdr;
+
+	if (nhaddr)
+		fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
+
+	if (!tbl_id) {
+		dst = ip6_route_input_lookup(net, skb->dev, &fl6, flags);
+	} else {
+		struct fib6_table *table;
+
+		table = fib6_get_table(net, tbl_id);
+		if (!table)
+			goto out;
+
+		rt = ip6_pol_route(net, table, 0, &fl6, flags);
+		dst = &rt->dst;
+	}
+
+	if (dst && dst->dev->flags & IFF_LOOPBACK && !dst->error) {
+		dst_release(dst);
+		dst = NULL;
+	}
+
+out:
+	if (!dst) {
+		rt = net->ipv6.ip6_blk_hole_entry;
+		dst = &rt->dst;
+		dst_hold(dst);
+	}
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+}
+
 /* regular endpoint function */
 static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 {
 	struct ipv6_sr_hdr *srh;
-	struct in6_addr *addr;
 
 	srh = get_and_validate_srh(skb);
 	if (!srh)
 		goto drop;
 
-	srh->segments_left--;
-	addr = srh->segments + srh->segments_left;
-
-	ipv6_hdr(skb)->daddr = *addr;
+	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
 
-	skb_dst_drop(skb);
-	ip6_route_input(skb);
+	lookup_nexthop(skb, NULL, 0);
 
 	return dst_input(skb);
 
@@ -127,41 +209,15 @@ drop:
 /* regular endpoint, and forward to specified nexthop */
 static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 {
-	struct net *net = dev_net(skb->dev);
 	struct ipv6_sr_hdr *srh;
-	struct dst_entry *dst;
-	struct in6_addr *addr;
-	struct ipv6hdr *hdr;
-	struct flowi6 fl6;
-	int flags;
 
 	srh = get_and_validate_srh(skb);
 	if (!srh)
 		goto drop;
 
-	srh->segments_left--;
-	addr = srh->segments + srh->segments_left;
-
-	hdr = ipv6_hdr(skb);
-	hdr->daddr = *addr;
-
-	skb_dst_drop(skb);
-
-	fl6.flowi6_iif = skb->dev->ifindex;
-	fl6.daddr = slwt->nh6;
-	fl6.saddr = hdr->saddr;
-	fl6.flowlabel = ip6_flowinfo(hdr);
-	fl6.flowi6_mark = skb->mark;
-	fl6.flowi6_proto = hdr->nexthdr;
-
-	flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE |
-		RT6_LOOKUP_F_REACHABLE;
+	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
 
-	dst = ip6_route_input_lookup(net, skb->dev, &fl6, flags);
-	if (dst->dev->flags & IFF_LOOPBACK)
-		goto drop;
-
-	skb_dst_set(skb, dst);
+	lookup_nexthop(skb, &slwt->nh6, 0);
 
 	return dst_input(skb);
 
@@ -174,42 +230,18 @@ drop:
 static int input_action_end_dx6(struct sk_buff *skb,
 				struct seg6_local_lwt *slwt)
 {
-	struct net *net = dev_net(skb->dev);
-	struct ipv6hdr *inner_hdr;
-	struct ipv6_sr_hdr *srh;
-	struct dst_entry *dst;
-	unsigned int off = 0;
-	struct flowi6 fl6;
-	bool use_nh;
-	int flags;
+	struct in6_addr *nhaddr = NULL;
 
 	/* this function accepts IPv6 encapsulated packets, with either
 	 * an SRH with SL=0, or no SRH.
 	 */
 
-	srh = get_srh(skb);
-	if (srh && srh->segments_left > 0)
-		goto drop;
-
-#ifdef CONFIG_IPV6_SEG6_HMAC
-	if (srh && !seg6_hmac_validate_skb(skb))
+	if (!decap_and_validate(skb, IPPROTO_IPV6))
 		goto drop;
-#endif
 
-	if (ipv6_find_hdr(skb, &off, IPPROTO_IPV6, NULL, NULL) < 0)
-		goto drop;
-
-	if (!pskb_pull(skb, off))
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
 		goto drop;
 
-	skb_postpull_rcsum(skb, skb_network_header(skb), off);
-
-	skb_reset_network_header(skb);
-	skb_reset_transport_header(skb);
-	skb->encapsulation = 0;
-
-	inner_hdr = ipv6_hdr(skb);
-
 	/* The inner packet is not associated to any local interface,
 	 * so we do not call netif_rx().
 	 *
@@ -217,26 +249,10 @@ static int input_action_end_dx6(struct sk_buff *skb,
 	 * inner packet's DA. Otherwise, use the specified nexthop.
 	 */
 
-	use_nh = !ipv6_addr_any(&slwt->nh6);
+	if (!ipv6_addr_any(&slwt->nh6))
+		nhaddr = &slwt->nh6;
 
-	skb_dst_drop(skb);
-
-	fl6.flowi6_iif = skb->dev->ifindex;
-	fl6.daddr = use_nh ? slwt->nh6 : inner_hdr->daddr;
-	fl6.saddr = inner_hdr->saddr;
-	fl6.flowlabel = ip6_flowinfo(inner_hdr);
-	fl6.flowi6_mark = skb->mark;
-	fl6.flowi6_proto = inner_hdr->nexthdr;
-
-	flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_REACHABLE;
-	if (use_nh)
-		flags |= RT6_LOOKUP_F_IFACE;
-
-	dst = ip6_route_input_lookup(net, skb->dev, &fl6, flags);
-	if (dst->dev->flags & IFF_LOOPBACK)
-		goto drop;
-
-	skb_dst_set(skb, dst);
+	lookup_nexthop(skb, nhaddr, 0);
 
 	return dst_input(skb);
 drop:
@@ -261,8 +277,7 @@ static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
 
-	skb_dst_drop(skb);
-	ip6_route_input(skb);
+	lookup_nexthop(skb, NULL, 0);
 
 	return dst_input(skb);
 
@@ -276,16 +291,13 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
 				     struct seg6_local_lwt *slwt)
 {
 	struct ipv6_sr_hdr *srh;
-	struct in6_addr *addr;
 	int err = -EINVAL;
 
 	srh = get_and_validate_srh(skb);
 	if (!srh)
 		goto drop;
 
-	srh->segments_left--;
-	addr = srh->segments + srh->segments_left;
-	ipv6_hdr(skb)->daddr = *addr;
+	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
 
 	skb_reset_inner_headers(skb);
 	skb->encapsulation = 1;
@@ -297,8 +309,7 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
 	skb_set_transport_header(skb, sizeof(struct ipv6hdr));
 
-	skb_dst_drop(skb);
-	ip6_route_input(skb);
+	lookup_nexthop(skb, NULL, 0);
 
 	return dst_input(skb);
 
-- 
cgit v1.2.3


From 891ef8dd2a8d14e4e73a81dcdb135b574c57f556 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Fri, 25 Aug 2017 09:58:17 +0200
Subject: ipv6: sr: implement additional seg6local actions

This patch implements the following seg6local actions.

- SEG6_LOCAL_ACTION_END_T: regular SRH processing and forward to the
  next-hop looked up in the specified routing table.

- SEG6_LOCAL_ACTION_END_DX2: decapsulate an L2 frame and forward it to
  the specified network interface.

- SEG6_LOCAL_ACTION_END_DX4: decapsulate an IPv4 packet and forward it,
  possibly to the specified next-hop.

- SEG6_LOCAL_ACTION_END_DT6: decapsulate an IPv6 packet and forward it
  to the next-hop looked up in the specified routing table.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_local.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 26db4d3e8755..9c1a885ee482 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -30,6 +30,7 @@
 #ifdef CONFIG_IPV6_SEG6_HMAC
 #include <net/seg6_hmac.h>
 #endif
+#include <linux/etherdevice.h>
 
 struct seg6_local_lwt;
 
@@ -226,6 +227,82 @@ drop:
 	return -EINVAL;
 }
 
+static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+	struct ipv6_sr_hdr *srh;
+
+	srh = get_and_validate_srh(skb);
+	if (!srh)
+		goto drop;
+
+	advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+
+	lookup_nexthop(skb, NULL, slwt->table);
+
+	return dst_input(skb);
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+/* decapsulate and forward inner L2 frame on specified interface */
+static int input_action_end_dx2(struct sk_buff *skb,
+				struct seg6_local_lwt *slwt)
+{
+	struct net *net = dev_net(skb->dev);
+	struct net_device *odev;
+	struct ethhdr *eth;
+
+	if (!decap_and_validate(skb, NEXTHDR_NONE))
+		goto drop;
+
+	if (!pskb_may_pull(skb, ETH_HLEN))
+		goto drop;
+
+	skb_reset_mac_header(skb);
+	eth = (struct ethhdr *)skb->data;
+
+	/* To determine the frame's protocol, we assume it is 802.3. This avoids
+	 * a call to eth_type_trans(), which is not really relevant for our
+	 * use case.
+	 */
+	if (!eth_proto_is_802_3(eth->h_proto))
+		goto drop;
+
+	odev = dev_get_by_index_rcu(net, slwt->oif);
+	if (!odev)
+		goto drop;
+
+	/* As we accept Ethernet frames, make sure the egress device is of
+	 * the correct type.
+	 */
+	if (odev->type != ARPHRD_ETHER)
+		goto drop;
+
+	if (!(odev->flags & IFF_UP) || !netif_carrier_ok(odev))
+		goto drop;
+
+	skb_orphan(skb);
+
+	if (skb_warn_if_lro(skb))
+		goto drop;
+
+	skb_forward_csum(skb);
+
+	if (skb->len - ETH_HLEN > odev->mtu)
+		goto drop;
+
+	skb->dev = odev;
+	skb->protocol = eth->h_proto;
+
+	return dev_queue_xmit(skb);
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
 /* decapsulate and forward to specified nexthop */
 static int input_action_end_dx6(struct sk_buff *skb,
 				struct seg6_local_lwt *slwt)
@@ -260,6 +337,56 @@ drop:
 	return -EINVAL;
 }
 
+static int input_action_end_dx4(struct sk_buff *skb,
+				struct seg6_local_lwt *slwt)
+{
+	struct iphdr *iph;
+	__be32 nhaddr;
+	int err;
+
+	if (!decap_and_validate(skb, IPPROTO_IPIP))
+		goto drop;
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto drop;
+
+	skb->protocol = htons(ETH_P_IP);
+
+	iph = ip_hdr(skb);
+
+	nhaddr = slwt->nh4.s_addr ?: iph->daddr;
+
+	skb_dst_drop(skb);
+
+	err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev);
+	if (err)
+		goto drop;
+
+	return dst_input(skb);
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+static int input_action_end_dt6(struct sk_buff *skb,
+				struct seg6_local_lwt *slwt)
+{
+	if (!decap_and_validate(skb, IPPROTO_IPV6))
+		goto drop;
+
+	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+		goto drop;
+
+	lookup_nexthop(skb, NULL, slwt->table);
+
+	return dst_input(skb);
+
+drop:
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
 /* push an SRH on top of the current one */
 static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
 {
@@ -329,11 +456,31 @@ static struct seg6_action_desc seg6_action_table[] = {
 		.attrs		= (1 << SEG6_LOCAL_NH6),
 		.input		= input_action_end_x,
 	},
+	{
+		.action		= SEG6_LOCAL_ACTION_END_T,
+		.attrs		= (1 << SEG6_LOCAL_TABLE),
+		.input		= input_action_end_t,
+	},
+	{
+		.action		= SEG6_LOCAL_ACTION_END_DX2,
+		.attrs		= (1 << SEG6_LOCAL_OIF),
+		.input		= input_action_end_dx2,
+	},
 	{
 		.action		= SEG6_LOCAL_ACTION_END_DX6,
 		.attrs		= (1 << SEG6_LOCAL_NH6),
 		.input		= input_action_end_dx6,
 	},
+	{
+		.action		= SEG6_LOCAL_ACTION_END_DX4,
+		.attrs		= (1 << SEG6_LOCAL_NH4),
+		.input		= input_action_end_dx4,
+	},
+	{
+		.action		= SEG6_LOCAL_ACTION_END_DT6,
+		.attrs		= (1 << SEG6_LOCAL_TABLE),
+		.input		= input_action_end_dt6,
+	},
 	{
 		.action		= SEG6_LOCAL_ACTION_END_B6,
 		.attrs		= (1 << SEG6_LOCAL_SRH),
-- 
cgit v1.2.3


From e2f387d2df0ece6d4418bb09bef7802cfaf7142d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 25 Aug 2017 02:59:41 +0200
Subject: netfilter: conntrack: don't log "invalid" icmpv6 connections

When enabling logging for invalid connections we currently also log most
icmpv6 types, which we don't track intentionally (e.g. neigh discovery).
"invalid" should really mean "invalid", i.e. short header or bad checksum.

We don't do any logging for icmp(v4) either, its just useless noise.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 808f63e2e1ff..43544b975eae 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -121,11 +121,6 @@ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb,
 		pr_debug("icmpv6: can't create new conn with type %u\n",
 			 type + 128);
 		nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple);
-		if (LOG_INVALID(nf_ct_net(ct), IPPROTO_ICMPV6))
-			nf_log_packet(nf_ct_net(ct), PF_INET6, 0, skb, NULL,
-				      NULL, NULL,
-				      "nf_ct_icmpv6: invalid new with type %d ",
-				      type + 128);
 		return false;
 	}
 	return true;
-- 
cgit v1.2.3


From a8e3bb347d8c32c25830af55466e08979df4e2be Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 28 Aug 2017 15:14:20 -0700
Subject: net: Add comment that early_demux can change via sysctl

Twice patches trying to constify inet{6}_protocol have been reverted:
39294c3df2a8 ("Revert "ipv6: constify inet6_protocol structures"") to
revert 3a3a4e3054137 and then 03157937fe0b5 ("Revert "ipv4: make
net_protocol const"") to revert aa8db499ea67.

Add a comment that the structures can not be const because the
early_demux field can change based on a sysctl.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 3 +++
 net/ipv6/udp.c      | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index abba3bc2a3d9..38f76d8b231e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1949,6 +1949,9 @@ struct proto tcpv6_prot = {
 	.diag_destroy		= tcp_abort,
 };
 
+/* thinking of making this const? Don't.
+ * early_demux can change based on sysctl.
+ */
 static struct inet6_protocol tcpv6_protocol = {
 	.early_demux	=	tcp_v6_early_demux,
 	.early_demux_handler =  tcp_v6_early_demux,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2a15f1bb6ef8..976f30391356 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1472,6 +1472,9 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
 }
 #endif
 
+/* thinking of making this const? Don't.
+ * early_demux can change based on sysctl.
+ */
 static struct inet6_protocol udpv6_protocol = {
 	.early_demux	=	udp_v6_early_demux,
 	.early_demux_handler =  udp_v6_early_demux,
-- 
cgit v1.2.3


From a6f57028d60328c06ea23982464de2f2e4a9ea3f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 29 Aug 2017 13:29:42 +0200
Subject: addrlabel: add/delete/get can run without rtnl

There appears to be no need to use rtnl, addrlabel entries are refcounted
and add/delete is serialized by the addrlabel table spinlock.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrlabel.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index cea5eb488013..b055bc79f56d 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -405,6 +405,18 @@ static const struct nla_policy ifal_policy[IFAL_MAX+1] = {
 	[IFAL_LABEL]		= { .len = sizeof(u32), },
 };
 
+static bool addrlbl_ifindex_exists(struct net *net, int ifindex)
+{
+
+	struct net_device *dev;
+
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, ifindex);
+	rcu_read_unlock();
+
+	return dev != NULL;
+}
+
 static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh,
 			     struct netlink_ext_ack *extack)
 {
@@ -439,7 +451,7 @@ static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh,
 	switch (nlh->nlmsg_type) {
 	case RTM_NEWADDRLABEL:
 		if (ifal->ifal_index &&
-		    !__dev_get_by_index(net, ifal->ifal_index))
+		    !addrlbl_ifindex_exists(net, ifal->ifal_index))
 			return -EINVAL;
 
 		err = ip6addrlbl_add(net, pfx, ifal->ifal_prefixlen,
@@ -548,7 +560,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		return -EINVAL;
 
 	if (ifal->ifal_index &&
-	    !__dev_get_by_index(net, ifal->ifal_index))
+	    !addrlbl_ifindex_exists(net, ifal->ifal_index))
 		return -EINVAL;
 
 	if (!tb[IFAL_ADDRESS])
@@ -593,10 +605,10 @@ out:
 void __init ipv6_addr_label_rtnl_register(void)
 {
 	__rtnl_register(PF_INET6, RTM_NEWADDRLABEL, ip6addrlbl_newdel,
-			NULL, 0);
+			NULL, RTNL_FLAG_DOIT_UNLOCKED);
 	__rtnl_register(PF_INET6, RTM_DELADDRLABEL, ip6addrlbl_newdel,
-			NULL, 0);
+			NULL, RTNL_FLAG_DOIT_UNLOCKED);
 	__rtnl_register(PF_INET6, RTM_GETADDRLABEL, ip6addrlbl_get,
-			ip6addrlbl_dump, 0);
+			ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED);
 }
 
-- 
cgit v1.2.3


From 1b70d792cf6775fb5d0737524387893daeb5374a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Mon, 28 Aug 2017 13:53:34 -0700
Subject: ipv6: Use rt6i_idev index for echo replies to a local address

Tariq repored local pings to linklocal address is failing:
$ ifconfig ens8
ens8: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1500
        inet 11.141.16.6  netmask 255.255.0.0  broadcast 11.141.255.255
        inet6 fe80::7efe:90ff:fecb:7502  prefixlen 64  scopeid 0x20<link>
        ether 7c:fe:90:cb:75:02  txqueuelen 1000  (Ethernet)
        RX packets 12  bytes 1164 (1.1 KiB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 30  bytes 2484 (2.4 KiB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

$  /bin/ping6 -c 3 fe80::7efe:90ff:fecb:7502%ens8
PING fe80::7efe:90ff:fecb:7502%ens8(fe80::7efe:90ff:fecb:7502) 56 data bytes

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/icmp.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index dd7608cf1d72..5acb54405b10 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -399,6 +399,24 @@ relookup_failed:
 	return ERR_PTR(err);
 }
 
+static int icmp6_iif(const struct sk_buff *skb)
+{
+	int iif = skb->dev->ifindex;
+
+	/* for local traffic to local address, skb dev is the loopback
+	 * device. Check if there is a dst attached to the skb and if so
+	 * get the real device index.
+	 */
+	if (unlikely(iif == LOOPBACK_IFINDEX)) {
+		const struct rt6_info *rt6 = skb_rt6_info(skb);
+
+		if (rt6)
+			iif = rt6->rt6i_idev->dev->ifindex;
+	}
+
+	return iif;
+}
+
 /*
  *	Send an ICMP message in response to a packet in error
  */
@@ -460,18 +478,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	 */
 
 	if (__ipv6_addr_needs_scope_id(addr_type)) {
-		iif = skb->dev->ifindex;
-
-		/* for local packets, get the real device index */
-		if (iif == LOOPBACK_IFINDEX) {
-			dst = skb_dst(skb);
-			if (dst) {
-				struct rt6_info *rt;
-
-				rt = container_of(dst, struct rt6_info, dst);
-				iif = rt->rt6i_idev->dev->ifindex;
-			}
-		}
+		iif = icmp6_iif(skb);
 	} else {
 		dst = skb_dst(skb);
 		iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev);
@@ -694,7 +701,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	fl6.daddr = ipv6_hdr(skb)->saddr;
 	if (saddr)
 		fl6.saddr = *saddr;
-	fl6.flowi6_oif = skb->dev->ifindex;
+	fl6.flowi6_oif = icmp6_iif(skb);
 	fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
 	fl6.flowi6_mark = mark;
 	fl6.flowi6_uid = sock_net_uid(net, NULL);
-- 
cgit v1.2.3


From eaa72dc47488d599439cd0fd0f8c4f1bcb3906bb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 29 Aug 2017 15:16:01 -0700
Subject: neigh: increase queue_len_bytes to match wmem_default

Florian reported UDP xmit drops that could be root caused to the
too small neigh limit.

Current limit is 64 KB, meaning that even a single UDP socket would hit
it, since its default sk_sndbuf comes from net.core.wmem_default
(~212992 bytes on 64bit arches).

Once ARP/ND resolution is in progress, we should allow a little more
packets to be queued, at least for one producer.

Once neigh arp_queue is filled, a rogue socket should hit its sk_sndbuf
limit and either block in sendmsg() or return -EAGAIN.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ndisc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 5e338eb89509..266a530414d7 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -127,7 +127,7 @@ struct neigh_table nd_tbl = {
 			[NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME,
 			[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
 			[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
-			[NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024,
+			[NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
 			[NEIGH_VAR_PROXY_QLEN] = 64,
 			[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
 			[NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
-- 
cgit v1.2.3


From 5829d70b0b6cd055e7a9af3abe573927e919e30b Mon Sep 17 00:00:00 2001
From: Ahmed Abdelsalam <amsalam20@gmail.com>
Date: Wed, 30 Aug 2017 10:50:37 +0200
Subject: ipv6: sr: fix get_srh() to comply with IPv6 standard "RFC 8200"

IPv6 packet may carry more than one extension header, and IPv6 nodes must
accept and attempt to process extension headers in any order and occurring
any number of times in the same packet. Hence, there should be no
assumption that Segment Routing extension header is to appear immediately
after the IPv6 header.

Moreover, section 4.1 of RFC 8200 gives a recommendation on the order of
appearance of those extension headers within an IPv6 packet. According to
this recommendation, Segment Routing extension header should appear after
Hop-by-Hop and Destination Options headers (if they present).

This patch fixes the get_srh(), so it gets the segment routing header
regardless of its position in the chain of the extension headers in IPv6
packet, and makes sure that the IPv6 routing extension header is of Type 4.

Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
Acked-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_local.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 9c1a885ee482..7ff54db73a48 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -62,17 +62,23 @@ static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt)
 static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb)
 {
 	struct ipv6_sr_hdr *srh;
-	struct ipv6hdr *hdr;
-	int len;
+	int len, srhoff = 0;
+
+	if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
+		return NULL;
+
+	if (!pskb_may_pull(skb, srhoff + sizeof(*srh)))
+		return NULL;
+
+	srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
 
-	hdr = ipv6_hdr(skb);
-	if (hdr->nexthdr != IPPROTO_ROUTING)
+	/* make sure it's a Segment Routing header (Routing Type 4) */
+	if (srh->type != IPV6_SRCRT_TYPE_4)
 		return NULL;
 
-	srh = (struct ipv6_sr_hdr *)(hdr + 1);
 	len = (srh->hdrlen + 1) << 3;
 
-	if (!pskb_may_pull(skb, sizeof(*hdr) + len))
+	if (!pskb_may_pull(skb, srhoff + len))
 		return NULL;
 
 	if (!seg6_validate_srh(srh, len))
-- 
cgit v1.2.3


From 47ebcc0bb1d5eb7f1b1eeab675409ea7f67b4a5c Mon Sep 17 00:00:00 2001
From: Yossi Kuperman <yossiku@mellanox.com>
Date: Wed, 30 Aug 2017 11:30:39 +0300
Subject: xfrm: Add support for network devices capable of removing the ESP
 trailer

In conjunction with crypto offload [1], removing the ESP trailer by
hardware can potentially improve the performance by avoiding (1) a
cache miss incurred by reading the nexthdr field and (2) the necessity
to calculate the csum value of the trailer in order to keep skb->csum
valid.

This patch introduces the changes to the xfrm stack and merely serves
as an infrastructure. Subsequent patch to mlx5 driver will put this to
a good use.

[1] https://www.mail-archive.com/netdev@vger.kernel.org/msg175733.html

Signed-off-by: Yossi Kuperman <yossiku@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/esp6.c | 51 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 15 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 74bde202eb9a..7fb41b0ad437 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -461,29 +461,30 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	return esp6_output_tail(x, skb, &esp);
 }
 
-int esp6_input_done2(struct sk_buff *skb, int err)
+static inline int esp_remove_trailer(struct sk_buff *skb)
 {
 	struct xfrm_state *x = xfrm_input_state(skb);
 	struct xfrm_offload *xo = xfrm_offload(skb);
 	struct crypto_aead *aead = x->data;
-	int alen = crypto_aead_authsize(aead);
-	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
-	int elen = skb->len - hlen;
-	int hdr_len = skb_network_header_len(skb);
+	int alen, hlen, elen;
 	int padlen, trimlen;
 	__wsum csumdiff;
 	u8 nexthdr[2];
+	int ret;
 
-	if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
-		kfree(ESP_SKB_CB(skb)->tmp);
+	alen = crypto_aead_authsize(aead);
+	hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+	elen = skb->len - hlen;
 
-	if (unlikely(err))
+	if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) {
+		ret = xo->proto;
 		goto out;
+	}
 
 	if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
 		BUG();
 
-	err = -EINVAL;
+	ret = -EINVAL;
 	padlen = nexthdr[0];
 	if (padlen + 2 + alen >= elen) {
 		net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
@@ -491,26 +492,46 @@ int esp6_input_done2(struct sk_buff *skb, int err)
 		goto out;
 	}
 
-	/* ... check padding bits here. Silly. :-) */
-
 	trimlen = alen + padlen + 2;
 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
-		skb_postpull_rcsum(skb, skb_network_header(skb),
-				   skb_network_header_len(skb));
 		csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
 		skb->csum = csum_block_sub(skb->csum, csumdiff,
 					   skb->len - trimlen);
 	}
 	pskb_trim(skb, skb->len - trimlen);
 
+	ret = nexthdr[1];
+
+out:
+	return ret;
+}
+
+int esp6_input_done2(struct sk_buff *skb, int err)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	struct crypto_aead *aead = x->data;
+	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+	int hdr_len = skb_network_header_len(skb);
+
+	if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
+		kfree(ESP_SKB_CB(skb)->tmp);
+
+	if (unlikely(err))
+		goto out;
+
+	err = esp_remove_trailer(skb);
+	if (unlikely(err < 0))
+		goto out;
+
+	skb_postpull_rcsum(skb, skb_network_header(skb),
+			   skb_network_header_len(skb));
 	skb_pull_rcsum(skb, hlen);
 	if (x->props.mode == XFRM_MODE_TUNNEL)
 		skb_reset_transport_header(skb);
 	else
 		skb_set_transport_header(skb, -hdr_len);
 
-	err = nexthdr[1];
-
 	/* RFC4303: Drop dummy packets without any error */
 	if (err == IPPROTO_NONE)
 		err = -EINVAL;
-- 
cgit v1.2.3


From 6391c4f67ad1643a17b54376e381038b9e660a0f Mon Sep 17 00:00:00 2001
From: Thomas Meyer <thomas@m3y3r.de>
Date: Thu, 31 Aug 2017 16:18:15 +0200
Subject: ipv6: sr: Use ARRAY_SIZE macro

Grepping for "sizeof\(.+\) / sizeof\(" found this as one of the first
candidates.
Maybe a coccinelle can catch all of those.

Signed-off-by: Thomas Meyer <thomas@m3y3r.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/seg6_hmac.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index f950cb53d5e3..33fb35cbfac1 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/errno.h>
+#include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
@@ -110,7 +111,7 @@ static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id)
 	struct seg6_hmac_algo *algo;
 	int i, alg_count;
 
-	alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
+	alg_count = ARRAY_SIZE(hmac_algos);
 	for (i = 0; i < alg_count; i++) {
 		algo = &hmac_algos[i];
 		if (algo->alg_id == alg_id)
@@ -360,7 +361,7 @@ static int seg6_hmac_init_algo(void)
 	struct shash_desc *shash;
 	int i, alg_count, cpu;
 
-	alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
+	alg_count = ARRAY_SIZE(hmac_algos);
 
 	for (i = 0; i < alg_count; i++) {
 		struct crypto_shash **p_tfm;
@@ -421,7 +422,7 @@ void seg6_hmac_exit(void)
 	struct seg6_hmac_algo *algo = NULL;
 	int i, alg_count, cpu;
 
-	alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo);
+	alg_count = ARRAY_SIZE(hmac_algos);
 	for (i = 0; i < alg_count; i++) {
 		algo = &hmac_algos[i];
 		for_each_possible_cpu(cpu) {
-- 
cgit v1.2.3


From 864150dfa31dceab6ec5ca4579a2d35ede985cb7 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Fri, 1 Sep 2017 12:15:17 +0300
Subject: net: Add module reference to FIB notifiers

When a listener registers to the FIB notification chain it receives a
dump of the FIB entries and rules from existing address families by
invoking their dump operations.

While we call into these modules we need to make sure they aren't
removed. Do that by increasing their reference count before invoking
their dump operations and decrease it afterwards.

Fixes: 04b1d4e50e82 ("net: core: Make the FIB notification chain generic")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/fib6_notifier.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c
index 66a103ef7e86..05f82baaa99e 100644
--- a/net/ipv6/fib6_notifier.c
+++ b/net/ipv6/fib6_notifier.c
@@ -1,6 +1,7 @@
 #include <linux/notifier.h>
 #include <linux/socket.h>
 #include <linux/kernel.h>
+#include <linux/export.h>
 #include <net/net_namespace.h>
 #include <net/fib_notifier.h>
 #include <net/netns/ipv6.h>
@@ -41,6 +42,7 @@ static const struct fib_notifier_ops fib6_notifier_ops_template = {
 	.family		= AF_INET6,
 	.fib_seq_read	= fib6_seq_read,
 	.fib_dump	= fib6_dump,
+	.owner		= THIS_MODULE,
 };
 
 int __net_init fib6_notifier_init(struct net *net)
-- 
cgit v1.2.3


From 5a63643e583b6a9789d7a225ae076fb4e603991c Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 1 Sep 2017 11:26:13 +0200
Subject: Revert "net: fix percpu memory leaks"

This reverts commit 1d6119baf0610f813eb9d9580eb4fd16de5b4ceb.

After reverting commit 6d7b857d541e ("net: use lib/percpu_counter API
for fragmentation mem accounting") then here is no need for this
fix-up patch.  As percpu_counter is no longer used, it cannot
memory leak it any-longer.

Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 12 +++---------
 net/ipv6/reassembly.c                   | 12 +++---------
 2 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 986d4ca38832..b263bf3a19f7 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -622,18 +622,12 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
 
 static int nf_ct_net_init(struct net *net)
 {
-	int res;
-
 	net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
-	res = inet_frags_init_net(&net->nf_frag.frags);
-	if (res)
-		return res;
-	res = nf_ct_frag6_sysctl_register(net);
-	if (res)
-		inet_frags_uninit_net(&net->nf_frag.frags);
-	return res;
+	inet_frags_init_net(&net->nf_frag.frags);
+
+	return nf_ct_frag6_sysctl_register(net);
 }
 
 static void nf_ct_net_exit(struct net *net)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index e1da5b888cc4..846012eae526 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -714,19 +714,13 @@ static void ip6_frags_sysctl_unregister(void)
 
 static int __net_init ipv6_frags_init_net(struct net *net)
 {
-	int res;
-
 	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
 
-	res = inet_frags_init_net(&net->ipv6.frags);
-	if (res)
-		return res;
-	res = ip6_frags_ns_sysctl_register(net);
-	if (res)
-		inet_frags_uninit_net(&net->ipv6.frags);
-	return res;
+	inet_frags_init_net(&net->ipv6.frags);
+
+	return ip6_frags_ns_sysctl_register(net);
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)
-- 
cgit v1.2.3


From d1c1e39de8357d66163da39e893e38ea1410e8f8 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 29 Aug 2017 12:04:10 +0200
Subject: netfilter: remove unused hooknum arg from packet functions

tested with allmodconfig build.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 43544b975eae..30e34c4de003 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -95,7 +95,6 @@ static int icmpv6_packet(struct nf_conn *ct,
 		       unsigned int dataoff,
 		       enum ip_conntrack_info ctinfo,
 		       u_int8_t pf,
-		       unsigned int hooknum,
 		       unsigned int *timeout)
 {
 	/* Do not immediately delete the connection after the first
@@ -129,8 +128,7 @@ static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb,
 static int
 icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 		     struct sk_buff *skb,
-		     unsigned int icmp6off,
-		     unsigned int hooknum)
+		     unsigned int icmp6off)
 {
 	struct nf_conntrack_tuple intuple, origtuple;
 	const struct nf_conntrack_tuple_hash *h;
@@ -214,7 +212,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl,
 	if (icmp6h->icmp6_type >= 128)
 		return NF_ACCEPT;
 
-	return icmpv6_error_message(net, tmpl, skb, dataoff, hooknum);
+	return icmpv6_error_message(net, tmpl, skb, dataoff);
 }
 
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-- 
cgit v1.2.3


From 44d6e2f27328b254111dd716fde45b3b59b8a4f7 Mon Sep 17 00:00:00 2001
From: Varsha Rao <rvarsha016@gmail.com>
Date: Wed, 30 Aug 2017 13:37:11 +0530
Subject: net: Replace NF_CT_ASSERT() with WARN_ON().

This patch removes NF_CT_ASSERT() and instead uses WARN_ON().

Signed-off-by: Varsha Rao <rvarsha016@gmail.com>
---
 net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 +-
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c       | 6 +++---
 net/ipv6/netfilter/nf_nat_masquerade_ipv6.c    | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 30e34c4de003..a9e1fd1a8536 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -136,7 +136,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 	enum ip_conntrack_info ctinfo;
 	struct nf_conntrack_zone tmp;
 
-	NF_CT_ASSERT(!skb_nfct(skb));
+	WARN_ON(skb_nfct(skb));
 
 	/* Are they talking about one of our connections? */
 	if (!nf_ct_get_tuplepr(skb,
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index b2b4f031b3a1..46d6dba50698 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -196,7 +196,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
 	struct nf_conntrack_tuple target;
 	unsigned long statusbit;
 
-	NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
+	WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);
 
 	if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
 		return 0;
@@ -319,8 +319,8 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
 
 	default:
 		/* ESTABLISHED */
-		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
-			     ctinfo == IP_CT_ESTABLISHED_REPLY);
+		WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
+			ctinfo != IP_CT_ESTABLISHED_REPLY);
 		if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
 			goto oif_changed;
 	}
diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
index d7b679037bae..98f61fcb9108 100644
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -36,8 +36,8 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
 	struct nf_nat_range newrange;
 
 	ct = nf_ct_get(skb, &ctinfo);
-	NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
-			    ctinfo == IP_CT_RELATED_REPLY));
+	WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+			 ctinfo == IP_CT_RELATED_REPLY)));
 
 	if (ipv6_dev_get_saddr(nf_ct_net(ct), out,
 			       &ipv6_hdr(skb)->daddr, 0, &src) < 0)
-- 
cgit v1.2.3


From 9efdb14f76f4d7591cd4d7a436ebd716b19703b6 Mon Sep 17 00:00:00 2001
From: Varsha Rao <rvarsha016@gmail.com>
Date: Wed, 30 Aug 2017 13:37:12 +0530
Subject: net: Remove CONFIG_NETFILTER_DEBUG and _ASSERT() macros.

This patch removes CONFIG_NETFILTER_DEBUG and _ASSERT() macros as they
are no longer required. Replace _ASSERT() macros with WARN_ON().

Signed-off-by: Varsha Rao <rvarsha016@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/ip6_tables.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 9f6644958e5e..54b1e75eded1 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -39,12 +39,6 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("IPv6 packet filter");
 
-#ifdef CONFIG_NETFILTER_DEBUG
-#define IP_NF_ASSERT(x)	WARN_ON(!(x))
-#else
-#define IP_NF_ASSERT(x)
-#endif
-
 void *ip6t_alloc_initial_table(const struct xt_table *info)
 {
 	return xt_alloc_initial_table(ip6t, IP6T);
@@ -284,7 +278,7 @@ ip6t_do_table(struct sk_buff *skb,
 	acpar.hotdrop = false;
 	acpar.state   = state;
 
-	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+	WARN_ON(!(table->valid_hooks & (1 << hook)));
 
 	local_bh_disable();
 	addend = xt_write_recseq_begin();
@@ -315,7 +309,7 @@ ip6t_do_table(struct sk_buff *skb,
 		const struct xt_entry_match *ematch;
 		struct xt_counters *counter;
 
-		IP_NF_ASSERT(e);
+		WARN_ON(!e);
 		acpar.thoff = 0;
 		if (!ip6_packet_match(skb, indev, outdev, &e->ipv6,
 		    &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) {
@@ -335,7 +329,7 @@ ip6t_do_table(struct sk_buff *skb,
 		ADD_COUNTER(*counter, skb->len, 1);
 
 		t = ip6t_get_target_c(e);
-		IP_NF_ASSERT(t->u.kernel.target);
+		WARN_ON(!t->u.kernel.target);
 
 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
 		/* The packet is traced: log it */
-- 
cgit v1.2.3