From 568af6de058cb2b0c5b98d98ffcf37cdc6bc38a7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 4 Mar 2017 19:53:47 +0100 Subject: netfilter: nf_tables: set pktinfo->thoff at AH header if found Phil Sutter reports that IPv6 AH header matching is broken. From userspace, nft generates bytecode that expects to find the AH header at NFT_PAYLOAD_TRANSPORT_HEADER both for IPv4 and IPv6. However, pktinfo->thoff is set to the inner header after the AH header in IPv6, while in IPv4 pktinfo->thoff points to the AH header indeed. This behaviour is inconsistent. This patch fixes this problem by updating ipv6_find_hdr() to get the IP6_FH_F_AUTH flag so this function stops at the AH header, so both IPv4 and IPv6 pktinfo->thoff point to the AH header. This is also inconsistent when trying to match encapsulated headers: 1) A packet that looks like IPv4 + AH + TCP dport 22 will *not* match. 2) A packet that looks like IPv6 + AH + TCP dport 22 will match. Reported-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_ipv6.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index d150b5066201..97983d1c05e4 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -9,12 +9,13 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) { + unsigned int flags = IP6_FH_F_AUTH; int protohdr, thoff = 0; unsigned short frag_off; nft_set_pktinfo(pkt, skb, state); - protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0) { nft_set_pktinfo_proto_unspec(pkt, skb); return; @@ -32,6 +33,7 @@ __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_IPV6) + unsigned int flags = IP6_FH_F_AUTH; struct ipv6hdr *ip6h, _ip6h; unsigned int thoff = 0; unsigned short frag_off; @@ -50,7 +52,7 @@ __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, if (pkt_len + sizeof(*ip6h) > skb->len) return -1; - protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0) return -1; -- cgit v1.2.3 From 10596608c4d62cb8c1c2b806debcbd32fe657e71 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Wed, 8 Mar 2017 22:54:18 +0800 Subject: netfilter: nf_tables: fix mismatch in big-endian system Currently, there are two different methods to store an u16 integer to the u32 data register. For example: u32 *dest = ®s->data[priv->dreg]; 1. *dest = 0; *(u16 *) dest = val_u16; 2. *dest = val_u16; For method 1, the u16 value will be stored like this, either in big-endian or little-endian system: 0 15 31 +-+-+-+-+-+-+-+-+-+-+-+-+ | Value | 0 | +-+-+-+-+-+-+-+-+-+-+-+-+ For method 2, in little-endian system, the u16 value will be the same as listed above. But in big-endian system, the u16 value will be stored like this: 0 15 31 +-+-+-+-+-+-+-+-+-+-+-+-+ | 0 | Value | +-+-+-+-+-+-+-+-+-+-+-+-+ So later we use "memcmp(®s->data[priv->sreg], data, 2);" to do compare in nft_cmp, nft_lookup expr ..., method 2 will get the wrong result in big-endian system, as 0~15 bits will always be zero. For the similar reason, when loading an u16 value from the u32 data register, we should use "*(u16 *) sreg;" instead of "(u16)*sreg;", the 2nd method will get the wrong value in the big-endian system. So introduce some wrapper functions to store/load an u8 or u16 integer to/from the u32 data register, and use them in the right place. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2aa8a9d80fbe..70c5ca0c60b1 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -103,6 +103,35 @@ struct nft_regs { }; }; +/* Store/load an u16 or u8 integer to/from the u32 data register. + * + * Note, when using concatenations, register allocation happens at 32-bit + * level. So for store instruction, pad the rest part with zero to avoid + * garbage values. + */ + +static inline void nft_reg_store16(u32 *dreg, u16 val) +{ + *dreg = 0; + *(u16 *)dreg = val; +} + +static inline void nft_reg_store8(u32 *dreg, u8 val) +{ + *dreg = 0; + *(u8 *)dreg = val; +} + +static inline u16 nft_reg_load16(u32 *sreg) +{ + return *(u16 *)sreg; +} + +static inline u8 nft_reg_load8(u32 *sreg) +{ + return *(u8 *)sreg; +} + static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { -- cgit v1.2.3 From 170a1fb9c01bc40b7e8fd57a32ac9a0e131ec5b6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sat, 11 Mar 2017 00:25:26 -0500 Subject: netfilter: Force fake conntrack entry to be at least 8 bytes aligned Since the nfct and nfctinfo have been combined, the nf_conn structure must be at least 8 bytes aligned, as the 3 LSB bits are used for the nfctinfo. But there's a fake nf_conn structure to denote untracked connections, which is created by a PER_CPU construct. This does not guarantee that it will be 8 bytes aligned and can break the logic in determining the correct nfctinfo. I triggered this on a 32bit machine with the following error: BUG: unable to handle kernel NULL pointer dereference at 00000af4 IP: nf_ct_deliver_cached_events+0x1b/0xfb *pdpt = 0000000031962001 *pde = 0000000000000000 Oops: 0000 [#1] SMP [Modules linked in: ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables ipv6 crc_ccitt ppdev r8169 parport_pc parport OK ] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.10.0-test+ #75 Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014 task: c126ec00 task.stack: c1258000 EIP: nf_ct_deliver_cached_events+0x1b/0xfb EFLAGS: 00010202 CPU: 0 EAX: 0021cd01 EBX: 00000000 ECX: 27b0c767 EDX: 32bcb17a ESI: f34135c0 EDI: f34135c0 EBP: f2debd60 ESP: f2debd3c DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 80050033 CR2: 00000af4 CR3: 309a0440 CR4: 001406f0 Call Trace: ? ipv6_skip_exthdr+0xac/0xcb ipv6_confirm+0x10c/0x119 [nf_conntrack_ipv6] nf_hook_slow+0x22/0xc7 nf_hook+0x9a/0xad [ipv6] ? ip6t_do_table+0x356/0x379 [ip6_tables] ? ip6_fragment+0x9e9/0x9e9 [ipv6] ip6_output+0xee/0x107 [ipv6] ? ip6_fragment+0x9e9/0x9e9 [ipv6] dst_output+0x36/0x4d [ipv6] NF_HOOK.constprop.37+0xb2/0xba [ipv6] ? icmp6_dst_alloc+0x2c/0xfd [ipv6] ? local_bh_enable+0x14/0x14 [ipv6] mld_sendpack+0x1c5/0x281 [ipv6] ? mark_held_locks+0x40/0x5c mld_ifc_timer_expire+0x1f6/0x21e [ipv6] call_timer_fn+0x135/0x283 ? detach_if_pending+0x55/0x55 ? mld_dad_timer_expire+0x3e/0x3e [ipv6] __run_timers+0x111/0x14b ? mld_dad_timer_expire+0x3e/0x3e [ipv6] run_timer_softirq+0x1c/0x36 __do_softirq+0x185/0x37c ? test_ti_thread_flag.constprop.19+0xd/0xd do_softirq_own_stack+0x22/0x28 irq_exit+0x5a/0xa4 smp_apic_timer_interrupt+0x2a/0x34 apic_timer_interrupt+0x37/0x3c By using DEFINE/DECLARE_PER_CPU_ALIGNED we can enforce at least 8 byte alignment as all cache line sizes are at least 8 bytes or more. Fixes: a9e419dc7be6 ("netfilter: merge ctinfo into nfct pointer storage area") Signed-off-by: Steven Rostedt (VMware) Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index f540f9ad2af4..19605878da47 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -244,7 +244,7 @@ extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, u32 seq); /* Fake conntrack entry for untracked connections */ -DECLARE_PER_CPU(struct nf_conn, nf_conntrack_untracked); +DECLARE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked); static inline struct nf_conn *nf_ct_untracked_get(void) { return raw_cpu_ptr(&nf_conntrack_untracked); -- cgit v1.2.3 From 04166f48d9593af4513ae06c0f966c0cee300a20 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Mar 2017 13:24:03 +0100 Subject: Revert "netfilter: nf_tables: add flush field to struct nft_set_iter" This reverts commit 1f48ff6c5393aa7fe290faf5d633164f105b0aa7. This patch is not required anymore now that we keep a dummy list of set elements in the bitmap set implementation, so revert this before we forget this code has no clients. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 70c5ca0c60b1..0136028652bd 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -232,7 +232,6 @@ struct nft_set_elem { struct nft_set; struct nft_set_iter { u8 genmask; - bool flush; unsigned int count; unsigned int skip; int err; -- cgit v1.2.3 From 4cbe4dac82e423ecc9a0ba46af24a860853259f4 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Mon, 13 Mar 2017 19:29:08 +0200 Subject: net/mlx4_core: Avoid delays during VF driver device shutdown Some Hypervisors detach VFs from VMs by instantly causing an FLR event to be generated for a VF. In the mlx4 case, this will cause that VF's comm channel to be disabled before the VM has an opportunity to invoke the VF device's "shutdown" method. For such Hypervisors, there is a race condition between the VF's shutdown method and its internal-error detection/reset thread. The internal-error detection/reset thread (which runs every 5 seconds) also detects a disabled comm channel. If the internal-error detection/reset flow wins the race, we still get delays (while that flow tries repeatedly to detect comm-channel recovery). The cited commit fixed the command timeout problem when the internal-error detection/reset flow loses the race. This commit avoids the unneeded delays when the internal-error detection/reset flow wins. Fixes: d585df1c5ccf ("net/mlx4_core: Avoid command timeouts during VF driver device shutdown") Signed-off-by: Jack Morgenstein Reported-by: Simon Xiao Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 7e66e4f62858..1beb1ec2fbdf 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -476,6 +476,7 @@ enum { enum { MLX4_INTERFACE_STATE_UP = 1 << 0, MLX4_INTERFACE_STATE_DELETION = 1 << 1, + MLX4_INTERFACE_STATE_NOWAIT = 1 << 2, }; #define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \ -- cgit v1.2.3 From 36d277bac8080202684e67162ebb157f16631581 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 15 Mar 2017 09:32:14 +0800 Subject: vsock: track pkt owner vsock So that we can cancel a queued pkt later if necessary. Signed-off-by: Peng Tao Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 9638bfeb0d1f..584f9a647ad4 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -48,6 +48,8 @@ struct virtio_vsock_pkt { struct virtio_vsock_hdr hdr; struct work_struct work; struct list_head list; + /* socket refcnt not held, only use for cancellation */ + struct vsock_sock *vsk; void *buf; u32 len; u32 off; @@ -56,6 +58,7 @@ struct virtio_vsock_pkt { struct virtio_vsock_pkt_info { u32 remote_cid, remote_port; + struct vsock_sock *vsk; struct msghdr *msg; u32 pkt_len; u16 type; -- cgit v1.2.3 From 16320f363ae128d9b9c70e60f00f2a572f57c23d Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 15 Mar 2017 09:32:15 +0800 Subject: vhost-vsock: add pkt cancel capability To allow canceling all packets of a connection. Reviewed-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: Peng Tao Signed-off-by: David S. Miller --- include/net/af_vsock.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index f2758964ce6f..f32ed9ac181a 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -100,6 +100,9 @@ struct vsock_transport { void (*destruct)(struct vsock_sock *); void (*release)(struct vsock_sock *); + /* Cancel all pending packets sent on vsock. */ + int (*cancel_pkt)(struct vsock_sock *vsk); + /* Connections. */ int (*connect)(struct vsock_sock *); -- cgit v1.2.3 From 1f904495b79003cd3d881de8731377d48fcbc7e3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 18 Mar 2017 19:27:23 +0800 Subject: sctp: define dst_pending_confirm as a bit in sctp_transport As tp->dst_pending_confirm's value can only be set 0 or 1, this patch is to change to define it as a bit instead of __u32. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 07a0b128625a..4f645198e9bd 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -753,6 +753,8 @@ struct sctp_transport { /* Is the Path MTU update pending on this tranport */ pmtu_pending:1, + dst_pending_confirm:1, /* need to confirm neighbour */ + /* Has this transport moved the ctsn since we last sacked */ sack_generation:1; u32 dst_cookie; @@ -806,8 +808,6 @@ struct sctp_transport { __u32 burst_limited; /* Holds old cwnd when max.burst is applied */ - __u32 dst_pending_confirm; /* need to confirm neighbour */ - /* Destination */ struct dst_entry *dst; /* Source address. */ -- cgit v1.2.3 From 4ef1b2869447411ad3ef91ad7d4891a83c1a509a Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sat, 18 Mar 2017 17:03:00 -0400 Subject: tcp: mark skbs with SCM_TIMESTAMPING_OPT_STATS SOF_TIMESTAMPING_OPT_STATS can be enabled and disabled while packets are collected on the error queue. So, checking SOF_TIMESTAMPING_OPT_STATS in sk->sk_tsflags is not enough to safely assume that the skb contains OPT_STATS data. Add a bit in sock_exterr_skb to indicate whether the skb contains opt_stats data. Fixes: 1c885808e456 ("tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING") Reported-by: JongHwan Kim Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/errqueue.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/errqueue.h b/include/linux/errqueue.h index 9ca23fcfb5d7..6fdfc884fdeb 100644 --- a/include/linux/errqueue.h +++ b/include/linux/errqueue.h @@ -20,6 +20,8 @@ struct sock_exterr_skb { struct sock_extended_err ee; u16 addr_offset; __be16 port; + u8 opt_stats:1, + unused:7; }; #endif -- cgit v1.2.3 From 1511949c61ec63e4b646c34d602ac6990b38ce30 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 20 Mar 2017 17:46:27 +0800 Subject: sctp: declare struct sctp_stream before using it sctp_stream_free uses struct sctp_stream as a param, but struct sctp_stream is defined after it's declaration. This patch is to declare struct sctp_stream before sctp_stream_free. Fixes: a83863174a61 ("sctp: prepare asoc stream for stream reconf") Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 4f645198e9bd..592decebac75 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -83,6 +83,7 @@ struct sctp_bind_addr; struct sctp_ulpq; struct sctp_ep_common; struct crypto_shash; +struct sctp_stream; #include -- cgit v1.2.3