From c4b0e771f906f5beb7d90c3d28fe55ff9dbd038c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:15 +0100 Subject: [PATCH 01/13] netfilter: avoid using skb->nf_bridge directly This pointer is going to be removed soon, so use the existing helpers in more places to avoid noise when the removal happens. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netfilter_bridge.h | 33 +++++++++++++----- include/net/netfilter/br_netfilter.h | 6 ---- net/bridge/br_netfilter_hooks.c | 19 ++++++++--- net/ipv4/netfilter/nf_reject_ipv4.c | 6 ++-- net/ipv6/netfilter/nf_reject_ipv6.c | 10 ++++-- net/netfilter/nf_log_common.c | 20 +++++------ net/netfilter/nf_queue.c | 50 ++++++++++++++++++---------- net/netfilter/nfnetlink_queue.c | 23 ++++++------- net/netfilter/xt_physdev.c | 2 +- 9 files changed, 103 insertions(+), 66 deletions(-) diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index fa06865009702..0a65a422587ce 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -17,43 +17,58 @@ static inline void br_drop_fake_rtable(struct sk_buff *skb) skb_dst_drop(skb); } +static inline struct nf_bridge_info * +nf_bridge_info_get(const struct sk_buff *skb) +{ + return skb->nf_bridge; +} + +static inline bool nf_bridge_info_exists(const struct sk_buff *skb) +{ + return skb->nf_bridge != NULL; +} + static inline int nf_bridge_get_physinif(const struct sk_buff *skb) { - struct nf_bridge_info *nf_bridge; + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - if (skb->nf_bridge == NULL) + if (!nf_bridge) return 0; - nf_bridge = skb->nf_bridge; return nf_bridge->physindev ? nf_bridge->physindev->ifindex : 0; } static inline int nf_bridge_get_physoutif(const struct sk_buff *skb) { - struct nf_bridge_info *nf_bridge; + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - if (skb->nf_bridge == NULL) + if (!nf_bridge) return 0; - nf_bridge = skb->nf_bridge; return nf_bridge->physoutdev ? nf_bridge->physoutdev->ifindex : 0; } static inline struct net_device * nf_bridge_get_physindev(const struct sk_buff *skb) { - return skb->nf_bridge ? skb->nf_bridge->physindev : NULL; + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + return nf_bridge ? nf_bridge->physindev : NULL; } static inline struct net_device * nf_bridge_get_physoutdev(const struct sk_buff *skb) { - return skb->nf_bridge ? skb->nf_bridge->physoutdev : NULL; + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + return nf_bridge ? nf_bridge->physoutdev : NULL; } static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb) { - return skb->nf_bridge && skb->nf_bridge->in_prerouting; + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + return nf_bridge && nf_bridge->in_prerouting; } #else #define br_drop_fake_rtable(skb) do { } while (0) diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h index 74af19c3a8f74..6efc0153987b3 100644 --- a/include/net/netfilter/br_netfilter.h +++ b/include/net/netfilter/br_netfilter.h @@ -22,12 +22,6 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, struct sock *sk, int (*okfn)(struct net *, struct sock *, struct sk_buff *)); -static inline struct nf_bridge_info * -nf_bridge_info_get(const struct sk_buff *skb) -{ - return skb->nf_bridge; -} - unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb); static inline void nf_bridge_push_encap_header(struct sk_buff *skb) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index c9383c470a838..c58cf68b45c5f 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -247,7 +247,9 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) void nf_bridge_update_protocol(struct sk_buff *skb) { - switch (skb->nf_bridge->orig_proto) { + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + switch (nf_bridge->orig_proto) { case BRNF_PROTO_8021Q: skb->protocol = htons(ETH_P_8021Q); break; @@ -569,7 +571,8 @@ static unsigned int br_nf_forward_ip(void *priv, struct net_device *parent; u_int8_t pf; - if (!skb->nf_bridge) + nf_bridge = nf_bridge_info_get(skb); + if (!nf_bridge) return NF_ACCEPT; /* Need exclusive nf_bridge_info since we might have multiple @@ -701,7 +704,9 @@ br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) { - if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE) + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + if (nf_bridge->orig_proto == BRNF_PROTO_PPPOE) return PPPOE_SES_HLEN; return 0; } @@ -839,7 +844,9 @@ static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (skb->nf_bridge && !skb->nf_bridge->in_prerouting && + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + if (nf_bridge && !nf_bridge->in_prerouting && !netif_is_l3_master(skb->dev)) { state->okfn(state->net, state->sk, skb); return NF_STOLEN; @@ -877,7 +884,9 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) static int br_nf_dev_xmit(struct sk_buff *skb) { - if (skb->nf_bridge && skb->nf_bridge->bridged_dnat) { + const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + if (nf_bridge && nf_bridge->bridged_dnat) { br_nf_pre_routing_finish_bridge_slow(skb); return 1; } diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 5cd06ba3535df..aa8304c618b80 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -102,6 +102,7 @@ EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); /* Send RST reply */ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) { + struct net_device *br_indev __maybe_unused; struct sk_buff *nskb; struct iphdr *niph; const struct tcphdr *oth; @@ -147,10 +148,11 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ - if (oldskb->nf_bridge) { + br_indev = nf_bridge_get_physindev(oldskb); + if (br_indev) { struct ethhdr *oeth = eth_hdr(oldskb); - nskb->dev = nf_bridge_get_physindev(oldskb); + nskb->dev = br_indev; niph->tot_len = htons(nskb->len); ip_send_check(niph); if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 24858402e3748..b9c8a763c863c 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -131,6 +131,7 @@ EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put); void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) { + struct net_device *br_indev __maybe_unused; struct sk_buff *nskb; struct tcphdr _otcph; const struct tcphdr *otcph; @@ -197,15 +198,18 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ - if (oldskb->nf_bridge) { + br_indev = nf_bridge_get_physindev(oldskb); + if (br_indev) { struct ethhdr *oeth = eth_hdr(oldskb); - nskb->dev = nf_bridge_get_physindev(oldskb); + nskb->dev = br_indev; nskb->protocol = htons(ETH_P_IPV6); ip6h->payload_len = htons(sizeof(struct tcphdr)); if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), - oeth->h_source, oeth->h_dest, nskb->len) < 0) + oeth->h_source, oeth->h_dest, nskb->len) < 0) { + kfree_skb(nskb); return; + } dev_queue_xmit(nskb); } else #endif diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index a8c5c846aec10..3a0d6880b7c9f 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -156,22 +156,20 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix) { + const struct net_device *physoutdev __maybe_unused; + const struct net_device *physindev __maybe_unused; + nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", '0' + loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : ""); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge) { - const struct net_device *physindev; - const struct net_device *physoutdev; - - physindev = nf_bridge_get_physindev(skb); - if (physindev && in != physindev) - nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); - physoutdev = nf_bridge_get_physoutdev(skb); - if (physoutdev && out != physoutdev) - nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); - } + physindev = nf_bridge_get_physindev(skb); + if (physindev && in != physindev) + nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); + physoutdev = nf_bridge_get_physoutdev(skb); + if (physoutdev && out != physoutdev) + nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); #endif } EXPORT_SYMBOL_GPL(nf_log_dump_packet_common); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index d67a96a25a681..a36a77bae1d6d 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -46,6 +46,24 @@ void nf_unregister_queue_handler(struct net *net) } EXPORT_SYMBOL(nf_unregister_queue_handler); +static void nf_queue_entry_release_br_nf_refs(struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + if (nf_bridge) { + struct net_device *physdev; + + physdev = nf_bridge_get_physindev(skb); + if (physdev) + dev_put(physdev); + physdev = nf_bridge_get_physoutdev(skb); + if (physdev) + dev_put(physdev); + } +#endif +} + void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; @@ -57,20 +75,28 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry) dev_put(state->out); if (state->sk) sock_put(state->sk); + + nf_queue_entry_release_br_nf_refs(entry->skb); +} +EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); + +static void nf_queue_entry_get_br_nf_refs(struct sk_buff *skb) +{ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (entry->skb->nf_bridge) { + struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + + if (nf_bridge) { struct net_device *physdev; - physdev = nf_bridge_get_physindev(entry->skb); + physdev = nf_bridge_get_physindev(skb); if (physdev) - dev_put(physdev); - physdev = nf_bridge_get_physoutdev(entry->skb); + dev_hold(physdev); + physdev = nf_bridge_get_physoutdev(skb); if (physdev) - dev_put(physdev); + dev_hold(physdev); } #endif } -EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); /* Bump dev refs so they don't vanish while packet is out */ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) @@ -83,18 +109,8 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) dev_hold(state->out); if (state->sk) sock_hold(state->sk); -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (entry->skb->nf_bridge) { - struct net_device *physdev; - physdev = nf_bridge_get_physindev(entry->skb); - if (physdev) - dev_hold(physdev); - physdev = nf_bridge_get_physoutdev(entry->skb); - if (physdev) - dev_hold(physdev); - } -#endif + nf_queue_entry_get_br_nf_refs(entry->skb); } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 1ce30efe6854a..0dcc3592d053f 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -727,13 +727,13 @@ nf_queue_entry_dup(struct nf_queue_entry *e) */ static void nf_bridge_adjust_skb_data(struct sk_buff *skb) { - if (skb->nf_bridge) + if (nf_bridge_info_get(skb)) __skb_push(skb, skb->network_header - skb->mac_header); } static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) { - if (skb->nf_bridge) + if (nf_bridge_info_get(skb)) __skb_pull(skb, skb->network_header - skb->mac_header); } #else @@ -904,23 +904,22 @@ nfqnl_set_mode(struct nfqnl_instance *queue, static int dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) { +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + int physinif, physoutif; + + physinif = nf_bridge_get_physinif(entry->skb); + physoutif = nf_bridge_get_physoutif(entry->skb); + + if (physinif == ifindex || physoutif == ifindex) + return 1; +#endif if (entry->state.in) if (entry->state.in->ifindex == ifindex) return 1; if (entry->state.out) if (entry->state.out->ifindex == ifindex) return 1; -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (entry->skb->nf_bridge) { - int physinif, physoutif; - physinif = nf_bridge_get_physinif(entry->skb); - physoutif = nf_bridge_get_physoutif(entry->skb); - - if (physinif == ifindex || physoutif == ifindex) - return 1; - } -#endif return 0; } diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 9d6d67b953ac8..4034d70bff394 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -33,7 +33,7 @@ physdev_mt(const struct sk_buff *skb, struct xt_action_param *par) /* Not a bridged IP packet or no info available yet: * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if * the destination device will be a bridge. */ - if (!skb->nf_bridge) { + if (!nf_bridge_info_exists(skb)) { /* Return MATCH if the invert flags of the used options are on */ if ((info->bitmask & XT_PHYSDEV_OP_BRIDGED) && !(info->invert & XT_PHYSDEV_OP_BRIDGED)) From df5042f4c5b9326c593bf2e31ed859ebc3b4130a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:16 +0100 Subject: [PATCH 02/13] sk_buff: add skb extension infrastructure This adds an optional extension infrastructure, with ispec (xfrm) and bridge netfilter as first users. objdiff shows no changes if kernel is built without xfrm and br_netfilter support. The third (planned future) user is Multipath TCP which is still out-of-tree. MPTCP needs to map logical mptcp sequence numbers to the tcp sequence numbers used by individual subflows. This DSS mapping is read/written from tcp option space on receive and written to tcp option space on transmitted tcp packets that are part of and MPTCP connection. Extending skb_shared_info or adding a private data field to skb fclones doesn't work for incoming skb, so a different DSS propagation method would be required for the receive side. mptcp has same requirements as secpath/bridge netfilter: 1. extension memory is released when the sk_buff is free'd. 2. data is shared after cloning an skb (clone inherits extension) 3. adding extension to an skb will COW the extension buffer if needed. The "MPTCP upstreaming" effort adds SKB_EXT_MPTCP extension to store the mapping for tx and rx processing. Two new members are added to sk_buff: 1. 'active_extensions' byte (filling a hole), telling which extensions are available for this skb. This has two purposes. a) avoids the need to initialize the pointer. b) allows to "delete" an extension by clearing its bit value in ->active_extensions. While it would be possible to store the active_extensions byte in the extension struct instead of sk_buff, there is one problem with this: When an extension has to be disabled, we can always clear the bit in skb->active_extensions. But in case it would be stored in the extension buffer itself, we might have to COW it first, if we are dealing with a cloned skb. On kmalloc failure we would be unable to turn an extension off. 2. extension pointer, located at the end of the sk_buff. If the active_extensions byte is 0, the pointer is undefined, it is not initialized on skb allocation. This adds extra code to skb clone and free paths (to deal with refcount/free of extension area) but this replaces similar code that manages skb->nf_bridge and skb->sp structs in the followup patches of the series. It is possible to add support for extensions that are not preseved on clones/copies. To do this, it would be needed to define a bitmask of all extensions that need copy/cow semantics, and change __skb_ext_copy() to check ->active_extensions & SKB_EXT_PRESERVE_ON_CLONE, then just set ->active_extensions to 0 on the new clone. This isn't done here because all extensions that get added here need the copy/cow semantics. v2: Allocate entire extension space using kmem_cache. Upside is that this allows better tracking of used memory, downside is that we will allocate more space than strictly needed in most cases (its unlikely that all extensions are active/needed at same time for same skb). The allocated memory (except the small extension header) is not cleared, so no additonal overhead aside from memory usage. Avoid atomic_dec_and_test operation on skb_ext_put() by using similar trick as kfree_skbmem() does with fclone_ref: If recount is 1, there is no concurrent user and we can free right away. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 111 ++++++++++++++++++++++++++++- net/Kconfig | 3 + net/core/skbuff.c | 155 +++++++++++++++++++++++++++++++++++++++++ net/ipv4/ip_output.c | 1 + net/ipv6/ip6_output.c | 1 + 5 files changed, 270 insertions(+), 1 deletion(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b1831a5ca1733..88f7541837e34 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -245,6 +245,7 @@ struct iov_iter; struct napi_struct; struct bpf_prog; union bpf_attr; +struct skb_ext; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { @@ -636,6 +637,7 @@ typedef unsigned char *sk_buff_data_t; * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves + * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -665,6 +667,7 @@ typedef unsigned char *sk_buff_data_t; * @data: Data head pointer * @truesize: Buffer size * @users: User count - see {datagram,tcp}.c + * @extensions: allocated extensions, valid if active_extensions is nonzero */ struct sk_buff { @@ -747,7 +750,9 @@ struct sk_buff { head_frag:1, xmit_more:1, pfmemalloc:1; - +#ifdef CONFIG_SKB_EXTENSIONS + __u8 active_extensions; +#endif /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() */ @@ -869,6 +874,11 @@ struct sk_buff { *data; unsigned int truesize; refcount_t users; + +#ifdef CONFIG_SKB_EXTENSIONS + /* only useable after checking ->active_extensions != 0 */ + struct skb_ext *extensions; +#endif }; #ifdef __KERNEL__ @@ -3896,6 +3906,105 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) atomic_inc(&nfct->use); } #endif + +#ifdef CONFIG_SKB_EXTENSIONS +enum skb_ext_id { +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + SKB_EXT_BRIDGE_NF, +#endif + SKB_EXT_NUM, /* must be last */ +}; + +/** + * struct skb_ext - sk_buff extensions + * @refcnt: 1 on allocation, deallocated on 0 + * @offset: offset to add to @data to obtain extension address + * @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units + * @data: start of extension data, variable sized + * + * Note: offsets/lengths are stored in chunks of 8 bytes, this allows + * to use 'u8' types while allowing up to 2kb worth of extension data. + */ +struct skb_ext { + refcount_t refcnt; + u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */ + u8 chunks; /* same */ + char data[0] __aligned(8); +}; + +void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); +void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id); +void __skb_ext_put(struct skb_ext *ext); + +static inline void skb_ext_put(struct sk_buff *skb) +{ + if (skb->active_extensions) + __skb_ext_put(skb->extensions); +} + +static inline void skb_ext_get(struct sk_buff *skb) +{ + if (skb->active_extensions) { + struct skb_ext *ext = skb->extensions; + + if (ext) + refcount_inc(&ext->refcnt); + } +} + +static inline void __skb_ext_copy(struct sk_buff *dst, + const struct sk_buff *src) +{ + dst->active_extensions = src->active_extensions; + + if (src->active_extensions) { + struct skb_ext *ext = src->extensions; + + refcount_inc(&ext->refcnt); + dst->extensions = ext; + } +} + +static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src) +{ + skb_ext_put(dst); + __skb_ext_copy(dst, src); +} + +static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i) +{ + return !!ext->offset[i]; +} + +static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id) +{ + return skb->active_extensions & (1 << id); +} + +static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) +{ + if (skb_ext_exist(skb, id)) + __skb_ext_del(skb, id); +} + +static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id) +{ + if (skb_ext_exist(skb, id)) { + struct skb_ext *ext = skb->extensions; + + return (void *)ext + (ext->offset[id] << 3); + } + + return NULL; +} +#else +static inline void skb_ext_put(struct sk_buff *skb) {} +static inline void skb_ext_get(struct sk_buff *skb) {} +static inline void skb_ext_del(struct sk_buff *skb, int unused) {} +static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} +static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} +#endif /* CONFIG_SKB_EXTENSIONS */ + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) { diff --git a/net/Kconfig b/net/Kconfig index f235edb593ba3..93b2912928609 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -51,6 +51,9 @@ config NET_INGRESS config NET_EGRESS bool +config SKB_EXTENSIONS + bool + menu "Networking options" source "net/packet/Kconfig" diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 40552547c69a7..d2dfad33e6864 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,9 @@ struct kmem_cache *skbuff_head_cache __ro_after_init; static struct kmem_cache *skbuff_fclone_cache __ro_after_init; +#ifdef CONFIG_SKB_EXTENSIONS +static struct kmem_cache *skbuff_ext_cache __ro_after_init; +#endif int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); @@ -617,6 +620,7 @@ void skb_release_head_state(struct sk_buff *skb) #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); #endif + skb_ext_put(skb); } /* Free everything but the sk_buff shell. */ @@ -796,6 +800,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->dev = old->dev; memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); + __skb_ext_copy(new, old); #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif @@ -3902,6 +3907,40 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(skb_gro_receive); +#ifdef CONFIG_SKB_EXTENSIONS +#define SKB_EXT_ALIGN_VALUE 8 +#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) + +static const u8 skb_ext_type_len[] = { +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), +#endif +}; + +static __always_inline unsigned int skb_ext_total_length(void) +{ + return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + skb_ext_type_len[SKB_EXT_BRIDGE_NF] + +#endif + 0; +} + +static void skb_extensions_init(void) +{ + BUILD_BUG_ON(SKB_EXT_NUM >= 8); + BUILD_BUG_ON(skb_ext_total_length() > 255); + + skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", + SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), + 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, + NULL); +} +#else +static void skb_extensions_init(void) {} +#endif + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", @@ -3916,6 +3955,7 @@ void __init skb_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + skb_extensions_init(); } static int @@ -5554,3 +5594,118 @@ void skb_condense(struct sk_buff *skb) */ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); } + +#ifdef CONFIG_SKB_EXTENSIONS +static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) +{ + return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); +} + +static struct skb_ext *skb_ext_alloc(void) +{ + struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + + if (new) { + memset(new->offset, 0, sizeof(new->offset)); + refcount_set(&new->refcnt, 1); + } + + return new; +} + +static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old) +{ + struct skb_ext *new; + + if (refcount_read(&old->refcnt) == 1) + return old; + + new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + if (!new) + return NULL; + + memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); + refcount_set(&new->refcnt, 1); + + __skb_ext_put(old); + return new; +} + +/** + * skb_ext_add - allocate space for given extension, COW if needed + * @skb: buffer + * @id: extension to allocate space for + * + * Allocates enough space for the given extension. + * If the extension is already present, a pointer to that extension + * is returned. + * + * If the skb was cloned, COW applies and the returned memory can be + * modified without changing the extension space of clones buffers. + * + * Returns pointer to the extension or NULL on allocation failure. + */ +void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *new, *old = NULL; + unsigned int newlen, newoff; + + if (skb->active_extensions) { + old = skb->extensions; + + new = skb_ext_maybe_cow(old); + if (!new) + return NULL; + + if (__skb_ext_exist(old, id)) { + if (old != new) + skb->extensions = new; + goto set_active; + } + + newoff = old->chunks; + } else { + newoff = SKB_EXT_CHUNKSIZEOF(*new); + + new = skb_ext_alloc(); + if (!new) + return NULL; + } + + newlen = newoff + skb_ext_type_len[id]; + new->chunks = newlen; + new->offset[id] = newoff; + skb->extensions = new; +set_active: + skb->active_extensions |= 1 << id; + return skb_ext_get_ptr(new, id); +} +EXPORT_SYMBOL(skb_ext_add); + +void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) +{ + struct skb_ext *ext = skb->extensions; + + skb->active_extensions &= ~(1 << id); + if (skb->active_extensions == 0) { + skb->extensions = NULL; + __skb_ext_put(ext); + } +} +EXPORT_SYMBOL(__skb_ext_del); + +void __skb_ext_put(struct skb_ext *ext) +{ + /* If this is last clone, nothing can increment + * it after check passes. Avoids one atomic op. + */ + if (refcount_read(&ext->refcnt) == 1) + goto free_now; + + if (!refcount_dec_and_test(&ext->refcnt)) + return; +free_now: + kmem_cache_free(skbuff_ext_cache, ext); +} +EXPORT_SYMBOL(__skb_ext_put); +#endif /* CONFIG_SKB_EXTENSIONS */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ab6618036afed..c80188875f392 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -533,6 +533,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); + skb_ext_copy(to, from); #if IS_ENABLED(CONFIG_IP_VS) to->ipvs_property = from->ipvs_property; #endif diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 9d55ee33b7f95..703a8e801c5cb 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -581,6 +581,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); + skb_ext_copy(to, from); skb_copy_secmark(to, from); } From de8bda1d22d38b7d5cd08b33f86efd94d4c86630 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:17 +0100 Subject: [PATCH 03/13] net: convert bridge_nf to use skb extension infrastructure This converts the bridge netfilter (calling iptables hooks from bridge) facility to use the extension infrastructure. The bridge_nf specific hooks in skb clone and free paths are removed, they have been replaced by the skb_ext hooks that do the same as the bridge nf allocations hooks did. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netfilter_bridge.h | 4 ++-- include/linux/skbuff.h | 28 ++-------------------------- include/net/netfilter/br_netfilter.h | 8 ++++---- net/Kconfig | 1 + net/bridge/br_netfilter_hooks.c | 20 ++------------------ net/bridge/br_netfilter_ipv6.c | 4 ++-- net/core/skbuff.c | 3 --- 7 files changed, 13 insertions(+), 55 deletions(-) diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index 0a65a422587ce..5f2614d02e03a 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -20,12 +20,12 @@ static inline void br_drop_fake_rtable(struct sk_buff *skb) static inline struct nf_bridge_info * nf_bridge_info_get(const struct sk_buff *skb) { - return skb->nf_bridge; + return skb_ext_find(skb, SKB_EXT_BRIDGE_NF); } static inline bool nf_bridge_info_exists(const struct sk_buff *skb) { - return skb->nf_bridge != NULL; + return skb_ext_exist(skb, SKB_EXT_BRIDGE_NF); } static inline int nf_bridge_get_physinif(const struct sk_buff *skb) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 88f7541837e34..2f42d2e99f17f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -255,7 +255,6 @@ struct nf_conntrack { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct nf_bridge_info { - refcount_t use; enum { BRNF_PROTO_UNCHANGED, BRNF_PROTO_8021Q, @@ -720,9 +719,6 @@ struct sk_buff { #endif #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) unsigned long _nfct; -#endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - struct nf_bridge_info *nf_bridge; #endif unsigned int len, data_len; @@ -4005,18 +4001,6 @@ static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {} static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {} #endif /* CONFIG_SKB_EXTENSIONS */ -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) -{ - if (nf_bridge && refcount_dec_and_test(&nf_bridge->use)) - kfree(nf_bridge); -} -static inline void nf_bridge_get(struct nf_bridge_info *nf_bridge) -{ - if (nf_bridge) - refcount_inc(&nf_bridge->use); -} -#endif /* CONFIG_BRIDGE_NETFILTER */ static inline void nf_reset(struct sk_buff *skb) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) @@ -4024,8 +4008,7 @@ static inline void nf_reset(struct sk_buff *skb) skb->_nfct = 0; #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - nf_bridge_put(skb->nf_bridge); - skb->nf_bridge = NULL; + skb_ext_del(skb, SKB_EXT_BRIDGE_NF); #endif } @@ -4043,7 +4026,7 @@ static inline void ipvs_reset(struct sk_buff *skb) #endif } -/* Note: This doesn't put any conntrack and bridge info in dst. */ +/* Note: This doesn't put any conntrack info in dst. */ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, bool copy) { @@ -4051,10 +4034,6 @@ static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src, dst->_nfct = src->_nfct; nf_conntrack_get(skb_nfct(src)); #endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - dst->nf_bridge = src->nf_bridge; - nf_bridge_get(src->nf_bridge); -#endif #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) if (copy) dst->nf_trace = src->nf_trace; @@ -4065,9 +4044,6 @@ static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src) { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) nf_conntrack_put(skb_nfct(dst)); -#endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - nf_bridge_put(dst->nf_bridge); #endif __nf_copy(dst, src, true); } diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h index 6efc0153987b3..4cd56808ac4e1 100644 --- a/include/net/netfilter/br_netfilter.h +++ b/include/net/netfilter/br_netfilter.h @@ -6,12 +6,12 @@ static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb) { - skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC); + struct nf_bridge_info *b = skb_ext_add(skb, SKB_EXT_BRIDGE_NF); - if (likely(skb->nf_bridge)) - refcount_set(&(skb->nf_bridge->use), 1); + if (b) + memset(b, 0, sizeof(*b)); - return skb->nf_bridge; + return b; } void nf_bridge_update_protocol(struct sk_buff *skb); diff --git a/net/Kconfig b/net/Kconfig index 93b2912928609..5cb9de1aaf886 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -187,6 +187,7 @@ config BRIDGE_NETFILTER depends on NETFILTER && INET depends on NETFILTER_ADVANCED select NETFILTER_FAMILY_BRIDGE + select SKB_EXTENSIONS default m ---help--- Enabling this option will let arptables resp. iptables see bridged diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index c58cf68b45c5f..d21a236984101 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -132,10 +132,7 @@ static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage); static void nf_bridge_info_free(struct sk_buff *skb) { - if (skb->nf_bridge) { - nf_bridge_put(skb->nf_bridge); - skb->nf_bridge = NULL; - } + skb_ext_del(skb, SKB_EXT_BRIDGE_NF); } static inline struct net_device *bridge_parent(const struct net_device *dev) @@ -148,19 +145,7 @@ static inline struct net_device *bridge_parent(const struct net_device *dev) static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) { - struct nf_bridge_info *nf_bridge = skb->nf_bridge; - - if (refcount_read(&nf_bridge->use) > 1) { - struct nf_bridge_info *tmp = nf_bridge_alloc(skb); - - if (tmp) { - memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info)); - refcount_set(&tmp->use, 1); - } - nf_bridge_put(nf_bridge); - nf_bridge = tmp; - } - return nf_bridge; + return skb_ext_add(skb, SKB_EXT_BRIDGE_NF); } unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) @@ -508,7 +493,6 @@ static unsigned int br_nf_pre_routing(void *priv, if (br_validate_ipv4(state->net, skb)) return NF_DROP; - nf_bridge_put(skb->nf_bridge); if (!nf_bridge_alloc(skb)) return NF_DROP; if (!setup_pre_routing(skb)) diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 96c072e71ea2e..94039f588f1dd 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -224,8 +224,8 @@ unsigned int br_nf_pre_routing_ipv6(void *priv, if (br_validate_ipv6(state->net, skb)) return NF_DROP; - nf_bridge_put(skb->nf_bridge); - if (!nf_bridge_alloc(skb)) + nf_bridge = nf_bridge_alloc(skb); + if (!nf_bridge) return NF_DROP; if (!setup_pre_routing(skb)) return NF_DROP; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d2dfad33e6864..0c65723591d7b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -616,9 +616,6 @@ void skb_release_head_state(struct sk_buff *skb) } #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb_nfct(skb)); -#endif -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - nf_bridge_put(skb->nf_bridge); #endif skb_ext_put(skb); } From 0ca64da128b816b2826e9b469f47239c47f1df31 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:18 +0100 Subject: [PATCH 04/13] xfrm: change secpath_set to return secpath struct, not error value It can only return 0 (success) or -ENOMEM. Change return value to a pointer to secpath struct. This avoids direct access to skb->sp: err = secpath_set(skb); if (!err) .. skb->sp-> ... Becomes: sp = secpath_set(skb) if (!sp) .. sp-> .. This reduces noise in followup patch which is going to remove skb->sp. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/xfrm.h | 2 +- net/ipv4/esp4_offload.c | 11 ++++++----- net/ipv6/esp6_offload.c | 11 ++++++----- net/ipv6/xfrm6_input.c | 6 ++++-- net/xfrm/xfrm_input.c | 16 +++++++++------- 5 files changed, 26 insertions(+), 20 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 0a8d70d169188..62ca62177bc63 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1131,7 +1131,7 @@ secpath_put(struct sec_path *sp) } struct sec_path *secpath_dup(struct sec_path *src); -int secpath_set(struct sk_buff *skb); +struct sec_path *secpath_set(struct sk_buff *skb); static inline void secpath_reset(struct sk_buff *skb) diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 58834a10c0be7..19bd22aa05f90 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -46,11 +46,12 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, xo = xfrm_offload(skb); if (!xo || !(xo->flags & CRYPTO_DONE)) { - err = secpath_set(skb); - if (err) + struct sec_path *sp = secpath_set(skb); + + if (!sp) goto out; - if (skb->sp->len == XFRM_MAX_DEPTH) + if (sp->len == XFRM_MAX_DEPTH) goto out; x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, @@ -59,8 +60,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, if (!x) goto out; - skb->sp->xvec[skb->sp->len++] = x; - skb->sp->olen++; + sp->xvec[sp->len++] = x; + sp->olen++; xo = xfrm_offload(skb); if (!xo) { diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 6177e21711718..01a97f5dfa4e2 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -68,11 +68,12 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, xo = xfrm_offload(skb); if (!xo || !(xo->flags & CRYPTO_DONE)) { - err = secpath_set(skb); - if (err) + struct sec_path *sp = secpath_set(skb); + + if (!sp) goto out; - if (skb->sp->len == XFRM_MAX_DEPTH) + if (sp->len == XFRM_MAX_DEPTH) goto out; x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, @@ -81,8 +82,8 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, if (!x) goto out; - skb->sp->xvec[skb->sp->len++] = x; - skb->sp->olen++; + sp->xvec[sp->len++] = x; + sp->olen++; xo = xfrm_offload(skb); if (!xo) { diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 9ef490dddcea2..97c69df1b3298 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -86,14 +86,16 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, { struct net *net = dev_net(skb->dev); struct xfrm_state *x = NULL; + struct sec_path *sp; int i = 0; - if (secpath_set(skb)) { + sp = secpath_set(skb); + if (!sp) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); goto drop; } - if (1 + skb->sp->len == XFRM_MAX_DEPTH) { + if (1 + sp->len == XFRM_MAX_DEPTH) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 684c0bc01e2c1..bda929b9ff35f 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -145,21 +145,22 @@ struct sec_path *secpath_dup(struct sec_path *src) } EXPORT_SYMBOL(secpath_dup); -int secpath_set(struct sk_buff *skb) +struct sec_path *secpath_set(struct sk_buff *skb) { - struct sec_path *sp; + struct sec_path *sp = skb->sp; /* Allocate new secpath or COW existing one. */ - if (!skb->sp || refcount_read(&skb->sp->refcnt) != 1) { + if (!sp || refcount_read(&sp->refcnt) != 1) { sp = secpath_dup(skb->sp); if (!sp) - return -ENOMEM; + return NULL; if (skb->sp) secpath_put(skb->sp); skb->sp = sp; } - return 0; + + return sp; } EXPORT_SYMBOL(secpath_set); @@ -236,6 +237,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) bool xfrm_gro = false; bool crypto_done = false; struct xfrm_offload *xo = xfrm_offload(skb); + struct sec_path *sp; if (encap_type < 0) { x = xfrm_input_state(skb); @@ -312,8 +314,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) break; } - err = secpath_set(skb); - if (err) { + sp = secpath_set(skb); + if (!sp) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); goto drop; } From 7af8f4ca314a592e2ba49cb5ea1de1325974998e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:19 +0100 Subject: [PATCH 05/13] net: move secpath_exist helper to sk_buff.h Future patch will remove skb->sp pointer. To reduce noise in those patches, move existing helper to sk_buff and use it in more places to ease skb->sp replacement later. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 13 ++++++++++--- include/net/xfrm.h | 9 --------- net/netfilter/nft_meta.c | 2 +- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2f42d2e99f17f..70ac58240ec0e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4066,12 +4066,19 @@ static inline void skb_init_secmark(struct sk_buff *skb) { } #endif +static inline int secpath_exists(const struct sk_buff *skb) +{ +#ifdef CONFIG_XFRM + return skb->sp != NULL; +#else + return 0; +#endif +} + static inline bool skb_irq_freeable(const struct sk_buff *skb) { return !skb->destructor && -#if IS_ENABLED(CONFIG_XFRM) - !skb->sp && -#endif + !secpath_exists(skb) && !skb_nfct(skb) && !skb->_skb_refdst && !skb_has_frag_list(skb); diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 62ca62177bc63..9cb506d09b983 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1104,15 +1104,6 @@ struct sec_path { struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH]; }; -static inline int secpath_exists(struct sk_buff *skb) -{ -#ifdef CONFIG_XFRM - return skb->sp != NULL; -#else - return 0; -#endif -} - static inline struct sec_path * secpath_get(struct sec_path *sp) { diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 6180626c3f80b..6df486c5ebd3b 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -229,7 +229,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, } #ifdef CONFIG_XFRM case NFT_META_SECPATH: - nft_reg_store8(dest, !!skb->sp); + nft_reg_store8(dest, secpath_exists(skb)); break; #endif #ifdef CONFIG_NF_TABLES_BRIDGE From 2294be0f11e22b6197d025e5d3ab42888879ec4e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:20 +0100 Subject: [PATCH 06/13] net: use skb_sec_path helper in more places skb_sec_path gains 'const' qualifier to avoid xt_policy.c: 'skb_sec_path' discards 'const' qualifier from pointer target type same reasoning as previous conversions: Won't need to touch these spots anymore when skb->sp is removed. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- include/net/xfrm.h | 6 ++++-- net/ipv4/esp4.c | 9 ++++++--- net/ipv4/esp4_offload.c | 4 +++- net/ipv6/esp6.c | 9 ++++++--- net/ipv6/esp6_offload.c | 4 +++- net/ipv6/xfrm6_input.c | 2 +- net/netfilter/nft_xfrm.c | 2 +- net/netfilter/xt_policy.c | 2 +- net/xfrm/xfrm_device.c | 4 +++- net/xfrm/xfrm_input.c | 16 ++++++++++------ net/xfrm/xfrm_policy.c | 19 +++++++++++-------- security/selinux/xfrm.c | 4 ++-- 13 files changed, 52 insertions(+), 31 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 70ac58240ec0e..d0f254a016bf0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4124,7 +4124,7 @@ static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb) return skb->dst_pending_confirm != 0; } -static inline struct sec_path *skb_sec_path(struct sk_buff *skb) +static inline struct sec_path *skb_sec_path(const struct sk_buff *skb) { #ifdef CONFIG_XFRM return skb->sp; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 9cb506d09b983..af723448c972b 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1896,14 +1896,16 @@ static inline void xfrm_states_delete(struct xfrm_state **states, int n) #ifdef CONFIG_XFRM static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb) { - return skb->sp->xvec[skb->sp->len - 1]; + struct sec_path *sp = skb_sec_path(skb); + + return sp->xvec[sp->len - 1]; } #endif static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) { #ifdef CONFIG_XFRM - struct sec_path *sp = skb->sp; + struct sec_path *sp = skb_sec_path(skb); if (!sp || !sp->olen || sp->len != sp->olen) return NULL; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 9e1c840596c5c..5459f41fc26fa 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -125,10 +125,13 @@ static void esp_output_done(struct crypto_async_request *base, int err) void *tmp; struct xfrm_state *x; - if (xo && (xo->flags & XFRM_DEV_RESUME)) - x = skb->sp->xvec[skb->sp->len - 1]; - else + if (xo && (xo->flags & XFRM_DEV_RESUME)) { + struct sec_path *sp = skb_sec_path(skb); + + x = sp->xvec[sp->len - 1]; + } else { x = skb_dst(skb)->xfrm; + } tmp = ESP_SKB_CB(skb)->tmp; esp_ssg_unref(x, tmp); diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 19bd22aa05f90..8756e0e790d2a 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -115,6 +115,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, struct crypto_aead *aead; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); + struct sec_path *sp; if (!xo) return ERR_PTR(-EINVAL); @@ -122,7 +123,8 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) return ERR_PTR(-EINVAL); - x = skb->sp->xvec[skb->sp->len - 1]; + sp = skb_sec_path(skb); + x = sp->xvec[sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 63b2b66f9dfae..5afe9f83374de 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -145,10 +145,13 @@ static void esp_output_done(struct crypto_async_request *base, int err) void *tmp; struct xfrm_state *x; - if (xo && (xo->flags & XFRM_DEV_RESUME)) - x = skb->sp->xvec[skb->sp->len - 1]; - else + if (xo && (xo->flags & XFRM_DEV_RESUME)) { + struct sec_path *sp = skb_sec_path(skb); + + x = sp->xvec[sp->len - 1]; + } else { x = skb_dst(skb)->xfrm; + } tmp = ESP_SKB_CB(skb)->tmp; esp_ssg_unref(x, tmp); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 01a97f5dfa4e2..d46b4eb645c2e 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -142,6 +142,7 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, struct crypto_aead *aead; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); + struct sec_path *sp; if (!xo) return ERR_PTR(-EINVAL); @@ -149,7 +150,8 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) return ERR_PTR(-EINVAL); - x = skb->sp->xvec[skb->sp->len - 1]; + sp = skb_sec_path(skb); + x = sp->xvec[sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 97c69df1b3298..a52cb3fc6df5a 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -147,7 +147,7 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, goto drop; } - skb->sp->xvec[skb->sp->len++] = x; + sp->xvec[sp->len++] = x; spin_lock(&x->lock); diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 5322609f7662a..b08865ec5ed36 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -161,7 +161,7 @@ static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - const struct sec_path *sp = pkt->skb->sp; + const struct sec_path *sp = skb_sec_path(pkt->skb); const struct xfrm_state *state; if (sp == NULL || sp->len <= priv->spnum) { diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index 13f8ccf946d62..aa84e8121c93e 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -56,7 +56,7 @@ match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info, unsigned short family) { const struct xt_policy_elem *e; - const struct sec_path *sp = skb->sp; + const struct sec_path *sp = skb_sec_path(skb); int strict = info->flags & XT_POLICY_MATCH_STRICT; int i, pos; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 144c137886b16..b8736f56e7f7b 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -32,6 +32,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur struct softnet_data *sd; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); + struct sec_path *sp; if (!xo) return skb; @@ -39,7 +40,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur if (!(features & NETIF_F_HW_ESP)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); - x = skb->sp->xvec[skb->sp->len - 1]; + sp = skb_sec_path(skb); + x = sp->xvec[sp->len - 1]; if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND) return skb; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index bda929b9ff35f..b4db25b244fa5 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -330,7 +330,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) daddr = (xfrm_address_t *)(skb_network_header(skb) + XFRM_SPI_SKB_CB(skb)->daddroff); do { - if (skb->sp->len == XFRM_MAX_DEPTH) { + sp = skb_sec_path(skb); + + if (sp->len == XFRM_MAX_DEPTH) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; @@ -346,7 +348,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) skb->mark = xfrm_smark_get(skb->mark, x); - skb->sp->xvec[skb->sp->len++] = x; + sp->xvec[sp->len++] = x; lock: spin_lock(&x->lock); @@ -470,8 +472,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) nf_reset(skb); if (decaps) { - if (skb->sp) - skb->sp->olen = 0; + sp = skb_sec_path(skb); + if (sp) + sp->olen = 0; skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return 0; @@ -482,8 +485,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async); if (xfrm_gro) { - if (skb->sp) - skb->sp->olen = 0; + sp = skb_sec_path(skb); + if (sp) + sp->olen = 0; skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return err; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index be04091eb7db4..d6acba07bdc96 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3200,11 +3200,12 @@ EXPORT_SYMBOL(xfrm_lookup_route); static inline int xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) { + struct sec_path *sp = skb_sec_path(skb); struct xfrm_state *x; - if (!skb->sp || idx < 0 || idx >= skb->sp->len) + if (!sp || idx < 0 || idx >= sp->len) return 0; - x = skb->sp->xvec[idx]; + x = sp->xvec[idx]; if (!x->type->reject) return 0; return x->type->reject(x, skb, fl); @@ -3304,6 +3305,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, struct flowi fl; int xerr_idx = -1; const struct xfrm_if_cb *ifcb; + struct sec_path *sp; struct xfrm_if *xi; u32 if_id = 0; @@ -3328,11 +3330,12 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, nf_nat_decode_session(skb, &fl, family); /* First, check used SA against their selectors. */ - if (skb->sp) { + sp = skb_sec_path(skb); + if (sp) { int i; - for (i = skb->sp->len-1; i >= 0; i--) { - struct xfrm_state *x = skb->sp->xvec[i]; + for (i = sp->len - 1; i >= 0; i--) { + struct xfrm_state *x = sp->xvec[i]; if (!xfrm_selector_match(&x->sel, &fl, family)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); return 0; @@ -3359,7 +3362,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } if (!pol) { - if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) { + if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) { xfrm_secpath_reject(xerr_idx, skb, &fl); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); return 0; @@ -3388,7 +3391,6 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, #endif if (pol->action == XFRM_POLICY_ALLOW) { - struct sec_path *sp; static struct sec_path dummy; struct xfrm_tmpl *tp[XFRM_MAX_DEPTH]; struct xfrm_tmpl *stp[XFRM_MAX_DEPTH]; @@ -3396,7 +3398,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, int ti = 0; int i, k; - if ((sp = skb->sp) == NULL) + sp = skb_sec_path(skb); + if (!sp) sp = &dummy; for (pi = 0; pi < npols; pi++) { diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index 91dc3783ed944..bd7d18bdb147a 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -230,7 +230,7 @@ static int selinux_xfrm_skb_sid_ingress(struct sk_buff *skb, u32 *sid, int ckall) { u32 sid_session = SECSID_NULL; - struct sec_path *sp = skb->sp; + struct sec_path *sp = skb_sec_path(skb); if (sp) { int i; @@ -408,7 +408,7 @@ int selinux_xfrm_sock_rcv_skb(u32 sk_sid, struct sk_buff *skb, struct common_audit_data *ad) { int i; - struct sec_path *sp = skb->sp; + struct sec_path *sp = skb_sec_path(skb); u32 peer_sid = SECINITSID_UNLABELED; if (sp) { From 2fdb435bc0d7492b8137d4c9db6a465511154e29 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:21 +0100 Subject: [PATCH 07/13] drivers: net: intel: use secpath helpers in more places Use skb_sec_path and secpath_exists helpers where possible. This reduces noise in followup patch that removes skb->sp pointer. v2: no changes, preseve acks from v1. Acked-by: Shannon Nelson Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 6 ++++-- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 5 +++-- drivers/net/ethernet/intel/ixgbevf/ipsec.c | 6 ++++-- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 +- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index 4d77f42e035c5..8befc7a50f8c6 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -1065,11 +1065,13 @@ int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring, struct ixgbe_adapter *adapter = netdev_priv(tx_ring->netdev); struct ixgbe_ipsec *ipsec = adapter->ipsec; struct xfrm_state *xs; + struct sec_path *sp; struct tx_sa *tsa; - if (unlikely(!first->skb->sp->len)) { + sp = skb_sec_path(first->skb); + if (unlikely(!sp->len)) { netdev_err(tx_ring->netdev, "%s: no xfrm state len = %d\n", - __func__, first->skb->sp->len); + __func__, sp->len); return 0; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index f1e40734c9754..2cd8c42d1403a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8695,7 +8695,8 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb, #endif /* IXGBE_FCOE */ #ifdef CONFIG_IXGBE_IPSEC - if (skb->sp && !ixgbe_ipsec_tx(tx_ring, first, &ipsec_tx)) + if (secpath_exists(skb) && + !ixgbe_ipsec_tx(tx_ring, first, &ipsec_tx)) goto out_drop; #endif tso = ixgbe_tso(tx_ring, first, &hdr_len, &ipsec_tx); @@ -10192,7 +10193,7 @@ ixgbe_features_check(struct sk_buff *skb, struct net_device *dev, */ if (skb->encapsulation && !(features & NETIF_F_TSO_MANGLEID)) { #ifdef CONFIG_IXGBE_IPSEC - if (!skb->sp) + if (!secpath_exists(skb)) #endif features &= ~NETIF_F_TSO; } diff --git a/drivers/net/ethernet/intel/ixgbevf/ipsec.c b/drivers/net/ethernet/intel/ixgbevf/ipsec.c index e8a3231be0bf3..07644e6bf4983 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ipsec.c +++ b/drivers/net/ethernet/intel/ixgbevf/ipsec.c @@ -450,12 +450,14 @@ int ixgbevf_ipsec_tx(struct ixgbevf_ring *tx_ring, struct ixgbevf_adapter *adapter = netdev_priv(tx_ring->netdev); struct ixgbevf_ipsec *ipsec = adapter->ipsec; struct xfrm_state *xs; + struct sec_path *sp; struct tx_sa *tsa; u16 sa_idx; - if (unlikely(!first->skb->sp->len)) { + sp = skb_sec_path(first->skb); + if (unlikely(!sp->len)) { netdev_err(tx_ring->netdev, "%s: no xfrm state len = %d\n", - __func__, first->skb->sp->len); + __func__, sp->len); return 0; } diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 2de81f046fb55..49e23afa05a2e 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -4157,7 +4157,7 @@ static int ixgbevf_xmit_frame_ring(struct sk_buff *skb, first->protocol = vlan_get_protocol(skb); #ifdef CONFIG_IXGBEVF_IPSEC - if (skb->sp && !ixgbevf_ipsec_tx(tx_ring, first, &ipsec_tx)) + if (secpath_exists(skb) && !ixgbevf_ipsec_tx(tx_ring, first, &ipsec_tx)) goto out_drop; #endif tso = ixgbevf_tso(tx_ring, first, &hdr_len, &ipsec_tx); From 6362a6a04031cacc8ff54b65b84385d9fbab853d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:22 +0100 Subject: [PATCH 08/13] drivers: net: ethernet: mellanox: use skb_sec_path helper Will avoid touching this when sp pointer is removed from sk_buff struct. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c index 128a82b1dbfc6..f6717c287ff4b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c @@ -254,11 +254,13 @@ struct sk_buff *mlx5e_ipsec_handle_tx_skb(struct net_device *netdev, struct mlx5e_ipsec_metadata *mdata; struct mlx5e_ipsec_sa_entry *sa_entry; struct xfrm_state *x; + struct sec_path *sp; if (!xo) return skb; - if (unlikely(skb->sp->len != 1)) { + sp = skb_sec_path(skb); + if (unlikely(sp->len != 1)) { atomic64_inc(&priv->ipsec->sw_stats.ipsec_tx_drop_bundle); goto drop; } @@ -372,10 +374,11 @@ struct sk_buff *mlx5e_ipsec_handle_rx_skb(struct net_device *netdev, bool mlx5e_ipsec_feature_check(struct sk_buff *skb, struct net_device *netdev, netdev_features_t features) { + struct sec_path *sp = skb_sec_path(skb); struct xfrm_state *x; - if (skb->sp && skb->sp->len) { - x = skb->sp->xvec[0]; + if (sp && sp->len) { + x = sp->xvec[0]; if (x && x->xso.offload_handle) return true; } From 56d1ac3260dadfc66c81ef2689bd1a09b05731a2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:23 +0100 Subject: [PATCH 09/13] drivers: net: netdevsim: use skb_sec_path helper ... so this won't have to be changed when skb->sp goes away. v2: no changes, preserve ack. Acked-by: Shannon Nelson Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- drivers/net/netdevsim/ipsec.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/netdevsim/ipsec.c b/drivers/net/netdevsim/ipsec.c index 2dcf6cc269d0d..76e11d889bb6e 100644 --- a/drivers/net/netdevsim/ipsec.c +++ b/drivers/net/netdevsim/ipsec.c @@ -227,18 +227,19 @@ static const struct xfrmdev_ops nsim_xfrmdev_ops = { bool nsim_ipsec_tx(struct netdevsim *ns, struct sk_buff *skb) { + struct sec_path *sp = skb_sec_path(skb); struct nsim_ipsec *ipsec = &ns->ipsec; struct xfrm_state *xs; struct nsim_sa *tsa; u32 sa_idx; /* do we even need to check this packet? */ - if (!skb->sp) + if (!sp) return true; - if (unlikely(!skb->sp->len)) { + if (unlikely(!sp->len)) { netdev_err(ns->netdev, "no xfrm state len = %d\n", - skb->sp->len); + sp->len); return false; } From 26912e3756d0a13b188142d1ba0ab279cd3b657a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:24 +0100 Subject: [PATCH 10/13] xfrm: use secpath_exist where applicable Will reduce noise when skb->sp is removed later in this series. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/xfrm.h | 2 +- net/xfrm/xfrm_interface.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index af723448c972b..31220edcce952 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1185,7 +1185,7 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir, if (sk && sk->sk_policy[XFRM_POLICY_IN]) return __xfrm_policy_check(sk, ndir, skb, family); - return (!net->xfrm.policy_count[dir] && !skb->sp) || + return (!net->xfrm.policy_count[dir] && !secpath_exists(skb)) || (skb_dst(skb)->flags & DST_NOPOLICY) || __xfrm_policy_check(sk, ndir, skb, family); } diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index d679fa0f44b31..6be8c7df15bb2 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -251,7 +251,7 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err) struct xfrm_if *xi; bool xnet; - if (err && !skb->sp) + if (err && !secpath_exists(skb)) return 0; x = xfrm_input_state(skb); From a053c866496d0c3647727f6a282a1db3afea3ed7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:25 +0100 Subject: [PATCH 11/13] drivers: chelsio: use skb_sec_path helper reduce noise when skb->sp is removed later in the series. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chcr_ipsec.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/chelsio/chcr_ipsec.c b/drivers/crypto/chelsio/chcr_ipsec.c index 461b97e2f1fdc..ceaa16b8f72ef 100644 --- a/drivers/crypto/chelsio/chcr_ipsec.c +++ b/drivers/crypto/chelsio/chcr_ipsec.c @@ -570,6 +570,7 @@ int chcr_ipsec_xmit(struct sk_buff *skb, struct net_device *dev) struct sge_eth_txq *q; struct port_info *pi; dma_addr_t addr[MAX_SKB_FRAGS + 1]; + struct sec_path *sp; bool immediate = false; if (!x->xso.offload_handle) @@ -578,7 +579,8 @@ int chcr_ipsec_xmit(struct sk_buff *skb, struct net_device *dev) sa_entry = (struct ipsec_sa_entry *)x->xso.offload_handle; kctx_len = sa_entry->kctx_len; - if (skb->sp->len != 1) { + sp = skb_sec_path(skb); + if (sp->len != 1) { out_free: dev_kfree_skb_any(skb); return NETDEV_TX_OK; } From a84e3f533324e40e4a99f50dee2188bf140d8098 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:26 +0100 Subject: [PATCH 12/13] xfrm: prefer secpath_set over secpath_dup secpath_set is a wrapper for secpath_dup that will not perform an allocation if the secpath attached to the skb has a reference count of one, i.e., it doesn't need to be COW'ed. Also, secpath_dup doesn't attach the secpath to the skb, it leaves this to the caller. Use secpath_set in places that immediately assign the return value to skb. This allows to remove skb->sp without touching these spots again. secpath_dup can eventually be removed in followup patch. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 9 +++++---- drivers/net/ethernet/intel/ixgbevf/ipsec.c | 9 +++++---- .../ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c | 10 ++++++---- net/xfrm/Kconfig | 1 + net/xfrm/xfrm_output.c | 7 ++----- 5 files changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index 8befc7a50f8c6..ff85ce5791a36 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -1161,6 +1161,7 @@ void ixgbe_ipsec_rx(struct ixgbe_ring *rx_ring, struct xfrm_state *xs = NULL; struct ipv6hdr *ip6 = NULL; struct iphdr *ip4 = NULL; + struct sec_path *sp; void *daddr; __be32 spi; u8 *c_hdr; @@ -1200,12 +1201,12 @@ void ixgbe_ipsec_rx(struct ixgbe_ring *rx_ring, if (unlikely(!xs)) return; - skb->sp = secpath_dup(skb->sp); - if (unlikely(!skb->sp)) + sp = secpath_set(skb); + if (unlikely(!sp)) return; - skb->sp->xvec[skb->sp->len++] = xs; - skb->sp->olen++; + sp->xvec[sp->len++] = xs; + sp->olen++; xo = xfrm_offload(skb); xo->flags = CRYPTO_DONE; xo->status = CRYPTO_SUCCESS; diff --git a/drivers/net/ethernet/intel/ixgbevf/ipsec.c b/drivers/net/ethernet/intel/ixgbevf/ipsec.c index 07644e6bf4983..5170dd9d8705b 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ipsec.c +++ b/drivers/net/ethernet/intel/ixgbevf/ipsec.c @@ -548,6 +548,7 @@ void ixgbevf_ipsec_rx(struct ixgbevf_ring *rx_ring, struct xfrm_state *xs = NULL; struct ipv6hdr *ip6 = NULL; struct iphdr *ip4 = NULL; + struct sec_path *sp; void *daddr; __be32 spi; u8 *c_hdr; @@ -587,12 +588,12 @@ void ixgbevf_ipsec_rx(struct ixgbevf_ring *rx_ring, if (unlikely(!xs)) return; - skb->sp = secpath_dup(skb->sp); - if (unlikely(!skb->sp)) + sp = secpath_set(skb); + if (unlikely(!sp)) return; - skb->sp->xvec[skb->sp->len++] = xs; - skb->sp->olen++; + sp->xvec[sp->len++] = xs; + sp->olen++; xo = xfrm_offload(skb); xo->flags = CRYPTO_DONE; xo->status = CRYPTO_SUCCESS; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c index f6717c287ff4b..53608afd39b6c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c @@ -307,10 +307,11 @@ mlx5e_ipsec_build_sp(struct net_device *netdev, struct sk_buff *skb, struct mlx5e_priv *priv = netdev_priv(netdev); struct xfrm_offload *xo; struct xfrm_state *xs; + struct sec_path *sp; u32 sa_handle; - skb->sp = secpath_dup(skb->sp); - if (unlikely(!skb->sp)) { + sp = secpath_set(skb); + if (unlikely(!sp)) { atomic64_inc(&priv->ipsec->sw_stats.ipsec_rx_drop_sp_alloc); return NULL; } @@ -322,8 +323,9 @@ mlx5e_ipsec_build_sp(struct net_device *netdev, struct sk_buff *skb, return NULL; } - skb->sp->xvec[skb->sp->len++] = xs; - skb->sp->olen++; + sp = skb_sec_path(skb); + sp->xvec[sp->len++] = xs; + sp->olen++; xo = xfrm_offload(skb); xo->flags = CRYPTO_DONE; diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 140270a13d54f..5d43aaa170273 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -5,6 +5,7 @@ config XFRM bool depends on NET select GRO_CELLS + select SKB_EXTENSIONS config XFRM_OFFLOAD bool diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 4ae87c5ce2e35..757c4d11983b7 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -218,19 +218,16 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) if (xfrm_dev_offload_ok(skb, x)) { struct sec_path *sp; - sp = secpath_dup(skb->sp); + sp = secpath_set(skb); if (!sp) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); kfree_skb(skb); return -ENOMEM; } - if (skb->sp) - secpath_put(skb->sp); - skb->sp = sp; skb->encapsulation = 1; sp->olen++; - sp->xvec[skb->sp->len++] = x; + sp->xvec[sp->len++] = x; xfrm_state_hold(x); if (skb_is_gso(skb)) { From 4165079ba328dd47262a2183049d3591f0a750b1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Dec 2018 17:15:27 +0100 Subject: [PATCH 13/13] net: switch secpath to use skb extension infrastructure Remove skb->sp and allocate secpath storage via extension infrastructure. This also reduces sk_buff by 8 bytes on x86_64. Total size of allyesconfig kernel is reduced slightly, as there is less inlined code (one conditional atomic op instead of two on skb_clone). No differences in throughput in following ipsec performance tests: - transport mode with aes on 10GB link - tunnel mode between two network namespaces with aes and null cipher Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- Documentation/networking/xfrm_device.txt | 7 +-- include/linux/skbuff.h | 10 ++--- include/net/xfrm.h | 22 +--------- net/core/skbuff.c | 47 +++++++++++++++++--- net/xfrm/xfrm_input.c | 56 ++++-------------------- 5 files changed, 59 insertions(+), 83 deletions(-) diff --git a/Documentation/networking/xfrm_device.txt b/Documentation/networking/xfrm_device.txt index 267f55b5f54a9..a1c904dc70dcc 100644 --- a/Documentation/networking/xfrm_device.txt +++ b/Documentation/networking/xfrm_device.txt @@ -111,9 +111,10 @@ the stack in xfrm_input(). xfrm_state_hold(xs); store the state information into the skb - skb->sp = secpath_dup(skb->sp); - skb->sp->xvec[skb->sp->len++] = xs; - skb->sp->olen++; + sp = secpath_set(skb); + if (!sp) return; + sp->xvec[sp->len++] = xs; + sp->olen++; indicate the success and/or error status of the offload xo = xfrm_offload(skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d0f254a016bf0..3f741b04e55db 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -714,9 +714,6 @@ struct sk_buff { struct list_head tcp_tsorted_anchor; }; -#ifdef CONFIG_XFRM - struct sec_path *sp; -#endif #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) unsigned long _nfct; #endif @@ -3907,6 +3904,9 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct) enum skb_ext_id { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) SKB_EXT_BRIDGE_NF, +#endif +#ifdef CONFIG_XFRM + SKB_EXT_SEC_PATH, #endif SKB_EXT_NUM, /* must be last */ }; @@ -4069,7 +4069,7 @@ static inline void skb_init_secmark(struct sk_buff *skb) static inline int secpath_exists(const struct sk_buff *skb) { #ifdef CONFIG_XFRM - return skb->sp != NULL; + return skb_ext_exist(skb, SKB_EXT_SEC_PATH); #else return 0; #endif @@ -4127,7 +4127,7 @@ static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb) static inline struct sec_path *skb_sec_path(const struct sk_buff *skb) { #ifdef CONFIG_XFRM - return skb->sp; + return skb_ext_find(skb, SKB_EXT_SEC_PATH); #else return NULL; #endif diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 31220edcce952..38c232861a64b 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1096,7 +1096,6 @@ struct xfrm_offload { }; struct sec_path { - refcount_t refcnt; int len; int olen; @@ -1104,32 +1103,13 @@ struct sec_path { struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH]; }; -static inline struct sec_path * -secpath_get(struct sec_path *sp) -{ - if (sp) - refcount_inc(&sp->refcnt); - return sp; -} - -void __secpath_destroy(struct sec_path *sp); - -static inline void -secpath_put(struct sec_path *sp) -{ - if (sp && refcount_dec_and_test(&sp->refcnt)) - __secpath_destroy(sp); -} - -struct sec_path *secpath_dup(struct sec_path *src); struct sec_path *secpath_set(struct sk_buff *skb); static inline void secpath_reset(struct sk_buff *skb) { #ifdef CONFIG_XFRM - secpath_put(skb->sp); - skb->sp = NULL; + skb_ext_del(skb, SKB_EXT_SEC_PATH); #endif } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0c65723591d7b..cb0bf4215745a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -609,7 +609,6 @@ static void kfree_skbmem(struct sk_buff *skb) void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); - secpath_reset(skb); if (skb->destructor) { WARN_ON(in_irq()); skb->destructor(skb); @@ -798,9 +797,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) memcpy(new->cb, old->cb, sizeof(old->cb)); skb_dst_copy(new, old); __skb_ext_copy(new, old); -#ifdef CONFIG_XFRM - new->sp = secpath_get(old->sp); -#endif __nf_copy(new, old, false); /* Note : this field could be in headers_start/headers_end section @@ -3912,6 +3908,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), #endif +#ifdef CONFIG_XFRM + [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -3919,6 +3918,9 @@ static __always_inline unsigned int skb_ext_total_length(void) return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) skb_ext_type_len[SKB_EXT_BRIDGE_NF] + +#endif +#ifdef CONFIG_XFRM + skb_ext_type_len[SKB_EXT_SEC_PATH] + #endif 0; } @@ -5610,7 +5612,8 @@ static struct skb_ext *skb_ext_alloc(void) return new; } -static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old) +static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, + unsigned int old_active) { struct skb_ext *new; @@ -5624,6 +5627,15 @@ static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old) memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); refcount_set(&new->refcnt, 1); +#ifdef CONFIG_XFRM + if (old_active & (1 << SKB_EXT_SEC_PATH)) { + struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_hold(sp->xvec[i]); + } +#endif __skb_ext_put(old); return new; } @@ -5650,7 +5662,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) if (skb->active_extensions) { old = skb->extensions; - new = skb_ext_maybe_cow(old); + new = skb_ext_maybe_cow(old, skb->active_extensions); if (!new) return NULL; @@ -5679,6 +5691,16 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) } EXPORT_SYMBOL(skb_ext_add); +#ifdef CONFIG_XFRM +static void skb_ext_put_sp(struct sec_path *sp) +{ + unsigned int i; + + for (i = 0; i < sp->len; i++) + xfrm_state_put(sp->xvec[i]); +} +#endif + void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) { struct skb_ext *ext = skb->extensions; @@ -5687,6 +5709,14 @@ void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) if (skb->active_extensions == 0) { skb->extensions = NULL; __skb_ext_put(ext); +#ifdef CONFIG_XFRM + } else if (id == SKB_EXT_SEC_PATH && + refcount_read(&ext->refcnt) == 1) { + struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); + + skb_ext_put_sp(sp); + sp->len = 0; +#endif } } EXPORT_SYMBOL(__skb_ext_del); @@ -5702,6 +5732,11 @@ void __skb_ext_put(struct skb_ext *ext) if (!refcount_dec_and_test(&ext->refcnt)) return; free_now: +#ifdef CONFIG_XFRM + if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) + skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); +#endif + kmem_cache_free(skbuff_ext_cache, ext); } EXPORT_SYMBOL(__skb_ext_put); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index b4db25b244fa5..6bc817359b580 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -38,8 +38,6 @@ struct xfrm_trans_cb { #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) -static struct kmem_cache *secpath_cachep __ro_after_init; - static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1]; @@ -111,54 +109,21 @@ static int xfrm_rcv_cb(struct sk_buff *skb, unsigned int family, u8 protocol, return ret; } -void __secpath_destroy(struct sec_path *sp) -{ - int i; - for (i = 0; i < sp->len; i++) - xfrm_state_put(sp->xvec[i]); - kmem_cache_free(secpath_cachep, sp); -} -EXPORT_SYMBOL(__secpath_destroy); - -struct sec_path *secpath_dup(struct sec_path *src) +struct sec_path *secpath_set(struct sk_buff *skb) { - struct sec_path *sp; + struct sec_path *sp, *tmp = skb_ext_find(skb, SKB_EXT_SEC_PATH); - sp = kmem_cache_alloc(secpath_cachep, GFP_ATOMIC); + sp = skb_ext_add(skb, SKB_EXT_SEC_PATH); if (!sp) return NULL; - sp->len = 0; - sp->olen = 0; + if (tmp) /* reused existing one (was COW'd if needed) */ + return sp; + /* allocated new secpath */ memset(sp->ovec, 0, sizeof(sp->ovec)); - - if (src) { - int i; - - memcpy(sp, src, sizeof(*sp)); - for (i = 0; i < sp->len; i++) - xfrm_state_hold(sp->xvec[i]); - } - refcount_set(&sp->refcnt, 1); - return sp; -} -EXPORT_SYMBOL(secpath_dup); - -struct sec_path *secpath_set(struct sk_buff *skb) -{ - struct sec_path *sp = skb->sp; - - /* Allocate new secpath or COW existing one. */ - if (!sp || refcount_read(&sp->refcnt) != 1) { - sp = secpath_dup(skb->sp); - if (!sp) - return NULL; - - if (skb->sp) - secpath_put(skb->sp); - skb->sp = sp; - } + sp->olen = 0; + sp->len = 0; return sp; } @@ -552,11 +517,6 @@ void __init xfrm_input_init(void) if (err) gro_cells.cells = NULL; - secpath_cachep = kmem_cache_create("secpath_cache", - sizeof(struct sec_path), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, - NULL); - for_each_possible_cpu(i) { struct xfrm_trans_tasklet *trans;