diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index fbfc13d81f66a..7c53e06b31c32 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -70,6 +70,7 @@ struct geneve_dev { bool collect_md; bool use_udp6_rx_checksums; bool ttl_inherit; + enum ifla_geneve_df df; }; struct geneve_sock { @@ -387,6 +388,57 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) return 0; } +/* Callback from net/ipv{4,6}/udp.c to check that we have a tunnel for errors */ +static int geneve_udp_encap_err_lookup(struct sock *sk, struct sk_buff *skb) +{ + struct genevehdr *geneveh; + struct geneve_sock *gs; + u8 zero_vni[3] = { 0 }; + u8 *vni = zero_vni; + + if (skb->len < GENEVE_BASE_HLEN) + return -EINVAL; + + geneveh = geneve_hdr(skb); + if (geneveh->ver != GENEVE_VER) + return -EINVAL; + + if (geneveh->proto_type != htons(ETH_P_TEB)) + return -EINVAL; + + gs = rcu_dereference_sk_user_data(sk); + if (!gs) + return -ENOENT; + + if (geneve_get_sk_family(gs) == AF_INET) { + struct iphdr *iph = ip_hdr(skb); + __be32 addr4 = 0; + + if (!gs->collect_md) { + vni = geneve_hdr(skb)->vni; + addr4 = iph->daddr; + } + + return geneve_lookup(gs, addr4, vni) ? 0 : -ENOENT; + } + +#if IS_ENABLED(CONFIG_IPV6) + if (geneve_get_sk_family(gs) == AF_INET6) { + struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct in6_addr addr6 = { 0 }; + + if (!gs->collect_md) { + vni = geneve_hdr(skb)->vni; + addr6 = ip6h->daddr; + } + + return geneve6_lookup(gs, addr6, vni) ? 0 : -ENOENT; + } +#endif + + return -EPFNOSUPPORT; +} + static struct socket *geneve_create_sock(struct net *net, bool ipv6, __be16 port, bool ipv6_rx_csum) { @@ -544,6 +596,7 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, tunnel_cfg.gro_receive = geneve_gro_receive; tunnel_cfg.gro_complete = geneve_gro_complete; tunnel_cfg.encap_rcv = geneve_udp_encap_recv; + tunnel_cfg.encap_err_lookup = geneve_udp_encap_err_lookup; tunnel_cfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, sock, &tunnel_cfg); list_add(&gs->list, &gn->sock_list); @@ -823,8 +876,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct rtable *rt; struct flowi4 fl4; __u8 tos, ttl; + __be16 df = 0; __be16 sport; - __be16 df; int err; rt = geneve_get_v4_rt(skb, dev, gs4, &fl4, info); @@ -838,6 +891,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, if (geneve->collect_md) { tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb); ttl = key->ttl; + + df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; } else { tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, ip_hdr(skb), skb); if (geneve->ttl_inherit) @@ -845,8 +900,22 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, else ttl = key->ttl; ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); + + if (geneve->df == GENEVE_DF_SET) { + df = htons(IP_DF); + } else if (geneve->df == GENEVE_DF_INHERIT) { + struct ethhdr *eth = eth_hdr(skb); + + if (ntohs(eth->h_proto) == ETH_P_IPV6) { + df = htons(IP_DF); + } else if (ntohs(eth->h_proto) == ETH_P_IP) { + struct iphdr *iph = ip_hdr(skb); + + if (iph->frag_off & htons(IP_DF)) + df = htons(IP_DF); + } + } } - df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr)); if (unlikely(err)) @@ -1093,6 +1162,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { [IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, [IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, [IFLA_GENEVE_TTL_INHERIT] = { .type = NLA_U8 }, + [IFLA_GENEVE_DF] = { .type = NLA_U8 }, }; static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], @@ -1128,6 +1198,16 @@ static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], } } + if (data[IFLA_GENEVE_DF]) { + enum ifla_geneve_df df = nla_get_u8(data[IFLA_GENEVE_DF]); + + if (df < 0 || df > GENEVE_DF_MAX) { + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_GENEVE_DF], + "Invalid DF attribute"); + return -EINVAL; + } + } + return 0; } @@ -1173,7 +1253,7 @@ static int geneve_configure(struct net *net, struct net_device *dev, struct netlink_ext_ack *extack, const struct ip_tunnel_info *info, bool metadata, bool ipv6_rx_csum, - bool ttl_inherit) + bool ttl_inherit, enum ifla_geneve_df df) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *t, *geneve = netdev_priv(dev); @@ -1223,6 +1303,7 @@ static int geneve_configure(struct net *net, struct net_device *dev, geneve->collect_md = metadata; geneve->use_udp6_rx_checksums = ipv6_rx_csum; geneve->ttl_inherit = ttl_inherit; + geneve->df = df; err = register_netdevice(dev); if (err) @@ -1242,7 +1323,7 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack, struct ip_tunnel_info *info, bool *metadata, bool *use_udp6_rx_checksums, bool *ttl_inherit, - bool changelink) + enum ifla_geneve_df *df, bool changelink) { int attrtype; @@ -1330,6 +1411,9 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], if (data[IFLA_GENEVE_TOS]) info->key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]); + if (data[IFLA_GENEVE_DF]) + *df = nla_get_u8(data[IFLA_GENEVE_DF]); + if (data[IFLA_GENEVE_LABEL]) { info->key.label = nla_get_be32(data[IFLA_GENEVE_LABEL]) & IPV6_FLOWLABEL_MASK; @@ -1448,6 +1532,7 @@ static int geneve_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { + enum ifla_geneve_df df = GENEVE_DF_UNSET; bool use_udp6_rx_checksums = false; struct ip_tunnel_info info; bool ttl_inherit = false; @@ -1456,12 +1541,12 @@ static int geneve_newlink(struct net *net, struct net_device *dev, init_tnl_info(&info, GENEVE_UDP_PORT); err = geneve_nl2info(tb, data, extack, &info, &metadata, - &use_udp6_rx_checksums, &ttl_inherit, false); + &use_udp6_rx_checksums, &ttl_inherit, &df, false); if (err) return err; err = geneve_configure(net, dev, extack, &info, metadata, - use_udp6_rx_checksums, ttl_inherit); + use_udp6_rx_checksums, ttl_inherit, df); if (err) return err; @@ -1524,6 +1609,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_info info; bool metadata; bool use_udp6_rx_checksums; + enum ifla_geneve_df df; bool ttl_inherit; int err; @@ -1539,7 +1625,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[], use_udp6_rx_checksums = geneve->use_udp6_rx_checksums; ttl_inherit = geneve->ttl_inherit; err = geneve_nl2info(tb, data, extack, &info, &metadata, - &use_udp6_rx_checksums, &ttl_inherit, true); + &use_udp6_rx_checksums, &ttl_inherit, &df, true); if (err) return err; @@ -1572,6 +1658,7 @@ static size_t geneve_get_size(const struct net_device *dev) nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TOS */ + nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_DF */ nla_total_size(sizeof(__be32)) + /* IFLA_GENEVE_LABEL */ nla_total_size(sizeof(__be16)) + /* IFLA_GENEVE_PORT */ nla_total_size(0) + /* IFLA_GENEVE_COLLECT_METADATA */ @@ -1620,6 +1707,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_be32(skb, IFLA_GENEVE_LABEL, info->key.label)) goto nla_put_failure; + if (nla_put_u8(skb, IFLA_GENEVE_DF, geneve->df)) + goto nla_put_failure; + if (nla_put_be16(skb, IFLA_GENEVE_PORT, info->key.tp_dst)) goto nla_put_failure; @@ -1671,7 +1761,8 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name, return dev; init_tnl_info(&info, dst_port); - err = geneve_configure(net, dev, NULL, &info, true, true, false); + err = geneve_configure(net, dev, NULL, &info, + true, true, false, GENEVE_DF_UNSET); if (err) { free_netdev(dev); return ERR_PTR(err); diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ae969f806d565..c3e65e78f0156 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1552,6 +1552,34 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) return 0; } +/* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */ +static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb) +{ + struct vxlan_dev *vxlan; + struct vxlan_sock *vs; + struct vxlanhdr *hdr; + __be32 vni; + + if (skb->len < VXLAN_HLEN) + return -EINVAL; + + hdr = vxlan_hdr(skb); + + if (!(hdr->vx_flags & VXLAN_HF_VNI)) + return -EINVAL; + + vs = rcu_dereference_sk_user_data(sk); + if (!vs) + return -ENOENT; + + vni = vxlan_vni(hdr->vx_vni); + vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni); + if (!vxlan) + return -ENOENT; + + return 0; +} + static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); @@ -2250,13 +2278,24 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - /* Bypass encapsulation if the destination is local */ if (!info) { + /* Bypass encapsulation if the destination is local */ err = encap_bypass_if_local(skb, dev, vxlan, dst, dst_port, ifindex, vni, &rt->dst, rt->rt_flags); if (err) goto out_unlock; + + if (vxlan->cfg.df == VXLAN_DF_SET) { + df = htons(IP_DF); + } else if (vxlan->cfg.df == VXLAN_DF_INHERIT) { + struct ethhdr *eth = eth_hdr(skb); + + if (ntohs(eth->h_proto) == ETH_P_IPV6 || + (ntohs(eth->h_proto) == ETH_P_IP && + old_iph->frag_off & htons(IP_DF))) + df = htons(IP_DF); + } } else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) { df = htons(IP_DF); } @@ -2809,6 +2848,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_GPE] = { .type = NLA_FLAG, }, [IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG }, [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, + [IFLA_VXLAN_DF] = { .type = NLA_U8 }, }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -2865,6 +2905,16 @@ static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], } } + if (data[IFLA_VXLAN_DF]) { + enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]); + + if (df < 0 || df > VXLAN_DF_MAX) { + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_DF], + "Invalid DF attribute"); + return -EINVAL; + } + } + return 0; } @@ -2948,6 +2998,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, tunnel_cfg.sk_user_data = vs; tunnel_cfg.encap_type = 1; tunnel_cfg.encap_rcv = vxlan_rcv; + tunnel_cfg.encap_err_lookup = vxlan_err_lookup; tunnel_cfg.encap_destroy = NULL; tunnel_cfg.gro_receive = vxlan_gro_receive; tunnel_cfg.gro_complete = vxlan_gro_complete; @@ -3509,6 +3560,9 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], conf->mtu = nla_get_u32(tb[IFLA_MTU]); } + if (data[IFLA_VXLAN_DF]) + conf->df = nla_get_u8(data[IFLA_VXLAN_DF]); + return 0; } @@ -3601,6 +3655,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL_INHERIT */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_DF */ nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ @@ -3667,6 +3722,7 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT, !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) || nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || + nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) || nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) || nla_put_u8(skb, IFLA_VXLAN_LEARNING, !!(vxlan->cfg.flags & VXLAN_F_LEARN)) || diff --git a/include/linux/udp.h b/include/linux/udp.h index 0a9c54e76305a..2725c83395bfd 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -77,6 +77,7 @@ struct udp_sock { * For encapsulation sockets. */ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); + int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb); void (*encap_destroy)(struct sock *sk); /* GRO functions for UDP socket */ diff --git a/include/net/icmp.h b/include/net/icmp.h index 3ef2743a8eecc..6ac3a5bd0117c 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -41,7 +41,7 @@ struct net; void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info); int icmp_rcv(struct sk_buff *skb); -void icmp_err(struct sk_buff *skb, u32 info); +int icmp_err(struct sk_buff *skb, u32 info); int icmp_init(void); void icmp_out_count(struct net *net, unsigned char type); diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 236e40ba06bf1..69b4bcf880c9e 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -69,6 +69,8 @@ struct ip6_tnl_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi6 *fl6); + int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info); }; #ifdef CONFIG_INET diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index b0d022ff6ea17..db6b2218a2ad2 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -311,6 +311,7 @@ struct ip_tunnel_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4); + int (*err_handler)(struct sk_buff *skb, u32 info); }; #define MAX_IPTUN_ENCAP_OPS 8 diff --git a/include/net/protocol.h b/include/net/protocol.h index 4fc75f7ae23be..92b3eaad60888 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -42,7 +42,10 @@ struct net_protocol { int (*early_demux)(struct sk_buff *skb); int (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); - void (*err_handler)(struct sk_buff *skb, u32 info); + + /* This returns an error if we weren't able to handle the error. */ + int (*err_handler)(struct sk_buff *skb, u32 info); + unsigned int no_policy:1, netns_ok:1, /* does the protocol do more stringent @@ -58,10 +61,12 @@ struct inet6_protocol { void (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); - void (*err_handler)(struct sk_buff *skb, + /* This returns an error if we weren't able to handle the error. */ + int (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info); + unsigned int flags; /* INET6_PROTO_xxx */ }; diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 8c2caa370e0f6..9a3b48a35e90c 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -151,7 +151,7 @@ int sctp_primitive_RECONF(struct net *net, struct sctp_association *asoc, * sctp/input.c */ int sctp_rcv(struct sk_buff *skb); -void sctp_v4_err(struct sk_buff *skb, u32 info); +int sctp_v4_err(struct sk_buff *skb, u32 info); void sctp_hash_endpoint(struct sctp_endpoint *); void sctp_unhash_endpoint(struct sctp_endpoint *); struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *, diff --git a/include/net/tcp.h b/include/net/tcp.h index a18914d204864..4743836bed2e2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -313,7 +313,7 @@ extern struct proto tcp_prot; void tcp_tasklet_init(void); -void tcp_v4_err(struct sk_buff *skb, u32); +int tcp_v4_err(struct sk_buff *skb, u32); void tcp_shutdown(struct sock *sk, int how); diff --git a/include/net/udp.h b/include/net/udp.h index eccca2325ee6d..fd6d948755c81 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -283,7 +283,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, const struct sock *)); -void udp_err(struct sk_buff *, u32); +int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udp_push_pending_frames(struct sock *sk); diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 3fbe56430e3b3..dc8d804af3b4d 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -64,6 +64,8 @@ static inline int udp_sock_create(struct net *net, } typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); +typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk, + struct sk_buff *skb); typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk, struct list_head *head, @@ -76,6 +78,7 @@ struct udp_tunnel_sock_cfg { /* Used for setting up udp_sock fields, see udp.h for details */ __u8 encap_type; udp_tunnel_encap_rcv_t encap_rcv; + udp_tunnel_encap_err_lookup_t encap_err_lookup; udp_tunnel_encap_destroy_t encap_destroy; udp_tunnel_gro_receive_t gro_receive; udp_tunnel_gro_complete_t gro_complete; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 03431c148e167..ec999c49df1f3 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -216,6 +216,7 @@ struct vxlan_config { unsigned long age_interval; unsigned int addrmax; bool no_share; + enum ifla_vxlan_df df; }; struct vxlan_dev_node { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 1debfa42cba1a..f42c069d81db3 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -533,6 +533,7 @@ enum { IFLA_VXLAN_LABEL, IFLA_VXLAN_GPE, IFLA_VXLAN_TTL_INHERIT, + IFLA_VXLAN_DF, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) @@ -542,6 +543,14 @@ struct ifla_vxlan_port_range { __be16 high; }; +enum ifla_vxlan_df { + VXLAN_DF_UNSET = 0, + VXLAN_DF_SET, + VXLAN_DF_INHERIT, + __VXLAN_DF_END, + VXLAN_DF_MAX = __VXLAN_DF_END - 1, +}; + /* GENEVE section */ enum { IFLA_GENEVE_UNSPEC, @@ -557,10 +566,19 @@ enum { IFLA_GENEVE_UDP_ZERO_CSUM6_RX, IFLA_GENEVE_LABEL, IFLA_GENEVE_TTL_INHERIT, + IFLA_GENEVE_DF, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) +enum ifla_geneve_df { + GENEVE_DF_UNSET = 0, + GENEVE_DF_SET, + GENEVE_DF_INHERIT, + __GENEVE_DF_END, + GENEVE_DF_MAX = __GENEVE_DF_END - 1, +}; + /* PPP section */ enum { IFLA_PPP_UNSPEC, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 8e08cea6f1786..26a21d97b6b07 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -231,7 +231,7 @@ EXPORT_SYMBOL(dccp_req_err); * check at all. A more general error queue to queue errors for later handling * is probably better. */ -static void dccp_v4_err(struct sk_buff *skb, u32 info) +static int dccp_v4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (struct iphdr *)skb->data; const u8 offset = iph->ihl << 2; @@ -259,16 +259,18 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; + return -ENOENT; } if (sk->sk_state == DCCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); - return; + return 0; } seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) - return dccp_req_err(sk, seq); + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + dccp_req_err(sk, seq); + return 0; + } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -357,6 +359,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) out: bh_unlock_sock(sk); sock_put(sk); + return 0; } static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 6344f1b18a6a1..d5740bad5b181 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -68,7 +68,7 @@ static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb) } -static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; @@ -96,16 +96,18 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); - return; + return -ENOENT; } if (sk->sk_state == DCCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); - return; + return 0; } seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) - return dccp_req_err(sk, seq); + if (sk->sk_state == DCCP_NEW_SYN_RECV) { + dccp_req_err(sk, seq); + return 0; + } bh_lock_sock(sk); if (sock_owned_by_user(sk)) @@ -183,6 +185,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, out: bh_unlock_sock(sk); sock_put(sk); + return 0; } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 500a59906b871..0d0ad19ecb870 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -1003,15 +1004,82 @@ static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return 0; } +static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info) +{ + const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]); + + if (ipprot && ipprot->err_handler) { + if (!ipprot->err_handler(skb, info)) + return 0; + } + + return -ENOENT; +} + +static int gue_err(struct sk_buff *skb, u32 info) +{ + int transport_offset = skb_transport_offset(skb); + struct guehdr *guehdr; + size_t optlen; + int ret; + + if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr)) + return -EINVAL; + + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + switch (guehdr->version) { + case 0: /* Full GUE header present */ + break; + case 1: { + /* Direct encasulation of IPv4 or IPv6 */ + skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); + + switch (((struct iphdr *)guehdr)->version) { + case 4: + ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info); + goto out; +#if IS_ENABLED(CONFIG_IPV6) + case 6: + ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info); + goto out; +#endif + default: + ret = -EOPNOTSUPP; + goto out; + } + } + default: /* Undefined version */ + return -EOPNOTSUPP; + } + + if (guehdr->control) + return -ENOENT; + + optlen = guehdr->hlen << 2; + + if (validate_gue_flags(guehdr, optlen)) + return -EINVAL; + + skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); + ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info); + +out: + skb_set_transport_header(skb, transport_offset); + return ret; +} + static const struct ip_tunnel_encap_ops fou_iptun_ops = { .encap_hlen = fou_encap_hlen, .build_header = fou_build_header, + .err_handler = gue_err, }; static const struct ip_tunnel_encap_ops gue_iptun_ops = { .encap_hlen = gue_encap_hlen, .build_header = gue_build_header, + .err_handler = gue_err, }; static int ip_tunnel_encap_add_fou_ops(void) diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 7efe740c06ebf..a4bf22ee3aedb 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -151,20 +151,25 @@ static int gre_rcv(struct sk_buff *skb) return NET_RX_DROP; } -static void gre_err(struct sk_buff *skb, u32 info) +static int gre_err(struct sk_buff *skb, u32 info) { const struct gre_protocol *proto; const struct iphdr *iph = (const struct iphdr *)skb->data; u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f; + int err = 0; if (ver >= GREPROTO_MAX) - return; + return -EINVAL; rcu_read_lock(); proto = rcu_dereference(gre_proto[ver]); if (proto && proto->err_handler) proto->err_handler(skb, info); + else + err = -EPROTONOSUPPORT; rcu_read_unlock(); + + return err; } static const struct net_protocol net_gre_protocol = { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index d832beed6e3a3..065997f414e61 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1079,7 +1079,7 @@ int icmp_rcv(struct sk_buff *skb) goto drop; } -void icmp_err(struct sk_buff *skb, u32 info) +int icmp_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; int offset = iph->ihl<<2; @@ -1094,13 +1094,15 @@ void icmp_err(struct sk_buff *skb, u32 info) */ if (icmph->type != ICMP_ECHOREPLY) { ping_err(skb, offset, info); - return; + return 0; } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP); else if (type == ICMP_REDIRECT) ipv4_redirect(skb, net, 0, IPPROTO_ICMP); + + return 0; } /* diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2c67af644e64b..76a9a5f7a40e6 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -121,8 +121,8 @@ static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; static unsigned int erspan_net_id __read_mostly; -static void ipgre_err(struct sk_buff *skb, u32 info, - const struct tnl_ptk_info *tpi) +static int ipgre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) { /* All the routers (except for Linux) return only @@ -146,17 +146,32 @@ static void ipgre_err(struct sk_buff *skb, u32 info, unsigned int data_len = 0; struct ip_tunnel *t; + if (tpi->proto == htons(ETH_P_TEB)) + itn = net_generic(net, gre_tap_net_id); + else if (tpi->proto == htons(ETH_P_ERSPAN) || + tpi->proto == htons(ETH_P_ERSPAN2)) + itn = net_generic(net, erspan_net_id); + else + itn = net_generic(net, ipgre_net_id); + + iph = (const struct iphdr *)(icmp_hdr(skb) + 1); + t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, + iph->daddr, iph->saddr, tpi->key); + + if (!t) + return -ENOENT; + switch (type) { default: case ICMP_PARAMETERPROB: - return; + return 0; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ - return; + return 0; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -168,7 +183,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info, case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) - return; + return 0; data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; @@ -176,40 +191,27 @@ static void ipgre_err(struct sk_buff *skb, u32 info, break; } - if (tpi->proto == htons(ETH_P_TEB)) - itn = net_generic(net, gre_tap_net_id); - else if (tpi->proto == htons(ETH_P_ERSPAN) || - tpi->proto == htons(ETH_P_ERSPAN2)) - itn = net_generic(net, erspan_net_id); - else - itn = net_generic(net, ipgre_net_id); - - iph = (const struct iphdr *)(icmp_hdr(skb) + 1); - t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, - iph->daddr, iph->saddr, tpi->key); - - if (!t) - return; - #if IS_ENABLED(CONFIG_IPV6) if (tpi->proto == htons(ETH_P_IPV6) && !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, type, data_len)) - return; + return 0; #endif if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) - return; + return 0; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) - return; + return 0; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; + + return 0; } static void gre_err(struct sk_buff *skb, u32 info) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index e65287c27e3d8..57c5dd283a2c1 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -140,6 +140,13 @@ static int ipip_err(struct sk_buff *skb, u32 info) struct ip_tunnel *t; int err = 0; + t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, + iph->daddr, iph->saddr, 0); + if (!t) { + err = -ENOENT; + goto out; + } + switch (type) { case ICMP_DEST_UNREACH: switch (code) { @@ -167,13 +174,6 @@ static int ipip_err(struct sk_buff *skb, u32 info) goto out; } - t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->daddr, iph->saddr, 0); - if (!t) { - err = -ENOENT; - goto out; - } - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol); goto out; diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 32a691b7ce2c7..92d249e053be6 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -29,6 +29,7 @@ #include struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; +EXPORT_SYMBOL(inet_protos); const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; EXPORT_SYMBOL(inet_offloads); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index de47038afdf02..a336787d75e5a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -423,7 +423,7 @@ EXPORT_SYMBOL(tcp_req_err); * */ -void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) +int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); @@ -446,20 +446,21 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) inet_iif(icmp_skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; + return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); - return; + return 0; } seq = ntohl(th->seq); - if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq, - type == ICMP_PARAMETERPROB || - type == ICMP_TIME_EXCEEDED || - (type == ICMP_DEST_UNREACH && - (code == ICMP_NET_UNREACH || - code == ICMP_HOST_UNREACH))); + if (sk->sk_state == TCP_NEW_SYN_RECV) { + tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || + type == ICMP_TIME_EXCEEDED || + (type == ICMP_DEST_UNREACH && + (code == ICMP_NET_UNREACH || + code == ICMP_HOST_UNREACH))); + return 0; + } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -613,6 +614,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) out: bh_unlock_sock(sk); sock_put(sk); + return 0; } void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index c0630013c1aed..33bf8e9c86630 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -149,34 +149,40 @@ static int tunnelmpls4_rcv(struct sk_buff *skb) } #endif -static void tunnel4_err(struct sk_buff *skb, u32 info) +static int tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } #if IS_ENABLED(CONFIG_IPV6) -static void tunnel64_err(struct sk_buff *skb, u32 info) +static int tunnel64_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } #endif #if IS_ENABLED(CONFIG_MPLS) -static void tunnelmpls4_err(struct sk_buff *skb, u32 info) +static int tunnelmpls4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnelmpls4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } #endif diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3488650b90ac3..6f8890c5bc7eb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -105,6 +105,7 @@ #include #include #include +#include #include #include #include @@ -583,6 +584,89 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, return true; } +DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); +void udp_encap_enable(void) +{ + static_branch_enable(&udp_encap_needed_key); +} +EXPORT_SYMBOL(udp_encap_enable); + +/* Handler for tunnels with arbitrary destination ports: no socket lookup, go + * through error handlers in encapsulations looking for a match. + */ +static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info) +{ + int i; + + for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { + int (*handler)(struct sk_buff *skb, u32 info); + + if (!iptun_encaps[i]) + continue; + handler = rcu_dereference(iptun_encaps[i]->err_handler); + if (handler && !handler(skb, info)) + return 0; + } + + return -ENOENT; +} + +/* Try to match ICMP errors to UDP tunnels by looking up a socket without + * reversing source and destination port: this will match tunnels that force the + * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that + * lwtunnels might actually break this assumption by being configured with + * different destination ports on endpoints, in this case we won't be able to + * trace ICMP messages back to them. + * + * If this doesn't match any socket, probe tunnels with arbitrary destination + * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port + * we've sent packets to won't necessarily match the local destination port. + * + * Then ask the tunnel implementation to match the error against a valid + * association. + * + * Return an error if we can't find a match, the socket if we need further + * processing, zero otherwise. + */ +static struct sock *__udp4_lib_err_encap(struct net *net, + const struct iphdr *iph, + struct udphdr *uh, + struct udp_table *udptable, + struct sk_buff *skb, u32 info) +{ + int network_offset, transport_offset; + struct sock *sk; + + network_offset = skb_network_offset(skb); + transport_offset = skb_transport_offset(skb); + + /* Network header needs to point to the outer IPv4 header inside ICMP */ + skb_reset_network_header(skb); + + /* Transport header needs to point to the UDP header */ + skb_set_transport_header(skb, iph->ihl << 2); + + sk = __udp4_lib_lookup(net, iph->daddr, uh->source, + iph->saddr, uh->dest, skb->dev->ifindex, 0, + udptable, NULL); + if (sk) { + int (*lookup)(struct sock *sk, struct sk_buff *skb); + struct udp_sock *up = udp_sk(sk); + + lookup = READ_ONCE(up->encap_err_lookup); + if (!lookup || lookup(sk, skb)) + sk = NULL; + } + + if (!sk) + sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info)); + + skb_set_transport_header(skb, transport_offset); + skb_set_network_header(skb, network_offset); + + return sk; +} + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -594,13 +678,14 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, * to find the appropriate port. */ -void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) +int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) { struct inet_sock *inet; const struct iphdr *iph = (const struct iphdr *)skb->data; struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; + bool tunnel = false; struct sock *sk; int harderr; int err; @@ -610,8 +695,21 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) iph->saddr, uh->source, skb->dev->ifindex, inet_sdif(skb), udptable, NULL); if (!sk) { - __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; /* No socket for error */ + /* No socket for error: try tunnels before discarding */ + sk = ERR_PTR(-ENOENT); + if (static_branch_unlikely(&udp_encap_needed_key)) { + sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb, + info); + if (!sk) + return 0; + } + + if (IS_ERR(sk)) { + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); + return PTR_ERR(sk); + } + + tunnel = true; } err = 0; @@ -654,6 +752,10 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ + if (tunnel) { + /* ...not for tunnels though: we don't have a sending socket */ + goto out; + } if (!inet->recverr) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; @@ -663,12 +765,12 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) sk->sk_err = err; sk->sk_error_report(sk); out: - return; + return 0; } -void udp_err(struct sk_buff *skb, u32 info) +int udp_err(struct sk_buff *skb, u32 info) { - __udp4_lib_err(skb, info, &udp_table); + return __udp4_lib_err(skb, info, &udp_table); } /* @@ -1891,13 +1993,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return 0; } -DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); -void udp_encap_enable(void) -{ - static_branch_enable(&udp_encap_needed_key); -} -EXPORT_SYMBOL(udp_encap_enable); - /* returns: * -1: error * 0: success diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index e7d18b140287f..3226726554196 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -7,7 +7,7 @@ #include int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int); -void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); +int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *); int udp_v4_get_port(struct sock *sk, unsigned short snum); diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 6539ff15e9a34..d0c412fc56adc 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -68,6 +68,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; + udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; udp_sk(sk)->encap_destroy = cfg->encap_destroy; udp_sk(sk)->gro_receive = cfg->gro_receive; udp_sk(sk)->gro_complete = cfg->gro_complete; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 8545457752fb9..39c7f17d916fe 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -25,9 +25,9 @@ static int udplite_rcv(struct sk_buff *skb) return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); } -static void udplite_err(struct sk_buff *skb, u32 info) +static int udplite_err(struct sk_buff *skb, u32 info) { - __udp4_lib_err(skb, info, &udplite_table); + return __udp4_lib_err(skb, info, &udplite_table); } static const struct net_protocol udplite_protocol = { diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index 8dd0e6ab86065..35c54865dc423 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -106,13 +106,15 @@ static int xfrm4_esp_rcv(struct sk_buff *skb) return 0; } -static void xfrm4_esp_err(struct sk_buff *skb, u32 info) +static int xfrm4_esp_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(esp4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } static int xfrm4_ah_rcv(struct sk_buff *skb) @@ -132,13 +134,15 @@ static int xfrm4_ah_rcv(struct sk_buff *skb) return 0; } -static void xfrm4_ah_err(struct sk_buff *skb, u32 info) +static int xfrm4_ah_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(ah4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } static int xfrm4_ipcomp_rcv(struct sk_buff *skb) @@ -158,13 +162,15 @@ static int xfrm4_ipcomp_rcv(struct sk_buff *skb) return 0; } -static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) +static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info) { struct xfrm4_protocol *handler; for_each_protocol_rcu(ipcomp4_handlers, handler) if (!handler->err_handler(skb, info)) - break; + return 0; + + return -ENOENT; } static const struct net_protocol esp4_protocol = { diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c index 6de3c04b0f30f..bd675c61deb1f 100644 --- a/net/ipv6/fou6.c +++ b/net/ipv6/fou6.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -69,14 +70,87 @@ static int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return 0; } +static int gue6_err_proto_handler(int proto, struct sk_buff *skb, + struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, u32 info) +{ + const struct inet6_protocol *ipprot; + + ipprot = rcu_dereference(inet6_protos[proto]); + if (ipprot && ipprot->err_handler) { + if (!ipprot->err_handler(skb, opt, type, code, offset, info)) + return 0; + } + + return -ENOENT; +} + +static int gue6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + int transport_offset = skb_transport_offset(skb); + struct guehdr *guehdr; + size_t optlen; + int ret; + + if (skb->len < sizeof(struct udphdr) + sizeof(struct guehdr)) + return -EINVAL; + + guehdr = (struct guehdr *)&udp_hdr(skb)[1]; + + switch (guehdr->version) { + case 0: /* Full GUE header present */ + break; + case 1: { + /* Direct encasulation of IPv4 or IPv6 */ + skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr)); + + switch (((struct iphdr *)guehdr)->version) { + case 4: + ret = gue6_err_proto_handler(IPPROTO_IPIP, skb, opt, + type, code, offset, info); + goto out; + case 6: + ret = gue6_err_proto_handler(IPPROTO_IPV6, skb, opt, + type, code, offset, info); + goto out; + default: + ret = -EOPNOTSUPP; + goto out; + } + } + default: /* Undefined version */ + return -EOPNOTSUPP; + } + + if (guehdr->control) + return -ENOENT; + + optlen = guehdr->hlen << 2; + + if (validate_gue_flags(guehdr, optlen)) + return -EINVAL; + + skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr)); + ret = gue6_err_proto_handler(guehdr->proto_ctype, skb, + opt, type, code, offset, info); + +out: + skb_set_transport_header(skb, transport_offset); + return ret; +} + + static const struct ip6_tnl_encap_ops fou_ip6tun_ops = { .encap_hlen = fou_encap_hlen, .build_header = fou6_build_header, + .err_handler = gue6_err, }; static const struct ip6_tnl_encap_ops gue_ip6tun_ops = { .encap_hlen = gue_encap_hlen, .build_header = gue6_build_header, + .err_handler = gue6_err, }; static int ip6_tnl_encap_add_fou_ops(void) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index c9c53ade55c3c..5d7aa2c2770ca 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -84,7 +84,7 @@ static inline struct sock *icmpv6_sk(struct net *net) return net->ipv6.icmp_sk[smp_processor_id()]; } -static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ @@ -100,6 +100,8 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!(type & ICMPV6_INFOMSG_MASK)) if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) ping_err(skb, offset, ntohl(info)); + + return 0; } static int icmpv6_rcv(struct sk_buff *skb); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 515adbdba1d27..81b69bcee7146 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -423,7 +423,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev) } -static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); @@ -433,13 +433,13 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6), offset) < 0) - return; + return -EINVAL; ipv6h = (const struct ipv6hdr *)skb->data; t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr, tpi.key, tpi.proto); if (!t) - return; + return -ENOENT; switch (type) { struct ipv6_tlv_tnl_enc_lim *tel; @@ -449,14 +449,14 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, t->parms.name); if (code != ICMPV6_PORT_UNREACH) break; - return; + return 0; case ICMPV6_TIME_EXCEED: if (code == ICMPV6_EXC_HOPLIMIT) { net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", t->parms.name); break; } - return; + return 0; case ICMPV6_PARAMPROB: teli = 0; if (code == ICMPV6_HDR_FIELD) @@ -472,14 +472,14 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", t->parms.name); } - return; + return 0; case ICMPV6_PKT_TOOBIG: ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); - return; + return 0; case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); - return; + return 0; } if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) @@ -487,6 +487,8 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, else t->err_count = 1; t->err_time = jiffies; + + return 0; } static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 03e6b7a2bc530..a3f5591625215 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -349,7 +349,7 @@ static void tcp_v6_mtu_reduced(struct sock *sk) } } -static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; @@ -371,17 +371,19 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); - return; + return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); - return; + return 0; } seq = ntohl(th->seq); fatal = icmpv6_err_convert(type, code, &err); - if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq, fatal); + if (sk->sk_state == TCP_NEW_SYN_RECV) { + tcp_req_err(sk, seq, fatal); + return 0; + } bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) @@ -467,6 +469,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, out: bh_unlock_sock(sk); sock_put(sk); + return 0; } diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index dae25cad05cd7..1991dede73677 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -134,24 +134,28 @@ static int tunnel46_rcv(struct sk_buff *skb) return 0; } -static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } -static void tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnel46_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } static const struct inet6_protocol tunnel6_protocol = { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c55698d19d684..0c0cb1611aef2 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -462,15 +463,106 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto try_again; } -void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info, - struct udp_table *udptable) +DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); +void udpv6_encap_enable(void) +{ + static_branch_enable(&udpv6_encap_needed_key); +} +EXPORT_SYMBOL(udpv6_encap_enable); + +/* Handler for tunnels with arbitrary destination ports: no socket lookup, go + * through error handlers in encapsulations looking for a match. + */ +static int __udp6_lib_err_encap_no_sk(struct sk_buff *skb, + struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, u32 info) +{ + int i; + + for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { + int (*handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, u32 info); + + if (!ip6tun_encaps[i]) + continue; + handler = rcu_dereference(ip6tun_encaps[i]->err_handler); + if (handler && !handler(skb, opt, type, code, offset, info)) + return 0; + } + + return -ENOENT; +} + +/* Try to match ICMP errors to UDP tunnels by looking up a socket without + * reversing source and destination port: this will match tunnels that force the + * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that + * lwtunnels might actually break this assumption by being configured with + * different destination ports on endpoints, in this case we won't be able to + * trace ICMP messages back to them. + * + * If this doesn't match any socket, probe tunnels with arbitrary destination + * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port + * we've sent packets to won't necessarily match the local destination port. + * + * Then ask the tunnel implementation to match the error against a valid + * association. + * + * Return an error if we can't find a match, the socket if we need further + * processing, zero otherwise. + */ +static struct sock *__udp6_lib_err_encap(struct net *net, + const struct ipv6hdr *hdr, int offset, + struct udphdr *uh, + struct udp_table *udptable, + struct sk_buff *skb, + struct inet6_skb_parm *opt, + u8 type, u8 code, __be32 info) +{ + int network_offset, transport_offset; + struct sock *sk; + + network_offset = skb_network_offset(skb); + transport_offset = skb_transport_offset(skb); + + /* Network header needs to point to the outer IPv6 header inside ICMP */ + skb_reset_network_header(skb); + + /* Transport header needs to point to the UDP header */ + skb_set_transport_header(skb, offset); + + sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source, + &hdr->saddr, uh->dest, + inet6_iif(skb), 0, udptable, skb); + if (sk) { + int (*lookup)(struct sock *sk, struct sk_buff *skb); + struct udp_sock *up = udp_sk(sk); + + lookup = READ_ONCE(up->encap_err_lookup); + if (!lookup || lookup(sk, skb)) + sk = NULL; + } + + if (!sk) { + sk = ERR_PTR(__udp6_lib_err_encap_no_sk(skb, opt, type, code, + offset, info)); + } + + skb_set_transport_header(skb, transport_offset); + skb_set_network_header(skb, network_offset); + + return sk; +} + +int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info, + struct udp_table *udptable) { struct ipv6_pinfo *np; const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct in6_addr *saddr = &hdr->saddr; const struct in6_addr *daddr = &hdr->daddr; struct udphdr *uh = (struct udphdr *)(skb->data+offset); + bool tunnel = false; struct sock *sk; int harderr; int err; @@ -479,9 +571,23 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, inet6_iif(skb), inet6_sdif(skb), udptable, skb); if (!sk) { - __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); - return; + /* No socket for error: try tunnels before discarding */ + sk = ERR_PTR(-ENOENT); + if (static_branch_unlikely(&udpv6_encap_needed_key)) { + sk = __udp6_lib_err_encap(net, hdr, offset, uh, + udptable, skb, + opt, type, code, info); + if (!sk) + return 0; + } + + if (IS_ERR(sk)) { + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); + return PTR_ERR(sk); + } + + tunnel = true; } harderr = icmpv6_err_convert(type, code, &err); @@ -495,10 +601,19 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, harderr = 1; } if (type == NDISC_REDIRECT) { - ip6_sk_redirect(skb, sk); + if (tunnel) { + ip6_redirect(skb, sock_net(sk), inet6_iif(skb), + sk->sk_mark, sk->sk_uid); + } else { + ip6_sk_redirect(skb, sk); + } goto out; } + /* Tunnels don't have an application socket: don't pass errors back */ + if (tunnel) + goto out; + if (!np->recverr) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; @@ -509,7 +624,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk->sk_err = err; sk->sk_error_report(sk); out: - return; + return 0; } static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -540,20 +655,13 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return 0; } -static __inline__ void udpv6_err(struct sk_buff *skb, - struct inet6_skb_parm *opt, u8 type, - u8 code, int offset, __be32 info) +static __inline__ int udpv6_err(struct sk_buff *skb, + struct inet6_skb_parm *opt, u8 type, + u8 code, int offset, __be32 info) { - __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); + return __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); } -DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); -void udpv6_encap_enable(void) -{ - static_branch_enable(&udpv6_encap_needed_key); -} -EXPORT_SYMBOL(udpv6_encap_enable); - static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 7903e21c178b9..5730e6503cb49 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -9,8 +9,8 @@ #include int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int); -void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int, - __be32, struct udp_table *); +int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int, + __be32, struct udp_table *); int udp_v6_get_port(struct sock *sk, unsigned short snum); diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 5000ad6878e6f..a125aebc29e5e 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -20,11 +20,12 @@ static int udplitev6_rcv(struct sk_buff *skb) return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); } -static void udplitev6_err(struct sk_buff *skb, +static int udplitev6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - __udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table); + return __udp6_lib_err(skb, opt, type, code, offset, info, + &udplite_table); } static const struct inet6_protocol udplitev6_protocol = { diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c index b2dc8ce493784..cc979b702c898 100644 --- a/net/ipv6/xfrm6_protocol.c +++ b/net/ipv6/xfrm6_protocol.c @@ -80,14 +80,16 @@ static int xfrm6_esp_rcv(struct sk_buff *skb) return 0; } -static void xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(esp6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } static int xfrm6_ah_rcv(struct sk_buff *skb) @@ -107,14 +109,16 @@ static int xfrm6_ah_rcv(struct sk_buff *skb) return 0; } -static void xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(ah6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } static int xfrm6_ipcomp_rcv(struct sk_buff *skb) @@ -134,14 +138,16 @@ static int xfrm6_ipcomp_rcv(struct sk_buff *skb) return 0; } -static void xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_protocol *handler; for_each_protocol_rcu(ipcomp6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) - break; + return 0; + + return -ENOENT; } static const struct inet6_protocol esp6_protocol = { diff --git a/net/sctp/input.c b/net/sctp/input.c index 5c36a99882ed1..7ab08a5b36dcb 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -574,7 +574,7 @@ void sctp_err_finish(struct sock *sk, struct sctp_transport *t) * is probably better. * */ -void sctp_v4_err(struct sk_buff *skb, __u32 info) +int sctp_v4_err(struct sk_buff *skb, __u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; const int ihlen = iph->ihl * 4; @@ -599,7 +599,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) skb->transport_header = savesctp; if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; + return -ENOENT; } /* Warning: The sock lock is held. Remember to call * sctp_err_finish! @@ -653,6 +653,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) out_unlock: sctp_err_finish(sk, transport); + return 0; } /* diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index fc6c5e4bffa54..6e27c62646e99 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -138,7 +138,7 @@ static struct notifier_block sctp_inet6addr_notifier = { }; /* ICMP error handler. */ -static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, +static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct inet6_dev *idev; @@ -147,7 +147,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct sctp_transport *transport; struct ipv6_pinfo *np; __u16 saveip, savesctp; - int err; + int err, ret = 0; struct net *net = dev_net(skb->dev); idev = in6_dev_get(skb->dev); @@ -163,6 +163,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, skb->transport_header = savesctp; if (!sk) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_INERRORS); + ret = -ENOENT; goto out; } @@ -202,6 +203,8 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, out: if (likely(idev != NULL)) in6_dev_put(idev); + + return ret; } static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index a369d616b390d..e2c94e47707cb 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -26,6 +26,47 @@ # - pmtu_ipv6 # Same as pmtu_ipv4, except for locked PMTU tests, using IPv6 # +# - pmtu_ipv4_vxlan4_exception +# Set up the same network topology as pmtu_ipv4, create a VXLAN tunnel +# over IPv4 between A and B, routed via R1. On the link between R1 and B, +# set a MTU lower than the VXLAN MTU and the MTU on the link between A and +# R1. Send IPv4 packets, exceeding the MTU between R1 and B, over VXLAN +# from A to B and check that the PMTU exception is created with the right +# value on A +# +# - pmtu_ipv6_vxlan4_exception +# Same as pmtu_ipv4_vxlan4_exception, but send IPv6 packets from A to B +# +# - pmtu_ipv4_vxlan6_exception +# Same as pmtu_ipv4_vxlan4_exception, but use IPv6 transport from A to B +# +# - pmtu_ipv6_vxlan6_exception +# Same as pmtu_ipv4_vxlan6_exception, but send IPv6 packets from A to B +# +# - pmtu_ipv4_geneve4_exception +# Same as pmtu_ipv4_vxlan4_exception, but using a GENEVE tunnel instead of +# VXLAN +# +# - pmtu_ipv6_geneve4_exception +# Same as pmtu_ipv6_vxlan4_exception, but using a GENEVE tunnel instead of +# VXLAN +# +# - pmtu_ipv4_geneve6_exception +# Same as pmtu_ipv4_vxlan6_exception, but using a GENEVE tunnel instead of +# VXLAN +# +# - pmtu_ipv6_geneve6_exception +# Same as pmtu_ipv6_vxlan6_exception, but using a GENEVE tunnel instead of +# VXLAN +# +# - pmtu_ipv{4,6}_fou{4,6}_exception +# Same as pmtu_ipv4_vxlan4, but using a direct IPv4/IPv6 encapsulation +# (FoU) over IPv4/IPv6, instead of VXLAN +# +# - pmtu_ipv{4,6}_fou{4,6}_exception +# Same as pmtu_ipv4_vxlan4, but using a generic UDP IPv4/IPv6 +# encapsulation (GUE) over IPv4/IPv6, instead of VXLAN +# # - pmtu_vti4_exception # Set up vti tunnel on top of veth, with xfrm states and policies, in two # namespaces with matching endpoints. Check that route exception is not @@ -72,6 +113,22 @@ which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping) tests=" pmtu_ipv4_exception ipv4: PMTU exceptions pmtu_ipv6_exception ipv6: PMTU exceptions + pmtu_ipv4_vxlan4_exception IPv4 over vxlan4: PMTU exceptions + pmtu_ipv6_vxlan4_exception IPv6 over vxlan4: PMTU exceptions + pmtu_ipv4_vxlan6_exception IPv4 over vxlan6: PMTU exceptions + pmtu_ipv6_vxlan6_exception IPv6 over vxlan6: PMTU exceptions + pmtu_ipv4_geneve4_exception IPv4 over geneve4: PMTU exceptions + pmtu_ipv6_geneve4_exception IPv6 over geneve4: PMTU exceptions + pmtu_ipv4_geneve6_exception IPv4 over geneve6: PMTU exceptions + pmtu_ipv6_geneve6_exception IPv6 over geneve6: PMTU exceptions + pmtu_ipv4_fou4_exception IPv4 over fou4: PMTU exceptions + pmtu_ipv6_fou4_exception IPv6 over fou4: PMTU exceptions + pmtu_ipv4_fou6_exception IPv4 over fou6: PMTU exceptions + pmtu_ipv6_fou6_exception IPv6 over fou6: PMTU exceptions + pmtu_ipv4_gue4_exception IPv4 over gue4: PMTU exceptions + pmtu_ipv6_gue4_exception IPv6 over gue4: PMTU exceptions + pmtu_ipv4_gue6_exception IPv4 over gue6: PMTU exceptions + pmtu_ipv6_gue6_exception IPv6 over gue6: PMTU exceptions pmtu_vti6_exception vti6: PMTU exceptions pmtu_vti4_exception vti4: PMTU exceptions pmtu_vti4_default_mtu vti4: default MTU assignment @@ -95,8 +152,8 @@ ns_r2="ip netns exec ${NS_R2}" # Addresses are: # - IPv4: PREFIX4.SEGMENT.ID (/24) # - IPv6: PREFIX6:SEGMENT::ID (/64) -prefix4="192.168" -prefix6="fd00" +prefix4="10.0" +prefix6="fc00" a_r1=1 a_r2=2 b_r1=3 @@ -129,12 +186,12 @@ veth6_a_addr="fd00:1::a" veth6_b_addr="fd00:1::b" veth6_mask="64" -vti4_a_addr="192.168.2.1" -vti4_b_addr="192.168.2.2" -vti4_mask="24" -vti6_a_addr="fd00:2::a" -vti6_b_addr="fd00:2::b" -vti6_mask="64" +tunnel4_a_addr="192.168.2.1" +tunnel4_b_addr="192.168.2.2" +tunnel4_mask="24" +tunnel6_a_addr="fd00:2::a" +tunnel6_b_addr="fd00:2::b" +tunnel6_mask="64" dummy6_0_addr="fc00:1000::0" dummy6_1_addr="fc00:1001::0" @@ -159,6 +216,89 @@ nsname() { eval echo \$NS_$1 } +setup_fou_or_gue() { + outer="${1}" + inner="${2}" + encap="${3}" + + if [ "${outer}" = "4" ]; then + modprobe fou || return 2 + a_addr="${prefix4}.${a_r1}.1" + b_addr="${prefix4}.${b_r1}.1" + if [ "${inner}" = "4" ]; then + type="ipip" + ipproto="4" + else + type="sit" + ipproto="41" + fi + else + modprobe fou6 || return 2 + a_addr="${prefix6}:${a_r1}::1" + b_addr="${prefix6}:${b_r1}::1" + if [ "${inner}" = "4" ]; then + type="ip6tnl" + mode="mode ipip6" + ipproto="4 -6" + else + type="ip6tnl" + mode="mode ip6ip6" + ipproto="41 -6" + fi + fi + + ${ns_a} ip fou add port 5555 ipproto ${ipproto} || return 2 + ${ns_a} ip link add ${encap}_a type ${type} ${mode} local ${a_addr} remote ${b_addr} encap ${encap} encap-sport auto encap-dport 5556 || return 2 + + ${ns_b} ip fou add port 5556 ipproto ${ipproto} + ${ns_b} ip link add ${encap}_b type ${type} ${mode} local ${b_addr} remote ${a_addr} encap ${encap} encap-sport auto encap-dport 5555 + + if [ "${inner}" = "4" ]; then + ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${encap}_a + ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${encap}_b + else + ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${encap}_a + ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${encap}_b + fi + + ${ns_a} ip link set ${encap}_a up + ${ns_b} ip link set ${encap}_b up + + sleep 1 +} + +setup_fou44() { + setup_fou_or_gue 4 4 fou +} + +setup_fou46() { + setup_fou_or_gue 4 6 fou +} + +setup_fou64() { + setup_fou_or_gue 6 4 fou +} + +setup_fou66() { + setup_fou_or_gue 6 6 fou +} + +setup_gue44() { + setup_fou_or_gue 4 4 gue +} + +setup_gue46() { + setup_fou_or_gue 4 6 gue +} + +setup_gue64() { + setup_fou_or_gue 6 4 gue +} + +setup_gue66() { + setup_fou_or_gue 6 6 gue +} + setup_namespaces() { for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do ip netns add ${n} || return 1 @@ -202,11 +342,57 @@ setup_vti() { } setup_vti4() { - setup_vti 4 ${veth4_a_addr} ${veth4_b_addr} ${vti4_a_addr} ${vti4_b_addr} ${vti4_mask} + setup_vti 4 ${veth4_a_addr} ${veth4_b_addr} ${tunnel4_a_addr} ${tunnel4_b_addr} ${tunnel4_mask} } setup_vti6() { - setup_vti 6 ${veth6_a_addr} ${veth6_b_addr} ${vti6_a_addr} ${vti6_b_addr} ${vti6_mask} + setup_vti 6 ${veth6_a_addr} ${veth6_b_addr} ${tunnel6_a_addr} ${tunnel6_b_addr} ${tunnel6_mask} +} + +setup_vxlan_or_geneve() { + type="${1}" + a_addr="${2}" + b_addr="${3}" + opts="${4}" + + if [ "${type}" = "vxlan" ]; then + opts="${opts} ttl 64 dstport 4789" + opts_a="local ${a_addr}" + opts_b="local ${b_addr}" + else + opts_a="" + opts_b="" + fi + + ${ns_a} ip link add ${type}_a type ${type} id 1 ${opts_a} remote ${b_addr} ${opts} || return 1 + ${ns_b} ip link add ${type}_b type ${type} id 1 ${opts_b} remote ${a_addr} ${opts} + + ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${type}_a + ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${type}_b + + ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${type}_a + ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${type}_b + + ${ns_a} ip link set ${type}_a up + ${ns_b} ip link set ${type}_b up + + sleep 1 +} + +setup_geneve4() { + setup_vxlan_or_geneve geneve ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1 "df set" +} + +setup_vxlan4() { + setup_vxlan_or_geneve vxlan ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1 "df set" +} + +setup_geneve6() { + setup_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 +} + +setup_vxlan6() { + setup_vxlan_or_geneve vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 } setup_xfrm() { @@ -465,6 +651,161 @@ test_pmtu_ipv6_exception() { test_pmtu_ipvX 6 } +test_pmtu_ipvX_over_vxlanY_or_geneveY_exception() { + type=${1} + family=${2} + outer_family=${3} + ll_mtu=4000 + + if [ ${outer_family} -eq 4 ]; then + setup namespaces routing ${type}4 || return 2 + # IPv4 header UDP header VXLAN/GENEVE header Ethernet header + exp_mtu=$((${ll_mtu} - 20 - 8 - 8 - 14)) + else + setup namespaces routing ${type}6 || return 2 + # IPv6 header UDP header VXLAN/GENEVE header Ethernet header + exp_mtu=$((${ll_mtu} - 40 - 8 - 8 - 14)) + fi + + trace "${ns_a}" ${type}_a "${ns_b}" ${type}_b \ + "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ + "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B + + if [ ${family} -eq 4 ]; then + ping=ping + dst=${tunnel4_b_addr} + else + ping=${ping6} + dst=${tunnel6_b_addr} + fi + + # Create route exception by exceeding link layer MTU + mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000)) + mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000)) + mtu "${ns_b}" veth_B-R1 ${ll_mtu} + mtu "${ns_r1}" veth_R1-B ${ll_mtu} + + mtu "${ns_a}" ${type}_a $((${ll_mtu} + 1000)) + mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000)) + ${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s $((${ll_mtu} + 500)) ${dst} > /dev/null + + # Check that exception was created + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})" + check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ${type} interface" +} + +test_pmtu_ipv4_vxlan4_exception() { + test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan 4 4 +} + +test_pmtu_ipv6_vxlan4_exception() { + test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan 6 4 +} + +test_pmtu_ipv4_geneve4_exception() { + test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 4 4 +} + +test_pmtu_ipv6_geneve4_exception() { + test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 6 4 +} + +test_pmtu_ipv4_vxlan6_exception() { + test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan 4 6 +} + +test_pmtu_ipv6_vxlan6_exception() { + test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan 6 6 +} + +test_pmtu_ipv4_geneve6_exception() { + test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 4 6 +} + +test_pmtu_ipv6_geneve6_exception() { + test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 6 6 +} + +test_pmtu_ipvX_over_fouY_or_gueY() { + inner_family=${1} + outer_family=${2} + encap=${3} + ll_mtu=4000 + + setup namespaces routing ${encap}${outer_family}${inner_family} || return 2 + trace "${ns_a}" ${encap}_a "${ns_b}" ${encap}_b \ + "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ + "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B + + if [ ${inner_family} -eq 4 ]; then + ping=ping + dst=${tunnel4_b_addr} + else + ping=${ping6} + dst=${tunnel6_b_addr} + fi + + if [ "${encap}" = "gue" ]; then + encap_overhead=4 + else + encap_overhead=0 + fi + + if [ ${outer_family} -eq 4 ]; then + # IPv4 header UDP header + exp_mtu=$((${ll_mtu} - 20 - 8 - ${encap_overhead})) + else + # IPv6 header Option 4 UDP header + exp_mtu=$((${ll_mtu} - 40 - 8 - 8 - ${encap_overhead})) + fi + + # Create route exception by exceeding link layer MTU + mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000)) + mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000)) + mtu "${ns_b}" veth_B-R1 ${ll_mtu} + mtu "${ns_r1}" veth_R1-B ${ll_mtu} + + mtu "${ns_a}" ${encap}_a $((${ll_mtu} + 1000)) + mtu "${ns_b}" ${encap}_b $((${ll_mtu} + 1000)) + ${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s $((${ll_mtu} + 500)) ${dst} > /dev/null + + # Check that exception was created + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})" + check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ${encap} interface" +} + +test_pmtu_ipv4_fou4_exception() { + test_pmtu_ipvX_over_fouY_or_gueY 4 4 fou +} + +test_pmtu_ipv6_fou4_exception() { + test_pmtu_ipvX_over_fouY_or_gueY 6 4 fou +} + +test_pmtu_ipv4_fou6_exception() { + test_pmtu_ipvX_over_fouY_or_gueY 4 6 fou +} + +test_pmtu_ipv6_fou6_exception() { + test_pmtu_ipvX_over_fouY_or_gueY 6 6 fou +} + +test_pmtu_ipv4_gue4_exception() { + test_pmtu_ipvX_over_fouY_or_gueY 4 4 gue +} + +test_pmtu_ipv6_gue4_exception() { + test_pmtu_ipvX_over_fouY_or_gueY 6 4 gue +} + +test_pmtu_ipv4_gue6_exception() { + test_pmtu_ipvX_over_fouY_or_gueY 4 6 gue +} + +test_pmtu_ipv6_gue6_exception() { + test_pmtu_ipvX_over_fouY_or_gueY 6 6 gue +} + test_pmtu_vti4_exception() { setup namespaces veth vti4 xfrm4 || return 2 trace "${ns_a}" veth_a "${ns_b}" veth_b \ @@ -484,14 +825,14 @@ test_pmtu_vti4_exception() { # Send DF packet without exceeding link layer MTU, check that no # exception is created - ${ns_a} ping -q -M want -i 0.1 -w 2 -s ${ping_payload} ${vti4_b_addr} > /dev/null - pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti4_b_addr})" + ${ns_a} ping -q -M want -i 0.1 -w 2 -s ${ping_payload} ${tunnel4_b_addr} > /dev/null + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})" check_pmtu_value "" "${pmtu}" "sending packet smaller than PMTU (IP payload length ${esp_payload_rfc4106})" || return 1 # Now exceed link layer MTU by one byte, check that exception is created # with the right PMTU value - ${ns_a} ping -q -M want -i 0.1 -w 2 -s $((ping_payload + 1)) ${vti4_b_addr} > /dev/null - pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti4_b_addr})" + ${ns_a} ping -q -M want -i 0.1 -w 2 -s $((ping_payload + 1)) ${tunnel4_b_addr} > /dev/null + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})" check_pmtu_value "${esp_payload_rfc4106}" "${pmtu}" "exceeding PMTU (IP payload length $((esp_payload_rfc4106 + 1)))" } @@ -506,20 +847,20 @@ test_pmtu_vti6_exception() { mtu "${ns_b}" veth_b 4000 mtu "${ns_a}" vti6_a 5000 mtu "${ns_b}" vti6_b 5000 - ${ns_a} ${ping6} -q -i 0.1 -w 2 -s 60000 ${vti6_b_addr} > /dev/null + ${ns_a} ${ping6} -q -i 0.1 -w 2 -s 60000 ${tunnel6_b_addr} > /dev/null # Check that exception was created - pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})" + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})" check_pmtu_value any "${pmtu}" "creating tunnel exceeding link layer MTU" || return 1 # Decrease tunnel MTU, check for PMTU decrease in route exception mtu "${ns_a}" vti6_a 3000 - pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})" + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})" check_pmtu_value "3000" "${pmtu}" "decreasing tunnel MTU" || fail=1 # Increase tunnel MTU, check for PMTU increase in route exception mtu "${ns_a}" vti6_a 9000 - pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${vti6_b_addr})" + pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})" check_pmtu_value "9000" "${pmtu}" "increasing tunnel MTU" || fail=1 return ${fail}