diff --git a/Documentation/networking/mpls-sysctl.txt b/Documentation/networking/mpls-sysctl.txt index 15d8d16934fd1..2f24a1912a48c 100644 --- a/Documentation/networking/mpls-sysctl.txt +++ b/Documentation/networking/mpls-sysctl.txt @@ -19,6 +19,25 @@ platform_labels - INTEGER Possible values: 0 - 1048575 Default: 0 +ip_ttl_propagate - BOOL + Control whether TTL is propagated from the IPv4/IPv6 header to + the MPLS header on imposing labels and propagated from the + MPLS header to the IPv4/IPv6 header on popping the last label. + + If disabled, the MPLS transport network will appear as a + single hop to transit traffic. + + 0 - disabled / RFC 3443 [Short] Pipe Model + 1 - enabled / RFC 3443 Uniform Model (default) + +default_ttl - BOOL + Default TTL value to use for MPLS packets where it cannot be + propagated from an IP header, either because one isn't present + or ip_ttl_propagate has been disabled. + + Possible values: 1 - 255 + Default: 255 + conf//input - BOOL Control whether packets can be input on this interface. diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h index 179253f9dcfd9..a18af6a16eb52 100644 --- a/include/net/mpls_iptunnel.h +++ b/include/net/mpls_iptunnel.h @@ -19,6 +19,8 @@ struct mpls_iptunnel_encap { u32 label[MAX_NEW_LABELS]; u8 labels; + u8 ttl_propagate; + u8 default_ttl; }; static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate) diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h index d29203651c017..6608b3693385e 100644 --- a/include/net/netns/mpls.h +++ b/include/net/netns/mpls.h @@ -9,8 +9,11 @@ struct mpls_route; struct ctl_table_header; struct netns_mpls { + int ip_ttl_propagate; + int default_ttl; size_t platform_labels; struct mpls_route __rcu * __rcu *platform_label; + struct ctl_table_header *ctl; }; diff --git a/include/uapi/linux/mpls_iptunnel.h b/include/uapi/linux/mpls_iptunnel.h index d80a0498f77ed..f5e45095b0bb5 100644 --- a/include/uapi/linux/mpls_iptunnel.h +++ b/include/uapi/linux/mpls_iptunnel.h @@ -16,11 +16,13 @@ /* MPLS tunnel attributes * [RTA_ENCAP] = { * [MPLS_IPTUNNEL_DST] + * [MPLS_IPTUNNEL_TTL] * } */ enum { MPLS_IPTUNNEL_UNSPEC, MPLS_IPTUNNEL_DST, + MPLS_IPTUNNEL_TTL, __MPLS_IPTUNNEL_MAX, }; #define MPLS_IPTUNNEL_MAX (__MPLS_IPTUNNEL_MAX - 1) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 75fcf5eff093a..3dd72aee4d32a 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -319,6 +319,7 @@ enum rtattr_type_t { RTA_EXPIRES, RTA_PAD, RTA_UID, + RTA_TTL_PROPAGATE, __RTA_MAX }; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 3818686182b21..0c5d111abe363 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -32,7 +32,9 @@ #define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1) static int zero = 0; +static int one = 1; static int label_limit = (1 << 20) - 1; +static int ttl_max = 255; static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, struct nlmsghdr *nlh, struct net *net, u32 portid, @@ -220,8 +222,8 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, return &rt->rt_nh[nh_index]; } -static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, - struct mpls_entry_decoded dec) +static bool mpls_egress(struct net *net, struct mpls_route *rt, + struct sk_buff *skb, struct mpls_entry_decoded dec) { enum mpls_payload_type payload_type; bool success = false; @@ -246,22 +248,46 @@ static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, switch (payload_type) { case MPT_IPV4: { struct iphdr *hdr4 = ip_hdr(skb); + u8 new_ttl; skb->protocol = htons(ETH_P_IP); + + /* If propagating TTL, take the decremented TTL from + * the incoming MPLS header, otherwise decrement the + * TTL, but only if not 0 to avoid underflow. + */ + if (rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED || + (rt->rt_ttl_propagate == MPLS_TTL_PROP_DEFAULT && + net->mpls.ip_ttl_propagate)) + new_ttl = dec.ttl; + else + new_ttl = hdr4->ttl ? hdr4->ttl - 1 : 0; + csum_replace2(&hdr4->check, htons(hdr4->ttl << 8), - htons(dec.ttl << 8)); - hdr4->ttl = dec.ttl; + htons(new_ttl << 8)); + hdr4->ttl = new_ttl; success = true; break; } case MPT_IPV6: { struct ipv6hdr *hdr6 = ipv6_hdr(skb); skb->protocol = htons(ETH_P_IPV6); - hdr6->hop_limit = dec.ttl; + + /* If propagating TTL, take the decremented TTL from + * the incoming MPLS header, otherwise decrement the + * hop limit, but only if not 0 to avoid underflow. + */ + if (rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED || + (rt->rt_ttl_propagate == MPLS_TTL_PROP_DEFAULT && + net->mpls.ip_ttl_propagate)) + hdr6->hop_limit = dec.ttl; + else if (hdr6->hop_limit) + hdr6->hop_limit = hdr6->hop_limit - 1; success = true; break; } case MPT_UNSPEC: + /* Should have decided which protocol it is by now */ break; } @@ -361,7 +387,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, if (unlikely(!new_header_size && dec.bos)) { /* Penultimate hop popping */ - if (!mpls_egress(rt, skb, dec)) + if (!mpls_egress(dev_net(out_dev), rt, skb, dec)) goto err; } else { bool bos; @@ -412,6 +438,7 @@ static struct packet_type mpls_packet_type __read_mostly = { static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = { [RTA_DST] = { .type = NLA_U32 }, [RTA_OIF] = { .type = NLA_U32 }, + [RTA_TTL_PROPAGATE] = { .type = NLA_U8 }, }; struct mpls_route_config { @@ -421,6 +448,7 @@ struct mpls_route_config { u8 rc_via_alen; u8 rc_via[MAX_VIA_ALEN]; u32 rc_label; + u8 rc_ttl_propagate; u8 rc_output_labels; u32 rc_output_label[MAX_NEW_LABELS]; u32 rc_nlflags; @@ -856,6 +884,7 @@ static int mpls_route_add(struct mpls_route_config *cfg) rt->rt_protocol = cfg->rc_protocol; rt->rt_payload_type = cfg->rc_payload_type; + rt->rt_ttl_propagate = cfg->rc_ttl_propagate; if (cfg->rc_mp) err = mpls_nh_build_multi(cfg, rt); @@ -1576,6 +1605,7 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->rc_label = LABEL_NOT_SPECIFIED; cfg->rc_protocol = rtm->rtm_protocol; cfg->rc_via_table = MPLS_NEIGH_TABLE_UNSPEC; + cfg->rc_ttl_propagate = MPLS_TTL_PROP_DEFAULT; cfg->rc_nlflags = nlh->nlmsg_flags; cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->rc_nlinfo.nlh = nlh; @@ -1622,6 +1652,17 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->rc_mp_len = nla_len(nla); break; } + case RTA_TTL_PROPAGATE: + { + u8 ttl_propagate = nla_get_u8(nla); + + if (ttl_propagate > 1) + goto errout; + cfg->rc_ttl_propagate = ttl_propagate ? + MPLS_TTL_PROP_ENABLED : + MPLS_TTL_PROP_DISABLED; + break; + } default: /* Unsupported attribute */ goto errout; @@ -1682,6 +1723,15 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, if (nla_put_labels(skb, RTA_DST, 1, &label)) goto nla_put_failure; + + if (rt->rt_ttl_propagate != MPLS_TTL_PROP_DEFAULT) { + bool ttl_propagate = + rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED; + + if (nla_put_u8(skb, RTA_TTL_PROPAGATE, + ttl_propagate)) + goto nla_put_failure; + } if (rt->rt_nhn == 1) { const struct mpls_nh *nh = rt->rt_nh; @@ -1792,7 +1842,8 @@ static inline size_t lfib_nlmsg_size(struct mpls_route *rt) { size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) - + nla_total_size(4); /* RTA_DST */ + + nla_total_size(4) /* RTA_DST */ + + nla_total_size(1); /* RTA_TTL_PROPAGATE */ if (rt->rt_nhn == 1) { struct mpls_nh *nh = rt->rt_nh; @@ -1876,6 +1927,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) RCU_INIT_POINTER(rt0->rt_nh->nh_dev, lo); rt0->rt_protocol = RTPROT_KERNEL; rt0->rt_payload_type = MPT_IPV4; + rt0->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; rt0->rt_nh->nh_via_table = NEIGH_LINK_TABLE; rt0->rt_nh->nh_via_alen = lo->addr_len; memcpy(__mpls_nh_via(rt0, rt0->rt_nh), lo->dev_addr, @@ -1889,6 +1941,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) RCU_INIT_POINTER(rt2->rt_nh->nh_dev, lo); rt2->rt_protocol = RTPROT_KERNEL; rt2->rt_payload_type = MPT_IPV6; + rt0->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; rt2->rt_nh->nh_via_table = NEIGH_LINK_TABLE; rt2->rt_nh->nh_via_alen = lo->addr_len; memcpy(__mpls_nh_via(rt2, rt2->rt_nh), lo->dev_addr, @@ -1970,6 +2023,9 @@ static int mpls_platform_labels(struct ctl_table *table, int write, return ret; } +#define MPLS_NS_SYSCTL_OFFSET(field) \ + (&((struct net *)0)->field) + static const struct ctl_table mpls_table[] = { { .procname = "platform_labels", @@ -1978,21 +2034,47 @@ static const struct ctl_table mpls_table[] = { .mode = 0644, .proc_handler = mpls_platform_labels, }, + { + .procname = "ip_ttl_propagate", + .data = MPLS_NS_SYSCTL_OFFSET(mpls.ip_ttl_propagate), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "default_ttl", + .data = MPLS_NS_SYSCTL_OFFSET(mpls.default_ttl), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &ttl_max, + }, { } }; static int mpls_net_init(struct net *net) { struct ctl_table *table; + int i; net->mpls.platform_labels = 0; net->mpls.platform_label = NULL; + net->mpls.ip_ttl_propagate = 1; + net->mpls.default_ttl = 255; table = kmemdup(mpls_table, sizeof(mpls_table), GFP_KERNEL); if (table == NULL) return -ENOMEM; - table[0].data = net; + /* Table data contains only offsets relative to the base of + * the mdev at this point, so make them absolute. + */ + for (i = 0; i < ARRAY_SIZE(mpls_table) - 1; i++) + table[i].data = (char *)net + (uintptr_t)table[i].data; + net->mpls.ctl = register_net_sysctl(net, "net/mpls", table); if (net->mpls.ctl == NULL) { kfree(table); diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 76360d8b95798..62928d8fabd1b 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -90,6 +90,12 @@ struct mpls_nh { /* next hop label forwarding entry */ u8 nh_via_table; }; +enum mpls_ttl_propagation { + MPLS_TTL_PROP_DEFAULT, + MPLS_TTL_PROP_ENABLED, + MPLS_TTL_PROP_DISABLED, +}; + /* The route, nexthops and vias are stored together in the same memory * block: * @@ -116,6 +122,7 @@ struct mpls_route { /* next hop label forwarding entry */ u8 rt_protocol; u8 rt_payload_type; u8 rt_max_alen; + u8 rt_ttl_propagate; unsigned int rt_nhn; unsigned int rt_nhn_alive; struct mpls_nh rt_nh[0]; diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index e4e4424f9eb1f..22f71fce0bfb8 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -29,6 +29,7 @@ static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = { [MPLS_IPTUNNEL_DST] = { .type = NLA_U32 }, + [MPLS_IPTUNNEL_TTL] = { .type = NLA_U8 }, }; static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) @@ -49,6 +50,7 @@ static int mpls_xmit(struct sk_buff *skb) struct rtable *rt = NULL; struct rt6_info *rt6 = NULL; struct mpls_dev *out_mdev; + struct net *net; int err = 0; bool bos; int i; @@ -56,17 +58,7 @@ static int mpls_xmit(struct sk_buff *skb) /* Find the output device */ out_dev = dst->dev; - - /* Obtain the ttl */ - if (dst->ops->family == AF_INET) { - ttl = ip_hdr(skb)->ttl; - rt = (struct rtable *)dst; - } else if (dst->ops->family == AF_INET6) { - ttl = ipv6_hdr(skb)->hop_limit; - rt6 = (struct rt6_info *)dst; - } else { - goto drop; - } + net = dev_net(out_dev); skb_orphan(skb); @@ -78,6 +70,38 @@ static int mpls_xmit(struct sk_buff *skb) tun_encap_info = mpls_lwtunnel_encap(dst->lwtstate); + /* Obtain the ttl using the following set of rules. + * + * LWT ttl propagation setting: + * - disabled => use default TTL value from LWT + * - enabled => use TTL value from IPv4/IPv6 header + * - default => + * Global ttl propagation setting: + * - disabled => use default TTL value from global setting + * - enabled => use TTL value from IPv4/IPv6 header + */ + if (dst->ops->family == AF_INET) { + if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) + ttl = tun_encap_info->default_ttl; + else if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DEFAULT && + !net->mpls.ip_ttl_propagate) + ttl = net->mpls.default_ttl; + else + ttl = ip_hdr(skb)->ttl; + rt = (struct rtable *)dst; + } else if (dst->ops->family == AF_INET6) { + if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) + ttl = tun_encap_info->default_ttl; + else if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DEFAULT && + !net->mpls.ip_ttl_propagate) + ttl = net->mpls.default_ttl; + else + ttl = ipv6_hdr(skb)->hop_limit; + rt6 = (struct rt6_info *)dst; + } else { + goto drop; + } + /* Verify the destination can hold the packet */ new_header_size = mpls_encap_size(tun_encap_info); mtu = mpls_dev_mtu(out_dev); @@ -160,6 +184,17 @@ static int mpls_build_state(struct nlattr *nla, &tun_encap_info->labels, tun_encap_info->label); if (ret) goto errout; + + tun_encap_info->ttl_propagate = MPLS_TTL_PROP_DEFAULT; + + if (tb[MPLS_IPTUNNEL_TTL]) { + tun_encap_info->default_ttl = nla_get_u8(tb[MPLS_IPTUNNEL_TTL]); + /* TTL 0 implies propagate from IP header */ + tun_encap_info->ttl_propagate = tun_encap_info->default_ttl ? + MPLS_TTL_PROP_DISABLED : + MPLS_TTL_PROP_ENABLED; + } + newts->type = LWTUNNEL_ENCAP_MPLS; newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; newts->headroom = mpls_encap_size(tun_encap_info); @@ -186,6 +221,10 @@ static int mpls_fill_encap_info(struct sk_buff *skb, tun_encap_info->label)) goto nla_put_failure; + if (tun_encap_info->ttl_propagate != MPLS_TTL_PROP_DEFAULT && + nla_put_u8(skb, MPLS_IPTUNNEL_TTL, tun_encap_info->default_ttl)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -195,10 +234,16 @@ static int mpls_fill_encap_info(struct sk_buff *skb, static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate) { struct mpls_iptunnel_encap *tun_encap_info; + int nlsize; tun_encap_info = mpls_lwtunnel_encap(lwtstate); - return nla_total_size(tun_encap_info->labels * 4); + nlsize = nla_total_size(tun_encap_info->labels * 4); + + if (tun_encap_info->ttl_propagate != MPLS_TTL_PROP_DEFAULT) + nlsize += nla_total_size(1); + + return nlsize; } static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) @@ -207,7 +252,9 @@ static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b); int l; - if (a_hdr->labels != b_hdr->labels) + if (a_hdr->labels != b_hdr->labels || + a_hdr->ttl_propagate != b_hdr->ttl_propagate || + a_hdr->default_ttl != b_hdr->default_ttl) return 1; for (l = 0; l < MAX_NEW_LABELS; l++)