diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 1ce7420322ee3..55674b0e65b7d 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -137,6 +137,20 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, } #if IS_ENABLED(CONFIG_IPV6) +static int vrf_ip6_local_out(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int err; + + err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, + sk, skb, NULL, skb_dst(skb)->dev, dst_output); + + if (likely(err == 1)) + err = dst_output(net, sk, skb); + + return err; +} + static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { @@ -151,7 +165,7 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, .flowlabel = ip6_flowinfo(iph), .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, - .flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF, + .flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF, }; int ret = NET_XMIT_DROP; struct dst_entry *dst; @@ -207,7 +221,7 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); - ret = ip6_local_out(net, skb->sk, skb); + ret = vrf_ip6_local_out(net, skb->sk, skb); if (unlikely(net_xmit_eval(ret))) dev->stats.tx_errors++; else @@ -227,6 +241,20 @@ static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, } #endif +/* based on ip_local_out; can't use it b/c the dst is switched pointing to us */ +static int vrf_ip_local_out(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int err; + + err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, + skb, NULL, skb_dst(skb)->dev, dst_output); + if (likely(err == 1)) + err = dst_output(net, sk, skb); + + return err; +} + static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, struct net_device *vrf_dev) { @@ -237,8 +265,7 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, .flowi4_oif = vrf_dev->ifindex, .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_tos = RT_TOS(ip4h->tos), - .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_L3MDEV_SRC | - FLOWI_FLAG_SKIP_NH_OIF, + .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_SKIP_NH_OIF, .daddr = ip4h->daddr, }; struct net *net = dev_net(vrf_dev); @@ -292,7 +319,7 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, RT_SCOPE_LINK); } - ret = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); + ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(net_xmit_eval(ret))) vrf_dev->stats.tx_errors++; else @@ -377,6 +404,43 @@ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } +/* set dst on skb to send packet to us via dev_xmit path. Allows + * packet to go through device based features such as qdisc, netfilter + * hooks and packet sockets with skb->dev set to vrf device. + */ +static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, + struct sock *sk, + struct sk_buff *skb) +{ + struct net_vrf *vrf = netdev_priv(vrf_dev); + struct dst_entry *dst = NULL; + struct rt6_info *rt6; + + /* don't divert link scope packets */ + if (rt6_need_strict(&ipv6_hdr(skb)->daddr)) + return skb; + + rcu_read_lock(); + + rt6 = rcu_dereference(vrf->rt6); + if (likely(rt6)) { + dst = &rt6->dst; + dst_hold(dst); + } + + rcu_read_unlock(); + + if (unlikely(!dst)) { + vrf_tx_error(vrf_dev, skb); + return NULL; + } + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + return skb; +} + /* holding rtnl */ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { @@ -463,6 +527,13 @@ static int vrf_rt6_create(struct net_device *dev) return rc; } #else +static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, + struct sock *sk, + struct sk_buff *skb) +{ + return skb; +} + static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { } @@ -531,6 +602,55 @@ static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } +/* set dst on skb to send packet to us via dev_xmit path. Allows + * packet to go through device based features such as qdisc, netfilter + * hooks and packet sockets with skb->dev set to vrf device. + */ +static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, + struct sock *sk, + struct sk_buff *skb) +{ + struct net_vrf *vrf = netdev_priv(vrf_dev); + struct dst_entry *dst = NULL; + struct rtable *rth; + + rcu_read_lock(); + + rth = rcu_dereference(vrf->rth); + if (likely(rth)) { + dst = &rth->dst; + dst_hold(dst); + } + + rcu_read_unlock(); + + if (unlikely(!dst)) { + vrf_tx_error(vrf_dev, skb); + return NULL; + } + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + return skb; +} + +/* called with rcu lock held */ +static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev, + struct sock *sk, + struct sk_buff *skb, + u16 proto) +{ + switch (proto) { + case AF_INET: + return vrf_ip_out(vrf_dev, sk, skb); + case AF_INET6: + return vrf_ip6_out(vrf_dev, sk, skb); + } + + return skb; +} + /* holding rtnl */ static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf) { @@ -722,63 +842,6 @@ static u32 vrf_fib_table(const struct net_device *dev) return vrf->tb_id; } -static struct rtable *vrf_get_rtable(const struct net_device *dev, - const struct flowi4 *fl4) -{ - struct rtable *rth = NULL; - - if (!(fl4->flowi4_flags & FLOWI_FLAG_L3MDEV_SRC)) { - struct net_vrf *vrf = netdev_priv(dev); - - rcu_read_lock(); - - rth = rcu_dereference(vrf->rth); - if (likely(rth)) - dst_hold(&rth->dst); - - rcu_read_unlock(); - } - - return rth; -} - -/* called under rcu_read_lock */ -static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4) -{ - struct fib_result res = { .tclassid = 0 }; - struct net *net = dev_net(dev); - u32 orig_tos = fl4->flowi4_tos; - u8 flags = fl4->flowi4_flags; - u8 scope = fl4->flowi4_scope; - u8 tos = RT_FL_TOS(fl4); - int rc; - - if (unlikely(!fl4->daddr)) - return 0; - - fl4->flowi4_flags |= FLOWI_FLAG_SKIP_NH_OIF; - fl4->flowi4_iif = LOOPBACK_IFINDEX; - /* make sure oif is set to VRF device for lookup */ - fl4->flowi4_oif = dev->ifindex; - fl4->flowi4_tos = tos & IPTOS_RT_MASK; - fl4->flowi4_scope = ((tos & RTO_ONLINK) ? - RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); - - rc = fib_lookup(net, fl4, &res, 0); - if (!rc) { - if (res.type == RTN_LOCAL) - fl4->saddr = res.fi->fib_prefsrc ? : fl4->daddr; - else - fib_select_path(net, &res, fl4, -1); - } - - fl4->flowi4_flags = flags; - fl4->flowi4_tos = orig_tos; - fl4->flowi4_scope = scope; - - return rc; -} - static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { return 0; @@ -970,106 +1033,44 @@ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, } #if IS_ENABLED(CONFIG_IPV6) -static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, - struct flowi6 *fl6) +/* send to link-local or multicast address via interface enslaved to + * VRF device. Force lookup to VRF table without changing flow struct + */ +static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, + struct flowi6 *fl6) { - bool need_strict = rt6_need_strict(&fl6->daddr); - struct net_vrf *vrf = netdev_priv(dev); struct net *net = dev_net(dev); + int flags = RT6_LOOKUP_F_IFACE; struct dst_entry *dst = NULL; struct rt6_info *rt; - /* send to link-local or multicast address */ - if (need_strict) { - int flags = RT6_LOOKUP_F_IFACE; - - /* VRF device does not have a link-local address and - * sending packets to link-local or mcast addresses over - * a VRF device does not make sense - */ - if (fl6->flowi6_oif == dev->ifindex) { - struct dst_entry *dst = &net->ipv6.ip6_null_entry->dst; - - dst_hold(dst); - return dst; - } - - if (!ipv6_addr_any(&fl6->saddr)) - flags |= RT6_LOOKUP_F_HAS_SADDR; - - rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, flags); - if (rt) - dst = &rt->dst; - - } else if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) { - - rcu_read_lock(); - - rt = rcu_dereference(vrf->rt6); - if (likely(rt)) { - dst = &rt->dst; - dst_hold(dst); - } - - rcu_read_unlock(); + /* VRF device does not have a link-local address and + * sending packets to link-local or mcast addresses over + * a VRF device does not make sense + */ + if (fl6->flowi6_oif == dev->ifindex) { + dst = &net->ipv6.ip6_null_entry->dst; + dst_hold(dst); + return dst; } - /* make sure oif is set to VRF device for lookup */ - if (!need_strict) - fl6->flowi6_oif = dev->ifindex; - - return dst; -} - -/* called under rcu_read_lock */ -static int vrf_get_saddr6(struct net_device *dev, const struct sock *sk, - struct flowi6 *fl6) -{ - struct net *net = dev_net(dev); - struct dst_entry *dst; - struct rt6_info *rt; - int err; - - if (rt6_need_strict(&fl6->daddr)) { - rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, - RT6_LOOKUP_F_IFACE); - if (unlikely(!rt)) - return 0; + if (!ipv6_addr_any(&fl6->saddr)) + flags |= RT6_LOOKUP_F_HAS_SADDR; + rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, flags); + if (rt) dst = &rt->dst; - } else { - __u8 flags = fl6->flowi6_flags; - fl6->flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC; - fl6->flowi6_flags |= FLOWI_FLAG_SKIP_NH_OIF; - - dst = ip6_route_output(net, sk, fl6); - rt = (struct rt6_info *)dst; - - fl6->flowi6_flags = flags; - } - - err = dst->error; - if (!err) { - err = ip6_route_get_saddr(net, rt, &fl6->daddr, - sk ? inet6_sk(sk)->srcprefs : 0, - &fl6->saddr); - } - - dst_release(dst); - - return err; + return dst; } #endif static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_fib_table = vrf_fib_table, - .l3mdev_get_rtable = vrf_get_rtable, - .l3mdev_get_saddr = vrf_get_saddr, .l3mdev_l3_rcv = vrf_l3_rcv, + .l3mdev_l3_out = vrf_l3_out, #if IS_ENABLED(CONFIG_IPV6) - .l3mdev_get_rt6_dst = vrf_get_rt6_dst, - .l3mdev_get_saddr6 = vrf_get_saddr6, + .l3mdev_link_scope_lookup = vrf_link_scope_lookup, #endif }; diff --git a/include/net/flow.h b/include/net/flow.h index d47ef4bb5423a..035aa77169678 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -34,8 +34,7 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 -#define FLOWI_FLAG_L3MDEV_SRC 0x04 -#define FLOWI_FLAG_SKIP_NH_OIF 0x08 +#define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; }; diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index e90095091aa0d..3832099289c5a 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -11,6 +11,7 @@ #ifndef _NET_L3MDEV_H_ #define _NET_L3MDEV_H_ +#include #include /** @@ -18,30 +19,24 @@ * * @l3mdev_fib_table: Get FIB table id to use for lookups * - * @l3mdev_get_rtable: Get cached IPv4 rtable (dst_entry) for device + * @l3mdev_l3_rcv: Hook in L3 receive path * - * @l3mdev_get_saddr: Get source address for a flow + * @l3mdev_l3_out: Hook in L3 output path * - * @l3mdev_get_rt6_dst: Get cached IPv6 rt6_info (dst_entry) for device + * @l3mdev_link_scope_lookup: IPv6 lookup for linklocal and mcast destinations */ struct l3mdev_ops { u32 (*l3mdev_fib_table)(const struct net_device *dev); struct sk_buff * (*l3mdev_l3_rcv)(struct net_device *dev, struct sk_buff *skb, u16 proto); - - /* IPv4 ops */ - struct rtable * (*l3mdev_get_rtable)(const struct net_device *dev, - const struct flowi4 *fl4); - int (*l3mdev_get_saddr)(struct net_device *dev, - struct flowi4 *fl4); + struct sk_buff * (*l3mdev_l3_out)(struct net_device *dev, + struct sock *sk, struct sk_buff *skb, + u16 proto); /* IPv6 ops */ - struct dst_entry * (*l3mdev_get_rt6_dst)(const struct net_device *dev, + struct dst_entry * (*l3mdev_link_scope_lookup)(const struct net_device *dev, struct flowi6 *fl6); - int (*l3mdev_get_saddr6)(struct net_device *dev, - const struct sock *sk, - struct flowi6 *fl6); }; #ifdef CONFIG_NET_L3_MASTER_DEV @@ -49,6 +44,8 @@ struct l3mdev_ops { int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg); +void l3mdev_update_flow(struct net *net, struct flowi *fl); + int l3mdev_master_ifindex_rcu(const struct net_device *dev); static inline int l3mdev_master_ifindex(struct net_device *dev) { @@ -80,7 +77,7 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) } static inline -const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev) +struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev) { /* netdev_master_upper_dev_get_rcu calls * list_first_or_null_rcu to walk the upper dev list. @@ -89,7 +86,7 @@ const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev) * typecast to remove the const */ struct net_device *dev = (struct net_device *)_dev; - const struct net_device *master; + struct net_device *master; if (!dev) return NULL; @@ -104,26 +101,6 @@ const struct net_device *l3mdev_master_dev_rcu(const struct net_device *_dev) return master; } -/* get index of an interface to use for FIB lookups. For devices - * enslaved to an L3 master device FIB lookups are based on the - * master index - */ -static inline int l3mdev_fib_oif_rcu(struct net_device *dev) -{ - return l3mdev_master_ifindex_rcu(dev) ? : dev->ifindex; -} - -static inline int l3mdev_fib_oif(struct net_device *dev) -{ - int oif; - - rcu_read_lock(); - oif = l3mdev_fib_oif_rcu(dev); - rcu_read_unlock(); - - return oif; -} - u32 l3mdev_fib_table_rcu(const struct net_device *dev); u32 l3mdev_fib_table_by_index(struct net *net, int ifindex); static inline u32 l3mdev_fib_table(const struct net_device *dev) @@ -137,15 +114,6 @@ static inline u32 l3mdev_fib_table(const struct net_device *dev) return tb_id; } -static inline struct rtable *l3mdev_get_rtable(const struct net_device *dev, - const struct flowi4 *fl4) -{ - if (netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_get_rtable) - return dev->l3mdev_ops->l3mdev_get_rtable(dev, fl4); - - return NULL; -} - static inline bool netif_index_is_l3_master(struct net *net, int ifindex) { struct net_device *dev; @@ -165,11 +133,7 @@ static inline bool netif_index_is_l3_master(struct net *net, int ifindex) return rc; } -int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4); - -struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6); -int l3mdev_get_saddr6(struct net *net, const struct sock *sk, - struct flowi6 *fl6); +struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6); static inline struct sk_buff *l3mdev_l3_rcv(struct sk_buff *skb, u16 proto) @@ -199,6 +163,34 @@ struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb) return l3mdev_l3_rcv(skb, AF_INET6); } +static inline +struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto) +{ + struct net_device *dev = skb_dst(skb)->dev; + + if (netif_is_l3_slave(dev)) { + struct net_device *master; + + master = netdev_master_upper_dev_get_rcu(dev); + if (master && master->l3mdev_ops->l3mdev_l3_out) + skb = master->l3mdev_ops->l3mdev_l3_out(master, sk, + skb, proto); + } + + return skb; +} + +static inline +struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb) +{ + return l3mdev_l3_out(sk, skb, AF_INET); +} + +static inline +struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb) +{ + return l3mdev_l3_out(sk, skb, AF_INET6); +} #else static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev) @@ -216,20 +208,11 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) } static inline -const struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev) +struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev) { return NULL; } -static inline int l3mdev_fib_oif_rcu(struct net_device *dev) -{ - return dev ? dev->ifindex : 0; -} -static inline int l3mdev_fib_oif(struct net_device *dev) -{ - return dev ? dev->ifindex : 0; -} - static inline u32 l3mdev_fib_table_rcu(const struct net_device *dev) { return 0; @@ -243,43 +226,37 @@ static inline u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) return 0; } -static inline struct rtable *l3mdev_get_rtable(const struct net_device *dev, - const struct flowi4 *fl4) -{ - return NULL; -} - static inline bool netif_index_is_l3_master(struct net *net, int ifindex) { return false; } -static inline int l3mdev_get_saddr(struct net *net, int ifindex, - struct flowi4 *fl4) +static inline +struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6) { - return 0; + return NULL; } static inline -struct dst_entry *l3mdev_get_rt6_dst(struct net *net, struct flowi6 *fl6) +struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb) { - return NULL; + return skb; } -static inline int l3mdev_get_saddr6(struct net *net, const struct sock *sk, - struct flowi6 *fl6) +static inline +struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb) { - return 0; + return skb; } static inline -struct sk_buff *l3mdev_ip_rcv(struct sk_buff *skb) +struct sk_buff *l3mdev_ip_out(struct sock *sk, struct sk_buff *skb) { return skb; } static inline -struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb) +struct sk_buff *l3mdev_ip6_out(struct sock *sk, struct sk_buff *skb) { return skb; } @@ -290,6 +267,10 @@ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, { return 1; } +static inline +void l3mdev_update_flow(struct net *net, struct flowi *fl) +{ +} #endif #endif /* _NET_L3MDEV_H_ */ diff --git a/include/net/route.h b/include/net/route.h index ad777d79af945..0429d47cad25c 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -285,15 +284,6 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4, ip_route_connect_init(fl4, dst, src, tos, oif, protocol, sport, dport, sk); - if (!src && oif) { - int rc; - - rc = l3mdev_get_saddr(net, oif, fl4); - if (rc < 0) - return ERR_PTR(rc); - - src = fl4->saddr; - } if (!dst || !src) { rt = __ip_route_output_key(net, fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 6e9ea69e5f751..770bebed6b288 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -56,6 +56,9 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, }; int err; + /* update flow if oif or iif point to device enslaved to l3mdev */ + l3mdev_update_flow(net, flowi4_to_flowi(flp)); + err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); #ifdef CONFIG_IP_ROUTE_CLASSID if (arg.rule) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b913f5bf07576..05d105832bdbb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -99,6 +99,14 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) iph->tot_len = htons(skb->len); ip_send_check(iph); + + /* if egress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_out(sk, skb); + if (unlikely(!skb)) + return 0; + return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); @@ -1574,8 +1582,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, } oif = arg->bound_dev_if; - if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) - oif = skb->skb_iif; + oif = oif ? : skb->skb_iif; flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark), diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 438f50c1a6765..90a85c9558724 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -606,12 +606,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); - if (!saddr && ipc.oif) { - err = l3mdev_get_saddr(net, ipc.oif, &fl4); - if (err < 0) - goto done; - } - if (!inet->hdrincl) { rfv.msg = msg; rfv.hlen = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3e992783c1d0c..b52496fd51075 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1831,7 +1831,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, * Now we are ready to route packet. */ fl4.flowi4_oif = 0; - fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev); + fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; @@ -2018,7 +2018,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res, return ERR_PTR(-EINVAL); if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) - if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) + if (ipv4_is_loopback(fl4->saddr) && + !(dev_out->flags & IFF_LOOPBACK) && + !netif_is_l3_master(dev_out)) return ERR_PTR(-EINVAL); if (ipv4_is_lbcast(fl4->daddr)) @@ -2148,7 +2150,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, unsigned int flags = 0; struct fib_result res; struct rtable *rth; - int master_idx; int orig_oif; int err = -ENETUNREACH; @@ -2158,9 +2159,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, orig_oif = fl4->flowi4_oif; - master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif); - if (master_idx) - fl4->flowi4_oif = master_idx; fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_tos = tos & IPTOS_RT_MASK; fl4->flowi4_scope = ((tos & RTO_ONLINK) ? @@ -2244,10 +2242,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } - - rth = l3mdev_get_rtable(dev_out, fl4); - if (rth) - goto out; } if (!fl4->daddr) { @@ -2265,8 +2259,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, if (err) { res.fi = NULL; res.table = NULL; - if (fl4->flowi4_oif && - !netif_index_is_l3_master(net, fl4->flowi4_oif)) { + if (fl4->flowi4_oif) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2302,7 +2295,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, else fl4->saddr = fl4->daddr; } - dev_out = net->loopback_dev; + + /* L3 master device is the loopback for that domain */ + dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev; fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; @@ -2577,9 +2572,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; - if (netif_index_is_l3_master(net, fl4.flowi4_oif)) - fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF; - if (iif) { struct net_device *dev; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 058c31286ce17..7d96dc2d3d08f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1021,12 +1021,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flow_flags, faddr, saddr, dport, inet->inet_sport); - if (!saddr && ipc.oif) { - err = l3mdev_get_saddr(net, ipc.oif, fl4); - if (err < 0) - goto out; - } - security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index b644a23c3db06..3155ed73d3b3c 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -112,7 +112,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) int oif = 0; if (skb_dst(skb)) - oif = l3mdev_fib_oif(skb_dst(skb)->dev); + oif = skb_dst(skb)->dev->ifindex; memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 5857c1fc8b672..eea23b57c6a5a 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -38,6 +38,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, .flags = FIB_LOOKUP_NOREF, }; + /* update flow if oif or iif point to device enslaved to l3mdev */ + l3mdev_update_flow(net, flowi6_to_flowi(fl6)); + fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 993fd9666f1b3..6001e781164eb 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -236,6 +236,14 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUT, skb->len); + + /* if egress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_out((struct sock *)sk, skb); + if (unlikely(!skb)) + return 0; + /* hooks should never assume socket lock is held. * we promote our socket to non const */ @@ -918,13 +926,6 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, int err; int flags = 0; - if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif && - (!*dst || !(*dst)->error)) { - err = l3mdev_get_saddr6(net, sk, fl6); - if (err) - goto out_err; - } - /* The correct way to handle this would be to do * ip6_route_get_saddr, and then ip6_route_output; however, * the route-specific preferred source forces the @@ -1016,7 +1017,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, out_err_release: dst_release(*dst); *dst = NULL; -out_err: + if (err == -ENETUNREACH) IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); return err; @@ -1062,8 +1063,6 @@ struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, return ERR_PTR(err); if (final_dst) fl6->daddr = *final_dst; - if (!fl6->flowi6_oif) - fl6->flowi6_oif = l3mdev_fib_oif(dst->dev); return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index fe65cdc28a459..d8e671457d105 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -67,7 +67,6 @@ #include #include #include -#include #include #include @@ -457,11 +456,9 @@ static void ndisc_send_skb(struct sk_buff *skb, if (!dst) { struct flowi6 fl6; - int oif = l3mdev_fib_oif(skb->dev); + int oif = skb->dev->ifindex; icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); - if (oif != skb->dev->ifindex) - fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC; dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { kfree_skb(skb); @@ -1538,7 +1535,6 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) int rd_len; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL, ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; - int oif = l3mdev_fib_oif(dev); bool ret; if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { @@ -1555,10 +1551,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) } icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, - &saddr_buf, &ipv6_hdr(skb)->saddr, oif); - - if (oif != skb->dev->ifindex) - fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC; + &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 462f2a76b5c22..7cca8ac66fe9b 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -148,6 +148,13 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) ipv6_hdr(skb)->payload_len = htons(len); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); + /* if egress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_out(sk, skb); + if (unlikely(!skb)) + return 0; + return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 590dd1f7746fc..54404f08efcca 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -653,6 +653,13 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, if (err) goto error_fault; + /* if egress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_out(sk, skb); + if (unlikely(!skb)) + return 0; + IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 09d43ff11a8d2..ad4a7ff301fcc 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1164,7 +1164,7 @@ void ip6_route_input(struct sk_buff *skb) int flags = RT6_LOOKUP_F_HAS_SADDR; struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { - .flowi6_iif = l3mdev_fib_oif(skb->dev), + .flowi6_iif = skb->dev->ifindex, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), @@ -1188,12 +1188,15 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags) { - struct dst_entry *dst; bool any_src; - dst = l3mdev_get_rt6_dst(net, fl6); - if (dst) - return dst; + if (rt6_need_strict(&fl6->daddr)) { + struct dst_entry *dst; + + dst = l3mdev_link_scope_lookup(net, fl6); + if (dst) + return dst; + } fl6->flowi6_iif = LOOPBACK_IFINDEX; @@ -2558,8 +2561,16 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, { u32 tb_id; struct net *net = dev_net(idev->dev); - struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, - DST_NOCOUNT); + struct net_device *dev = net->loopback_dev; + struct rt6_info *rt; + + /* use L3 Master device as loopback for host routes if device + * is enslaved and address is not link local or multicast + */ + if (!rt6_need_strict(addr)) + dev = l3mdev_master_dev_rcu(idev->dev) ? : dev; + + rt = ip6_dst_alloc(net, dev, DST_NOCOUNT); if (!rt) return ERR_PTR(-ENOMEM); @@ -3338,11 +3349,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) } else { fl6.flowi6_oif = oif; - if (netif_index_is_l3_master(net, oif)) { - fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | - FLOWI_FLAG_SKIP_NH_OIF; - } - rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 04529a3d42cb9..54cf7197c7ab5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -818,12 +818,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_proto = IPPROTO_TCP; if (rt6_need_strict(&fl6.daddr) && !oif) fl6.flowi6_oif = tcp_v6_iif(skb); - else { - if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) - oif = skb->skb_iif; - - fl6.flowi6_oif = oif; - } + else + fl6.flowi6_oif = oif ? : skb->skb_iif; fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); fl6.fl6_dport = t1->dest; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 6cc97003e4a95..b7b7e863a2bbd 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -134,7 +134,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) nexthdr = nh[nhoff]; if (skb_dst(skb)) - oif = l3mdev_fib_oif(skb_dst(skb)->dev); + oif = skb_dst(skb)->dev->ifindex; memset(fl6, 0, sizeof(struct flowi6)); fl6->flowi6_mark = skb->mark; diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index c4a1c3e84e126..8da86ceca33dd 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -100,15 +100,14 @@ u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); /** - * l3mdev_get_rt6_dst - IPv6 route lookup based on flow. Returns - * cached route for L3 master device if relevant - * to flow + * l3mdev_link_scope_lookup - IPv6 route lookup based on flow for link + * local and multicast addresses * @net: network namespace for device index lookup * @fl6: IPv6 flow struct for lookup */ -struct dst_entry *l3mdev_get_rt6_dst(struct net *net, - struct flowi6 *fl6) +struct dst_entry *l3mdev_link_scope_lookup(struct net *net, + struct flowi6 *fl6) { struct dst_entry *dst = NULL; struct net_device *dev; @@ -121,70 +120,15 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, dev = netdev_master_upper_dev_get_rcu(dev); if (dev && netif_is_l3_master(dev) && - dev->l3mdev_ops->l3mdev_get_rt6_dst) - dst = dev->l3mdev_ops->l3mdev_get_rt6_dst(dev, fl6); + dev->l3mdev_ops->l3mdev_link_scope_lookup) + dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6); rcu_read_unlock(); } return dst; } -EXPORT_SYMBOL_GPL(l3mdev_get_rt6_dst); - -/** - * l3mdev_get_saddr - get source address for a flow based on an interface - * enslaved to an L3 master device - * @net: network namespace for device index lookup - * @ifindex: Interface index - * @fl4: IPv4 flow struct - */ - -int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4) -{ - struct net_device *dev; - int rc = 0; - - if (ifindex) { - rcu_read_lock(); - - dev = dev_get_by_index_rcu(net, ifindex); - if (dev && netif_is_l3_slave(dev)) - dev = netdev_master_upper_dev_get_rcu(dev); - - if (dev && netif_is_l3_master(dev) && - dev->l3mdev_ops->l3mdev_get_saddr) - rc = dev->l3mdev_ops->l3mdev_get_saddr(dev, fl4); - - rcu_read_unlock(); - } - - return rc; -} -EXPORT_SYMBOL_GPL(l3mdev_get_saddr); - -int l3mdev_get_saddr6(struct net *net, const struct sock *sk, - struct flowi6 *fl6) -{ - struct net_device *dev; - int rc = 0; - - if (fl6->flowi6_oif) { - rcu_read_lock(); - - dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); - if (dev && netif_is_l3_slave(dev)) - dev = netdev_master_upper_dev_get_rcu(dev); - - if (dev && netif_is_l3_master(dev) && - dev->l3mdev_ops->l3mdev_get_saddr6) - rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6); - - rcu_read_unlock(); - } - - return rc; -} -EXPORT_SYMBOL_GPL(l3mdev_get_saddr6); +EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup); /** * l3mdev_fib_rule_match - Determine if flowi references an @@ -222,3 +166,38 @@ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, return rc; } + +void l3mdev_update_flow(struct net *net, struct flowi *fl) +{ + struct net_device *dev; + int ifindex; + + rcu_read_lock(); + + if (fl->flowi_oif) { + dev = dev_get_by_index_rcu(net, fl->flowi_oif); + if (dev) { + ifindex = l3mdev_master_ifindex_rcu(dev); + if (ifindex) { + fl->flowi_oif = ifindex; + fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF; + goto out; + } + } + } + + if (fl->flowi_iif) { + dev = dev_get_by_index_rcu(net, fl->flowi_iif); + if (dev) { + ifindex = l3mdev_master_ifindex_rcu(dev); + if (ifindex) { + fl->flowi_iif = ifindex; + fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF; + } + } + } + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(l3mdev_update_flow);