Skip to content

Commit

Permalink
Merge branch 'ovs-L3-encap'
Browse files Browse the repository at this point in the history
Jiri Benc says:

====================
openvswitch: support for layer 3 encapsulated packets

At the core of this patch set is removing the assumption in Open vSwitch
datapath that all packets have Ethernet header.

The implementation relies on the presence of pop_eth and push_eth actions
in datapath flows to facilitate adding and removing Ethernet headers as
appropriate. The construction of such flows is left up to user-space.

This series is based on work by Simon Horman, Lorand Jakab, Thomas Morin and
others. I kept Lorand's and Simon's s-o-b in the patches that are derived
from v11 to record their authorship of parts of the code.

Changes from v12 to v13:

* Addressed Pravin's feedback.
* Removed the GRE vport conversion patch; L3 GRE ports should be created by
  rtnetlink instead.

Main changes from v11 to v12:

* The patches were restructured and split differently for easier review.
* They were rebased and adjusted to the current net-next. Especially MPLS
  handling is different (and easier) thanks to the recent MPLS GSO rework.
* Several bugs were discovered and fixed. The most notable is fragment
  handling: header adjustment for ARPHRD_NONE devices on tx needs to be done
  after refragmentation, not before it. This required significant changes in
  the patchset. Another one is stricter checking of attributes (match on L2
  vs. L3 packet) at the kernel level.
* Instead of is_layer3 bool, a mac_proto field is used.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Nov 13, 2016
2 parents c540594 + 217ac77 commit f0a4040
Show file tree
Hide file tree
Showing 9 changed files with 353 additions and 134 deletions.
15 changes: 15 additions & 0 deletions include/uapi/linux/openvswitch.h
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,15 @@ enum ovs_nat_attr {

#define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1)

/*
* struct ovs_action_push_eth - %OVS_ACTION_ATTR_PUSH_ETH action argument.
* @addresses: Source and destination MAC addresses.
* @eth_type: Ethernet type
*/
struct ovs_action_push_eth {
struct ovs_key_ethernet addresses;
};

/**
* enum ovs_action_attr - Action types.
*
Expand Down Expand Up @@ -738,6 +747,10 @@ enum ovs_nat_attr {
* is no MPLS label stack, as determined by ethertype, no action is taken.
* @OVS_ACTION_ATTR_CT: Track the connection. Populate the conntrack-related
* entries in the flow key.
* @OVS_ACTION_ATTR_PUSH_ETH: Push a new outermost Ethernet header onto the
* packet.
* @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
* packet.
*
* Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all
* fields within a header are modifiable, e.g. the IPv4 protocol and fragment
Expand Down Expand Up @@ -765,6 +778,8 @@ enum ovs_action_attr {
* bits. */
OVS_ACTION_ATTR_CT, /* Nested OVS_CT_ATTR_* . */
OVS_ACTION_ATTR_TRUNC, /* u32 struct ovs_action_trunc. */
OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */
OVS_ACTION_ATTR_POP_ETH, /* No argument. */

__OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted
* from userspace. */
Expand Down
111 changes: 81 additions & 30 deletions net/openvswitch/actions.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ struct ovs_frag_data {
u16 vlan_tci;
__be16 vlan_proto;
unsigned int l2_len;
u8 mac_proto;
u8 l2_data[MAX_L2_LEN];
};

Expand Down Expand Up @@ -137,12 +138,12 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb,

static void invalidate_flow_key(struct sw_flow_key *key)
{
key->eth.type = htons(0);
key->mac_proto |= SW_FLOW_KEY_INVALID;
}

static bool is_flow_key_valid(const struct sw_flow_key *key)
{
return !!key->eth.type;
return !(key->mac_proto & SW_FLOW_KEY_INVALID);
}

static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
Expand Down Expand Up @@ -186,7 +187,8 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,

skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);

update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET)
update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
skb->protocol = mpls->mpls_ethertype;

invalidate_flow_key(key);
Expand All @@ -196,7 +198,6 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
const __be16 ethertype)
{
struct ethhdr *hdr;
int err;

err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
Expand All @@ -212,11 +213,15 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->mac_len);

/* mpls_hdr() is used to locate the ethertype field correctly in the
* presence of VLAN tags.
*/
hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
update_ethertype(skb, hdr, ethertype);
if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) {
struct ethhdr *hdr;

/* mpls_hdr() is used to locate the ethertype field correctly in the
* presence of VLAN tags.
*/
hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
update_ethertype(skb, hdr, ethertype);
}
if (eth_p_mpls(skb->protocol))
skb->protocol = ethertype;

Expand Down Expand Up @@ -312,6 +317,47 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
return 0;
}

/* pop_eth does not support VLAN packets as this action is never called
* for them.
*/
static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key)
{
skb_pull_rcsum(skb, ETH_HLEN);
skb_reset_mac_header(skb);
skb_reset_mac_len(skb);

/* safe right before invalidate_flow_key */
key->mac_proto = MAC_PROTO_NONE;
invalidate_flow_key(key);
return 0;
}

static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
const struct ovs_action_push_eth *ethh)
{
struct ethhdr *hdr;

/* Add the new Ethernet header */
if (skb_cow_head(skb, ETH_HLEN) < 0)
return -ENOMEM;

skb_push(skb, ETH_HLEN);
skb_reset_mac_header(skb);
skb_reset_mac_len(skb);

hdr = eth_hdr(skb);
ether_addr_copy(hdr->h_source, ethh->addresses.eth_src);
ether_addr_copy(hdr->h_dest, ethh->addresses.eth_dst);
hdr->h_proto = skb->protocol;

skb_postpush_rcsum(skb, hdr, ETH_HLEN);

/* safe right before invalidate_flow_key */
key->mac_proto = MAC_PROTO_ETHERNET;
invalidate_flow_key(key);
return 0;
}

static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
__be32 addr, __be32 new_addr)
{
Expand Down Expand Up @@ -673,7 +719,7 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk
skb_reset_mac_len(skb);
}

ovs_vport_send(vport, skb);
ovs_vport_send(vport, skb, data->mac_proto);
return 0;
}

Expand All @@ -692,7 +738,7 @@ static struct dst_ops ovs_dst_ops = {
* ovs_vport_output(), which is called once per fragmented packet.
*/
static void prepare_frag(struct vport *vport, struct sk_buff *skb,
u16 orig_network_offset)
u16 orig_network_offset, u8 mac_proto)
{
unsigned int hlen = skb_network_offset(skb);
struct ovs_frag_data *data;
Expand All @@ -705,6 +751,7 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb,
data->network_offset = orig_network_offset;
data->vlan_tci = skb->vlan_tci;
data->vlan_proto = skb->vlan_proto;
data->mac_proto = mac_proto;
data->l2_len = hlen;
memcpy(&data->l2_data, skb->data, hlen);

Expand All @@ -713,7 +760,8 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb,
}

static void ovs_fragment(struct net *net, struct vport *vport,
struct sk_buff *skb, u16 mru, __be16 ethertype)
struct sk_buff *skb, u16 mru,
struct sw_flow_key *key)
{
u16 orig_network_offset = 0;

Expand All @@ -727,11 +775,12 @@ static void ovs_fragment(struct net *net, struct vport *vport,
goto err;
}

if (ethertype == htons(ETH_P_IP)) {
if (key->eth.type == htons(ETH_P_IP)) {
struct dst_entry ovs_dst;
unsigned long orig_dst;

prepare_frag(vport, skb, orig_network_offset);
prepare_frag(vport, skb, orig_network_offset,
ovs_key_mac_proto(key));
dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1,
DST_OBSOLETE_NONE, DST_NOCOUNT);
ovs_dst.dev = vport->dev;
Expand All @@ -742,7 +791,7 @@ static void ovs_fragment(struct net *net, struct vport *vport,

ip_do_fragment(net, skb->sk, skb, ovs_vport_output);
refdst_drop(orig_dst);
} else if (ethertype == htons(ETH_P_IPV6)) {
} else if (key->eth.type == htons(ETH_P_IPV6)) {
const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
unsigned long orig_dst;
struct rt6_info ovs_rt;
Expand All @@ -751,7 +800,8 @@ static void ovs_fragment(struct net *net, struct vport *vport,
goto err;
}

prepare_frag(vport, skb, orig_network_offset);
prepare_frag(vport, skb, orig_network_offset,
ovs_key_mac_proto(key));
memset(&ovs_rt, 0, sizeof(ovs_rt));
dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
DST_OBSOLETE_NONE, DST_NOCOUNT);
Expand All @@ -765,7 +815,7 @@ static void ovs_fragment(struct net *net, struct vport *vport,
refdst_drop(orig_dst);
} else {
WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",
ovs_vport_name(vport), ntohs(ethertype), mru,
ovs_vport_name(vport), ntohs(key->eth.type), mru,
vport->dev->mtu);
goto err;
}
Expand All @@ -785,26 +835,19 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
u32 cutlen = OVS_CB(skb)->cutlen;

if (unlikely(cutlen > 0)) {
if (skb->len - cutlen > ETH_HLEN)
if (skb->len - cutlen > ovs_mac_header_len(key))
pskb_trim(skb, skb->len - cutlen);
else
pskb_trim(skb, ETH_HLEN);
pskb_trim(skb, ovs_mac_header_len(key));
}

if (likely(!mru || (skb->len <= mru + ETH_HLEN))) {
ovs_vport_send(vport, skb);
if (likely(!mru ||
(skb->len <= mru + vport->dev->hard_header_len))) {
ovs_vport_send(vport, skb, ovs_key_mac_proto(key));
} else if (mru <= vport->dev->mtu) {
struct net *net = read_pnet(&dp->net);
__be16 ethertype = key->eth.type;

if (!is_flow_key_valid(key)) {
if (eth_p_mpls(skb->protocol))
ethertype = skb->inner_protocol;
else
ethertype = vlan_get_protocol(skb);
}

ovs_fragment(net, vport, skb, mru, ethertype);
ovs_fragment(net, vport, skb, mru, key);
} else {
kfree_skb(skb);
}
Expand Down Expand Up @@ -1198,6 +1241,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
if (err)
return err == -EINPROGRESS ? 0 : err;
break;

case OVS_ACTION_ATTR_PUSH_ETH:
err = push_eth(skb, key, nla_data(a));
break;

case OVS_ACTION_ATTR_POP_ETH:
err = pop_eth(skb, key);
break;
}

if (unlikely(err)) {
Expand Down
13 changes: 1 addition & 12 deletions net/openvswitch/datapath.c
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
struct sw_flow *flow;
struct sw_flow_actions *sf_acts;
struct datapath *dp;
struct ethhdr *eth;
struct vport *input_vport;
u16 mru = 0;
int len;
Expand All @@ -581,17 +580,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)

nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len);

skb_reset_mac_header(packet);
eth = eth_hdr(packet);

/* Normally, setting the skb 'protocol' field would be handled by a
* call to eth_type_trans(), but it assumes there's a sending
* device, which we may not have. */
if (eth_proto_is_802_3(eth->h_proto))
packet->protocol = eth->h_proto;
else
packet->protocol = htons(ETH_P_802_2);

/* Set packet's mru */
if (a[OVS_PACKET_ATTR_MRU]) {
mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]);
Expand All @@ -618,6 +606,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
rcu_assign_pointer(flow->sf_acts, acts);
packet->priority = flow->key.phy.priority;
packet->mark = flow->key.phy.skb_mark;
packet->protocol = flow->key.eth.type;

rcu_read_lock();
dp = get_dp_rcu(net, ovs_header->dp_ifindex);
Expand Down
Loading

0 comments on commit f0a4040

Please sign in to comment.