diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 52457ae3b259e..7f591d71ab28d 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -189,6 +189,11 @@ enum nfp_bpf_map_use {
 	NFP_MAP_USE_ATOMIC_CNT,
 };
 
+struct nfp_bpf_map_word {
+	unsigned char type		:4;
+	unsigned char non_zero_update	:1;
+};
+
 /**
  * struct nfp_bpf_map - private per-map data attached to BPF maps for offload
  * @offmap:	pointer to the offloaded BPF map
@@ -202,7 +207,7 @@ struct nfp_bpf_map {
 	struct nfp_app_bpf *bpf;
 	u32 tid;
 	struct list_head l;
-	enum nfp_bpf_map_use use_map[];
+	struct nfp_bpf_map_word use_map[];
 };
 
 struct nfp_bpf_neutral_map {
@@ -436,6 +441,7 @@ struct nfp_bpf_subprog_info {
  * @prog: machine code
  * @prog_len: number of valid instructions in @prog array
  * @__prog_alloc_len: alloc size of @prog array
+ * @stack_size: total amount of stack used
  * @verifier_meta: temporary storage for verifier's insn meta
  * @type: BPF program type
  * @last_bpf_off: address of the last instruction translated from BPF
@@ -460,6 +466,8 @@ struct nfp_prog {
 	unsigned int prog_len;
 	unsigned int __prog_alloc_len;
 
+	unsigned int stack_size;
+
 	struct nfp_insn_meta *verifier_meta;
 
 	enum bpf_prog_type type;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 927e038d9f77d..ba8ceedcf6a28 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -262,10 +262,25 @@ static void nfp_map_bpf_byte_swap(struct nfp_bpf_map *nfp_map, void *value)
 	unsigned int i;
 
 	for (i = 0; i < DIV_ROUND_UP(nfp_map->offmap->map.value_size, 4); i++)
-		if (nfp_map->use_map[i] == NFP_MAP_USE_ATOMIC_CNT)
+		if (nfp_map->use_map[i].type == NFP_MAP_USE_ATOMIC_CNT)
 			word[i] = (__force u32)cpu_to_be32(word[i]);
 }
 
+/* Mark value as unsafely initialized in case it becomes atomic later
+ * and we didn't byte swap something non-byte swap neutral.
+ */
+static void
+nfp_map_bpf_byte_swap_record(struct nfp_bpf_map *nfp_map, void *value)
+{
+	u32 *word = value;
+	unsigned int i;
+
+	for (i = 0; i < DIV_ROUND_UP(nfp_map->offmap->map.value_size, 4); i++)
+		if (nfp_map->use_map[i].type == NFP_MAP_UNUSED &&
+		    word[i] != (__force u32)cpu_to_be32(word[i]))
+			nfp_map->use_map[i].non_zero_update = 1;
+}
+
 static int
 nfp_bpf_map_lookup_entry(struct bpf_offloaded_map *offmap,
 			 void *key, void *value)
@@ -285,6 +300,7 @@ nfp_bpf_map_update_entry(struct bpf_offloaded_map *offmap,
 			 void *key, void *value, u64 flags)
 {
 	nfp_map_bpf_byte_swap(offmap->dev_priv, value);
+	nfp_map_bpf_byte_swap_record(offmap->dev_priv, value);
 	return nfp_bpf_ctrl_update_entry(offmap, key, value, flags);
 }
 
@@ -473,7 +489,7 @@ nfp_net_bpf_load(struct nfp_net *nn, struct bpf_prog *prog,
 		 struct netlink_ext_ack *extack)
 {
 	struct nfp_prog *nfp_prog = prog->aux->offload->dev_priv;
-	unsigned int max_mtu;
+	unsigned int max_mtu, max_stack, max_prog_len;
 	dma_addr_t dma_addr;
 	void *img;
 	int err;
@@ -484,6 +500,18 @@ nfp_net_bpf_load(struct nfp_net *nn, struct bpf_prog *prog,
 		return -EOPNOTSUPP;
 	}
 
+	max_stack = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64;
+	if (nfp_prog->stack_size > max_stack) {
+		NL_SET_ERR_MSG_MOD(extack, "stack too large");
+		return -EOPNOTSUPP;
+	}
+
+	max_prog_len = nn_readw(nn, NFP_NET_CFG_BPF_MAX_LEN);
+	if (nfp_prog->prog_len > max_prog_len) {
+		NL_SET_ERR_MSG_MOD(extack, "program too long");
+		return -EOPNOTSUPP;
+	}
+
 	img = nfp_bpf_relo_for_vnic(nfp_prog, nn->app_priv);
 	if (IS_ERR(img))
 		return PTR_ERR(img);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 193dd685b3658..99f977bfd8ccd 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -80,6 +80,46 @@ nfp_record_adjust_head(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog,
 	nfp_prog->adjust_head_location = location;
 }
 
+static bool nfp_bpf_map_update_value_ok(struct bpf_verifier_env *env)
+{
+	const struct bpf_reg_state *reg1 = cur_regs(env) + BPF_REG_1;
+	const struct bpf_reg_state *reg3 = cur_regs(env) + BPF_REG_3;
+	struct bpf_offloaded_map *offmap;
+	struct bpf_func_state *state;
+	struct nfp_bpf_map *nfp_map;
+	int off, i;
+
+	state = env->cur_state->frame[reg3->frameno];
+
+	/* We need to record each time update happens with non-zero words,
+	 * in case such word is used in atomic operations.
+	 * Implicitly depend on nfp_bpf_stack_arg_ok(reg3) being run before.
+	 */
+
+	offmap = map_to_offmap(reg1->map_ptr);
+	nfp_map = offmap->dev_priv;
+	off = reg3->off + reg3->var_off.value;
+
+	for (i = 0; i < offmap->map.value_size; i++) {
+		struct bpf_stack_state *stack_entry;
+		unsigned int soff;
+
+		soff = -(off + i) - 1;
+		stack_entry = &state->stack[soff / BPF_REG_SIZE];
+		if (stack_entry->slot_type[soff % BPF_REG_SIZE] == STACK_ZERO)
+			continue;
+
+		if (nfp_map->use_map[i / 4].type == NFP_MAP_USE_ATOMIC_CNT) {
+			pr_vlog(env, "value at offset %d/%d may be non-zero, bpf_map_update_elem() is required to initialize atomic counters to zero to avoid offload endian issues\n",
+				i, soff);
+			return false;
+		}
+		nfp_map->use_map[i / 4].non_zero_update = 1;
+	}
+
+	return true;
+}
+
 static int
 nfp_bpf_stack_arg_ok(const char *fname, struct bpf_verifier_env *env,
 		     const struct bpf_reg_state *reg,
@@ -171,7 +211,8 @@ nfp_bpf_check_helper_call(struct nfp_prog *nfp_prog,
 					 bpf->helpers.map_update, reg1) ||
 		    !nfp_bpf_stack_arg_ok("map_update", env, reg2,
 					  meta->func_id ? &meta->arg2 : NULL) ||
-		    !nfp_bpf_stack_arg_ok("map_update", env, reg3, NULL))
+		    !nfp_bpf_stack_arg_ok("map_update", env, reg3, NULL) ||
+		    !nfp_bpf_map_update_value_ok(env))
 			return -EOPNOTSUPP;
 		break;
 
@@ -352,15 +393,22 @@ nfp_bpf_map_mark_used_one(struct bpf_verifier_env *env,
 			  struct nfp_bpf_map *nfp_map,
 			  unsigned int off, enum nfp_bpf_map_use use)
 {
-	if (nfp_map->use_map[off / 4] != NFP_MAP_UNUSED &&
-	    nfp_map->use_map[off / 4] != use) {
+	if (nfp_map->use_map[off / 4].type != NFP_MAP_UNUSED &&
+	    nfp_map->use_map[off / 4].type != use) {
 		pr_vlog(env, "map value use type conflict %s vs %s off: %u\n",
-			nfp_bpf_map_use_name(nfp_map->use_map[off / 4]),
+			nfp_bpf_map_use_name(nfp_map->use_map[off / 4].type),
 			nfp_bpf_map_use_name(use), off);
 		return -EOPNOTSUPP;
 	}
 
-	nfp_map->use_map[off / 4] = use;
+	if (nfp_map->use_map[off / 4].non_zero_update &&
+	    use == NFP_MAP_USE_ATOMIC_CNT) {
+		pr_vlog(env, "atomic counter in map value may already be initialized to non-zero value off: %u\n",
+			off);
+		return -EOPNOTSUPP;
+	}
+
+	nfp_map->use_map[off / 4].type = use;
 
 	return 0;
 }
@@ -699,9 +747,9 @@ nfp_bpf_get_stack_usage(struct nfp_prog *nfp_prog, unsigned int cnt)
 
 static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 {
-	unsigned int stack_size, stack_needed;
 	struct bpf_subprog_info *info;
 	struct nfp_prog *nfp_prog;
+	unsigned int max_stack;
 	struct nfp_net *nn;
 	int i;
 
@@ -729,11 +777,12 @@ static int nfp_bpf_finalize(struct bpf_verifier_env *env)
 	}
 
 	nn = netdev_priv(env->prog->aux->offload->netdev);
-	stack_size = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64;
-	stack_needed = nfp_bpf_get_stack_usage(nfp_prog, env->prog->len);
-	if (stack_needed > stack_size) {
+	max_stack = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64;
+	nfp_prog->stack_size = nfp_bpf_get_stack_usage(nfp_prog,
+						       env->prog->len);
+	if (nfp_prog->stack_size > max_stack) {
 		pr_vlog(env, "stack too large: program %dB > FW stack %dB\n",
-			stack_needed, stack_size);
+			nfp_prog->stack_size, max_stack);
 		return -EOPNOTSUPP;
 	}
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e60fff48288b6..33014ae73103f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,6 +39,9 @@ struct bpf_map_ops {
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
 	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
 	int (*map_delete_elem)(struct bpf_map *map, void *key);
+	int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
+	int (*map_pop_elem)(struct bpf_map *map, void *value);
+	int (*map_peek_elem)(struct bpf_map *map, void *value);
 
 	/* funcs called by prog_array and perf_event_array map */
 	void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
@@ -138,6 +141,7 @@ enum bpf_arg_type {
 	ARG_CONST_MAP_PTR,	/* const argument used as pointer to bpf_map */
 	ARG_PTR_TO_MAP_KEY,	/* pointer to stack used as map key */
 	ARG_PTR_TO_MAP_VALUE,	/* pointer to stack used as map value */
+	ARG_PTR_TO_UNINIT_MAP_VALUE,	/* pointer to valid memory used to store a map value */
 
 	/* the following constraints used to prototype bpf_memcmp() and other
 	 * functions that access data on eBPF program stack
@@ -810,6 +814,9 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map,
 extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
+extern const struct bpf_func_proto bpf_map_push_elem_proto;
+extern const struct bpf_func_proto bpf_map_pop_elem_proto;
+extern const struct bpf_func_proto bpf_map_peek_elem_proto;
 
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index fa48343a5ea17..44d9ab4809bdb 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -51,7 +51,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_PERCPU_HASH, htab_lru_percpu_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_LPM_TRIE, trie_map_ops)
 #ifdef CONFIG_PERF_EVENTS
-BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_trace_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
@@ -69,3 +69,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops)
 #endif
 #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5771874bc01e9..91b4c934f02e7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -548,6 +548,27 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 	cb->data_end  = skb->data + skb_headlen(skb);
 }
 
+/* Similar to bpf_compute_data_pointers(), except that save orginal
+ * data in cb->data and cb->meta_data for restore.
+ */
+static inline void bpf_compute_and_save_data_end(
+	struct sk_buff *skb, void **saved_data_end)
+{
+	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
+
+	*saved_data_end = cb->data_end;
+	cb->data_end  = skb->data + skb_headlen(skb);
+}
+
+/* Restore data saved by bpf_compute_data_pointers(). */
+static inline void bpf_restore_data_end(
+	struct sk_buff *skb, void *saved_data_end)
+{
+	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
+
+	cb->data_end = saved_data_end;
+}
+
 static inline u8 *bpf_skb_cb(struct sk_buff *skb)
 {
 	/* eBPF programs may read/write skb->cb[] area to transfer meta
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 0b919f0bc6d66..2a11e9d91dfa8 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -176,6 +176,7 @@ static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src,
 {
 	dst->sg.data[which] = src->sg.data[which];
 	dst->sg.data[which].length  = size;
+	dst->sg.size		   += size;
 	src->sg.data[which].length -= size;
 	src->sg.data[which].offset += size;
 }
@@ -186,21 +187,29 @@ static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src)
 	sk_msg_init(src);
 }
 
+static inline bool sk_msg_full(const struct sk_msg *msg)
+{
+	return (msg->sg.end == msg->sg.start) && msg->sg.size;
+}
+
 static inline u32 sk_msg_elem_used(const struct sk_msg *msg)
 {
+	if (sk_msg_full(msg))
+		return MAX_MSG_FRAGS;
+
 	return msg->sg.end >= msg->sg.start ?
 		msg->sg.end - msg->sg.start :
 		msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start);
 }
 
-static inline bool sk_msg_full(const struct sk_msg *msg)
+static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which)
 {
-	return (msg->sg.end == msg->sg.start) && msg->sg.size;
+	return &msg->sg.data[which];
 }
 
-static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which)
+static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which)
 {
-	return &msg->sg.data[which];
+	return msg->sg.data[which];
 }
 
 static inline struct page *sk_msg_page(struct sk_msg *msg, int which)
@@ -266,11 +275,6 @@ static inline struct sk_psock *sk_psock(const struct sock *sk)
 	return rcu_dereference_sk_user_data(sk);
 }
 
-static inline bool sk_has_psock(struct sock *sk)
-{
-	return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg;
-}
-
 static inline void sk_psock_queue_msg(struct sk_psock *psock,
 				      struct sk_msg *msg)
 {
@@ -370,6 +374,26 @@ static inline bool sk_psock_test_state(const struct sk_psock *psock,
 	return test_bit(bit, &psock->state);
 }
 
+static inline struct sk_psock *sk_psock_get_checked(struct sock *sk)
+{
+	struct sk_psock *psock;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (psock) {
+		if (sk->sk_prot->recvmsg != tcp_bpf_recvmsg) {
+			psock = ERR_PTR(-EBUSY);
+			goto out;
+		}
+
+		if (!refcount_inc_not_zero(&psock->refcnt))
+			psock = ERR_PTR(-EBUSY);
+	}
+out:
+	rcu_read_unlock();
+	return psock;
+}
+
 static inline struct sk_psock *sk_psock_get(struct sock *sk)
 {
 	struct sk_psock *psock;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3600ae0f25c34..8a61c3e8c15df 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2051,11 +2051,6 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
 #define TCP_ULP_MAX		128
 #define TCP_ULP_BUF_MAX		(TCP_ULP_NAME_MAX*TCP_ULP_MAX)
 
-enum {
-	TCP_ULP_TLS,
-	TCP_ULP_BPF,
-};
-
 struct tcp_ulp_ops {
 	struct list_head	list;
 
@@ -2064,9 +2059,7 @@ struct tcp_ulp_ops {
 	/* cleanup ulp */
 	void (*release)(struct sock *sk);
 
-	int		uid;
 	char		name[TCP_ULP_NAME_MAX];
-	bool		user_visible;
 	struct module	*owner;
 };
 int tcp_register_ulp(struct tcp_ulp_ops *type);
@@ -2089,7 +2082,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
 int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		    int nonblock, int flags, int *addr_len);
 int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
-		      struct msghdr *msg, int len);
+		      struct msghdr *msg, int len, int flags);
 
 /* Call BPF_SOCK_OPS program that returns an int. If the return value
  * is < 0, then the BPF op failed (for example if the loaded BPF
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f9187b41dff62..852dc17ab47a0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -103,6 +103,7 @@ enum bpf_cmd {
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
 	BPF_TASK_FD_QUERY,
+	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 };
 
 enum bpf_map_type {
@@ -128,6 +129,8 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_CGROUP_STORAGE,
 	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
 	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+	BPF_MAP_TYPE_QUEUE,
+	BPF_MAP_TYPE_STACK,
 };
 
 enum bpf_prog_type {
@@ -462,6 +465,28 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
+ * 	Description
+ * 		Push an element *value* in *map*. *flags* is one of:
+ *
+ * 		**BPF_EXIST**
+ * 		If the queue/stack is full, the oldest element is removed to
+ * 		make room for this.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_pop_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Pop an element from *map*.
+ * Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_peek_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Get an element from *map* without removing it.
+ * Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_probe_read(void *dst, u32 size, const void *src)
  * 	Description
  * 		For tracing programs, safely attempt to read *size* bytes from
@@ -1433,7 +1458,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
+ * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags)
  * 	Description
  * 		Grow or shrink the room for data in the packet associated to
  * 		*skb* by *len_diff*, and according to the selected *mode*.
@@ -2215,6 +2240,23 @@ union bpf_attr {
  *		pointer that was returned from bpf_sk_lookup_xxx\ ().
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags)
+ *	Description
+ *		For socket policies, insert *len* bytes into msg at offset
+ *		*start*.
+ *
+ *		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ *		*msg* it may want to insert metadata or options into the msg.
+ *		This can later be read and used by any of the lower layer BPF
+ *		hooks.
+ *
+ *		This helper may fail if under memory pressure (a malloc
+ *		fails) in these cases BPF programs will get an appropriate
+ *		error and BPF programs will need to handle them.
+ *
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2303,7 +2345,11 @@ union bpf_attr {
 	FN(skb_ancestor_cgroup_id),	\
 	FN(sk_lookup_tcp),		\
 	FN(sk_lookup_udp),		\
-	FN(sk_release),
+	FN(sk_release),			\
+	FN(map_push_elem),		\
+	FN(map_pop_elem),		\
+	FN(map_peek_elem),		\
+	FN(msg_push_data),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index ff8262626b8fe..4c2fa3ac56f63 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,7 +3,7 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
 ifeq ($(CONFIG_NET),y)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 00f6ed2e4f9a8..9425c2fb872f7 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -553,6 +553,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 {
 	unsigned int offset = skb->data - skb_network_header(skb);
 	struct sock *save_sk;
+	void *saved_data_end;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -566,8 +567,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	save_sk = skb->sk;
 	skb->sk = sk;
 	__skb_push(skb, offset);
+
+	/* compute pointers for the bpf prog */
+	bpf_compute_and_save_data_end(skb, &saved_data_end);
+
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
 				 bpf_prog_run_save_cb);
+	bpf_restore_data_end(skb, saved_data_end);
 	__skb_pull(skb, offset);
 	skb->sk = save_sk;
 	return ret == 1 ? 0 : -EPERM;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index defcf4df6d912..7c7eeea8cffcc 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1783,6 +1783,9 @@ BPF_CALL_0(bpf_user_rnd_u32)
 const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
 const struct bpf_func_proto bpf_map_update_elem_proto __weak;
 const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
+const struct bpf_func_proto bpf_map_push_elem_proto __weak;
+const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
+const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
 
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6502115e8f55a..ab0d5e3f9892b 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -76,6 +76,49 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = {
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
 };
 
+BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags)
+{
+	return map->ops->map_push_elem(map, value, flags);
+}
+
+const struct bpf_func_proto bpf_map_push_elem_proto = {
+	.func		= bpf_map_push_elem,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_MAP_VALUE,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value)
+{
+	return map->ops->map_pop_elem(map, value);
+}
+
+const struct bpf_func_proto bpf_map_pop_elem_proto = {
+	.func		= bpf_map_pop_elem,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MAP_VALUE,
+};
+
+BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value)
+{
+	return map->ops->map_peek_elem(map, value);
+}
+
+const struct bpf_func_proto bpf_map_peek_elem_proto = {
+	.func		= bpf_map_pop_elem,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MAP_VALUE,
+};
+
 const struct bpf_func_proto bpf_get_prandom_u32_proto = {
 	.func		= bpf_user_rnd_u32,
 	.gpl_only	= false,
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
new file mode 100644
index 0000000000000..12a93fb374492
--- /dev/null
+++ b/kernel/bpf/queue_stack_maps.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * queue_stack_maps.c: BPF queue and stack maps
+ *
+ * Copyright (c) 2018 Politecnico di Torino
+ */
+#include <linux/bpf.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include "percpu_freelist.h"
+
+#define QUEUE_STACK_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+
+struct bpf_queue_stack {
+	struct bpf_map map;
+	raw_spinlock_t lock;
+	u32 head, tail;
+	u32 size; /* max_entries + 1 */
+
+	char elements[0] __aligned(8);
+};
+
+static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map)
+{
+	return container_of(map, struct bpf_queue_stack, map);
+}
+
+static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs)
+{
+	return qs->head == qs->tail;
+}
+
+static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
+{
+	u32 head = qs->head + 1;
+
+	if (unlikely(head >= qs->size))
+		head = 0;
+
+	return head == qs->tail;
+}
+
+/* Called from syscall */
+static int queue_stack_map_alloc_check(union bpf_attr *attr)
+{
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 0 ||
+	    attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK)
+		return -EINVAL;
+
+	if (attr->value_size > KMALLOC_MAX_SIZE)
+		/* if value_size is bigger, the user space won't be able to
+		 * access the elements.
+		 */
+		return -E2BIG;
+
+	return 0;
+}
+
+static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
+{
+	int ret, numa_node = bpf_map_attr_numa_node(attr);
+	struct bpf_queue_stack *qs;
+	u32 size, value_size;
+	u64 queue_size, cost;
+
+	size = attr->max_entries + 1;
+	value_size = attr->value_size;
+
+	queue_size = sizeof(*qs) + (u64) value_size * size;
+
+	cost = queue_size;
+	if (cost >= U32_MAX - PAGE_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	ret = bpf_map_precharge_memlock(cost);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	qs = bpf_map_area_alloc(queue_size, numa_node);
+	if (!qs)
+		return ERR_PTR(-ENOMEM);
+
+	memset(qs, 0, sizeof(*qs));
+
+	bpf_map_init_from_attr(&qs->map, attr);
+
+	qs->map.pages = cost;
+	qs->size = size;
+
+	raw_spin_lock_init(&qs->lock);
+
+	return &qs->map;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void queue_stack_map_free(struct bpf_map *map)
+{
+	struct bpf_queue_stack *qs = bpf_queue_stack(map);
+
+	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete
+	 */
+	synchronize_rcu();
+
+	bpf_map_area_free(qs);
+}
+
+static int __queue_map_get(struct bpf_map *map, void *value, bool delete)
+{
+	struct bpf_queue_stack *qs = bpf_queue_stack(map);
+	unsigned long flags;
+	int err = 0;
+	void *ptr;
+
+	raw_spin_lock_irqsave(&qs->lock, flags);
+
+	if (queue_stack_map_is_empty(qs)) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	ptr = &qs->elements[qs->tail * qs->map.value_size];
+	memcpy(value, ptr, qs->map.value_size);
+
+	if (delete) {
+		if (unlikely(++qs->tail >= qs->size))
+			qs->tail = 0;
+	}
+
+out:
+	raw_spin_unlock_irqrestore(&qs->lock, flags);
+	return err;
+}
+
+
+static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
+{
+	struct bpf_queue_stack *qs = bpf_queue_stack(map);
+	unsigned long flags;
+	int err = 0;
+	void *ptr;
+	u32 index;
+
+	raw_spin_lock_irqsave(&qs->lock, flags);
+
+	if (queue_stack_map_is_empty(qs)) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	index = qs->head - 1;
+	if (unlikely(index >= qs->size))
+		index = qs->size - 1;
+
+	ptr = &qs->elements[index * qs->map.value_size];
+	memcpy(value, ptr, qs->map.value_size);
+
+	if (delete)
+		qs->head = index;
+
+out:
+	raw_spin_unlock_irqrestore(&qs->lock, flags);
+	return err;
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_map_peek_elem(struct bpf_map *map, void *value)
+{
+	return __queue_map_get(map, value, false);
+}
+
+/* Called from syscall or from eBPF program */
+static int stack_map_peek_elem(struct bpf_map *map, void *value)
+{
+	return __stack_map_get(map, value, false);
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_map_pop_elem(struct bpf_map *map, void *value)
+{
+	return __queue_map_get(map, value, true);
+}
+
+/* Called from syscall or from eBPF program */
+static int stack_map_pop_elem(struct bpf_map *map, void *value)
+{
+	return __stack_map_get(map, value, true);
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
+				     u64 flags)
+{
+	struct bpf_queue_stack *qs = bpf_queue_stack(map);
+	unsigned long irq_flags;
+	int err = 0;
+	void *dst;
+
+	/* BPF_EXIST is used to force making room for a new element in case the
+	 * map is full
+	 */
+	bool replace = (flags & BPF_EXIST);
+
+	/* Check supported flags for queue and stack maps */
+	if (flags & BPF_NOEXIST || flags > BPF_EXIST)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&qs->lock, irq_flags);
+
+	if (queue_stack_map_is_full(qs)) {
+		if (!replace) {
+			err = -E2BIG;
+			goto out;
+		}
+		/* advance tail pointer to overwrite oldest element */
+		if (unlikely(++qs->tail >= qs->size))
+			qs->tail = 0;
+	}
+
+	dst = &qs->elements[qs->head * qs->map.value_size];
+	memcpy(dst, value, qs->map.value_size);
+
+	if (unlikely(++qs->head >= qs->size))
+		qs->head = 0;
+
+out:
+	raw_spin_unlock_irqrestore(&qs->lock, irq_flags);
+	return err;
+}
+
+/* Called from syscall or from eBPF program */
+static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_stack_map_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 flags)
+{
+	return -EINVAL;
+}
+
+/* Called from syscall or from eBPF program */
+static int queue_stack_map_delete_elem(struct bpf_map *map, void *key)
+{
+	return -EINVAL;
+}
+
+/* Called from syscall */
+static int queue_stack_map_get_next_key(struct bpf_map *map, void *key,
+					void *next_key)
+{
+	return -EINVAL;
+}
+
+const struct bpf_map_ops queue_map_ops = {
+	.map_alloc_check = queue_stack_map_alloc_check,
+	.map_alloc = queue_stack_map_alloc,
+	.map_free = queue_stack_map_free,
+	.map_lookup_elem = queue_stack_map_lookup_elem,
+	.map_update_elem = queue_stack_map_update_elem,
+	.map_delete_elem = queue_stack_map_delete_elem,
+	.map_push_elem = queue_stack_map_push_elem,
+	.map_pop_elem = queue_map_pop_elem,
+	.map_peek_elem = queue_map_peek_elem,
+	.map_get_next_key = queue_stack_map_get_next_key,
+};
+
+const struct bpf_map_ops stack_map_ops = {
+	.map_alloc_check = queue_stack_map_alloc_check,
+	.map_alloc = queue_stack_map_alloc,
+	.map_free = queue_stack_map_free,
+	.map_lookup_elem = queue_stack_map_lookup_elem,
+	.map_update_elem = queue_stack_map_update_elem,
+	.map_delete_elem = queue_stack_map_delete_elem,
+	.map_push_elem = queue_stack_map_push_elem,
+	.map_pop_elem = stack_map_pop_elem,
+	.map_peek_elem = stack_map_peek_elem,
+	.map_get_next_key = queue_stack_map_get_next_key,
+};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index b2ade10f7ec3e..90daf285de032 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -600,7 +600,7 @@ static void stack_map_free(struct bpf_map *map)
 	put_callchain_buffers();
 }
 
-const struct bpf_map_ops stack_map_ops = {
+const struct bpf_map_ops stack_trace_map_ops = {
 	.map_alloc = stack_map_alloc,
 	.map_free = stack_map_free,
 	.map_get_next_key = stack_map_get_next_key,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f4ecd6ed22522..ccb93277aae2c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -651,6 +651,17 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 	return -ENOTSUPP;
 }
 
+static void *__bpf_copy_key(void __user *ukey, u64 key_size)
+{
+	if (key_size)
+		return memdup_user(ukey, key_size);
+
+	if (ukey)
+		return ERR_PTR(-EINVAL);
+
+	return NULL;
+}
+
 /* last field in 'union bpf_attr' used by this command */
 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
 
@@ -678,7 +689,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	key = memdup_user(ukey, map->key_size);
+	key = __bpf_copy_key(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
 		goto err_put;
@@ -716,6 +727,9 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = bpf_fd_htab_map_lookup_elem(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
 		err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+		   map->map_type == BPF_MAP_TYPE_STACK) {
+		err = map->ops->map_peek_elem(map, value);
 	} else {
 		rcu_read_lock();
 		ptr = map->ops->map_lookup_elem(map, key);
@@ -785,7 +799,7 @@ static int map_update_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	key = memdup_user(ukey, map->key_size);
+	key = __bpf_copy_key(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
 		goto err_put;
@@ -846,6 +860,9 @@ static int map_update_elem(union bpf_attr *attr)
 		/* rcu_read_lock() is not needed */
 		err = bpf_fd_reuseport_array_update_elem(map, key, value,
 							 attr->flags);
+	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+		   map->map_type == BPF_MAP_TYPE_STACK) {
+		err = map->ops->map_push_elem(map, value, attr->flags);
 	} else {
 		rcu_read_lock();
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
@@ -888,7 +905,7 @@ static int map_delete_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	key = memdup_user(ukey, map->key_size);
+	key = __bpf_copy_key(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
 		goto err_put;
@@ -941,7 +958,7 @@ static int map_get_next_key(union bpf_attr *attr)
 	}
 
 	if (ukey) {
-		key = memdup_user(ukey, map->key_size);
+		key = __bpf_copy_key(ukey, map->key_size);
 		if (IS_ERR(key)) {
 			err = PTR_ERR(key);
 			goto err_put;
@@ -982,6 +999,69 @@ static int map_get_next_key(union bpf_attr *attr)
 	return err;
 }
 
+#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
+
+static int map_lookup_and_delete_elem(union bpf_attr *attr)
+{
+	void __user *ukey = u64_to_user_ptr(attr->key);
+	void __user *uvalue = u64_to_user_ptr(attr->value);
+	int ufd = attr->map_fd;
+	struct bpf_map *map;
+	void *key, *value;
+	u32 value_size;
+	struct fd f;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
+		return -EINVAL;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
+	key = __bpf_copy_key(ukey, map->key_size);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
+		goto err_put;
+	}
+
+	value_size = map->value_size;
+
+	err = -ENOMEM;
+	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+	if (!value)
+		goto free_key;
+
+	if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+	    map->map_type == BPF_MAP_TYPE_STACK) {
+		err = map->ops->map_pop_elem(map, value);
+	} else {
+		err = -ENOTSUPP;
+	}
+
+	if (err)
+		goto free_value;
+
+	if (copy_to_user(uvalue, value, value_size) != 0)
+		goto free_value;
+
+	err = 0;
+
+free_value:
+	kfree(value);
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
 static const struct bpf_prog_ops * const bpf_prog_types[] = {
 #define BPF_PROG_TYPE(_id, _name) \
 	[_id] = & _name ## _prog_ops,
@@ -2455,6 +2535,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_TASK_FD_QUERY:
 		err = bpf_task_fd_query(&attr, uattr);
 		break;
+	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
+		err = map_lookup_and_delete_elem(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3f93a548a6426..98fa0be35370b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1528,14 +1528,19 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
 	return reg->type != SCALAR_VALUE;
 }
 
+static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
+{
+	return cur_regs(env) + regno;
+}
+
 static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 {
-	return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
+	return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno));
 }
 
 static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
 {
-	const struct bpf_reg_state *reg = cur_regs(env) + regno;
+	const struct bpf_reg_state *reg = reg_state(env, regno);
 
 	return reg->type == PTR_TO_CTX ||
 	       reg->type == PTR_TO_SOCKET;
@@ -1543,11 +1548,19 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
 
 static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
 {
-	const struct bpf_reg_state *reg = cur_regs(env) + regno;
+	const struct bpf_reg_state *reg = reg_state(env, regno);
 
 	return type_is_pkt_pointer(reg->type);
 }
 
+static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
+{
+	const struct bpf_reg_state *reg = reg_state(env, regno);
+
+	/* Separate to is_ctx_reg() since we still want to allow BPF_ST here. */
+	return reg->type == PTR_TO_FLOW_KEYS;
+}
+
 static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
 				   const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
@@ -1956,9 +1969,11 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 	}
 
 	if (is_ctx_reg(env, insn->dst_reg) ||
-	    is_pkt_reg(env, insn->dst_reg)) {
+	    is_pkt_reg(env, insn->dst_reg) ||
+	    is_flow_key_reg(env, insn->dst_reg)) {
 		verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
-			insn->dst_reg, reg_type_str[insn->dst_reg]);
+			insn->dst_reg,
+			reg_type_str[reg_state(env, insn->dst_reg)->type]);
 		return -EACCES;
 	}
 
@@ -1983,7 +1998,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 				int access_size, bool zero_size_allowed,
 				struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *reg = cur_regs(env) + regno;
+	struct bpf_reg_state *reg = reg_state(env, regno);
 	struct bpf_func_state *state = func(env, reg);
 	int off, i, slot, spi;
 
@@ -2062,8 +2077,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	case PTR_TO_PACKET_META:
 		return check_packet_access(env, regno, reg->off, access_size,
 					   zero_size_allowed);
-	case PTR_TO_FLOW_KEYS:
-		return check_flow_keys_access(env, reg->off, access_size);
 	case PTR_TO_MAP_VALUE:
 		return check_map_access(env, regno, reg->off, access_size,
 					zero_size_allowed);
@@ -2117,7 +2130,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	}
 
 	if (arg_type == ARG_PTR_TO_MAP_KEY ||
-	    arg_type == ARG_PTR_TO_MAP_VALUE) {
+	    arg_type == ARG_PTR_TO_MAP_VALUE ||
+	    arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
 		expected_type = PTR_TO_STACK;
 		if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
 		    type != expected_type)
@@ -2187,7 +2201,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		err = check_helper_mem_access(env, regno,
 					      meta->map_ptr->key_size, false,
 					      NULL);
-	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
+	} else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
+		   arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
 		/* bpf_map_xxx(..., map_ptr, ..., value) call:
 		 * check [value, value + map->value_size) validity
 		 */
@@ -2196,9 +2211,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose(env, "invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
+		meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE);
 		err = check_helper_mem_access(env, regno,
 					      meta->map_ptr->value_size, false,
-					      NULL);
+					      meta);
 	} else if (arg_type_is_mem_size(arg_type)) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
@@ -2321,6 +2337,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (func_id != BPF_FUNC_sk_select_reuseport)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_QUEUE:
+	case BPF_MAP_TYPE_STACK:
+		if (func_id != BPF_FUNC_map_peek_elem &&
+		    func_id != BPF_FUNC_map_pop_elem &&
+		    func_id != BPF_FUNC_map_push_elem)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -2377,6 +2400,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY)
 			goto error;
 		break;
+	case BPF_FUNC_map_peek_elem:
+	case BPF_FUNC_map_pop_elem:
+	case BPF_FUNC_map_push_elem:
+		if (map->map_type != BPF_MAP_TYPE_QUEUE &&
+		    map->map_type != BPF_MAP_TYPE_STACK)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -2672,7 +2702,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	if (func_id != BPF_FUNC_tail_call &&
 	    func_id != BPF_FUNC_map_lookup_elem &&
 	    func_id != BPF_FUNC_map_update_elem &&
-	    func_id != BPF_FUNC_map_delete_elem)
+	    func_id != BPF_FUNC_map_delete_elem &&
+	    func_id != BPF_FUNC_map_push_elem &&
+	    func_id != BPF_FUNC_map_pop_elem &&
+	    func_id != BPF_FUNC_map_peek_elem)
 		return 0;
 
 	if (meta->map_ptr == NULL) {
@@ -5244,7 +5277,8 @@ static int do_check(struct bpf_verifier_env *env)
 
 			if (is_ctx_reg(env, insn->dst_reg)) {
 				verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
-					insn->dst_reg, reg_type_str[insn->dst_reg]);
+					insn->dst_reg,
+					reg_type_str[reg_state(env, insn->dst_reg)->type]);
 				return -EACCES;
 			}
 
@@ -6144,7 +6178,10 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 		if (prog->jit_requested && BITS_PER_LONG == 64 &&
 		    (insn->imm == BPF_FUNC_map_lookup_elem ||
 		     insn->imm == BPF_FUNC_map_update_elem ||
-		     insn->imm == BPF_FUNC_map_delete_elem)) {
+		     insn->imm == BPF_FUNC_map_delete_elem ||
+		     insn->imm == BPF_FUNC_map_push_elem   ||
+		     insn->imm == BPF_FUNC_map_pop_elem    ||
+		     insn->imm == BPF_FUNC_map_peek_elem)) {
 			aux = &env->insn_aux_data[i + delta];
 			if (bpf_map_ptr_poisoned(aux))
 				goto patch_call_imm;
@@ -6177,6 +6214,14 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			BUILD_BUG_ON(!__same_type(ops->map_update_elem,
 				     (int (*)(struct bpf_map *map, void *key, void *value,
 					      u64 flags))NULL));
+			BUILD_BUG_ON(!__same_type(ops->map_push_elem,
+				     (int (*)(struct bpf_map *map, void *value,
+					      u64 flags))NULL));
+			BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
+				     (int (*)(struct bpf_map *map, void *value))NULL));
+			BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
+				     (int (*)(struct bpf_map *map, void *value))NULL));
+
 			switch (insn->imm) {
 			case BPF_FUNC_map_lookup_elem:
 				insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
@@ -6190,6 +6235,18 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 				insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -
 					    __bpf_call_base;
 				continue;
+			case BPF_FUNC_map_push_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_push_elem) -
+					    __bpf_call_base;
+				continue;
+			case BPF_FUNC_map_pop_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_pop_elem) -
+					    __bpf_call_base;
+				continue;
+			case BPF_FUNC_map_peek_elem:
+				insn->imm = BPF_CAST_CALL(ops->map_peek_elem) -
+					    __bpf_call_base;
+				continue;
 			}
 
 			goto patch_call_imm;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 0c423b8cd75cc..c89c22c49015f 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -10,6 +10,8 @@
 #include <linux/etherdevice.h>
 #include <linux/filter.h>
 #include <linux/sched/signal.h>
+#include <net/sock.h>
+#include <net/tcp.h>
 
 static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
 		struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
@@ -115,6 +117,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	u32 retval, duration;
 	int hh_len = ETH_HLEN;
 	struct sk_buff *skb;
+	struct sock *sk;
 	void *data;
 	int ret;
 
@@ -137,11 +140,21 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 		break;
 	}
 
+	sk = kzalloc(sizeof(struct sock), GFP_USER);
+	if (!sk) {
+		kfree(data);
+		return -ENOMEM;
+	}
+	sock_net_set(sk, current->nsproxy->net_ns);
+	sock_init_data(NULL, sk);
+
 	skb = build_skb(data, 0);
 	if (!skb) {
 		kfree(data);
+		kfree(sk);
 		return -ENOMEM;
 	}
+	skb->sk = sk;
 
 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
 	__skb_put(skb, size);
@@ -159,6 +172,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 
 			if (pskb_expand_head(skb, nhead, 0, GFP_USER)) {
 				kfree_skb(skb);
+				kfree(sk);
 				return -ENOMEM;
 			}
 		}
@@ -171,6 +185,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 		size = skb_headlen(skb);
 	ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration);
 	kfree_skb(skb);
+	kfree(sk);
 	return ret;
 }
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 1a3ac6c468735..35c6933c26229 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2297,6 +2297,137 @@ static const struct bpf_func_proto bpf_msg_pull_data_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
+	   u32, len, u64, flags)
+{
+	struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
+	u32 new, i = 0, l, space, copy = 0, offset = 0;
+	u8 *raw, *to, *from;
+	struct page *page;
+
+	if (unlikely(flags))
+		return -EINVAL;
+
+	/* First find the starting scatterlist element */
+	i = msg->sg.start;
+	do {
+		l = sk_msg_elem(msg, i)->length;
+
+		if (start < offset + l)
+			break;
+		offset += l;
+		sk_msg_iter_var_next(i);
+	} while (i != msg->sg.end);
+
+	if (start >= offset + l)
+		return -EINVAL;
+
+	space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
+
+	/* If no space available will fallback to copy, we need at
+	 * least one scatterlist elem available to push data into
+	 * when start aligns to the beginning of an element or two
+	 * when it falls inside an element. We handle the start equals
+	 * offset case because its the common case for inserting a
+	 * header.
+	 */
+	if (!space || (space == 1 && start != offset))
+		copy = msg->sg.data[i].length;
+
+	page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
+			   get_order(copy + len));
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	if (copy) {
+		int front, back;
+
+		raw = page_address(page);
+
+		psge = sk_msg_elem(msg, i);
+		front = start - offset;
+		back = psge->length - front;
+		from = sg_virt(psge);
+
+		if (front)
+			memcpy(raw, from, front);
+
+		if (back) {
+			from += front;
+			to = raw + front + len;
+
+			memcpy(to, from, back);
+		}
+
+		put_page(sg_page(psge));
+	} else if (start - offset) {
+		psge = sk_msg_elem(msg, i);
+		rsge = sk_msg_elem_cpy(msg, i);
+
+		psge->length = start - offset;
+		rsge.length -= psge->length;
+		rsge.offset += start;
+
+		sk_msg_iter_var_next(i);
+		sg_unmark_end(psge);
+		sk_msg_iter_next(msg, end);
+	}
+
+	/* Slot(s) to place newly allocated data */
+	new = i;
+
+	/* Shift one or two slots as needed */
+	if (!copy) {
+		sge = sk_msg_elem_cpy(msg, i);
+
+		sk_msg_iter_var_next(i);
+		sg_unmark_end(&sge);
+		sk_msg_iter_next(msg, end);
+
+		nsge = sk_msg_elem_cpy(msg, i);
+		if (rsge.length) {
+			sk_msg_iter_var_next(i);
+			nnsge = sk_msg_elem_cpy(msg, i);
+		}
+
+		while (i != msg->sg.end) {
+			msg->sg.data[i] = sge;
+			sge = nsge;
+			sk_msg_iter_var_next(i);
+			if (rsge.length) {
+				nsge = nnsge;
+				nnsge = sk_msg_elem_cpy(msg, i);
+			} else {
+				nsge = sk_msg_elem_cpy(msg, i);
+			}
+		}
+	}
+
+	/* Place newly allocated data buffer */
+	sk_mem_charge(msg->sk, len);
+	msg->sg.size += len;
+	msg->sg.copy[new] = false;
+	sg_set_page(&msg->sg.data[new], page, len + copy, 0);
+	if (rsge.length) {
+		get_page(sg_page(&rsge));
+		sk_msg_iter_var_next(new);
+		msg->sg.data[new] = rsge;
+	}
+
+	sk_msg_compute_data_pointers(msg);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_push_data_proto = {
+	.func		= bpf_msg_push_data,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -4854,6 +4985,7 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_xdp_adjust_head ||
 	    func == bpf_xdp_adjust_meta ||
 	    func == bpf_msg_pull_data ||
+	    func == bpf_msg_push_data ||
 	    func == bpf_xdp_adjust_tail ||
 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
 	    func == bpf_lwt_seg6_store_bytes ||
@@ -4876,6 +5008,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_map_update_elem_proto;
 	case BPF_FUNC_map_delete_elem:
 		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_map_push_elem:
+		return &bpf_map_push_elem_proto;
+	case BPF_FUNC_map_pop_elem:
+		return &bpf_map_pop_elem_proto;
+	case BPF_FUNC_map_peek_elem:
+		return &bpf_map_peek_elem_proto;
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_get_smp_processor_id:
@@ -5124,6 +5262,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_msg_cork_bytes_proto;
 	case BPF_FUNC_msg_pull_data:
 		return &bpf_msg_pull_data_proto;
+	case BPF_FUNC_msg_push_data:
+		return &bpf_msg_push_data_proto;
 	case BPF_FUNC_get_local_storage:
 		return &bpf_get_local_storage_proto;
 	default:
@@ -5346,6 +5486,40 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
+static bool cg_skb_is_valid_access(int off, int size,
+				   enum bpf_access_type type,
+				   const struct bpf_prog *prog,
+				   struct bpf_insn_access_aux *info)
+{
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, tc_classid):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+	case bpf_ctx_range(struct __sk_buff, flow_keys):
+		return false;
+	}
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case bpf_ctx_range(struct __sk_buff, mark):
+		case bpf_ctx_range(struct __sk_buff, priority):
+		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		break;
+	case bpf_ctx_range(struct __sk_buff, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		break;
+	}
+
+	return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
 static bool lwt_is_valid_access(int off, int size,
 				enum bpf_access_type type,
 				const struct bpf_prog *prog,
@@ -7038,7 +7212,7 @@ const struct bpf_prog_ops xdp_prog_ops = {
 
 const struct bpf_verifier_ops cg_skb_verifier_ops = {
 	.get_func_proto		= cg_skb_func_proto,
-	.is_valid_access	= sk_filter_is_valid_access,
+	.is_valid_access	= cg_skb_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 3c0e44cb811a4..be6092ac69f8a 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -175,12 +175,13 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 		}
 	}
 
-	psock = sk_psock_get(sk);
+	psock = sk_psock_get_checked(sk);
+	if (IS_ERR(psock)) {
+		ret = PTR_ERR(psock);
+		goto out_progs;
+	}
+
 	if (psock) {
-		if (!sk_has_psock(sk)) {
-			ret = -EBUSY;
-			goto out_progs;
-		}
 		if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
 		    (skb_progs  && READ_ONCE(psock->progs.skb_parser))) {
 			sk_psock_put(sk, psock);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 80debb0daf374..b7918d4caa300 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -39,17 +39,19 @@ static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
 }
 
 int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
-		      struct msghdr *msg, int len)
+		      struct msghdr *msg, int len, int flags)
 {
 	struct iov_iter *iter = &msg->msg_iter;
+	int peek = flags & MSG_PEEK;
 	int i, ret, copied = 0;
+	struct sk_msg *msg_rx;
+
+	msg_rx = list_first_entry_or_null(&psock->ingress_msg,
+					  struct sk_msg, list);
 
 	while (copied != len) {
 		struct scatterlist *sge;
-		struct sk_msg *msg_rx;
 
-		msg_rx = list_first_entry_or_null(&psock->ingress_msg,
-						  struct sk_msg, list);
 		if (unlikely(!msg_rx))
 			break;
 
@@ -70,21 +72,30 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 			}
 
 			copied += copy;
-			sge->offset += copy;
-			sge->length -= copy;
-			sk_mem_uncharge(sk, copy);
-			if (!sge->length) {
-				i++;
-				if (i == MAX_SKB_FRAGS)
-					i = 0;
-				if (!msg_rx->skb)
-					put_page(page);
+			if (likely(!peek)) {
+				sge->offset += copy;
+				sge->length -= copy;
+				sk_mem_uncharge(sk, copy);
+				msg_rx->sg.size -= copy;
+
+				if (!sge->length) {
+					sk_msg_iter_var_next(i);
+					if (!msg_rx->skb)
+						put_page(page);
+				}
+			} else {
+				sk_msg_iter_var_next(i);
 			}
 
 			if (copied == len)
 				break;
 		} while (i != msg_rx->sg.end);
 
+		if (unlikely(peek)) {
+			msg_rx = list_next_entry(msg_rx, list);
+			continue;
+		}
+
 		msg_rx->sg.start = i;
 		if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
 			list_del(&msg_rx->list);
@@ -92,6 +103,8 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 				consume_skb(msg_rx->skb);
 			kfree(msg_rx);
 		}
+		msg_rx = list_first_entry_or_null(&psock->ingress_msg,
+						  struct sk_msg, list);
 	}
 
 	return copied;
@@ -114,7 +127,7 @@ int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
 	lock_sock(sk);
 msg_bytes_ready:
-	copied = __tcp_bpf_recvmsg(sk, psock, msg, len);
+	copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
 	if (!copied) {
 		int data, err = 0;
 		long timeo;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index e90b6d5370774..311cec8e533de 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -715,8 +715,6 @@ EXPORT_SYMBOL(tls_unregister_device);
 
 static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 	.name			= "tls",
-	.uid			= TCP_ULP_TLS,
-	.user_visible		= true,
 	.owner			= THIS_MODULE,
 	.init			= tls_init,
 };
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index a525fc4c2a4b9..5cd88ba8acd17 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1478,7 +1478,8 @@ int tls_sw_recvmsg(struct sock *sk,
 		skb = tls_wait_data(sk, psock, flags, timeo, &err);
 		if (!skb) {
 			if (psock) {
-				int ret = __tcp_bpf_recvmsg(sk, psock, msg, len);
+				int ret = __tcp_bpf_recvmsg(sk, psock,
+							    msg, len, flags);
 
 				if (ret > 0) {
 					copied += ret;
diff --git a/tools/arch/arm64/include/asm/barrier.h b/tools/arch/arm64/include/asm/barrier.h
index 40bde6b235013..12835ea0e4173 100644
--- a/tools/arch/arm64/include/asm/barrier.h
+++ b/tools/arch/arm64/include/asm/barrier.h
@@ -14,4 +14,74 @@
 #define wmb()		asm volatile("dmb ishst" ::: "memory")
 #define rmb()		asm volatile("dmb ishld" ::: "memory")
 
+#define smp_store_release(p, v)					\
+do {								\
+	union { typeof(*p) __val; char __c[1]; } __u =		\
+		{ .__val = (__force typeof(*p)) (v) }; 		\
+								\
+	switch (sizeof(*p)) {					\
+	case 1:							\
+		asm volatile ("stlrb %w1, %0"			\
+				: "=Q" (*p)			\
+				: "r" (*(__u8 *)__u.__c)	\
+				: "memory");			\
+		break;						\
+	case 2:							\
+		asm volatile ("stlrh %w1, %0"			\
+				: "=Q" (*p)			\
+				: "r" (*(__u16 *)__u.__c)	\
+				: "memory");			\
+		break;						\
+	case 4:							\
+		asm volatile ("stlr %w1, %0"			\
+				: "=Q" (*p)			\
+				: "r" (*(__u32 *)__u.__c)	\
+				: "memory");			\
+		break;						\
+	case 8:							\
+		asm volatile ("stlr %1, %0"			\
+				: "=Q" (*p)			\
+				: "r" (*(__u64 *)__u.__c)	\
+				: "memory");			\
+		break;						\
+	default:						\
+		/* Only to shut up gcc ... */			\
+		mb();						\
+		break;						\
+	}							\
+} while (0)
+
+#define smp_load_acquire(p)					\
+({								\
+	union { typeof(*p) __val; char __c[1]; } __u;		\
+								\
+	switch (sizeof(*p)) {					\
+	case 1:							\
+		asm volatile ("ldarb %w0, %1"			\
+			: "=r" (*(__u8 *)__u.__c)		\
+			: "Q" (*p) : "memory");			\
+		break;						\
+	case 2:							\
+		asm volatile ("ldarh %w0, %1"			\
+			: "=r" (*(__u16 *)__u.__c)		\
+			: "Q" (*p) : "memory");			\
+		break;						\
+	case 4:							\
+		asm volatile ("ldar %w0, %1"			\
+			: "=r" (*(__u32 *)__u.__c)		\
+			: "Q" (*p) : "memory");			\
+		break;						\
+	case 8:							\
+		asm volatile ("ldar %0, %1"			\
+			: "=r" (*(__u64 *)__u.__c)		\
+			: "Q" (*p) : "memory");			\
+		break;						\
+	default:						\
+		/* Only to shut up gcc ... */			\
+		mb();						\
+		break;						\
+	}							\
+	__u.__val;						\
+})
+
 #endif /* _TOOLS_LINUX_ASM_AARCH64_BARRIER_H */
diff --git a/tools/arch/ia64/include/asm/barrier.h b/tools/arch/ia64/include/asm/barrier.h
index d808ee0e77b58..4d471d9511a54 100644
--- a/tools/arch/ia64/include/asm/barrier.h
+++ b/tools/arch/ia64/include/asm/barrier.h
@@ -46,4 +46,17 @@
 #define rmb()		mb()
 #define wmb()		mb()
 
+#define smp_store_release(p, v)			\
+do {						\
+	barrier();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	barrier();				\
+	___p1;					\
+})
+
 #endif /* _TOOLS_LINUX_ASM_IA64_BARRIER_H */
diff --git a/tools/arch/powerpc/include/asm/barrier.h b/tools/arch/powerpc/include/asm/barrier.h
index a634da05bc973..905a2c66d96d9 100644
--- a/tools/arch/powerpc/include/asm/barrier.h
+++ b/tools/arch/powerpc/include/asm/barrier.h
@@ -27,4 +27,20 @@
 #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
 #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
 
+#if defined(__powerpc64__)
+#define smp_lwsync()	__asm__ __volatile__ ("lwsync" : : : "memory")
+
+#define smp_store_release(p, v)			\
+do {						\
+	smp_lwsync();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	smp_lwsync();				\
+	___p1;					\
+})
+#endif /* defined(__powerpc64__) */
 #endif /* _TOOLS_LINUX_ASM_POWERPC_BARRIER_H */
diff --git a/tools/arch/s390/include/asm/barrier.h b/tools/arch/s390/include/asm/barrier.h
index 5030c99f47d20..de362fa664d4f 100644
--- a/tools/arch/s390/include/asm/barrier.h
+++ b/tools/arch/s390/include/asm/barrier.h
@@ -28,4 +28,17 @@
 #define rmb()				mb()
 #define wmb()				mb()
 
+#define smp_store_release(p, v)			\
+do {						\
+	barrier();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	barrier();				\
+	___p1;					\
+})
+
 #endif /* __TOOLS_LIB_ASM_BARRIER_H */
diff --git a/tools/arch/sparc/include/asm/barrier_64.h b/tools/arch/sparc/include/asm/barrier_64.h
index ba61344287d50..cfb0fdc8ccf09 100644
--- a/tools/arch/sparc/include/asm/barrier_64.h
+++ b/tools/arch/sparc/include/asm/barrier_64.h
@@ -40,4 +40,17 @@ do {	__asm__ __volatile__("ba,pt	%%xcc, 1f\n\t" \
 #define rmb()	__asm__ __volatile__("":::"memory")
 #define wmb()	__asm__ __volatile__("":::"memory")
 
+#define smp_store_release(p, v)			\
+do {						\
+	barrier();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	barrier();				\
+	___p1;					\
+})
+
 #endif /* !(__TOOLS_LINUX_SPARC64_BARRIER_H) */
diff --git a/tools/arch/x86/include/asm/barrier.h b/tools/arch/x86/include/asm/barrier.h
index 8774dee27471e..58919868473c1 100644
--- a/tools/arch/x86/include/asm/barrier.h
+++ b/tools/arch/x86/include/asm/barrier.h
@@ -26,4 +26,18 @@
 #define wmb()	asm volatile("sfence" ::: "memory")
 #endif
 
+#if defined(__x86_64__)
+#define smp_store_release(p, v)			\
+do {						\
+	barrier();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+
+#define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	barrier();				\
+	___p1;					\
+})
+#endif /* defined(__x86_64__) */
 #endif /* _TOOLS_LINUX_ASM_X86_BARRIER_H */
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 3497f2d803284..f55a2daed59b7 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -86,7 +86,9 @@ DESCRIPTION
 	**bpftool map pin**     *MAP*  *FILE*
 		  Pin map *MAP* as *FILE*.
 
-		  Note: *FILE* must be located in *bpffs* mount.
+		  Note: *FILE* must be located in *bpffs* mount. It must not
+		  contain a dot character ('.'), which is reserved for future
+		  extensions of *bpffs*.
 
 	**bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*]
 		  Read events from a BPF_MAP_TYPE_PERF_EVENT_ARRAY map.
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 12c803003ab29..ac4e904b10fbd 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -75,7 +75,9 @@ DESCRIPTION
 	**bpftool prog pin** *PROG* *FILE*
 		  Pin program *PROG* as *FILE*.
 
-		  Note: *FILE* must be located in *bpffs* mount.
+		  Note: *FILE* must be located in *bpffs* mount. It must not
+		  contain a dot character ('.'), which is reserved for future
+		  extensions of *bpffs*.
 
 	**bpftool prog load** *OBJ* *FILE* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*]
 		  Load bpf program from binary *OBJ* and pin as *FILE*.
@@ -91,7 +93,9 @@ DESCRIPTION
 		  If **dev** *NAME* is specified program will be loaded onto
 		  given networking device (offload).
 
-		  Note: *FILE* must be located in *bpffs* mount.
+		  Note: *FILE* must be located in *bpffs* mount. It must not
+		  contain a dot character ('.'), which is reserved for future
+		  extensions of *bpffs*.
 
         **bpftool prog attach** *PROG* *ATTACH_TYPE* *MAP*
                   Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index c56545e87b0dd..3f78e6404589f 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -143,7 +143,7 @@ _bpftool_map_update_map_type()
     local type
     type=$(bpftool -jp map show $keyword $ref | \
         command sed -n 's/.*"type": "\(.*\)",$/\1/p')
-    printf $type
+    [[ -n $type ]] && printf $type
 }
 
 _bpftool_map_update_get_id()
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 3318da8060bd1..25af85304ebee 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -554,7 +554,9 @@ static int read_sysfs_netdev_hex_int(char *devname, const char *entry_name)
 	return read_sysfs_hex_int(full_path);
 }
 
-const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino)
+const char *
+ifindex_to_bfd_params(__u32 ifindex, __u64 ns_dev, __u64 ns_ino,
+		      const char **opt)
 {
 	char devname[IF_NAMESIZE];
 	int vendor_id;
@@ -579,6 +581,7 @@ const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino)
 		    device_id != 0x6000 &&
 		    device_id != 0x6003)
 			p_info("Unknown NFP device ID, assuming it is NFP-6xxx arch");
+		*opt = "ctx4";
 		return "NFP-6xxx";
 	default:
 		p_err("Can't get bfd arch name for device vendor id 0x%04x",
diff --git a/tools/bpf/bpftool/jit_disasm.c b/tools/bpf/bpftool/jit_disasm.c
index 87439320ef70e..c75ffd9ce2bb3 100644
--- a/tools/bpf/bpftool/jit_disasm.c
+++ b/tools/bpf/bpftool/jit_disasm.c
@@ -77,7 +77,7 @@ static int fprintf_json(void *out, const char *fmt, ...)
 }
 
 void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes,
-		       const char *arch)
+		       const char *arch, const char *disassembler_options)
 {
 	disassembler_ftype disassemble;
 	struct disassemble_info info;
@@ -116,6 +116,8 @@ void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes,
 
 	info.arch = bfd_get_arch(bfdf);
 	info.mach = bfd_get_mach(bfdf);
+	if (disassembler_options)
+		info.disassembler_options = disassembler_options;
 	info.buffer = image;
 	info.buffer_length = len;
 
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 828dde30e9ecd..75a3296dc0bc8 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -321,7 +321,8 @@ static int do_batch(int argc, char **argv)
 		p_err("reading batch file failed: %s", strerror(errno));
 		err = -1;
 	} else {
-		p_info("processed %d commands", lines);
+		if (!json_output)
+			printf("processed %d commands\n", lines);
 		err = 0;
 	}
 err_close:
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 28ee769bd11b7..28322ace28565 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -145,13 +145,15 @@ int map_parse_fd(int *argc, char ***argv);
 int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
 
 void disasm_print_insn(unsigned char *image, ssize_t len, int opcodes,
-		       const char *arch);
+		       const char *arch, const char *disassembler_options);
 void print_data_json(uint8_t *data, size_t len);
 void print_hex_data_json(uint8_t *data, size_t len);
 
 unsigned int get_page_size(void);
 unsigned int get_possible_cpus(void);
-const char *ifindex_to_bfd_name_ns(__u32 ifindex, __u64 ns_dev, __u64 ns_ino);
+const char *
+ifindex_to_bfd_params(__u32 ifindex, __u64 ns_dev, __u64 ns_ino,
+		      const char **opt);
 
 struct btf_dumper {
 	const struct btf *btf;
diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c
index 6d41323be291b..bdaf4062e26e0 100644
--- a/tools/bpf/bpftool/map_perf_ring.c
+++ b/tools/bpf/bpftool/map_perf_ring.c
@@ -50,15 +50,17 @@ static void int_exit(int signo)
 	stop = true;
 }
 
-static enum bpf_perf_event_ret print_bpf_output(void *event, void *priv)
+static enum bpf_perf_event_ret
+print_bpf_output(struct perf_event_header *event, void *private_data)
 {
-	struct event_ring_info *ring = priv;
-	struct perf_event_sample *e = event;
+	struct perf_event_sample *e = container_of(event, struct perf_event_sample,
+						   header);
+	struct event_ring_info *ring = private_data;
 	struct {
 		struct perf_event_header header;
 		__u64 id;
 		__u64 lost;
-	} *lost = event;
+	} *lost = (typeof(lost))event;
 
 	if (json_output) {
 		jsonw_start_object(json_wtr);
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 335028968dfb9..5302ee282409e 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -449,6 +449,7 @@ static int do_dump(int argc, char **argv)
 	unsigned long *func_ksyms = NULL;
 	struct bpf_prog_info info = {};
 	unsigned int *func_lens = NULL;
+	const char *disasm_opt = NULL;
 	unsigned int nr_func_ksyms;
 	unsigned int nr_func_lens;
 	struct dump_data dd = {};
@@ -607,9 +608,10 @@ static int do_dump(int argc, char **argv)
 		const char *name = NULL;
 
 		if (info.ifindex) {
-			name = ifindex_to_bfd_name_ns(info.ifindex,
-						      info.netns_dev,
-						      info.netns_ino);
+			name = ifindex_to_bfd_params(info.ifindex,
+						     info.netns_dev,
+						     info.netns_ino,
+						     &disasm_opt);
 			if (!name)
 				goto err_free;
 		}
@@ -651,7 +653,8 @@ static int do_dump(int argc, char **argv)
 					printf("%s:\n", sym_name);
 				}
 
-				disasm_print_insn(img, lens[i], opcodes, name);
+				disasm_print_insn(img, lens[i], opcodes, name,
+						  disasm_opt);
 				img += lens[i];
 
 				if (json_output)
@@ -663,7 +666,8 @@ static int do_dump(int argc, char **argv)
 			if (json_output)
 				jsonw_end_array(json_wtr);
 		} else {
-			disasm_print_insn(buf, *member_len, opcodes, name);
+			disasm_print_insn(buf, *member_len, opcodes, name,
+					  disasm_opt);
 		}
 	} else if (visual) {
 		if (json_output)
diff --git a/tools/include/asm/barrier.h b/tools/include/asm/barrier.h
index 391d942536e53..8d378c57cb011 100644
--- a/tools/include/asm/barrier.h
+++ b/tools/include/asm/barrier.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/compiler.h>
 #if defined(__i386__) || defined(__x86_64__)
 #include "../../arch/x86/include/asm/barrier.h"
 #elif defined(__arm__)
@@ -26,3 +27,37 @@
 #else
 #include <asm-generic/barrier.h>
 #endif
+
+/*
+ * Generic fallback smp_*() definitions for archs that haven't
+ * been updated yet.
+ */
+
+#ifndef smp_rmb
+# define smp_rmb()	rmb()
+#endif
+
+#ifndef smp_wmb
+# define smp_wmb()	wmb()
+#endif
+
+#ifndef smp_mb
+# define smp_mb()	mb()
+#endif
+
+#ifndef smp_store_release
+# define smp_store_release(p, v)		\
+do {						\
+	smp_mb();				\
+	WRITE_ONCE(*p, v);			\
+} while (0)
+#endif
+
+#ifndef smp_load_acquire
+# define smp_load_acquire(p)			\
+({						\
+	typeof(*p) ___p1 = READ_ONCE(*p);	\
+	smp_mb();				\
+	___p1;					\
+})
+#endif
diff --git a/tools/include/linux/ring_buffer.h b/tools/include/linux/ring_buffer.h
new file mode 100644
index 0000000000000..9a083ae604730
--- /dev/null
+++ b/tools/include/linux/ring_buffer.h
@@ -0,0 +1,73 @@
+#ifndef _TOOLS_LINUX_RING_BUFFER_H_
+#define _TOOLS_LINUX_RING_BUFFER_H_
+
+#include <asm/barrier.h>
+
+/*
+ * Contract with kernel for walking the perf ring buffer from
+ * user space requires the following barrier pairing (quote
+ * from kernel/events/ring_buffer.c):
+ *
+ *   Since the mmap() consumer (userspace) can run on a
+ *   different CPU:
+ *
+ *   kernel                             user
+ *
+ *   if (LOAD ->data_tail) {            LOAD ->data_head
+ *                      (A)             smp_rmb()       (C)
+ *      STORE $data                     LOAD $data
+ *      smp_wmb()       (B)             smp_mb()        (D)
+ *      STORE ->data_head               STORE ->data_tail
+ *   }
+ *
+ *   Where A pairs with D, and B pairs with C.
+ *
+ *   In our case A is a control dependency that separates the
+ *   load of the ->data_tail and the stores of $data. In case
+ *   ->data_tail indicates there is no room in the buffer to
+ *   store $data we do not.
+ *
+ *   D needs to be a full barrier since it separates the data
+ *   READ from the tail WRITE.
+ *
+ *   For B a WMB is sufficient since it separates two WRITEs,
+ *   and for C an RMB is sufficient since it separates two READs.
+ *
+ * Note, instead of B, C, D we could also use smp_store_release()
+ * in B and D as well as smp_load_acquire() in C.
+ *
+ * However, this optimization does not make sense for all kernel
+ * supported architectures since for a fair number it would
+ * resolve into READ_ONCE() + smp_mb() pair for smp_load_acquire(),
+ * and smp_mb() + WRITE_ONCE() pair for smp_store_release().
+ *
+ * Thus for those smp_wmb() in B and smp_rmb() in C would still
+ * be less expensive. For the case of D this has either the same
+ * cost or is less expensive, for example, due to TSO x86 can
+ * avoid the CPU barrier entirely.
+ */
+
+static inline u64 ring_buffer_read_head(struct perf_event_mmap_page *base)
+{
+/*
+ * Architectures where smp_load_acquire() does not fallback to
+ * READ_ONCE() + smp_mb() pair.
+ */
+#if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__) || \
+    defined(__ia64__) || defined(__sparc__) && defined(__arch64__)
+	return smp_load_acquire(&base->data_head);
+#else
+	u64 head = READ_ONCE(base->data_head);
+
+	smp_rmb();
+	return head;
+#endif
+}
+
+static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base,
+					  u64 tail)
+{
+	smp_store_release(&base->data_tail, tail);
+}
+
+#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f9187b41dff62..852dc17ab47a0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -103,6 +103,7 @@ enum bpf_cmd {
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
 	BPF_TASK_FD_QUERY,
+	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 };
 
 enum bpf_map_type {
@@ -128,6 +129,8 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_CGROUP_STORAGE,
 	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
 	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+	BPF_MAP_TYPE_QUEUE,
+	BPF_MAP_TYPE_STACK,
 };
 
 enum bpf_prog_type {
@@ -462,6 +465,28 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
+ * 	Description
+ * 		Push an element *value* in *map*. *flags* is one of:
+ *
+ * 		**BPF_EXIST**
+ * 		If the queue/stack is full, the oldest element is removed to
+ * 		make room for this.
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_pop_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Pop an element from *map*.
+ * Return
+ * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_map_peek_elem(struct bpf_map *map, void *value)
+ * 	Description
+ * 		Get an element from *map* without removing it.
+ * Return
+ * 		0 on success, or a negative error in case of failure.
+ *
  * int bpf_probe_read(void *dst, u32 size, const void *src)
  * 	Description
  * 		For tracing programs, safely attempt to read *size* bytes from
@@ -1433,7 +1458,7 @@ union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
- * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags)
+ * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags)
  * 	Description
  * 		Grow or shrink the room for data in the packet associated to
  * 		*skb* by *len_diff*, and according to the selected *mode*.
@@ -2215,6 +2240,23 @@ union bpf_attr {
  *		pointer that was returned from bpf_sk_lookup_xxx\ ().
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags)
+ *	Description
+ *		For socket policies, insert *len* bytes into msg at offset
+ *		*start*.
+ *
+ *		If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ *		*msg* it may want to insert metadata or options into the msg.
+ *		This can later be read and used by any of the lower layer BPF
+ *		hooks.
+ *
+ *		This helper may fail if under memory pressure (a malloc
+ *		fails) in these cases BPF programs will get an appropriate
+ *		error and BPF programs will need to handle them.
+ *
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2303,7 +2345,11 @@ union bpf_attr {
 	FN(skb_ancestor_cgroup_id),	\
 	FN(sk_lookup_tcp),		\
 	FN(sk_lookup_udp),		\
-	FN(sk_release),
+	FN(sk_release),			\
+	FN(map_push_elem),		\
+	FN(map_pop_elem),		\
+	FN(map_peek_elem),		\
+	FN(msg_push_data),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/include/uapi/linux/tls.h b/tools/include/uapi/linux/tls.h
new file mode 100644
index 0000000000000..ff02287495ac5
--- /dev/null
+++ b/tools/include/uapi/linux/tls.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _UAPI_LINUX_TLS_H
+#define _UAPI_LINUX_TLS_H
+
+#include <linux/types.h>
+
+/* TLS socket options */
+#define TLS_TX			1	/* Set transmit parameters */
+#define TLS_RX			2	/* Set receive parameters */
+
+/* Supported versions */
+#define TLS_VERSION_MINOR(ver)	((ver) & 0xFF)
+#define TLS_VERSION_MAJOR(ver)	(((ver) >> 8) & 0xFF)
+
+#define TLS_VERSION_NUMBER(id)	((((id##_VERSION_MAJOR) & 0xFF) << 8) |	\
+				 ((id##_VERSION_MINOR) & 0xFF))
+
+#define TLS_1_2_VERSION_MAJOR	0x3
+#define TLS_1_2_VERSION_MINOR	0x3
+#define TLS_1_2_VERSION		TLS_VERSION_NUMBER(TLS_1_2)
+
+/* Supported ciphers */
+#define TLS_CIPHER_AES_GCM_128				51
+#define TLS_CIPHER_AES_GCM_128_IV_SIZE			8
+#define TLS_CIPHER_AES_GCM_128_KEY_SIZE		16
+#define TLS_CIPHER_AES_GCM_128_SALT_SIZE		4
+#define TLS_CIPHER_AES_GCM_128_TAG_SIZE		16
+#define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE		8
+
+#define TLS_SET_RECORD_TYPE	1
+#define TLS_GET_RECORD_TYPE	2
+
+struct tls_crypto_info {
+	__u16 version;
+	__u16 cipher_type;
+};
+
+struct tls12_crypto_info_aes_gcm_128 {
+	struct tls_crypto_info info;
+	unsigned char iv[TLS_CIPHER_AES_GCM_128_IV_SIZE];
+	unsigned char key[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
+	unsigned char salt[TLS_CIPHER_AES_GCM_128_SALT_SIZE];
+	unsigned char rec_seq[TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE];
+};
+
+#endif /* _UAPI_LINUX_TLS_H */
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 79d84413ddf20..425b480bda752 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -125,6 +125,7 @@ override CFLAGS += $(EXTRA_WARNINGS)
 override CFLAGS += -Werror -Wall
 override CFLAGS += -fPIC
 override CFLAGS += $(INCLUDES)
+override CFLAGS += -fvisibility=hidden
 
 ifeq ($(VERBOSE),1)
   Q =
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index d70a255cb05e7..03f9bcc4ef501 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -278,6 +278,18 @@ int bpf_map_lookup_elem(int fd, const void *key, void *value)
 	return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
 }
 
+int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value)
+{
+	union bpf_attr attr;
+
+	bzero(&attr, sizeof(attr));
+	attr.map_fd = fd;
+	attr.key = ptr_to_u64(key);
+	attr.value = ptr_to_u64(value);
+
+	return sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr));
+}
+
 int bpf_map_delete_elem(int fd, const void *key)
 {
 	union bpf_attr attr;
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 69a4d40c42273..26a51538213cd 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -27,6 +27,10 @@
 #include <stdbool.h>
 #include <stddef.h>
 
+#ifndef LIBBPF_API
+#define LIBBPF_API __attribute__((visibility("default")))
+#endif
+
 struct bpf_create_map_attr {
 	const char *name;
 	enum bpf_map_type map_type;
@@ -42,21 +46,24 @@ struct bpf_create_map_attr {
 	__u32 inner_map_fd;
 };
 
-int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);
-int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
-			int key_size, int value_size, int max_entries,
-			__u32 map_flags, int node);
-int bpf_create_map_name(enum bpf_map_type map_type, const char *name,
-			int key_size, int value_size, int max_entries,
-			__u32 map_flags);
-int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
-		   int max_entries, __u32 map_flags);
-int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name,
-			       int key_size, int inner_map_fd, int max_entries,
-			       __u32 map_flags, int node);
-int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name,
-			  int key_size, int inner_map_fd, int max_entries,
-			  __u32 map_flags);
+LIBBPF_API int
+bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);
+LIBBPF_API int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
+				   int key_size, int value_size,
+				   int max_entries, __u32 map_flags, int node);
+LIBBPF_API int bpf_create_map_name(enum bpf_map_type map_type, const char *name,
+				   int key_size, int value_size,
+				   int max_entries, __u32 map_flags);
+LIBBPF_API int bpf_create_map(enum bpf_map_type map_type, int key_size,
+			      int value_size, int max_entries, __u32 map_flags);
+LIBBPF_API int bpf_create_map_in_map_node(enum bpf_map_type map_type,
+					  const char *name, int key_size,
+					  int inner_map_fd, int max_entries,
+					  __u32 map_flags, int node);
+LIBBPF_API int bpf_create_map_in_map(enum bpf_map_type map_type,
+				     const char *name, int key_size,
+				     int inner_map_fd, int max_entries,
+				     __u32 map_flags);
 
 struct bpf_load_program_attr {
 	enum bpf_prog_type prog_type;
@@ -74,44 +81,51 @@ struct bpf_load_program_attr {
 
 /* Recommend log buffer size */
 #define BPF_LOG_BUF_SIZE (256 * 1024)
-int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
-			   char *log_buf, size_t log_buf_sz);
-int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
-		     size_t insns_cnt, const char *license,
-		     __u32 kern_version, char *log_buf,
-		     size_t log_buf_sz);
-int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
-		       size_t insns_cnt, int strict_alignment,
-		       const char *license, __u32 kern_version,
-		       char *log_buf, size_t log_buf_sz, int log_level);
+LIBBPF_API int
+bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
+		       char *log_buf, size_t log_buf_sz);
+LIBBPF_API int bpf_load_program(enum bpf_prog_type type,
+				const struct bpf_insn *insns, size_t insns_cnt,
+				const char *license, __u32 kern_version,
+				char *log_buf, size_t log_buf_sz);
+LIBBPF_API int bpf_verify_program(enum bpf_prog_type type,
+				  const struct bpf_insn *insns,
+				  size_t insns_cnt, int strict_alignment,
+				  const char *license, __u32 kern_version,
+				  char *log_buf, size_t log_buf_sz,
+				  int log_level);
 
-int bpf_map_update_elem(int fd, const void *key, const void *value,
-			__u64 flags);
+LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value,
+				   __u64 flags);
 
-int bpf_map_lookup_elem(int fd, const void *key, void *value);
-int bpf_map_delete_elem(int fd, const void *key);
-int bpf_map_get_next_key(int fd, const void *key, void *next_key);
-int bpf_obj_pin(int fd, const char *pathname);
-int bpf_obj_get(const char *pathname);
-int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
-		    unsigned int flags);
-int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
-int bpf_prog_detach2(int prog_fd, int attachable_fd, enum bpf_attach_type type);
-int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
-		      void *data_out, __u32 *size_out, __u32 *retval,
-		      __u32 *duration);
-int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id);
-int bpf_map_get_next_id(__u32 start_id, __u32 *next_id);
-int bpf_prog_get_fd_by_id(__u32 id);
-int bpf_map_get_fd_by_id(__u32 id);
-int bpf_btf_get_fd_by_id(__u32 id);
-int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len);
-int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
-		   __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt);
-int bpf_raw_tracepoint_open(const char *name, int prog_fd);
-int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
-		 bool do_log);
-int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
-		      __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset,
-		      __u64 *probe_addr);
+LIBBPF_API int bpf_map_lookup_elem(int fd, const void *key, void *value);
+LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key,
+					      void *value);
+LIBBPF_API int bpf_map_delete_elem(int fd, const void *key);
+LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key);
+LIBBPF_API int bpf_obj_pin(int fd, const char *pathname);
+LIBBPF_API int bpf_obj_get(const char *pathname);
+LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd,
+			       enum bpf_attach_type type, unsigned int flags);
+LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
+LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd,
+				enum bpf_attach_type type);
+LIBBPF_API int bpf_prog_test_run(int prog_fd, int repeat, void *data,
+				 __u32 size, void *data_out, __u32 *size_out,
+				 __u32 *retval, __u32 *duration);
+LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id);
+LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id);
+LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id);
+LIBBPF_API int bpf_map_get_fd_by_id(__u32 id);
+LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id);
+LIBBPF_API int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len);
+LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type,
+			      __u32 query_flags, __u32 *attach_flags,
+			      __u32 *prog_ids, __u32 *prog_cnt);
+LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd);
+LIBBPF_API int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf,
+			    __u32 log_buf_size, bool do_log);
+LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf,
+				 __u32 *buf_len, __u32 *prog_id, __u32 *fd_type,
+				 __u64 *probe_offset, __u64 *probe_addr);
 #endif /* __LIBBPF_BPF_H */
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 6db5462bb2ef9..b77e7080f7e73 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -6,6 +6,10 @@
 
 #include <linux/types.h>
 
+#ifndef LIBBPF_API
+#define LIBBPF_API __attribute__((visibility("default")))
+#endif
+
 #define BTF_ELF_SEC ".BTF"
 
 struct btf;
@@ -14,13 +18,15 @@ struct btf_type;
 typedef int (*btf_print_fn_t)(const char *, ...)
 	__attribute__((format(printf, 1, 2)));
 
-void btf__free(struct btf *btf);
-struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log);
-__s32 btf__find_by_name(const struct btf *btf, const char *type_name);
-const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id);
-__s64 btf__resolve_size(const struct btf *btf, __u32 type_id);
-int btf__resolve_type(const struct btf *btf, __u32 type_id);
-int btf__fd(const struct btf *btf);
-const char *btf__name_by_offset(const struct btf *btf, __u32 offset);
+LIBBPF_API void btf__free(struct btf *btf);
+LIBBPF_API struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log);
+LIBBPF_API __s32 btf__find_by_name(const struct btf *btf,
+				   const char *type_name);
+LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf,
+						  __u32 id);
+LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id);
+LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id);
+LIBBPF_API int btf__fd(const struct btf *btf);
+LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset);
 
 #endif /* __LIBBPF_BTF_H */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index bd71efcc53bed..b607be7236d3e 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -27,6 +27,7 @@
 #include <linux/list.h>
 #include <linux/limits.h>
 #include <linux/perf_event.h>
+#include <linux/ring_buffer.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
@@ -2414,61 +2415,49 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
 }
 
 enum bpf_perf_event_ret
-bpf_perf_event_read_simple(void *mem, unsigned long size,
-			   unsigned long page_size, void **buf, size_t *buf_len,
-			   bpf_perf_event_print_t fn, void *priv)
+bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
+			   void **copy_mem, size_t *copy_size,
+			   bpf_perf_event_print_t fn, void *private_data)
 {
-	volatile struct perf_event_mmap_page *header = mem;
+	struct perf_event_mmap_page *header = mmap_mem;
+	__u64 data_head = ring_buffer_read_head(header);
 	__u64 data_tail = header->data_tail;
-	__u64 data_head = header->data_head;
-	int ret = LIBBPF_PERF_EVENT_ERROR;
-	void *base, *begin, *end;
-
-	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
-	if (data_head == data_tail)
-		return LIBBPF_PERF_EVENT_CONT;
-
-	base = ((char *)header) + page_size;
-
-	begin = base + data_tail % size;
-	end = base + data_head % size;
-
-	while (begin != end) {
-		struct perf_event_header *ehdr;
-
-		ehdr = begin;
-		if (begin + ehdr->size > base + size) {
-			long len = base + size - begin;
-
-			if (*buf_len < ehdr->size) {
-				free(*buf);
-				*buf = malloc(ehdr->size);
-				if (!*buf) {
+	void *base = ((__u8 *)header) + page_size;
+	int ret = LIBBPF_PERF_EVENT_CONT;
+	struct perf_event_header *ehdr;
+	size_t ehdr_size;
+
+	while (data_head != data_tail) {
+		ehdr = base + (data_tail & (mmap_size - 1));
+		ehdr_size = ehdr->size;
+
+		if (((void *)ehdr) + ehdr_size > base + mmap_size) {
+			void *copy_start = ehdr;
+			size_t len_first = base + mmap_size - copy_start;
+			size_t len_secnd = ehdr_size - len_first;
+
+			if (*copy_size < ehdr_size) {
+				free(*copy_mem);
+				*copy_mem = malloc(ehdr_size);
+				if (!*copy_mem) {
+					*copy_size = 0;
 					ret = LIBBPF_PERF_EVENT_ERROR;
 					break;
 				}
-				*buf_len = ehdr->size;
+				*copy_size = ehdr_size;
 			}
 
-			memcpy(*buf, begin, len);
-			memcpy(*buf + len, base, ehdr->size - len);
-			ehdr = (void *)*buf;
-			begin = base + ehdr->size - len;
-		} else if (begin + ehdr->size == base + size) {
-			begin = base;
-		} else {
-			begin += ehdr->size;
+			memcpy(*copy_mem, copy_start, len_first);
+			memcpy(*copy_mem + len_first, base, len_secnd);
+			ehdr = *copy_mem;
 		}
 
-		ret = fn(ehdr, priv);
+		ret = fn(ehdr, private_data);
+		data_tail += ehdr_size;
 		if (ret != LIBBPF_PERF_EVENT_CONT)
 			break;
-
-		data_tail += ehdr->size;
 	}
 
-	__sync_synchronize(); /* smp_mb() */
-	header->data_tail = data_tail;
-
+	ring_buffer_write_tail(header, data_tail);
 	return ret;
 }
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 7e9c801a9fddb..1f3468dad8b2c 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -16,6 +16,10 @@
 #include <sys/types.h>  // for size_t
 #include <linux/bpf.h>
 
+#ifndef LIBBPF_API
+#define LIBBPF_API __attribute__((visibility("default")))
+#endif
+
 enum libbpf_errno {
 	__LIBBPF_ERRNO__START = 4000,
 
@@ -37,7 +41,7 @@ enum libbpf_errno {
 	__LIBBPF_ERRNO__END,
 };
 
-int libbpf_strerror(int err, char *buf, size_t size);
+LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size);
 
 /*
  * __printf is defined in include/linux/compiler-gcc.h. However,
@@ -47,9 +51,9 @@ int libbpf_strerror(int err, char *buf, size_t size);
 typedef int (*libbpf_print_fn_t)(const char *, ...)
 	__attribute__((format(printf, 1, 2)));
 
-void libbpf_set_print(libbpf_print_fn_t warn,
-		      libbpf_print_fn_t info,
-		      libbpf_print_fn_t debug);
+LIBBPF_API void libbpf_set_print(libbpf_print_fn_t warn,
+				 libbpf_print_fn_t info,
+				 libbpf_print_fn_t debug);
 
 /* Hide internal to user */
 struct bpf_object;
@@ -59,27 +63,28 @@ struct bpf_object_open_attr {
 	enum bpf_prog_type prog_type;
 };
 
-struct bpf_object *bpf_object__open(const char *path);
-struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr);
+LIBBPF_API struct bpf_object *bpf_object__open(const char *path);
+LIBBPF_API struct bpf_object *
+bpf_object__open_xattr(struct bpf_object_open_attr *attr);
 struct bpf_object *__bpf_object__open_xattr(struct bpf_object_open_attr *attr,
 					    int flags);
-struct bpf_object *bpf_object__open_buffer(void *obj_buf,
-					   size_t obj_buf_sz,
-					   const char *name);
-int bpf_object__pin(struct bpf_object *object, const char *path);
-void bpf_object__close(struct bpf_object *object);
+LIBBPF_API struct bpf_object *bpf_object__open_buffer(void *obj_buf,
+						      size_t obj_buf_sz,
+						      const char *name);
+LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path);
+LIBBPF_API void bpf_object__close(struct bpf_object *object);
 
 /* Load/unload object into/from kernel */
-int bpf_object__load(struct bpf_object *obj);
-int bpf_object__unload(struct bpf_object *obj);
-const char *bpf_object__name(struct bpf_object *obj);
-unsigned int bpf_object__kversion(struct bpf_object *obj);
-int bpf_object__btf_fd(const struct bpf_object *obj);
+LIBBPF_API int bpf_object__load(struct bpf_object *obj);
+LIBBPF_API int bpf_object__unload(struct bpf_object *obj);
+LIBBPF_API const char *bpf_object__name(struct bpf_object *obj);
+LIBBPF_API unsigned int bpf_object__kversion(struct bpf_object *obj);
+LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj);
 
-struct bpf_program *
+LIBBPF_API struct bpf_program *
 bpf_object__find_program_by_title(struct bpf_object *obj, const char *title);
 
-struct bpf_object *bpf_object__next(struct bpf_object *prev);
+LIBBPF_API struct bpf_object *bpf_object__next(struct bpf_object *prev);
 #define bpf_object__for_each_safe(pos, tmp)			\
 	for ((pos) = bpf_object__next(NULL),		\
 		(tmp) = bpf_object__next(pos);		\
@@ -87,19 +92,20 @@ struct bpf_object *bpf_object__next(struct bpf_object *prev);
 	     (pos) = (tmp), (tmp) = bpf_object__next(tmp))
 
 typedef void (*bpf_object_clear_priv_t)(struct bpf_object *, void *);
-int bpf_object__set_priv(struct bpf_object *obj, void *priv,
-			 bpf_object_clear_priv_t clear_priv);
-void *bpf_object__priv(struct bpf_object *prog);
+LIBBPF_API int bpf_object__set_priv(struct bpf_object *obj, void *priv,
+				    bpf_object_clear_priv_t clear_priv);
+LIBBPF_API void *bpf_object__priv(struct bpf_object *prog);
 
-int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type,
-			     enum bpf_attach_type *expected_attach_type);
-int libbpf_attach_type_by_name(const char *name,
-			       enum bpf_attach_type *attach_type);
+LIBBPF_API int
+libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type,
+			 enum bpf_attach_type *expected_attach_type);
+LIBBPF_API int libbpf_attach_type_by_name(const char *name,
+					  enum bpf_attach_type *attach_type);
 
 /* Accessors of bpf_program */
 struct bpf_program;
-struct bpf_program *bpf_program__next(struct bpf_program *prog,
-				      struct bpf_object *obj);
+LIBBPF_API struct bpf_program *bpf_program__next(struct bpf_program *prog,
+						 struct bpf_object *obj);
 
 #define bpf_object__for_each_program(pos, obj)		\
 	for ((pos) = bpf_program__next(NULL, (obj));	\
@@ -109,21 +115,24 @@ struct bpf_program *bpf_program__next(struct bpf_program *prog,
 typedef void (*bpf_program_clear_priv_t)(struct bpf_program *,
 					 void *);
 
-int bpf_program__set_priv(struct bpf_program *prog, void *priv,
-			  bpf_program_clear_priv_t clear_priv);
+LIBBPF_API int bpf_program__set_priv(struct bpf_program *prog, void *priv,
+				     bpf_program_clear_priv_t clear_priv);
 
-void *bpf_program__priv(struct bpf_program *prog);
-void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex);
+LIBBPF_API void *bpf_program__priv(struct bpf_program *prog);
+LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog,
+					 __u32 ifindex);
 
-const char *bpf_program__title(struct bpf_program *prog, bool needs_copy);
+LIBBPF_API const char *bpf_program__title(struct bpf_program *prog,
+					  bool needs_copy);
 
-int bpf_program__load(struct bpf_program *prog, char *license,
-		      __u32 kern_version);
-int bpf_program__fd(struct bpf_program *prog);
-int bpf_program__pin_instance(struct bpf_program *prog, const char *path,
-			      int instance);
-int bpf_program__pin(struct bpf_program *prog, const char *path);
-void bpf_program__unload(struct bpf_program *prog);
+LIBBPF_API int bpf_program__load(struct bpf_program *prog, char *license,
+				 __u32 kern_version);
+LIBBPF_API int bpf_program__fd(struct bpf_program *prog);
+LIBBPF_API int bpf_program__pin_instance(struct bpf_program *prog,
+					 const char *path,
+					 int instance);
+LIBBPF_API int bpf_program__pin(struct bpf_program *prog, const char *path);
+LIBBPF_API void bpf_program__unload(struct bpf_program *prog);
 
 struct bpf_insn;
 
@@ -184,34 +193,36 @@ typedef int (*bpf_program_prep_t)(struct bpf_program *prog, int n,
 				  struct bpf_insn *insns, int insns_cnt,
 				  struct bpf_prog_prep_result *res);
 
-int bpf_program__set_prep(struct bpf_program *prog, int nr_instance,
-			  bpf_program_prep_t prep);
+LIBBPF_API int bpf_program__set_prep(struct bpf_program *prog, int nr_instance,
+				     bpf_program_prep_t prep);
 
-int bpf_program__nth_fd(struct bpf_program *prog, int n);
+LIBBPF_API int bpf_program__nth_fd(struct bpf_program *prog, int n);
 
 /*
  * Adjust type of BPF program. Default is kprobe.
  */
-int bpf_program__set_socket_filter(struct bpf_program *prog);
-int bpf_program__set_tracepoint(struct bpf_program *prog);
-int bpf_program__set_raw_tracepoint(struct bpf_program *prog);
-int bpf_program__set_kprobe(struct bpf_program *prog);
-int bpf_program__set_sched_cls(struct bpf_program *prog);
-int bpf_program__set_sched_act(struct bpf_program *prog);
-int bpf_program__set_xdp(struct bpf_program *prog);
-int bpf_program__set_perf_event(struct bpf_program *prog);
-void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type);
-void bpf_program__set_expected_attach_type(struct bpf_program *prog,
-					   enum bpf_attach_type type);
-
-bool bpf_program__is_socket_filter(struct bpf_program *prog);
-bool bpf_program__is_tracepoint(struct bpf_program *prog);
-bool bpf_program__is_raw_tracepoint(struct bpf_program *prog);
-bool bpf_program__is_kprobe(struct bpf_program *prog);
-bool bpf_program__is_sched_cls(struct bpf_program *prog);
-bool bpf_program__is_sched_act(struct bpf_program *prog);
-bool bpf_program__is_xdp(struct bpf_program *prog);
-bool bpf_program__is_perf_event(struct bpf_program *prog);
+LIBBPF_API int bpf_program__set_socket_filter(struct bpf_program *prog);
+LIBBPF_API int bpf_program__set_tracepoint(struct bpf_program *prog);
+LIBBPF_API int bpf_program__set_raw_tracepoint(struct bpf_program *prog);
+LIBBPF_API int bpf_program__set_kprobe(struct bpf_program *prog);
+LIBBPF_API int bpf_program__set_sched_cls(struct bpf_program *prog);
+LIBBPF_API int bpf_program__set_sched_act(struct bpf_program *prog);
+LIBBPF_API int bpf_program__set_xdp(struct bpf_program *prog);
+LIBBPF_API int bpf_program__set_perf_event(struct bpf_program *prog);
+LIBBPF_API void bpf_program__set_type(struct bpf_program *prog,
+				      enum bpf_prog_type type);
+LIBBPF_API void
+bpf_program__set_expected_attach_type(struct bpf_program *prog,
+				      enum bpf_attach_type type);
+
+LIBBPF_API bool bpf_program__is_socket_filter(struct bpf_program *prog);
+LIBBPF_API bool bpf_program__is_tracepoint(struct bpf_program *prog);
+LIBBPF_API bool bpf_program__is_raw_tracepoint(struct bpf_program *prog);
+LIBBPF_API bool bpf_program__is_kprobe(struct bpf_program *prog);
+LIBBPF_API bool bpf_program__is_sched_cls(struct bpf_program *prog);
+LIBBPF_API bool bpf_program__is_sched_act(struct bpf_program *prog);
+LIBBPF_API bool bpf_program__is_xdp(struct bpf_program *prog);
+LIBBPF_API bool bpf_program__is_perf_event(struct bpf_program *prog);
 
 /*
  * No need for __attribute__((packed)), all members of 'bpf_map_def'
@@ -232,39 +243,39 @@ struct bpf_map_def {
  * so no need to worry about a name clash.
  */
 struct bpf_map;
-struct bpf_map *
+LIBBPF_API struct bpf_map *
 bpf_object__find_map_by_name(struct bpf_object *obj, const char *name);
 
 /*
  * Get bpf_map through the offset of corresponding struct bpf_map_def
  * in the BPF object file.
  */
-struct bpf_map *
+LIBBPF_API struct bpf_map *
 bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset);
 
-struct bpf_map *
+LIBBPF_API struct bpf_map *
 bpf_map__next(struct bpf_map *map, struct bpf_object *obj);
 #define bpf_map__for_each(pos, obj)		\
 	for ((pos) = bpf_map__next(NULL, (obj));	\
 	     (pos) != NULL;				\
 	     (pos) = bpf_map__next((pos), (obj)))
 
-int bpf_map__fd(struct bpf_map *map);
-const struct bpf_map_def *bpf_map__def(struct bpf_map *map);
-const char *bpf_map__name(struct bpf_map *map);
-__u32 bpf_map__btf_key_type_id(const struct bpf_map *map);
-__u32 bpf_map__btf_value_type_id(const struct bpf_map *map);
+LIBBPF_API int bpf_map__fd(struct bpf_map *map);
+LIBBPF_API const struct bpf_map_def *bpf_map__def(struct bpf_map *map);
+LIBBPF_API const char *bpf_map__name(struct bpf_map *map);
+LIBBPF_API __u32 bpf_map__btf_key_type_id(const struct bpf_map *map);
+LIBBPF_API __u32 bpf_map__btf_value_type_id(const struct bpf_map *map);
 
 typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *);
-int bpf_map__set_priv(struct bpf_map *map, void *priv,
-		      bpf_map_clear_priv_t clear_priv);
-void *bpf_map__priv(struct bpf_map *map);
-int bpf_map__reuse_fd(struct bpf_map *map, int fd);
-bool bpf_map__is_offload_neutral(struct bpf_map *map);
-void bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex);
-int bpf_map__pin(struct bpf_map *map, const char *path);
+LIBBPF_API int bpf_map__set_priv(struct bpf_map *map, void *priv,
+				 bpf_map_clear_priv_t clear_priv);
+LIBBPF_API void *bpf_map__priv(struct bpf_map *map);
+LIBBPF_API int bpf_map__reuse_fd(struct bpf_map *map, int fd);
+LIBBPF_API bool bpf_map__is_offload_neutral(struct bpf_map *map);
+LIBBPF_API void bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex);
+LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path);
 
-long libbpf_get_error(const void *ptr);
+LIBBPF_API long libbpf_get_error(const void *ptr);
 
 struct bpf_prog_load_attr {
 	const char *file;
@@ -273,12 +284,12 @@ struct bpf_prog_load_attr {
 	int ifindex;
 };
 
-int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
-			struct bpf_object **pobj, int *prog_fd);
-int bpf_prog_load(const char *file, enum bpf_prog_type type,
-		  struct bpf_object **pobj, int *prog_fd);
+LIBBPF_API int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
+				   struct bpf_object **pobj, int *prog_fd);
+LIBBPF_API int bpf_prog_load(const char *file, enum bpf_prog_type type,
+			     struct bpf_object **pobj, int *prog_fd);
 
-int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
+LIBBPF_API int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
 
 enum bpf_perf_event_ret {
 	LIBBPF_PERF_EVENT_DONE	= 0,
@@ -286,12 +297,14 @@ enum bpf_perf_event_ret {
 	LIBBPF_PERF_EVENT_CONT	= -2,
 };
 
-typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(void *event,
-							  void *priv);
-int bpf_perf_event_read_simple(void *mem, unsigned long size,
-			       unsigned long page_size,
-			       void **buf, size_t *buf_len,
-			       bpf_perf_event_print_t fn, void *priv);
+struct perf_event_header;
+typedef enum bpf_perf_event_ret
+	(*bpf_perf_event_print_t)(struct perf_event_header *hdr,
+				  void *private_data);
+LIBBPF_API enum bpf_perf_event_ret
+bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size,
+			   void **copy_mem, size_t *copy_size,
+			   bpf_perf_event_print_t fn, void *private_data);
 
 struct nlattr;
 typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb);
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index 05a6d47c79561..8f6531fd4dadb 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -4,7 +4,7 @@
 #include <linux/compiler.h>
 #include <linux/refcount.h>
 #include <linux/types.h>
-#include <asm/barrier.h>
+#include <linux/ring_buffer.h>
 #include <stdbool.h>
 #include "auxtrace.h"
 #include "event.h"
@@ -71,21 +71,12 @@ void perf_mmap__consume(struct perf_mmap *map);
 
 static inline u64 perf_mmap__read_head(struct perf_mmap *mm)
 {
-	struct perf_event_mmap_page *pc = mm->base;
-	u64 head = READ_ONCE(pc->data_head);
-	rmb();
-	return head;
+	return ring_buffer_read_head(mm->base);
 }
 
 static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail)
 {
-	struct perf_event_mmap_page *pc = md->base;
-
-	/*
-	 * ensure all reads are done before we write the tail out.
-	 */
-	mb();
-	pc->data_tail = tail;
+	ring_buffer_write_tail(md->base, tail);
 }
 
 union perf_event *perf_mmap__read_forward(struct perf_mmap *map);
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 8a60c9b9892d2..1b799e30c06d3 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -25,3 +25,5 @@ test_cgroup_storage
 test_select_reuseport
 test_flow_dissector
 flow_dissector_load
+test_netcnt
+test_section_names
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index d99dd6fc3fbe4..e39dfb4e7970f 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -37,7 +37,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
 	get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
 	test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o \
-	test_sk_lookup_kern.o test_xdp_vlan.o
+	test_sk_lookup_kern.o test_xdp_vlan.o test_queue_map.o test_stack_map.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -118,6 +118,9 @@ CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \
 $(OUTPUT)/test_l4lb_noinline.o: CLANG_FLAGS += -fno-inline
 $(OUTPUT)/test_xdp_noinline.o: CLANG_FLAGS += -fno-inline
 
+$(OUTPUT)/test_queue_map.o: test_queue_stack_map.h
+$(OUTPUT)/test_stack_map.o: test_queue_stack_map.h
+
 BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
 BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
 BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm')
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index fda8c162d0dfa..686e57ce40f43 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -16,6 +16,13 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value,
 	(void *) BPF_FUNC_map_update_elem;
 static int (*bpf_map_delete_elem)(void *map, void *key) =
 	(void *) BPF_FUNC_map_delete_elem;
+static int (*bpf_map_push_elem)(void *map, void *value,
+				unsigned long long flags) =
+	(void *) BPF_FUNC_map_push_elem;
+static int (*bpf_map_pop_elem)(void *map, void *value) =
+	(void *) BPF_FUNC_map_pop_elem;
+static int (*bpf_map_peek_elem)(void *map, void *value) =
+	(void *) BPF_FUNC_map_peek_elem;
 static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) =
 	(void *) BPF_FUNC_probe_read;
 static unsigned long long (*bpf_ktime_get_ns)(void) =
@@ -104,6 +111,8 @@ static int (*bpf_msg_cork_bytes)(void *ctx, int len) =
 	(void *) BPF_FUNC_msg_cork_bytes;
 static int (*bpf_msg_pull_data)(void *ctx, int start, int end, int flags) =
 	(void *) BPF_FUNC_msg_pull_data;
+static int (*bpf_msg_push_data)(void *ctx, int start, int end, int flags) =
+	(void *) BPF_FUNC_msg_push_data;
 static int (*bpf_bind)(void *ctx, void *addr, int addr_len) =
 	(void *) BPF_FUNC_bind;
 static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
diff --git a/tools/testing/selftests/bpf/test_libbpf.sh b/tools/testing/selftests/bpf/test_libbpf.sh
index d97dc914cd496..156d89f1edcc4 100755
--- a/tools/testing/selftests/bpf/test_libbpf.sh
+++ b/tools/testing/selftests/bpf/test_libbpf.sh
@@ -6,7 +6,7 @@ export TESTNAME=test_libbpf
 # Determine selftest success via shell exit code
 exit_handler()
 {
-	if (( $? == 0 )); then
+	if [ $? -eq 0 ]; then
 		echo "selftests: $TESTNAME [PASS]";
 	else
 		echo "$TESTNAME: failed at file $LAST_LOADED" 1>&2
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 9b552c0fc47db..4db2116e52be4 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -15,6 +15,7 @@
 #include <string.h>
 #include <assert.h>
 #include <stdlib.h>
+#include <time.h>
 
 #include <sys/wait.h>
 #include <sys/socket.h>
@@ -471,6 +472,122 @@ static void test_devmap(int task, void *data)
 	close(fd);
 }
 
+static void test_queuemap(int task, void *data)
+{
+	const int MAP_SIZE = 32;
+	__u32 vals[MAP_SIZE + MAP_SIZE/2], val;
+	int fd, i;
+
+	/* Fill test values to be used */
+	for (i = 0; i < MAP_SIZE + MAP_SIZE/2; i++)
+		vals[i] = rand();
+
+	/* Invalid key size */
+	fd = bpf_create_map(BPF_MAP_TYPE_QUEUE, 4, sizeof(val), MAP_SIZE,
+			    map_flags);
+	assert(fd < 0 && errno == EINVAL);
+
+	fd = bpf_create_map(BPF_MAP_TYPE_QUEUE, 0, sizeof(val), MAP_SIZE,
+			    map_flags);
+	/* Queue map does not support BPF_F_NO_PREALLOC */
+	if (map_flags & BPF_F_NO_PREALLOC) {
+		assert(fd < 0 && errno == EINVAL);
+		return;
+	}
+	if (fd < 0) {
+		printf("Failed to create queuemap '%s'!\n", strerror(errno));
+		exit(1);
+	}
+
+	/* Push MAP_SIZE elements */
+	for (i = 0; i < MAP_SIZE; i++)
+		assert(bpf_map_update_elem(fd, NULL, &vals[i], 0) == 0);
+
+	/* Check that element cannot be pushed due to max_entries limit */
+	assert(bpf_map_update_elem(fd, NULL, &val, 0) == -1 &&
+	       errno == E2BIG);
+
+	/* Peek element */
+	assert(bpf_map_lookup_elem(fd, NULL, &val) == 0 && val == vals[0]);
+
+	/* Replace half elements */
+	for (i = MAP_SIZE; i < MAP_SIZE + MAP_SIZE/2; i++)
+		assert(bpf_map_update_elem(fd, NULL, &vals[i], BPF_EXIST) == 0);
+
+	/* Pop all elements */
+	for (i = MAP_SIZE/2; i < MAP_SIZE + MAP_SIZE/2; i++)
+		assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == 0 &&
+		       val == vals[i]);
+
+	/* Check that there are not elements left */
+	assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == -1 &&
+	       errno == ENOENT);
+
+	/* Check that non supported functions set errno to EINVAL */
+	assert(bpf_map_delete_elem(fd, NULL) == -1 && errno == EINVAL);
+	assert(bpf_map_get_next_key(fd, NULL, NULL) == -1 && errno == EINVAL);
+
+	close(fd);
+}
+
+static void test_stackmap(int task, void *data)
+{
+	const int MAP_SIZE = 32;
+	__u32 vals[MAP_SIZE + MAP_SIZE/2], val;
+	int fd, i;
+
+	/* Fill test values to be used */
+	for (i = 0; i < MAP_SIZE + MAP_SIZE/2; i++)
+		vals[i] = rand();
+
+	/* Invalid key size */
+	fd = bpf_create_map(BPF_MAP_TYPE_STACK, 4, sizeof(val), MAP_SIZE,
+			    map_flags);
+	assert(fd < 0 && errno == EINVAL);
+
+	fd = bpf_create_map(BPF_MAP_TYPE_STACK, 0, sizeof(val), MAP_SIZE,
+			    map_flags);
+	/* Stack map does not support BPF_F_NO_PREALLOC */
+	if (map_flags & BPF_F_NO_PREALLOC) {
+		assert(fd < 0 && errno == EINVAL);
+		return;
+	}
+	if (fd < 0) {
+		printf("Failed to create stackmap '%s'!\n", strerror(errno));
+		exit(1);
+	}
+
+	/* Push MAP_SIZE elements */
+	for (i = 0; i < MAP_SIZE; i++)
+		assert(bpf_map_update_elem(fd, NULL, &vals[i], 0) == 0);
+
+	/* Check that element cannot be pushed due to max_entries limit */
+	assert(bpf_map_update_elem(fd, NULL, &val, 0) == -1 &&
+	       errno == E2BIG);
+
+	/* Peek element */
+	assert(bpf_map_lookup_elem(fd, NULL, &val) == 0 && val == vals[i - 1]);
+
+	/* Replace half elements */
+	for (i = MAP_SIZE; i < MAP_SIZE + MAP_SIZE/2; i++)
+		assert(bpf_map_update_elem(fd, NULL, &vals[i], BPF_EXIST) == 0);
+
+	/* Pop all elements */
+	for (i = MAP_SIZE + MAP_SIZE/2 - 1; i >= MAP_SIZE/2; i--)
+		assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == 0 &&
+		       val == vals[i]);
+
+	/* Check that there are not elements left */
+	assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == -1 &&
+	       errno == ENOENT);
+
+	/* Check that non supported functions set errno to EINVAL */
+	assert(bpf_map_delete_elem(fd, NULL) == -1 && errno == EINVAL);
+	assert(bpf_map_get_next_key(fd, NULL, NULL) == -1 && errno == EINVAL);
+
+	close(fd);
+}
+
 #include <sys/socket.h>
 #include <sys/ioctl.h>
 #include <arpa/inet.h>
@@ -1434,10 +1551,15 @@ static void run_all_tests(void)
 	test_map_wronly();
 
 	test_reuseport_array();
+
+	test_queuemap(0, NULL);
+	test_stackmap(0, NULL);
 }
 
 int main(void)
 {
+	srand(time(NULL));
+
 	map_flags = 0;
 	run_all_tests();
 
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index e8becca9c5211..2d3c04f455302 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1735,8 +1735,105 @@ static void test_reference_tracking()
 	bpf_object__close(obj);
 }
 
+enum {
+	QUEUE,
+	STACK,
+};
+
+static void test_queue_stack_map(int type)
+{
+	const int MAP_SIZE = 32;
+	__u32 vals[MAP_SIZE], duration, retval, size, val;
+	int i, err, prog_fd, map_in_fd, map_out_fd;
+	char file[32], buf[128];
+	struct bpf_object *obj;
+	struct iphdr *iph = (void *)buf + sizeof(struct ethhdr);
+
+	/* Fill test values to be used */
+	for (i = 0; i < MAP_SIZE; i++)
+		vals[i] = rand();
+
+	if (type == QUEUE)
+		strncpy(file, "./test_queue_map.o", sizeof(file));
+	else if (type == STACK)
+		strncpy(file, "./test_stack_map.o", sizeof(file));
+	else
+		return;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+	if (err) {
+		error_cnt++;
+		return;
+	}
+
+	map_in_fd = bpf_find_map(__func__, obj, "map_in");
+	if (map_in_fd < 0)
+		goto out;
+
+	map_out_fd = bpf_find_map(__func__, obj, "map_out");
+	if (map_out_fd < 0)
+		goto out;
+
+	/* Push 32 elements to the input map */
+	for (i = 0; i < MAP_SIZE; i++) {
+		err = bpf_map_update_elem(map_in_fd, NULL, &vals[i], 0);
+		if (err) {
+			error_cnt++;
+			goto out;
+		}
+	}
+
+	/* The eBPF program pushes iph.saddr in the output map,
+	 * pops the input map and saves this value in iph.daddr
+	 */
+	for (i = 0; i < MAP_SIZE; i++) {
+		if (type == QUEUE) {
+			val = vals[i];
+			pkt_v4.iph.saddr = vals[i] * 5;
+		} else if (type == STACK) {
+			val = vals[MAP_SIZE - 1 - i];
+			pkt_v4.iph.saddr = vals[MAP_SIZE - 1 - i] * 5;
+		}
+
+		err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+					buf, &size, &retval, &duration);
+		if (err || retval || size != sizeof(pkt_v4) ||
+		    iph->daddr != val)
+			break;
+	}
+
+	CHECK(err || retval || size != sizeof(pkt_v4) || iph->daddr != val,
+	      "bpf_map_pop_elem",
+	      "err %d errno %d retval %d size %d iph->daddr %u\n",
+	      err, errno, retval, size, iph->daddr);
+
+	/* Queue is empty, program should return TC_ACT_SHOT */
+	err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+				buf, &size, &retval, &duration);
+	CHECK(err || retval != 2 /* TC_ACT_SHOT */|| size != sizeof(pkt_v4),
+	      "check-queue-stack-map-empty",
+	      "err %d errno %d retval %d size %d\n",
+	      err, errno, retval, size);
+
+	/* Check that the program pushed elements correctly */
+	for (i = 0; i < MAP_SIZE; i++) {
+		err = bpf_map_lookup_and_delete_elem(map_out_fd, NULL, &val);
+		if (err || val != vals[i] * 5)
+			break;
+	}
+
+	CHECK(i != MAP_SIZE && (err || val != vals[i] * 5),
+	      "bpf_map_push_elem", "err %d value %u\n", err, val);
+
+out:
+	pkt_v4.iph.saddr = 0;
+	bpf_object__close(obj);
+}
+
 int main(void)
 {
+	srand(time(NULL));
+
 	jit_enabled = is_jit_enabled();
 
 	test_pkt_access();
@@ -1757,6 +1854,8 @@ int main(void)
 	test_task_fd_query_rawtp();
 	test_task_fd_query_tp();
 	test_reference_tracking();
+	test_queue_stack_map(QUEUE);
+	test_queue_stack_map(STACK);
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_queue_map.c b/tools/testing/selftests/bpf/test_queue_map.c
new file mode 100644
index 0000000000000..87db1f9da33db
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_queue_map.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Politecnico di Torino
+#define MAP_TYPE BPF_MAP_TYPE_QUEUE
+#include "test_queue_stack_map.h"
diff --git a/tools/testing/selftests/bpf/test_queue_stack_map.h b/tools/testing/selftests/bpf/test_queue_stack_map.h
new file mode 100644
index 0000000000000..295b9b3bc5c72
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_queue_stack_map.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2018 Politecnico di Torino
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/pkt_cls.h>
+#include "bpf_helpers.h"
+
+int _version SEC("version") = 1;
+
+struct bpf_map_def __attribute__ ((section("maps"), used)) map_in = {
+	.type = MAP_TYPE,
+	.key_size = 0,
+	.value_size = sizeof(__u32),
+	.max_entries = 32,
+	.map_flags = 0,
+};
+
+struct bpf_map_def __attribute__ ((section("maps"), used)) map_out = {
+	.type = MAP_TYPE,
+	.key_size = 0,
+	.value_size = sizeof(__u32),
+	.max_entries = 32,
+	.map_flags = 0,
+};
+
+SEC("test")
+int _test(struct __sk_buff *skb)
+{
+	void *data_end = (void *)(long)skb->data_end;
+	void *data = (void *)(long)skb->data;
+	struct ethhdr *eth = (struct ethhdr *)(data);
+	__u32 value;
+	int err;
+
+	if (eth + 1 > data_end)
+		return TC_ACT_SHOT;
+
+	struct iphdr *iph = (struct iphdr *)(eth + 1);
+
+	if (iph + 1 > data_end)
+		return TC_ACT_SHOT;
+
+	err = bpf_map_pop_elem(&map_in, &value);
+	if (err)
+		return TC_ACT_SHOT;
+
+	iph->daddr = value;
+
+	err = bpf_map_push_elem(&map_out, &iph->saddr, 0);
+	if (err)
+		return TC_ACT_SHOT;
+
+	return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 10a5fa83c75a1..622ade0a09578 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -28,6 +28,7 @@
 #include <linux/sock_diag.h>
 #include <linux/bpf.h>
 #include <linux/if_link.h>
+#include <linux/tls.h>
 #include <assert.h>
 #include <libgen.h>
 
@@ -43,6 +44,13 @@
 int running;
 static void running_handler(int a);
 
+#ifndef TCP_ULP
+# define TCP_ULP 31
+#endif
+#ifndef SOL_TLS
+# define SOL_TLS 282
+#endif
+
 /* randomly selected ports for testing on lo */
 #define S1_PORT 10000
 #define S2_PORT 10001
@@ -69,9 +77,12 @@ int txmsg_apply;
 int txmsg_cork;
 int txmsg_start;
 int txmsg_end;
+int txmsg_start_push;
+int txmsg_end_push;
 int txmsg_ingress;
 int txmsg_skb;
 int ktls;
+int peek_flag;
 
 static const struct option long_options[] = {
 	{"help",	no_argument,		NULL, 'h' },
@@ -91,9 +102,12 @@ static const struct option long_options[] = {
 	{"txmsg_cork",	required_argument,	NULL, 'k'},
 	{"txmsg_start", required_argument,	NULL, 's'},
 	{"txmsg_end",	required_argument,	NULL, 'e'},
+	{"txmsg_start_push", required_argument,	NULL, 'p'},
+	{"txmsg_end_push",   required_argument,	NULL, 'q'},
 	{"txmsg_ingress", no_argument,		&txmsg_ingress, 1 },
 	{"txmsg_skb", no_argument,		&txmsg_skb, 1 },
 	{"ktls", no_argument,			&ktls, 1 },
+	{"peek", no_argument,			&peek_flag, 1 },
 	{0, 0, NULL, 0 }
 };
 
@@ -114,11 +128,6 @@ static void usage(char *argv[])
 	printf("\n");
 }
 
-#define TCP_ULP 31
-#define TLS_TX 1
-#define TLS_RX 2
-#include <linux/tls.h>
-
 char *sock_to_string(int s)
 {
 	if (s == c1)
@@ -349,33 +358,40 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt,
 	return 0;
 }
 
-static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
-		    struct msg_stats *s, bool tx,
-		    struct sockmap_options *opt)
+static void msg_free_iov(struct msghdr *msg)
 {
-	struct msghdr msg = {0};
-	int err, i, flags = MSG_NOSIGNAL;
+	int i;
+
+	for (i = 0; i < msg->msg_iovlen; i++)
+		free(msg->msg_iov[i].iov_base);
+	free(msg->msg_iov);
+	msg->msg_iov = NULL;
+	msg->msg_iovlen = 0;
+}
+
+static int msg_alloc_iov(struct msghdr *msg,
+			 int iov_count, int iov_length,
+			 bool data, bool xmit)
+{
+	unsigned char k = 0;
 	struct iovec *iov;
-	unsigned char k;
-	bool data_test = opt->data_test;
-	bool drop = opt->drop_expected;
+	int i;
 
 	iov = calloc(iov_count, sizeof(struct iovec));
 	if (!iov)
 		return errno;
 
-	k = 0;
 	for (i = 0; i < iov_count; i++) {
 		unsigned char *d = calloc(iov_length, sizeof(char));
 
 		if (!d) {
 			fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count);
-			goto out_errno;
+			goto unwind_iov;
 		}
 		iov[i].iov_base = d;
 		iov[i].iov_len = iov_length;
 
-		if (data_test && tx) {
+		if (data && xmit) {
 			int j;
 
 			for (j = 0; j < iov_length; j++)
@@ -383,9 +399,60 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		}
 	}
 
-	msg.msg_iov = iov;
-	msg.msg_iovlen = iov_count;
-	k = 0;
+	msg->msg_iov = iov;
+	msg->msg_iovlen = iov_count;
+
+	return 0;
+unwind_iov:
+	for (i--; i >= 0 ; i--)
+		free(msg->msg_iov[i].iov_base);
+	return -ENOMEM;
+}
+
+static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz)
+{
+	int i, j, bytes_cnt = 0;
+	unsigned char k = 0;
+
+	for (i = 0; i < msg->msg_iovlen; i++) {
+		unsigned char *d = msg->msg_iov[i].iov_base;
+
+		for (j = 0;
+		     j < msg->msg_iov[i].iov_len && size; j++) {
+			if (d[j] != k++) {
+				fprintf(stderr,
+					"detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
+					i, j, d[j], k - 1, d[j+1], k);
+				return -EIO;
+			}
+			bytes_cnt++;
+			if (bytes_cnt == chunk_sz) {
+				k = 0;
+				bytes_cnt = 0;
+			}
+			size--;
+		}
+	}
+	return 0;
+}
+
+static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
+		    struct msg_stats *s, bool tx,
+		    struct sockmap_options *opt)
+{
+	struct msghdr msg = {0}, msg_peek = {0};
+	int err, i, flags = MSG_NOSIGNAL;
+	bool drop = opt->drop_expected;
+	bool data = opt->data_test;
+
+	err = msg_alloc_iov(&msg, iov_count, iov_length, data, tx);
+	if (err)
+		goto out_errno;
+	if (peek_flag) {
+		err = msg_alloc_iov(&msg_peek, iov_count, iov_length, data, tx);
+		if (err)
+			goto out_errno;
+	}
 
 	if (tx) {
 		clock_gettime(CLOCK_MONOTONIC, &s->start);
@@ -405,19 +472,12 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		}
 		clock_gettime(CLOCK_MONOTONIC, &s->end);
 	} else {
-		int slct, recv, max_fd = fd;
+		int slct, recvp = 0, recv, max_fd = fd;
 		int fd_flags = O_NONBLOCK;
 		struct timeval timeout;
 		float total_bytes;
-		int bytes_cnt = 0;
-		int chunk_sz;
 		fd_set w;
 
-		if (opt->sendpage)
-			chunk_sz = iov_length * cnt;
-		else
-			chunk_sz = iov_length * iov_count;
-
 		fcntl(fd, fd_flags);
 		total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
 		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
@@ -449,6 +509,19 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 				goto out_errno;
 			}
 
+			errno = 0;
+			if (peek_flag) {
+				flags |= MSG_PEEK;
+				recvp = recvmsg(fd, &msg_peek, flags);
+				if (recvp < 0) {
+					if (errno != EWOULDBLOCK) {
+						clock_gettime(CLOCK_MONOTONIC, &s->end);
+						goto out_errno;
+					}
+				}
+				flags = 0;
+			}
+
 			recv = recvmsg(fd, &msg, flags);
 			if (recv < 0) {
 				if (errno != EWOULDBLOCK) {
@@ -460,27 +533,23 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 
 			s->bytes_recvd += recv;
 
-			if (data_test) {
-				int j;
-
-				for (i = 0; i < msg.msg_iovlen; i++) {
-					unsigned char *d = iov[i].iov_base;
-
-					for (j = 0;
-					     j < iov[i].iov_len && recv; j++) {
-						if (d[j] != k++) {
-							errno = -EIO;
-							fprintf(stderr,
-								"detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
-								i, j, d[j], k - 1, d[j+1], k);
-							goto out_errno;
-						}
-						bytes_cnt++;
-						if (bytes_cnt == chunk_sz) {
-							k = 0;
-							bytes_cnt = 0;
-						}
-						recv--;
+			if (data) {
+				int chunk_sz = opt->sendpage ?
+						iov_length * cnt :
+						iov_length * iov_count;
+
+				errno = msg_verify_data(&msg, recv, chunk_sz);
+				if (errno) {
+					perror("data verify msg failed\n");
+					goto out_errno;
+				}
+				if (recvp) {
+					errno = msg_verify_data(&msg_peek,
+								recvp,
+								chunk_sz);
+					if (errno) {
+						perror("data verify msg_peek failed\n");
+						goto out_errno;
 					}
 				}
 			}
@@ -488,14 +557,12 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		clock_gettime(CLOCK_MONOTONIC, &s->end);
 	}
 
-	for (i = 0; i < iov_count; i++)
-		free(iov[i].iov_base);
-	free(iov);
-	return 0;
+	msg_free_iov(&msg);
+	msg_free_iov(&msg_peek);
+	return err;
 out_errno:
-	for (i = 0; i < iov_count; i++)
-		free(iov[i].iov_base);
-	free(iov);
+	msg_free_iov(&msg);
+	msg_free_iov(&msg_peek);
 	return errno;
 }
 
@@ -562,9 +629,10 @@ static int sendmsg_test(struct sockmap_options *opt)
 		}
 		if (opt->verbose)
 			fprintf(stdout,
-				"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n",
+				"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s %s\n",
 				s.bytes_sent, sent_Bps, sent_Bps/giga,
-				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
+				s.bytes_recvd, recvd_Bps, recvd_Bps/giga,
+				peek_flag ? "(peek_msg)" : "");
 		if (err && txmsg_cork)
 			err = 0;
 		exit(err ? 1 : 0);
@@ -839,6 +907,30 @@ static int run_options(struct sockmap_options *options, int cg_fd,  int test)
 			}
 		}
 
+		if (txmsg_start_push) {
+			i = 2;
+			err = bpf_map_update_elem(map_fd[5],
+						  &i, &txmsg_start_push, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem (txmsg_start_push):  %d (%s)\n",
+					err, strerror(errno));
+				goto out;
+			}
+		}
+
+		if (txmsg_end_push) {
+			i = 3;
+			err = bpf_map_update_elem(map_fd[5],
+						  &i, &txmsg_end_push, BPF_ANY);
+			if (err) {
+				fprintf(stderr,
+					"ERROR: bpf_map_update_elem %i@%i (txmsg_end_push):  %d (%s)\n",
+					txmsg_end_push, i, err, strerror(errno));
+				goto out;
+			}
+		}
+
 		if (txmsg_ingress) {
 			int in = BPF_F_INGRESS;
 
@@ -996,6 +1088,8 @@ static void test_options(char *options)
 		strncat(options, "skb,", OPTSTRING);
 	if (ktls)
 		strncat(options, "ktls,", OPTSTRING);
+	if (peek_flag)
+		strncat(options, "peek,", OPTSTRING);
 }
 
 static int __test_exec(int cgrp, int test, struct sockmap_options *opt)
@@ -1169,6 +1263,8 @@ static int test_mixed(int cgrp)
 	txmsg_pass = txmsg_noisy = txmsg_redir_noisy = txmsg_drop = 0;
 	txmsg_apply = txmsg_cork = 0;
 	txmsg_start = txmsg_end = 0;
+	txmsg_start_push = txmsg_end_push = 0;
+
 	/* Test small and large iov_count values with pass/redir/apply/cork */
 	txmsg_pass = 1;
 	txmsg_redir = 0;
@@ -1285,6 +1381,8 @@ static int test_start_end(int cgrp)
 	/* Test basic start/end with lots of iov_count and iov_lengths */
 	txmsg_start = 1;
 	txmsg_end = 2;
+	txmsg_start_push = 1;
+	txmsg_end_push = 2;
 	err = test_txmsg(cgrp);
 	if (err)
 		goto out;
@@ -1298,6 +1396,8 @@ static int test_start_end(int cgrp)
 	for (i = 99; i <= 1600; i += 500) {
 		txmsg_start = 0;
 		txmsg_end = i;
+		txmsg_start_push = 0;
+		txmsg_end_push = i;
 		err = test_exec(cgrp, &opt);
 		if (err)
 			goto out;
@@ -1307,6 +1407,8 @@ static int test_start_end(int cgrp)
 	for (i = 199; i <= 1600; i += 500) {
 		txmsg_start = 100;
 		txmsg_end = i;
+		txmsg_start_push = 100;
+		txmsg_end_push = i;
 		err = test_exec(cgrp, &opt);
 		if (err)
 			goto out;
@@ -1315,6 +1417,8 @@ static int test_start_end(int cgrp)
 	/* Test start/end with cork pulling last sg entry */
 	txmsg_start = 1500;
 	txmsg_end = 1600;
+	txmsg_start_push = 1500;
+	txmsg_end_push = 1600;
 	err = test_exec(cgrp, &opt);
 	if (err)
 		goto out;
@@ -1322,6 +1426,8 @@ static int test_start_end(int cgrp)
 	/* Test start/end pull of single byte in last page */
 	txmsg_start = 1111;
 	txmsg_end = 1112;
+	txmsg_start_push = 1111;
+	txmsg_end_push = 1112;
 	err = test_exec(cgrp, &opt);
 	if (err)
 		goto out;
@@ -1329,6 +1435,8 @@ static int test_start_end(int cgrp)
 	/* Test start/end with end < start */
 	txmsg_start = 1111;
 	txmsg_end = 0;
+	txmsg_start_push = 1111;
+	txmsg_end_push = 0;
 	err = test_exec(cgrp, &opt);
 	if (err)
 		goto out;
@@ -1336,6 +1444,8 @@ static int test_start_end(int cgrp)
 	/* Test start/end with end > data */
 	txmsg_start = 0;
 	txmsg_end = 1601;
+	txmsg_start_push = 0;
+	txmsg_end_push = 1601;
 	err = test_exec(cgrp, &opt);
 	if (err)
 		goto out;
@@ -1343,6 +1453,8 @@ static int test_start_end(int cgrp)
 	/* Test start/end with start > data */
 	txmsg_start = 1601;
 	txmsg_end = 1600;
+	txmsg_start_push = 1601;
+	txmsg_end_push = 1600;
 	err = test_exec(cgrp, &opt);
 
 out:
@@ -1358,7 +1470,7 @@ char *map_names[] = {
 	"sock_map_redir",
 	"sock_apply_bytes",
 	"sock_cork_bytes",
-	"sock_pull_bytes",
+	"sock_bytes",
 	"sock_redir_flags",
 	"sock_skb_opts",
 };
@@ -1465,7 +1577,7 @@ static int __test_suite(int cg_fd, char *bpf_file)
 	}
 
 	/* Tests basic commands and APIs with range of iov values */
-	txmsg_start = txmsg_end = 0;
+	txmsg_start = txmsg_end = txmsg_start_push = txmsg_end_push = 0;
 	err = test_txmsg(cg_fd);
 	if (err)
 		goto out;
@@ -1514,7 +1626,7 @@ int main(int argc, char **argv)
 	if (argc < 2)
 		return test_suite(-1);
 
-	while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:",
+	while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:",
 				  long_options, &longindex)) != -1) {
 		switch (opt) {
 		case 's':
@@ -1523,6 +1635,12 @@ int main(int argc, char **argv)
 		case 'e':
 			txmsg_end = atoi(optarg);
 			break;
+		case 'p':
+			txmsg_start_push = atoi(optarg);
+			break;
+		case 'q':
+			txmsg_end_push = atoi(optarg);
+			break;
 		case 'a':
 			txmsg_apply = atoi(optarg);
 			break;
diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/test_sockmap_kern.h
index 8e8e41780bb9f..14b8bbac004f8 100644
--- a/tools/testing/selftests/bpf/test_sockmap_kern.h
+++ b/tools/testing/selftests/bpf/test_sockmap_kern.h
@@ -70,11 +70,11 @@ struct bpf_map_def SEC("maps") sock_cork_bytes = {
 	.max_entries = 1
 };
 
-struct bpf_map_def SEC("maps") sock_pull_bytes = {
+struct bpf_map_def SEC("maps") sock_bytes = {
 	.type = BPF_MAP_TYPE_ARRAY,
 	.key_size = sizeof(int),
 	.value_size = sizeof(int),
-	.max_entries = 2
+	.max_entries = 4
 };
 
 struct bpf_map_def SEC("maps") sock_redir_flags = {
@@ -181,8 +181,8 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
 SEC("sk_msg1")
 int bpf_prog4(struct sk_msg_md *msg)
 {
-	int *bytes, zero = 0, one = 1;
-	int *start, *end;
+	int *bytes, zero = 0, one = 1, two = 2, three = 3;
+	int *start, *end, *start_push, *end_push;
 
 	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
 	if (bytes)
@@ -190,18 +190,24 @@ int bpf_prog4(struct sk_msg_md *msg)
 	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
 	if (bytes)
 		bpf_msg_cork_bytes(msg, *bytes);
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	start = bpf_map_lookup_elem(&sock_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_bytes, &one);
 	if (start && end)
 		bpf_msg_pull_data(msg, *start, *end, 0);
+	start_push = bpf_map_lookup_elem(&sock_bytes, &two);
+	end_push = bpf_map_lookup_elem(&sock_bytes, &three);
+	if (start_push && end_push)
+		bpf_msg_push_data(msg, *start_push, *end_push, 0);
 	return SK_PASS;
 }
 
 SEC("sk_msg2")
 int bpf_prog5(struct sk_msg_md *msg)
 {
-	int err1 = -1, err2 = -1, zero = 0, one = 1;
-	int *bytes, *start, *end, len1, len2;
+	int zero = 0, one = 1, two = 2, three = 3;
+	int *start, *end, *start_push, *end_push;
+	int *bytes, len1, len2 = 0, len3;
+	int err1 = -1, err2 = -1;
 
 	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
 	if (bytes)
@@ -210,8 +216,8 @@ int bpf_prog5(struct sk_msg_md *msg)
 	if (bytes)
 		err2 = bpf_msg_cork_bytes(msg, *bytes);
 	len1 = (__u64)msg->data_end - (__u64)msg->data;
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	start = bpf_map_lookup_elem(&sock_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_bytes, &one);
 	if (start && end) {
 		int err;
 
@@ -225,6 +231,23 @@ int bpf_prog5(struct sk_msg_md *msg)
 		bpf_printk("sk_msg2: length update %i->%i\n",
 			   len1, len2);
 	}
+
+	start_push = bpf_map_lookup_elem(&sock_bytes, &two);
+	end_push = bpf_map_lookup_elem(&sock_bytes, &three);
+	if (start_push && end_push) {
+		int err;
+
+		bpf_printk("sk_msg2: push(%i:%i)\n",
+			   start_push ? *start_push : 0,
+			   end_push ? *end_push : 0);
+		err = bpf_msg_push_data(msg, *start_push, *end_push, 0);
+		if (err)
+			bpf_printk("sk_msg2: push_data err %i\n", err);
+		len3 = (__u64)msg->data_end - (__u64)msg->data;
+		bpf_printk("sk_msg2: length push_update %i->%i\n",
+			   len2 ? len2 : len1, len3);
+	}
+
 	bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n",
 		   len1, err1, err2);
 	return SK_PASS;
@@ -233,8 +256,8 @@ int bpf_prog5(struct sk_msg_md *msg)
 SEC("sk_msg3")
 int bpf_prog6(struct sk_msg_md *msg)
 {
-	int *bytes, zero = 0, one = 1, key = 0;
-	int *start, *end, *f;
+	int *bytes, *start, *end, *start_push, *end_push, *f;
+	int zero = 0, one = 1, two = 2, three = 3, key = 0;
 	__u64 flags = 0;
 
 	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
@@ -243,10 +266,17 @@ int bpf_prog6(struct sk_msg_md *msg)
 	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
 	if (bytes)
 		bpf_msg_cork_bytes(msg, *bytes);
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+
+	start = bpf_map_lookup_elem(&sock_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_bytes, &one);
 	if (start && end)
 		bpf_msg_pull_data(msg, *start, *end, 0);
+
+	start_push = bpf_map_lookup_elem(&sock_bytes, &two);
+	end_push = bpf_map_lookup_elem(&sock_bytes, &three);
+	if (start_push && end_push)
+		bpf_msg_push_data(msg, *start_push, *end_push, 0);
+
 	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
 	if (f && *f) {
 		key = 2;
@@ -262,8 +292,9 @@ int bpf_prog6(struct sk_msg_md *msg)
 SEC("sk_msg4")
 int bpf_prog7(struct sk_msg_md *msg)
 {
-	int err1 = 0, err2 = 0, zero = 0, one = 1, key = 0;
-	int *f, *bytes, *start, *end, len1, len2;
+	int zero = 0, one = 1, two = 2, three = 3, len1, len2 = 0, len3;
+	int *bytes, *start, *end, *start_push, *end_push, *f;
+	int err1 = 0, err2 = 0, key = 0;
 	__u64 flags = 0;
 
 		int err;
@@ -274,10 +305,10 @@ int bpf_prog7(struct sk_msg_md *msg)
 	if (bytes)
 		err2 = bpf_msg_cork_bytes(msg, *bytes);
 	len1 = (__u64)msg->data_end - (__u64)msg->data;
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
-	if (start && end) {
 
+	start = bpf_map_lookup_elem(&sock_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_bytes, &one);
+	if (start && end) {
 		bpf_printk("sk_msg2: pull(%i:%i)\n",
 			   start ? *start : 0, end ? *end : 0);
 		err = bpf_msg_pull_data(msg, *start, *end, 0);
@@ -288,6 +319,22 @@ int bpf_prog7(struct sk_msg_md *msg)
 		bpf_printk("sk_msg2: length update %i->%i\n",
 			   len1, len2);
 	}
+
+	start_push = bpf_map_lookup_elem(&sock_bytes, &two);
+	end_push = bpf_map_lookup_elem(&sock_bytes, &three);
+	if (start_push && end_push) {
+		bpf_printk("sk_msg4: push(%i:%i)\n",
+			   start_push ? *start_push : 0,
+			   end_push ? *end_push : 0);
+		err = bpf_msg_push_data(msg, *start_push, *end_push, 0);
+		if (err)
+			bpf_printk("sk_msg4: push_data err %i\n",
+				   err);
+		len3 = (__u64)msg->data_end - (__u64)msg->data;
+		bpf_printk("sk_msg4: length push_update %i->%i\n",
+			   len2 ? len2 : len1, len3);
+	}
+
 	f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
 	if (f && *f) {
 		key = 2;
@@ -342,8 +389,8 @@ int bpf_prog9(struct sk_msg_md *msg)
 SEC("sk_msg7")
 int bpf_prog10(struct sk_msg_md *msg)
 {
-	int *bytes, zero = 0, one = 1;
-	int *start, *end;
+	int *bytes, *start, *end, *start_push, *end_push;
+	int zero = 0, one = 1, two = 2, three = 3;
 
 	bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
 	if (bytes)
@@ -351,10 +398,14 @@ int bpf_prog10(struct sk_msg_md *msg)
 	bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
 	if (bytes)
 		bpf_msg_cork_bytes(msg, *bytes);
-	start = bpf_map_lookup_elem(&sock_pull_bytes, &zero);
-	end = bpf_map_lookup_elem(&sock_pull_bytes, &one);
+	start = bpf_map_lookup_elem(&sock_bytes, &zero);
+	end = bpf_map_lookup_elem(&sock_bytes, &one);
 	if (start && end)
 		bpf_msg_pull_data(msg, *start, *end, 0);
+	start_push = bpf_map_lookup_elem(&sock_bytes, &two);
+	end_push = bpf_map_lookup_elem(&sock_bytes, &three);
+	if (start_push && end_push)
+		bpf_msg_push_data(msg, *start_push, *end_push, 0);
 
 	return SK_DROP;
 }
diff --git a/tools/testing/selftests/bpf/test_stack_map.c b/tools/testing/selftests/bpf/test_stack_map.c
new file mode 100644
index 0000000000000..31c3880e6da00
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_stack_map.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Politecnico di Torino
+#define MAP_TYPE BPF_MAP_TYPE_STACK
+#include "test_queue_stack_map.h"
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index cf4cd32b67725..769d68a48f303 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -3430,7 +3430,7 @@ static struct bpf_test tests[] = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_1, offsetof(struct __sk_buff, mark), 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "BPF_ST stores into R1 inv is not allowed",
+		.errstr = "BPF_ST stores into R1 ctx is not allowed",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
@@ -3442,7 +3442,7 @@ static struct bpf_test tests[] = {
 				     BPF_REG_0, offsetof(struct __sk_buff, mark), 0),
 			BPF_EXIT_INSN(),
 		},
-		.errstr = "BPF_XADD stores into R1 inv is not allowed",
+		.errstr = "BPF_XADD stores into R1 ctx is not allowed",
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
@@ -4862,6 +4862,177 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 		.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 	},
+	{
+		"direct packet read test#1 for CGROUP_SKB",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+				    offsetof(struct __sk_buff, len)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+				    offsetof(struct __sk_buff, pkt_type)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff, queue_mapping)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1,
+				    offsetof(struct __sk_buff, protocol)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1,
+				    offsetof(struct __sk_buff, vlan_present)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"direct packet read test#2 for CGROUP_SKB",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+				    offsetof(struct __sk_buff, vlan_tci)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+				    offsetof(struct __sk_buff, vlan_proto)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, priority)),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
+				    offsetof(struct __sk_buff, priority)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff,
+					     ingress_ifindex)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1,
+				    offsetof(struct __sk_buff, tc_index)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1,
+				    offsetof(struct __sk_buff, hash)),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"direct packet read test#3 for CGROUP_SKB",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1,
+				    offsetof(struct __sk_buff, napi_id)),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_4,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_5,
+				    offsetof(struct __sk_buff, cb[1])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_7,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_8,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"direct packet read test#4 for CGROUP_SKB",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, family)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, remote_ip4)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+				    offsetof(struct __sk_buff, local_ip4)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+				    offsetof(struct __sk_buff, remote_ip6[0])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+				    offsetof(struct __sk_buff, remote_ip6[1])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+				    offsetof(struct __sk_buff, remote_ip6[2])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+				    offsetof(struct __sk_buff, remote_ip6[3])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, local_ip6[0])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, local_ip6[1])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, local_ip6[2])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+				    offsetof(struct __sk_buff, local_ip6[3])),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff, remote_port)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1,
+				    offsetof(struct __sk_buff, local_port)),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"invalid access of tc_classid for CGROUP_SKB",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, tc_classid)),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid bpf_context access",
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"invalid access of data_meta for CGROUP_SKB",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_meta)),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid bpf_context access",
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"invalid access of flow_keys for CGROUP_SKB",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, flow_keys)),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid bpf_context access",
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
+	{
+		"invalid write access to napi_id for CGROUP_SKB",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1,
+				    offsetof(struct __sk_buff, napi_id)),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_9,
+				    offsetof(struct __sk_buff, napi_id)),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = REJECT,
+		.errstr = "invalid bpf_context access",
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+	},
 	{
 		"valid cgroup storage access",
 		.insns = {
@@ -5499,7 +5670,7 @@ static struct bpf_test tests[] = {
 		.errstr_unpriv = "R2 leaks addr into mem",
 		.result_unpriv = REJECT,
 		.result = REJECT,
-		.errstr = "BPF_XADD stores into R1 inv is not allowed",
+		.errstr = "BPF_XADD stores into R1 ctx is not allowed",
 	},
 	{
 		"leak pointer into ctx 2",
@@ -5514,7 +5685,7 @@ static struct bpf_test tests[] = {
 		.errstr_unpriv = "R10 leaks addr into mem",
 		.result_unpriv = REJECT,
 		.result = REJECT,
-		.errstr = "BPF_XADD stores into R1 inv is not allowed",
+		.errstr = "BPF_XADD stores into R1 ctx is not allowed",
 	},
 	{
 		"leak pointer into ctx 3",
@@ -12463,7 +12634,7 @@ static struct bpf_test tests[] = {
 			BPF_EXIT_INSN(),
 		},
 		.result = REJECT,
-		.errstr = "BPF_XADD stores into R2 ctx",
+		.errstr = "BPF_XADD stores into R2 pkt is not allowed",
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
 	{
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index cabe2a3a3b300..4cdb63bf0521d 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -41,6 +41,7 @@ int load_kallsyms(void)
 		syms[i].name = strdup(func);
 		i++;
 	}
+	fclose(f);
 	sym_cnt = i;
 	qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
 	return 0;
@@ -124,10 +125,11 @@ struct perf_event_sample {
 	char data[];
 };
 
-static enum bpf_perf_event_ret bpf_perf_event_print(void *event, void *priv)
+static enum bpf_perf_event_ret
+bpf_perf_event_print(struct perf_event_header *hdr, void *private_data)
 {
-	struct perf_event_sample *e = event;
-	perf_event_print_fn fn = priv;
+	struct perf_event_sample *e = (struct perf_event_sample *)hdr;
+	perf_event_print_fn fn = private_data;
 	int ret;
 
 	if (e->header.type == PERF_RECORD_SAMPLE) {