Skip to content

Commit

Permalink
Merge branch 'udp-pernetns-hash'
Browse files Browse the repository at this point in the history
Kuniyuki Iwashima says:

====================
udp: Introduce optional per-netns hash table.

This series is the UDP version of the per-netns ehash series [0],
which were initially in the same patch set. [1]

The notable difference with TCP is the max table size is 64K and the min
size is 128.  This is because the possible hash range by udp_hashfn()
always fits in 64K within the same netns and because we want to keep a
bitmap in udp_lib_get_port() on the stack.  Also, the UDP per-netns table
isolates both 1-tuple and 2-tuple tables.

For details, please see the last patch.

  patch 1 - 4: prep for per-netns hash table
  patch     5: add per-netns hash table

[0]: https://lore.kernel.org/netdev/20220908011022.45342-1-kuniyu@amazon.com/
[1]: https://lore.kernel.org/netdev/20220826000445.46552-1-kuniyu@amazon.com/
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
David S. Miller committed Nov 16, 2022
2 parents e882256 + 9804985 commit fd258f2
Show file tree
Hide file tree
Showing 10 changed files with 261 additions and 58 deletions.
27 changes: 27 additions & 0 deletions Documentation/networking/ip-sysctl.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1177,6 +1177,33 @@ udp_rmem_min - INTEGER
udp_wmem_min - INTEGER
UDP does not have tx memory accounting and this tunable has no effect.

udp_hash_entries - INTEGER
Show the number of hash buckets for UDP sockets in the current
networking namespace.

A negative value means the networking namespace does not own its
hash buckets and shares the initial networking namespace's one.

udp_child_ehash_entries - INTEGER
Control the number of hash buckets for UDP sockets in the child
networking namespace, which must be set before clone() or unshare().

If the value is not 0, the kernel uses a value rounded up to 2^n
as the actual hash bucket size. 0 is a special value, meaning
the child networking namespace will share the initial networking
namespace's hash buckets.

Note that the child will use the global one in case the kernel
fails to allocate enough memory. In addition, the global hash
buckets are spread over available NUMA nodes, but the allocation
of the child hash table depends on the current process's NUMA
policy, which could result in performance differences.

Possible values: 0, 2^n (n: 7 (128) - 16 (64K))

Default: 0


RAW variables
=============

Expand Down
2 changes: 2 additions & 0 deletions include/linux/udp.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
return (struct udphdr *)skb_transport_header(skb);
}

#define UDP_HTABLE_SIZE_MIN_PERNET 128
#define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256)
#define UDP_HTABLE_SIZE_MAX 65536

static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
{
Expand Down
3 changes: 3 additions & 0 deletions include/net/netns/ipv4.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ struct tcp_fastopen_context;

struct netns_ipv4 {
struct inet_timewait_death_row tcp_death_row;
struct udp_table *udp_table;

#ifdef CONFIG_SYSCTL
struct ctl_table_header *forw_hdr;
Expand Down Expand Up @@ -207,6 +208,8 @@ struct netns_ipv4 {

atomic_t dev_addr_genid;

unsigned int sysctl_udp_child_hash_entries;

#ifdef CONFIG_SYSCTL
unsigned long *sysctl_local_reserved_ports;
int sysctl_ip_prot_sock;
Expand Down
4 changes: 2 additions & 2 deletions net/core/filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -6432,7 +6432,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
else
sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
dst4, tuple->ipv4.dport,
dif, sdif, &udp_table, NULL);
dif, sdif, net->ipv4.udp_table, NULL);
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
Expand All @@ -6448,7 +6448,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
src6, tuple->ipv6.sport,
dst6, tuple->ipv6.dport,
dif, sdif,
&udp_table, NULL);
net->ipv4.udp_table, NULL);
#endif
}

Expand Down
40 changes: 40 additions & 0 deletions net/ipv4/sysctl_net_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ static int one_day_secs = 24 * 3600;
static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;

Expand Down Expand Up @@ -402,12 +403,36 @@ static int proc_tcp_ehash_entries(struct ctl_table *table, int write,
if (!net_eq(net, &init_net) && !hinfo->pernet)
tcp_ehash_entries *= -1;

memset(&tbl, 0, sizeof(tbl));
tbl.data = &tcp_ehash_entries;
tbl.maxlen = sizeof(int);

return proc_dointvec(&tbl, write, buffer, lenp, ppos);
}

static int proc_udp_hash_entries(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
ipv4.sysctl_udp_child_hash_entries);
int udp_hash_entries;
struct ctl_table tbl;

udp_hash_entries = net->ipv4.udp_table->mask + 1;

/* A negative number indicates that the child netns
* shares the global udp_table.
*/
if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table)
udp_hash_entries *= -1;

memset(&tbl, 0, sizeof(tbl));
tbl.data = &udp_hash_entries;
tbl.maxlen = sizeof(int);

return proc_dointvec(&tbl, write, buffer, lenp, ppos);
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
void *buffer, size_t *lenp,
Expand Down Expand Up @@ -1361,6 +1386,21 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = &tcp_child_ehash_entries_max,
},
{
.procname = "udp_hash_entries",
.data = &init_net.ipv4.sysctl_udp_child_hash_entries,
.mode = 0444,
.proc_handler = proc_udp_hash_entries,
},
{
.procname = "udp_child_hash_entries",
.data = &init_net.ipv4.sysctl_udp_child_hash_entries,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_douintvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &udp_child_hash_entries_max,
},
{
.procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min,
Expand Down
Loading

0 comments on commit fd258f2

Please sign in to comment.