Skip to content

Commit

Permalink
Merge branch 'net-reduce-tcp_memory_allocated-inflation'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
net: reduce tcp_memory_allocated inflation

Hosts with a lot of sockets tend to hit so called TCP memory pressure,
leading to very bad TCP performance and/or OOM.

The problem is that some TCP sockets can hold up to 2MB of 'forward
allocations' in their per-socket cache (sk->sk_forward_alloc),
and there is no mechanism to make them relinquish their share
under mem pressure.
Only under some potentially rare events their share is reclaimed,
one socket at a time.

In this series, I implemented a per-cpu cache instead of a per-socket one.

Each CPU has a +1/-1 MB (256 pages on x86) forward alloc cache, in order
to not dirty tcp_memory_allocated shared cache line too often.

We keep sk->sk_forward_alloc values as small as possible, to meet
memcg page granularity constraint.

Note that memcg already has a per-cpu cache, although MEMCG_CHARGE_BATCH
is defined to 32 pages, which seems a bit small.

Note that while this cover letter mentions TCP, this work is generic
and supports TCP, UDP, DECNET, SCTP.
====================

Link: https://lore.kernel.org/r/20220609063412.2205738-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Jakub Kicinski committed Jun 10, 2022
2 parents 5c281b4 + 0f2c269 commit e10b02e
Show file tree
Hide file tree
Showing 23 changed files with 114 additions and 126 deletions.
100 changes: 44 additions & 56 deletions include/net/sock.h
Original file line number Diff line number Diff line change
Expand Up @@ -1254,6 +1254,7 @@ struct proto {
void (*enter_memory_pressure)(struct sock *sk);
void (*leave_memory_pressure)(struct sock *sk);
atomic_long_t *memory_allocated; /* Current allocated memory. */
int __percpu *per_cpu_fw_alloc;
struct percpu_counter *sockets_allocated; /* Current number of sockets. */

/*
Expand Down Expand Up @@ -1396,22 +1397,48 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
return !!*sk->sk_prot->memory_pressure;
}

static inline long
proto_memory_allocated(const struct proto *prot)
{
return max(0L, atomic_long_read(prot->memory_allocated));
}

static inline long
sk_memory_allocated(const struct sock *sk)
{
return atomic_long_read(sk->sk_prot->memory_allocated);
return proto_memory_allocated(sk->sk_prot);
}

/* 1 MB per cpu, in page units */
#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT))

static inline long
sk_memory_allocated_add(struct sock *sk, int amt)
{
return atomic_long_add_return(amt, sk->sk_prot->memory_allocated);
int local_reserve;

preempt_disable();
local_reserve = __this_cpu_add_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
if (local_reserve >= SK_MEMORY_PCPU_RESERVE) {
__this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
}
preempt_enable();
return sk_memory_allocated(sk);
}

static inline void
sk_memory_allocated_sub(struct sock *sk, int amt)
{
atomic_long_sub(amt, sk->sk_prot->memory_allocated);
int local_reserve;

preempt_disable();
local_reserve = __this_cpu_sub_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
if (local_reserve <= -SK_MEMORY_PCPU_RESERVE) {
__this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
}
preempt_enable();
}

#define SK_ALLOC_PERCPU_COUNTER_BATCH 16
Expand Down Expand Up @@ -1440,12 +1467,6 @@ proto_sockets_allocated_sum_positive(struct proto *prot)
return percpu_counter_sum_positive(prot->sockets_allocated);
}

static inline long
proto_memory_allocated(struct proto *prot)
{
return atomic_long_read(prot->memory_allocated);
}

static inline bool
proto_memory_pressure(struct proto *prot)
{
Expand Down Expand Up @@ -1532,30 +1553,18 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind);
void __sk_mem_reduce_allocated(struct sock *sk, int amount);
void __sk_mem_reclaim(struct sock *sk, int amount);

/* We used to have PAGE_SIZE here, but systems with 64KB pages
* do not necessarily have 16x time more memory than 4KB ones.
*/
#define SK_MEM_QUANTUM 4096
#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
#define SK_MEM_SEND 0
#define SK_MEM_RECV 1

/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
/* sysctl_mem values are in pages */
static inline long sk_prot_mem_limits(const struct sock *sk, int index)
{
long val = sk->sk_prot->sysctl_mem[index];

#if PAGE_SIZE > SK_MEM_QUANTUM
val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
#elif PAGE_SIZE < SK_MEM_QUANTUM
val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT;
#endif
return val;
return sk->sk_prot->sysctl_mem[index];
}

static inline int sk_mem_pages(int amt)
{
return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT;
}

static inline bool sk_has_account(struct sock *sk)
Expand All @@ -1566,19 +1575,23 @@ static inline bool sk_has_account(struct sock *sk)

static inline bool sk_wmem_schedule(struct sock *sk, int size)
{
int delta;

if (!sk_has_account(sk))
return true;
return size <= sk->sk_forward_alloc ||
__sk_mem_schedule(sk, size, SK_MEM_SEND);
delta = size - sk->sk_forward_alloc;
return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND);
}

static inline bool
sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
{
int delta;

if (!sk_has_account(sk))
return true;
return size <= sk->sk_forward_alloc ||
__sk_mem_schedule(sk, size, SK_MEM_RECV) ||
delta = size - sk->sk_forward_alloc;
return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
skb_pfmemalloc(skb);
}

Expand All @@ -1604,7 +1617,7 @@ static inline void sk_mem_reclaim(struct sock *sk)

reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);

if (reclaimable >= SK_MEM_QUANTUM)
if (reclaimable >= (int)PAGE_SIZE)
__sk_mem_reclaim(sk, reclaimable);
}

Expand All @@ -1614,49 +1627,24 @@ static inline void sk_mem_reclaim_final(struct sock *sk)
sk_mem_reclaim(sk);
}

static inline void sk_mem_reclaim_partial(struct sock *sk)
{
int reclaimable;

if (!sk_has_account(sk))
return;

reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);

if (reclaimable > SK_MEM_QUANTUM)
__sk_mem_reclaim(sk, reclaimable - 1);
}

static inline void sk_mem_charge(struct sock *sk, int size)
{
if (!sk_has_account(sk))
return;
sk->sk_forward_alloc -= size;
}

/* the following macros control memory reclaiming in sk_mem_uncharge()
/* the following macros control memory reclaiming in mptcp_rmem_uncharge()
*/
#define SK_RECLAIM_THRESHOLD (1 << 21)
#define SK_RECLAIM_CHUNK (1 << 20)

static inline void sk_mem_uncharge(struct sock *sk, int size)
{
int reclaimable;

if (!sk_has_account(sk))
return;
sk->sk_forward_alloc += size;
reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);

/* Avoid a possible overflow.
* TCP send queues can make this happen, if sk_mem_reclaim()
* is not called and more than 2 GBytes are released at once.
*
* If we reach 2 MBytes, reclaim 1 MBytes right now, there is
* no need to hold that much forward allocation anyway.
*/
if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD))
__sk_mem_reclaim(sk, SK_RECLAIM_CHUNK);
sk_mem_reclaim(sk);
}

/*
Expand Down
2 changes: 2 additions & 0 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,8 @@ extern long sysctl_tcp_mem[3];
#define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */

extern atomic_long_t tcp_memory_allocated;
DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);

extern struct percpu_counter tcp_sockets_allocated;
extern unsigned long tcp_memory_pressure;

Expand Down
1 change: 1 addition & 0 deletions include/net/udp.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
extern struct proto udp_prot;

extern atomic_long_t udp_memory_allocated;
DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);

/* sysctl variables for udp */
extern long sysctl_udp_mem[3];
Expand Down
3 changes: 0 additions & 3 deletions net/core/datagram.c
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,6 @@ EXPORT_SYMBOL(skb_recv_datagram);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
consume_skb(skb);
sk_mem_reclaim_partial(sk);
}
EXPORT_SYMBOL(skb_free_datagram);

Expand All @@ -336,7 +335,6 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
slow = lock_sock_fast(sk);
sk_peek_offset_bwd(sk, len);
skb_orphan(skb);
sk_mem_reclaim_partial(sk);
unlock_sock_fast(sk, slow);

/* skb is now orphaned, can be freed outside of locked section */
Expand Down Expand Up @@ -396,7 +394,6 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
NULL);

kfree_skb(skb);
sk_mem_reclaim_partial(sk);
return err;
}
EXPORT_SYMBOL(skb_kill_datagram);
Expand Down
22 changes: 12 additions & 10 deletions net/core/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,7 @@ EXPORT_SYMBOL(sock_set_mark);
static void sock_release_reserved_memory(struct sock *sk, int bytes)
{
/* Round down bytes to multiple of pages */
bytes &= ~(SK_MEM_QUANTUM - 1);
bytes = round_down(bytes, PAGE_SIZE);

WARN_ON(bytes > sk->sk_reserved_mem);
sk->sk_reserved_mem -= bytes;
Expand Down Expand Up @@ -1028,9 +1028,9 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
return -ENOMEM;
}
sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
sk->sk_forward_alloc += pages << PAGE_SHIFT;

sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
sk->sk_reserved_mem += pages << PAGE_SHIFT;

return 0;
}
Expand Down Expand Up @@ -2987,7 +2987,6 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)

return 0;
}
EXPORT_SYMBOL(__sk_mem_raise_allocated);

/**
* __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
Expand All @@ -3003,10 +3002,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
{
int ret, amt = sk_mem_pages(size);

sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
sk->sk_forward_alloc += amt << PAGE_SHIFT;
ret = __sk_mem_raise_allocated(sk, size, amt, kind);
if (!ret)
sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
sk->sk_forward_alloc -= amt << PAGE_SHIFT;
return ret;
}
EXPORT_SYMBOL(__sk_mem_schedule);
Expand All @@ -3029,17 +3028,16 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount)
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
sk_leave_memory_pressure(sk);
}
EXPORT_SYMBOL(__sk_mem_reduce_allocated);

/**
* __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
* @sk: socket
* @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
* @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
*/
void __sk_mem_reclaim(struct sock *sk, int amount)
{
amount >>= SK_MEM_QUANTUM_SHIFT;
sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
amount >>= PAGE_SHIFT;
sk->sk_forward_alloc -= amount << PAGE_SHIFT;
__sk_mem_reduce_allocated(sk, amount);
}
EXPORT_SYMBOL(__sk_mem_reclaim);
Expand Down Expand Up @@ -3798,6 +3796,10 @@ int proto_register(struct proto *prot, int alloc_slab)
pr_err("%s: missing sysctl_mem\n", prot->name);
return -EINVAL;
}
if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
return -EINVAL;
}
if (alloc_slab) {
prot->slab = kmem_cache_create_usercopy(prot->name,
prot->obj_size, 0,
Expand Down
4 changes: 4 additions & 0 deletions net/decnet/af_decnet.c
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ static DEFINE_RWLOCK(dn_hash_lock);
static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
static struct hlist_head dn_wild_sk;
static atomic_long_t decnet_memory_allocated;
static DEFINE_PER_CPU(int, decnet_memory_per_cpu_fw_alloc);

static int __dn_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsigned int optlen, int flags);
Expand Down Expand Up @@ -454,7 +455,10 @@ static struct proto dn_proto = {
.owner = THIS_MODULE,
.enter_memory_pressure = dn_enter_memory_pressure,
.memory_pressure = &dn_memory_pressure,

.memory_allocated = &decnet_memory_allocated,
.per_cpu_fw_alloc = &decnet_memory_per_cpu_fw_alloc,

.sysctl_mem = sysctl_decnet_mem,
.sysctl_wmem = sysctl_decnet_wmem,
.sysctl_rmem = sysctl_decnet_rmem,
Expand Down
13 changes: 4 additions & 9 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,8 @@ EXPORT_SYMBOL(sysctl_tcp_mem);

atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */
EXPORT_SYMBOL(tcp_memory_allocated);
DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);

#if IS_ENABLED(CONFIG_SMC)
DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
Expand Down Expand Up @@ -856,9 +858,6 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
{
struct sk_buff *skb;

if (unlikely(tcp_under_memory_pressure(sk)))
sk_mem_reclaim_partial(sk);

skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp);
if (likely(skb)) {
bool mem_scheduled;
Expand Down Expand Up @@ -2762,8 +2761,6 @@ void __tcp_close(struct sock *sk, long timeout)
__kfree_skb(skb);
}

sk_mem_reclaim(sk);

/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
if (sk->sk_state == TCP_CLOSE)
goto adjudge_to_death;
Expand Down Expand Up @@ -2871,7 +2868,6 @@ void __tcp_close(struct sock *sk, long timeout)
}
}
if (sk->sk_state != TCP_CLOSE) {
sk_mem_reclaim(sk);
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
Expand Down Expand Up @@ -2949,7 +2945,6 @@ void tcp_write_queue_purge(struct sock *sk)
}
tcp_rtx_queue_purge(sk);
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
tcp_sk(sk)->packets_out = 0;
inet_csk(sk)->icsk_backoff = 0;
Expand Down Expand Up @@ -4661,11 +4656,11 @@ void __init tcp_init(void)
max_wshare = min(4UL*1024*1024, limit);
max_rshare = min(6UL*1024*1024, limit);

init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);

init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE;
init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);

Expand Down
Loading

0 comments on commit e10b02e

Please sign in to comment.