Skip to content

Commit

Permalink
Merge branch 'af_unix-rework-gc'
Browse files Browse the repository at this point in the history
Kuniyuki Iwashima says:

====================
af_unix: Rework GC.

When we pass a file descriptor to an AF_UNIX socket via SCM_RIGTHS,
the underlying struct file of the inflight fd gets its refcount bumped.
If the fd is of an AF_UNIX socket, we need to track it in case it forms
cyclic references.

Let's say we send a fd of AF_UNIX socket A to B and vice versa and
close() both sockets.

When created, each socket's struct file initially has one reference.
After the fd exchange, both refcounts are bumped up to 2.  Then, close()
decreases both to 1.  From this point on, no one can touch the file/socket.

However, the struct file has one refcount and thus never calls the
release() function of the AF_UNIX socket.

That's why we need to track all inflight AF_UNIX sockets and run garbage
collection.

This series replaces the current GC implementation that locks each inflight
socket's receive queue and requires trickiness in other places.

The new GC does not lock each socket's queue to minimise its effect and
tries to be lightweight if there is no cyclic reference or no update in
the shape of the inflight fd graph.

The new implementation is based on Tarjan's Strongly Connected Components
algorithm, and we will consider each inflight AF_UNIX socket as a vertex
and its file descriptor as an edge in a directed graph.

For the details, please see each patch.

  patch 1  -  3 : Add struct to express inflight socket graphs
  patch       4 : Optimse inflight fd counting
  patch 5  -  6 : Group SCC possibly forming a cycle
  patch 7  -  8 : Support embryo socket
  patch 9  - 11 : Make GC lightweight
  patch 12 - 13 : Detect dead cycle references
  patch      14 : Replace GC algorithm
  patch      15 : selftest

After this series is applied, we can remove the two ugly tricks for race,
scm_fp_dup() in unix_attach_fds() and spin_lock dance in unix_peek_fds()
as done in patch 14/15 of v1.

Also, we will add cond_resched_lock() in __unix_gc() and convert it to
use a dedicated kthread instead of global system workqueue as suggested
by Paolo in a v4 thread.

v4: https://lore.kernel.org/netdev/20240301022243.73908-1-kuniyu@amazon.com/
v3: https://lore.kernel.org/netdev/20240223214003.17369-1-kuniyu@amazon.com/
v2: https://lore.kernel.org/netdev/20240216210556.65913-1-kuniyu@amazon.com/
v1: https://lore.kernel.org/netdev/20240203030058.60750-1-kuniyu@amazon.com/
====================

Link: https://lore.kernel.org/r/20240325202425.60930-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
Jakub Kicinski committed Mar 29, 2024
2 parents 50e2907 + 2aa0cff commit da493db
Show file tree
Hide file tree
Showing 8 changed files with 735 additions and 205 deletions.
31 changes: 23 additions & 8 deletions include/net/af_unix.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,30 @@ static inline struct unix_sock *unix_get_socket(struct file *filp)

extern spinlock_t unix_gc_lock;
extern unsigned int unix_tot_inflight;

void unix_inflight(struct user_struct *user, struct file *fp);
void unix_notinflight(struct user_struct *user, struct file *fp);
void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
void unix_del_edges(struct scm_fp_list *fpl);
void unix_update_edges(struct unix_sock *receiver);
int unix_prepare_fpl(struct scm_fp_list *fpl);
void unix_destroy_fpl(struct scm_fp_list *fpl);
void unix_gc(void);
void wait_for_unix_gc(struct scm_fp_list *fpl);

struct unix_vertex {
struct list_head edges;
struct list_head entry;
struct list_head scc_entry;
unsigned long out_degree;
unsigned long index;
unsigned long scc_index;
};

struct unix_edge {
struct unix_sock *predecessor;
struct unix_sock *successor;
struct list_head vertex_entry;
struct list_head stack_entry;
};

struct sock *unix_peer_get(struct sock *sk);

#define UNIX_HASH_MOD (256 - 1)
Expand Down Expand Up @@ -62,12 +80,9 @@ struct unix_sock {
struct path path;
struct mutex iolock, bindlock;
struct sock *peer;
struct list_head link;
unsigned long inflight;
struct sock *listener;
struct unix_vertex *vertex;
spinlock_t lock;
unsigned long gc_flags;
#define UNIX_GC_CANDIDATE 0
#define UNIX_GC_MAYBE_CYCLE 1
struct socket_wq peer_wq;
wait_queue_entry_t peer_wake;
struct scm_stat scm_stat;
Expand Down
9 changes: 9 additions & 0 deletions include/net/scm.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,19 @@ struct scm_creds {
kgid_t gid;
};

#ifdef CONFIG_UNIX
struct unix_edge;
#endif

struct scm_fp_list {
short count;
short count_unix;
short max;
#ifdef CONFIG_UNIX
bool inflight;
struct list_head vertices;
struct unix_edge *edges;
#endif
struct user_struct *user;
struct file *fp[SCM_MAX_FD];
};
Expand Down
11 changes: 11 additions & 0 deletions net/core/scm.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
fpl->count_unix = 0;
fpl->max = SCM_MAX_FD;
fpl->user = NULL;
#if IS_ENABLED(CONFIG_UNIX)
fpl->inflight = false;
fpl->edges = NULL;
INIT_LIST_HEAD(&fpl->vertices);
#endif
}
fpp = &fpl->fp[fpl->count];

Expand Down Expand Up @@ -376,8 +381,14 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
if (new_fpl) {
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);

new_fpl->max = new_fpl->count;
new_fpl->user = get_uid(fpl->user);
#if IS_ENABLED(CONFIG_UNIX)
new_fpl->inflight = false;
new_fpl->edges = NULL;
INIT_LIST_HEAD(&new_fpl->vertices);
#endif
}
return new_fpl;
}
Expand Down
27 changes: 14 additions & 13 deletions net/unix/af_unix.c
Original file line number Diff line number Diff line change
Expand Up @@ -979,11 +979,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
sk->sk_destruct = unix_sock_destructor;
u = unix_sk(sk);
u->inflight = 0;
u->listener = NULL;
u->vertex = NULL;
u->path.dentry = NULL;
u->path.mnt = NULL;
spin_lock_init(&u->lock);
INIT_LIST_HEAD(&u->link);
mutex_init(&u->iolock); /* single task reading lock */
mutex_init(&u->bindlock); /* single task binding lock */
init_waitqueue_head(&u->peer_wait);
Expand Down Expand Up @@ -1597,6 +1597,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
newsk->sk_type = sk->sk_type;
init_peercred(newsk);
newu = unix_sk(newsk);
newu->listener = other;
RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
otheru = unix_sk(other);

Expand Down Expand Up @@ -1692,8 +1693,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
bool kern)
{
struct sock *sk = sock->sk;
struct sock *tsk;
struct sk_buff *skb;
struct sock *tsk;
int err;

err = -EOPNOTSUPP;
Expand All @@ -1718,6 +1719,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
}

tsk = skb->sk;
unix_update_edges(unix_sk(tsk));
skb_free_datagram(sk, skb);
wake_up_interruptible(&unix_sk(sk)->peer_wait);

Expand Down Expand Up @@ -1789,8 +1791,6 @@ static inline bool too_many_unix_fds(struct task_struct *p)

static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
int i;

if (too_many_unix_fds(current))
return -ETOOMANYREFS;

Expand All @@ -1802,21 +1802,18 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
if (!UNIXCB(skb).fp)
return -ENOMEM;

for (i = scm->fp->count - 1; i >= 0; i--)
unix_inflight(scm->fp->user, scm->fp->fp[i]);
if (unix_prepare_fpl(UNIXCB(skb).fp))
return -ENOMEM;

return 0;
}

static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
int i;

scm->fp = UNIXCB(skb).fp;
UNIXCB(skb).fp = NULL;

for (i = scm->fp->count - 1; i >= 0; i--)
unix_notinflight(scm->fp->user, scm->fp->fp[i]);
unix_destroy_fpl(scm->fp);
}

static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
Expand Down Expand Up @@ -1937,17 +1934,21 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk);

if (unlikely(fp && fp->count))
if (unlikely(fp && fp->count)) {
atomic_add(fp->count, &u->scm_stat.nr_fds);
unix_add_edges(fp, u);
}
}

static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
{
struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk);

if (unlikely(fp && fp->count))
if (unlikely(fp && fp->count)) {
atomic_sub(fp->count, &u->scm_stat.nr_fds);
unix_del_edges(fp);
}
}

/*
Expand Down
Loading

0 comments on commit da493db

Please sign in to comment.