Skip to content

Commit

Permalink
Merge tag 'vfs-6.12.file' of git://git.kernel.org/pub/scm/linux/kerne…
Browse files Browse the repository at this point in the history
…l/git/vfs/vfs

Pull vfs file updates from Christian Brauner:
 "This is the work to cleanup and shrink struct file significantly.

  Right now, (focusing on x86) struct file is 232 bytes. After this
  series struct file will be 184 bytes aka 3 cacheline and a spare 8
  bytes for future extensions at the end of the struct.

  With struct file being as ubiquitous as it is this should make a
  difference for file heavy workloads and allow further optimizations in
  the future.

   - struct fown_struct was embedded into struct file letting it take up
     32 bytes in total when really it shouldn't even be embedded in
     struct file in the first place. Instead, actual users of struct
     fown_struct now allocate the struct on demand. This frees up 24
     bytes.

   - Move struct file_ra_state into the union containg the cleanup hooks
     and move f_iocb_flags out of the union. This closes a 4 byte hole
     we created earlier and brings struct file to 192 bytes. Which means
     struct file is 3 cachelines and we managed to shrink it by 40
     bytes.

   - Reorder struct file so that nothing crosses a cacheline.

     I suspect that in the future we will end up reordering some members
     to mitigate false sharing issues or just because someone does
     actually provide really good perf data.

   - Shrinking struct file to 192 bytes is only part of the work.

     Files use a slab that is SLAB_TYPESAFE_BY_RCU and when a kmem cache
     is created with SLAB_TYPESAFE_BY_RCU the free pointer must be
     located outside of the object because the cache doesn't know what
     part of the memory can safely be overwritten as it may be needed to
     prevent object recycling.

     That has the consequence that SLAB_TYPESAFE_BY_RCU may end up
     adding a new cacheline.

     So this also contains work to add a new kmem_cache_create_rcu()
     function that allows the caller to specify an offset where the
     freelist pointer is supposed to be placed. Thus avoiding the
     implicit addition of a fourth cacheline.

   - And finally this removes the f_version member in struct file.

     The f_version member isn't particularly well-defined. It is mainly
     used as a cookie to detect concurrent seeks when iterating
     directories. But it is also abused by some subsystems for
     completely unrelated things.

     It is mostly a directory and filesystem specific thing that doesn't
     really need to live in struct file and with its wonky semantics it
     really lacks a specific function.

     For pipes, f_version is (ab)used to defer poll notifications until
     a write has happened. And struct pipe_inode_info is used by
     multiple struct files in their ->private_data so there's no chance
     of pushing that down into file->private_data without introducing
     another pointer indirection.

     But pipes don't rely on f_pos_lock so this adds a union into struct
     file encompassing f_pos_lock and a pipe specific f_pipe member that
     pipes can use. This union of course can be extended to other file
     types and is similar to what we do in struct inode already"

* tag 'vfs-6.12.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (26 commits)
  fs: remove f_version
  pipe: use f_pipe
  fs: add f_pipe
  ubifs: store cookie in private data
  ufs: store cookie in private data
  udf: store cookie in private data
  proc: store cookie in private data
  ocfs2: store cookie in private data
  input: remove f_version abuse
  ext4: store cookie in private data
  ext2: store cookie in private data
  affs: store cookie in private data
  fs: add generic_llseek_cookie()
  fs: use must_set_pos()
  fs: add must_set_pos()
  fs: add vfs_setpos_cookie()
  s390: remove unused f_version
  ceph: remove unused f_version
  adi: remove unused f_version
  mm: Removed @freeptr_offset to prevent doc warning
  ...
  • Loading branch information
Linus Torvalds committed Sep 16, 2024
2 parents 2775df6 + 24a988f commit 3352633
Show file tree
Hide file tree
Showing 33 changed files with 744 additions and 271 deletions.
1 change: 0 additions & 1 deletion drivers/char/adi.c
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,6 @@ static loff_t adi_llseek(struct file *file, loff_t offset, int whence)

if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
ret = offset;
}

Expand Down
47 changes: 22 additions & 25 deletions drivers/input/input.c
Original file line number Diff line number Diff line change
Expand Up @@ -1079,33 +1079,31 @@ static inline void input_wakeup_procfs_readers(void)
wake_up(&input_devices_poll_wait);
}

struct input_seq_state {
unsigned short pos;
bool mutex_acquired;
int input_devices_state;
};

static __poll_t input_proc_devices_poll(struct file *file, poll_table *wait)
{
struct seq_file *seq = file->private_data;
struct input_seq_state *state = seq->private;

poll_wait(file, &input_devices_poll_wait, wait);
if (file->f_version != input_devices_state) {
file->f_version = input_devices_state;
if (state->input_devices_state != input_devices_state) {
state->input_devices_state = input_devices_state;
return EPOLLIN | EPOLLRDNORM;
}

return 0;
}

union input_seq_state {
struct {
unsigned short pos;
bool mutex_acquired;
};
void *p;
};

static void *input_devices_seq_start(struct seq_file *seq, loff_t *pos)
{
union input_seq_state *state = (union input_seq_state *)&seq->private;
struct input_seq_state *state = seq->private;
int error;

/* We need to fit into seq->private pointer */
BUILD_BUG_ON(sizeof(union input_seq_state) != sizeof(seq->private));

error = mutex_lock_interruptible(&input_mutex);
if (error) {
state->mutex_acquired = false;
Expand All @@ -1124,7 +1122,7 @@ static void *input_devices_seq_next(struct seq_file *seq, void *v, loff_t *pos)

static void input_seq_stop(struct seq_file *seq, void *v)
{
union input_seq_state *state = (union input_seq_state *)&seq->private;
struct input_seq_state *state = seq->private;

if (state->mutex_acquired)
mutex_unlock(&input_mutex);
Expand Down Expand Up @@ -1210,25 +1208,23 @@ static const struct seq_operations input_devices_seq_ops = {

static int input_proc_devices_open(struct inode *inode, struct file *file)
{
return seq_open(file, &input_devices_seq_ops);
return seq_open_private(file, &input_devices_seq_ops,
sizeof(struct input_seq_state));
}

static const struct proc_ops input_devices_proc_ops = {
.proc_open = input_proc_devices_open,
.proc_poll = input_proc_devices_poll,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_release = seq_release,
.proc_release = seq_release_private,
};

static void *input_handlers_seq_start(struct seq_file *seq, loff_t *pos)
{
union input_seq_state *state = (union input_seq_state *)&seq->private;
struct input_seq_state *state = seq->private;
int error;

/* We need to fit into seq->private pointer */
BUILD_BUG_ON(sizeof(union input_seq_state) != sizeof(seq->private));

error = mutex_lock_interruptible(&input_mutex);
if (error) {
state->mutex_acquired = false;
Expand All @@ -1243,7 +1239,7 @@ static void *input_handlers_seq_start(struct seq_file *seq, loff_t *pos)

static void *input_handlers_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
union input_seq_state *state = (union input_seq_state *)&seq->private;
struct input_seq_state *state = seq->private;

state->pos = *pos + 1;
return seq_list_next(v, &input_handler_list, pos);
Expand All @@ -1252,7 +1248,7 @@ static void *input_handlers_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static int input_handlers_seq_show(struct seq_file *seq, void *v)
{
struct input_handler *handler = container_of(v, struct input_handler, node);
union input_seq_state *state = (union input_seq_state *)&seq->private;
struct input_seq_state *state = seq->private;

seq_printf(seq, "N: Number=%u Name=%s", state->pos, handler->name);
if (handler->filter)
Expand All @@ -1273,14 +1269,15 @@ static const struct seq_operations input_handlers_seq_ops = {

static int input_proc_handlers_open(struct inode *inode, struct file *file)
{
return seq_open(file, &input_handlers_seq_ops);
return seq_open_private(file, &input_handlers_seq_ops,
sizeof(struct input_seq_state));
}

static const struct proc_ops input_handlers_proc_ops = {
.proc_open = input_proc_handlers_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_release = seq_release,
.proc_release = seq_release_private,
};

static int __init input_proc_init(void)
Expand Down
6 changes: 6 additions & 0 deletions drivers/net/tun.c
Original file line number Diff line number Diff line change
Expand Up @@ -3452,6 +3452,12 @@ static int tun_chr_fasync(int fd, struct file *file, int on)
struct tun_file *tfile = file->private_data;
int ret;

if (on) {
ret = file_f_owner_allocate(file);
if (ret)
goto out;
}

if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
goto out;

Expand Down
3 changes: 0 additions & 3 deletions drivers/s390/char/hmcdrv_dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,6 @@ static loff_t hmcdrv_dev_seek(struct file *fp, loff_t pos, int whence)
if (pos < 0)
return -EINVAL;

if (fp->f_pos != pos)
++fp->f_version;

fp->f_pos = pos;
return pos;
}
Expand Down
6 changes: 6 additions & 0 deletions drivers/tty/tty_io.c
Original file line number Diff line number Diff line change
Expand Up @@ -2225,6 +2225,12 @@ static int __tty_fasync(int fd, struct file *filp, int on)
if (tty_paranoia_check(tty, file_inode(filp), "tty_fasync"))
goto out;

if (on) {
retval = file_f_owner_allocate(filp);
if (retval)
goto out;
}

retval = fasync_helper(fd, filp, on, &tty->fasync);
if (retval <= 0)
goto out;
Expand Down
44 changes: 38 additions & 6 deletions fs/affs/dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,44 @@
#include <linux/iversion.h>
#include "affs.h"

struct affs_dir_data {
unsigned long ino;
u64 cookie;
};

static int affs_readdir(struct file *, struct dir_context *);

static loff_t affs_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct affs_dir_data *data = file->private_data;

return generic_llseek_cookie(file, offset, whence, &data->cookie);
}

static int affs_dir_open(struct inode *inode, struct file *file)
{
struct affs_dir_data *data;

data = kzalloc(sizeof(struct affs_dir_data), GFP_KERNEL);
if (!data)
return -ENOMEM;
file->private_data = data;
return 0;
}

static int affs_dir_release(struct inode *inode, struct file *file)
{
kfree(file->private_data);
return 0;
}

const struct file_operations affs_dir_operations = {
.open = affs_dir_open,
.read = generic_read_dir,
.llseek = generic_file_llseek,
.llseek = affs_dir_llseek,
.iterate_shared = affs_readdir,
.fsync = affs_file_fsync,
.release = affs_dir_release,
};

/*
Expand All @@ -45,6 +76,7 @@ static int
affs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
struct affs_dir_data *data = file->private_data;
struct super_block *sb = inode->i_sb;
struct buffer_head *dir_bh = NULL;
struct buffer_head *fh_bh = NULL;
Expand All @@ -59,7 +91,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos);

if (ctx->pos < 2) {
file->private_data = (void *)0;
data->ino = 0;
if (!dir_emit_dots(file, ctx))
return 0;
}
Expand All @@ -80,8 +112,8 @@ affs_readdir(struct file *file, struct dir_context *ctx)
/* If the directory hasn't changed since the last call to readdir(),
* we can jump directly to where we left off.
*/
ino = (u32)(long)file->private_data;
if (ino && inode_eq_iversion(inode, file->f_version)) {
ino = data->ino;
if (ino && inode_eq_iversion(inode, data->cookie)) {
pr_debug("readdir() left off=%d\n", ino);
goto inside;
}
Expand Down Expand Up @@ -131,8 +163,8 @@ affs_readdir(struct file *file, struct dir_context *ctx)
} while (ino);
}
done:
file->f_version = inode_query_iversion(inode);
file->private_data = (void *)(long)ino;
data->cookie = inode_query_iversion(inode);
data->ino = ino;
affs_brelse(fh_bh);

out_brelse_dir:
Expand Down
1 change: 0 additions & 1 deletion fs/ceph/dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -707,7 +707,6 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)

if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
dfi->file_info.flags &= ~CEPH_F_ATEND;
}
retval = offset;
Expand Down
28 changes: 25 additions & 3 deletions fs/ext2/dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
bool need_revalidate = !inode_eq_iversion(inode, *(u64 *)file->private_data);
bool has_filetype;

if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
Expand All @@ -290,7 +290,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
offset = ext2_validate_entry(kaddr, offset, chunk_mask);
ctx->pos = (n<<PAGE_SHIFT) + offset;
}
file->f_version = inode_query_iversion(inode);
*(u64 *)file->private_data = inode_query_iversion(inode);
need_revalidate = false;
}
de = (ext2_dirent *)(kaddr+offset);
Expand Down Expand Up @@ -703,8 +703,30 @@ int ext2_empty_dir(struct inode *inode)
return 0;
}

static int ext2_dir_open(struct inode *inode, struct file *file)
{
file->private_data = kzalloc(sizeof(u64), GFP_KERNEL);
if (!file->private_data)
return -ENOMEM;
return 0;
}

static int ext2_dir_release(struct inode *inode, struct file *file)
{
kfree(file->private_data);
return 0;
}

static loff_t ext2_dir_llseek(struct file *file, loff_t offset, int whence)
{
return generic_llseek_cookie(file, offset, whence,
(u64 *)file->private_data);
}

const struct file_operations ext2_dir_operations = {
.llseek = generic_file_llseek,
.open = ext2_dir_open,
.release = ext2_dir_release,
.llseek = ext2_dir_llseek,
.read = generic_read_dir,
.iterate_shared = ext2_readdir,
.unlocked_ioctl = ext2_ioctl,
Expand Down
Loading

0 comments on commit 3352633

Please sign in to comment.