Skip to content

Commit

Permalink
Merge tag 'ceph-for-5.18-rc1' of https://github.com/ceph/ceph-client
Browse files Browse the repository at this point in the history
Pull ceph updates from Ilya Dryomov:
 "The highlights are:

   - several changes to how snap context and snap realms are tracked
     (Xiubo Li). In particular, this should resolve a long-standing
     issue of high kworker CPU usage and various stalls caused by
     needless iteration over all inodes in the snap realm.

   - async create fixes to address hangs in some edge cases (Jeff
     Layton)

   - support for getvxattr MDS op for querying server-side xattrs, such
     as file/directory layouts and ephemeral pins (Milind Changire)

   - average latency is now maintained for all metrics (Venky Shankar)

   - some tweaks around handling inline data to make it fit better with
     netfs helper library (David Howells)

  Also a couple of memory leaks got plugged along with a few assorted
  fixups. Last but not least, Xiubo has stepped up to serve as a CephFS
  co-maintainer"

* tag 'ceph-for-5.18-rc1' of https://github.com/ceph/ceph-client: (27 commits)
  ceph: fix memory leak in ceph_readdir when note_last_dentry returns error
  ceph: uninitialized variable in debug output
  ceph: use tracked average r/w/m latencies to display metrics in debugfs
  ceph: include average/stdev r/w/m latency in mds metrics
  ceph: track average r/w/m latency
  ceph: use ktime_to_timespec64() rather than jiffies_to_timespec64()
  ceph: assign the ci only when the inode isn't NULL
  ceph: fix inode reference leakage in ceph_get_snapdir()
  ceph: misc fix for code style and logs
  ceph: allocate capsnap memory outside of ceph_queue_cap_snap()
  ceph: do not release the global snaprealm until unmounting
  ceph: remove incorrect and unused CEPH_INO_DOTDOT macro
  MAINTAINERS: add Xiubo Li as cephfs co-maintainer
  ceph: eliminate the recursion when rebuilding the snap context
  ceph: do not update snapshot context when there is no new snapshot
  ceph: zero the dir_entries memory when allocating it
  ceph: move to a dedicated slabcache for ceph_cap_snap
  ceph: add getvxattr op
  libceph: drop else branches in prepare_read_data{,_cont}
  ceph: fix comments mentioning i_mutex
  ...
  • Loading branch information
Linus Torvalds committed Mar 25, 2022
2 parents b1b07ba + f639d98 commit 85c7000
Show file tree
Hide file tree
Showing 20 changed files with 577 additions and 376 deletions.
2 changes: 2 additions & 0 deletions MAINTAINERS
Original file line number Diff line number Diff line change
Expand Up @@ -4456,6 +4456,7 @@ F: drivers/power/supply/cw2015_battery.c
CEPH COMMON CODE (LIBCEPH)
M: Ilya Dryomov <idryomov@gmail.com>
M: Jeff Layton <jlayton@kernel.org>
M: Xiubo Li <xiubli@redhat.com>
L: ceph-devel@vger.kernel.org
S: Supported
W: http://ceph.com/
Expand All @@ -4466,6 +4467,7 @@ F: net/ceph/

CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH)
M: Jeff Layton <jlayton@kernel.org>
M: Xiubo Li <xiubli@redhat.com>
M: Ilya Dryomov <idryomov@gmail.com>
L: ceph-devel@vger.kernel.org
S: Supported
Expand Down
240 changes: 112 additions & 128 deletions fs/ceph/addr.c
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ static int ceph_releasepage(struct page *page, gfp_t gfp)

static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
{
struct inode *inode = rreq->mapping->host;
struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_layout *lo = &ci->i_layout;
u32 blockoff;
Expand All @@ -201,7 +201,7 @@ static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)

static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq)
{
struct inode *inode = subreq->rreq->mapping->host;
struct inode *inode = subreq->rreq->inode;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 objno, objoff;
Expand Down Expand Up @@ -244,10 +244,63 @@ static void finish_netfs_read(struct ceph_osd_request *req)
iput(req->r_inode);
}

static bool ceph_netfs_issue_op_inline(struct netfs_read_subrequest *subreq)
{
struct netfs_read_request *rreq = subreq->rreq;
struct inode *inode = rreq->inode;
struct ceph_mds_reply_info_parsed *rinfo;
struct ceph_mds_reply_info_in *iinfo;
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci = ceph_inode(inode);
struct iov_iter iter;
ssize_t err = 0;
size_t len;

__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
__clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);

if (subreq->start >= inode->i_size)
goto out;

/* We need to fetch the inline data. */
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
}
req->r_ino1 = ci->i_vino;
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
req->r_num_caps = 2;

err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err < 0)
goto out;

rinfo = &req->r_reply_info;
iinfo = &rinfo->targeti;
if (iinfo->inline_version == CEPH_INLINE_NONE) {
/* The data got uninlined */
ceph_mdsc_put_request(req);
return false;
}

len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
if (err == 0)
err = -EFAULT;

ceph_mdsc_put_request(req);
out:
netfs_subreq_terminated(subreq, err, false);
return true;
}

static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
{
struct netfs_read_request *rreq = subreq->rreq;
struct inode *inode = rreq->mapping->host;
struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
Expand All @@ -258,6 +311,10 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
int err = 0;
u64 len = subreq->len;

if (ci->i_inline_version != CEPH_INLINE_NONE &&
ceph_netfs_issue_op_inline(subreq))
return;

req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
0, 1, CEPH_OSD_OP_READ,
CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
Expand Down Expand Up @@ -326,23 +383,9 @@ static int ceph_readpage(struct file *file, struct page *subpage)
size_t len = folio_size(folio);
u64 off = folio_file_pos(folio);

if (ci->i_inline_version != CEPH_INLINE_NONE) {
/*
* Uptodate inline data should have been added
* into page cache while getting Fcr caps.
*/
if (off == 0) {
folio_unlock(folio);
return -EINVAL;
}
zero_user_segment(&folio->page, 0, folio_size(folio));
folio_mark_uptodate(folio);
folio_unlock(folio);
return 0;
}

dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n",
vino.ino, vino.snap, file, off, len, folio, folio_index(folio));
dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n inline %d",
vino.ino, vino.snap, file, off, len, folio, folio_index(folio),
ci->i_inline_version != CEPH_INLINE_NONE);

return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL);
}
Expand Down Expand Up @@ -1281,45 +1324,11 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct folio *folio = NULL;
pgoff_t index = pos >> PAGE_SHIFT;
int r;

/*
* Uninlining should have already been done and everything updated, EXCEPT
* for inline_version sent to the MDS.
*/
if (ci->i_inline_version != CEPH_INLINE_NONE) {
unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
if (aop_flags & AOP_FLAG_NOFS)
fgp_flags |= FGP_NOFS;
folio = __filemap_get_folio(mapping, index, fgp_flags,
mapping_gfp_mask(mapping));
if (!folio)
return -ENOMEM;

/*
* The inline_version on a new inode is set to 1. If that's the
* case, then the folio is brand new and isn't yet Uptodate.
*/
r = 0;
if (index == 0 && ci->i_inline_version != 1) {
if (!folio_test_uptodate(folio)) {
WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n",
ci->i_inline_version);
r = -EINVAL;
}
goto out;
}
zero_user_segment(&folio->page, 0, folio_size(folio));
folio_mark_uptodate(folio);
goto out;
}

r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL,
&ceph_netfs_read_ops, NULL);
out:
if (r == 0)
folio_wait_fscache(folio);
if (r < 0) {
Expand Down Expand Up @@ -1515,19 +1524,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
ceph_block_sigs(&oldset);

if (ci->i_inline_version != CEPH_INLINE_NONE) {
struct page *locked_page = NULL;
if (off == 0) {
lock_page(page);
locked_page = page;
}
err = ceph_uninline_data(vma->vm_file, locked_page);
if (locked_page)
unlock_page(locked_page);
if (err < 0)
goto out_free;
}

if (off + thp_size(page) <= size)
len = thp_size(page);
else
Expand Down Expand Up @@ -1584,11 +1580,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
ceph_put_snap_context(snapc);
} while (err == 0);

if (ret == VM_FAULT_LOCKED ||
ci->i_inline_version != CEPH_INLINE_NONE) {
if (ret == VM_FAULT_LOCKED) {
int dirty;
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
Expand Down Expand Up @@ -1652,16 +1646,30 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
}
}

int ceph_uninline_data(struct file *filp, struct page *locked_page)
int ceph_uninline_data(struct file *file)
{
struct inode *inode = file_inode(filp);
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
struct page *page = NULL;
u64 len, inline_version;
struct ceph_cap_flush *prealloc_cf;
struct folio *folio = NULL;
u64 inline_version = CEPH_INLINE_NONE;
struct page *pages[1];
int err = 0;
bool from_pagecache = false;
u64 len;

prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;

folio = read_mapping_folio(inode->i_mapping, 0, file);
if (IS_ERR(folio)) {
err = PTR_ERR(folio);
goto out;
}

folio_lock(folio);

spin_lock(&ci->i_ceph_lock);
inline_version = ci->i_inline_version;
Expand All @@ -1672,53 +1680,19 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)

if (inline_version == 1 || /* initial version, no data */
inline_version == CEPH_INLINE_NONE)
goto out;

if (locked_page) {
page = locked_page;
WARN_ON(!PageUptodate(page));
} else if (ceph_caps_issued(ci) &
(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) {
page = find_get_page(inode->i_mapping, 0);
if (page) {
if (PageUptodate(page)) {
from_pagecache = true;
lock_page(page);
} else {
put_page(page);
page = NULL;
}
}
}
goto out_unlock;

if (page) {
len = i_size_read(inode);
if (len > PAGE_SIZE)
len = PAGE_SIZE;
} else {
page = __page_cache_alloc(GFP_NOFS);
if (!page) {
err = -ENOMEM;
goto out;
}
err = __ceph_do_getattr(inode, page,
CEPH_STAT_CAP_INLINE_DATA, true);
if (err < 0) {
/* no inline data */
if (err == -ENODATA)
err = 0;
goto out;
}
len = err;
}
len = i_size_read(inode);
if (len > folio_size(folio))
len = folio_size(folio);

req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 0, 1,
CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
NULL, 0, 0, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
goto out_unlock;
}

req->r_mtime = inode->i_mtime;
Expand All @@ -1727,7 +1701,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_osdc_put_request(req);
if (err < 0)
goto out;
goto out_unlock;

req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 1, 3,
Expand All @@ -1736,10 +1710,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ci->i_truncate_size, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
goto out_unlock;
}

osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
pages[0] = folio_page(folio, 0);
osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false);

{
__le64 xattr_buf = cpu_to_le64(inline_version);
Expand All @@ -1749,7 +1724,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
CEPH_OSD_CMPXATTR_OP_GT,
CEPH_OSD_CMPXATTR_MODE_U64);
if (err)
goto out_put;
goto out_put_req;
}

{
Expand All @@ -1760,7 +1735,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
"inline_version",
xattr_buf, xattr_len, 0, 0);
if (err)
goto out_put;
goto out_put_req;
}

req->r_mtime = inode->i_mtime;
Expand All @@ -1771,19 +1746,28 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);

out_put:
if (!err) {
int dirty;

/* Set to CAP_INLINE_NONE and dirty the caps */
down_read(&fsc->mdsc->snap_rwsem);
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
up_read(&fsc->mdsc->snap_rwsem);
if (dirty)
__mark_inode_dirty(inode, dirty);
}
out_put_req:
ceph_osdc_put_request(req);
if (err == -ECANCELED)
err = 0;
out_unlock:
folio_unlock(folio);
folio_put(folio);
out:
if (page && page != locked_page) {
if (from_pagecache) {
unlock_page(page);
put_page(page);
} else
__free_pages(page, 0);
}

ceph_free_cap_flush(prealloc_cf);
dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
inode, ceph_vinop(inode), inline_version, err);
return err;
Expand Down
Loading

0 comments on commit 85c7000

Please sign in to comment.