Skip to content

Commit

Permalink
xfs: garbage collect old cowextsz reservations
Browse files Browse the repository at this point in the history
Trim CoW reservations made on behalf of a cowextsz hint if they get too
old or we run low on quota, so long as we don't have dirty data awaiting
writeback or directio operations in progress.

Garbage collection of the cowextsize extents are kept separate from
prealloc extent reaping because setting the CoW prealloc lifetime to a
(much) higher value than the regular prealloc extent lifetime has been
useful for combatting CoW fragmentation on VM hosts where the VMs
experience bursty write behaviors and we can keep the utilization ratios
low enough that we don't start to run out of space.  IOWs, it benefits
us to keep the CoW fork reservations around for as long as we can unless
we run out of blocks or hit inode reclaim.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
  • Loading branch information
Darrick J. Wong committed Oct 5, 2016
1 parent 90e2056 commit 83104d4
Show file tree
Hide file tree
Showing 15 changed files with 287 additions and 32 deletions.
2 changes: 2 additions & 0 deletions fs/xfs/xfs_bmap_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -1891,6 +1891,8 @@ xfs_swap_extents(
cowfp = ip->i_cowfp;
ip->i_cowfp = tip->i_cowfp;
tip->i_cowfp = cowfp;
xfs_inode_set_cowblocks_tag(ip);
xfs_inode_set_cowblocks_tag(tip);
}

xfs_trans_log_inode(tp, ip, src_log_flags);
Expand Down
3 changes: 3 additions & 0 deletions fs/xfs/xfs_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,9 @@ xfs_file_buffered_aio_write(
enospc = xfs_inode_free_quota_eofblocks(ip);
if (enospc)
goto write_retry;
enospc = xfs_inode_free_quota_cowblocks(ip);
if (enospc)
goto write_retry;
} else if (ret == -ENOSPC && !enospc) {
struct xfs_eofblocks eofb = {0};

Expand Down
5 changes: 3 additions & 2 deletions fs/xfs/xfs_globals.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
/*
* Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
* other XFS code uses these values. Times are measured in centisecs (i.e.
* 100ths of a second) with the exception of eofb_timer, which is measured in
* seconds.
* 100ths of a second) with the exception of eofb_timer and cowb_timer, which
* are measured in seconds.
*/
xfs_param_t xfs_params = {
/* MIN DFLT MAX */
Expand All @@ -42,6 +42,7 @@ xfs_param_t xfs_params = {
.inherit_nodfrg = { 0, 1, 1 },
.fstrm_timer = { 1, 30*100, 3600*100},
.eofb_timer = { 1, 300, 3600*24},
.cowb_timer = { 1, 1800, 3600*24},
};

struct xfs_globals xfs_globals = {
Expand Down
238 changes: 209 additions & 29 deletions fs/xfs/xfs_icache.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "xfs_bmap_util.h"
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_reflink.h"

#include <linux/kthread.h>
#include <linux/freezer.h>
Expand Down Expand Up @@ -792,6 +793,33 @@ xfs_eofblocks_worker(
xfs_queue_eofblocks(mp);
}

/*
* Background scanning to trim preallocated CoW space. This is queued
* based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default).
* (We'll just piggyback on the post-EOF prealloc space workqueue.)
*/
STATIC void
xfs_queue_cowblocks(
struct xfs_mount *mp)
{
rcu_read_lock();
if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG))
queue_delayed_work(mp->m_eofblocks_workqueue,
&mp->m_cowblocks_work,
msecs_to_jiffies(xfs_cowb_secs * 1000));
rcu_read_unlock();
}

void
xfs_cowblocks_worker(
struct work_struct *work)
{
struct xfs_mount *mp = container_of(to_delayed_work(work),
struct xfs_mount, m_cowblocks_work);
xfs_icache_free_cowblocks(mp, NULL);
xfs_queue_cowblocks(mp);
}

int
xfs_inode_ag_iterator(
struct xfs_mount *mp,
Expand Down Expand Up @@ -1348,18 +1376,30 @@ xfs_inode_free_eofblocks(
return ret;
}

int
xfs_icache_free_eofblocks(
static int
__xfs_icache_free_eofblocks(
struct xfs_mount *mp,
struct xfs_eofblocks *eofb)
struct xfs_eofblocks *eofb,
int (*execute)(struct xfs_inode *ip, int flags,
void *args),
int tag)
{
int flags = SYNC_TRYLOCK;

if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
flags = SYNC_WAIT;

return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
eofb, XFS_ICI_EOFBLOCKS_TAG);
return xfs_inode_ag_iterator_tag(mp, execute, flags,
eofb, tag);
}

int
xfs_icache_free_eofblocks(
struct xfs_mount *mp,
struct xfs_eofblocks *eofb)
{
return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks,
XFS_ICI_EOFBLOCKS_TAG);
}

/*
Expand All @@ -1368,9 +1408,11 @@ xfs_icache_free_eofblocks(
* failure. We make a best effort by including each quota under low free space
* conditions (less than 1% free space) in the scan.
*/
int
xfs_inode_free_quota_eofblocks(
struct xfs_inode *ip)
static int
__xfs_inode_free_quota_eofblocks(
struct xfs_inode *ip,
int (*execute)(struct xfs_mount *mp,
struct xfs_eofblocks *eofb))
{
int scan = 0;
struct xfs_eofblocks eofb = {0};
Expand Down Expand Up @@ -1406,14 +1448,25 @@ xfs_inode_free_quota_eofblocks(
}

if (scan)
xfs_icache_free_eofblocks(ip->i_mount, &eofb);
execute(ip->i_mount, &eofb);

return scan;
}

void
xfs_inode_set_eofblocks_tag(
xfs_inode_t *ip)
int
xfs_inode_free_quota_eofblocks(
struct xfs_inode *ip)
{
return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks);
}

static void
__xfs_inode_set_eofblocks_tag(
xfs_inode_t *ip,
void (*execute)(struct xfs_mount *mp),
void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
int error, unsigned long caller_ip),
int tag)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
Expand All @@ -1431,35 +1484,44 @@ xfs_inode_set_eofblocks_tag(

pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
trace_xfs_inode_set_eofblocks_tag(ip);

tagged = radix_tree_tagged(&pag->pag_ici_root,
XFS_ICI_EOFBLOCKS_TAG);
tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
radix_tree_tag_set(&pag->pag_ici_root,
XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
XFS_ICI_EOFBLOCKS_TAG);
XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
if (!tagged) {
/* propagate the eofblocks tag up into the perag radix tree */
spin_lock(&ip->i_mount->m_perag_lock);
radix_tree_tag_set(&ip->i_mount->m_perag_tree,
XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
XFS_ICI_EOFBLOCKS_TAG);
tag);
spin_unlock(&ip->i_mount->m_perag_lock);

/* kick off background trimming */
xfs_queue_eofblocks(ip->i_mount);
execute(ip->i_mount);

trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
-1, _RET_IP_);
set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
}

spin_unlock(&pag->pag_ici_lock);
xfs_perag_put(pag);
}

void
xfs_inode_clear_eofblocks_tag(
xfs_inode_set_eofblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_set_eofblocks_tag(ip);
return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks,
trace_xfs_perag_set_eofblocks,
XFS_ICI_EOFBLOCKS_TAG);
}

static void
__xfs_inode_clear_eofblocks_tag(
xfs_inode_t *ip,
void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
int error, unsigned long caller_ip),
int tag)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
Expand All @@ -1470,23 +1532,141 @@ xfs_inode_clear_eofblocks_tag(

pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
trace_xfs_inode_clear_eofblocks_tag(ip);

radix_tree_tag_clear(&pag->pag_ici_root,
XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
XFS_ICI_EOFBLOCKS_TAG);
if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
if (!radix_tree_tagged(&pag->pag_ici_root, tag)) {
/* clear the eofblocks tag from the perag radix tree */
spin_lock(&ip->i_mount->m_perag_lock);
radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
XFS_ICI_EOFBLOCKS_TAG);
tag);
spin_unlock(&ip->i_mount->m_perag_lock);
trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
-1, _RET_IP_);
clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
}

spin_unlock(&pag->pag_ici_lock);
xfs_perag_put(pag);
}

void
xfs_inode_clear_eofblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_clear_eofblocks_tag(ip);
return __xfs_inode_clear_eofblocks_tag(ip,
trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG);
}

/*
* Automatic CoW Reservation Freeing
*
* These functions automatically garbage collect leftover CoW reservations
* that were made on behalf of a cowextsize hint when we start to run out
* of quota or when the reservations sit around for too long. If the file
* has dirty pages or is undergoing writeback, its CoW reservations will
* be retained.
*
* The actual garbage collection piggybacks off the same code that runs
* the speculative EOF preallocation garbage collector.
*/
STATIC int
xfs_inode_free_cowblocks(
struct xfs_inode *ip,
int flags,
void *args)
{
int ret;
struct xfs_eofblocks *eofb = args;
bool need_iolock = true;
int match;

ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));

if (!xfs_reflink_has_real_cow_blocks(ip)) {
trace_xfs_inode_free_cowblocks_invalid(ip);
xfs_inode_clear_cowblocks_tag(ip);
return 0;
}

/*
* If the mapping is dirty or under writeback we cannot touch the
* CoW fork. Leave it alone if we're in the midst of a directio.
*/
if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
atomic_read(&VFS_I(ip)->i_dio_count))
return 0;

if (eofb) {
if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
match = xfs_inode_match_id_union(ip, eofb);
else
match = xfs_inode_match_id(ip, eofb);
if (!match)
return 0;

/* skip the inode if the file size is too small */
if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
XFS_ISIZE(ip) < eofb->eof_min_file_size)
return 0;

/*
* A scan owner implies we already hold the iolock. Skip it in
* xfs_free_eofblocks() to avoid deadlock. This also eliminates
* the possibility of EAGAIN being returned.
*/
if (eofb->eof_scan_owner == ip->i_ino)
need_iolock = false;
}

/* Free the CoW blocks */
if (need_iolock) {
xfs_ilock(ip, XFS_IOLOCK_EXCL);
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
}

ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);

if (need_iolock) {
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
}

return ret;
}

int
xfs_icache_free_cowblocks(
struct xfs_mount *mp,
struct xfs_eofblocks *eofb)
{
return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks,
XFS_ICI_COWBLOCKS_TAG);
}

int
xfs_inode_free_quota_cowblocks(
struct xfs_inode *ip)
{
return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks);
}

void
xfs_inode_set_cowblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_set_eofblocks_tag(ip);
return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks,
trace_xfs_perag_set_eofblocks,
XFS_ICI_COWBLOCKS_TAG);
}

void
xfs_inode_clear_cowblocks_tag(
xfs_inode_t *ip)
{
trace_xfs_inode_clear_eofblocks_tag(ip);
return __xfs_inode_clear_eofblocks_tag(ip,
trace_xfs_perag_clear_eofblocks, XFS_ICI_COWBLOCKS_TAG);
}
7 changes: 7 additions & 0 deletions fs/xfs/xfs_icache.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ struct xfs_eofblocks {
in xfs_inode_ag_iterator */
#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
#define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */

/*
* Flags for xfs_iget()
Expand Down Expand Up @@ -70,6 +71,12 @@ int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
void xfs_eofblocks_worker(struct work_struct *);
void xfs_queue_eofblocks(struct xfs_mount *);

void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip);
void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *);
int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip);
void xfs_cowblocks_worker(struct work_struct *);

int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, int flags, void *args),
int flags, void *args);
Expand Down
4 changes: 3 additions & 1 deletion fs/xfs/xfs_inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1629,8 +1629,10 @@ xfs_itruncate_extents(
/*
* Clear the reflink flag if we truncated everything.
*/
if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip))
if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) {
ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
xfs_inode_clear_cowblocks_tag(ip);
}

/*
* Always re-log the inode so that our permanent transaction can keep
Expand Down
Loading

0 comments on commit 83104d4

Please sign in to comment.