Skip to content

Commit

Permalink
xfs: allocate sparse inode chunks on full chunk allocation failure
Browse files Browse the repository at this point in the history
xfs_ialloc_ag_alloc() makes several attempts to allocate a full inode
chunk. If all else fails, reduce the allocation to the sparse length and
alignment and attempt to allocate a sparse inode chunk.

If sparse chunk allocation succeeds, check whether an inobt record
already exists that can track the chunk. If so, inherit and update the
existing record. Otherwise, insert a new record for the sparse chunk.

Create helpers to align sparse chunk inode records and insert or update
existing records in the inode btrees. The xfs_inobt_insert_sprec()
helper implements the merge or update semantics required for sparse
inode records with respect to both the inobt and finobt. To update the
inobt, either insert a new record or merge with an existing record. To
update the finobt, use the updated inobt record to either insert or
replace an existing record.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
  • Loading branch information
Brian Foster authored and Dave Chinner committed May 28, 2015
1 parent 4148c34 commit 56d1115
Show file tree
Hide file tree
Showing 4 changed files with 401 additions and 14 deletions.
330 changes: 316 additions & 14 deletions fs/xfs/libxfs/xfs_ialloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,214 @@ xfs_ialloc_inode_init(
return 0;
}

/*
* Align startino and allocmask for a recently allocated sparse chunk such that
* they are fit for insertion (or merge) into the on-disk inode btrees.
*
* Background:
*
* When enabled, sparse inode support increases the inode alignment from cluster
* size to inode chunk size. This means that the minimum range between two
* non-adjacent inode records in the inobt is large enough for a full inode
* record. This allows for cluster sized, cluster aligned block allocation
* without need to worry about whether the resulting inode record overlaps with
* another record in the tree. Without this basic rule, we would have to deal
* with the consequences of overlap by potentially undoing recent allocations in
* the inode allocation codepath.
*
* Because of this alignment rule (which is enforced on mount), there are two
* inobt possibilities for newly allocated sparse chunks. One is that the
* aligned inode record for the chunk covers a range of inodes not already
* covered in the inobt (i.e., it is safe to insert a new sparse record). The
* other is that a record already exists at the aligned startino that considers
* the newly allocated range as sparse. In the latter case, record content is
* merged in hope that sparse inode chunks fill to full chunks over time.
*/
STATIC void
xfs_align_sparse_ino(
struct xfs_mount *mp,
xfs_agino_t *startino,
uint16_t *allocmask)
{
xfs_agblock_t agbno;
xfs_agblock_t mod;
int offset;

agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
mod = agbno % mp->m_sb.sb_inoalignmt;
if (!mod)
return;

/* calculate the inode offset and align startino */
offset = mod << mp->m_sb.sb_inopblog;
*startino -= offset;

/*
* Since startino has been aligned down, left shift allocmask such that
* it continues to represent the same physical inodes relative to the
* new startino.
*/
*allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
}

/*
* Determine whether the source inode record can merge into the target. Both
* records must be sparse, the inode ranges must match and there must be no
* allocation overlap between the records.
*/
STATIC bool
__xfs_inobt_can_merge(
struct xfs_inobt_rec_incore *trec, /* tgt record */
struct xfs_inobt_rec_incore *srec) /* src record */
{
uint64_t talloc;
uint64_t salloc;

/* records must cover the same inode range */
if (trec->ir_startino != srec->ir_startino)
return false;

/* both records must be sparse */
if (!xfs_inobt_issparse(trec->ir_holemask) ||
!xfs_inobt_issparse(srec->ir_holemask))
return false;

/* both records must track some inodes */
if (!trec->ir_count || !srec->ir_count)
return false;

/* can't exceed capacity of a full record */
if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
return false;

/* verify there is no allocation overlap */
talloc = xfs_inobt_irec_to_allocmask(trec);
salloc = xfs_inobt_irec_to_allocmask(srec);
if (talloc & salloc)
return false;

return true;
}

/*
* Merge the source inode record into the target. The caller must call
* __xfs_inobt_can_merge() to ensure the merge is valid.
*/
STATIC void
__xfs_inobt_rec_merge(
struct xfs_inobt_rec_incore *trec, /* target */
struct xfs_inobt_rec_incore *srec) /* src */
{
ASSERT(trec->ir_startino == srec->ir_startino);

/* combine the counts */
trec->ir_count += srec->ir_count;
trec->ir_freecount += srec->ir_freecount;

/*
* Merge the holemask and free mask. For both fields, 0 bits refer to
* allocated inodes. We combine the allocated ranges with bitwise AND.
*/
trec->ir_holemask &= srec->ir_holemask;
trec->ir_free &= srec->ir_free;
}

/*
* Insert a new sparse inode chunk into the associated inode btree. The inode
* record for the sparse chunk is pre-aligned to a startino that should match
* any pre-existing sparse inode record in the tree. This allows sparse chunks
* to fill over time.
*
* This function supports two modes of handling preexisting records depending on
* the merge flag. If merge is true, the provided record is merged with the
* existing record and updated in place. The merged record is returned in nrec.
* If merge is false, an existing record is replaced with the provided record.
* If no preexisting record exists, the provided record is always inserted.
*
* It is considered corruption if a merge is requested and not possible. Given
* the sparse inode alignment constraints, this should never happen.
*/
STATIC int
xfs_inobt_insert_sprec(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct xfs_buf *agbp,
int btnum,
struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */
bool merge) /* merge or replace */
{
struct xfs_btree_cur *cur;
struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
int error;
int i;
struct xfs_inobt_rec_incore rec;

cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);

/* the new record is pre-aligned so we know where to look */
error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
if (error)
goto error;
/* if nothing there, insert a new record and return */
if (i == 0) {
error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
nrec->ir_count, nrec->ir_freecount,
nrec->ir_free, &i);
if (error)
goto error;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);

goto out;
}

/*
* A record exists at this startino. Merge or replace the record
* depending on what we've been asked to do.
*/
if (merge) {
error = xfs_inobt_get_rec(cur, &rec, &i);
if (error)
goto error;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
XFS_WANT_CORRUPTED_GOTO(mp,
rec.ir_startino == nrec->ir_startino,
error);

/*
* This should never fail. If we have coexisting records that
* cannot merge, something is seriously wrong.
*/
XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
error);

trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
rec.ir_holemask, nrec->ir_startino,
nrec->ir_holemask);

/* merge to nrec to output the updated record */
__xfs_inobt_rec_merge(nrec, &rec);

trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
nrec->ir_holemask);

error = xfs_inobt_rec_check_count(mp, nrec);
if (error)
goto error;
}

error = xfs_inobt_update(cur, nrec);
if (error)
goto error;

out:
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return 0;
error:
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
return error;
}

/*
* Allocate new inodes in the allocation group specified by agbp.
* Return 0 for success, else error code.
Expand All @@ -395,6 +603,8 @@ xfs_ialloc_ag_alloc(
xfs_agino_t newlen; /* new number of inodes */
int isaligned = 0; /* inode allocation at stripe unit */
/* boundary */
uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */
struct xfs_inobt_rec_incore rec;
struct xfs_perag *pag;

memset(&args, 0, sizeof(args));
Expand Down Expand Up @@ -511,6 +721,45 @@ xfs_ialloc_ag_alloc(
return error;
}

/*
* Finally, try a sparse allocation if the filesystem supports it and
* the sparse allocation length is smaller than a full chunk.
*/
if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
args.fsbno == NULLFSBLOCK) {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.agbno = be32_to_cpu(agi->agi_root);
args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
args.alignment = args.mp->m_sb.sb_spino_align;
args.prod = 1;

args.minlen = args.mp->m_ialloc_min_blks;
args.maxlen = args.minlen;

/*
* The inode record will be aligned to full chunk size. We must
* prevent sparse allocation from AG boundaries that result in
* invalid inode records, such as records that start at agbno 0
* or extend beyond the AG.
*
* Set min agbno to the first aligned, non-zero agbno and max to
* the last aligned agbno that is at least one full chunk from
* the end of the AG.
*/
args.min_agbno = args.mp->m_sb.sb_inoalignmt;
args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
args.mp->m_sb.sb_inoalignmt) -
args.mp->m_ialloc_blks;

error = xfs_alloc_vextent(&args);
if (error)
return error;

newlen = args.len << args.mp->m_sb.sb_inopblog;
allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
}

if (args.fsbno == NULLFSBLOCK) {
*alloc = 0;
return 0;
Expand All @@ -535,27 +784,80 @@ xfs_ialloc_ag_alloc(
* Convert the results.
*/
newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);

if (xfs_inobt_issparse(~allocmask)) {
/*
* We've allocated a sparse chunk. Align the startino and mask.
*/
xfs_align_sparse_ino(args.mp, &newino, &allocmask);

rec.ir_startino = newino;
rec.ir_holemask = ~allocmask;
rec.ir_count = newlen;
rec.ir_freecount = newlen;
rec.ir_free = XFS_INOBT_ALL_FREE;

/*
* Insert the sparse record into the inobt and allow for a merge
* if necessary. If a merge does occur, rec is updated to the
* merged record.
*/
error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
&rec, true);
if (error == -EFSCORRUPTED) {
xfs_alert(args.mp,
"invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
XFS_AGINO_TO_INO(args.mp, agno,
rec.ir_startino),
rec.ir_holemask, rec.ir_count);
xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
}
if (error)
return error;

/*
* We can't merge the part we've just allocated as for the inobt
* due to finobt semantics. The original record may or may not
* exist independent of whether physical inodes exist in this
* sparse chunk.
*
* We must update the finobt record based on the inobt record.
* rec contains the fully merged and up to date inobt record
* from the previous call. Set merge false to replace any
* existing record with this one.
*/
if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
XFS_BTNUM_FINO, &rec,
false);
if (error)
return error;
}
} else {
/* full chunk - insert new records to both btrees */
error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
XFS_BTNUM_INO);
if (error)
return error;

if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
error = xfs_inobt_insert(args.mp, tp, agbp, newino,
newlen, XFS_BTNUM_FINO);
if (error)
return error;
}
}

/*
* Update AGI counts and newino.
*/
be32_add_cpu(&agi->agi_count, newlen);
be32_add_cpu(&agi->agi_freecount, newlen);
pag = xfs_perag_get(args.mp, agno);
pag->pagi_freecount += newlen;
xfs_perag_put(pag);
agi->agi_newino = cpu_to_be32(newino);

/*
* Insert records describing the new inode chunk into the btrees.
*/
error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
XFS_BTNUM_INO);
if (error)
return error;

if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
XFS_BTNUM_FINO);
if (error)
return error;
}
/*
* Log allocation group header fields
*/
Expand Down
31 changes: 31 additions & 0 deletions fs/xfs/libxfs/xfs_ialloc_btree.c
Original file line number Diff line number Diff line change
Expand Up @@ -478,3 +478,34 @@ xfs_inobt_irec_to_allocmask(

return bitmap;
}

#if defined(DEBUG) || defined(XFS_WARN)
/*
* Verify that an in-core inode record has a valid inode count.
*/
int
xfs_inobt_rec_check_count(
struct xfs_mount *mp,
struct xfs_inobt_rec_incore *rec)
{
int inocount = 0;
int nextbit = 0;
uint64_t allocbmap;
int wordsz;

wordsz = sizeof(allocbmap) / sizeof(unsigned int);
allocbmap = xfs_inobt_irec_to_allocmask(rec);

nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit);
while (nextbit != -1) {
inocount++;
nextbit = xfs_next_bit((uint *) &allocbmap, wordsz,
nextbit + 1);
}

if (inocount != rec->ir_count)
return -EFSCORRUPTED;

return 0;
}
#endif /* DEBUG */
Loading

0 comments on commit 56d1115

Please sign in to comment.