From f85c10e24ab9fd8ccb6de3d6061a3110ff3581df Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 13 Jun 2018 08:52:47 -0500 Subject: [PATCH 01/25] gfs2: eliminate rs_inum and reduce the size of gfs2 inodes Before this patch, block reservations kept track of the inode number. At one point, that was a valid thing to do. However, since we made the reservation a part of the inode (rather than a pointer to a separate allocated object) the reservation can determine the inode number by using container_of. This saves us a little memory in our inode. Signed-off-by: Bob Peterson Acked-by: Steven Whitehouse Reviewed-by: Andreas Gruenbacher --- fs/gfs2/incore.h | 1 - fs/gfs2/rgrp.c | 5 +++-- fs/gfs2/trace_gfs2.h | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index d2ad817e089f9..e9cd2cc292d33 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -295,7 +295,6 @@ struct gfs2_blkreserv { struct rb_node rs_node; /* link to other block reservations */ struct gfs2_rbm rs_rbm; /* Start of reservation */ u32 rs_free; /* how many blocks are still free */ - u64 rs_inum; /* Inode number for reservation */ }; /* diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 6bc5cfe710d18..7a001f6e8aee7 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -607,8 +607,10 @@ int gfs2_rsqa_alloc(struct gfs2_inode *ip) static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) { + struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res); + gfs2_print_dbg(seq, " B: n:%llu s:%llu b:%u f:%u\n", - (unsigned long long)rs->rs_inum, + (unsigned long long)ip->i_no_addr, (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm), rs->rs_rbm.offset, rs->rs_free); } @@ -1528,7 +1530,6 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, if (ret == 0) { rs->rs_rbm = rbm; rs->rs_free = extlen; - rs->rs_inum = ip->i_no_addr; rs_insert(ip); } else { if (goal == rgd->rd_last_alloc + rgd->rd_data0) diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index cb10b95efe0f7..e0025258107a8 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -606,7 +606,8 @@ TRACE_EVENT(gfs2_rs, __entry->rd_addr = rs->rs_rbm.rgd->rd_addr; __entry->rd_free_clone = rs->rs_rbm.rgd->rd_free_clone; __entry->rd_reserved = rs->rs_rbm.rgd->rd_reserved; - __entry->inum = rs->rs_inum; + __entry->inum = container_of(rs, struct gfs2_inode, + i_res)->i_no_addr; __entry->start = gfs2_rbm_to_block(&rs->rs_rbm); __entry->free = rs->rs_free; __entry->func = func; From 9e1a9ecd13b9bb421c88135b178577caf4d54f6a Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 7 Jun 2018 11:56:46 +0100 Subject: [PATCH 02/25] gfs2: Don't withdraw under a spin lock In two places, the gfs2_io_error_bh macro is called while holding the sd_ail_lock spin lock. This isn't allowed because gfs2_io_error_bh withdraws the filesystem, which can sleep because it issues a uevent. To fix that, add a gfs2_io_error_bh_wd macro that does withdraw the filesystem and change gfs2_io_error_bh to not withdraw the filesystem. In those places where the new gfs2_io_error_bh is used, withdraw the filesystem after releasing sd_ail_lock. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Reviewed-by: Andrew Price --- fs/gfs2/log.c | 26 +++++++++++++++++++------- fs/gfs2/lops.c | 2 +- fs/gfs2/meta_io.c | 4 ++-- fs/gfs2/util.c | 38 ++++++++++++++++++++------------------ fs/gfs2/util.h | 10 +++++++--- 5 files changed, 49 insertions(+), 31 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 0248835625f13..a767fad023860 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -92,7 +92,8 @@ static void gfs2_remove_from_ail(struct gfs2_bufdata *bd) static int gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct writeback_control *wbc, - struct gfs2_trans *tr) + struct gfs2_trans *tr, + bool *withdraw) __releases(&sdp->sd_ail_lock) __acquires(&sdp->sd_ail_lock) { @@ -107,8 +108,10 @@ __acquires(&sdp->sd_ail_lock) gfs2_assert(sdp, bd->bd_tr == tr); if (!buffer_busy(bh)) { - if (!buffer_uptodate(bh)) + if (!buffer_uptodate(bh)) { gfs2_io_error_bh(sdp, bh); + *withdraw = true; + } list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); continue; } @@ -148,6 +151,7 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) struct list_head *head = &sdp->sd_ail1_list; struct gfs2_trans *tr; struct blk_plug plug; + bool withdraw = false; trace_gfs2_ail_flush(sdp, wbc, 1); blk_start_plug(&plug); @@ -156,11 +160,13 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) list_for_each_entry_reverse(tr, head, tr_list) { if (wbc->nr_to_write <= 0) break; - if (gfs2_ail1_start_one(sdp, wbc, tr)) + if (gfs2_ail1_start_one(sdp, wbc, tr, &withdraw)) goto restart; } spin_unlock(&sdp->sd_ail_lock); blk_finish_plug(&plug); + if (withdraw) + gfs2_lm_withdraw(sdp, NULL); trace_gfs2_ail_flush(sdp, wbc, 0); } @@ -188,7 +194,8 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp) * */ -static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + bool *withdraw) { struct gfs2_bufdata *bd, *s; struct buffer_head *bh; @@ -199,11 +206,12 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr) gfs2_assert(sdp, bd->bd_tr == tr); if (buffer_busy(bh)) continue; - if (!buffer_uptodate(bh)) + if (!buffer_uptodate(bh)) { gfs2_io_error_bh(sdp, bh); + *withdraw = true; + } list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); } - } /** @@ -218,10 +226,11 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp) struct gfs2_trans *tr, *s; int oldest_tr = 1; int ret; + bool withdraw = false; spin_lock(&sdp->sd_ail_lock); list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { - gfs2_ail1_empty_one(sdp, tr); + gfs2_ail1_empty_one(sdp, tr, &withdraw); if (list_empty(&tr->tr_ail1_list) && oldest_tr) list_move(&tr->tr_list, &sdp->sd_ail2_list); else @@ -230,6 +239,9 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp) ret = list_empty(&sdp->sd_ail1_list); spin_unlock(&sdp->sd_ail_lock); + if (withdraw) + gfs2_lm_withdraw(sdp, "fatal: I/O error(s)\n"); + return ret; } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 4d6567990baf2..f2567f958d002 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -49,7 +49,7 @@ void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) if (test_set_buffer_pinned(bh)) gfs2_assert_withdraw(sdp, 0); if (!buffer_uptodate(bh)) - gfs2_io_error_bh(sdp, bh); + gfs2_io_error_bh_wd(sdp, bh); bd = bh->b_private; /* If this buffer is in the AIL and it has already been written * to in-place disk block, remove it from the AIL. diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 52de1036d9f92..be9c0bf697fe6 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -293,7 +293,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, if (unlikely(!buffer_uptodate(bh))) { struct gfs2_trans *tr = current->journal_info; if (tr && test_bit(TR_TOUCHED, &tr->tr_flags)) - gfs2_io_error_bh(sdp, bh); + gfs2_io_error_bh_wd(sdp, bh); brelse(bh); *bhp = NULL; return -EIO; @@ -320,7 +320,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) if (!buffer_uptodate(bh)) { struct gfs2_trans *tr = current->journal_info; if (tr && test_bit(TR_TOUCHED, &tr->tr_flags)) - gfs2_io_error_bh(sdp, bh); + gfs2_io_error_bh_wd(sdp, bh); return -EIO; } if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 763d659db91b4..59c811de0dc7a 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -46,14 +46,16 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...) test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) return 0; - va_start(args, fmt); + if (fmt) { + va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; + vaf.fmt = fmt; + vaf.va = &args; - fs_err(sdp, "%pV", &vaf); + fs_err(sdp, "%pV", &vaf); - va_end(args); + va_end(args); + } if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { fs_err(sdp, "about to withdraw this file system\n"); @@ -246,21 +248,21 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, } /** - * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw - * Returns: -1 if this call withdrew the machine, - * 0 if it was already withdrawn + * gfs2_io_error_bh_i - Flag a buffer I/O error + * @withdraw: withdraw the filesystem */ -int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, - const char *function, char *file, unsigned int line) +void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, unsigned int line, + bool withdraw) { - int rv; - rv = gfs2_lm_withdraw(sdp, - "fatal: I/O error\n" - " block = %llu\n" - " function = %s, file = %s, line = %u\n", - (unsigned long long)bh->b_blocknr, - function, file, line); - return rv; + fs_err(sdp, + "fatal: I/O error\n" + " block = %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, + function, file, line); + if (withdraw) + gfs2_lm_withdraw(sdp, NULL); } diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 3926f95a6eb70..96ac4aba47387 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -136,11 +136,15 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__); -int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, - const char *function, char *file, unsigned int line); +void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, unsigned int line, + bool withdraw); + +#define gfs2_io_error_bh_wd(sdp, bh) \ +gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__, true); #define gfs2_io_error_bh(sdp, bh) \ -gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__); +gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__, false); extern struct kmem_cache *gfs2_glock_cachep; From 00251a16d7f9eb380437b402def05cd7c1b16c09 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 18 Jun 2018 16:34:59 +0100 Subject: [PATCH 03/25] gfs2: Minor clarification to __gfs2_punch_hole Rename end_off to end_len to make the code less confusing. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/bmap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index ed6699705c13c..c7287afeeef5b 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2154,11 +2154,11 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) if (error) goto out; } else { - unsigned int start_off, end_off, blocksize; + unsigned int start_off, end_len, blocksize; blocksize = i_blocksize(inode); start_off = offset & (blocksize - 1); - end_off = (offset + length) & (blocksize - 1); + end_len = (offset + length) & (blocksize - 1); if (start_off) { unsigned int len = length; if (length > blocksize - start_off) @@ -2167,11 +2167,11 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) if (error) goto out; if (start_off + length < blocksize) - end_off = 0; + end_len = 0; } - if (end_off) { + if (end_len) { error = gfs2_block_zero_range(inode, - offset + length - end_off, end_off); + offset + length - end_len, end_len); if (error) goto out; } From ee9c7f9ae3d4fb9fb5c9cacbe3880d5dd66feb16 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 20 Jun 2018 15:15:24 -0500 Subject: [PATCH 04/25] gfs2: call ktime_get_coarse_real_ts64() directly current_kernel_time64() is now just a deprecated wrapper around ktime_get_coarse_real_ts64(), so let's just call that directly. Signed-off-by: Arnd Bergmann Signed-off-by: Bob Peterson --- fs/gfs2/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index a767fad023860..ee20ea42e7b59 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -701,7 +701,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, hash = ~crc32(~0, lh, LH_V1_SIZE); lh->lh_hash = cpu_to_be32(hash); - tv = current_kernel_time64(); + ktime_get_coarse_real_ts64(&tv); lh->lh_nsec = cpu_to_be32(tv.tv_nsec); lh->lh_sec = cpu_to_be64(tv.tv_sec); addr = gfs2_log_bmap(sdp); From d505a96a3b16f46455035dc0296bc2da6014e163 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 24 Jun 2018 10:43:49 +0100 Subject: [PATCH 05/25] gfs2: Further iomap cleanups In gfs2_iomap_alloc, set the type of newly allocated extents to IOMAP_MAPPED so that iomap_to_bh will set the bh states correctly: otherwise, the bhs would not be marked as mapped, confusing __mpage_writepage. This means that we need to check for the IOMAP_F_NEW flag in fallocate_chunk now. Further clean up gfs2_iomap_get and implement gfs2_stuffed_iomap here directly. For reads beyond the end of the file, return holes instead of failing with -ENOENT so that we can get rid of that special case in gfs2_block_map. Signed-off-by: Andreas Gruenbacher Reviewed-by: Bob Peterson --- fs/gfs2/bmap.c | 74 ++++++++++++++++++++++++++++---------------------- fs/gfs2/file.c | 2 +- 2 files changed, 43 insertions(+), 33 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index ed6699705c13c..33ee93344d18c 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -750,6 +750,7 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, } } while (iomap->addr == IOMAP_NULL_ADDR); + iomap->type = IOMAP_MAPPED; iomap->length = (u64)dblks << inode->i_blkbits; ip->i_height = mp->mp_fheight; gfs2_add_inode_blocks(&ip->i_inode, alloced); @@ -759,17 +760,6 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, return ret; } -static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap) -{ - struct gfs2_inode *ip = GFS2_I(inode); - - iomap->addr = (ip->i_no_addr << inode->i_blkbits) + - sizeof(struct gfs2_dinode); - iomap->offset = 0; - iomap->length = i_size_read(inode); - iomap->type = IOMAP_INLINE; -} - #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE /** @@ -789,37 +779,61 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + loff_t size = i_size_read(inode); __be64 *ptr; sector_t lblock; sector_t lblock_stop; int ret; int eob; u64 len; - struct buffer_head *bh; + struct buffer_head *dibh = NULL, *bh; u8 height; if (!length) return -EINVAL; + down_read(&ip->i_rw_mutex); + + ret = gfs2_meta_inode_buffer(ip, &dibh); + if (ret) + goto unlock; + if (gfs2_is_stuffed(ip)) { - if (flags & IOMAP_REPORT) { - if (pos >= i_size_read(inode)) - return -ENOENT; - gfs2_stuffed_iomap(inode, iomap); - return 0; + if (flags & IOMAP_WRITE) { + loff_t max_size = gfs2_max_stuffed_size(ip); + + if (pos + length > max_size) + goto unstuff; + iomap->length = max_size; + } else { + if (pos >= size) { + if (flags & IOMAP_REPORT) { + ret = -ENOENT; + goto unlock; + } else { + /* report a hole */ + iomap->offset = pos; + iomap->length = length; + goto do_alloc; + } + } + iomap->length = size; } - BUG_ON(!(flags & IOMAP_WRITE)); + iomap->addr = (ip->i_no_addr << inode->i_blkbits) + + sizeof(struct gfs2_dinode); + iomap->type = IOMAP_INLINE; + goto out; } + +unstuff: lblock = pos >> inode->i_blkbits; iomap->offset = lblock << inode->i_blkbits; lblock_stop = (pos + length - 1) >> inode->i_blkbits; len = lblock_stop - lblock + 1; + iomap->length = len << inode->i_blkbits; - down_read(&ip->i_rw_mutex); - - ret = gfs2_meta_inode_buffer(ip, &mp->mp_bh[0]); - if (ret) - goto unlock; + get_bh(dibh); + mp->mp_bh[0] = dibh; height = ip->i_height; while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height]) @@ -853,21 +867,23 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, iomap->bdev = inode->i_sb->s_bdev; unlock: up_read(&ip->i_rw_mutex); + if (dibh) + brelse(dibh); return ret; do_alloc: iomap->addr = IOMAP_NULL_ADDR; - iomap->length = len << inode->i_blkbits; iomap->type = IOMAP_HOLE; - iomap->flags = 0; if (flags & IOMAP_REPORT) { - loff_t size = i_size_read(inode); if (pos >= size) ret = -ENOENT; else if (height == ip->i_height) ret = gfs2_hole_size(inode, lblock, len, mp, iomap); else iomap->length = size - pos; + } else if (!(flags & IOMAP_WRITE)) { + if (pos < size && height == ip->i_height) + ret = gfs2_hole_size(inode, lblock, len, mp, iomap); } goto out; } @@ -941,12 +957,6 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, } else { ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp); release_metapath(&mp); - - /* Return unmapped buffer beyond the end of file. */ - if (ret == -ENOENT) { - ret = 0; - goto out; - } } if (ret) goto out; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 7137db7b0119d..6f6bbfbff13d3 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -754,7 +754,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, if (error) goto out; offset = iomap.offset + iomap.length; - if (iomap.type != IOMAP_HOLE) + if (!(iomap.flags & IOMAP_F_NEW)) continue; error = sb_issue_zeroout(sb, iomap.addr >> inode->i_blkbits, iomap.length >> inode->i_blkbits, From 64bc06bb32ee9cf458f432097113c8b495d75757 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sun, 24 Jun 2018 15:04:04 +0100 Subject: [PATCH 06/25] gfs2: iomap buffered write support With the traditional page-based writes, blocks are allocated separately for each page written to. With iomap writes, we can allocate a lot more blocks at once, with a fraction of the allocation overhead for each page. Split calculating the number of blocks that can be allocated at a given position (gfs2_alloc_size) off from gfs2_iomap_alloc: that size determines the number of blocks to allocate and reserve in the journal. Signed-off-by: Andreas Gruenbacher Reviewed-by: Bob Peterson --- fs/gfs2/aops.c | 20 ++-- fs/gfs2/aops.h | 19 ++++ fs/gfs2/bmap.c | 298 ++++++++++++++++++++++++++++++++++++++++++++----- fs/gfs2/file.c | 44 +++++++- 4 files changed, 338 insertions(+), 43 deletions(-) create mode 100644 fs/gfs2/aops.h diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 35f5ee23566d6..ecfbca9c88ff8 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -36,10 +37,11 @@ #include "super.h" #include "util.h" #include "glops.h" +#include "aops.h" -static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, - unsigned int from, unsigned int len) +void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, + unsigned int from, unsigned int len) { struct buffer_head *head = page_buffers(page); unsigned int bsize = head->b_size; @@ -462,7 +464,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping, * Returns: errno */ -static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) +int stuffed_readpage(struct gfs2_inode *ip, struct page *page) { struct buffer_head *dibh; u64 dsize = i_size_read(&ip->i_inode); @@ -776,7 +778,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, * adjust_fs_space - Adjusts the free space available due to gfs2_grow * @inode: the rindex inode */ -static void adjust_fs_space(struct inode *inode) +void adjust_fs_space(struct inode *inode) { struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); @@ -822,11 +824,11 @@ static void adjust_fs_space(struct inode *inode) * This copies the data from the page into the inode block after * the inode data structure itself. * - * Returns: errno + * Returns: copied bytes or errno */ -static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, - loff_t pos, unsigned copied, - struct page *page) +int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, + loff_t pos, unsigned copied, + struct page *page) { struct gfs2_inode *ip = GFS2_I(inode); u64 to = pos + copied; @@ -865,7 +867,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, * The main write_end function for GFS2. We just put our locking around the VFS * provided functions. * - * Returns: errno + * Returns: copied bytes or errno */ static int gfs2_write_end(struct file *file, struct address_space *mapping, diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h new file mode 100644 index 0000000000000..fa8e5d0144ddf --- /dev/null +++ b/fs/gfs2/aops.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 Red Hat, Inc. All rights reserved. + */ + +#ifndef __AOPS_DOT_H__ +#define __AOPS_DOT_H__ + +#include "incore.h" + +extern int stuffed_readpage(struct gfs2_inode *ip, struct page *page); +extern int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, + loff_t pos, unsigned copied, + struct page *page); +extern void adjust_fs_space(struct inode *inode); +extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, + unsigned int from, unsigned int len); + +#endif /* __AOPS_DOT_H__ */ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 33ee93344d18c..9a699c0a5df17 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -28,6 +28,7 @@ #include "trans.h" #include "dir.h" #include "util.h" +#include "aops.h" #include "trace_gfs2.h" /* This doesn't need to be that large as max 64 bit pointers in a 4k @@ -41,6 +42,8 @@ struct metapath { int mp_aheight; /* actual height (lookup height) */ }; +static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length); + /** * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page * @ip: the inode @@ -389,7 +392,7 @@ static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h) return mp->mp_aheight - x - 1; } -static inline void release_metapath(struct metapath *mp) +static void release_metapath(struct metapath *mp) { int i; @@ -397,6 +400,7 @@ static inline void release_metapath(struct metapath *mp) if (mp->mp_bh[i] == NULL) break; brelse(mp->mp_bh[i]); + mp->mp_bh[i] = NULL; } } @@ -609,11 +613,13 @@ enum alloc_state { * ii) Indirect blocks to fill in lower part of the metadata tree * iii) Data blocks * - * The function is in two parts. The first part works out the total - * number of blocks which we need. The second part does the actual - * allocation asking for an extent at a time (if enough contiguous free - * blocks are available, there will only be one request per bmap call) - * and uses the state machine to initialise the blocks in order. + * This function is called after gfs2_iomap_get, which works out the + * total number of blocks which we need via gfs2_alloc_size. + * + * We then do the actual allocation asking for an extent at a time (if + * enough contiguous free blocks are available, there will only be one + * allocation request per call) and uses the state machine to initialise + * the blocks in order. * * Right now, this function will allocate at most one indirect block * worth of data -- with a default block size of 4K, that's slightly @@ -633,39 +639,26 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, struct buffer_head *dibh = mp->mp_bh[0]; u64 bn; unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; - unsigned dblks = 0; - unsigned ptrs_per_blk; + size_t dblks = iomap->length >> inode->i_blkbits; const unsigned end_of_metadata = mp->mp_fheight - 1; int ret; enum alloc_state state; __be64 *ptr; __be64 zero_bn = 0; - size_t maxlen = iomap->length >> inode->i_blkbits; BUG_ON(mp->mp_aheight < 1); BUG_ON(dibh == NULL); + BUG_ON(dblks < 1); gfs2_trans_add_meta(ip->i_gl, dibh); down_write(&ip->i_rw_mutex); if (mp->mp_fheight == mp->mp_aheight) { - struct buffer_head *bh; - int eob; - - /* Bottom indirect block exists, find unalloced extent size */ - ptr = metapointer(end_of_metadata, mp); - bh = mp->mp_bh[end_of_metadata]; - dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, - maxlen, &eob); - BUG_ON(dblks < 1); + /* Bottom indirect block exists */ state = ALLOC_DATA; } else { /* Need to allocate indirect blocks */ - ptrs_per_blk = mp->mp_fheight > 1 ? sdp->sd_inptrs : - sdp->sd_diptrs; - dblks = min(maxlen, (size_t)(ptrs_per_blk - - mp->mp_list[end_of_metadata])); if (mp->mp_fheight == ip->i_height) { /* Writing into existing tree, extend tree down */ iblks = mp->mp_fheight - mp->mp_aheight; @@ -762,6 +755,50 @@ static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE +/** + * gfs2_alloc_size - Compute the maximum allocation size + * @inode: The inode + * @mp: The metapath + * @size: Requested size in blocks + * + * Compute the maximum size of the next allocation at @mp. + * + * Returns: size in blocks + */ +static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + const __be64 *first, *ptr, *end; + + /* + * For writes to stuffed files, this function is called twice via + * gfs2_iomap_get, before and after unstuffing. The size we return the + * first time needs to be large enough to get the reservation and + * allocation sizes right. The size we return the second time must + * be exact or else gfs2_iomap_alloc won't do the right thing. + */ + + if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) { + unsigned int maxsize = mp->mp_fheight > 1 ? + sdp->sd_inptrs : sdp->sd_diptrs; + maxsize -= mp->mp_list[mp->mp_fheight - 1]; + if (size > maxsize) + size = maxsize; + return size; + } + + first = metapointer(ip->i_height - 1, mp); + end = metaend(ip->i_height - 1, mp); + if (end - first > size) + end = first + size; + for (ptr = first; ptr < end; ptr++) { + if (*ptr) + break; + } + return ptr - first; +} + /** * gfs2_iomap_get - Map blocks from an inode to disk blocks * @inode: The inode @@ -797,6 +834,7 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, ret = gfs2_meta_inode_buffer(ip, &dibh); if (ret) goto unlock; + iomap->private = dibh; if (gfs2_is_stuffed(ip)) { if (flags & IOMAP_WRITE) { @@ -822,6 +860,7 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, iomap->addr = (ip->i_no_addr << inode->i_blkbits) + sizeof(struct gfs2_dinode); iomap->type = IOMAP_INLINE; + iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode); goto out; } @@ -867,7 +906,7 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, iomap->bdev = inode->i_sb->s_bdev; unlock: up_read(&ip->i_rw_mutex); - if (dibh) + if (ret && dibh) brelse(dibh); return ret; @@ -881,13 +920,168 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, ret = gfs2_hole_size(inode, lblock, len, mp, iomap); else iomap->length = size - pos; - } else if (!(flags & IOMAP_WRITE)) { + } else if (flags & IOMAP_WRITE) { + u64 alloc_size; + + len = gfs2_alloc_size(inode, mp, len); + alloc_size = len << inode->i_blkbits; + if (alloc_size < iomap->length) + iomap->length = alloc_size; + } else { if (pos < size && height == ip->i_height) ret = gfs2_hole_size(inode, lblock, len, mp, iomap); } goto out; } +static int gfs2_write_lock(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + int error; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); + error = gfs2_glock_nq(&ip->i_gh); + if (error) + goto out_uninit; + if (&ip->i_inode == sdp->sd_rindex) { + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, &m_ip->i_gh); + if (error) + goto out_unlock; + } + return 0; + +out_unlock: + gfs2_glock_dq(&ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); + return error; +} + +static void gfs2_write_unlock(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + + if (&ip->i_inode == sdp->sd_rindex) { + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + + gfs2_glock_dq_uninit(&m_ip->i_gh); + } + gfs2_glock_dq_uninit(&ip->i_gh); +} + +static void gfs2_iomap_journaled_page_done(struct inode *inode, loff_t pos, + unsigned copied, struct page *page, + struct iomap *iomap) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied); +} + +static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, + loff_t length, unsigned flags, + struct iomap *iomap) +{ + struct metapath mp = { .mp_aheight = 1, }; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + bool unstuff, alloc_required; + int ret; + + ret = gfs2_write_lock(inode); + if (ret) + return ret; + + unstuff = gfs2_is_stuffed(ip) && + pos + length > gfs2_max_stuffed_size(ip); + + ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); + if (ret) + goto out_release; + + alloc_required = unstuff || iomap->type == IOMAP_HOLE; + + if (alloc_required || gfs2_is_jdata(ip)) + gfs2_write_calc_reserv(ip, iomap->length, &data_blocks, + &ind_blocks); + + if (alloc_required) { + struct gfs2_alloc_parms ap = { + .target = data_blocks + ind_blocks + }; + + ret = gfs2_quota_lock_check(ip, &ap); + if (ret) + goto out_release; + + ret = gfs2_inplace_reserve(ip, &ap); + if (ret) + goto out_qunlock; + } + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks; + if (ind_blocks || data_blocks) + rblocks += RES_STATFS + RES_QUOTA; + if (inode == sdp->sd_rindex) + rblocks += 2 * RES_STATFS; + if (alloc_required) + rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks); + + ret = gfs2_trans_begin(sdp, rblocks, iomap->length >> inode->i_blkbits); + if (ret) + goto out_trans_fail; + + if (unstuff) { + ret = gfs2_unstuff_dinode(ip, NULL); + if (ret) + goto out_trans_end; + release_metapath(&mp); + brelse(iomap->private); + iomap->private = NULL; + ret = gfs2_iomap_get(inode, iomap->offset, iomap->length, + flags, iomap, &mp); + if (ret) + goto out_trans_end; + } + + if (iomap->type == IOMAP_HOLE) { + ret = gfs2_iomap_alloc(inode, iomap, flags, &mp); + if (ret) { + gfs2_trans_end(sdp); + gfs2_inplace_release(ip); + punch_hole(ip, iomap->offset, iomap->length); + goto out_qunlock; + } + } + release_metapath(&mp); + if (gfs2_is_jdata(ip)) + iomap->page_done = gfs2_iomap_journaled_page_done; + return 0; + +out_trans_end: + gfs2_trans_end(sdp); +out_trans_fail: + if (alloc_required) + gfs2_inplace_release(ip); +out_qunlock: + if (alloc_required) + gfs2_quota_unlock(ip); +out_release: + if (iomap->private) + brelse(iomap->private); + release_metapath(&mp); + gfs2_write_unlock(inode); + return ret; +} + static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap *iomap) { @@ -897,10 +1091,7 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, trace_gfs2_iomap_start(ip, pos, length, flags); if (flags & IOMAP_WRITE) { - ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); - if (!ret && iomap->type == IOMAP_HOLE) - ret = gfs2_iomap_alloc(inode, iomap, flags, &mp); - release_metapath(&mp); + ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap); } else { ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); release_metapath(&mp); @@ -909,8 +1100,59 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, return ret; } +static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_trans *tr = current->journal_info; + struct buffer_head *dibh = iomap->private; + + if (!(flags & IOMAP_WRITE)) + goto out; + + if (iomap->type != IOMAP_INLINE) { + gfs2_ordered_add_inode(ip); + + if (tr->tr_num_buf_new) + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + else + gfs2_trans_add_meta(ip->i_gl, dibh); + } + + if (inode == sdp->sd_rindex) { + adjust_fs_space(inode); + sdp->sd_rindex_uptodate = 0; + } + + gfs2_trans_end(sdp); + gfs2_inplace_release(ip); + + if (length != written && (iomap->flags & IOMAP_F_NEW)) { + /* Deallocate blocks that were just allocated. */ + loff_t blockmask = i_blocksize(inode) - 1; + loff_t end = (pos + length) & ~blockmask; + + pos = (pos + written + blockmask) & ~blockmask; + if (pos < end) { + truncate_pagecache_range(inode, pos, end - 1); + punch_hole(ip, pos, end - pos); + } + } + + if (ip->i_qadata && ip->i_qadata->qa_qd_num) + gfs2_quota_unlock(ip); + gfs2_write_unlock(inode); + +out: + if (dibh) + brelse(dibh); + return 0; +} + const struct iomap_ops gfs2_iomap_ops = { .iomap_begin = gfs2_iomap_begin, + .iomap_end = gfs2_iomap_end, }; /** diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6f6bbfbff13d3..16dd395479a54 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -26,10 +26,12 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" #include "bmap.h" +#include "aops.h" #include "dir.h" #include "glock.h" #include "glops.h" @@ -691,9 +693,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, /** * gfs2_file_write_iter - Perform a write to a file * @iocb: The io context - * @iov: The data to write - * @nr_segs: Number of @iov segments - * @pos: The file position + * @from: The data to write * * We have to do a lock/unlock here to refresh the inode size for * O_APPEND writes, otherwise we can land up writing at the wrong @@ -705,8 +705,9 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct gfs2_inode *ip = GFS2_I(file_inode(file)); - int ret; + struct inode *inode = file_inode(file); + struct gfs2_inode *ip = GFS2_I(inode); + ssize_t ret; ret = gfs2_rsqa_alloc(ip); if (ret) @@ -723,7 +724,38 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) gfs2_glock_dq_uninit(&gh); } - return generic_file_write_iter(iocb, from); + if (iocb->ki_flags & IOCB_DIRECT) + return generic_file_write_iter(iocb, from); + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(inode); + + ret = file_remove_privs(file); + if (ret) + goto out2; + + ret = file_update_time(file); + if (ret) + goto out2; + + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + +out2: + current->backing_dev_info = NULL; +out: + inode_unlock(inode); + if (likely(ret > 0)) { + iocb->ki_pos += ret; + + /* Handle various SYNC-type writes */ + ret = generic_write_sync(iocb, ret); + } + return ret; } static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, From bcfe94139a45fae128844558d6e27a0258860a90 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 11 May 2018 17:44:19 +0100 Subject: [PATCH 07/25] gfs2: gfs2_extent_length cleanup Now that gfs2_extent_length is no longer used for determining the size of a hole and always with an upper size limit, the function can be simplified. Signed-off-by: Andreas Gruenbacher Reviewed-by: Bob Peterson --- fs/gfs2/bmap.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 9a699c0a5df17..8b5876e19ecf7 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -406,22 +406,17 @@ static void release_metapath(struct metapath *mp) /** * gfs2_extent_length - Returns length of an extent of blocks - * @start: Start of the buffer - * @len: Length of the buffer in bytes - * @ptr: Current position in the buffer - * @limit: Max extent length to return (0 = unlimited) + * @bh: The metadata block + * @ptr: Current position in @bh + * @limit: Max extent length to return * @eob: Set to 1 if we hit "end of block" * - * If the first block is zero (unallocated) it will return the number of - * unallocated blocks in the extent, otherwise it will return the number - * of contiguous blocks in the extent. - * * Returns: The length of the extent (minimum of one block) */ -static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob) +static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob) { - const __be64 *end = (start + len); + const __be64 *end = (__be64 *)(bh->b_data + bh->b_size); const __be64 *first = ptr; u64 d = be64_to_cpu(*ptr); @@ -430,14 +425,11 @@ static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __b ptr++; if (ptr >= end) break; - if (limit && --limit == 0) - break; - if (d) - d++; + d++; } while(be64_to_cpu(*ptr) == d); if (ptr >= end) *eob = 1; - return (ptr - first); + return ptr - first; } typedef const __be64 *(*gfs2_metadata_walker)( @@ -893,7 +885,7 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, goto do_alloc; bh = mp->mp_bh[ip->i_height - 1]; - len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, len, &eob); + len = gfs2_extent_length(bh, ptr, len, &eob); iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits; iomap->length = len << inode->i_blkbits; From 967bcc91b044936e85dbb5848952dc1335a846f4 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 19 Jun 2018 15:08:02 +0100 Subject: [PATCH 08/25] gfs2: iomap direct I/O support The page unmapping previously done in gfs2_direct_IO is now done generically in iomap_dio_rw. Signed-off-by: Andreas Gruenbacher Reviewed-by: Bob Peterson --- fs/gfs2/aops.c | 100 +------------------------------------ fs/gfs2/bmap.c | 14 +++++- fs/gfs2/file.c | 132 +++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 136 insertions(+), 110 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ecfbca9c88ff8..1054cc4a96db9 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -84,12 +84,6 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, return 0; } -static int gfs2_get_block_direct(struct inode *inode, sector_t lblock, - struct buffer_head *bh_result, int create) -{ - return gfs2_block_map(inode, lblock, bh_result, 0); -} - /** * gfs2_writepage_common - Common bits of writepage * @page: The page to be written @@ -1024,96 +1018,6 @@ static void gfs2_invalidatepage(struct page *page, unsigned int offset, try_to_release_page(page, 0); } -/** - * gfs2_ok_for_dio - check that dio is valid on this file - * @ip: The inode - * @offset: The offset at which we are reading or writing - * - * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o) - * 1 (to accept the i/o request) - */ -static int gfs2_ok_for_dio(struct gfs2_inode *ip, loff_t offset) -{ - /* - * Should we return an error here? I can't see that O_DIRECT for - * a stuffed file makes any sense. For now we'll silently fall - * back to buffered I/O - */ - if (gfs2_is_stuffed(ip)) - return 0; - - if (offset >= i_size_read(&ip->i_inode)) - return 0; - return 1; -} - - - -static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct address_space *mapping = inode->i_mapping; - struct gfs2_inode *ip = GFS2_I(inode); - loff_t offset = iocb->ki_pos; - struct gfs2_holder gh; - int rv; - - /* - * Deferred lock, even if its a write, since we do no allocation - * on this path. All we need change is atime, and this lock mode - * ensures that other nodes have flushed their buffered read caches - * (i.e. their page cache entries for this inode). We do not, - * unfortunately have the option of only flushing a range like - * the VFS does. - */ - gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); - rv = gfs2_glock_nq(&gh); - if (rv) - goto out_uninit; - rv = gfs2_ok_for_dio(ip, offset); - if (rv != 1) - goto out; /* dio not valid, fall back to buffered i/o */ - - /* - * Now since we are holding a deferred (CW) lock at this point, you - * might be wondering why this is ever needed. There is a case however - * where we've granted a deferred local lock against a cached exclusive - * glock. That is ok provided all granted local locks are deferred, but - * it also means that it is possible to encounter pages which are - * cached and possibly also mapped. So here we check for that and sort - * them out ahead of the dio. The glock state machine will take care of - * everything else. - * - * If in fact the cached glock state (gl->gl_state) is deferred (CW) in - * the first place, mapping->nr_pages will always be zero. - */ - if (mapping->nrpages) { - loff_t lstart = offset & ~(PAGE_SIZE - 1); - loff_t len = iov_iter_count(iter); - loff_t end = PAGE_ALIGN(offset + len) - 1; - - rv = 0; - if (len == 0) - goto out; - if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) - unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); - rv = filemap_write_and_wait_range(mapping, lstart, end); - if (rv) - goto out; - if (iov_iter_rw(iter) == WRITE) - truncate_inode_pages_range(mapping, lstart, end); - } - - rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - gfs2_get_block_direct, NULL, NULL, 0); -out: - gfs2_glock_dq(&gh); -out_uninit: - gfs2_holder_uninit(&gh); - return rv; -} - /** * gfs2_releasepage - free the metadata associated with a page * @page: the page that's being released @@ -1194,7 +1098,7 @@ static const struct address_space_operations gfs2_writeback_aops = { .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, - .direct_IO = gfs2_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -1211,7 +1115,7 @@ static const struct address_space_operations gfs2_ordered_aops = { .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, - .direct_IO = gfs2_direct_IO, + .direct_IO = noop_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 8b5876e19ecf7..29391090d5b79 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -915,6 +915,9 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, } else if (flags & IOMAP_WRITE) { u64 alloc_size; + if (flags & IOMAP_DIRECT) + goto out; /* (see gfs2_file_direct_write) */ + len = gfs2_alloc_size(inode, mp, len); alloc_size = len << inode->i_blkbits; if (alloc_size < iomap->length) @@ -1082,11 +1085,18 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, int ret; trace_gfs2_iomap_start(ip, pos, length, flags); - if (flags & IOMAP_WRITE) { + if ((flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)) { ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap); } else { ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); release_metapath(&mp); + /* + * Silently fall back to buffered I/O for stuffed files or if + * we've hot a hole (see gfs2_file_direct_write). + */ + if ((flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT) && + iomap->type != IOMAP_MAPPED) + ret = -ENOTBLK; } trace_gfs2_iomap_end(ip, iomap, ret); return ret; @@ -1100,7 +1110,7 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, struct gfs2_trans *tr = current->journal_info; struct buffer_head *dibh = iomap->private; - if (!(flags & IOMAP_WRITE)) + if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != IOMAP_WRITE) goto out; if (iomap->type != IOMAP_INLINE) { diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 16dd395479a54..89280515169e9 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -690,6 +690,85 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, return ret ? ret : ret1; } +static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + size_t count = iov_iter_count(to); + struct gfs2_holder gh; + ssize_t ret; + + if (!count) + return 0; /* skip atime */ + + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret) + goto out_uninit; + + /* fall back to buffered I/O for stuffed files */ + ret = -ENOTBLK; + if (gfs2_is_stuffed(ip)) + goto out; + + ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL); + +out: + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + return ret; +} + +static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + size_t len = iov_iter_count(from); + loff_t offset = iocb->ki_pos; + struct gfs2_holder gh; + ssize_t ret; + + /* + * Deferred lock, even if its a write, since we do no allocation on + * this path. All we need to change is the atime, and this lock mode + * ensures that other nodes have flushed their buffered read caches + * (i.e. their page cache entries for this inode). We do not, + * unfortunately, have the option of only flushing a range like the + * VFS does. + */ + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret) + goto out_uninit; + + /* Silently fall back to buffered I/O when writing beyond EOF */ + if (offset + len > i_size_read(&ip->i_inode)) + goto out; + + ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL); + +out: + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + return ret; +} + +static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = gfs2_file_direct_read(iocb, to); + if (likely(ret != -ENOTBLK)) + return ret; + iocb->ki_flags &= ~IOCB_DIRECT; + } + return generic_file_read_iter(iocb, to); +} + /** * gfs2_file_write_iter - Perform a write to a file * @iocb: The io context @@ -707,7 +786,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct gfs2_inode *ip = GFS2_I(inode); - ssize_t ret; + ssize_t written = 0, ret; ret = gfs2_rsqa_alloc(ip); if (ret) @@ -724,9 +803,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) gfs2_glock_dq_uninit(&gh); } - if (iocb->ki_flags & IOCB_DIRECT) - return generic_file_write_iter(iocb, from); - inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) @@ -743,19 +819,55 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret) goto out2; - ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + if (iocb->ki_flags & IOCB_DIRECT) { + struct address_space *mapping = file->f_mapping; + loff_t pos, endbyte; + ssize_t buffered; + + written = gfs2_file_direct_write(iocb, from); + if (written < 0 || !iov_iter_count(from)) + goto out2; + + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + if (unlikely(ret < 0)) + goto out2; + buffered = ret; + + /* + * We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + pos = iocb->ki_pos; + endbyte = pos + buffered - 1; + ret = filemap_write_and_wait_range(mapping, pos, endbyte); + if (!ret) { + iocb->ki_pos += buffered; + written += buffered; + invalidate_mapping_pages(mapping, + pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } + } else { + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + if (likely(ret > 0)) + iocb->ki_pos += ret; + } out2: current->backing_dev_info = NULL; out: inode_unlock(inode); if (likely(ret > 0)) { - iocb->ki_pos += ret; - /* Handle various SYNC-type writes */ ret = generic_write_sync(iocb, ret); } - return ret; + return written ? written : ret; } static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, @@ -1157,7 +1269,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) const struct file_operations gfs2_file_fops = { .llseek = gfs2_llseek, - .read_iter = generic_file_read_iter, + .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, @@ -1187,7 +1299,7 @@ const struct file_operations gfs2_dir_fops = { const struct file_operations gfs2_file_fops_nolock = { .llseek = gfs2_llseek, - .read_iter = generic_file_read_iter, + .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, From 025d0e7f73c6a9cc3ca2fe7de821792a8f3269bf Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 19 Mar 2018 23:10:52 +0000 Subject: [PATCH 09/25] gfs2: Remove gfs2_write_{begin,end} Now that generic_file_write_iter is no longer used, there are no remaining users of these address space operations. Signed-off-by: Andreas Gruenbacher Reviewed-by: Bob Peterson --- fs/gfs2/aops.c | 213 ------------------------------------------------- 1 file changed, 213 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 1054cc4a96db9..cc80fd71f3dd5 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -639,135 +639,6 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping, return ret; } -/** - * gfs2_write_begin - Begin to write to a file - * @file: The file to write to - * @mapping: The mapping in which to write - * @pos: The file offset at which to start writing - * @len: Length of the write - * @flags: Various flags - * @pagep: Pointer to return the page - * @fsdata: Pointer to return fs data (unused by GFS2) - * - * Returns: errno - */ - -static int gfs2_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - struct gfs2_inode *ip = GFS2_I(mapping->host); - struct gfs2_sbd *sdp = GFS2_SB(mapping->host); - struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); - unsigned int data_blocks = 0, ind_blocks = 0, rblocks; - unsigned requested = 0; - int alloc_required; - int error = 0; - pgoff_t index = pos >> PAGE_SHIFT; - unsigned from = pos & (PAGE_SIZE - 1); - struct page *page; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); - error = gfs2_glock_nq(&ip->i_gh); - if (unlikely(error)) - goto out_uninit; - if (&ip->i_inode == sdp->sd_rindex) { - error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, - GL_NOCACHE, &m_ip->i_gh); - if (unlikely(error)) { - gfs2_glock_dq(&ip->i_gh); - goto out_uninit; - } - } - - alloc_required = gfs2_write_alloc_required(ip, pos, len); - - if (alloc_required || gfs2_is_jdata(ip)) - gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); - - if (alloc_required) { - struct gfs2_alloc_parms ap = { .aflags = 0, }; - requested = data_blocks + ind_blocks; - ap.target = requested; - error = gfs2_quota_lock_check(ip, &ap); - if (error) - goto out_unlock; - - error = gfs2_inplace_reserve(ip, &ap); - if (error) - goto out_qunlock; - } - - rblocks = RES_DINODE + ind_blocks; - if (gfs2_is_jdata(ip)) - rblocks += data_blocks ? data_blocks : 1; - if (ind_blocks || data_blocks) - rblocks += RES_STATFS + RES_QUOTA; - if (&ip->i_inode == sdp->sd_rindex) - rblocks += 2 * RES_STATFS; - if (alloc_required) - rblocks += gfs2_rg_blocks(ip, requested); - - error = gfs2_trans_begin(sdp, rblocks, - PAGE_SIZE/sdp->sd_sb.sb_bsize); - if (error) - goto out_trans_fail; - - error = -ENOMEM; - flags |= AOP_FLAG_NOFS; - page = grab_cache_page_write_begin(mapping, index, flags); - *pagep = page; - if (unlikely(!page)) - goto out_endtrans; - - if (gfs2_is_stuffed(ip)) { - error = 0; - if (pos + len > gfs2_max_stuffed_size(ip)) { - error = gfs2_unstuff_dinode(ip, page); - if (error == 0) - goto prepare_write; - } else if (!PageUptodate(page)) { - error = stuffed_readpage(ip, page); - } - goto out; - } - -prepare_write: - error = __block_write_begin(page, from, len, gfs2_block_map); -out: - if (error == 0) - return 0; - - unlock_page(page); - put_page(page); - - gfs2_trans_end(sdp); - if (alloc_required) { - gfs2_inplace_release(ip); - if (pos + len > ip->i_inode.i_size) - gfs2_trim_blocks(&ip->i_inode); - } - goto out_qunlock; - -out_endtrans: - gfs2_trans_end(sdp); -out_trans_fail: - if (alloc_required) - gfs2_inplace_release(ip); -out_qunlock: - if (alloc_required) - gfs2_quota_unlock(ip); -out_unlock: - if (&ip->i_inode == sdp->sd_rindex) { - gfs2_glock_dq(&m_ip->i_gh); - gfs2_holder_uninit(&m_ip->i_gh); - } - gfs2_glock_dq(&ip->i_gh); -out_uninit: - gfs2_holder_uninit(&ip->i_gh); - return error; -} - /** * adjust_fs_space - Adjusts the free space available due to gfs2_grow * @inode: the rindex inode @@ -848,84 +719,6 @@ int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, return copied; } -/** - * gfs2_write_end - * @file: The file to write to - * @mapping: The address space to write to - * @pos: The file position - * @len: The length of the data - * @copied: How much was actually copied by the VFS - * @page: The page that has been written - * @fsdata: The fsdata (unused in GFS2) - * - * The main write_end function for GFS2. We just put our locking around the VFS - * provided functions. - * - * Returns: copied bytes or errno - */ - -static int gfs2_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = page->mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); - struct buffer_head *dibh; - int ret; - struct gfs2_trans *tr = current->journal_info; - BUG_ON(!tr); - - BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL); - - ret = gfs2_meta_inode_buffer(ip, &dibh); - if (unlikely(ret)) - goto out; - - if (gfs2_is_stuffed(ip)) { - ret = gfs2_stuffed_write_end(inode, dibh, pos, copied, page); - page = NULL; - goto out2; - } - - if (gfs2_is_jdata(ip)) - gfs2_page_add_databufs(ip, page, pos & ~PAGE_MASK, len); - else - gfs2_ordered_add_inode(ip); - - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - page = NULL; - if (tr->tr_num_buf_new) - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - else - gfs2_trans_add_meta(ip->i_gl, dibh); - -out2: - if (inode == sdp->sd_rindex) { - adjust_fs_space(inode); - sdp->sd_rindex_uptodate = 0; - } - - brelse(dibh); -out: - if (page) { - unlock_page(page); - put_page(page); - } - gfs2_trans_end(sdp); - gfs2_inplace_release(ip); - if (ip->i_qadata && ip->i_qadata->qa_qd_num) - gfs2_quota_unlock(ip); - if (inode == sdp->sd_rindex) { - gfs2_glock_dq(&m_ip->i_gh); - gfs2_holder_uninit(&m_ip->i_gh); - } - gfs2_glock_dq(&ip->i_gh); - gfs2_holder_uninit(&ip->i_gh); - return ret; -} - /** * jdata_set_page_dirty - Page dirtying function * @page: The page to dirty @@ -1093,8 +886,6 @@ static const struct address_space_operations gfs2_writeback_aops = { .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, - .write_begin = gfs2_write_begin, - .write_end = gfs2_write_end, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, @@ -1109,8 +900,6 @@ static const struct address_space_operations gfs2_ordered_aops = { .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, - .write_begin = gfs2_write_begin, - .write_end = gfs2_write_end, .set_page_dirty = __set_page_dirty_buffers, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, @@ -1126,8 +915,6 @@ static const struct address_space_operations gfs2_jdata_aops = { .writepages = gfs2_jdata_writepages, .readpage = gfs2_readpage, .readpages = gfs2_readpages, - .write_begin = gfs2_write_begin, - .write_end = gfs2_write_end, .set_page_dirty = jdata_set_page_dirty, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, From 03f8c41c73da849ec2b73aa678ce6380e8318920 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 21 Jun 2018 07:22:12 -0500 Subject: [PATCH 10/25] gfs2: Stop messing with ip->i_rgd in the rlist code In the resource group list code, keep the last resource group added in the last position in the array. Check against that instead of messing with ip->i_rgd. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/rgrp.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 7a001f6e8aee7..ecdc4cb5b6ad1 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -2559,19 +2559,35 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, if (gfs2_assert_warn(sdp, !rlist->rl_ghs)) return; - if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block)) - rgd = ip->i_rgd; - else + /* + * The resource group last accessed is kept in the last position. + */ + + if (rlist->rl_rgrps) { + rgd = rlist->rl_rgd[rlist->rl_rgrps - 1]; + if (rgrp_contains_block(rgd, block)) + return; rgd = gfs2_blk2rgrpd(sdp, block, 1); + } else { + rgd = ip->i_rgd; + if (!rgd || !rgrp_contains_block(rgd, block)) + rgd = gfs2_blk2rgrpd(sdp, block, 1); + } + if (!rgd) { - fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block); + fs_err(sdp, "rlist_add: no rgrp for block %llu\n", + (unsigned long long)block); return; } ip->i_rgd = rgd; - for (x = 0; x < rlist->rl_rgrps; x++) - if (rlist->rl_rgd[x] == rgd) + for (x = 0; x < rlist->rl_rgrps; x++) { + if (rlist->rl_rgd[x] == rgd) { + swap(rlist->rl_rgd[x], + rlist->rl_rgd[rlist->rl_rgrps - 1]); return; + } + } if (rlist->rl_rgrps == rlist->rl_space) { new_space = rlist->rl_space + 10; From b7eba890a228f591fea2889b901267ba5de7839b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 21 Jun 2018 07:42:37 -0500 Subject: [PATCH 11/25] gfs2: Eliminate redundant ip->i_rgd GFS2 remembers the last rgrp used for allocations in ip->i_rgd. However, block allocations are made by way of a reservations structure, ip->i_res, which keeps the last rgrp in ip->i_res.rs_rgd, and ip->i_res is kept in sync with ip->i_res.rs_rgd, so it's redundant. Get rid of ip->i_rgd and just use ip->i_res.rs_rgd in its place. Based on patches by Robert Peterson. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson --- fs/gfs2/incore.h | 1 - fs/gfs2/rgrp.c | 13 ++++++------- fs/gfs2/super.c | 1 - fs/gfs2/trans.h | 6 ++++-- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index e9cd2cc292d33..b50908211b69b 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -397,7 +397,6 @@ struct gfs2_inode { struct gfs2_holder i_gh; /* for prepare/commit_write only */ struct gfs2_qadata *i_qadata; /* quota allocation data */ struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */ - struct gfs2_rgrpd *i_rgd; u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; struct list_head i_ordered; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index ecdc4cb5b6ad1..60c86532782ed 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1992,8 +1992,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) return -EINVAL; if (gfs2_rs_active(rs)) { begin = rs->rs_rbm.rgd; - } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { - rs->rs_rbm.rgd = begin = ip->i_rgd; + } else if (rs->rs_rbm.rgd && + rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) { + begin = rs->rs_rbm.rgd; } else { check_and_update_goal(ip); rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); @@ -2057,8 +2058,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) if (rs->rs_rbm.rgd->rd_free_clone >= ap->target || (loops == 2 && ap->min_target && rs->rs_rbm.rgd->rd_free_clone >= ap->min_target)) { - ip->i_rgd = rs->rs_rbm.rgd; - ap->allowed = ip->i_rgd->rd_free_clone; + ap->allowed = rs->rs_rbm.rgd->rd_free_clone; return 0; } check_rgrp: @@ -2336,7 +2336,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *dibh; - struct gfs2_rbm rbm = { .rgd = ip->i_rgd, }; + struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, }; unsigned int ndata; u64 block; /* block, within the file system scope */ int error; @@ -2569,7 +2569,7 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, return; rgd = gfs2_blk2rgrpd(sdp, block, 1); } else { - rgd = ip->i_rgd; + rgd = ip->i_res.rs_rbm.rgd; if (!rgd || !rgrp_contains_block(rgd, block)) rgd = gfs2_blk2rgrpd(sdp, block, 1); } @@ -2579,7 +2579,6 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, (unsigned long long)block); return; } - ip->i_rgd = rgd; for (x = 0; x < rlist->rl_rgrps; x++) { if (rlist->rl_rgd[x] == rgd) { diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index cf5c7f3080d24..685dc6fff5edd 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1729,7 +1729,6 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) if (ip) { ip->i_flags = 0; ip->i_gl = NULL; - ip->i_rgd = NULL; memset(&ip->i_res, 0, sizeof(ip->i_res)); RB_CLEAR_NODE(&ip->i_res.rs_node); ip->i_rahead = 0; diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index 1e6e7da25a172..ad70087d05971 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -30,9 +30,11 @@ struct gfs2_glock; * block, or all of the blocks in the rg, whichever is smaller */ static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned requested) { - if (requested < ip->i_rgd->rd_length) + struct gfs2_rgrpd *rgd = ip->i_res.rs_rbm.rgd; + + if (requested < rgd->rd_length) return requested + 1; - return ip->i_rgd->rd_length; + return rgd->rd_length; } extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, From e79e0e1428188b24c3b57309ffa54a33c4ae40c4 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 18 Jun 2018 13:24:13 -0500 Subject: [PATCH 12/25] gfs2: Don't reject a supposedly full bitmap if we have blocks reserved Before this patch, you could get into situations like this: 1. Process 1 searches for X free blocks, finds them, makes a reservation 2. Process 2 searches for free blocks in the same rgrp, but now the bitmap is full because process 1's reservation is skipped over. So it marks the bitmap as GBF_FULL. 3. Process 1 tries to allocate blocks from its own reservation, but since the GBF_FULL bit is set, it skips over the rgrp and searches elsewhere, thus not using its own reservation. This patch adds an additional check to allow processes to use their own reservations. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/rgrp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 60c86532782ed..bce75f25e53a0 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1687,7 +1687,8 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, while(1) { bi = rbm_bi(rbm); - if (test_bit(GBF_FULL, &bi->bi_flags) && + if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) && + test_bit(GBF_FULL, &bi->bi_flags) && (state == GFS2_BLKST_FREE)) goto next_bitmap; From 910f3d58d0d40691534b77cc01588ffa22ee7dee Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Fri, 22 Jun 2018 09:51:43 +0800 Subject: [PATCH 13/25] gfs2: using posix_acl_xattr_size instead of posix_acl_to_xattr It seems better to get size by calling posix_acl_xattr_size() instead of calling posix_acl_to_xattr() with NULL buffer argument. posix_acl_xattr_size() never returns 0, so remove the unnecessary check. Signed-off-by: Chengguang Xu Signed-off-by: Andreas Gruenbacher --- fs/gfs2/acl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 776717f1eeea4..af5f87a493d9b 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -82,14 +82,12 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type) int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int error; - int len; + size_t len; char *data; const char *name = gfs2_acl_name(type); if (acl) { - len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); - if (len == 0) - return 0; + len = posix_acl_xattr_size(acl->a_count); data = kmalloc(len, GFP_NOFS); if (data == NULL) return -ENOMEM; From 109dbb1e6f27fb8f80ee61953485c7c3b1717951 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 2 Jul 2018 22:16:13 +0530 Subject: [PATCH 14/25] fs: gfs2: Adding new return type vm_fault_t Use new return type vm_fault_t for gfs2_page_mkwrite handler. see commit 1c8f422059ae ("mm: change return type to vm_fault_t") for reference. Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 7137db7b0119d..8cb278ee9a0ea 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -387,7 +387,7 @@ static int gfs2_allocate_page_backing(struct page *page) * blocks allocated on disk to back that page. */ -static int gfs2_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); From 4a7727725dc7d73769c5ab24c566df454093285f Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 5 Jul 2018 14:40:46 -0500 Subject: [PATCH 15/25] GFS2: Fix recovery issues for spectators This patch fixes a couple problems dealing with spectators who remain with gfs2 mounts after the last non-spectator node fails. Before this patch, spectator mounts would try to acquire the dlm's mounted lock EX as part of its normal recovery sequence. The mounted lock is only used to determine whether the node is the first mounter, the first node to mount the file system, for the purposes of file system recovery and journal replay. It's not necessary for spectators: they should never do journal recovery. If they acquire the lock it will prevent another "real" first-mounter from acquiring the lock in EX mode, which means it also cannot do journal recovery because it doesn't think it's the first node to mount the file system. This patch checks if the mounter is a spectator, and if so, avoids grabbing the mounted lock. This allows a secondary mounter who is really the first non-spectator mounter, to do journal recovery: since the spectator doesn't acquire the lock, it can grab it in EX mode, and therefore consider itself to be the first mounter both as a "real" first mount, and as a first-real-after-spectator. Note that the control lock still needs to be taken in PR mode in order to fetch the lvb value so it has the current status of all journal's recovery. This is used as it is today by a first mounter to replay the journals. For spectators, it's merely used to fetch the status bits. All recovery is bypassed and the node waits until recovery is completed by a non-spectator node. I also improved the cryptic message given by control_mount when a spectator is waiting for a non-spectator to perform recovery. It also fixes a problem in gfs2_recover_set whereby spectators were never queueing recovery work for their own journal. They cannot do recovery themselves, but they still need to queue the work so they can check the recovery bits and clear the DFL_BLOCK_LOCKS bit once the recovery happens on another node. When the work queue runs on a spectator, it bypasses most of the work so it won't print a bunch of annoying messages. All it will print is a bunch of messages that look like this until recovery completes on the non-spectator node: GFS2: fsid=mycluster:scratch.s: recover generation 3 jid 0 GFS2: fsid=mycluster:scratch.s: recover jid 0 result busy These continue every 1.5 seconds until the recovery is done by the non-spectator, at which time it says: GFS2: fsid=mycluster:scratch.s: recover generation 4 done Then it proceeds with its mount. If the file system is mounted in spectator node and the last remaining non-spectator is fenced, any IO to the file system is blocked by dlm and the spectator waits until recovery is performed by a non-spectator. If a spectator tries to mount the file system before any non-spectators, it blocks and repeatedly gives this kernel message: GFS2: fsid=mycluster:scratch: Recovery is required. Waiting for a non-spectator to mount. GFS2: fsid=mycluster:scratch: Recovery is required. Waiting for a non-spectator to mount. Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/lock_dlm.c | 20 +++++++++++++++++--- fs/gfs2/recovery.c | 7 ++++--- fs/gfs2/sys.c | 11 +++++++++-- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 006c6164f7595..ac7caa267ed60 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -821,6 +821,13 @@ static int control_mount(struct gfs2_sbd *sdp) goto fail; } + /** + * If we're a spectator, we don't want to take the lock in EX because + * we cannot do the first-mount responsibility it implies: recovery. + */ + if (sdp->sd_args.ar_spectator) + goto locks_done; + error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE); if (!error) { mounted_mode = DLM_LOCK_EX; @@ -896,9 +903,16 @@ static int control_mount(struct gfs2_sbd *sdp) if (lvb_gen < mount_gen) { /* wait for mounted nodes to update control_lock lvb to our generation, which might include new recovery bits set */ - fs_info(sdp, "control_mount wait1 block %u start %u mount %u " - "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, - lvb_gen, ls->ls_recover_flags); + if (sdp->sd_args.ar_spectator) { + fs_info(sdp, "Recovery is required. Waiting for a " + "non-spectator to mount.\n"); + msleep_interruptible(1000); + } else { + fs_info(sdp, "control_mount wait1 block %u start %u " + "mount %u lvb %u flags %lx\n", block_gen, + start_gen, mount_gen, lvb_gen, + ls->ls_recover_flags); + } spin_unlock(&ls->ls_recover_spin); goto restart; } diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index d8b622c375ab8..0f501f938d1cf 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -413,12 +413,13 @@ void gfs2_recover_func(struct work_struct *work) ktime_t t_start, t_jlck, t_jhd, t_tlck, t_rep; int ro = 0; unsigned int pass; - int error; + int error = 0; int jlocked = 0; t_start = ktime_get(); - if (sdp->sd_args.ar_spectator || - (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) { + if (sdp->sd_args.ar_spectator) + goto fail; + if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", jd->jd_jid); jlocked = 1; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index c191fa58a1df0..1787d295834e7 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -429,11 +429,18 @@ int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid) spin_lock(&sdp->sd_jindex_spin); rv = -EBUSY; - if (sdp->sd_jdesc->jd_jid == jid) + /** + * If we're a spectator, we use journal0, but it's not really ours. + * So we need to wait for its recovery too. If we skip it we'd never + * queue work to the recovery workqueue, and so its completion would + * never clear the DFL_BLOCK_LOCKS flag, so all our locks would + * permanently stop working. + */ + if (sdp->sd_jdesc->jd_jid == jid && !sdp->sd_args.ar_spectator) goto out; rv = -ENOENT; list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { - if (jd->jd_jid != jid) + if (jd->jd_jid != jid && !sdp->sd_args.ar_spectator) continue; rv = gfs2_recover_journal(jd, false); break; From c25892827c7996eb19ca2a5b1cf596218122e994 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 6 Jul 2018 23:05:41 +0100 Subject: [PATCH 16/25] gfs2: fallocate_chunk: Always initialize struct iomap In fallocate_chunk, always initialize the iomap before calling gfs2_iomap_get_alloc: future changes could otherwise cause things like iomap.flags to leak across calls. Signed-off-by: Andreas Gruenbacher Reviewed-by: Bob Peterson --- fs/gfs2/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index f3c6d78659b15..6d895d39158ac 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -877,7 +877,6 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, struct gfs2_inode *ip = GFS2_I(inode); loff_t end = offset + len; struct buffer_head *dibh; - struct iomap iomap = { }; int error; error = gfs2_meta_inode_buffer(ip, &dibh); @@ -893,6 +892,8 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, } while (offset < end) { + struct iomap iomap = { }; + error = gfs2_iomap_get_alloc(inode, offset, end - offset, &iomap); if (error) From 1d45bb7f9d2a5cbae1e5d9a5f72adad84db4d318 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 27 Jun 2018 01:59:18 +0100 Subject: [PATCH 17/25] gfs2: Use iomap for stuffed direct I/O reads Remove the fallback code from direct to buffered I/O for stuffed reads. For stuffed writes, we must keep the fallback code: the deferred glock we are holding under direct I/O doesn't allow to write to the inode or change the file size. Signed-off-by: Andreas Gruenbacher Reviewed-by: Bob Peterson --- fs/gfs2/file.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6d895d39158ac..08369c6cd127a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -706,14 +706,8 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to) if (ret) goto out_uninit; - /* fall back to buffered I/O for stuffed files */ - ret = -ENOTBLK; - if (gfs2_is_stuffed(ip)) - goto out; - ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL); -out: gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); From f95cbb44abf9d6545769147d5abec4770c89872d Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 6 Jun 2018 20:30:38 +0100 Subject: [PATCH 18/25] gfs2: use iomap_readpage for blocksize == PAGE_SIZE We only use iomap_readpage for pages that don't have buffer heads attached yet: iomap_readpage would otherwise read pages from disk that are marked buffer_uptodate() but not PageUptodate(). Those pages may actually contain data more recent than what's on disk. Signed-off-by: Andreas Gruenbacher Reviewed-by: Bob Peterson --- fs/gfs2/aops.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index cc80fd71f3dd5..31e8270d0b266 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -508,9 +508,13 @@ static int __gfs2_readpage(void *file, struct page *page) { struct gfs2_inode *ip = GFS2_I(page->mapping->host); struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + int error; - if (gfs2_is_stuffed(ip)) { + if (i_blocksize(page->mapping->host) == PAGE_SIZE && + !page_has_buffers(page)) { + error = iomap_readpage(page, &gfs2_iomap_ops); + } else if (gfs2_is_stuffed(ip)) { error = stuffed_readpage(ip, page); unlock_page(page); } else { From d1b0cb933c8e638947ea72f3ab4e3dad4325bb96 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 17 Jul 2018 15:59:28 +0100 Subject: [PATCH 19/25] gfs2: remove redundant variable 'moved' Variable 'moved' s being assigned but is never used hence it is redundant and can be removed. This has been the case ever since commit c752666c. Cleans up clang warning: warning: variable 'moved' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Andreas Gruenbacher --- fs/gfs2/dir.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index d97ad89955d14..e37002560c11f 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1011,7 +1011,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) u64 bn, leaf_no; __be64 *lp; u32 index; - int x, moved = 0; + int x; int error; index = name->hash >> (32 - dip->i_depth); @@ -1113,8 +1113,6 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) if (!prev) prev = dent; - - moved = 1; } else { prev = dent; } From f6753df35c32f17b7abf0de37aa52850ca9733c9 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 30 May 2018 14:05:15 -0500 Subject: [PATCH 20/25] GFS2: rgrp free blocks used incorrectly Before this patch, several functions in rgrp.c checked the value of rgd->rd_free_clone. That does not take into account blocks that were reserved by a multi-block reservation. This causes a problem when space gets tight in the file system. For example, when function gfs2_inplace_reserve checks to see if a rgrp has enough blocks to satisfy the request, it can accept a rgrp that it should reject because, although there are enough blocks to satisfy the request _now_, those blocks may be reserved for another running process. A second problem with this occurs when we've reserved the remaining blocks in an rgrp: function rg_mblk_search() can reject an rgrp improperly because it calculates: u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved; But rd_reserved includes blocks that the current process just reserved in its own call to inplace_reserve. For example, it can reserve the last 128 blocks of an rgrp, then reject that same rgrp because the above calculates out to free_blocks = 0; Consequences include, but are not limited to, (1) leaving holes, and thus increasing file system fragmentation, and (2) reporting file system is full long before it actually is. This patch introduces a new function, rgd_free, which returns the number of clone-free blocks (blocks that are truly free as opposed to blocks that are still being used because an unlinked file is still open) minus the number of blocks reserved by processes, but not counting the blocks we ourselves reserved (because obviously we need to allocate them). Signed-off-by: Bob Peterson Signed-off-by: Andreas Gruenbacher --- fs/gfs2/rgrp.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 0a484a009ba2c..68a81afd3b4a7 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1489,6 +1489,34 @@ static void rs_insert(struct gfs2_inode *ip) trace_gfs2_rs(rs, TRACE_RS_INSERT); } +/** + * rgd_free - return the number of free blocks we can allocate. + * @rgd: the resource group + * + * This function returns the number of free blocks for an rgrp. + * That's the clone-free blocks (blocks that are free, not including those + * still being used for unlinked files that haven't been deleted.) + * + * It also subtracts any blocks reserved by someone else, but does not + * include free blocks that are still part of our current reservation, + * because obviously we can (and will) allocate them. + */ +static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs) +{ + u32 tot_reserved, tot_free; + + if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free)) + return 0; + tot_reserved = rgd->rd_reserved - rs->rs_free; + + if (rgd->rd_free_clone < tot_reserved) + tot_reserved = 0; + + tot_free = rgd->rd_free_clone - tot_reserved; + + return tot_free; +} + /** * rg_mblk_search - find a group of multiple free blocks to form a reservation * @rgd: the resource group descriptor @@ -1504,7 +1532,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, u64 goal; struct gfs2_blkreserv *rs = &ip->i_res; u32 extlen; - u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved; + u32 free_blocks = rgd_free(rgd, rs); int ret; struct inode *inode = &ip->i_inode; @@ -1985,7 +2013,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) int error = 0, rg_locked, flags = 0; u64 last_unlinked = NO_BLOCK; int loops = 0; - u32 skip = 0; + u32 free_blocks, skip = 0; if (sdp->sd_args.ar_rgrplvb) flags |= GL_SKIP; @@ -2056,10 +2084,11 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) goto check_rgrp; /* If rgrp has enough free space, use it */ - if (rs->rs_rbm.rgd->rd_free_clone >= ap->target || + free_blocks = rgd_free(rs->rs_rbm.rgd, rs); + if (free_blocks >= ap->target || (loops == 2 && ap->min_target && - rs->rs_rbm.rgd->rd_free_clone >= ap->min_target)) { - ap->allowed = rs->rs_rbm.rgd->rd_free_clone; + free_blocks >= ap->min_target)) { + ap->allowed = free_blocks; return 0; } check_rgrp: From 776125785a87ff05d49938bd5b9f336f2a05bff6 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 25 Jul 2018 18:45:08 +0100 Subject: [PATCH 21/25] gfs2: Special-case rindex for gfs2_grow To speed up the common case of appending to a file, gfs2_write_alloc_required presumes that writing beyond the end of a file will always require additional blocks to be allocated. This assumption is incorrect for preallocates files, but there are no negative consequences as long as *some* space is still left on the filesystem. One special file that always has some space preallocated beyond the end of the file is the rindex: when growing a filesystem, gfs2_grow adds one or more new resource groups and appends records describing those resource groups to the rindex; the preallocated space ensures that this is always possible. However, when a filesystem is completely full, gfs2_write_alloc_required will indicate that an additional allocation is required, and appending the next record to the rindex will fail even though space for that record has already been preallocated. To fix that, skip the incorrect optimization in gfs2_write_alloc_required, but for the rindex only. Other writes to preallocated space beyond the end of the file are still allowed to fail on completely full filesystems. Signed-off-by: Andreas Gruenbacher Reviewed-by: Bob Peterson --- fs/gfs2/bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 89f1f7d3186de..03128ed1f34e8 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2316,7 +2316,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift; lblock = offset >> shift; lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; - if (lblock_stop > end_of_file) + if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex)) return 1; size = (lblock_stop - lblock) << shift; From 3f30f929bb17877ebc1653c6f3ff41863f1ba524 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 26 Jul 2018 12:59:13 -0500 Subject: [PATCH 22/25] gfs2: cleanup: call gfs2_rgrp_ondisk2lvb from gfs2_rgrp_out Before this patch gfs2_rgrp_ondisk2lvb was called after every call to gfs2_rgrp_out. This patch just calls it directly from within gfs2_rgrp_out, and moves the function to be before it so we don't need a function prototype. Signed-off-by: Bob Peterson Reviewed-by: Andreas Gruenbacher --- fs/gfs2/rgrp.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 68a81afd3b4a7..7c5afeba8888a 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1053,6 +1053,18 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) /* rd_data0, rd_data and rd_bitbytes already set from rindex */ } +static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf) +{ + const struct gfs2_rgrp *str = buf; + + rgl->rl_magic = cpu_to_be32(GFS2_MAGIC); + rgl->rl_flags = str->rg_flags; + rgl->rl_free = str->rg_free; + rgl->rl_dinodes = str->rg_dinodes; + rgl->rl_igeneration = str->rg_igeneration; + rgl->__pad = 0UL; +} + static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) { struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd); @@ -1075,6 +1087,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) str->rg_crc = cpu_to_be32(crc); memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf); } static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd) @@ -1089,18 +1102,6 @@ static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd) return 1; } -static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf) -{ - const struct gfs2_rgrp *str = buf; - - rgl->rl_magic = cpu_to_be32(GFS2_MAGIC); - rgl->rl_flags = str->rg_flags; - rgl->rl_free = str->rg_free; - rgl->rl_dinodes = str->rg_dinodes; - rgl->rl_igeneration = str->rg_igeneration; - rgl->__pad = 0UL; -} - static void update_rgrp_lvb_unlinked(struct gfs2_rgrpd *rgd, u32 change) { struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; @@ -1426,7 +1427,6 @@ int gfs2_fitrim(struct file *filp, void __user *argp) rgd->rd_flags |= GFS2_RGF_TRIMMED; gfs2_trans_add_meta(rgd->rd_gl, bh); gfs2_rgrp_out(rgd, bh->b_data); - gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data); gfs2_trans_end(sdp); } } @@ -2424,7 +2424,6 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data); - gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data); gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); if (dinode) @@ -2465,7 +2464,6 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) rgd->rd_flags &= ~GFS2_RGF_TRIMMED; gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); /* Directories keep their data in the metadata address space */ if (meta || ip->i_depth) @@ -2502,7 +2500,6 @@ void gfs2_unlink_di(struct inode *inode) trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); update_rgrp_lvb_unlinked(rgd, 1); } @@ -2523,7 +2520,6 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); update_rgrp_lvb_unlinked(rgd, -1); gfs2_statfs_change(sdp, 0, +1, -1); From 21e2156f3c4b2ad8b780a6d02342ca0e028a8acd Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 3 Aug 2018 10:57:52 +0100 Subject: [PATCH 23/25] gfs2: Get rid of gfs2_ea_strlen Function gfs2_ea_strlen is only called from ea_list_i, so inline it there. Remove the duplicate switch statement and the creative use of memcpy to set a null byte. Signed-off-by: Andreas Gruenbacher Reviewed-by: Andrew Price Reviewed-by: Bob Peterson --- fs/gfs2/xattr.c | 59 ++++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 37 deletions(-) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index f2bce1e0f6fb5..38515988aaf74 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -343,60 +343,45 @@ struct ea_list { unsigned int ei_size; }; -static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea) -{ - switch (ea->ea_type) { - case GFS2_EATYPE_USR: - return 5 + ea->ea_name_len + 1; - case GFS2_EATYPE_SYS: - return 7 + ea->ea_name_len + 1; - case GFS2_EATYPE_SECURITY: - return 9 + ea->ea_name_len + 1; - default: - return 0; - } -} - static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, void *private) { struct ea_list *ei = private; struct gfs2_ea_request *er = ei->ei_er; - unsigned int ea_size = gfs2_ea_strlen(ea); + unsigned int ea_size; + char *prefix; + unsigned int l; if (ea->ea_type == GFS2_EATYPE_UNUSED) return 0; - if (er->er_data_len) { - char *prefix = NULL; - unsigned int l = 0; - char c = 0; + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + prefix = "user."; + l = 5; + break; + case GFS2_EATYPE_SYS: + prefix = "system."; + l = 7; + break; + case GFS2_EATYPE_SECURITY: + prefix = "security."; + l = 9; + break; + default: + BUG(); + } + ea_size = l + ea->ea_name_len + 1; + if (er->er_data_len) { if (ei->ei_size + ea_size > er->er_data_len) return -ERANGE; - switch (ea->ea_type) { - case GFS2_EATYPE_USR: - prefix = "user."; - l = 5; - break; - case GFS2_EATYPE_SYS: - prefix = "system."; - l = 7; - break; - case GFS2_EATYPE_SECURITY: - prefix = "security."; - l = 9; - break; - } - - BUG_ON(l == 0); - memcpy(er->er_data + ei->ei_size, prefix, l); memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea), ea->ea_name_len); - memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1); + er->er_data[ei->ei_size + ea_size - 1] = 0; } ei->ei_size += ea_size; From dffe12a82826082d2129ef91b17b257254cb60fc Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 7 Aug 2018 10:07:00 -0500 Subject: [PATCH 24/25] gfs2: Fix gfs2_testbit to use clone bitmaps Function gfs2_testbit is called in three places. Two of those places, gfs2_alloc_extent and gfs2_unaligned_extlen, should be using the clone bitmaps, not the "real" bitmaps. Function gfs2_unaligned_extlen is used by the block reservations scheme to determine the length of an extent of free blocks. Before this patch, it wasn't using the clone bitmap, which means recently-freed blocks were treated as free blocks for the purposes of an allocation. This patch adds a new parameter to gfs2_testbit to indicate whether or not the clone bitmaps should be used (if available). Signed-off-by: Bob Peterson Reviewed-by: Andreas Gruenbacher --- fs/gfs2/incore.h | 21 +++++++++++++++++++++ fs/gfs2/rgrp.c | 44 +++++++++++++++++++------------------------- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b50908211b69b..b96d39c28e17c 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -65,6 +65,27 @@ struct gfs2_log_operations { #define GBF_FULL 1 +/** + * Clone bitmaps (bi_clone): + * + * - When a block is freed, we remember the previous state of the block in the + * clone bitmap, and only mark the block as free in the real bitmap. + * + * - When looking for a block to allocate, we check for a free block in the + * clone bitmap, and if no clone bitmap exists, in the real bitmap. + * + * - For allocating a block, we mark it as allocated in the real bitmap, and if + * a clone bitmap exists, also in the clone bitmap. + * + * - At the end of a log_flush, we copy the real bitmap into the clone bitmap + * to make the clone bitmap reflect the current allocation state. + * (Alternatively, we could remove the clone bitmap.) + * + * The clone bitmaps are in-core only, and is never written to disk. + * + * These steps ensure that blocks which have been freed in a transaction cannot + * be reallocated in that same transaction. + */ struct gfs2_bitmap { struct buffer_head *bi_bh; char *bi_clone; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 7c5afeba8888a..ef50fe9b880a0 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -123,17 +123,26 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone, /** * gfs2_testbit - test a bit in the bitmaps * @rbm: The bit to test + * @use_clone: If true, test the clone bitmap, not the official bitmap. + * + * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps, + * not the "real" bitmaps, to avoid allocating recently freed blocks. * * Returns: The two bit block state of the requested bit */ -static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm) +static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone) { struct gfs2_bitmap *bi = rbm_bi(rbm); - const u8 *buffer = bi->bi_bh->b_data + bi->bi_offset; + const u8 *buffer; const u8 *byte; unsigned int bit; + if (use_clone && bi->bi_clone) + buffer = bi->bi_clone; + else + buffer = bi->bi_bh->b_data; + buffer += bi->bi_offset; byte = buffer + (rbm->offset / GFS2_NBBY); bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE; @@ -322,7 +331,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le u8 res; for (n = 0; n < n_unaligned; n++) { - res = gfs2_testbit(rbm); + res = gfs2_testbit(rbm, true); if (res != GFS2_BLKST_FREE) return true; (*len)--; @@ -2146,26 +2155,6 @@ void gfs2_inplace_release(struct gfs2_inode *ip) gfs2_glock_dq_uninit(&rs->rs_rgd_gh); } -/** - * gfs2_get_block_type - Check a block in a RG is of given type - * @rgd: the resource group holding the block - * @block: the block number - * - * Returns: The block type (GFS2_BLKST_*) - */ - -static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) -{ - struct gfs2_rbm rbm = { .rgd = rgd, }; - int ret; - - ret = gfs2_rbm_from_block(&rbm, block); - WARN_ON_ONCE(ret != 0); - - return gfs2_testbit(&rbm); -} - - /** * gfs2_alloc_extent - allocate an extent from a given bitmap * @rbm: the resource group information @@ -2190,7 +2179,7 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode, block++; while (*n < elen) { ret = gfs2_rbm_from_block(&pos, block); - if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE) + if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE) break; gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh); gfs2_setbit(&pos, true, GFS2_BLKST_USED); @@ -2543,6 +2532,7 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) { struct gfs2_rgrpd *rgd; struct gfs2_holder rgd_gh; + struct gfs2_rbm rbm; int error = -EINVAL; rgd = gfs2_blk2rgrpd(sdp, no_addr, 1); @@ -2553,7 +2543,11 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) if (error) goto fail; - if (gfs2_get_block_type(rgd, no_addr) != type) + rbm.rgd = rgd; + error = gfs2_rbm_from_block(&rbm, no_addr); + WARN_ON_ONCE(error != 0); + + if (gfs2_testbit(&rbm, false) != type) error = -ESTALE; gfs2_glock_dq_uninit(&rgd_gh); From f5580d0f8bf60993a5fbc73ee04678070ffbba57 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 8 Aug 2018 09:53:30 -0500 Subject: [PATCH 25/25] gfs2: eliminate update_rgrp_lvb_unlinked Function update_rgrp_lvb_unlinked used to do the same thing as be32_add_cpu. This patch removes it in favor of using be32_add_cpu directly. Signed-off-by: Bob Peterson Reviewed-by: Andrew Price --- fs/gfs2/rgrp.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index ef50fe9b880a0..1ad3256b9cbc3 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1111,13 +1111,6 @@ static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd) return 1; } -static void update_rgrp_lvb_unlinked(struct gfs2_rgrpd *rgd, u32 change) -{ - struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; - u32 unlinked = be32_to_cpu(rgl->rl_unlinked) + change; - rgl->rl_unlinked = cpu_to_be32(unlinked); -} - static u32 count_unlinked(struct gfs2_rgrpd *rgd) { struct gfs2_bitmap *bi; @@ -2489,7 +2482,7 @@ void gfs2_unlink_di(struct inode *inode) trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - update_rgrp_lvb_unlinked(rgd, 1); + be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1); } void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) @@ -2509,7 +2502,7 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - update_rgrp_lvb_unlinked(rgd, -1); + be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1); gfs2_statfs_change(sdp, 0, +1, -1); trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);