Skip to content

Commit

Permalink
NFS: support large reads and writes on the wire
Browse files Browse the repository at this point in the history
 Most NFS server implementations allow up to 64KB reads and writes on the
 wire.  The Solaris NFS server allows up to a megabyte, for instance.

 Now the Linux NFS client supports transfer sizes up to 1MB, too.  This will
 help reduce protocol and context switch overhead on read/write intensive NFS
 workloads, and support larger atomic read and write operations on servers
 that support them.

 Test-plan:
 Connectathon and iozone on mount point with wsize=rsize>32768 over TCP.
 Tests with NFS over UDP to verify the maximum RPC payload size cap.

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
  • Loading branch information
Chuck Lever authored and Trond Myklebust committed Jan 6, 2006
1 parent 325cfed commit 40859d7
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 53 deletions.
5 changes: 3 additions & 2 deletions fs/nfs/direct.c
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int
struct list_head *list;
struct nfs_direct_req *dreq;
unsigned int reads = 0;
unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;

dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
if (!dreq)
Expand All @@ -167,7 +168,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int

list = &dreq->list;
for(;;) {
struct nfs_read_data *data = nfs_readdata_alloc();
struct nfs_read_data *data = nfs_readdata_alloc(rpages);

if (unlikely(!data)) {
while (!list_empty(list)) {
Expand Down Expand Up @@ -431,7 +432,7 @@ static ssize_t nfs_direct_write_seg(struct inode *inode,
struct nfs_writeverf first_verf;
struct nfs_write_data *wdata;

wdata = nfs_writedata_alloc();
wdata = nfs_writedata_alloc(NFS_SERVER(inode)->wpages);
if (!wdata)
return -ENOMEM;

Expand Down
25 changes: 10 additions & 15 deletions fs/nfs/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -221,10 +221,10 @@ nfs_calc_block_size(u64 tsize)
static inline unsigned long
nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
{
if (bsize < 1024)
bsize = NFS_DEF_FILE_IO_BUFFER_SIZE;
else if (bsize >= NFS_MAX_FILE_IO_BUFFER_SIZE)
bsize = NFS_MAX_FILE_IO_BUFFER_SIZE;
if (bsize < NFS_MIN_FILE_IO_SIZE)
bsize = NFS_DEF_FILE_IO_SIZE;
else if (bsize >= NFS_MAX_FILE_IO_SIZE)
bsize = NFS_MAX_FILE_IO_SIZE;

return nfs_block_bits(bsize, nrbitsp);
}
Expand Down Expand Up @@ -307,20 +307,15 @@ nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
if (server->rsize > max_rpc_payload)
server->rsize = max_rpc_payload;
if (server->wsize > max_rpc_payload)
server->wsize = max_rpc_payload;

if (server->rsize > NFS_MAX_FILE_IO_SIZE)
server->rsize = NFS_MAX_FILE_IO_SIZE;
server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (server->rpages > NFS_READ_MAXIOV) {
server->rpages = NFS_READ_MAXIOV;
server->rsize = server->rpages << PAGE_CACHE_SHIFT;
}

if (server->wsize > max_rpc_payload)
server->wsize = max_rpc_payload;
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
server->wsize = NFS_MAX_FILE_IO_SIZE;
server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (server->wpages > NFS_WRITE_MAXIOV) {
server->wpages = NFS_WRITE_MAXIOV;
server->wsize = server->wpages << PAGE_CACHE_SHIFT;
}

if (sb->s_blocksize == 0)
sb->s_blocksize = nfs_block_bits(server->wsize,
Expand Down
4 changes: 2 additions & 2 deletions fs/nfs/nfsroot.c
Original file line number Diff line number Diff line change
Expand Up @@ -296,8 +296,8 @@ static int __init root_nfs_name(char *name)
nfs_port = -1;
nfs_data.version = NFS_MOUNT_VERSION;
nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */
nfs_data.rsize = NFS_DEF_FILE_IO_BUFFER_SIZE;
nfs_data.wsize = NFS_DEF_FILE_IO_BUFFER_SIZE;
nfs_data.rsize = NFS_DEF_FILE_IO_SIZE;
nfs_data.wsize = NFS_DEF_FILE_IO_SIZE;
nfs_data.acregmin = 3;
nfs_data.acregmax = 60;
nfs_data.acdirmin = 30;
Expand Down
6 changes: 3 additions & 3 deletions fs/nfs/read.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
int result;
struct nfs_read_data *rdata;

rdata = nfs_readdata_alloc();
rdata = nfs_readdata_alloc(1);
if (!rdata)
return -ENOMEM;

Expand Down Expand Up @@ -283,7 +283,7 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)

nbytes = req->wb_bytes;
for(;;) {
data = nfs_readdata_alloc();
data = nfs_readdata_alloc(1);
if (!data)
goto out_bad;
INIT_LIST_HEAD(&data->pages);
Expand Down Expand Up @@ -339,7 +339,7 @@ static int nfs_pagein_one(struct list_head *head, struct inode *inode)
if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
return nfs_pagein_multi(head, inode);

data = nfs_readdata_alloc();
data = nfs_readdata_alloc(NFS_SERVER(inode)->rpages);
if (!data)
goto out_bad;

Expand Down
29 changes: 22 additions & 7 deletions fs/nfs/write.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,18 +89,33 @@ static mempool_t *nfs_commit_mempool;

static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);

static inline struct nfs_write_data *nfs_commit_alloc(void)
static inline struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
{
struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS);

if (p) {
memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&p->pages);
if (pagecount < NFS_PAGEVEC_SIZE)
p->pagevec = &p->page_array[0];
else {
size_t size = ++pagecount * sizeof(struct page *);
p->pagevec = kmalloc(size, GFP_NOFS);
if (p->pagevec) {
memset(p->pagevec, 0, size);
} else {
mempool_free(p, nfs_commit_mempool);
p = NULL;
}
}
}
return p;
}

static inline void nfs_commit_free(struct nfs_write_data *p)
{
if (p && (p->pagevec != &p->page_array[0]))
kfree(p->pagevec);
mempool_free(p, nfs_commit_mempool);
}

Expand Down Expand Up @@ -167,7 +182,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode,
int result, written = 0;
struct nfs_write_data *wdata;

wdata = nfs_writedata_alloc();
wdata = nfs_writedata_alloc(1);
if (!wdata)
return -ENOMEM;

Expand Down Expand Up @@ -909,7 +924,7 @@ static int nfs_flush_multi(struct list_head *head, struct inode *inode, int how)

nbytes = req->wb_bytes;
for (;;) {
data = nfs_writedata_alloc();
data = nfs_writedata_alloc(1);
if (!data)
goto out_bad;
list_add(&data->pages, &list);
Expand Down Expand Up @@ -973,7 +988,7 @@ static int nfs_flush_one(struct list_head *head, struct inode *inode, int how)
if (NFS_SERVER(inode)->wsize < PAGE_CACHE_SIZE)
return nfs_flush_multi(head, inode, how);

data = nfs_writedata_alloc();
data = nfs_writedata_alloc(NFS_SERVER(inode)->wpages);
if (!data)
goto out_bad;

Expand Down Expand Up @@ -1241,12 +1256,12 @@ static void nfs_commit_rpcsetup(struct list_head *head,
* Commit dirty pages
*/
static int
nfs_commit_list(struct list_head *head, int how)
nfs_commit_list(struct inode *inode, struct list_head *head, int how)
{
struct nfs_write_data *data;
struct nfs_page *req;

data = nfs_commit_alloc();
data = nfs_commit_alloc(NFS_SERVER(inode)->wpages);

if (!data)
goto out_bad;
Expand Down Expand Up @@ -1351,7 +1366,7 @@ int nfs_commit_inode(struct inode *inode, int how)
res = nfs_scan_commit(inode, &head, 0, 0);
spin_unlock(&nfsi->req_lock);
if (res) {
error = nfs_commit_list(&head, how);
error = nfs_commit_list(inode, &head, how);
if (error < 0)
return error;
}
Expand Down
41 changes: 35 additions & 6 deletions include/linux/nfs_fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@
# define NFS_DEBUG
#endif

#define NFS_MAX_FILE_IO_BUFFER_SIZE 32768
#define NFS_DEF_FILE_IO_BUFFER_SIZE 4096

/* Default timeout values */
#define NFS_MAX_UDP_TIMEOUT (60*HZ)
#define NFS_MAX_TCP_TIMEOUT (600*HZ)
Expand Down Expand Up @@ -462,18 +459,33 @@ static inline int nfs_wb_page(struct inode *inode, struct page* page)
*/
extern mempool_t *nfs_wdata_mempool;

static inline struct nfs_write_data *nfs_writedata_alloc(void)
static inline struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
{
struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);

if (p) {
memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&p->pages);
if (pagecount < NFS_PAGEVEC_SIZE)
p->pagevec = &p->page_array[0];
else {
size_t size = ++pagecount * sizeof(struct page *);
p->pagevec = kmalloc(size, GFP_NOFS);
if (p->pagevec) {
memset(p->pagevec, 0, size);
} else {
mempool_free(p, nfs_wdata_mempool);
p = NULL;
}
}
}
return p;
}

static inline void nfs_writedata_free(struct nfs_write_data *p)
{
if (p && (p->pagevec != &p->page_array[0]))
kfree(p->pagevec);
mempool_free(p, nfs_wdata_mempool);
}

Expand All @@ -492,16 +504,33 @@ extern void nfs_readdata_release(void *data);
*/
extern mempool_t *nfs_rdata_mempool;

static inline struct nfs_read_data *nfs_readdata_alloc(void)
static inline struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
{
struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
if (p)

if (p) {
memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&p->pages);
if (pagecount < NFS_PAGEVEC_SIZE)
p->pagevec = &p->page_array[0];
else {
size_t size = ++pagecount * sizeof(struct page *);
p->pagevec = kmalloc(size, GFP_NOFS);
if (p->pagevec) {
memset(p->pagevec, 0, size);
} else {
mempool_free(p, nfs_rdata_mempool);
p = NULL;
}
}
}
return p;
}

static inline void nfs_readdata_free(struct nfs_read_data *p)
{
if (p && (p->pagevec != &p->page_array[0]))
kfree(p->pagevec);
mempool_free(p, nfs_rdata_mempool);
}

Expand Down
29 changes: 16 additions & 13 deletions include/linux/nfs_xdr.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@
#include <linux/sunrpc/xprt.h>
#include <linux/nfsacl.h>

/*
* To change the maximum rsize and wsize supported by the NFS client, adjust
* NFS_MAX_FILE_IO_SIZE. 64KB is a typical maximum, but some servers can
* support a megabyte or more. The default is left at 4096 bytes, which is
* reasonable for NFS over UDP.
*/
#define NFS_MAX_FILE_IO_SIZE (1048576U)
#define NFS_DEF_FILE_IO_SIZE (4096U)
#define NFS_MIN_FILE_IO_SIZE (1024U)

struct nfs4_fsid {
__u64 major;
__u64 minor;
Expand Down Expand Up @@ -215,12 +225,6 @@ struct nfs4_delegreturnargs {
/*
* Arguments to the read call.
*/

#define NFS_READ_MAXIOV (9U)
#if (NFS_READ_MAXIOV > (MAX_IOVEC -2))
#error "NFS_READ_MAXIOV is too large"
#endif

struct nfs_readargs {
struct nfs_fh * fh;
struct nfs_open_context *context;
Expand All @@ -239,11 +243,6 @@ struct nfs_readres {
/*
* Arguments to the write call.
*/
#define NFS_WRITE_MAXIOV (9U)
#if (NFS_WRITE_MAXIOV > (MAX_IOVEC -2))
#error "NFS_WRITE_MAXIOV is too large"
#endif

struct nfs_writeargs {
struct nfs_fh * fh;
struct nfs_open_context *context;
Expand Down Expand Up @@ -674,6 +673,8 @@ struct nfs4_server_caps_res {

struct nfs_page;

#define NFS_PAGEVEC_SIZE (8U)

struct nfs_read_data {
int flags;
struct rpc_task task;
Expand All @@ -682,13 +683,14 @@ struct nfs_read_data {
struct nfs_fattr fattr; /* fattr storage */
struct list_head pages; /* Coalesced read requests */
struct nfs_page *req; /* multi ops per nfs_page */
struct page *pagevec[NFS_READ_MAXIOV];
struct page **pagevec;
struct nfs_readargs args;
struct nfs_readres res;
#ifdef CONFIG_NFS_V4
unsigned long timestamp; /* For lease renewal */
#endif
void (*complete) (struct nfs_read_data *, int);
struct page *page_array[NFS_PAGEVEC_SIZE + 1];
};

struct nfs_write_data {
Expand All @@ -700,13 +702,14 @@ struct nfs_write_data {
struct nfs_writeverf verf;
struct list_head pages; /* Coalesced requests we wish to flush */
struct nfs_page *req; /* multi ops per nfs_page */
struct page *pagevec[NFS_WRITE_MAXIOV];
struct page **pagevec;
struct nfs_writeargs args; /* argument struct */
struct nfs_writeres res; /* result struct */
#ifdef CONFIG_NFS_V4
unsigned long timestamp; /* For lease renewal */
#endif
void (*complete) (struct nfs_write_data *, int);
struct page *page_array[NFS_PAGEVEC_SIZE + 1];
};

struct nfs_access_entry;
Expand Down
5 changes: 0 additions & 5 deletions include/linux/sunrpc/xdr.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,6 @@ xdr_adjust_iovec(struct kvec *iov, u32 *p)
return iov->iov_len = ((u8 *) p - (u8 *) iov->iov_base);
}

/*
* Maximum number of iov's we use.
*/
#define MAX_IOVEC (12)

/*
* XDR buffer helper functions
*/
Expand Down

0 comments on commit 40859d7

Please sign in to comment.