diff -druN Linux-2.4.4/fs/nfs/Makefile linux-2.4.4-odirect/fs/nfs/Makefile --- Linux-2.4.4/fs/nfs/Makefile Fri Dec 29 17:07:23 2000 +++ linux-2.4.4-odirect/fs/nfs/Makefile Thu Nov 15 11:42:39 2001 @@ -10,7 +10,7 @@ O_TARGET := nfs.o obj-y := inode.o file.o read.o write.o dir.o symlink.o proc.o \ - nfs2xdr.o flushd.o unlink.o + nfs2xdr.o flushd.o unlink.o direct.o obj-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o obj-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o diff -druN Linux-2.4.4/fs/nfs/direct.c linux-2.4.4-odirect/fs/nfs/direct.c --- Linux-2.4.4/fs/nfs/direct.c Wed Dec 31 19:00:00 1969 +++ linux-2.4.4-odirect/fs/nfs/direct.c Thu Nov 29 17:30:07 2001 @@ -0,0 +1,396 @@ +/* + * linux/fs/nfs/direct.c + * + * High-performance direct I/O for the NFS client + * + * When an application requests uncached I/O, all read and write requests + * are made directly to the server; data stored or fetched via these + * requests is not cached in the Linux page cache. The client does not + * correct unaligned requests from applications. All requested bytes are + * held on permanent storage before a direct write system call returns to + * an application. Applications that manage their own data caching, such + * as databases, can make very good use of direct I/O. + * + * Solaris implements an uncached I/O facility called directio() that + * is used for backups and sequential I/O to very large files. Solaris + * also supports uncaching whole NFS partitions with "-o forcedirectio," + * an undocumented mount option. + * + * Simple-minded prototype November 2001, by Chuck Lever + * + * TODO: + * + * 1. Eliminate the buffer copy operation in the read and write + * paths. The RPC checksum bug must be fixed before this will + * be possible. + * + * 2. Use concurrent asynchronous network requests rather than + * serialized synchronous network requests for normal (non-sync) + * direct I/O. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define NFSDBG_FACILITY (NFSDBG_PAGECACHE | NFSDBG_VFS) +#define ROUND_UP(x, mod) (((x) + ((mod) - 1)) & (~((mod) - 1))) +#define VERF_SIZE (2 * sizeof(__u32)) + +static kmem_cache_t *nfs_direct_cachep; + +static inline void * +nfs_alloc_direct_buffer(void) +{ + return kmem_cache_alloc(nfs_direct_cachep, SLAB_NFS); +} + +static inline void +nfs_free_direct_buffer(void *buffer) +{ + kmem_cache_free(nfs_direct_cachep, buffer); +} + +static inline int +nfs_direct_read_one(struct file *file, struct inode *inode, loff_t offset, + size_t count, void *buf) +{ + int eof, result; + struct nfs_fattr fattr; + + lock_kernel(); + result = NFS_PROTO(inode)->read(inode, nfs_file_cred(file), &fattr, + 0, offset, count, buf, &eof); + nfs_refresh_inode(inode, &fattr); + unlock_kernel(); + + if (result == -EISDIR) + return -EINVAL; + + return result; +} + +static inline int +nfs_direct_write_one(struct file *file, struct inode *inode, void *buf, + loff_t offset, size_t count, int flags, struct nfs_writeverf *verf) +{ + int result; + struct nfs_fattr fattr; + + lock_kernel(); + result = NFS_PROTO(inode)->write(inode, nfs_file_cred(file), &fattr, + flags, offset, count, buf, verf); + nfs_write_attributes(inode, &fattr); + unlock_kernel(); + + if ((flags == NFS_RW_SYNC) && (verf->committed != NFS_FILE_SYNC)) { + printk(KERN_ERR __FUNCTION__ + ": server didn't sync stable write request\n"); + return -EIO; + } + + if (result != count) + printk(KERN_INFO __FUNCTION__ + ": short write, count=%u, result=%d\n", count, result); + + return result; +} + +#ifdef CONFIG_NFS_V3 +static inline int +nfs_direct_commit_one(struct inode *inode, loff_t offset, size_t count, + struct nfs_writeverf *verf) +{ + int result; + struct nfs_fattr fattr; + + lock_kernel(); + result = NFS_PROTO(inode)->commit(inode, &fattr, offset, count, verf); + nfs_write_attributes(inode, &fattr); + unlock_kernel(); + + return result; +} +#else +static inline int +nfs_direct_commit_one(struct inode *inode, loff_t offset, size_t count, + struct nfs_writeverf *verf) +{ + return 0; +} +#endif + +/* + * Push one or more network reads. + */ +static inline int +nfs_direct_read_multiple(struct file *file, const char *dst, size_t count, + loff_t offset, void *buffer) +{ + int total = 0; + struct inode *inode = file->f_dentry->d_inode; + int rsize = NFS_SERVER(inode)->rsize; + + while (count) { + int request, result; + + request = count; + if (count > rsize) + request = rsize; + + result = nfs_direct_read_one(file, inode, offset, request, + buffer); + if (result < 0) { + total = result; + break; + } + __copy_to_user((void *) dst, buffer, result); + total += result; + count -= result; + offset += result; + dst += result; + + if (result < request) /* NFSv2ism */ + break; + }; + + return total; +} + +/* + * Push one stable network write. + */ +static inline int +nfs_direct_write_single(struct file *file, struct inode *inode, + const char *dst, size_t count, loff_t pos, void *buffer) +{ + struct nfs_writeverf verf; + + if (count) + __copy_from_user(buffer, (void *) dst, count); + + return nfs_direct_write_one(file, inode, buffer, pos, count, + NFS_RW_SYNC, &verf); +} + +/* + * Push multiple network writes. If this is V3, push unstable writes + * and finish with a commit. + * + * The "sync" mount flag causes all V3 writes to be stable, causing + * partial writes to hit the server's *disk* in byte order. This means + * every separate network write hits the server's disk synchronously and + * in byte order, providing an added degree of durability for database + * applications. + */ +static inline int +nfs_direct_write_multiple(struct file *file, struct inode *inode, + const char *user, size_t count, loff_t pos, void *buffer) +{ + int flags, total, remaining, need_commit; + loff_t offset; + const char *dst; + struct nfs_writeverf first_verf, ret_verf; + + flags = NFS_RW_SYNC; +#ifdef CONFIG_NFS_V3 + if ((NFS_PROTO(inode)->version == 3) && !IS_SYNC(inode)) + flags = 0; +#endif + +retry: + need_commit = 0; + total = 0; + remaining = count; + offset = pos; + dst = user; + while (remaining) { + int request, result; + int wsize = NFS_SERVER(inode)->wsize; + + request = remaining; + if (remaining > wsize) + request = wsize; + + __copy_from_user(buffer, (void *) dst, request); + result = nfs_direct_write_one(file, inode, buffer, offset, + request, flags, &ret_verf); + if (result < 0) { + total = result; + break; + } + + if (!total) + memcpy(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE); + if (ret_verf.committed != NFS_FILE_SYNC) { + need_commit = 1; + if (memcmp(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE)) + goto print_retry; + } + + total += result; + remaining -= result; + offset += result; + dst += result; + }; + + if (need_commit) { + if (nfs_direct_commit_one(inode, pos, count - remaining, + &ret_verf)) + goto print_retry; + if (memcmp(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE)) + goto print_retry; + } + + return total; + +print_retry: + printk(KERN_INFO __FUNCTION__ + ": detected server restart; retrying with FILE_SYNC\n"); + flags = NFS_RW_SYNC; + goto retry; +} + +int +nfs_direct_read(struct file *file, const char *dst, size_t count, + loff_t *ppos) +{ + int result; + loff_t pos = *ppos; + struct dentry *dentry = file->f_dentry; + void *buffer; + + dfprintk(VFS, "NFS: direct_read(%s/%s, %d@%Lu)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + count, pos); + + if ((count < 0) || (pos < 0)) + return -EINVAL; + if (!access_ok(VERIFY_WRITE, dst, count)) + return -EFAULT; + + buffer = nfs_alloc_direct_buffer(); + if (!buffer) + return -ENOMEM; + + result = nfs_direct_read_multiple(file, dst, count, pos, buffer); + if (result > 0) + *ppos = pos + result; + + nfs_free_direct_buffer(buffer); + return result; +} + +int +nfs_direct_write(struct file *file, const char *dst, size_t count, + loff_t *ppos) +{ + int result; + unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + loff_t pos = *ppos; + void *buffer; + struct dentry *dentry = file->f_dentry; + struct inode *inode = dentry->d_inode; + + dfprintk(VFS, "NFS: direct_write(%s/%s, %d@%Lu)\n", + dentry->d_parent->d_name.name, dentry->d_name.name, + count, pos); + + if ((count < 0) || (pos < 0)) + return -EINVAL; + if (!access_ok(VERIFY_WRITE, dst, count)) + return -EFAULT; + + buffer = nfs_alloc_direct_buffer(); + if (!buffer) + return -ENOMEM; + + down(&inode->i_sem); + + /* + * Flush pending cached writes for the requested range. + * Don't wait -- we just need to maintain write ordering. + */ + if (nfs_have_writebacks(inode)) + nfs_sync_file(inode, file, pos, + (count >> PAGE_CACHE_SHIFT) + 1, 0); + + if (file->f_flags & O_APPEND) + pos = inode->i_size; + + result = -EFBIG; + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if ((pos > 0xFFFFFFFFULL) || (count > (limit - (u32)pos))) + count = limit - (u32)pos; + } + if (((pos + count) > MAX_NON_LFS) && !(file->f_flags & O_LARGEFILE)) { + if (pos >= MAX_NON_LFS) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (count > (MAX_NON_LFS - (u32)pos)) + count = MAX_NON_LFS - (u32)pos; + } + + if (count <= NFS_SERVER(inode)->wsize) + result = nfs_direct_write_single(file, inode, dst, count, + pos, buffer); + else + result = nfs_direct_write_multiple(file, inode, dst, count, + pos, buffer); + + /* + * Because we didn't write into the page cache, anything cached + * there is now invalid. We used nfs_write_attributes to + * update the attribute cache after each network write. + * This prevents data cache purging after each network write. + * We do it once here at the end of the whole write operation. + */ + invalidate_inode_pages(inode); + + up(&inode->i_sem); + + if (result > 0) + *ppos = pos + result; + +out: + nfs_free_direct_buffer(buffer); + return result; +} + +int nfs_init_directcache(void) +{ + nfs_direct_cachep = kmem_cache_create("nfs_direct", +#ifdef CONFIG_NFS_V3 + NFS3_MAXDATA, +#else + NFS2_MAXDATA, +#endif + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (nfs_direct_cachep == NULL) + return -ENOMEM; + + return 0; +} + +void nfs_destroy_directcache(void) +{ + if (kmem_cache_destroy(nfs_direct_cachep)) + printk(KERN_INFO __FUNCTION__ + ": not all buffers were freed\n"); +} diff -druN Linux-2.4.4/fs/nfs/file.c linux-2.4.4-odirect/fs/nfs/file.c --- Linux-2.4.4/fs/nfs/file.c Tue Sep 4 13:34:53 2001 +++ linux-2.4.4-odirect/fs/nfs/file.c Thu Nov 15 11:47:27 2001 @@ -98,6 +98,9 @@ dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long) count, (unsigned long) *ppos); + if (NFS_SERVER(inode)->flags & NFS_MOUNT_NODC) + return nfs_direct_read( file, buf, count, ppos); + result = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (!result) result = generic_file_read(file, buf, count, ppos); @@ -224,6 +227,10 @@ result = -EBUSY; if (IS_SWAPFILE(inode)) goto out_swapfile; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_NODC) + return nfs_direct_write( file, buf, count, ppos); + result = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (result) goto out; diff -druN Linux-2.4.4/fs/nfs/inode.c linux-2.4.4-odirect/fs/nfs/inode.c --- Linux-2.4.4/fs/nfs/inode.c Tue Sep 4 13:34:53 2001 +++ linux-2.4.4-odirect/fs/nfs/inode.c Thu Nov 15 11:49:20 2001 @@ -1053,7 +1053,9 @@ extern int nfs_init_nfspagecache(void); extern void nfs_destroy_nfspagecache(void); extern int nfs_init_readpagecache(void); -extern int nfs_destroy_readpagecache(void); +extern void nfs_destroy_readpagecache(void); +extern int nfs_init_directcache(void); +extern void nfs_destroy_directcache(void); /* * Initialize NFS @@ -1070,6 +1072,10 @@ if (err) return err; + err = nfs_init_directcache(); + if (err) + return err; + #ifdef CONFIG_PROC_FS rpc_proc_register(&nfs_rpcstat); #endif @@ -1080,6 +1086,7 @@ { nfs_destroy_readpagecache(); nfs_destroy_nfspagecache(); + nfs_destroy_directcache(); #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif diff -druN Linux-2.4.4/fs/nfs/nfs3proc.c linux-2.4.4-odirect/fs/nfs/nfs3proc.c --- Linux-2.4.4/fs/nfs/nfs3proc.c Sun Dec 3 21:01:01 2000 +++ linux-2.4.4-odirect/fs/nfs/nfs3proc.c Thu Nov 15 11:50:53 2001 @@ -181,6 +181,27 @@ return status < 0? status : res.count; } +static int +nfs3_proc_commit(struct inode *inode, struct nfs_fattr *fattr, loff_t offset, + unsigned int count, struct nfs_writeverf *verf) +{ + struct nfs_writeargs arg = { NFS_FH(inode), offset, count, 0, 0, + {{0, 0}, {0,0}, {0,0}, {0,0}, + {0,0}, {0,0}, {0,0}, {0,0}} }; + struct nfs_writeres res = { fattr, verf, 0 }; + struct rpc_message msg = { NFS3PROC_COMMIT, &arg, &res, NULL }; + int status; + + dprintk("NFS call commit %d @ %Ld\n", count, (long long)offset); + + fattr->valid = 0; + + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + + dprintk("NFS reply commit: %d\n", status); + return status; +} + /* * Create a regular file. * For now, we don't implement O_EXCL. @@ -495,7 +516,7 @@ nfs3_proc_readlink, nfs3_proc_read, nfs3_proc_write, - NULL, /* commit */ + nfs3_proc_commit, nfs3_proc_create, nfs3_proc_remove, nfs3_proc_unlink_setup, diff -druN Linux-2.4.4/fs/nfs/nfsroot.c linux-2.4.4-odirect/fs/nfs/nfsroot.c --- Linux-2.4.4/fs/nfs/nfsroot.c Mon Sep 25 16:13:53 2000 +++ linux-2.4.4-odirect/fs/nfs/nfsroot.c Thu Nov 15 11:52:24 2001 @@ -158,6 +158,8 @@ { "udp", ~NFS_MOUNT_TCP, 0 }, { "tcp", ~NFS_MOUNT_TCP, NFS_MOUNT_TCP }, { "broken_suid",~NFS_MOUNT_BROKEN_SUID, NFS_MOUNT_BROKEN_SUID }, + { "dc", ~NFS_MOUNT_NODC, 0 }, + { "nodc", ~NFS_MOUNT_NODC, NFS_MOUNT_NODC }, { NULL, 0, 0 } }; diff -druN Linux-2.4.4/fs/nfs/write.c linux-2.4.4-odirect/fs/nfs/write.c --- Linux-2.4.4/fs/nfs/write.c Tue Sep 4 13:34:53 2001 +++ linux-2.4.4-odirect/fs/nfs/write.c Thu Nov 15 11:52:34 2001 @@ -147,23 +147,6 @@ } /* - * This function will be used to simulate weak cache consistency - * under NFSv2 when the NFSv3 attribute patch is included. - * For the moment, we just call nfs_refresh_inode(). - */ -static __inline__ int -nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) -{ - if ((fattr->valid & NFS_ATTR_FATTR) && !(fattr->valid & NFS_ATTR_WCC)) { - fattr->pre_size = NFS_CACHE_ISIZE(inode); - fattr->pre_mtime = NFS_CACHE_MTIME(inode); - fattr->pre_ctime = NFS_CACHE_CTIME(inode); - fattr->valid |= NFS_ATTR_WCC; - } - return nfs_refresh_inode(inode, fattr); -} - -/* * Write a page synchronously. * Offset is the data offset within the page. */ diff -druN Linux-2.4.4/fs/super.c linux-2.4.4-odirect/fs/super.c --- Linux-2.4.4/fs/super.c Tue Sep 4 13:34:53 2001 +++ linux-2.4.4-odirect/fs/super.c Thu Nov 15 11:42:26 2001 @@ -486,6 +486,7 @@ { NFS_MOUNT_NOAC, ",noac", "" }, { NFS_MOUNT_NONLM, ",nolock", ",lock" }, { NFS_MOUNT_BROKEN_SUID, ",broken_suid", "" }, + { NFS_MOUNT_NODC, ",nodc", "" }, { 0, NULL, NULL } }; diff -druN Linux-2.4.4/include/linux/nfs_fs.h linux-2.4.4-odirect/include/linux/nfs_fs.h --- Linux-2.4.4/include/linux/nfs_fs.h Tue Sep 4 13:34:54 2001 +++ linux-2.4.4-odirect/include/linux/nfs_fs.h Thu Nov 15 11:58:03 2001 @@ -256,6 +256,12 @@ extern int nfs_pagein_timeout(struct inode *); /* + * linux/fs/nfs/direct.c + */ +extern int nfs_direct_write(struct file *, const char *, size_t, loff_t *); +extern int nfs_direct_read(struct file *, const char *, size_t, loff_t *); + +/* * linux/fs/mount_clnt.c * (Used only by nfsroot module) */ @@ -271,6 +277,23 @@ if (time_before(jiffies, NFS_READTIME(inode)+NFS_ATTRTIMEO(inode))) return NFS_STALE(inode) ? -ESTALE : 0; return __nfs_revalidate_inode(server, inode); +} + +/* + * This function will be used to simulate weak cache consistency + * under NFSv2 when the NFSv3 attribute patch is included. + * For the moment, we just call nfs_refresh_inode(). + */ +static __inline__ int +nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) +{ + if ((fattr->valid & NFS_ATTR_FATTR) && !(fattr->valid & NFS_ATTR_WCC)) { + fattr->pre_size = NFS_CACHE_ISIZE(inode); + fattr->pre_mtime = NFS_CACHE_MTIME(inode); + fattr->pre_ctime = NFS_CACHE_CTIME(inode); + fattr->valid |= NFS_ATTR_WCC; + } + return nfs_refresh_inode(inode, fattr); } static inline loff_t diff -druN Linux-2.4.4/include/linux/nfs_mount.h linux-2.4.4-odirect/include/linux/nfs_mount.h --- Linux-2.4.4/include/linux/nfs_mount.h Tue Sep 4 13:34:54 2001 +++ linux-2.4.4-odirect/include/linux/nfs_mount.h Thu Nov 15 12:00:14 2001 @@ -53,6 +53,7 @@ #define NFS_MOUNT_KERBEROS 0x0100 /* 3 */ #define NFS_MOUNT_NONLM 0x0200 /* 3 */ #define NFS_MOUNT_BROKEN_SUID 0x0400 /* 4 */ +#define NFS_MOUNT_NODC 0x0800 /* 4 */ #define NFS_MOUNT_FLAGMASK 0xFFFF #endif diff -druN Linux-2.4.4/include/linux/nfs_xdr.h linux-2.4.4-odirect/include/linux/nfs_xdr.h --- Linux-2.4.4/include/linux/nfs_xdr.h Tue Sep 4 13:34:54 2001 +++ linux-2.4.4-odirect/include/linux/nfs_xdr.h Thu Nov 15 11:55:31 2001 @@ -331,8 +331,8 @@ struct nfs_fattr *, int, loff_t, unsigned int, void *buffer, struct nfs_writeverf *verfp); - int (*commit) (struct inode *, struct nfs_fattr *, - unsigned long, unsigned int); + int (*commit) (struct inode *, struct nfs_fattr *, loff_t, + unsigned int, struct nfs_writeverf *verfp); int (*create) (struct inode *, struct qstr *, struct iattr *, int, struct nfs_fh *, struct nfs_fattr *); int (*remove) (struct inode *, struct qstr *); diff -druN Linux-2.4.4/net/sunrpc/xprt.c linux-2.4.4-odirect/net/sunrpc/xprt.c --- Linux-2.4.4/net/sunrpc/xprt.c Tue Sep 4 13:34:55 2001 +++ linux-2.4.4-odirect/net/sunrpc/xprt.c Thu Nov 29 17:24:47 2001 @@ -605,10 +605,14 @@ int to_move = cur_len; if (to_move > copied) to_move = copied; - if (need_csum) - csum = skb_copy_and_csum_bits(skb, offset, cur_ptr, - to_move, csum); - else + if (need_csum) { + unsigned int csum2; + + csum2 = skb_copy_and_csum_bits(skb, offset, + cur_ptr, + to_move, 0); + csum = csum_block_add(csum, csum2, offset); + } else skb_copy_bits(skb, offset, cur_ptr, to_move); offset += to_move; copied -= to_move; @@ -622,8 +626,12 @@ } } if (need_csum) { - if (slack > 0) - csum = skb_checksum(skb, offset, slack, csum); + if (slack > 0) { + unsigned int csum2; + + csum2 = skb_checksum(skb, offset, slack, 0); + csum = csum_block_add(csum, csum2, offset); + } if ((unsigned short)csum_fold(csum)) return -1; }