--- linux-2.6.14-pnfs-current-dhildebz/fs/nfs/Makefile | 3 linux-2.6.14-pnfs-current-dhildebz/fs/nfs/nfs4filelayout.c | 469 ++++++++++ linux-2.6.14-pnfs-current-dhildebz/fs/nfs/nfs4filelayout.h | 96 ++ linux-2.6.14-pnfs-current-dhildebz/fs/nfs/nfs4filelayoutdev.c | 406 ++++++++ 4 files changed, 974 insertions(+) diff -puN /dev/null fs/nfs/nfs4filelayout.c --- /dev/null 2006-01-09 05:56:56.224752500 -0500 +++ linux-2.6.14-pnfs-current-dhildebz/fs/nfs/nfs4filelayout.c 2006-01-18 20:12:38.047788000 -0500 @@ -0,0 +1,469 @@ +/* + * linux/fs/nfs/nfs4filelayout-mod.c + * + * Module for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Dean Hildebrand + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nfs4filelayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dean Hildebrand "); +MODULE_DESCRIPTION("The NFSv4 file layout driver"); + +extern void nfs_execute_read(struct nfs_read_data *data); +extern void nfs_readdata_release(struct rpc_task *task); +extern int nfs_flush_task_priority(int how); +extern void nfs_writedata_release(struct rpc_task *task); +extern void nfs_execute_write(struct nfs_write_data *data); +extern void nfs_commit_rpcsetup(struct nfs_write_data *data, int sync); + +/* Callback operations to the pNFS client */ +struct pnfs_client_operations * pnfs_callback_ops; + +/* Initialize a mountpoint by retrieving the list of + * available devices for it. + * Return the pnfs_mount_type structure so the + * pNFS_client can refer to the mount point later on. + */ +struct pnfs_mount_type* +filelayout_initialize_mountpoint(struct super_block* sb) +{ + struct filelayout_mount_type* fl_mt; + struct pnfs_mount_type* mt; + struct pnfs_devicelist dlist; + int status; + + fl_mt = kmalloc(sizeof(struct filelayout_mount_type), GFP_KERNEL); + if (!fl_mt) + return NULL; + mt = kmalloc(sizeof(struct pnfs_mount_type), GFP_KERNEL); + if (!mt) + return NULL; + + fl_mt->fl_sb = sb; + mt->mountid = (void*)fl_mt; + + /* Retrieve device list from server*/ + status = pnfs_callback_ops->nfs_getdevicelist(sb, &dlist); + + /* Initialize nfs4 file layout specific device list structure */ + fl_mt->hlist = kmalloc(sizeof(struct nfs4_pnfs_dev_hlist), GFP_KERNEL); + if (!fl_mt->hlist) + return NULL; + status = nfs4_pnfs_devlist_init(fl_mt->hlist); + if (status) + return NULL; + + /* Decode opaque devicelist and add to list of available + * devices (data servers. + */ + status = decode_and_add_devicelist(fl_mt, &dlist); + if (status) + return NULL; + + return mt; +} + +/* Uninitialize a mountpoint by destroying its device list. + */ +int +filelayout_uninitialize_mountpoint(struct pnfs_mount_type* mountid) +{ + struct filelayout_mount_type* fl_mt = NULL; + + if (mountid) + fl_mt = (struct filelayout_mount_type*)mountid->mountid; + + nfs4_pnfs_devlist_destroy(fl_mt->hlist); + + if (fl_mt != NULL) + kfree(fl_mt); + kfree(mountid); + return 0; +} + +/* Perform sync or async reads. + * + * An optimization for the NFS file layout driver + * allows the original read/write data structs to be passed in the + * last argument. + * + * This is called after the pNFS client has already created, so I pass it + * in via the last argument (void*). I think this is the only way as there + * are just too many NFS specific arguments in the read/write data structs + * to pass to the layout drivers. + * + * TODO: + * 1. This is a lot of arguments, create special non-nfs-specific structure? + */ +ssize_t filelayout_read_pagelist( + struct pnfs_layout_type * layoutid, + struct inode * inode, + struct page **pages, + unsigned int pgbase, + unsigned nr_pages, + loff_t offset, + size_t count, + void* read_data) +{ + struct nfs_read_data *data = (struct nfs_read_data*)read_data; + struct nfs4_filelayout* nfslay = (struct nfs4_filelayout*)layoutid->layoutid; + struct nfs4_pnfs_dserver dserver; + int status; + + /* Retrieve the correct rpc_client for the byte range */ + status = nfs4_pnfs_dserver_get(inode, + nfslay, + offset, + count, + &dserver); + if(!status) { + data->pnfs_client = dserver.dev_item->rpc_clnt; + data->args.fh = dserver.fh; + } + + /* Perform a syncronous or asyncronous read */ + if (IS_SYNC(inode)) + { + /* sync */ + status = NFS_PROTO(inode)->read(data); + } + else + { + /* async */ + NFS_PROTO(inode)->read_setup(data); + data->task.tk_cookie = (unsigned long)inode; + data->task.tk_calldata = data; + /* Release requests */ + data->task.tk_release = nfs_readdata_release; + + dprintk("NFS: %4d initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + count, + (unsigned long long)data->args.offset); + nfs_execute_read(data); + status = 0; + } + return status; +} + +/* Perform sync or async writes. + * + * TODO: See filelayout_read_pagelist. + */ +ssize_t filelayout_write_pagelist( + struct pnfs_layout_type * layoutid, + struct inode * inode, + struct page **pages, + unsigned int pgbase, + unsigned nr_pages, + loff_t offset, + size_t count, + int sync, + void* write_data) +{ + struct nfs_write_data *data = (struct nfs_write_data*)write_data; + struct nfs4_filelayout* nfslay = (struct nfs4_filelayout*)layoutid->layoutid; + struct nfs4_pnfs_dserver dserver; + int status; + + /* Retrieve the correct rpc_client for the byte range */ + status = nfs4_pnfs_dserver_get(inode, + nfslay, + offset, + count, + &dserver); + if(!status) { + data->pnfs_client = dserver.dev_item->rpc_clnt; + data->args.fh = dserver.fh; + } + + /* Perform a syncronous or asyncronous read */ + /* TODO: What is the different between file and data sync? */ + if (sync == NFS_FILE_SYNC || sync == NFS_DATA_SYNC) + { + /* sync */ + status = NFS_PROTO(inode)->write(data); + } + else + { + /* async */ + NFS_PROTO(inode)->write_setup(data, sync); + data->task.tk_priority = nfs_flush_task_priority(sync); + data->task.tk_cookie = (unsigned long)inode; + data->task.tk_calldata = data; + /* Release requests */ + data->task.tk_release = nfs_writedata_release; + + dprintk("NFS: %4d initiated write call (req %s/%Ld, %u bytes @ offset %Lu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + count, + (unsigned long long)data->args.offset); + nfs_execute_write(data); + status = 0; + } + return status; +} + +/* Create a filelayout layout structure and return it. The pNFS client + * will use the pnfs_layout_type type to refer to the layout for this + * inode from now on. + */ +struct pnfs_layout_type* +filelayout_alloc_layout(struct pnfs_mount_type * mountid, struct inode * inode) +{ + struct pnfs_layout_type* pnfslay = NULL; + struct nfs4_filelayout* nfslay = NULL; + + pnfslay = kmalloc(sizeof(struct pnfs_layout_type), GFP_KERNEL); + if (!pnfslay) + return NULL; + nfslay = kmalloc(sizeof(struct nfs4_filelayout), GFP_KERNEL); + if (!nfslay) + return NULL; + + pnfslay->layoutid = (void*)nfslay; + pnfslay->mountid = mountid; + return pnfslay; +} + +/* Free a filelayout layout structure + */ +void +filelayout_free_layout(struct pnfs_layout_type * layoutid, struct inode * inode) +{ + struct nfs4_filelayout* nfslay = NULL; + if (layoutid) + nfslay = (struct nfs4_filelayout*)layoutid->layoutid; + if (nfslay != NULL) + kfree(nfslay); + kfree(layoutid); +} + +/* Decode layout and store in layoutid. Overwrite any existing layout + * information for this file. + */ +struct pnfs_layout_type* +filelayout_set_layout(struct pnfs_layout_type* layoutid, + struct inode* inode, + void* layout) +{ + struct nfs4_filelayout* fl = NULL; + int i; + uint32_t *p = (uint32_t*)layout, class; + + dprintk("%s set_layout_map Begin\n", __FUNCTION__); + + if (!layoutid) + goto nfserr; + fl = (struct nfs4_filelayout*)layoutid->layoutid; + if (!fl) + goto nfserr; + + /* Decode the layout here */ + READ64(fl->offset); + READ64(fl->length); + READ32(fl->iomode); + READ32(class); /* class */ + + dprintk("DEBUG: %s: class %d\n", __func__, class); + if (class != LAYOUT_NFSV4_FILES) { + return NULL; + } + + /* layout */ + READ32(fl->stripe_type); + READ64(fl->stripe_unit); + READ64(fl->file_size); + READ32(fl->num_devs); + + dprintk("DEBUG: %s: stripe_unit %lld file_size %lld devs %d\n", + __func__, fl->stripe_unit, fl->file_size, fl->num_devs); + for (i = 0; i < fl->num_devs; i++) { + /* dev_id */ + READ32(fl->devs[i].dev_id); + + /* fh */ + memset(&fl->devs[i].fh, 0, sizeof(struct nfs_fh)); + READ32(fl->devs[i].fh.size); + COPYMEM(fl->devs[i].fh.data, fl->devs[i].fh.size); + dprintk("DEBUG: %s: dev %d len %d\n", __func__, + fl->devs[i].dev_id,fl->devs[i].fh.size); + } + + return layoutid; +nfserr: + return NULL; +} + +/* Call nfs fsync function to flush buffers and eventually call + * the filelayout_write_pagelist and filelayout_commit functions. + */ +int +filelayout_fsync( + struct pnfs_layout_type * layoutid, + struct file *file, + struct dentry *dentry, + int datasync) +{ + return pnfs_callback_ops->nfs_fsync(file, dentry, datasync); +} + +/* Technically we would need to execute a COMMIT op to each + * data server on which a page in 'pages' exists. + * With a cluster file system, all we need is a single commit to the MDS. + */ +int +filelayout_commit(struct pnfs_layout_type * layoutid, struct inode* ino, struct list_head *pages, int sync, void* private) +{ + struct nfs_write_data *data = (struct nfs_write_data*)private; + + nfs_commit_rpcsetup(data, sync); + nfs_execute_write(data); + return 0; +} + +/* Return the stripesize for the specified file. + */ +ssize_t +filelayout_get_stripesize(struct pnfs_layout_type* layoutid, struct inode* inode) +{ + struct nfs4_filelayout* fl = (struct nfs4_filelayout*)layoutid->layoutid; + ssize_t stripesize = fl->stripe_unit; + return stripesize; +} + +/* Split wsize/rsize chunks so they do not span multiple data servers + */ +int +filelayout_gather_across_stripes(struct pnfs_mount_type* mountid) +{ + return 0; +} + +/* Use the NFSv4 page cache +*/ +int +filelayout_use_pagecache(struct pnfs_layout_type* layoutid, struct inode* inode) +{ + return 1; +} + +/* Issue a layoutget in the same compound as OPEN + */ +int +filelayout_layoutget_on_open(struct pnfs_mount_type* mountid) +{ + return 1; +} + +struct layoutdriver_io_operations filelayout_io_operations = +{ + .fsync = filelayout_fsync, + .commit = filelayout_commit, + .read_pagelist = filelayout_read_pagelist, + .write_pagelist = filelayout_write_pagelist, + .set_layout = filelayout_set_layout, + .alloc_layout = filelayout_alloc_layout, + .free_layout = filelayout_free_layout, + .initialize_mountpoint = filelayout_initialize_mountpoint, + .uninitialize_mountpoint = filelayout_uninitialize_mountpoint, +}; + +struct layoutdriver_policy_operations filelayout_policy_operations = +{ + .get_stripesize = filelayout_get_stripesize, + .gather_across_stripes = filelayout_gather_across_stripes, + .use_pagecache = filelayout_use_pagecache, + .layoutget_on_open = filelayout_layoutget_on_open, +}; + + +struct pnfs_layoutdriver_type filelayout_type = +{ + .id = LAYOUT_NFSV4_FILES, + .name = "LAYOUT_NFSV4_FILES", + .ld_io_ops = &filelayout_io_operations, + .ld_policy_ops = &filelayout_policy_operations, +}; + +static int __init nfs4filelayout_init(void) +{ + printk("%s: NFSv4 File Layout Driver Registering...\n", __FUNCTION__); + + /* Need to register file_operations struct with global list to indicate + * that NFS4 file layout is a possible pNFS I/O module + */ + pnfs_callback_ops = pnfs_register_layoutdriver(&filelayout_type); + + return 0; +} + +static void __exit nfs4filelayout_exit(void) +{ + printk("%s: NFSv4 File Layout Driver Unregistering...\n", __FUNCTION__); + + /* Unregister NFS4 file layout driver with pNFS client*/ + pnfs_unregister_layoutdriver(&filelayout_type); +} + +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); + +/* + * Local variables: + * c-indent-level: 4 + * c-basic-offset: 4 + * End: + * + * vim: ts=8 sts=4 sw=4 expandtab + */ diff -puN fs/nfs/Makefile~client-nfs4filelayoutdriver fs/nfs/Makefile --- linux-2.6.14-pnfs-current/fs/nfs/Makefile~client-nfs4filelayoutdriver 2006-01-18 20:11:57.168279000 -0500 +++ linux-2.6.14-pnfs-current-dhildebz/fs/nfs/Makefile 2006-01-18 20:11:57.190280000 -0500 @@ -15,3 +15,6 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x pnfs.o nfs-$(CONFIG_NFS_DIRECTIO) += direct.o nfs-objs := $(nfs-y) + +obj-m += nfslayoutdriver.o +nfslayoutdriver-objs := nfs4filelayout.o nfs4filelayoutdev.o diff -puN /dev/null fs/nfs/nfs4filelayout.h --- /dev/null 2006-01-09 05:56:56.224752500 -0500 +++ linux-2.6.14-pnfs-current-dhildebz/fs/nfs/nfs4filelayout.h 2006-01-18 20:11:57.196280000 -0500 @@ -0,0 +1,96 @@ +/* + * pnfs_nfs4filelayout.h + * + * NFSv4 file layout driver data structures. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Dean Hildebrand + */ + +#ifndef FS_NFS_NFS4FILELAYOUT_H +#define FS_NFS_NFS4FILELAYOUT_H + +#include + +#define NFS4_PNFS_DEV_HASH_BITS 5 +#define NFS4_PNFS_DEV_HASH (1 << NFS4_PNFS_DEV_HASH_BITS) + +#define NFS4_PNFS_MAX_DEVS 16 + +struct nfs4_pnfs_dev_item { + struct hlist_node hash_node; + u32 dev_id; + u32 ip_addr; + u32 port; + atomic_t count; + struct rpc_clnt *rpc_clnt; +}; + +struct nfs4_pnfs_dev_hlist { + rwlock_t dev_lock; + struct hlist_head dev_list[NFS4_PNFS_DEV_HASH]; +}; + +struct nfs4_pnfs_devaddr { + u32 dev_id; + u32 ip; + u16 port; +}; + +struct nfs4_pnfs_devlist { + struct list_head devlist; + struct nfs4_pnfs_devaddr devaddr; +}; + +struct nfs4_pnfs_dserver { + struct nfs_fh *fh; + struct nfs4_pnfs_dev_item *dev_item; +}; + +struct nfs4_filelayout_devs { + u32 dev_id; + struct nfs_fh fh; +}; + +struct nfs4_filelayout { + int uncommitted_write; + loff_t last_commit_size; + u64 layout_id; + u64 offset; + u64 length; + u32 iomode; + u64 file_size; + u32 stripe_type; + u64 stripe_unit; + unsigned int num_devs; + struct nfs4_filelayout_devs devs[NFS4_PNFS_MAX_DEVS]; +}; + +struct filelayout_mount_type { + struct super_block* fl_sb; + struct nfs4_pnfs_dev_hlist *hlist; +}; + +int nfs4_pnfs_devlist_init(struct nfs4_pnfs_dev_hlist *hlist); +void nfs4_pnfs_devlist_destroy(struct nfs4_pnfs_dev_hlist *hlist); + +int nfs4_pnfs_dserver_get(struct inode *inode, + struct nfs4_filelayout *layout, + u64 offset, + u32 count, + struct nfs4_pnfs_dserver *dserver); +int decode_and_add_devicelist(struct filelayout_mount_type *mt, struct pnfs_devicelist* devlist); + +#define READ32(x) (x) = ntohl(*p++) +#define READ64(x) do { \ + (x) = (u64)ntohl(*p++) << 32; \ + (x) |= ntohl(*p++); \ +} while (0) +#define COPYMEM(x,nbytes) do { \ + memcpy((x), p, nbytes); \ + p += XDR_QUADLEN(nbytes); \ +} while (0) + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff -puN /dev/null fs/nfs/nfs4filelayoutdev.c --- /dev/null 2006-01-09 05:56:56.224752500 -0500 +++ linux-2.6.14-pnfs-current-dhildebz/fs/nfs/nfs4filelayoutdev.c 2006-01-18 20:11:57.203279000 -0500 @@ -0,0 +1,406 @@ +/* + * linux/fs/nfs/nfs4filelayout-mod.c + * + * Module for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Dean Hildebrand + * Garth Goodson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "nfs4filelayout.h" + +extern struct pnfs_client_operations * pnfs_callback_ops; + +struct rpc_clnt* create_nfs_rpcclient(struct rpc_xprt *xprt, + char* server_name, + u32 version, + rpc_authflavor_t authflavor, + int *err); + +/* Assumes lock is held */ +static inline struct nfs4_pnfs_dev_item * +_device_lookup(struct nfs4_pnfs_dev_hlist *hlist, u32 dev_id) +{ + unsigned long hash; + struct hlist_node *np; + + printk("_device_lookup: dev_id=%u\n", dev_id); + + hash = hash_long(dev_id, NFS4_PNFS_DEV_HASH_BITS); + + hlist_for_each(np, &hlist->dev_list[hash]) { + struct nfs4_pnfs_dev_item *dev; + dev = hlist_entry(np, struct nfs4_pnfs_dev_item, hash_node); + if (dev->dev_id == dev_id) { + return dev; + } + } + return NULL; +} + +/* Assumes lock is held */ +static inline void +_device_add(struct nfs4_pnfs_dev_hlist *hlist, struct nfs4_pnfs_dev_item *dev) +{ + unsigned long hash; + + printk("_device_add: dev_id=%u, ip=%x, port=%hu\n", dev->dev_id, + dev->ip_addr, dev->port); + + hash = hash_long(dev->dev_id, NFS4_PNFS_DEV_HASH_BITS); + hlist_add_head(&dev->hash_node, &hlist->dev_list[hash]); +} + +/* Create an rpc to the data server defined in 'dev' */ +static int +device_create(struct rpc_clnt *mds_rpc, struct nfs4_pnfs_dev_item *dev) +{ + struct rpc_clnt *clnt; + struct rpc_xprt *xprt; + struct sockaddr_in sin; + int err = 0; + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = dev->ip_addr; + sin.sin_port = dev->port; + + printk("device_create: dev_id=%u, ip=%x, port=%hu\n", dev->dev_id, dev->ip_addr, dev->port); + + xprt = xprt_create_proto(IPPROTO_TCP, &sin, + &mds_rpc->cl_xprt->timeout); + if (IS_ERR(xprt)) { + err = PTR_ERR(xprt); + goto out; + } + + clnt = create_nfs_rpcclient(xprt, "nfs4_pnfs_dserver", mds_rpc->cl_vers, mds_rpc->cl_auth->au_flavor, &err); + if (clnt == NULL) { + printk("%s: Can't create nfs rpc client!\n", __FUNCTION__); + goto out; + } + + dev->rpc_clnt = clnt; + + out: + return err; +} + +static void +device_destroy(struct nfs4_pnfs_dev_item *dev) +{ + if (!dev) + return; +/* BUG_ON(!atomic_sub_and_test(0, &dev->count)); */ + rpc_shutdown_client(dev->rpc_clnt); + kfree(dev); +} + +int +nfs4_pnfs_devlist_init(struct nfs4_pnfs_dev_hlist *hlist) +{ + int i; + + hlist->dev_lock = RW_LOCK_UNLOCKED; + + for (i = 0; i < NFS4_PNFS_DEV_HASH; i++) { + INIT_HLIST_HEAD(&hlist->dev_list[i]); + } + + return 0; +} + +/* De-alloc all devices for a mount point. This is called in + * nfs4_kill_super. + */ +void +nfs4_pnfs_devlist_destroy(struct nfs4_pnfs_dev_hlist *hlist) +{ + int i; + + if(hlist == NULL) + return; + + /* No lock held, as synchronization should occur at upper levels */ + for (i = 0; i < NFS4_PNFS_DEV_HASH; i++) { + struct hlist_node *np, *next; + + hlist_for_each_safe(np, next, &hlist->dev_list[i]) { + struct nfs4_pnfs_dev_item *dev; + dev = hlist_entry(np, struct nfs4_pnfs_dev_item, hash_node); + hlist_del_rcu(&dev->hash_node); + device_destroy(dev); + } + } +} + +/* Create the rpc client to the data server specific in + * 'dev', and add it to the list of available devices + * for this mount point. + */ +static int +nfs4_pnfs_device_add(struct filelayout_mount_type *mt, + struct nfs4_pnfs_dev_item *dev) +{ + struct nfs4_pnfs_dev_item *tmp_dev; + int err; + struct nfs4_pnfs_dev_hlist *hlist = mt->hlist; + struct nfs_server *server = NFS_SB(mt->fl_sb); + + printk("nfs4_pnfs_device_add\n"); + + /* Create device */ + err = device_create(server->client, dev); + if (err) + return err; + + /* Write lock, do lookup again, and then add device */ + write_lock(&hlist->dev_lock); + tmp_dev = _device_lookup(hlist, dev->dev_id); + if (tmp_dev == NULL) { + _device_add(hlist, dev); + } + write_unlock(&hlist->dev_lock); + + /* Cleanup, if device was recently added */ + if (tmp_dev != NULL) { + printk(" device found, not adding (after creation)\n"); + device_destroy(dev); + } + + return 0; +} + +/* Decode opaque device data and return the result + */ +static struct nfs4_pnfs_dev_item* +decode_device(struct pnfs_device* dev) +{ + int len; + int tmp[6]; + uint32_t *p = (uint32_t*)dev->dev_addr_buf; + struct nfs4_pnfs_dev_item* file_dev; + char *r_addr; + + if ((file_dev = kmalloc(sizeof(struct nfs4_pnfs_dev_item), GFP_KERNEL)) == NULL) + { + return NULL; + } + + /* Initialize dev */ + INIT_HLIST_NODE(&file_dev->hash_node); + atomic_set(&file_dev->count, 0); + + /* Device id */ + file_dev->dev_id = dev->dev_id; + + /* Decode contents of device*/ + + /* device addr -- r_netid, r_addr */ + + /* check and skip r_netid */ + READ32(len); + if (len != 3) /* "tcp" */ + return NULL; + /* TODO: Don't we read the tcp bytes? */ + + READ32(len); + r_addr = (char*)p; + sscanf(r_addr, "%d.%d.%d.%d.%d.%d", &tmp[0], &tmp[1], + &tmp[2], &tmp[3], &tmp[4], &tmp[5]); + file_dev->ip_addr = htonl((tmp[0]<<24) | (tmp[1]<<16) | + (tmp[2]<<8) | (tmp[3])); + file_dev->port = htons((tmp[4] << 8) | (tmp[5])); + + return file_dev; +} + +/* Decode the opaque device specified in 'dev' + * and add it to the list of available devices for this + * mount point. + * Must at some point be followed up with device_destroy + */ +static struct nfs4_pnfs_dev_item* +decode_and_add_device(struct filelayout_mount_type *mt, struct pnfs_device* dev) +{ + struct nfs4_pnfs_dev_item* file_dev; + + file_dev = decode_device(dev); + + if (!file_dev) + { + printk("%s Could not decode device\n", __FUNCTION__); + return NULL; + } + + if (nfs4_pnfs_device_add(mt, file_dev)) + return NULL; + return file_dev; +} + +/* Decode the opaque device list in 'devlist' + * and add it to the list of available devices for this + * mount point. + * Must at some point be followed up with device_destroy. + */ +int +decode_and_add_devicelist(struct filelayout_mount_type *mt, struct pnfs_devicelist* devlist) +{ + int i, cnt; + + for (i = 0,cnt=0; i < devlist->num_devs && cnt < NFS4_PNFS_DEV_MAXCOUNT; i++) { + if (!decode_and_add_device(mt, &devlist->devs[cnt])) + return 1; + cnt++; + } + return 0; +} + +/* Retrieve the information for dev_id, add it to the list + * of available devices, and return it. + */ +static struct nfs4_pnfs_dev_item * +get_device_info(struct filelayout_mount_type *mt, u32 dev_id) +{ + int rc; + struct pnfs_device *pdev = NULL; + + if ((pdev = kmalloc(sizeof(struct pnfs_device), GFP_KERNEL)) == NULL) + { + return NULL; + } + rc = pnfs_callback_ops->nfs_getdeviceinfo(mt->fl_sb, dev_id, pdev); + if (rc) { + return NULL; + } + + /* Found new device, need to decode it and then add it to the + * list of known devices for this mountpoint. + */ + return decode_and_add_device(mt, pdev); +} + +/* Lookup and return the device dev_id + */ +static struct nfs4_pnfs_dev_item * +nfs4_pnfs_device_get(struct inode *inode, u32 dev_id) +{ + struct nfs4_pnfs_dev_item *dev; + struct nfs_server* server = NFS_SERVER(inode); + struct filelayout_mount_type *mt = (struct filelayout_mount_type*)server->pnfs_mountid->mountid; + struct nfs4_pnfs_dev_hlist *hlist = mt->hlist; + + read_lock(&hlist->dev_lock); + dev = _device_lookup(hlist, dev_id); +/* + if (dev) { + atomic_inc(&dev->count); + } +*/ + read_unlock(&hlist->dev_lock); + if (dev == NULL) + dev = get_device_info(mt, dev_id); + + return dev; +} + +/* Retrieve the rpc client for a specified byte range + * in 'inode' by filling in the contents of 'dserver'. + */ +int +nfs4_pnfs_dserver_get(struct inode *inode, + struct nfs4_filelayout *layout, + u64 offset, + u32 count, + struct nfs4_pnfs_dserver *dserver) +{ + u32 dev_id; + u64 tmp; + u32 stripe_idx, dbg_stripe_idx; + + if(!layout) + return 1; + + tmp = offset; + /* Want ((offset / layout->stripe_unit) % layout->num_devs) */ + do_div(tmp, layout->stripe_unit); + stripe_idx = do_div(tmp, layout->num_devs); + + /* For debugging */ + tmp = offset + count - 1; + do_div(tmp, layout->stripe_unit); + dbg_stripe_idx = do_div(tmp, layout->num_devs); + + printk("nfsv4_pnfs_dserver_get: offset=%Lu, count=%u, si=%u, dsi=%u, " + "num_devs=%u, stripe_unit=%Lu\n", + offset, count, stripe_idx, dbg_stripe_idx, layout->num_devs, + layout->stripe_unit); + + BUG_ON(dbg_stripe_idx != stripe_idx); + + dev_id = layout->devs[stripe_idx].dev_id; + + dserver->dev_item = nfs4_pnfs_device_get(inode, dev_id); + if (dserver->dev_item == NULL) + return 1; + dserver->fh = &layout->devs[stripe_idx].fh; + + printk("nfs4_pnfs_dserver_get: dev_id=%u, idx=%u, offset=%Lu, count=%u\n", + dev_id, stripe_idx, offset, count); + + return 0; +} + +/* Currently not used. + * I have disabled checking the device count until we can think of a good way + * to call nfs4_pnfs_device_put in a generic way from the pNFS client. + * The only way I think think of is to put the nfs4_pnfs_dev_item directly + * in the nfs4_write/read_data structure, which breaks the clear line between + * the pNFS client and layout drivers. If I did do this, then I could call + * an ioctl on the NFSv4 file layout driver to decrement the device count. + */ +static void +nfs4_pnfs_device_put(struct nfs4_pnfs_dev_hlist *hlist, struct nfs4_pnfs_dev_item *dev) +{ + printk("nfs4_pnfs_dserver_put: dev_id=%u\n", dev->dev_id); + atomic_dec(&dev->count); +} _