Copyright (c) 2006 The Regents of the University of Michigan All rights reserved. Permission is granted to use, copy, create derivative works and redistribute this software and such derivative works for any purpose, so long as the name of the University of Michigan is not used in any advertising or publicity pertaining to the use or distribution of this software without specific, written prior authorization. If the above copyright notice or any other identification of the university of michigan is included in any copy of any portion of this software, then the disclaimer below must also be included. This software is provided as is, without representation from the University of Michigan as to its fitness for any purpose, and without warranty by the university of michigan of any kind, either express or implied, including without limitation the implied warranties of merchantability and fitness for a particular purpose. The Regents of the University of Michigan shall not be liable for any damages, including special, indirect, incidental, or consequential damages, with respect to any claim arising out or in connection with the use of the software, even if it has been or is hereafter advised of the possibility of such damages. Signed-off-by: George Dunlap diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/fs/Kconfig linux-2.6.14.3-RPCRDMA/fs/Kconfig --- linux-2.6.14.3/fs/Kconfig 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/fs/Kconfig 2005-12-08 11:50:23.000000000 -0500 @@ -1447,6 +1447,15 @@ config NFSD_TCP TCP connections usually perform better than the default UDP when the network is lossy or congested. If unsure, say Y. +config NFSD_RDMA + tristate "Provide NFS server over RDMA support" + select SUNRPC_RDMA + default n + help + If you want your NFS server to support RDMA connections, say Y here. + Automatically selects SUNRPC_RDMA as well. + If unsure, say N. + config ROOT_NFS bool "Root file system on NFS" depends on NFS_FS=y && IP_PNP @@ -1471,6 +1480,9 @@ config LOCKD_V4 config EXPORTFS tristate + +config SUNRPC_RDMA + tristate config NFS_ACL_SUPPORT tristate diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/fs/nfs/callback.c linux-2.6.14.3-RPCRDMA/fs/nfs/callback.c --- linux-2.6.14.3/fs/nfs/callback.c 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/fs/nfs/callback.c 2005-12-08 11:50:23.000000000 -0500 @@ -81,7 +81,7 @@ static void nfs_callback_svc(struct svc_ int nfs_callback_up(void) { struct svc_serv *serv; - struct svc_sock *svsk; + struct svc_xprt *svsk; int ret = 0; lock_kernel(); @@ -98,9 +98,9 @@ int nfs_callback_up(void) ret = svc_makesock(serv, IPPROTO_TCP, 0); if (ret < 0) goto out_destroy; - if (!list_empty(&serv->sv_permsocks)) { - svsk = list_entry(serv->sv_permsocks.next, - struct svc_sock, sk_list); + if (!list_empty(&serv->sv_permxprts)) { + svsk = list_entry(serv->sv_permxprts.next, + struct svc_xprt, sk_list); nfs_callback_tcpport = ntohs(inet_sk(svsk->sk_sk)->sport); dprintk ("Callback port = 0x%x\n", nfs_callback_tcpport); } else diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/fs/nfsd/nfs3proc.c linux-2.6.14.3-RPCRDMA/fs/nfsd/nfs3proc.c --- linux-2.6.14.3/fs/nfsd/nfs3proc.c 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/fs/nfsd/nfs3proc.c 2005-12-08 11:50:23.000000000 -0500 @@ -547,6 +547,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqst nfserr = fh_verify(rqstp, &argp->fh, 0, MAY_NOP); + dprintk("%s: post fh_verify nfserr = %d\n", __FUNCTION__, nfserr); + /* Check special features of the file system. May request * different read/write sizes for file systems known to have * problems with large blocks */ @@ -561,6 +563,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqst } fh_put(&argp->fh); + dprintk("%s: nfserr = %d\n", __FUNCTION__, nfserr); RETURN_STATUS(nfserr); } diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/fs/nfsd/nfs3xdr.c linux-2.6.14.3-RPCRDMA/fs/nfsd/nfs3xdr.c --- linux-2.6.14.3/fs/nfsd/nfs3xdr.c 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/fs/nfsd/nfs3xdr.c 2005-12-08 11:50:23.000000000 -0500 @@ -1032,8 +1032,12 @@ int nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, u32 *p, struct nfsd3_fsinfores *resp) { + dprintk("%s: writing response starting at %p\n", __FUNCTION__, p); + *p++ = xdr_zero; /* no post_op_attr */ + dprintk("%s: resp->status = %d\n", __FUNCTION__, resp->status); + if (resp->status == 0) { *p++ = htonl(resp->f_rtmax); *p++ = htonl(resp->f_rtpref); @@ -1048,6 +1052,8 @@ nfs3svc_encode_fsinfores(struct svc_rqst *p++ = htonl(resp->f_properties); } + dprintk("%s: response ending at %p\n", __FUNCTION__, p); + return xdr_ressize_check(rqstp, p); } diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/fs/nfsd/nfsfh.c linux-2.6.14.3-RPCRDMA/fs/nfsd/nfsfh.c --- linux-2.6.14.3/fs/nfsd/nfsfh.c 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/fs/nfsd/nfsfh.c 2005-12-08 11:50:23.000000000 -0500 @@ -130,6 +130,7 @@ fh_verify(struct svc_rqst *rqstp, struct int data_left = fh->fh_size/4; error = nfserr_stale; + dprintk("%s: rqstp->rq_client = %p\n", __FUNCTION__, NULL); if (rqstp->rq_client == NULL) goto out; if (rqstp->rq_vers > 2) diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/fs/nfsd/nfssvc.c linux-2.6.14.3-RPCRDMA/fs/nfsd/nfssvc.c --- linux-2.6.14.3/fs/nfsd/nfssvc.c 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/fs/nfsd/nfssvc.c 2005-12-08 11:50:23.000000000 -0500 @@ -104,15 +104,28 @@ nfsd_svc(unsigned short port, int nrserv nfsd_serv = svc_create(&nfsd_program, NFSD_BUFSIZE); if (nfsd_serv == NULL) goto out; + dprintk("nfsd: calling svc_makesock for UDP\n"); error = svc_makesock(nfsd_serv, IPPROTO_UDP, port); + dprintk("%s: error = %d\n", __FUNCTION__, error); if (error < 0) goto failure; #ifdef CONFIG_NFSD_TCP + dprintk("nfsd: calling svc_makesock for TCP\n"); error = svc_makesock(nfsd_serv, IPPROTO_TCP, port); + dprintk("%s: error = %d\n", __FUNCTION__, error); if (error < 0) goto failure; #endif +/* XXX There should be an condition on CONFIG_NFSD_RDMA here */ + /* XXX we need a new protocol constant for RDMA */ + dprintk("nfsd: calling svc_makesock for RDMA\n"); + error = svc_makesock(nfsd_serv, IPPROTO_MAX + 1, port); +/* error = svc_makexprt(nfsd_serv, IPPROTO_MAX + 1, port); */ + dprintk("%s: error = %d\n", __FUNCTION__, error); + if (error < 0) + goto failure; + do_gettimeofday(&nfssvc_boot); /* record boot time */ } else nfsd_serv->sv_nrthreads++; @@ -349,6 +362,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, u3 */ if (!(nfserr && rqstp->rq_vers == 2)) { xdr = proc->pc_encode; + if (xdr && !xdr(rqstp, nfserrp, rqstp->rq_resp)) { /* Failed to encode result. Release cache entry */ diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/linux/sunrpc/svc.h linux-2.6.14.3-RPCRDMA/include/linux/sunrpc/svc.h --- linux-2.6.14.3/include/linux/sunrpc/svc.h 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/linux/sunrpc/svc.h 2005-12-08 11:50:23.000000000 -0500 @@ -26,6 +26,9 @@ * a list of idle threads waiting for input. * * We currently do not support more than one RPC program per daemon. + * + * Deprecated fields to remove + * - sv_sockets */ struct svc_serv { struct list_head sv_threads; /* idle server threads */ @@ -37,8 +40,8 @@ struct svc_serv { unsigned int sv_bufsz; /* datagram buffer size */ unsigned int sv_xdrsize; /* XDR buffer size */ - struct list_head sv_permsocks; /* all permanent sockets */ - struct list_head sv_tempsocks; /* all temporary sockets */ + struct list_head sv_permxprts; /* all permanent transports */ + struct list_head sv_tempxprts; /* all temporary transports */ int sv_tmpcnt; /* count of temporary sockets */ char * sv_name; /* service name */ @@ -110,7 +113,7 @@ static inline void svc_putu32(struct kve */ struct svc_rqst { struct list_head rq_list; /* idle list */ - struct svc_sock * rq_sock; /* socket */ + struct svc_xprt * rq_sock; /* socket */ struct sockaddr_in rq_addr; /* peer address */ int rq_addrlen; @@ -244,7 +247,7 @@ static inline void svc_free_allpages(str struct svc_deferred_req { u32 prot; /* protocol (UDP or TCP) */ struct sockaddr_in addr; - struct svc_sock *svsk; /* where reply must go */ + struct svc_xprt *svsk; /* where reply must go */ struct cache_deferred_req handle; int argslen; u32 args[0]; diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/linux/sunrpc/svcsock.h linux-2.6.14.3-RPCRDMA/include/linux/sunrpc/svcsock.h --- linux-2.6.14.3/include/linux/sunrpc/svcsock.h 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/linux/sunrpc/svcsock.h 2005-12-08 11:50:23.000000000 -0500 @@ -1,7 +1,7 @@ /* * linux/include/linux/sunrpc/svcsock.h * - * RPC server socket I/O. + * RPC server transport-independent I/O. * * Copyright (C) 1995, 1996 Olaf Kirch */ @@ -12,15 +12,18 @@ #include /* - * RPC server socket. + * RPC transport instance */ -struct svc_sock { +struct svc_xprt { struct list_head sk_ready; /* list of ready sockets */ - struct list_head sk_list; /* list of all sockets */ + struct list_head sk_list; /* list of all transports */ + + void * sx_data; /* transport-specific data */ + struct socket * sk_sock; /* berkeley socket layer */ struct sock * sk_sk; /* INET layer */ - struct svc_serv * sk_server; /* service for this socket */ + struct svc_serv * sx_server; /* service for this socket */ unsigned int sk_inuse; /* use count */ unsigned long sk_flags; #define SK_BUSY 0 /* enqueued/receiving */ @@ -38,14 +41,22 @@ struct svc_sock { * be revisted */ struct semaphore sk_sem; /* to serialize sending data */ - int (*sk_recvfrom)(struct svc_rqst *rqstp); - int (*sk_sendto)(struct svc_rqst *rqstp); + int (*sx_recvfrom)(struct svc_rqst *rqstp); + int (*sx_sendto)(struct svc_rqst *rqstp); + void (*sx_delete)(struct svc_xprt *xprt); + void (*sx_put)(struct svc_xprt *xprt); + /* return 0 if insufficient space to write reply */ + int (*sx_has_wspace)(struct svc_xprt *xprt); + /* We keep the old state_change and data_ready CB's here */ void (*sk_ostate)(struct sock *); void (*sk_odata)(struct sock *, int bytes); void (*sk_owspace)(struct sock *); + /* if non-zero, the transport manages XDR buffers */ + int sx_manages_buffers; + /* private TCP part */ int sk_reclen; /* length of record */ int sk_tcplen; /* current read length */ @@ -56,10 +67,12 @@ struct svc_sock { * Function prototypes. */ int svc_makesock(struct svc_serv *, int, unsigned short); -void svc_delete_socket(struct svc_sock *); +int svc_makexprt(struct svc_serv *, int, unsigned short); +void svc_delete_socket(struct svc_xprt *); int svc_recv(struct svc_serv *, struct svc_rqst *, long); int svc_send(struct svc_rqst *); void svc_drop(struct svc_rqst *); void svc_sock_update_bufs(struct svc_serv *serv); +void svc_sock_enqueue(struct svc_xprt *svsk); #endif /* SUNRPC_SVCSOCK_H */ diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/linux/sunrpc/svcxprt_rdma.h linux-2.6.14.3-RPCRDMA/include/linux/sunrpc/svcxprt_rdma.h --- linux-2.6.14.3/include/linux/sunrpc/svcxprt_rdma.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/linux/sunrpc/svcxprt_rdma.h 2005-12-08 11:50:23.000000000 -0500 @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan + * All rights reserved. + * + * Permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation from the + * University of Michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. The Regents + * of the University of Michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#ifndef SVCXPRT_RDMA_H +#define SVCXPRT_RDMA_H + +int svc_create_rdma_xprt(struct svc_serv *serv, int protocol, unsigned short port); + +#endif /* SVCXPRT_RDMA_H */ diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/linux/sunrpc/svcxprt_rdma_kdapl.h linux-2.6.14.3-RPCRDMA/include/linux/sunrpc/svcxprt_rdma_kdapl.h --- linux-2.6.14.3/include/linux/sunrpc/svcxprt_rdma_kdapl.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/linux/sunrpc/svcxprt_rdma_kdapl.h 2005-12-08 11:50:23.000000000 -0500 @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan + * All rights reserved. + * + * Permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation from the + * University of Michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. The Regents + * of the University of Michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + * + * Defines types and functions used for the implementation of an RDMA RPC + * transport. + * + * This header file depends on the inclusion of the following others: + * - dat/kdat.h: KDAT API types + * - linux/sunrpc/svcsock.h: struct svc_xprt + */ + +#ifndef SVCXPRT_RDMA_KDAPL_H +#define SVCXPRT_RDMA_KDAPL_H + +#define RDMA_MAX_PAGES 1 /* Max # pages posted for RDMA Recv */ +#define RDMA_CREDIT_PAGES 4 /* # pages for each credit buffer */ + +enum rdma_proc { + RDMA_MSG = 0, + RDMA_NOMSG = 1, + RDMA_MSGP = 2, + RDMA_DONE = 3, + RDMA_ERROR = 4 +}; + +/* + * Public Service Point & associated structures + */ +struct rdma_psp { + DAT_PSP_HANDLE rp_psp_handle; + DAT_EVD_HANDLE rp_cr_evd; +}; + +/* + * Interface Adapter & associated structures + */ +struct rdma_ia { + struct list_head ri_endpoints; + spinlock_t ri_lock; + unsigned int ri_initialized; + DAT_IA_HANDLE ri_ia_handle; + DAT_EVD_HANDLE ri_async_evd; + DAT_IA_ATTR ri_ia_attr; + DAT_PROVIDER_ATTR ri_pv_attr; + struct rdma_psp ri_psp; + DAT_PZ_HANDLE ri_pz_handle; + DAT_LMR_HANDLE ri_lmr_handle; + DAT_LMR_TRIPLET ri_lmr_iov; +#if 0 /* JFL: fixes compiler error */ + DAT_CR_HANDLE * ri_conn_req; /* XXX should be list... */ +#else + DAT_CR_HANDLE ri_conn_req; +#endif +/* + int ri_memreg_strategy; +#if RPCRDMA_DEBUG + DAT_RMR_CONTEXT ri_bind_rmr; +#endif +*/ +}; + +/* + * RDMA Credit + * Represents the storage corresponding to an RDMA credit + */ +struct rdma_credit { + struct page * rb_pages[RDMA_CREDIT_PAGES]; +}; + +/* + * RDMA endpoint + */ +struct rdma_ep { + struct list_head re_list; /* list of all endpoints */ + struct rdma_credit re_credits[1]; /* XXX will be list */ + int re_connected; + int re_closing; + wait_queue_head_t re_connect_wait; + DAT_EP_HANDLE re_handle; + DAT_EVD_HANDLE re_conn_evd; + DAT_EVD_HANDLE re_in_dto_evd; + DAT_EVD_HANDLE re_out_dto_evd; + struct rdma_ia * re_ia; +#if 0 /* JFL: store the address data rather than a pointer to the provider's */ + /* address data, which the provider might delete on us. */ + struct sockaddr_in * re_raddr; /* the remote IA address */ +#else + struct sockaddr_in re_raddr; +#endif + struct svc_xprt * re_xprt; + DAT_LMR_TRIPLET re_resp[3]; /* head, page data & tail */ + DAT_VLEN re_recvlen; + struct page * re_pages[RDMA_MAX_PAGES + 1]; /* 1 page for reply */ +}; + +int rdma_init_ia(struct rdma_ia *ia, const DAT_NAME_PTR ia_name, DAT_COUNT evd_min_qlen); +int rdma_create_pub_svc_point(struct svc_xprt *xprt, DAT_CONN_QUAL connection_qualifier); +void rdma_close_ia(struct rdma_ia *ia); +/* void rdma_close_ep(struct rdma_ep *ep); */ +int svc_rdma_ia_recvfrom(struct svc_rqst *rqstp); +int svc_rdma_ep_recvfrom(struct svc_rqst *rqstp); + +#endif /* SVCXPRT_RDMA_KDAPL_H */ diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_addr.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_addr.h --- linux-2.6.14.3/include/rdma/ib_addr.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_addr.h 2005-12-08 11:16:53.000000000 -0500 @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This Software is licensed under one of the following licenses: + * + * 1) under the terms of the "Common Public License 1.0" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/cpl.php. + * + * 2) under the terms of the "The BSD License" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/bsd-license.php. + * + * 3) under the terms of the "GNU General Public License (GPL) Version 2" a + * copy of which is available from the Open Source Initiative, see + * http://www.opensource.org/licenses/gpl-license.php. + * + * Licensee has the right to choose one of the above licenses. + * + * Redistributions of source code must retain the above copyright + * notice and one of the license notices. + * + * Redistributions in binary form must reproduce both the above copyright + * notice, one of the license notices in the documentation + * and/or other materials provided with the distribution. + * + */ + +#if !defined(IB_ADDR_H) +#define IB_ADDR_H + +#include +#include + +struct ib_addr { + union ib_gid sgid; + union ib_gid dgid; + u16 pkey; +}; + +/** + * ib_translate_addr - Translate a local IP address to an Infiniband GID and + * PKey. + */ +int ib_translate_addr(struct sockaddr *addr, union ib_gid *gid, u16 *pkey); + +/** + * ib_resolve_addr - Resolve source and destination IP addresses to + * Infiniband network addresses. + * @src_addr: An optional source address to use in the resolution. If a + * source address is not provided, a usable address will be returned via + * the callback. + * @dst_addr: The destination address to resolve. + * @addr: A reference to a data location that will receive the resolved + * addresses. The data location must remain valid until the callback has + * been invoked. + * @timeout_ms: Amount of time to wait for the address resolution to complete. + * @callback: Call invoked once address resolution has completed, timed out, + * or been canceled. A status of 0 indicates success. + * @context: User-specified context associated with the call. + */ +int ib_resolve_addr(struct sockaddr *src_addr, struct sockaddr *dst_addr, + struct ib_addr *addr, int timeout_ms, + void (*callback)(int status, struct sockaddr *src_addr, + struct ib_addr *addr, void *context), + void *context); + +void ib_addr_cancel(struct ib_addr *addr); + +#endif /* IB_ADDR_H */ + diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_at.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_at.h --- linux-2.6.14.3/include/rdma/ib_at.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_at.h 2005-12-08 11:16:53.000000000 -0500 @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2004,2005 Voltaire Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * + * $Id: ib_at.h 3202 2005-08-26 17:11:34Z roland $ + */ + +#if !defined( IB_AT_H ) +#define IB_AT_H + +#include +#include + +enum ib_at_multipathing_type { + IB_AT_PATH_SAME_PORT = 0, + IB_AT_PATH_SAME_HCA = 1, /* but different ports if applicable */ + IB_AT_PATH_SAME_SYSTEM = 2, /* but different ports if applicable */ + IB_AT_PATH_INDEPENDENT_HCA = 3, + IB_AT_PATH_SRC_ROUTE = 4, /* application controlled multipathing */ +}; + +enum ib_at_route_flags { + IB_AT_ROUTE_USE_DEFAULTS = 0, + IB_AT_ROUTE_FORCE_ATS = 1, + IB_AT_ROUTE_FORCE_ARP = 2, + IB_AT_ROUTE_FORCE_RESOLVE = 4, +}; + +struct ib_at_path_attr { + u16 qos_tag; + __be16 pkey; + u8 multi_path_type; +}; + +struct ib_at_ib_route { + union ib_gid sgid; + union ib_gid dgid; + struct ib_device *out_dev; + int out_port; + struct ib_at_path_attr attr; +}; + +enum ib_at_op_status { + IB_AT_STATUS_INVALID = 0, + IB_AT_STATUS_PENDING = 1, + IB_AT_STATUS_COMPLETED = 2, + IB_AT_STATUS_ERROR = 3, + IB_AT_STATUS_CANCELED = 4, +}; + +/* + * ib_at_completion structure - callback function parameters structure + * @completion: completion call back function + * @context: user defined context pointer + * @req_id: asynchronous request ID - optional, out + * + * The asynchronous resolution function behavior is as follows: + * If the resolve operation can be fulfilled immediately, then the output + * structures are set and the number of filled structures is returned. + * + * If the resolve operation cannot by fulfilled immediately and + * an ib_at_completion structure is not provided, + * then the function immediately returns -EWOULDBLOCK. + * + * If ib_at_completion structure is provided and an asynchronous + * operation is started, the function immediately returns zero, + * and the request ID field (req_id) is set if the pointer is + * non NULL. This request ID may be used to cancel the operation, + * or to poll its status. + * + * When an asynchronous operation completes (successfully or not), + * the callback function is called, passing the request ID, + * the supplied user context and the number of output structures. + * If the asynchronous operation did not complete, a negative + * error code is return as the 'rec_num'. + * Valid error codes are: + * -EINTR: operation is canceled + * -EIO: request send failed + * -ETIMEOUT: operation timed out + * + * Returned value of zero records means that the resolution process + * completed, but the given address could not be resolved at this time. + */ +struct ib_at_completion { + void (*fn)(u64 req_id, void *context, int rec_num); + void *context; + u64 req_id; +}; + +/** + * ib_at_route_by_ip - asynchronously resolve ip address to ib route + * @dst_ip: destination ip + * @src_ip: source ip - optional + * @tos: ip type of service + * @flags: ib_at_route_flags + * @ib_route: out structure + * @async_comp: asynchronous callback structure - optional + * + * Resolve the specified dst_ip to a &struct ib_route structure. + * src_ip can be provided to force specific output interface. + * flags can be used to select resolving method; currently IB-ARP or ATS. + * + * See ib_at_completion structure documentation for asynchronous + * operation details. + */ +int ib_at_route_by_ip(u32 dst_ip, u32 src_ip, int tos, u16 flags, + struct ib_at_ib_route *ib_route, + struct ib_at_completion *async_comp); + +/** + * ib_at_paths_by_route - asynchronously resolve ib route to ib path records + * @ib_route: ib route to resolve + * @mpath_type: ib_at_multipathing_type + * @path_arr: SA path record array - out + * @npath: maximal number of paths to return + * @async_comp: asynchronous callback structure - optional + * + * Resolve the specified ib_route to a SA path record array. + * Number of returned paths will not exceed npath. + * Multipathing type may be used to obtain redundant paths for APM, + * other failover schemes, bandwidth aggregation or source based routing. + * Note that multipathing request is meaningless unless npath is greater than 1. + * + * Returned ib_route structure includes the recommended pkey and qos_tag for + * this route. + * + * See ib_at_completion structure documentation for asynchronous operation + * details. + */ +int ib_at_paths_by_route(struct ib_at_ib_route *ib_route, u32 mpath_type, + struct ib_sa_path_rec *path_arr, int npath, + struct ib_at_completion *async_comp); + +/** + * ib_at_ips_by_gid - asynchronously resolve GID to IP addresses + * @gid: GID to resolve + * @dst_ips: array of IPs, out + * @nips: number of IP entries in dst_ips array + * @async_comp: asynchronous callback structure - optional + * + * Resolve the gid to IP addresses, but not more than nips. + * This function rely on the IB-ATS mechanism. + * + * See ib_at_completion structure documentation for asynchronous + * operation details. + */ +int ib_at_ips_by_gid(union ib_gid *gid, u32 *dst_ips, int nips, + struct ib_at_completion *async_comp); + +/** + * ib_at_ips_by_subnet - return local IP addresses by IP subnet + * @network: network to resolve - optional + * @netmask: subnet net mask - optional + * @dst_ips: array of IPs, out + * @nips: number of IP entries in dst_ips array + * + * Return local IP addresses matching the network and netmask, + * but not more than nips. + * + * Note that network and netmask as 0x0 or 0xffffffff returns all local IPs. + */ +int ib_at_ips_by_subnet(u32 network, u32 netmask, u32 *dst_ips, int nips); + +/** + * ib_at_invalidate_paths - invalidate possibly cached paths keyed by ib_route + * @ib_route: paths key - optional + * + * Returns number of invalidated paths. + * If ib_route is NULL, then the entire cache will be flushed. + */ +int ib_at_invalidate_paths(struct ib_at_ib_route *ib_route); + +/** + * ib_at_cancel - cancel possible active asynchronous operation + * @req_id: asynchronous request ID + * + * Return 0 if canceled, -1 if cancel failed (e.g. bad ID) + */ +int ib_at_cancel(u64 req_id); + +/** + * ib_at_status - poll asynchronous operation's status + * @req_id: asynchronous request ID ib_at_op_status + * + * Return non-negative ib_at_op_status value, + * or -EINVAL if the request ID is invalid. + */ +int ib_at_status(u64 req_id); + +#endif /* IB_AT_H */ diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_cache.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_cache.h --- linux-2.6.14.3/include/rdma/ib_cache.h 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_cache.h 2005-12-08 11:16:53.000000000 -0500 @@ -31,7 +31,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: ib_cache.h 1349 2004-12-16 21:09:43Z roland $ + * $Id: ib_cache.h 3202 2005-08-26 17:11:34Z roland $ */ #ifndef _IB_CACHE_H diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_cm.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_cm.h --- linux-2.6.14.3/include/rdma/ib_cm.h 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_cm.h 2005-12-08 11:16:53.000000000 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Intel Corporation. All rights reserved. + * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. @@ -32,7 +32,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: ib_cm.h 2730 2005-06-28 16:43:03Z sean.hefty $ + * $Id: ib_cm.h 4311 2005-12-05 18:42:01Z sean.hefty $ */ #if !defined(IB_CM_H) #define IB_CM_H @@ -102,14 +102,14 @@ enum ib_cm_data_size { IB_CM_APR_INFO_LENGTH = 72, IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216, IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136, - IB_CM_SIDR_REP_INFO_LENGTH = 72 + IB_CM_SIDR_REP_INFO_LENGTH = 72, + IB_CM_PRIVATE_DATA_COMPARE_SIZE = 64 }; struct ib_cm_id; struct ib_cm_req_event_param { struct ib_cm_id *listen_id; - struct ib_device *device; u8 port; struct ib_sa_path_rec *primary_path; @@ -220,7 +220,6 @@ struct ib_cm_apr_event_param { struct ib_cm_sidr_req_event_param { struct ib_cm_id *listen_id; - struct ib_device *device; u8 port; u16 pkey; }; @@ -240,7 +239,6 @@ struct ib_cm_sidr_rep_event_param { u32 qpn; void *info; u8 info_len; - }; struct ib_cm_event { @@ -284,6 +282,7 @@ typedef int (*ib_cm_handler)(struct ib_c struct ib_cm_id { ib_cm_handler cm_handler; void *context; + struct ib_device *device; __be64 service_id; __be64 service_mask; enum ib_cm_state state; /* internal CM/debug use */ @@ -295,6 +294,8 @@ struct ib_cm_id { /** * ib_create_cm_id - Allocate a communication identifier. + * @device: Device associated with the cm_id. All related communication will + * be associated with the specified device. * @cm_handler: Callback invoked to notify the user of CM events. * @context: User specified context associated with the communication * identifier. @@ -302,7 +303,8 @@ struct ib_cm_id { * Communication identifiers are used to track connection states, service * ID resolution requests, and listen requests. */ -struct ib_cm_id *ib_create_cm_id(ib_cm_handler cm_handler, +struct ib_cm_id *ib_create_cm_id(struct ib_device *device, + ib_cm_handler cm_handler, void *context); /** @@ -316,6 +318,11 @@ void ib_destroy_cm_id(struct ib_cm_id *c #define IB_SERVICE_ID_AGN_MASK __constant_cpu_to_be64(0xFF00000000000000ULL) #define IB_CM_ASSIGN_SERVICE_ID __constant_cpu_to_be64(0x0200000000000000ULL) +struct ib_cm_private_data_compare { + u8 data[IB_CM_PRIVATE_DATA_COMPARE_SIZE]; + u8 mask[IB_CM_PRIVATE_DATA_COMPARE_SIZE]; +}; + /** * ib_cm_listen - Initiates listening on the specified service ID for * connection and service ID resolution requests. @@ -328,10 +335,12 @@ void ib_destroy_cm_id(struct ib_cm_id *c * range of service IDs. If set to 0, the service ID is matched * exactly. This parameter is ignored if %service_id is set to * IB_CM_ASSIGN_SERVICE_ID. + * @compare_data: This parameter is optional. It specifies data that must + * appear in the private data of a connection request for the specified + * listen request. */ -int ib_cm_listen(struct ib_cm_id *cm_id, - __be64 service_id, - __be64 service_mask); +int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, + struct ib_cm_private_data_compare *compare_data); struct ib_cm_req_param { struct ib_sa_path_rec *primary_path; diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_fmr_pool.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_fmr_pool.h --- linux-2.6.14.3/include/rdma/ib_fmr_pool.h 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_fmr_pool.h 2005-12-08 11:16:53.000000000 -0500 @@ -30,7 +30,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: ib_fmr_pool.h 2730 2005-06-28 16:43:03Z sean.hefty $ + * $Id: ib_fmr_pool.h 3202 2005-08-26 17:11:34Z roland $ */ #if !defined(IB_FMR_POOL_H) diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_mad.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_mad.h --- linux-2.6.14.3/include/rdma/ib_mad.h 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_mad.h 2005-12-08 11:16:53.000000000 -0500 @@ -33,7 +33,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: ib_mad.h 2775 2005-07-02 13:42:12Z halr $ + * $Id: ib_mad.h 3925 2005-10-31 23:03:17Z roland $ */ #if !defined( IB_MAD_H ) @@ -109,10 +109,14 @@ #define IB_QP_SET_QKEY 0x80000000 enum { + IB_MGMT_MAD_HDR = 24, IB_MGMT_MAD_DATA = 232, + IB_MGMT_RMPP_HDR = 36, IB_MGMT_RMPP_DATA = 220, + IB_MGMT_VENDOR_HDR = 40, IB_MGMT_VENDOR_DATA = 216, - IB_MGMT_SA_DATA = 200 + IB_MGMT_SA_HDR = 56, + IB_MGMT_SA_DATA = 200, }; struct ib_mad_hdr { @@ -203,26 +207,25 @@ struct ib_class_port_info /** * ib_mad_send_buf - MAD data buffer and work request for sends. - * @mad: References an allocated MAD data buffer. The size of the data - * buffer is specified in the @send_wr.length field. - * @mapping: DMA mapping information. + * @next: A pointer used to chain together MADs for posting. + * @mad: References an allocated MAD data buffer. * @mad_agent: MAD agent that allocated the buffer. + * @ah: The address handle to use when sending the MAD. * @context: User-controlled context fields. - * @send_wr: An initialized work request structure used when sending the MAD. - * The wr_id field of the work request is initialized to reference this - * data structure. - * @sge: A scatter-gather list referenced by the work request. + * @timeout_ms: Time to wait for a response. + * @retries: Number of times to retry a request for a response. * * Users are responsible for initializing the MAD buffer itself, with the * exception of specifying the payload length field in any RMPP MAD. */ struct ib_mad_send_buf { - struct ib_mad *mad; - DECLARE_PCI_UNMAP_ADDR(mapping) + struct ib_mad_send_buf *next; + void *mad; struct ib_mad_agent *mad_agent; + struct ib_ah *ah; void *context[2]; - struct ib_send_wr send_wr; - struct ib_sge sge; + int timeout_ms; + int retries; }; /** @@ -287,7 +290,7 @@ typedef void (*ib_mad_send_handler)(stru * or @mad_send_wc. */ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent, - struct ib_send_wr *send_wr, + struct ib_mad_send_buf *send_buf, struct ib_mad_send_wc *mad_send_wc); /** @@ -296,7 +299,7 @@ typedef void (*ib_mad_snoop_handler)(str * @mad_recv_wc: Received work completion information on the received MAD. * * MADs received in response to a send request operation will be handed to - * the user after the send operation completes. All data buffers given + * the user before the send operation completes. All data buffers given * to registered agents through this routine are owned by the receiving * client, except for snooping agents. Clients snooping MADs should not * modify the data referenced by @mad_recv_wc. @@ -334,13 +337,13 @@ struct ib_mad_agent { /** * ib_mad_send_wc - MAD send completion information. - * @wr_id: Work request identifier associated with the send MAD request. + * @send_buf: Send MAD data buffer associated with the send MAD request. * @status: Completion status. * @vendor_err: Optional vendor error information returned with a failed * request. */ struct ib_mad_send_wc { - u64 wr_id; + struct ib_mad_send_buf *send_buf; enum ib_wc_status status; u32 vendor_err; }; @@ -366,7 +369,7 @@ struct ib_mad_recv_buf { * @rmpp_list: Specifies a list of RMPP reassembled received MAD buffers. * @mad_len: The length of the received MAD, without duplicated headers. * - * For received response, the wr_id field of the wc is set to the wr_id + * For received response, the wr_id contains a pointer to the ib_mad_send_buf * for the corresponding send request. */ struct ib_mad_recv_wc { @@ -463,9 +466,9 @@ int ib_unregister_mad_agent(struct ib_ma /** * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated * with the registered client. - * @mad_agent: Specifies the associated registration to post the send to. - * @send_wr: Specifies the information needed to send the MAD(s). - * @bad_send_wr: Specifies the MAD on which an error was encountered. + * @send_buf: Specifies the information needed to send the MAD(s). + * @bad_send_buf: Specifies the MAD on which an error was encountered. This + * parameter is optional if only a single MAD is posted. * * Sent MADs are not guaranteed to complete in the order that they were posted. * @@ -479,9 +482,8 @@ int ib_unregister_mad_agent(struct ib_ma * defined data being transferred. The paylen_newwin field should be * specified in network-byte order. */ -int ib_post_send_mad(struct ib_mad_agent *mad_agent, - struct ib_send_wr *send_wr, - struct ib_send_wr **bad_send_wr); +int ib_post_send_mad(struct ib_mad_send_buf *send_buf, + struct ib_mad_send_buf **bad_send_buf); /** * ib_coalesce_recv_mad - Coalesces received MAD data into a single buffer. @@ -507,23 +509,25 @@ void ib_free_recv_mad(struct ib_mad_recv /** * ib_cancel_mad - Cancels an outstanding send MAD operation. * @mad_agent: Specifies the registration associated with sent MAD. - * @wr_id: Indicates the work request identifier of the MAD to cancel. + * @send_buf: Indicates the MAD to cancel. * * MADs will be returned to the user through the corresponding * ib_mad_send_handler. */ -void ib_cancel_mad(struct ib_mad_agent *mad_agent, u64 wr_id); +void ib_cancel_mad(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf); /** * ib_modify_mad - Modifies an outstanding send MAD operation. * @mad_agent: Specifies the registration associated with sent MAD. - * @wr_id: Indicates the work request identifier of the MAD to modify. + * @send_buf: Indicates the MAD to modify. * @timeout_ms: New timeout value for sent MAD. * * This call will reset the timeout value for a sent MAD to the specified * value. */ -int ib_modify_mad(struct ib_mad_agent *mad_agent, u64 wr_id, u32 timeout_ms); +int ib_modify_mad(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, u32 timeout_ms); /** * ib_redirect_mad_qp - Registers a QP for MAD services. @@ -572,7 +576,6 @@ int ib_process_mad_wc(struct ib_mad_agen * @remote_qpn: Specifies the QPN of the receiving node. * @pkey_index: Specifies which PKey the MAD will be sent using. This field * is valid only if the remote_qpn is QP 1. - * @ah: References the address handle used to transfer to the remote node. * @rmpp_active: Indicates if the send will enable RMPP. * @hdr_len: Indicates the size of the data header of the MAD. This length * should include the common MAD header, RMPP header, plus any class @@ -582,11 +585,10 @@ int ib_process_mad_wc(struct ib_mad_agen * additional padding that may be necessary. * @gfp_mask: GFP mask used for the memory allocation. * - * This is a helper routine that may be used to allocate a MAD. Users are - * not required to allocate outbound MADs using this call. The returned - * MAD send buffer will reference a data buffer usable for sending a MAD, along + * This routine allocates a MAD for sending. The returned MAD send buffer + * will reference a data buffer usable for sending a MAD, along * with an initialized work request structure. Users may modify the returned - * MAD data buffer or work request before posting the send. + * MAD data buffer before posting the send. * * The returned data buffer will be cleared. Users are responsible for * initializing the common MAD and any class specific headers. If @rmpp_active @@ -594,7 +596,7 @@ int ib_process_mad_wc(struct ib_mad_agen */ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, u32 remote_qpn, u16 pkey_index, - struct ib_ah *ah, int rmpp_active, + int rmpp_active, int hdr_len, int data_len, gfp_t gfp_mask); diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_marshall.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_marshall.h --- linux-2.6.14.3/include/rdma/ib_marshall.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_marshall.h 2005-12-08 11:16:53.000000000 -0500 @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(IB_USER_MARSHALL_H) +#define IB_USER_MARSHALL_H + +#include +#include +#include +#include + +void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, + struct ib_qp_attr *src); + +void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, + struct ib_sa_path_rec *src); + +void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst, + struct ib_user_path_rec *src); + +#endif /* IB_USER_MARSHALL_H */ diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_pack.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_pack.h --- linux-2.6.14.3/include/rdma/ib_pack.h 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_pack.h 2005-12-08 11:16:53.000000000 -0500 @@ -29,7 +29,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: ib_pack.h 1349 2004-12-16 21:09:43Z roland $ + * $Id: ib_pack.h 3202 2005-08-26 17:11:34Z roland $ */ #ifndef IB_PACK_H diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_sa.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_sa.h --- linux-2.6.14.3/include/rdma/ib_sa.h 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_sa.h 2005-12-08 11:16:53.000000000 -0500 @@ -30,7 +30,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: ib_sa.h 2811 2005-07-06 18:11:43Z halr $ + * $Id: ib_sa.h 3925 2005-10-31 23:03:17Z roland $ */ #ifndef IB_SA_H diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_smi.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_smi.h --- linux-2.6.14.3/include/rdma/ib_smi.h 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_smi.h 2005-12-08 11:16:53.000000000 -0500 @@ -33,7 +33,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: ib_smi.h 1389 2004-12-27 22:56:47Z roland $ + * $Id: ib_smi.h 3202 2005-08-26 17:11:34Z roland $ */ #if !defined( IB_SMI_H ) diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_user_at.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_user_at.h --- linux-2.6.14.3/include/rdma/ib_user_at.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_user_at.h 2005-12-08 11:16:53.000000000 -0500 @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: ib_user_at.h 3202 2005-08-26 17:11:34Z roland $ + */ + +#ifndef IB_USER_AT_H +#define IB_USER_AT_H + +#include +#include + +#define IB_USER_AT_ABI_VERSION 1 + +enum { + IB_USER_AT_CMD_ROUTE_BY_IP, + IB_USER_AT_CMD_PATHS_BY_ROUTE, + IB_USER_AT_CMD_IPS_BY_GID, + IB_USER_AT_CMD_IPS_BY_SUBNET, + IB_USER_AT_CMD_INVALIDATE_PATHS, + IB_USER_AT_CMD_CANCEL, + IB_USER_AT_CMD_STATUS, + + IB_USER_AT_CMD_EVENT, +}; + +/* + * command ABI structures. + */ +struct ib_uat_cmd_hdr { + __u32 cmd; + __u16 in; + __u16 out; +}; + +enum ib_uat_multipathing_type { + IB_USER_AT_PATH_SAME_PORT = 0, + IB_USER_AT_PATH_SAME_HCA = 1, /* but different ports if applicable */ + IB_USER_AT_PATH_SAME_SYSTEM = 2, /* but different ports if applicable */ + IB_USER_AT_PATH_INDEPENDENT_HCA = 3, + IB_USER_AT_PATH_SRC_ROUTE = 4, /* application controlled multipathing */ +}; + +enum ib_uat_route_flags { + IB_USER_AT_ROUTE_USE_DEFAULTS = 0, + IB_USER_AT_ROUTE_FORCE_ATS = 1, + IB_USER_AT_ROUTE_FORCE_ARP = 2, + IB_USER_AT_ROUTE_FORCE_RESOLVE = 4, +}; + +struct ib_uat_path_attr { + __u16 qos_tag; + __u16 pkey; + __u8 multi_path_type; +}; + +struct ib_uat_ib_route { + __u8 sgid[16]; + __u8 dgid[16]; + struct ibv_device *out_dev; + int out_port; + struct ib_uat_path_attr attr; +}; + +enum ib_uat_op_status { + IB_USER_AT_STATUS_INVALID = 0, + IB_USER_AT_STATUS_PENDING = 1, + IB_USER_AT_STATUS_COMPLETED = 2, + IB_USER_AT_STATUS_CANCELED = 3, +}; + +struct ib_uat_completion { + void (*fn)(__u64 req_id, void *context, int rec_num); + void *context; + __u64 req_id; +}; + +struct ib_uat_paths_by_route_req { + struct ib_uat_ib_route *ib_route; + __u32 mpath_type; + struct ib_sa_path_rec *path_arr; + int npath; + struct ib_uat_completion *async_comp; + __u64 response; +}; + +struct ib_uat_paths_by_route_resp { + __u64 req_id; +}; + +struct ib_uat_route_by_ip_req { + __u32 dst_ip; + __u32 src_ip; + int tos; + __u16 flags; + struct ib_uat_ib_route *ib_route; + struct ib_uat_completion *async_comp; + __u64 response; +}; + +struct ib_uat_route_by_ip_resp { + __u64 req_id; +}; + +struct ib_uat_ips_by_gid_req { + union ibv_gid *gid; + __u32 *dst_ips; + int nips; + struct ib_uat_completion *async_comp; + __u64 response; +}; + +struct ib_uat_ips_by_gid_resp { + __u64 req_id; +}; + +struct ib_uat_ips_by_subnet_req { + __u32 network; + __u32 netmask; + __u32 *dst_ips; + int nips; +}; + +struct ib_uat_invalidate_paths_req { + struct ib_uat_ib_route *ib_route; +}; + +struct ib_uat_cancel_req { + __u64 req_id; +}; + +struct ib_uat_status_req { + __u64 req_id; +}; + +/* + * event notification ABI structures. + */ +struct ib_uat_event_get { + __u64 response; +}; + +struct ib_uat_event_resp { + __u64 callback; + __u64 context; + __u64 req_id; + int rec_num; +}; +#endif /* IB_USER_AT_H */ diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_user_cm.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_user_cm.h --- linux-2.6.14.3/include/rdma/ib_user_cm.h 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_user_cm.h 2005-12-08 11:16:53.000000000 -0500 @@ -30,15 +30,15 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: ib_user_cm.h 2576 2005-06-09 17:00:30Z libor $ + * $Id: ib_user_cm.h 4019 2005-11-11 00:33:09Z sean.hefty $ */ #ifndef IB_USER_CM_H #define IB_USER_CM_H -#include +#include -#define IB_USER_CM_ABI_VERSION 2 +#define IB_USER_CM_ABI_VERSION 4 enum { IB_USER_CM_CMD_CREATE_ID, @@ -84,6 +84,7 @@ struct ib_ucm_create_id_resp { struct ib_ucm_destroy_id { __u64 response; __u32 id; + __u32 reserved; }; struct ib_ucm_destroy_id_resp { @@ -93,6 +94,7 @@ struct ib_ucm_destroy_id_resp { struct ib_ucm_attr_id { __u64 response; __u32 id; + __u32 reserved; }; struct ib_ucm_attr_id_resp { @@ -108,62 +110,11 @@ struct ib_ucm_init_qp_attr { __u32 qp_state; }; -struct ib_ucm_ah_attr { - __u8 grh_dgid[16]; - __u32 grh_flow_label; - __u16 dlid; - __u16 reserved; - __u8 grh_sgid_index; - __u8 grh_hop_limit; - __u8 grh_traffic_class; - __u8 sl; - __u8 src_path_bits; - __u8 static_rate; - __u8 is_global; - __u8 port_num; -}; - -struct ib_ucm_init_qp_attr_resp { - __u32 qp_attr_mask; - __u32 qp_state; - __u32 cur_qp_state; - __u32 path_mtu; - __u32 path_mig_state; - __u32 qkey; - __u32 rq_psn; - __u32 sq_psn; - __u32 dest_qp_num; - __u32 qp_access_flags; - - struct ib_ucm_ah_attr ah_attr; - struct ib_ucm_ah_attr alt_ah_attr; - - /* ib_qp_cap */ - __u32 max_send_wr; - __u32 max_recv_wr; - __u32 max_send_sge; - __u32 max_recv_sge; - __u32 max_inline_data; - - __u16 pkey_index; - __u16 alt_pkey_index; - __u8 en_sqd_async_notify; - __u8 sq_draining; - __u8 max_rd_atomic; - __u8 max_dest_rd_atomic; - __u8 min_rnr_timer; - __u8 port_num; - __u8 timeout; - __u8 retry_cnt; - __u8 rnr_retry; - __u8 alt_port_num; - __u8 alt_timeout; -}; - struct ib_ucm_listen { __be64 service_id; __be64 service_mask; __u32 id; + __u32 reserved; }; struct ib_ucm_establish { @@ -177,28 +128,6 @@ struct ib_ucm_private_data { __u8 reserved[3]; }; -struct ib_ucm_path_rec { - __u8 dgid[16]; - __u8 sgid[16]; - __be16 dlid; - __be16 slid; - __u32 raw_traffic; - __be32 flow_label; - __u32 reversible; - __u32 mtu; - __be16 pkey; - __u8 hop_limit; - __u8 traffic_class; - __u8 numb_path; - __u8 sl; - __u8 mtu_selector; - __u8 rate_selector; - __u8 rate; - __u8 packet_life_time_selector; - __u8 packet_life_time; - __u8 preference; -}; - struct ib_ucm_req { __u32 id; __u32 qpn; @@ -219,7 +148,7 @@ struct ib_ucm_req { __u8 rnr_retry_count; __u8 max_cm_retries; __u8 srq; - __u8 reserved[1]; + __u8 reserved[5]; }; struct ib_ucm_rep { @@ -236,6 +165,7 @@ struct ib_ucm_rep { __u8 flow_control; __u8 rnr_retry_count; __u8 srq; + __u8 reserved[4]; }; struct ib_ucm_info { @@ -245,7 +175,7 @@ struct ib_ucm_info { __u64 data; __u8 info_len; __u8 data_len; - __u8 reserved[2]; + __u8 reserved[6]; }; struct ib_ucm_mra { @@ -273,6 +203,7 @@ struct ib_ucm_sidr_req { __u16 pkey; __u8 len; __u8 max_cm_retries; + __u8 reserved[4]; }; struct ib_ucm_sidr_rep { @@ -284,7 +215,7 @@ struct ib_ucm_sidr_rep { __u64 data; __u8 info_len; __u8 data_len; - __u8 reserved[2]; + __u8 reserved[6]; }; /* * event notification ABI structures. @@ -295,14 +226,12 @@ struct ib_ucm_event_get { __u64 info; __u8 data_len; __u8 info_len; - __u8 reserved[2]; + __u8 reserved[6]; }; struct ib_ucm_req_event_resp { - /* device */ - /* port */ - struct ib_ucm_path_rec primary_path; - struct ib_ucm_path_rec alternate_path; + struct ib_user_path_rec primary_path; + struct ib_user_path_rec alternate_path; __be64 remote_ca_guid; __u32 remote_qkey; __u32 remote_qpn; @@ -316,6 +245,8 @@ struct ib_ucm_req_event_resp { __u8 retry_count; __u8 rnr_retry_count; __u8 srq; + __u8 port; + __u8 reserved[7]; }; struct ib_ucm_rep_event_resp { @@ -330,7 +261,7 @@ struct ib_ucm_rep_event_resp { __u8 flow_control; __u8 rnr_retry_count; __u8 srq; - __u8 reserved[1]; + __u8 reserved[5]; }; struct ib_ucm_rej_event_resp { @@ -344,7 +275,7 @@ struct ib_ucm_mra_event_resp { }; struct ib_ucm_lap_event_resp { - struct ib_ucm_path_rec path; + struct ib_user_path_rec path; }; struct ib_ucm_apr_event_resp { @@ -353,10 +284,9 @@ struct ib_ucm_apr_event_resp { }; struct ib_ucm_sidr_req_event_resp { - /* device */ - /* port */ __u16 pkey; - __u8 reserved[2]; + __u8 port; + __u8 reserved; }; struct ib_ucm_sidr_rep_event_resp { @@ -376,6 +306,7 @@ struct ib_ucm_event_resp { __u32 id; __u32 event; __u32 present; + __u32 reserved; union { struct ib_ucm_req_event_resp req_resp; struct ib_ucm_rep_event_resp rep_resp; diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_user_mad.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_user_mad.h --- linux-2.6.14.3/include/rdma/ib_user_mad.h 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_user_mad.h 2005-12-08 11:16:53.000000000 -0500 @@ -30,7 +30,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: ib_user_mad.h 2814 2005-07-06 19:14:09Z halr $ + * $Id: ib_user_mad.h 3202 2005-08-26 17:11:34Z roland $ */ #ifndef IB_USER_MAD_H diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_user_sa.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_user_sa.h --- linux-2.6.14.3/include/rdma/ib_user_sa.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_user_sa.h 2005-12-08 11:16:53.000000000 -0500 @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_USER_SA_H +#define IB_USER_SA_H + +#include + +struct ib_user_path_rec { + __u8 dgid[16]; + __u8 sgid[16]; + __be16 dlid; + __be16 slid; + __u32 raw_traffic; + __be32 flow_label; + __u32 reversible; + __u32 mtu; + __be16 pkey; + __u8 hop_limit; + __u8 traffic_class; + __u8 numb_path; + __u8 sl; + __u8 mtu_selector; + __u8 rate_selector; + __u8 rate; + __u8 packet_life_time_selector; + __u8 packet_life_time; + __u8 preference; +}; + +#endif /* IB_USER_SA_H */ diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_user_verbs.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_user_verbs.h --- linux-2.6.14.3/include/rdma/ib_user_verbs.h 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_user_verbs.h 2005-12-08 11:16:53.000000000 -0500 @@ -1,6 +1,7 @@ /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -30,7 +31,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: ib_user_verbs.h 2708 2005-06-24 17:27:21Z roland $ + * $Id: ib_user_verbs.h 4019 2005-11-11 00:33:09Z sean.hefty $ */ #ifndef IB_USER_VERBS_H @@ -42,15 +43,12 @@ * Increment this value if any changes that break userspace ABI * compatibility are made. */ -#define IB_USER_VERBS_ABI_VERSION 2 +#define IB_USER_VERBS_ABI_VERSION 4 enum { - IB_USER_VERBS_CMD_QUERY_PARAMS, IB_USER_VERBS_CMD_GET_CONTEXT, IB_USER_VERBS_CMD_QUERY_DEVICE, IB_USER_VERBS_CMD_QUERY_PORT, - IB_USER_VERBS_CMD_QUERY_GID, - IB_USER_VERBS_CMD_QUERY_PKEY, IB_USER_VERBS_CMD_ALLOC_PD, IB_USER_VERBS_CMD_DEALLOC_PD, IB_USER_VERBS_CMD_CREATE_AH, @@ -65,6 +63,7 @@ enum { IB_USER_VERBS_CMD_ALLOC_MW, IB_USER_VERBS_CMD_BIND_MW, IB_USER_VERBS_CMD_DEALLOC_MW, + IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL, IB_USER_VERBS_CMD_CREATE_CQ, IB_USER_VERBS_CMD_RESIZE_CQ, IB_USER_VERBS_CMD_DESTROY_CQ, @@ -90,8 +89,11 @@ enum { * Make sure that all structs defined in this file remain laid out so * that they pack the same way on 32-bit and 64-bit architectures (to * avoid incompatibility between 32-bit userspace and 64-bit kernels). - * In particular do not use pointer types -- pass pointers in __u64 - * instead. + * Specifically: + * - Do not use pointer types -- pass pointers in __u64 instead. + * - Make sure that any structure larger than 4 bytes is padded to a + * multiple of 8 bytes. Otherwise the structure size will be + * different between 32-bit and 64-bit architectures. */ struct ib_uverbs_async_event_desc { @@ -118,27 +120,14 @@ struct ib_uverbs_cmd_hdr { __u16 out_words; }; -/* - * No driver_data for "query params" command, since this is intended - * to be a core function with no possible device dependence. - */ -struct ib_uverbs_query_params { - __u64 response; -}; - -struct ib_uverbs_query_params_resp { - __u32 num_cq_events; -}; - struct ib_uverbs_get_context { __u64 response; - __u64 cq_fd_tab; __u64 driver_data[0]; }; struct ib_uverbs_get_context_resp { __u32 async_fd; - __u32 reserved; + __u32 num_comp_vectors; }; struct ib_uverbs_query_device { @@ -220,31 +209,6 @@ struct ib_uverbs_query_port_resp { __u8 reserved[3]; }; -struct ib_uverbs_query_gid { - __u64 response; - __u8 port_num; - __u8 index; - __u8 reserved[6]; - __u64 driver_data[0]; -}; - -struct ib_uverbs_query_gid_resp { - __u8 gid[16]; -}; - -struct ib_uverbs_query_pkey { - __u64 response; - __u8 port_num; - __u8 index; - __u8 reserved[6]; - __u64 driver_data[0]; -}; - -struct ib_uverbs_query_pkey_resp { - __u16 pkey; - __u16 reserved; -}; - struct ib_uverbs_alloc_pd { __u64 response; __u64 driver_data[0]; @@ -278,11 +242,21 @@ struct ib_uverbs_dereg_mr { __u32 mr_handle; }; +struct ib_uverbs_create_comp_channel { + __u64 response; +}; + +struct ib_uverbs_create_comp_channel_resp { + __u32 fd; +}; + struct ib_uverbs_create_cq { __u64 response; __u64 user_handle; __u32 cqe; - __u32 event_handler; + __u32 comp_vector; + __s32 comp_channel; + __u32 reserved; __u64 driver_data[0]; }; @@ -291,6 +265,41 @@ struct ib_uverbs_create_cq_resp { __u32 cqe; }; +struct ib_uverbs_poll_cq { + __u64 response; + __u32 cq_handle; + __u32 ne; +}; + +struct ib_uverbs_wc { + __u64 wr_id; + __u32 status; + __u32 opcode; + __u32 vendor_err; + __u32 byte_len; + __u32 imm_data; + __u32 qp_num; + __u32 src_qp; + __u32 wc_flags; + __u16 pkey_index; + __u16 slid; + __u8 sl; + __u8 dlid_path_bits; + __u8 port_num; + __u8 reserved; +}; + +struct ib_uverbs_poll_cq_resp { + __u32 count; + __u32 reserved; + struct ib_uverbs_wc wc[0]; +}; + +struct ib_uverbs_req_notify_cq { + __u32 cq_handle; + __u32 solicited_only; +}; + struct ib_uverbs_destroy_cq { __u64 response; __u32 cq_handle; @@ -302,6 +311,64 @@ struct ib_uverbs_destroy_cq_resp { __u32 async_events_reported; }; +struct ib_uverbs_global_route { + __u8 dgid[16]; + __u32 flow_label; + __u8 sgid_index; + __u8 hop_limit; + __u8 traffic_class; + __u8 reserved; +}; + +struct ib_uverbs_ah_attr { + struct ib_uverbs_global_route grh; + __u16 dlid; + __u8 sl; + __u8 src_path_bits; + __u8 static_rate; + __u8 is_global; + __u8 port_num; + __u8 reserved; +}; + +struct ib_uverbs_qp_attr { + __u32 qp_attr_mask; + __u32 qp_state; + __u32 cur_qp_state; + __u32 path_mtu; + __u32 path_mig_state; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + + struct ib_uverbs_ah_attr ah_attr; + struct ib_uverbs_ah_attr alt_ah_attr; + + /* ib_qp_cap */ + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 en_sqd_async_notify; + __u8 sq_draining; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 reserved[5]; +}; + struct ib_uverbs_create_qp { __u64 response; __u64 user_handle; @@ -324,6 +391,11 @@ struct ib_uverbs_create_qp { struct ib_uverbs_create_qp_resp { __u32 qp_handle; __u32 qpn; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; }; /* @@ -388,6 +460,107 @@ struct ib_uverbs_destroy_qp_resp { __u32 events_reported; }; +/* + * The ib_uverbs_sge structure isn't used anywhere, since we assume + * the ib_sge structure is packed the same way on 32-bit and 64-bit + * architectures in both kernel and user space. It's just here to + * document the ABI. + */ +struct ib_uverbs_sge { + __u64 addr; + __u32 length; + __u32 lkey; +}; + +struct ib_uverbs_send_wr { + __u64 wr_id; + __u32 num_sge; + __u32 opcode; + __u32 send_flags; + __u32 imm_data; + union { + struct { + __u64 remote_addr; + __u32 rkey; + __u32 reserved; + } rdma; + struct { + __u64 remote_addr; + __u64 compare_add; + __u64 swap; + __u32 rkey; + __u32 reserved; + } atomic; + struct { + __u32 ah; + __u32 remote_qpn; + __u32 remote_qkey; + __u32 reserved; + } ud; + } wr; +}; + +struct ib_uverbs_post_send { + __u64 response; + __u32 qp_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ib_uverbs_send_wr send_wr[0]; +}; + +struct ib_uverbs_post_send_resp { + __u32 bad_wr; +}; + +struct ib_uverbs_recv_wr { + __u64 wr_id; + __u32 num_sge; + __u32 reserved; +}; + +struct ib_uverbs_post_recv { + __u64 response; + __u32 qp_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ib_uverbs_recv_wr recv_wr[0]; +}; + +struct ib_uverbs_post_recv_resp { + __u32 bad_wr; +}; + +struct ib_uverbs_post_srq_recv { + __u64 response; + __u32 srq_handle; + __u32 wr_count; + __u32 sge_count; + __u32 wqe_size; + struct ib_uverbs_recv_wr recv[0]; +}; + +struct ib_uverbs_post_srq_recv_resp { + __u32 bad_wr; +}; + +struct ib_uverbs_create_ah { + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 reserved; + struct ib_uverbs_ah_attr attr; +}; + +struct ib_uverbs_create_ah_resp { + __u32 ah_handle; +}; + +struct ib_uverbs_destroy_ah { + __u32 ah_handle; +}; + struct ib_uverbs_attach_mcast { __u8 gid[16]; __u32 qp_handle; @@ -422,9 +595,7 @@ struct ib_uverbs_modify_srq { __u32 srq_handle; __u32 attr_mask; __u32 max_wr; - __u32 max_sge; __u32 srq_limit; - __u32 reserved; __u64 driver_data[0]; }; diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/ib_verbs.h linux-2.6.14.3-RPCRDMA/include/rdma/ib_verbs.h --- linux-2.6.14.3/include/rdma/ib_verbs.h 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/ib_verbs.h 2005-12-08 11:16:53.000000000 -0500 @@ -35,7 +35,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: ib_verbs.h 1349 2004-12-16 21:09:43Z roland $ + * $Id: ib_verbs.h 4030 2005-11-13 04:48:01Z roland $ */ #if !defined(IB_VERBS_H) @@ -48,6 +48,14 @@ #include #include +/* XXX remove this compatibility hack when 2.6.15 is released */ +#include + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15) +#define class_device_create(cls, parent, devt, device, fmt, arg...) \ + class_device_create(cls, devt, device, fmt, ## arg) +#endif /* XXX end of hack */ + union ib_gid { u8 raw[16]; struct { @@ -595,11 +603,8 @@ struct ib_send_wr { } atomic; struct { struct ib_ah *ah; - struct ib_mad_hdr *mad_hdr; u32 remote_qpn; u32 remote_qkey; - int timeout_ms; /* valid for MADs only */ - int retries; /* valid for MADs only */ u16 pkey_index; /* valid for GSI only */ u8 port_num; /* valid for DR SMPs on switch only */ } ud; @@ -884,7 +889,7 @@ struct ib_device { struct ib_ucontext *context, struct ib_udata *udata); int (*destroy_cq)(struct ib_cq *cq); - int (*resize_cq)(struct ib_cq *cq, int *cqe); + int (*resize_cq)(struct ib_cq *cq, int cqe); int (*poll_cq)(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int (*peek_cq)(struct ib_cq *cq, int wc_cnt); @@ -951,6 +956,10 @@ struct ib_device { IB_DEV_UNREGISTERED } reg_state; + u64 uverbs_cmd_mask; + int uverbs_abi_ver; + + __be64 node_guid; u8 node_type; u8 phys_port_cnt; }; diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/rdma_cm.h linux-2.6.14.3-RPCRDMA/include/rdma/rdma_cm.h --- linux-2.6.14.3/include/rdma/rdma_cm.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/rdma_cm.h 2005-12-08 11:16:53.000000000 -0500 @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2005 Voltaire Inc. All rights reserved. + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This Software is licensed under one of the following licenses: + * + * 1) under the terms of the "Common Public License 1.0" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/cpl.php. + * + * 2) under the terms of the "The BSD License" a copy of which is + * available from the Open Source Initiative, see + * http://www.opensource.org/licenses/bsd-license.php. + * + * 3) under the terms of the "GNU General Public License (GPL) Version 2" a + * copy of which is available from the Open Source Initiative, see + * http://www.opensource.org/licenses/gpl-license.php. + * + * Licensee has the right to choose one of the above licenses. + * + * Redistributions of source code must retain the above copyright + * notice and one of the license notices. + * + * Redistributions in binary form must reproduce both the above copyright + * notice, one of the license notices in the documentation + * and/or other materials provided with the distribution. + * + */ + +#if !defined(RDMA_CM_H) +#define RDMA_CM_H + +#include +#include +#include +#include + +/* + * Upon receiving a device removal event, users must destroy the associated + * RDMA identifier and release all resources allocated with the device. + */ +enum rdma_cm_event_type { + RDMA_CM_EVENT_ADDR_RESOLVED, + RDMA_CM_EVENT_ADDR_ERROR, + RDMA_CM_EVENT_ROUTE_RESOLVED, + RDMA_CM_EVENT_ROUTE_ERROR, + RDMA_CM_EVENT_CONNECT_REQUEST, + RDMA_CM_EVENT_CONNECT_RESPONSE, + RDMA_CM_EVENT_CONNECT_ERROR, + RDMA_CM_EVENT_UNREACHABLE, + RDMA_CM_EVENT_REJECTED, + RDMA_CM_EVENT_ESTABLISHED, + RDMA_CM_EVENT_DISCONNECTED, + RDMA_CM_EVENT_DEVICE_REMOVAL, +}; + +struct rdma_addr { + struct sockaddr src_addr; + u8 src_pad[sizeof(struct sockaddr_in6) - + sizeof(struct sockaddr)]; + struct sockaddr dst_addr; + u8 dst_pad[sizeof(struct sockaddr_in6) - + sizeof(struct sockaddr)]; + union { + struct ib_addr ibaddr; + } addr; +}; + +struct rdma_route { + struct rdma_addr addr; + struct ib_sa_path_rec *path_rec; + int num_paths; +}; + +struct rdma_cm_event { + enum rdma_cm_event_type event; + int status; + void *private_data; + u8 private_data_len; +}; + +struct rdma_cm_id; + +/** + * rdma_cm_event_handler - Callback used to report user events. + * + * Notes: Users may not call rdma_destroy_id from this callback to destroy + * the passed in id, or a corresponding listen id. Returning a + * non-zero value from the callback will destroy the corresponding id. + */ +typedef int (*rdma_cm_event_handler)(struct rdma_cm_id *id, + struct rdma_cm_event *event); + +struct rdma_cm_id { + struct ib_device *device; + void *context; + struct ib_qp *qp; + rdma_cm_event_handler event_handler; + struct rdma_route route; + u8 port_num; +}; + +struct rdma_cm_id* rdma_create_id(rdma_cm_event_handler event_handler, + void *context); + +void rdma_destroy_id(struct rdma_cm_id *id); + +/** + * rdma_bind_addr - Bind an RDMA identifier to a source address and + * associated RDMA device, if needed. + * + * @id: RDMA identifier. + * @addr: Local address information. Wildcard values are permitted. + * + * This associates a source address with the RDMA identifier before calling + * rdma_listen. If a specific local address is given, the RDMA identifier will + * be bound to a local RDMA device. + */ +int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr); + +/** + * rdma_resolve_addr - Resolve destination and optional source addresses + * from IP addresses to an RDMA address. If successful, the specified + * rdma_cm_id will be bound to a local device. + * + * @id: RDMA identifier. + * @src_addr: Source address information. This parameter may be NULL. + * @dst_addr: Destination address information. + * @timeout_ms: Time to wait for resolution to complete. + */ +int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + struct sockaddr *dst_addr, int timeout_ms); + +/** + * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier + * into route information needed to establish a connection. + * + * This is called on the client side of a connection. + * Users must have first called rdma_resolve_addr to resolve a dst_addr + * into an RDMA address before calling this routine. + */ +int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); + +/** + * rdma_create_qp - Allocate a QP and associate it with the specified RDMA + * identifier. + * + * QPs allocated to an rdma_cm_id will automatically be transitioned by the CMA + * through their states. + */ +int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr); + +/** + * rdma_destroy_qp - Deallocate the QP associated with the specified RDMA + * identifier. + * + * Users must destroy any QP associated with an RDMA identifier before + * destroying the RDMA ID. + */ +void rdma_destroy_qp(struct rdma_cm_id *id); + +/** + * rdma_init_qp_attr - Initializes the QP attributes for use in transitioning + * to a specified QP state. + * @id: Communication identifier associated with the QP attributes to + * initialize. + * @qp_attr: On input, specifies the desired QP state. On output, the + * mandatory and desired optional attributes will be set in order to + * modify the QP to the specified state. + * @qp_attr_mask: The QP attribute mask that may be used to transition the + * QP to the specified state. + * + * Users must set the @qp_attr->qp_state to the desired QP state. This call + * will set all required attributes for the given transition, along with + * known optional attributes. Users may override the attributes returned from + * this call before calling ib_modify_qp. + * + * Users that wish to have their QP automatically transitioned through its + * states can associate a QP with the rdma_cm_id by calling rdma_create_qp(). + */ +int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, + int *qp_attr_mask); + +struct rdma_conn_param { + const void *private_data; + u8 private_data_len; + u8 responder_resources; + u8 initiator_depth; + u8 flow_control; + u8 retry_count; /* ignored when accepting */ + u8 rnr_retry_count; + /* Fields below ignored if a QP is created on the rdma_cm_id. */ + u8 srq; + u32 qp_num; + enum ib_qp_type qp_type; +}; + +/** + * rdma_connect - Initiate an active connection request. + * + * Users must have resolved a route for the rdma_cm_id to connect with + * by having called rdma_resolve_route before calling this routine. + */ +int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); + +/** + * rdma_listen - This function is called by the passive side to + * listen for incoming connection requests. + * + * Users must have bound the rdma_cm_id to a local address by calling + * rdma_bind_addr before calling this routine. + */ +int rdma_listen(struct rdma_cm_id *id, int backlog); + +/** + * rdma_accept - Called to accept a connection request or response. + * @id: Connection identifier associated with the request. + * @conn_param: Information needed to establish the connection. This must be + * provided if accepting a connection request. If accepting a connection + * response, this parameter must be NULL. + * + * Typically, this routine is only called by the listener to accept a connection + * request. It must also be called on the active side of a connection if the + * user is performing their own QP transitions. + */ +int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); + +/** + * rdma_reject - Called on the passive side to reject a connection request. + */ +int rdma_reject(struct rdma_cm_id *id, const void *private_data, + u8 private_data_len); + +/** + * rdma_disconnect - This function disconnects the associated QP. + */ +int rdma_disconnect(struct rdma_cm_id *id); + +#endif /* RDMA_CM_H */ + diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/include/rdma/rdma_user_cm.h linux-2.6.14.3-RPCRDMA/include/rdma/rdma_user_cm.h --- linux-2.6.14.3/include/rdma/rdma_user_cm.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/include/rdma/rdma_user_cm.h 2005-12-08 11:16:53.000000000 -0500 @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2005 Intel Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RDMA_USER_CM_H +#define RDMA_USER_CM_H + +#include +#include +#include +#include + +#define RDMA_USER_CM_ABI_VERSION 1 + +#define RDMA_MAX_PRIVATE_DATA 256 + +enum { + RDMA_USER_CM_CMD_CREATE_ID, + RDMA_USER_CM_CMD_DESTROY_ID, + RDMA_USER_CM_CMD_BIND_ADDR, + RDMA_USER_CM_CMD_RESOLVE_ADDR, + RDMA_USER_CM_CMD_RESOLVE_ROUTE, + RDMA_USER_CM_CMD_QUERY_ROUTE, + RDMA_USER_CM_CMD_CONNECT, + RDMA_USER_CM_CMD_LISTEN, + RDMA_USER_CM_CMD_ACCEPT, + RDMA_USER_CM_CMD_REJECT, + RDMA_USER_CM_CMD_DISCONNECT, + RDMA_USER_CM_CMD_INIT_QP_ATTR, + RDMA_USER_CM_CMD_GET_EVENT +}; + +/* + * command ABI structures. + */ +struct rdma_ucm_cmd_hdr { + __u32 cmd; + __u16 in; + __u16 out; +}; + +struct rdma_ucm_create_id { + __u64 uid; + __u64 response; +}; + +struct rdma_ucm_create_id_resp { + __u32 id; +}; + +struct rdma_ucm_destroy_id { + __u64 response; + __u32 id; + __u32 reserved; +}; + +struct rdma_ucm_destroy_id_resp { + __u32 events_reported; +}; + +struct rdma_ucm_bind_addr { + __u64 response; + struct sockaddr_in6 addr; + __u32 id; +}; + +struct rdma_ucm_resolve_addr { + struct sockaddr_in6 src_addr; + struct sockaddr_in6 dst_addr; + __u32 id; + __u32 timeout_ms; +}; + +struct rdma_ucm_resolve_route { + __u32 id; + __u32 timeout_ms; +}; + +struct rdma_ucm_query_route { + __u64 response; + __u32 id; + __u32 reserved; +}; + +struct rdma_ucm_query_route_resp { + __u64 node_guid; + struct ib_user_path_rec ib_route[2]; + struct sockaddr_in6 src_addr; + struct sockaddr_in6 dst_addr; + __u32 num_paths; + __u8 port_num; + __u8 reserved[3]; +}; + +struct rdma_ucm_conn_param { + __u32 qp_num; + __u32 qp_type; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; + __u8 private_data_len; + __u8 srq; + __u8 responder_resources; + __u8 initiator_depth; + __u8 flow_control; + __u8 retry_count; + __u8 rnr_retry_count; + __u8 valid; +}; + +struct rdma_ucm_connect { + struct rdma_ucm_conn_param conn_param; + __u32 id; + __u32 reserved; +}; + +struct rdma_ucm_listen { + __u32 id; + __u32 backlog; +}; + +struct rdma_ucm_accept { + __u64 uid; + struct rdma_ucm_conn_param conn_param; + __u32 id; + __u32 reserved; +}; + +struct rdma_ucm_reject { + __u32 id; + __u8 private_data_len; + __u8 reserved[3]; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; +}; + +struct rdma_ucm_disconnect { + __u32 id; +}; + +struct rdma_ucm_init_qp_attr { + __u64 response; + __u32 id; + __u32 qp_state; +}; + +struct rdma_ucm_get_event { + __u64 response; +}; + +struct rdma_ucm_event_resp { + __u64 uid; + __u32 id; + __u32 event; + __u32 status; + __u8 private_data_len; + __u8 reserved[3]; + __u8 private_data[RDMA_MAX_PRIVATE_DATA]; +}; + +#endif /* RDMA_USER_CM_H */ diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/net/ipv4/fib_frontend.c linux-2.6.14.3-RPCRDMA/net/ipv4/fib_frontend.c --- linux-2.6.14.3/net/ipv4/fib_frontend.c 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/net/ipv4/fib_frontend.c 2005-12-08 11:25:14.000000000 -0500 @@ -661,4 +661,5 @@ void __init ip_fib_init(void) } EXPORT_SYMBOL(inet_addr_type); +EXPORT_SYMBOL(ip_dev_find); EXPORT_SYMBOL(ip_rt_ioctl); diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/net/sunrpc/Makefile linux-2.6.14.3-RPCRDMA/net/sunrpc/Makefile --- linux-2.6.14.3/net/sunrpc/Makefile 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/net/sunrpc/Makefile 2005-12-08 11:51:27.000000000 -0500 @@ -2,6 +2,7 @@ # Makefile for Linux kernel SUN RPC # +EXTRA_CFLAGS += -Idrivers/infiniband/ulp/kdapl/ obj-$(CONFIG_SUNRPC) += sunrpc.o obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ @@ -10,6 +11,8 @@ sunrpc-y := clnt.o xprt.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ pmap_clnt.o timer.o xdr.o \ - sunrpc_syms.o cache.o rpc_pipe.o + sunrpc_syms.o cache.o rpc_pipe.o \ + svcxprt_rdma.o svcxprt_rdma_kdapl.o sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o +sunrpc-$(CONFIG_SUNRPC_RDMA) += svcxprt_rdma.o svcxprt_rdma_kdapl.o diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/net/sunrpc/svc.c linux-2.6.14.3-RPCRDMA/net/sunrpc/svc.c --- linux-2.6.14.3/net/sunrpc/svc.c 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/net/sunrpc/svc.c 2005-12-08 11:50:23.000000000 -0500 @@ -56,8 +56,8 @@ svc_create(struct svc_program *prog, uns serv->sv_xdrsize = xdrsize; INIT_LIST_HEAD(&serv->sv_threads); INIT_LIST_HEAD(&serv->sv_sockets); - INIT_LIST_HEAD(&serv->sv_tempsocks); - INIT_LIST_HEAD(&serv->sv_permsocks); + INIT_LIST_HEAD(&serv->sv_tempxprts); + INIT_LIST_HEAD(&serv->sv_permxprts); spin_lock_init(&serv->sv_lock); /* Remove any stale portmap registrations */ @@ -72,7 +72,7 @@ svc_create(struct svc_program *prog, uns void svc_destroy(struct svc_serv *serv) { - struct svc_sock *svsk; + struct svc_xprt *xprt; dprintk("RPC: svc_destroy(%s, %d)\n", serv->sv_program->pg_name, @@ -86,17 +86,28 @@ svc_destroy(struct svc_serv *serv) } else printk("svc_destroy: no threads for serv=%p!\n", serv); - while (!list_empty(&serv->sv_tempsocks)) { - svsk = list_entry(serv->sv_tempsocks.next, - struct svc_sock, + while (!list_empty(&serv->sv_tempxprts)) { + xprt = list_entry(serv->sv_tempxprts.next, + struct svc_xprt, sk_list); - svc_delete_socket(svsk); + dprintk("%s: deleting temp xprt %p\n", __FUNCTION__, xprt); + if (xprt->sx_delete) { + xprt->sx_delete(xprt); + } else { + dprintk("%s: xprt->sx_delete is NULL\n", __FUNCTION__); + } } - while (!list_empty(&serv->sv_permsocks)) { - svsk = list_entry(serv->sv_permsocks.next, - struct svc_sock, - sk_list); - svc_delete_socket(svsk); + + while (!list_empty(&serv->sv_permxprts)) { + xprt = list_entry(serv->sv_permxprts.next, + struct svc_xprt, + sk_list); + dprintk("%s: deleting perm xprt %p\n", __FUNCTION__, xprt); + if (xprt->sx_delete) { + xprt->sx_delete(xprt); + } else { + dprintk("%s: xprt->sx_delete is NULL\n", __FUNCTION__); + } } cache_clean_deferred(serv); @@ -273,18 +284,20 @@ svc_process(struct svc_serv *serv, struc if (argv->iov_len < 6*4) goto err_short_len; - /* setup response xdr_buf. - * Initially it has just one page - */ - svc_take_page(rqstp); /* must succeed */ - resv->iov_base = page_address(rqstp->rq_respages[0]); - resv->iov_len = 0; - rqstp->rq_res.pages = rqstp->rq_respages+1; - rqstp->rq_res.len = 0; - rqstp->rq_res.page_base = 0; - rqstp->rq_res.page_len = 0; - rqstp->rq_res.buflen = PAGE_SIZE; - rqstp->rq_res.tail[0].iov_len = 0; + if (!rqstp->rq_sock->sx_manages_buffers) { + /* setup response xdr_buf. + * Initially it has just one page + */ + svc_take_page(rqstp); /* must succeed */ + resv->iov_base = page_address(rqstp->rq_respages[0]); + resv->iov_len = 0; + rqstp->rq_res.pages = rqstp->rq_respages+1; + rqstp->rq_res.len = 0; + rqstp->rq_res.page_base = 0; + rqstp->rq_res.page_len = 0; + rqstp->rq_res.tail[0].iov_len = 0; + } + /* tcp needs a space for the record length... */ if (rqstp->rq_prot == IPPROTO_TCP) svc_putu32(resv, 0); @@ -324,6 +337,11 @@ svc_process(struct svc_serv *serv, struc auth_stat = rpc_autherr_badcred; auth_res = progp->pg_authenticate(rqstp); } + + if (auth_res != SVC_OK) { + dprintk("%s: auth_res = %u\n", __FUNCTION__, auth_res); + } + switch (auth_res) { case SVC_OK: break; diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/net/sunrpc/svcsock.c linux-2.6.14.3-RPCRDMA/net/sunrpc/svcsock.c --- linux-2.6.14.3/net/sunrpc/svcsock.c 2005-11-24 17:10:21.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/net/sunrpc/svcsock.c 2005-12-08 11:50:23.000000000 -0500 @@ -43,6 +43,10 @@ #include #include +/* XXX move to module? */ +#include + + /* SMP locking strategy: * * svc_serv->sv_lock protects most stuff for that service. @@ -63,13 +67,13 @@ #define RPCDBG_FACILITY RPCDBG_SVCSOCK -static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, +static struct svc_xprt *svc_setup_socket(struct svc_serv *, struct socket *, int *errp, int pmap_reg); static void svc_udp_data_ready(struct sock *, int); static int svc_udp_recvfrom(struct svc_rqst *); static int svc_udp_sendto(struct svc_rqst *); -static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *svsk); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); @@ -119,7 +123,7 @@ svc_release_skb(struct svc_rqst *rqstp) * Any space to write? */ static inline unsigned long -svc_sock_wspace(struct svc_sock *svsk) +svc_sock_wspace(struct svc_xprt *svsk) { int wspace; @@ -131,22 +135,67 @@ svc_sock_wspace(struct svc_sock *svsk) return wspace; } +/** + * svc_sock_has_write_space - Checks if there is enough space for the repsonse + * @xprt: the transport to write on + * @wspace: the number of bytes available for writing + */ +static inline int +svc_sock_has_write_space(struct svc_xprt *xprt, long int wspace) { + struct svc_serv *serv = xprt->sx_server; + + set_bit(SOCK_NOSPACE, &xprt->sk_sock->flags); + if (((xprt->sk_reserved + serv->sv_bufsz)*2 > wspace) + && !test_bit(SK_CLOSE, &xprt->sk_flags) + && !test_bit(SK_CONN, &xprt->sk_flags)) { + /* Don't enqueue while not enough space for reply */ + dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", + xprt->sk_sk, xprt->sk_reserved+serv->sv_bufsz, wspace); + return 0; + } + clear_bit(SOCK_NOSPACE, &xprt->sk_sock->flags); + + return 1; +} + +/** + * svc_tcp_has_write_space - Checks if there is enough space for the repsonse + * @xprt: The transport instance the reply will be written to + **/ +static inline int +svc_tcp_has_write_space(struct svc_xprt *xprt) { + return svc_sock_has_write_space(xprt, sk_stream_wspace(xprt->sk_sk)); +} + +/** + * svc_udp_has_write_space - Checks if there is enough space for the repsonse + * @xprt: The transport instance the reply will be written to + **/ +static inline int +svc_udp_has_write_space(struct svc_xprt *xprt) { + return svc_sock_has_write_space(xprt, sock_wspace(xprt->sk_sk)); +} + /* * Queue up a socket with data pending. If there are idle nfsd * processes, wake 'em up. * */ -static void -svc_sock_enqueue(struct svc_sock *svsk) +void +svc_sock_enqueue(struct svc_xprt *xprt) { - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = xprt->sx_server; struct svc_rqst *rqstp; - if (!(svsk->sk_flags & + dprintk("%s: xprt = %p, serv = %p\n", __FUNCTION__, xprt, serv); + + if (!(xprt->sk_flags & ( (1<sk_flags)) + if (test_bit(SK_DEAD, &xprt->sk_flags)) { + dprintk("%s: xprt is dead, will not be enqueued\n", __FUNCTION__); return; + } spin_lock_bh(&serv->sv_lock); @@ -155,56 +204,47 @@ svc_sock_enqueue(struct svc_sock *svsk) printk(KERN_ERR "svc_sock_enqueue: threads and sockets both waiting??\n"); - if (test_bit(SK_DEAD, &svsk->sk_flags)) { + if (test_bit(SK_DEAD, &xprt->sk_flags)) { /* Don't enqueue dead sockets */ - dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); + dprintk("svc: socket %p is dead, not enqueued\n", xprt->sk_sk); goto out_unlock; } - if (test_bit(SK_BUSY, &svsk->sk_flags)) { + if (test_bit(SK_BUSY, &xprt->sk_flags)) { /* Don't enqueue socket while daemon is receiving */ - dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); + dprintk("svc: socket %p busy, not enqueued\n", xprt->sk_sk); goto out_unlock; } - set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - if (((svsk->sk_reserved + serv->sv_bufsz)*2 - > svc_sock_wspace(svsk)) - && !test_bit(SK_CLOSE, &svsk->sk_flags) - && !test_bit(SK_CONN, &svsk->sk_flags)) { - /* Don't enqueue while not enough space for reply */ - dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", - svsk->sk_sk, svsk->sk_reserved+serv->sv_bufsz, - svc_sock_wspace(svsk)); + if (xprt->sx_has_wspace && !xprt->sx_has_wspace(xprt)) { goto out_unlock; } - clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); /* Mark socket as busy. It will remain in this state until the * server has processed all pending data and put the socket back * on the idle list. */ - set_bit(SK_BUSY, &svsk->sk_flags); + set_bit(SK_BUSY, &xprt->sk_flags); if (!list_empty(&serv->sv_threads)) { rqstp = list_entry(serv->sv_threads.next, struct svc_rqst, rq_list); dprintk("svc: socket %p served by daemon %p\n", - svsk->sk_sk, rqstp); + xprt->sk_sk, rqstp); svc_serv_dequeue(serv, rqstp); if (rqstp->rq_sock) printk(KERN_ERR "svc_sock_enqueue: server %p, rq_sock=%p!\n", rqstp, rqstp->rq_sock); - rqstp->rq_sock = svsk; - svsk->sk_inuse++; + rqstp->rq_sock = xprt; + xprt->sk_inuse++; rqstp->rq_reserved = serv->sv_bufsz; - svsk->sk_reserved += rqstp->rq_reserved; + xprt->sk_reserved += rqstp->rq_reserved; wake_up(&rqstp->rq_wait); } else { - dprintk("svc: socket %p put into queue\n", svsk->sk_sk); - list_add_tail(&svsk->sk_ready, &serv->sv_sockets); + dprintk("svc: socket %p put into queue\n", xprt->sk_sk); + list_add_tail(&xprt->sk_ready, &serv->sv_sockets); } out_unlock: @@ -214,16 +254,16 @@ out_unlock: /* * Dequeue the first socket. Must be called with the serv->sv_lock held. */ -static inline struct svc_sock * +static inline struct svc_xprt * svc_sock_dequeue(struct svc_serv *serv) { - struct svc_sock *svsk; + struct svc_xprt *svsk; if (list_empty(&serv->sv_sockets)) return NULL; svsk = list_entry(serv->sv_sockets.next, - struct svc_sock, sk_ready); + struct svc_xprt, sk_ready); list_del_init(&svsk->sk_ready); dprintk("svc: socket %p dequeued, inuse=%d\n", @@ -239,7 +279,7 @@ svc_sock_dequeue(struct svc_serv *serv) * no (or insufficient) data. */ static inline void -svc_sock_received(struct svc_sock *svsk) +svc_sock_received(struct svc_xprt *svsk) { clear_bit(SK_BUSY, &svsk->sk_flags); svc_sock_enqueue(svsk); @@ -261,11 +301,11 @@ void svc_reserve(struct svc_rqst *rqstp, space += rqstp->rq_res.head[0].iov_len; if (space < rqstp->rq_reserved) { - struct svc_sock *svsk = rqstp->rq_sock; - spin_lock_bh(&svsk->sk_server->sv_lock); + struct svc_xprt *svsk = rqstp->rq_sock; + spin_lock_bh(&svsk->sx_server->sv_lock); svsk->sk_reserved -= (rqstp->rq_reserved - space); rqstp->rq_reserved = space; - spin_unlock_bh(&svsk->sk_server->sv_lock); + spin_unlock_bh(&svsk->sx_server->sv_lock); svc_sock_enqueue(svsk); } @@ -275,9 +315,9 @@ void svc_reserve(struct svc_rqst *rqstp, * Release a socket after use. */ static inline void -svc_sock_put(struct svc_sock *svsk) +svc_sock_put(struct svc_xprt *svsk) { - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = svsk->sx_server; spin_lock_bh(&serv->sv_lock); if (!--(svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) { @@ -290,10 +330,20 @@ svc_sock_put(struct svc_sock *svsk) spin_unlock_bh(&serv->sv_lock); } +/** + * svc_xprt_release - prepares thread context for processing next request + * @rqstp: context of a server thread containing request/reply data + * + * This function removes the association between a particular server thread + * and a particular transport instance as well as returning the server + * thread context to a pristine state for handling the next request. + */ static void -svc_sock_release(struct svc_rqst *rqstp) +svc_xprt_release(struct svc_rqst *rqstp) { - struct svc_sock *svsk = rqstp->rq_sock; + struct svc_xprt *svsk = rqstp->rq_sock; + + dprintk("%s: rqstp = %p\n", __FUNCTION__, rqstp); svc_release_skb(rqstp); @@ -316,7 +366,7 @@ svc_sock_release(struct svc_rqst *rqstp) svc_reserve(rqstp, 0); rqstp->rq_sock = NULL; - svc_sock_put(svsk); + if (svsk->sx_put) svsk->sx_put(svsk); } /* @@ -348,7 +398,7 @@ svc_wake_up(struct svc_serv *serv) static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) { - struct svc_sock *svsk = rqstp->rq_sock; + struct svc_xprt *svsk = rqstp->rq_sock; struct socket *sock = svsk->sk_sock; int slen; char buffer[CMSG_SPACE(sizeof(struct in_pktinfo))]; @@ -432,7 +482,7 @@ out: * Check input queue length */ static int -svc_recv_available(struct svc_sock *svsk) +svc_recv_available(struct svc_xprt *svsk) { mm_segment_t oldfs; struct socket *sock = svsk->sk_sock; @@ -468,7 +518,7 @@ svc_recvfrom(struct svc_rqst *rqstp, str len = kernel_recvmsg(sock, &msg, iov, nr, buflen, MSG_DONTWAIT); /* sock_recvmsg doesn't fill in the name/namelen, so we must.. - * possibly we should cache this in the svc_sock structure + * possibly we should cache this in the svc_xprt structure * at accept time. FIXME */ alen = sizeof(rqstp->rq_addr); @@ -512,7 +562,7 @@ svc_sock_setbufsize(struct socket *sock, static void svc_udp_data_ready(struct sock *sk, int count) { - struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; + struct svc_xprt *svsk = (struct svc_xprt *)sk->sk_user_data; if (svsk) { dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", @@ -530,7 +580,7 @@ svc_udp_data_ready(struct sock *sk, int static void svc_write_space(struct sock *sk) { - struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); + struct svc_xprt *svsk = (struct svc_xprt *)(sk->sk_user_data); if (svsk) { dprintk("svc: socket %p(inet %p), write_space busy=%d\n", @@ -554,8 +604,8 @@ csum_partial_copy_to_xdr(struct xdr_buf static int svc_udp_recvfrom(struct svc_rqst *rqstp) { - struct svc_sock *svsk = rqstp->rq_sock; - struct svc_serv *serv = svsk->sk_server; + struct svc_xprt *svsk = rqstp->rq_sock; + struct svc_serv *serv = svsk->sx_server; struct sk_buff *skb; int err, len; @@ -665,20 +715,23 @@ svc_udp_sendto(struct svc_rqst *rqstp) } static void -svc_udp_init(struct svc_sock *svsk) +svc_udp_init(struct svc_xprt *svsk) { svsk->sk_sk->sk_data_ready = svc_udp_data_ready; svsk->sk_sk->sk_write_space = svc_write_space; - svsk->sk_recvfrom = svc_udp_recvfrom; - svsk->sk_sendto = svc_udp_sendto; + svsk->sx_sendto = svc_udp_sendto; + svsk->sx_recvfrom = svc_udp_recvfrom; + svsk->sx_delete = svc_delete_socket; + svsk->sx_put = svc_sock_put; + svsk->sx_has_wspace = svc_udp_has_write_space; /* initialise setting must have enough space to * receive and respond to one request. * svc_udp_recvfrom will re-adjust if necessary */ svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_server->sv_bufsz, - 3 * svsk->sk_server->sv_bufsz); + 3 * svsk->sx_server->sv_bufsz, + 3 * svsk->sx_server->sv_bufsz); set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ set_bit(SK_CHNGBUF, &svsk->sk_flags); @@ -691,7 +744,7 @@ svc_udp_init(struct svc_sock *svsk) static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused) { - struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; + struct svc_xprt *svsk = (struct svc_xprt *)sk->sk_user_data; dprintk("svc: socket %p TCP (listen) state change %d\n", sk, sk->sk_state); @@ -724,7 +777,7 @@ svc_tcp_listen_data_ready(struct sock *s static void svc_tcp_state_change(struct sock *sk) { - struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; + struct svc_xprt *svsk = (struct svc_xprt *)sk->sk_user_data; dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", sk, sk->sk_state, sk->sk_user_data); @@ -742,7 +795,7 @@ svc_tcp_state_change(struct sock *sk) static void svc_tcp_data_ready(struct sock *sk, int count) { - struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; + struct svc_xprt *svsk = (struct svc_xprt *)sk->sk_user_data; dprintk("svc: socket %p TCP data ready (svsk %p)\n", sk, sk->sk_user_data); @@ -758,14 +811,14 @@ svc_tcp_data_ready(struct sock *sk, int * Accept a TCP connection */ static void -svc_tcp_accept(struct svc_sock *svsk) +svc_tcp_accept(struct svc_xprt *svsk) { struct sockaddr_in sin; - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = svsk->sx_server; struct socket *sock = svsk->sk_sock; struct socket *newsock; struct proto_ops *ops; - struct svc_sock *newsvsk; + struct svc_xprt *newsvsk; int err, slen; dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); @@ -834,12 +887,12 @@ svc_tcp_accept(struct svc_sock *svsk) * * The only somewhat efficient mechanism would be if drop * old connections from the same IP first. But right now - * we don't even record the client IP in svc_sock. + * we don't even record the client IP in svc_xprt. */ if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { - struct svc_sock *svsk = NULL; + struct svc_xprt *svsk = NULL; spin_lock_bh(&serv->sv_lock); - if (!list_empty(&serv->sv_tempsocks)) { + if (!list_empty(&serv->sv_tempxprts)) { if (net_ratelimit()) { /* Try to help the admin */ printk(KERN_NOTICE "%s: too many open TCP " @@ -856,8 +909,8 @@ svc_tcp_accept(struct svc_sock *svsk) * Always select the oldest socket. It's not fair, * but so is life */ - svsk = list_entry(serv->sv_tempsocks.prev, - struct svc_sock, + svsk = list_entry(serv->sv_tempxprts.prev, + struct svc_xprt, sk_list); set_bit(SK_CLOSE, &svsk->sk_flags); svsk->sk_inuse ++; @@ -866,7 +919,7 @@ svc_tcp_accept(struct svc_sock *svsk) if (svsk) { svc_sock_enqueue(svsk); - svc_sock_put(svsk); + if (svsk->sx_put) svsk->sx_put(svsk); } } @@ -887,8 +940,8 @@ failed: static int svc_tcp_recvfrom(struct svc_rqst *rqstp) { - struct svc_sock *svsk = rqstp->rq_sock; - struct svc_serv *serv = svsk->sk_server; + struct svc_xprt *svsk = rqstp->rq_sock; + struct svc_serv *serv = svsk->sx_server; int len; struct kvec vec[RPCSVC_MAXPAGES]; int pnum, vlen; @@ -1031,7 +1084,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) svc_sock_received(svsk); } else { printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", - svsk->sk_server->sv_name, -len); + svsk->sx_server->sv_name, -len); svc_sock_received(svsk); } @@ -1061,7 +1114,7 @@ svc_tcp_sendto(struct svc_rqst *rqstp) sent = svc_sendto(rqstp, &rqstp->rq_res); if (sent != xbufp->len) { printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", - rqstp->rq_sock->sk_server->sv_name, + rqstp->rq_sock->sx_server->sv_name, (sent<0)?"got error":"sent only", sent, xbufp->len); svc_delete_socket(rqstp->rq_sock); @@ -1071,13 +1124,16 @@ svc_tcp_sendto(struct svc_rqst *rqstp) } static void -svc_tcp_init(struct svc_sock *svsk) +svc_tcp_init(struct svc_xprt *svsk) { struct sock *sk = svsk->sk_sk; struct tcp_sock *tp = tcp_sk(sk); - svsk->sk_recvfrom = svc_tcp_recvfrom; - svsk->sk_sendto = svc_tcp_sendto; + svsk->sx_recvfrom = svc_tcp_recvfrom; + svsk->sx_sendto = svc_tcp_sendto; + svsk->sx_delete = svc_delete_socket; + svsk->sx_put = svc_sock_put; + svsk->sx_has_wspace = svc_tcp_has_write_space; if (sk->sk_state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); @@ -1099,8 +1155,8 @@ svc_tcp_init(struct svc_sock *svsk) * svc_tcp_recvfrom will re-adjust if necessary */ svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_server->sv_bufsz, - 3 * svsk->sk_server->sv_bufsz); + 3 * svsk->sx_server->sv_bufsz, + 3 * svsk->sx_server->sv_bufsz); set_bit(SK_CHNGBUF, &svsk->sk_flags); set_bit(SK_DATA, &svsk->sk_flags); @@ -1119,14 +1175,14 @@ svc_sock_update_bufs(struct svc_serv *se struct list_head *le; spin_lock_bh(&serv->sv_lock); - list_for_each(le, &serv->sv_permsocks) { - struct svc_sock *svsk = - list_entry(le, struct svc_sock, sk_list); + list_for_each(le, &serv->sv_permxprts) { + struct svc_xprt *svsk = + list_entry(le, struct svc_xprt, sk_list); set_bit(SK_CHNGBUF, &svsk->sk_flags); } - list_for_each(le, &serv->sv_tempsocks) { - struct svc_sock *svsk = - list_entry(le, struct svc_sock, sk_list); + list_for_each(le, &serv->sv_tempxprts) { + struct svc_xprt *svsk = + list_entry(le, struct svc_xprt, sk_list); set_bit(SK_CHNGBUF, &svsk->sk_flags); } spin_unlock_bh(&serv->sv_lock); @@ -1138,7 +1194,7 @@ svc_sock_update_bufs(struct svc_serv *se int svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) { - struct svc_sock *svsk =NULL; + struct svc_xprt *svsk =NULL; int len; int pages; struct xdr_buf *arg; @@ -1188,9 +1244,9 @@ svc_recv(struct svc_serv *serv, struct s return -EINTR; spin_lock_bh(&serv->sv_lock); - if (!list_empty(&serv->sv_tempsocks)) { - svsk = list_entry(serv->sv_tempsocks.next, - struct svc_sock, sk_list); + if (!list_empty(&serv->sv_tempxprts)) { + svsk = list_entry(serv->sv_tempxprts.next, + struct svc_xprt, sk_list); /* apparently the "standard" is that clients close * idle connections after 5 minutes, servers after * 6 minutes @@ -1222,7 +1278,9 @@ svc_recv(struct svc_serv *serv, struct s add_wait_queue(&rqstp->rq_wait, &wait); spin_unlock_bh(&serv->sv_lock); + dprintk("%s: going to sleep...\n", __FUNCTION__); schedule_timeout(timeout); + dprintk("%s: ...woke up\n", __FUNCTION__); try_to_freeze(); @@ -1240,13 +1298,27 @@ svc_recv(struct svc_serv *serv, struct s dprintk("svc: server %p, socket %p, inuse=%d\n", rqstp, svsk, svsk->sk_inuse); - len = svsk->sk_recvfrom(rqstp); + len = svsk->sx_recvfrom(rqstp); dprintk("svc: got len=%d\n", len); + { + unsigned char *recv_buf = (unsigned char *) rqstp->rq_arg.head[0].iov_base; + int i; + + for (i = 0; i < len; i += 4) { + dprintk("%p: %.2x %.2x %.2x %.2x\n", + &recv_buf[i], + recv_buf[i], + recv_buf[i + 1], + recv_buf[i + 2], + recv_buf[i + 3]); + } + } + /* No data, incomplete (TCP) read, or accept() */ if (len == 0 || len == -EAGAIN) { rqstp->rq_res.len = 0; - svc_sock_release(rqstp); + svc_xprt_release(rqstp); return -EAGAIN; } svsk->sk_lastrecv = get_seconds(); @@ -1254,7 +1326,7 @@ svc_recv(struct svc_serv *serv, struct s /* push active sockets to end of list */ spin_lock_bh(&serv->sv_lock); if (!list_empty(&svsk->sk_list)) - list_move_tail(&svsk->sk_list, &serv->sv_tempsocks); + list_move_tail(&svsk->sk_list, &serv->sv_tempxprts); spin_unlock_bh(&serv->sv_lock); } @@ -1273,7 +1345,7 @@ void svc_drop(struct svc_rqst *rqstp) { dprintk("svc: socket %p dropped request\n", rqstp->rq_sock); - svc_sock_release(rqstp); + svc_xprt_release(rqstp); } /* @@ -1282,7 +1354,7 @@ svc_drop(struct svc_rqst *rqstp) int svc_send(struct svc_rqst *rqstp) { - struct svc_sock *svsk; + struct svc_xprt *svsk; int len; struct xdr_buf *xb; @@ -1301,29 +1373,77 @@ svc_send(struct svc_rqst *rqstp) xb->page_len + xb->tail[0].iov_len; + { + unsigned char *buf; + int i; + + dprintk("%s: response length = %u (%u head + %u page + %u tail)\n", + __FUNCTION__, + xb->len, xb->head[0].iov_len, xb->page_len, xb->tail[0].iov_len); + + buf = xb->head[0].iov_base; + dprintk("%s: reply head:\n", __FUNCTION__); + for (i = 0; i < xb->head[0].iov_len; i += 4) { + dprintk("%p: %.2x %.2x %.2x %.2x\n", + &buf[i], + buf[i], + buf[i + 1], + buf[i + 2], + buf[i + 3]); + } + + if (xb->page_len != 0) { + buf = page_address(xb->pages[0]) + xb->page_base; + dprintk("%s: reply page data:\n", __FUNCTION__); + for (i = 0; i < xb->page_len; i += 4) { + dprintk("%p: %.2x %.2x %.2x %.2x\n", + &buf[i], + buf[i], + buf[i + 1], + buf[i + 2], + buf[i + 3]); + } + } + + if (xb->tail[0].iov_len != 0) { + buf = xb->tail[0].iov_base; + dprintk("%s: reply tail:\n", __FUNCTION__); + for (i = 0; i < xb->tail[0].iov_len; i += 4) { + dprintk("%p: %.2x %.2x %.2x %.2x\n", + &buf[i], + buf[i], + buf[i + 1], + buf[i + 2], + buf[i + 3]); + } + } + + } + /* Grab svsk->sk_sem to serialize outgoing data. */ down(&svsk->sk_sem); if (test_bit(SK_DEAD, &svsk->sk_flags)) len = -ENOTCONN; else - len = svsk->sk_sendto(rqstp); + len = svsk->sx_sendto(rqstp); up(&svsk->sk_sem); - svc_sock_release(rqstp); + svc_xprt_release(rqstp); if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) return 0; + dprintk("%s: sent %d bytes\n", __FUNCTION__, len); return len; } /* - * Initialize socket for RPC use and create svc_sock struct + * Initialize socket for RPC use and create svc_xprt struct * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. */ -static struct svc_sock * +static struct svc_xprt * svc_setup_socket(struct svc_serv *serv, struct socket *sock, int *errp, int pmap_register) { - struct svc_sock *svsk; + struct svc_xprt *svsk; struct sock *inet; dprintk("svc: svc_setup_socket %p\n", sock); @@ -1352,7 +1472,7 @@ svc_setup_socket(struct svc_serv *serv, svsk->sk_ostate = inet->sk_state_change; svsk->sk_odata = inet->sk_data_ready; svsk->sk_owspace = inet->sk_write_space; - svsk->sk_server = serv; + svsk->sx_server = serv; svsk->sk_lastrecv = get_seconds(); INIT_LIST_HEAD(&svsk->sk_deferred); INIT_LIST_HEAD(&svsk->sk_ready); @@ -1367,11 +1487,11 @@ svc_setup_socket(struct svc_serv *serv, spin_lock_bh(&serv->sv_lock); if (!pmap_register) { set_bit(SK_TEMP, &svsk->sk_flags); - list_add(&svsk->sk_list, &serv->sv_tempsocks); + list_add(&svsk->sk_list, &serv->sv_tempxprts); serv->sv_tmpcnt++; } else { clear_bit(SK_TEMP, &svsk->sk_flags); - list_add(&svsk->sk_list, &serv->sv_permsocks); + list_add(&svsk->sk_list, &serv->sv_permxprts); } spin_unlock_bh(&serv->sv_lock); @@ -1389,7 +1509,7 @@ svc_setup_socket(struct svc_serv *serv, static int svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin) { - struct svc_sock *svsk; + struct svc_xprt *svsk; struct socket *sock; int error; int type; @@ -1436,14 +1556,14 @@ bummer: * Remove a dead socket */ void -svc_delete_socket(struct svc_sock *svsk) +svc_delete_socket(struct svc_xprt *svsk) { struct svc_serv *serv; struct sock *sk; dprintk("svc: svc_delete_socket(%p)\n", svsk); - serv = svsk->sk_server; + serv = svsk->sx_server; sk = svsk->sk_sk; sk->sk_state_change = svsk->sk_ostate; @@ -1465,23 +1585,60 @@ svc_delete_socket(struct svc_sock *svsk) } else { spin_unlock_bh(&serv->sv_lock); dprintk(KERN_NOTICE "svc: server socket destroy delayed\n"); - /* svsk->sk_server = NULL; */ + /* svsk->sx_server = NULL; */ } } /* * Make a socket for nfsd and lockd + * @deprecated switch to svc_makexprt */ int svc_makesock(struct svc_serv *serv, int protocol, unsigned short port) { struct sockaddr_in sin; - dprintk("svc: creating socket proto = %d\n", protocol); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = INADDR_ANY; - sin.sin_port = htons(port); - return svc_create_socket(serv, protocol, &sin); +/* dprintk("svc: creating socket proto = %d\n", protocol); */ +/* sin.sin_family = AF_INET; */ +/* sin.sin_addr.s_addr = INADDR_ANY; */ +/* sin.sin_port = htons(port); */ +/* return svc_create_socket(serv, protocol, &sin); */ + + + if (protocol == IPPROTO_UDP || protocol == IPPROTO_TCP) { + dprintk("svc: creating socket proto = %d\n", protocol); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = INADDR_ANY; + sin.sin_port = htons(port); + return svc_create_socket(serv, protocol, &sin); + } else { /* XXX need RDMA protocol constant */ + dprintk("svc: creating RDMA transport\n"); + return svc_create_rdma_xprt(serv, protocol, htons(port)); + } +} + +/* + * Make a transport instance for nfsd and lockd + * @param serv the RPC service this instance will belong to + * @param protocol the protocol for the instance + * @param port the port to listen on + * @return 0 on success, negative value for errors + */ +int +svc_makexprt(struct svc_serv *serv, int protocol, unsigned short port) +{ + struct sockaddr_in sin; + + if (protocol == IPPROTO_UDP || protocol == IPPROTO_TCP) { + dprintk("svc: creating socket proto = %d\n", protocol); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = INADDR_ANY; + sin.sin_port = htons(port); + return svc_create_socket(serv, protocol, &sin); + } else { /* XXX need RDMA protocol constant */ + dprintk("svc: creating RDMA transport\n"); + return svc_create_rdma_xprt(serv, protocol, port); + } } /* @@ -1492,10 +1649,10 @@ static void svc_revisit(struct cache_def { struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); struct svc_serv *serv = dreq->owner; - struct svc_sock *svsk; + struct svc_xprt *svsk; if (too_many) { - svc_sock_put(dr->svsk); + if (dr->svsk->sx_put) dr->svsk->sx_put(dr->svsk); kfree(dr); return; } @@ -1507,7 +1664,7 @@ static void svc_revisit(struct cache_def spin_unlock_bh(&serv->sv_lock); set_bit(SK_DEFERRED, &svsk->sk_flags); svc_sock_enqueue(svsk); - svc_sock_put(svsk); + if (svsk->sx_put) svsk->sx_put(svsk); } static struct cache_deferred_req * @@ -1561,10 +1718,10 @@ static int svc_deferred_recv(struct svc_ } -static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *svsk) { struct svc_deferred_req *dr = NULL; - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = svsk->sx_server; if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) return NULL; diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/net/sunrpc/svcxprt_rdma.c linux-2.6.14.3-RPCRDMA/net/sunrpc/svcxprt_rdma.c --- linux-2.6.14.3/net/sunrpc/svcxprt_rdma.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/net/sunrpc/svcxprt_rdma.c 2005-12-08 12:37:37.000000000 -0500 @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan + * All rights reserved. + * + * Permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation from the + * University of Michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. The Regents + * of the University of Michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + * + * RDMA Interface layer for RPC + * This file uses kDAPL-specific knowledge to implement an abstract transport + * implementation for RPC to use without needing to include kDAPL information. + */ + +#if 1 /* JFL: needed for spin_lock_bh() and spin_unlock_bh() */ +#include +#endif +#include +#include /* change to svcxprt.h later */ +#ifdef KDAPL12 +#include +#else +#include +#endif +#include +#include +#include + +# define RPCDBG_FACILITY RPCDBG_XPRT + +static struct rdma_ia default_ia = { .ri_initialized = 0 }; + +/* XXX these are hard-wired for now */ +/* JFL: OpenIB kDAPL provider is called mthcaXy where */ +/* X=HCA number=0,1,... and y=port number=a,b,... */ +static const DAT_NAME_PTR ia_name = "mthca0a"; +/* static const DAT_NAME_PTR ia_name = "ccil0"; */ +static DAT_COUNT evd_min_queue_length = 4; + +/** + * svc_rdma_ia_destroy - Remove the RDMA IA transport from the RPC service + * @xprt: the transport instance representing the RDMA IA + * + * Does a clean teardown of all the RDMA structures and removes the reference + * to them from the RPC service + **/ +void +svc_rdma_ia_destroy(struct svc_xprt *xprt) { + dprintk("%s: removing the RDMA IA transport from RPC\n", __FUNCTION__); + list_del_init(&xprt->sk_list); + rdma_close_ia(xprt->sx_data); + kfree(xprt); +} + +/** + * svc_create_rdma_xprt - Create an RDMA PSP for the RPC service + * @serv: the RPC service the PSP will belong to + * @protocol: currently unused + * @port: the port to listen on in network byte-order + * + * Returns 0 on success, + * -1 if the interface adapter can't be initialized, + * -2 if the public service point can't be created + **/ +int +svc_create_rdma_xprt(struct svc_serv *serv, int protocol, unsigned short port) +{ + DAT_CONN_QUAL connection_qualifier = (DAT_CONN_QUAL) htons(port); + int err; + struct svc_xprt *xprt = NULL; + + dprintk("%s: default_ia.ri_initialized = %d\n", __FUNCTION__, + default_ia.ri_initialized); + + if (!default_ia.ri_initialized) { + if (rdma_init_ia(&default_ia, ia_name, evd_min_queue_length)) { + err = -1; + goto fail; + } + } + + if (!(xprt = kmalloc(sizeof(*xprt), GFP_KERNEL))) { + printk("%s: Not enough memory for transport structure\n", __FUNCTION__); + return -ENOMEM; + } + memset(xprt, 0, sizeof(*xprt)); + + set_bit(SK_BUSY, &xprt->sk_flags); + xprt->sx_server = serv; + xprt->sk_lastrecv = get_seconds(); + xprt->sx_data = &default_ia; + xprt->sx_delete = svc_rdma_ia_destroy; + xprt->sx_recvfrom = svc_rdma_ia_recvfrom; + INIT_LIST_HEAD(&xprt->sk_deferred); + INIT_LIST_HEAD(&xprt->sk_ready); + sema_init(&xprt->sk_sem, 1); + + /* XXX more stuff to set? */ + + spin_lock_bh(&serv->sv_lock); + list_add(&xprt->sk_list, &serv->sv_permxprts); + spin_unlock_bh(&serv->sv_lock); + + clear_bit(SK_BUSY, &xprt->sk_flags); + + dprintk("%s: created RDMA IA transport: %p\n", __FUNCTION__, xprt); + + if (rdma_create_pub_svc_point(xprt, connection_qualifier)) { + err = -2; + goto fail; + } + + return 0; + fail: + svc_rdma_ia_destroy(xprt); + return err; +} diff -purN -x '#*#' -x infiniband -x include/rdma -x config -X linux-2.6.14.3/Documentation/dontdiff linux-2.6.14.3/net/sunrpc/svcxprt_rdma_kdapl.c linux-2.6.14.3-RPCRDMA/net/sunrpc/svcxprt_rdma_kdapl.c --- linux-2.6.14.3/net/sunrpc/svcxprt_rdma_kdapl.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.14.3-RPCRDMA/net/sunrpc/svcxprt_rdma_kdapl.c 2005-12-19 23:07:06.000000000 -0500 @@ -0,0 +1,1288 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan + * All rights reserved. + * + * Permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the University of Michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. If + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * This software is provided as is, without representation from the + * University of Michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. The Regents + * of the University of Michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#ifdef KDAPL12 +#include +#else +#include +#endif +#include +#include +#include +#include +#include /* num_physpages */ +#include /* spinlock_t, spinlock functions */ +#include /* SOCK_NOSPACE */ +#include /* sk_stream_wspace, sock_wspace */ +#include /* memset() */ + +# define RPCDBG_FACILITY RPCDBG_XPRT + +/* handle operations in tasklet context, using a single, global list */ +static void rdma_op_tasklet(unsigned long data); +DECLARE_TASKLET(rdma_op_tasklets, rdma_op_tasklet, 0UL); + +static spinlock_t rdma_ops_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(rdma_ops); + +struct rdma_op { + struct list_head list; + struct svc_xprt *xprt; +}; + +static const char * +ststatus(unsigned int status) +{ + static const char * const ststrings[] = { + "success", + "flushed", + "local length error", + "local endpoint error", + "local protection error", + "bad response", + "remote access error", + "remote responder error", + "transport error", + "receiver not ready", + "partial packet", + "rmr operation error" + + }; + if (status <= 11) + return ststrings[status]; + return "unknown"; +} + +static void +dprintk_dat_err(const char *message, const char *function, DAT_RETURN datstatus) { + const char *major_message = NULL; + const char *minor_message = NULL; + + if (dat_strerror(datstatus, &major_message, &minor_message) == DAT_SUCCESS) { + dprintk("%s: %s. %s: %s\n", function, message, major_message, minor_message); + } else { + dprintk("%s: %s\n", function, message); + } +} + +/** + * async_evd_upcall - Upcall for "catastropic" errors on the interface adapter + * @instance_data: pointer to the &rdma_ia struct + * @event: the event that triggered the upcall + * @more_events: whether there are additional events on the EVD + **/ +static void +async_evd_upcall(DAT_PVOID instance_data, + const DAT_EVENT *event, + DAT_BOOLEAN more_events) +{ + printk("%s: got event %#.8x\n", __FUNCTION__, event->event_number); +} + +static inline void rdma_schedule_op_tasklet(struct svc_xprt *xprt) +{ + unsigned long lock_flags; + struct rdma_op *op; + + op = kmalloc(sizeof *op, GFP_ATOMIC); + op->xprt = xprt; + + spin_lock_irqsave(&rdma_ops_lock, lock_flags); + list_add_tail(&op->list, &rdma_ops); + spin_unlock_irqrestore(&rdma_ops_lock, lock_flags); + tasklet_schedule(&rdma_op_tasklets); +} + +static void rdma_op_tasklet(unsigned long data) +{ + unsigned long lock_flags; + struct rdma_op *op; + struct svc_xprt *xprt; + + dprintk("%s: entering\n", __FUNCTION__); + spin_lock_irqsave(&rdma_ops_lock, lock_flags); + while (!list_empty(&rdma_ops)) { + op = list_entry(rdma_ops.next, struct rdma_op, list); + list_del(&op->list); + spin_unlock_irqrestore(&rdma_ops_lock, lock_flags); + + xprt = op->xprt; + kfree(op); + + set_bit(SK_DATA, &xprt->sk_flags); + svc_sock_enqueue(xprt); + + spin_lock_irqsave(&rdma_ops_lock, lock_flags); + } + spin_unlock_irqrestore(&rdma_ops_lock, lock_flags); + dprintk("%s: leaving\n", __FUNCTION__); +} + +/** + * dto_event_upcall - Handles data transfer events + * + * @instance_data: pointer to the rdma_ia struct + * @event: the event that triggered the upcall + * @more_events: whether there are additional events on the EVD + * + * This upcall handles DTO, (recv, send, bind and unbind) events. + * It is reentrant but has been specified using DAT_UPCALL_SINGLE_INSTANCE + * in order to maintain ordering of receives to keep server credits. + * It must also be prepared to be called from interrupt context, + * so it must not block or perform blocking calls. + * + * It is the responsibility of the scheduled tasklet to return + * recv buffers to the pool. NOTE: this affects synchronization of + * connection shutdown. That is, the structures required for + * the completion of the reply handler must remain intact until + * all memory has been reclaimed. There is some work here TBD. + * + * Note that send events are suppressed and do not result in an upcall. + **/ +static void +in_dto_event_upcall(DAT_PVOID instance_data, + const DAT_EVENT *event, DAT_BOOLEAN more_events) +{ + const DAT_DTO_COMPLETION_EVENT_DATA *dto_data = NULL; + struct svc_xprt *xprt; + struct rdma_ep *ep; + + if (!event) { + dprintk("%s: event is NULL\n", __FUNCTION__); + return; + } + + switch (event->event_number) { + case DAT_DTO_COMPLETION_EVENT: + dto_data = &event->event_data.dto_completion_event_data; + + dprintk("%s: got DAT_DTO_COMPLETION_EVENT; status: %s\n", + __FUNCTION__, + ststatus(dto_data->status)); + if (dto_data->status != DAT_DTO_SUCCESS) break; + + if (!dto_data->user_cookie.as_ptr) { + dprintk("%s: cookie is NULL\n", __FUNCTION__); + return; + } + + ep = (struct rdma_ep *) dto_data->user_cookie.as_ptr; + xprt = (struct svc_xprt *) ep->re_xprt; + ep->re_recvlen = dto_data->transfered_length; + dprintk("%s: received %llu bytes on ep %p\n", __FUNCTION__, ep->re_recvlen, ep); + rdma_schedule_op_tasklet(xprt); + + return; + default: + dprintk("%s: got event %#.8x\n", __FUNCTION__, event->event_number); + return; + } +} + +static void +out_dto_event_upcall(DAT_PVOID instance_data, + const DAT_EVENT *event, DAT_BOOLEAN more_events) +{ + const DAT_DTO_COMPLETION_EVENT_DATA dto_data = + event->event_data.dto_completion_event_data; + + if (dto_data.status == DAT_DTO_SUCCESS) { + dprintk("%s: succesfully sent %llu bytes\n", __FUNCTION__, + dto_data.transfered_length); + } else { + dprintk("%s: got DAT_DTO_COMPLETION_EVENT; status: %s\n", + __FUNCTION__, + ststatus(dto_data.status)); + } +} + +/** + * free_evd - Cleans up an EVD to allow it to be reclaimed. + * @evd_handle: pointer to the handle of the EVD to be retired + * + * Modifys the upcall policy prior to calling dat_evd_free to + * ensure graceful retirement. After freeing, sets the handle + * to DAT_HANDLE_NULL to prevent subsequent use. + **/ +static void +free_evd(DAT_EVD_HANDLE *evd_handle) { + DAT_RETURN datstatus; + DAT_EVENT junk; + + if (*evd_handle == DAT_HANDLE_NULL) { + return; + } + + /* XXX do we need to synchronize this? */ + datstatus = dat_evd_modify_upcall(*evd_handle, + DAT_UPCALL_TEARDOWN, + DAT_UPCALL_SAME); + if (datstatus != DAT_SUCCESS) { + dprintk_dat_err("dat_evd_modify_upcall failed", __FUNCTION__, datstatus); + return; + } + + /* drain any remaining events from the EVD */ + while ((datstatus = dat_evd_dequeue(*evd_handle, &junk)) == DAT_SUCCESS) { + dprintk("%s: drained outstanding event\n", __FUNCTION__); + } + + if (DAT_GET_TYPE(datstatus) == DAT_QUEUE_EMPTY) { + dprintk("%s: EVD queue empty\n", __FUNCTION__); + } else { + dprintk_dat_err("dat_evd_dequeue failed", __FUNCTION__, datstatus); + } + + datstatus = dat_evd_free(*evd_handle); + if (datstatus != DAT_SUCCESS) { + dprintk_dat_err("dat_evd_free failed", __FUNCTION__, datstatus); + } else { + *evd_handle = DAT_HANDLE_NULL; + } +} + +/** + * rdma_close_ep - closes and deallocates an RDMA endpoint structure + * @ep: the endpoint to close + * + * The memory associated with the EP is freed, then dat_ep_disconnect + * is called and we wait for the disconnect event. Finally, before + * calling dat_ep_free, any remaining DTOs are drained. + **/ +static void +rdma_close_ep(struct rdma_ep *ep) { + u32 i; + DAT_RETURN datstatus; + + dprintk("%s: closing rdma_ep %p\n", __FUNCTION__, ep); + + ep->re_closing = 1; /* XXX synchronize this? */ + + list_del_init(&ep->re_list); /* remove from IA list of EPs */ + for (i = 0; i < RDMA_MAX_PAGES && ep->re_pages[i]; i++) { + free_page((unsigned long) ep->re_pages[i]); + } + free_page((unsigned long) ep->re_pages[RDMA_MAX_PAGES]); /* reply page */ + + /* XXX syncronize the connected variable */ + if (ep->re_connected) { + datstatus = dat_ep_disconnect(ep->re_handle, DAT_CLOSE_ABRUPT_FLAG); + if (datstatus != DAT_SUCCESS) { + printk("%s: dat_ep_disconnect failed (%x)\n", + __FUNCTION__, datstatus); + } else { + dprintk("%s: dat_ep_disconnect on %p succeeded\n", + __FUNCTION__, ep->re_handle); + } + + dprintk("%s: waiting for disconnect event\n", __FUNCTION__); + wait_event_interruptible_timeout(ep->re_connect_wait, + ep->re_connected == 0, 5 * HZ); + dprintk("%s: disconnect event recieved\n", __FUNCTION__); + } else { + dprintk("%s: EP already disconnected by peer\n", __FUNCTION__); + } + + free_evd(&ep->re_conn_evd); + free_evd(&ep->re_in_dto_evd); + free_evd(&ep->re_out_dto_evd); + + dprintk("%s: EP EVDs freed\n", __FUNCTION__); + + datstatus = dat_ep_free(ep->re_handle); + + dprintk("%s: datstatus = %x\n", __FUNCTION__, datstatus); + dprintk_dat_err("dat_ep_free", __FUNCTION__, datstatus); + + if (datstatus != DAT_SUCCESS) { + printk("%s: dat_ep_free failed (%x)\n", __FUNCTION__, datstatus); + } else { + dprintk("%s: dat_ep_free on %p succeeded\n", __FUNCTION__, ep->re_handle); + } + + /* rdma_ep structure is freed by svc_rdma_ep_destroy */ +} + +/** + * svc_rdma_ep_destroy - Remove an RDMA endpoint transport instance + * @xprt: the transport instance representing the RDMA endpoint + * + * Closes the endpoint and deallocates the associated structures + * XXX This probably doesn't corectly handle the closing of a xprt + * that is currently in use + **/ +void +svc_rdma_ep_destroy(struct svc_xprt *xprt) { + struct svc_serv *serv = xprt->sx_server; + struct svc_xprt *xprt1 = NULL; + struct svc_xprt *temp = NULL; + + dprintk("%s: removing EP xprt %p from server %p\n", + _